diff options
Diffstat (limited to 'net')
144 files changed, 7532 insertions, 2227 deletions
diff --git a/net/Kconfig b/net/Kconfig index c2cdbce629bd..7b6cd340b72b 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -369,6 +369,7 @@ source "net/irda/Kconfig" source "net/bluetooth/Kconfig" source "net/rxrpc/Kconfig" source "net/kcm/Kconfig" +source "net/strparser/Kconfig" config FIB_RULES bool diff --git a/net/Makefile b/net/Makefile index 9bd20bb86cc6..4cafaa2b4667 100644 --- a/net/Makefile +++ b/net/Makefile @@ -35,6 +35,7 @@ obj-$(CONFIG_BT) += bluetooth/ obj-$(CONFIG_SUNRPC) += sunrpc/ obj-$(CONFIG_AF_RXRPC) += rxrpc/ obj-$(CONFIG_AF_KCM) += kcm/ +obj-$(CONFIG_STREAM_PARSER) += strparser/ obj-$(CONFIG_ATM) += atm/ obj-$(CONFIG_L2TP) += l2tp/ obj-$(CONFIG_DECNET) += decnet/ diff --git a/net/atm/mpc.c b/net/atm/mpc.c index 0e982222d425..3b3b1a292ec8 100644 --- a/net/atm/mpc.c +++ b/net/atm/mpc.c @@ -1007,7 +1007,7 @@ static int mpoa_event_listener(struct notifier_block *mpoa_notifier, if (!net_eq(dev_net(dev), &init_net)) return NOTIFY_DONE; - if (dev->name == NULL || strncmp(dev->name, "lec", 3)) + if (strncmp(dev->name, "lec", 3)) return NOTIFY_DONE; /* we are only interested in lec:s */ switch (event) { diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig index 833bb145ba3c..f20742cbae6d 100644 --- a/net/batman-adv/Kconfig +++ b/net/batman-adv/Kconfig @@ -73,10 +73,21 @@ config BATMAN_ADV_MCAST reduce the air overhead while improving the reliability of multicast messages. -config BATMAN_ADV_DEBUG - bool "B.A.T.M.A.N. debugging" +config BATMAN_ADV_DEBUGFS + bool "batman-adv debugfs entries" depends on BATMAN_ADV depends on DEBUG_FS + default y + help + Enable this to export routing related debug tables via debugfs. + The information for each soft-interface and used hard-interface can be + found under batman_adv/ + + If unsure, say Y. + +config BATMAN_ADV_DEBUG + bool "B.A.T.M.A.N. debugging" + depends on BATMAN_ADV_DEBUGFS help This is an option for use by developers; most people should say N here. This enables compilation of support for diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile index a83fc6c58d19..f724d3c98a81 100644 --- a/net/batman-adv/Makefile +++ b/net/batman-adv/Makefile @@ -24,14 +24,14 @@ batman-adv-$(CONFIG_BATMAN_ADV_BATMAN_V) += bat_v_elp.o batman-adv-$(CONFIG_BATMAN_ADV_BATMAN_V) += bat_v_ogm.o batman-adv-y += bitarray.o batman-adv-$(CONFIG_BATMAN_ADV_BLA) += bridge_loop_avoidance.o -batman-adv-$(CONFIG_DEBUG_FS) += debugfs.o +batman-adv-$(CONFIG_BATMAN_ADV_DEBUGFS) += debugfs.o batman-adv-$(CONFIG_BATMAN_ADV_DAT) += distributed-arp-table.o batman-adv-y += fragmentation.o batman-adv-y += gateway_client.o batman-adv-y += gateway_common.o batman-adv-y += hard-interface.o batman-adv-y += hash.o -batman-adv-y += icmp_socket.o +batman-adv-$(CONFIG_BATMAN_ADV_DEBUGFS) += icmp_socket.o batman-adv-$(CONFIG_BATMAN_ADV_DEBUG) += log.o batman-adv-y += main.o batman-adv-$(CONFIG_BATMAN_ADV_MCAST) += multicast.o diff --git a/net/batman-adv/bat_algo.c b/net/batman-adv/bat_algo.c index 81dbbf569bd4..623d04302aa2 100644 --- a/net/batman-adv/bat_algo.c +++ b/net/batman-adv/bat_algo.c @@ -20,12 +20,18 @@ #include <linux/errno.h> #include <linux/list.h> #include <linux/moduleparam.h> +#include <linux/netlink.h> #include <linux/printk.h> #include <linux/seq_file.h> +#include <linux/skbuff.h> #include <linux/stddef.h> #include <linux/string.h> +#include <net/genetlink.h> +#include <net/netlink.h> +#include <uapi/linux/batman_adv.h> #include "bat_algo.h" +#include "netlink.h" char batadv_routing_algo[20] = "BATMAN_IV"; static struct hlist_head batadv_algo_list; @@ -95,6 +101,7 @@ int batadv_algo_select(struct batadv_priv *bat_priv, char *name) return 0; } +#ifdef CONFIG_BATMAN_ADV_DEBUGFS int batadv_algo_seq_print_text(struct seq_file *seq, void *offset) { struct batadv_algo_ops *bat_algo_ops; @@ -107,6 +114,7 @@ int batadv_algo_seq_print_text(struct seq_file *seq, void *offset) return 0; } +#endif static int batadv_param_set_ra(const char *val, const struct kernel_param *kp) { @@ -138,3 +146,65 @@ static struct kparam_string batadv_param_string_ra = { module_param_cb(routing_algo, &batadv_param_ops_ra, &batadv_param_string_ra, 0644); + +/** + * batadv_algo_dump_entry - fill in information about one supported routing + * algorithm + * @msg: netlink message to be sent back + * @portid: Port to reply to + * @seq: Sequence number of message + * @bat_algo_ops: Algorithm to be dumped + * + * Return: Error number, or 0 on success + */ +static int batadv_algo_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_algo_ops *bat_algo_ops) +{ + void *hdr; + + hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, + NLM_F_MULTI, BATADV_CMD_GET_ROUTING_ALGOS); + if (!hdr) + return -EMSGSIZE; + + if (nla_put_string(msg, BATADV_ATTR_ALGO_NAME, bat_algo_ops->name)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + return 0; + + nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +/** + * batadv_algo_dump - fill in information about supported routing + * algorithms + * @msg: netlink message to be sent back + * @cb: Parameters to the netlink request + * + * Return: Length of reply message. + */ +int batadv_algo_dump(struct sk_buff *msg, struct netlink_callback *cb) +{ + int portid = NETLINK_CB(cb->skb).portid; + struct batadv_algo_ops *bat_algo_ops; + int skip = cb->args[0]; + int i = 0; + + hlist_for_each_entry(bat_algo_ops, &batadv_algo_list, list) { + if (i++ < skip) + continue; + + if (batadv_algo_dump_entry(msg, portid, cb->nlh->nlmsg_seq, + bat_algo_ops)) { + i--; + break; + } + } + + cb->args[0] = i; + + return msg->len; +} diff --git a/net/batman-adv/bat_algo.h b/net/batman-adv/bat_algo.h index 860d773dd8fa..3b5b69cdd12b 100644 --- a/net/batman-adv/bat_algo.h +++ b/net/batman-adv/bat_algo.h @@ -22,7 +22,9 @@ #include <linux/types.h> +struct netlink_callback; struct seq_file; +struct sk_buff; extern char batadv_routing_algo[]; extern struct list_head batadv_hardif_list; @@ -31,5 +33,6 @@ void batadv_algo_init(void); int batadv_algo_register(struct batadv_algo_ops *bat_algo_ops); int batadv_algo_select(struct batadv_priv *bat_priv, char *name); int batadv_algo_seq_print_text(struct seq_file *seq, void *offset); +int batadv_algo_dump(struct sk_buff *msg, struct netlink_callback *cb); #endif /* _NET_BATMAN_ADV_BAT_ALGO_H_ */ diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 19b0abd6c640..e2d18d0b1f06 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -35,6 +35,7 @@ #include <linux/list.h> #include <linux/lockdep.h> #include <linux/netdevice.h> +#include <linux/netlink.h> #include <linux/pkt_sched.h> #include <linux/printk.h> #include <linux/random.h> @@ -48,12 +49,17 @@ #include <linux/string.h> #include <linux/types.h> #include <linux/workqueue.h> +#include <net/genetlink.h> +#include <net/netlink.h> +#include <uapi/linux/batman_adv.h> #include "bat_algo.h" #include "bitarray.h" +#include "gateway_client.h" #include "hard-interface.h" #include "hash.h" #include "log.h" +#include "netlink.h" #include "network-coding.h" #include "originator.h" #include "packet.h" @@ -318,17 +324,18 @@ batadv_iv_ogm_orig_get(struct batadv_priv *bat_priv, const u8 *addr) if (!orig_node->bat_iv.bcast_own_sum) goto free_orig_node; + kref_get(&orig_node->refcount); hash_added = batadv_hash_add(bat_priv->orig_hash, batadv_compare_orig, batadv_choose_orig, orig_node, &orig_node->hash_entry); if (hash_added != 0) - goto free_orig_node; + goto free_orig_node_hash; return orig_node; -free_orig_node: - /* free twice, as batadv_orig_node_new sets refcount to 2 */ +free_orig_node_hash: batadv_orig_node_put(orig_node); +free_orig_node: batadv_orig_node_put(orig_node); return NULL; @@ -528,36 +535,25 @@ static void batadv_iv_ogm_send_to_if(struct batadv_forw_packet *forw_packet, static void batadv_iv_ogm_emit(struct batadv_forw_packet *forw_packet) { struct net_device *soft_iface; - struct batadv_priv *bat_priv; - struct batadv_hard_iface *primary_if = NULL; if (!forw_packet->if_incoming) { pr_err("Error - can't forward packet: incoming iface not specified\n"); - goto out; + return; } soft_iface = forw_packet->if_incoming->soft_iface; - bat_priv = netdev_priv(soft_iface); if (WARN_ON(!forw_packet->if_outgoing)) - goto out; + return; if (WARN_ON(forw_packet->if_outgoing->soft_iface != soft_iface)) - goto out; + return; if (forw_packet->if_incoming->if_status != BATADV_IF_ACTIVE) - goto out; - - primary_if = batadv_primary_if_get_selected(bat_priv); - if (!primary_if) - goto out; + return; /* only for one specific outgoing interface */ batadv_iv_ogm_send_to_if(forw_packet, forw_packet->if_outgoing); - -out: - if (primary_if) - batadv_hardif_put(primary_if); } /** @@ -685,19 +681,12 @@ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff, struct batadv_forw_packet *forw_packet_aggr; unsigned char *skb_buff; unsigned int skb_size; + atomic_t *queue_left = own_packet ? NULL : &bat_priv->batman_queue_left; - /* own packet should always be scheduled */ - if (!own_packet) { - if (!batadv_atomic_dec_not_zero(&bat_priv->batman_queue_left)) { - batadv_dbg(BATADV_DBG_BATMAN, bat_priv, - "batman packet queue full\n"); - return; - } - } - - forw_packet_aggr = kmalloc(sizeof(*forw_packet_aggr), GFP_ATOMIC); + forw_packet_aggr = batadv_forw_packet_alloc(if_incoming, if_outgoing, + queue_left, bat_priv); if (!forw_packet_aggr) - goto out_nomem; + return; if (atomic_read(&bat_priv->aggregated_ogms) && packet_len < BATADV_MAX_AGGREGATION_BYTES) @@ -708,8 +697,11 @@ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff, skb_size += ETH_HLEN; forw_packet_aggr->skb = netdev_alloc_skb_ip_align(NULL, skb_size); - if (!forw_packet_aggr->skb) - goto out_free_forw_packet; + if (!forw_packet_aggr->skb) { + batadv_forw_packet_free(forw_packet_aggr); + return; + } + forw_packet_aggr->skb->priority = TC_PRIO_CONTROL; skb_reserve(forw_packet_aggr->skb, ETH_HLEN); @@ -717,12 +709,7 @@ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff, forw_packet_aggr->packet_len = packet_len; memcpy(skb_buff, packet_buff, packet_len); - kref_get(&if_incoming->refcount); - kref_get(&if_outgoing->refcount); forw_packet_aggr->own = own_packet; - forw_packet_aggr->if_incoming = if_incoming; - forw_packet_aggr->if_outgoing = if_outgoing; - forw_packet_aggr->num_packets = 0; forw_packet_aggr->direct_link_flags = BATADV_NO_FLAGS; forw_packet_aggr->send_time = send_time; @@ -741,13 +728,6 @@ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff, queue_delayed_work(batadv_event_workqueue, &forw_packet_aggr->delayed_work, send_time - jiffies); - - return; -out_free_forw_packet: - kfree(forw_packet_aggr); -out_nomem: - if (!own_packet) - atomic_inc(&bat_priv->batman_queue_left); } /* aggregate a new packet into the existing ogm packet */ @@ -1830,10 +1810,6 @@ static void batadv_iv_send_outstanding_bat_ogm_packet(struct work_struct *work) batadv_iv_ogm_schedule(forw_packet->if_incoming); out: - /* don't count own packet */ - if (!forw_packet->own) - atomic_inc(&bat_priv->batman_queue_left); - batadv_forw_packet_free(forw_packet); } @@ -1879,6 +1855,7 @@ static int batadv_iv_ogm_receive(struct sk_buff *skb, return NET_RX_SUCCESS; } +#ifdef CONFIG_BATMAN_ADV_DEBUGFS /** * batadv_iv_ogm_orig_print_neigh - print neighbors for the originator table * @orig_node: the orig_node for which the neighbors are printed @@ -1976,8 +1953,239 @@ next: if (batman_count == 0) seq_puts(seq, "No batman nodes in range ...\n"); } +#endif + +/** + * batadv_iv_ogm_neigh_get_tq_avg - Get the TQ average for a neighbour on a + * given outgoing interface. + * @neigh_node: Neighbour of interest + * @if_outgoing: Outgoing interface of interest + * @tq_avg: Pointer of where to store the TQ average + * + * Return: False if no average TQ available, otherwise true. + */ +static bool +batadv_iv_ogm_neigh_get_tq_avg(struct batadv_neigh_node *neigh_node, + struct batadv_hard_iface *if_outgoing, + u8 *tq_avg) +{ + struct batadv_neigh_ifinfo *n_ifinfo; + + n_ifinfo = batadv_neigh_ifinfo_get(neigh_node, if_outgoing); + if (!n_ifinfo) + return false; + + *tq_avg = n_ifinfo->bat_iv.tq_avg; + batadv_neigh_ifinfo_put(n_ifinfo); + + return true; +} + +/** + * batadv_iv_ogm_orig_dump_subentry - Dump an originator subentry into a + * message + * @msg: Netlink message to dump into + * @portid: Port making netlink request + * @seq: Sequence number of netlink message + * @bat_priv: The bat priv with all the soft interface information + * @if_outgoing: Limit dump to entries with this outgoing interface + * @orig_node: Originator to dump + * @neigh_node: Single hops neighbour + * @best: Is the best originator + * + * Return: Error code, or 0 on success + */ +static int +batadv_iv_ogm_orig_dump_subentry(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_priv *bat_priv, + struct batadv_hard_iface *if_outgoing, + struct batadv_orig_node *orig_node, + struct batadv_neigh_node *neigh_node, + bool best) +{ + void *hdr; + u8 tq_avg; + unsigned int last_seen_msecs; + + last_seen_msecs = jiffies_to_msecs(jiffies - orig_node->last_seen); + + if (!batadv_iv_ogm_neigh_get_tq_avg(neigh_node, if_outgoing, &tq_avg)) + return 0; + + if (if_outgoing != BATADV_IF_DEFAULT && + if_outgoing != neigh_node->if_incoming) + return 0; + + hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, + NLM_F_MULTI, BATADV_CMD_GET_ORIGINATORS); + if (!hdr) + return -ENOBUFS; + + if (nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN, + orig_node->orig) || + nla_put(msg, BATADV_ATTR_NEIGH_ADDRESS, ETH_ALEN, + neigh_node->addr) || + nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, + neigh_node->if_incoming->net_dev->ifindex) || + nla_put_u8(msg, BATADV_ATTR_TQ, tq_avg) || + nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS, + last_seen_msecs)) + goto nla_put_failure; + + if (best && nla_put_flag(msg, BATADV_ATTR_FLAG_BEST)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + return 0; + + nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +/** + * batadv_iv_ogm_orig_dump_entry - Dump an originator entry into a message + * @msg: Netlink message to dump into + * @portid: Port making netlink request + * @seq: Sequence number of netlink message + * @bat_priv: The bat priv with all the soft interface information + * @if_outgoing: Limit dump to entries with this outgoing interface + * @orig_node: Originator to dump + * @sub_s: Number of sub entries to skip + * + * This function assumes the caller holds rcu_read_lock(). + * + * Return: Error code, or 0 on success + */ +static int +batadv_iv_ogm_orig_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_priv *bat_priv, + struct batadv_hard_iface *if_outgoing, + struct batadv_orig_node *orig_node, int *sub_s) +{ + struct batadv_neigh_node *neigh_node_best; + struct batadv_neigh_node *neigh_node; + int sub = 0; + bool best; + u8 tq_avg_best; + + neigh_node_best = batadv_orig_router_get(orig_node, if_outgoing); + if (!neigh_node_best) + goto out; + + if (!batadv_iv_ogm_neigh_get_tq_avg(neigh_node_best, if_outgoing, + &tq_avg_best)) + goto out; + + if (tq_avg_best == 0) + goto out; + + hlist_for_each_entry_rcu(neigh_node, &orig_node->neigh_list, list) { + if (sub++ < *sub_s) + continue; + + best = (neigh_node == neigh_node_best); + + if (batadv_iv_ogm_orig_dump_subentry(msg, portid, seq, + bat_priv, if_outgoing, + orig_node, neigh_node, + best)) { + batadv_neigh_node_put(neigh_node_best); + + *sub_s = sub - 1; + return -EMSGSIZE; + } + } + + out: + if (neigh_node_best) + batadv_neigh_node_put(neigh_node_best); + + *sub_s = 0; + return 0; +} /** + * batadv_iv_ogm_orig_dump_bucket - Dump an originator bucket into a + * message + * @msg: Netlink message to dump into + * @portid: Port making netlink request + * @seq: Sequence number of netlink message + * @bat_priv: The bat priv with all the soft interface information + * @if_outgoing: Limit dump to entries with this outgoing interface + * @head: Bucket to be dumped + * @idx_s: Number of entries to be skipped + * @sub: Number of sub entries to be skipped + * + * Return: Error code, or 0 on success + */ +static int +batadv_iv_ogm_orig_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_priv *bat_priv, + struct batadv_hard_iface *if_outgoing, + struct hlist_head *head, int *idx_s, int *sub) +{ + struct batadv_orig_node *orig_node; + int idx = 0; + + rcu_read_lock(); + hlist_for_each_entry_rcu(orig_node, head, hash_entry) { + if (idx++ < *idx_s) + continue; + + if (batadv_iv_ogm_orig_dump_entry(msg, portid, seq, bat_priv, + if_outgoing, orig_node, + sub)) { + rcu_read_unlock(); + *idx_s = idx - 1; + return -EMSGSIZE; + } + } + rcu_read_unlock(); + + *idx_s = 0; + *sub = 0; + return 0; +} + +/** + * batadv_iv_ogm_orig_dump - Dump the originators into a message + * @msg: Netlink message to dump into + * @cb: Control block containing additional options + * @bat_priv: The bat priv with all the soft interface information + * @if_outgoing: Limit dump to entries with this outgoing interface + */ +static void +batadv_iv_ogm_orig_dump(struct sk_buff *msg, struct netlink_callback *cb, + struct batadv_priv *bat_priv, + struct batadv_hard_iface *if_outgoing) +{ + struct batadv_hashtable *hash = bat_priv->orig_hash; + struct hlist_head *head; + int bucket = cb->args[0]; + int idx = cb->args[1]; + int sub = cb->args[2]; + int portid = NETLINK_CB(cb->skb).portid; + + while (bucket < hash->size) { + head = &hash->table[bucket]; + + if (batadv_iv_ogm_orig_dump_bucket(msg, portid, + cb->nlh->nlmsg_seq, + bat_priv, if_outgoing, head, + &idx, &sub)) + break; + + bucket++; + } + + cb->args[0] = bucket; + cb->args[1] = idx; + cb->args[2] = sub; +} + +#ifdef CONFIG_BATMAN_ADV_DEBUGFS +/** * batadv_iv_hardif_neigh_print - print a single hop neighbour node * @seq: neighbour table seq_file struct * @hardif_neigh: hardif neighbour information @@ -2027,37 +2235,43 @@ static void batadv_iv_neigh_print(struct batadv_priv *bat_priv, if (batman_count == 0) seq_puts(seq, "No batman nodes in range ...\n"); } +#endif /** - * batadv_iv_ogm_neigh_cmp - compare the metrics of two neighbors + * batadv_iv_ogm_neigh_diff - calculate tq difference of two neighbors * @neigh1: the first neighbor object of the comparison * @if_outgoing1: outgoing interface for the first neighbor * @neigh2: the second neighbor object of the comparison * @if_outgoing2: outgoing interface for the second neighbor + * @diff: pointer to integer receiving the calculated difference * - * Return: a value less, equal to or greater than 0 if the metric via neigh1 is - * lower, the same as or higher than the metric via neigh2 + * The content of *@diff is only valid when this function returns true. + * It is less, equal to or greater than 0 if the metric via neigh1 is lower, + * the same as or higher than the metric via neigh2 + * + * Return: true when the difference could be calculated, false otherwise */ -static int batadv_iv_ogm_neigh_cmp(struct batadv_neigh_node *neigh1, - struct batadv_hard_iface *if_outgoing1, - struct batadv_neigh_node *neigh2, - struct batadv_hard_iface *if_outgoing2) +static bool batadv_iv_ogm_neigh_diff(struct batadv_neigh_node *neigh1, + struct batadv_hard_iface *if_outgoing1, + struct batadv_neigh_node *neigh2, + struct batadv_hard_iface *if_outgoing2, + int *diff) { struct batadv_neigh_ifinfo *neigh1_ifinfo, *neigh2_ifinfo; u8 tq1, tq2; - int diff; + bool ret = true; neigh1_ifinfo = batadv_neigh_ifinfo_get(neigh1, if_outgoing1); neigh2_ifinfo = batadv_neigh_ifinfo_get(neigh2, if_outgoing2); if (!neigh1_ifinfo || !neigh2_ifinfo) { - diff = 0; + ret = false; goto out; } tq1 = neigh1_ifinfo->bat_iv.tq_avg; tq2 = neigh2_ifinfo->bat_iv.tq_avg; - diff = tq1 - tq2; + *diff = (int)tq1 - (int)tq2; out: if (neigh1_ifinfo) @@ -2065,6 +2279,162 @@ out: if (neigh2_ifinfo) batadv_neigh_ifinfo_put(neigh2_ifinfo); + return ret; +} + +/** + * batadv_iv_ogm_neigh_dump_neigh - Dump a neighbour into a netlink message + * @msg: Netlink message to dump into + * @portid: Port making netlink request + * @seq: Sequence number of netlink message + * @hardif_neigh: Neighbour to be dumped + * + * Return: Error code, or 0 on success + */ +static int +batadv_iv_ogm_neigh_dump_neigh(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_hardif_neigh_node *hardif_neigh) +{ + void *hdr; + unsigned int last_seen_msecs; + + last_seen_msecs = jiffies_to_msecs(jiffies - hardif_neigh->last_seen); + + hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, + NLM_F_MULTI, BATADV_CMD_GET_NEIGHBORS); + if (!hdr) + return -ENOBUFS; + + if (nla_put(msg, BATADV_ATTR_NEIGH_ADDRESS, ETH_ALEN, + hardif_neigh->addr) || + nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, + hardif_neigh->if_incoming->net_dev->ifindex) || + nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS, + last_seen_msecs)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + return 0; + + nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +/** + * batadv_iv_ogm_neigh_dump_hardif - Dump the neighbours of a hard interface + * into a message + * @msg: Netlink message to dump into + * @portid: Port making netlink request + * @seq: Sequence number of netlink message + * @bat_priv: The bat priv with all the soft interface information + * @hard_iface: Hard interface to dump the neighbours for + * @idx_s: Number of entries to skip + * + * This function assumes the caller holds rcu_read_lock(). + * + * Return: Error code, or 0 on success + */ +static int +batadv_iv_ogm_neigh_dump_hardif(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_priv *bat_priv, + struct batadv_hard_iface *hard_iface, + int *idx_s) +{ + struct batadv_hardif_neigh_node *hardif_neigh; + int idx = 0; + + hlist_for_each_entry_rcu(hardif_neigh, + &hard_iface->neigh_list, list) { + if (idx++ < *idx_s) + continue; + + if (batadv_iv_ogm_neigh_dump_neigh(msg, portid, seq, + hardif_neigh)) { + *idx_s = idx - 1; + return -EMSGSIZE; + } + } + + *idx_s = 0; + return 0; +} + +/** + * batadv_iv_ogm_neigh_dump - Dump the neighbours into a message + * @msg: Netlink message to dump into + * @cb: Control block containing additional options + * @bat_priv: The bat priv with all the soft interface information + * @single_hardif: Limit dump to this hard interfaace + */ +static void +batadv_iv_ogm_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb, + struct batadv_priv *bat_priv, + struct batadv_hard_iface *single_hardif) +{ + struct batadv_hard_iface *hard_iface; + int i_hardif = 0; + int i_hardif_s = cb->args[0]; + int idx = cb->args[1]; + int portid = NETLINK_CB(cb->skb).portid; + + rcu_read_lock(); + if (single_hardif) { + if (i_hardif_s == 0) { + if (batadv_iv_ogm_neigh_dump_hardif(msg, portid, + cb->nlh->nlmsg_seq, + bat_priv, + single_hardif, + &idx) == 0) + i_hardif++; + } + } else { + list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, + list) { + if (hard_iface->soft_iface != bat_priv->soft_iface) + continue; + + if (i_hardif++ < i_hardif_s) + continue; + + if (batadv_iv_ogm_neigh_dump_hardif(msg, portid, + cb->nlh->nlmsg_seq, + bat_priv, + hard_iface, &idx)) { + i_hardif--; + break; + } + } + } + rcu_read_unlock(); + + cb->args[0] = i_hardif; + cb->args[1] = idx; +} + +/** + * batadv_iv_ogm_neigh_cmp - compare the metrics of two neighbors + * @neigh1: the first neighbor object of the comparison + * @if_outgoing1: outgoing interface for the first neighbor + * @neigh2: the second neighbor object of the comparison + * @if_outgoing2: outgoing interface for the second neighbor + * + * Return: a value less, equal to or greater than 0 if the metric via neigh1 is + * lower, the same as or higher than the metric via neigh2 + */ +static int batadv_iv_ogm_neigh_cmp(struct batadv_neigh_node *neigh1, + struct batadv_hard_iface *if_outgoing1, + struct batadv_neigh_node *neigh2, + struct batadv_hard_iface *if_outgoing2) +{ + bool ret; + int diff; + + ret = batadv_iv_ogm_neigh_diff(neigh1, if_outgoing1, neigh2, + if_outgoing2, &diff); + if (!ret) + return 0; + return diff; } @@ -2085,36 +2455,341 @@ batadv_iv_ogm_neigh_is_sob(struct batadv_neigh_node *neigh1, struct batadv_neigh_node *neigh2, struct batadv_hard_iface *if_outgoing2) { - struct batadv_neigh_ifinfo *neigh1_ifinfo, *neigh2_ifinfo; - u8 tq1, tq2; bool ret; + int diff; - neigh1_ifinfo = batadv_neigh_ifinfo_get(neigh1, if_outgoing1); - neigh2_ifinfo = batadv_neigh_ifinfo_get(neigh2, if_outgoing2); + ret = batadv_iv_ogm_neigh_diff(neigh1, if_outgoing1, neigh2, + if_outgoing2, &diff); + if (!ret) + return false; - /* we can't say that the metric is better */ - if (!neigh1_ifinfo || !neigh2_ifinfo) { - ret = false; + ret = diff > -BATADV_TQ_SIMILARITY_THRESHOLD; + return ret; +} + +static void batadv_iv_iface_activate(struct batadv_hard_iface *hard_iface) +{ + /* begin scheduling originator messages on that interface */ + batadv_iv_ogm_schedule(hard_iface); +} + +static struct batadv_gw_node * +batadv_iv_gw_get_best_gw_node(struct batadv_priv *bat_priv) +{ + struct batadv_neigh_node *router; + struct batadv_neigh_ifinfo *router_ifinfo; + struct batadv_gw_node *gw_node, *curr_gw = NULL; + u64 max_gw_factor = 0; + u64 tmp_gw_factor = 0; + u8 max_tq = 0; + u8 tq_avg; + struct batadv_orig_node *orig_node; + + rcu_read_lock(); + hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.list, list) { + orig_node = gw_node->orig_node; + router = batadv_orig_router_get(orig_node, BATADV_IF_DEFAULT); + if (!router) + continue; + + router_ifinfo = batadv_neigh_ifinfo_get(router, + BATADV_IF_DEFAULT); + if (!router_ifinfo) + goto next; + + if (!kref_get_unless_zero(&gw_node->refcount)) + goto next; + + tq_avg = router_ifinfo->bat_iv.tq_avg; + + switch (atomic_read(&bat_priv->gw.sel_class)) { + case 1: /* fast connection */ + tmp_gw_factor = tq_avg * tq_avg; + tmp_gw_factor *= gw_node->bandwidth_down; + tmp_gw_factor *= 100 * 100; + tmp_gw_factor >>= 18; + + if ((tmp_gw_factor > max_gw_factor) || + ((tmp_gw_factor == max_gw_factor) && + (tq_avg > max_tq))) { + if (curr_gw) + batadv_gw_node_put(curr_gw); + curr_gw = gw_node; + kref_get(&curr_gw->refcount); + } + break; + + default: /* 2: stable connection (use best statistic) + * 3: fast-switch (use best statistic but change as + * soon as a better gateway appears) + * XX: late-switch (use best statistic but change as + * soon as a better gateway appears which has + * $routing_class more tq points) + */ + if (tq_avg > max_tq) { + if (curr_gw) + batadv_gw_node_put(curr_gw); + curr_gw = gw_node; + kref_get(&curr_gw->refcount); + } + break; + } + + if (tq_avg > max_tq) + max_tq = tq_avg; + + if (tmp_gw_factor > max_gw_factor) + max_gw_factor = tmp_gw_factor; + + batadv_gw_node_put(gw_node); + +next: + batadv_neigh_node_put(router); + if (router_ifinfo) + batadv_neigh_ifinfo_put(router_ifinfo); + } + rcu_read_unlock(); + + return curr_gw; +} + +static bool batadv_iv_gw_is_eligible(struct batadv_priv *bat_priv, + struct batadv_orig_node *curr_gw_orig, + struct batadv_orig_node *orig_node) +{ + struct batadv_neigh_ifinfo *router_orig_ifinfo = NULL; + struct batadv_neigh_ifinfo *router_gw_ifinfo = NULL; + struct batadv_neigh_node *router_gw = NULL; + struct batadv_neigh_node *router_orig = NULL; + u8 gw_tq_avg, orig_tq_avg; + bool ret = false; + + /* dynamic re-election is performed only on fast or late switch */ + if (atomic_read(&bat_priv->gw.sel_class) <= 2) + return false; + + router_gw = batadv_orig_router_get(curr_gw_orig, BATADV_IF_DEFAULT); + if (!router_gw) { + ret = true; goto out; } - tq1 = neigh1_ifinfo->bat_iv.tq_avg; - tq2 = neigh2_ifinfo->bat_iv.tq_avg; - ret = (tq1 - tq2) > -BATADV_TQ_SIMILARITY_THRESHOLD; + router_gw_ifinfo = batadv_neigh_ifinfo_get(router_gw, + BATADV_IF_DEFAULT); + if (!router_gw_ifinfo) { + ret = true; + goto out; + } + + router_orig = batadv_orig_router_get(orig_node, BATADV_IF_DEFAULT); + if (!router_orig) + goto out; + + router_orig_ifinfo = batadv_neigh_ifinfo_get(router_orig, + BATADV_IF_DEFAULT); + if (!router_orig_ifinfo) + goto out; + + gw_tq_avg = router_gw_ifinfo->bat_iv.tq_avg; + orig_tq_avg = router_orig_ifinfo->bat_iv.tq_avg; + + /* the TQ value has to be better */ + if (orig_tq_avg < gw_tq_avg) + goto out; + /* if the routing class is greater than 3 the value tells us how much + * greater the TQ value of the new gateway must be + */ + if ((atomic_read(&bat_priv->gw.sel_class) > 3) && + (orig_tq_avg - gw_tq_avg < atomic_read(&bat_priv->gw.sel_class))) + goto out; + + batadv_dbg(BATADV_DBG_BATMAN, bat_priv, + "Restarting gateway selection: better gateway found (tq curr: %i, tq new: %i)\n", + gw_tq_avg, orig_tq_avg); + + ret = true; out: - if (neigh1_ifinfo) - batadv_neigh_ifinfo_put(neigh1_ifinfo); - if (neigh2_ifinfo) - batadv_neigh_ifinfo_put(neigh2_ifinfo); + if (router_gw_ifinfo) + batadv_neigh_ifinfo_put(router_gw_ifinfo); + if (router_orig_ifinfo) + batadv_neigh_ifinfo_put(router_orig_ifinfo); + if (router_gw) + batadv_neigh_node_put(router_gw); + if (router_orig) + batadv_neigh_node_put(router_orig); return ret; } -static void batadv_iv_iface_activate(struct batadv_hard_iface *hard_iface) +#ifdef CONFIG_BATMAN_ADV_DEBUGFS +/* fails if orig_node has no router */ +static int batadv_iv_gw_write_buffer_text(struct batadv_priv *bat_priv, + struct seq_file *seq, + const struct batadv_gw_node *gw_node) { - /* begin scheduling originator messages on that interface */ - batadv_iv_ogm_schedule(hard_iface); + struct batadv_gw_node *curr_gw; + struct batadv_neigh_node *router; + struct batadv_neigh_ifinfo *router_ifinfo = NULL; + int ret = -1; + + router = batadv_orig_router_get(gw_node->orig_node, BATADV_IF_DEFAULT); + if (!router) + goto out; + + router_ifinfo = batadv_neigh_ifinfo_get(router, BATADV_IF_DEFAULT); + if (!router_ifinfo) + goto out; + + curr_gw = batadv_gw_get_selected_gw_node(bat_priv); + + seq_printf(seq, "%s %pM (%3i) %pM [%10s]: %u.%u/%u.%u MBit\n", + (curr_gw == gw_node ? "=>" : " "), + gw_node->orig_node->orig, + router_ifinfo->bat_iv.tq_avg, router->addr, + router->if_incoming->net_dev->name, + gw_node->bandwidth_down / 10, + gw_node->bandwidth_down % 10, + gw_node->bandwidth_up / 10, + gw_node->bandwidth_up % 10); + ret = seq_has_overflowed(seq) ? -1 : 0; + + if (curr_gw) + batadv_gw_node_put(curr_gw); +out: + if (router_ifinfo) + batadv_neigh_ifinfo_put(router_ifinfo); + if (router) + batadv_neigh_node_put(router); + return ret; +} + +static void batadv_iv_gw_print(struct batadv_priv *bat_priv, + struct seq_file *seq) +{ + struct batadv_gw_node *gw_node; + int gw_count = 0; + + seq_puts(seq, + " Gateway (#/255) Nexthop [outgoingIF]: advertised uplink bandwidth\n"); + + rcu_read_lock(); + hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.list, list) { + /* fails if orig_node has no router */ + if (batadv_iv_gw_write_buffer_text(bat_priv, seq, gw_node) < 0) + continue; + + gw_count++; + } + rcu_read_unlock(); + + if (gw_count == 0) + seq_puts(seq, "No gateways in range ...\n"); +} +#endif + +/** + * batadv_iv_gw_dump_entry - Dump a gateway into a message + * @msg: Netlink message to dump into + * @portid: Port making netlink request + * @seq: Sequence number of netlink message + * @bat_priv: The bat priv with all the soft interface information + * @gw_node: Gateway to be dumped + * + * Return: Error code, or 0 on success + */ +static int batadv_iv_gw_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_priv *bat_priv, + struct batadv_gw_node *gw_node) +{ + struct batadv_neigh_ifinfo *router_ifinfo = NULL; + struct batadv_neigh_node *router; + struct batadv_gw_node *curr_gw; + int ret = -EINVAL; + void *hdr; + + router = batadv_orig_router_get(gw_node->orig_node, BATADV_IF_DEFAULT); + if (!router) + goto out; + + router_ifinfo = batadv_neigh_ifinfo_get(router, BATADV_IF_DEFAULT); + if (!router_ifinfo) + goto out; + + curr_gw = batadv_gw_get_selected_gw_node(bat_priv); + + hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, + NLM_F_MULTI, BATADV_CMD_GET_GATEWAYS); + if (!hdr) { + ret = -ENOBUFS; + goto out; + } + + ret = -EMSGSIZE; + + if (curr_gw == gw_node) + if (nla_put_flag(msg, BATADV_ATTR_FLAG_BEST)) { + genlmsg_cancel(msg, hdr); + goto out; + } + + if (nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN, + gw_node->orig_node->orig) || + nla_put_u8(msg, BATADV_ATTR_TQ, router_ifinfo->bat_iv.tq_avg) || + nla_put(msg, BATADV_ATTR_ROUTER, ETH_ALEN, + router->addr) || + nla_put_string(msg, BATADV_ATTR_HARD_IFNAME, + router->if_incoming->net_dev->name) || + nla_put_u32(msg, BATADV_ATTR_BANDWIDTH_DOWN, + gw_node->bandwidth_down) || + nla_put_u32(msg, BATADV_ATTR_BANDWIDTH_UP, + gw_node->bandwidth_up)) { + genlmsg_cancel(msg, hdr); + goto out; + } + + genlmsg_end(msg, hdr); + ret = 0; + +out: + if (router_ifinfo) + batadv_neigh_ifinfo_put(router_ifinfo); + if (router) + batadv_neigh_node_put(router); + return ret; +} + +/** + * batadv_iv_gw_dump - Dump gateways into a message + * @msg: Netlink message to dump into + * @cb: Control block containing additional options + * @bat_priv: The bat priv with all the soft interface information + */ +static void batadv_iv_gw_dump(struct sk_buff *msg, struct netlink_callback *cb, + struct batadv_priv *bat_priv) +{ + int portid = NETLINK_CB(cb->skb).portid; + struct batadv_gw_node *gw_node; + int idx_skip = cb->args[0]; + int idx = 0; + + rcu_read_lock(); + hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.list, list) { + if (idx++ < idx_skip) + continue; + + if (batadv_iv_gw_dump_entry(msg, portid, cb->nlh->nlmsg_seq, + bat_priv, gw_node)) { + idx_skip = idx - 1; + goto unlock; + } + } + + idx_skip = idx; +unlock: + rcu_read_unlock(); + + cb->args[0] = idx_skip; } static struct batadv_algo_ops batadv_batman_iv __read_mostly = { @@ -2129,14 +2804,28 @@ static struct batadv_algo_ops batadv_batman_iv __read_mostly = { .neigh = { .cmp = batadv_iv_ogm_neigh_cmp, .is_similar_or_better = batadv_iv_ogm_neigh_is_sob, +#ifdef CONFIG_BATMAN_ADV_DEBUGFS .print = batadv_iv_neigh_print, +#endif + .dump = batadv_iv_ogm_neigh_dump, }, .orig = { +#ifdef CONFIG_BATMAN_ADV_DEBUGFS .print = batadv_iv_ogm_orig_print, +#endif + .dump = batadv_iv_ogm_orig_dump, .free = batadv_iv_ogm_orig_free, .add_if = batadv_iv_ogm_orig_add_if, .del_if = batadv_iv_ogm_orig_del_if, }, + .gw = { + .get_best_gw_node = batadv_iv_gw_get_best_gw_node, + .is_eligible = batadv_iv_gw_is_eligible, +#ifdef CONFIG_BATMAN_ADV_DEBUGFS + .print = batadv_iv_gw_print, +#endif + .dump = batadv_iv_gw_dump, + }, }; int __init batadv_iv_init(void) diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c index 0366cbf5e444..e79f6f01182e 100644 --- a/net/batman-adv/bat_v.c +++ b/net/batman-adv/bat_v.c @@ -21,24 +21,38 @@ #include <linux/atomic.h> #include <linux/bug.h> #include <linux/cache.h> +#include <linux/errno.h> +#include <linux/if_ether.h> #include <linux/init.h> #include <linux/jiffies.h> +#include <linux/kernel.h> +#include <linux/kref.h> #include <linux/netdevice.h> +#include <linux/netlink.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/seq_file.h> #include <linux/stddef.h> #include <linux/types.h> #include <linux/workqueue.h> +#include <net/genetlink.h> +#include <net/netlink.h> +#include <uapi/linux/batman_adv.h> #include "bat_algo.h" #include "bat_v_elp.h" #include "bat_v_ogm.h" +#include "gateway_client.h" +#include "gateway_common.h" #include "hard-interface.h" #include "hash.h" +#include "log.h" +#include "netlink.h" #include "originator.h" #include "packet.h" +struct sk_buff; + static void batadv_v_iface_activate(struct batadv_hard_iface *hard_iface) { struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); @@ -115,6 +129,7 @@ batadv_v_hardif_neigh_init(struct batadv_hardif_neigh_node *hardif_neigh) batadv_v_elp_throughput_metric_update); } +#ifdef CONFIG_BATMAN_ADV_DEBUGFS /** * batadv_v_orig_print_neigh - print neighbors for the originator table * @orig_node: the orig_node for which the neighbors are printed @@ -198,8 +213,142 @@ static void batadv_v_neigh_print(struct batadv_priv *bat_priv, if (batman_count == 0) seq_puts(seq, "No batman nodes in range ...\n"); } +#endif + +/** + * batadv_v_neigh_dump_neigh - Dump a neighbour into a message + * @msg: Netlink message to dump into + * @portid: Port making netlink request + * @seq: Sequence number of netlink message + * @hardif_neigh: Neighbour to dump + * + * Return: Error code, or 0 on success + */ +static int +batadv_v_neigh_dump_neigh(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_hardif_neigh_node *hardif_neigh) +{ + void *hdr; + unsigned int last_seen_msecs; + u32 throughput; + + last_seen_msecs = jiffies_to_msecs(jiffies - hardif_neigh->last_seen); + throughput = ewma_throughput_read(&hardif_neigh->bat_v.throughput); + throughput = throughput * 100; + + hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, NLM_F_MULTI, + BATADV_CMD_GET_NEIGHBORS); + if (!hdr) + return -ENOBUFS; + + if (nla_put(msg, BATADV_ATTR_NEIGH_ADDRESS, ETH_ALEN, + hardif_neigh->addr) || + nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, + hardif_neigh->if_incoming->net_dev->ifindex) || + nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS, + last_seen_msecs) || + nla_put_u32(msg, BATADV_ATTR_THROUGHPUT, throughput)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + return 0; + + nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} /** + * batadv_v_neigh_dump_hardif - Dump the neighbours of a hard interface into + * a message + * @msg: Netlink message to dump into + * @portid: Port making netlink request + * @seq: Sequence number of netlink message + * @bat_priv: The bat priv with all the soft interface information + * @hard_iface: The hard interface to be dumped + * @idx_s: Entries to be skipped + * + * This function assumes the caller holds rcu_read_lock(). + * + * Return: Error code, or 0 on success + */ +static int +batadv_v_neigh_dump_hardif(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_priv *bat_priv, + struct batadv_hard_iface *hard_iface, + int *idx_s) +{ + struct batadv_hardif_neigh_node *hardif_neigh; + int idx = 0; + + hlist_for_each_entry_rcu(hardif_neigh, + &hard_iface->neigh_list, list) { + if (idx++ < *idx_s) + continue; + + if (batadv_v_neigh_dump_neigh(msg, portid, seq, hardif_neigh)) { + *idx_s = idx - 1; + return -EMSGSIZE; + } + } + + *idx_s = 0; + return 0; +} + +/** + * batadv_v_neigh_dump - Dump the neighbours of a hard interface into a + * message + * @msg: Netlink message to dump into + * @cb: Control block containing additional options + * @bat_priv: The bat priv with all the soft interface information + * @single_hardif: Limit dumping to this hard interface + */ +static void +batadv_v_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb, + struct batadv_priv *bat_priv, + struct batadv_hard_iface *single_hardif) +{ + struct batadv_hard_iface *hard_iface; + int i_hardif = 0; + int i_hardif_s = cb->args[0]; + int idx = cb->args[1]; + int portid = NETLINK_CB(cb->skb).portid; + + rcu_read_lock(); + if (single_hardif) { + if (i_hardif_s == 0) { + if (batadv_v_neigh_dump_hardif(msg, portid, + cb->nlh->nlmsg_seq, + bat_priv, single_hardif, + &idx) == 0) + i_hardif++; + } + } else { + list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { + if (hard_iface->soft_iface != bat_priv->soft_iface) + continue; + + if (i_hardif++ < i_hardif_s) + continue; + + if (batadv_v_neigh_dump_hardif(msg, portid, + cb->nlh->nlmsg_seq, + bat_priv, hard_iface, + &idx)) { + i_hardif--; + break; + } + } + } + rcu_read_unlock(); + + cb->args[0] = i_hardif; + cb->args[1] = idx; +} + +#ifdef CONFIG_BATMAN_ADV_DEBUGFS +/** * batadv_v_orig_print - print the originator table * @bat_priv: the bat priv with all the soft interface information * @seq: debugfs table seq_file struct @@ -265,6 +414,205 @@ next: if (batman_count == 0) seq_puts(seq, "No batman nodes in range ...\n"); } +#endif + +/** + * batadv_v_orig_dump_subentry - Dump an originator subentry into a + * message + * @msg: Netlink message to dump into + * @portid: Port making netlink request + * @seq: Sequence number of netlink message + * @bat_priv: The bat priv with all the soft interface information + * @if_outgoing: Limit dump to entries with this outgoing interface + * @orig_node: Originator to dump + * @neigh_node: Single hops neighbour + * @best: Is the best originator + * + * Return: Error code, or 0 on success + */ +static int +batadv_v_orig_dump_subentry(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_priv *bat_priv, + struct batadv_hard_iface *if_outgoing, + struct batadv_orig_node *orig_node, + struct batadv_neigh_node *neigh_node, + bool best) +{ + struct batadv_neigh_ifinfo *n_ifinfo; + unsigned int last_seen_msecs; + u32 throughput; + void *hdr; + + n_ifinfo = batadv_neigh_ifinfo_get(neigh_node, if_outgoing); + if (!n_ifinfo) + return 0; + + throughput = n_ifinfo->bat_v.throughput * 100; + + batadv_neigh_ifinfo_put(n_ifinfo); + + last_seen_msecs = jiffies_to_msecs(jiffies - orig_node->last_seen); + + if (if_outgoing != BATADV_IF_DEFAULT && + if_outgoing != neigh_node->if_incoming) + return 0; + + hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, NLM_F_MULTI, + BATADV_CMD_GET_ORIGINATORS); + if (!hdr) + return -ENOBUFS; + + if (nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN, orig_node->orig) || + nla_put(msg, BATADV_ATTR_NEIGH_ADDRESS, ETH_ALEN, + neigh_node->addr) || + nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, + neigh_node->if_incoming->net_dev->ifindex) || + nla_put_u32(msg, BATADV_ATTR_THROUGHPUT, throughput) || + nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS, + last_seen_msecs)) + goto nla_put_failure; + + if (best && nla_put_flag(msg, BATADV_ATTR_FLAG_BEST)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + return 0; + + nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +/** + * batadv_v_orig_dump_entry - Dump an originator entry into a message + * @msg: Netlink message to dump into + * @portid: Port making netlink request + * @seq: Sequence number of netlink message + * @bat_priv: The bat priv with all the soft interface information + * @if_outgoing: Limit dump to entries with this outgoing interface + * @orig_node: Originator to dump + * @sub_s: Number of sub entries to skip + * + * This function assumes the caller holds rcu_read_lock(). + * + * Return: Error code, or 0 on success + */ +static int +batadv_v_orig_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_priv *bat_priv, + struct batadv_hard_iface *if_outgoing, + struct batadv_orig_node *orig_node, int *sub_s) +{ + struct batadv_neigh_node *neigh_node_best; + struct batadv_neigh_node *neigh_node; + int sub = 0; + bool best; + + neigh_node_best = batadv_orig_router_get(orig_node, if_outgoing); + if (!neigh_node_best) + goto out; + + hlist_for_each_entry_rcu(neigh_node, &orig_node->neigh_list, list) { + if (sub++ < *sub_s) + continue; + + best = (neigh_node == neigh_node_best); + + if (batadv_v_orig_dump_subentry(msg, portid, seq, bat_priv, + if_outgoing, orig_node, + neigh_node, best)) { + batadv_neigh_node_put(neigh_node_best); + + *sub_s = sub - 1; + return -EMSGSIZE; + } + } + + out: + if (neigh_node_best) + batadv_neigh_node_put(neigh_node_best); + + *sub_s = 0; + return 0; +} + +/** + * batadv_v_orig_dump_bucket - Dump an originator bucket into a + * message + * @msg: Netlink message to dump into + * @portid: Port making netlink request + * @seq: Sequence number of netlink message + * @bat_priv: The bat priv with all the soft interface information + * @if_outgoing: Limit dump to entries with this outgoing interface + * @head: Bucket to be dumped + * @idx_s: Number of entries to be skipped + * @sub: Number of sub entries to be skipped + * + * Return: Error code, or 0 on success + */ +static int +batadv_v_orig_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_priv *bat_priv, + struct batadv_hard_iface *if_outgoing, + struct hlist_head *head, int *idx_s, int *sub) +{ + struct batadv_orig_node *orig_node; + int idx = 0; + + rcu_read_lock(); + hlist_for_each_entry_rcu(orig_node, head, hash_entry) { + if (idx++ < *idx_s) + continue; + + if (batadv_v_orig_dump_entry(msg, portid, seq, bat_priv, + if_outgoing, orig_node, sub)) { + rcu_read_unlock(); + *idx_s = idx - 1; + return -EMSGSIZE; + } + } + rcu_read_unlock(); + + *idx_s = 0; + *sub = 0; + return 0; +} + +/** + * batadv_v_orig_dump - Dump the originators into a message + * @msg: Netlink message to dump into + * @cb: Control block containing additional options + * @bat_priv: The bat priv with all the soft interface information + * @if_outgoing: Limit dump to entries with this outgoing interface + */ +static void +batadv_v_orig_dump(struct sk_buff *msg, struct netlink_callback *cb, + struct batadv_priv *bat_priv, + struct batadv_hard_iface *if_outgoing) +{ + struct batadv_hashtable *hash = bat_priv->orig_hash; + struct hlist_head *head; + int bucket = cb->args[0]; + int idx = cb->args[1]; + int sub = cb->args[2]; + int portid = NETLINK_CB(cb->skb).portid; + + while (bucket < hash->size) { + head = &hash->table[bucket]; + + if (batadv_v_orig_dump_bucket(msg, portid, + cb->nlh->nlmsg_seq, + bat_priv, if_outgoing, head, &idx, + &sub)) + break; + + bucket++; + } + + cb->args[0] = bucket; + cb->args[1] = idx; + cb->args[2] = sub; +} static int batadv_v_neigh_cmp(struct batadv_neigh_node *neigh1, struct batadv_hard_iface *if_outgoing1, @@ -320,6 +668,365 @@ err_ifinfo1: return ret; } +static ssize_t batadv_v_store_sel_class(struct batadv_priv *bat_priv, + char *buff, size_t count) +{ + u32 old_class, class; + + if (!batadv_parse_throughput(bat_priv->soft_iface, buff, + "B.A.T.M.A.N. V GW selection class", + &class)) + return -EINVAL; + + old_class = atomic_read(&bat_priv->gw.sel_class); + atomic_set(&bat_priv->gw.sel_class, class); + + if (old_class != class) + batadv_gw_reselect(bat_priv); + + return count; +} + +static ssize_t batadv_v_show_sel_class(struct batadv_priv *bat_priv, char *buff) +{ + u32 class = atomic_read(&bat_priv->gw.sel_class); + + return sprintf(buff, "%u.%u MBit\n", class / 10, class % 10); +} + +/** + * batadv_v_gw_throughput_get - retrieve the GW-bandwidth for a given GW + * @gw_node: the GW to retrieve the metric for + * @bw: the pointer where the metric will be stored. The metric is computed as + * the minimum between the GW advertised throughput and the path throughput to + * it in the mesh + * + * Return: 0 on success, -1 on failure + */ +static int batadv_v_gw_throughput_get(struct batadv_gw_node *gw_node, u32 *bw) +{ + struct batadv_neigh_ifinfo *router_ifinfo = NULL; + struct batadv_orig_node *orig_node; + struct batadv_neigh_node *router; + int ret = -1; + + orig_node = gw_node->orig_node; + router = batadv_orig_router_get(orig_node, BATADV_IF_DEFAULT); + if (!router) + goto out; + + router_ifinfo = batadv_neigh_ifinfo_get(router, BATADV_IF_DEFAULT); + if (!router_ifinfo) + goto out; + + /* the GW metric is computed as the minimum between the path throughput + * to reach the GW itself and the advertised bandwidth. + * This gives us an approximation of the effective throughput that the + * client can expect via this particular GW node + */ + *bw = router_ifinfo->bat_v.throughput; + *bw = min_t(u32, *bw, gw_node->bandwidth_down); + + ret = 0; +out: + if (router) + batadv_neigh_node_put(router); + if (router_ifinfo) + batadv_neigh_ifinfo_put(router_ifinfo); + + return ret; +} + +/** + * batadv_v_gw_get_best_gw_node - retrieve the best GW node + * @bat_priv: the bat priv with all the soft interface information + * + * Return: the GW node having the best GW-metric, NULL if no GW is known + */ +static struct batadv_gw_node * +batadv_v_gw_get_best_gw_node(struct batadv_priv *bat_priv) +{ + struct batadv_gw_node *gw_node, *curr_gw = NULL; + u32 max_bw = 0, bw; + + rcu_read_lock(); + hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.list, list) { + if (!kref_get_unless_zero(&gw_node->refcount)) + continue; + + if (batadv_v_gw_throughput_get(gw_node, &bw) < 0) + goto next; + + if (curr_gw && (bw <= max_bw)) + goto next; + + if (curr_gw) + batadv_gw_node_put(curr_gw); + + curr_gw = gw_node; + kref_get(&curr_gw->refcount); + max_bw = bw; + +next: + batadv_gw_node_put(gw_node); + } + rcu_read_unlock(); + + return curr_gw; +} + +/** + * batadv_v_gw_is_eligible - check if a originator would be selected as GW + * @bat_priv: the bat priv with all the soft interface information + * @curr_gw_orig: originator representing the currently selected GW + * @orig_node: the originator representing the new candidate + * + * Return: true if orig_node can be selected as current GW, false otherwise + */ +static bool batadv_v_gw_is_eligible(struct batadv_priv *bat_priv, + struct batadv_orig_node *curr_gw_orig, + struct batadv_orig_node *orig_node) +{ + struct batadv_gw_node *curr_gw = NULL, *orig_gw = NULL; + u32 gw_throughput, orig_throughput, threshold; + bool ret = false; + + threshold = atomic_read(&bat_priv->gw.sel_class); + + curr_gw = batadv_gw_node_get(bat_priv, curr_gw_orig); + if (!curr_gw) { + ret = true; + goto out; + } + + if (batadv_v_gw_throughput_get(curr_gw, &gw_throughput) < 0) { + ret = true; + goto out; + } + + orig_gw = batadv_gw_node_get(bat_priv, orig_node); + if (!orig_node) + goto out; + + if (batadv_v_gw_throughput_get(orig_gw, &orig_throughput) < 0) + goto out; + + if (orig_throughput < gw_throughput) + goto out; + + if ((orig_throughput - gw_throughput) < threshold) + goto out; + + batadv_dbg(BATADV_DBG_BATMAN, bat_priv, + "Restarting gateway selection: better gateway found (throughput curr: %u, throughput new: %u)\n", + gw_throughput, orig_throughput); + + ret = true; +out: + if (curr_gw) + batadv_gw_node_put(curr_gw); + if (orig_gw) + batadv_gw_node_put(orig_gw); + + return ret; +} + +#ifdef CONFIG_BATMAN_ADV_DEBUGFS +/* fails if orig_node has no router */ +static int batadv_v_gw_write_buffer_text(struct batadv_priv *bat_priv, + struct seq_file *seq, + const struct batadv_gw_node *gw_node) +{ + struct batadv_gw_node *curr_gw; + struct batadv_neigh_node *router; + struct batadv_neigh_ifinfo *router_ifinfo = NULL; + int ret = -1; + + router = batadv_orig_router_get(gw_node->orig_node, BATADV_IF_DEFAULT); + if (!router) + goto out; + + router_ifinfo = batadv_neigh_ifinfo_get(router, BATADV_IF_DEFAULT); + if (!router_ifinfo) + goto out; + + curr_gw = batadv_gw_get_selected_gw_node(bat_priv); + + seq_printf(seq, "%s %pM (%9u.%1u) %pM [%10s]: %u.%u/%u.%u MBit\n", + (curr_gw == gw_node ? "=>" : " "), + gw_node->orig_node->orig, + router_ifinfo->bat_v.throughput / 10, + router_ifinfo->bat_v.throughput % 10, router->addr, + router->if_incoming->net_dev->name, + gw_node->bandwidth_down / 10, + gw_node->bandwidth_down % 10, + gw_node->bandwidth_up / 10, + gw_node->bandwidth_up % 10); + ret = seq_has_overflowed(seq) ? -1 : 0; + + if (curr_gw) + batadv_gw_node_put(curr_gw); +out: + if (router_ifinfo) + batadv_neigh_ifinfo_put(router_ifinfo); + if (router) + batadv_neigh_node_put(router); + return ret; +} + +/** + * batadv_v_gw_print - print the gateway list + * @bat_priv: the bat priv with all the soft interface information + * @seq: gateway table seq_file struct + */ +static void batadv_v_gw_print(struct batadv_priv *bat_priv, + struct seq_file *seq) +{ + struct batadv_gw_node *gw_node; + int gw_count = 0; + + seq_puts(seq, + " Gateway ( throughput) Nexthop [outgoingIF]: advertised uplink bandwidth\n"); + + rcu_read_lock(); + hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.list, list) { + /* fails if orig_node has no router */ + if (batadv_v_gw_write_buffer_text(bat_priv, seq, gw_node) < 0) + continue; + + gw_count++; + } + rcu_read_unlock(); + + if (gw_count == 0) + seq_puts(seq, "No gateways in range ...\n"); +} +#endif + +/** + * batadv_v_gw_dump_entry - Dump a gateway into a message + * @msg: Netlink message to dump into + * @portid: Port making netlink request + * @seq: Sequence number of netlink message + * @bat_priv: The bat priv with all the soft interface information + * @gw_node: Gateway to be dumped + * + * Return: Error code, or 0 on success + */ +static int batadv_v_gw_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_priv *bat_priv, + struct batadv_gw_node *gw_node) +{ + struct batadv_neigh_ifinfo *router_ifinfo = NULL; + struct batadv_neigh_node *router; + struct batadv_gw_node *curr_gw; + int ret = -EINVAL; + void *hdr; + + router = batadv_orig_router_get(gw_node->orig_node, BATADV_IF_DEFAULT); + if (!router) + goto out; + + router_ifinfo = batadv_neigh_ifinfo_get(router, BATADV_IF_DEFAULT); + if (!router_ifinfo) + goto out; + + curr_gw = batadv_gw_get_selected_gw_node(bat_priv); + + hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, + NLM_F_MULTI, BATADV_CMD_GET_GATEWAYS); + if (!hdr) { + ret = -ENOBUFS; + goto out; + } + + ret = -EMSGSIZE; + + if (curr_gw == gw_node) { + if (nla_put_flag(msg, BATADV_ATTR_FLAG_BEST)) { + genlmsg_cancel(msg, hdr); + goto out; + } + } + + if (nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN, + gw_node->orig_node->orig)) { + genlmsg_cancel(msg, hdr); + goto out; + } + + if (nla_put_u32(msg, BATADV_ATTR_THROUGHPUT, + router_ifinfo->bat_v.throughput)) { + genlmsg_cancel(msg, hdr); + goto out; + } + + if (nla_put(msg, BATADV_ATTR_ROUTER, ETH_ALEN, router->addr)) { + genlmsg_cancel(msg, hdr); + goto out; + } + + if (nla_put_string(msg, BATADV_ATTR_HARD_IFNAME, + router->if_incoming->net_dev->name)) { + genlmsg_cancel(msg, hdr); + goto out; + } + + if (nla_put_u32(msg, BATADV_ATTR_BANDWIDTH_DOWN, + gw_node->bandwidth_down)) { + genlmsg_cancel(msg, hdr); + goto out; + } + + if (nla_put_u32(msg, BATADV_ATTR_BANDWIDTH_UP, gw_node->bandwidth_up)) { + genlmsg_cancel(msg, hdr); + goto out; + } + + genlmsg_end(msg, hdr); + ret = 0; + +out: + if (router_ifinfo) + batadv_neigh_ifinfo_put(router_ifinfo); + if (router) + batadv_neigh_node_put(router); + return ret; +} + +/** + * batadv_v_gw_dump - Dump gateways into a message + * @msg: Netlink message to dump into + * @cb: Control block containing additional options + * @bat_priv: The bat priv with all the soft interface information + */ +static void batadv_v_gw_dump(struct sk_buff *msg, struct netlink_callback *cb, + struct batadv_priv *bat_priv) +{ + int portid = NETLINK_CB(cb->skb).portid; + struct batadv_gw_node *gw_node; + int idx_skip = cb->args[0]; + int idx = 0; + + rcu_read_lock(); + hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.list, list) { + if (idx++ < idx_skip) + continue; + + if (batadv_v_gw_dump_entry(msg, portid, cb->nlh->nlmsg_seq, + bat_priv, gw_node)) { + idx_skip = idx - 1; + goto unlock; + } + } + + idx_skip = idx; +unlock: + rcu_read_unlock(); + + cb->args[0] = idx_skip; +} + static struct batadv_algo_ops batadv_batman_v __read_mostly = { .name = "BATMAN_V", .iface = { @@ -333,10 +1040,26 @@ static struct batadv_algo_ops batadv_batman_v __read_mostly = { .hardif_init = batadv_v_hardif_neigh_init, .cmp = batadv_v_neigh_cmp, .is_similar_or_better = batadv_v_neigh_is_sob, +#ifdef CONFIG_BATMAN_ADV_DEBUGFS .print = batadv_v_neigh_print, +#endif + .dump = batadv_v_neigh_dump, }, .orig = { +#ifdef CONFIG_BATMAN_ADV_DEBUGFS .print = batadv_v_orig_print, +#endif + .dump = batadv_v_orig_dump, + }, + .gw = { + .store_sel_class = batadv_v_store_sel_class, + .show_sel_class = batadv_v_show_sel_class, + .get_best_gw_node = batadv_v_gw_get_best_gw_node, + .is_eligible = batadv_v_gw_is_eligible, +#ifdef CONFIG_BATMAN_ADV_DEBUGFS + .print = batadv_v_gw_print, +#endif + .dump = batadv_v_gw_dump, }, }; @@ -363,7 +1086,16 @@ void batadv_v_hardif_init(struct batadv_hard_iface *hard_iface) */ int batadv_v_mesh_init(struct batadv_priv *bat_priv) { - return batadv_v_ogm_init(bat_priv); + int ret = 0; + + ret = batadv_v_ogm_init(bat_priv); + if (ret < 0) + return ret; + + /* set default throughput difference threshold to 5Mbps */ + atomic_set(&bat_priv->gw.sel_class, 50); + + return 0; } /** diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c index 6fbba4eb0617..1aeeadca620c 100644 --- a/net/batman-adv/bat_v_ogm.c +++ b/net/batman-adv/bat_v_ogm.c @@ -73,13 +73,12 @@ struct batadv_orig_node *batadv_v_ogm_orig_get(struct batadv_priv *bat_priv, if (!orig_node) return NULL; + kref_get(&orig_node->refcount); hash_added = batadv_hash_add(bat_priv->orig_hash, batadv_compare_orig, batadv_choose_orig, orig_node, &orig_node->hash_entry); if (hash_added != 0) { - /* orig_node->refcounter is initialised to 2 by - * batadv_orig_node_new() - */ + /* remove refcnt for newly created orig_node and hash entry */ batadv_orig_node_put(orig_node); batadv_orig_node_put(orig_node); orig_node = NULL; diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index ad2ffe16d29f..e7f690b571ea 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -35,6 +35,7 @@ #include <linux/list.h> #include <linux/lockdep.h> #include <linux/netdevice.h> +#include <linux/netlink.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/seq_file.h> @@ -45,12 +46,18 @@ #include <linux/string.h> #include <linux/workqueue.h> #include <net/arp.h> +#include <net/genetlink.h> +#include <net/netlink.h> +#include <net/sock.h> +#include <uapi/linux/batman_adv.h> #include "hard-interface.h" #include "hash.h" #include "log.h" +#include "netlink.h" #include "originator.h" #include "packet.h" +#include "soft-interface.h" #include "sysfs.h" #include "translation-table.h" @@ -519,11 +526,9 @@ batadv_bla_get_backbone_gw(struct batadv_priv *bat_priv, u8 *orig, atomic_set(&entry->wait_periods, 0); ether_addr_copy(entry->orig, orig); INIT_WORK(&entry->report_work, batadv_bla_loopdetect_report); - - /* one for the hash, one for returning */ kref_init(&entry->refcount); - kref_get(&entry->refcount); + kref_get(&entry->refcount); hash_added = batadv_hash_add(bat_priv->bla.backbone_hash, batadv_compare_backbone_gw, batadv_choose_backbone_gw, entry, @@ -711,12 +716,13 @@ static void batadv_bla_add_claim(struct batadv_priv *bat_priv, claim->lasttime = jiffies; kref_get(&backbone_gw->refcount); claim->backbone_gw = backbone_gw; - kref_init(&claim->refcount); - kref_get(&claim->refcount); + batadv_dbg(BATADV_DBG_BLA, bat_priv, "bla_add_claim(): adding new entry %pM, vid %d to hash ...\n", mac, BATADV_PRINT_VID(vid)); + + kref_get(&claim->refcount); hash_added = batadv_hash_add(bat_priv->bla.claim_hash, batadv_compare_claim, batadv_choose_claim, claim, @@ -1148,7 +1154,7 @@ static bool batadv_bla_process_claim(struct batadv_priv *bat_priv, /* Let the loopdetect frames on the mesh in any case. */ if (bla_dst->type == BATADV_CLAIM_TYPE_LOOPDETECT) - return 0; + return false; /* check if it is a claim frame. */ ret = batadv_check_claim_group(bat_priv, primary_if, hw_src, hw_dst, @@ -1990,6 +1996,7 @@ out: return ret; } +#ifdef CONFIG_BATMAN_ADV_DEBUGFS /** * batadv_bla_claim_table_seq_print_text - print the claim table in a seq file * @seq: seq file to print on @@ -2050,8 +2057,172 @@ out: batadv_hardif_put(primary_if); return 0; } +#endif + +/** + * batadv_bla_claim_dump_entry - dump one entry of the claim table + * to a netlink socket + * @msg: buffer for the message + * @portid: netlink port + * @seq: Sequence number of netlink message + * @primary_if: primary interface + * @claim: entry to dump + * + * Return: 0 or error code. + */ +static int +batadv_bla_claim_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_hard_iface *primary_if, + struct batadv_bla_claim *claim) +{ + u8 *primary_addr = primary_if->net_dev->dev_addr; + u16 backbone_crc; + bool is_own; + void *hdr; + int ret = -EINVAL; + + hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, + NLM_F_MULTI, BATADV_CMD_GET_BLA_CLAIM); + if (!hdr) { + ret = -ENOBUFS; + goto out; + } + + is_own = batadv_compare_eth(claim->backbone_gw->orig, + primary_addr); + + spin_lock_bh(&claim->backbone_gw->crc_lock); + backbone_crc = claim->backbone_gw->crc; + spin_unlock_bh(&claim->backbone_gw->crc_lock); + + if (is_own) + if (nla_put_flag(msg, BATADV_ATTR_BLA_OWN)) { + genlmsg_cancel(msg, hdr); + goto out; + } + + if (nla_put(msg, BATADV_ATTR_BLA_ADDRESS, ETH_ALEN, claim->addr) || + nla_put_u16(msg, BATADV_ATTR_BLA_VID, claim->vid) || + nla_put(msg, BATADV_ATTR_BLA_BACKBONE, ETH_ALEN, + claim->backbone_gw->orig) || + nla_put_u16(msg, BATADV_ATTR_BLA_CRC, + backbone_crc)) { + genlmsg_cancel(msg, hdr); + goto out; + } + + genlmsg_end(msg, hdr); + ret = 0; + +out: + return ret; +} + +/** + * batadv_bla_claim_dump_bucket - dump one bucket of the claim table + * to a netlink socket + * @msg: buffer for the message + * @portid: netlink port + * @seq: Sequence number of netlink message + * @primary_if: primary interface + * @head: bucket to dump + * @idx_skip: How many entries to skip + * + * Return: always 0. + */ +static int +batadv_bla_claim_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_hard_iface *primary_if, + struct hlist_head *head, int *idx_skip) +{ + struct batadv_bla_claim *claim; + int idx = 0; + + rcu_read_lock(); + hlist_for_each_entry_rcu(claim, head, hash_entry) { + if (idx++ < *idx_skip) + continue; + if (batadv_bla_claim_dump_entry(msg, portid, seq, + primary_if, claim)) { + *idx_skip = idx - 1; + goto unlock; + } + } + + *idx_skip = idx; +unlock: + rcu_read_unlock(); + return 0; +} /** + * batadv_bla_claim_dump - dump claim table to a netlink socket + * @msg: buffer for the message + * @cb: callback structure containing arguments + * + * Return: message length. + */ +int batadv_bla_claim_dump(struct sk_buff *msg, struct netlink_callback *cb) +{ + struct batadv_hard_iface *primary_if = NULL; + int portid = NETLINK_CB(cb->skb).portid; + struct net *net = sock_net(cb->skb->sk); + struct net_device *soft_iface; + struct batadv_hashtable *hash; + struct batadv_priv *bat_priv; + int bucket = cb->args[0]; + struct hlist_head *head; + int idx = cb->args[1]; + int ifindex; + int ret = 0; + + ifindex = batadv_netlink_get_ifindex(cb->nlh, + BATADV_ATTR_MESH_IFINDEX); + if (!ifindex) + return -EINVAL; + + soft_iface = dev_get_by_index(net, ifindex); + if (!soft_iface || !batadv_softif_is_valid(soft_iface)) { + ret = -ENODEV; + goto out; + } + + bat_priv = netdev_priv(soft_iface); + hash = bat_priv->bla.claim_hash; + + primary_if = batadv_primary_if_get_selected(bat_priv); + if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) { + ret = -ENOENT; + goto out; + } + + while (bucket < hash->size) { + head = &hash->table[bucket]; + + if (batadv_bla_claim_dump_bucket(msg, portid, + cb->nlh->nlmsg_seq, + primary_if, head, &idx)) + break; + bucket++; + } + + cb->args[0] = bucket; + cb->args[1] = idx; + + ret = msg->len; + +out: + if (primary_if) + batadv_hardif_put(primary_if); + + if (soft_iface) + dev_put(soft_iface); + + return ret; +} + +#ifdef CONFIG_BATMAN_ADV_DEBUGFS +/** * batadv_bla_backbone_table_seq_print_text - print the backbone table in a seq * file * @seq: seq file to print on @@ -2114,3 +2285,168 @@ out: batadv_hardif_put(primary_if); return 0; } +#endif + +/** + * batadv_bla_backbone_dump_entry - dump one entry of the backbone table + * to a netlink socket + * @msg: buffer for the message + * @portid: netlink port + * @seq: Sequence number of netlink message + * @primary_if: primary interface + * @backbone_gw: entry to dump + * + * Return: 0 or error code. + */ +static int +batadv_bla_backbone_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_hard_iface *primary_if, + struct batadv_bla_backbone_gw *backbone_gw) +{ + u8 *primary_addr = primary_if->net_dev->dev_addr; + u16 backbone_crc; + bool is_own; + int msecs; + void *hdr; + int ret = -EINVAL; + + hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, + NLM_F_MULTI, BATADV_CMD_GET_BLA_BACKBONE); + if (!hdr) { + ret = -ENOBUFS; + goto out; + } + + is_own = batadv_compare_eth(backbone_gw->orig, primary_addr); + + spin_lock_bh(&backbone_gw->crc_lock); + backbone_crc = backbone_gw->crc; + spin_unlock_bh(&backbone_gw->crc_lock); + + msecs = jiffies_to_msecs(jiffies - backbone_gw->lasttime); + + if (is_own) + if (nla_put_flag(msg, BATADV_ATTR_BLA_OWN)) { + genlmsg_cancel(msg, hdr); + goto out; + } + + if (nla_put(msg, BATADV_ATTR_BLA_BACKBONE, ETH_ALEN, + backbone_gw->orig) || + nla_put_u16(msg, BATADV_ATTR_BLA_VID, backbone_gw->vid) || + nla_put_u16(msg, BATADV_ATTR_BLA_CRC, + backbone_crc) || + nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS, msecs)) { + genlmsg_cancel(msg, hdr); + goto out; + } + + genlmsg_end(msg, hdr); + ret = 0; + +out: + return ret; +} + +/** + * batadv_bla_backbone_dump_bucket - dump one bucket of the backbone table + * to a netlink socket + * @msg: buffer for the message + * @portid: netlink port + * @seq: Sequence number of netlink message + * @primary_if: primary interface + * @head: bucket to dump + * @idx_skip: How many entries to skip + * + * Return: always 0. + */ +static int +batadv_bla_backbone_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_hard_iface *primary_if, + struct hlist_head *head, int *idx_skip) +{ + struct batadv_bla_backbone_gw *backbone_gw; + int idx = 0; + + rcu_read_lock(); + hlist_for_each_entry_rcu(backbone_gw, head, hash_entry) { + if (idx++ < *idx_skip) + continue; + if (batadv_bla_backbone_dump_entry(msg, portid, seq, + primary_if, backbone_gw)) { + *idx_skip = idx - 1; + goto unlock; + } + } + + *idx_skip = idx; +unlock: + rcu_read_unlock(); + return 0; +} + +/** + * batadv_bla_backbone_dump - dump backbone table to a netlink socket + * @msg: buffer for the message + * @cb: callback structure containing arguments + * + * Return: message length. + */ +int batadv_bla_backbone_dump(struct sk_buff *msg, struct netlink_callback *cb) +{ + struct batadv_hard_iface *primary_if = NULL; + int portid = NETLINK_CB(cb->skb).portid; + struct net *net = sock_net(cb->skb->sk); + struct net_device *soft_iface; + struct batadv_hashtable *hash; + struct batadv_priv *bat_priv; + int bucket = cb->args[0]; + struct hlist_head *head; + int idx = cb->args[1]; + int ifindex; + int ret = 0; + + ifindex = batadv_netlink_get_ifindex(cb->nlh, + BATADV_ATTR_MESH_IFINDEX); + if (!ifindex) + return -EINVAL; + + soft_iface = dev_get_by_index(net, ifindex); + if (!soft_iface || !batadv_softif_is_valid(soft_iface)) { + ret = -ENODEV; + goto out; + } + + bat_priv = netdev_priv(soft_iface); + hash = bat_priv->bla.backbone_hash; + + primary_if = batadv_primary_if_get_selected(bat_priv); + if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) { + ret = -ENOENT; + goto out; + } + + while (bucket < hash->size) { + head = &hash->table[bucket]; + + if (batadv_bla_backbone_dump_bucket(msg, portid, + cb->nlh->nlmsg_seq, + primary_if, head, &idx)) + break; + bucket++; + } + + cb->args[0] = bucket; + cb->args[1] = idx; + + ret = msg->len; + +out: + if (primary_if) + batadv_hardif_put(primary_if); + + if (soft_iface) + dev_put(soft_iface); + + return ret; +} diff --git a/net/batman-adv/bridge_loop_avoidance.h b/net/batman-adv/bridge_loop_avoidance.h index 0f01daeb359e..1ae93e46fb98 100644 --- a/net/batman-adv/bridge_loop_avoidance.h +++ b/net/batman-adv/bridge_loop_avoidance.h @@ -23,6 +23,7 @@ #include <linux/types.h> struct net_device; +struct netlink_callback; struct seq_file; struct sk_buff; @@ -35,8 +36,10 @@ bool batadv_bla_is_backbone_gw(struct sk_buff *skb, struct batadv_orig_node *orig_node, int hdr_size); int batadv_bla_claim_table_seq_print_text(struct seq_file *seq, void *offset); +int batadv_bla_claim_dump(struct sk_buff *msg, struct netlink_callback *cb); int batadv_bla_backbone_table_seq_print_text(struct seq_file *seq, void *offset); +int batadv_bla_backbone_dump(struct sk_buff *msg, struct netlink_callback *cb); bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv, u8 *orig, unsigned short vid); bool batadv_bla_check_bcast_duplist(struct batadv_priv *bat_priv, @@ -47,7 +50,7 @@ void batadv_bla_update_orig_address(struct batadv_priv *bat_priv, void batadv_bla_status_update(struct net_device *net_dev); int batadv_bla_init(struct batadv_priv *bat_priv); void batadv_bla_free(struct batadv_priv *bat_priv); - +int batadv_bla_claim_dump(struct sk_buff *msg, struct netlink_callback *cb); #define BATADV_BLA_CRC_INIT 0 #else /* ifdef CONFIG_BATMAN_ADV_BLA */ @@ -112,6 +115,18 @@ static inline void batadv_bla_free(struct batadv_priv *bat_priv) { } +static inline int batadv_bla_claim_dump(struct sk_buff *msg, + struct netlink_callback *cb) +{ + return -EOPNOTSUPP; +} + +static inline int batadv_bla_backbone_dump(struct sk_buff *msg, + struct netlink_callback *cb) +{ + return -EOPNOTSUPP; +} + #endif /* ifdef CONFIG_BATMAN_ADV_BLA */ #endif /* ifndef _NET_BATMAN_ADV_BLA_H_ */ diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c index 1d68b6e63b96..b4ffba7dd583 100644 --- a/net/batman-adv/debugfs.c +++ b/net/batman-adv/debugfs.c @@ -31,6 +31,7 @@ #include <linux/stddef.h> #include <linux/stringify.h> #include <linux/sysfs.h> +#include <net/net_namespace.h> #include "bat_algo.h" #include "bridge_loop_avoidance.h" @@ -305,12 +306,16 @@ void batadv_debugfs_destroy(void) */ int batadv_debugfs_add_hardif(struct batadv_hard_iface *hard_iface) { + struct net *net = dev_net(hard_iface->net_dev); struct batadv_debuginfo **bat_debug; struct dentry *file; if (!batadv_debugfs) goto out; + if (net != &init_net) + return 0; + hard_iface->debug_dir = debugfs_create_dir(hard_iface->net_dev->name, batadv_debugfs); if (!hard_iface->debug_dir) @@ -341,6 +346,11 @@ out: */ void batadv_debugfs_del_hardif(struct batadv_hard_iface *hard_iface) { + struct net *net = dev_net(hard_iface->net_dev); + + if (net != &init_net) + return; + if (batadv_debugfs) { debugfs_remove_recursive(hard_iface->debug_dir); hard_iface->debug_dir = NULL; @@ -351,11 +361,15 @@ int batadv_debugfs_add_meshif(struct net_device *dev) { struct batadv_priv *bat_priv = netdev_priv(dev); struct batadv_debuginfo **bat_debug; + struct net *net = dev_net(dev); struct dentry *file; if (!batadv_debugfs) goto out; + if (net != &init_net) + return 0; + bat_priv->debug_dir = debugfs_create_dir(dev->name, batadv_debugfs); if (!bat_priv->debug_dir) goto out; @@ -392,6 +406,10 @@ out: void batadv_debugfs_del_meshif(struct net_device *dev) { struct batadv_priv *bat_priv = netdev_priv(dev); + struct net *net = dev_net(dev); + + if (net != &init_net) + return; batadv_debug_log_cleanup(bat_priv); diff --git a/net/batman-adv/debugfs.h b/net/batman-adv/debugfs.h index 1ab4e2e63afc..c68ff3dcb926 100644 --- a/net/batman-adv/debugfs.h +++ b/net/batman-adv/debugfs.h @@ -26,7 +26,7 @@ struct net_device; #define BATADV_DEBUGFS_SUBDIR "batman_adv" -#if IS_ENABLED(CONFIG_DEBUG_FS) +#if IS_ENABLED(CONFIG_BATMAN_ADV_DEBUGFS) void batadv_debugfs_init(void); void batadv_debugfs_destroy(void); diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index b1cc8bfe11ac..e257efdc5d03 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -343,8 +343,8 @@ static void batadv_dat_entry_add(struct batadv_priv *bat_priv, __be32 ip, ether_addr_copy(dat_entry->mac_addr, mac_addr); dat_entry->last_update = jiffies; kref_init(&dat_entry->refcount); - kref_get(&dat_entry->refcount); + kref_get(&dat_entry->refcount); hash_added = batadv_hash_add(bat_priv->dat.hash, batadv_compare_dat, batadv_hash_dat, dat_entry, &dat_entry->hash_entry); @@ -795,6 +795,7 @@ void batadv_dat_free(struct batadv_priv *bat_priv) batadv_dat_hash_free(bat_priv); } +#ifdef CONFIG_BATMAN_ADV_DEBUGFS /** * batadv_dat_cache_seq_print_text - print the local DAT hash table * @seq: seq file to print on @@ -846,6 +847,7 @@ out: batadv_hardif_put(primary_if); return 0; } +#endif /** * batadv_arp_get_type - parse an ARP packet and gets the type diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index 63a805d3f96e..de055d64debe 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -20,6 +20,7 @@ #include <linux/atomic.h> #include <linux/byteorder/generic.h> +#include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/fs.h> #include <linux/if_ether.h> @@ -31,6 +32,7 @@ #include <linux/kref.h> #include <linux/list.h> #include <linux/netdevice.h> +#include <linux/netlink.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/seq_file.h> @@ -39,13 +41,17 @@ #include <linux/spinlock.h> #include <linux/stddef.h> #include <linux/udp.h> +#include <net/sock.h> +#include <uapi/linux/batman_adv.h> #include "gateway_common.h" #include "hard-interface.h" #include "log.h" +#include "netlink.h" #include "originator.h" #include "packet.h" #include "routing.h" +#include "soft-interface.h" #include "sysfs.h" #include "translation-table.h" @@ -80,12 +86,12 @@ static void batadv_gw_node_release(struct kref *ref) * batadv_gw_node_put - decrement the gw_node refcounter and possibly release it * @gw_node: gateway node to free */ -static void batadv_gw_node_put(struct batadv_gw_node *gw_node) +void batadv_gw_node_put(struct batadv_gw_node *gw_node) { kref_put(&gw_node->refcount, batadv_gw_node_release); } -static struct batadv_gw_node * +struct batadv_gw_node * batadv_gw_get_selected_gw_node(struct batadv_priv *bat_priv) { struct batadv_gw_node *gw_node; @@ -164,86 +170,6 @@ void batadv_gw_reselect(struct batadv_priv *bat_priv) atomic_set(&bat_priv->gw.reselect, 1); } -static struct batadv_gw_node * -batadv_gw_get_best_gw_node(struct batadv_priv *bat_priv) -{ - struct batadv_neigh_node *router; - struct batadv_neigh_ifinfo *router_ifinfo; - struct batadv_gw_node *gw_node, *curr_gw = NULL; - u64 max_gw_factor = 0; - u64 tmp_gw_factor = 0; - u8 max_tq = 0; - u8 tq_avg; - struct batadv_orig_node *orig_node; - - rcu_read_lock(); - hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.list, list) { - orig_node = gw_node->orig_node; - router = batadv_orig_router_get(orig_node, BATADV_IF_DEFAULT); - if (!router) - continue; - - router_ifinfo = batadv_neigh_ifinfo_get(router, - BATADV_IF_DEFAULT); - if (!router_ifinfo) - goto next; - - if (!kref_get_unless_zero(&gw_node->refcount)) - goto next; - - tq_avg = router_ifinfo->bat_iv.tq_avg; - - switch (atomic_read(&bat_priv->gw.sel_class)) { - case 1: /* fast connection */ - tmp_gw_factor = tq_avg * tq_avg; - tmp_gw_factor *= gw_node->bandwidth_down; - tmp_gw_factor *= 100 * 100; - tmp_gw_factor >>= 18; - - if ((tmp_gw_factor > max_gw_factor) || - ((tmp_gw_factor == max_gw_factor) && - (tq_avg > max_tq))) { - if (curr_gw) - batadv_gw_node_put(curr_gw); - curr_gw = gw_node; - kref_get(&curr_gw->refcount); - } - break; - - default: /* 2: stable connection (use best statistic) - * 3: fast-switch (use best statistic but change as - * soon as a better gateway appears) - * XX: late-switch (use best statistic but change as - * soon as a better gateway appears which has - * $routing_class more tq points) - */ - if (tq_avg > max_tq) { - if (curr_gw) - batadv_gw_node_put(curr_gw); - curr_gw = gw_node; - kref_get(&curr_gw->refcount); - } - break; - } - - if (tq_avg > max_tq) - max_tq = tq_avg; - - if (tmp_gw_factor > max_gw_factor) - max_gw_factor = tmp_gw_factor; - - batadv_gw_node_put(gw_node); - -next: - batadv_neigh_node_put(router); - if (router_ifinfo) - batadv_neigh_ifinfo_put(router_ifinfo); - } - rcu_read_unlock(); - - return curr_gw; -} - /** * batadv_gw_check_client_stop - check if client mode has been switched off * @bat_priv: the bat priv with all the soft interface information @@ -287,12 +213,19 @@ void batadv_gw_election(struct batadv_priv *bat_priv) if (atomic_read(&bat_priv->gw.mode) != BATADV_GW_MODE_CLIENT) goto out; + if (!bat_priv->algo_ops->gw.get_best_gw_node) + goto out; + curr_gw = batadv_gw_get_selected_gw_node(bat_priv); if (!batadv_atomic_dec_not_zero(&bat_priv->gw.reselect) && curr_gw) goto out; - next_gw = batadv_gw_get_best_gw_node(bat_priv); + /* if gw.reselect is set to 1 it means that a previous call to + * gw.is_eligible() said that we have a new best GW, therefore it can + * now be picked from the list and selected + */ + next_gw = bat_priv->algo_ops->gw.get_best_gw_node(bat_priv); if (curr_gw == next_gw) goto out; @@ -360,70 +293,31 @@ out: void batadv_gw_check_election(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node) { - struct batadv_neigh_ifinfo *router_orig_tq = NULL; - struct batadv_neigh_ifinfo *router_gw_tq = NULL; struct batadv_orig_node *curr_gw_orig; - struct batadv_neigh_node *router_gw = NULL; - struct batadv_neigh_node *router_orig = NULL; - u8 gw_tq_avg, orig_tq_avg; + + /* abort immediately if the routing algorithm does not support gateway + * election + */ + if (!bat_priv->algo_ops->gw.is_eligible) + return; curr_gw_orig = batadv_gw_get_selected_orig(bat_priv); if (!curr_gw_orig) goto reselect; - router_gw = batadv_orig_router_get(curr_gw_orig, BATADV_IF_DEFAULT); - if (!router_gw) - goto reselect; - - router_gw_tq = batadv_neigh_ifinfo_get(router_gw, - BATADV_IF_DEFAULT); - if (!router_gw_tq) - goto reselect; - /* this node already is the gateway */ if (curr_gw_orig == orig_node) goto out; - router_orig = batadv_orig_router_get(orig_node, BATADV_IF_DEFAULT); - if (!router_orig) + if (!bat_priv->algo_ops->gw.is_eligible(bat_priv, curr_gw_orig, + orig_node)) goto out; - router_orig_tq = batadv_neigh_ifinfo_get(router_orig, - BATADV_IF_DEFAULT); - if (!router_orig_tq) - goto out; - - gw_tq_avg = router_gw_tq->bat_iv.tq_avg; - orig_tq_avg = router_orig_tq->bat_iv.tq_avg; - - /* the TQ value has to be better */ - if (orig_tq_avg < gw_tq_avg) - goto out; - - /* if the routing class is greater than 3 the value tells us how much - * greater the TQ value of the new gateway must be - */ - if ((atomic_read(&bat_priv->gw.sel_class) > 3) && - (orig_tq_avg - gw_tq_avg < atomic_read(&bat_priv->gw.sel_class))) - goto out; - - batadv_dbg(BATADV_DBG_BATMAN, bat_priv, - "Restarting gateway selection: better gateway found (tq curr: %i, tq new: %i)\n", - gw_tq_avg, orig_tq_avg); - reselect: batadv_gw_reselect(bat_priv); out: if (curr_gw_orig) batadv_orig_node_put(curr_gw_orig); - if (router_gw) - batadv_neigh_node_put(router_gw); - if (router_orig) - batadv_neigh_node_put(router_orig); - if (router_gw_tq) - batadv_neigh_ifinfo_put(router_gw_tq); - if (router_orig_tq) - batadv_neigh_ifinfo_put(router_orig_tq); } /** @@ -445,14 +339,15 @@ static void batadv_gw_node_add(struct batadv_priv *bat_priv, if (!gw_node) return; - kref_get(&orig_node->refcount); + kref_init(&gw_node->refcount); INIT_HLIST_NODE(&gw_node->list); + kref_get(&orig_node->refcount); gw_node->orig_node = orig_node; gw_node->bandwidth_down = ntohl(gateway->bandwidth_down); gw_node->bandwidth_up = ntohl(gateway->bandwidth_up); - kref_init(&gw_node->refcount); spin_lock_bh(&bat_priv->gw.list_lock); + kref_get(&gw_node->refcount); hlist_add_head_rcu(&gw_node->list, &bat_priv->gw.list); spin_unlock_bh(&bat_priv->gw.list_lock); @@ -463,6 +358,9 @@ static void batadv_gw_node_add(struct batadv_priv *bat_priv, ntohl(gateway->bandwidth_down) % 10, ntohl(gateway->bandwidth_up) / 10, ntohl(gateway->bandwidth_up) % 10); + + /* don't return reference to new gw_node */ + batadv_gw_node_put(gw_node); } /** @@ -472,9 +370,8 @@ static void batadv_gw_node_add(struct batadv_priv *bat_priv, * * Return: gateway node if found or NULL otherwise. */ -static struct batadv_gw_node * -batadv_gw_node_get(struct batadv_priv *bat_priv, - struct batadv_orig_node *orig_node) +struct batadv_gw_node *batadv_gw_node_get(struct batadv_priv *bat_priv, + struct batadv_orig_node *orig_node) { struct batadv_gw_node *gw_node_tmp, *gw_node = NULL; @@ -585,81 +482,87 @@ void batadv_gw_node_free(struct batadv_priv *bat_priv) spin_unlock_bh(&bat_priv->gw.list_lock); } -/* fails if orig_node has no router */ -static int batadv_write_buffer_text(struct batadv_priv *bat_priv, - struct seq_file *seq, - const struct batadv_gw_node *gw_node) +#ifdef CONFIG_BATMAN_ADV_DEBUGFS +int batadv_gw_client_seq_print_text(struct seq_file *seq, void *offset) { - struct batadv_gw_node *curr_gw; - struct batadv_neigh_node *router; - struct batadv_neigh_ifinfo *router_ifinfo = NULL; - int ret = -1; + struct net_device *net_dev = (struct net_device *)seq->private; + struct batadv_priv *bat_priv = netdev_priv(net_dev); + struct batadv_hard_iface *primary_if; - router = batadv_orig_router_get(gw_node->orig_node, BATADV_IF_DEFAULT); - if (!router) - goto out; + primary_if = batadv_seq_print_text_primary_if_get(seq); + if (!primary_if) + return 0; - router_ifinfo = batadv_neigh_ifinfo_get(router, BATADV_IF_DEFAULT); - if (!router_ifinfo) - goto out; + seq_printf(seq, "[B.A.T.M.A.N. adv %s, MainIF/MAC: %s/%pM (%s %s)]\n", + BATADV_SOURCE_VERSION, primary_if->net_dev->name, + primary_if->net_dev->dev_addr, net_dev->name, + bat_priv->algo_ops->name); - curr_gw = batadv_gw_get_selected_gw_node(bat_priv); + batadv_hardif_put(primary_if); - seq_printf(seq, "%s %pM (%3i) %pM [%10s]: %u.%u/%u.%u MBit\n", - (curr_gw == gw_node ? "=>" : " "), - gw_node->orig_node->orig, - router_ifinfo->bat_iv.tq_avg, router->addr, - router->if_incoming->net_dev->name, - gw_node->bandwidth_down / 10, - gw_node->bandwidth_down % 10, - gw_node->bandwidth_up / 10, - gw_node->bandwidth_up % 10); - ret = seq_has_overflowed(seq) ? -1 : 0; + if (!bat_priv->algo_ops->gw.print) { + seq_puts(seq, + "No printing function for this routing protocol\n"); + return 0; + } - if (curr_gw) - batadv_gw_node_put(curr_gw); -out: - if (router_ifinfo) - batadv_neigh_ifinfo_put(router_ifinfo); - if (router) - batadv_neigh_node_put(router); - return ret; + bat_priv->algo_ops->gw.print(bat_priv, seq); + + return 0; } +#endif -int batadv_gw_client_seq_print_text(struct seq_file *seq, void *offset) +/** + * batadv_gw_dump - Dump gateways into a message + * @msg: Netlink message to dump into + * @cb: Control block containing additional options + * + * Return: Error code, or length of message + */ +int batadv_gw_dump(struct sk_buff *msg, struct netlink_callback *cb) { - struct net_device *net_dev = (struct net_device *)seq->private; - struct batadv_priv *bat_priv = netdev_priv(net_dev); - struct batadv_hard_iface *primary_if; - struct batadv_gw_node *gw_node; - int gw_count = 0; - - primary_if = batadv_seq_print_text_primary_if_get(seq); - if (!primary_if) + struct batadv_hard_iface *primary_if = NULL; + struct net *net = sock_net(cb->skb->sk); + struct net_device *soft_iface; + struct batadv_priv *bat_priv; + int ifindex; + int ret; + + ifindex = batadv_netlink_get_ifindex(cb->nlh, + BATADV_ATTR_MESH_IFINDEX); + if (!ifindex) + return -EINVAL; + + soft_iface = dev_get_by_index(net, ifindex); + if (!soft_iface || !batadv_softif_is_valid(soft_iface)) { + ret = -ENODEV; goto out; + } - seq_printf(seq, - " Gateway (#/255) Nexthop [outgoingIF]: advertised uplink bandwidth ... [B.A.T.M.A.N. adv %s, MainIF/MAC: %s/%pM (%s)]\n", - BATADV_SOURCE_VERSION, primary_if->net_dev->name, - primary_if->net_dev->dev_addr, net_dev->name); + bat_priv = netdev_priv(soft_iface); - rcu_read_lock(); - hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.list, list) { - /* fails if orig_node has no router */ - if (batadv_write_buffer_text(bat_priv, seq, gw_node) < 0) - continue; + primary_if = batadv_primary_if_get_selected(bat_priv); + if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) { + ret = -ENOENT; + goto out; + } - gw_count++; + if (!bat_priv->algo_ops->gw.dump) { + ret = -EOPNOTSUPP; + goto out; } - rcu_read_unlock(); - if (gw_count == 0) - seq_puts(seq, "No gateways in range ...\n"); + bat_priv->algo_ops->gw.dump(msg, cb, bat_priv); + + ret = msg->len; out: if (primary_if) batadv_hardif_put(primary_if); - return 0; + if (soft_iface) + dev_put(soft_iface); + + return ret; } /** diff --git a/net/batman-adv/gateway_client.h b/net/batman-adv/gateway_client.h index 582dd8c413c8..859166d03561 100644 --- a/net/batman-adv/gateway_client.h +++ b/net/batman-adv/gateway_client.h @@ -23,6 +23,7 @@ #include <linux/types.h> struct batadv_tvlv_gateway_data; +struct netlink_callback; struct seq_file; struct sk_buff; @@ -39,10 +40,16 @@ void batadv_gw_node_update(struct batadv_priv *bat_priv, void batadv_gw_node_delete(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node); void batadv_gw_node_free(struct batadv_priv *bat_priv); +void batadv_gw_node_put(struct batadv_gw_node *gw_node); +struct batadv_gw_node * +batadv_gw_get_selected_gw_node(struct batadv_priv *bat_priv); int batadv_gw_client_seq_print_text(struct seq_file *seq, void *offset); +int batadv_gw_dump(struct sk_buff *msg, struct netlink_callback *cb); bool batadv_gw_out_of_range(struct batadv_priv *bat_priv, struct sk_buff *skb); enum batadv_dhcp_recipient batadv_gw_dhcp_recipient_get(struct sk_buff *skb, unsigned int *header_len, u8 *chaddr); +struct batadv_gw_node *batadv_gw_node_get(struct batadv_priv *bat_priv, + struct batadv_orig_node *orig_node); #endif /* _NET_BATMAN_ADV_GATEWAY_CLIENT_H_ */ diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c index d7bc6a87bcc9..21184810d89f 100644 --- a/net/batman-adv/gateway_common.c +++ b/net/batman-adv/gateway_common.c @@ -241,10 +241,9 @@ static void batadv_gw_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, batadv_gw_node_update(bat_priv, orig, &gateway); - /* restart gateway selection if fast or late switching was enabled */ + /* restart gateway selection */ if ((gateway.bandwidth_down != 0) && - (atomic_read(&bat_priv->gw.mode) == BATADV_GW_MODE_CLIENT) && - (atomic_read(&bat_priv->gw.sel_class) > 2)) + (atomic_read(&bat_priv->gw.mode) == BATADV_GW_MODE_CLIENT)) batadv_gw_check_election(bat_priv, orig); } diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 1f9080840566..08ce36147c4c 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -35,7 +35,8 @@ #include <linux/rtnetlink.h> #include <linux/slab.h> #include <linux/spinlock.h> -#include <linux/workqueue.h> +#include <net/net_namespace.h> +#include <net/rtnetlink.h> #include "bat_v.h" #include "bridge_loop_avoidance.h" @@ -85,25 +86,55 @@ out: } /** + * batadv_getlink_net - return link net namespace (of use fallback) + * @netdev: net_device to check + * @fallback_net: return in case get_link_net is not available for @netdev + * + * Return: result of rtnl_link_ops->get_link_net or @fallback_net + */ +static const struct net *batadv_getlink_net(const struct net_device *netdev, + const struct net *fallback_net) +{ + if (!netdev->rtnl_link_ops) + return fallback_net; + + if (!netdev->rtnl_link_ops->get_link_net) + return fallback_net; + + return netdev->rtnl_link_ops->get_link_net(netdev); +} + +/** * batadv_mutual_parents - check if two devices are each others parent - * @dev1: 1st net_device - * @dev2: 2nd net_device + * @dev1: 1st net dev + * @net1: 1st devices netns + * @dev2: 2nd net dev + * @net2: 2nd devices netns * * veth devices come in pairs and each is the parent of the other! * * Return: true if the devices are each others parent, otherwise false */ static bool batadv_mutual_parents(const struct net_device *dev1, - const struct net_device *dev2) + const struct net *net1, + const struct net_device *dev2, + const struct net *net2) { int dev1_parent_iflink = dev_get_iflink(dev1); int dev2_parent_iflink = dev_get_iflink(dev2); + const struct net *dev1_parent_net; + const struct net *dev2_parent_net; + + dev1_parent_net = batadv_getlink_net(dev1, net1); + dev2_parent_net = batadv_getlink_net(dev2, net2); if (!dev1_parent_iflink || !dev2_parent_iflink) return false; return (dev1_parent_iflink == dev2->ifindex) && - (dev2_parent_iflink == dev1->ifindex); + (dev2_parent_iflink == dev1->ifindex) && + net_eq(dev1_parent_net, net2) && + net_eq(dev2_parent_net, net1); } /** @@ -121,8 +152,9 @@ static bool batadv_mutual_parents(const struct net_device *dev1, */ static bool batadv_is_on_batman_iface(const struct net_device *net_dev) { - struct net_device *parent_dev; struct net *net = dev_net(net_dev); + struct net_device *parent_dev; + const struct net *parent_net; bool ret; /* check if this is a batman-adv mesh interface */ @@ -134,13 +166,16 @@ static bool batadv_is_on_batman_iface(const struct net_device *net_dev) dev_get_iflink(net_dev) == net_dev->ifindex) return false; + parent_net = batadv_getlink_net(net_dev, net); + /* recurse over the parent device */ - parent_dev = __dev_get_by_index(net, dev_get_iflink(net_dev)); + parent_dev = __dev_get_by_index((struct net *)parent_net, + dev_get_iflink(net_dev)); /* if we got a NULL parent_dev there is something broken.. */ if (WARN(!parent_dev, "Cannot find parent device")) return false; - if (batadv_mutual_parents(net_dev, parent_dev)) + if (batadv_mutual_parents(net_dev, net, parent_dev, parent_net)) return false; ret = batadv_is_on_batman_iface(parent_dev); @@ -625,25 +660,6 @@ out: batadv_hardif_put(primary_if); } -/** - * batadv_hardif_remove_interface_finish - cleans up the remains of a hardif - * @work: work queue item - * - * Free the parts of the hard interface which can not be removed under - * rtnl lock (to prevent deadlock situations). - */ -static void batadv_hardif_remove_interface_finish(struct work_struct *work) -{ - struct batadv_hard_iface *hard_iface; - - hard_iface = container_of(work, struct batadv_hard_iface, - cleanup_work); - - batadv_debugfs_del_hardif(hard_iface); - batadv_sysfs_del_hardif(&hard_iface->hardif_obj); - batadv_hardif_put(hard_iface); -} - static struct batadv_hard_iface * batadv_hardif_add_interface(struct net_device *net_dev) { @@ -676,10 +692,9 @@ batadv_hardif_add_interface(struct net_device *net_dev) INIT_LIST_HEAD(&hard_iface->list); INIT_HLIST_HEAD(&hard_iface->neigh_list); - INIT_WORK(&hard_iface->cleanup_work, - batadv_hardif_remove_interface_finish); spin_lock_init(&hard_iface->neigh_list_lock); + kref_init(&hard_iface->refcount); hard_iface->num_bcasts = BATADV_NUM_BCASTS_DEFAULT; if (batadv_is_wifi_netdev(net_dev)) @@ -687,11 +702,8 @@ batadv_hardif_add_interface(struct net_device *net_dev) batadv_v_hardif_init(hard_iface); - /* extra reference for return */ - kref_init(&hard_iface->refcount); - kref_get(&hard_iface->refcount); - batadv_check_known_mac_addr(hard_iface->net_dev); + kref_get(&hard_iface->refcount); list_add_tail_rcu(&hard_iface->list, &batadv_hardif_list); return hard_iface; @@ -713,13 +725,15 @@ static void batadv_hardif_remove_interface(struct batadv_hard_iface *hard_iface) /* first deactivate interface */ if (hard_iface->if_status != BATADV_IF_NOT_IN_USE) batadv_hardif_disable_interface(hard_iface, - BATADV_IF_CLEANUP_AUTO); + BATADV_IF_CLEANUP_KEEP); if (hard_iface->if_status != BATADV_IF_NOT_IN_USE) return; hard_iface->if_status = BATADV_IF_TO_BE_REMOVED; - queue_work(batadv_event_workqueue, &hard_iface->cleanup_work); + batadv_debugfs_del_hardif(hard_iface); + batadv_sysfs_del_hardif(&hard_iface->hardif_obj); + batadv_hardif_put(hard_iface); } void batadv_hardif_remove_interfaces(void) diff --git a/net/batman-adv/icmp_socket.h b/net/batman-adv/icmp_socket.h index 618d5de06f20..e44a7da51431 100644 --- a/net/batman-adv/icmp_socket.h +++ b/net/batman-adv/icmp_socket.h @@ -26,9 +26,25 @@ struct batadv_icmp_header; #define BATADV_ICMP_SOCKET "socket" -void batadv_socket_init(void); int batadv_socket_setup(struct batadv_priv *bat_priv); + +#ifdef CONFIG_BATMAN_ADV_DEBUGFS + +void batadv_socket_init(void); void batadv_socket_receive_packet(struct batadv_icmp_header *icmph, size_t icmp_len); +#else + +static inline void batadv_socket_init(void) +{ +} + +static inline void +batadv_socket_receive_packet(struct batadv_icmp_header *icmph, size_t icmp_len) +{ +} + +#endif + #endif /* _NET_BATMAN_ADV_ICMP_SOCKET_H_ */ diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index fe4c5e29f96b..2c017ab47557 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -82,6 +82,12 @@ static void batadv_recv_handler_init(void); static int __init batadv_init(void) { + int ret; + + ret = batadv_tt_cache_init(); + if (ret < 0) + return ret; + INIT_LIST_HEAD(&batadv_hardif_list); batadv_algo_init(); @@ -93,9 +99,8 @@ static int __init batadv_init(void) batadv_tp_meter_init(); batadv_event_workqueue = create_singlethread_workqueue("bat_events"); - if (!batadv_event_workqueue) - return -ENOMEM; + goto err_create_wq; batadv_socket_init(); batadv_debugfs_init(); @@ -108,6 +113,11 @@ static int __init batadv_init(void) BATADV_SOURCE_VERSION, BATADV_COMPAT_VERSION); return 0; + +err_create_wq: + batadv_tt_cache_destroy(); + + return -ENOMEM; } static void __exit batadv_exit(void) @@ -123,6 +133,8 @@ static void __exit batadv_exit(void) batadv_event_workqueue = NULL; rcu_barrier(); + + batadv_tt_cache_destroy(); } int batadv_mesh_init(struct net_device *soft_iface) @@ -270,6 +282,7 @@ bool batadv_is_my_mac(struct batadv_priv *bat_priv, const u8 *addr) return is_my_mac; } +#ifdef CONFIG_BATMAN_ADV_DEBUGFS /** * batadv_seq_print_text_primary_if_get - called from debugfs table printing * function that requires the primary interface @@ -305,6 +318,7 @@ batadv_seq_print_text_primary_if_get(struct seq_file *seq) out: return primary_if; } +#endif /** * batadv_max_header_len - calculate maximum encapsulation overhead for a @@ -638,3 +652,4 @@ MODULE_AUTHOR(BATADV_DRIVER_AUTHOR); MODULE_DESCRIPTION(BATADV_DRIVER_DESC); MODULE_SUPPORTED_DEVICE(BATADV_DRIVER_DEVICE); MODULE_VERSION(BATADV_SOURCE_VERSION); +MODULE_ALIAS_RTNL_LINK("batadv"); diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 06a860845434..09af21e27639 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -24,7 +24,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2016.3" +#define BATADV_SOURCE_VERSION "2016.4" #endif /* B.A.T.M.A.N. parameters */ diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index cc915073a753..13661f43386f 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -528,7 +528,7 @@ update: } return !(mcast_data.flags & - (BATADV_MCAST_WANT_ALL_IPV4 + BATADV_MCAST_WANT_ALL_IPV6)); + (BATADV_MCAST_WANT_ALL_IPV4 | BATADV_MCAST_WANT_ALL_IPV6)); } /** @@ -1134,6 +1134,7 @@ void batadv_mcast_init(struct batadv_priv *bat_priv) BATADV_TVLV_HANDLER_OGM_CIFNOTFND); } +#ifdef CONFIG_BATMAN_ADV_DEBUGFS /** * batadv_mcast_flags_print_header - print own mcast flags to debugfs table * @bat_priv: the bat priv with all the soft interface information @@ -1234,6 +1235,7 @@ int batadv_mcast_flags_seq_print_text(struct seq_file *seq, void *offset) return 0; } +#endif /** * batadv_mcast_free - free the multicast optimizations structures diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index 231f8eaf075b..18831e72b0fb 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -18,6 +18,8 @@ #include "netlink.h" #include "main.h" +#include <linux/atomic.h> +#include <linux/byteorder/generic.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/genetlink.h> @@ -26,24 +28,33 @@ #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/printk.h> +#include <linux/rculist.h> +#include <linux/rcupdate.h> +#include <linux/skbuff.h> #include <linux/stddef.h> #include <linux/types.h> #include <net/genetlink.h> #include <net/netlink.h> +#include <net/sock.h> #include <uapi/linux/batman_adv.h> +#include "bat_algo.h" +#include "bridge_loop_avoidance.h" +#include "gateway_client.h" #include "hard-interface.h" +#include "originator.h" +#include "packet.h" #include "soft-interface.h" #include "tp_meter.h" +#include "translation-table.h" -struct sk_buff; - -static struct genl_family batadv_netlink_family = { +struct genl_family batadv_netlink_family = { .id = GENL_ID_GENERATE, .hdrsize = 0, .name = BATADV_NL_NAME, .version = 1, .maxattr = BATADV_ATTR_MAX, + .netnsok = true, }; /* multicast groups */ @@ -69,9 +80,44 @@ static struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = { [BATADV_ATTR_TPMETER_TEST_TIME] = { .type = NLA_U32 }, [BATADV_ATTR_TPMETER_BYTES] = { .type = NLA_U64 }, [BATADV_ATTR_TPMETER_COOKIE] = { .type = NLA_U32 }, + [BATADV_ATTR_ACTIVE] = { .type = NLA_FLAG }, + [BATADV_ATTR_TT_ADDRESS] = { .len = ETH_ALEN }, + [BATADV_ATTR_TT_TTVN] = { .type = NLA_U8 }, + [BATADV_ATTR_TT_LAST_TTVN] = { .type = NLA_U8 }, + [BATADV_ATTR_TT_CRC32] = { .type = NLA_U32 }, + [BATADV_ATTR_TT_VID] = { .type = NLA_U16 }, + [BATADV_ATTR_TT_FLAGS] = { .type = NLA_U32 }, + [BATADV_ATTR_FLAG_BEST] = { .type = NLA_FLAG }, + [BATADV_ATTR_LAST_SEEN_MSECS] = { .type = NLA_U32 }, + [BATADV_ATTR_NEIGH_ADDRESS] = { .len = ETH_ALEN }, + [BATADV_ATTR_TQ] = { .type = NLA_U8 }, + [BATADV_ATTR_THROUGHPUT] = { .type = NLA_U32 }, + [BATADV_ATTR_BANDWIDTH_UP] = { .type = NLA_U32 }, + [BATADV_ATTR_BANDWIDTH_DOWN] = { .type = NLA_U32 }, + [BATADV_ATTR_ROUTER] = { .len = ETH_ALEN }, + [BATADV_ATTR_BLA_OWN] = { .type = NLA_FLAG }, + [BATADV_ATTR_BLA_ADDRESS] = { .len = ETH_ALEN }, + [BATADV_ATTR_BLA_VID] = { .type = NLA_U16 }, + [BATADV_ATTR_BLA_BACKBONE] = { .len = ETH_ALEN }, + [BATADV_ATTR_BLA_CRC] = { .type = NLA_U16 }, }; /** + * batadv_netlink_get_ifindex - Extract an interface index from a message + * @nlh: Message header + * @attrtype: Attribute which holds an interface index + * + * Return: interface index, or 0. + */ +int +batadv_netlink_get_ifindex(const struct nlmsghdr *nlh, int attrtype) +{ + struct nlattr *attr = nlmsg_find_attr(nlh, GENL_HDRLEN, attrtype); + + return attr ? nla_get_u32(attr) : 0; +} + +/** * batadv_netlink_mesh_info_put - fill in generic information about mesh * interface * @msg: netlink message to be sent back @@ -93,8 +139,16 @@ batadv_netlink_mesh_info_put(struct sk_buff *msg, struct net_device *soft_iface) nla_put_u32(msg, BATADV_ATTR_MESH_IFINDEX, soft_iface->ifindex) || nla_put_string(msg, BATADV_ATTR_MESH_IFNAME, soft_iface->name) || nla_put(msg, BATADV_ATTR_MESH_ADDRESS, ETH_ALEN, - soft_iface->dev_addr)) + soft_iface->dev_addr) || + nla_put_u8(msg, BATADV_ATTR_TT_TTVN, + (u8)atomic_read(&bat_priv->tt.vn))) + goto out; + +#ifdef CONFIG_BATMAN_ADV_BLA + if (nla_put_u16(msg, BATADV_ATTR_BLA_CRC, + ntohs(bat_priv->bla.claim_dest.group))) goto out; +#endif primary_if = batadv_primary_if_get_selected(bat_priv); if (primary_if && primary_if->if_status == BATADV_IF_ACTIVE) { @@ -380,6 +434,106 @@ out: return ret; } +/** + * batadv_netlink_dump_hardif_entry - Dump one hard interface into a message + * @msg: Netlink message to dump into + * @portid: Port making netlink request + * @seq: Sequence number of netlink message + * @hard_iface: Hard interface to dump + * + * Return: error code, or 0 on success + */ +static int +batadv_netlink_dump_hardif_entry(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_hard_iface *hard_iface) +{ + struct net_device *net_dev = hard_iface->net_dev; + void *hdr; + + hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, NLM_F_MULTI, + BATADV_CMD_GET_HARDIFS); + if (!hdr) + return -EMSGSIZE; + + if (nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, + net_dev->ifindex) || + nla_put_string(msg, BATADV_ATTR_HARD_IFNAME, + net_dev->name) || + nla_put(msg, BATADV_ATTR_HARD_ADDRESS, ETH_ALEN, + net_dev->dev_addr)) + goto nla_put_failure; + + if (hard_iface->if_status == BATADV_IF_ACTIVE) { + if (nla_put_flag(msg, BATADV_ATTR_ACTIVE)) + goto nla_put_failure; + } + + genlmsg_end(msg, hdr); + return 0; + + nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +/** + * batadv_netlink_dump_hardifs - Dump all hard interface into a messages + * @msg: Netlink message to dump into + * @cb: Parameters from query + * + * Return: error code, or length of reply message on success + */ +static int +batadv_netlink_dump_hardifs(struct sk_buff *msg, struct netlink_callback *cb) +{ + struct net *net = sock_net(cb->skb->sk); + struct net_device *soft_iface; + struct batadv_hard_iface *hard_iface; + int ifindex; + int portid = NETLINK_CB(cb->skb).portid; + int seq = cb->nlh->nlmsg_seq; + int skip = cb->args[0]; + int i = 0; + + ifindex = batadv_netlink_get_ifindex(cb->nlh, + BATADV_ATTR_MESH_IFINDEX); + if (!ifindex) + return -EINVAL; + + soft_iface = dev_get_by_index(net, ifindex); + if (!soft_iface) + return -ENODEV; + + if (!batadv_softif_is_valid(soft_iface)) { + dev_put(soft_iface); + return -ENODEV; + } + + rcu_read_lock(); + + list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { + if (hard_iface->soft_iface != soft_iface) + continue; + + if (i++ < skip) + continue; + + if (batadv_netlink_dump_hardif_entry(msg, portid, seq, + hard_iface)) { + i--; + break; + } + } + + rcu_read_unlock(); + + dev_put(soft_iface); + + cb->args[0] = i; + + return msg->len; +} + static struct genl_ops batadv_netlink_ops[] = { { .cmd = BATADV_CMD_GET_MESH_INFO, @@ -399,6 +553,61 @@ static struct genl_ops batadv_netlink_ops[] = { .policy = batadv_netlink_policy, .doit = batadv_netlink_tp_meter_cancel, }, + { + .cmd = BATADV_CMD_GET_ROUTING_ALGOS, + .flags = GENL_ADMIN_PERM, + .policy = batadv_netlink_policy, + .dumpit = batadv_algo_dump, + }, + { + .cmd = BATADV_CMD_GET_HARDIFS, + .flags = GENL_ADMIN_PERM, + .policy = batadv_netlink_policy, + .dumpit = batadv_netlink_dump_hardifs, + }, + { + .cmd = BATADV_CMD_GET_TRANSTABLE_LOCAL, + .flags = GENL_ADMIN_PERM, + .policy = batadv_netlink_policy, + .dumpit = batadv_tt_local_dump, + }, + { + .cmd = BATADV_CMD_GET_TRANSTABLE_GLOBAL, + .flags = GENL_ADMIN_PERM, + .policy = batadv_netlink_policy, + .dumpit = batadv_tt_global_dump, + }, + { + .cmd = BATADV_CMD_GET_ORIGINATORS, + .flags = GENL_ADMIN_PERM, + .policy = batadv_netlink_policy, + .dumpit = batadv_orig_dump, + }, + { + .cmd = BATADV_CMD_GET_NEIGHBORS, + .flags = GENL_ADMIN_PERM, + .policy = batadv_netlink_policy, + .dumpit = batadv_hardif_neigh_dump, + }, + { + .cmd = BATADV_CMD_GET_GATEWAYS, + .flags = GENL_ADMIN_PERM, + .policy = batadv_netlink_policy, + .dumpit = batadv_gw_dump, + }, + { + .cmd = BATADV_CMD_GET_BLA_CLAIM, + .flags = GENL_ADMIN_PERM, + .policy = batadv_netlink_policy, + .dumpit = batadv_bla_claim_dump, + }, + { + .cmd = BATADV_CMD_GET_BLA_BACKBONE, + .flags = GENL_ADMIN_PERM, + .policy = batadv_netlink_policy, + .dumpit = batadv_bla_backbone_dump, + }, + }; /** diff --git a/net/batman-adv/netlink.h b/net/batman-adv/netlink.h index 945653ab58c6..52eb16281aba 100644 --- a/net/batman-adv/netlink.h +++ b/net/batman-adv/netlink.h @@ -21,12 +21,18 @@ #include "main.h" #include <linux/types.h> +#include <net/genetlink.h> + +struct nlmsghdr; void batadv_netlink_register(void); void batadv_netlink_unregister(void); +int batadv_netlink_get_ifindex(const struct nlmsghdr *nlh, int attrtype); int batadv_netlink_tpmeter_notify(struct batadv_priv *bat_priv, const u8 *dst, u8 result, u32 test_time, u64 total_bytes, u32 cookie); +extern struct genl_family batadv_netlink_family; + #endif /* _NET_BATMAN_ADV_NETLINK_H_ */ diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c index 293ef4ffd4e1..e3baf697a35c 100644 --- a/net/batman-adv/network-coding.c +++ b/net/batman-adv/network-coding.c @@ -856,14 +856,12 @@ batadv_nc_get_nc_node(struct batadv_priv *bat_priv, if (!nc_node) return NULL; - kref_get(&orig_neigh_node->refcount); - /* Initialize nc_node */ INIT_LIST_HEAD(&nc_node->list); + kref_init(&nc_node->refcount); ether_addr_copy(nc_node->addr, orig_node->orig); + kref_get(&orig_neigh_node->refcount); nc_node->orig_node = orig_neigh_node; - kref_init(&nc_node->refcount); - kref_get(&nc_node->refcount); /* Select ingoing or outgoing coding node */ if (in_coding) { @@ -879,6 +877,7 @@ batadv_nc_get_nc_node(struct batadv_priv *bat_priv, /* Add nc_node to orig_node */ spin_lock_bh(lock); + kref_get(&nc_node->refcount); list_add_tail_rcu(&nc_node->list, list); spin_unlock_bh(lock); @@ -979,7 +978,6 @@ static struct batadv_nc_path *batadv_nc_get_path(struct batadv_priv *bat_priv, INIT_LIST_HEAD(&nc_path->packet_list); spin_lock_init(&nc_path->packet_list_lock); kref_init(&nc_path->refcount); - kref_get(&nc_path->refcount); nc_path->last_valid = jiffies; ether_addr_copy(nc_path->next_hop, dst); ether_addr_copy(nc_path->prev_hop, src); @@ -989,6 +987,7 @@ static struct batadv_nc_path *batadv_nc_get_path(struct batadv_priv *bat_priv, nc_path->next_hop); /* Add nc_path to hash table */ + kref_get(&nc_path->refcount); hash_added = batadv_hash_add(hash, batadv_nc_hash_compare, batadv_nc_hash_choose, &nc_path_key, &nc_path->hash_entry); @@ -1882,6 +1881,7 @@ void batadv_nc_mesh_free(struct batadv_priv *bat_priv) batadv_hash_destroy(bat_priv->nc.decoding_hash); } +#ifdef CONFIG_BATMAN_ADV_DEBUGFS /** * batadv_nc_nodes_seq_print_text - print the nc node information * @seq: seq file to print on @@ -1981,3 +1981,4 @@ int batadv_nc_init_debugfs(struct batadv_priv *bat_priv) out: return -ENOMEM; } +#endif diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index 3940b5d24421..5f3bfc41aeb1 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -28,11 +28,15 @@ #include <linux/list.h> #include <linux/lockdep.h> #include <linux/netdevice.h> +#include <linux/netlink.h> #include <linux/rculist.h> #include <linux/seq_file.h> +#include <linux/skbuff.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/workqueue.h> +#include <net/sock.h> +#include <uapi/linux/batman_adv.h> #include "bat_algo.h" #include "distributed-arp-table.h" @@ -42,8 +46,10 @@ #include "hash.h" #include "log.h" #include "multicast.h" +#include "netlink.h" #include "network-coding.h" #include "routing.h" +#include "soft-interface.h" #include "translation-table.h" /* hash class keys */ @@ -127,9 +133,9 @@ batadv_orig_node_vlan_new(struct batadv_orig_node *orig_node, goto out; kref_init(&vlan->refcount); - kref_get(&vlan->refcount); vlan->vid = vid; + kref_get(&vlan->refcount); hlist_add_head_rcu(&vlan->list, &orig_node->vlan_list); out: @@ -380,6 +386,7 @@ batadv_orig_ifinfo_new(struct batadv_orig_node *orig_node, orig_ifinfo->if_outgoing = if_outgoing; INIT_HLIST_NODE(&orig_ifinfo->list); kref_init(&orig_ifinfo->refcount); + kref_get(&orig_ifinfo->refcount); hlist_add_head_rcu(&orig_ifinfo->list, &orig_node->ifinfo_list); @@ -453,9 +460,9 @@ batadv_neigh_ifinfo_new(struct batadv_neigh_node *neigh, INIT_HLIST_NODE(&neigh_ifinfo->list); kref_init(&neigh_ifinfo->refcount); - kref_get(&neigh_ifinfo->refcount); neigh_ifinfo->if_outgoing = if_outgoing; + kref_get(&neigh_ifinfo->refcount); hlist_add_head_rcu(&neigh_ifinfo->list, &neigh->ifinfo_list); out: @@ -647,8 +654,8 @@ batadv_neigh_node_create(struct batadv_orig_node *orig_node, /* extra reference for return */ kref_init(&neigh_node->refcount); - kref_get(&neigh_node->refcount); + kref_get(&neigh_node->refcount); hlist_add_head_rcu(&neigh_node->list, &orig_node->neigh_list); batadv_dbg(BATADV_DBG_BATMAN, orig_node->bat_priv, @@ -686,6 +693,7 @@ batadv_neigh_node_get_or_create(struct batadv_orig_node *orig_node, return batadv_neigh_node_create(orig_node, hard_iface, neigh_addr); } +#ifdef CONFIG_BATMAN_ADV_DEBUGFS /** * batadv_hardif_neigh_seq_print_text - print the single hop neighbour list * @seq: neighbour table seq_file struct @@ -719,6 +727,84 @@ int batadv_hardif_neigh_seq_print_text(struct seq_file *seq, void *offset) bat_priv->algo_ops->neigh.print(bat_priv, seq); return 0; } +#endif + +/** + * batadv_hardif_neigh_dump - Dump to netlink the neighbor infos for a specific + * outgoing interface + * @msg: message to dump into + * @cb: parameters for the dump + * + * Return: 0 or error value + */ +int batadv_hardif_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb) +{ + struct net *net = sock_net(cb->skb->sk); + struct net_device *soft_iface; + struct net_device *hard_iface = NULL; + struct batadv_hard_iface *hardif = BATADV_IF_DEFAULT; + struct batadv_priv *bat_priv; + struct batadv_hard_iface *primary_if = NULL; + int ret; + int ifindex, hard_ifindex; + + ifindex = batadv_netlink_get_ifindex(cb->nlh, BATADV_ATTR_MESH_IFINDEX); + if (!ifindex) + return -EINVAL; + + soft_iface = dev_get_by_index(net, ifindex); + if (!soft_iface || !batadv_softif_is_valid(soft_iface)) { + ret = -ENODEV; + goto out; + } + + bat_priv = netdev_priv(soft_iface); + + primary_if = batadv_primary_if_get_selected(bat_priv); + if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) { + ret = -ENOENT; + goto out; + } + + hard_ifindex = batadv_netlink_get_ifindex(cb->nlh, + BATADV_ATTR_HARD_IFINDEX); + if (hard_ifindex) { + hard_iface = dev_get_by_index(net, hard_ifindex); + if (hard_iface) + hardif = batadv_hardif_get_by_netdev(hard_iface); + + if (!hardif) { + ret = -ENODEV; + goto out; + } + + if (hardif->soft_iface != soft_iface) { + ret = -ENOENT; + goto out; + } + } + + if (!bat_priv->algo_ops->neigh.dump) { + ret = -EOPNOTSUPP; + goto out; + } + + bat_priv->algo_ops->neigh.dump(msg, cb, bat_priv, hardif); + + ret = msg->len; + + out: + if (hardif) + batadv_hardif_put(hardif); + if (hard_iface) + dev_put(hard_iface); + if (primary_if) + batadv_hardif_put(primary_if); + if (soft_iface) + dev_put(soft_iface); + + return ret; +} /** * batadv_orig_ifinfo_release - release orig_ifinfo from lists and queue for @@ -905,7 +991,6 @@ struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv, /* extra reference for return */ kref_init(&orig_node->refcount); - kref_get(&orig_node->refcount); orig_node->bat_priv = bat_priv; ether_addr_copy(orig_node->orig, addr); @@ -1256,6 +1341,7 @@ void batadv_purge_orig_ref(struct batadv_priv *bat_priv) _batadv_purge_orig(bat_priv); } +#ifdef CONFIG_BATMAN_ADV_DEBUGFS int batadv_orig_seq_print_text(struct seq_file *seq, void *offset) { struct net_device *net_dev = (struct net_device *)seq->private; @@ -1329,6 +1415,84 @@ out: batadv_hardif_put(hard_iface); return 0; } +#endif + +/** + * batadv_orig_dump - Dump to netlink the originator infos for a specific + * outgoing interface + * @msg: message to dump into + * @cb: parameters for the dump + * + * Return: 0 or error value + */ +int batadv_orig_dump(struct sk_buff *msg, struct netlink_callback *cb) +{ + struct net *net = sock_net(cb->skb->sk); + struct net_device *soft_iface; + struct net_device *hard_iface = NULL; + struct batadv_hard_iface *hardif = BATADV_IF_DEFAULT; + struct batadv_priv *bat_priv; + struct batadv_hard_iface *primary_if = NULL; + int ret; + int ifindex, hard_ifindex; + + ifindex = batadv_netlink_get_ifindex(cb->nlh, BATADV_ATTR_MESH_IFINDEX); + if (!ifindex) + return -EINVAL; + + soft_iface = dev_get_by_index(net, ifindex); + if (!soft_iface || !batadv_softif_is_valid(soft_iface)) { + ret = -ENODEV; + goto out; + } + + bat_priv = netdev_priv(soft_iface); + + primary_if = batadv_primary_if_get_selected(bat_priv); + if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) { + ret = -ENOENT; + goto out; + } + + hard_ifindex = batadv_netlink_get_ifindex(cb->nlh, + BATADV_ATTR_HARD_IFINDEX); + if (hard_ifindex) { + hard_iface = dev_get_by_index(net, hard_ifindex); + if (hard_iface) + hardif = batadv_hardif_get_by_netdev(hard_iface); + + if (!hardif) { + ret = -ENODEV; + goto out; + } + + if (hardif->soft_iface != soft_iface) { + ret = -ENOENT; + goto out; + } + } + + if (!bat_priv->algo_ops->orig.dump) { + ret = -EOPNOTSUPP; + goto out; + } + + bat_priv->algo_ops->orig.dump(msg, cb, bat_priv, hardif); + + ret = msg->len; + + out: + if (hardif) + batadv_hardif_put(hardif); + if (hard_iface) + dev_put(hard_iface); + if (primary_if) + batadv_hardif_put(primary_if); + if (soft_iface) + dev_put(soft_iface); + + return ret; +} int batadv_orig_hash_add_if(struct batadv_hard_iface *hard_iface, int max_if_num) diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h index 566306bf05dc..ebc56183f358 100644 --- a/net/batman-adv/originator.h +++ b/net/batman-adv/originator.h @@ -31,7 +31,9 @@ #include "hash.h" +struct netlink_callback; struct seq_file; +struct sk_buff; bool batadv_compare_orig(const struct hlist_node *node, const void *data2); int batadv_originator_init(struct batadv_priv *bat_priv); @@ -61,6 +63,7 @@ batadv_neigh_ifinfo_get(struct batadv_neigh_node *neigh, struct batadv_hard_iface *if_outgoing); void batadv_neigh_ifinfo_put(struct batadv_neigh_ifinfo *neigh_ifinfo); +int batadv_hardif_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb); int batadv_hardif_neigh_seq_print_text(struct seq_file *seq, void *offset); struct batadv_orig_ifinfo * @@ -72,6 +75,7 @@ batadv_orig_ifinfo_new(struct batadv_orig_node *orig_node, void batadv_orig_ifinfo_put(struct batadv_orig_ifinfo *orig_ifinfo); int batadv_orig_seq_print_text(struct seq_file *seq, void *offset); +int batadv_orig_dump(struct sk_buff *msg, struct netlink_callback *cb); int batadv_orig_hardif_seq_print_text(struct seq_file *seq, void *offset); int batadv_orig_hash_add_if(struct batadv_hard_iface *hard_iface, int max_if_num); diff --git a/net/batman-adv/packet.h b/net/batman-adv/packet.h index 6b011ff64dd8..6afc0b86950e 100644 --- a/net/batman-adv/packet.h +++ b/net/batman-adv/packet.h @@ -129,42 +129,6 @@ enum batadv_tt_data_flags { }; /** - * enum batadv_tt_client_flags - TT client specific flags - * @BATADV_TT_CLIENT_DEL: the client has to be deleted from the table - * @BATADV_TT_CLIENT_ROAM: the client roamed to/from another node and the new - * update telling its new real location has not been received/sent yet - * @BATADV_TT_CLIENT_WIFI: this client is connected through a wifi interface. - * This information is used by the "AP Isolation" feature - * @BATADV_TT_CLIENT_ISOLA: this client is considered "isolated". This - * information is used by the Extended Isolation feature - * @BATADV_TT_CLIENT_NOPURGE: this client should never be removed from the table - * @BATADV_TT_CLIENT_NEW: this client has been added to the local table but has - * not been announced yet - * @BATADV_TT_CLIENT_PENDING: this client is marked for removal but it is kept - * in the table for one more originator interval for consistency purposes - * @BATADV_TT_CLIENT_TEMP: this global client has been detected to be part of - * the network but no nnode has already announced it - * - * Bits from 0 to 7 are called _remote flags_ because they are sent on the wire. - * Bits from 8 to 15 are called _local flags_ because they are used for local - * computations only. - * - * Bits from 4 to 7 - a subset of remote flags - are ensured to be in sync with - * the other nodes in the network. To achieve this goal these flags are included - * in the TT CRC computation. - */ -enum batadv_tt_client_flags { - BATADV_TT_CLIENT_DEL = BIT(0), - BATADV_TT_CLIENT_ROAM = BIT(1), - BATADV_TT_CLIENT_WIFI = BIT(4), - BATADV_TT_CLIENT_ISOLA = BIT(5), - BATADV_TT_CLIENT_NOPURGE = BIT(8), - BATADV_TT_CLIENT_NEW = BIT(9), - BATADV_TT_CLIENT_PENDING = BIT(10), - BATADV_TT_CLIENT_TEMP = BIT(11), -}; - -/** * enum batadv_vlan_flags - flags for the four MSB of any vlan ID field * @BATADV_VLAN_HAS_TAG: whether the field contains a valid vlan tag or not */ diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index 7602c001e92b..610f2c45edcd 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -74,11 +74,23 @@ static void _batadv_update_route(struct batadv_priv *bat_priv, if (!orig_ifinfo) return; - rcu_read_lock(); - curr_router = rcu_dereference(orig_ifinfo->router); - if (curr_router && !kref_get_unless_zero(&curr_router->refcount)) - curr_router = NULL; - rcu_read_unlock(); + spin_lock_bh(&orig_node->neigh_list_lock); + /* curr_router used earlier may not be the current orig_ifinfo->router + * anymore because it was dereferenced outside of the neigh_list_lock + * protected region. After the new best neighbor has replace the current + * best neighbor the reference counter needs to decrease. Consequently, + * the code needs to ensure the curr_router variable contains a pointer + * to the replaced best neighbor. + */ + curr_router = rcu_dereference_protected(orig_ifinfo->router, true); + + /* increase refcount of new best neighbor */ + if (neigh_node) + kref_get(&neigh_node->refcount); + + rcu_assign_pointer(orig_ifinfo->router, neigh_node); + spin_unlock_bh(&orig_node->neigh_list_lock); + batadv_orig_ifinfo_put(orig_ifinfo); /* route deleted */ if ((curr_router) && (!neigh_node)) { @@ -100,27 +112,6 @@ static void _batadv_update_route(struct batadv_priv *bat_priv, curr_router->addr); } - if (curr_router) - batadv_neigh_node_put(curr_router); - - spin_lock_bh(&orig_node->neigh_list_lock); - /* curr_router used earlier may not be the current orig_ifinfo->router - * anymore because it was dereferenced outside of the neigh_list_lock - * protected region. After the new best neighbor has replace the current - * best neighbor the reference counter needs to decrease. Consequently, - * the code needs to ensure the curr_router variable contains a pointer - * to the replaced best neighbor. - */ - curr_router = rcu_dereference_protected(orig_ifinfo->router, true); - - /* increase refcount of new best neighbor */ - if (neigh_node) - kref_get(&neigh_node->refcount); - - rcu_assign_pointer(orig_ifinfo->router, neigh_node); - spin_unlock_bh(&orig_node->neigh_list_lock); - batadv_orig_ifinfo_put(orig_ifinfo); - /* decrease refcount of previous best neighbor */ if (curr_router) batadv_neigh_node_put(curr_router); diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c index 6191159484df..8d4e1f578574 100644 --- a/net/batman-adv/send.c +++ b/net/batman-adv/send.c @@ -315,8 +315,7 @@ out: * * Wrap the given skb into a batman-adv unicast or unicast-4addr header * depending on whether BATADV_UNICAST or BATADV_UNICAST_4ADDR was supplied - * as packet_type. Then send this frame to the given orig_node and release a - * reference to this orig_node. + * as packet_type. Then send this frame to the given orig_node. * * Return: NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise. */ @@ -370,8 +369,6 @@ int batadv_send_skb_unicast(struct batadv_priv *bat_priv, ret = NET_XMIT_SUCCESS; out: - if (orig_node) - batadv_orig_node_put(orig_node); if (ret == NET_XMIT_DROP) kfree_skb(skb); return ret; @@ -403,6 +400,7 @@ int batadv_send_skb_via_tt_generic(struct batadv_priv *bat_priv, struct ethhdr *ethhdr = (struct ethhdr *)skb->data; struct batadv_orig_node *orig_node; u8 *src, *dst; + int ret; src = ethhdr->h_source; dst = ethhdr->h_dest; @@ -414,8 +412,13 @@ int batadv_send_skb_via_tt_generic(struct batadv_priv *bat_priv, } orig_node = batadv_transtable_search(bat_priv, src, dst, vid); - return batadv_send_skb_unicast(bat_priv, skb, packet_type, - packet_subtype, orig_node, vid); + ret = batadv_send_skb_unicast(bat_priv, skb, packet_type, + packet_subtype, orig_node, vid); + + if (orig_node) + batadv_orig_node_put(orig_node); + + return ret; } /** @@ -433,12 +436,25 @@ int batadv_send_skb_via_gw(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid) { struct batadv_orig_node *orig_node; + int ret; orig_node = batadv_gw_get_selected_orig(bat_priv); - return batadv_send_skb_unicast(bat_priv, skb, BATADV_UNICAST_4ADDR, - BATADV_P_DATA, orig_node, vid); + ret = batadv_send_skb_unicast(bat_priv, skb, BATADV_UNICAST_4ADDR, + BATADV_P_DATA, orig_node, vid); + + if (orig_node) + batadv_orig_node_put(orig_node); + + return ret; } +/** + * batadv_forw_packet_free - free a forwarding packet + * @forw_packet: The packet to free + * + * This frees a forwarding packet and releases any resources it might + * have claimed. + */ void batadv_forw_packet_free(struct batadv_forw_packet *forw_packet) { kfree_skb(forw_packet->skb); @@ -446,9 +462,73 @@ void batadv_forw_packet_free(struct batadv_forw_packet *forw_packet) batadv_hardif_put(forw_packet->if_incoming); if (forw_packet->if_outgoing) batadv_hardif_put(forw_packet->if_outgoing); + if (forw_packet->queue_left) + atomic_inc(forw_packet->queue_left); kfree(forw_packet); } +/** + * batadv_forw_packet_alloc - allocate a forwarding packet + * @if_incoming: The (optional) if_incoming to be grabbed + * @if_outgoing: The (optional) if_outgoing to be grabbed + * @queue_left: The (optional) queue counter to decrease + * @bat_priv: The bat_priv for the mesh of this forw_packet + * + * Allocates a forwarding packet and tries to get a reference to the + * (optional) if_incoming, if_outgoing and queue_left. If queue_left + * is NULL then bat_priv is optional, too. + * + * Return: An allocated forwarding packet on success, NULL otherwise. + */ +struct batadv_forw_packet * +batadv_forw_packet_alloc(struct batadv_hard_iface *if_incoming, + struct batadv_hard_iface *if_outgoing, + atomic_t *queue_left, + struct batadv_priv *bat_priv) +{ + struct batadv_forw_packet *forw_packet; + const char *qname; + + if (queue_left && !batadv_atomic_dec_not_zero(queue_left)) { + qname = "unknown"; + + if (queue_left == &bat_priv->bcast_queue_left) + qname = "bcast"; + + if (queue_left == &bat_priv->batman_queue_left) + qname = "batman"; + + batadv_dbg(BATADV_DBG_BATMAN, bat_priv, + "%s queue is full\n", qname); + + return NULL; + } + + forw_packet = kmalloc(sizeof(*forw_packet), GFP_ATOMIC); + if (!forw_packet) + goto err; + + if (if_incoming) + kref_get(&if_incoming->refcount); + + if (if_outgoing) + kref_get(&if_outgoing->refcount); + + forw_packet->skb = NULL; + forw_packet->queue_left = queue_left; + forw_packet->if_incoming = if_incoming; + forw_packet->if_outgoing = if_outgoing; + forw_packet->num_packets = 0; + + return forw_packet; + +err: + if (queue_left) + atomic_inc(queue_left); + + return NULL; +} + static void _batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv, struct batadv_forw_packet *forw_packet, @@ -487,24 +567,20 @@ int batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv, struct batadv_bcast_packet *bcast_packet; struct sk_buff *newskb; - if (!batadv_atomic_dec_not_zero(&bat_priv->bcast_queue_left)) { - batadv_dbg(BATADV_DBG_BATMAN, bat_priv, - "bcast packet queue full\n"); - goto out; - } - primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) - goto out_and_inc; - - forw_packet = kmalloc(sizeof(*forw_packet), GFP_ATOMIC); + goto err; + forw_packet = batadv_forw_packet_alloc(primary_if, NULL, + &bat_priv->bcast_queue_left, + bat_priv); + batadv_hardif_put(primary_if); if (!forw_packet) - goto out_and_inc; + goto err; newskb = skb_copy(skb, GFP_ATOMIC); if (!newskb) - goto packet_free; + goto err_packet_free; /* as we have a copy now, it is safe to decrease the TTL */ bcast_packet = (struct batadv_bcast_packet *)newskb->data; @@ -513,11 +589,6 @@ int batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv, skb_reset_mac_header(newskb); forw_packet->skb = newskb; - forw_packet->if_incoming = primary_if; - forw_packet->if_outgoing = NULL; - - /* how often did we send the bcast packet ? */ - forw_packet->num_packets = 0; INIT_DELAYED_WORK(&forw_packet->delayed_work, batadv_send_outstanding_bcast_packet); @@ -525,13 +596,9 @@ int batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv, _batadv_add_bcast_packet_to_list(bat_priv, forw_packet, delay); return NETDEV_TX_OK; -packet_free: - kfree(forw_packet); -out_and_inc: - atomic_inc(&bat_priv->bcast_queue_left); -out: - if (primary_if) - batadv_hardif_put(primary_if); +err_packet_free: + batadv_forw_packet_free(forw_packet); +err: return NETDEV_TX_BUSY; } @@ -592,7 +659,6 @@ static void batadv_send_outstanding_bcast_packet(struct work_struct *work) out: batadv_forw_packet_free(forw_packet); - atomic_inc(&bat_priv->bcast_queue_left); } void @@ -633,9 +699,6 @@ batadv_purge_outstanding_packets(struct batadv_priv *bat_priv, if (pending) { hlist_del(&forw_packet->list); - if (!forw_packet->own) - atomic_inc(&bat_priv->bcast_queue_left); - batadv_forw_packet_free(forw_packet); } } @@ -663,9 +726,6 @@ batadv_purge_outstanding_packets(struct batadv_priv *bat_priv, if (pending) { hlist_del(&forw_packet->list); - if (!forw_packet->own) - atomic_inc(&bat_priv->batman_queue_left); - batadv_forw_packet_free(forw_packet); } } diff --git a/net/batman-adv/send.h b/net/batman-adv/send.h index 7cecb7563b45..999f78683d9e 100644 --- a/net/batman-adv/send.h +++ b/net/batman-adv/send.h @@ -28,6 +28,12 @@ struct sk_buff; void batadv_forw_packet_free(struct batadv_forw_packet *forw_packet); +struct batadv_forw_packet * +batadv_forw_packet_alloc(struct batadv_hard_iface *if_incoming, + struct batadv_hard_iface *if_outgoing, + atomic_t *queue_left, + struct batadv_priv *bat_priv); + int batadv_send_skb_to_orig(struct sk_buff *skb, struct batadv_orig_node *orig_node, struct batadv_hard_iface *recv_if); diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 7527c0652dd5..49e16b6e0ba3 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -39,6 +39,7 @@ #include <linux/random.h> #include <linux/rculist.h> #include <linux/rcupdate.h> +#include <linux/rtnetlink.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/socket.h> @@ -46,7 +47,6 @@ #include <linux/stddef.h> #include <linux/string.h> #include <linux/types.h> -#include <linux/workqueue.h> #include "bat_algo.h" #include "bridge_loop_avoidance.h" @@ -57,6 +57,7 @@ #include "hard-interface.h" #include "multicast.h" #include "network-coding.h" +#include "originator.h" #include "packet.h" #include "send.h" #include "sysfs.h" @@ -377,6 +378,8 @@ dropped: dropped_freed: batadv_inc_counter(bat_priv, BATADV_CNT_TX_DROPPED); end: + if (mcast_single_orig) + batadv_orig_node_put(mcast_single_orig); if (primary_if) batadv_hardif_put(primary_if); return NETDEV_TX_OK; @@ -591,6 +594,7 @@ int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid) } spin_lock_bh(&bat_priv->softif_vlan_list_lock); + kref_get(&vlan->refcount); hlist_add_head_rcu(&vlan->list, &bat_priv->softif_vlan_list); spin_unlock_bh(&bat_priv->softif_vlan_list_lock); @@ -601,6 +605,9 @@ int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid) bat_priv->soft_iface->dev_addr, vid, BATADV_NULL_IFINDEX, BATADV_NO_MARK); + /* don't return reference to new softif_vlan */ + batadv_softif_vlan_put(vlan); + return 0; } @@ -747,34 +754,6 @@ static void batadv_set_lockdep_class(struct net_device *dev) } /** - * batadv_softif_destroy_finish - cleans up the remains of a softif - * @work: work queue item - * - * Free the parts of the soft interface which can not be removed under - * rtnl lock (to prevent deadlock situations). - */ -static void batadv_softif_destroy_finish(struct work_struct *work) -{ - struct batadv_softif_vlan *vlan; - struct batadv_priv *bat_priv; - struct net_device *soft_iface; - - bat_priv = container_of(work, struct batadv_priv, - cleanup_work); - soft_iface = bat_priv->soft_iface; - - /* destroy the "untagged" VLAN */ - vlan = batadv_softif_vlan_get(bat_priv, BATADV_NO_FLAGS); - if (vlan) { - batadv_softif_destroy_vlan(bat_priv, vlan); - batadv_softif_vlan_put(vlan); - } - - batadv_sysfs_del_meshif(soft_iface); - unregister_netdev(soft_iface); -} - -/** * batadv_softif_init_late - late stage initialization of soft interface * @dev: registered network device to modify * @@ -791,7 +770,6 @@ static int batadv_softif_init_late(struct net_device *dev) bat_priv = netdev_priv(dev); bat_priv->soft_iface = dev; - INIT_WORK(&bat_priv->cleanup_work, batadv_softif_destroy_finish); /* batadv_interface_stats() needs to be available as soon as * register_netdevice() has been called @@ -1028,8 +1006,19 @@ struct net_device *batadv_softif_create(struct net *net, const char *name) void batadv_softif_destroy_sysfs(struct net_device *soft_iface) { struct batadv_priv *bat_priv = netdev_priv(soft_iface); + struct batadv_softif_vlan *vlan; - queue_work(batadv_event_workqueue, &bat_priv->cleanup_work); + ASSERT_RTNL(); + + /* destroy the "untagged" VLAN */ + vlan = batadv_softif_vlan_get(bat_priv, BATADV_NO_FLAGS); + if (vlan) { + batadv_softif_destroy_vlan(bat_priv, vlan); + batadv_softif_vlan_put(vlan); + } + + batadv_sysfs_del_meshif(soft_iface); + unregister_netdevice(soft_iface); } /** diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c index fe9ca94ddee2..02d96f224c60 100644 --- a/net/batman-adv/sysfs.c +++ b/net/batman-adv/sysfs.c @@ -37,6 +37,7 @@ #include <linux/stddef.h> #include <linux/string.h> #include <linux/stringify.h> +#include <linux/workqueue.h> #include "bridge_loop_avoidance.h" #include "distributed-arp-table.h" @@ -428,6 +429,13 @@ static ssize_t batadv_show_gw_mode(struct kobject *kobj, struct attribute *attr, struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj); int bytes_written; + /* GW mode is not available if the routing algorithm in use does not + * implement the GW API + */ + if (!bat_priv->algo_ops->gw.get_best_gw_node || + !bat_priv->algo_ops->gw.is_eligible) + return -ENOENT; + switch (atomic_read(&bat_priv->gw.mode)) { case BATADV_GW_MODE_CLIENT: bytes_written = sprintf(buff, "%s\n", @@ -455,6 +463,13 @@ static ssize_t batadv_store_gw_mode(struct kobject *kobj, char *curr_gw_mode_str; int gw_mode_tmp = -1; + /* toggling GW mode is allowed only if the routing algorithm in use + * provides the GW API + */ + if (!bat_priv->algo_ops->gw.get_best_gw_node || + !bat_priv->algo_ops->gw.is_eligible) + return -EINVAL; + if (buff[count - 1] == '\n') buff[count - 1] = '\0'; @@ -514,6 +529,50 @@ static ssize_t batadv_store_gw_mode(struct kobject *kobj, return count; } +static ssize_t batadv_show_gw_sel_class(struct kobject *kobj, + struct attribute *attr, char *buff) +{ + struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj); + + /* GW selection class is not available if the routing algorithm in use + * does not implement the GW API + */ + if (!bat_priv->algo_ops->gw.get_best_gw_node || + !bat_priv->algo_ops->gw.is_eligible) + return -ENOENT; + + if (bat_priv->algo_ops->gw.show_sel_class) + return bat_priv->algo_ops->gw.show_sel_class(bat_priv, buff); + + return sprintf(buff, "%i\n", atomic_read(&bat_priv->gw.sel_class)); +} + +static ssize_t batadv_store_gw_sel_class(struct kobject *kobj, + struct attribute *attr, char *buff, + size_t count) +{ + struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj); + + /* setting the GW selection class is allowed only if the routing + * algorithm in use implements the GW API + */ + if (!bat_priv->algo_ops->gw.get_best_gw_node || + !bat_priv->algo_ops->gw.is_eligible) + return -EINVAL; + + if (buff[count - 1] == '\n') + buff[count - 1] = '\0'; + + if (bat_priv->algo_ops->gw.store_sel_class) + return bat_priv->algo_ops->gw.store_sel_class(bat_priv, buff, + count); + + return __batadv_store_uint_attr(buff, count, 1, BATADV_TQ_MAX_VALUE, + batadv_post_gw_reselect, attr, + &bat_priv->gw.sel_class, + bat_priv->soft_iface); +} + static ssize_t batadv_show_gw_bwidth(struct kobject *kobj, struct attribute *attr, char *buff) { @@ -625,8 +684,8 @@ BATADV_ATTR_SIF_UINT(orig_interval, orig_interval, S_IRUGO | S_IWUSR, 2 * BATADV_JITTER, INT_MAX, NULL); BATADV_ATTR_SIF_UINT(hop_penalty, hop_penalty, S_IRUGO | S_IWUSR, 0, BATADV_TQ_MAX_VALUE, NULL); -BATADV_ATTR_SIF_UINT(gw_sel_class, gw.sel_class, S_IRUGO | S_IWUSR, 1, - BATADV_TQ_MAX_VALUE, batadv_post_gw_reselect); +static BATADV_ATTR(gw_sel_class, S_IRUGO | S_IWUSR, batadv_show_gw_sel_class, + batadv_store_gw_sel_class); static BATADV_ATTR(gw_bandwidth, S_IRUGO | S_IWUSR, batadv_show_gw_bwidth, batadv_store_gw_bwidth); #ifdef CONFIG_BATMAN_ADV_MCAST @@ -712,6 +771,8 @@ rem_attr: for (bat_attr = batadv_mesh_attrs; *bat_attr; ++bat_attr) sysfs_remove_file(bat_priv->mesh_obj, &((*bat_attr)->attr)); + kobject_uevent(bat_priv->mesh_obj, KOBJ_REMOVE); + kobject_del(bat_priv->mesh_obj); kobject_put(bat_priv->mesh_obj); bat_priv->mesh_obj = NULL; out: @@ -726,6 +787,8 @@ void batadv_sysfs_del_meshif(struct net_device *dev) for (bat_attr = batadv_mesh_attrs; *bat_attr; ++bat_attr) sysfs_remove_file(bat_priv->mesh_obj, &((*bat_attr)->attr)); + kobject_uevent(bat_priv->mesh_obj, KOBJ_REMOVE); + kobject_del(bat_priv->mesh_obj); kobject_put(bat_priv->mesh_obj); bat_priv->mesh_obj = NULL; } @@ -781,6 +844,10 @@ rem_attr: for (bat_attr = batadv_vlan_attrs; *bat_attr; ++bat_attr) sysfs_remove_file(vlan->kobj, &((*bat_attr)->attr)); + if (vlan->kobj != bat_priv->mesh_obj) { + kobject_uevent(vlan->kobj, KOBJ_REMOVE); + kobject_del(vlan->kobj); + } kobject_put(vlan->kobj); vlan->kobj = NULL; out: @@ -800,6 +867,10 @@ void batadv_sysfs_del_vlan(struct batadv_priv *bat_priv, for (bat_attr = batadv_vlan_attrs; *bat_attr; ++bat_attr) sysfs_remove_file(vlan->kobj, &((*bat_attr)->attr)); + if (vlan->kobj != bat_priv->mesh_obj) { + kobject_uevent(vlan->kobj, KOBJ_REMOVE); + kobject_del(vlan->kobj); + } kobject_put(vlan->kobj); vlan->kobj = NULL; } @@ -828,31 +899,31 @@ static ssize_t batadv_show_mesh_iface(struct kobject *kobj, return length; } -static ssize_t batadv_store_mesh_iface(struct kobject *kobj, - struct attribute *attr, char *buff, - size_t count) +/** + * batadv_store_mesh_iface_finish - store new hardif mesh_iface state + * @net_dev: netdevice to add/remove to/from batman-adv soft-interface + * @ifname: name of soft-interface to modify + * + * Changes the parts of the hard+soft interface which can not be modified under + * sysfs lock (to prevent deadlock situations). + * + * Return: 0 on success, 0 < on failure + */ +static int batadv_store_mesh_iface_finish(struct net_device *net_dev, + char ifname[IFNAMSIZ]) { - struct net_device *net_dev = batadv_kobj_to_netdev(kobj); struct net *net = dev_net(net_dev); struct batadv_hard_iface *hard_iface; - int status_tmp = -1; - int ret = count; + int status_tmp; + int ret = 0; + + ASSERT_RTNL(); hard_iface = batadv_hardif_get_by_netdev(net_dev); if (!hard_iface) - return count; - - if (buff[count - 1] == '\n') - buff[count - 1] = '\0'; - - if (strlen(buff) >= IFNAMSIZ) { - pr_err("Invalid parameter for 'mesh_iface' setting received: interface name too long '%s'\n", - buff); - batadv_hardif_put(hard_iface); - return -EINVAL; - } + return 0; - if (strncmp(buff, "none", 4) == 0) + if (strncmp(ifname, "none", 4) == 0) status_tmp = BATADV_IF_NOT_IN_USE; else status_tmp = BATADV_IF_I_WANT_YOU; @@ -861,15 +932,13 @@ static ssize_t batadv_store_mesh_iface(struct kobject *kobj, goto out; if ((hard_iface->soft_iface) && - (strncmp(hard_iface->soft_iface->name, buff, IFNAMSIZ) == 0)) + (strncmp(hard_iface->soft_iface->name, ifname, IFNAMSIZ) == 0)) goto out; - rtnl_lock(); - if (status_tmp == BATADV_IF_NOT_IN_USE) { batadv_hardif_disable_interface(hard_iface, BATADV_IF_CLEANUP_AUTO); - goto unlock; + goto out; } /* if the interface already is in use */ @@ -877,15 +946,71 @@ static ssize_t batadv_store_mesh_iface(struct kobject *kobj, batadv_hardif_disable_interface(hard_iface, BATADV_IF_CLEANUP_AUTO); - ret = batadv_hardif_enable_interface(hard_iface, net, buff); - -unlock: - rtnl_unlock(); + ret = batadv_hardif_enable_interface(hard_iface, net, ifname); out: batadv_hardif_put(hard_iface); return ret; } +/** + * batadv_store_mesh_iface_work - store new hardif mesh_iface state + * @work: work queue item + * + * Changes the parts of the hard+soft interface which can not be modified under + * sysfs lock (to prevent deadlock situations). + */ +static void batadv_store_mesh_iface_work(struct work_struct *work) +{ + struct batadv_store_mesh_work *store_work; + int ret; + + store_work = container_of(work, struct batadv_store_mesh_work, work); + + rtnl_lock(); + ret = batadv_store_mesh_iface_finish(store_work->net_dev, + store_work->soft_iface_name); + rtnl_unlock(); + + if (ret < 0) + pr_err("Failed to store new mesh_iface state %s for %s: %d\n", + store_work->soft_iface_name, store_work->net_dev->name, + ret); + + dev_put(store_work->net_dev); + kfree(store_work); +} + +static ssize_t batadv_store_mesh_iface(struct kobject *kobj, + struct attribute *attr, char *buff, + size_t count) +{ + struct net_device *net_dev = batadv_kobj_to_netdev(kobj); + struct batadv_store_mesh_work *store_work; + + if (buff[count - 1] == '\n') + buff[count - 1] = '\0'; + + if (strlen(buff) >= IFNAMSIZ) { + pr_err("Invalid parameter for 'mesh_iface' setting received: interface name too long '%s'\n", + buff); + return -EINVAL; + } + + store_work = kmalloc(sizeof(*store_work), GFP_KERNEL); + if (!store_work) + return -ENOMEM; + + dev_hold(net_dev); + INIT_WORK(&store_work->work, batadv_store_mesh_iface_work); + store_work->net_dev = net_dev; + strlcpy(store_work->soft_iface_name, buff, + sizeof(store_work->soft_iface_name)); + + queue_work(batadv_event_workqueue, &store_work->work); + + return count; +} + static ssize_t batadv_show_iface_status(struct kobject *kobj, struct attribute *attr, char *buff) { @@ -1048,6 +1173,8 @@ out: void batadv_sysfs_del_hardif(struct kobject **hardif_obj) { + kobject_uevent(*hardif_obj, KOBJ_REMOVE); + kobject_del(*hardif_obj); kobject_put(*hardif_obj); *hardif_obj = NULL; } diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 7e6df7a4964a..7f663092f6de 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -22,12 +22,14 @@ #include <linux/bitops.h> #include <linux/bug.h> #include <linux/byteorder/generic.h> +#include <linux/cache.h> #include <linux/compiler.h> #include <linux/crc32c.h> #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/fs.h> #include <linux/if_ether.h> +#include <linux/init.h> #include <linux/jhash.h> #include <linux/jiffies.h> #include <linux/kernel.h> @@ -35,25 +37,39 @@ #include <linux/list.h> #include <linux/lockdep.h> #include <linux/netdevice.h> +#include <linux/netlink.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/seq_file.h> +#include <linux/skbuff.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/stddef.h> #include <linux/string.h> #include <linux/workqueue.h> +#include <net/genetlink.h> +#include <net/netlink.h> +#include <net/sock.h> +#include <uapi/linux/batman_adv.h> #include "bridge_loop_avoidance.h" #include "hard-interface.h" #include "hash.h" #include "log.h" #include "multicast.h" +#include "netlink.h" #include "originator.h" #include "packet.h" #include "soft-interface.h" #include "tvlv.h" +static struct kmem_cache *batadv_tl_cache __read_mostly; +static struct kmem_cache *batadv_tg_cache __read_mostly; +static struct kmem_cache *batadv_tt_orig_cache __read_mostly; +static struct kmem_cache *batadv_tt_change_cache __read_mostly; +static struct kmem_cache *batadv_tt_req_cache __read_mostly; +static struct kmem_cache *batadv_tt_roam_cache __read_mostly; + /* hash class keys */ static struct lock_class_key batadv_tt_local_hash_lock_class_key; static struct lock_class_key batadv_tt_global_hash_lock_class_key; @@ -205,6 +221,20 @@ batadv_tt_global_hash_find(struct batadv_priv *bat_priv, const u8 *addr, } /** + * batadv_tt_local_entry_free_rcu - free the tt_local_entry + * @rcu: rcu pointer of the tt_local_entry + */ +static void batadv_tt_local_entry_free_rcu(struct rcu_head *rcu) +{ + struct batadv_tt_local_entry *tt_local_entry; + + tt_local_entry = container_of(rcu, struct batadv_tt_local_entry, + common.rcu); + + kmem_cache_free(batadv_tl_cache, tt_local_entry); +} + +/** * batadv_tt_local_entry_release - release tt_local_entry from lists and queue * for free after rcu grace period * @ref: kref pointer of the nc_node @@ -218,7 +248,7 @@ static void batadv_tt_local_entry_release(struct kref *ref) batadv_softif_vlan_put(tt_local_entry->vlan); - kfree_rcu(tt_local_entry, common.rcu); + call_rcu(&tt_local_entry->common.rcu, batadv_tt_local_entry_free_rcu); } /** @@ -234,6 +264,20 @@ batadv_tt_local_entry_put(struct batadv_tt_local_entry *tt_local_entry) } /** + * batadv_tt_global_entry_free_rcu - free the tt_global_entry + * @rcu: rcu pointer of the tt_global_entry + */ +static void batadv_tt_global_entry_free_rcu(struct rcu_head *rcu) +{ + struct batadv_tt_global_entry *tt_global_entry; + + tt_global_entry = container_of(rcu, struct batadv_tt_global_entry, + common.rcu); + + kmem_cache_free(batadv_tg_cache, tt_global_entry); +} + +/** * batadv_tt_global_entry_release - release tt_global_entry from lists and queue * for free after rcu grace period * @ref: kref pointer of the nc_node @@ -246,7 +290,8 @@ static void batadv_tt_global_entry_release(struct kref *ref) common.refcount); batadv_tt_global_del_orig_list(tt_global_entry); - kfree_rcu(tt_global_entry, common.rcu); + + call_rcu(&tt_global_entry->common.rcu, batadv_tt_global_entry_free_rcu); } /** @@ -384,6 +429,19 @@ static void batadv_tt_global_size_dec(struct batadv_orig_node *orig_node, } /** + * batadv_tt_orig_list_entry_free_rcu - free the orig_entry + * @rcu: rcu pointer of the orig_entry + */ +static void batadv_tt_orig_list_entry_free_rcu(struct rcu_head *rcu) +{ + struct batadv_tt_orig_list_entry *orig_entry; + + orig_entry = container_of(rcu, struct batadv_tt_orig_list_entry, rcu); + + kmem_cache_free(batadv_tt_orig_cache, orig_entry); +} + +/** * batadv_tt_orig_list_entry_release - release tt orig entry from lists and * queue for free after rcu grace period * @ref: kref pointer of the tt orig entry @@ -396,7 +454,7 @@ static void batadv_tt_orig_list_entry_release(struct kref *ref) refcount); batadv_orig_node_put(orig_entry->orig_node); - kfree_rcu(orig_entry, rcu); + call_rcu(&orig_entry->rcu, batadv_tt_orig_list_entry_free_rcu); } /** @@ -426,7 +484,7 @@ static void batadv_tt_local_event(struct batadv_priv *bat_priv, bool event_removed = false; bool del_op_requested, del_op_entry; - tt_change_node = kmalloc(sizeof(*tt_change_node), GFP_ATOMIC); + tt_change_node = kmem_cache_alloc(batadv_tt_change_cache, GFP_ATOMIC); if (!tt_change_node) return; @@ -467,8 +525,8 @@ static void batadv_tt_local_event(struct batadv_priv *bat_priv, continue; del: list_del(&entry->list); - kfree(entry); - kfree(tt_change_node); + kmem_cache_free(batadv_tt_change_cache, entry); + kmem_cache_free(batadv_tt_change_cache, tt_change_node); event_removed = true; goto unlock; } @@ -646,7 +704,7 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr, goto out; } - tt_local = kmalloc(sizeof(*tt_local), GFP_ATOMIC); + tt_local = kmem_cache_alloc(batadv_tl_cache, GFP_ATOMIC); if (!tt_local) goto out; @@ -656,7 +714,7 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr, net_ratelimited_function(batadv_info, soft_iface, "adding TT local entry %pM to non-existent VLAN %d\n", addr, BATADV_PRINT_VID(vid)); - kfree(tt_local); + kmem_cache_free(batadv_tl_cache, tt_local); tt_local = NULL; goto out; } @@ -676,7 +734,6 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr, if (batadv_is_wifi_netdev(in_dev)) tt_local->common.flags |= BATADV_TT_CLIENT_WIFI; kref_init(&tt_local->common.refcount); - kref_get(&tt_local->common.refcount); tt_local->last_seen = jiffies; tt_local->common.added_at = tt_local->last_seen; tt_local->vlan = vlan; @@ -688,6 +745,7 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr, is_multicast_ether_addr(addr)) tt_local->common.flags |= BATADV_TT_CLIENT_NOPURGE; + kref_get(&tt_local->common.refcount); hash_added = batadv_hash_add(bat_priv->tt.local_hash, batadv_compare_tt, batadv_choose_tt, &tt_local->common, &tt_local->common.hash_entry); @@ -959,7 +1017,7 @@ static void batadv_tt_tvlv_container_update(struct batadv_priv *bat_priv) tt_diff_entries_count++; } list_del(&entry->list); - kfree(entry); + kmem_cache_free(batadv_tt_change_cache, entry); } spin_unlock_bh(&bat_priv->tt.changes_list_lock); @@ -989,6 +1047,7 @@ container_register: kfree(tt_data); } +#ifdef CONFIG_BATMAN_ADV_DEBUGFS int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset) { struct net_device *net_dev = (struct net_device *)seq->private; @@ -1056,6 +1115,165 @@ out: batadv_hardif_put(primary_if); return 0; } +#endif + +/** + * batadv_tt_local_dump_entry - Dump one TT local entry into a message + * @msg :Netlink message to dump into + * @portid: Port making netlink request + * @seq: Sequence number of netlink message + * @bat_priv: The bat priv with all the soft interface information + * @common: tt local & tt global common data + * + * Return: Error code, or 0 on success + */ +static int +batadv_tt_local_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_priv *bat_priv, + struct batadv_tt_common_entry *common) +{ + void *hdr; + struct batadv_softif_vlan *vlan; + struct batadv_tt_local_entry *local; + unsigned int last_seen_msecs; + u32 crc; + + local = container_of(common, struct batadv_tt_local_entry, common); + last_seen_msecs = jiffies_to_msecs(jiffies - local->last_seen); + + vlan = batadv_softif_vlan_get(bat_priv, common->vid); + if (!vlan) + return 0; + + crc = vlan->tt.crc; + + batadv_softif_vlan_put(vlan); + + hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, + NLM_F_MULTI, + BATADV_CMD_GET_TRANSTABLE_LOCAL); + if (!hdr) + return -ENOBUFS; + + if (nla_put(msg, BATADV_ATTR_TT_ADDRESS, ETH_ALEN, common->addr) || + nla_put_u32(msg, BATADV_ATTR_TT_CRC32, crc) || + nla_put_u16(msg, BATADV_ATTR_TT_VID, common->vid) || + nla_put_u32(msg, BATADV_ATTR_TT_FLAGS, common->flags)) + goto nla_put_failure; + + if (!(common->flags & BATADV_TT_CLIENT_NOPURGE) && + nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS, last_seen_msecs)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + return 0; + + nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +/** + * batadv_tt_local_dump_bucket - Dump one TT local bucket into a message + * @msg: Netlink message to dump into + * @portid: Port making netlink request + * @seq: Sequence number of netlink message + * @bat_priv: The bat priv with all the soft interface information + * @head: Pointer to the list containing the local tt entries + * @idx_s: Number of entries to skip + * + * Return: Error code, or 0 on success + */ +static int +batadv_tt_local_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_priv *bat_priv, + struct hlist_head *head, int *idx_s) +{ + struct batadv_tt_common_entry *common; + int idx = 0; + + rcu_read_lock(); + hlist_for_each_entry_rcu(common, head, hash_entry) { + if (idx++ < *idx_s) + continue; + + if (batadv_tt_local_dump_entry(msg, portid, seq, bat_priv, + common)) { + rcu_read_unlock(); + *idx_s = idx - 1; + return -EMSGSIZE; + } + } + rcu_read_unlock(); + + *idx_s = 0; + return 0; +} + +/** + * batadv_tt_local_dump - Dump TT local entries into a message + * @msg: Netlink message to dump into + * @cb: Parameters from query + * + * Return: Error code, or 0 on success + */ +int batadv_tt_local_dump(struct sk_buff *msg, struct netlink_callback *cb) +{ + struct net *net = sock_net(cb->skb->sk); + struct net_device *soft_iface; + struct batadv_priv *bat_priv; + struct batadv_hard_iface *primary_if = NULL; + struct batadv_hashtable *hash; + struct hlist_head *head; + int ret; + int ifindex; + int bucket = cb->args[0]; + int idx = cb->args[1]; + int portid = NETLINK_CB(cb->skb).portid; + + ifindex = batadv_netlink_get_ifindex(cb->nlh, BATADV_ATTR_MESH_IFINDEX); + if (!ifindex) + return -EINVAL; + + soft_iface = dev_get_by_index(net, ifindex); + if (!soft_iface || !batadv_softif_is_valid(soft_iface)) { + ret = -ENODEV; + goto out; + } + + bat_priv = netdev_priv(soft_iface); + + primary_if = batadv_primary_if_get_selected(bat_priv); + if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) { + ret = -ENOENT; + goto out; + } + + hash = bat_priv->tt.local_hash; + + while (bucket < hash->size) { + head = &hash->table[bucket]; + + if (batadv_tt_local_dump_bucket(msg, portid, cb->nlh->nlmsg_seq, + bat_priv, head, &idx)) + break; + + bucket++; + } + + ret = msg->len; + + out: + if (primary_if) + batadv_hardif_put(primary_if); + if (soft_iface) + dev_put(soft_iface); + + cb->args[0] = bucket; + cb->args[1] = idx; + + return ret; +} static void batadv_tt_local_set_pending(struct batadv_priv *bat_priv, @@ -1259,7 +1477,7 @@ static void batadv_tt_changes_list_free(struct batadv_priv *bat_priv) list_for_each_entry_safe(entry, safe, &bat_priv->tt.changes_list, list) { list_del(&entry->list); - kfree(entry); + kmem_cache_free(batadv_tt_change_cache, entry); } atomic_set(&bat_priv->tt.local_changes, 0); @@ -1341,7 +1559,7 @@ batadv_tt_global_orig_entry_add(struct batadv_tt_global_entry *tt_global, goto out; } - orig_entry = kzalloc(sizeof(*orig_entry), GFP_ATOMIC); + orig_entry = kmem_cache_zalloc(batadv_tt_orig_cache, GFP_ATOMIC); if (!orig_entry) goto out; @@ -1351,9 +1569,9 @@ batadv_tt_global_orig_entry_add(struct batadv_tt_global_entry *tt_global, orig_entry->orig_node = orig_node; orig_entry->ttvn = ttvn; kref_init(&orig_entry->refcount); - kref_get(&orig_entry->refcount); spin_lock_bh(&tt_global->list_lock); + kref_get(&orig_entry->refcount); hlist_add_head_rcu(&orig_entry->list, &tt_global->orig_list); spin_unlock_bh(&tt_global->list_lock); @@ -1411,7 +1629,8 @@ static bool batadv_tt_global_add(struct batadv_priv *bat_priv, goto out; if (!tt_global_entry) { - tt_global_entry = kzalloc(sizeof(*tt_global_entry), GFP_ATOMIC); + tt_global_entry = kmem_cache_zalloc(batadv_tg_cache, + GFP_ATOMIC); if (!tt_global_entry) goto out; @@ -1428,13 +1647,13 @@ static bool batadv_tt_global_add(struct batadv_priv *bat_priv, if (flags & BATADV_TT_CLIENT_ROAM) tt_global_entry->roam_at = jiffies; kref_init(&common->refcount); - kref_get(&common->refcount); common->added_at = jiffies; INIT_HLIST_HEAD(&tt_global_entry->orig_list); atomic_set(&tt_global_entry->orig_list_count, 0); spin_lock_init(&tt_global_entry->list_lock); + kref_get(&common->refcount); hash_added = batadv_hash_add(bat_priv->tt.global_hash, batadv_compare_tt, batadv_choose_tt, common, @@ -1579,6 +1798,7 @@ batadv_transtable_best_orig(struct batadv_priv *bat_priv, return best_entry; } +#ifdef CONFIG_BATMAN_ADV_DEBUGFS /** * batadv_tt_global_print_entry - print all orig nodes who announce the address * for this global entry @@ -1702,6 +1922,219 @@ out: batadv_hardif_put(primary_if); return 0; } +#endif + +/** + * batadv_tt_global_dump_subentry - Dump all TT local entries into a message + * @msg: Netlink message to dump into + * @portid: Port making netlink request + * @seq: Sequence number of netlink message + * @common: tt local & tt global common data + * @orig: Originator node announcing a non-mesh client + * @best: Is the best originator for the TT entry + * + * Return: Error code, or 0 on success + */ +static int +batadv_tt_global_dump_subentry(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_tt_common_entry *common, + struct batadv_tt_orig_list_entry *orig, + bool best) +{ + void *hdr; + struct batadv_orig_node_vlan *vlan; + u8 last_ttvn; + u32 crc; + + vlan = batadv_orig_node_vlan_get(orig->orig_node, + common->vid); + if (!vlan) + return 0; + + crc = vlan->tt.crc; + + batadv_orig_node_vlan_put(vlan); + + hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, + NLM_F_MULTI, + BATADV_CMD_GET_TRANSTABLE_GLOBAL); + if (!hdr) + return -ENOBUFS; + + last_ttvn = atomic_read(&orig->orig_node->last_ttvn); + + if (nla_put(msg, BATADV_ATTR_TT_ADDRESS, ETH_ALEN, common->addr) || + nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN, + orig->orig_node->orig) || + nla_put_u8(msg, BATADV_ATTR_TT_TTVN, orig->ttvn) || + nla_put_u8(msg, BATADV_ATTR_TT_LAST_TTVN, last_ttvn) || + nla_put_u32(msg, BATADV_ATTR_TT_CRC32, crc) || + nla_put_u16(msg, BATADV_ATTR_TT_VID, common->vid) || + nla_put_u32(msg, BATADV_ATTR_TT_FLAGS, common->flags)) + goto nla_put_failure; + + if (best && nla_put_flag(msg, BATADV_ATTR_FLAG_BEST)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + return 0; + + nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +/** + * batadv_tt_global_dump_entry - Dump one TT global entry into a message + * @msg: Netlink message to dump into + * @portid: Port making netlink request + * @seq: Sequence number of netlink message + * @bat_priv: The bat priv with all the soft interface information + * @common: tt local & tt global common data + * @sub_s: Number of entries to skip + * + * This function assumes the caller holds rcu_read_lock(). + * + * Return: Error code, or 0 on success + */ +static int +batadv_tt_global_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_priv *bat_priv, + struct batadv_tt_common_entry *common, int *sub_s) +{ + struct batadv_tt_orig_list_entry *orig_entry, *best_entry; + struct batadv_tt_global_entry *global; + struct hlist_head *head; + int sub = 0; + bool best; + + global = container_of(common, struct batadv_tt_global_entry, common); + best_entry = batadv_transtable_best_orig(bat_priv, global); + head = &global->orig_list; + + hlist_for_each_entry_rcu(orig_entry, head, list) { + if (sub++ < *sub_s) + continue; + + best = (orig_entry == best_entry); + + if (batadv_tt_global_dump_subentry(msg, portid, seq, common, + orig_entry, best)) { + *sub_s = sub - 1; + return -EMSGSIZE; + } + } + + *sub_s = 0; + return 0; +} + +/** + * batadv_tt_global_dump_bucket - Dump one TT local bucket into a message + * @msg: Netlink message to dump into + * @portid: Port making netlink request + * @seq: Sequence number of netlink message + * @bat_priv: The bat priv with all the soft interface information + * @head: Pointer to the list containing the global tt entries + * @idx_s: Number of entries to skip + * @sub: Number of entries to skip + * + * Return: Error code, or 0 on success + */ +static int +batadv_tt_global_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_priv *bat_priv, + struct hlist_head *head, int *idx_s, int *sub) +{ + struct batadv_tt_common_entry *common; + int idx = 0; + + rcu_read_lock(); + hlist_for_each_entry_rcu(common, head, hash_entry) { + if (idx++ < *idx_s) + continue; + + if (batadv_tt_global_dump_entry(msg, portid, seq, bat_priv, + common, sub)) { + rcu_read_unlock(); + *idx_s = idx - 1; + return -EMSGSIZE; + } + } + rcu_read_unlock(); + + *idx_s = 0; + *sub = 0; + return 0; +} + +/** + * batadv_tt_global_dump - Dump TT global entries into a message + * @msg: Netlink message to dump into + * @cb: Parameters from query + * + * Return: Error code, or length of message on success + */ +int batadv_tt_global_dump(struct sk_buff *msg, struct netlink_callback *cb) +{ + struct net *net = sock_net(cb->skb->sk); + struct net_device *soft_iface; + struct batadv_priv *bat_priv; + struct batadv_hard_iface *primary_if = NULL; + struct batadv_hashtable *hash; + struct hlist_head *head; + int ret; + int ifindex; + int bucket = cb->args[0]; + int idx = cb->args[1]; + int sub = cb->args[2]; + int portid = NETLINK_CB(cb->skb).portid; + + ifindex = batadv_netlink_get_ifindex(cb->nlh, BATADV_ATTR_MESH_IFINDEX); + if (!ifindex) + return -EINVAL; + + soft_iface = dev_get_by_index(net, ifindex); + if (!soft_iface || !batadv_softif_is_valid(soft_iface)) { + ret = -ENODEV; + goto out; + } + + bat_priv = netdev_priv(soft_iface); + + primary_if = batadv_primary_if_get_selected(bat_priv); + if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) { + ret = -ENOENT; + goto out; + } + + hash = bat_priv->tt.global_hash; + + while (bucket < hash->size) { + head = &hash->table[bucket]; + + if (batadv_tt_global_dump_bucket(msg, portid, + cb->nlh->nlmsg_seq, bat_priv, + head, &idx, &sub)) + break; + + bucket++; + } + + ret = msg->len; + + out: + if (primary_if) + batadv_hardif_put(primary_if); + if (soft_iface) + dev_put(soft_iface); + + cb->args[0] = bucket; + cb->args[1] = idx; + cb->args[2] = sub; + + return ret; +} /** * _batadv_tt_global_del_orig_entry - remove and free an orig_entry @@ -2280,7 +2713,7 @@ static void batadv_tt_req_node_release(struct kref *ref) tt_req_node = container_of(ref, struct batadv_tt_req_node, refcount); - kfree(tt_req_node); + kmem_cache_free(batadv_tt_req_cache, tt_req_node); } /** @@ -2367,7 +2800,7 @@ batadv_tt_req_node_new(struct batadv_priv *bat_priv, goto unlock; } - tt_req_node = kmalloc(sizeof(*tt_req_node), GFP_ATOMIC); + tt_req_node = kmem_cache_alloc(batadv_tt_req_cache, GFP_ATOMIC); if (!tt_req_node) goto unlock; @@ -3104,7 +3537,7 @@ static void batadv_tt_roam_list_free(struct batadv_priv *bat_priv) list_for_each_entry_safe(node, safe, &bat_priv->tt.roam_list, list) { list_del(&node->list); - kfree(node); + kmem_cache_free(batadv_tt_roam_cache, node); } spin_unlock_bh(&bat_priv->tt.roam_list_lock); @@ -3121,7 +3554,7 @@ static void batadv_tt_roam_purge(struct batadv_priv *bat_priv) continue; list_del(&node->list); - kfree(node); + kmem_cache_free(batadv_tt_roam_cache, node); } spin_unlock_bh(&bat_priv->tt.roam_list_lock); } @@ -3162,7 +3595,8 @@ static bool batadv_tt_check_roam_count(struct batadv_priv *bat_priv, u8 *client) } if (!ret) { - tt_roam_node = kmalloc(sizeof(*tt_roam_node), GFP_ATOMIC); + tt_roam_node = kmem_cache_alloc(batadv_tt_roam_cache, + GFP_ATOMIC); if (!tt_roam_node) goto unlock; @@ -3865,3 +4299,85 @@ bool batadv_tt_global_is_isolated(struct batadv_priv *bat_priv, return ret; } + +/** + * batadv_tt_cache_init - Initialize tt memory object cache + * + * Return: 0 on success or negative error number in case of failure. + */ +int __init batadv_tt_cache_init(void) +{ + size_t tl_size = sizeof(struct batadv_tt_local_entry); + size_t tg_size = sizeof(struct batadv_tt_global_entry); + size_t tt_orig_size = sizeof(struct batadv_tt_orig_list_entry); + size_t tt_change_size = sizeof(struct batadv_tt_change_node); + size_t tt_req_size = sizeof(struct batadv_tt_req_node); + size_t tt_roam_size = sizeof(struct batadv_tt_roam_node); + + batadv_tl_cache = kmem_cache_create("batadv_tl_cache", tl_size, 0, + SLAB_HWCACHE_ALIGN, NULL); + if (!batadv_tl_cache) + return -ENOMEM; + + batadv_tg_cache = kmem_cache_create("batadv_tg_cache", tg_size, 0, + SLAB_HWCACHE_ALIGN, NULL); + if (!batadv_tg_cache) + goto err_tt_tl_destroy; + + batadv_tt_orig_cache = kmem_cache_create("batadv_tt_orig_cache", + tt_orig_size, 0, + SLAB_HWCACHE_ALIGN, NULL); + if (!batadv_tt_orig_cache) + goto err_tt_tg_destroy; + + batadv_tt_change_cache = kmem_cache_create("batadv_tt_change_cache", + tt_change_size, 0, + SLAB_HWCACHE_ALIGN, NULL); + if (!batadv_tt_change_cache) + goto err_tt_orig_destroy; + + batadv_tt_req_cache = kmem_cache_create("batadv_tt_req_cache", + tt_req_size, 0, + SLAB_HWCACHE_ALIGN, NULL); + if (!batadv_tt_req_cache) + goto err_tt_change_destroy; + + batadv_tt_roam_cache = kmem_cache_create("batadv_tt_roam_cache", + tt_roam_size, 0, + SLAB_HWCACHE_ALIGN, NULL); + if (!batadv_tt_roam_cache) + goto err_tt_req_destroy; + + return 0; + +err_tt_req_destroy: + kmem_cache_destroy(batadv_tt_req_cache); + batadv_tt_req_cache = NULL; +err_tt_change_destroy: + kmem_cache_destroy(batadv_tt_change_cache); + batadv_tt_change_cache = NULL; +err_tt_orig_destroy: + kmem_cache_destroy(batadv_tt_orig_cache); + batadv_tt_orig_cache = NULL; +err_tt_tg_destroy: + kmem_cache_destroy(batadv_tg_cache); + batadv_tg_cache = NULL; +err_tt_tl_destroy: + kmem_cache_destroy(batadv_tl_cache); + batadv_tl_cache = NULL; + + return -ENOMEM; +} + +/** + * batadv_tt_cache_destroy - Destroy tt memory object cache + */ +void batadv_tt_cache_destroy(void) +{ + kmem_cache_destroy(batadv_tl_cache); + kmem_cache_destroy(batadv_tg_cache); + kmem_cache_destroy(batadv_tt_orig_cache); + kmem_cache_destroy(batadv_tt_change_cache); + kmem_cache_destroy(batadv_tt_req_cache); + kmem_cache_destroy(batadv_tt_roam_cache); +} diff --git a/net/batman-adv/translation-table.h b/net/batman-adv/translation-table.h index 7c7e2c006bfe..783fdba84db2 100644 --- a/net/batman-adv/translation-table.h +++ b/net/batman-adv/translation-table.h @@ -22,8 +22,10 @@ #include <linux/types.h> +struct netlink_callback; struct net_device; struct seq_file; +struct sk_buff; int batadv_tt_init(struct batadv_priv *bat_priv); bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr, @@ -33,6 +35,8 @@ u16 batadv_tt_local_remove(struct batadv_priv *bat_priv, const char *message, bool roaming); int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset); int batadv_tt_global_seq_print_text(struct seq_file *seq, void *offset); +int batadv_tt_local_dump(struct sk_buff *msg, struct netlink_callback *cb); +int batadv_tt_global_dump(struct sk_buff *msg, struct netlink_callback *cb); void batadv_tt_global_del_orig(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, s32 match_vid, const char *message); @@ -59,4 +63,7 @@ bool batadv_tt_add_temporary_global_entry(struct batadv_priv *bat_priv, bool batadv_tt_global_is_isolated(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid); +int batadv_tt_cache_init(void); +void batadv_tt_cache_destroy(void); + #endif /* _NET_BATMAN_ADV_TRANSLATION_TABLE_H_ */ diff --git a/net/batman-adv/tvlv.c b/net/batman-adv/tvlv.c index 3d1cf0fb112d..77654f055f24 100644 --- a/net/batman-adv/tvlv.c +++ b/net/batman-adv/tvlv.c @@ -257,8 +257,13 @@ void batadv_tvlv_container_register(struct batadv_priv *bat_priv, spin_lock_bh(&bat_priv->tvlv.container_list_lock); tvlv_old = batadv_tvlv_container_get(bat_priv, type, version); batadv_tvlv_container_remove(bat_priv, tvlv_old); + + kref_get(&tvlv_new->refcount); hlist_add_head(&tvlv_new->list, &bat_priv->tvlv.container_list); spin_unlock_bh(&bat_priv->tvlv.container_list_lock); + + /* don't return reference to new tvlv_container */ + batadv_tvlv_container_put(tvlv_new); } /** @@ -542,8 +547,12 @@ void batadv_tvlv_handler_register(struct batadv_priv *bat_priv, INIT_HLIST_NODE(&tvlv_handler->list); spin_lock_bh(&bat_priv->tvlv.handler_list_lock); + kref_get(&tvlv_handler->refcount); hlist_add_head_rcu(&tvlv_handler->list, &bat_priv->tvlv.handler_list); spin_unlock_bh(&bat_priv->tvlv.handler_list_lock); + + /* don't return reference to new tvlv_handler */ + batadv_tvlv_handler_put(tvlv_handler); } /** diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index a64522c3b45d..b3dd1a381aad 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -28,6 +28,7 @@ #include <linux/if_ether.h> #include <linux/kref.h> #include <linux/netdevice.h> +#include <linux/netlink.h> #include <linux/sched.h> /* for linux/wait.h */ #include <linux/spinlock.h> #include <linux/types.h> @@ -132,7 +133,6 @@ struct batadv_hard_iface_bat_v { * @rcu: struct used for freeing in an RCU-safe manner * @bat_iv: per hard-interface B.A.T.M.A.N. IV data * @bat_v: per hard-interface B.A.T.M.A.N. V data - * @cleanup_work: work queue callback item for hard-interface deinit * @debug_dir: dentry for nc subdir in batman-adv directory in debugfs * @neigh_list: list of unique single hop neighbors via this interface * @neigh_list_lock: lock protecting neigh_list @@ -152,7 +152,6 @@ struct batadv_hard_iface { #ifdef CONFIG_BATMAN_ADV_BATMAN_V struct batadv_hard_iface_bat_v bat_v; #endif - struct work_struct cleanup_work; struct dentry *debug_dir; struct hlist_head neigh_list; /* neigh_list_lock protects: neigh_list */ @@ -1015,7 +1014,6 @@ struct batadv_priv_bat_v { * @forw_bcast_list_lock: lock protecting forw_bcast_list * @tp_list_lock: spinlock protecting @tp_list * @orig_work: work queue callback item for orig node purging - * @cleanup_work: work queue callback item for soft-interface deinit * @primary_if: one of the hard-interfaces assigned to this mesh interface * becomes the primary interface * @algo_ops: routing algorithm used by this mesh interface @@ -1074,7 +1072,6 @@ struct batadv_priv { spinlock_t tp_list_lock; /* protects tp_list */ atomic_t tp_num; struct delayed_work orig_work; - struct work_struct cleanup_work; struct batadv_hard_iface __rcu *primary_if; /* rcu protected pointer */ struct batadv_algo_ops *algo_ops; struct hlist_head softif_vlan_list; @@ -1379,6 +1376,7 @@ struct batadv_skb_cb { * locally generated packet * @if_outgoing: packet where the packet should be sent to, or NULL if * unspecified + * @queue_left: The queue (counter) this packet was applied to */ struct batadv_forw_packet { struct hlist_node list; @@ -1391,11 +1389,13 @@ struct batadv_forw_packet { struct delayed_work delayed_work; struct batadv_hard_iface *if_incoming; struct batadv_hard_iface *if_outgoing; + atomic_t *queue_left; }; /** * struct batadv_algo_iface_ops - mesh algorithm callbacks (interface specific) * @activate: start routing mechanisms when hard-interface is brought up + * (optional) * @enable: init routing info when hard-interface is enabled * @disable: de-init routing info when hard-interface is disabled * @update_mac: (re-)init mac addresses of the protocol information @@ -1413,11 +1413,13 @@ struct batadv_algo_iface_ops { /** * struct batadv_algo_neigh_ops - mesh algorithm callbacks (neighbour specific) * @hardif_init: called on creation of single hop entry + * (optional) * @cmp: compare the metrics of two neighbors for their respective outgoing * interfaces * @is_similar_or_better: check if neigh1 is equally similar or better than * neigh2 for their respective outgoing interface from the metric prospective * @print: print the single hop neighbor list (optional) + * @dump: dump neighbors to a netlink socket (optional) */ struct batadv_algo_neigh_ops { void (*hardif_init)(struct batadv_hardif_neigh_node *neigh); @@ -1429,26 +1431,64 @@ struct batadv_algo_neigh_ops { struct batadv_hard_iface *if_outgoing1, struct batadv_neigh_node *neigh2, struct batadv_hard_iface *if_outgoing2); +#ifdef CONFIG_BATMAN_ADV_DEBUGFS void (*print)(struct batadv_priv *priv, struct seq_file *seq); +#endif + void (*dump)(struct sk_buff *msg, struct netlink_callback *cb, + struct batadv_priv *priv, + struct batadv_hard_iface *hard_iface); }; /** * struct batadv_algo_orig_ops - mesh algorithm callbacks (originator specific) * @free: free the resources allocated by the routing algorithm for an orig_node - * object + * object (optional) * @add_if: ask the routing algorithm to apply the needed changes to the - * orig_node due to a new hard-interface being added into the mesh + * orig_node due to a new hard-interface being added into the mesh (optional) * @del_if: ask the routing algorithm to apply the needed changes to the - * orig_node due to an hard-interface being removed from the mesh + * orig_node due to an hard-interface being removed from the mesh (optional) * @print: print the originator table (optional) + * @dump: dump originators to a netlink socket (optional) */ struct batadv_algo_orig_ops { void (*free)(struct batadv_orig_node *orig_node); int (*add_if)(struct batadv_orig_node *orig_node, int max_if_num); int (*del_if)(struct batadv_orig_node *orig_node, int max_if_num, int del_if_num); +#ifdef CONFIG_BATMAN_ADV_DEBUGFS void (*print)(struct batadv_priv *priv, struct seq_file *seq, struct batadv_hard_iface *hard_iface); +#endif + void (*dump)(struct sk_buff *msg, struct netlink_callback *cb, + struct batadv_priv *priv, + struct batadv_hard_iface *hard_iface); +}; + +/** + * struct batadv_algo_gw_ops - mesh algorithm callbacks (GW specific) + * @store_sel_class: parse and stores a new GW selection class (optional) + * @show_sel_class: prints the current GW selection class (optional) + * @get_best_gw_node: select the best GW from the list of available nodes + * (optional) + * @is_eligible: check if a newly discovered GW is a potential candidate for + * the election as best GW (optional) + * @print: print the gateway table (optional) + * @dump: dump gateways to a netlink socket (optional) + */ +struct batadv_algo_gw_ops { + ssize_t (*store_sel_class)(struct batadv_priv *bat_priv, char *buff, + size_t count); + ssize_t (*show_sel_class)(struct batadv_priv *bat_priv, char *buff); + struct batadv_gw_node *(*get_best_gw_node) + (struct batadv_priv *bat_priv); + bool (*is_eligible)(struct batadv_priv *bat_priv, + struct batadv_orig_node *curr_gw_orig, + struct batadv_orig_node *orig_node); +#ifdef CONFIG_BATMAN_ADV_DEBUGFS + void (*print)(struct batadv_priv *bat_priv, struct seq_file *seq); +#endif + void (*dump)(struct sk_buff *msg, struct netlink_callback *cb, + struct batadv_priv *priv); }; /** @@ -1458,6 +1498,7 @@ struct batadv_algo_orig_ops { * @iface: callbacks related to interface handling * @neigh: callbacks related to neighbors handling * @orig: callbacks related to originators handling + * @gw: callbacks related to GW mode */ struct batadv_algo_ops { struct hlist_node list; @@ -1465,6 +1506,7 @@ struct batadv_algo_ops { struct batadv_algo_iface_ops iface; struct batadv_algo_neigh_ops neigh; struct batadv_algo_orig_ops orig; + struct batadv_algo_gw_ops gw; }; /** @@ -1564,4 +1606,17 @@ enum batadv_tvlv_handler_flags { BATADV_TVLV_HANDLER_OGM_CALLED = BIT(2), }; +/** + * struct batadv_store_mesh_work - Work queue item to detach add/del interface + * from sysfs locks + * @net_dev: netdevice to add/remove to/from batman-adv soft-interface + * @soft_iface_name: name of soft-interface to modify + * @work: work queue item + */ +struct batadv_store_mesh_work { + struct net_device *net_dev; + char soft_iface_name[IFNAMSIZ]; + struct work_struct work; +}; + #endif /* _NET_BATMAN_ADV_TYPES_H_ */ diff --git a/net/bridge/Makefile b/net/bridge/Makefile index a1cda5d4718d..0aefc011b668 100644 --- a/net/bridge/Makefile +++ b/net/bridge/Makefile @@ -20,4 +20,6 @@ bridge-$(CONFIG_BRIDGE_IGMP_SNOOPING) += br_multicast.o br_mdb.o bridge-$(CONFIG_BRIDGE_VLAN_FILTERING) += br_vlan.o +bridge-$(CONFIG_NET_SWITCHDEV) += br_switchdev.o + obj-$(CONFIG_NETFILTER) += netfilter/ diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 63a83d8d7da3..32a02de39cd2 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -29,7 +29,8 @@ static inline int should_deliver(const struct net_bridge_port *p, vg = nbp_vlan_group_rcu(p); return ((p->flags & BR_HAIRPIN_MODE) || skb->dev != p->dev) && - br_allowed_egress(vg, skb) && p->state == BR_STATE_FORWARDING; + br_allowed_egress(vg, skb) && p->state == BR_STATE_FORWARDING && + nbp_switchdev_allowed_egress(p, skb); } int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index f2fede05d32c..1da3221845f1 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -545,6 +545,10 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) if (err) goto err5; + err = nbp_switchdev_mark_set(p); + if (err) + goto err6; + dev_disable_lro(dev); list_add_rcu(&p->list, &br->port_list); @@ -566,7 +570,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) err = nbp_vlan_init(p); if (err) { netdev_err(dev, "failed to initialize vlan filtering on this port\n"); - goto err6; + goto err7; } spin_lock_bh(&br->lock); @@ -589,12 +593,12 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) return 0; -err6: +err7: list_del_rcu(&p->list); br_fdb_delete_by_port(br, p, 0, 1); nbp_update_port_count(br); +err6: netdev_upper_dev_unlink(dev, br->dev); - err5: dev->priv_flags &= ~IFF_BRIDGE_PORT; netdev_rx_handler_unregister(dev); diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 8e486203d133..3132cfc80e9d 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -145,6 +145,8 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb if (!br_allowed_ingress(p->br, nbp_vlan_group_rcu(p), skb, &vid)) goto out; + nbp_switchdev_frame_mark(p, skb); + /* insert into forwarding database after filtering to avoid spoofing */ br = p->br; if (p->flags & BR_LEARNING) diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index f2a29e467e78..190a5bc00f4a 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -1245,14 +1245,30 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) return 0; } -static size_t bridge_get_linkxstats_size(const struct net_device *dev) +static size_t br_get_linkxstats_size(const struct net_device *dev, int attr) { - struct net_bridge *br = netdev_priv(dev); + struct net_bridge_port *p = NULL; struct net_bridge_vlan_group *vg; struct net_bridge_vlan *v; + struct net_bridge *br; int numvls = 0; - vg = br_vlan_group(br); + switch (attr) { + case IFLA_STATS_LINK_XSTATS: + br = netdev_priv(dev); + vg = br_vlan_group(br); + break; + case IFLA_STATS_LINK_XSTATS_SLAVE: + p = br_port_get_rtnl(dev); + if (!p) + return 0; + br = p->br; + vg = nbp_vlan_group(p); + break; + default: + return 0; + } + if (vg) { /* we need to count all, even placeholder entries */ list_for_each_entry(v, &vg->vlan_list, vlist) @@ -1264,45 +1280,42 @@ static size_t bridge_get_linkxstats_size(const struct net_device *dev) nla_total_size(0); } -static size_t brport_get_linkxstats_size(const struct net_device *dev) -{ - return nla_total_size(sizeof(struct br_mcast_stats)) + - nla_total_size(0); -} - -static size_t br_get_linkxstats_size(const struct net_device *dev, int attr) +static int br_fill_linkxstats(struct sk_buff *skb, + const struct net_device *dev, + int *prividx, int attr) { - size_t retsize = 0; + struct nlattr *nla __maybe_unused; + struct net_bridge_port *p = NULL; + struct net_bridge_vlan_group *vg; + struct net_bridge_vlan *v; + struct net_bridge *br; + struct nlattr *nest; + int vl_idx = 0; switch (attr) { case IFLA_STATS_LINK_XSTATS: - retsize = bridge_get_linkxstats_size(dev); + br = netdev_priv(dev); + vg = br_vlan_group(br); break; case IFLA_STATS_LINK_XSTATS_SLAVE: - retsize = brport_get_linkxstats_size(dev); + p = br_port_get_rtnl(dev); + if (!p) + return 0; + br = p->br; + vg = nbp_vlan_group(p); break; + default: + return -EINVAL; } - return retsize; -} - -static int bridge_fill_linkxstats(struct sk_buff *skb, - const struct net_device *dev, - int *prividx) -{ - struct net_bridge *br = netdev_priv(dev); - struct nlattr *nla __maybe_unused; - struct net_bridge_vlan_group *vg; - struct net_bridge_vlan *v; - struct nlattr *nest; - int vl_idx = 0; - nest = nla_nest_start(skb, LINK_XSTATS_TYPE_BRIDGE); if (!nest) return -EMSGSIZE; - vg = br_vlan_group(br); if (vg) { + u16 pvid; + + pvid = br_get_pvid(vg); list_for_each_entry(v, &vg->vlan_list, vlist) { struct bridge_vlan_xstats vxi; struct br_vlan_stats stats; @@ -1311,6 +1324,9 @@ static int bridge_fill_linkxstats(struct sk_buff *skb, continue; memset(&vxi, 0, sizeof(vxi)); vxi.vid = v->vid; + vxi.flags = v->flags; + if (v->vid == pvid) + vxi.flags |= BRIDGE_VLAN_INFO_PVID; br_vlan_get_stats(v, &stats); vxi.rx_bytes = stats.rx_bytes; vxi.rx_packets = stats.rx_packets; @@ -1329,7 +1345,7 @@ static int bridge_fill_linkxstats(struct sk_buff *skb, BRIDGE_XSTATS_PAD); if (!nla) goto nla_put_failure; - br_multicast_get_stats(br, NULL, nla_data(nla)); + br_multicast_get_stats(br, p, nla_data(nla)); } #endif nla_nest_end(skb, nest); @@ -1344,52 +1360,6 @@ nla_put_failure: return -EMSGSIZE; } -static int brport_fill_linkxstats(struct sk_buff *skb, - const struct net_device *dev, - int *prividx) -{ - struct net_bridge_port *p = br_port_get_rtnl(dev); - struct nlattr *nla __maybe_unused; - struct nlattr *nest; - - if (!p) - return 0; - - nest = nla_nest_start(skb, LINK_XSTATS_TYPE_BRIDGE); - if (!nest) - return -EMSGSIZE; -#ifdef CONFIG_BRIDGE_IGMP_SNOOPING - nla = nla_reserve_64bit(skb, BRIDGE_XSTATS_MCAST, - sizeof(struct br_mcast_stats), - BRIDGE_XSTATS_PAD); - if (!nla) { - nla_nest_end(skb, nest); - return -EMSGSIZE; - } - br_multicast_get_stats(p->br, p, nla_data(nla)); -#endif - nla_nest_end(skb, nest); - - return 0; -} - -static int br_fill_linkxstats(struct sk_buff *skb, const struct net_device *dev, - int *prividx, int attr) -{ - int ret = -EINVAL; - - switch (attr) { - case IFLA_STATS_LINK_XSTATS: - ret = bridge_fill_linkxstats(skb, dev, prividx); - break; - case IFLA_STATS_LINK_XSTATS_SLAVE: - ret = brport_fill_linkxstats(skb, dev, prividx); - break; - } - - return ret; -} - static struct rtnl_af_ops br_af_ops __read_mostly = { .family = AF_BRIDGE, .get_link_af_size = br_get_link_af_size_filtered, diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index aac2a6e6b008..2379b2b865c9 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -251,6 +251,9 @@ struct net_bridge_port #ifdef CONFIG_BRIDGE_VLAN_FILTERING struct net_bridge_vlan_group __rcu *vlgrp; #endif +#ifdef CONFIG_NET_SWITCHDEV + int offload_fwd_mark; +#endif }; #define br_auto_port(p) ((p)->flags & BR_AUTO_MASK) @@ -359,6 +362,11 @@ struct net_bridge struct timer_list gc_timer; struct kobject *ifobj; u32 auto_cnt; + +#ifdef CONFIG_NET_SWITCHDEV + int offload_fwd_mark; +#endif + #ifdef CONFIG_BRIDGE_VLAN_FILTERING struct net_bridge_vlan_group __rcu *vlgrp; u8 vlan_enabled; @@ -381,6 +389,10 @@ struct br_input_skb_cb { #ifdef CONFIG_BRIDGE_VLAN_FILTERING bool vlan_filtered; #endif + +#ifdef CONFIG_NET_SWITCHDEV + int offload_fwd_mark; +#endif }; #define BR_INPUT_SKB_CB(__skb) ((struct br_input_skb_cb *)(__skb)->cb) @@ -1034,4 +1046,29 @@ static inline int br_sysfs_addbr(struct net_device *dev) { return 0; } static inline void br_sysfs_delbr(struct net_device *dev) { return; } #endif /* CONFIG_SYSFS */ +/* br_switchdev.c */ +#ifdef CONFIG_NET_SWITCHDEV +int nbp_switchdev_mark_set(struct net_bridge_port *p); +void nbp_switchdev_frame_mark(const struct net_bridge_port *p, + struct sk_buff *skb); +bool nbp_switchdev_allowed_egress(const struct net_bridge_port *p, + const struct sk_buff *skb); +#else +static inline int nbp_switchdev_mark_set(struct net_bridge_port *p) +{ + return 0; +} + +static inline void nbp_switchdev_frame_mark(const struct net_bridge_port *p, + struct sk_buff *skb) +{ +} + +static inline bool nbp_switchdev_allowed_egress(const struct net_bridge_port *p, + const struct sk_buff *skb) +{ + return true; +} +#endif /* CONFIG_NET_SWITCHDEV */ + #endif diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c new file mode 100644 index 000000000000..f4097b900de1 --- /dev/null +++ b/net/bridge/br_switchdev.c @@ -0,0 +1,57 @@ +#include <linux/kernel.h> +#include <linux/list.h> +#include <linux/netdevice.h> +#include <linux/rtnetlink.h> +#include <linux/skbuff.h> +#include <net/switchdev.h> + +#include "br_private.h" + +static int br_switchdev_mark_get(struct net_bridge *br, struct net_device *dev) +{ + struct net_bridge_port *p; + + /* dev is yet to be added to the port list. */ + list_for_each_entry(p, &br->port_list, list) { + if (switchdev_port_same_parent_id(dev, p->dev)) + return p->offload_fwd_mark; + } + + return ++br->offload_fwd_mark; +} + +int nbp_switchdev_mark_set(struct net_bridge_port *p) +{ + struct switchdev_attr attr = { + .orig_dev = p->dev, + .id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID, + }; + int err; + + ASSERT_RTNL(); + + err = switchdev_port_attr_get(p->dev, &attr); + if (err) { + if (err == -EOPNOTSUPP) + return 0; + return err; + } + + p->offload_fwd_mark = br_switchdev_mark_get(p->br, p->dev); + + return 0; +} + +void nbp_switchdev_frame_mark(const struct net_bridge_port *p, + struct sk_buff *skb) +{ + if (skb->offload_fwd_mark && !WARN_ON_ONCE(!p->offload_fwd_mark)) + BR_INPUT_SKB_CB(skb)->offload_fwd_mark = p->offload_fwd_mark; +} + +bool nbp_switchdev_allowed_egress(const struct net_bridge_port *p, + const struct sk_buff *skb) +{ + return !skb->offload_fwd_mark || + BR_INPUT_SKB_CB(skb)->offload_fwd_mark != p->offload_fwd_mark; +} diff --git a/net/core/dev.c b/net/core/dev.c index dd6ce598de89..1d5c6dda1988 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3355,16 +3355,6 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) else skb_dst_force(skb); -#ifdef CONFIG_NET_SWITCHDEV - /* Don't forward if offload device already forwarded */ - if (skb->offload_fwd_mark && - skb->offload_fwd_mark == dev->offload_fwd_mark) { - consume_skb(skb); - rc = NET_XMIT_SUCCESS; - goto out; - } -#endif - txq = netdev_pick_tx(dev, skb, accel_priv); q = rcu_dereference_bh(txq->qdisc); @@ -4292,15 +4282,25 @@ int netif_receive_skb(struct sk_buff *skb) } EXPORT_SYMBOL(netif_receive_skb); -/* Network device is going away, flush any packets still pending - * Called with irqs disabled. - */ -static void flush_backlog(void *arg) +struct flush_work { + struct net_device *dev; + struct work_struct work; +}; + +DEFINE_PER_CPU(struct flush_work, flush_works); + +/* Network device is going away, flush any packets still pending */ +static void flush_backlog(struct work_struct *work) { - struct net_device *dev = arg; - struct softnet_data *sd = this_cpu_ptr(&softnet_data); + struct flush_work *flush = container_of(work, typeof(*flush), work); + struct net_device *dev = flush->dev; struct sk_buff *skb, *tmp; + struct softnet_data *sd; + local_bh_disable(); + sd = this_cpu_ptr(&softnet_data); + + local_irq_disable(); rps_lock(sd); skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) { if (skb->dev == dev) { @@ -4310,6 +4310,7 @@ static void flush_backlog(void *arg) } } rps_unlock(sd); + local_irq_enable(); skb_queue_walk_safe(&sd->process_queue, skb, tmp) { if (skb->dev == dev) { @@ -4318,6 +4319,27 @@ static void flush_backlog(void *arg) input_queue_head_incr(sd); } } + local_bh_enable(); +} + +static void flush_all_backlogs(struct net_device *dev) +{ + unsigned int cpu; + + get_online_cpus(); + + for_each_online_cpu(cpu) { + struct flush_work *flush = per_cpu_ptr(&flush_works, cpu); + + INIT_WORK(&flush->work, flush_backlog); + flush->dev = dev; + queue_work_on(cpu, system_highpri_wq, &flush->work); + } + + for_each_online_cpu(cpu) + flush_work(&per_cpu_ptr(&flush_works, cpu)->work); + + put_online_cpus(); } static int napi_gro_complete(struct sk_buff *skb) @@ -4805,8 +4827,9 @@ static bool sd_has_rps_ipi_waiting(struct softnet_data *sd) static int process_backlog(struct napi_struct *napi, int quota) { - int work = 0; struct softnet_data *sd = container_of(napi, struct softnet_data, backlog); + bool again = true; + int work = 0; /* Check if we have pending ipi, its better to send them now, * not waiting net_rx_action() end. @@ -4817,23 +4840,20 @@ static int process_backlog(struct napi_struct *napi, int quota) } napi->weight = weight_p; - local_irq_disable(); - while (1) { + while (again) { struct sk_buff *skb; while ((skb = __skb_dequeue(&sd->process_queue))) { rcu_read_lock(); - local_irq_enable(); __netif_receive_skb(skb); rcu_read_unlock(); - local_irq_disable(); input_queue_head_incr(sd); - if (++work >= quota) { - local_irq_enable(); + if (++work >= quota) return work; - } + } + local_irq_disable(); rps_lock(sd); if (skb_queue_empty(&sd->input_pkt_queue)) { /* @@ -4845,16 +4865,14 @@ static int process_backlog(struct napi_struct *napi, int quota) * and we dont need an smp_mb() memory barrier. */ napi->state = 0; - rps_unlock(sd); - - break; + again = false; + } else { + skb_queue_splice_tail_init(&sd->input_pkt_queue, + &sd->process_queue); } - - skb_queue_splice_tail_init(&sd->input_pkt_queue, - &sd->process_queue); rps_unlock(sd); + local_irq_enable(); } - local_irq_enable(); return work; } @@ -6707,7 +6725,7 @@ static void rollback_registered_many(struct list_head *head) unlist_netdevice(dev); dev->reg_state = NETREG_UNREGISTERING; - on_each_cpu(flush_backlog, dev, 1); + flush_all_backlogs(dev); } synchronize_net(); @@ -7625,6 +7643,9 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, INIT_LIST_HEAD(&dev->all_adj_list.lower); INIT_LIST_HEAD(&dev->ptype_all); INIT_LIST_HEAD(&dev->ptype_specific); +#ifdef CONFIG_NET_SCHED + hash_init(dev->qdisc_hash); +#endif dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; setup(dev); diff --git a/net/core/filter.c b/net/core/filter.c index cb06aceb512a..a83766be1ad2 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1350,14 +1350,18 @@ struct bpf_scratchpad { static DEFINE_PER_CPU(struct bpf_scratchpad, bpf_sp); +static inline int __bpf_try_make_writable(struct sk_buff *skb, + unsigned int write_len) +{ + return skb_ensure_writable(skb, write_len); +} + static inline int bpf_try_make_writable(struct sk_buff *skb, unsigned int write_len) { - int err; + int err = __bpf_try_make_writable(skb, write_len); - err = skb_ensure_writable(skb, write_len); bpf_compute_data_end(skb); - return err; } @@ -1976,8 +1980,8 @@ static u64 bpf_skb_change_type(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) u32 pkt_type = r2; /* We only allow a restricted subset to be changed for now. */ - if (unlikely(skb->pkt_type > PACKET_OTHERHOST || - pkt_type > PACKET_OTHERHOST)) + if (unlikely(!skb_pkt_type_ok(skb->pkt_type) || + !skb_pkt_type_ok(pkt_type))) return -EINVAL; skb->pkt_type = pkt_type; @@ -1992,6 +1996,92 @@ static const struct bpf_func_proto bpf_skb_change_type_proto = { .arg2_type = ARG_ANYTHING, }; +static u32 __bpf_skb_min_len(const struct sk_buff *skb) +{ + u32 min_len = skb_network_offset(skb); + + if (skb_transport_header_was_set(skb)) + min_len = skb_transport_offset(skb); + if (skb->ip_summed == CHECKSUM_PARTIAL) + min_len = skb_checksum_start_offset(skb) + + skb->csum_offset + sizeof(__sum16); + return min_len; +} + +static u32 __bpf_skb_max_len(const struct sk_buff *skb) +{ + return skb->dev ? skb->dev->mtu + skb->dev->hard_header_len : + 65536; +} + +static int bpf_skb_grow_rcsum(struct sk_buff *skb, unsigned int new_len) +{ + unsigned int old_len = skb->len; + int ret; + + ret = __skb_grow_rcsum(skb, new_len); + if (!ret) + memset(skb->data + old_len, 0, new_len - old_len); + return ret; +} + +static int bpf_skb_trim_rcsum(struct sk_buff *skb, unsigned int new_len) +{ + return __skb_trim_rcsum(skb, new_len); +} + +static u64 bpf_skb_change_tail(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5) +{ + struct sk_buff *skb = (struct sk_buff *)(long) r1; + u32 max_len = __bpf_skb_max_len(skb); + u32 min_len = __bpf_skb_min_len(skb); + u32 new_len = (u32) r2; + int ret; + + if (unlikely(flags || new_len > max_len || new_len < min_len)) + return -EINVAL; + if (skb->encapsulation) + return -ENOTSUPP; + + /* The basic idea of this helper is that it's performing the + * needed work to either grow or trim an skb, and eBPF program + * rewrites the rest via helpers like bpf_skb_store_bytes(), + * bpf_lX_csum_replace() and others rather than passing a raw + * buffer here. This one is a slow path helper and intended + * for replies with control messages. + * + * Like in bpf_skb_change_proto(), we want to keep this rather + * minimal and without protocol specifics so that we are able + * to separate concerns as in bpf_skb_store_bytes() should only + * be the one responsible for writing buffers. + * + * It's really expected to be a slow path operation here for + * control message replies, so we're implicitly linearizing, + * uncloning and drop offloads from the skb by this. + */ + ret = __bpf_try_make_writable(skb, skb->len); + if (!ret) { + if (new_len > skb->len) + ret = bpf_skb_grow_rcsum(skb, new_len); + else if (new_len < skb->len) + ret = bpf_skb_trim_rcsum(skb, new_len); + if (!ret && skb_is_gso(skb)) + skb_gso_reset(skb); + } + + bpf_compute_data_end(skb); + return ret; +} + +static const struct bpf_func_proto bpf_skb_change_tail_proto = { + .func = bpf_skb_change_tail, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; + bool bpf_helper_changes_skb_data(void *func) { if (func == bpf_skb_vlan_push) @@ -2002,6 +2092,8 @@ bool bpf_helper_changes_skb_data(void *func) return true; if (func == bpf_skb_change_proto) return true; + if (func == bpf_skb_change_tail) + return true; if (func == bpf_l3_csum_replace) return true; if (func == bpf_l4_csum_replace) @@ -2282,7 +2374,6 @@ bpf_get_skb_set_tunnel_proto(enum bpf_func_id which) } } -#ifdef CONFIG_SOCK_CGROUP_DATA static u64 bpf_skb_under_cgroup(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) { struct sk_buff *skb = (struct sk_buff *)(long)r1; @@ -2303,7 +2394,7 @@ static u64 bpf_skb_under_cgroup(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) if (unlikely(!cgrp)) return -EAGAIN; - return cgroup_is_descendant(sock_cgroup_ptr(&sk->sk_cgrp_data), cgrp); + return sk_under_cgroup_hierarchy(sk, cgrp); } static const struct bpf_func_proto bpf_skb_under_cgroup_proto = { @@ -2314,7 +2405,41 @@ static const struct bpf_func_proto bpf_skb_under_cgroup_proto = { .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, }; -#endif + +static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff, + unsigned long off, unsigned long len) +{ + memcpy(dst_buff, src_buff + off, len); + return 0; +} + +static u64 bpf_xdp_event_output(u64 r1, u64 r2, u64 flags, u64 r4, + u64 meta_size) +{ + struct xdp_buff *xdp = (struct xdp_buff *)(long) r1; + struct bpf_map *map = (struct bpf_map *)(long) r2; + u64 xdp_size = (flags & BPF_F_CTXLEN_MASK) >> 32; + void *meta = (void *)(long) r4; + + if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK))) + return -EINVAL; + if (unlikely(xdp_size > (unsigned long)(xdp->data_end - xdp->data))) + return -EFAULT; + + return bpf_event_output(map, flags, meta, meta_size, xdp, xdp_size, + bpf_xdp_copy); +} + +static const struct bpf_func_proto bpf_xdp_event_output_proto = { + .func = bpf_xdp_event_output, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_STACK, + .arg5_type = ARG_CONST_STACK_SIZE, +}; static const struct bpf_func_proto * sk_filter_func_proto(enum bpf_func_id func_id) @@ -2368,6 +2493,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) return &bpf_skb_change_proto_proto; case BPF_FUNC_skb_change_type: return &bpf_skb_change_type_proto; + case BPF_FUNC_skb_change_tail: + return &bpf_skb_change_tail_proto; case BPF_FUNC_skb_get_tunnel_key: return &bpf_skb_get_tunnel_key_proto; case BPF_FUNC_skb_set_tunnel_key: @@ -2386,10 +2513,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) return &bpf_skb_event_output_proto; case BPF_FUNC_get_smp_processor_id: return &bpf_get_smp_processor_id_proto; -#ifdef CONFIG_SOCK_CGROUP_DATA case BPF_FUNC_skb_under_cgroup: return &bpf_skb_under_cgroup_proto; -#endif default: return sk_filter_func_proto(func_id); } @@ -2398,7 +2523,12 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) static const struct bpf_func_proto * xdp_func_proto(enum bpf_func_id func_id) { - return sk_filter_func_proto(func_id); + switch (func_id) { + case BPF_FUNC_perf_event_output: + return &bpf_xdp_event_output_proto; + default: + return sk_filter_func_proto(func_id); + } } static bool __is_valid_access(int off, int size, enum bpf_access_type type) diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 61ad43f61c5e..a2879c0f6c4c 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -6,6 +6,8 @@ #include <linux/if_vlan.h> #include <net/ip.h> #include <net/ipv6.h> +#include <net/gre.h> +#include <net/pptp.h> #include <linux/igmp.h> #include <linux/icmp.h> #include <linux/sctp.h> @@ -116,13 +118,16 @@ bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_dissector_key_addrs *key_addrs; struct flow_dissector_key_ports *key_ports; struct flow_dissector_key_tags *key_tags; + struct flow_dissector_key_vlan *key_vlan; struct flow_dissector_key_keyid *key_keyid; + bool skip_vlan = false; u8 ip_proto = 0; bool ret = false; if (!data) { data = skb->data; - proto = skb->protocol; + proto = skb_vlan_tag_present(skb) ? + skb->vlan_proto : skb->protocol; nhoff = skb_network_offset(skb); hlen = skb_headlen(skb); } @@ -241,23 +246,45 @@ ipv6: case htons(ETH_P_8021AD): case htons(ETH_P_8021Q): { const struct vlan_hdr *vlan; - struct vlan_hdr _vlan; - vlan = __skb_header_pointer(skb, nhoff, sizeof(_vlan), data, hlen, &_vlan); - if (!vlan) - goto out_bad; + if (skb_vlan_tag_present(skb)) + proto = skb->protocol; + + if (!skb_vlan_tag_present(skb) || + proto == cpu_to_be16(ETH_P_8021Q) || + proto == cpu_to_be16(ETH_P_8021AD)) { + struct vlan_hdr _vlan; + vlan = __skb_header_pointer(skb, nhoff, sizeof(_vlan), + data, hlen, &_vlan); + if (!vlan) + goto out_bad; + proto = vlan->h_vlan_encapsulated_proto; + nhoff += sizeof(*vlan); + if (skip_vlan) + goto again; + } + + skip_vlan = true; if (dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_VLANID)) { - key_tags = skb_flow_dissector_target(flow_dissector, - FLOW_DISSECTOR_KEY_VLANID, + FLOW_DISSECTOR_KEY_VLAN)) { + key_vlan = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_VLAN, target_container); - key_tags->vlan_id = skb_vlan_tag_get_id(skb); + if (skb_vlan_tag_present(skb)) { + key_vlan->vlan_id = skb_vlan_tag_get_id(skb); + key_vlan->vlan_priority = + (skb_vlan_tag_get_prio(skb) >> VLAN_PRIO_SHIFT); + } else { + key_vlan->vlan_id = ntohs(vlan->h_vlan_TCI) & + VLAN_VID_MASK; + key_vlan->vlan_priority = + (ntohs(vlan->h_vlan_TCI) & + VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; + } } - proto = vlan->h_vlan_encapsulated_proto; - nhoff += sizeof(*vlan); goto again; } case htons(ETH_P_PPP_SES): { @@ -338,32 +365,42 @@ mpls: ip_proto_again: switch (ip_proto) { case IPPROTO_GRE: { - struct gre_hdr { - __be16 flags; - __be16 proto; - } *hdr, _hdr; + struct gre_base_hdr *hdr, _hdr; + u16 gre_ver; + int offset = 0; hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) goto out_bad; - /* - * Only look inside GRE if version zero and no - * routing - */ - if (hdr->flags & (GRE_VERSION | GRE_ROUTING)) + + /* Only look inside GRE without routing */ + if (hdr->flags & GRE_ROUTING) break; - proto = hdr->proto; - nhoff += 4; + /* Only look inside GRE for version 0 and 1 */ + gre_ver = ntohs(hdr->flags & GRE_VERSION); + if (gre_ver > 1) + break; + + proto = hdr->protocol; + if (gre_ver) { + /* Version1 must be PPTP, and check the flags */ + if (!(proto == GRE_PROTO_PPP && (hdr->flags & GRE_KEY))) + break; + } + + offset += sizeof(struct gre_base_hdr); + if (hdr->flags & GRE_CSUM) - nhoff += 4; + offset += sizeof(((struct gre_full_hdr *)0)->csum) + + sizeof(((struct gre_full_hdr *)0)->reserved1); + if (hdr->flags & GRE_KEY) { const __be32 *keyid; __be32 _keyid; - keyid = __skb_header_pointer(skb, nhoff, sizeof(_keyid), + keyid = __skb_header_pointer(skb, nhoff + offset, sizeof(_keyid), data, hlen, &_keyid); - if (!keyid) goto out_bad; @@ -372,32 +409,65 @@ ip_proto_again: key_keyid = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_GRE_KEYID, target_container); - key_keyid->keyid = *keyid; + if (gre_ver == 0) + key_keyid->keyid = *keyid; + else + key_keyid->keyid = *keyid & GRE_PPTP_KEY_MASK; } - nhoff += 4; + offset += sizeof(((struct gre_full_hdr *)0)->key); } + if (hdr->flags & GRE_SEQ) - nhoff += 4; - if (proto == htons(ETH_P_TEB)) { - const struct ethhdr *eth; - struct ethhdr _eth; - - eth = __skb_header_pointer(skb, nhoff, - sizeof(_eth), - data, hlen, &_eth); - if (!eth) + offset += sizeof(((struct pptp_gre_header *)0)->seq); + + if (gre_ver == 0) { + if (proto == htons(ETH_P_TEB)) { + const struct ethhdr *eth; + struct ethhdr _eth; + + eth = __skb_header_pointer(skb, nhoff + offset, + sizeof(_eth), + data, hlen, &_eth); + if (!eth) + goto out_bad; + proto = eth->h_proto; + offset += sizeof(*eth); + + /* Cap headers that we access via pointers at the + * end of the Ethernet header as our maximum alignment + * at that point is only 2 bytes. + */ + if (NET_IP_ALIGN) + hlen = (nhoff + offset); + } + } else { /* version 1, must be PPTP */ + u8 _ppp_hdr[PPP_HDRLEN]; + u8 *ppp_hdr; + + if (hdr->flags & GRE_ACK) + offset += sizeof(((struct pptp_gre_header *)0)->ack); + + ppp_hdr = skb_header_pointer(skb, nhoff + offset, + sizeof(_ppp_hdr), _ppp_hdr); + if (!ppp_hdr) goto out_bad; - proto = eth->h_proto; - nhoff += sizeof(*eth); - - /* Cap headers that we access via pointers at the - * end of the Ethernet header as our maximum alignment - * at that point is only 2 bytes. - */ - if (NET_IP_ALIGN) - hlen = nhoff; + + switch (PPP_PROTOCOL(ppp_hdr)) { + case PPP_IP: + proto = htons(ETH_P_IP); + break; + case PPP_IPV6: + proto = htons(ETH_P_IPV6); + break; + default: + /* Could probably catch some more like MPLS */ + break; + } + + offset += PPP_HDRLEN; } + nhoff += offset; key_control->flags |= FLOW_DIS_ENCAPSULATION; if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) goto out_good; @@ -872,8 +942,8 @@ static const struct flow_dissector_key flow_keys_dissector_keys[] = { .offset = offsetof(struct flow_keys, ports), }, { - .key_id = FLOW_DISSECTOR_KEY_VLANID, - .offset = offsetof(struct flow_keys, tags), + .key_id = FLOW_DISSECTOR_KEY_VLAN, + .offset = offsetof(struct flow_keys, vlan), }, { .key_id = FLOW_DISSECTOR_KEY_FLOW_LABEL, diff --git a/net/core/neighbour.c b/net/core/neighbour.c index cf26e04c4046..2ae929f9bd06 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1148,7 +1148,8 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, } else goto out; } else { - if (lladdr == neigh->ha && new == NUD_STALE) + if (lladdr == neigh->ha && new == NUD_STALE && + !(flags & NEIGH_UPDATE_F_ADMIN)) new = old; } } diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 2c2eb1b629b1..1fe58167d39a 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -37,6 +37,8 @@ struct net init_net = { }; EXPORT_SYMBOL(init_net); +static bool init_net_initialized; + #define INITIAL_NET_GEN_PTRS 13 /* +1 for len +2 for rcu_head */ static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS; @@ -750,6 +752,8 @@ static int __init net_ns_init(void) if (setup_net(&init_net, &init_user_ns)) panic("Could not setup the initial network namespace"); + init_net_initialized = true; + rtnl_lock(); list_add_tail_rcu(&init_net.list, &net_namespace_list); rtnl_unlock(); @@ -811,15 +815,24 @@ static void __unregister_pernet_operations(struct pernet_operations *ops) static int __register_pernet_operations(struct list_head *list, struct pernet_operations *ops) { + if (!init_net_initialized) { + list_add_tail(&ops->list, list); + return 0; + } + return ops_init(ops, &init_net); } static void __unregister_pernet_operations(struct pernet_operations *ops) { - LIST_HEAD(net_exit_list); - list_add(&init_net.exit_list, &net_exit_list); - ops_exit_list(ops, &net_exit_list); - ops_free_list(ops, &net_exit_list); + if (!init_net_initialized) { + list_del(&ops->list); + } else { + LIST_HEAD(net_exit_list); + list_add(&init_net.exit_list, &net_exit_list); + ops_exit_list(ops, &net_exit_list); + ops_free_list(ops, &net_exit_list); + } } #endif /* CONFIG_NET_NS */ diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 189cc78c77eb..318fc5231b2b 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -704,6 +704,8 @@ int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics) } else if (i == RTAX_FEATURES - 1) { u32 user_features = metrics[i] & RTAX_FEATURE_MASK; + if (!user_features) + continue; BUILD_BUG_ON(RTAX_FEATURE_MASK & DST_FEATURE_MASK); if (nla_put_u32(skb, i + 1, user_features)) goto nla_put_failure; diff --git a/net/core/sock.c b/net/core/sock.c index 25dab8b60223..51a730485649 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1315,24 +1315,6 @@ static void sock_copy(struct sock *nsk, const struct sock *osk) #endif } -void sk_prot_clear_portaddr_nulls(struct sock *sk, int size) -{ - unsigned long nulls1, nulls2; - - nulls1 = offsetof(struct sock, __sk_common.skc_node.next); - nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next); - if (nulls1 > nulls2) - swap(nulls1, nulls2); - - if (nulls1 != 0) - memset((char *)sk, 0, nulls1); - memset((char *)sk + nulls1 + sizeof(void *), 0, - nulls2 - nulls1 - sizeof(void *)); - memset((char *)sk + nulls2 + sizeof(void *), 0, - size - nulls2 - sizeof(void *)); -} -EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls); - static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, int family) { @@ -1344,12 +1326,8 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO); if (!sk) return sk; - if (priority & __GFP_ZERO) { - if (prot->clear_sk) - prot->clear_sk(sk, prot->obj_size); - else - sk_prot_clear_nulls(sk, prot->obj_size); - } + if (priority & __GFP_ZERO) + sk_prot_clear_nulls(sk, prot->obj_size); } else sk = kmalloc(prot->obj_size, priority); diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 7e68bc6bc853..d8d267e9a872 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -61,27 +61,27 @@ const struct dsa_device_ops *dsa_device_ops[DSA_TAG_LAST] = { static DEFINE_MUTEX(dsa_switch_drivers_mutex); static LIST_HEAD(dsa_switch_drivers); -void register_switch_driver(struct dsa_switch_driver *drv) +void register_switch_driver(struct dsa_switch_ops *ops) { mutex_lock(&dsa_switch_drivers_mutex); - list_add_tail(&drv->list, &dsa_switch_drivers); + list_add_tail(&ops->list, &dsa_switch_drivers); mutex_unlock(&dsa_switch_drivers_mutex); } EXPORT_SYMBOL_GPL(register_switch_driver); -void unregister_switch_driver(struct dsa_switch_driver *drv) +void unregister_switch_driver(struct dsa_switch_ops *ops) { mutex_lock(&dsa_switch_drivers_mutex); - list_del_init(&drv->list); + list_del_init(&ops->list); mutex_unlock(&dsa_switch_drivers_mutex); } EXPORT_SYMBOL_GPL(unregister_switch_driver); -static struct dsa_switch_driver * +static struct dsa_switch_ops * dsa_switch_probe(struct device *parent, struct device *host_dev, int sw_addr, const char **_name, void **priv) { - struct dsa_switch_driver *ret; + struct dsa_switch_ops *ret; struct list_head *list; const char *name; @@ -90,13 +90,13 @@ dsa_switch_probe(struct device *parent, struct device *host_dev, int sw_addr, mutex_lock(&dsa_switch_drivers_mutex); list_for_each(list, &dsa_switch_drivers) { - struct dsa_switch_driver *drv; + struct dsa_switch_ops *ops; - drv = list_entry(list, struct dsa_switch_driver, list); + ops = list_entry(list, struct dsa_switch_ops, list); - name = drv->probe(parent, host_dev, sw_addr, priv); + name = ops->probe(parent, host_dev, sw_addr, priv); if (name != NULL) { - ret = drv; + ret = ops; break; } } @@ -117,7 +117,7 @@ static ssize_t temp1_input_show(struct device *dev, struct dsa_switch *ds = dev_get_drvdata(dev); int temp, ret; - ret = ds->drv->get_temp(ds, &temp); + ret = ds->ops->get_temp(ds, &temp); if (ret < 0) return ret; @@ -131,7 +131,7 @@ static ssize_t temp1_max_show(struct device *dev, struct dsa_switch *ds = dev_get_drvdata(dev); int temp, ret; - ret = ds->drv->get_temp_limit(ds, &temp); + ret = ds->ops->get_temp_limit(ds, &temp); if (ret < 0) return ret; @@ -149,7 +149,7 @@ static ssize_t temp1_max_store(struct device *dev, if (ret < 0) return ret; - ret = ds->drv->set_temp_limit(ds, DIV_ROUND_CLOSEST(temp, 1000)); + ret = ds->ops->set_temp_limit(ds, DIV_ROUND_CLOSEST(temp, 1000)); if (ret < 0) return ret; @@ -164,7 +164,7 @@ static ssize_t temp1_max_alarm_show(struct device *dev, bool alarm; int ret; - ret = ds->drv->get_temp_alarm(ds, &alarm); + ret = ds->ops->get_temp_alarm(ds, &alarm); if (ret < 0) return ret; @@ -184,15 +184,15 @@ static umode_t dsa_hwmon_attrs_visible(struct kobject *kobj, { struct device *dev = container_of(kobj, struct device, kobj); struct dsa_switch *ds = dev_get_drvdata(dev); - struct dsa_switch_driver *drv = ds->drv; + struct dsa_switch_ops *ops = ds->ops; umode_t mode = attr->mode; if (index == 1) { - if (!drv->get_temp_limit) + if (!ops->get_temp_limit) mode = 0; - else if (!drv->set_temp_limit) + else if (!ops->set_temp_limit) mode &= ~S_IWUSR; - } else if (index == 2 && !drv->get_temp_alarm) { + } else if (index == 2 && !ops->get_temp_alarm) { mode = 0; } return mode; @@ -228,8 +228,8 @@ int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct device *dev, genphy_config_init(phydev); genphy_read_status(phydev); - if (ds->drv->adjust_link) - ds->drv->adjust_link(ds, port, phydev); + if (ds->ops->adjust_link) + ds->ops->adjust_link(ds, port, phydev); } return 0; @@ -303,7 +303,7 @@ void dsa_cpu_port_ethtool_restore(struct dsa_switch *ds) static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) { - struct dsa_switch_driver *drv = ds->drv; + struct dsa_switch_ops *ops = ds->ops; struct dsa_switch_tree *dst = ds->dst; struct dsa_chip_data *cd = ds->cd; bool valid_name_found = false; @@ -354,7 +354,10 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) * switch. */ if (dst->cpu_switch == index) { - dst->tag_ops = dsa_resolve_tag_protocol(drv->tag_protocol); + enum dsa_tag_protocol tag_protocol; + + tag_protocol = ops->get_tag_protocol(ds); + dst->tag_ops = dsa_resolve_tag_protocol(tag_protocol); if (IS_ERR(dst->tag_ops)) { ret = PTR_ERR(dst->tag_ops); goto out; @@ -368,15 +371,15 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) /* * Do basic register setup. */ - ret = drv->setup(ds); + ret = ops->setup(ds); if (ret < 0) goto out; - ret = drv->set_addr(ds, dst->master_netdev->dev_addr); + ret = ops->set_addr(ds, dst->master_netdev->dev_addr); if (ret < 0) goto out; - if (!ds->slave_mii_bus && drv->phy_read) { + if (!ds->slave_mii_bus && ops->phy_read) { ds->slave_mii_bus = devm_mdiobus_alloc(parent); if (!ds->slave_mii_bus) { ret = -ENOMEM; @@ -423,7 +426,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) * register with hardware monitoring subsystem. * Treat registration error as non-fatal and ignore it. */ - if (drv->get_temp) { + if (ops->get_temp) { const char *netname = netdev_name(dst->master_netdev); char hname[IFNAMSIZ + 1]; int i, j; @@ -454,7 +457,7 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index, struct device *parent, struct device *host_dev) { struct dsa_chip_data *cd = dst->pd->chip + index; - struct dsa_switch_driver *drv; + struct dsa_switch_ops *ops; struct dsa_switch *ds; int ret; const char *name; @@ -463,8 +466,8 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index, /* * Probe for switch model. */ - drv = dsa_switch_probe(parent, host_dev, cd->sw_addr, &name, &priv); - if (drv == NULL) { + ops = dsa_switch_probe(parent, host_dev, cd->sw_addr, &name, &priv); + if (!ops) { netdev_err(dst->master_netdev, "[%d]: could not detect attached switch\n", index); return ERR_PTR(-EINVAL); @@ -483,7 +486,7 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index, ds->dst = dst; ds->index = index; ds->cd = cd; - ds->drv = drv; + ds->ops = ops; ds->priv = priv; ds->dev = parent; @@ -538,12 +541,12 @@ static void dsa_switch_destroy(struct dsa_switch *ds) ds->dsa_port_mask |= ~(1 << port); } - if (ds->slave_mii_bus && ds->drv->phy_read) + if (ds->slave_mii_bus && ds->ops->phy_read) mdiobus_unregister(ds->slave_mii_bus); } #ifdef CONFIG_PM_SLEEP -static int dsa_switch_suspend(struct dsa_switch *ds) +int dsa_switch_suspend(struct dsa_switch *ds) { int i, ret = 0; @@ -557,18 +560,19 @@ static int dsa_switch_suspend(struct dsa_switch *ds) return ret; } - if (ds->drv->suspend) - ret = ds->drv->suspend(ds); + if (ds->ops->suspend) + ret = ds->ops->suspend(ds); return ret; } +EXPORT_SYMBOL_GPL(dsa_switch_suspend); -static int dsa_switch_resume(struct dsa_switch *ds) +int dsa_switch_resume(struct dsa_switch *ds) { int i, ret = 0; - if (ds->drv->resume) - ret = ds->drv->resume(ds); + if (ds->ops->resume) + ret = ds->ops->resume(ds); if (ret) return ret; @@ -585,6 +589,7 @@ static int dsa_switch_resume(struct dsa_switch *ds) return 0; } +EXPORT_SYMBOL_GPL(dsa_switch_resume); #endif /* platform driver init and cleanup *****************************************/ @@ -1086,7 +1091,6 @@ static int dsa_resume(struct device *d) static SIMPLE_DEV_PM_OPS(dsa_pm_ops, dsa_suspend, dsa_resume); static const struct of_device_id dsa_of_match_table[] = { - { .compatible = "brcm,bcm7445-switch-v4.0" }, { .compatible = "marvell,dsa", }, {} }; diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index f30bad9678f0..8278385dcd21 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -294,25 +294,25 @@ static int dsa_ds_apply(struct dsa_switch_tree *dst, struct dsa_switch *ds) int err; /* Initialize ds->phys_mii_mask before registering the slave MDIO bus - * driver and before drv->setup() has run, since the switch drivers and + * driver and before ops->setup() has run, since the switch drivers and * the slave MDIO bus driver rely on these values for probing PHY * devices or not */ ds->phys_mii_mask = ds->enabled_port_mask; - err = ds->drv->setup(ds); + err = ds->ops->setup(ds); if (err < 0) return err; - err = ds->drv->set_addr(ds, dst->master_netdev->dev_addr); + err = ds->ops->set_addr(ds, dst->master_netdev->dev_addr); if (err < 0) return err; - err = ds->drv->set_addr(ds, dst->master_netdev->dev_addr); + err = ds->ops->set_addr(ds, dst->master_netdev->dev_addr); if (err < 0) return err; - if (!ds->slave_mii_bus && ds->drv->phy_read) { + if (!ds->slave_mii_bus && ds->ops->phy_read) { ds->slave_mii_bus = devm_mdiobus_alloc(ds->dev); if (!ds->slave_mii_bus) return -ENOMEM; @@ -374,7 +374,7 @@ static void dsa_ds_unapply(struct dsa_switch_tree *dst, struct dsa_switch *ds) dsa_user_port_unapply(port, index, ds); } - if (ds->slave_mii_bus && ds->drv->phy_read) + if (ds->slave_mii_bus && ds->ops->phy_read) mdiobus_unregister(ds->slave_mii_bus); } @@ -443,6 +443,7 @@ static int dsa_cpu_parse(struct device_node *port, u32 index, struct dsa_switch_tree *dst, struct dsa_switch *ds) { + enum dsa_tag_protocol tag_protocol; struct net_device *ethernet_dev; struct device_node *ethernet; @@ -465,7 +466,8 @@ static int dsa_cpu_parse(struct device_node *port, u32 index, dst->cpu_port = index; } - dst->tag_ops = dsa_resolve_tag_protocol(ds->drv->tag_protocol); + tag_protocol = ds->ops->get_tag_protocol(ds); + dst->tag_ops = dsa_resolve_tag_protocol(tag_protocol); if (IS_ERR(dst->tag_ops)) { dev_warn(ds->dev, "No tagger for this switch\n"); return PTR_ERR(dst->tag_ops); @@ -541,7 +543,7 @@ static int dsa_parse_ports_dn(struct device_node *ports, struct dsa_switch *ds) ds->ports[reg].dn = port; - /* Initialize enabled_port_mask now for drv->setup() + /* Initialize enabled_port_mask now for ops->setup() * to have access to a correct value, just like what * net/dsa/dsa.c::dsa_switch_setup_one does. */ diff --git a/net/dsa/slave.c b/net/dsa/slave.c index fc9196745225..9f6c2a20f6ff 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -28,7 +28,7 @@ static int dsa_slave_phy_read(struct mii_bus *bus, int addr, int reg) struct dsa_switch *ds = bus->priv; if (ds->phys_mii_mask & (1 << addr)) - return ds->drv->phy_read(ds, addr, reg); + return ds->ops->phy_read(ds, addr, reg); return 0xffff; } @@ -38,7 +38,7 @@ static int dsa_slave_phy_write(struct mii_bus *bus, int addr, int reg, u16 val) struct dsa_switch *ds = bus->priv; if (ds->phys_mii_mask & (1 << addr)) - return ds->drv->phy_write(ds, addr, reg, val); + return ds->ops->phy_write(ds, addr, reg, val); return 0; } @@ -98,14 +98,14 @@ static int dsa_slave_open(struct net_device *dev) goto clear_allmulti; } - if (ds->drv->port_enable) { - err = ds->drv->port_enable(ds, p->port, p->phy); + if (ds->ops->port_enable) { + err = ds->ops->port_enable(ds, p->port, p->phy); if (err) goto clear_promisc; } - if (ds->drv->port_stp_state_set) - ds->drv->port_stp_state_set(ds, p->port, stp_state); + if (ds->ops->port_stp_state_set) + ds->ops->port_stp_state_set(ds, p->port, stp_state); if (p->phy) phy_start(p->phy); @@ -144,11 +144,11 @@ static int dsa_slave_close(struct net_device *dev) if (!ether_addr_equal(dev->dev_addr, master->dev_addr)) dev_uc_del(master, dev->dev_addr); - if (ds->drv->port_disable) - ds->drv->port_disable(ds, p->port, p->phy); + if (ds->ops->port_disable) + ds->ops->port_disable(ds, p->port, p->phy); - if (ds->drv->port_stp_state_set) - ds->drv->port_stp_state_set(ds, p->port, BR_STATE_DISABLED); + if (ds->ops->port_stp_state_set) + ds->ops->port_stp_state_set(ds, p->port, BR_STATE_DISABLED); return 0; } @@ -209,13 +209,13 @@ static int dsa_slave_port_vlan_add(struct net_device *dev, struct dsa_switch *ds = p->parent; if (switchdev_trans_ph_prepare(trans)) { - if (!ds->drv->port_vlan_prepare || !ds->drv->port_vlan_add) + if (!ds->ops->port_vlan_prepare || !ds->ops->port_vlan_add) return -EOPNOTSUPP; - return ds->drv->port_vlan_prepare(ds, p->port, vlan, trans); + return ds->ops->port_vlan_prepare(ds, p->port, vlan, trans); } - ds->drv->port_vlan_add(ds, p->port, vlan, trans); + ds->ops->port_vlan_add(ds, p->port, vlan, trans); return 0; } @@ -226,10 +226,10 @@ static int dsa_slave_port_vlan_del(struct net_device *dev, struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_switch *ds = p->parent; - if (!ds->drv->port_vlan_del) + if (!ds->ops->port_vlan_del) return -EOPNOTSUPP; - return ds->drv->port_vlan_del(ds, p->port, vlan); + return ds->ops->port_vlan_del(ds, p->port, vlan); } static int dsa_slave_port_vlan_dump(struct net_device *dev, @@ -239,8 +239,8 @@ static int dsa_slave_port_vlan_dump(struct net_device *dev, struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_switch *ds = p->parent; - if (ds->drv->port_vlan_dump) - return ds->drv->port_vlan_dump(ds, p->port, vlan, cb); + if (ds->ops->port_vlan_dump) + return ds->ops->port_vlan_dump(ds, p->port, vlan, cb); return -EOPNOTSUPP; } @@ -253,13 +253,13 @@ static int dsa_slave_port_fdb_add(struct net_device *dev, struct dsa_switch *ds = p->parent; if (switchdev_trans_ph_prepare(trans)) { - if (!ds->drv->port_fdb_prepare || !ds->drv->port_fdb_add) + if (!ds->ops->port_fdb_prepare || !ds->ops->port_fdb_add) return -EOPNOTSUPP; - return ds->drv->port_fdb_prepare(ds, p->port, fdb, trans); + return ds->ops->port_fdb_prepare(ds, p->port, fdb, trans); } - ds->drv->port_fdb_add(ds, p->port, fdb, trans); + ds->ops->port_fdb_add(ds, p->port, fdb, trans); return 0; } @@ -271,8 +271,8 @@ static int dsa_slave_port_fdb_del(struct net_device *dev, struct dsa_switch *ds = p->parent; int ret = -EOPNOTSUPP; - if (ds->drv->port_fdb_del) - ret = ds->drv->port_fdb_del(ds, p->port, fdb); + if (ds->ops->port_fdb_del) + ret = ds->ops->port_fdb_del(ds, p->port, fdb); return ret; } @@ -284,8 +284,8 @@ static int dsa_slave_port_fdb_dump(struct net_device *dev, struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_switch *ds = p->parent; - if (ds->drv->port_fdb_dump) - return ds->drv->port_fdb_dump(ds, p->port, fdb, cb); + if (ds->ops->port_fdb_dump) + return ds->ops->port_fdb_dump(ds, p->port, fdb, cb); return -EOPNOTSUPP; } @@ -308,9 +308,9 @@ static int dsa_slave_stp_state_set(struct net_device *dev, struct dsa_switch *ds = p->parent; if (switchdev_trans_ph_prepare(trans)) - return ds->drv->port_stp_state_set ? 0 : -EOPNOTSUPP; + return ds->ops->port_stp_state_set ? 0 : -EOPNOTSUPP; - ds->drv->port_stp_state_set(ds, p->port, attr->u.stp_state); + ds->ops->port_stp_state_set(ds, p->port, attr->u.stp_state); return 0; } @@ -326,8 +326,8 @@ static int dsa_slave_vlan_filtering(struct net_device *dev, if (switchdev_trans_ph_prepare(trans)) return 0; - if (ds->drv->port_vlan_filtering) - return ds->drv->port_vlan_filtering(ds, p->port, + if (ds->ops->port_vlan_filtering) + return ds->ops->port_vlan_filtering(ds, p->port, attr->u.vlan_filtering); return 0; @@ -365,8 +365,8 @@ static int dsa_slave_ageing_time(struct net_device *dev, ds->ports[p->port].ageing_time = ageing_time; ageing_time = dsa_fastest_ageing_time(ds, ageing_time); - if (ds->drv->set_ageing_time) - return ds->drv->set_ageing_time(ds, ageing_time); + if (ds->ops->set_ageing_time) + return ds->ops->set_ageing_time(ds, ageing_time); return 0; } @@ -481,8 +481,8 @@ static int dsa_slave_bridge_port_join(struct net_device *dev, p->bridge_dev = br; - if (ds->drv->port_bridge_join) - ret = ds->drv->port_bridge_join(ds, p->port, br); + if (ds->ops->port_bridge_join) + ret = ds->ops->port_bridge_join(ds, p->port, br); return ret == -EOPNOTSUPP ? 0 : ret; } @@ -493,16 +493,16 @@ static void dsa_slave_bridge_port_leave(struct net_device *dev) struct dsa_switch *ds = p->parent; - if (ds->drv->port_bridge_leave) - ds->drv->port_bridge_leave(ds, p->port); + if (ds->ops->port_bridge_leave) + ds->ops->port_bridge_leave(ds, p->port); p->bridge_dev = NULL; /* Port left the bridge, put in BR_STATE_DISABLED by the bridge layer, * so allow it to be in BR_STATE_FORWARDING to be kept functional */ - if (ds->drv->port_stp_state_set) - ds->drv->port_stp_state_set(ds, p->port, BR_STATE_FORWARDING); + if (ds->ops->port_stp_state_set) + ds->ops->port_stp_state_set(ds, p->port, BR_STATE_FORWARDING); } static int dsa_slave_port_attr_get(struct net_device *dev, @@ -605,8 +605,8 @@ static int dsa_slave_get_regs_len(struct net_device *dev) struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_switch *ds = p->parent; - if (ds->drv->get_regs_len) - return ds->drv->get_regs_len(ds, p->port); + if (ds->ops->get_regs_len) + return ds->ops->get_regs_len(ds, p->port); return -EOPNOTSUPP; } @@ -617,8 +617,8 @@ dsa_slave_get_regs(struct net_device *dev, struct ethtool_regs *regs, void *_p) struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_switch *ds = p->parent; - if (ds->drv->get_regs) - ds->drv->get_regs(ds, p->port, regs, _p); + if (ds->ops->get_regs) + ds->ops->get_regs(ds, p->port, regs, _p); } static int dsa_slave_nway_reset(struct net_device *dev) @@ -651,8 +651,8 @@ static int dsa_slave_get_eeprom_len(struct net_device *dev) if (ds->cd && ds->cd->eeprom_len) return ds->cd->eeprom_len; - if (ds->drv->get_eeprom_len) - return ds->drv->get_eeprom_len(ds); + if (ds->ops->get_eeprom_len) + return ds->ops->get_eeprom_len(ds); return 0; } @@ -663,8 +663,8 @@ static int dsa_slave_get_eeprom(struct net_device *dev, struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_switch *ds = p->parent; - if (ds->drv->get_eeprom) - return ds->drv->get_eeprom(ds, eeprom, data); + if (ds->ops->get_eeprom) + return ds->ops->get_eeprom(ds, eeprom, data); return -EOPNOTSUPP; } @@ -675,8 +675,8 @@ static int dsa_slave_set_eeprom(struct net_device *dev, struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_switch *ds = p->parent; - if (ds->drv->set_eeprom) - return ds->drv->set_eeprom(ds, eeprom, data); + if (ds->ops->set_eeprom) + return ds->ops->set_eeprom(ds, eeprom, data); return -EOPNOTSUPP; } @@ -694,8 +694,8 @@ static void dsa_slave_get_strings(struct net_device *dev, strncpy(data + len, "tx_bytes", len); strncpy(data + 2 * len, "rx_packets", len); strncpy(data + 3 * len, "rx_bytes", len); - if (ds->drv->get_strings != NULL) - ds->drv->get_strings(ds, p->port, data + 4 * len); + if (ds->ops->get_strings) + ds->ops->get_strings(ds, p->port, data + 4 * len); } } @@ -714,8 +714,8 @@ static void dsa_cpu_port_get_ethtool_stats(struct net_device *dev, dst->master_ethtool_ops.get_ethtool_stats(dev, stats, data); } - if (ds->drv->get_ethtool_stats) - ds->drv->get_ethtool_stats(ds, cpu_port, data + count); + if (ds->ops->get_ethtool_stats) + ds->ops->get_ethtool_stats(ds, cpu_port, data + count); } static int dsa_cpu_port_get_sset_count(struct net_device *dev, int sset) @@ -727,8 +727,8 @@ static int dsa_cpu_port_get_sset_count(struct net_device *dev, int sset) if (dst->master_ethtool_ops.get_sset_count) count += dst->master_ethtool_ops.get_sset_count(dev, sset); - if (sset == ETH_SS_STATS && ds->drv->get_sset_count) - count += ds->drv->get_sset_count(ds); + if (sset == ETH_SS_STATS && ds->ops->get_sset_count) + count += ds->ops->get_sset_count(ds); return count; } @@ -755,14 +755,14 @@ static void dsa_cpu_port_get_strings(struct net_device *dev, dst->master_ethtool_ops.get_strings(dev, stringset, data); } - if (stringset == ETH_SS_STATS && ds->drv->get_strings) { + if (stringset == ETH_SS_STATS && ds->ops->get_strings) { ndata = data + mcount * len; /* This function copies ETH_GSTRINGS_LEN bytes, we will mangle * the output after to prepend our CPU port prefix we * constructed earlier */ - ds->drv->get_strings(ds, cpu_port, ndata); - count = ds->drv->get_sset_count(ds); + ds->ops->get_strings(ds, cpu_port, ndata); + count = ds->ops->get_sset_count(ds); for (i = 0; i < count; i++) { memmove(ndata + (i * len + sizeof(pfx)), ndata + i * len, len - sizeof(pfx)); @@ -782,8 +782,8 @@ static void dsa_slave_get_ethtool_stats(struct net_device *dev, data[1] = dev->stats.tx_bytes; data[2] = dev->stats.rx_packets; data[3] = dev->stats.rx_bytes; - if (ds->drv->get_ethtool_stats != NULL) - ds->drv->get_ethtool_stats(ds, p->port, data + 4); + if (ds->ops->get_ethtool_stats) + ds->ops->get_ethtool_stats(ds, p->port, data + 4); } static int dsa_slave_get_sset_count(struct net_device *dev, int sset) @@ -795,8 +795,8 @@ static int dsa_slave_get_sset_count(struct net_device *dev, int sset) int count; count = 4; - if (ds->drv->get_sset_count != NULL) - count += ds->drv->get_sset_count(ds); + if (ds->ops->get_sset_count) + count += ds->ops->get_sset_count(ds); return count; } @@ -809,8 +809,8 @@ static void dsa_slave_get_wol(struct net_device *dev, struct ethtool_wolinfo *w) struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_switch *ds = p->parent; - if (ds->drv->get_wol) - ds->drv->get_wol(ds, p->port, w); + if (ds->ops->get_wol) + ds->ops->get_wol(ds, p->port, w); } static int dsa_slave_set_wol(struct net_device *dev, struct ethtool_wolinfo *w) @@ -819,8 +819,8 @@ static int dsa_slave_set_wol(struct net_device *dev, struct ethtool_wolinfo *w) struct dsa_switch *ds = p->parent; int ret = -EOPNOTSUPP; - if (ds->drv->set_wol) - ret = ds->drv->set_wol(ds, p->port, w); + if (ds->ops->set_wol) + ret = ds->ops->set_wol(ds, p->port, w); return ret; } @@ -831,10 +831,10 @@ static int dsa_slave_set_eee(struct net_device *dev, struct ethtool_eee *e) struct dsa_switch *ds = p->parent; int ret; - if (!ds->drv->set_eee) + if (!ds->ops->set_eee) return -EOPNOTSUPP; - ret = ds->drv->set_eee(ds, p->port, p->phy, e); + ret = ds->ops->set_eee(ds, p->port, p->phy, e); if (ret) return ret; @@ -850,10 +850,10 @@ static int dsa_slave_get_eee(struct net_device *dev, struct ethtool_eee *e) struct dsa_switch *ds = p->parent; int ret; - if (!ds->drv->get_eee) + if (!ds->ops->get_eee) return -EOPNOTSUPP; - ret = ds->drv->get_eee(ds, p->port, e); + ret = ds->ops->get_eee(ds, p->port, e); if (ret) return ret; @@ -988,8 +988,8 @@ static void dsa_slave_adjust_link(struct net_device *dev) p->old_pause = p->phy->pause; } - if (ds->drv->adjust_link && status_changed) - ds->drv->adjust_link(ds, p->port, p->phy); + if (ds->ops->adjust_link && status_changed) + ds->ops->adjust_link(ds, p->port, p->phy); if (status_changed) phy_print_status(p->phy); @@ -1004,8 +1004,8 @@ static int dsa_slave_fixed_link_update(struct net_device *dev, if (dev) { p = netdev_priv(dev); ds = p->parent; - if (ds->drv->fixed_link_update) - ds->drv->fixed_link_update(ds, p->port, status); + if (ds->ops->fixed_link_update) + ds->ops->fixed_link_update(ds, p->port, status); } return 0; @@ -1062,8 +1062,8 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p, phy_dn = port_dn; } - if (ds->drv->get_phy_flags) - phy_flags = ds->drv->get_phy_flags(ds, p->port); + if (ds->ops->get_phy_flags) + phy_flags = ds->ops->get_phy_flags(ds, p->port); if (phy_dn) { int phy_id = of_mdio_parse_addr(&slave_dev->dev, phy_dn); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 55513e654d79..e94b47be0019 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -211,24 +211,19 @@ int inet_listen(struct socket *sock, int backlog) * we can only allow the backlog to be adjusted. */ if (old_state != TCP_LISTEN) { - /* Check special setups for testing purpose to enable TFO w/o - * requiring TCP_FASTOPEN sockopt. + /* Enable TFO w/o requiring TCP_FASTOPEN socket option. * Note that only TCP sockets (SOCK_STREAM) will reach here. - * Also fastopenq may already been allocated because this - * socket was in TCP_LISTEN state previously but was - * shutdown() (rather than close()). + * Also fastopen backlog may already been set via the option + * because the socket was in TCP_LISTEN state previously but + * was shutdown() rather than close(). */ - if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) != 0 && + if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) && + (sysctl_tcp_fastopen & TFO_SERVER_ENABLE) && !inet_csk(sk)->icsk_accept_queue.fastopenq.max_qlen) { - if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) != 0) - fastopen_queue_tune(sk, backlog); - else if ((sysctl_tcp_fastopen & - TFO_SERVER_WO_SOCKOPT2) != 0) - fastopen_queue_tune(sk, - ((uint)sysctl_tcp_fastopen) >> 16); - + fastopen_queue_tune(sk, backlog); tcp_fastopen_init_key_once(true); } + err = inet_csk_listen_start(sk, backlog); if (err) goto out; @@ -921,6 +916,8 @@ const struct proto_ops inet_stream_ops = { .mmap = sock_no_mmap, .sendpage = inet_sendpage, .splice_read = tcp_splice_read, + .read_sock = tcp_read_sock, + .peek_len = tcp_peek_len, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_sock_common_setsockopt, .compat_getsockopt = compat_sock_common_getsockopt, diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index ef2ebeb89d0f..317c31939732 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -93,9 +93,6 @@ struct fib_table *fib_new_table(struct net *net, u32 id) return NULL; switch (id) { - case RT_TABLE_LOCAL: - rcu_assign_pointer(net->ipv4.fib_local, tb); - break; case RT_TABLE_MAIN: rcu_assign_pointer(net->ipv4.fib_main, tb); break; @@ -137,9 +134,6 @@ static void fib_replace_table(struct net *net, struct fib_table *old, { #ifdef CONFIG_IP_MULTIPLE_TABLES switch (new->tb_id) { - case RT_TABLE_LOCAL: - rcu_assign_pointer(net->ipv4.fib_local, new); - break; case RT_TABLE_MAIN: rcu_assign_pointer(net->ipv4.fib_main, new); break; @@ -1249,7 +1243,6 @@ static void ip_fib_net_exit(struct net *net) rtnl_lock(); #ifdef CONFIG_IP_MULTIPLE_TABLES - RCU_INIT_POINTER(net->ipv4.fib_local, NULL); RCU_INIT_POINTER(net->ipv4.fib_main, NULL); RCU_INIT_POINTER(net->ipv4.fib_default, NULL); #endif diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 539fa264e67d..8066ccc48a17 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1576,7 +1576,8 @@ static bool fib_good_nh(const struct fib_nh *nh) rcu_read_lock_bh(); - n = __ipv4_neigh_lookup_noref(nh->nh_dev, nh->nh_gw); + n = __ipv4_neigh_lookup_noref(nh->nh_dev, + (__force u32)nh->nh_gw); if (n) state = n->nud_state; diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 9b4ca87f70ba..606cc3e85d2b 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -472,6 +472,15 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc, continue; } + /* Based on RFC3376 5.1. Should not send source-list change + * records when there is a filter mode change. + */ + if (((gdeleted && pmc->sfmode == MCAST_EXCLUDE) || + (!gdeleted && pmc->crcount)) && + (type == IGMPV3_ALLOW_NEW_SOURCES || + type == IGMPV3_BLOCK_OLD_SOURCES) && psf->sf_crcount) + goto decrease_sf_crcount; + /* clear marks on query responses */ if (isquery) psf->sf_gsresp = 0; @@ -499,6 +508,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc, scount++; stotal++; if ((type == IGMPV3_ALLOW_NEW_SOURCES || type == IGMPV3_BLOCK_OLD_SOURCES) && psf->sf_crcount) { +decrease_sf_crcount: psf->sf_crcount--; if ((sdeleted || gdeleted) && psf->sf_crcount == 0) { if (psf_prev) diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 38c2c47fe0e8..abfbe492ebfe 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -45,6 +45,7 @@ struct inet_diag_entry { u16 family; u16 userlocks; u32 ifindex; + u32 mark; }; static DEFINE_MUTEX(inet_diag_table_mutex); @@ -580,6 +581,14 @@ static int inet_diag_bc_run(const struct nlattr *_bc, yes = 0; break; } + case INET_DIAG_BC_MARK_COND: { + struct inet_diag_markcond *cond; + + cond = (struct inet_diag_markcond *)(op + 1); + if ((entry->mark & cond->mask) != cond->mark) + yes = 0; + break; + } } if (yes) { @@ -624,6 +633,12 @@ int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk) entry.dport = ntohs(inet->inet_dport); entry.ifindex = sk->sk_bound_dev_if; entry.userlocks = sk_fullsock(sk) ? sk->sk_userlocks : 0; + if (sk_fullsock(sk)) + entry.mark = sk->sk_mark; + else if (sk->sk_state == TCP_NEW_SYN_RECV) + entry.mark = inet_rsk(inet_reqsk(sk))->ir_mark; + else + entry.mark = 0; return inet_diag_bc_run(bc, &entry); } @@ -706,10 +721,25 @@ static bool valid_port_comparison(const struct inet_diag_bc_op *op, return true; } -static int inet_diag_bc_audit(const void *bytecode, int bytecode_len) +static bool valid_markcond(const struct inet_diag_bc_op *op, int len, + int *min_len) +{ + *min_len += sizeof(struct inet_diag_markcond); + return len >= *min_len; +} + +static int inet_diag_bc_audit(const struct nlattr *attr, + const struct sk_buff *skb) { - const void *bc = bytecode; - int len = bytecode_len; + bool net_admin = netlink_net_capable(skb, CAP_NET_ADMIN); + const void *bytecode, *bc; + int bytecode_len, len; + + if (!attr || nla_len(attr) < sizeof(struct inet_diag_bc_op)) + return -EINVAL; + + bytecode = bc = nla_data(attr); + len = bytecode_len = nla_len(attr); while (len > 0) { int min_len = sizeof(struct inet_diag_bc_op); @@ -732,6 +762,12 @@ static int inet_diag_bc_audit(const void *bytecode, int bytecode_len) if (!valid_port_comparison(bc, len, &min_len)) return -EINVAL; break; + case INET_DIAG_BC_MARK_COND: + if (!net_admin) + return -EPERM; + if (!valid_markcond(bc, len, &min_len)) + return -EINVAL; + break; case INET_DIAG_BC_AUTO: case INET_DIAG_BC_JMP: case INET_DIAG_BC_NOP: @@ -1020,13 +1056,13 @@ static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh) if (nlh->nlmsg_flags & NLM_F_DUMP) { if (nlmsg_attrlen(nlh, hdrlen)) { struct nlattr *attr; + int err; attr = nlmsg_find_attr(nlh, hdrlen, INET_DIAG_REQ_BYTECODE); - if (!attr || - nla_len(attr) < sizeof(struct inet_diag_bc_op) || - inet_diag_bc_audit(nla_data(attr), nla_len(attr))) - return -EINVAL; + err = inet_diag_bc_audit(attr, skb); + if (err) + return err; } { struct netlink_dump_control c = { @@ -1051,13 +1087,13 @@ static int inet_diag_handler_cmd(struct sk_buff *skb, struct nlmsghdr *h) h->nlmsg_flags & NLM_F_DUMP) { if (nlmsg_attrlen(h, hdrlen)) { struct nlattr *attr; + int err; attr = nlmsg_find_attr(h, hdrlen, INET_DIAG_REQ_BYTECODE); - if (!attr || - nla_len(attr) < sizeof(struct inet_diag_bc_op) || - inet_diag_bc_audit(nla_data(attr), nla_len(attr))) - return -EINVAL; + err = inet_diag_bc_audit(attr, skb); + if (err) + return err; } { struct netlink_dump_control c = { diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 1d71c40eaaf3..071a785c65eb 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -85,7 +85,6 @@ /* Define the timeout for waiting for a DHCP/BOOTP/RARP reply */ #define CONF_OPEN_RETRIES 2 /* (Re)open devices twice */ #define CONF_SEND_RETRIES 6 /* Send six requests per open */ -#define CONF_INTER_TIMEOUT (HZ) /* Inter-device timeout: 1 second */ #define CONF_BASE_TIMEOUT (HZ*2) /* Initial timeout: 2 seconds */ #define CONF_TIMEOUT_RANDOM (HZ) /* Maximum amount of randomization */ #define CONF_TIMEOUT_MULT *7/4 /* Rate of timeout growth */ @@ -188,7 +187,7 @@ struct ic_device { }; static struct ic_device *ic_first_dev __initdata; /* List of open device */ -static struct net_device *ic_dev __initdata; /* Selected device */ +static struct ic_device *ic_dev __initdata; /* Selected device */ static bool __init ic_is_init_dev(struct net_device *dev) { @@ -307,7 +306,7 @@ static void __init ic_close_devs(void) while ((d = next)) { next = d->next; dev = d->dev; - if (dev != ic_dev && !netdev_uses_dsa(dev)) { + if ((!ic_dev || dev != ic_dev->dev) && !netdev_uses_dsa(dev)) { pr_debug("IP-Config: Downing %s\n", dev->name); dev_change_flags(dev, d->flags); } @@ -372,7 +371,7 @@ static int __init ic_setup_if(void) int err; memset(&ir, 0, sizeof(ir)); - strcpy(ir.ifr_ifrn.ifrn_name, ic_dev->name); + strcpy(ir.ifr_ifrn.ifrn_name, ic_dev->dev->name); set_sockaddr(sin, ic_myaddr, 0); if ((err = ic_devinet_ioctl(SIOCSIFADDR, &ir)) < 0) { pr_err("IP-Config: Unable to set interface address (%d)\n", @@ -396,7 +395,7 @@ static int __init ic_setup_if(void) * out, we'll try to muddle along. */ if (ic_dev_mtu != 0) { - strcpy(ir.ifr_name, ic_dev->name); + strcpy(ir.ifr_name, ic_dev->dev->name); ir.ifr_mtu = ic_dev_mtu; if ((err = ic_dev_ioctl(SIOCSIFMTU, &ir)) < 0) pr_err("IP-Config: Unable to set interface mtu to %d (%d)\n", @@ -568,7 +567,7 @@ ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt goto drop_unlock; /* We have a winner! */ - ic_dev = dev; + ic_dev = d; if (ic_myaddr == NONE) ic_myaddr = tip; ic_servaddr = sip; @@ -655,8 +654,6 @@ static struct packet_type bootp_packet_type __initdata = { .func = ic_bootp_recv, }; -static __be32 ic_dev_xid; /* Device under configuration */ - /* * Initialize DHCP/BOOTP extension fields in the request. */ @@ -666,14 +663,14 @@ static const u8 ic_bootp_cookie[4] = { 99, 130, 83, 99 }; #ifdef IPCONFIG_DHCP static void __init -ic_dhcp_init_options(u8 *options) +ic_dhcp_init_options(u8 *options, struct ic_device *d) { u8 mt = ((ic_servaddr == NONE) ? DHCPDISCOVER : DHCPREQUEST); u8 *e = options; int len; - pr_debug("DHCP: Sending message type %d\n", mt); + pr_debug("DHCP: Sending message type %d (%s)\n", mt, d->dev->name); memcpy(e, ic_bootp_cookie, 4); /* RFC1048 Magic Cookie */ e += 4; @@ -857,7 +854,7 @@ static void __init ic_bootp_send_if(struct ic_device *d, unsigned long jiffies_d /* add DHCP options or BOOTP extensions */ #ifdef IPCONFIG_DHCP if (ic_proto_enabled & IC_USE_DHCP) - ic_dhcp_init_options(b->exten); + ic_dhcp_init_options(b->exten, d); else #endif ic_bootp_init_ext(b->exten); @@ -1033,14 +1030,8 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str /* Is it a reply to our BOOTP request? */ if (b->op != BOOTP_REPLY || b->xid != d->xid) { - net_err_ratelimited("DHCP/BOOTP: Reply not for us, op[%x] xid[%x]\n", - b->op, b->xid); - goto drop_unlock; - } - - /* Is it a reply for the device we are configuring? */ - if (b->xid != ic_dev_xid) { - net_err_ratelimited("DHCP/BOOTP: Ignoring delayed packet\n"); + net_err_ratelimited("DHCP/BOOTP: Reply not for us on %s, op[%x] xid[%x]\n", + d->dev->name, b->op, b->xid); goto drop_unlock; } @@ -1075,7 +1066,7 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str } } - pr_debug("DHCP: Got message type %d\n", mt); + pr_debug("DHCP: Got message type %d (%s)\n", mt, d->dev->name); switch (mt) { case DHCPOFFER: @@ -1130,7 +1121,7 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str } /* We have a winner! */ - ic_dev = dev; + ic_dev = d; ic_myaddr = b->your_ip; ic_servaddr = b->server_ip; ic_addrservaddr = b->iph.saddr; @@ -1225,9 +1216,6 @@ static int __init ic_dynamic(void) timeout = CONF_BASE_TIMEOUT + (timeout % (unsigned int) CONF_TIMEOUT_RANDOM); for (;;) { #ifdef IPCONFIG_BOOTP - /* Track the device we are configuring */ - ic_dev_xid = d->xid; - if (do_bootp && (d->able & IC_BOOTP)) ic_bootp_send_if(d, jiffies - start_jiffies); #endif @@ -1236,15 +1224,19 @@ static int __init ic_dynamic(void) ic_rarp_send_if(d); #endif - jiff = jiffies + (d->next ? CONF_INTER_TIMEOUT : timeout); - while (time_before(jiffies, jiff) && !ic_got_reply) - schedule_timeout_uninterruptible(1); + if (!d->next) { + jiff = jiffies + timeout; + while (time_before(jiffies, jiff) && !ic_got_reply) + schedule_timeout_uninterruptible(1); + } #ifdef IPCONFIG_DHCP /* DHCP isn't done until we get a DHCPACK. */ if ((ic_got_reply & IC_BOOTP) && (ic_proto_enabled & IC_USE_DHCP) && ic_dhcp_msgtype != DHCPACK) { ic_got_reply = 0; + /* continue on device that got the reply */ + d = ic_dev; pr_cont(","); continue; } @@ -1487,7 +1479,7 @@ static int __init ip_auto_config(void) #endif /* IPCONFIG_DYNAMIC */ } else { /* Device selected manually or only one device -> use it */ - ic_dev = ic_first_dev->dev; + ic_dev = ic_first_dev; } addr = root_nfs_parse_addr(root_server_path); @@ -1501,14 +1493,6 @@ static int __init ip_auto_config(void) return -1; /* - * Close all network devices except the device we've - * autoconfigured and set up routes. - */ - ic_close_devs(); - if (ic_setup_if() < 0 || ic_setup_routes() < 0) - return -1; - - /* * Record which protocol was actually used. */ #ifdef IPCONFIG_DYNAMIC @@ -1522,7 +1506,7 @@ static int __init ip_auto_config(void) pr_info("IP-Config: Complete:\n"); pr_info(" device=%s, hwaddr=%*phC, ipaddr=%pI4, mask=%pI4, gw=%pI4\n", - ic_dev->name, ic_dev->addr_len, ic_dev->dev_addr, + ic_dev->dev->name, ic_dev->dev->addr_len, ic_dev->dev->dev_addr, &ic_myaddr, &ic_netmask, &ic_gateway); pr_info(" host=%s, domain=%s, nis-domain=%s\n", utsname()->nodename, ic_domain, utsname()->domainname); @@ -1542,7 +1526,18 @@ static int __init ip_auto_config(void) pr_cont("\n"); #endif /* !SILENT */ - return 0; + /* + * Close all network devices except the device we've + * autoconfigured and set up routes. + */ + if (ic_setup_if() < 0 || ic_setup_routes() < 0) + err = -1; + else + err = 0; + + ic_close_devs(); + + return err; } late_initcall(ip_auto_config); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 9f665b63a927..1ed015e4bc79 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -257,6 +257,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPSpuriousRTOs", LINUX_MIB_TCPSPURIOUSRTOS), SNMP_MIB_ITEM("TCPMD5NotFound", LINUX_MIB_TCPMD5NOTFOUND), SNMP_MIB_ITEM("TCPMD5Unexpected", LINUX_MIB_TCPMD5UNEXPECTED), + SNMP_MIB_ITEM("TCPMD5Failure", LINUX_MIB_TCPMD5FAILURE), SNMP_MIB_ITEM("TCPSackShifted", LINUX_MIB_SACKSHIFTED), SNMP_MIB_ITEM("TCPSackMerged", LINUX_MIB_SACKMERGED), SNMP_MIB_ITEM("TCPSackShiftFallback", LINUX_MIB_SACKSHIFTFALLBACK), diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index ffbb218de520..77311a92275c 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1570,6 +1570,12 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, } EXPORT_SYMBOL(tcp_read_sock); +int tcp_peek_len(struct socket *sock) +{ + return tcp_inq(sock->sk); +} +EXPORT_SYMBOL(tcp_peek_len); + /* * This routine copies from a sock struct into the user buffer. * @@ -3092,23 +3098,6 @@ struct tcp_md5sig_pool *tcp_get_md5sig_pool(void) } EXPORT_SYMBOL(tcp_get_md5sig_pool); -int tcp_md5_hash_header(struct tcp_md5sig_pool *hp, - const struct tcphdr *th) -{ - struct scatterlist sg; - struct tcphdr hdr; - - /* We are not allowed to change tcphdr, make a local copy */ - memcpy(&hdr, th, sizeof(hdr)); - hdr.check = 0; - - /* options aren't included in the hash */ - sg_init_one(&sg, &hdr, sizeof(hdr)); - ahash_request_set_crypt(hp->md5_req, &sg, NULL, sizeof(hdr)); - return crypto_ahash_update(hp->md5_req); -} -EXPORT_SYMBOL(tcp_md5_hash_header); - int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp, const struct sk_buff *skb, unsigned int header_len) { diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3ebf45b38bc3..8cd02c0b056c 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4392,12 +4392,9 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb, if (tcp_prune_queue(sk) < 0) return -1; - if (!sk_rmem_schedule(sk, skb, size)) { + while (!sk_rmem_schedule(sk, skb, size)) { if (!tcp_prune_ofo_queue(sk)) return -1; - - if (!sk_rmem_schedule(sk, skb, size)) - return -1; } } return 0; @@ -4874,29 +4871,41 @@ static void tcp_collapse_ofo_queue(struct sock *sk) } /* - * Purge the out-of-order queue. - * Return true if queue was pruned. + * Clean the out-of-order queue to make room. + * We drop high sequences packets to : + * 1) Let a chance for holes to be filled. + * 2) not add too big latencies if thousands of packets sit there. + * (But if application shrinks SO_RCVBUF, we could still end up + * freeing whole queue here) + * + * Return true if queue has shrunk. */ static bool tcp_prune_ofo_queue(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - bool res = false; + struct sk_buff *skb; - if (!skb_queue_empty(&tp->out_of_order_queue)) { - NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED); - __skb_queue_purge(&tp->out_of_order_queue); + if (skb_queue_empty(&tp->out_of_order_queue)) + return false; - /* Reset SACK state. A conforming SACK implementation will - * do the same at a timeout based retransmit. When a connection - * is in a sad state like this, we care only about integrity - * of the connection not performance. - */ - if (tp->rx_opt.sack_ok) - tcp_sack_reset(&tp->rx_opt); + NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED); + + while ((skb = __skb_dequeue_tail(&tp->out_of_order_queue)) != NULL) { + tcp_drop(sk, skb); sk_mem_reclaim(sk); - res = true; + if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && + !tcp_under_memory_pressure(sk)) + break; } - return res; + + /* Reset SACK state. A conforming SACK implementation will + * do the same at a timeout based retransmit. When a connection + * is in a sad state like this, we care only about integrity + * of the connection not performance. + */ + if (tp->rx_opt.sack_ok) + tcp_sack_reset(&tp->rx_opt); + return true; } /* Reduce allocated memory if we can, trying to get diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 7158d4f8dae4..a75bf48d7950 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1175,6 +1175,7 @@ static bool tcp_v4_inbound_md5_hash(const struct sock *sk, NULL, skb); if (genhash || memcmp(hash_location, newhash, 16) != 0) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE); net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n", &iph->saddr, ntohs(th->source), &iph->daddr, ntohs(th->dest), @@ -1537,6 +1538,34 @@ bool tcp_prequeue(struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL(tcp_prequeue); +bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) +{ + u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf; + + /* Only socket owner can try to collapse/prune rx queues + * to reduce memory overhead, so add a little headroom here. + * Few sockets backlog are possibly concurrently non empty. + */ + limit += 64*1024; + + /* In case all data was pulled from skb frags (in __pskb_pull_tail()), + * we can fix skb->truesize to its real value to avoid future drops. + * This is valid because skb is not yet charged to the socket. + * It has been noticed pure SACK packets were sometimes dropped + * (if cooked by drivers without copybreak feature). + */ + if (!skb->data_len) + skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); + + if (unlikely(sk_add_backlog(sk, skb, limit))) { + bh_unlock_sock(sk); + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); + return true; + } + return false; +} +EXPORT_SYMBOL(tcp_add_backlog); + /* * From tcp_input.c */ @@ -1608,6 +1637,7 @@ process: sk = req->rsk_listener; if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) { + sk_drops_add(sk, skb); reqsk_put(req); goto discard_it; } @@ -1666,10 +1696,7 @@ process: if (!sock_owned_by_user(sk)) { if (!tcp_prequeue(sk, skb)) ret = tcp_v4_do_rcv(sk, skb); - } else if (unlikely(sk_add_backlog(sk, skb, - sk->sk_rcvbuf + sk->sk_sndbuf))) { - bh_unlock_sock(sk); - __NET_INC_STATS(net, LINUX_MIB_TCPBACKLOGDROP); + } else if (tcp_add_backlog(sk, skb)) { goto discard_and_relse; } bh_unlock_sock(sk); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index bdaef7fd6e47..8b45794eb6b2 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2776,7 +2776,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk) max_segs = tcp_tso_autosize(sk, tcp_current_mss(sk)); tcp_for_write_queue_from(skb, sk) { - __u8 sacked = TCP_SKB_CB(skb)->sacked; + __u8 sacked; int segs; if (skb == tcp_send_head(sk)) @@ -2788,6 +2788,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk) segs = tp->snd_cwnd - tcp_packets_in_flight(tp); if (segs <= 0) return; + sacked = TCP_SKB_CB(skb)->sacked; /* In case tcp_shift_skb_data() have aggregated large skbs, * we need to make sure not sending too bigs TSO packets */ diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 5fdcb8d108d4..058c31286ce1 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -114,6 +114,7 @@ #include <net/busy_poll.h> #include "udp_impl.h" #include <net/sock_reuseport.h> +#include <net/addrconf.h> struct udp_table udp_table __read_mostly; EXPORT_SYMBOL(udp_table); @@ -2192,6 +2193,20 @@ unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait) } EXPORT_SYMBOL(udp_poll); +int udp_abort(struct sock *sk, int err) +{ + lock_sock(sk); + + sk->sk_err = err; + sk->sk_error_report(sk); + udp_disconnect(sk, 0); + + release_sock(sk); + + return 0; +} +EXPORT_SYMBOL_GPL(udp_abort); + struct proto udp_prot = { .name = "UDP", .owner = THIS_MODULE, @@ -2221,7 +2236,7 @@ struct proto udp_prot = { .compat_setsockopt = compat_udp_setsockopt, .compat_getsockopt = compat_udp_getsockopt, #endif - .clear_sk = sk_prot_clear_portaddr_nulls, + .diag_destroy = udp_abort, }; EXPORT_SYMBOL(udp_prot); diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index 3d5ccf4b1412..8a9f6e535caa 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c @@ -165,12 +165,88 @@ static void udp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, r->idiag_wqueue = sk_wmem_alloc_get(sk); } +#ifdef CONFIG_INET_DIAG_DESTROY +static int __udp_diag_destroy(struct sk_buff *in_skb, + const struct inet_diag_req_v2 *req, + struct udp_table *tbl) +{ + struct net *net = sock_net(in_skb->sk); + struct sock *sk; + int err; + + rcu_read_lock(); + + if (req->sdiag_family == AF_INET) + sk = __udp4_lib_lookup(net, + req->id.idiag_dst[0], req->id.idiag_dport, + req->id.idiag_src[0], req->id.idiag_sport, + req->id.idiag_if, tbl, NULL); +#if IS_ENABLED(CONFIG_IPV6) + else if (req->sdiag_family == AF_INET6) { + if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) && + ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src)) + sk = __udp4_lib_lookup(net, + req->id.idiag_dst[0], req->id.idiag_dport, + req->id.idiag_src[0], req->id.idiag_sport, + req->id.idiag_if, tbl, NULL); + + else + sk = __udp6_lib_lookup(net, + (struct in6_addr *)req->id.idiag_dst, + req->id.idiag_dport, + (struct in6_addr *)req->id.idiag_src, + req->id.idiag_sport, + req->id.idiag_if, tbl, NULL); + } +#endif + else { + rcu_read_unlock(); + return -EINVAL; + } + + if (sk && !atomic_inc_not_zero(&sk->sk_refcnt)) + sk = NULL; + + rcu_read_unlock(); + + if (!sk) + return -ENOENT; + + if (sock_diag_check_cookie(sk, req->id.idiag_cookie)) { + sock_put(sk); + return -ENOENT; + } + + err = sock_diag_destroy(sk, ECONNABORTED); + + sock_put(sk); + + return err; +} + +static int udp_diag_destroy(struct sk_buff *in_skb, + const struct inet_diag_req_v2 *req) +{ + return __udp_diag_destroy(in_skb, req, &udp_table); +} + +static int udplite_diag_destroy(struct sk_buff *in_skb, + const struct inet_diag_req_v2 *req) +{ + return __udp_diag_destroy(in_skb, req, &udplite_table); +} + +#endif + static const struct inet_diag_handler udp_diag_handler = { .dump = udp_diag_dump, .dump_one = udp_diag_dump_one, .idiag_get_info = udp_diag_get_info, .idiag_type = IPPROTO_UDP, .idiag_info_size = 0, +#ifdef CONFIG_INET_DIAG_DESTROY + .destroy = udp_diag_destroy, +#endif }; static void udplite_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, @@ -192,6 +268,9 @@ static const struct inet_diag_handler udplite_diag_handler = { .idiag_get_info = udp_diag_get_info, .idiag_type = IPPROTO_UDPLITE, .idiag_info_size = 0, +#ifdef CONFIG_INET_DIAG_DESTROY + .destroy = udplite_diag_destroy, +#endif }; static int __init udp_diag_init(void) diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index 2eea073e27ef..af817158d830 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -60,7 +60,6 @@ struct proto udplite_prot = { .compat_setsockopt = compat_udp_setsockopt, .compat_getsockopt = compat_udp_getsockopt, #endif - .clear_sk = sk_prot_clear_portaddr_nulls, }; EXPORT_SYMBOL(udplite_prot); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index b454055ba625..46ad699937fd 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -545,6 +545,8 @@ const struct proto_ops inet6_stream_ops = { .mmap = sock_no_mmap, .sendpage = inet_sendpage, .splice_read = tcp_splice_read, + .read_sock = tcp_read_sock, + .peek_len = tcp_peek_len, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_sock_common_setsockopt, .compat_getsockopt = compat_sock_common_getsockopt, diff --git a/net/ipv6/ila/ila_common.c b/net/ipv6/ila/ila_common.c index ec9efbcdad35..aba0998ddbfb 100644 --- a/net/ipv6/ila/ila_common.c +++ b/net/ipv6/ila/ila_common.c @@ -172,6 +172,5 @@ static void __exit ila_fini(void) module_init(ila_init); module_exit(ila_fini); -MODULE_ALIAS_RTNL_LWT(ILA); MODULE_AUTHOR("Tom Herbert <tom@herbertland.com>"); MODULE_LICENSE("GPL"); diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 704274cbd495..397e1ed3daa3 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -61,12 +61,12 @@ static bool log_ecn_error = true; module_param(log_ecn_error, bool, 0644); MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); -#define HASH_SIZE_SHIFT 5 -#define HASH_SIZE (1 << HASH_SIZE_SHIFT) +#define IP6_GRE_HASH_SIZE_SHIFT 5 +#define IP6_GRE_HASH_SIZE (1 << IP6_GRE_HASH_SIZE_SHIFT) static int ip6gre_net_id __read_mostly; struct ip6gre_net { - struct ip6_tnl __rcu *tunnels[4][HASH_SIZE]; + struct ip6_tnl __rcu *tunnels[4][IP6_GRE_HASH_SIZE]; struct net_device *fb_tunnel_dev; }; @@ -96,12 +96,12 @@ static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu); will match fallback tunnel. */ -#define HASH_KEY(key) (((__force u32)key^((__force u32)key>>4))&(HASH_SIZE - 1)) +#define HASH_KEY(key) (((__force u32)key^((__force u32)key>>4))&(IP6_GRE_HASH_SIZE - 1)) static u32 HASH_ADDR(const struct in6_addr *addr) { u32 hash = ipv6_addr_hash(addr); - return hash_32(hash, HASH_SIZE_SHIFT); + return hash_32(hash, IP6_GRE_HASH_SIZE_SHIFT); } #define tunnels_r_l tunnels[3] @@ -1087,7 +1087,7 @@ static void ip6gre_destroy_tunnels(struct net *net, struct list_head *head) for (prio = 0; prio < 4; prio++) { int h; - for (h = 0; h < HASH_SIZE; h++) { + for (h = 0; h < IP6_GRE_HASH_SIZE; h++) { struct ip6_tnl *t; t = rtnl_dereference(ign->tunnels[prio][h]); diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 7b0481e3738f..2050217df565 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -64,8 +64,8 @@ MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("ip6tnl"); MODULE_ALIAS_NETDEV("ip6tnl0"); -#define HASH_SIZE_SHIFT 5 -#define HASH_SIZE (1 << HASH_SIZE_SHIFT) +#define IP6_TUNNEL_HASH_SIZE_SHIFT 5 +#define IP6_TUNNEL_HASH_SIZE (1 << IP6_TUNNEL_HASH_SIZE_SHIFT) static bool log_ecn_error = true; module_param(log_ecn_error, bool, 0644); @@ -75,7 +75,7 @@ static u32 HASH(const struct in6_addr *addr1, const struct in6_addr *addr2) { u32 hash = ipv6_addr_hash(addr1) ^ ipv6_addr_hash(addr2); - return hash_32(hash, HASH_SIZE_SHIFT); + return hash_32(hash, IP6_TUNNEL_HASH_SIZE_SHIFT); } static int ip6_tnl_dev_init(struct net_device *dev); @@ -87,7 +87,7 @@ struct ip6_tnl_net { /* the IPv6 tunnel fallback device */ struct net_device *fb_tnl_dev; /* lists for storing tunnels in use */ - struct ip6_tnl __rcu *tnls_r_l[HASH_SIZE]; + struct ip6_tnl __rcu *tnls_r_l[IP6_TUNNEL_HASH_SIZE]; struct ip6_tnl __rcu *tnls_wc[1]; struct ip6_tnl __rcu **tnls[2]; }; @@ -2031,7 +2031,7 @@ static void __net_exit ip6_tnl_destroy_tunnels(struct net *net) if (dev->rtnl_link_ops == &ip6_link_ops) unregister_netdevice_queue(dev, &list); - for (h = 0; h < HASH_SIZE; h++) { + for (h = 0; h < IP6_TUNNEL_HASH_SIZE; h++) { t = rtnl_dereference(ip6n->tnls_r_l[h]); while (t) { /* If dev is in the same netns, it has already diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index d90a11f14040..cc7e05898307 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -50,14 +50,14 @@ #include <net/net_namespace.h> #include <net/netns/generic.h> -#define HASH_SIZE_SHIFT 5 -#define HASH_SIZE (1 << HASH_SIZE_SHIFT) +#define IP6_VTI_HASH_SIZE_SHIFT 5 +#define IP6_VTI_HASH_SIZE (1 << IP6_VTI_HASH_SIZE_SHIFT) static u32 HASH(const struct in6_addr *addr1, const struct in6_addr *addr2) { u32 hash = ipv6_addr_hash(addr1) ^ ipv6_addr_hash(addr2); - return hash_32(hash, HASH_SIZE_SHIFT); + return hash_32(hash, IP6_VTI_HASH_SIZE_SHIFT); } static int vti6_dev_init(struct net_device *dev); @@ -69,7 +69,7 @@ struct vti6_net { /* the vti6 tunnel fallback device */ struct net_device *fb_tnl_dev; /* lists for storing tunnels in use */ - struct ip6_tnl __rcu *tnls_r_l[HASH_SIZE]; + struct ip6_tnl __rcu *tnls_r_l[IP6_VTI_HASH_SIZE]; struct ip6_tnl __rcu *tnls_wc[1]; struct ip6_tnl __rcu **tnls[2]; }; @@ -1040,7 +1040,7 @@ static void __net_exit vti6_destroy_tunnels(struct vti6_net *ip6n) struct ip6_tnl *t; LIST_HEAD(list); - for (h = 0; h < HASH_SIZE; h++) { + for (h = 0; h < IP6_VTI_HASH_SIZE; h++) { t = rtnl_dereference(ip6n->tnls_r_l[h]); while (t) { unregister_netdevice_queue(t->dev, &list); diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index d64ee7e83664..75c1fc54f188 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -1739,6 +1739,15 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, continue; } + /* Based on RFC3810 6.1. Should not send source-list change + * records when there is a filter mode change. + */ + if (((gdeleted && pmc->mca_sfmode == MCAST_EXCLUDE) || + (!gdeleted && pmc->mca_crcount)) && + (type == MLD2_ALLOW_NEW_SOURCES || + type == MLD2_BLOCK_OLD_SOURCES) && psf->sf_crcount) + goto decrease_sf_crcount; + /* clear marks on query responses */ if (isquery) psf->sf_gsresp = 0; @@ -1766,6 +1775,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, scount++; stotal++; if ((type == MLD2_ALLOW_NEW_SOURCES || type == MLD2_BLOCK_OLD_SOURCES) && psf->sf_crcount) { +decrease_sf_crcount: psf->sf_crcount--; if ((sdeleted || gdeleted) && psf->sf_crcount == 0) { if (psf_prev) diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 182b6a9be29d..b1cdf8009d29 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -62,7 +62,7 @@ For comments look at net/ipv4/ip_gre.c --ANK */ -#define HASH_SIZE 16 +#define IP6_SIT_HASH_SIZE 16 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF) static bool log_ecn_error = true; @@ -78,9 +78,9 @@ static struct rtnl_link_ops sit_link_ops __read_mostly; static int sit_net_id __read_mostly; struct sit_net { - struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE]; - struct ip_tunnel __rcu *tunnels_r[HASH_SIZE]; - struct ip_tunnel __rcu *tunnels_l[HASH_SIZE]; + struct ip_tunnel __rcu *tunnels_r_l[IP6_SIT_HASH_SIZE]; + struct ip_tunnel __rcu *tunnels_r[IP6_SIT_HASH_SIZE]; + struct ip_tunnel __rcu *tunnels_l[IP6_SIT_HASH_SIZE]; struct ip_tunnel __rcu *tunnels_wc[1]; struct ip_tunnel __rcu **tunnels[4]; @@ -1126,7 +1126,7 @@ static int ipip6_tunnel_update_6rd(struct ip_tunnel *t, } #endif -bool ipip6_valid_ip_proto(u8 ipproto) +static bool ipip6_valid_ip_proto(u8 ipproto) { return ipproto == IPPROTO_IPV6 || ipproto == IPPROTO_IPIP || @@ -1783,7 +1783,7 @@ static void __net_exit sit_destroy_tunnels(struct net *net, for (prio = 1; prio < 4; prio++) { int h; - for (h = 0; h < HASH_SIZE; h++) { + for (h = 0; h < IP6_SIT_HASH_SIZE; h++) { struct ip_tunnel *t; t = rtnl_dereference(sitn->tunnels[prio][h]); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 94f4f89d73e7..04529a3d42cb 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -671,6 +671,7 @@ static bool tcp_v6_inbound_md5_hash(const struct sock *sk, NULL, skb); if (genhash || memcmp(hash_location, newhash, 16) != 0) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE); net_info_ratelimited("MD5 Hash %s for [%pI6c]:%u->[%pI6c]:%u\n", genhash ? "failed" : "mismatch", &ip6h->saddr, ntohs(th->source), @@ -1415,6 +1416,7 @@ process: sk = req->rsk_listener; tcp_v6_fill_cb(skb, hdr, th); if (tcp_v6_inbound_md5_hash(sk, skb)) { + sk_drops_add(sk, skb); reqsk_put(req); goto discard_it; } @@ -1471,10 +1473,7 @@ process: if (!sock_owned_by_user(sk)) { if (!tcp_prequeue(sk, skb)) ret = tcp_v6_do_rcv(sk, skb); - } else if (unlikely(sk_add_backlog(sk, skb, - sk->sk_rcvbuf + sk->sk_sndbuf))) { - bh_unlock_sock(sk); - __NET_INC_STATS(net, LINUX_MIB_TCPBACKLOGDROP); + } else if (tcp_add_backlog(sk, skb)) { goto discard_and_relse; } bh_unlock_sock(sk); @@ -1868,17 +1867,6 @@ void tcp6_proc_exit(struct net *net) } #endif -static void tcp_v6_clear_sk(struct sock *sk, int size) -{ - struct inet_sock *inet = inet_sk(sk); - - /* we do not want to clear pinet6 field, because of RCU lookups */ - sk_prot_clear_nulls(sk, offsetof(struct inet_sock, pinet6)); - - size -= offsetof(struct inet_sock, pinet6) + sizeof(inet->pinet6); - memset(&inet->pinet6 + 1, 0, size); -} - struct proto tcpv6_prot = { .name = "TCPv6", .owner = THIS_MODULE, @@ -1920,7 +1908,6 @@ struct proto tcpv6_prot = { .compat_setsockopt = compat_tcp_setsockopt, .compat_getsockopt = compat_tcp_getsockopt, #endif - .clear_sk = tcp_v6_clear_sk, .diag_destroy = tcp_abort, }; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 19ac3a1c308d..9aa7c1c7a9ce 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1424,17 +1424,6 @@ void udp6_proc_exit(struct net *net) } #endif /* CONFIG_PROC_FS */ -void udp_v6_clear_sk(struct sock *sk, int size) -{ - struct inet_sock *inet = inet_sk(sk); - - /* we do not want to clear pinet6 field, because of RCU lookups */ - sk_prot_clear_portaddr_nulls(sk, offsetof(struct inet_sock, pinet6)); - - size -= offsetof(struct inet_sock, pinet6) + sizeof(inet->pinet6); - memset(&inet->pinet6 + 1, 0, size); -} - /* ------------------------------------------------------------------------ */ struct proto udpv6_prot = { @@ -1465,7 +1454,7 @@ struct proto udpv6_prot = { .compat_setsockopt = compat_udpv6_setsockopt, .compat_getsockopt = compat_udpv6_getsockopt, #endif - .clear_sk = udp_v6_clear_sk, + .diag_destroy = udp_abort, }; static struct inet_protosw udpv6_protosw = { diff --git a/net/ipv6/udp_impl.h b/net/ipv6/udp_impl.h index 0682c031ccdc..f6eb1ab34f4b 100644 --- a/net/ipv6/udp_impl.h +++ b/net/ipv6/udp_impl.h @@ -29,8 +29,6 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock, int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); void udpv6_destroy_sock(struct sock *sk); -void udp_v6_clear_sk(struct sock *sk, int size); - #ifdef CONFIG_PROC_FS int udp6_seq_show(struct seq_file *seq, void *v); #endif diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c index fd6ef414899b..47d0d2b87106 100644 --- a/net/ipv6/udplite.c +++ b/net/ipv6/udplite.c @@ -55,7 +55,6 @@ struct proto udplitev6_prot = { .compat_setsockopt = compat_udpv6_setsockopt, .compat_getsockopt = compat_udpv6_getsockopt, #endif - .clear_sk = udp_v6_clear_sk, }; static struct inet_protosw udplite6_protosw = { diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c index 8d2f7c9b491d..db639690c205 100644 --- a/net/irda/af_irda.c +++ b/net/irda/af_irda.c @@ -845,9 +845,6 @@ static int irda_accept(struct socket *sock, struct socket *newsock, int flags) if (sock->state != SS_UNCONNECTED) goto out; - if ((sk = sock->sk) == NULL) - goto out; - err = -EOPNOTSUPP; if ((sk->sk_type != SOCK_STREAM) && (sk->sk_type != SOCK_SEQPACKET) && (sk->sk_type != SOCK_DGRAM)) diff --git a/net/kcm/Kconfig b/net/kcm/Kconfig index 5db94d940ecc..87fca36e6c47 100644 --- a/net/kcm/Kconfig +++ b/net/kcm/Kconfig @@ -3,6 +3,7 @@ config AF_KCM tristate "KCM sockets" depends on INET select BPF_SYSCALL + select STREAM_PARSER ---help--- KCM (Kernel Connection Multiplexor) sockets provide a method for multiplexing messages of a message based application diff --git a/net/kcm/kcmproc.c b/net/kcm/kcmproc.c index 16c2e03bd388..bf75c9231cca 100644 --- a/net/kcm/kcmproc.c +++ b/net/kcm/kcmproc.c @@ -155,8 +155,8 @@ static void kcm_format_psock(struct kcm_psock *psock, struct seq_file *seq, seq_printf(seq, " psock-%-5u %-10llu %-16llu %-10llu %-16llu %-8d %-8d %-8d %-8d ", psock->index, - psock->stats.rx_msgs, - psock->stats.rx_bytes, + psock->strp.stats.rx_msgs, + psock->strp.stats.rx_bytes, psock->stats.tx_msgs, psock->stats.tx_bytes, psock->sk->sk_receive_queue.qlen, @@ -170,14 +170,27 @@ static void kcm_format_psock(struct kcm_psock *psock, struct seq_file *seq, if (psock->tx_stopped) seq_puts(seq, "TxStop "); - if (psock->rx_stopped) + if (psock->strp.rx_stopped) seq_puts(seq, "RxStop "); if (psock->tx_kcm) seq_printf(seq, "Rsvd-%d ", psock->tx_kcm->index); - if (psock->ready_rx_msg) - seq_puts(seq, "RdyRx "); + if (!psock->strp.rx_paused && !psock->ready_rx_msg) { + if (psock->sk->sk_receive_queue.qlen) { + if (psock->strp.rx_need_bytes) + seq_printf(seq, "RxWait=%u ", + psock->strp.rx_need_bytes); + else + seq_printf(seq, "RxWait "); + } + } else { + if (psock->strp.rx_paused) + seq_puts(seq, "RxPause "); + + if (psock->ready_rx_msg) + seq_puts(seq, "RdyRx "); + } seq_puts(seq, "\n"); } @@ -275,6 +288,7 @@ static int kcm_stats_seq_show(struct seq_file *seq, void *v) { struct kcm_psock_stats psock_stats; struct kcm_mux_stats mux_stats; + struct strp_aggr_stats strp_stats; struct kcm_mux *mux; struct kcm_psock *psock; struct net *net = seq->private; @@ -282,20 +296,28 @@ static int kcm_stats_seq_show(struct seq_file *seq, void *v) memset(&mux_stats, 0, sizeof(mux_stats)); memset(&psock_stats, 0, sizeof(psock_stats)); + memset(&strp_stats, 0, sizeof(strp_stats)); mutex_lock(&knet->mutex); aggregate_mux_stats(&knet->aggregate_mux_stats, &mux_stats); aggregate_psock_stats(&knet->aggregate_psock_stats, &psock_stats); + aggregate_strp_stats(&knet->aggregate_strp_stats, + &strp_stats); list_for_each_entry_rcu(mux, &knet->mux_list, kcm_mux_list) { spin_lock_bh(&mux->lock); aggregate_mux_stats(&mux->stats, &mux_stats); aggregate_psock_stats(&mux->aggregate_psock_stats, &psock_stats); - list_for_each_entry(psock, &mux->psocks, psock_list) + aggregate_strp_stats(&mux->aggregate_strp_stats, + &strp_stats); + list_for_each_entry(psock, &mux->psocks, psock_list) { aggregate_psock_stats(&psock->stats, &psock_stats); + save_strp_stats(&psock->strp, &strp_stats); + } + spin_unlock_bh(&mux->lock); } @@ -328,7 +350,7 @@ static int kcm_stats_seq_show(struct seq_file *seq, void *v) mux_stats.rx_ready_drops); seq_printf(seq, - "%-8s %-10s %-16s %-10s %-16s %-10s %-10s %-10s %-10s %-10s %-10s %-10s %-10s %-10s\n", + "%-8s %-10s %-16s %-10s %-16s %-10s %-10s %-10s %-10s %-10s %-10s %-10s %-10s %-10s %-10s %-10s\n", "Psock", "RX-Msgs", "RX-Bytes", @@ -337,6 +359,8 @@ static int kcm_stats_seq_show(struct seq_file *seq, void *v) "Reserved", "Unreserved", "RX-Aborts", + "RX-Intr", + "RX-Unrecov", "RX-MemFail", "RX-NeedMor", "RX-BadLen", @@ -345,20 +369,22 @@ static int kcm_stats_seq_show(struct seq_file *seq, void *v) "TX-Aborts"); seq_printf(seq, - "%-8s %-10llu %-16llu %-10llu %-16llu %-10llu %-10llu %-10u %-10u %-10u %-10u %-10u %-10u %-10u\n", + "%-8s %-10llu %-16llu %-10llu %-16llu %-10llu %-10llu %-10u %-10u %-10u %-10u %-10u %-10u %-10u %-10u %-10u\n", "", - psock_stats.rx_msgs, - psock_stats.rx_bytes, + strp_stats.rx_msgs, + strp_stats.rx_bytes, psock_stats.tx_msgs, psock_stats.tx_bytes, psock_stats.reserved, psock_stats.unreserved, - psock_stats.rx_aborts, - psock_stats.rx_mem_fail, - psock_stats.rx_need_more_hdr, - psock_stats.rx_bad_hdr_len, - psock_stats.rx_msg_too_big, - psock_stats.rx_msg_timeouts, + strp_stats.rx_aborts, + strp_stats.rx_interrupted, + strp_stats.rx_unrecov_intr, + strp_stats.rx_mem_fail, + strp_stats.rx_need_more_hdr, + strp_stats.rx_bad_hdr_len, + strp_stats.rx_msg_too_big, + strp_stats.rx_msg_timeouts, psock_stats.tx_aborts); return 0; diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index cb39e05b166c..2632ac748371 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -1,3 +1,13 @@ +/* + * Kernel Connection Multiplexor + * + * Copyright (c) 2016 Tom Herbert <tom@herbertland.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + */ + #include <linux/bpf.h> #include <linux/errno.h> #include <linux/errqueue.h> @@ -16,7 +26,6 @@ #include <net/kcm.h> #include <net/netns/generic.h> #include <net/sock.h> -#include <net/tcp.h> #include <uapi/linux/kcm.h> unsigned int kcm_net_id; @@ -35,38 +44,12 @@ static inline struct kcm_tx_msg *kcm_tx_msg(struct sk_buff *skb) return (struct kcm_tx_msg *)skb->cb; } -static inline struct kcm_rx_msg *kcm_rx_msg(struct sk_buff *skb) -{ - return (struct kcm_rx_msg *)((void *)skb->cb + - offsetof(struct qdisc_skb_cb, data)); -} - static void report_csk_error(struct sock *csk, int err) { csk->sk_err = EPIPE; csk->sk_error_report(csk); } -/* Callback lock held */ -static void kcm_abort_rx_psock(struct kcm_psock *psock, int err, - struct sk_buff *skb) -{ - struct sock *csk = psock->sk; - - /* Unrecoverable error in receive */ - - del_timer(&psock->rx_msg_timer); - - if (psock->rx_stopped) - return; - - psock->rx_stopped = 1; - KCM_STATS_INCR(psock->stats.rx_aborts); - - /* Report an error on the lower socket */ - report_csk_error(csk, err); -} - static void kcm_abort_tx_psock(struct kcm_psock *psock, int err, bool wakeup_kcm) { @@ -109,12 +92,13 @@ static void kcm_abort_tx_psock(struct kcm_psock *psock, int err, static void kcm_update_rx_mux_stats(struct kcm_mux *mux, struct kcm_psock *psock) { - KCM_STATS_ADD(mux->stats.rx_bytes, - psock->stats.rx_bytes - psock->saved_rx_bytes); + STRP_STATS_ADD(mux->stats.rx_bytes, + psock->strp.stats.rx_bytes - + psock->saved_rx_bytes); mux->stats.rx_msgs += - psock->stats.rx_msgs - psock->saved_rx_msgs; - psock->saved_rx_msgs = psock->stats.rx_msgs; - psock->saved_rx_bytes = psock->stats.rx_bytes; + psock->strp.stats.rx_msgs - psock->saved_rx_msgs; + psock->saved_rx_msgs = psock->strp.stats.rx_msgs; + psock->saved_rx_bytes = psock->strp.stats.rx_bytes; } static void kcm_update_tx_mux_stats(struct kcm_mux *mux, @@ -167,11 +151,11 @@ static void kcm_rcv_ready(struct kcm_sock *kcm) */ list_del(&psock->psock_ready_list); psock->ready_rx_msg = NULL; - /* Commit clearing of ready_rx_msg for queuing work */ smp_mb(); - queue_work(kcm_wq, &psock->rx_work); + strp_unpause(&psock->strp); + strp_check_rcv(&psock->strp); } /* Buffer limit is okay now, add to ready list */ @@ -285,6 +269,7 @@ static struct kcm_sock *reserve_rx_kcm(struct kcm_psock *psock, if (list_empty(&mux->kcm_rx_waiters)) { psock->ready_rx_msg = head; + strp_pause(&psock->strp); list_add_tail(&psock->psock_ready_list, &mux->psocks_ready); spin_unlock_bh(&mux->rx_lock); @@ -353,346 +338,60 @@ static void unreserve_rx_kcm(struct kcm_psock *psock, spin_unlock_bh(&mux->rx_lock); } -static void kcm_start_rx_timer(struct kcm_psock *psock) -{ - if (psock->sk->sk_rcvtimeo) - mod_timer(&psock->rx_msg_timer, psock->sk->sk_rcvtimeo); -} - -/* Macro to invoke filter function. */ -#define KCM_RUN_FILTER(prog, ctx) \ - (*prog->bpf_func)(ctx, prog->insnsi) - -/* Lower socket lock held */ -static int kcm_tcp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb, - unsigned int orig_offset, size_t orig_len) -{ - struct kcm_psock *psock = (struct kcm_psock *)desc->arg.data; - struct kcm_rx_msg *rxm; - struct kcm_sock *kcm; - struct sk_buff *head, *skb; - size_t eaten = 0, cand_len; - ssize_t extra; - int err; - bool cloned_orig = false; - - if (psock->ready_rx_msg) - return 0; - - head = psock->rx_skb_head; - if (head) { - /* Message already in progress */ - - rxm = kcm_rx_msg(head); - if (unlikely(rxm->early_eaten)) { - /* Already some number of bytes on the receive sock - * data saved in rx_skb_head, just indicate they - * are consumed. - */ - eaten = orig_len <= rxm->early_eaten ? - orig_len : rxm->early_eaten; - rxm->early_eaten -= eaten; - - return eaten; - } - - if (unlikely(orig_offset)) { - /* Getting data with a non-zero offset when a message is - * in progress is not expected. If it does happen, we - * need to clone and pull since we can't deal with - * offsets in the skbs for a message expect in the head. - */ - orig_skb = skb_clone(orig_skb, GFP_ATOMIC); - if (!orig_skb) { - KCM_STATS_INCR(psock->stats.rx_mem_fail); - desc->error = -ENOMEM; - return 0; - } - if (!pskb_pull(orig_skb, orig_offset)) { - KCM_STATS_INCR(psock->stats.rx_mem_fail); - kfree_skb(orig_skb); - desc->error = -ENOMEM; - return 0; - } - cloned_orig = true; - orig_offset = 0; - } - - if (!psock->rx_skb_nextp) { - /* We are going to append to the frags_list of head. - * Need to unshare the frag_list. - */ - err = skb_unclone(head, GFP_ATOMIC); - if (err) { - KCM_STATS_INCR(psock->stats.rx_mem_fail); - desc->error = err; - return 0; - } - - if (unlikely(skb_shinfo(head)->frag_list)) { - /* We can't append to an sk_buff that already - * has a frag_list. We create a new head, point - * the frag_list of that to the old head, and - * then are able to use the old head->next for - * appending to the message. - */ - if (WARN_ON(head->next)) { - desc->error = -EINVAL; - return 0; - } - - skb = alloc_skb(0, GFP_ATOMIC); - if (!skb) { - KCM_STATS_INCR(psock->stats.rx_mem_fail); - desc->error = -ENOMEM; - return 0; - } - skb->len = head->len; - skb->data_len = head->len; - skb->truesize = head->truesize; - *kcm_rx_msg(skb) = *kcm_rx_msg(head); - psock->rx_skb_nextp = &head->next; - skb_shinfo(skb)->frag_list = head; - psock->rx_skb_head = skb; - head = skb; - } else { - psock->rx_skb_nextp = - &skb_shinfo(head)->frag_list; - } - } - } - - while (eaten < orig_len) { - /* Always clone since we will consume something */ - skb = skb_clone(orig_skb, GFP_ATOMIC); - if (!skb) { - KCM_STATS_INCR(psock->stats.rx_mem_fail); - desc->error = -ENOMEM; - break; - } - - cand_len = orig_len - eaten; - - head = psock->rx_skb_head; - if (!head) { - head = skb; - psock->rx_skb_head = head; - /* Will set rx_skb_nextp on next packet if needed */ - psock->rx_skb_nextp = NULL; - rxm = kcm_rx_msg(head); - memset(rxm, 0, sizeof(*rxm)); - rxm->offset = orig_offset + eaten; - } else { - /* Unclone since we may be appending to an skb that we - * already share a frag_list with. - */ - err = skb_unclone(skb, GFP_ATOMIC); - if (err) { - KCM_STATS_INCR(psock->stats.rx_mem_fail); - desc->error = err; - break; - } - - rxm = kcm_rx_msg(head); - *psock->rx_skb_nextp = skb; - psock->rx_skb_nextp = &skb->next; - head->data_len += skb->len; - head->len += skb->len; - head->truesize += skb->truesize; - } - - if (!rxm->full_len) { - ssize_t len; - - len = KCM_RUN_FILTER(psock->bpf_prog, head); - - if (!len) { - /* Need more header to determine length */ - if (!rxm->accum_len) { - /* Start RX timer for new message */ - kcm_start_rx_timer(psock); - } - rxm->accum_len += cand_len; - eaten += cand_len; - KCM_STATS_INCR(psock->stats.rx_need_more_hdr); - WARN_ON(eaten != orig_len); - break; - } else if (len > psock->sk->sk_rcvbuf) { - /* Message length exceeds maximum allowed */ - KCM_STATS_INCR(psock->stats.rx_msg_too_big); - desc->error = -EMSGSIZE; - psock->rx_skb_head = NULL; - kcm_abort_rx_psock(psock, EMSGSIZE, head); - break; - } else if (len <= (ssize_t)head->len - - skb->len - rxm->offset) { - /* Length must be into new skb (and also - * greater than zero) - */ - KCM_STATS_INCR(psock->stats.rx_bad_hdr_len); - desc->error = -EPROTO; - psock->rx_skb_head = NULL; - kcm_abort_rx_psock(psock, EPROTO, head); - break; - } - - rxm->full_len = len; - } - - extra = (ssize_t)(rxm->accum_len + cand_len) - rxm->full_len; - - if (extra < 0) { - /* Message not complete yet. */ - if (rxm->full_len - rxm->accum_len > - tcp_inq(psock->sk)) { - /* Don't have the whole messages in the socket - * buffer. Set psock->rx_need_bytes to wait for - * the rest of the message. Also, set "early - * eaten" since we've already buffered the skb - * but don't consume yet per tcp_read_sock. - */ - - if (!rxm->accum_len) { - /* Start RX timer for new message */ - kcm_start_rx_timer(psock); - } - - psock->rx_need_bytes = rxm->full_len - - rxm->accum_len; - rxm->accum_len += cand_len; - rxm->early_eaten = cand_len; - KCM_STATS_ADD(psock->stats.rx_bytes, cand_len); - desc->count = 0; /* Stop reading socket */ - break; - } - rxm->accum_len += cand_len; - eaten += cand_len; - WARN_ON(eaten != orig_len); - break; - } - - /* Positive extra indicates ore bytes than needed for the - * message - */ - - WARN_ON(extra > cand_len); - - eaten += (cand_len - extra); - - /* Hurray, we have a new message! */ - del_timer(&psock->rx_msg_timer); - psock->rx_skb_head = NULL; - KCM_STATS_INCR(psock->stats.rx_msgs); - -try_queue: - kcm = reserve_rx_kcm(psock, head); - if (!kcm) { - /* Unable to reserve a KCM, message is held in psock. */ - break; - } - - if (kcm_queue_rcv_skb(&kcm->sk, head)) { - /* Should mean socket buffer full */ - unreserve_rx_kcm(psock, false); - goto try_queue; - } - } - - if (cloned_orig) - kfree_skb(orig_skb); - - KCM_STATS_ADD(psock->stats.rx_bytes, eaten); - - return eaten; -} - -/* Called with lock held on lower socket */ -static int psock_tcp_read_sock(struct kcm_psock *psock) -{ - read_descriptor_t desc; - - desc.arg.data = psock; - desc.error = 0; - desc.count = 1; /* give more than one skb per call */ - - /* sk should be locked here, so okay to do tcp_read_sock */ - tcp_read_sock(psock->sk, &desc, kcm_tcp_recv); - - unreserve_rx_kcm(psock, true); - - return desc.error; -} - /* Lower sock lock held */ -static void psock_tcp_data_ready(struct sock *sk) +static void psock_data_ready(struct sock *sk) { struct kcm_psock *psock; read_lock_bh(&sk->sk_callback_lock); psock = (struct kcm_psock *)sk->sk_user_data; - if (unlikely(!psock || psock->rx_stopped)) - goto out; + if (likely(psock)) + strp_data_ready(&psock->strp); - if (psock->ready_rx_msg) - goto out; - - if (psock->rx_need_bytes) { - if (tcp_inq(sk) >= psock->rx_need_bytes) - psock->rx_need_bytes = 0; - else - goto out; - } - - if (psock_tcp_read_sock(psock) == -ENOMEM) - queue_delayed_work(kcm_wq, &psock->rx_delayed_work, 0); - -out: read_unlock_bh(&sk->sk_callback_lock); } -static void do_psock_rx_work(struct kcm_psock *psock) +/* Called with lower sock held */ +static void kcm_rcv_strparser(struct strparser *strp, struct sk_buff *skb) { - read_descriptor_t rd_desc; - struct sock *csk = psock->sk; - - /* We need the read lock to synchronize with psock_tcp_data_ready. We - * need the socket lock for calling tcp_read_sock. - */ - lock_sock(csk); - read_lock_bh(&csk->sk_callback_lock); - - if (unlikely(csk->sk_user_data != psock)) - goto out; - - if (unlikely(psock->rx_stopped)) - goto out; - - if (psock->ready_rx_msg) - goto out; - - rd_desc.arg.data = psock; + struct kcm_psock *psock = container_of(strp, struct kcm_psock, strp); + struct kcm_sock *kcm; - if (psock_tcp_read_sock(psock) == -ENOMEM) - queue_delayed_work(kcm_wq, &psock->rx_delayed_work, 0); +try_queue: + kcm = reserve_rx_kcm(psock, skb); + if (!kcm) { + /* Unable to reserve a KCM, message is held in psock and strp + * is paused. + */ + return; + } -out: - read_unlock_bh(&csk->sk_callback_lock); - release_sock(csk); + if (kcm_queue_rcv_skb(&kcm->sk, skb)) { + /* Should mean socket buffer full */ + unreserve_rx_kcm(psock, false); + goto try_queue; + } } -static void psock_rx_work(struct work_struct *w) +static int kcm_parse_func_strparser(struct strparser *strp, struct sk_buff *skb) { - do_psock_rx_work(container_of(w, struct kcm_psock, rx_work)); + struct kcm_psock *psock = container_of(strp, struct kcm_psock, strp); + struct bpf_prog *prog = psock->bpf_prog; + + return (*prog->bpf_func)(skb, prog->insnsi); } -static void psock_rx_delayed_work(struct work_struct *w) +static int kcm_read_sock_done(struct strparser *strp, int err) { - do_psock_rx_work(container_of(w, struct kcm_psock, - rx_delayed_work.work)); + struct kcm_psock *psock = container_of(strp, struct kcm_psock, strp); + + unreserve_rx_kcm(psock, true); + + return err; } -static void psock_tcp_state_change(struct sock *sk) +static void psock_state_change(struct sock *sk) { /* TCP only does a POLLIN for a half close. Do a POLLHUP here * since application will normally not poll with POLLIN @@ -702,7 +401,7 @@ static void psock_tcp_state_change(struct sock *sk) report_csk_error(sk, EPIPE); } -static void psock_tcp_write_space(struct sock *sk) +static void psock_write_space(struct sock *sk) { struct kcm_psock *psock; struct kcm_mux *mux; @@ -713,14 +412,13 @@ static void psock_tcp_write_space(struct sock *sk) psock = (struct kcm_psock *)sk->sk_user_data; if (unlikely(!psock)) goto out; - mux = psock->mux; spin_lock_bh(&mux->lock); /* Check if the socket is reserved so someone is waiting for sending. */ kcm = psock->tx_kcm; - if (kcm) + if (kcm && !unlikely(kcm->tx_stopped)) queue_work(kcm_wq, &kcm->tx_work); spin_unlock_bh(&mux->lock); @@ -1411,7 +1109,7 @@ static int kcm_recvmsg(struct socket *sock, struct msghdr *msg, struct kcm_sock *kcm = kcm_sk(sk); int err = 0; long timeo; - struct kcm_rx_msg *rxm; + struct strp_rx_msg *rxm; int copied = 0; struct sk_buff *skb; @@ -1425,7 +1123,7 @@ static int kcm_recvmsg(struct socket *sock, struct msghdr *msg, /* Okay, have a message on the receive queue */ - rxm = kcm_rx_msg(skb); + rxm = strp_rx_msg(skb); if (len > rxm->full_len) len = rxm->full_len; @@ -1481,7 +1179,7 @@ static ssize_t kcm_splice_read(struct socket *sock, loff_t *ppos, struct sock *sk = sock->sk; struct kcm_sock *kcm = kcm_sk(sk); long timeo; - struct kcm_rx_msg *rxm; + struct strp_rx_msg *rxm; int err = 0; ssize_t copied; struct sk_buff *skb; @@ -1498,7 +1196,7 @@ static ssize_t kcm_splice_read(struct socket *sock, loff_t *ppos, /* Okay, have a message on the receive queue */ - rxm = kcm_rx_msg(skb); + rxm = strp_rx_msg(skb); if (len > rxm->full_len) len = rxm->full_len; @@ -1674,15 +1372,6 @@ static void init_kcm_sock(struct kcm_sock *kcm, struct kcm_mux *mux) spin_unlock_bh(&mux->rx_lock); } -static void kcm_rx_msg_timeout(unsigned long arg) -{ - struct kcm_psock *psock = (struct kcm_psock *)arg; - - /* Message assembly timed out */ - KCM_STATS_INCR(psock->stats.rx_msg_timeouts); - kcm_abort_rx_psock(psock, ETIMEDOUT, NULL); -} - static int kcm_attach(struct socket *sock, struct socket *csock, struct bpf_prog *prog) { @@ -1692,19 +1381,13 @@ static int kcm_attach(struct socket *sock, struct socket *csock, struct kcm_psock *psock = NULL, *tpsock; struct list_head *head; int index = 0; - - if (csock->ops->family != PF_INET && - csock->ops->family != PF_INET6) - return -EINVAL; + struct strp_callbacks cb; + int err; csk = csock->sk; if (!csk) return -EINVAL; - /* Only support TCP for now */ - if (csk->sk_protocol != IPPROTO_TCP) - return -EINVAL; - psock = kmem_cache_zalloc(kcm_psockp, GFP_KERNEL); if (!psock) return -ENOMEM; @@ -1713,11 +1396,16 @@ static int kcm_attach(struct socket *sock, struct socket *csock, psock->sk = csk; psock->bpf_prog = prog; - setup_timer(&psock->rx_msg_timer, kcm_rx_msg_timeout, - (unsigned long)psock); + cb.rcv_msg = kcm_rcv_strparser; + cb.abort_parser = NULL; + cb.parse_msg = kcm_parse_func_strparser; + cb.read_sock_done = kcm_read_sock_done; - INIT_WORK(&psock->rx_work, psock_rx_work); - INIT_DELAYED_WORK(&psock->rx_delayed_work, psock_rx_delayed_work); + err = strp_init(&psock->strp, csk, &cb); + if (err) { + kmem_cache_free(kcm_psockp, psock); + return err; + } sock_hold(csk); @@ -1726,9 +1414,9 @@ static int kcm_attach(struct socket *sock, struct socket *csock, psock->save_write_space = csk->sk_write_space; psock->save_state_change = csk->sk_state_change; csk->sk_user_data = psock; - csk->sk_data_ready = psock_tcp_data_ready; - csk->sk_write_space = psock_tcp_write_space; - csk->sk_state_change = psock_tcp_state_change; + csk->sk_data_ready = psock_data_ready; + csk->sk_write_space = psock_write_space; + csk->sk_state_change = psock_state_change; write_unlock_bh(&csk->sk_callback_lock); /* Finished initialization, now add the psock to the MUX. */ @@ -1750,7 +1438,7 @@ static int kcm_attach(struct socket *sock, struct socket *csock, spin_unlock_bh(&mux->lock); /* Schedule RX work in case there are already bytes queued */ - queue_work(kcm_wq, &psock->rx_work); + strp_check_rcv(&psock->strp); return 0; } @@ -1790,6 +1478,8 @@ static void kcm_unattach(struct kcm_psock *psock) struct sock *csk = psock->sk; struct kcm_mux *mux = psock->mux; + lock_sock(csk); + /* Stop getting callbacks from TCP socket. After this there should * be no way to reserve a kcm for this psock. */ @@ -1798,7 +1488,7 @@ static void kcm_unattach(struct kcm_psock *psock) csk->sk_data_ready = psock->save_data_ready; csk->sk_write_space = psock->save_write_space; csk->sk_state_change = psock->save_state_change; - psock->rx_stopped = 1; + strp_stop(&psock->strp); if (WARN_ON(psock->rx_kcm)) { write_unlock_bh(&csk->sk_callback_lock); @@ -1821,18 +1511,17 @@ static void kcm_unattach(struct kcm_psock *psock) write_unlock_bh(&csk->sk_callback_lock); - del_timer_sync(&psock->rx_msg_timer); - cancel_work_sync(&psock->rx_work); - cancel_delayed_work_sync(&psock->rx_delayed_work); + /* Call strp_done without sock lock */ + release_sock(csk); + strp_done(&psock->strp); + lock_sock(csk); bpf_prog_put(psock->bpf_prog); - kfree_skb(psock->rx_skb_head); - psock->rx_skb_head = NULL; - spin_lock_bh(&mux->lock); aggregate_psock_stats(&psock->stats, &mux->aggregate_psock_stats); + save_strp_stats(&psock->strp, &mux->aggregate_strp_stats); KCM_STATS_INCR(mux->stats.psock_unattach); @@ -1875,6 +1564,8 @@ no_reserved: fput(csk->sk_socket->file); kmem_cache_free(kcm_psockp, psock); } + + release_sock(csk); } static int kcm_unattach_ioctl(struct socket *sock, struct kcm_unattach *info) @@ -1915,6 +1606,7 @@ static int kcm_unattach_ioctl(struct socket *sock, struct kcm_unattach *info) spin_unlock_bh(&mux->lock); + /* Lower socket lock should already be held */ kcm_unattach(psock); err = 0; @@ -2072,6 +1764,8 @@ static void release_mux(struct kcm_mux *mux) aggregate_mux_stats(&mux->stats, &knet->aggregate_mux_stats); aggregate_psock_stats(&mux->aggregate_psock_stats, &knet->aggregate_psock_stats); + aggregate_strp_stats(&mux->aggregate_strp_stats, + &knet->aggregate_strp_stats); list_del_rcu(&mux->kcm_mux_list); knet->count--; mutex_unlock(&knet->mutex); @@ -2151,6 +1845,13 @@ static int kcm_release(struct socket *sock) * it will just return. */ __skb_queue_purge(&sk->sk_write_queue); + + /* Set tx_stopped. This is checked when psock is bound to a kcm and we + * get a writespace callback. This prevents further work being queued + * from the callback (unbinding the psock occurs after canceling work. + */ + kcm->tx_stopped = 1; + release_sock(sk); spin_lock_bh(&mux->lock); diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 232cb92033e8..34eff77982cf 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -177,7 +177,7 @@ static int pppol2tp_recv_payload_hook(struct sk_buff *skb) if (!pskb_may_pull(skb, 2)) return 1; - if ((skb->data[0] == 0xff) && (skb->data[1] == 0x03)) + if ((skb->data[0] == PPP_ALLSTATIONS) && (skb->data[1] == PPP_UI)) skb_pull(skb, 2); return 0; @@ -282,7 +282,6 @@ static void pppol2tp_session_sock_put(struct l2tp_session *session) static int pppol2tp_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) { - static const unsigned char ppph[2] = { 0xff, 0x03 }; struct sock *sk = sock->sk; struct sk_buff *skb; int error; @@ -312,7 +311,7 @@ static int pppol2tp_sendmsg(struct socket *sock, struct msghdr *m, error = -ENOMEM; skb = sock_wmalloc(sk, NET_SKB_PAD + sizeof(struct iphdr) + uhlen + session->hdr_len + - sizeof(ppph) + total_len, + 2 + total_len, /* 2 bytes for PPP_ALLSTATIONS & PPP_UI */ 0, GFP_KERNEL); if (!skb) goto error_put_sess_tun; @@ -325,8 +324,8 @@ static int pppol2tp_sendmsg(struct socket *sock, struct msghdr *m, skb_reserve(skb, uhlen); /* Add PPP header */ - skb->data[0] = ppph[0]; - skb->data[1] = ppph[1]; + skb->data[0] = PPP_ALLSTATIONS; + skb->data[1] = PPP_UI; skb_put(skb, 2); /* Copy user data into skb */ @@ -369,7 +368,6 @@ error: */ static int pppol2tp_xmit(struct ppp_channel *chan, struct sk_buff *skb) { - static const u8 ppph[2] = { 0xff, 0x03 }; struct sock *sk = (struct sock *) chan->private; struct sock *sk_tun; struct l2tp_session *session; @@ -398,14 +396,14 @@ static int pppol2tp_xmit(struct ppp_channel *chan, struct sk_buff *skb) sizeof(struct iphdr) + /* IP header */ uhlen + /* UDP header (if L2TP_ENCAPTYPE_UDP) */ session->hdr_len + /* L2TP header */ - sizeof(ppph); /* PPP header */ + 2; /* 2 bytes for PPP_ALLSTATIONS & PPP_UI */ if (skb_cow_head(skb, headroom)) goto abort_put_sess_tun; /* Setup PPP header */ - __skb_push(skb, sizeof(ppph)); - skb->data[0] = ppph[0]; - skb->data[1] = ppph[1]; + __skb_push(skb, 2); + skb->data[0] = PPP_ALLSTATIONS; + skb->data[1] = PPP_UI; local_bh_disable(); l2tp_xmit_skb(session, skb, session->hdr_len); @@ -440,7 +438,7 @@ static void pppol2tp_session_close(struct l2tp_session *session) BUG_ON(session->magic != L2TP_SESSION_MAGIC); if (sock) { - inet_shutdown(sock, 2); + inet_shutdown(sock, SEND_SHUTDOWN); /* Don't let the session go away before our socket does */ l2tp_session_inc_refcount(session); } diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index ba5fc1f01e53..42a41ae405ba 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -1088,13 +1088,13 @@ static inline void drv_leave_ibss(struct ieee80211_local *local, } static inline u32 drv_get_expected_throughput(struct ieee80211_local *local, - struct ieee80211_sta *sta) + struct sta_info *sta) { u32 ret = 0; - trace_drv_get_expected_throughput(sta); - if (local->ops->get_expected_throughput) - ret = local->ops->get_expected_throughput(&local->hw, sta); + trace_drv_get_expected_throughput(&sta->sta); + if (local->ops->get_expected_throughput && sta->uploaded) + ret = local->ops->get_expected_throughput(&local->hw, &sta->sta); trace_drv_return_u32(local, ret); return ret; diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index 8f9c3bde835f..fa7d37cf0351 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -326,22 +326,33 @@ static u32 airtime_link_metric_get(struct ieee80211_local *local, u32 tx_time, estimated_retx; u64 result; - if (sta->mesh->fail_avg >= 100) - return MAX_METRIC; + /* Try to get rate based on HW/SW RC algorithm. + * Rate is returned in units of Kbps, correct this + * to comply with airtime calculation units + * Round up in case we get rate < 100Kbps + */ + rate = DIV_ROUND_UP(sta_get_expected_throughput(sta), 100); - sta_set_rate_info_tx(sta, &sta->tx_stats.last_rate, &rinfo); - rate = cfg80211_calculate_bitrate(&rinfo); - if (WARN_ON(!rate)) - return MAX_METRIC; + if (rate) { + err = 0; + } else { + if (sta->mesh->fail_avg >= 100) + return MAX_METRIC; - err = (sta->mesh->fail_avg << ARITH_SHIFT) / 100; + sta_set_rate_info_tx(sta, &sta->tx_stats.last_rate, &rinfo); + rate = cfg80211_calculate_bitrate(&rinfo); + if (WARN_ON(!rate)) + return MAX_METRIC; + + err = (sta->mesh->fail_avg << ARITH_SHIFT) / 100; + } /* bitrate is in units of 100 Kbps, while we need rate in units of * 1Mbps. This will be corrected on tx_time computation. */ tx_time = (device_constant + 10 * test_frame_len / rate); estimated_retx = ((1 << (2 * ARITH_SHIFT)) / (s_unit - err)); - result = (tx_time * estimated_retx) >> (2 * ARITH_SHIFT) ; + result = (tx_time * estimated_retx) >> (2 * ARITH_SHIFT); return (u32)result; } diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 76b737dcc36f..19f14c907d74 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -2279,11 +2279,7 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo) if (test_sta_flag(sta, WLAN_STA_TDLS_PEER)) sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_TDLS_PEER); - /* check if the driver has a SW RC implementation */ - if (ref && ref->ops->get_expected_throughput) - thr = ref->ops->get_expected_throughput(sta->rate_ctrl_priv); - else - thr = drv_get_expected_throughput(local, &sta->sta); + thr = sta_get_expected_throughput(sta); if (thr != 0) { sinfo->filled |= BIT(NL80211_STA_INFO_EXPECTED_THROUGHPUT); @@ -2291,6 +2287,25 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo) } } +u32 sta_get_expected_throughput(struct sta_info *sta) +{ + struct ieee80211_sub_if_data *sdata = sta->sdata; + struct ieee80211_local *local = sdata->local; + struct rate_control_ref *ref = NULL; + u32 thr = 0; + + if (test_sta_flag(sta, WLAN_STA_RATE_CONTROL)) + ref = local->rate_ctrl; + + /* check if the driver has a SW RC implementation */ + if (ref && ref->ops->get_expected_throughput) + thr = ref->ops->get_expected_throughput(sta->rate_ctrl_priv); + else + thr = drv_get_expected_throughput(local, sta); + + return thr; +} + unsigned long ieee80211_sta_last_active(struct sta_info *sta) { struct ieee80211_sta_rx_stats *stats = sta_get_last_rx_stats(sta); diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 78b0ef32dddd..0556be3e3628 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -712,6 +712,8 @@ void sta_set_rate_info_tx(struct sta_info *sta, struct rate_info *rinfo); void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo); +u32 sta_get_expected_throughput(struct sta_info *sta); + void ieee80211_sta_expire(struct ieee80211_sub_if_data *sdata, unsigned long exp_time); u8 sta_info_tx_streams(struct sta_info *sta); diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 502396694f47..1d0746dfea57 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -2334,7 +2334,6 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata, struct mesh_path __maybe_unused *mppath = NULL, *mpath = NULL; const u8 *encaps_data; int encaps_len, skip_header_bytes; - int nh_pos, h_pos; bool wme_sta = false, authorized = false; bool tdls_peer; bool multicast; @@ -2640,13 +2639,7 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata, encaps_len = 0; } - nh_pos = skb_network_header(skb) - skb->data; - h_pos = skb_transport_header(skb) - skb->data; - skb_pull(skb, skip_header_bytes); - nh_pos -= skip_header_bytes; - h_pos -= skip_header_bytes; - head_need = hdrlen + encaps_len + meshhdrlen - skb_headroom(skb); /* @@ -2672,18 +2665,12 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata, } } - if (encaps_data) { + if (encaps_data) memcpy(skb_push(skb, encaps_len), encaps_data, encaps_len); - nh_pos += encaps_len; - h_pos += encaps_len; - } #ifdef CONFIG_MAC80211_MESH - if (meshhdrlen > 0) { + if (meshhdrlen > 0) memcpy(skb_push(skb, meshhdrlen), &mesh_hdr, meshhdrlen); - nh_pos += meshhdrlen; - h_pos += meshhdrlen; - } #endif if (ieee80211_is_data_qos(fc)) { @@ -2699,15 +2686,7 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata, } else memcpy(skb_push(skb, hdrlen), &hdr, hdrlen); - nh_pos += hdrlen; - h_pos += hdrlen; - - /* Update skb pointers to various headers since this modified frame - * is going to go through Linux networking code that may potentially - * need things like pointer to IP header. */ skb_reset_mac_header(skb); - skb_set_network_header(skb, nh_pos); - skb_set_transport_header(skb, h_pos); info = IEEE80211_SKB_CB(skb); memset(info, 0, sizeof(*info)); @@ -4390,9 +4369,6 @@ void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata, int ac = ieee802_1d_to_ac[tid & 7]; skb_reset_mac_header(skb); - skb_reset_network_header(skb); - skb_reset_transport_header(skb); - skb_set_queue_mapping(skb, ac); skb->priority = tid; diff --git a/net/netlink/diag.c b/net/netlink/diag.c index 8dd836a8dd60..3e3e2534478a 100644 --- a/net/netlink/diag.c +++ b/net/netlink/diag.c @@ -63,43 +63,75 @@ out_nlmsg_trim: static int __netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, int protocol, int s_num) { + struct rhashtable_iter *hti = (void *)cb->args[2]; struct netlink_table *tbl = &nl_table[protocol]; - struct rhashtable *ht = &tbl->hash; - const struct bucket_table *htbl = rht_dereference_rcu(ht->tbl, ht); struct net *net = sock_net(skb->sk); struct netlink_diag_req *req; struct netlink_sock *nlsk; struct sock *sk; - int ret = 0, num = 0, i; + int num = 2; + int ret = 0; req = nlmsg_data(cb->nlh); - for (i = 0; i < htbl->size; i++) { - struct rhash_head *pos; + if (s_num > 1) + goto mc_list; - rht_for_each_entry_rcu(nlsk, pos, htbl, i, node) { - sk = (struct sock *)nlsk; + num--; - if (!net_eq(sock_net(sk), net)) - continue; - if (num < s_num) { - num++; + if (!hti) { + hti = kmalloc(sizeof(*hti), GFP_KERNEL); + if (!hti) + return -ENOMEM; + + cb->args[2] = (long)hti; + } + + if (!s_num) + rhashtable_walk_enter(&tbl->hash, hti); + + ret = rhashtable_walk_start(hti); + if (ret == -EAGAIN) + ret = 0; + if (ret) + goto stop; + + while ((nlsk = rhashtable_walk_next(hti))) { + if (IS_ERR(nlsk)) { + ret = PTR_ERR(nlsk); + if (ret == -EAGAIN) { + ret = 0; continue; } + break; + } - if (sk_diag_fill(sk, skb, req, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI, - sock_i_ino(sk)) < 0) { - ret = 1; - goto done; - } + sk = (struct sock *)nlsk; - num++; + if (!net_eq(sock_net(sk), net)) + continue; + + if (sk_diag_fill(sk, skb, req, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI, + sock_i_ino(sk)) < 0) { + ret = 1; + break; } } +stop: + rhashtable_walk_stop(hti); + if (ret) + goto done; + + rhashtable_walk_exit(hti); + cb->args[2] = 0; + num++; + +mc_list: + read_lock(&nl_table_lock); sk_for_each_bound(sk, &tbl->mc_list) { if (sk_hashed(sk)) continue; @@ -116,13 +148,14 @@ static int __netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, NLM_F_MULTI, sock_i_ino(sk)) < 0) { ret = 1; - goto done; + break; } num++; } + read_unlock(&nl_table_lock); + done: cb->args[0] = num; - cb->args[1] = protocol; return ret; } @@ -131,20 +164,20 @@ static int netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct netlink_diag_req *req; int s_num = cb->args[0]; + int err = 0; req = nlmsg_data(cb->nlh); - rcu_read_lock(); - read_lock(&nl_table_lock); - if (req->sdiag_protocol == NDIAG_PROTO_ALL) { int i; for (i = cb->args[1]; i < MAX_LINKS; i++) { - if (__netlink_diag_dump(skb, cb, i, s_num)) + err = __netlink_diag_dump(skb, cb, i, s_num); + if (err) break; s_num = 0; } + cb->args[1] = i; } else { if (req->sdiag_protocol >= MAX_LINKS) { read_unlock(&nl_table_lock); @@ -152,13 +185,22 @@ static int netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) return -ENOENT; } - __netlink_diag_dump(skb, cb, req->sdiag_protocol, s_num); + err = __netlink_diag_dump(skb, cb, req->sdiag_protocol, s_num); } - read_unlock(&nl_table_lock); - rcu_read_unlock(); + return err < 0 ? err : skb->len; +} + +static int netlink_diag_dump_done(struct netlink_callback *cb) +{ + struct rhashtable_iter *hti = (void *)cb->args[2]; + + if (cb->args[0] == 1) + rhashtable_walk_exit(hti); - return skb->len; + kfree(hti); + + return 0; } static int netlink_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) @@ -172,6 +214,7 @@ static int netlink_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) if (h->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = netlink_diag_dump, + .done = netlink_diag_dump_done, }; return netlink_dump_start(net->diag_nlsk, skb, h, &c); } else diff --git a/net/rds/ib.h b/net/rds/ib.h index 046f7508c06b..45ac8e8e58f4 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -333,6 +333,7 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp); void rds_ib_state_change(struct sock *sk); int rds_ib_listen_init(void); void rds_ib_listen_stop(void); +__printf(2, 3) void __rds_ib_conn_error(struct rds_connection *conn, const char *, ...); int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, struct rdma_cm_event *event); diff --git a/net/rds/rds.h b/net/rds/rds.h index b2d17f0fafa8..fd0bccb2f9f9 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -688,6 +688,7 @@ void __rds_conn_error(struct rds_connection *conn, const char *, ...); #define rds_conn_error(conn, fmt...) \ __rds_conn_error(conn, KERN_WARNING "RDS: " fmt) +__printf(2, 3) void __rds_conn_path_error(struct rds_conn_path *cp, const char *, ...); #define rds_conn_path_error(cp, fmt...) \ __rds_conn_path_error(cp, KERN_WARNING "RDS: " fmt) diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 88effadd4b16..c7cf356b42b8 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -22,6 +22,7 @@ #include <net/net_namespace.h> #include <net/sock.h> #include <net/af_rxrpc.h> +#define CREATE_TRACE_POINTS #include "ar-internal.h" MODULE_DESCRIPTION("RxRPC network protocol"); diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index ff83fb1ddd47..c761124961cc 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -255,6 +255,9 @@ enum rxrpc_conn_flag { RXRPC_CONN_HAS_IDR, /* Has a client conn ID assigned */ RXRPC_CONN_IN_SERVICE_CONNS, /* Conn is in peer->service_conns */ RXRPC_CONN_IN_CLIENT_CONNS, /* Conn is in local->client_conns */ + RXRPC_CONN_EXPOSED, /* Conn has extra ref for exposure */ + RXRPC_CONN_DONT_REUSE, /* Don't reuse this connection */ + RXRPC_CONN_COUNTED, /* Counted by rxrpc_nr_client_conns */ }; /* @@ -265,6 +268,17 @@ enum rxrpc_conn_event { }; /* + * The connection cache state. + */ +enum rxrpc_conn_cache_state { + RXRPC_CONN_CLIENT_INACTIVE, /* Conn is not yet listed */ + RXRPC_CONN_CLIENT_WAITING, /* Conn is on wait list, waiting for capacity */ + RXRPC_CONN_CLIENT_ACTIVE, /* Conn is on active list, doing calls */ + RXRPC_CONN_CLIENT_CULLED, /* Conn is culled and delisted, doing calls */ + RXRPC_CONN_CLIENT_IDLE, /* Conn is on idle list, doing mostly nothing */ +}; + +/* * The connection protocol state. */ enum rxrpc_conn_proto_state { @@ -276,6 +290,7 @@ enum rxrpc_conn_proto_state { RXRPC_CONN_REMOTELY_ABORTED, /* Conn aborted by peer */ RXRPC_CONN_LOCALLY_ABORTED, /* Conn aborted locally */ RXRPC_CONN_NETWORK_ERROR, /* Conn terminated by network error */ + RXRPC_CONN_LOCAL_ERROR, /* Conn terminated by local error */ RXRPC_CONN__NR_STATES }; @@ -288,23 +303,33 @@ struct rxrpc_connection { struct rxrpc_conn_proto proto; struct rxrpc_conn_parameters params; - spinlock_t channel_lock; + atomic_t usage; + struct rcu_head rcu; + struct list_head cache_link; + spinlock_t channel_lock; + unsigned char active_chans; /* Mask of active channels */ +#define RXRPC_ACTIVE_CHANS_MASK ((1 << RXRPC_MAXCALLS) - 1) + struct list_head waiting_calls; /* Calls waiting for channels */ struct rxrpc_channel { struct rxrpc_call __rcu *call; /* Active call */ u32 call_id; /* ID of current call */ u32 call_counter; /* Call ID counter */ u32 last_call; /* ID of last call */ - u32 last_result; /* Result of last call (0/abort) */ + u8 last_type; /* Type of last packet */ + u16 last_service_id; + union { + u32 last_seq; + u32 last_abort; + }; } channels[RXRPC_MAXCALLS]; - wait_queue_head_t channel_wq; /* queue to wait for channel to become available */ - struct rcu_head rcu; struct work_struct processor; /* connection event processor */ union { struct rb_node client_node; /* Node in local->client_conns */ struct rb_node service_node; /* Node in peer->service_conns */ }; + struct list_head proc_link; /* link in procfs list */ struct list_head link; /* link in master connection list */ struct sk_buff_head rx_queue; /* received conn-level packets */ const struct rxrpc_security *security; /* applied security module */ @@ -313,17 +338,16 @@ struct rxrpc_connection { struct rxrpc_crypt csum_iv; /* packet checksum base */ unsigned long flags; unsigned long events; - unsigned long put_time; /* Time at which last put */ + unsigned long idle_timestamp; /* Time at which last became idle */ spinlock_t state_lock; /* state-change lock */ - atomic_t usage; + enum rxrpc_conn_cache_state cache_state : 8; enum rxrpc_conn_proto_state state : 8; /* current state of connection */ u32 local_abort; /* local abort code */ u32 remote_abort; /* remote abort code */ int error; /* local error incurred */ int debug_id; /* debug ID for printks */ atomic_t serial; /* packet serial number counter */ - atomic_t hi_serial; /* highest serial number received */ - atomic_t avail_chans; /* number of channels available */ + unsigned int hi_serial; /* highest serial number received */ u8 size_align; /* data size alignment (for security) */ u8 header_size; /* rxrpc + security header size */ u8 security_size; /* security header size */ @@ -341,10 +365,11 @@ enum rxrpc_call_flag { RXRPC_CALL_RCVD_LAST, /* all packets received */ RXRPC_CALL_RUN_RTIMER, /* Tx resend timer started */ RXRPC_CALL_TX_SOFT_ACK, /* sent some soft ACKs */ - RXRPC_CALL_PROC_BUSY, /* the processor is busy */ RXRPC_CALL_INIT_ACCEPT, /* acceptance was initiated */ RXRPC_CALL_HAS_USERID, /* has a user ID attached */ RXRPC_CALL_EXPECT_OOS, /* expect out of sequence packets */ + RXRPC_CALL_IS_SERVICE, /* Call is service call */ + RXRPC_CALL_EXPOSED, /* The call was exposed to the world */ }; /* @@ -402,6 +427,7 @@ enum rxrpc_call_state { struct rxrpc_call { struct rcu_head rcu; struct rxrpc_connection *conn; /* connection carrying call */ + struct rxrpc_peer *peer; /* Peer record for remote address */ struct rxrpc_sock *socket; /* socket responsible */ struct timer_list lifetimer; /* lifetime remaining on call */ struct timer_list deadspan; /* reap timer for re-ACK'ing, etc */ @@ -410,13 +436,14 @@ struct rxrpc_call { struct work_struct destroyer; /* call destroyer */ struct work_struct processor; /* packet processor and ACK generator */ struct list_head link; /* link in master call list */ + struct list_head chan_wait_link; /* Link in conn->waiting_calls */ struct hlist_node error_link; /* link in error distribution list */ struct list_head accept_link; /* calls awaiting acceptance */ struct rb_node sock_node; /* node in socket call tree */ struct sk_buff_head rx_queue; /* received packets */ struct sk_buff_head rx_oos_queue; /* packets received out of sequence */ struct sk_buff *tx_pending; /* Tx socket buffer being filled */ - wait_queue_head_t tx_waitq; /* wait for Tx window space to become available */ + wait_queue_head_t waitq; /* Wait queue for channel or Tx */ __be32 crypto_buf[2]; /* Temporary packet crypto buffer */ unsigned long user_call_ID; /* user-defined call ID */ unsigned long creation_jif; /* time of call creation */ @@ -432,8 +459,10 @@ struct rxrpc_call { int error_report; /* Network error (ICMP/local transport) */ int error; /* Local error incurred */ enum rxrpc_call_state state : 8; /* current state of call */ + u16 service_id; /* service ID */ + u32 call_id; /* call ID on connection */ + u32 cid; /* connection ID plus channel index */ int debug_id; /* debug ID for printks */ - u8 channel; /* connection channel occupied by this call */ /* transmission-phase ACK management */ u8 acks_head; /* offset into window of first entry */ @@ -455,19 +484,13 @@ struct rxrpc_call { rxrpc_seq_t ackr_win_top; /* top of ACK window (rx_data_eaten is bottom) */ rxrpc_seq_t ackr_prev_seq; /* previous sequence number received */ u8 ackr_reason; /* reason to ACK */ + u16 ackr_skew; /* skew on packet being ACK'd */ rxrpc_serial_t ackr_serial; /* serial of packet being ACK'd */ atomic_t ackr_not_idle; /* number of packets in Rx queue */ /* received packet records, 1 bit per record */ #define RXRPC_ACKR_WINDOW_ASZ DIV_ROUND_UP(RXRPC_MAXACKS, BITS_PER_LONG) unsigned long ackr_window[RXRPC_ACKR_WINDOW_ASZ + 1]; - - u8 in_clientflag; /* Copy of conn->in_clientflag */ - struct rxrpc_local *local; /* Local endpoint. */ - u32 call_id; /* call ID on connection */ - u32 cid; /* connection ID plus channel index */ - u32 epoch; /* epoch of this connection */ - u16 service_id; /* service ID */ }; /* @@ -484,6 +507,8 @@ static inline void rxrpc_abort_call(struct rxrpc_call *call, u32 abort_code) write_unlock_bh(&call->state_lock); } +#include <trace/events/rxrpc.h> + /* * af_rxrpc.c */ @@ -502,8 +527,8 @@ int rxrpc_reject_call(struct rxrpc_sock *); /* * call_event.c */ -void __rxrpc_propose_ACK(struct rxrpc_call *, u8, u32, bool); -void rxrpc_propose_ACK(struct rxrpc_call *, u8, u32, bool); +void __rxrpc_propose_ACK(struct rxrpc_call *, u8, u16, u32, bool); +void rxrpc_propose_ACK(struct rxrpc_call *, u8, u16, u32, bool); void rxrpc_process_call(struct work_struct *); /* @@ -528,15 +553,32 @@ void rxrpc_release_calls_on_socket(struct rxrpc_sock *); void __rxrpc_put_call(struct rxrpc_call *); void __exit rxrpc_destroy_all_calls(void); +static inline bool rxrpc_is_service_call(const struct rxrpc_call *call) +{ + return test_bit(RXRPC_CALL_IS_SERVICE, &call->flags); +} + +static inline bool rxrpc_is_client_call(const struct rxrpc_call *call) +{ + return !rxrpc_is_service_call(call); +} + /* * conn_client.c */ +extern unsigned int rxrpc_max_client_connections; +extern unsigned int rxrpc_reap_client_connections; +extern unsigned int rxrpc_conn_idle_client_expiry; +extern unsigned int rxrpc_conn_idle_client_fast_expiry; extern struct idr rxrpc_client_conn_ids; void rxrpc_destroy_client_conn_ids(void); int rxrpc_connect_call(struct rxrpc_call *, struct rxrpc_conn_parameters *, struct sockaddr_rxrpc *, gfp_t); -void rxrpc_unpublish_client_conn(struct rxrpc_connection *); +void rxrpc_expose_client_call(struct rxrpc_call *); +void rxrpc_disconnect_client_call(struct rxrpc_call *); +void rxrpc_put_client_conn(struct rxrpc_connection *); +void __exit rxrpc_destroy_all_client_connections(void); /* * conn_event.c @@ -550,15 +592,17 @@ void rxrpc_reject_packets(struct rxrpc_local *); */ extern unsigned int rxrpc_connection_expiry; extern struct list_head rxrpc_connections; +extern struct list_head rxrpc_connection_proc_list; extern rwlock_t rxrpc_connection_lock; int rxrpc_extract_addr_from_skb(struct sockaddr_rxrpc *, struct sk_buff *); struct rxrpc_connection *rxrpc_alloc_connection(gfp_t); struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *, struct sk_buff *); -void __rxrpc_disconnect_call(struct rxrpc_call *); +void __rxrpc_disconnect_call(struct rxrpc_connection *, struct rxrpc_call *); void rxrpc_disconnect_call(struct rxrpc_call *); -void rxrpc_put_connection(struct rxrpc_connection *); +void rxrpc_kill_connection(struct rxrpc_connection *); +void __rxrpc_put_connection(struct rxrpc_connection *); void __exit rxrpc_destroy_all_connections(void); static inline bool rxrpc_conn_is_client(const struct rxrpc_connection *conn) @@ -582,6 +626,21 @@ struct rxrpc_connection *rxrpc_get_connection_maybe(struct rxrpc_connection *con return atomic_inc_not_zero(&conn->usage) ? conn : NULL; } +static inline void rxrpc_put_connection(struct rxrpc_connection *conn) +{ + if (!conn) + return; + + if (rxrpc_conn_is_client(conn)) { + if (atomic_dec_and_test(&conn->usage)) + rxrpc_put_client_conn(conn); + } else { + if (atomic_dec_return(&conn->usage) == 1) + __rxrpc_put_connection(conn); + } +} + + static inline bool rxrpc_queue_conn(struct rxrpc_connection *conn) { if (!rxrpc_get_connection_maybe(conn)) @@ -697,9 +756,10 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *, struct sockaddr_rxrpc *, gfp_t); struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *, gfp_t); -static inline void rxrpc_get_peer(struct rxrpc_peer *peer) +static inline struct rxrpc_peer *rxrpc_get_peer(struct rxrpc_peer *peer) { atomic_inc(&peer->usage); + return peer; } static inline @@ -747,6 +807,11 @@ int rxrpc_init_server_conn_security(struct rxrpc_connection *); * skbuff.c */ void rxrpc_packet_destructor(struct sk_buff *); +void rxrpc_new_skb(struct sk_buff *); +void rxrpc_see_skb(struct sk_buff *); +void rxrpc_get_skb(struct sk_buff *); +void rxrpc_free_skb(struct sk_buff *); +void rxrpc_purge_queue(struct sk_buff_head *); /* * sysctl.c @@ -894,44 +959,6 @@ do { \ #endif /* __KDEBUGALL */ -/* - * socket buffer accounting / leak finding - */ -static inline void __rxrpc_new_skb(struct sk_buff *skb, const char *fn) -{ - //_net("new skb %p %s [%d]", skb, fn, atomic_read(&rxrpc_n_skbs)); - //atomic_inc(&rxrpc_n_skbs); -} - -#define rxrpc_new_skb(skb) __rxrpc_new_skb((skb), __func__) - -static inline void __rxrpc_kill_skb(struct sk_buff *skb, const char *fn) -{ - //_net("kill skb %p %s [%d]", skb, fn, atomic_read(&rxrpc_n_skbs)); - //atomic_dec(&rxrpc_n_skbs); -} - -#define rxrpc_kill_skb(skb) __rxrpc_kill_skb((skb), __func__) - -static inline void __rxrpc_free_skb(struct sk_buff *skb, const char *fn) -{ - if (skb) { - CHECK_SLAB_OKAY(&skb->users); - //_net("free skb %p %s [%d]", - // skb, fn, atomic_read(&rxrpc_n_skbs)); - //atomic_dec(&rxrpc_n_skbs); - kfree_skb(skb); - } -} - -#define rxrpc_free_skb(skb) __rxrpc_free_skb((skb), __func__) - -static inline void rxrpc_purge_queue(struct sk_buff_head *list) -{ - struct sk_buff *skb; - while ((skb = skb_dequeue((list))) != NULL) - rxrpc_free_skb(skb); -} #define rxrpc_get_call(CALL) \ do { \ diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 9bae21e66d65..669ac79d3b44 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -203,6 +203,7 @@ void rxrpc_accept_incoming_calls(struct rxrpc_local *local) _net("incoming call skb %p", skb); + rxrpc_see_skb(skb); sp = rxrpc_skb(skb); /* Set up a response packet header in case we need it */ diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index e60cf65c2232..5292bcfd8816 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -25,7 +25,7 @@ * propose an ACK be sent */ void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, - u32 serial, bool immediate) + u16 skew, u32 serial, bool immediate) { unsigned long expiry; s8 prior = rxrpc_ack_priority[ack_reason]; @@ -44,8 +44,10 @@ void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, /* update DELAY, IDLE, REQUESTED and PING_RESPONSE ACK serial * numbers */ if (prior == rxrpc_ack_priority[call->ackr_reason]) { - if (prior <= 4) + if (prior <= 4) { + call->ackr_skew = skew; call->ackr_serial = serial; + } if (immediate) goto cancel_timer; return; @@ -103,13 +105,13 @@ cancel_timer: * propose an ACK be sent, locking the call structure */ void rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, - u32 serial, bool immediate) + u16 skew, u32 serial, bool immediate) { s8 prior = rxrpc_ack_priority[ack_reason]; if (prior > rxrpc_ack_priority[call->ackr_reason]) { spin_lock_bh(&call->lock); - __rxrpc_propose_ACK(call, ack_reason, serial, immediate); + __rxrpc_propose_ACK(call, ack_reason, skew, serial, immediate); spin_unlock_bh(&call->lock); } } @@ -191,6 +193,8 @@ static void rxrpc_resend(struct rxrpc_call *call) stop = true; sp->resend_at = jiffies + 3; } else { + if (rxrpc_is_client_call(call)) + rxrpc_expose_client_call(call); sp->resend_at = jiffies + rxrpc_resend_timeout; } @@ -376,7 +380,7 @@ static void rxrpc_rotate_tx_window(struct rxrpc_call *call, u32 hard) call->acks_hard++; } - wake_up(&call->tx_waitq); + wake_up(&call->waitq); } /* @@ -407,6 +411,7 @@ static int rxrpc_drain_rx_oos_queue(struct rxrpc_call *call) skb = skb_dequeue(&call->rx_oos_queue); if (skb) { + rxrpc_see_skb(skb); sp = rxrpc_skb(skb); _debug("drain OOS packet %d [%d]", @@ -427,6 +432,7 @@ static int rxrpc_drain_rx_oos_queue(struct rxrpc_call *call) /* find out what the next packet is */ skb = skb_peek(&call->rx_oos_queue); + rxrpc_see_skb(skb); if (skb) call->rx_first_oos = rxrpc_skb(skb)->hdr.seq; else @@ -576,6 +582,7 @@ process_further: if (!skb) return -EAGAIN; + rxrpc_see_skb(skb); _net("deferred skb %p", skb); sp = rxrpc_skb(skb); @@ -625,7 +632,7 @@ process_further: if (ack.reason == RXRPC_ACK_PING) { _proto("Rx ACK %%%u PING Request", latest); rxrpc_propose_ACK(call, RXRPC_ACK_PING_RESPONSE, - sp->hdr.serial, true); + skb->priority, sp->hdr.serial, true); } /* discard any out-of-order or duplicate ACKs */ @@ -832,11 +839,6 @@ void rxrpc_process_call(struct work_struct *work) call->debug_id, rxrpc_call_states[call->state], call->events, (jiffies - call->creation_jif) / (HZ / 10)); - if (test_and_set_bit(RXRPC_CALL_PROC_BUSY, &call->flags)) { - _debug("XXXXXXXXXXXXX RUNNING ON MULTIPLE CPUS XXXXXXXXXXXXX"); - return; - } - if (!call->conn) goto skip_msg_init; @@ -1155,8 +1157,7 @@ skip_msg_init: goto maybe_reschedule; send_ACK_with_skew: - ack.maxSkew = htons(atomic_read(&call->conn->hi_serial) - - ntohl(ack.serial)); + ack.maxSkew = htons(call->ackr_skew); send_ACK: mtu = call->conn->params.peer->if_mtu; mtu -= call->conn->params.peer->hdrsize; @@ -1246,7 +1247,8 @@ send_message_2: case RXRPC_CALL_SERVER_ACK_REQUEST: _debug("start ACK timer"); rxrpc_propose_ACK(call, RXRPC_ACK_DELAY, - call->ackr_serial, false); + call->ackr_skew, call->ackr_serial, + false); default: break; } @@ -1281,7 +1283,6 @@ maybe_reschedule: } error: - clear_bit(RXRPC_CALL_PROC_BUSY, &call->flags); kfree(acks); /* because we don't want two CPUs both processing the work item for one diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index ae057e0740f3..e7cbcc4a87cf 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -127,10 +127,11 @@ static struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp) INIT_WORK(&call->destroyer, &rxrpc_destroy_call); INIT_WORK(&call->processor, &rxrpc_process_call); INIT_LIST_HEAD(&call->link); + INIT_LIST_HEAD(&call->chan_wait_link); INIT_LIST_HEAD(&call->accept_link); skb_queue_head_init(&call->rx_queue); skb_queue_head_init(&call->rx_oos_queue); - init_waitqueue_head(&call->tx_waitq); + init_waitqueue_head(&call->waitq); spin_lock_init(&call->lock); rwlock_init(&call->state_lock); atomic_set(&call->usage, 1); @@ -167,10 +168,7 @@ static struct rxrpc_call *rxrpc_alloc_client_call(struct rxrpc_sock *rx, sock_hold(&rx->sk); call->socket = rx; call->rx_data_post = 1; - - call->local = rx->local; call->service_id = srx->srx_service; - call->in_clientflag = 0; _leave(" = %p", call); return call; @@ -318,11 +316,12 @@ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx, chan = sp->hdr.cid & RXRPC_CHANNELMASK; candidate->socket = rx; candidate->conn = conn; + candidate->peer = conn->params.peer; candidate->cid = sp->hdr.cid; candidate->call_id = sp->hdr.callNumber; - candidate->channel = chan; candidate->rx_data_post = 0; candidate->state = RXRPC_CALL_SERVER_ACCEPTING; + candidate->flags |= (1 << RXRPC_CALL_IS_SERVICE); if (conn->security_ix > 0) candidate->state = RXRPC_CALL_SERVER_SECURING; @@ -332,7 +331,7 @@ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx, call = rcu_dereference_protected(conn->channels[chan].call, lockdep_is_held(&conn->channel_lock)); - _debug("channel[%u] is %p", candidate->channel, call); + _debug("channel[%u] is %p", candidate->cid & RXRPC_CHANNELMASK, call); if (call && call->call_id == sp->hdr.callNumber) { /* already set; must've been a duplicate packet */ _debug("extant call [%d]", call->state); @@ -360,7 +359,7 @@ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx, call->debug_id, rxrpc_call_states[call->state]); if (call->state >= RXRPC_CALL_COMPLETE) { - __rxrpc_disconnect_call(call); + __rxrpc_disconnect_call(conn, call); } else { spin_unlock(&conn->channel_lock); kmem_cache_free(rxrpc_call_jar, candidate); @@ -387,6 +386,7 @@ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx, rcu_assign_pointer(conn->channels[chan].call, call); sock_hold(&rx->sk); rxrpc_get_connection(conn); + rxrpc_get_peer(call->peer); spin_unlock(&conn->channel_lock); spin_lock(&conn->params.peer->lock); @@ -397,10 +397,7 @@ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx, list_add_tail(&call->link, &rxrpc_calls); write_unlock_bh(&rxrpc_call_lock); - call->local = conn->params.local; - call->epoch = conn->proto.epoch; call->service_id = conn->params.service_id; - call->in_clientflag = RXRPC_CLIENT_INITIATED; _net("CALL incoming %d on CONN %d", call->debug_id, call->conn->debug_id); @@ -569,12 +566,6 @@ void rxrpc_release_calls_on_socket(struct rxrpc_sock *rx) read_lock_bh(&rx->call_lock); - /* mark all the calls as no longer wanting incoming packets */ - for (p = rb_first(&rx->calls); p; p = rb_next(p)) { - call = rb_entry(p, struct rxrpc_call, sock_node); - rxrpc_mark_call_released(call); - } - /* kill the not-yet-accepted incoming calls */ list_for_each_entry(call, &rx->secureq, accept_link) { rxrpc_mark_call_released(call); @@ -584,6 +575,12 @@ void rxrpc_release_calls_on_socket(struct rxrpc_sock *rx) rxrpc_mark_call_released(call); } + /* mark all the calls as no longer wanting incoming packets */ + for (p = rb_first(&rx->calls); p; p = rb_next(p)) { + call = rb_entry(p, struct rxrpc_call, sock_node); + rxrpc_mark_call_released(call); + } + read_unlock_bh(&rx->call_lock); _leave(""); } @@ -616,6 +613,7 @@ static void rxrpc_rcu_destroy_call(struct rcu_head *rcu) struct rxrpc_call *call = container_of(rcu, struct rxrpc_call, rcu); rxrpc_purge_queue(&call->rx_queue); + rxrpc_put_peer(call->peer); kmem_cache_free(rxrpc_call_jar, call); } @@ -682,8 +680,8 @@ static void rxrpc_destroy_call(struct work_struct *work) struct rxrpc_call *call = container_of(work, struct rxrpc_call, destroyer); - _enter("%p{%d,%d,%p}", - call, atomic_read(&call->usage), call->channel, call->conn); + _enter("%p{%d,%x,%p}", + call, atomic_read(&call->usage), call->cid, call->conn); ASSERTCMP(call->state, ==, RXRPC_CALL_DEAD); diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index 9e91f27b0d0f..349402b08e5a 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -7,6 +7,68 @@ * modify it under the terms of the GNU General Public Licence * as published by the Free Software Foundation; either version * 2 of the Licence, or (at your option) any later version. + * + * + * Client connections need to be cached for a little while after they've made a + * call so as to handle retransmitted DATA packets in case the server didn't + * receive the final ACK or terminating ABORT we sent it. + * + * Client connections can be in one of a number of cache states: + * + * (1) INACTIVE - The connection is not held in any list and may not have been + * exposed to the world. If it has been previously exposed, it was + * discarded from the idle list after expiring. + * + * (2) WAITING - The connection is waiting for the number of client conns to + * drop below the maximum capacity. Calls may be in progress upon it from + * when it was active and got culled. + * + * The connection is on the rxrpc_waiting_client_conns list which is kept + * in to-be-granted order. Culled conns with waiters go to the back of + * the queue just like new conns. + * + * (3) ACTIVE - The connection has at least one call in progress upon it, it + * may freely grant available channels to new calls and calls may be + * waiting on it for channels to become available. + * + * The connection is on the rxrpc_active_client_conns list which is kept + * in activation order for culling purposes. + * + * rxrpc_nr_active_client_conns is held incremented also. + * + * (4) CULLED - The connection got summarily culled to try and free up + * capacity. Calls currently in progress on the connection are allowed to + * continue, but new calls will have to wait. There can be no waiters in + * this state - the conn would have to go to the WAITING state instead. + * + * (5) IDLE - The connection has no calls in progress upon it and must have + * been exposed to the world (ie. the EXPOSED flag must be set). When it + * expires, the EXPOSED flag is cleared and the connection transitions to + * the INACTIVE state. + * + * The connection is on the rxrpc_idle_client_conns list which is kept in + * order of how soon they'll expire. + * + * There are flags of relevance to the cache: + * + * (1) EXPOSED - The connection ID got exposed to the world. If this flag is + * set, an extra ref is added to the connection preventing it from being + * reaped when it has no calls outstanding. This flag is cleared and the + * ref dropped when a conn is discarded from the idle list. + * + * This allows us to move terminal call state retransmission to the + * connection and to discard the call immediately we think it is done + * with. It also give us a chance to reuse the connection. + * + * (2) DONT_REUSE - The connection should be discarded as soon as possible and + * should not be reused. This is set when an exclusive connection is used + * or a call ID counter overflows. + * + * The caching state may only be changed if the cache lock is held. + * + * There are two idle client connection expiry durations. If the total number + * of connections is below the reap threshold, we use the normal duration; if + * it's above, we use the fast duration. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -16,22 +78,37 @@ #include <linux/timer.h> #include "ar-internal.h" +__read_mostly unsigned int rxrpc_max_client_connections = 1000; +__read_mostly unsigned int rxrpc_reap_client_connections = 900; +__read_mostly unsigned int rxrpc_conn_idle_client_expiry = 2 * 60 * HZ; +__read_mostly unsigned int rxrpc_conn_idle_client_fast_expiry = 2 * HZ; + +static unsigned int rxrpc_nr_client_conns; +static unsigned int rxrpc_nr_active_client_conns; +static __read_mostly bool rxrpc_kill_all_client_conns; + +static DEFINE_SPINLOCK(rxrpc_client_conn_cache_lock); +static DEFINE_SPINLOCK(rxrpc_client_conn_discard_mutex); +static LIST_HEAD(rxrpc_waiting_client_conns); +static LIST_HEAD(rxrpc_active_client_conns); +static LIST_HEAD(rxrpc_idle_client_conns); + /* * We use machine-unique IDs for our client connections. */ DEFINE_IDR(rxrpc_client_conn_ids); static DEFINE_SPINLOCK(rxrpc_conn_id_lock); +static void rxrpc_cull_active_client_conns(void); +static void rxrpc_discard_expired_client_conns(struct work_struct *); + +static DECLARE_DELAYED_WORK(rxrpc_client_conn_reap, + rxrpc_discard_expired_client_conns); + /* * Get a connection ID and epoch for a client connection from the global pool. * The connection struct pointer is then recorded in the idr radix tree. The * epoch is changed if this wraps. - * - * TODO: The IDR tree gets very expensive on memory if the connection IDs are - * widely scattered throughout the number space, so we shall need to retire - * connections that have, say, an ID more than four times the maximum number of - * client conns away from the current allocation point to try and keep the IDs - * concentrated. We will also need to retire connections from an old epoch. */ static int rxrpc_get_client_connection_id(struct rxrpc_connection *conn, gfp_t gfp) @@ -114,8 +191,7 @@ void rxrpc_destroy_client_conn_ids(void) } /* - * Allocate a client connection. The caller must take care to clear any - * padding bytes in *cp. + * Allocate a client connection. */ static struct rxrpc_connection * rxrpc_alloc_client_connection(struct rxrpc_conn_parameters *cp, gfp_t gfp) @@ -131,6 +207,10 @@ rxrpc_alloc_client_connection(struct rxrpc_conn_parameters *cp, gfp_t gfp) return ERR_PTR(-ENOMEM); } + atomic_set(&conn->usage, 1); + if (conn->params.exclusive) + __set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags); + conn->params = *cp; conn->out_clientflag = RXRPC_CLIENT_INITIATED; conn->state = RXRPC_CONN_CLIENT; @@ -148,7 +228,7 @@ rxrpc_alloc_client_connection(struct rxrpc_conn_parameters *cp, gfp_t gfp) goto error_2; write_lock(&rxrpc_connection_lock); - list_add_tail(&conn->link, &rxrpc_connections); + list_add_tail(&conn->proc_link, &rxrpc_connection_proc_list); write_unlock(&rxrpc_connection_lock); /* We steal the caller's peer ref. */ @@ -170,32 +250,68 @@ error_0: } /* - * find a connection for a call - * - called in process context with IRQs enabled + * Determine if a connection may be reused. */ -int rxrpc_connect_call(struct rxrpc_call *call, - struct rxrpc_conn_parameters *cp, - struct sockaddr_rxrpc *srx, - gfp_t gfp) +static bool rxrpc_may_reuse_conn(struct rxrpc_connection *conn) +{ + int id_cursor, id, distance, limit; + + if (test_bit(RXRPC_CONN_DONT_REUSE, &conn->flags)) + goto dont_reuse; + + if (conn->proto.epoch != rxrpc_epoch) + goto mark_dont_reuse; + + /* The IDR tree gets very expensive on memory if the connection IDs are + * widely scattered throughout the number space, so we shall want to + * kill off connections that, say, have an ID more than about four + * times the maximum number of client conns away from the current + * allocation point to try and keep the IDs concentrated. + */ + id_cursor = READ_ONCE(rxrpc_client_conn_ids.cur); + id = conn->proto.cid >> RXRPC_CIDSHIFT; + distance = id - id_cursor; + if (distance < 0) + distance = -distance; + limit = round_up(rxrpc_max_client_connections, IDR_SIZE) * 4; + if (distance > limit) + goto mark_dont_reuse; + + return true; + +mark_dont_reuse: + set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags); +dont_reuse: + return false; +} + +/* + * Create or find a client connection to use for a call. + * + * If we return with a connection, the call will be on its waiting list. It's + * left to the caller to assign a channel and wake up the call. + */ +static int rxrpc_get_client_conn(struct rxrpc_call *call, + struct rxrpc_conn_parameters *cp, + struct sockaddr_rxrpc *srx, + gfp_t gfp) { struct rxrpc_connection *conn, *candidate = NULL; struct rxrpc_local *local = cp->local; struct rb_node *p, **pp, *parent; long diff; - int chan; - - DECLARE_WAITQUEUE(myself, current); + int ret = -ENOMEM; _enter("{%d,%lx},", call->debug_id, call->user_call_ID); cp->peer = rxrpc_lookup_peer(cp->local, srx, gfp); if (!cp->peer) - return -ENOMEM; + goto error; + /* If the connection is not meant to be exclusive, search the available + * connections to see if the connection we want to use already exists. + */ if (!cp->exclusive) { - /* Search for a existing client connection unless this is going - * to be a connection that's used exclusively for a single call. - */ _debug("search 1"); spin_lock(&local->client_conns_lock); p = local->client_conns.rb_node; @@ -206,39 +322,55 @@ int rxrpc_connect_call(struct rxrpc_call *call, diff = (cmp(peer) ?: cmp(key) ?: cmp(security_level)); - if (diff < 0) +#undef cmp + if (diff < 0) { p = p->rb_left; - else if (diff > 0) + } else if (diff > 0) { p = p->rb_right; - else - goto found_extant_conn; + } else { + if (rxrpc_may_reuse_conn(conn) && + rxrpc_get_connection_maybe(conn)) + goto found_extant_conn; + /* The connection needs replacing. It's better + * to effect that when we have something to + * replace it with so that we don't have to + * rebalance the tree twice. + */ + break; + } } spin_unlock(&local->client_conns_lock); } - /* We didn't find a connection or we want an exclusive one. */ - _debug("get new conn"); + /* There wasn't a connection yet or we need an exclusive connection. + * We need to create a candidate and then potentially redo the search + * in case we're racing with another thread also trying to connect on a + * shareable connection. + */ + _debug("new conn"); candidate = rxrpc_alloc_client_connection(cp, gfp); - if (!candidate) { - _leave(" = -ENOMEM"); - return -ENOMEM; + if (IS_ERR(candidate)) { + ret = PTR_ERR(candidate); + goto error_peer; } + /* Add the call to the new connection's waiting list in case we're + * going to have to wait for the connection to come live. It's our + * connection, so we want first dibs on the channel slots. We would + * normally have to take channel_lock but we do this before anyone else + * can see the connection. + */ + list_add_tail(&call->chan_wait_link, &candidate->waiting_calls); + if (cp->exclusive) { - /* Assign the call on an exclusive connection to channel 0 and - * don't add the connection to the endpoint's shareable conn - * lookup tree. - */ - _debug("exclusive chan 0"); - conn = candidate; - atomic_set(&conn->avail_chans, RXRPC_MAXCALLS - 1); - spin_lock(&conn->channel_lock); - chan = 0; - goto found_channel; + call->conn = candidate; + _leave(" = 0 [exclusive %d]", candidate->debug_id); + return 0; } - /* We need to redo the search before attempting to add a new connection - * lest we race with someone else adding a conflicting instance. + /* Publish the new connection for userspace to find. We need to redo + * the search before doing this lest we race with someone else adding a + * conflicting instance. */ _debug("search 2"); spin_lock(&local->client_conns_lock); @@ -249,124 +381,672 @@ int rxrpc_connect_call(struct rxrpc_call *call, parent = *pp; conn = rb_entry(parent, struct rxrpc_connection, client_node); +#define cmp(X) ((long)conn->params.X - (long)candidate->params.X) diff = (cmp(peer) ?: cmp(key) ?: cmp(security_level)); - if (diff < 0) +#undef cmp + if (diff < 0) { pp = &(*pp)->rb_left; - else if (diff > 0) + } else if (diff > 0) { pp = &(*pp)->rb_right; - else - goto found_extant_conn; + } else { + if (rxrpc_may_reuse_conn(conn) && + rxrpc_get_connection_maybe(conn)) + goto found_extant_conn; + /* The old connection is from an outdated epoch. */ + _debug("replace conn"); + clear_bit(RXRPC_CONN_IN_CLIENT_CONNS, &conn->flags); + rb_replace_node(&conn->client_node, + &candidate->client_node, + &local->client_conns); + goto candidate_published; + } } - /* The second search also failed; simply add the new connection with - * the new call in channel 0. Note that we need to take the channel - * lock before dropping the client conn lock. - */ _debug("new conn"); - set_bit(RXRPC_CONN_IN_CLIENT_CONNS, &candidate->flags); rb_link_node(&candidate->client_node, parent, pp); rb_insert_color(&candidate->client_node, &local->client_conns); -attached: - conn = candidate; - candidate = NULL; - atomic_set(&conn->avail_chans, RXRPC_MAXCALLS - 1); - spin_lock(&conn->channel_lock); +candidate_published: + set_bit(RXRPC_CONN_IN_CLIENT_CONNS, &candidate->flags); + call->conn = candidate; spin_unlock(&local->client_conns_lock); - chan = 0; + _leave(" = 0 [new %d]", candidate->debug_id); + return 0; -found_channel: - _debug("found chan"); - call->conn = conn; - call->channel = chan; - call->epoch = conn->proto.epoch; - call->cid = conn->proto.cid | chan; - call->call_id = ++conn->channels[chan].call_counter; - conn->channels[chan].call_id = call->call_id; - rcu_assign_pointer(conn->channels[chan].call, call); + /* We come here if we found a suitable connection already in existence. + * Discard any candidate we may have allocated, and try to get a + * channel on this one. + */ +found_extant_conn: + _debug("found conn"); + spin_unlock(&local->client_conns_lock); - _net("CONNECT call %d on conn %d", call->debug_id, conn->debug_id); + rxrpc_put_connection(candidate); + candidate = NULL; + spin_lock(&conn->channel_lock); + call->conn = conn; + list_add(&call->chan_wait_link, &conn->waiting_calls); spin_unlock(&conn->channel_lock); + _leave(" = 0 [extant %d]", conn->debug_id); + return 0; + +error_peer: rxrpc_put_peer(cp->peer); cp->peer = NULL; - _leave(" = %p {u=%d}", conn, atomic_read(&conn->usage)); - return 0; +error: + _leave(" = %d", ret); + return ret; +} - /* We found a potentially suitable connection already in existence. If - * we can reuse it (ie. its usage count hasn't been reduced to 0 by the - * reaper), discard any candidate we may have allocated, and try to get - * a channel on this one, otherwise we have to replace it. - */ -found_extant_conn: - _debug("found conn"); - if (!rxrpc_get_connection_maybe(conn)) { - set_bit(RXRPC_CONN_IN_CLIENT_CONNS, &candidate->flags); - rb_replace_node(&conn->client_node, - &candidate->client_node, - &local->client_conns); - clear_bit(RXRPC_CONN_IN_CLIENT_CONNS, &conn->flags); - goto attached; +/* + * Activate a connection. + */ +static void rxrpc_activate_conn(struct rxrpc_connection *conn) +{ + conn->cache_state = RXRPC_CONN_CLIENT_ACTIVE; + rxrpc_nr_active_client_conns++; + list_move_tail(&conn->cache_link, &rxrpc_active_client_conns); +} + +/* + * Attempt to animate a connection for a new call. + * + * If it's not exclusive, the connection is in the endpoint tree, and we're in + * the conn's list of those waiting to grab a channel. There is, however, a + * limit on the number of live connections allowed at any one time, so we may + * have to wait for capacity to become available. + * + * Note that a connection on the waiting queue might *also* have active + * channels if it has been culled to make space and then re-requested by a new + * call. + */ +static void rxrpc_animate_client_conn(struct rxrpc_connection *conn) +{ + unsigned int nr_conns; + + _enter("%d,%d", conn->debug_id, conn->cache_state); + + if (conn->cache_state == RXRPC_CONN_CLIENT_ACTIVE) + goto out; + + spin_lock(&rxrpc_client_conn_cache_lock); + + nr_conns = rxrpc_nr_client_conns; + if (!test_and_set_bit(RXRPC_CONN_COUNTED, &conn->flags)) + rxrpc_nr_client_conns = nr_conns + 1; + + switch (conn->cache_state) { + case RXRPC_CONN_CLIENT_ACTIVE: + case RXRPC_CONN_CLIENT_WAITING: + break; + + case RXRPC_CONN_CLIENT_INACTIVE: + case RXRPC_CONN_CLIENT_CULLED: + case RXRPC_CONN_CLIENT_IDLE: + if (nr_conns >= rxrpc_max_client_connections) + goto wait_for_capacity; + goto activate_conn; + + default: + BUG(); } - spin_unlock(&local->client_conns_lock); +out_unlock: + spin_unlock(&rxrpc_client_conn_cache_lock); +out: + _leave(" [%d]", conn->cache_state); + return; - rxrpc_put_connection(candidate); +activate_conn: + _debug("activate"); + rxrpc_activate_conn(conn); + goto out_unlock; + +wait_for_capacity: + _debug("wait"); + conn->cache_state = RXRPC_CONN_CLIENT_WAITING; + list_move_tail(&conn->cache_link, &rxrpc_waiting_client_conns); + goto out_unlock; +} + +/* + * Deactivate a channel. + */ +static void rxrpc_deactivate_one_channel(struct rxrpc_connection *conn, + unsigned int channel) +{ + struct rxrpc_channel *chan = &conn->channels[channel]; + + rcu_assign_pointer(chan->call, NULL); + conn->active_chans &= ~(1 << channel); +} + +/* + * Assign a channel to the call at the front of the queue and wake the call up. + * We don't increment the callNumber counter until this number has been exposed + * to the world. + */ +static void rxrpc_activate_one_channel(struct rxrpc_connection *conn, + unsigned int channel) +{ + struct rxrpc_channel *chan = &conn->channels[channel]; + struct rxrpc_call *call = list_entry(conn->waiting_calls.next, + struct rxrpc_call, chan_wait_link); + u32 call_id = chan->call_counter + 1; + + list_del_init(&call->chan_wait_link); + conn->active_chans |= 1 << channel; + call->peer = rxrpc_get_peer(conn->params.peer); + call->cid = conn->proto.cid | channel; + call->call_id = call_id; + + _net("CONNECT call %08x:%08x as call %d on conn %d", + call->cid, call->call_id, call->debug_id, conn->debug_id); + + /* Paired with the read barrier in rxrpc_wait_for_channel(). This + * orders cid and epoch in the connection wrt to call_id without the + * need to take the channel_lock. + * + * We provisionally assign a callNumber at this point, but we don't + * confirm it until the call is about to be exposed. + * + * TODO: Pair with a barrier in the data_ready handler when that looks + * at the call ID through a connection channel. + */ + smp_wmb(); + chan->call_id = call_id; + rcu_assign_pointer(chan->call, call); + wake_up(&call->waitq); +} + +/* + * Assign channels and callNumbers to waiting calls. + */ +static void rxrpc_activate_channels(struct rxrpc_connection *conn) +{ + unsigned char mask; + + _enter("%d", conn->debug_id); + + if (conn->cache_state != RXRPC_CONN_CLIENT_ACTIVE || + conn->active_chans == RXRPC_ACTIVE_CHANS_MASK) + return; + + spin_lock(&conn->channel_lock); + + while (!list_empty(&conn->waiting_calls) && + (mask = ~conn->active_chans, + mask &= RXRPC_ACTIVE_CHANS_MASK, + mask != 0)) + rxrpc_activate_one_channel(conn, __ffs(mask)); + + spin_unlock(&conn->channel_lock); + _leave(""); +} + +/* + * Wait for a callNumber and a channel to be granted to a call. + */ +static int rxrpc_wait_for_channel(struct rxrpc_call *call, gfp_t gfp) +{ + int ret = 0; + + _enter("%d", call->debug_id); + + if (!call->call_id) { + DECLARE_WAITQUEUE(myself, current); - if (!atomic_add_unless(&conn->avail_chans, -1, 0)) { if (!gfpflags_allow_blocking(gfp)) { - rxrpc_put_connection(conn); - _leave(" = -EAGAIN"); - return -EAGAIN; + ret = -EAGAIN; + goto out; } - add_wait_queue(&conn->channel_wq, &myself); + add_wait_queue_exclusive(&call->waitq, &myself); for (;;) { set_current_state(TASK_INTERRUPTIBLE); - if (atomic_add_unless(&conn->avail_chans, -1, 0)) + if (call->call_id) break; - if (signal_pending(current)) - goto interrupted; + if (signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } schedule(); } - remove_wait_queue(&conn->channel_wq, &myself); + remove_wait_queue(&call->waitq, &myself); __set_current_state(TASK_RUNNING); } - /* The connection allegedly now has a free channel and we can now - * attach the call to it. - */ + /* Paired with the write barrier in rxrpc_activate_one_channel(). */ + smp_rmb(); + +out: + _leave(" = %d", ret); + return ret; +} + +/* + * find a connection for a call + * - called in process context with IRQs enabled + */ +int rxrpc_connect_call(struct rxrpc_call *call, + struct rxrpc_conn_parameters *cp, + struct sockaddr_rxrpc *srx, + gfp_t gfp) +{ + int ret; + + _enter("{%d,%lx},", call->debug_id, call->user_call_ID); + + rxrpc_discard_expired_client_conns(NULL); + rxrpc_cull_active_client_conns(); + + ret = rxrpc_get_client_conn(call, cp, srx, gfp); + if (ret < 0) + return ret; + + rxrpc_animate_client_conn(call->conn); + rxrpc_activate_channels(call->conn); + + ret = rxrpc_wait_for_channel(call, gfp); + if (ret < 0) + rxrpc_disconnect_client_call(call); + + _leave(" = %d", ret); + return ret; +} + +/* + * Note that a connection is about to be exposed to the world. Once it is + * exposed, we maintain an extra ref on it that stops it from being summarily + * discarded before it's (a) had a chance to deal with retransmission and (b) + * had a chance at re-use (the per-connection security negotiation is + * expensive). + */ +static void rxrpc_expose_client_conn(struct rxrpc_connection *conn) +{ + if (!test_and_set_bit(RXRPC_CONN_EXPOSED, &conn->flags)) + rxrpc_get_connection(conn); +} + +/* + * Note that a call, and thus a connection, is about to be exposed to the + * world. + */ +void rxrpc_expose_client_call(struct rxrpc_call *call) +{ + struct rxrpc_connection *conn = call->conn; + struct rxrpc_channel *chan = + &conn->channels[call->cid & RXRPC_CHANNELMASK]; + + if (!test_and_set_bit(RXRPC_CALL_EXPOSED, &call->flags)) { + /* Mark the call ID as being used. If the callNumber counter + * exceeds ~2 billion, we kill the connection after its + * outstanding calls have finished so that the counter doesn't + * wrap. + */ + chan->call_counter++; + if (chan->call_counter >= INT_MAX) + set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags); + rxrpc_expose_client_conn(conn); + } +} + +/* + * Disconnect a client call. + */ +void rxrpc_disconnect_client_call(struct rxrpc_call *call) +{ + unsigned int channel = call->cid & RXRPC_CHANNELMASK; + struct rxrpc_connection *conn = call->conn; + struct rxrpc_channel *chan = &conn->channels[channel]; + + call->conn = NULL; + spin_lock(&conn->channel_lock); - for (chan = 0; chan < RXRPC_MAXCALLS; chan++) - if (!conn->channels[chan].call) - goto found_channel; - BUG(); + /* Calls that have never actually been assigned a channel can simply be + * discarded. If the conn didn't get used either, it will follow + * immediately unless someone else grabs it in the meantime. + */ + if (!list_empty(&call->chan_wait_link)) { + _debug("call is waiting"); + ASSERTCMP(call->call_id, ==, 0); + ASSERT(!test_bit(RXRPC_CALL_EXPOSED, &call->flags)); + list_del_init(&call->chan_wait_link); + + /* We must deactivate or idle the connection if it's now + * waiting for nothing. + */ + spin_lock(&rxrpc_client_conn_cache_lock); + if (conn->cache_state == RXRPC_CONN_CLIENT_WAITING && + list_empty(&conn->waiting_calls) && + !conn->active_chans) + goto idle_connection; + goto out; + } + + ASSERTCMP(rcu_access_pointer(chan->call), ==, call); + ASSERTCMP(atomic_read(&conn->usage), >=, 2); + + /* If a client call was exposed to the world, we save the result for + * retransmission. + * + * We use a barrier here so that the call number and abort code can be + * read without needing to take a lock. + * + * TODO: Make the incoming packet handler check this and handle + * terminal retransmission without requiring access to the call. + */ + if (test_bit(RXRPC_CALL_EXPOSED, &call->flags)) { + _debug("exposed %u,%u", call->call_id, call->local_abort); + __rxrpc_disconnect_call(conn, call); + } + + /* See if we can pass the channel directly to another call. */ + if (conn->cache_state == RXRPC_CONN_CLIENT_ACTIVE && + !list_empty(&conn->waiting_calls)) { + _debug("pass chan"); + rxrpc_activate_one_channel(conn, channel); + goto out_2; + } + + /* Things are more complex and we need the cache lock. We might be + * able to simply idle the conn or it might now be lurking on the wait + * list. It might even get moved back to the active list whilst we're + * waiting for the lock. + */ + spin_lock(&rxrpc_client_conn_cache_lock); + + switch (conn->cache_state) { + case RXRPC_CONN_CLIENT_ACTIVE: + if (list_empty(&conn->waiting_calls)) { + rxrpc_deactivate_one_channel(conn, channel); + if (!conn->active_chans) { + rxrpc_nr_active_client_conns--; + goto idle_connection; + } + goto out; + } + + _debug("pass chan 2"); + rxrpc_activate_one_channel(conn, channel); + goto out; + + case RXRPC_CONN_CLIENT_CULLED: + rxrpc_deactivate_one_channel(conn, channel); + ASSERT(list_empty(&conn->waiting_calls)); + if (!conn->active_chans) + goto idle_connection; + goto out; + + case RXRPC_CONN_CLIENT_WAITING: + rxrpc_deactivate_one_channel(conn, channel); + goto out; + + default: + BUG(); + } -interrupted: - remove_wait_queue(&conn->channel_wq, &myself); - __set_current_state(TASK_RUNNING); +out: + spin_unlock(&rxrpc_client_conn_cache_lock); +out_2: + spin_unlock(&conn->channel_lock); rxrpc_put_connection(conn); - rxrpc_put_peer(cp->peer); - cp->peer = NULL; - _leave(" = -ERESTARTSYS"); - return -ERESTARTSYS; + _leave(""); + return; + +idle_connection: + /* As no channels remain active, the connection gets deactivated + * immediately or moved to the idle list for a short while. + */ + if (test_bit(RXRPC_CONN_EXPOSED, &conn->flags)) { + _debug("make idle"); + conn->idle_timestamp = jiffies; + conn->cache_state = RXRPC_CONN_CLIENT_IDLE; + list_move_tail(&conn->cache_link, &rxrpc_idle_client_conns); + if (rxrpc_idle_client_conns.next == &conn->cache_link && + !rxrpc_kill_all_client_conns) + queue_delayed_work(rxrpc_workqueue, + &rxrpc_client_conn_reap, + rxrpc_conn_idle_client_expiry); + } else { + _debug("make inactive"); + conn->cache_state = RXRPC_CONN_CLIENT_INACTIVE; + list_del_init(&conn->cache_link); + } + goto out; } /* - * Remove a client connection from the local endpoint's tree, thereby removing - * it as a target for reuse for new client calls. + * Clean up a dead client connection. */ -void rxrpc_unpublish_client_conn(struct rxrpc_connection *conn) +static struct rxrpc_connection * +rxrpc_put_one_client_conn(struct rxrpc_connection *conn) { + struct rxrpc_connection *next; struct rxrpc_local *local = conn->params.local; + unsigned int nr_conns; - spin_lock(&local->client_conns_lock); - if (test_and_clear_bit(RXRPC_CONN_IN_CLIENT_CONNS, &conn->flags)) - rb_erase(&conn->client_node, &local->client_conns); - spin_unlock(&local->client_conns_lock); + if (test_bit(RXRPC_CONN_IN_CLIENT_CONNS, &conn->flags)) { + spin_lock(&local->client_conns_lock); + if (test_and_clear_bit(RXRPC_CONN_IN_CLIENT_CONNS, + &conn->flags)) + rb_erase(&conn->client_node, &local->client_conns); + spin_unlock(&local->client_conns_lock); + } rxrpc_put_client_connection_id(conn); + + ASSERTCMP(conn->cache_state, ==, RXRPC_CONN_CLIENT_INACTIVE); + + if (!test_bit(RXRPC_CONN_COUNTED, &conn->flags)) + return NULL; + + spin_lock(&rxrpc_client_conn_cache_lock); + nr_conns = --rxrpc_nr_client_conns; + + next = NULL; + if (nr_conns < rxrpc_max_client_connections && + !list_empty(&rxrpc_waiting_client_conns)) { + next = list_entry(rxrpc_waiting_client_conns.next, + struct rxrpc_connection, cache_link); + rxrpc_get_connection(next); + rxrpc_activate_conn(next); + } + + spin_unlock(&rxrpc_client_conn_cache_lock); + rxrpc_kill_connection(conn); + + if (next) + rxrpc_activate_channels(next); + + /* We need to get rid of the temporary ref we took upon next, but we + * can't call rxrpc_put_connection() recursively. + */ + return next; +} + +/* + * Clean up a dead client connections. + */ +void rxrpc_put_client_conn(struct rxrpc_connection *conn) +{ + struct rxrpc_connection *next; + + do { + _enter("%p{u=%d,d=%d}", + conn, atomic_read(&conn->usage), conn->debug_id); + + next = rxrpc_put_one_client_conn(conn); + + if (!next) + break; + conn = next; + } while (atomic_dec_and_test(&conn->usage)); + + _leave(""); +} + +/* + * Kill the longest-active client connections to make room for new ones. + */ +static void rxrpc_cull_active_client_conns(void) +{ + struct rxrpc_connection *conn; + unsigned int nr_conns = rxrpc_nr_client_conns; + unsigned int nr_active, limit; + + _enter(""); + + ASSERTCMP(nr_conns, >=, 0); + if (nr_conns < rxrpc_max_client_connections) { + _leave(" [ok]"); + return; + } + limit = rxrpc_reap_client_connections; + + spin_lock(&rxrpc_client_conn_cache_lock); + nr_active = rxrpc_nr_active_client_conns; + + while (nr_active > limit) { + ASSERT(!list_empty(&rxrpc_active_client_conns)); + conn = list_entry(rxrpc_active_client_conns.next, + struct rxrpc_connection, cache_link); + ASSERTCMP(conn->cache_state, ==, RXRPC_CONN_CLIENT_ACTIVE); + + if (list_empty(&conn->waiting_calls)) { + conn->cache_state = RXRPC_CONN_CLIENT_CULLED; + list_del_init(&conn->cache_link); + } else { + conn->cache_state = RXRPC_CONN_CLIENT_WAITING; + list_move_tail(&conn->cache_link, + &rxrpc_waiting_client_conns); + } + + nr_active--; + } + + rxrpc_nr_active_client_conns = nr_active; + spin_unlock(&rxrpc_client_conn_cache_lock); + ASSERTCMP(nr_active, >=, 0); + _leave(" [culled]"); +} + +/* + * Discard expired client connections from the idle list. Each conn in the + * idle list has been exposed and holds an extra ref because of that. + * + * This may be called from conn setup or from a work item so cannot be + * considered non-reentrant. + */ +static void rxrpc_discard_expired_client_conns(struct work_struct *work) +{ + struct rxrpc_connection *conn; + unsigned long expiry, conn_expires_at, now; + unsigned int nr_conns; + bool did_discard = false; + + _enter("%c", work ? 'w' : 'n'); + + if (list_empty(&rxrpc_idle_client_conns)) { + _leave(" [empty]"); + return; + } + + /* Don't double up on the discarding */ + if (!spin_trylock(&rxrpc_client_conn_discard_mutex)) { + _leave(" [already]"); + return; + } + + /* We keep an estimate of what the number of conns ought to be after + * we've discarded some so that we don't overdo the discarding. + */ + nr_conns = rxrpc_nr_client_conns; + +next: + spin_lock(&rxrpc_client_conn_cache_lock); + + if (list_empty(&rxrpc_idle_client_conns)) + goto out; + + conn = list_entry(rxrpc_idle_client_conns.next, + struct rxrpc_connection, cache_link); + ASSERT(test_bit(RXRPC_CONN_EXPOSED, &conn->flags)); + + if (!rxrpc_kill_all_client_conns) { + /* If the number of connections is over the reap limit, we + * expedite discard by reducing the expiry timeout. We must, + * however, have at least a short grace period to be able to do + * final-ACK or ABORT retransmission. + */ + expiry = rxrpc_conn_idle_client_expiry; + if (nr_conns > rxrpc_reap_client_connections) + expiry = rxrpc_conn_idle_client_fast_expiry; + + conn_expires_at = conn->idle_timestamp + expiry; + + now = READ_ONCE(jiffies); + if (time_after(conn_expires_at, now)) + goto not_yet_expired; + } + + _debug("discard conn %d", conn->debug_id); + if (!test_and_clear_bit(RXRPC_CONN_EXPOSED, &conn->flags)) + BUG(); + conn->cache_state = RXRPC_CONN_CLIENT_INACTIVE; + list_del_init(&conn->cache_link); + + spin_unlock(&rxrpc_client_conn_cache_lock); + + /* When we cleared the EXPOSED flag, we took on responsibility for the + * reference that that had on the usage count. We deal with that here. + * If someone re-sets the flag and re-gets the ref, that's fine. + */ + rxrpc_put_connection(conn); + did_discard = true; + nr_conns--; + goto next; + +not_yet_expired: + /* The connection at the front of the queue hasn't yet expired, so + * schedule the work item for that point if we discarded something. + * + * We don't worry if the work item is already scheduled - it can look + * after rescheduling itself at a later time. We could cancel it, but + * then things get messier. + */ + _debug("not yet"); + if (!rxrpc_kill_all_client_conns) + queue_delayed_work(rxrpc_workqueue, + &rxrpc_client_conn_reap, + conn_expires_at - now); + +out: + spin_unlock(&rxrpc_client_conn_cache_lock); + spin_unlock(&rxrpc_client_conn_discard_mutex); + _leave(""); +} + +/* + * Preemptively destroy all the client connection records rather than waiting + * for them to time out + */ +void __exit rxrpc_destroy_all_client_connections(void) +{ + _enter(""); + + spin_lock(&rxrpc_client_conn_cache_lock); + rxrpc_kill_all_client_conns = true; + spin_unlock(&rxrpc_client_conn_cache_lock); + + cancel_delayed_work(&rxrpc_client_conn_reap); + + if (!queue_delayed_work(rxrpc_workqueue, &rxrpc_client_conn_reap, 0)) + _debug("destroy: queue failed"); + + _leave(""); } diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index cee0f35bc1cf..6296374df840 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -25,6 +25,114 @@ #include "ar-internal.h" /* + * Retransmit terminal ACK or ABORT of the previous call. + */ +static void rxrpc_conn_retransmit(struct rxrpc_connection *conn, + struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct rxrpc_channel *chan; + struct msghdr msg; + struct kvec iov; + struct { + struct rxrpc_wire_header whdr; + union { + struct { + __be32 code; + } abort; + struct { + struct rxrpc_ackpacket ack; + u8 padding[3]; + struct rxrpc_ackinfo info; + }; + }; + } __attribute__((packed)) pkt; + size_t len; + u32 serial, mtu, call_id; + + _enter("%d", conn->debug_id); + + chan = &conn->channels[sp->hdr.cid & RXRPC_CHANNELMASK]; + + /* If the last call got moved on whilst we were waiting to run, just + * ignore this packet. + */ + call_id = READ_ONCE(chan->last_call); + /* Sync with __rxrpc_disconnect_call() */ + smp_rmb(); + if (call_id != sp->hdr.callNumber) + return; + + msg.msg_name = &conn->params.peer->srx.transport; + msg.msg_namelen = conn->params.peer->srx.transport_len; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = 0; + + pkt.whdr.epoch = htonl(sp->hdr.epoch); + pkt.whdr.cid = htonl(sp->hdr.cid); + pkt.whdr.callNumber = htonl(sp->hdr.callNumber); + pkt.whdr.seq = 0; + pkt.whdr.type = chan->last_type; + pkt.whdr.flags = conn->out_clientflag; + pkt.whdr.userStatus = 0; + pkt.whdr.securityIndex = conn->security_ix; + pkt.whdr._rsvd = 0; + pkt.whdr.serviceId = htons(chan->last_service_id); + + len = sizeof(pkt.whdr); + switch (chan->last_type) { + case RXRPC_PACKET_TYPE_ABORT: + pkt.abort.code = htonl(chan->last_abort); + len += sizeof(pkt.abort); + break; + + case RXRPC_PACKET_TYPE_ACK: + mtu = conn->params.peer->if_mtu; + mtu -= conn->params.peer->hdrsize; + pkt.ack.bufferSpace = 0; + pkt.ack.maxSkew = htons(skb->priority); + pkt.ack.firstPacket = htonl(chan->last_seq); + pkt.ack.previousPacket = htonl(chan->last_seq - 1); + pkt.ack.serial = htonl(sp->hdr.serial); + pkt.ack.reason = RXRPC_ACK_DUPLICATE; + pkt.ack.nAcks = 0; + pkt.info.rxMTU = htonl(rxrpc_rx_mtu); + pkt.info.maxMTU = htonl(mtu); + pkt.info.rwind = htonl(rxrpc_rx_window_size); + pkt.info.jumbo_max = htonl(rxrpc_rx_jumbo_max); + len += sizeof(pkt.ack) + sizeof(pkt.info); + break; + } + + /* Resync with __rxrpc_disconnect_call() and check that the last call + * didn't get advanced whilst we were filling out the packets. + */ + smp_rmb(); + if (READ_ONCE(chan->last_call) != call_id) + return; + + iov.iov_base = &pkt; + iov.iov_len = len; + + serial = atomic_inc_return(&conn->serial); + pkt.whdr.serial = htonl(serial); + + switch (chan->last_type) { + case RXRPC_PACKET_TYPE_ABORT: + _proto("Tx ABORT %%%u { %d } [re]", serial, conn->local_abort); + break; + case RXRPC_PACKET_TYPE_ACK: + _proto("Tx ACK %%%u [re]", serial); + break; + } + + kernel_sendmsg(conn->params.local->socket, &msg, &iov, 1, len); + _leave(""); + return; +} + +/* * pass a connection-level abort onto all calls on that connection */ static void rxrpc_abort_calls(struct rxrpc_connection *conn, int state, @@ -166,6 +274,12 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, _enter("{%d},{%u,%%%u},", conn->debug_id, sp->hdr.type, sp->hdr.serial); switch (sp->hdr.type) { + case RXRPC_PACKET_TYPE_DATA: + case RXRPC_PACKET_TYPE_ACK: + rxrpc_conn_retransmit(conn, skb); + rxrpc_free_skb(skb); + return 0; + case RXRPC_PACKET_TYPE_ABORT: if (skb_copy_bits(skb, 0, &wtmp, sizeof(wtmp)) < 0) return -EPROTO; @@ -277,6 +391,7 @@ void rxrpc_process_connection(struct work_struct *work) /* go through the conn-level event packets, releasing the ref on this * connection that each one has when we've finished with it */ while ((skb = skb_dequeue(&conn->rx_queue))) { + rxrpc_see_skb(skb); ret = rxrpc_process_event(conn, skb, &abort_code); switch (ret) { case -EPROTO: @@ -365,6 +480,7 @@ void rxrpc_reject_packets(struct rxrpc_local *local) whdr.type = RXRPC_PACKET_TYPE_ABORT; while ((skb = skb_dequeue(&local->reject_queue))) { + rxrpc_see_skb(skb); sp = rxrpc_skb(skb); switch (sa.sa.sa_family) { case AF_INET: diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index 896d84493a05..5b45b6c367e7 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -1,6 +1,6 @@ -/* RxRPC virtual connection handler +/* RxRPC virtual connection handler, common bits. * - * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Copyright (C) 2007, 2016 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * This program is free software; you can redistribute it and/or @@ -15,8 +15,6 @@ #include <linux/slab.h> #include <linux/net.h> #include <linux/skbuff.h> -#include <net/sock.h> -#include <net/af_rxrpc.h> #include "ar-internal.h" /* @@ -27,9 +25,12 @@ unsigned int rxrpc_connection_expiry = 10 * 60; static void rxrpc_connection_reaper(struct work_struct *work); LIST_HEAD(rxrpc_connections); +LIST_HEAD(rxrpc_connection_proc_list); DEFINE_RWLOCK(rxrpc_connection_lock); static DECLARE_DELAYED_WORK(rxrpc_connection_reap, rxrpc_connection_reaper); +static void rxrpc_destroy_connection(struct rcu_head *); + /* * allocate a new connection */ @@ -41,21 +42,19 @@ struct rxrpc_connection *rxrpc_alloc_connection(gfp_t gfp) conn = kzalloc(sizeof(struct rxrpc_connection), gfp); if (conn) { + INIT_LIST_HEAD(&conn->cache_link); spin_lock_init(&conn->channel_lock); - init_waitqueue_head(&conn->channel_wq); + INIT_LIST_HEAD(&conn->waiting_calls); INIT_WORK(&conn->processor, &rxrpc_process_connection); + INIT_LIST_HEAD(&conn->proc_link); INIT_LIST_HEAD(&conn->link); skb_queue_head_init(&conn->rx_queue); conn->security = &rxrpc_no_security; spin_lock_init(&conn->state_lock); - /* We maintain an extra ref on the connection whilst it is - * on the rxrpc_connections list. - */ - atomic_set(&conn->usage, 2); conn->debug_id = atomic_inc_return(&rxrpc_debug_id); - atomic_set(&conn->avail_chans, RXRPC_MAXCALLS); conn->size_align = 4; conn->header_size = sizeof(struct rxrpc_wire_header); + conn->idle_timestamp = jiffies; } _leave(" = %p{%d}", conn, conn ? conn->debug_id : 0); @@ -153,25 +152,32 @@ not_found: * terminates. The caller must hold the channel_lock and must release the * call's ref on the connection. */ -void __rxrpc_disconnect_call(struct rxrpc_call *call) +void __rxrpc_disconnect_call(struct rxrpc_connection *conn, + struct rxrpc_call *call) { - struct rxrpc_connection *conn = call->conn; - struct rxrpc_channel *chan = &conn->channels[call->channel]; + struct rxrpc_channel *chan = + &conn->channels[call->cid & RXRPC_CHANNELMASK]; - _enter("%d,%d", conn->debug_id, call->channel); + _enter("%d,%x", conn->debug_id, call->cid); if (rcu_access_pointer(chan->call) == call) { /* Save the result of the call so that we can repeat it if necessary * through the channel, whilst disposing of the actual call record. */ - chan->last_result = call->local_abort; + chan->last_service_id = call->service_id; + if (call->local_abort) { + chan->last_abort = call->local_abort; + chan->last_type = RXRPC_PACKET_TYPE_ABORT; + } else { + chan->last_seq = call->rx_data_eaten; + chan->last_type = RXRPC_PACKET_TYPE_ACK; + } + /* Sync with rxrpc_conn_retransmit(). */ smp_wmb(); chan->last_call = chan->call_id; chan->call_id = chan->call_counter; rcu_assign_pointer(chan->call, NULL); - atomic_inc(&conn->avail_chans); - wake_up(&conn->channel_wq); } _leave(""); @@ -185,34 +191,52 @@ void rxrpc_disconnect_call(struct rxrpc_call *call) { struct rxrpc_connection *conn = call->conn; + if (rxrpc_is_client_call(call)) + return rxrpc_disconnect_client_call(call); + spin_lock(&conn->channel_lock); - __rxrpc_disconnect_call(call); + __rxrpc_disconnect_call(conn, call); spin_unlock(&conn->channel_lock); call->conn = NULL; + conn->idle_timestamp = jiffies; rxrpc_put_connection(conn); } /* - * release a virtual connection + * Kill off a connection. */ -void rxrpc_put_connection(struct rxrpc_connection *conn) +void rxrpc_kill_connection(struct rxrpc_connection *conn) { - if (!conn) - return; + ASSERT(!rcu_access_pointer(conn->channels[0].call) && + !rcu_access_pointer(conn->channels[1].call) && + !rcu_access_pointer(conn->channels[2].call) && + !rcu_access_pointer(conn->channels[3].call)); + ASSERT(list_empty(&conn->cache_link)); - _enter("%p{u=%d,d=%d}", - conn, atomic_read(&conn->usage), conn->debug_id); + write_lock(&rxrpc_connection_lock); + list_del_init(&conn->proc_link); + write_unlock(&rxrpc_connection_lock); - ASSERTCMP(atomic_read(&conn->usage), >, 1); + /* Drain the Rx queue. Note that even though we've unpublished, an + * incoming packet could still be being added to our Rx queue, so we + * will need to drain it again in the RCU cleanup handler. + */ + rxrpc_purge_queue(&conn->rx_queue); - conn->put_time = ktime_get_seconds(); - if (atomic_dec_return(&conn->usage) == 1) { - _debug("zombie"); - rxrpc_queue_delayed_work(&rxrpc_connection_reap, 0); - } + /* Leave final destruction to RCU. The connection processor work item + * must carry a ref on the connection to prevent us getting here whilst + * it is queued or running. + */ + call_rcu(&conn->rcu, rxrpc_destroy_connection); +} - _leave(""); +/* + * release a virtual connection + */ +void __rxrpc_put_connection(struct rxrpc_connection *conn) +{ + rxrpc_queue_delayed_work(&rxrpc_connection_reap, 0); } /* @@ -242,19 +266,19 @@ static void rxrpc_destroy_connection(struct rcu_head *rcu) } /* - * reap dead connections + * reap dead service connections */ static void rxrpc_connection_reaper(struct work_struct *work) { struct rxrpc_connection *conn, *_p; - unsigned long reap_older_than, earliest, put_time, now; + unsigned long reap_older_than, earliest, idle_timestamp, now; LIST_HEAD(graveyard); _enter(""); - now = ktime_get_seconds(); - reap_older_than = now - rxrpc_connection_expiry; + now = jiffies; + reap_older_than = now - rxrpc_connection_expiry * HZ; earliest = ULONG_MAX; write_lock(&rxrpc_connection_lock); @@ -263,10 +287,14 @@ static void rxrpc_connection_reaper(struct work_struct *work) if (likely(atomic_read(&conn->usage) > 1)) continue; - put_time = READ_ONCE(conn->put_time); - if (time_after(put_time, reap_older_than)) { - if (time_before(put_time, earliest)) - earliest = put_time; + idle_timestamp = READ_ONCE(conn->idle_timestamp); + _debug("reap CONN %d { u=%d,t=%ld }", + conn->debug_id, atomic_read(&conn->usage), + (long)reap_older_than - (long)idle_timestamp); + + if (time_after(idle_timestamp, reap_older_than)) { + if (time_before(idle_timestamp, earliest)) + earliest = idle_timestamp; continue; } @@ -277,7 +305,7 @@ static void rxrpc_connection_reaper(struct work_struct *work) continue; if (rxrpc_conn_is_client(conn)) - rxrpc_unpublish_client_conn(conn); + BUG(); else rxrpc_unpublish_service_conn(conn); @@ -287,9 +315,9 @@ static void rxrpc_connection_reaper(struct work_struct *work) if (earliest != ULONG_MAX) { _debug("reschedule reaper %ld", (long) earliest - now); - ASSERTCMP(earliest, >, now); + ASSERT(time_after(earliest, now)); rxrpc_queue_delayed_work(&rxrpc_connection_reap, - (earliest - now) * HZ); + earliest - now); } while (!list_empty(&graveyard)) { @@ -298,16 +326,15 @@ static void rxrpc_connection_reaper(struct work_struct *work) list_del_init(&conn->link); ASSERTCMP(atomic_read(&conn->usage), ==, 0); - skb_queue_purge(&conn->rx_queue); - call_rcu(&conn->rcu, rxrpc_destroy_connection); + rxrpc_kill_connection(conn); } _leave(""); } /* - * preemptively destroy all the connection records rather than waiting for them - * to time out + * preemptively destroy all the service connection records rather than + * waiting for them to time out */ void __exit rxrpc_destroy_all_connections(void) { @@ -316,6 +343,8 @@ void __exit rxrpc_destroy_all_connections(void) _enter(""); + rxrpc_destroy_all_client_connections(); + rxrpc_connection_expiry = 0; cancel_delayed_work(&rxrpc_connection_reap); rxrpc_queue_delayed_work(&rxrpc_connection_reap, 0); @@ -330,6 +359,8 @@ void __exit rxrpc_destroy_all_connections(void) write_unlock(&rxrpc_connection_lock); BUG_ON(leak); + ASSERT(list_empty(&rxrpc_connection_proc_list)); + /* Make sure the local and peer records pinned by any dying connections * are released. */ diff --git a/net/rxrpc/conn_service.c b/net/rxrpc/conn_service.c index fd9027ccba8f..316a92107fee 100644 --- a/net/rxrpc/conn_service.c +++ b/net/rxrpc/conn_service.c @@ -185,8 +185,14 @@ struct rxrpc_connection *rxrpc_incoming_connection(struct rxrpc_local *local, rxrpc_get_local(local); + /* We maintain an extra ref on the connection whilst it is on + * the rxrpc_connections list. + */ + atomic_set(&conn->usage, 2); + write_lock(&rxrpc_connection_lock); list_add_tail(&conn->link, &rxrpc_connections); + list_add_tail(&conn->proc_link, &rxrpc_connection_proc_list); write_unlock(&rxrpc_connection_lock); /* Make the connection a target for incoming packets. */ diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 70bb77818dea..5e683dd21ab9 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -125,6 +125,7 @@ static int rxrpc_fast_process_data(struct rxrpc_call *call, bool terminal; int ret, ackbit, ack; u32 serial; + u16 skew; u8 flags; _enter("{%u,%u},,{%u}", call->rx_data_post, call->rx_first_oos, seq); @@ -133,6 +134,7 @@ static int rxrpc_fast_process_data(struct rxrpc_call *call, ASSERTCMP(sp->call, ==, NULL); flags = sp->hdr.flags; serial = sp->hdr.serial; + skew = skb->priority; spin_lock(&call->lock); @@ -231,7 +233,7 @@ static int rxrpc_fast_process_data(struct rxrpc_call *call, spin_unlock(&call->lock); atomic_inc(&call->ackr_not_idle); - rxrpc_propose_ACK(call, RXRPC_ACK_DELAY, serial, false); + rxrpc_propose_ACK(call, RXRPC_ACK_DELAY, skew, serial, false); _leave(" = 0 [posted]"); return 0; @@ -244,7 +246,7 @@ out: discard_and_ack: _debug("discard and ACK packet %p", skb); - __rxrpc_propose_ACK(call, ack, serial, true); + __rxrpc_propose_ACK(call, ack, skew, serial, true); discard: spin_unlock(&call->lock); rxrpc_free_skb(skb); @@ -252,7 +254,7 @@ discard: return 0; enqueue_and_ack: - __rxrpc_propose_ACK(call, ack, serial, true); + __rxrpc_propose_ACK(call, ack, skew, serial, true); enqueue_packet: _net("defer skb %p", skb); spin_unlock(&call->lock); @@ -304,7 +306,7 @@ void rxrpc_fast_process_packet(struct rxrpc_call *call, struct sk_buff *skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); __be32 wtmp; - u32 hi_serial, abort_code; + u32 abort_code; _enter("%p,%p", call, skb); @@ -321,18 +323,12 @@ void rxrpc_fast_process_packet(struct rxrpc_call *call, struct sk_buff *skb) } #endif - /* track the latest serial number on this connection for ACK packet - * information */ - hi_serial = atomic_read(&call->conn->hi_serial); - while (sp->hdr.serial > hi_serial) - hi_serial = atomic_cmpxchg(&call->conn->hi_serial, hi_serial, - sp->hdr.serial); - /* request ACK generation for any ACK or DATA packet that requests * it */ if (sp->hdr.flags & RXRPC_REQUEST_ACK) { _proto("ACK Requested on %%%u", sp->hdr.serial); - rxrpc_propose_ACK(call, RXRPC_ACK_REQUESTED, sp->hdr.serial, false); + rxrpc_propose_ACK(call, RXRPC_ACK_REQUESTED, + skb->priority, sp->hdr.serial, false); } switch (sp->hdr.type) { @@ -570,7 +566,8 @@ done: /* * post connection-level events to the connection - * - this includes challenges, responses and some aborts + * - this includes challenges, responses, some aborts and call terminal packet + * retransmission. */ static void rxrpc_post_packet_to_conn(struct rxrpc_connection *conn, struct sk_buff *skb) @@ -637,7 +634,7 @@ void rxrpc_data_ready(struct sock *sk) struct rxrpc_skb_priv *sp; struct rxrpc_local *local = sk->sk_user_data; struct sk_buff *skb; - int ret; + int ret, skew; _enter("%p", sk); @@ -700,25 +697,64 @@ void rxrpc_data_ready(struct sock *sk) rcu_read_lock(); conn = rxrpc_find_connection_rcu(local, skb); - if (!conn) + if (!conn) { + skb->priority = 0; goto cant_route_call; + } + + /* Note the serial number skew here */ + skew = (int)sp->hdr.serial - (int)conn->hi_serial; + if (skew >= 0) { + if (skew > 0) + conn->hi_serial = sp->hdr.serial; + skb->priority = 0; + } else { + skew = -skew; + skb->priority = min(skew, 65535); + } if (sp->hdr.callNumber == 0) { /* Connection-level packet */ _debug("CONN %p {%d}", conn, conn->debug_id); rxrpc_post_packet_to_conn(conn, skb); + goto out_unlock; } else { /* Call-bound packets are routed by connection channel. */ unsigned int channel = sp->hdr.cid & RXRPC_CHANNELMASK; struct rxrpc_channel *chan = &conn->channels[channel]; - struct rxrpc_call *call = rcu_dereference(chan->call); + struct rxrpc_call *call; + + /* Ignore really old calls */ + if (sp->hdr.callNumber < chan->last_call) + goto discard_unlock; + + if (sp->hdr.callNumber == chan->last_call) { + /* For the previous service call, if completed + * successfully, we discard all further packets. + */ + if (rxrpc_conn_is_service(conn) && + (chan->last_type == RXRPC_PACKET_TYPE_ACK || + sp->hdr.type == RXRPC_PACKET_TYPE_ABORT)) + goto discard_unlock; + + /* But otherwise we need to retransmit the final packet + * from data cached in the connection record. + */ + rxrpc_post_packet_to_conn(conn, skb); + goto out_unlock; + } + call = rcu_dereference(chan->call); if (!call || atomic_read(&call->usage) == 0) goto cant_route_call; rxrpc_post_packet_to_call(call, skb); + goto out_unlock; } +discard_unlock: + rxrpc_free_skb(skb); +out_unlock: rcu_read_unlock(); out: return; diff --git a/net/rxrpc/local_event.c b/net/rxrpc/local_event.c index 31a3f86ef2f6..bcc6593b4cdb 100644 --- a/net/rxrpc/local_event.c +++ b/net/rxrpc/local_event.c @@ -93,6 +93,7 @@ void rxrpc_process_local_events(struct rxrpc_local *local) if (skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + rxrpc_see_skb(skb); _debug("{%d},{%u}", local->debug_id, sp->hdr.type); switch (sp->hdr.type) { diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index f4bda06b7d2d..8a9917cba6fe 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -218,11 +218,11 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) ret = 0; } else if (cmd != RXRPC_CMD_SEND_DATA) { ret = -EINVAL; - } else if (!call->in_clientflag && + } else if (rxrpc_is_client_call(call) && call->state != RXRPC_CALL_CLIENT_SEND_REQUEST) { /* request phase complete for this client call */ ret = -EPROTO; - } else if (call->in_clientflag && + } else if (rxrpc_is_service_call(call) && call->state != RXRPC_CALL_SERVER_ACK_REQUEST && call->state != RXRPC_CALL_SERVER_SEND_REPLY) { /* Reply phase not begun or not complete for service call. */ @@ -390,7 +390,7 @@ static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx, call->acks_winsz), *timeo); - add_wait_queue(&call->tx_waitq, &myself); + add_wait_queue(&call->waitq, &myself); for (;;) { set_current_state(TASK_INTERRUPTIBLE); @@ -408,7 +408,7 @@ static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx, lock_sock(&rx->sk); } - remove_wait_queue(&call->tx_waitq, &myself); + remove_wait_queue(&call->waitq, &myself); set_current_state(TASK_RUNNING); _leave(" = %d", ret); return ret; @@ -482,6 +482,8 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb, if (try_to_del_timer_sync(&call->ack_timer) >= 0) { /* the packet may be freed by rxrpc_process_call() before this * returns */ + if (rxrpc_is_client_call(call)) + rxrpc_expose_client_call(call); ret = rxrpc_send_data_packet(call->conn, skb); _net("sent skb %p", skb); } else { @@ -548,6 +550,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, skb = call->tx_pending; call->tx_pending = NULL; + rxrpc_see_skb(skb); copied = 0; do { diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c index ced5f07444e5..060fb4892c39 100644 --- a/net/rxrpc/proc.c +++ b/net/rxrpc/proc.c @@ -46,7 +46,9 @@ static void rxrpc_call_seq_stop(struct seq_file *seq, void *v) static int rxrpc_call_seq_show(struct seq_file *seq, void *v) { - struct rxrpc_connection *conn; + struct rxrpc_local *local; + struct rxrpc_sock *rx; + struct rxrpc_peer *peer; struct rxrpc_call *call; char lbuff[4 + 4 + 4 + 4 + 5 + 1], rbuff[4 + 4 + 4 + 4 + 5 + 1]; @@ -60,15 +62,24 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) call = list_entry(v, struct rxrpc_call, link); - sprintf(lbuff, "%pI4:%u", - &call->local->srx.transport.sin.sin_addr, - ntohs(call->local->srx.transport.sin.sin_port)); + rx = READ_ONCE(call->socket); + if (rx) { + local = READ_ONCE(rx->local); + if (local) + sprintf(lbuff, "%pI4:%u", + &local->srx.transport.sin.sin_addr, + ntohs(local->srx.transport.sin.sin_port)); + else + strcpy(lbuff, "no_local"); + } else { + strcpy(lbuff, "no_socket"); + } - conn = call->conn; - if (conn) + peer = call->peer; + if (peer) sprintf(rbuff, "%pI4:%u", - &conn->params.peer->srx.transport.sin.sin_addr, - ntohs(conn->params.peer->srx.transport.sin.sin_port)); + &peer->srx.transport.sin.sin_addr, + ntohs(peer->srx.transport.sin.sin_port)); else strcpy(rbuff, "no_connection"); @@ -80,7 +91,7 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) call->service_id, call->cid, call->call_id, - call->in_clientflag ? "Svc" : "Clt", + rxrpc_is_service_call(call) ? "Svc" : "Clt", atomic_read(&call->usage), rxrpc_call_states[call->state], call->remote_abort ?: call->local_abort, @@ -115,13 +126,13 @@ const struct file_operations rxrpc_call_seq_fops = { static void *rxrpc_connection_seq_start(struct seq_file *seq, loff_t *_pos) { read_lock(&rxrpc_connection_lock); - return seq_list_start_head(&rxrpc_connections, *_pos); + return seq_list_start_head(&rxrpc_connection_proc_list, *_pos); } static void *rxrpc_connection_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - return seq_list_next(v, &rxrpc_connections, pos); + return seq_list_next(v, &rxrpc_connection_proc_list, pos); } static void rxrpc_connection_seq_stop(struct seq_file *seq, void *v) @@ -134,7 +145,7 @@ static int rxrpc_connection_seq_show(struct seq_file *seq, void *v) struct rxrpc_connection *conn; char lbuff[4 + 4 + 4 + 4 + 5 + 1], rbuff[4 + 4 + 4 + 4 + 5 + 1]; - if (v == &rxrpc_connections) { + if (v == &rxrpc_connection_proc_list) { seq_puts(seq, "Proto Local Remote " " SvID ConnID End Use State Key " @@ -143,7 +154,7 @@ static int rxrpc_connection_seq_show(struct seq_file *seq, void *v) return 0; } - conn = list_entry(v, struct rxrpc_connection, link); + conn = list_entry(v, struct rxrpc_connection, proc_link); sprintf(lbuff, "%pI4:%u", &conn->params.local->srx.transport.sin.sin_addr, @@ -165,7 +176,7 @@ static int rxrpc_connection_seq_show(struct seq_file *seq, void *v) rxrpc_conn_states[conn->state], key_serial(conn->params.key), atomic_read(&conn->serial), - atomic_read(&conn->hi_serial)); + conn->hi_serial); return 0; } diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index 9ed66d533002..b964c2d49a88 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -111,6 +111,7 @@ int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, } peek_next_packet: + rxrpc_see_skb(skb); sp = rxrpc_skb(skb); call = sp->call; ASSERT(call != NULL); diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index 63afa9e9cc08..89f475febfd7 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -275,7 +275,7 @@ static int rxkad_secure_packet(struct rxrpc_call *call, memcpy(&iv, call->conn->csum_iv.x, sizeof(iv)); /* calculate the security checksum */ - x = call->channel << (32 - RXRPC_CIDSHIFT); + x = (call->cid & RXRPC_CHANNELMASK) << (32 - RXRPC_CIDSHIFT); x |= sp->hdr.seq & 0x3fffffff; call->crypto_buf[0] = htonl(sp->hdr.callNumber); call->crypto_buf[1] = htonl(x); @@ -507,7 +507,7 @@ static int rxkad_verify_packet(struct rxrpc_call *call, memcpy(&iv, call->conn->csum_iv.x, sizeof(iv)); /* validate the security checksum */ - x = call->channel << (32 - RXRPC_CIDSHIFT); + x = (call->cid & RXRPC_CHANNELMASK) << (32 - RXRPC_CIDSHIFT); x |= sp->hdr.seq & 0x3fffffff; call->crypto_buf[0] = htonl(call->call_id); call->crypto_buf[1] = htonl(x); diff --git a/net/rxrpc/skbuff.c b/net/rxrpc/skbuff.c index 06c51d4b622d..fbd8c74d9505 100644 --- a/net/rxrpc/skbuff.c +++ b/net/rxrpc/skbuff.c @@ -53,9 +53,9 @@ static void rxrpc_request_final_ACK(struct rxrpc_call *call) /* * drop the bottom ACK off of the call ACK window and advance the window */ -static void rxrpc_hard_ACK_data(struct rxrpc_call *call, - struct rxrpc_skb_priv *sp) +static void rxrpc_hard_ACK_data(struct rxrpc_call *call, struct sk_buff *skb) { + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); int loop; u32 seq; @@ -91,8 +91,8 @@ static void rxrpc_hard_ACK_data(struct rxrpc_call *call, * its Tx bufferage. */ _debug("send Rx idle ACK"); - __rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, sp->hdr.serial, - false); + __rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, + skb->priority, sp->hdr.serial, false); } spin_unlock_bh(&call->lock); @@ -125,7 +125,7 @@ void rxrpc_kernel_data_consumed(struct rxrpc_call *call, struct sk_buff *skb) ASSERTCMP(sp->hdr.seq, >, call->rx_data_eaten); call->rx_data_recv = sp->hdr.seq; - rxrpc_hard_ACK_data(call, sp); + rxrpc_hard_ACK_data(call, skb); } EXPORT_SYMBOL(rxrpc_kernel_data_consumed); @@ -163,3 +163,65 @@ void rxrpc_kernel_free_skb(struct sk_buff *skb) rxrpc_free_skb(skb); } EXPORT_SYMBOL(rxrpc_kernel_free_skb); + +/* + * Note the existence of a new-to-us socket buffer (allocated or dequeued). + */ +void rxrpc_new_skb(struct sk_buff *skb) +{ + const void *here = __builtin_return_address(0); + int n = atomic_inc_return(&rxrpc_n_skbs); + trace_rxrpc_skb(skb, 0, atomic_read(&skb->users), n, here); +} + +/* + * Note the re-emergence of a socket buffer from a queue or buffer. + */ +void rxrpc_see_skb(struct sk_buff *skb) +{ + const void *here = __builtin_return_address(0); + if (skb) { + int n = atomic_read(&rxrpc_n_skbs); + trace_rxrpc_skb(skb, 1, atomic_read(&skb->users), n, here); + } +} + +/* + * Note the addition of a ref on a socket buffer. + */ +void rxrpc_get_skb(struct sk_buff *skb) +{ + const void *here = __builtin_return_address(0); + int n = atomic_inc_return(&rxrpc_n_skbs); + trace_rxrpc_skb(skb, 2, atomic_read(&skb->users), n, here); + skb_get(skb); +} + +/* + * Note the destruction of a socket buffer. + */ +void rxrpc_free_skb(struct sk_buff *skb) +{ + const void *here = __builtin_return_address(0); + if (skb) { + int n; + CHECK_SLAB_OKAY(&skb->users); + n = atomic_dec_return(&rxrpc_n_skbs); + trace_rxrpc_skb(skb, 3, atomic_read(&skb->users), n, here); + kfree_skb(skb); + } +} + +/* + * Clear a queue of socket buffers. + */ +void rxrpc_purge_queue(struct sk_buff_head *list) +{ + const void *here = __builtin_return_address(0); + struct sk_buff *skb; + while ((skb = skb_dequeue((list))) != NULL) { + int n = atomic_dec_return(&rxrpc_n_skbs); + trace_rxrpc_skb(skb, 4, atomic_read(&skb->users), n, here); + kfree_skb(skb); + } +} diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c index 03ad08774d4e..dc380af8a81e 100644 --- a/net/rxrpc/sysctl.c +++ b/net/rxrpc/sysctl.c @@ -62,6 +62,22 @@ static struct ctl_table rxrpc_sysctl_table[] = { .proc_handler = proc_dointvec_ms_jiffies, .extra1 = (void *)&one, }, + { + .procname = "idle_conn_expiry", + .data = &rxrpc_conn_idle_client_expiry, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_ms_jiffies, + .extra1 = (void *)&one, + }, + { + .procname = "idle_conn_fast_expiry", + .data = &rxrpc_conn_idle_client_fast_expiry, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_ms_jiffies, + .extra1 = (void *)&one, + }, /* Values measured in seconds but used in jiffies */ { @@ -81,17 +97,24 @@ static struct ctl_table rxrpc_sysctl_table[] = { .extra1 = (void *)&one, }, - /* Values measured in seconds */ + /* Non-time values */ + { + .procname = "max_client_conns", + .data = &rxrpc_max_client_connections, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = (void *)&rxrpc_reap_client_connections, + }, { - .procname = "connection_expiry", - .data = &rxrpc_connection_expiry, + .procname = "reap_client_conns", + .data = &rxrpc_reap_client_connections, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = (void *)&one, + .extra2 = (void *)&rxrpc_max_client_connections, }, - - /* Non-time values */ { .procname = "max_backlog", .data = &rxrpc_max_backlog, diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 691409de3e1a..59a8d3150ae2 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -43,7 +43,8 @@ static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a, goto drop; break; case TCA_VLAN_ACT_PUSH: - err = skb_vlan_push(skb, v->tcfv_push_proto, v->tcfv_push_vid); + err = skb_vlan_push(skb, v->tcfv_push_proto, v->tcfv_push_vid | + (v->tcfv_push_prio << VLAN_PRIO_SHIFT)); if (err) goto drop; break; @@ -65,6 +66,7 @@ static const struct nla_policy vlan_policy[TCA_VLAN_MAX + 1] = { [TCA_VLAN_PARMS] = { .len = sizeof(struct tc_vlan) }, [TCA_VLAN_PUSH_VLAN_ID] = { .type = NLA_U16 }, [TCA_VLAN_PUSH_VLAN_PROTOCOL] = { .type = NLA_U16 }, + [TCA_VLAN_PUSH_VLAN_PRIORITY] = { .type = NLA_U8 }, }; static int tcf_vlan_init(struct net *net, struct nlattr *nla, @@ -78,6 +80,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, int action; __be16 push_vid = 0; __be16 push_proto = 0; + u8 push_prio = 0; bool exists = false; int ret = 0, err; @@ -123,6 +126,9 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, } else { push_proto = htons(ETH_P_8021Q); } + + if (tb[TCA_VLAN_PUSH_VLAN_PRIORITY]) + push_prio = nla_get_u8(tb[TCA_VLAN_PUSH_VLAN_PRIORITY]); break; default: if (exists) @@ -150,6 +156,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, v->tcfv_action = action; v->tcfv_push_vid = push_vid; + v->tcfv_push_prio = push_prio; v->tcfv_push_proto = push_proto; v->tcf_action = parm->action; @@ -181,7 +188,9 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a, if (v->tcfv_action == TCA_VLAN_ACT_PUSH && (nla_put_u16(skb, TCA_VLAN_PUSH_VLAN_ID, v->tcfv_push_vid) || nla_put_be16(skb, TCA_VLAN_PUSH_VLAN_PROTOCOL, - v->tcfv_push_proto))) + v->tcfv_push_proto) || + (nla_put_u8(skb, TCA_VLAN_PUSH_VLAN_PRIORITY, + v->tcfv_push_prio)))) goto nla_put_failure; tcf_tm_dump(&t, &v->tcf_tm); diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c index 0b8c3ace671f..eb219b78cd49 100644 --- a/net/sched/cls_basic.c +++ b/net/sched/cls_basic.c @@ -138,10 +138,12 @@ static int basic_set_parms(struct net *net, struct tcf_proto *tp, struct tcf_exts e; struct tcf_ematch_tree t; - tcf_exts_init(&e, TCA_BASIC_ACT, TCA_BASIC_POLICE); - err = tcf_exts_validate(net, tp, tb, est, &e, ovr); + err = tcf_exts_init(&e, TCA_BASIC_ACT, TCA_BASIC_POLICE); if (err < 0) return err; + err = tcf_exts_validate(net, tp, tb, est, &e, ovr); + if (err < 0) + goto errout; err = tcf_em_tree_validate(tp, tb[TCA_BASIC_EMATCHES], &t); if (err < 0) @@ -189,7 +191,10 @@ static int basic_change(struct net *net, struct sk_buff *in_skb, if (!fnew) return -ENOBUFS; - tcf_exts_init(&fnew->exts, TCA_BASIC_ACT, TCA_BASIC_POLICE); + err = tcf_exts_init(&fnew->exts, TCA_BASIC_ACT, TCA_BASIC_POLICE); + if (err < 0) + goto errout; + err = -EINVAL; if (handle) { fnew->handle = handle; @@ -226,6 +231,7 @@ static int basic_change(struct net *net, struct sk_buff *in_skb, return 0; errout: + tcf_exts_destroy(&fnew->exts); kfree(fnew); return err; } diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index c3002c2c68bb..4742f415ee5b 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -311,17 +311,19 @@ static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp, if ((!is_bpf && !is_ebpf) || (is_bpf && is_ebpf)) return -EINVAL; - tcf_exts_init(&exts, TCA_BPF_ACT, TCA_BPF_POLICE); - ret = tcf_exts_validate(net, tp, tb, est, &exts, ovr); + ret = tcf_exts_init(&exts, TCA_BPF_ACT, TCA_BPF_POLICE); if (ret < 0) return ret; + ret = tcf_exts_validate(net, tp, tb, est, &exts, ovr); + if (ret < 0) + goto errout; if (tb[TCA_BPF_FLAGS]) { u32 bpf_flags = nla_get_u32(tb[TCA_BPF_FLAGS]); if (bpf_flags & ~TCA_BPF_FLAG_ACT_DIRECT) { - tcf_exts_destroy(&exts); - return -EINVAL; + ret = -EINVAL; + goto errout; } have_exts = bpf_flags & TCA_BPF_FLAG_ACT_DIRECT; @@ -331,10 +333,8 @@ static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp, ret = is_bpf ? cls_bpf_prog_from_ops(tb, prog) : cls_bpf_prog_from_efd(tb, prog, tp); - if (ret < 0) { - tcf_exts_destroy(&exts); - return ret; - } + if (ret < 0) + goto errout; if (tb[TCA_BPF_CLASSID]) { prog->res.classid = nla_get_u32(tb[TCA_BPF_CLASSID]); @@ -343,6 +343,10 @@ static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp, tcf_exts_change(tp, &prog->exts, &exts); return 0; + +errout: + tcf_exts_destroy(&exts); + return ret; } static u32 cls_bpf_grab_new_handle(struct tcf_proto *tp, @@ -388,7 +392,9 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, if (!prog) return -ENOBUFS; - tcf_exts_init(&prog->exts, TCA_BPF_ACT, TCA_BPF_POLICE); + ret = tcf_exts_init(&prog->exts, TCA_BPF_ACT, TCA_BPF_POLICE); + if (ret < 0) + goto errout; if (oldprog) { if (handle && oldprog->handle != handle) { @@ -420,9 +426,10 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, *arg = (unsigned long) prog; return 0; + errout: + tcf_exts_destroy(&prog->exts); kfree(prog); - return ret; } diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index 4c85bd3a750c..85233c470035 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -93,7 +93,9 @@ static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb, if (!new) return -ENOBUFS; - tcf_exts_init(&new->exts, TCA_CGROUP_ACT, TCA_CGROUP_POLICE); + err = tcf_exts_init(&new->exts, TCA_CGROUP_ACT, TCA_CGROUP_POLICE); + if (err < 0) + goto errout; new->handle = handle; new->tp = tp; err = nla_parse_nested(tb, TCA_CGROUP_MAX, tca[TCA_OPTIONS], @@ -101,10 +103,14 @@ static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb, if (err < 0) goto errout; - tcf_exts_init(&e, TCA_CGROUP_ACT, TCA_CGROUP_POLICE); - err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr); + err = tcf_exts_init(&e, TCA_CGROUP_ACT, TCA_CGROUP_POLICE); if (err < 0) goto errout; + err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr); + if (err < 0) { + tcf_exts_destroy(&e); + goto errout; + } err = tcf_em_tree_validate(tp, tb[TCA_CGROUP_EMATCHES], &t); if (err < 0) { @@ -120,6 +126,7 @@ static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb, call_rcu(&head->rcu, cls_cgroup_destroy_rcu); return 0; errout: + tcf_exts_destroy(&new->exts); kfree(new); return err; } diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index fbfec6a18839..2c1ae549edbf 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -418,10 +418,12 @@ static int flow_change(struct net *net, struct sk_buff *in_skb, return -EOPNOTSUPP; } - tcf_exts_init(&e, TCA_FLOW_ACT, TCA_FLOW_POLICE); + err = tcf_exts_init(&e, TCA_FLOW_ACT, TCA_FLOW_POLICE); + if (err < 0) + goto err1; err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr); if (err < 0) - return err; + goto err1; err = tcf_em_tree_validate(tp, tb[TCA_FLOW_EMATCHES], &t); if (err < 0) @@ -432,13 +434,15 @@ static int flow_change(struct net *net, struct sk_buff *in_skb, if (!fnew) goto err2; - tcf_exts_init(&fnew->exts, TCA_FLOW_ACT, TCA_FLOW_POLICE); + err = tcf_exts_init(&fnew->exts, TCA_FLOW_ACT, TCA_FLOW_POLICE); + if (err < 0) + goto err3; fold = (struct flow_filter *)*arg; if (fold) { err = -EINVAL; if (fold->handle != handle && handle) - goto err2; + goto err3; /* Copy fold into fnew */ fnew->tp = fold->tp; @@ -458,31 +462,31 @@ static int flow_change(struct net *net, struct sk_buff *in_skb, if (tb[TCA_FLOW_MODE]) mode = nla_get_u32(tb[TCA_FLOW_MODE]); if (mode != FLOW_MODE_HASH && nkeys > 1) - goto err2; + goto err3; if (mode == FLOW_MODE_HASH) perturb_period = fold->perturb_period; if (tb[TCA_FLOW_PERTURB]) { if (mode != FLOW_MODE_HASH) - goto err2; + goto err3; perturb_period = nla_get_u32(tb[TCA_FLOW_PERTURB]) * HZ; } } else { err = -EINVAL; if (!handle) - goto err2; + goto err3; if (!tb[TCA_FLOW_KEYS]) - goto err2; + goto err3; mode = FLOW_MODE_MAP; if (tb[TCA_FLOW_MODE]) mode = nla_get_u32(tb[TCA_FLOW_MODE]); if (mode != FLOW_MODE_HASH && nkeys > 1) - goto err2; + goto err3; if (tb[TCA_FLOW_PERTURB]) { if (mode != FLOW_MODE_HASH) - goto err2; + goto err3; perturb_period = nla_get_u32(tb[TCA_FLOW_PERTURB]) * HZ; } @@ -542,6 +546,8 @@ static int flow_change(struct net *net, struct sk_buff *in_skb, call_rcu(&fold->rcu, flow_destroy_filter); return 0; +err3: + tcf_exts_destroy(&fnew->exts); err2: tcf_em_tree_destroy(&t); kfree(fnew); diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 5060801a2f6d..cf9ad5b50889 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -28,6 +28,7 @@ struct fl_flow_key { struct flow_dissector_key_control control; struct flow_dissector_key_basic basic; struct flow_dissector_key_eth_addrs eth; + struct flow_dissector_key_vlan vlan; struct flow_dissector_key_addrs ipaddrs; union { struct flow_dissector_key_ipv4_addrs ipv4; @@ -293,6 +294,10 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { [TCA_FLOWER_KEY_TCP_DST] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_UDP_SRC] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_UDP_DST] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_VLAN_ID] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_VLAN_PRIO] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_VLAN_ETH_TYPE] = { .type = NLA_U16 }, + }; static void fl_set_key_val(struct nlattr **tb, @@ -308,9 +313,29 @@ static void fl_set_key_val(struct nlattr **tb, memcpy(mask, nla_data(tb[mask_type]), len); } +static void fl_set_key_vlan(struct nlattr **tb, + struct flow_dissector_key_vlan *key_val, + struct flow_dissector_key_vlan *key_mask) +{ +#define VLAN_PRIORITY_MASK 0x7 + + if (tb[TCA_FLOWER_KEY_VLAN_ID]) { + key_val->vlan_id = + nla_get_u16(tb[TCA_FLOWER_KEY_VLAN_ID]) & VLAN_VID_MASK; + key_mask->vlan_id = VLAN_VID_MASK; + } + if (tb[TCA_FLOWER_KEY_VLAN_PRIO]) { + key_val->vlan_priority = + nla_get_u8(tb[TCA_FLOWER_KEY_VLAN_PRIO]) & + VLAN_PRIORITY_MASK; + key_mask->vlan_priority = VLAN_PRIORITY_MASK; + } +} + static int fl_set_key(struct net *net, struct nlattr **tb, struct fl_flow_key *key, struct fl_flow_key *mask) { + __be16 ethertype; #ifdef CONFIG_NET_CLS_IND if (tb[TCA_FLOWER_INDEV]) { int err = tcf_change_indev(net, tb[TCA_FLOWER_INDEV]); @@ -328,9 +353,20 @@ static int fl_set_key(struct net *net, struct nlattr **tb, mask->eth.src, TCA_FLOWER_KEY_ETH_SRC_MASK, sizeof(key->eth.src)); - fl_set_key_val(tb, &key->basic.n_proto, TCA_FLOWER_KEY_ETH_TYPE, - &mask->basic.n_proto, TCA_FLOWER_UNSPEC, - sizeof(key->basic.n_proto)); + if (tb[TCA_FLOWER_KEY_ETH_TYPE]) { + ethertype = nla_get_be16(tb[TCA_FLOWER_KEY_ETH_TYPE]); + + if (ethertype == htons(ETH_P_8021Q)) { + fl_set_key_vlan(tb, &key->vlan, &mask->vlan); + fl_set_key_val(tb, &key->basic.n_proto, + TCA_FLOWER_KEY_VLAN_ETH_TYPE, + &mask->basic.n_proto, TCA_FLOWER_UNSPEC, + sizeof(key->basic.n_proto)); + } else { + key->basic.n_proto = ethertype; + mask->basic.n_proto = cpu_to_be16(~0); + } + } if (key->basic.n_proto == htons(ETH_P_IP) || key->basic.n_proto == htons(ETH_P_IPV6)) { @@ -404,12 +440,10 @@ static int fl_init_hashtable(struct cls_fl_head *head, #define FL_KEY_MEMBER_OFFSET(member) offsetof(struct fl_flow_key, member) #define FL_KEY_MEMBER_SIZE(member) (sizeof(((struct fl_flow_key *) 0)->member)) -#define FL_KEY_MEMBER_END_OFFSET(member) \ - (FL_KEY_MEMBER_OFFSET(member) + FL_KEY_MEMBER_SIZE(member)) -#define FL_KEY_IN_RANGE(mask, member) \ - (FL_KEY_MEMBER_OFFSET(member) <= (mask)->range.end && \ - FL_KEY_MEMBER_END_OFFSET(member) >= (mask)->range.start) +#define FL_KEY_IS_MASKED(mask, member) \ + memchr_inv(((char *)mask) + FL_KEY_MEMBER_OFFSET(member), \ + 0, FL_KEY_MEMBER_SIZE(member)) \ #define FL_KEY_SET(keys, cnt, id, member) \ do { \ @@ -418,9 +452,9 @@ static int fl_init_hashtable(struct cls_fl_head *head, cnt++; \ } while(0); -#define FL_KEY_SET_IF_IN_RANGE(mask, keys, cnt, id, member) \ +#define FL_KEY_SET_IF_MASKED(mask, keys, cnt, id, member) \ do { \ - if (FL_KEY_IN_RANGE(mask, member)) \ + if (FL_KEY_IS_MASKED(mask, member)) \ FL_KEY_SET(keys, cnt, id, member); \ } while(0); @@ -432,14 +466,16 @@ static void fl_init_dissector(struct cls_fl_head *head, FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_CONTROL, control); FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_BASIC, basic); - FL_KEY_SET_IF_IN_RANGE(mask, keys, cnt, - FLOW_DISSECTOR_KEY_ETH_ADDRS, eth); - FL_KEY_SET_IF_IN_RANGE(mask, keys, cnt, - FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4); - FL_KEY_SET_IF_IN_RANGE(mask, keys, cnt, - FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6); - FL_KEY_SET_IF_IN_RANGE(mask, keys, cnt, - FLOW_DISSECTOR_KEY_PORTS, tp); + FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FLOW_DISSECTOR_KEY_ETH_ADDRS, eth); + FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4); + FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6); + FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FLOW_DISSECTOR_KEY_PORTS, tp); + FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FLOW_DISSECTOR_KEY_VLAN, vlan); skb_flow_dissector_init(&head->dissector, keys, cnt); } @@ -478,10 +514,12 @@ static int fl_set_parms(struct net *net, struct tcf_proto *tp, struct tcf_exts e; int err; - tcf_exts_init(&e, TCA_FLOWER_ACT, 0); - err = tcf_exts_validate(net, tp, tb, est, &e, ovr); + err = tcf_exts_init(&e, TCA_FLOWER_ACT, 0); if (err < 0) return err; + err = tcf_exts_validate(net, tp, tb, est, &e, ovr); + if (err < 0) + goto errout; if (tb[TCA_FLOWER_CLASSID]) { f->res.classid = nla_get_u32(tb[TCA_FLOWER_CLASSID]); @@ -550,7 +588,9 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, if (!fnew) return -ENOBUFS; - tcf_exts_init(&fnew->exts, TCA_FLOWER_ACT, 0); + err = tcf_exts_init(&fnew->exts, TCA_FLOWER_ACT, 0); + if (err < 0) + goto errout; if (!handle) { handle = fl_grab_new_handle(tp, head); @@ -614,6 +654,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, return 0; errout: + tcf_exts_destroy(&fnew->exts); kfree(fnew); return err; } @@ -668,6 +709,29 @@ static int fl_dump_key_val(struct sk_buff *skb, return 0; } +static int fl_dump_key_vlan(struct sk_buff *skb, + struct flow_dissector_key_vlan *vlan_key, + struct flow_dissector_key_vlan *vlan_mask) +{ + int err; + + if (!memchr_inv(vlan_mask, 0, sizeof(*vlan_mask))) + return 0; + if (vlan_mask->vlan_id) { + err = nla_put_u16(skb, TCA_FLOWER_KEY_VLAN_ID, + vlan_key->vlan_id); + if (err) + return err; + } + if (vlan_mask->vlan_priority) { + err = nla_put_u8(skb, TCA_FLOWER_KEY_VLAN_PRIO, + vlan_key->vlan_priority); + if (err) + return err; + } + return 0; +} + static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, struct sk_buff *skb, struct tcmsg *t) { @@ -712,6 +776,10 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, &mask->basic.n_proto, TCA_FLOWER_UNSPEC, sizeof(key->basic.n_proto))) goto nla_put_failure; + + if (fl_dump_key_vlan(skb, &key->vlan, &mask->vlan)) + goto nla_put_failure; + if ((key->basic.n_proto == htons(ETH_P_IP) || key->basic.n_proto == htons(ETH_P_IPV6)) && fl_dump_key_val(skb, &key->basic.ip_proto, TCA_FLOWER_KEY_IP_PROTO, diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index f23a3b68bba6..cc0bda945800 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -195,10 +195,12 @@ fw_change_attrs(struct net *net, struct tcf_proto *tp, struct fw_filter *f, u32 mask; int err; - tcf_exts_init(&e, TCA_FW_ACT, TCA_FW_POLICE); - err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr); + err = tcf_exts_init(&e, TCA_FW_ACT, TCA_FW_POLICE); if (err < 0) return err; + err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr); + if (err < 0) + goto errout; if (tb[TCA_FW_CLASSID]) { f->res.classid = nla_get_u32(tb[TCA_FW_CLASSID]); @@ -270,10 +272,15 @@ static int fw_change(struct net *net, struct sk_buff *in_skb, #endif /* CONFIG_NET_CLS_IND */ fnew->tp = f->tp; - tcf_exts_init(&fnew->exts, TCA_FW_ACT, TCA_FW_POLICE); + err = tcf_exts_init(&fnew->exts, TCA_FW_ACT, TCA_FW_POLICE); + if (err < 0) { + kfree(fnew); + return err; + } err = fw_change_attrs(net, tp, fnew, tb, tca, base, ovr); if (err < 0) { + tcf_exts_destroy(&fnew->exts); kfree(fnew); return err; } @@ -313,7 +320,9 @@ static int fw_change(struct net *net, struct sk_buff *in_skb, if (f == NULL) return -ENOBUFS; - tcf_exts_init(&f->exts, TCA_FW_ACT, TCA_FW_POLICE); + err = tcf_exts_init(&f->exts, TCA_FW_ACT, TCA_FW_POLICE); + if (err < 0) + goto errout; f->id = handle; f->tp = tp; @@ -328,6 +337,7 @@ static int fw_change(struct net *net, struct sk_buff *in_skb, return 0; errout: + tcf_exts_destroy(&f->exts); kfree(f); return err; } diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index 08a3b0a6f5ab..c91e65d81a48 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -383,17 +383,19 @@ static int route4_set_parms(struct net *net, struct tcf_proto *tp, struct nlattr **tb, struct nlattr *est, int new, bool ovr) { - int err; u32 id = 0, to = 0, nhandle = 0x8000; struct route4_filter *fp; unsigned int h1; struct route4_bucket *b; struct tcf_exts e; + int err; - tcf_exts_init(&e, TCA_ROUTE4_ACT, TCA_ROUTE4_POLICE); - err = tcf_exts_validate(net, tp, tb, est, &e, ovr); + err = tcf_exts_init(&e, TCA_ROUTE4_ACT, TCA_ROUTE4_POLICE); if (err < 0) return err; + err = tcf_exts_validate(net, tp, tb, est, &e, ovr); + if (err < 0) + goto errout; err = -EINVAL; if (tb[TCA_ROUTE4_TO]) { @@ -503,7 +505,10 @@ static int route4_change(struct net *net, struct sk_buff *in_skb, if (!f) goto errout; - tcf_exts_init(&f->exts, TCA_ROUTE4_ACT, TCA_ROUTE4_POLICE); + err = tcf_exts_init(&f->exts, TCA_ROUTE4_ACT, TCA_ROUTE4_POLICE); + if (err < 0) + goto errout; + if (fold) { f->id = fold->id; f->iif = fold->iif; @@ -557,6 +562,7 @@ static int route4_change(struct net *net, struct sk_buff *in_skb, return 0; errout: + tcf_exts_destroy(&f->exts); kfree(f); return err; } diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h index f9c9fc075fe6..4f05a19fb073 100644 --- a/net/sched/cls_rsvp.h +++ b/net/sched/cls_rsvp.h @@ -487,10 +487,12 @@ static int rsvp_change(struct net *net, struct sk_buff *in_skb, if (err < 0) return err; - tcf_exts_init(&e, TCA_RSVP_ACT, TCA_RSVP_POLICE); - err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr); + err = tcf_exts_init(&e, TCA_RSVP_ACT, TCA_RSVP_POLICE); if (err < 0) return err; + err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr); + if (err < 0) + goto errout2; f = (struct rsvp_filter *)*arg; if (f) { @@ -506,7 +508,11 @@ static int rsvp_change(struct net *net, struct sk_buff *in_skb, goto errout2; } - tcf_exts_init(&n->exts, TCA_RSVP_ACT, TCA_RSVP_POLICE); + err = tcf_exts_init(&n->exts, TCA_RSVP_ACT, TCA_RSVP_POLICE); + if (err < 0) { + kfree(n); + goto errout2; + } if (tb[TCA_RSVP_CLASSID]) { n->res.classid = nla_get_u32(tb[TCA_RSVP_CLASSID]); @@ -530,7 +536,9 @@ static int rsvp_change(struct net *net, struct sk_buff *in_skb, if (f == NULL) goto errout2; - tcf_exts_init(&f->exts, TCA_RSVP_ACT, TCA_RSVP_POLICE); + err = tcf_exts_init(&f->exts, TCA_RSVP_ACT, TCA_RSVP_POLICE); + if (err < 0) + goto errout; h2 = 16; if (tb[TCA_RSVP_SRC]) { memcpy(f->src, nla_data(tb[TCA_RSVP_SRC]), sizeof(f->src)); @@ -627,6 +635,7 @@ insert: goto insert; errout: + tcf_exts_destroy(&f->exts); kfree(f); errout2: tcf_exts_destroy(&e); diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index 944c8ff45055..d9500709831f 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -219,10 +219,10 @@ static const struct nla_policy tcindex_policy[TCA_TCINDEX_MAX + 1] = { [TCA_TCINDEX_CLASSID] = { .type = NLA_U32 }, }; -static void tcindex_filter_result_init(struct tcindex_filter_result *r) +static int tcindex_filter_result_init(struct tcindex_filter_result *r) { memset(r, 0, sizeof(*r)); - tcf_exts_init(&r->exts, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); + return tcf_exts_init(&r->exts, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); } static void __tcindex_partial_destroy(struct rcu_head *head) @@ -233,23 +233,57 @@ static void __tcindex_partial_destroy(struct rcu_head *head) kfree(p); } +static void tcindex_free_perfect_hash(struct tcindex_data *cp) +{ + int i; + + for (i = 0; i < cp->hash; i++) + tcf_exts_destroy(&cp->perfect[i].exts); + kfree(cp->perfect); +} + +static int tcindex_alloc_perfect_hash(struct tcindex_data *cp) +{ + int i, err = 0; + + cp->perfect = kcalloc(cp->hash, sizeof(struct tcindex_filter_result), + GFP_KERNEL); + if (!cp->perfect) + return -ENOMEM; + + for (i = 0; i < cp->hash; i++) { + err = tcf_exts_init(&cp->perfect[i].exts, + TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); + if (err < 0) + goto errout; + } + + return 0; + +errout: + tcindex_free_perfect_hash(cp); + return err; +} + static int tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, u32 handle, struct tcindex_data *p, struct tcindex_filter_result *r, struct nlattr **tb, struct nlattr *est, bool ovr) { - int err, balloc = 0; struct tcindex_filter_result new_filter_result, *old_r = r; struct tcindex_filter_result cr; - struct tcindex_data *cp, *oldp; + struct tcindex_data *cp = NULL, *oldp; struct tcindex_filter *f = NULL; /* make gcc behave */ + int err, balloc = 0; struct tcf_exts e; - tcf_exts_init(&e, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); - err = tcf_exts_validate(net, tp, tb, est, &e, ovr); + err = tcf_exts_init(&e, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); if (err < 0) return err; + err = tcf_exts_validate(net, tp, tb, est, &e, ovr); + if (err < 0) + goto errout; err = -ENOMEM; /* tcindex_data attributes must look atomic to classifier/lookup so @@ -270,19 +304,20 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, if (p->perfect) { int i; - cp->perfect = kmemdup(p->perfect, - sizeof(*r) * cp->hash, GFP_KERNEL); - if (!cp->perfect) + if (tcindex_alloc_perfect_hash(cp) < 0) goto errout; for (i = 0; i < cp->hash; i++) - tcf_exts_init(&cp->perfect[i].exts, - TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); + cp->perfect[i].res = p->perfect[i].res; balloc = 1; } cp->h = p->h; - tcindex_filter_result_init(&new_filter_result); - tcindex_filter_result_init(&cr); + err = tcindex_filter_result_init(&new_filter_result); + if (err < 0) + goto errout1; + err = tcindex_filter_result_init(&cr); + if (err < 0) + goto errout1; if (old_r) cr.res = r->res; @@ -338,15 +373,8 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, err = -ENOMEM; if (!cp->perfect && !cp->h) { if (valid_perfect_hash(cp)) { - int i; - - cp->perfect = kcalloc(cp->hash, sizeof(*r), GFP_KERNEL); - if (!cp->perfect) + if (tcindex_alloc_perfect_hash(cp) < 0) goto errout_alloc; - for (i = 0; i < cp->hash; i++) - tcf_exts_init(&cp->perfect[i].exts, - TCA_TCINDEX_ACT, - TCA_TCINDEX_POLICE); balloc = 1; } else { struct tcindex_filter __rcu **hash; @@ -373,8 +401,12 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, if (!f) goto errout_alloc; f->key = handle; - tcindex_filter_result_init(&f->result); f->next = NULL; + err = tcindex_filter_result_init(&f->result); + if (err < 0) { + kfree(f); + goto errout_alloc; + } } if (tb[TCA_TCINDEX_CLASSID]) { @@ -387,8 +419,13 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, else tcf_exts_change(tp, &cr.exts, &e); - if (old_r && old_r != r) - tcindex_filter_result_init(old_r); + if (old_r && old_r != r) { + err = tcindex_filter_result_init(old_r); + if (err < 0) { + kfree(f); + goto errout_alloc; + } + } oldp = p; r->res = cr.res; @@ -415,9 +452,12 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, errout_alloc: if (balloc == 1) - kfree(cp->perfect); + tcindex_free_perfect_hash(cp); else if (balloc == 2) kfree(cp->h); +errout1: + tcf_exts_destroy(&cr.exts); + tcf_exts_destroy(&new_filter_result.exts); errout: kfree(cp); tcf_exts_destroy(&e); diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index ffe593efe930..a29263a9d8c1 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -709,13 +709,15 @@ static int u32_set_parms(struct net *net, struct tcf_proto *tp, struct tc_u_knode *n, struct nlattr **tb, struct nlattr *est, bool ovr) { - int err; struct tcf_exts e; + int err; - tcf_exts_init(&e, TCA_U32_ACT, TCA_U32_POLICE); - err = tcf_exts_validate(net, tp, tb, est, &e, ovr); + err = tcf_exts_init(&e, TCA_U32_ACT, TCA_U32_POLICE); if (err < 0) return err; + err = tcf_exts_validate(net, tp, tb, est, &e, ovr); + if (err < 0) + goto errout; err = -EINVAL; if (tb[TCA_U32_LINK]) { @@ -833,7 +835,10 @@ static struct tc_u_knode *u32_init_knode(struct tcf_proto *tp, new->tp = tp; memcpy(&new->sel, s, sizeof(*s) + s->nkeys*sizeof(struct tc_u32_key)); - tcf_exts_init(&new->exts, TCA_U32_ACT, TCA_U32_POLICE); + if (tcf_exts_init(&new->exts, TCA_U32_ACT, TCA_U32_POLICE)) { + kfree(new); + return NULL; + } return new; } @@ -985,9 +990,12 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, n->handle = handle; n->fshift = s->hmask ? ffs(ntohl(s->hmask)) - 1 : 0; n->flags = flags; - tcf_exts_init(&n->exts, TCA_U32_ACT, TCA_U32_POLICE); n->tp = tp; + err = tcf_exts_init(&n->exts, TCA_U32_ACT, TCA_U32_POLICE); + if (err < 0) + goto errout; + #ifdef CONFIG_CLS_U32_MARK n->pcpu_success = alloc_percpu(u32); if (!n->pcpu_success) { @@ -1028,9 +1036,10 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, errhw: #ifdef CONFIG_CLS_U32_MARK free_percpu(n->pcpu_success); -errout: #endif +errout: + tcf_exts_destroy(&n->exts); #ifdef CONFIG_CLS_U32_PERF free_percpu(n->pf); #endif diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 12ebde845523..d677b3484d81 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -29,6 +29,7 @@ #include <linux/hrtimer.h> #include <linux/lockdep.h> #include <linux/slab.h> +#include <linux/hashtable.h> #include <net/net_namespace.h> #include <net/sock.h> @@ -259,37 +260,40 @@ static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle) { struct Qdisc *q; + if (!qdisc_dev(root)) + return (root->handle == handle ? root : NULL); + if (!(root->flags & TCQ_F_BUILTIN) && root->handle == handle) return root; - list_for_each_entry_rcu(q, &root->list, list) { + hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle) { if (q->handle == handle) return q; } return NULL; } -void qdisc_list_add(struct Qdisc *q) +void qdisc_hash_add(struct Qdisc *q) { if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) { struct Qdisc *root = qdisc_dev(q)->qdisc; WARN_ON_ONCE(root == &noop_qdisc); ASSERT_RTNL(); - list_add_tail_rcu(&q->list, &root->list); + hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle); } } -EXPORT_SYMBOL(qdisc_list_add); +EXPORT_SYMBOL(qdisc_hash_add); -void qdisc_list_del(struct Qdisc *q) +void qdisc_hash_del(struct Qdisc *q) { if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) { ASSERT_RTNL(); - list_del_rcu(&q->list); + hash_del_rcu(&q->hash); } } -EXPORT_SYMBOL(qdisc_list_del); +EXPORT_SYMBOL(qdisc_hash_del); struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle) { @@ -998,7 +1002,7 @@ qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue, goto err_out4; } - qdisc_list_add(sch); + qdisc_hash_add(sch); return sch; } @@ -1431,10 +1435,11 @@ err_out: static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb, struct netlink_callback *cb, - int *q_idx_p, int s_q_idx) + int *q_idx_p, int s_q_idx, bool recur) { int ret = 0, q_idx = *q_idx_p; struct Qdisc *q; + int b; if (!root) return 0; @@ -1449,7 +1454,17 @@ static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb, goto done; q_idx++; } - list_for_each_entry(q, &root->list, list) { + + /* If dumping singletons, there is no qdisc_dev(root) and the singleton + * itself has already been dumped. + * + * If we've already dumped the top-level (ingress) qdisc above and the global + * qdisc hashtable, we don't want to hit it again + */ + if (!qdisc_dev(root) || !recur) + goto out; + + hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) { if (q_idx < s_q_idx) { q_idx++; continue; @@ -1490,13 +1505,13 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb) s_q_idx = 0; q_idx = 0; - if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0) + if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx, true) < 0) goto done; dev_queue = dev_ingress_queue(dev); if (dev_queue && tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb, - &q_idx, s_q_idx) < 0) + &q_idx, s_q_idx, false) < 0) goto done; cont: @@ -1765,6 +1780,7 @@ static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb, int *t_p, int s_t) { struct Qdisc *q; + int b; if (!root) return 0; @@ -1772,7 +1788,10 @@ static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb, if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0) return -1; - list_for_each_entry(q, &root->list, list) { + if (!qdisc_dev(root)) + return 0; + + hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) { if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0) return -1; } diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 657c13362b19..0d21b567ff27 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -423,7 +423,6 @@ struct Qdisc noop_qdisc = { .dequeue = noop_dequeue, .flags = TCQ_F_BUILTIN, .ops = &noop_qdisc_ops, - .list = LIST_HEAD_INIT(noop_qdisc.list), .q.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock), .dev_queue = &noop_netdev_queue, .running = SEQCNT_ZERO(noop_qdisc.running), @@ -613,7 +612,6 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p); sch->padded = (char *) sch - (char *) p; } - INIT_LIST_HEAD(&sch->list); skb_queue_head_init(&sch->q); spin_lock_init(&sch->busylock); @@ -701,7 +699,7 @@ void qdisc_destroy(struct Qdisc *qdisc) return; #ifdef CONFIG_NET_SCHED - qdisc_list_del(qdisc); + qdisc_hash_del(qdisc); qdisc_put_stab(rtnl_dereference(qdisc->stab)); #endif @@ -789,6 +787,10 @@ static void attach_default_qdiscs(struct net_device *dev) qdisc->ops->attach(qdisc); } } +#ifdef CONFIG_NET_SCHED + if (dev->qdisc) + qdisc_hash_add(dev->qdisc); +#endif } static void transition_one_qdisc(struct net_device *dev, diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 3ddc7bd74ecb..000f1d36128e 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -142,8 +142,6 @@ struct hfsc_class { link-sharing, max(myf, cfmin) */ u64 cl_myf; /* my fit-time (calculated from this class's own upperlimit curve) */ - u64 cl_myfadj; /* my fit-time adjustment (to cancel - history dependence) */ u64 cl_cfmin; /* earliest children's fit-time (used with cl_myf to obtain cl_f) */ u64 cl_cvtmin; /* minimal virtual time among the @@ -151,11 +149,8 @@ struct hfsc_class { (monotonic within a period) */ u64 cl_vtadj; /* intra-period cumulative vt adjustment */ - u64 cl_vtoff; /* inter-period cumulative vt offset */ - u64 cl_cvtmax; /* max child's vt in the last period */ - u64 cl_cvtoff; /* cumulative cvtmax of all periods */ - u64 cl_pcvtoff; /* parent's cvtoff at initialization - time */ + u64 cl_cvtoff; /* largest virtual time seen among + the children */ struct internal_sc cl_rsc; /* internal real-time service curve */ struct internal_sc cl_fsc; /* internal fair service curve */ @@ -701,28 +696,16 @@ init_vf(struct hfsc_class *cl, unsigned int len) } else { /* * first child for a new parent backlog period. - * add parent's cvtmax to cvtoff to make a new - * vt (vtoff + vt) larger than the vt in the - * last period for all children. + * initialize cl_vt to the highest value seen + * among the siblings. this is analogous to + * what cur_time would provide in realtime case. */ - vt = cl->cl_parent->cl_cvtmax; - cl->cl_parent->cl_cvtoff += vt; - cl->cl_parent->cl_cvtmax = 0; + cl->cl_vt = cl->cl_parent->cl_cvtoff; cl->cl_parent->cl_cvtmin = 0; - cl->cl_vt = 0; } - cl->cl_vtoff = cl->cl_parent->cl_cvtoff - - cl->cl_pcvtoff; - /* update the virtual curve */ - vt = cl->cl_vt + cl->cl_vtoff; - rtsc_min(&cl->cl_virtual, &cl->cl_fsc, vt, - cl->cl_total); - if (cl->cl_virtual.x == vt) { - cl->cl_virtual.x -= cl->cl_vtoff; - cl->cl_vtoff = 0; - } + rtsc_min(&cl->cl_virtual, &cl->cl_fsc, cl->cl_vt, cl->cl_total); cl->cl_vtadj = 0; cl->cl_vtperiod++; /* increment vt period */ @@ -745,7 +728,6 @@ init_vf(struct hfsc_class *cl, unsigned int len) /* compute myf */ cl->cl_myf = rtsc_y2x(&cl->cl_ulimit, cl->cl_total); - cl->cl_myfadj = 0; } } @@ -779,8 +761,7 @@ update_vf(struct hfsc_class *cl, unsigned int len, u64 cur_time) go_passive = 0; /* update vt */ - cl->cl_vt = rtsc_y2x(&cl->cl_virtual, cl->cl_total) - - cl->cl_vtoff + cl->cl_vtadj; + cl->cl_vt = rtsc_y2x(&cl->cl_virtual, cl->cl_total) + cl->cl_vtadj; /* * if vt of the class is smaller than cvtmin, @@ -795,9 +776,9 @@ update_vf(struct hfsc_class *cl, unsigned int len, u64 cur_time) if (go_passive) { /* no more active child, going passive */ - /* update cvtmax of the parent class */ - if (cl->cl_vt > cl->cl_parent->cl_cvtmax) - cl->cl_parent->cl_cvtmax = cl->cl_vt; + /* update cvtoff of the parent class */ + if (cl->cl_vt > cl->cl_parent->cl_cvtoff) + cl->cl_parent->cl_cvtoff = cl->cl_vt; /* remove this class from the vt tree */ vttree_remove(cl); @@ -813,9 +794,10 @@ update_vf(struct hfsc_class *cl, unsigned int len, u64 cur_time) /* update f */ if (cl->cl_flags & HFSC_USC) { + cl->cl_myf = rtsc_y2x(&cl->cl_ulimit, cl->cl_total); +#if 0 cl->cl_myf = cl->cl_myfadj + rtsc_y2x(&cl->cl_ulimit, cl->cl_total); -#if 0 /* * This code causes classes to stay way under their * limit when multiple classes are used at gigabit @@ -940,7 +922,7 @@ static void hfsc_change_fsc(struct hfsc_class *cl, struct tc_service_curve *fsc) { sc2isc(fsc, &cl->cl_fsc); - rtsc_init(&cl->cl_virtual, &cl->cl_fsc, cl->cl_vtoff + cl->cl_vt, cl->cl_total); + rtsc_init(&cl->cl_virtual, &cl->cl_fsc, cl->cl_vt, cl->cl_total); cl->cl_flags |= HFSC_FSC; } @@ -1094,7 +1076,6 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (parent->level == 0) hfsc_purge_queue(sch, parent); hfsc_adjust_levels(parent); - cl->cl_pcvtoff = parent->cl_cvtoff; sch_tree_unlock(sch); qdisc_class_hash_grow(sch, &q->clhash); @@ -1482,16 +1463,12 @@ hfsc_reset_class(struct hfsc_class *cl) cl->cl_e = 0; cl->cl_vt = 0; cl->cl_vtadj = 0; - cl->cl_vtoff = 0; cl->cl_cvtmin = 0; - cl->cl_cvtmax = 0; cl->cl_cvtoff = 0; - cl->cl_pcvtoff = 0; cl->cl_vtperiod = 0; cl->cl_parentperiod = 0; cl->cl_f = 0; cl->cl_myf = 0; - cl->cl_myfadj = 0; cl->cl_cfmin = 0; cl->cl_nactive = 0; diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c index b9439827c172..2bc8d7f8df16 100644 --- a/net/sched/sch_mq.c +++ b/net/sched/sch_mq.c @@ -88,7 +88,7 @@ static void mq_attach(struct Qdisc *sch) qdisc_destroy(old); #ifdef CONFIG_NET_SCHED if (ntx < dev->real_num_tx_queues) - qdisc_list_add(qdisc); + qdisc_hash_add(qdisc); #endif } diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index 549c66359924..b5c502c78143 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -182,7 +182,7 @@ static void mqprio_attach(struct Qdisc *sch) if (old) qdisc_destroy(old); if (ntx < dev->real_num_tx_queues) - qdisc_list_add(qdisc); + qdisc_hash_add(qdisc); } kfree(priv->qdiscs); priv->qdiscs = NULL; diff --git a/net/strparser/Kconfig b/net/strparser/Kconfig new file mode 100644 index 000000000000..6cff3f6d0c3a --- /dev/null +++ b/net/strparser/Kconfig @@ -0,0 +1,4 @@ + +config STREAM_PARSER + tristate + default n diff --git a/net/strparser/Makefile b/net/strparser/Makefile new file mode 100644 index 000000000000..858a126ebaa0 --- /dev/null +++ b/net/strparser/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_STREAM_PARSER) += strparser.o diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c new file mode 100644 index 000000000000..5c7549b5b92c --- /dev/null +++ b/net/strparser/strparser.c @@ -0,0 +1,510 @@ +/* + * Stream Parser + * + * Copyright (c) 2016 Tom Herbert <tom@herbertland.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + */ + +#include <linux/bpf.h> +#include <linux/errno.h> +#include <linux/errqueue.h> +#include <linux/file.h> +#include <linux/in.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/net.h> +#include <linux/netdevice.h> +#include <linux/poll.h> +#include <linux/rculist.h> +#include <linux/skbuff.h> +#include <linux/socket.h> +#include <linux/uaccess.h> +#include <linux/workqueue.h> +#include <net/strparser.h> +#include <net/netns/generic.h> +#include <net/sock.h> + +static struct workqueue_struct *strp_wq; + +struct _strp_rx_msg { + /* Internal cb structure. struct strp_rx_msg must be first for passing + * to upper layer. + */ + struct strp_rx_msg strp; + int accum_len; + int early_eaten; +}; + +static inline struct _strp_rx_msg *_strp_rx_msg(struct sk_buff *skb) +{ + return (struct _strp_rx_msg *)((void *)skb->cb + + offsetof(struct qdisc_skb_cb, data)); +} + +/* Lower lock held */ +static void strp_abort_rx_strp(struct strparser *strp, int err) +{ + struct sock *csk = strp->sk; + + /* Unrecoverable error in receive */ + + del_timer(&strp->rx_msg_timer); + + if (strp->rx_stopped) + return; + + strp->rx_stopped = 1; + + /* Report an error on the lower socket */ + csk->sk_err = err; + csk->sk_error_report(csk); +} + +static void strp_start_rx_timer(struct strparser *strp) +{ + if (strp->sk->sk_rcvtimeo) + mod_timer(&strp->rx_msg_timer, strp->sk->sk_rcvtimeo); +} + +/* Lower lock held */ +static void strp_parser_err(struct strparser *strp, int err, + read_descriptor_t *desc) +{ + desc->error = err; + kfree_skb(strp->rx_skb_head); + strp->rx_skb_head = NULL; + strp->cb.abort_parser(strp, err); +} + +static inline int strp_peek_len(struct strparser *strp) +{ + struct socket *sock = strp->sk->sk_socket; + + return sock->ops->peek_len(sock); +} + +/* Lower socket lock held */ +static int strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb, + unsigned int orig_offset, size_t orig_len) +{ + struct strparser *strp = (struct strparser *)desc->arg.data; + struct _strp_rx_msg *rxm; + struct sk_buff *head, *skb; + size_t eaten = 0, cand_len; + ssize_t extra; + int err; + bool cloned_orig = false; + + if (strp->rx_paused) + return 0; + + head = strp->rx_skb_head; + if (head) { + /* Message already in progress */ + + rxm = _strp_rx_msg(head); + if (unlikely(rxm->early_eaten)) { + /* Already some number of bytes on the receive sock + * data saved in rx_skb_head, just indicate they + * are consumed. + */ + eaten = orig_len <= rxm->early_eaten ? + orig_len : rxm->early_eaten; + rxm->early_eaten -= eaten; + + return eaten; + } + + if (unlikely(orig_offset)) { + /* Getting data with a non-zero offset when a message is + * in progress is not expected. If it does happen, we + * need to clone and pull since we can't deal with + * offsets in the skbs for a message expect in the head. + */ + orig_skb = skb_clone(orig_skb, GFP_ATOMIC); + if (!orig_skb) { + STRP_STATS_INCR(strp->stats.rx_mem_fail); + desc->error = -ENOMEM; + return 0; + } + if (!pskb_pull(orig_skb, orig_offset)) { + STRP_STATS_INCR(strp->stats.rx_mem_fail); + kfree_skb(orig_skb); + desc->error = -ENOMEM; + return 0; + } + cloned_orig = true; + orig_offset = 0; + } + + if (!strp->rx_skb_nextp) { + /* We are going to append to the frags_list of head. + * Need to unshare the frag_list. + */ + err = skb_unclone(head, GFP_ATOMIC); + if (err) { + STRP_STATS_INCR(strp->stats.rx_mem_fail); + desc->error = err; + return 0; + } + + if (unlikely(skb_shinfo(head)->frag_list)) { + /* We can't append to an sk_buff that already + * has a frag_list. We create a new head, point + * the frag_list of that to the old head, and + * then are able to use the old head->next for + * appending to the message. + */ + if (WARN_ON(head->next)) { + desc->error = -EINVAL; + return 0; + } + + skb = alloc_skb(0, GFP_ATOMIC); + if (!skb) { + STRP_STATS_INCR(strp->stats.rx_mem_fail); + desc->error = -ENOMEM; + return 0; + } + skb->len = head->len; + skb->data_len = head->len; + skb->truesize = head->truesize; + *_strp_rx_msg(skb) = *_strp_rx_msg(head); + strp->rx_skb_nextp = &head->next; + skb_shinfo(skb)->frag_list = head; + strp->rx_skb_head = skb; + head = skb; + } else { + strp->rx_skb_nextp = + &skb_shinfo(head)->frag_list; + } + } + } + + while (eaten < orig_len) { + /* Always clone since we will consume something */ + skb = skb_clone(orig_skb, GFP_ATOMIC); + if (!skb) { + STRP_STATS_INCR(strp->stats.rx_mem_fail); + desc->error = -ENOMEM; + break; + } + + cand_len = orig_len - eaten; + + head = strp->rx_skb_head; + if (!head) { + head = skb; + strp->rx_skb_head = head; + /* Will set rx_skb_nextp on next packet if needed */ + strp->rx_skb_nextp = NULL; + rxm = _strp_rx_msg(head); + memset(rxm, 0, sizeof(*rxm)); + rxm->strp.offset = orig_offset + eaten; + } else { + /* Unclone since we may be appending to an skb that we + * already share a frag_list with. + */ + err = skb_unclone(skb, GFP_ATOMIC); + if (err) { + STRP_STATS_INCR(strp->stats.rx_mem_fail); + desc->error = err; + break; + } + + rxm = _strp_rx_msg(head); + *strp->rx_skb_nextp = skb; + strp->rx_skb_nextp = &skb->next; + head->data_len += skb->len; + head->len += skb->len; + head->truesize += skb->truesize; + } + + if (!rxm->strp.full_len) { + ssize_t len; + + len = (*strp->cb.parse_msg)(strp, head); + + if (!len) { + /* Need more header to determine length */ + if (!rxm->accum_len) { + /* Start RX timer for new message */ + strp_start_rx_timer(strp); + } + rxm->accum_len += cand_len; + eaten += cand_len; + STRP_STATS_INCR(strp->stats.rx_need_more_hdr); + WARN_ON(eaten != orig_len); + break; + } else if (len < 0) { + if (len == -ESTRPIPE && rxm->accum_len) { + len = -ENODATA; + strp->rx_unrecov_intr = 1; + } else { + strp->rx_interrupted = 1; + } + strp_parser_err(strp, err, desc); + break; + } else if (len > strp->sk->sk_rcvbuf) { + /* Message length exceeds maximum allowed */ + STRP_STATS_INCR(strp->stats.rx_msg_too_big); + strp_parser_err(strp, -EMSGSIZE, desc); + break; + } else if (len <= (ssize_t)head->len - + skb->len - rxm->strp.offset) { + /* Length must be into new skb (and also + * greater than zero) + */ + STRP_STATS_INCR(strp->stats.rx_bad_hdr_len); + strp_parser_err(strp, -EPROTO, desc); + break; + } + + rxm->strp.full_len = len; + } + + extra = (ssize_t)(rxm->accum_len + cand_len) - + rxm->strp.full_len; + + if (extra < 0) { + /* Message not complete yet. */ + if (rxm->strp.full_len - rxm->accum_len > + strp_peek_len(strp)) { + /* Don't have the whole messages in the socket + * buffer. Set strp->rx_need_bytes to wait for + * the rest of the message. Also, set "early + * eaten" since we've already buffered the skb + * but don't consume yet per strp_read_sock. + */ + + if (!rxm->accum_len) { + /* Start RX timer for new message */ + strp_start_rx_timer(strp); + } + + strp->rx_need_bytes = rxm->strp.full_len - + rxm->accum_len; + rxm->accum_len += cand_len; + rxm->early_eaten = cand_len; + STRP_STATS_ADD(strp->stats.rx_bytes, cand_len); + desc->count = 0; /* Stop reading socket */ + break; + } + rxm->accum_len += cand_len; + eaten += cand_len; + WARN_ON(eaten != orig_len); + break; + } + + /* Positive extra indicates ore bytes than needed for the + * message + */ + + WARN_ON(extra > cand_len); + + eaten += (cand_len - extra); + + /* Hurray, we have a new message! */ + del_timer(&strp->rx_msg_timer); + strp->rx_skb_head = NULL; + STRP_STATS_INCR(strp->stats.rx_msgs); + + /* Give skb to upper layer */ + strp->cb.rcv_msg(strp, head); + + if (unlikely(strp->rx_paused)) { + /* Upper layer paused strp */ + break; + } + } + + if (cloned_orig) + kfree_skb(orig_skb); + + STRP_STATS_ADD(strp->stats.rx_bytes, eaten); + + return eaten; +} + +static int default_read_sock_done(struct strparser *strp, int err) +{ + return err; +} + +/* Called with lock held on lower socket */ +static int strp_read_sock(struct strparser *strp) +{ + struct socket *sock = strp->sk->sk_socket; + read_descriptor_t desc; + + desc.arg.data = strp; + desc.error = 0; + desc.count = 1; /* give more than one skb per call */ + + /* sk should be locked here, so okay to do read_sock */ + sock->ops->read_sock(strp->sk, &desc, strp_recv); + + desc.error = strp->cb.read_sock_done(strp, desc.error); + + return desc.error; +} + +/* Lower sock lock held */ +void strp_data_ready(struct strparser *strp) +{ + if (unlikely(strp->rx_stopped)) + return; + + /* This check is needed to synchronize with do_strp_rx_work. + * do_strp_rx_work acquires a process lock (lock_sock) whereas + * the lock held here is bh_lock_sock. The two locks can be + * held by different threads at the same time, but bh_lock_sock + * allows a thread in BH context to safely check if the process + * lock is held. In this case, if the lock is held, queue work. + */ + if (sock_owned_by_user(strp->sk)) { + queue_work(strp_wq, &strp->rx_work); + return; + } + + if (strp->rx_paused) + return; + + if (strp->rx_need_bytes) { + if (strp_peek_len(strp) >= strp->rx_need_bytes) + strp->rx_need_bytes = 0; + else + return; + } + + if (strp_read_sock(strp) == -ENOMEM) + queue_work(strp_wq, &strp->rx_work); +} +EXPORT_SYMBOL_GPL(strp_data_ready); + +static void do_strp_rx_work(struct strparser *strp) +{ + read_descriptor_t rd_desc; + struct sock *csk = strp->sk; + + /* We need the read lock to synchronize with strp_data_ready. We + * need the socket lock for calling strp_read_sock. + */ + lock_sock(csk); + + if (unlikely(strp->rx_stopped)) + goto out; + + if (strp->rx_paused) + goto out; + + rd_desc.arg.data = strp; + + if (strp_read_sock(strp) == -ENOMEM) + queue_work(strp_wq, &strp->rx_work); + +out: + release_sock(csk); +} + +static void strp_rx_work(struct work_struct *w) +{ + do_strp_rx_work(container_of(w, struct strparser, rx_work)); +} + +static void strp_rx_msg_timeout(unsigned long arg) +{ + struct strparser *strp = (struct strparser *)arg; + + /* Message assembly timed out */ + STRP_STATS_INCR(strp->stats.rx_msg_timeouts); + lock_sock(strp->sk); + strp->cb.abort_parser(strp, ETIMEDOUT); + release_sock(strp->sk); +} + +int strp_init(struct strparser *strp, struct sock *csk, + struct strp_callbacks *cb) +{ + struct socket *sock = csk->sk_socket; + + if (!cb || !cb->rcv_msg || !cb->parse_msg) + return -EINVAL; + + if (!sock->ops->read_sock || !sock->ops->peek_len) + return -EAFNOSUPPORT; + + memset(strp, 0, sizeof(*strp)); + + strp->sk = csk; + + setup_timer(&strp->rx_msg_timer, strp_rx_msg_timeout, + (unsigned long)strp); + + INIT_WORK(&strp->rx_work, strp_rx_work); + + strp->cb.rcv_msg = cb->rcv_msg; + strp->cb.parse_msg = cb->parse_msg; + strp->cb.read_sock_done = cb->read_sock_done ? : default_read_sock_done; + strp->cb.abort_parser = cb->abort_parser ? : strp_abort_rx_strp; + + return 0; +} +EXPORT_SYMBOL_GPL(strp_init); + +void strp_unpause(struct strparser *strp) +{ + strp->rx_paused = 0; + + /* Sync setting rx_paused with RX work */ + smp_mb(); + + queue_work(strp_wq, &strp->rx_work); +} +EXPORT_SYMBOL_GPL(strp_unpause); + +/* strp must already be stopped so that strp_recv will no longer be called. + * Note that strp_done is not called with the lower socket held. + */ +void strp_done(struct strparser *strp) +{ + WARN_ON(!strp->rx_stopped); + + del_timer_sync(&strp->rx_msg_timer); + cancel_work_sync(&strp->rx_work); + + if (strp->rx_skb_head) { + kfree_skb(strp->rx_skb_head); + strp->rx_skb_head = NULL; + } +} +EXPORT_SYMBOL_GPL(strp_done); + +void strp_stop(struct strparser *strp) +{ + strp->rx_stopped = 1; +} +EXPORT_SYMBOL_GPL(strp_stop); + +void strp_check_rcv(struct strparser *strp) +{ + queue_work(strp_wq, &strp->rx_work); +} +EXPORT_SYMBOL_GPL(strp_check_rcv); + +static int __init strp_mod_init(void) +{ + strp_wq = create_singlethread_workqueue("kstrp"); + + return 0; +} + +static void __exit strp_mod_exit(void) +{ +} +module_init(strp_mod_init); +module_exit(strp_mod_exit); +MODULE_LICENSE("GPL"); diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index a5fc9dd24aa9..1031a0327fff 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -1292,12 +1292,10 @@ bool switchdev_port_same_parent_id(struct net_device *a, struct switchdev_attr a_attr = { .orig_dev = a, .id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID, - .flags = SWITCHDEV_F_NO_RECURSE, }; struct switchdev_attr b_attr = { .orig_dev = b, .id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID, - .flags = SWITCHDEV_F_NO_RECURSE, }; if (switchdev_port_attr_get(a, &a_attr) || @@ -1306,89 +1304,4 @@ bool switchdev_port_same_parent_id(struct net_device *a, return netdev_phys_item_id_same(&a_attr.u.ppid, &b_attr.u.ppid); } - -static u32 switchdev_port_fwd_mark_get(struct net_device *dev, - struct net_device *group_dev) -{ - struct net_device *lower_dev; - struct list_head *iter; - - netdev_for_each_lower_dev(group_dev, lower_dev, iter) { - if (lower_dev == dev) - continue; - if (switchdev_port_same_parent_id(dev, lower_dev)) - return lower_dev->offload_fwd_mark; - return switchdev_port_fwd_mark_get(dev, lower_dev); - } - - return dev->ifindex; -} EXPORT_SYMBOL_GPL(switchdev_port_same_parent_id); - -static void switchdev_port_fwd_mark_reset(struct net_device *group_dev, - u32 old_mark, u32 *reset_mark) -{ - struct net_device *lower_dev; - struct list_head *iter; - - netdev_for_each_lower_dev(group_dev, lower_dev, iter) { - if (lower_dev->offload_fwd_mark == old_mark) { - if (!*reset_mark) - *reset_mark = lower_dev->ifindex; - lower_dev->offload_fwd_mark = *reset_mark; - } - switchdev_port_fwd_mark_reset(lower_dev, old_mark, reset_mark); - } -} - -/** - * switchdev_port_fwd_mark_set - Set port offload forwarding mark - * - * @dev: port device - * @group_dev: containing device - * @joining: true if dev is joining group; false if leaving group - * - * An ungrouped port's offload mark is just its ifindex. A grouped - * port's (member of a bridge, for example) offload mark is the ifindex - * of one of the ports in the group with the same parent (switch) ID. - * Ports on the same device in the same group will have the same mark. - * - * Example: - * - * br0 ifindex=9 - * sw1p1 ifindex=2 mark=2 - * sw1p2 ifindex=3 mark=2 - * sw2p1 ifindex=4 mark=5 - * sw2p2 ifindex=5 mark=5 - * - * If sw2p2 leaves the bridge, we'll have: - * - * br0 ifindex=9 - * sw1p1 ifindex=2 mark=2 - * sw1p2 ifindex=3 mark=2 - * sw2p1 ifindex=4 mark=4 - * sw2p2 ifindex=5 mark=5 - */ -void switchdev_port_fwd_mark_set(struct net_device *dev, - struct net_device *group_dev, - bool joining) -{ - u32 mark = dev->ifindex; - u32 reset_mark = 0; - - if (group_dev) { - ASSERT_RTNL(); - if (joining) - mark = switchdev_port_fwd_mark_get(dev, group_dev); - else if (dev->offload_fwd_mark == mark) - /* Ohoh, this port was the mark reference port, - * but it's leaving the group, so reset the - * mark for the remaining ports in the group. - */ - switchdev_port_fwd_mark_reset(group_dev, mark, - &reset_mark); - } - - dev->offload_fwd_mark = mark; -} -EXPORT_SYMBOL_GPL(switchdev_port_fwd_mark_set); diff --git a/net/sysctl_net.c b/net/sysctl_net.c index 46a71c701e7c..5bc1a3d57401 100644 --- a/net/sysctl_net.c +++ b/net/sysctl_net.c @@ -42,26 +42,37 @@ static int net_ctl_permissions(struct ctl_table_header *head, struct ctl_table *table) { struct net *net = container_of(head->set, struct net, sysctls); - kuid_t root_uid = make_kuid(net->user_ns, 0); - kgid_t root_gid = make_kgid(net->user_ns, 0); /* Allow network administrator to have same access as root. */ - if (ns_capable_noaudit(net->user_ns, CAP_NET_ADMIN) || - uid_eq(root_uid, current_euid())) { + if (ns_capable(net->user_ns, CAP_NET_ADMIN)) { int mode = (table->mode >> 6) & 7; return (mode << 6) | (mode << 3) | mode; } - /* Allow netns root group to have the same access as the root group */ - if (in_egroup_p(root_gid)) { - int mode = (table->mode >> 3) & 7; - return (mode << 3) | mode; - } + return table->mode; } +static void net_ctl_set_ownership(struct ctl_table_header *head, + struct ctl_table *table, + kuid_t *uid, kgid_t *gid) +{ + struct net *net = container_of(head->set, struct net, sysctls); + kuid_t ns_root_uid; + kgid_t ns_root_gid; + + ns_root_uid = make_kuid(net->user_ns, 0); + if (uid_valid(ns_root_uid)) + *uid = ns_root_uid; + + ns_root_gid = make_kgid(net->user_ns, 0); + if (gid_valid(ns_root_gid)) + *gid = ns_root_gid; +} + static struct ctl_table_root net_sysctl_root = { .lookup = net_ctl_header_lookup, .permissions = net_ctl_permissions, + .set_ownership = net_ctl_set_ownership, }; static int __net_init sysctl_net_init(struct net *net) diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index 65b1bbf133bd..975dbeb60ab0 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -42,6 +42,7 @@ #include "monitor.h" #include "bcast.h" #include "netlink.h" +#include "udp_media.h" #define MAX_ADDR_STR 60 @@ -56,6 +57,13 @@ static struct tipc_media * const media_info_array[] = { NULL }; +static struct tipc_bearer *bearer_get(struct net *net, int bearer_id) +{ + struct tipc_net *tn = tipc_net(net); + + return rcu_dereference_rtnl(tn->bearer_list[bearer_id]); +} + static void bearer_disable(struct net *net, struct tipc_bearer *b); /** @@ -323,6 +331,7 @@ restart: b->domain = disc_domain; b->net_plane = bearer_id + 'A'; b->priority = priority; + test_and_set_bit_lock(0, &b->up); res = tipc_disc_create(net, b, &b->bcast_addr, &skb); if (res) { @@ -360,15 +369,24 @@ static int tipc_reset_bearer(struct net *net, struct tipc_bearer *b) */ void tipc_bearer_reset_all(struct net *net) { - struct tipc_net *tn = tipc_net(net); struct tipc_bearer *b; int i; for (i = 0; i < MAX_BEARERS; i++) { - b = rcu_dereference_rtnl(tn->bearer_list[i]); + b = bearer_get(net, i); + if (b) + clear_bit_unlock(0, &b->up); + } + for (i = 0; i < MAX_BEARERS; i++) { + b = bearer_get(net, i); if (b) tipc_reset_bearer(net, b); } + for (i = 0; i < MAX_BEARERS; i++) { + b = bearer_get(net, i); + if (b) + test_and_set_bit_lock(0, &b->up); + } } /** @@ -382,8 +400,9 @@ static void bearer_disable(struct net *net, struct tipc_bearer *b) int bearer_id = b->identity; pr_info("Disabling bearer <%s>\n", b->name); - b->media->disable_media(b); + clear_bit_unlock(0, &b->up); tipc_node_delete_links(net, bearer_id); + b->media->disable_media(b); RCU_INIT_POINTER(b->media_ptr, NULL); if (b->link_req) tipc_disc_delete(b->link_req); @@ -440,22 +459,16 @@ int tipc_l2_send_msg(struct net *net, struct sk_buff *skb, { struct net_device *dev; int delta; - void *tipc_ptr; dev = (struct net_device *)rcu_dereference_rtnl(b->media_ptr); if (!dev) return 0; - /* Send RESET message even if bearer is detached from device */ - tipc_ptr = rcu_dereference_rtnl(dev->tipc_ptr); - if (unlikely(!tipc_ptr && !msg_is_reset(buf_msg(skb)))) - goto drop; - - delta = dev->hard_header_len - skb_headroom(skb); - if ((delta > 0) && - pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) - goto drop; - + delta = SKB_DATA_ALIGN(dev->hard_header_len - skb_headroom(skb)); + if ((delta > 0) && pskb_expand_head(skb, delta, 0, GFP_ATOMIC)) { + kfree_skb(skb); + return 0; + } skb_reset_network_header(skb); skb->dev = dev; skb->protocol = htons(ETH_P_TIPC); @@ -463,9 +476,6 @@ int tipc_l2_send_msg(struct net *net, struct sk_buff *skb, dev->dev_addr, skb->len); dev_queue_xmit(skb); return 0; -drop: - kfree_skb(skb); - return 0; } int tipc_bearer_mtu(struct net *net, u32 bearer_id) @@ -487,12 +497,12 @@ void tipc_bearer_xmit_skb(struct net *net, u32 bearer_id, struct sk_buff *skb, struct tipc_media_addr *dest) { - struct tipc_net *tn = tipc_net(net); + struct tipc_msg *hdr = buf_msg(skb); struct tipc_bearer *b; rcu_read_lock(); - b = rcu_dereference_rtnl(tn->bearer_list[bearer_id]); - if (likely(b)) + b = bearer_get(net, bearer_id); + if (likely(b && (test_bit(0, &b->up) || msg_is_reset(hdr)))) b->media->send_msg(net, skb, b, dest); else kfree_skb(skb); @@ -505,7 +515,6 @@ void tipc_bearer_xmit(struct net *net, u32 bearer_id, struct sk_buff_head *xmitq, struct tipc_media_addr *dst) { - struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_bearer *b; struct sk_buff *skb, *tmp; @@ -513,12 +522,15 @@ void tipc_bearer_xmit(struct net *net, u32 bearer_id, return; rcu_read_lock(); - b = rcu_dereference_rtnl(tn->bearer_list[bearer_id]); + b = bearer_get(net, bearer_id); if (unlikely(!b)) __skb_queue_purge(xmitq); skb_queue_walk_safe(xmitq, skb, tmp) { __skb_dequeue(xmitq); - b->media->send_msg(net, skb, b, dst); + if (likely(test_bit(0, &b->up) || msg_is_reset(buf_msg(skb)))) + b->media->send_msg(net, skb, b, dst); + else + kfree_skb(skb); } rcu_read_unlock(); } @@ -535,8 +547,8 @@ void tipc_bearer_bc_xmit(struct net *net, u32 bearer_id, struct tipc_msg *hdr; rcu_read_lock(); - b = rcu_dereference_rtnl(tn->bearer_list[bearer_id]); - if (unlikely(!b)) + b = bearer_get(net, bearer_id); + if (unlikely(!b || !test_bit(0, &b->up))) __skb_queue_purge(xmitq); skb_queue_walk_safe(xmitq, skb, tmp) { hdr = buf_msg(skb); @@ -566,7 +578,8 @@ static int tipc_l2_rcv_msg(struct sk_buff *skb, struct net_device *dev, rcu_read_lock(); b = rcu_dereference_rtnl(dev->tipc_ptr); - if (likely(b && (skb->pkt_type <= PACKET_BROADCAST))) { + if (likely(b && test_bit(0, &b->up) && + (skb->pkt_type <= PACKET_BROADCAST))) { skb->next = NULL; tipc_rcv(dev_net(dev), skb, b); rcu_read_unlock(); @@ -591,18 +604,9 @@ static int tipc_l2_device_event(struct notifier_block *nb, unsigned long evt, { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); - struct tipc_net *tn = tipc_net(net); struct tipc_bearer *b; - int i; b = rtnl_dereference(dev->tipc_ptr); - if (!b) { - for (i = 0; i < MAX_BEARERS; b = NULL, i++) { - b = rtnl_dereference(tn->bearer_list[i]); - if (b && (b->media_ptr == dev)) - break; - } - } if (!b) return NOTIFY_DONE; @@ -613,11 +617,10 @@ static int tipc_l2_device_event(struct notifier_block *nb, unsigned long evt, if (netif_carrier_ok(dev)) break; case NETDEV_UP: - rcu_assign_pointer(dev->tipc_ptr, b); + test_and_set_bit_lock(0, &b->up); break; case NETDEV_GOING_DOWN: - RCU_INIT_POINTER(dev->tipc_ptr, NULL); - synchronize_net(); + clear_bit_unlock(0, &b->up); tipc_reset_bearer(net, b); break; case NETDEV_CHANGEMTU: @@ -709,6 +712,14 @@ static int __tipc_nl_add_bearer(struct tipc_nl_msg *msg, goto prop_msg_full; nla_nest_end(msg->skb, prop); + +#ifdef CONFIG_TIPC_MEDIA_UDP + if (bearer->media->type_id == TIPC_MEDIA_TYPE_UDP) { + if (tipc_udp_nl_add_bearer_data(msg, bearer)) + goto attr_msg_full; + } +#endif + nla_nest_end(msg->skb, attrs); genlmsg_end(msg->skb, hdr); @@ -895,6 +906,49 @@ int tipc_nl_bearer_enable(struct sk_buff *skb, struct genl_info *info) return 0; } +int tipc_nl_bearer_add(struct sk_buff *skb, struct genl_info *info) +{ + int err; + char *name; + struct tipc_bearer *b; + struct nlattr *attrs[TIPC_NLA_BEARER_MAX + 1]; + struct net *net = sock_net(skb->sk); + + if (!info->attrs[TIPC_NLA_BEARER]) + return -EINVAL; + + err = nla_parse_nested(attrs, TIPC_NLA_BEARER_MAX, + info->attrs[TIPC_NLA_BEARER], + tipc_nl_bearer_policy); + if (err) + return err; + + if (!attrs[TIPC_NLA_BEARER_NAME]) + return -EINVAL; + name = nla_data(attrs[TIPC_NLA_BEARER_NAME]); + + rtnl_lock(); + b = tipc_bearer_find(net, name); + if (!b) { + rtnl_unlock(); + return -EINVAL; + } + +#ifdef CONFIG_TIPC_MEDIA_UDP + if (attrs[TIPC_NLA_BEARER_UDP_OPTS]) { + err = tipc_udp_nl_bearer_add(b, + attrs[TIPC_NLA_BEARER_UDP_OPTS]); + if (err) { + rtnl_unlock(); + return err; + } + } +#endif + rtnl_unlock(); + + return 0; +} + int tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info) { int err; diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h index 43757f1f9cb3..78892e2f53e3 100644 --- a/net/tipc/bearer.h +++ b/net/tipc/bearer.h @@ -150,6 +150,7 @@ struct tipc_bearer { u32 identity; struct tipc_link_req *link_req; char net_plane; + unsigned long up; }; struct tipc_bearer_names { @@ -180,6 +181,7 @@ int tipc_nl_bearer_enable(struct sk_buff *skb, struct genl_info *info); int tipc_nl_bearer_dump(struct sk_buff *skb, struct netlink_callback *cb); int tipc_nl_bearer_get(struct sk_buff *skb, struct genl_info *info); int tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info); +int tipc_nl_bearer_add(struct sk_buff *skb, struct genl_info *info); int tipc_nl_media_dump(struct sk_buff *skb, struct netlink_callback *cb); int tipc_nl_media_get(struct sk_buff *skb, struct genl_info *info); diff --git a/net/tipc/link.c b/net/tipc/link.c index 877d94f34814..2c6e1b9e024b 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -807,7 +807,7 @@ void link_prepare_wakeup(struct tipc_link *l) skb_queue_walk_safe(&l->wakeupq, skb, tmp) { imp = TIPC_SKB_CB(skb)->chain_imp; - lim = l->window + l->backlog[imp].limit; + lim = l->backlog[imp].limit; pnd[imp] += TIPC_SKB_CB(skb)->chain_sz; if ((pnd[imp] + l->backlog[imp].len) >= lim) break; @@ -873,9 +873,11 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, struct sk_buff *skb, *_skb, *bskb; /* Match msg importance against this and all higher backlog limits: */ - for (i = imp; i <= TIPC_SYSTEM_IMPORTANCE; i++) { - if (unlikely(l->backlog[i].len >= l->backlog[i].limit)) - return link_schedule_user(l, list); + if (!skb_queue_empty(backlogq)) { + for (i = imp; i <= TIPC_SYSTEM_IMPORTANCE; i++) { + if (unlikely(l->backlog[i].len >= l->backlog[i].limit)) + return link_schedule_user(l, list); + } } if (unlikely(msg_size(hdr) > mtu)) { skb_queue_purge(list); @@ -1692,10 +1694,10 @@ void tipc_link_set_queue_limits(struct tipc_link *l, u32 win) int max_bulk = TIPC_MAX_PUBLICATIONS / (l->mtu / ITEM_SIZE); l->window = win; - l->backlog[TIPC_LOW_IMPORTANCE].limit = win / 2; - l->backlog[TIPC_MEDIUM_IMPORTANCE].limit = win; - l->backlog[TIPC_HIGH_IMPORTANCE].limit = win / 2 * 3; - l->backlog[TIPC_CRITICAL_IMPORTANCE].limit = win * 2; + l->backlog[TIPC_LOW_IMPORTANCE].limit = max_t(u16, 50, win); + l->backlog[TIPC_MEDIUM_IMPORTANCE].limit = max_t(u16, 100, win * 2); + l->backlog[TIPC_HIGH_IMPORTANCE].limit = max_t(u16, 150, win * 3); + l->backlog[TIPC_CRITICAL_IMPORTANCE].limit = max_t(u16, 200, win * 4); l->backlog[TIPC_SYSTEM_IMPORTANCE].limit = max_bulk; } diff --git a/net/tipc/net.h b/net/tipc/net.h index 77a7a118911d..c7c254902873 100644 --- a/net/tipc/net.h +++ b/net/tipc/net.h @@ -39,6 +39,8 @@ #include <net/genetlink.h> +extern const struct nla_policy tipc_nl_net_policy[]; + int tipc_net_start(struct net *net, u32 addr); void tipc_net_stop(struct net *net); diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c index a84daec0afe9..3200059d14b2 100644 --- a/net/tipc/netlink.c +++ b/net/tipc/netlink.c @@ -41,6 +41,7 @@ #include "link.h" #include "node.h" #include "net.h" +#include "udp_media.h" #include <net/genetlink.h> static const struct nla_policy tipc_nl_policy[TIPC_NLA_MAX + 1] = { @@ -161,6 +162,11 @@ static const struct genl_ops tipc_genl_v2_ops[] = { .policy = tipc_nl_policy, }, { + .cmd = TIPC_NL_BEARER_ADD, + .doit = tipc_nl_bearer_add, + .policy = tipc_nl_policy, + }, + { .cmd = TIPC_NL_BEARER_SET, .doit = tipc_nl_bearer_set, .policy = tipc_nl_policy, @@ -238,6 +244,18 @@ static const struct genl_ops tipc_genl_v2_ops[] = { .dumpit = tipc_nl_node_dump_monitor_peer, .policy = tipc_nl_policy, }, + { + .cmd = TIPC_NL_PEER_REMOVE, + .doit = tipc_nl_peer_rm, + .policy = tipc_nl_policy, + }, +#ifdef CONFIG_TIPC_MEDIA_UDP + { + .cmd = TIPC_NL_UDP_GET_REMOTEIP, + .dumpit = tipc_udp_nl_dump_remoteip, + .policy = tipc_nl_policy, + }, +#endif }; int tipc_nlmsg_parse(const struct nlmsghdr *nlh, struct nlattr ***attr) diff --git a/net/tipc/node.c b/net/tipc/node.c index 21974191e425..7e8b75fd1a02 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -1553,6 +1553,69 @@ discard: kfree_skb(skb); } +int tipc_nl_peer_rm(struct sk_buff *skb, struct genl_info *info) +{ + struct net *net = sock_net(skb->sk); + struct tipc_net *tn = net_generic(net, tipc_net_id); + struct nlattr *attrs[TIPC_NLA_NET_MAX + 1]; + struct tipc_node *peer; + u32 addr; + int err; + int i; + + /* We identify the peer by its net */ + if (!info->attrs[TIPC_NLA_NET]) + return -EINVAL; + + err = nla_parse_nested(attrs, TIPC_NLA_NET_MAX, + info->attrs[TIPC_NLA_NET], + tipc_nl_net_policy); + if (err) + return err; + + if (!attrs[TIPC_NLA_NET_ADDR]) + return -EINVAL; + + addr = nla_get_u32(attrs[TIPC_NLA_NET_ADDR]); + + if (in_own_node(net, addr)) + return -ENOTSUPP; + + spin_lock_bh(&tn->node_list_lock); + peer = tipc_node_find(net, addr); + if (!peer) { + spin_unlock_bh(&tn->node_list_lock); + return -ENXIO; + } + + tipc_node_write_lock(peer); + if (peer->state != SELF_DOWN_PEER_DOWN && + peer->state != SELF_DOWN_PEER_LEAVING) { + tipc_node_write_unlock(peer); + err = -EBUSY; + goto err_out; + } + + for (i = 0; i < MAX_BEARERS; i++) { + struct tipc_link_entry *le = &peer->links[i]; + + if (le->link) { + kfree(le->link); + le->link = NULL; + peer->link_cnt--; + } + } + tipc_node_write_unlock(peer); + tipc_node_delete(peer); + + err = 0; +err_out: + tipc_node_put(peer); + spin_unlock_bh(&tn->node_list_lock); + + return err; +} + int tipc_nl_node_dump(struct sk_buff *skb, struct netlink_callback *cb) { int err; diff --git a/net/tipc/node.h b/net/tipc/node.h index d69fdfcc0ec9..4578b34c7dca 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -77,6 +77,7 @@ int tipc_nl_node_dump_link(struct sk_buff *skb, struct netlink_callback *cb); int tipc_nl_node_reset_link_stats(struct sk_buff *skb, struct genl_info *info); int tipc_nl_node_get_link(struct sk_buff *skb, struct genl_info *info); int tipc_nl_node_set_link(struct sk_buff *skb, struct genl_info *info); +int tipc_nl_peer_rm(struct sk_buff *skb, struct genl_info *info); int tipc_nl_node_set_monitor(struct sk_buff *skb, struct genl_info *info); int tipc_nl_node_get_monitor(struct sk_buff *skb, struct genl_info *info); diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index ae7e14cae085..dd274687a53d 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -49,6 +49,7 @@ #include "core.h" #include "bearer.h" #include "netlink.h" +#include "msg.h" /* IANA assigned UDP port */ #define UDP_PORT_DEFAULT 6118 @@ -70,6 +71,13 @@ struct udp_media_addr { }; }; +/* struct udp_replicast - container for UDP remote addresses */ +struct udp_replicast { + struct udp_media_addr addr; + struct rcu_head rcu; + struct list_head list; +}; + /** * struct udp_bearer - ip/udp bearer data structure * @bearer: associated generic tipc bearer @@ -82,8 +90,20 @@ struct udp_bearer { struct socket *ubsock; u32 ifindex; struct work_struct work; + struct udp_replicast rcast; }; +static int tipc_udp_is_mcast_addr(struct udp_media_addr *addr) +{ + if (ntohs(addr->proto) == ETH_P_IP) + return ipv4_is_multicast(addr->ipv4.s_addr); +#if IS_ENABLED(CONFIG_IPV6) + else + return ipv6_addr_is_multicast(&addr->ipv6); +#endif + return 0; +} + /* udp_media_addr_set - convert a ip/udp address to a TIPC media address */ static void tipc_udp_media_addr_set(struct tipc_media_addr *addr, struct udp_media_addr *ua) @@ -91,15 +111,9 @@ static void tipc_udp_media_addr_set(struct tipc_media_addr *addr, memset(addr, 0, sizeof(struct tipc_media_addr)); addr->media_id = TIPC_MEDIA_TYPE_UDP; memcpy(addr->value, ua, sizeof(struct udp_media_addr)); - if (ntohs(ua->proto) == ETH_P_IP) { - if (ipv4_is_multicast(ua->ipv4.s_addr)) - addr->broadcast = 1; - } else if (ntohs(ua->proto) == ETH_P_IPV6) { - if (ipv6_addr_type(&ua->ipv6) & IPV6_ADDR_MULTICAST) - addr->broadcast = 1; - } else { - pr_err("Invalid UDP media address\n"); - } + + if (tipc_udp_is_mcast_addr(ua)) + addr->broadcast = 1; } /* tipc_udp_addr2str - convert ip/udp address to string */ @@ -140,28 +154,13 @@ static int tipc_udp_addr2msg(char *msg, struct tipc_media_addr *a) } /* tipc_send_msg - enqueue a send request */ -static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb, - struct tipc_bearer *b, - struct tipc_media_addr *dest) +static int tipc_udp_xmit(struct net *net, struct sk_buff *skb, + struct udp_bearer *ub, struct udp_media_addr *src, + struct udp_media_addr *dst) { int ttl, err = 0; - struct udp_bearer *ub; - struct udp_media_addr *dst = (struct udp_media_addr *)&dest->value; - struct udp_media_addr *src = (struct udp_media_addr *)&b->addr.value; struct rtable *rt; - if (skb_headroom(skb) < UDP_MIN_HEADROOM) { - err = pskb_expand_head(skb, UDP_MIN_HEADROOM, 0, GFP_ATOMIC); - if (err) - goto tx_error; - } - - skb_set_inner_protocol(skb, htons(ETH_P_TIPC)); - ub = rcu_dereference_rtnl(b->media_ptr); - if (!ub) { - err = -ENODEV; - goto tx_error; - } if (dst->proto == htons(ETH_P_IP)) { struct flowi4 fl = { .daddr = dst->ipv4.s_addr, @@ -207,29 +206,178 @@ tx_error: return err; } +static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb, + struct tipc_bearer *b, + struct tipc_media_addr *addr) +{ + struct udp_media_addr *src = (struct udp_media_addr *)&b->addr.value; + struct udp_media_addr *dst = (struct udp_media_addr *)&addr->value; + struct udp_replicast *rcast; + struct udp_bearer *ub; + int err = 0; + + if (skb_headroom(skb) < UDP_MIN_HEADROOM) { + err = pskb_expand_head(skb, UDP_MIN_HEADROOM, 0, GFP_ATOMIC); + if (err) + goto out; + } + + skb_set_inner_protocol(skb, htons(ETH_P_TIPC)); + ub = rcu_dereference_rtnl(b->media_ptr); + if (!ub) { + err = -ENODEV; + goto out; + } + + if (!addr->broadcast || list_empty(&ub->rcast.list)) + return tipc_udp_xmit(net, skb, ub, src, dst); + + /* Replicast, send an skb to each configured IP address */ + list_for_each_entry_rcu(rcast, &ub->rcast.list, list) { + struct sk_buff *_skb; + + _skb = pskb_copy(skb, GFP_ATOMIC); + if (!_skb) { + err = -ENOMEM; + goto out; + } + + err = tipc_udp_xmit(net, _skb, ub, src, &rcast->addr); + if (err) { + kfree_skb(_skb); + goto out; + } + } + err = 0; +out: + kfree_skb(skb); + return err; +} + +static bool tipc_udp_is_known_peer(struct tipc_bearer *b, + struct udp_media_addr *addr) +{ + struct udp_replicast *rcast, *tmp; + struct udp_bearer *ub; + + ub = rcu_dereference_rtnl(b->media_ptr); + if (!ub) { + pr_err_ratelimited("UDP bearer instance not found\n"); + return false; + } + + list_for_each_entry_safe(rcast, tmp, &ub->rcast.list, list) { + if (!memcmp(&rcast->addr, addr, sizeof(struct udp_media_addr))) + return true; + } + + return false; +} + +static int tipc_udp_rcast_add(struct tipc_bearer *b, + struct udp_media_addr *addr) +{ + struct udp_replicast *rcast; + struct udp_bearer *ub; + + ub = rcu_dereference_rtnl(b->media_ptr); + if (!ub) + return -ENODEV; + + rcast = kmalloc(sizeof(*rcast), GFP_ATOMIC); + if (!rcast) + return -ENOMEM; + + memcpy(&rcast->addr, addr, sizeof(struct udp_media_addr)); + + if (ntohs(addr->proto) == ETH_P_IP) + pr_info("New replicast peer: %pI4\n", &rcast->addr.ipv4); +#if IS_ENABLED(CONFIG_IPV6) + else if (ntohs(addr->proto) == ETH_P_IPV6) + pr_info("New replicast peer: %pI6\n", &rcast->addr.ipv6); +#endif + + list_add_rcu(&rcast->list, &ub->rcast.list); + return 0; +} + +static int tipc_udp_rcast_disc(struct tipc_bearer *b, struct sk_buff *skb) +{ + struct udp_media_addr src = {0}; + struct udp_media_addr *dst; + + dst = (struct udp_media_addr *)&b->bcast_addr.value; + if (tipc_udp_is_mcast_addr(dst)) + return 0; + + src.port = udp_hdr(skb)->source; + + if (ip_hdr(skb)->version == 4) { + struct iphdr *iphdr = ip_hdr(skb); + + src.proto = htons(ETH_P_IP); + src.ipv4.s_addr = iphdr->saddr; + if (ipv4_is_multicast(iphdr->daddr)) + return 0; +#if IS_ENABLED(CONFIG_IPV6) + } else if (ip_hdr(skb)->version == 6) { + struct ipv6hdr *iphdr = ipv6_hdr(skb); + + src.proto = htons(ETH_P_IPV6); + src.ipv6 = iphdr->saddr; + if (ipv6_addr_is_multicast(&iphdr->daddr)) + return 0; +#endif + } else { + return 0; + } + + if (likely(tipc_udp_is_known_peer(b, &src))) + return 0; + + return tipc_udp_rcast_add(b, &src); +} + /* tipc_udp_recv - read data from bearer socket */ static int tipc_udp_recv(struct sock *sk, struct sk_buff *skb) { struct udp_bearer *ub; struct tipc_bearer *b; + struct tipc_msg *hdr; + int err; ub = rcu_dereference_sk_user_data(sk); if (!ub) { pr_err_ratelimited("Failed to get UDP bearer reference"); - kfree_skb(skb); - return 0; + goto out; } - skb_pull(skb, sizeof(struct udphdr)); + hdr = buf_msg(skb); + rcu_read_lock(); b = rcu_dereference_rtnl(ub->bearer); + if (!b) + goto rcu_out; - if (b) { + if (b && test_bit(0, &b->up)) { tipc_rcv(sock_net(sk), skb, b); rcu_read_unlock(); return 0; } + + if (unlikely(msg_user(hdr) == LINK_CONFIG)) { + err = tipc_udp_rcast_disc(b, skb); + if (err) + goto rcu_out; + } + + tipc_rcv(sock_net(sk), skb, b); rcu_read_unlock(); + return 0; + +rcu_out: + rcu_read_unlock(); +out: kfree_skb(skb); return 0; } @@ -241,15 +389,11 @@ static int enable_mcast(struct udp_bearer *ub, struct udp_media_addr *remote) struct sock *sk = ub->ubsock->sk; if (ntohs(remote->proto) == ETH_P_IP) { - if (!ipv4_is_multicast(remote->ipv4.s_addr)) - return 0; mreqn.imr_multiaddr = remote->ipv4; mreqn.imr_ifindex = ub->ifindex; err = ip_mc_join_group(sk, &mreqn); #if IS_ENABLED(CONFIG_IPV6) } else { - if (!ipv6_addr_is_multicast(&remote->ipv6)) - return 0; err = ipv6_stub->ipv6_sock_mc_join(sk, ub->ifindex, &remote->ipv6); #endif @@ -257,75 +401,234 @@ static int enable_mcast(struct udp_bearer *ub, struct udp_media_addr *remote) return err; } -/** - * parse_options - build local/remote addresses from configuration - * @attrs: netlink config data - * @ub: UDP bearer instance - * @local: local bearer IP address/port - * @remote: peer or multicast IP/port - */ -static int parse_options(struct nlattr *attrs[], struct udp_bearer *ub, - struct udp_media_addr *local, - struct udp_media_addr *remote) +static int __tipc_nl_add_udp_addr(struct sk_buff *skb, + struct udp_media_addr *addr, int nla_t) { - struct nlattr *opts[TIPC_NLA_UDP_MAX + 1]; - struct sockaddr_storage sa_local, sa_remote; + if (ntohs(addr->proto) == ETH_P_IP) { + struct sockaddr_in ip4; - if (!attrs[TIPC_NLA_BEARER_UDP_OPTS]) - goto err; - if (nla_parse_nested(opts, TIPC_NLA_UDP_MAX, - attrs[TIPC_NLA_BEARER_UDP_OPTS], - tipc_nl_udp_policy)) - goto err; - if (opts[TIPC_NLA_UDP_LOCAL] && opts[TIPC_NLA_UDP_REMOTE]) { - nla_memcpy(&sa_local, opts[TIPC_NLA_UDP_LOCAL], - sizeof(sa_local)); - nla_memcpy(&sa_remote, opts[TIPC_NLA_UDP_REMOTE], - sizeof(sa_remote)); + ip4.sin_family = AF_INET; + ip4.sin_port = addr->port; + ip4.sin_addr.s_addr = addr->ipv4.s_addr; + if (nla_put(skb, nla_t, sizeof(ip4), &ip4)) + return -EMSGSIZE; + +#if IS_ENABLED(CONFIG_IPV6) + } else if (ntohs(addr->proto) == ETH_P_IPV6) { + struct sockaddr_in6 ip6; + + ip6.sin6_family = AF_INET6; + ip6.sin6_port = addr->port; + memcpy(&ip6.sin6_addr, &addr->ipv6, sizeof(struct in6_addr)); + if (nla_put(skb, nla_t, sizeof(ip6), &ip6)) + return -EMSGSIZE; +#endif + } + + return 0; +} + +int tipc_udp_nl_dump_remoteip(struct sk_buff *skb, struct netlink_callback *cb) +{ + u32 bid = cb->args[0]; + u32 skip_cnt = cb->args[1]; + u32 portid = NETLINK_CB(cb->skb).portid; + struct udp_replicast *rcast, *tmp; + struct tipc_bearer *b; + struct udp_bearer *ub; + void *hdr; + int err; + int i; + + if (!bid && !skip_cnt) { + struct net *net = sock_net(skb->sk); + struct nlattr *battrs[TIPC_NLA_BEARER_MAX + 1]; + struct nlattr **attrs; + char *bname; + + err = tipc_nlmsg_parse(cb->nlh, &attrs); + if (err) + return err; + + if (!attrs[TIPC_NLA_BEARER]) + return -EINVAL; + + err = nla_parse_nested(battrs, TIPC_NLA_BEARER_MAX, + attrs[TIPC_NLA_BEARER], + tipc_nl_bearer_policy); + if (err) + return err; + + if (!battrs[TIPC_NLA_BEARER_NAME]) + return -EINVAL; + + bname = nla_data(battrs[TIPC_NLA_BEARER_NAME]); + + rtnl_lock(); + b = tipc_bearer_find(net, bname); + if (!b) { + rtnl_unlock(); + return -EINVAL; + } + bid = b->identity; } else { -err: - pr_err("Invalid UDP bearer configuration"); + struct net *net = sock_net(skb->sk); + struct tipc_net *tn = net_generic(net, tipc_net_id); + + rtnl_lock(); + b = rtnl_dereference(tn->bearer_list[bid]); + if (!b) { + rtnl_unlock(); + return -EINVAL; + } + } + + ub = rcu_dereference_rtnl(b->media_ptr); + if (!ub) { + rtnl_unlock(); return -EINVAL; } - if ((sa_local.ss_family & sa_remote.ss_family) == AF_INET) { - struct sockaddr_in *ip4; - - ip4 = (struct sockaddr_in *)&sa_local; - local->proto = htons(ETH_P_IP); - local->port = ip4->sin_port; - local->ipv4.s_addr = ip4->sin_addr.s_addr; - - ip4 = (struct sockaddr_in *)&sa_remote; - remote->proto = htons(ETH_P_IP); - remote->port = ip4->sin_port; - remote->ipv4.s_addr = ip4->sin_addr.s_addr; + + i = 0; + list_for_each_entry_safe(rcast, tmp, &ub->rcast.list, list) { + if (i < skip_cnt) + goto count; + + hdr = genlmsg_put(skb, portid, cb->nlh->nlmsg_seq, + &tipc_genl_family, NLM_F_MULTI, + TIPC_NL_BEARER_GET); + if (!hdr) + goto done; + + err = __tipc_nl_add_udp_addr(skb, &rcast->addr, + TIPC_NLA_UDP_REMOTE); + if (err) { + genlmsg_cancel(skb, hdr); + goto done; + } + genlmsg_end(skb, hdr); +count: + i++; + } +done: + rtnl_unlock(); + cb->args[0] = bid; + cb->args[1] = i; + + return skb->len; +} + +int tipc_udp_nl_add_bearer_data(struct tipc_nl_msg *msg, struct tipc_bearer *b) +{ + struct udp_media_addr *src = (struct udp_media_addr *)&b->addr.value; + struct udp_media_addr *dst; + struct udp_bearer *ub; + struct nlattr *nest; + + ub = rcu_dereference_rtnl(b->media_ptr); + if (!ub) + return -ENODEV; + + nest = nla_nest_start(msg->skb, TIPC_NLA_BEARER_UDP_OPTS); + if (!nest) + goto msg_full; + + if (__tipc_nl_add_udp_addr(msg->skb, src, TIPC_NLA_UDP_LOCAL)) + goto msg_full; + + dst = (struct udp_media_addr *)&b->bcast_addr.value; + if (__tipc_nl_add_udp_addr(msg->skb, dst, TIPC_NLA_UDP_REMOTE)) + goto msg_full; + + if (!list_empty(&ub->rcast.list)) { + if (nla_put_flag(msg->skb, TIPC_NLA_UDP_MULTI_REMOTEIP)) + goto msg_full; + } + + nla_nest_end(msg->skb, nest); + return 0; +msg_full: + nla_nest_cancel(msg->skb, nest); + return -EMSGSIZE; +} + +/** + * tipc_parse_udp_addr - build udp media address from netlink data + * @nlattr: netlink attribute containing sockaddr storage aligned address + * @addr: tipc media address to fill with address, port and protocol type + * @scope_id: IPv6 scope id pointer, not NULL indicates it's required + */ + +static int tipc_parse_udp_addr(struct nlattr *nla, struct udp_media_addr *addr, + u32 *scope_id) +{ + struct sockaddr_storage sa; + + nla_memcpy(&sa, nla, sizeof(sa)); + if (sa.ss_family == AF_INET) { + struct sockaddr_in *ip4 = (struct sockaddr_in *)&sa; + + addr->proto = htons(ETH_P_IP); + addr->port = ip4->sin_port; + addr->ipv4.s_addr = ip4->sin_addr.s_addr; return 0; #if IS_ENABLED(CONFIG_IPV6) - } else if ((sa_local.ss_family & sa_remote.ss_family) == AF_INET6) { - int atype; - struct sockaddr_in6 *ip6; + } else if (sa.ss_family == AF_INET6) { + struct sockaddr_in6 *ip6 = (struct sockaddr_in6 *)&sa; - ip6 = (struct sockaddr_in6 *)&sa_local; - atype = ipv6_addr_type(&ip6->sin6_addr); - if (__ipv6_addr_needs_scope_id(atype) && !ip6->sin6_scope_id) - return -EINVAL; + addr->proto = htons(ETH_P_IPV6); + addr->port = ip6->sin6_port; + memcpy(&addr->ipv6, &ip6->sin6_addr, sizeof(struct in6_addr)); + + /* Scope ID is only interesting for local addresses */ + if (scope_id) { + int atype; - local->proto = htons(ETH_P_IPV6); - local->port = ip6->sin6_port; - memcpy(&local->ipv6, &ip6->sin6_addr, sizeof(struct in6_addr)); - ub->ifindex = ip6->sin6_scope_id; + atype = ipv6_addr_type(&ip6->sin6_addr); + if (__ipv6_addr_needs_scope_id(atype) && + !ip6->sin6_scope_id) { + return -EINVAL; + } + + *scope_id = ip6->sin6_scope_id ? : 0; + } - ip6 = (struct sockaddr_in6 *)&sa_remote; - remote->proto = htons(ETH_P_IPV6); - remote->port = ip6->sin6_port; - memcpy(&remote->ipv6, &ip6->sin6_addr, sizeof(struct in6_addr)); return 0; #endif } return -EADDRNOTAVAIL; } +int tipc_udp_nl_bearer_add(struct tipc_bearer *b, struct nlattr *attr) +{ + int err; + struct udp_media_addr addr = {0}; + struct nlattr *opts[TIPC_NLA_UDP_MAX + 1]; + struct udp_media_addr *dst; + + if (nla_parse_nested(opts, TIPC_NLA_UDP_MAX, attr, tipc_nl_udp_policy)) + return -EINVAL; + + if (!opts[TIPC_NLA_UDP_REMOTE]) + return -EINVAL; + + err = tipc_parse_udp_addr(opts[TIPC_NLA_UDP_REMOTE], &addr, NULL); + if (err) + return err; + + dst = (struct udp_media_addr *)&b->bcast_addr.value; + if (tipc_udp_is_mcast_addr(dst)) { + pr_err("Can't add remote ip to TIPC UDP multicast bearer\n"); + return -EINVAL; + } + + if (tipc_udp_is_known_peer(b, &addr)) + return 0; + + return tipc_udp_rcast_add(b, &addr); +} + /** * tipc_udp_enable - callback to create a new udp bearer instance * @net: network namespace @@ -340,18 +643,37 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b, { int err = -EINVAL; struct udp_bearer *ub; - struct udp_media_addr *remote; + struct udp_media_addr remote = {0}; struct udp_media_addr local = {0}; struct udp_port_cfg udp_conf = {0}; struct udp_tunnel_sock_cfg tuncfg = {NULL}; + struct nlattr *opts[TIPC_NLA_UDP_MAX + 1]; ub = kzalloc(sizeof(*ub), GFP_ATOMIC); if (!ub) return -ENOMEM; - remote = (struct udp_media_addr *)&b->bcast_addr.value; - memset(remote, 0, sizeof(struct udp_media_addr)); - err = parse_options(attrs, ub, &local, remote); + INIT_LIST_HEAD(&ub->rcast.list); + + if (!attrs[TIPC_NLA_BEARER_UDP_OPTS]) + goto err; + + if (nla_parse_nested(opts, TIPC_NLA_UDP_MAX, + attrs[TIPC_NLA_BEARER_UDP_OPTS], + tipc_nl_udp_policy)) + goto err; + + if (!opts[TIPC_NLA_UDP_LOCAL] || !opts[TIPC_NLA_UDP_REMOTE]) { + pr_err("Invalid UDP bearer configuration"); + return -EINVAL; + } + + err = tipc_parse_udp_addr(opts[TIPC_NLA_UDP_LOCAL], &local, + &ub->ifindex); + if (err) + goto err; + + err = tipc_parse_udp_addr(opts[TIPC_NLA_UDP_REMOTE], &remote, NULL); if (err) goto err; @@ -396,9 +718,18 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b, tuncfg.encap_destroy = NULL; setup_udp_tunnel_sock(net, ub->ubsock, &tuncfg); - err = enable_mcast(ub, remote); + /** + * The bcast media address port is used for all peers and the ip + * is used if it's a multicast address. + */ + memcpy(&b->bcast_addr.value, &remote, sizeof(remote)); + if (tipc_udp_is_mcast_addr(&remote)) + err = enable_mcast(ub, &remote); + else + err = tipc_udp_rcast_add(b, &remote); if (err) goto err; + return 0; err: if (ub->ubsock) @@ -411,6 +742,12 @@ err: static void cleanup_bearer(struct work_struct *work) { struct udp_bearer *ub = container_of(work, struct udp_bearer, work); + struct udp_replicast *rcast, *tmp; + + list_for_each_entry_safe(rcast, tmp, &ub->rcast.list, list) { + list_del_rcu(&rcast->list); + kfree_rcu(rcast, rcu); + } if (ub->ubsock) udp_tunnel_sock_release(ub->ubsock); diff --git a/net/tipc/udp_media.h b/net/tipc/udp_media.h new file mode 100644 index 000000000000..281bbae87726 --- /dev/null +++ b/net/tipc/udp_media.h @@ -0,0 +1,46 @@ +/* + * net/tipc/udp_media.h: Include file for UDP bearer media + * + * Copyright (c) 1996-2006, 2013-2016, Ericsson AB + * Copyright (c) 2005, 2010-2011, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifdef CONFIG_TIPC_MEDIA_UDP +#ifndef _TIPC_UDP_MEDIA_H +#define _TIPC_UDP_MEDIA_H + +int tipc_udp_nl_bearer_add(struct tipc_bearer *b, struct nlattr *attr); +int tipc_udp_nl_add_bearer_data(struct tipc_nl_msg *msg, struct tipc_bearer *b); +int tipc_udp_nl_dump_remoteip(struct sk_buff *skb, struct netlink_callback *cb); + +#endif +#endif diff --git a/net/wireless/core.c b/net/wireless/core.c index 7645e97362c0..2029b49a1df3 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -906,6 +906,8 @@ void cfg80211_unregister_wdev(struct wireless_dev *wdev) if (WARN_ON(wdev->netdev)) return; + nl80211_notify_iface(rdev, wdev, NL80211_CMD_DEL_INTERFACE); + list_del_rcu(&wdev->list); rdev->devlist_generation++; @@ -1079,6 +1081,8 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, wdev->iftype == NL80211_IFTYPE_P2P_CLIENT || wdev->iftype == NL80211_IFTYPE_ADHOC) && !wdev->use_4addr) dev->priv_flags |= IFF_DONT_BRIDGE; + + nl80211_notify_iface(rdev, wdev, NL80211_CMD_NEW_INTERFACE); break; case NETDEV_GOING_DOWN: cfg80211_leave(rdev, wdev); @@ -1157,6 +1161,8 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, * remove and clean it up. */ if (!list_empty(&wdev->list)) { + nl80211_notify_iface(rdev, wdev, + NL80211_CMD_DEL_INTERFACE); sysfs_remove_link(&dev->dev.kobj, "phy80211"); list_del_rcu(&wdev->list); rdev->devlist_generation++; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index f02653a08993..499785778983 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -2751,7 +2751,7 @@ static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info) struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct vif_params params; struct wireless_dev *wdev; - struct sk_buff *msg, *event; + struct sk_buff *msg; int err; enum nl80211_iftype type = NL80211_IFTYPE_UNSPECIFIED; u32 flags; @@ -2855,20 +2855,15 @@ static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info) return -ENOBUFS; } - event = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (event) { - if (nl80211_send_iface(event, 0, 0, 0, - rdev, wdev, false) < 0) { - nlmsg_free(event); - goto out; - } - - genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), - event, 0, NL80211_MCGRP_CONFIG, - GFP_KERNEL); - } + /* + * For wdevs which have no associated netdev object (e.g. of type + * NL80211_IFTYPE_P2P_DEVICE), emit the NEW_INTERFACE event here. + * For all other types, the event will be generated from the + * netdev notifier + */ + if (!wdev->netdev) + nl80211_notify_iface(rdev, wdev, NL80211_CMD_NEW_INTERFACE); -out: return genlmsg_reply(msg, info); } @@ -2876,18 +2871,10 @@ static int nl80211_del_interface(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct wireless_dev *wdev = info->user_ptr[1]; - struct sk_buff *msg; - int status; if (!rdev->ops->del_virtual_intf) return -EOPNOTSUPP; - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (msg && nl80211_send_iface(msg, 0, 0, 0, rdev, wdev, true) < 0) { - nlmsg_free(msg); - msg = NULL; - } - /* * If we remove a wireless device without a netdev then clear * user_ptr[1] so that nl80211_post_doit won't dereference it @@ -2898,15 +2885,7 @@ static int nl80211_del_interface(struct sk_buff *skb, struct genl_info *info) if (!wdev->netdev) info->user_ptr[1] = NULL; - status = rdev_del_virtual_intf(rdev, wdev); - if (status >= 0 && msg) - genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), - msg, 0, NL80211_MCGRP_CONFIG, - GFP_KERNEL); - else - nlmsg_free(msg); - - return status; + return rdev_del_virtual_intf(rdev, wdev); } static int nl80211_set_noack_map(struct sk_buff *skb, struct genl_info *info) @@ -5374,6 +5353,18 @@ static int nl80211_check_s32(const struct nlattr *nla, s32 min, s32 max, s32 *ou return 0; } +static int nl80211_check_power_mode(const struct nlattr *nla, + enum nl80211_mesh_power_mode min, + enum nl80211_mesh_power_mode max, + enum nl80211_mesh_power_mode *out) +{ + u32 val = nla_get_u32(nla); + if (val < min || val > max) + return -EINVAL; + *out = val; + return 0; +} + static int nl80211_parse_mesh_config(struct genl_info *info, struct mesh_config *cfg, u32 *mask_out) @@ -5518,7 +5509,7 @@ do { \ NL80211_MESH_POWER_ACTIVE, NL80211_MESH_POWER_MAX, mask, NL80211_MESHCONF_POWER_MODE, - nl80211_check_u32); + nl80211_check_power_mode); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshAwakeWindowDuration, 0, 65535, mask, NL80211_MESHCONF_AWAKE_WINDOW, nl80211_check_u16); @@ -7773,12 +7764,13 @@ static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info) ibss.beacon_interval = 100; - if (info->attrs[NL80211_ATTR_BEACON_INTERVAL]) { + if (info->attrs[NL80211_ATTR_BEACON_INTERVAL]) ibss.beacon_interval = nla_get_u32(info->attrs[NL80211_ATTR_BEACON_INTERVAL]); - if (ibss.beacon_interval < 1 || ibss.beacon_interval > 10000) - return -EINVAL; - } + + err = cfg80211_validate_beacon_int(rdev, ibss.beacon_interval); + if (err) + return err; if (!rdev->ops->join_ibss) return -EOPNOTSUPP; @@ -9252,9 +9244,10 @@ static int nl80211_join_mesh(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_BEACON_INTERVAL]) { setup.beacon_interval = nla_get_u32(info->attrs[NL80211_ATTR_BEACON_INTERVAL]); - if (setup.beacon_interval < 10 || - setup.beacon_interval > 10000) - return -EINVAL; + + err = cfg80211_validate_beacon_int(rdev, setup.beacon_interval); + if (err) + return err; } if (info->attrs[NL80211_ATTR_DTIM_PERIOD]) { @@ -11847,6 +11840,29 @@ void nl80211_notify_wiphy(struct cfg80211_registered_device *rdev, NL80211_MCGRP_CONFIG, GFP_KERNEL); } +void nl80211_notify_iface(struct cfg80211_registered_device *rdev, + struct wireless_dev *wdev, + enum nl80211_commands cmd) +{ + struct sk_buff *msg; + + WARN_ON(cmd != NL80211_CMD_NEW_INTERFACE && + cmd != NL80211_CMD_DEL_INTERFACE); + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return; + + if (nl80211_send_iface(msg, 0, 0, 0, rdev, wdev, + cmd == NL80211_CMD_DEL_INTERFACE) < 0) { + nlmsg_free(msg); + return; + } + + genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0, + NL80211_MCGRP_CONFIG, GFP_KERNEL); +} + static int nl80211_add_scan_req(struct sk_buff *msg, struct cfg80211_registered_device *rdev) { diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h index a63f402b10b7..7e3821d7fcc5 100644 --- a/net/wireless/nl80211.h +++ b/net/wireless/nl80211.h @@ -7,6 +7,9 @@ int nl80211_init(void); void nl80211_exit(void); void nl80211_notify_wiphy(struct cfg80211_registered_device *rdev, enum nl80211_commands cmd); +void nl80211_notify_iface(struct cfg80211_registered_device *rdev, + struct wireless_dev *wdev, + enum nl80211_commands cmd); void nl80211_send_scan_start(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev); struct sk_buff *nl80211_build_scan_msg(struct cfg80211_registered_device *rdev, diff --git a/net/wireless/util.c b/net/wireless/util.c index b7d1592bd5b8..0675f513e7b9 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -1559,7 +1559,7 @@ int cfg80211_validate_beacon_int(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev; int res = 0; - if (!beacon_int) + if (beacon_int < 10 || beacon_int > 10000) return -EINVAL; list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { |
