From 61792b677415b77c8db04991c22966bb8de7603e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 23 Oct 2018 16:47:16 +0200 Subject: netfilter: ipv6: fix oops when defragmenting locally generated fragments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Unlike ipv4 and normal ipv6 defrag, netfilter ipv6 defragmentation did not save/restore skb->dst. This causes oops when handling locally generated ipv6 fragments, as output path needs a valid dst. Reported-by: Maciej Żenczykowski Fixes: 84379c9afe01 ("netfilter: ipv6: nf_defrag: drop skb dst before queueing") Signed-off-by: Florian Westphal Reviewed-by: Eric Dumazet Signed-off-by: Pablo Neira Ayuso --- net/ipv6/netfilter/nf_conntrack_reasm.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index b8ac369f98ad..d219979c3e52 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -587,11 +587,16 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) */ ret = -EINPROGRESS; if (fq->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && - fq->q.meat == fq->q.len && - nf_ct_frag6_reasm(fq, skb, dev)) - ret = 0; - else + fq->q.meat == fq->q.len) { + unsigned long orefdst = skb->_skb_refdst; + + skb->_skb_refdst = 0UL; + if (nf_ct_frag6_reasm(fq, skb, dev)) + ret = 0; + skb->_skb_refdst = orefdst; + } else { skb_dst_drop(skb); + } out_unlock: spin_unlock_bh(&fq->q.lock); -- cgit From 5e91c9d9cd3fd557226ca75fed58816b9eee7e07 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 22 Oct 2018 21:49:45 +0200 Subject: netfilter: nft_osf: check if attribute is present If the attribute is not sent, eg. old libnftnl binary, then tb[NFTA_OSF_TTL] is NULL and kernel crashes from the _init path. Fixes: a218dc82f0b5 ("netfilter: nft_osf: Add ttl option support") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_osf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c index ca5e5d8c5ef8..b13618c764ec 100644 --- a/net/netfilter/nft_osf.c +++ b/net/netfilter/nft_osf.c @@ -50,7 +50,7 @@ static int nft_osf_init(const struct nft_ctx *ctx, int err; u8 ttl; - if (nla_get_u8(tb[NFTA_OSF_TTL])) { + if (tb[NFTA_OSF_TTL]) { ttl = nla_get_u8(tb[NFTA_OSF_TTL]); if (ttl > 2) return -EINVAL; -- cgit From 4269fea768a11a447d8de620ce420f2214d4685c Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 26 Oct 2018 11:14:28 +0200 Subject: Revert "netfilter: nft_numgen: add map lookups for numgen random operations" Laura found a better way to do this from userspace without requiring kernel infrastructure, revert this. Fixes: 978d8f9055c3 ("netfilter: nft_numgen: add map lookups for numgen random operations") Signed-off-by: Laura Garcia Liebana Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_numgen.c | 127 --------------------------------------------- 1 file changed, 127 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c index 649d1700ec5b..3cc1b3dc3c3c 100644 --- a/net/netfilter/nft_numgen.c +++ b/net/netfilter/nft_numgen.c @@ -24,7 +24,6 @@ struct nft_ng_inc { u32 modulus; atomic_t counter; u32 offset; - struct nft_set *map; }; static u32 nft_ng_inc_gen(struct nft_ng_inc *priv) @@ -48,34 +47,11 @@ static void nft_ng_inc_eval(const struct nft_expr *expr, regs->data[priv->dreg] = nft_ng_inc_gen(priv); } -static void nft_ng_inc_map_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) -{ - struct nft_ng_inc *priv = nft_expr_priv(expr); - const struct nft_set *map = priv->map; - const struct nft_set_ext *ext; - u32 result; - bool found; - - result = nft_ng_inc_gen(priv); - found = map->ops->lookup(nft_net(pkt), map, &result, &ext); - - if (!found) - return; - - nft_data_copy(®s->data[priv->dreg], - nft_set_ext_data(ext), map->dlen); -} - static const struct nla_policy nft_ng_policy[NFTA_NG_MAX + 1] = { [NFTA_NG_DREG] = { .type = NLA_U32 }, [NFTA_NG_MODULUS] = { .type = NLA_U32 }, [NFTA_NG_TYPE] = { .type = NLA_U32 }, [NFTA_NG_OFFSET] = { .type = NLA_U32 }, - [NFTA_NG_SET_NAME] = { .type = NLA_STRING, - .len = NFT_SET_MAXNAMELEN - 1 }, - [NFTA_NG_SET_ID] = { .type = NLA_U32 }, }; static int nft_ng_inc_init(const struct nft_ctx *ctx, @@ -101,22 +77,6 @@ static int nft_ng_inc_init(const struct nft_ctx *ctx, NFT_DATA_VALUE, sizeof(u32)); } -static int nft_ng_inc_map_init(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nlattr * const tb[]) -{ - struct nft_ng_inc *priv = nft_expr_priv(expr); - u8 genmask = nft_genmask_next(ctx->net); - - nft_ng_inc_init(ctx, expr, tb); - - priv->map = nft_set_lookup_global(ctx->net, ctx->table, - tb[NFTA_NG_SET_NAME], - tb[NFTA_NG_SET_ID], genmask); - - return PTR_ERR_OR_ZERO(priv->map); -} - static int nft_ng_dump(struct sk_buff *skb, enum nft_registers dreg, u32 modulus, enum nft_ng_types type, u32 offset) { @@ -143,27 +103,10 @@ static int nft_ng_inc_dump(struct sk_buff *skb, const struct nft_expr *expr) priv->offset); } -static int nft_ng_inc_map_dump(struct sk_buff *skb, - const struct nft_expr *expr) -{ - const struct nft_ng_inc *priv = nft_expr_priv(expr); - - if (nft_ng_dump(skb, priv->dreg, priv->modulus, - NFT_NG_INCREMENTAL, priv->offset) || - nla_put_string(skb, NFTA_NG_SET_NAME, priv->map->name)) - goto nla_put_failure; - - return 0; - -nla_put_failure: - return -1; -} - struct nft_ng_random { enum nft_registers dreg:8; u32 modulus; u32 offset; - struct nft_set *map; }; static u32 nft_ng_random_gen(struct nft_ng_random *priv) @@ -183,25 +126,6 @@ static void nft_ng_random_eval(const struct nft_expr *expr, regs->data[priv->dreg] = nft_ng_random_gen(priv); } -static void nft_ng_random_map_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) -{ - struct nft_ng_random *priv = nft_expr_priv(expr); - const struct nft_set *map = priv->map; - const struct nft_set_ext *ext; - u32 result; - bool found; - - result = nft_ng_random_gen(priv); - found = map->ops->lookup(nft_net(pkt), map, &result, &ext); - if (!found) - return; - - nft_data_copy(®s->data[priv->dreg], - nft_set_ext_data(ext), map->dlen); -} - static int nft_ng_random_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) @@ -226,21 +150,6 @@ static int nft_ng_random_init(const struct nft_ctx *ctx, NFT_DATA_VALUE, sizeof(u32)); } -static int nft_ng_random_map_init(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nlattr * const tb[]) -{ - struct nft_ng_random *priv = nft_expr_priv(expr); - u8 genmask = nft_genmask_next(ctx->net); - - nft_ng_random_init(ctx, expr, tb); - priv->map = nft_set_lookup_global(ctx->net, ctx->table, - tb[NFTA_NG_SET_NAME], - tb[NFTA_NG_SET_ID], genmask); - - return PTR_ERR_OR_ZERO(priv->map); -} - static int nft_ng_random_dump(struct sk_buff *skb, const struct nft_expr *expr) { const struct nft_ng_random *priv = nft_expr_priv(expr); @@ -249,22 +158,6 @@ static int nft_ng_random_dump(struct sk_buff *skb, const struct nft_expr *expr) priv->offset); } -static int nft_ng_random_map_dump(struct sk_buff *skb, - const struct nft_expr *expr) -{ - const struct nft_ng_random *priv = nft_expr_priv(expr); - - if (nft_ng_dump(skb, priv->dreg, priv->modulus, - NFT_NG_RANDOM, priv->offset) || - nla_put_string(skb, NFTA_NG_SET_NAME, priv->map->name)) - goto nla_put_failure; - - return 0; - -nla_put_failure: - return -1; -} - static struct nft_expr_type nft_ng_type; static const struct nft_expr_ops nft_ng_inc_ops = { .type = &nft_ng_type, @@ -274,14 +167,6 @@ static const struct nft_expr_ops nft_ng_inc_ops = { .dump = nft_ng_inc_dump, }; -static const struct nft_expr_ops nft_ng_inc_map_ops = { - .type = &nft_ng_type, - .size = NFT_EXPR_SIZE(sizeof(struct nft_ng_inc)), - .eval = nft_ng_inc_map_eval, - .init = nft_ng_inc_map_init, - .dump = nft_ng_inc_map_dump, -}; - static const struct nft_expr_ops nft_ng_random_ops = { .type = &nft_ng_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_ng_random)), @@ -290,14 +175,6 @@ static const struct nft_expr_ops nft_ng_random_ops = { .dump = nft_ng_random_dump, }; -static const struct nft_expr_ops nft_ng_random_map_ops = { - .type = &nft_ng_type, - .size = NFT_EXPR_SIZE(sizeof(struct nft_ng_random)), - .eval = nft_ng_random_map_eval, - .init = nft_ng_random_map_init, - .dump = nft_ng_random_map_dump, -}; - static const struct nft_expr_ops * nft_ng_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { @@ -312,12 +189,8 @@ nft_ng_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) switch (type) { case NFT_NG_INCREMENTAL: - if (tb[NFTA_NG_SET_NAME]) - return &nft_ng_inc_map_ops; return &nft_ng_inc_ops; case NFT_NG_RANDOM: - if (tb[NFTA_NG_SET_NAME]) - return &nft_ng_random_map_ops; return &nft_ng_random_ops; } -- cgit From 439cd39ea136d2c026805264d58a91f36b6b64ca Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Sat, 14 Jul 2018 21:59:43 +0200 Subject: netfilter: ipset: list:set: Decrease refcount synchronously on deletion and replace Commit 45040978c899 ("netfilter: ipset: Fix set:list type crash when flush/dump set in parallel") postponed decreasing set reference counters to the RCU callback. An 'ipset del' command can terminate before the RCU grace period is elapsed, and if sets are listed before then, the reference counter shown in userspace will be wrong: # ipset create h hash:ip; ipset create l list:set; ipset add l # ipset del l h; ipset list h Name: h Type: hash:ip Revision: 4 Header: family inet hashsize 1024 maxelem 65536 Size in memory: 88 References: 1 Number of entries: 0 Members: # sleep 1; ipset list h Name: h Type: hash:ip Revision: 4 Header: family inet hashsize 1024 maxelem 65536 Size in memory: 88 References: 0 Number of entries: 0 Members: Fix this by making the reference count update synchronous again. As a result, when sets are listed, ip_set_name_byindex() might now fetch a set whose reference count is already zero. Instead of relying on the reference count to protect against concurrent set renaming, grab ip_set_ref_lock as reader and copy the name, while holding the same lock in ip_set_rename() as writer instead. Reported-by: Li Shuang Fixes: 45040978c899 ("netfilter: ipset: Fix set:list type crash when flush/dump set in parallel") Signed-off-by: Stefano Brivio Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_core.c | 23 +++++++++++------------ net/netfilter/ipset/ip_set_list_set.c | 17 +++++++++++------ 2 files changed, 22 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index bc4bd247bb7d..fa15a831aeee 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -693,21 +693,20 @@ ip_set_put_byindex(struct net *net, ip_set_id_t index) EXPORT_SYMBOL_GPL(ip_set_put_byindex); /* Get the name of a set behind a set index. - * We assume the set is referenced, so it does exist and - * can't be destroyed. The set cannot be renamed due to - * the referencing either. - * + * Set itself is protected by RCU, but its name isn't: to protect against + * renaming, grab ip_set_ref_lock as reader (see ip_set_rename()) and copy the + * name. */ -const char * -ip_set_name_byindex(struct net *net, ip_set_id_t index) +void +ip_set_name_byindex(struct net *net, ip_set_id_t index, char *name) { - const struct ip_set *set = ip_set_rcu_get(net, index); + struct ip_set *set = ip_set_rcu_get(net, index); BUG_ON(!set); - BUG_ON(set->ref == 0); - /* Referenced, so it's safe */ - return set->name; + read_lock_bh(&ip_set_ref_lock); + strncpy(name, set->name, IPSET_MAXNAMELEN); + read_unlock_bh(&ip_set_ref_lock); } EXPORT_SYMBOL_GPL(ip_set_name_byindex); @@ -1153,7 +1152,7 @@ static int ip_set_rename(struct net *net, struct sock *ctnl, if (!set) return -ENOENT; - read_lock_bh(&ip_set_ref_lock); + write_lock_bh(&ip_set_ref_lock); if (set->ref != 0) { ret = -IPSET_ERR_REFERENCED; goto out; @@ -1170,7 +1169,7 @@ static int ip_set_rename(struct net *net, struct sock *ctnl, strncpy(set->name, name2, IPSET_MAXNAMELEN); out: - read_unlock_bh(&ip_set_ref_lock); + write_unlock_bh(&ip_set_ref_lock); return ret; } diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index 072a658fde04..4eef55da0878 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -148,9 +148,7 @@ __list_set_del_rcu(struct rcu_head * rcu) { struct set_elem *e = container_of(rcu, struct set_elem, rcu); struct ip_set *set = e->set; - struct list_set *map = set->data; - ip_set_put_byindex(map->net, e->id); ip_set_ext_destroy(set, e); kfree(e); } @@ -158,15 +156,21 @@ __list_set_del_rcu(struct rcu_head * rcu) static inline void list_set_del(struct ip_set *set, struct set_elem *e) { + struct list_set *map = set->data; + set->elements--; list_del_rcu(&e->list); + ip_set_put_byindex(map->net, e->id); call_rcu(&e->rcu, __list_set_del_rcu); } static inline void -list_set_replace(struct set_elem *e, struct set_elem *old) +list_set_replace(struct ip_set *set, struct set_elem *e, struct set_elem *old) { + struct list_set *map = set->data; + list_replace_rcu(&old->list, &e->list); + ip_set_put_byindex(map->net, old->id); call_rcu(&old->rcu, __list_set_del_rcu); } @@ -298,7 +302,7 @@ list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext, INIT_LIST_HEAD(&e->list); list_set_init_extensions(set, ext, e); if (n) - list_set_replace(e, n); + list_set_replace(set, e, n); else if (next) list_add_tail_rcu(&e->list, &next->list); else if (prev) @@ -486,6 +490,7 @@ list_set_list(const struct ip_set *set, const struct list_set *map = set->data; struct nlattr *atd, *nested; u32 i = 0, first = cb->args[IPSET_CB_ARG0]; + char name[IPSET_MAXNAMELEN]; struct set_elem *e; int ret = 0; @@ -504,8 +509,8 @@ list_set_list(const struct ip_set *set, nested = ipset_nest_start(skb, IPSET_ATTR_DATA); if (!nested) goto nla_put_failure; - if (nla_put_string(skb, IPSET_ATTR_NAME, - ip_set_name_byindex(map->net, e->id))) + ip_set_name_byindex(map->net, e->id, name); + if (nla_put_string(skb, IPSET_ATTR_NAME, name)) goto nla_put_failure; if (ip_set_put_extensions(skb, set, e, true)) goto nla_put_failure; -- cgit From 886503f34d63e681662057448819edb5b1057a97 Mon Sep 17 00:00:00 2001 From: Eric Westbrook Date: Tue, 28 Aug 2018 15:14:42 -0600 Subject: netfilter: ipset: actually allow allowable CIDR 0 in hash:net,port,net Allow /0 as advertised for hash:net,port,net sets. For "hash:net,port,net", ipset(8) says that "either subnet is permitted to be a /0 should you wish to match port between all destinations." Make that statement true. Before: # ipset create cidrzero hash:net,port,net # ipset add cidrzero 0.0.0.0/0,12345,0.0.0.0/0 ipset v6.34: The value of the CIDR parameter of the IP address is invalid # ipset create cidrzero6 hash:net,port,net family inet6 # ipset add cidrzero6 ::/0,12345,::/0 ipset v6.34: The value of the CIDR parameter of the IP address is invalid After: # ipset create cidrzero hash:net,port,net # ipset add cidrzero 0.0.0.0/0,12345,0.0.0.0/0 # ipset test cidrzero 192.168.205.129,12345,172.16.205.129 192.168.205.129,tcp:12345,172.16.205.129 is in set cidrzero. # ipset create cidrzero6 hash:net,port,net family inet6 # ipset add cidrzero6 ::/0,12345,::/0 # ipset test cidrzero6 fe80::1,12345,ff00::1 fe80::1,tcp:12345,ff00::1 is in set cidrzero6. See also: https://bugzilla.kernel.org/show_bug.cgi?id=200897 https://github.com/ewestbrook/linux/commit/df7ff6efb0934ab6acc11f003ff1a7580d6c1d9c Signed-off-by: Eric Westbrook Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_hash_netportnet.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c index d391485a6acd..613e18e720a4 100644 --- a/net/netfilter/ipset/ip_set_hash_netportnet.c +++ b/net/netfilter/ipset/ip_set_hash_netportnet.c @@ -213,13 +213,13 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_CIDR]) { e.cidr[0] = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (!e.cidr[0] || e.cidr[0] > HOST_MASK) + if (e.cidr[0] > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; } if (tb[IPSET_ATTR_CIDR2]) { e.cidr[1] = nla_get_u8(tb[IPSET_ATTR_CIDR2]); - if (!e.cidr[1] || e.cidr[1] > HOST_MASK) + if (e.cidr[1] > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; } @@ -493,13 +493,13 @@ hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_CIDR]) { e.cidr[0] = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (!e.cidr[0] || e.cidr[0] > HOST_MASK) + if (e.cidr[0] > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; } if (tb[IPSET_ATTR_CIDR2]) { e.cidr[1] = nla_get_u8(tb[IPSET_ATTR_CIDR2]); - if (!e.cidr[1] || e.cidr[1] > HOST_MASK) + if (e.cidr[1] > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; } -- cgit From ed956f3947a01ff9875cd908d7c1ef1fe7f47bf0 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Mon, 22 Oct 2018 23:30:40 +0200 Subject: netfilter: ipset: fix ip_set_list allocation failure ip_set_create() and ip_set_net_init() attempt to allocate physically contiguous memory for ip_set_list. If memory is fragmented, the allocations could easily fail: vzctl: page allocation failure: order:7, mode:0xc0d0 Call Trace: dump_stack+0x19/0x1b warn_alloc_failed+0x110/0x180 __alloc_pages_nodemask+0x7bf/0xc60 alloc_pages_current+0x98/0x110 kmalloc_order+0x18/0x40 kmalloc_order_trace+0x26/0xa0 __kmalloc+0x279/0x290 ip_set_net_init+0x4b/0x90 [ip_set] ops_init+0x3b/0xb0 setup_net+0xbb/0x170 copy_net_ns+0xf1/0x1c0 create_new_namespaces+0xf9/0x180 copy_namespaces+0x8e/0xd0 copy_process+0xb61/0x1a00 do_fork+0x91/0x320 Use kvcalloc() to fallback to 0-order allocations if high order page isn't available. Signed-off-by: Andrey Ryabinin Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_core.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index fa15a831aeee..68db946df151 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -960,7 +960,7 @@ static int ip_set_create(struct net *net, struct sock *ctnl, /* Wraparound */ goto cleanup; - list = kcalloc(i, sizeof(struct ip_set *), GFP_KERNEL); + list = kvcalloc(i, sizeof(struct ip_set *), GFP_KERNEL); if (!list) goto cleanup; /* nfnl mutex is held, both lists are valid */ @@ -972,7 +972,7 @@ static int ip_set_create(struct net *net, struct sock *ctnl, /* Use new list */ index = inst->ip_set_max; inst->ip_set_max = i; - kfree(tmp); + kvfree(tmp); ret = 0; } else if (ret) { goto cleanup; @@ -2058,7 +2058,7 @@ ip_set_net_init(struct net *net) if (inst->ip_set_max >= IPSET_INVALID_ID) inst->ip_set_max = IPSET_INVALID_ID - 1; - list = kcalloc(inst->ip_set_max, sizeof(struct ip_set *), GFP_KERNEL); + list = kvcalloc(inst->ip_set_max, sizeof(struct ip_set *), GFP_KERNEL); if (!list) return -ENOMEM; inst->is_deleted = false; @@ -2086,7 +2086,7 @@ ip_set_net_exit(struct net *net) } } nfnl_unlock(NFNL_SUBSYS_IPSET); - kfree(rcu_dereference_protected(inst->ip_set_list, 1)); + kvfree(rcu_dereference_protected(inst->ip_set_list, 1)); } static struct pernet_operations ip_set_net_ops = { -- cgit From 7de414a9dd91426318df7b63da024b2b07e53df5 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 1 Nov 2018 12:02:37 -0700 Subject: net: drop skb on failure in ip_check_defrag() Most callers of pskb_trim_rcsum() simply drop the skb when it fails, however, ip_check_defrag() still continues to pass the skb up to stack. This is suspicious. In ip_check_defrag(), after we learn the skb is an IP fragment, passing the skb to callers makes no sense, because callers expect fragments are defrag'ed on success. So, dropping the skb when we can't defrag it is reasonable. Note, prior to commit 88078d98d1bb, this is not a big problem as checksum will be fixed up anyway. After it, the checksum is not correct on failure. Found this during code review. Fixes: 88078d98d1bb ("net: pskb_trim_rcsum() and CHECKSUM_COMPLETE are friends") Cc: Eric Dumazet Signed-off-by: Cong Wang Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ip_fragment.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 9b0158fa431f..d6ee343fdb86 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -722,10 +722,14 @@ struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user) if (ip_is_fragment(&iph)) { skb = skb_share_check(skb, GFP_ATOMIC); if (skb) { - if (!pskb_may_pull(skb, netoff + iph.ihl * 4)) - return skb; - if (pskb_trim_rcsum(skb, netoff + len)) - return skb; + if (!pskb_may_pull(skb, netoff + iph.ihl * 4)) { + kfree_skb(skb); + return NULL; + } + if (pskb_trim_rcsum(skb, netoff + len)) { + kfree_skb(skb); + return NULL; + } memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); if (ip_defrag(net, skb, user)) return NULL; -- cgit From 49682bfa1e0e448a711471a5db83be0df1fb39a2 Mon Sep 17 00:00:00 2001 From: Mathieu Malaterre Date: Wed, 31 Oct 2018 13:16:58 +0100 Subject: net: document skb parameter in function 'skb_gso_size_check' Remove kernel-doc warning: net/core/skbuff.c:4953: warning: Function parameter or member 'skb' not described in 'skb_gso_size_check' Signed-off-by: Mathieu Malaterre Signed-off-by: David S. Miller --- net/core/skbuff.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 946de0e24c87..b4ee5c8b928f 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4944,6 +4944,8 @@ static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb) * * This is a helper to do that correctly considering GSO_BY_FRAGS. * + * @skb: GSO skb + * * @seg_len: The segmented length (from skb_gso_*_seglen). In the * GSO_BY_FRAGS case this will be [header sizes + GSO_BY_FRAGS]. * -- cgit From 2384d02520ff2a916169b2fd85ea50e923ed56c2 Mon Sep 17 00:00:00 2001 From: Jeff Barnhill <0xeffeff@gmail.com> Date: Fri, 2 Nov 2018 20:23:57 +0000 Subject: net/ipv6: Add anycast addresses to a global hashtable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit icmp6_send() function is expensive on systems with a large number of interfaces. Every time it’s called, it has to verify that the source address does not correspond to an existing anycast address by looping through every device and every anycast address on the device. This can result in significant delays for a CPU when there are a large number of neighbors and ND timers are frequently timing out and calling neigh_invalidate(). Add anycast addresses to a global hashtable to allow quick searching for matching anycast addresses. This is based on inet6_addr_lst in addrconf.c. Signed-off-by: Jeff Barnhill <0xeffeff@gmail.com> Signed-off-by: David S. Miller --- net/ipv6/af_inet6.c | 5 ++++ net/ipv6/anycast.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 81 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 3f4d61017a69..f0cd291034f0 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -1001,6 +1001,9 @@ static int __init inet6_init(void) err = ip6_flowlabel_init(); if (err) goto ip6_flowlabel_fail; + err = ipv6_anycast_init(); + if (err) + goto ipv6_anycast_fail; err = addrconf_init(); if (err) goto addrconf_fail; @@ -1091,6 +1094,8 @@ ipv6_frag_fail: ipv6_exthdrs_fail: addrconf_cleanup(); addrconf_fail: + ipv6_anycast_cleanup(); +ipv6_anycast_fail: ip6_flowlabel_cleanup(); ip6_flowlabel_fail: ndisc_late_cleanup(); diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index 4e0ff7031edd..7698637cf827 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -44,8 +44,22 @@ #include +#define IN6_ADDR_HSIZE_SHIFT 8 +#define IN6_ADDR_HSIZE BIT(IN6_ADDR_HSIZE_SHIFT) +/* anycast address hash table + */ +static struct hlist_head inet6_acaddr_lst[IN6_ADDR_HSIZE]; +static DEFINE_SPINLOCK(acaddr_hash_lock); + static int ipv6_dev_ac_dec(struct net_device *dev, const struct in6_addr *addr); +static u32 inet6_acaddr_hash(struct net *net, const struct in6_addr *addr) +{ + u32 val = ipv6_addr_hash(addr) ^ net_hash_mix(net); + + return hash_32(val, IN6_ADDR_HSIZE_SHIFT); +} + /* * socket join an anycast group */ @@ -204,16 +218,39 @@ void ipv6_sock_ac_close(struct sock *sk) rtnl_unlock(); } +static void ipv6_add_acaddr_hash(struct net *net, struct ifacaddr6 *aca) +{ + unsigned int hash = inet6_acaddr_hash(net, &aca->aca_addr); + + spin_lock(&acaddr_hash_lock); + hlist_add_head_rcu(&aca->aca_addr_lst, &inet6_acaddr_lst[hash]); + spin_unlock(&acaddr_hash_lock); +} + +static void ipv6_del_acaddr_hash(struct ifacaddr6 *aca) +{ + spin_lock(&acaddr_hash_lock); + hlist_del_init_rcu(&aca->aca_addr_lst); + spin_unlock(&acaddr_hash_lock); +} + static void aca_get(struct ifacaddr6 *aca) { refcount_inc(&aca->aca_refcnt); } +static void aca_free_rcu(struct rcu_head *h) +{ + struct ifacaddr6 *aca = container_of(h, struct ifacaddr6, rcu); + + fib6_info_release(aca->aca_rt); + kfree(aca); +} + static void aca_put(struct ifacaddr6 *ac) { if (refcount_dec_and_test(&ac->aca_refcnt)) { - fib6_info_release(ac->aca_rt); - kfree(ac); + call_rcu(&ac->rcu, aca_free_rcu); } } @@ -229,6 +266,7 @@ static struct ifacaddr6 *aca_alloc(struct fib6_info *f6i, aca->aca_addr = *addr; fib6_info_hold(f6i); aca->aca_rt = f6i; + INIT_HLIST_NODE(&aca->aca_addr_lst); aca->aca_users = 1; /* aca_tstamp should be updated upon changes */ aca->aca_cstamp = aca->aca_tstamp = jiffies; @@ -285,6 +323,8 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr) aca_get(aca); write_unlock_bh(&idev->lock); + ipv6_add_acaddr_hash(net, aca); + ip6_ins_rt(net, f6i); addrconf_join_solict(idev->dev, &aca->aca_addr); @@ -325,6 +365,7 @@ int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr) else idev->ac_list = aca->aca_next; write_unlock_bh(&idev->lock); + ipv6_del_acaddr_hash(aca); addrconf_leave_solict(idev, &aca->aca_addr); ip6_del_rt(dev_net(idev->dev), aca->aca_rt); @@ -352,6 +393,8 @@ void ipv6_ac_destroy_dev(struct inet6_dev *idev) idev->ac_list = aca->aca_next; write_unlock_bh(&idev->lock); + ipv6_del_acaddr_hash(aca); + addrconf_leave_solict(idev, &aca->aca_addr); ip6_del_rt(dev_net(idev->dev), aca->aca_rt); @@ -390,17 +433,25 @@ static bool ipv6_chk_acast_dev(struct net_device *dev, const struct in6_addr *ad bool ipv6_chk_acast_addr(struct net *net, struct net_device *dev, const struct in6_addr *addr) { + unsigned int hash = inet6_acaddr_hash(net, addr); + struct net_device *nh_dev; + struct ifacaddr6 *aca; bool found = false; rcu_read_lock(); if (dev) found = ipv6_chk_acast_dev(dev, addr); else - for_each_netdev_rcu(net, dev) - if (ipv6_chk_acast_dev(dev, addr)) { + hlist_for_each_entry_rcu(aca, &inet6_acaddr_lst[hash], + aca_addr_lst) { + nh_dev = fib6_info_nh_dev(aca->aca_rt); + if (!nh_dev || !net_eq(dev_net(nh_dev), net)) + continue; + if (ipv6_addr_equal(&aca->aca_addr, addr)) { found = true; break; } + } rcu_read_unlock(); return found; } @@ -539,4 +590,25 @@ void ac6_proc_exit(struct net *net) { remove_proc_entry("anycast6", net->proc_net); } + +/* Init / cleanup code + */ +int __init ipv6_anycast_init(void) +{ + int i; + + for (i = 0; i < IN6_ADDR_HSIZE; i++) + INIT_HLIST_HEAD(&inet6_acaddr_lst[i]); + return 0; +} + +void ipv6_anycast_cleanup(void) +{ + int i; + + spin_lock(&acaddr_hash_lock); + for (i = 0; i < IN6_ADDR_HSIZE; i++) + WARN_ON(!hlist_empty(&inet6_acaddr_lst[i])); + spin_unlock(&acaddr_hash_lock); +} #endif -- cgit From c7e86acfcee30794dc99a0759924bf7b9d43f1ca Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 1 Nov 2018 13:39:53 +0000 Subject: rxrpc: Fix lockup due to no error backoff after ack transmit error If the network becomes (partially) unavailable, say by disabling IPv6, the background ACK transmission routine can get itself into a tizzy by proposing immediate ACK retransmission. Since we're in the call event processor, that happens immediately without returning to the workqueue manager. The condition should clear after a while when either the network comes back or the call times out. Fix this by: (1) When re-proposing an ACK on failed Tx, don't schedule it immediately. This will allow a certain amount of time to elapse before we try again. (2) Enforce a return to the workqueue manager after a certain number of iterations of the call processing loop. (3) Add a backoff delay that increases the delay on deferred ACKs by a jiffy per failed transmission to a limit of HZ. The backoff delay is cleared on a successful return from kernel_sendmsg(). (4) Cancel calls immediately if the opening sendmsg fails. The layer above can arrange retransmission or rotate to another server. Fixes: 248f219cb8bc ("rxrpc: Rewrite the data and ack handling code") Signed-off-by: David Howells Signed-off-by: David S. Miller --- net/rxrpc/ar-internal.h | 1 + net/rxrpc/call_event.c | 18 ++++++++++++++---- net/rxrpc/output.c | 35 +++++++++++++++++++++++++++++++---- 3 files changed, 46 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 382196e57a26..bc628acf4f4f 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -611,6 +611,7 @@ struct rxrpc_call { * not hard-ACK'd packet follows this. */ rxrpc_seq_t tx_top; /* Highest Tx slot allocated. */ + u16 tx_backoff; /* Delay to insert due to Tx failure */ /* TCP-style slow-start congestion control [RFC5681]. Since the SMSS * is fixed, we keep these numbers in terms of segments (ie. DATA diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 8e7434e92097..468efc3660c0 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -123,6 +123,7 @@ static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, else ack_at = expiry; + ack_at += READ_ONCE(call->tx_backoff); ack_at += now; if (time_before(ack_at, call->ack_at)) { WRITE_ONCE(call->ack_at, ack_at); @@ -311,6 +312,7 @@ void rxrpc_process_call(struct work_struct *work) container_of(work, struct rxrpc_call, processor); rxrpc_serial_t *send_ack; unsigned long now, next, t; + unsigned int iterations = 0; rxrpc_see_call(call); @@ -319,6 +321,11 @@ void rxrpc_process_call(struct work_struct *work) call->debug_id, rxrpc_call_states[call->state], call->events); recheck_state: + /* Limit the number of times we do this before returning to the manager */ + iterations++; + if (iterations > 5) + goto requeue; + if (test_and_clear_bit(RXRPC_CALL_EV_ABORT, &call->events)) { rxrpc_send_abort_packet(call); goto recheck_state; @@ -447,13 +454,16 @@ recheck_state: rxrpc_reduce_call_timer(call, next, now, rxrpc_timer_restart); /* other events may have been raised since we started checking */ - if (call->events && call->state < RXRPC_CALL_COMPLETE) { - __rxrpc_queue_call(call); - goto out; - } + if (call->events && call->state < RXRPC_CALL_COMPLETE) + goto requeue; out_put: rxrpc_put_call(call, rxrpc_call_put); out: _leave(""); + return; + +requeue: + __rxrpc_queue_call(call); + goto out; } diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 189418888839..736aa9281100 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -34,6 +34,21 @@ struct rxrpc_abort_buffer { static const char rxrpc_keepalive_string[] = ""; +/* + * Increase Tx backoff on transmission failure and clear it on success. + */ +static void rxrpc_tx_backoff(struct rxrpc_call *call, int ret) +{ + if (ret < 0) { + u16 tx_backoff = READ_ONCE(call->tx_backoff); + + if (tx_backoff < HZ) + WRITE_ONCE(call->tx_backoff, tx_backoff + 1); + } else { + WRITE_ONCE(call->tx_backoff, 0); + } +} + /* * Arrange for a keepalive ping a certain time after we last transmitted. This * lets the far side know we're still interested in this call and helps keep @@ -210,6 +225,7 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping, else trace_rxrpc_tx_packet(call->debug_id, &pkt->whdr, rxrpc_tx_point_call_ack); + rxrpc_tx_backoff(call, ret); if (call->state < RXRPC_CALL_COMPLETE) { if (ret < 0) { @@ -218,7 +234,7 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping, rxrpc_propose_ACK(call, pkt->ack.reason, ntohs(pkt->ack.maxSkew), ntohl(pkt->ack.serial), - true, true, + false, true, rxrpc_propose_ack_retry_tx); } else { spin_lock_bh(&call->lock); @@ -300,7 +316,7 @@ int rxrpc_send_abort_packet(struct rxrpc_call *call) else trace_rxrpc_tx_packet(call->debug_id, &pkt.whdr, rxrpc_tx_point_call_abort); - + rxrpc_tx_backoff(call, ret); rxrpc_put_connection(conn); return ret; @@ -413,6 +429,7 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb, else trace_rxrpc_tx_packet(call->debug_id, &whdr, rxrpc_tx_point_call_data_nofrag); + rxrpc_tx_backoff(call, ret); if (ret == -EMSGSIZE) goto send_fragmentable; @@ -445,9 +462,18 @@ done: rxrpc_reduce_call_timer(call, expect_rx_by, nowj, rxrpc_timer_set_for_normal); } - } - rxrpc_set_keepalive(call); + rxrpc_set_keepalive(call); + } else { + /* Cancel the call if the initial transmission fails, + * particularly if that's due to network routing issues that + * aren't going away anytime soon. The layer above can arrange + * the retransmission. + */ + if (!test_and_set_bit(RXRPC_CALL_BEGAN_RX_TIMER, &call->flags)) + rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR, + RX_USER_ABORT, ret); + } _leave(" = %d [%u]", ret, call->peer->maxdata); return ret; @@ -506,6 +532,7 @@ send_fragmentable: else trace_rxrpc_tx_packet(call->debug_id, &whdr, rxrpc_tx_point_call_data_frag); + rxrpc_tx_backoff(call, ret); up_write(&conn->params.local->defrag_sem); goto done; -- cgit From 54451f60c8fa061af9051a53be9786393947367c Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Sun, 21 Oct 2018 00:00:08 +0900 Subject: netfilter: xt_IDLETIMER: add sysfs filename checking routine When IDLETIMER rule is added, sysfs file is created under /sys/class/xt_idletimer/timers/ But some label name shouldn't be used. ".", "..", "power", "uevent", "subsystem", etc... So that sysfs filename checking routine is needed. test commands: %iptables -I INPUT -j IDLETIMER --timeout 1 --label "power" splat looks like: [95765.423132] sysfs: cannot create duplicate filename '/devices/virtual/xt_idletimer/timers/power' [95765.433418] CPU: 0 PID: 8446 Comm: iptables Not tainted 4.19.0-rc6+ #20 [95765.449755] Call Trace: [95765.449755] dump_stack+0xc9/0x16b [95765.449755] ? show_regs_print_info+0x5/0x5 [95765.449755] sysfs_warn_dup+0x74/0x90 [95765.449755] sysfs_add_file_mode_ns+0x352/0x500 [95765.449755] sysfs_create_file_ns+0x179/0x270 [95765.449755] ? sysfs_add_file_mode_ns+0x500/0x500 [95765.449755] ? idletimer_tg_checkentry+0x3e5/0xb1b [xt_IDLETIMER] [95765.449755] ? rcu_read_lock_sched_held+0x114/0x130 [95765.449755] ? __kmalloc_track_caller+0x211/0x2b0 [95765.449755] ? memcpy+0x34/0x50 [95765.449755] idletimer_tg_checkentry+0x4e2/0xb1b [xt_IDLETIMER] [ ... ] Fixes: 0902b469bd25 ("netfilter: xtables: idletimer target implementation") Signed-off-by: Taehee Yoo Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_IDLETIMER.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'net') diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c index c6acfc2d9c84..eb4cbd244c3d 100644 --- a/net/netfilter/xt_IDLETIMER.c +++ b/net/netfilter/xt_IDLETIMER.c @@ -114,6 +114,22 @@ static void idletimer_tg_expired(struct timer_list *t) schedule_work(&timer->work); } +static int idletimer_check_sysfs_name(const char *name, unsigned int size) +{ + int ret; + + ret = xt_check_proc_name(name, size); + if (ret < 0) + return ret; + + if (!strcmp(name, "power") || + !strcmp(name, "subsystem") || + !strcmp(name, "uevent")) + return -EINVAL; + + return 0; +} + static int idletimer_tg_create(struct idletimer_tg_info *info) { int ret; @@ -124,6 +140,10 @@ static int idletimer_tg_create(struct idletimer_tg_info *info) goto out; } + ret = idletimer_check_sysfs_name(info->label, sizeof(info->label)); + if (ret < 0) + goto out_free_timer; + sysfs_attr_init(&info->timer->attr.attr); info->timer->attr.attr.name = kstrdup(info->label, GFP_KERNEL); if (!info->timer->attr.attr.name) { -- cgit From 8a02bdd50b2ecb6d62121d2958d3ea186cc88ce7 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Tue, 30 Oct 2018 22:43:42 +0100 Subject: netfilter: ipset: Fix calling ip_set() macro at dumping The ip_set() macro is called when either ip_set_ref_lock held only or no lock/nfnl mutex is held at dumping. Take this into account properly. Also, use Pablo's suggestion to use rcu_dereference_raw(), the ref_netlink protects the set. Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_core.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 68db946df151..1577f2f76060 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -55,11 +55,15 @@ MODULE_AUTHOR("Jozsef Kadlecsik "); MODULE_DESCRIPTION("core IP set support"); MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET); -/* When the nfnl mutex is held: */ +/* When the nfnl mutex or ip_set_ref_lock is held: */ #define ip_set_dereference(p) \ - rcu_dereference_protected(p, lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET)) + rcu_dereference_protected(p, \ + lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET) || \ + lockdep_is_held(&ip_set_ref_lock)) #define ip_set(inst, id) \ ip_set_dereference((inst)->ip_set_list)[id] +#define ip_set_ref_netlink(inst,id) \ + rcu_dereference_raw((inst)->ip_set_list)[id] /* The set types are implemented in modules and registered set types * can be found in ip_set_type_list. Adding/deleting types is @@ -1251,7 +1255,7 @@ ip_set_dump_done(struct netlink_callback *cb) struct ip_set_net *inst = (struct ip_set_net *)cb->args[IPSET_CB_NET]; ip_set_id_t index = (ip_set_id_t)cb->args[IPSET_CB_INDEX]; - struct ip_set *set = ip_set(inst, index); + struct ip_set *set = ip_set_ref_netlink(inst, index); if (set->variant->uref) set->variant->uref(set, cb, false); @@ -1440,7 +1444,7 @@ next_set: release_refcount: /* If there was an error or set is done, release set */ if (ret || !cb->args[IPSET_CB_ARG0]) { - set = ip_set(inst, index); + set = ip_set_ref_netlink(inst, index); if (set->variant->uref) set->variant->uref(set, cb, false); pr_debug("release set %s\n", set->name); -- cgit From a95a7774d51e13f9cf4b7285666829b68852f07a Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 2 Nov 2018 00:11:34 +0100 Subject: netfilter: conntrack: add nf_{tcp,udp,sctp,icmp,dccp,icmpv6,generic}_pernet() Expose these functions to access conntrack protocol tracker netns area, nfnetlink_cttimeout needs this. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_proto_dccp.c | 13 ++++--------- net/netfilter/nf_conntrack_proto_generic.c | 11 +++-------- net/netfilter/nf_conntrack_proto_icmp.c | 11 +++-------- net/netfilter/nf_conntrack_proto_icmpv6.c | 11 +++-------- net/netfilter/nf_conntrack_proto_sctp.c | 11 +++-------- net/netfilter/nf_conntrack_proto_tcp.c | 15 +++++---------- net/netfilter/nf_conntrack_proto_udp.c | 11 +++-------- 7 files changed, 24 insertions(+), 59 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index 171e9e122e5f..023c1445bc39 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -384,11 +384,6 @@ dccp_state_table[CT_DCCP_ROLE_MAX + 1][DCCP_PKT_SYNCACK + 1][CT_DCCP_MAX + 1] = }, }; -static inline struct nf_dccp_net *dccp_pernet(struct net *net) -{ - return &net->ct.nf_ct_proto.dccp; -} - static noinline bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb, const struct dccp_hdr *dh) @@ -401,7 +396,7 @@ dccp_new(struct nf_conn *ct, const struct sk_buff *skb, state = dccp_state_table[CT_DCCP_ROLE_CLIENT][dh->dccph_type][CT_DCCP_NONE]; switch (state) { default: - dn = dccp_pernet(net); + dn = nf_dccp_pernet(net); if (dn->dccp_loose == 0) { msg = "not picking up existing connection "; goto out_invalid; @@ -568,7 +563,7 @@ static int dccp_packet(struct nf_conn *ct, struct sk_buff *skb, timeouts = nf_ct_timeout_lookup(ct); if (!timeouts) - timeouts = dccp_pernet(nf_ct_net(ct))->dccp_timeout; + timeouts = nf_dccp_pernet(nf_ct_net(ct))->dccp_timeout; nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[new_state]); return NF_ACCEPT; @@ -681,7 +676,7 @@ static int nlattr_to_dccp(struct nlattr *cda[], struct nf_conn *ct) static int dccp_timeout_nlattr_to_obj(struct nlattr *tb[], struct net *net, void *data) { - struct nf_dccp_net *dn = dccp_pernet(net); + struct nf_dccp_net *dn = nf_dccp_pernet(net); unsigned int *timeouts = data; int i; @@ -814,7 +809,7 @@ static int dccp_kmemdup_sysctl_table(struct net *net, struct nf_proto_net *pn, static int dccp_init_net(struct net *net) { - struct nf_dccp_net *dn = dccp_pernet(net); + struct nf_dccp_net *dn = nf_dccp_pernet(net); struct nf_proto_net *pn = &dn->pn; if (!pn->users) { diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c index e10e867e0b55..5da19d5fbc76 100644 --- a/net/netfilter/nf_conntrack_proto_generic.c +++ b/net/netfilter/nf_conntrack_proto_generic.c @@ -27,11 +27,6 @@ static bool nf_generic_should_process(u8 proto) } } -static inline struct nf_generic_net *generic_pernet(struct net *net) -{ - return &net->ct.nf_ct_proto.generic; -} - static bool generic_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, struct net *net, struct nf_conntrack_tuple *tuple) @@ -58,7 +53,7 @@ static int generic_packet(struct nf_conn *ct, } if (!timeout) - timeout = &generic_pernet(nf_ct_net(ct))->timeout; + timeout = &nf_generic_pernet(nf_ct_net(ct))->timeout; nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); return NF_ACCEPT; @@ -72,7 +67,7 @@ static int generic_packet(struct nf_conn *ct, static int generic_timeout_nlattr_to_obj(struct nlattr *tb[], struct net *net, void *data) { - struct nf_generic_net *gn = generic_pernet(net); + struct nf_generic_net *gn = nf_generic_pernet(net); unsigned int *timeout = data; if (!timeout) @@ -138,7 +133,7 @@ static int generic_kmemdup_sysctl_table(struct nf_proto_net *pn, static int generic_init_net(struct net *net) { - struct nf_generic_net *gn = generic_pernet(net); + struct nf_generic_net *gn = nf_generic_pernet(net); struct nf_proto_net *pn = &gn->pn; gn->timeout = nf_ct_generic_timeout; diff --git a/net/netfilter/nf_conntrack_proto_icmp.c b/net/netfilter/nf_conntrack_proto_icmp.c index 3598520bd19b..de64d8a5fdfd 100644 --- a/net/netfilter/nf_conntrack_proto_icmp.c +++ b/net/netfilter/nf_conntrack_proto_icmp.c @@ -25,11 +25,6 @@ static const unsigned int nf_ct_icmp_timeout = 30*HZ; -static inline struct nf_icmp_net *icmp_pernet(struct net *net) -{ - return &net->ct.nf_ct_proto.icmp; -} - static bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, struct net *net, struct nf_conntrack_tuple *tuple) { @@ -103,7 +98,7 @@ static int icmp_packet(struct nf_conn *ct, } if (!timeout) - timeout = &icmp_pernet(nf_ct_net(ct))->timeout; + timeout = &nf_icmp_pernet(nf_ct_net(ct))->timeout; nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); return NF_ACCEPT; @@ -275,7 +270,7 @@ static int icmp_timeout_nlattr_to_obj(struct nlattr *tb[], struct net *net, void *data) { unsigned int *timeout = data; - struct nf_icmp_net *in = icmp_pernet(net); + struct nf_icmp_net *in = nf_icmp_pernet(net); if (tb[CTA_TIMEOUT_ICMP_TIMEOUT]) { if (!timeout) @@ -337,7 +332,7 @@ static int icmp_kmemdup_sysctl_table(struct nf_proto_net *pn, static int icmp_init_net(struct net *net) { - struct nf_icmp_net *in = icmp_pernet(net); + struct nf_icmp_net *in = nf_icmp_pernet(net); struct nf_proto_net *pn = &in->pn; in->timeout = nf_ct_icmp_timeout; diff --git a/net/netfilter/nf_conntrack_proto_icmpv6.c b/net/netfilter/nf_conntrack_proto_icmpv6.c index 378618feed5d..a15eefb8e317 100644 --- a/net/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/netfilter/nf_conntrack_proto_icmpv6.c @@ -30,11 +30,6 @@ static const unsigned int nf_ct_icmpv6_timeout = 30*HZ; -static inline struct nf_icmp_net *icmpv6_pernet(struct net *net) -{ - return &net->ct.nf_ct_proto.icmpv6; -} - static bool icmpv6_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, struct net *net, @@ -87,7 +82,7 @@ static bool icmpv6_invert_tuple(struct nf_conntrack_tuple *tuple, static unsigned int *icmpv6_get_timeouts(struct net *net) { - return &icmpv6_pernet(net)->timeout; + return &nf_icmpv6_pernet(net)->timeout; } /* Returns verdict for packet, or -1 for invalid. */ @@ -286,7 +281,7 @@ static int icmpv6_timeout_nlattr_to_obj(struct nlattr *tb[], struct net *net, void *data) { unsigned int *timeout = data; - struct nf_icmp_net *in = icmpv6_pernet(net); + struct nf_icmp_net *in = nf_icmpv6_pernet(net); if (!timeout) timeout = icmpv6_get_timeouts(net); @@ -348,7 +343,7 @@ static int icmpv6_kmemdup_sysctl_table(struct nf_proto_net *pn, static int icmpv6_init_net(struct net *net) { - struct nf_icmp_net *in = icmpv6_pernet(net); + struct nf_icmp_net *in = nf_icmpv6_pernet(net); struct nf_proto_net *pn = &in->pn; in->timeout = nf_ct_icmpv6_timeout; diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index 3d719d3eb9a3..d53e3e78f605 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -146,11 +146,6 @@ static const u8 sctp_conntracks[2][11][SCTP_CONNTRACK_MAX] = { } }; -static inline struct nf_sctp_net *sctp_pernet(struct net *net) -{ - return &net->ct.nf_ct_proto.sctp; -} - #ifdef CONFIG_NF_CONNTRACK_PROCFS /* Print out the private part of the conntrack. */ static void sctp_print_conntrack(struct seq_file *s, struct nf_conn *ct) @@ -480,7 +475,7 @@ static int sctp_packet(struct nf_conn *ct, timeouts = nf_ct_timeout_lookup(ct); if (!timeouts) - timeouts = sctp_pernet(nf_ct_net(ct))->timeouts; + timeouts = nf_sctp_pernet(nf_ct_net(ct))->timeouts; nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[new_state]); @@ -599,7 +594,7 @@ static int sctp_timeout_nlattr_to_obj(struct nlattr *tb[], struct net *net, void *data) { unsigned int *timeouts = data; - struct nf_sctp_net *sn = sctp_pernet(net); + struct nf_sctp_net *sn = nf_sctp_pernet(net); int i; /* set default SCTP timeouts. */ @@ -736,7 +731,7 @@ static int sctp_kmemdup_sysctl_table(struct nf_proto_net *pn, static int sctp_init_net(struct net *net) { - struct nf_sctp_net *sn = sctp_pernet(net); + struct nf_sctp_net *sn = nf_sctp_pernet(net); struct nf_proto_net *pn = &sn->pn; if (!pn->users) { diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 1bcf9984d45e..4dcbd51a8e97 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -272,11 +272,6 @@ static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { } }; -static inline struct nf_tcp_net *tcp_pernet(struct net *net) -{ - return &net->ct.nf_ct_proto.tcp; -} - #ifdef CONFIG_NF_CONNTRACK_PROCFS /* Print out the private part of the conntrack. */ static void tcp_print_conntrack(struct seq_file *s, struct nf_conn *ct) @@ -475,7 +470,7 @@ static bool tcp_in_window(const struct nf_conn *ct, const struct tcphdr *tcph) { struct net *net = nf_ct_net(ct); - struct nf_tcp_net *tn = tcp_pernet(net); + struct nf_tcp_net *tn = nf_tcp_pernet(net); struct ip_ct_tcp_state *sender = &state->seen[dir]; struct ip_ct_tcp_state *receiver = &state->seen[!dir]; const struct nf_conntrack_tuple *tuple = &ct->tuplehash[dir].tuple; @@ -767,7 +762,7 @@ static noinline bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb, { enum tcp_conntrack new_state; struct net *net = nf_ct_net(ct); - const struct nf_tcp_net *tn = tcp_pernet(net); + const struct nf_tcp_net *tn = nf_tcp_pernet(net); const struct ip_ct_tcp_state *sender = &ct->proto.tcp.seen[0]; const struct ip_ct_tcp_state *receiver = &ct->proto.tcp.seen[1]; @@ -841,7 +836,7 @@ static int tcp_packet(struct nf_conn *ct, const struct nf_hook_state *state) { struct net *net = nf_ct_net(ct); - struct nf_tcp_net *tn = tcp_pernet(net); + struct nf_tcp_net *tn = nf_tcp_pernet(net); struct nf_conntrack_tuple *tuple; enum tcp_conntrack new_state, old_state; unsigned int index, *timeouts; @@ -1283,7 +1278,7 @@ static unsigned int tcp_nlattr_tuple_size(void) static int tcp_timeout_nlattr_to_obj(struct nlattr *tb[], struct net *net, void *data) { - struct nf_tcp_net *tn = tcp_pernet(net); + struct nf_tcp_net *tn = nf_tcp_pernet(net); unsigned int *timeouts = data; int i; @@ -1508,7 +1503,7 @@ static int tcp_kmemdup_sysctl_table(struct nf_proto_net *pn, static int tcp_init_net(struct net *net) { - struct nf_tcp_net *tn = tcp_pernet(net); + struct nf_tcp_net *tn = nf_tcp_pernet(net); struct nf_proto_net *pn = &tn->pn; if (!pn->users) { diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index a7aa70370913..c879d8d78cfd 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -32,14 +32,9 @@ static const unsigned int udp_timeouts[UDP_CT_MAX] = { [UDP_CT_REPLIED] = 180*HZ, }; -static inline struct nf_udp_net *udp_pernet(struct net *net) -{ - return &net->ct.nf_ct_proto.udp; -} - static unsigned int *udp_get_timeouts(struct net *net) { - return udp_pernet(net)->timeouts; + return nf_udp_pernet(net)->timeouts; } static void udp_error_log(const struct sk_buff *skb, @@ -212,7 +207,7 @@ static int udp_timeout_nlattr_to_obj(struct nlattr *tb[], struct net *net, void *data) { unsigned int *timeouts = data; - struct nf_udp_net *un = udp_pernet(net); + struct nf_udp_net *un = nf_udp_pernet(net); if (!timeouts) timeouts = un->timeouts; @@ -292,7 +287,7 @@ static int udp_kmemdup_sysctl_table(struct nf_proto_net *pn, static int udp_init_net(struct net *net) { - struct nf_udp_net *un = udp_pernet(net); + struct nf_udp_net *un = nf_udp_pernet(net); struct nf_proto_net *pn = &un->pn; if (!pn->users) { -- cgit From 8866df9264a34e675b4ee8a151db819b87cce2d3 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 2 Nov 2018 00:14:00 +0100 Subject: netfilter: nfnetlink_cttimeout: pass default timeout policy to obj_to_nlattr Otherwise, we hit a NULL pointer deference since handlers always assume default timeout policy is passed. netlink: 24 bytes leftover after parsing attributes in process `syz-executor2'. kasan: CONFIG_KASAN_INLINE enabled kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] PREEMPT SMP KASAN CPU: 0 PID: 9575 Comm: syz-executor1 Not tainted 4.19.0+ #312 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:icmp_timeout_obj_to_nlattr+0x77/0x170 net/netfilter/nf_conntrack_proto_icmp.c:297 Fixes: c779e849608a ("netfilter: conntrack: remove get_timeout() indirection") Reported-by: Eric Dumazet Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nfnetlink_cttimeout.c | 47 +++++++++++++++++++++++++++++++------ 1 file changed, 40 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index e7a50af1b3d6..a518eb162344 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -382,7 +382,8 @@ err: static int cttimeout_default_fill_info(struct net *net, struct sk_buff *skb, u32 portid, u32 seq, u32 type, int event, u16 l3num, - const struct nf_conntrack_l4proto *l4proto) + const struct nf_conntrack_l4proto *l4proto, + const unsigned int *timeouts) { struct nlmsghdr *nlh; struct nfgenmsg *nfmsg; @@ -408,7 +409,7 @@ cttimeout_default_fill_info(struct net *net, struct sk_buff *skb, u32 portid, if (!nest_parms) goto nla_put_failure; - ret = l4proto->ctnl_timeout.obj_to_nlattr(skb, NULL); + ret = l4proto->ctnl_timeout.obj_to_nlattr(skb, timeouts); if (ret < 0) goto nla_put_failure; @@ -430,6 +431,7 @@ static int cttimeout_default_get(struct net *net, struct sock *ctnl, struct netlink_ext_ack *extack) { const struct nf_conntrack_l4proto *l4proto; + unsigned int *timeouts = NULL; struct sk_buff *skb2; int ret, err; __u16 l3num; @@ -442,12 +444,44 @@ static int cttimeout_default_get(struct net *net, struct sock *ctnl, l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]); l4proto = nf_ct_l4proto_find_get(l4num); - /* This protocol is not supported, skip. */ - if (l4proto->l4proto != l4num) { - err = -EOPNOTSUPP; + err = -EOPNOTSUPP; + if (l4proto->l4proto != l4num) goto err; + + switch (l4proto->l4proto) { + case IPPROTO_ICMP: + timeouts = &nf_icmp_pernet(net)->timeout; + break; + case IPPROTO_TCP: + timeouts = nf_tcp_pernet(net)->timeouts; + break; + case IPPROTO_UDP: + timeouts = nf_udp_pernet(net)->timeouts; + break; + case IPPROTO_DCCP: +#ifdef CONFIG_NF_CT_PROTO_DCCP + timeouts = nf_dccp_pernet(net)->dccp_timeout; +#endif + break; + case IPPROTO_ICMPV6: + timeouts = &nf_icmpv6_pernet(net)->timeout; + break; + case IPPROTO_SCTP: +#ifdef CONFIG_NF_CT_PROTO_SCTP + timeouts = nf_sctp_pernet(net)->timeouts; +#endif + break; + case 255: + timeouts = &nf_generic_pernet(net)->timeout; + break; + default: + WARN_ON_ONCE(1); + break; } + if (!timeouts) + goto err; + skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (skb2 == NULL) { err = -ENOMEM; @@ -458,8 +492,7 @@ static int cttimeout_default_get(struct net *net, struct sock *ctnl, nlh->nlmsg_seq, NFNL_MSG_TYPE(nlh->nlmsg_type), IPCTNL_MSG_TIMEOUT_DEFAULT_SET, - l3num, - l4proto); + l3num, l4proto, timeouts); if (ret <= 0) { kfree_skb(skb2); err = -ENOMEM; -- cgit From e4844c9c62a0fe47980d6c3d4b7a096a5d755925 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 2 Nov 2018 11:33:37 +0100 Subject: netfilter: nft_compat: ebtables 'nat' table is normal chain type Unlike ip(6)tables, the ebtables nat table has no special properties. This bug causes 'ebtables -A' to fail when using a target such as 'snat' (ebt_snat target sets ".table = "nat"'). Targets that have no table restrictions work fine. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_compat.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 768292eac2a4..9d0ede474224 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -54,9 +54,11 @@ static bool nft_xt_put(struct nft_xt *xt) return false; } -static int nft_compat_chain_validate_dependency(const char *tablename, - const struct nft_chain *chain) +static int nft_compat_chain_validate_dependency(const struct nft_ctx *ctx, + const char *tablename) { + enum nft_chain_types type = NFT_CHAIN_T_DEFAULT; + const struct nft_chain *chain = ctx->chain; const struct nft_base_chain *basechain; if (!tablename || @@ -64,9 +66,12 @@ static int nft_compat_chain_validate_dependency(const char *tablename, return 0; basechain = nft_base_chain(chain); - if (strcmp(tablename, "nat") == 0 && - basechain->type->type != NFT_CHAIN_T_NAT) - return -EINVAL; + if (strcmp(tablename, "nat") == 0) { + if (ctx->family != NFPROTO_BRIDGE) + type = NFT_CHAIN_T_NAT; + if (basechain->type->type != type) + return -EINVAL; + } return 0; } @@ -342,8 +347,7 @@ static int nft_target_validate(const struct nft_ctx *ctx, if (target->hooks && !(hook_mask & target->hooks)) return -EINVAL; - ret = nft_compat_chain_validate_dependency(target->table, - ctx->chain); + ret = nft_compat_chain_validate_dependency(ctx, target->table); if (ret < 0) return ret; } @@ -590,8 +594,7 @@ static int nft_match_validate(const struct nft_ctx *ctx, if (match->hooks && !(hook_mask & match->hooks)) return -EINVAL; - ret = nft_compat_chain_validate_dependency(match->table, - ctx->chain); + ret = nft_compat_chain_validate_dependency(ctx, match->table); if (ret < 0) return ret; } -- cgit From f393808dc64149ccd0e5a8427505ba2974a59854 Mon Sep 17 00:00:00 2001 From: Vasily Khoruzhick Date: Thu, 25 Oct 2018 12:15:43 -0700 Subject: netfilter: conntrack: fix calculation of next bucket number in early_drop If there's no entry to drop in bucket that corresponds to the hash, early_drop() should look for it in other buckets. But since it increments hash instead of bucket number, it actually looks in the same bucket 8 times: hsize is 16k by default (14 bits) and hash is 32-bit value, so reciprocal_scale(hash, hsize) returns the same value for hash..hash+7 in most cases. Fix it by increasing bucket number instead of hash and rename _hash to bucket to avoid future confusion. Fixes: 3e86638e9a0b ("netfilter: conntrack: consider ct netns in early_drop logic") Cc: # v4.7+ Signed-off-by: Vasily Khoruzhick Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_core.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index ca1168d67fac..e92e749aff53 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1073,19 +1073,22 @@ static unsigned int early_drop_list(struct net *net, return drops; } -static noinline int early_drop(struct net *net, unsigned int _hash) +static noinline int early_drop(struct net *net, unsigned int hash) { - unsigned int i; + unsigned int i, bucket; for (i = 0; i < NF_CT_EVICTION_RANGE; i++) { struct hlist_nulls_head *ct_hash; - unsigned int hash, hsize, drops; + unsigned int hsize, drops; rcu_read_lock(); nf_conntrack_get_ht(&ct_hash, &hsize); - hash = reciprocal_scale(_hash++, hsize); + if (!i) + bucket = reciprocal_scale(hash, hsize); + else + bucket = (bucket + 1) % hsize; - drops = early_drop_list(net, &ct_hash[hash]); + drops = early_drop_list(net, &ct_hash[bucket]); rcu_read_unlock(); if (drops) { -- cgit From fe60faa5063822f2d555f4f326c7dd72a60929bf Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 31 Oct 2018 08:39:13 -0700 Subject: net: do not abort bulk send on BQL status Before calling dev_hard_start_xmit(), upper layers tried to cook optimal skb list based on BQL budget. Problem is that GSO packets can end up comsuming more than the BQL budget. Breaking the loop is not useful, since requeued packets are ahead of any packets still in the qdisc. It is also more expensive, since next TX completion will push these packets later, while skbs are not in cpu caches. It is also a behavior difference with TSO packets, that can break the BQL limit by a large amount. Note that drivers should use __netdev_tx_sent_queue() in order to have optimal xmit_more support, and avoid useless atomic operations as shown in the following patch. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 77d43ae2a7bb..0ffcbdd55fa9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3272,7 +3272,7 @@ struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *de } skb = next; - if (netif_xmit_stopped(txq) && skb) { + if (netif_tx_queue_stopped(txq) && skb) { rc = NETDEV_TX_BUSY; break; } -- cgit From a277d516de5f498c91d91189717ef7e01102ad27 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 2 Nov 2018 16:36:55 +0100 Subject: openvswitch: fix linking without CONFIG_NF_CONNTRACK_LABELS When CONFIG_CC_OPTIMIZE_FOR_DEBUGGING is enabled, the compiler fails to optimize out a dead code path, which leads to a link failure: net/openvswitch/conntrack.o: In function `ovs_ct_set_labels': conntrack.c:(.text+0x2e60): undefined reference to `nf_connlabels_replace' In this configuration, we can take a shortcut, and completely remove the contrack label code. This may also help the regular optimization. Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- net/openvswitch/conntrack.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 6bec37ab4472..a4660c48ff01 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -1203,7 +1203,8 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key, &info->labels.mask); if (err) return err; - } else if (labels_nonzero(&info->labels.mask)) { + } else if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) && + labels_nonzero(&info->labels.mask)) { err = ovs_ct_set_labels(ct, key, &info->labels.value, &info->labels.mask); if (err) -- cgit From 12480e3b16982c4026de10dd8155823219cd6391 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 3 Nov 2018 14:01:31 +0800 Subject: sctp: define SCTP_SS_DEFAULT for Stream schedulers According to rfc8260#section-4.3.2, SCTP_SS_DEFAULT is required to defined as SCTP_SS_FCFS or SCTP_SS_RR. SCTP_SS_FCFS is used for SCTP_SS_DEFAULT's value in this patch. Fixes: 5bbbbe32a431 ("sctp: introduce stream scheduler foundations") Reported-by: Jianwen Ji Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/outqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index 9cb854b05342..c37e1c2dec9d 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -212,7 +212,7 @@ void sctp_outq_init(struct sctp_association *asoc, struct sctp_outq *q) INIT_LIST_HEAD(&q->retransmit); INIT_LIST_HEAD(&q->sacked); INIT_LIST_HEAD(&q->abandoned); - sctp_sched_set_sched(asoc, SCTP_SS_FCFS); + sctp_sched_set_sched(asoc, SCTP_SS_DEFAULT); } /* Free the outqueue structure and any related pending chunks. -- cgit From 5d7a5bcb67c70cbc904057ef52d3fcfeb24420bb Mon Sep 17 00:00:00 2001 From: Frank Sorenson Date: Tue, 30 Oct 2018 15:10:40 -0500 Subject: sunrpc: correct the computation for page_ptr when truncating When truncating the encode buffer, the page_ptr is getting advanced, causing the next page to be skipped while encoding. The page is still included in the response, so the response contains a page of bogus data. We need to adjust the page_ptr backwards to ensure we encode the next page into the correct place. We saw this triggered when concurrent directory modifications caused nfsd4_encode_direct_fattr() to return nfserr_noent, and the resulting call to xdr_truncate_encode() corrupted the READDIR reply. Signed-off-by: Frank Sorenson Cc: stable@vger.kernel.org Signed-off-by: J. Bruce Fields --- net/sunrpc/xdr.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 2bbb8d38d2bf..5cfb9e0a18dc 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -673,11 +673,10 @@ void xdr_truncate_encode(struct xdr_stream *xdr, size_t len) WARN_ON_ONCE(xdr->iov); return; } - if (fraglen) { + if (fraglen) xdr->end = head->iov_base + head->iov_len; - xdr->page_ptr--; - } /* (otherwise assume xdr->end is already set) */ + xdr->page_ptr--; head->iov_len = len; buf->len = len; xdr->p = head->iov_base + head->iov_len; -- cgit From 6915ed86cca6a0b422e8ada05ec7009d2074b88a Mon Sep 17 00:00:00 2001 From: Jeff Barnhill <0xeffeff@gmail.com> Date: Mon, 5 Nov 2018 20:36:45 +0000 Subject: net/ipv6: Move anycast init/cleanup functions out of CONFIG_PROC_FS Move the anycast.c init and cleanup functions which were inadvertently added inside the CONFIG_PROC_FS definition. Fixes: 2384d02520ff ("net/ipv6: Add anycast addresses to a global hashtable") Signed-off-by: Jeff Barnhill <0xeffeff@gmail.com> Signed-off-by: David S. Miller --- net/ipv6/anycast.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index 7698637cf827..94999058e110 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -590,6 +590,7 @@ void ac6_proc_exit(struct net *net) { remove_proc_entry("anycast6", net->proc_net); } +#endif /* Init / cleanup code */ @@ -611,4 +612,3 @@ void ipv6_anycast_cleanup(void) WARN_ON(!hlist_empty(&inet6_acaddr_lst[i])); spin_unlock(&acaddr_hash_lock); } -#endif -- cgit From 5e1acb4afacc6229946c3d2b7ffc422b53fb2448 Mon Sep 17 00:00:00 2001 From: Alexey Kodanev Date: Fri, 2 Nov 2018 19:11:04 +0300 Subject: rtnetlink: restore handling of dumpit return value in rtnl_dump_all() For non-zero return from dumpit() we should break the loop in rtnl_dump_all() and return the result. Otherwise, e.g., we could get the memory leak in inet6_dump_fib() [1]. The pointer to the allocated struct fib6_walker there (saved in cb->args) can be lost, reset on the next iteration. Fix it by partially restoring the previous behavior before commit c63586dc9b3e ("net: rtnl_dump_all needs to propagate error from dumpit function"). The returned error from dumpit() is still passed further. [1]: unreferenced object 0xffff88001322a200 (size 96): comm "sshd", pid 1484, jiffies 4296032768 (age 1432.542s) hex dump (first 32 bytes): 00 01 00 00 00 00 ad de 00 02 00 00 00 00 ad de ................ 18 09 41 36 00 88 ff ff 18 09 41 36 00 88 ff ff ..A6......A6.... backtrace: [<0000000095846b39>] kmem_cache_alloc_trace+0x151/0x220 [<000000007d12709f>] inet6_dump_fib+0x68d/0x940 [<000000002775a316>] rtnl_dump_all+0x1d9/0x2d0 [<00000000d7cd302b>] netlink_dump+0x945/0x11a0 [<000000002f43485f>] __netlink_dump_start+0x55d/0x800 [<00000000f76bbeec>] rtnetlink_rcv_msg+0x4fa/0xa00 [<000000009b5761f3>] netlink_rcv_skb+0x29c/0x420 [<0000000087a1dae1>] rtnetlink_rcv+0x15/0x20 [<00000000691b703b>] netlink_unicast+0x4e3/0x6c0 [<00000000b5be0204>] netlink_sendmsg+0x7f2/0xba0 [<0000000096d2aa60>] sock_sendmsg+0xba/0xf0 [<000000008c1b786f>] __sys_sendto+0x1e4/0x330 [<0000000019587b3f>] __x64_sys_sendto+0xe1/0x1a0 [<00000000071f4d56>] do_syscall_64+0x9f/0x300 [<000000002737577f>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [<0000000057587684>] 0xffffffffffffffff Fixes: c63586dc9b3e ("net: rtnl_dump_all needs to propagate error from dumpit function") Signed-off-by: Alexey Kodanev Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index e01274bd5e3e..33d9227a8b80 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3367,7 +3367,7 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) cb->seq = 0; } ret = dumpit(skb, cb); - if (ret < 0) + if (ret) break; } cb->family = idx; -- cgit From e22d0bfa09a56e427a1a950ccb85621c86b343a4 Mon Sep 17 00:00:00 2001 From: Alexey Kodanev Date: Fri, 2 Nov 2018 19:11:05 +0300 Subject: ipv6: properly check return value in inet6_dump_all() Make sure we call fib6_dump_end() if it happens that skb->len is zero. rtnl_dump_all() can reset cb->args on the next loop iteration there. Fixes: 08e814c9e8eb ("net/ipv6: Bail early if user only wants cloned entries") Fixes: ae677bbb4441 ("net: Don't return invalid table id error when dumping all families") Signed-off-by: Alexey Kodanev Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 1b8bc008b53b..ae3786132c23 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -591,7 +591,7 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) /* fib entries are never clones */ if (arg.filter.flags & RTM_F_CLONED) - return skb->len; + goto out; w = (void *)cb->args[2]; if (!w) { @@ -621,7 +621,7 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) tb = fib6_get_table(net, arg.filter.table_id); if (!tb) { if (arg.filter.dump_all_families) - return skb->len; + goto out; NL_SET_ERR_MSG_MOD(cb->extack, "FIB table does not exist"); return -ENOENT; -- cgit From d016b4a3562b745fd9fa387a47d8de62ccd7e241 Mon Sep 17 00:00:00 2001 From: "Matwey V. Kornilov" Date: Fri, 2 Nov 2018 21:19:36 +0300 Subject: net: core: netpoll: Enable netconsole IPv6 link local address There is no reason to discard using source link local address when remote netconsole IPv6 address is set to be link local one. The patch allows administrators to use IPv6 netconsole without explicitly configuring source address: netconsole=@/,@fe80::5054:ff:fe2f:6012/ Signed-off-by: Matwey V. Kornilov Signed-off-by: David S. Miller --- net/core/netpoll.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 5da9552b186b..2b9fdbc43205 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -717,7 +717,8 @@ int netpoll_setup(struct netpoll *np) read_lock_bh(&idev->lock); list_for_each_entry(ifp, &idev->addr_list, if_list) { - if (ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL) + if (!!(ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL) != + !!(ipv6_addr_type(&np->remote_ip.in6) & IPV6_ADDR_LINKLOCAL)) continue; np->local_ip.in6 = ifp->addr; err = 0; -- cgit From c34c1287778b080ed692c0a46a8e345206cc29e6 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Sun, 4 Nov 2018 22:37:15 -0800 Subject: sock_diag: fix autoloading of the raw_diag module IPPROTO_RAW isn't registred as an inet protocol, so inet_protos[protocol] is always NULL for it. Cc: Cyrill Gorcunov Cc: Xin Long Fixes: bf2ae2e4bf93 ("sock_diag: request _diag module only when the family or proto has been registered") Signed-off-by: Andrei Vagin Reviewed-by: Cyrill Gorcunov Signed-off-by: David S. Miller --- net/core/sock.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index 6fcc4bc07d19..080a880a1761 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3279,6 +3279,7 @@ int sock_load_diag_module(int family, int protocol) #ifdef CONFIG_INET if (family == AF_INET && + protocol != IPPROTO_RAW && !rcu_access_pointer(inet_protos[protocol])) return -ENOENT; #endif -- cgit From 97adaddaa6db7a8af81b9b11e30cbe3628cd6700 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Mon, 5 Nov 2018 22:31:41 +0900 Subject: net: bpfilter: fix iptables failure if bpfilter_umh is disabled When iptables command is executed, ip_{set/get}sockopt() try to upload bpfilter.ko if bpfilter is enabled. if it couldn't find bpfilter.ko, command is failed. bpfilter.ko is generated if CONFIG_BPFILTER_UMH is enabled. ip_{set/get}sockopt() only checks CONFIG_BPFILTER. So that if CONFIG_BPFILTER is enabled and CONFIG_BPFILTER_UMH is disabled, iptables command is always failed. test config: CONFIG_BPFILTER=y # CONFIG_BPFILTER_UMH is not set test command: %iptables -L iptables: No chain/target/match by that name. Fixes: d2ba09c17a06 ("net: add skeleton of bpfilter kernel module") Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- net/ipv4/ip_sockglue.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 26c36cccabdc..fffcc130900e 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -1246,7 +1246,7 @@ int ip_setsockopt(struct sock *sk, int level, return -ENOPROTOOPT; err = do_ip_setsockopt(sk, level, optname, optval, optlen); -#ifdef CONFIG_BPFILTER +#if IS_ENABLED(CONFIG_BPFILTER_UMH) if (optname >= BPFILTER_IPT_SO_SET_REPLACE && optname < BPFILTER_IPT_SET_MAX) err = bpfilter_ip_set_sockopt(sk, optname, optval, optlen); @@ -1559,7 +1559,7 @@ int ip_getsockopt(struct sock *sk, int level, int err; err = do_ip_getsockopt(sk, level, optname, optval, optlen, 0); -#ifdef CONFIG_BPFILTER +#if IS_ENABLED(CONFIG_BPFILTER_UMH) if (optname >= BPFILTER_IPT_SO_GET_INFO && optname < BPFILTER_IPT_GET_MAX) err = bpfilter_ip_get_sockopt(sk, optname, optval, optlen); @@ -1596,7 +1596,7 @@ int compat_ip_getsockopt(struct sock *sk, int level, int optname, err = do_ip_getsockopt(sk, level, optname, optval, optlen, MSG_CMSG_COMPAT); -#ifdef CONFIG_BPFILTER +#if IS_ENABLED(CONFIG_BPFILTER_UMH) if (optname >= BPFILTER_IPT_SO_GET_INFO && optname < BPFILTER_IPT_GET_MAX) err = bpfilter_ip_get_sockopt(sk, optname, optval, optlen); -- cgit From 025911a5f4e36955498ed50806ad1b02f0f76288 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Thu, 8 Nov 2018 02:04:57 +0000 Subject: SUNRPC: drop pointless static qualifier in xdr_get_next_encode_buffer() There is no need to have the '__be32 *p' variable static since new value always be assigned before use it. Signed-off-by: YueHaibing Cc: stable@vger.kernel.org Signed-off-by: J. Bruce Fields --- net/sunrpc/xdr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 5cfb9e0a18dc..f302c6eb8779 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -546,7 +546,7 @@ EXPORT_SYMBOL_GPL(xdr_commit_encode); static __be32 *xdr_get_next_encode_buffer(struct xdr_stream *xdr, size_t nbytes) { - static __be32 *p; + __be32 *p; int space_left; int frag1bytes, frag2bytes; -- cgit From 0d5b9311baf27bb545f187f12ecfd558220c607d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 8 Nov 2018 17:34:27 -0800 Subject: inet: frags: better deal with smp races MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Multiple cpus might attempt to insert a new fragment in rhashtable, if for example RPS is buggy, as reported by 배석진 in https://patchwork.ozlabs.org/patch/994601/ We use rhashtable_lookup_get_insert_key() instead of rhashtable_insert_fast() to let cpus losing the race free their own inet_frag_queue and use the one that was inserted by another cpu. Fixes: 648700f76b03 ("inet: frags: use rhashtables for reassembly units") Signed-off-by: Eric Dumazet Reported-by: 배석진 Signed-off-by: David S. Miller --- net/ipv4/inet_fragment.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index bcb11f3a27c0..760a9e52e02b 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -178,21 +178,22 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, } static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf, - void *arg) + void *arg, + struct inet_frag_queue **prev) { struct inet_frags *f = nf->f; struct inet_frag_queue *q; - int err; q = inet_frag_alloc(nf, f, arg); - if (!q) + if (!q) { + *prev = ERR_PTR(-ENOMEM); return NULL; - + } mod_timer(&q->timer, jiffies + nf->timeout); - err = rhashtable_insert_fast(&nf->rhashtable, &q->node, - f->rhash_params); - if (err < 0) { + *prev = rhashtable_lookup_get_insert_key(&nf->rhashtable, &q->key, + &q->node, f->rhash_params); + if (*prev) { q->flags |= INET_FRAG_COMPLETE; inet_frag_kill(q); inet_frag_destroy(q); @@ -204,22 +205,22 @@ static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf, /* TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() */ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key) { - struct inet_frag_queue *fq; + struct inet_frag_queue *fq = NULL, *prev; if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh) return NULL; rcu_read_lock(); - fq = rhashtable_lookup(&nf->rhashtable, key, nf->f->rhash_params); - if (fq) { + prev = rhashtable_lookup(&nf->rhashtable, key, nf->f->rhash_params); + if (!prev) + fq = inet_frag_create(nf, key, &prev); + if (prev && !IS_ERR(prev)) { + fq = prev; if (!refcount_inc_not_zero(&fq->refcnt)) fq = NULL; - rcu_read_unlock(); - return fq; } rcu_read_unlock(); - - return inet_frag_create(nf, key); + return fq; } EXPORT_SYMBOL(inet_frag_find); -- cgit From a43608fa77213ad5ac5f75994254b9f65d57cfa0 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Wed, 24 Oct 2018 10:27:12 +0200 Subject: can: raw: check for CAN FD capable netdev in raw_sendmsg() When the socket is CAN FD enabled it can handle CAN FD frame transmissions. Add an additional check in raw_sendmsg() as a CAN2.0 CAN driver (non CAN FD) should never see a CAN FD frame. Due to the commonly used can_dropped_invalid_skb() function the CAN 2.0 driver would drop that CAN FD frame anyway - but with this patch the user gets a proper -EINVAL return code. Signed-off-by: Oliver Hartkopp Cc: linux-stable Signed-off-by: Marc Kleine-Budde --- net/can/raw.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/can/raw.c b/net/can/raw.c index 1051eee82581..3aab7664933f 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -745,18 +745,19 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) } else ifindex = ro->ifindex; - if (ro->fd_frames) { + dev = dev_get_by_index(sock_net(sk), ifindex); + if (!dev) + return -ENXIO; + + err = -EINVAL; + if (ro->fd_frames && dev->mtu == CANFD_MTU) { if (unlikely(size != CANFD_MTU && size != CAN_MTU)) - return -EINVAL; + goto put_dev; } else { if (unlikely(size != CAN_MTU)) - return -EINVAL; + goto put_dev; } - dev = dev_get_by_index(sock_net(sk), ifindex); - if (!dev) - return -ENXIO; - skb = sock_alloc_send_skb(sk, size + sizeof(struct can_skb_priv), msg->msg_flags & MSG_DONTWAIT, &err); if (!skb) -- cgit From 62230715fd2453b3ba948c9d83cfb3ada9169169 Mon Sep 17 00:00:00 2001 From: 배석진 Date: Fri, 9 Nov 2018 16:53:06 -0800 Subject: flow_dissector: do not dissect l4 ports for fragments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Only first fragment has the sport/dport information, not the following ones. If we want consistent hash for all fragments, we need to ignore ports even for first fragment. This bug is visible for IPv6 traffic, if incoming fragments do not have a flow label, since skb_get_hash() will give different results for first fragment and following ones. It is also visible if any routing rule wants dissection and sport or dport. See commit 5e5d6fed3741 ("ipv6: route: dissect flow in input path if fib rules need it") for details. [edumazet] rewrote the changelog completely. Fixes: 06635a35d13d ("flow_dissect: use programable dissector in skb_flow_dissect and friends") Signed-off-by: 배석진 Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 676f3ad629f9..588f475019d4 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -1166,8 +1166,8 @@ ip_proto_again: break; } - if (dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_PORTS)) { + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS) && + !(key_control->flags & FLOW_DIS_IS_FRAGMENT)) { key_ports = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_PORTS, target_container); -- cgit From 63c82997f5c0f3e1b914af43d82f712a86bc5f3a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 9 Nov 2018 21:06:26 -0800 Subject: net: sched: cls_flower: validate nested enc_opts_policy to avoid warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit TCA_FLOWER_KEY_ENC_OPTS and TCA_FLOWER_KEY_ENC_OPTS_MASK can only currently contain further nested attributes, which are parsed by hand, so the policy is never actually used resulting in a W=1 build warning: net/sched/cls_flower.c:492:1: warning: ‘enc_opts_policy’ defined but not used [-Wunused-const-variable=] enc_opts_policy[TCA_FLOWER_KEY_ENC_OPTS_MAX + 1] = { Add the validation anyway to avoid potential bugs when other attributes are added and to make the attribute structure slightly more clear. Validation will also set extact to point to bad attribute on error. Fixes: 0a6e77784f49 ("net/sched: allow flower to match tunnel options") Signed-off-by: Jakub Kicinski Acked-by: Simon Horman Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_flower.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 9aada2d0ef06..c6c327874abc 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -709,11 +709,23 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key, struct netlink_ext_ack *extack) { const struct nlattr *nla_enc_key, *nla_opt_key, *nla_opt_msk = NULL; - int option_len, key_depth, msk_depth = 0; + int err, option_len, key_depth, msk_depth = 0; + + err = nla_validate_nested(tb[TCA_FLOWER_KEY_ENC_OPTS], + TCA_FLOWER_KEY_ENC_OPTS_MAX, + enc_opts_policy, extack); + if (err) + return err; nla_enc_key = nla_data(tb[TCA_FLOWER_KEY_ENC_OPTS]); if (tb[TCA_FLOWER_KEY_ENC_OPTS_MASK]) { + err = nla_validate_nested(tb[TCA_FLOWER_KEY_ENC_OPTS_MASK], + TCA_FLOWER_KEY_ENC_OPTS_MAX, + enc_opts_policy, extack); + if (err) + return err; + nla_opt_msk = nla_data(tb[TCA_FLOWER_KEY_ENC_OPTS_MASK]); msk_depth = nla_len(tb[TCA_FLOWER_KEY_ENC_OPTS_MASK]); } -- cgit From 7ab412d33b4c7ff3e0148d3db25dd861edd1283d Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Sat, 10 Nov 2018 17:30:24 -0500 Subject: tipc: fix link re-establish failure When a link failure is detected locally, the link is reset, the flag link->in_session is set to false, and a RESET_MSG with the 'stopping' bit set is sent to the peer. The purpose of this bit is to inform the peer that this endpoint just is going down, and that the peer should handle the reception of this particular RESET message as a local failure. This forces the peer to accept another RESET or ACTIVATE message from this endpoint before it can re-establish the link. This again is necessary to ensure that link session numbers are properly exchanged before the link comes up again. If a failure is detected locally at the same time at the peer endpoint this will do the same, which is also a correct behavior. However, when receiving such messages, the endpoints will not distinguish between 'stopping' RESETs and ordinary ones when it comes to updating session numbers. Both endpoints will copy the received session number and set their 'in_session' flags to true at the reception, while they are still expecting another RESET from the peer before they can go ahead and re-establish. This is contradictory, since, after applying the validation check referred to below, the 'in_session' flag will cause rejection of all such messages, and the link will never come up again. We now fix this by not only handling received RESET/STOPPING messages as a local failure, but also by omitting to set a new session number and the 'in_session' flag in such cases. Fixes: 7ea817f4e832 ("tipc: check session number before accepting link protocol messages") Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/link.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/tipc/link.c b/net/tipc/link.c index 201c3b5bc96b..836727e363c4 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -1594,14 +1594,17 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, if (in_range(peers_prio, l->priority + 1, TIPC_MAX_LINK_PRI)) l->priority = peers_prio; - /* ACTIVATE_MSG serves as PEER_RESET if link is already down */ - if (msg_peer_stopping(hdr)) + /* If peer is going down we want full re-establish cycle */ + if (msg_peer_stopping(hdr)) { rc = tipc_link_fsm_evt(l, LINK_FAILURE_EVT); - else if ((mtyp == RESET_MSG) || !link_is_up(l)) + break; + } + /* ACTIVATE_MSG serves as PEER_RESET if link is already down */ + if (mtyp == RESET_MSG || !link_is_up(l)) rc = tipc_link_fsm_evt(l, LINK_PEER_RESET_EVT); /* ACTIVATE_MSG takes up link if it was already locally reset */ - if ((mtyp == ACTIVATE_MSG) && (l->state == LINK_ESTABLISHING)) + if (mtyp == ACTIVATE_MSG && l->state == LINK_ESTABLISHING) rc = TIPC_LINK_UP_EVT; l->peer_session = msg_session(hdr); -- cgit From 7236ead1b14923f3ba35cd29cce13246be83f451 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 10 Nov 2018 16:22:29 -0800 Subject: act_mirred: clear skb->tstamp on redirect If sch_fq is used at ingress, skbs that might have been timestamped by net_timestamp_set() if a packet capture is requesting timestamps could be delayed by arbitrary amount of time, since sch_fq time base is MONOTONIC. Fix this problem by moving code from sch_netem.c to act_mirred.c. Fixes: fb420d5d91c1 ("tcp/fq: move back to CLOCK_MONOTONIC") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/act_mirred.c | 3 ++- net/sched/sch_netem.c | 9 --------- 2 files changed, 2 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 1dae5f2b358f..c8cf4d10c435 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -258,7 +258,8 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a, if (is_redirect) { skb2->tc_redirected = 1; skb2->tc_from_ingress = skb2->tc_at_ingress; - + if (skb2->tc_from_ingress) + skb2->tstamp = 0; /* let's the caller reinsert the packet, if possible */ if (use_reinsert) { res->ingress = want_ingress; diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 57b3ad9394ad..2c38e3d07924 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -648,15 +648,6 @@ deliver: */ skb->dev = qdisc_dev(sch); -#ifdef CONFIG_NET_CLS_ACT - /* - * If it's at ingress let's pretend the delay is - * from the network (tstamp will be updated). - */ - if (skb->tc_redirected && skb->tc_from_ingress) - skb->tstamp = 0; -#endif - if (q->slot.slot_next) { q->slot.packets_left--; q->slot.bytes_left -= qdisc_pkt_len(skb); -- cgit From f4156f9656feac21f4de712fac94fae964c5d402 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Tue, 30 Oct 2018 12:17:10 +0100 Subject: batman-adv: Use explicit tvlv padding for ELP packets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The announcement messages of batman-adv COMPAT_VERSION 15 have the possibility to announce additional information via a dynamic TVLV part. This part is optional for the ELP packets and currently not parsed by the Linux implementation. Still out-of-tree versions are using it to transport things like neighbor hashes to optimize the rebroadcast behavior. Since the ELP broadcast packets are smaller than the minimal ethernet packet, it often has to be padded. This is often done (as specified in RFC894) with octets of zero and thus work perfectly fine with the TVLV part (making it a zero length and thus empty). But not all ethernet compatible hardware seems to follow this advice. To avoid ambiguous situations when parsing the TVLV header, just force the 4 bytes (TVLV length + padding) after the required ELP header to zero. Fixes: d6f94d91f766 ("batman-adv: ELP - adding basic infrastructure") Reported-by: Linus Lüssing Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/bat_v_elp.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index 9f481cfdf77d..e8090f099eb8 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -352,19 +352,21 @@ out: */ int batadv_v_elp_iface_enable(struct batadv_hard_iface *hard_iface) { + static const size_t tvlv_padding = sizeof(__be32); struct batadv_elp_packet *elp_packet; unsigned char *elp_buff; u32 random_seqno; size_t size; int res = -ENOMEM; - size = ETH_HLEN + NET_IP_ALIGN + BATADV_ELP_HLEN; + size = ETH_HLEN + NET_IP_ALIGN + BATADV_ELP_HLEN + tvlv_padding; hard_iface->bat_v.elp_skb = dev_alloc_skb(size); if (!hard_iface->bat_v.elp_skb) goto out; skb_reserve(hard_iface->bat_v.elp_skb, ETH_HLEN + NET_IP_ALIGN); - elp_buff = skb_put_zero(hard_iface->bat_v.elp_skb, BATADV_ELP_HLEN); + elp_buff = skb_put_zero(hard_iface->bat_v.elp_skb, + BATADV_ELP_HLEN + tvlv_padding); elp_packet = (struct batadv_elp_packet *)elp_buff; elp_packet->packet_type = BATADV_ELP; -- cgit From d7d8bbb40a5b1f682ee6589e212934f4c6b8ad60 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Wed, 7 Nov 2018 23:09:12 +0100 Subject: batman-adv: Expand merged fragment buffer for full packet MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The complete size ("total_size") of the fragmented packet is stored in the fragment header and in the size of the fragment chain. When the fragments are ready for merge, the skbuff's tail of the first fragment is expanded to have enough room after the data pointer for at least total_size. This means that it gets expanded by total_size - first_skb->len. But this is ignoring the fact that after expanding the buffer, the fragment header is pulled by from this buffer. Assuming that the tailroom of the buffer was already 0, the buffer after the data pointer of the skbuff is now only total_size - len(fragment_header) large. When the merge function is then processing the remaining fragments, the code to copy the data over to the merged skbuff will cause an skb_over_panic when it tries to actually put enough data to fill the total_size bytes of the packet. The size of the skb_pull must therefore also be taken into account when the buffer's tailroom is expanded. Fixes: 610bfc6bc99b ("batman-adv: Receive fragmented packets and merge") Reported-by: Martin Weinelt Co-authored-by: Linus Lüssing Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/fragmentation.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index 0fddc17106bd..5b71a289d04f 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -275,7 +275,7 @@ batadv_frag_merge_packets(struct hlist_head *chain) kfree(entry); packet = (struct batadv_frag_packet *)skb_out->data; - size = ntohs(packet->total_size); + size = ntohs(packet->total_size) + hdr_size; /* Make room for the rest of the fragments. */ if (pskb_expand_head(skb_out, 0, size - skb_out->len, GFP_ATOMIC) < 0) { -- cgit From a652a4bc21695a57c3b8d13d222a6f8b41f100aa Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 12 Nov 2018 15:30:52 -0500 Subject: SUNRPC: Fix a Oops when destroying the RPCSEC_GSS credential cache Commit 07d02a67b7fa causes a use-after free in the RPCSEC_GSS credential destroy code, because the call to get_rpccred() in gss_destroying_context() will now always fail to increment the refcount. While we could just replace the get_rpccred() with a refcount_set(), that would have the unfortunate consequence of resurrecting a credential in the credential cache for which we are in the process of destroying the RPCSEC_GSS context. Rather than do this, we choose to make a copy that is never added to the cache and use that to destroy the context. Fixes: 07d02a67b7fa ("SUNRPC: Simplify lookup code") Signed-off-by: Trond Myklebust --- net/sunrpc/auth_gss/auth_gss.c | 61 +++++++++++++++++++++++++++++------------- 1 file changed, 42 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 30f970cdc7f6..5d3f252659f1 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -1239,36 +1239,59 @@ gss_create(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt) return &gss_auth->rpc_auth; } +static struct gss_cred * +gss_dup_cred(struct gss_auth *gss_auth, struct gss_cred *gss_cred) +{ + struct gss_cred *new; + + /* Make a copy of the cred so that we can reference count it */ + new = kzalloc(sizeof(*gss_cred), GFP_NOIO); + if (new) { + struct auth_cred acred = { + .uid = gss_cred->gc_base.cr_uid, + }; + struct gss_cl_ctx *ctx = + rcu_dereference_protected(gss_cred->gc_ctx, 1); + + rpcauth_init_cred(&new->gc_base, &acred, + &gss_auth->rpc_auth, + &gss_nullops); + new->gc_base.cr_flags = 1UL << RPCAUTH_CRED_UPTODATE; + new->gc_service = gss_cred->gc_service; + new->gc_principal = gss_cred->gc_principal; + kref_get(&gss_auth->kref); + rcu_assign_pointer(new->gc_ctx, ctx); + gss_get_ctx(ctx); + } + return new; +} + /* - * gss_destroying_context will cause the RPCSEC_GSS to send a NULL RPC call + * gss_send_destroy_context will cause the RPCSEC_GSS to send a NULL RPC call * to the server with the GSS control procedure field set to * RPC_GSS_PROC_DESTROY. This should normally cause the server to release * all RPCSEC_GSS state associated with that context. */ -static int -gss_destroying_context(struct rpc_cred *cred) +static void +gss_send_destroy_context(struct rpc_cred *cred) { struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); struct gss_auth *gss_auth = container_of(cred->cr_auth, struct gss_auth, rpc_auth); struct gss_cl_ctx *ctx = rcu_dereference_protected(gss_cred->gc_ctx, 1); + struct gss_cred *new; struct rpc_task *task; - if (test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) == 0) - return 0; - - ctx->gc_proc = RPC_GSS_PROC_DESTROY; - cred->cr_ops = &gss_nullops; - - /* Take a reference to ensure the cred will be destroyed either - * by the RPC call or by the put_rpccred() below */ - get_rpccred(cred); + new = gss_dup_cred(gss_auth, gss_cred); + if (new) { + ctx->gc_proc = RPC_GSS_PROC_DESTROY; - task = rpc_call_null(gss_auth->client, cred, RPC_TASK_ASYNC|RPC_TASK_SOFT); - if (!IS_ERR(task)) - rpc_put_task(task); + task = rpc_call_null(gss_auth->client, &new->gc_base, + RPC_TASK_ASYNC|RPC_TASK_SOFT); + if (!IS_ERR(task)) + rpc_put_task(task); - put_rpccred(cred); - return 1; + put_rpccred(&new->gc_base); + } } /* gss_destroy_cred (and gss_free_ctx) are used to clean up after failure @@ -1330,8 +1353,8 @@ static void gss_destroy_cred(struct rpc_cred *cred) { - if (gss_destroying_context(cred)) - return; + if (test_and_clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) != 0) + gss_send_destroy_context(cred); gss_destroy_nullcred(cred); } -- cgit From e3d5e573a54dabdc0f9f3cb039d799323372b251 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 12 Nov 2018 16:06:51 -0500 Subject: SUNRPC: Fix a bogus get/put in generic_key_to_expire() Signed-off-by: Trond Myklebust --- net/sunrpc/auth_generic.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'net') diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c index d8831b988b1e..ab4a3be1542a 100644 --- a/net/sunrpc/auth_generic.c +++ b/net/sunrpc/auth_generic.c @@ -281,13 +281,7 @@ static bool generic_key_to_expire(struct rpc_cred *cred) { struct auth_cred *acred = &container_of(cred, struct generic_cred, gc_base)->acred; - bool ret; - - get_rpccred(cred); - ret = test_bit(RPC_CRED_KEY_EXPIRE_SOON, &acred->ac_flags); - put_rpccred(cred); - - return ret; + return test_bit(RPC_CRED_KEY_EXPIRE_SOON, &acred->ac_flags); } static const struct rpc_credops generic_credops = { -- cgit From f8504f4ca0a0e9f84546ef86e00b24d2ea9a0bd2 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 13 Nov 2018 01:08:25 +0800 Subject: l2tp: fix a sock refcnt leak in l2tp_tunnel_register This issue happens when trying to add an existent tunnel. It doesn't call sock_put() before returning -EEXIST to release the sock refcnt that was held by calling sock_hold() before the existence check. This patch is to fix it by holding the sock after doing the existence check. Fixes: f6cd651b056f ("l2tp: fix race in duplicate tunnel detection") Reported-by: Jianlin Shi Signed-off-by: Xin Long Reviewed-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_core.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 82cdf9020b53..26f1d435696a 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1490,12 +1490,7 @@ int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net, goto err_sock; } - sk = sock->sk; - - sock_hold(sk); - tunnel->sock = sk; tunnel->l2tp_net = net; - pn = l2tp_pernet(net); spin_lock_bh(&pn->l2tp_tunnel_list_lock); @@ -1510,6 +1505,10 @@ int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net, list_add_rcu(&tunnel->list, &pn->l2tp_tunnel_list); spin_unlock_bh(&pn->l2tp_tunnel_list_lock); + sk = sock->sk; + sock_hold(sk); + tunnel->sock = sk; + if (tunnel->encap == L2TP_ENCAPTYPE_UDP) { struct udp_tunnel_sock_cfg udp_cfg = { .sk_user_data = tunnel, -- cgit From 7150ceaacb27f7b3bf494e72cd4be4e11612dfff Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 12 Nov 2018 22:33:22 +0000 Subject: rxrpc: Fix life check The life-checking function, which is used by kAFS to make sure that a call is still live in the event of a pending signal, only samples the received packet serial number counter; it doesn't actually provoke a change in the counter, rather relying on the server to happen to give us a packet in the time window. Fix this by adding a function to force a ping to be transmitted. kAFS then keeps track of whether there's been a stall, and if so, uses the new function to ping the server, resetting the timeout to allow the reply to come back. If there's a stall, a ping and the call is *still* stalled in the same place after another period, then the call will be aborted. Fixes: bc5e3a546d55 ("rxrpc: Use MSG_WAITALL to tell sendmsg() to temporarily ignore signals") Fixes: f4d15fb6f99a ("rxrpc: Provide functions for allowing cleaner handling of signals") Signed-off-by: David Howells Signed-off-by: David S. Miller --- net/rxrpc/af_rxrpc.c | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 64362d078da8..a2522f9d71e2 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -375,16 +375,35 @@ EXPORT_SYMBOL(rxrpc_kernel_end_call); * getting ACKs from the server. Returns a number representing the life state * which can be compared to that returned by a previous call. * - * If this is a client call, ping ACKs will be sent to the server to find out - * whether it's still responsive and whether the call is still alive on the - * server. + * If the life state stalls, rxrpc_kernel_probe_life() should be called and + * then 2RTT waited. */ -u32 rxrpc_kernel_check_life(struct socket *sock, struct rxrpc_call *call) +u32 rxrpc_kernel_check_life(const struct socket *sock, + const struct rxrpc_call *call) { return call->acks_latest; } EXPORT_SYMBOL(rxrpc_kernel_check_life); +/** + * rxrpc_kernel_probe_life - Poke the peer to see if it's still alive + * @sock: The socket the call is on + * @call: The call to check + * + * In conjunction with rxrpc_kernel_check_life(), allow a kernel service to + * find out whether a call is still alive by pinging it. This should cause the + * life state to be bumped in about 2*RTT. + * + * The must be called in TASK_RUNNING state on pain of might_sleep() objecting. + */ +void rxrpc_kernel_probe_life(struct socket *sock, struct rxrpc_call *call) +{ + rxrpc_propose_ACK(call, RXRPC_ACK_PING, 0, 0, true, false, + rxrpc_propose_ack_ping_for_check_life); + rxrpc_send_ack_packet(call, true, NULL); +} +EXPORT_SYMBOL(rxrpc_kernel_probe_life); + /** * rxrpc_kernel_get_epoch - Retrieve the epoch value from a call. * @sock: The socket the call is on -- cgit From 08e14fe429a07475ee9f29a283945d602e4a6d92 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 12 Nov 2018 16:17:16 -0800 Subject: net_sched: sch_fq: ensure maxrate fq parameter applies to EDT flows When EDT conversion happened, fq lost the ability to enfore a maxrate for all flows. It kept it for non EDT flows. This commit restores the functionality. Tested: tc qd replace dev eth0 root fq maxrate 500Mbit netperf -P0 -H host -- -O THROUGHPUT 489.75 Fixes: ab408b6dc744 ("tcp: switch tcp and sch_fq to new earliest departure time model") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/sch_fq.c | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 4b1af706896c..25a7cf6d380f 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -469,22 +469,29 @@ begin: goto begin; } prefetch(&skb->end); - f->credit -= qdisc_pkt_len(skb); + plen = qdisc_pkt_len(skb); + f->credit -= plen; - if (ktime_to_ns(skb->tstamp) || !q->rate_enable) + if (!q->rate_enable) goto out; rate = q->flow_max_rate; - if (skb->sk) - rate = min(skb->sk->sk_pacing_rate, rate); - - if (rate <= q->low_rate_threshold) { - f->credit = 0; - plen = qdisc_pkt_len(skb); - } else { - plen = max(qdisc_pkt_len(skb), q->quantum); - if (f->credit > 0) - goto out; + + /* If EDT time was provided for this skb, we need to + * update f->time_next_packet only if this qdisc enforces + * a flow max rate. + */ + if (!skb->tstamp) { + if (skb->sk) + rate = min(skb->sk->sk_pacing_rate, rate); + + if (rate <= q->low_rate_threshold) { + f->credit = 0; + } else { + plen = max(plen, q->quantum); + if (f->credit > 0) + goto out; + } } if (rate != ~0UL) { u64 len = (u64)plen * NSEC_PER_SEC; -- cgit From 761f60261b4401aa368d71d431b4c218af0efcee Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 14 Nov 2018 00:48:28 +0800 Subject: ipv6: fix a dst leak when removing its exception These is no need to hold dst before calling rt6_remove_exception_rt(). The call to dst_hold_safe() in ip6_link_failure() was for ip6_del_rt(), which has been removed in Commit 93531c674315 ("net/ipv6: separate handling of FIB entries from dst based routes"). Otherwise, it will cause a dst leak. This patch is to simply remove the dst_hold_safe() call before calling rt6_remove_exception_rt() and also do the same in ip6_del_cached_rt(). It's safe, because the removal of the exception that holds its dst's refcnt is protected by rt6_exception_lock. Fixes: 93531c674315 ("net/ipv6: separate handling of FIB entries from dst based routes") Fixes: 23fb93a4d3f1 ("net/ipv6: Cleanup exception and cache route handling") Reported-by: Li Shuang Signed-off-by: Xin Long Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 2a7423c39456..14b422f35504 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2232,8 +2232,7 @@ static void ip6_link_failure(struct sk_buff *skb) if (rt) { rcu_read_lock(); if (rt->rt6i_flags & RTF_CACHE) { - if (dst_hold_safe(&rt->dst)) - rt6_remove_exception_rt(rt); + rt6_remove_exception_rt(rt); } else { struct fib6_info *from; struct fib6_node *fn; @@ -3214,8 +3213,8 @@ static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg) if (cfg->fc_flags & RTF_GATEWAY && !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway)) goto out; - if (dst_hold_safe(&rt->dst)) - rc = rt6_remove_exception_rt(rt); + + rc = rt6_remove_exception_rt(rt); out: return rc; } -- cgit From 19ab69107d3ecfb7cd3e38ad262a881be40c01a3 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Wed, 14 Nov 2018 12:17:25 +0100 Subject: net/sched: act_pedit: fix memory leak when IDR allocation fails tcf_idr_check_alloc() can return a negative value, on allocation failures (-ENOMEM) or IDR exhaustion (-ENOSPC): don't leak keys_ex in these cases. Fixes: 0190c1d452a9 ("net: sched: atomically check-allocate action") Signed-off-by: Davide Caratti Acked-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/act_pedit.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index da3dd0f68cc2..2b372a06b432 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -201,7 +201,8 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, goto out_release; } } else { - return err; + ret = err; + goto out_free; } p = to_pedit(*a); -- cgit From 95506588d2c1d72ca29adef8ae9bf771bcfb4ced Mon Sep 17 00:00:00 2001 From: Slavomir Kaslev Date: Fri, 16 Nov 2018 11:27:53 +0200 Subject: socket: do a generic_file_splice_read when proto_ops has no splice_read splice(2) fails with -EINVAL when called reading on a socket with no splice_read set in its proto_ops (such as vsock sockets). Switch this to fallbacks to a generic_file_splice_read instead. Signed-off-by: Slavomir Kaslev Signed-off-by: David S. Miller --- net/socket.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/socket.c b/net/socket.c index 593826e11a53..334fcc617ef2 100644 --- a/net/socket.c +++ b/net/socket.c @@ -853,7 +853,7 @@ static ssize_t sock_splice_read(struct file *file, loff_t *ppos, struct socket *sock = file->private_data; if (unlikely(!sock->ops->splice_read)) - return -EINVAL; + return generic_file_splice_read(file, ppos, pipe, len, flags); return sock->ops->splice_read(sock, ppos, pipe, len, flags); } -- cgit From 9d332e69c1dc74dcd748de7cbd2dac5c61bda265 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Fri, 16 Nov 2018 18:50:01 +0200 Subject: net: bridge: fix vlan stats use-after-free on destruction Syzbot reported a use-after-free of the global vlan context on port vlan destruction. When I added per-port vlan stats I missed the fact that the global vlan context can be freed before the per-port vlan rcu callback. There're a few different ways to deal with this, I've chosen to add a new private flag that is set only when per-port stats are allocated so we can directly check it on destruction without dereferencing the global context at all. The new field in net_bridge_vlan uses a hole. v2: cosmetic change, move the check to br_process_vlan_info where the other checks are done v3: add change log in the patch, add private (in-kernel only) flags in a hole in net_bridge_vlan struct and use that instead of mixing user-space flags with private flags Fixes: 9163a0fc1f0c ("net: bridge: add support for per-port vlan stats") Reported-by: syzbot+04681da557a0e49a52e5@syzkaller.appspotmail.com Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_private.h | 7 +++++++ net/bridge/br_vlan.c | 3 ++- 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 2920e06a5403..04c19a37e500 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -102,12 +102,18 @@ struct br_tunnel_info { struct metadata_dst *tunnel_dst; }; +/* private vlan flags */ +enum { + BR_VLFLAG_PER_PORT_STATS = BIT(0), +}; + /** * struct net_bridge_vlan - per-vlan entry * * @vnode: rhashtable member * @vid: VLAN id * @flags: bridge vlan flags + * @priv_flags: private (in-kernel) bridge vlan flags * @stats: per-cpu VLAN statistics * @br: if MASTER flag set, this points to a bridge struct * @port: if MASTER flag unset, this points to a port struct @@ -127,6 +133,7 @@ struct net_bridge_vlan { struct rhash_head tnode; u16 vid; u16 flags; + u16 priv_flags; struct br_vlan_stats __percpu *stats; union { struct net_bridge *br; diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 8c9297a01947..e84be08b8285 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -197,7 +197,7 @@ static void nbp_vlan_rcu_free(struct rcu_head *rcu) v = container_of(rcu, struct net_bridge_vlan, rcu); WARN_ON(br_vlan_is_master(v)); /* if we had per-port stats configured then free them here */ - if (v->brvlan->stats != v->stats) + if (v->priv_flags & BR_VLFLAG_PER_PORT_STATS) free_percpu(v->stats); v->stats = NULL; kfree(v); @@ -264,6 +264,7 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags) err = -ENOMEM; goto out_filt; } + v->priv_flags |= BR_VLFLAG_PER_PORT_STATS; } else { v->stats = masterv->stats; } -- cgit From 16f7eb2b77b55da816c4e207f3f9440a8cafc00a Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Fri, 16 Nov 2018 16:58:19 +0100 Subject: ip_tunnel: don't force DF when MTU is locked The various types of tunnels running over IPv4 can ask to set the DF bit to do PMTU discovery. However, PMTU discovery is subject to the threshold set by the net.ipv4.route.min_pmtu sysctl, and is also disabled on routes with "mtu lock". In those cases, we shouldn't set the DF bit. This patch makes setting the DF bit conditional on the route's MTU locking state. This issue seems to be older than git history. Signed-off-by: Sabrina Dubroca Reviewed-by: Stefano Brivio Signed-off-by: David S. Miller --- net/ipv4/ip_tunnel_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index dde671e97829..c248e0dccbe1 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -80,7 +80,7 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, iph->version = 4; iph->ihl = sizeof(struct iphdr) >> 2; - iph->frag_off = df; + iph->frag_off = ip_mtu_locked(&rt->dst) ? 0 : df; iph->protocol = proto; iph->tos = tos; iph->daddr = dst; -- cgit From 33d9a2c72f086cbf1087b2fd2d1a15aa9df14a7f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 17 Nov 2018 21:57:02 -0800 Subject: net-gro: reset skb->pkt_type in napi_reuse_skb() eth_type_trans() assumes initial value for skb->pkt_type is PACKET_HOST. This is indeed the value right after a fresh skb allocation. However, it is possible that GRO merged a packet with a different value (like PACKET_OTHERHOST in case macvlan is used), so we need to make sure napi->skb will have pkt_type set back to PACKET_HOST. Otherwise, valid packets might be dropped by the stack because their pkt_type is not PACKET_HOST. napi_reuse_skb() was added in commit 96e93eab2033 ("gro: Add internal interfaces for VLAN"), but this bug always has been there. Fixes: 96e93eab2033 ("gro: Add internal interfaces for VLAN") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 0ffcbdd55fa9..066aa902d85c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5655,6 +5655,10 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) skb->vlan_tci = 0; skb->dev = napi->dev; skb->skb_iif = 0; + + /* eth_type_trans() assumes pkt_type is PACKET_HOST */ + skb->pkt_type = PACKET_HOST; + skb->encapsulation = 0; skb_shinfo(skb)->gso_type = 0; skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); -- cgit From adba75be0d23cca92a028749d92c60c8909bbdb3 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Fri, 16 Nov 2018 16:55:04 -0500 Subject: tipc: fix lockdep warning when reinitilaizing sockets We get the following warning: [ 47.926140] 32-bit node address hash set to 2010a0a [ 47.927202] [ 47.927433] ================================ [ 47.928050] WARNING: inconsistent lock state [ 47.928661] 4.19.0+ #37 Tainted: G E [ 47.929346] -------------------------------- [ 47.929954] inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage. [ 47.930116] swapper/3/0 [HC0[0]:SC1[3]:HE1:SE0] takes: [ 47.930116] 00000000af8bc31e (&(&ht->lock)->rlock){+.?.}, at: rhashtable_walk_enter+0x36/0xb0 [ 47.930116] {SOFTIRQ-ON-W} state was registered at: [ 47.930116] _raw_spin_lock+0x29/0x60 [ 47.930116] rht_deferred_worker+0x556/0x810 [ 47.930116] process_one_work+0x1f5/0x540 [ 47.930116] worker_thread+0x64/0x3e0 [ 47.930116] kthread+0x112/0x150 [ 47.930116] ret_from_fork+0x3a/0x50 [ 47.930116] irq event stamp: 14044 [ 47.930116] hardirqs last enabled at (14044): [] __local_bh_enable_ip+0x7a/0xf0 [ 47.938117] hardirqs last disabled at (14043): [] __local_bh_enable_ip+0x41/0xf0 [ 47.938117] softirqs last enabled at (14028): [] irq_enter+0x5e/0x60 [ 47.938117] softirqs last disabled at (14029): [] irq_exit+0xb5/0xc0 [ 47.938117] [ 47.938117] other info that might help us debug this: [ 47.938117] Possible unsafe locking scenario: [ 47.938117] [ 47.938117] CPU0 [ 47.938117] ---- [ 47.938117] lock(&(&ht->lock)->rlock); [ 47.938117] [ 47.938117] lock(&(&ht->lock)->rlock); [ 47.938117] [ 47.938117] *** DEADLOCK *** [ 47.938117] [ 47.938117] 2 locks held by swapper/3/0: [ 47.938117] #0: 0000000062c64f90 ((&d->timer)){+.-.}, at: call_timer_fn+0x5/0x280 [ 47.938117] #1: 00000000ee39619c (&(&d->lock)->rlock){+.-.}, at: tipc_disc_timeout+0xc8/0x540 [tipc] [ 47.938117] [ 47.938117] stack backtrace: [ 47.938117] CPU: 3 PID: 0 Comm: swapper/3 Tainted: G E 4.19.0+ #37 [ 47.938117] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [ 47.938117] Call Trace: [ 47.938117] [ 47.938117] dump_stack+0x5e/0x8b [ 47.938117] print_usage_bug+0x1ed/0x1ff [ 47.938117] mark_lock+0x5b5/0x630 [ 47.938117] __lock_acquire+0x4c0/0x18f0 [ 47.938117] ? lock_acquire+0xa6/0x180 [ 47.938117] lock_acquire+0xa6/0x180 [ 47.938117] ? rhashtable_walk_enter+0x36/0xb0 [ 47.938117] _raw_spin_lock+0x29/0x60 [ 47.938117] ? rhashtable_walk_enter+0x36/0xb0 [ 47.938117] rhashtable_walk_enter+0x36/0xb0 [ 47.938117] tipc_sk_reinit+0xb0/0x410 [tipc] [ 47.938117] ? mark_held_locks+0x6f/0x90 [ 47.938117] ? __local_bh_enable_ip+0x7a/0xf0 [ 47.938117] ? lockdep_hardirqs_on+0x20/0x1a0 [ 47.938117] tipc_net_finalize+0xbf/0x180 [tipc] [ 47.938117] tipc_disc_timeout+0x509/0x540 [tipc] [ 47.938117] ? call_timer_fn+0x5/0x280 [ 47.938117] ? tipc_disc_msg_xmit.isra.19+0xa0/0xa0 [tipc] [ 47.938117] ? tipc_disc_msg_xmit.isra.19+0xa0/0xa0 [tipc] [ 47.938117] call_timer_fn+0xa1/0x280 [ 47.938117] ? tipc_disc_msg_xmit.isra.19+0xa0/0xa0 [tipc] [ 47.938117] run_timer_softirq+0x1f2/0x4d0 [ 47.938117] __do_softirq+0xfc/0x413 [ 47.938117] irq_exit+0xb5/0xc0 [ 47.938117] smp_apic_timer_interrupt+0xac/0x210 [ 47.938117] apic_timer_interrupt+0xf/0x20 [ 47.938117] [ 47.938117] RIP: 0010:default_idle+0x1c/0x140 [ 47.938117] Code: 90 90 90 90 90 90 90 90 90 90 90 90 90 90 0f 1f 44 00 00 41 54 55 53 65 8b 2d d8 2b 74 65 0f 1f 44 00 00 e8 c6 2c 8b ff fb f4 <65> 8b 2d c5 2b 74 65 0f 1f 44 00 00 5b 5d 41 5c c3 65 8b 05 b4 2b [ 47.938117] RSP: 0018:ffffaf6ac0207ec8 EFLAGS: 00000206 ORIG_RAX: ffffffffffffff13 [ 47.938117] RAX: ffff8f5b3735e200 RBX: 0000000000000003 RCX: 0000000000000001 [ 47.938117] RDX: 0000000000000001 RSI: 0000000000000001 RDI: ffff8f5b3735e200 [ 47.938117] RBP: 0000000000000003 R08: 0000000000000001 R09: 0000000000000000 [ 47.938117] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 [ 47.938117] R13: 0000000000000000 R14: ffff8f5b3735e200 R15: ffff8f5b3735e200 [ 47.938117] ? default_idle+0x1a/0x140 [ 47.938117] do_idle+0x1bc/0x280 [ 47.938117] cpu_startup_entry+0x19/0x20 [ 47.938117] start_secondary+0x187/0x1c0 [ 47.938117] secondary_startup_64+0xa4/0xb0 The reason seems to be that tipc_net_finalize()->tipc_sk_reinit() is calling the function rhashtable_walk_enter() within a timer interrupt. We fix this by executing tipc_net_finalize() in work queue context. Acked-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/discover.c | 19 ++++++++++--------- net/tipc/net.c | 45 +++++++++++++++++++++++++++++++++++++-------- net/tipc/net.h | 2 +- 3 files changed, 48 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/tipc/discover.c b/net/tipc/discover.c index 2830709957bd..c138d68e8a69 100644 --- a/net/tipc/discover.c +++ b/net/tipc/discover.c @@ -166,7 +166,8 @@ static bool tipc_disc_addr_trial_msg(struct tipc_discoverer *d, /* Apply trial address if we just left trial period */ if (!trial && !self) { - tipc_net_finalize(net, tn->trial_addr); + tipc_sched_net_finalize(net, tn->trial_addr); + msg_set_prevnode(buf_msg(d->skb), tn->trial_addr); msg_set_type(buf_msg(d->skb), DSC_REQ_MSG); } @@ -300,14 +301,12 @@ static void tipc_disc_timeout(struct timer_list *t) goto exit; } - /* Trial period over ? */ - if (!time_before(jiffies, tn->addr_trial_end)) { - /* Did we just leave it ? */ - if (!tipc_own_addr(net)) - tipc_net_finalize(net, tn->trial_addr); - - msg_set_type(buf_msg(d->skb), DSC_REQ_MSG); - msg_set_prevnode(buf_msg(d->skb), tipc_own_addr(net)); + /* Did we just leave trial period ? */ + if (!time_before(jiffies, tn->addr_trial_end) && !tipc_own_addr(net)) { + mod_timer(&d->timer, jiffies + TIPC_DISC_INIT); + spin_unlock_bh(&d->lock); + tipc_sched_net_finalize(net, tn->trial_addr); + return; } /* Adjust timeout interval according to discovery phase */ @@ -319,6 +318,8 @@ static void tipc_disc_timeout(struct timer_list *t) d->timer_intv = TIPC_DISC_SLOW; else if (!d->num_nodes && d->timer_intv > TIPC_DISC_FAST) d->timer_intv = TIPC_DISC_FAST; + msg_set_type(buf_msg(d->skb), DSC_REQ_MSG); + msg_set_prevnode(buf_msg(d->skb), tn->trial_addr); } mod_timer(&d->timer, jiffies + d->timer_intv); diff --git a/net/tipc/net.c b/net/tipc/net.c index 62199cf5a56c..f076edb74338 100644 --- a/net/tipc/net.c +++ b/net/tipc/net.c @@ -104,6 +104,14 @@ * - A local spin_lock protecting the queue of subscriber events. */ +struct tipc_net_work { + struct work_struct work; + struct net *net; + u32 addr; +}; + +static void tipc_net_finalize(struct net *net, u32 addr); + int tipc_net_init(struct net *net, u8 *node_id, u32 addr) { if (tipc_own_id(net)) { @@ -119,17 +127,38 @@ int tipc_net_init(struct net *net, u8 *node_id, u32 addr) return 0; } -void tipc_net_finalize(struct net *net, u32 addr) +static void tipc_net_finalize(struct net *net, u32 addr) { struct tipc_net *tn = tipc_net(net); - if (!cmpxchg(&tn->node_addr, 0, addr)) { - tipc_set_node_addr(net, addr); - tipc_named_reinit(net); - tipc_sk_reinit(net); - tipc_nametbl_publish(net, TIPC_CFG_SRV, addr, addr, - TIPC_CLUSTER_SCOPE, 0, addr); - } + if (cmpxchg(&tn->node_addr, 0, addr)) + return; + tipc_set_node_addr(net, addr); + tipc_named_reinit(net); + tipc_sk_reinit(net); + tipc_nametbl_publish(net, TIPC_CFG_SRV, addr, addr, + TIPC_CLUSTER_SCOPE, 0, addr); +} + +static void tipc_net_finalize_work(struct work_struct *work) +{ + struct tipc_net_work *fwork; + + fwork = container_of(work, struct tipc_net_work, work); + tipc_net_finalize(fwork->net, fwork->addr); + kfree(fwork); +} + +void tipc_sched_net_finalize(struct net *net, u32 addr) +{ + struct tipc_net_work *fwork = kzalloc(sizeof(*fwork), GFP_ATOMIC); + + if (!fwork) + return; + INIT_WORK(&fwork->work, tipc_net_finalize_work); + fwork->net = net; + fwork->addr = addr; + schedule_work(&fwork->work); } void tipc_net_stop(struct net *net) diff --git a/net/tipc/net.h b/net/tipc/net.h index 09ad02b50bb1..b7f2e364eb99 100644 --- a/net/tipc/net.h +++ b/net/tipc/net.h @@ -42,7 +42,7 @@ extern const struct nla_policy tipc_nl_net_policy[]; int tipc_net_init(struct net *net, u8 *node_id, u32 addr); -void tipc_net_finalize(struct net *net, u32 addr); +void tipc_sched_net_finalize(struct net *net, u32 addr); void tipc_net_stop(struct net *net); int tipc_nl_net_dump(struct sk_buff *skb, struct netlink_callback *cb); int tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info); -- cgit From 1c1274a56999fbdf9cf84e332b28448bb2d55221 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Sat, 17 Nov 2018 12:17:06 -0500 Subject: tipc: don't assume linear buffer when reading ancillary data The code for reading ancillary data from a received buffer is assuming the buffer is linear. To make this assumption true we have to linearize the buffer before message data is read. Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/socket.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 636e6131769d..b57b1be7252b 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1555,16 +1555,17 @@ static void tipc_sk_set_orig_addr(struct msghdr *m, struct sk_buff *skb) /** * tipc_sk_anc_data_recv - optionally capture ancillary data for received message * @m: descriptor for message info - * @msg: received message header + * @skb: received message buffer * @tsk: TIPC port associated with message * * Note: Ancillary data is not captured if not requested by receiver. * * Returns 0 if successful, otherwise errno */ -static int tipc_sk_anc_data_recv(struct msghdr *m, struct tipc_msg *msg, +static int tipc_sk_anc_data_recv(struct msghdr *m, struct sk_buff *skb, struct tipc_sock *tsk) { + struct tipc_msg *msg; u32 anc_data[3]; u32 err; u32 dest_type; @@ -1573,6 +1574,7 @@ static int tipc_sk_anc_data_recv(struct msghdr *m, struct tipc_msg *msg, if (likely(m->msg_controllen == 0)) return 0; + msg = buf_msg(skb); /* Optionally capture errored message object(s) */ err = msg ? msg_errcode(msg) : 0; @@ -1583,6 +1585,9 @@ static int tipc_sk_anc_data_recv(struct msghdr *m, struct tipc_msg *msg, if (res) return res; if (anc_data[1]) { + if (skb_linearize(skb)) + return -ENOMEM; + msg = buf_msg(skb); res = put_cmsg(m, SOL_TIPC, TIPC_RETDATA, anc_data[1], msg_data(msg)); if (res) @@ -1744,9 +1749,10 @@ static int tipc_recvmsg(struct socket *sock, struct msghdr *m, /* Collect msg meta data, including error code and rejected data */ tipc_sk_set_orig_addr(m, skb); - rc = tipc_sk_anc_data_recv(m, hdr, tsk); + rc = tipc_sk_anc_data_recv(m, skb, tsk); if (unlikely(rc)) goto exit; + hdr = buf_msg(skb); /* Capture data if non-error msg, otherwise just set return value */ if (likely(!err)) { @@ -1856,9 +1862,10 @@ static int tipc_recvstream(struct socket *sock, struct msghdr *m, /* Collect msg meta data, incl. error code and rejected data */ if (!copied) { tipc_sk_set_orig_addr(m, skb); - rc = tipc_sk_anc_data_recv(m, hdr, tsk); + rc = tipc_sk_anc_data_recv(m, skb, tsk); if (rc) break; + hdr = buf_msg(skb); } /* Copy data if msg ok, otherwise return error/partial data */ -- cgit From 7ddacfa564870cdd97275fd87decb6174abc6380 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sun, 18 Nov 2018 10:45:30 -0800 Subject: ipv6: Fix PMTU updates for UDP/raw sockets in presence of VRF Preethi reported that PMTU discovery for UDP/raw applications is not working in the presence of VRF when the socket is not bound to a device. The problem is that ip6_sk_update_pmtu does not consider the L3 domain of the skb device if the socket is not bound. Update the function to set oif to the L3 master device if relevant. Fixes: ca254490c8df ("net: Add VRF support to IPv6 stack") Reported-by: Preethi Ramachandra Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 14b422f35504..059f0531f7c1 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2359,10 +2359,13 @@ EXPORT_SYMBOL_GPL(ip6_update_pmtu); void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu) { + int oif = sk->sk_bound_dev_if; struct dst_entry *dst; - ip6_update_pmtu(skb, sock_net(sk), mtu, - sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid); + if (!oif && skb->dev) + oif = l3mdev_master_ifindex(skb->dev); + + ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid); dst = __sk_dst_get(sk); if (!dst || !dst->obsolete || -- cgit From 7e241f647dc7087a0401418a187f3f5b527cc690 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 8 Nov 2018 15:55:37 +0100 Subject: libceph: fall back to sendmsg for slab pages skb_can_coalesce() allows coalescing neighboring slab objects into a single frag: return page == skb_frag_page(frag) && off == frag->page_offset + skb_frag_size(frag); ceph_tcp_sendpage() can be handed slab pages. One example of this is XFS: it passes down sector sized slab objects for its metadata I/O. If the kernel client is co-located on the OSD node, the skb may go through loopback and pop on the receive side with the exact same set of frags. When tcp_recvmsg() attempts to copy out such a frag, hardened usercopy complains because the size exceeds the object's allocated size: usercopy: kernel memory exposure attempt detected from ffff9ba917f20a00 (kmalloc-512) (1024 bytes) Although skb_can_coalesce() could be taught to return false if the resulting frag would cross a slab object boundary, we already have a fallback for non-refcounted pages. Utilize it for slab pages too. Cc: stable@vger.kernel.org # 4.8+ Signed-off-by: Ilya Dryomov --- net/ceph/messenger.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 57fcc6b4bf6e..2f126eff275d 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -580,9 +580,15 @@ static int ceph_tcp_sendpage(struct socket *sock, struct page *page, struct bio_vec bvec; int ret; - /* sendpage cannot properly handle pages with page_count == 0, - * we need to fallback to sendmsg if that's the case */ - if (page_count(page) >= 1) + /* + * sendpage cannot properly handle pages with page_count == 0, + * we need to fall back to sendmsg if that's the case. + * + * Same goes for slab pages: skb_can_coalesce() allows + * coalescing neighboring slab objects into a single frag which + * triggers one of hardened usercopy checks. + */ + if (page_count(page) >= 1 && !PageSlab(page)) return __ceph_tcp_sendpage(sock, page, offset, size, more); bvec.bv_page = page; -- cgit From 02968ccf0125d39b08ecef5946300a8a873c0942 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 18 Nov 2018 15:07:38 +0800 Subject: sctp: count sk_wmem_alloc by skb truesize in sctp_packet_transmit Now sctp increases sk_wmem_alloc by 1 when doing set_owner_w for the skb allocked in sctp_packet_transmit and decreases by 1 when freeing this skb. But when this skb goes through networking stack, some subcomponents might change skb->truesize and add the same amount on sk_wmem_alloc. However sctp doesn't know the amount to decrease by, it would cause a leak on sk->sk_wmem_alloc and the sock can never be freed. Xiumei found this issue when it hit esp_output_head() by using sctp over ipsec, where skb->truesize is added and so is sk->sk_wmem_alloc. Since sctp has used sk_wmem_queued to count for writable space since Commit cd305c74b0f8 ("sctp: use sk_wmem_queued to check for writable space"), it's ok to fix it by counting sk_wmem_alloc by skb truesize in sctp_packet_transmit. Fixes: cac2661c53f3 ("esp4: Avoid skb_cow_data whenever possible") Reported-by: Xiumei Mu Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/output.c | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-) (limited to 'net') diff --git a/net/sctp/output.c b/net/sctp/output.c index 67939ad99c01..88dfa6ae1fb4 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -396,25 +396,6 @@ finish: return retval; } -static void sctp_packet_release_owner(struct sk_buff *skb) -{ - sk_free(skb->sk); -} - -static void sctp_packet_set_owner_w(struct sk_buff *skb, struct sock *sk) -{ - skb_orphan(skb); - skb->sk = sk; - skb->destructor = sctp_packet_release_owner; - - /* - * The data chunks have already been accounted for in sctp_sendmsg(), - * therefore only reserve a single byte to keep socket around until - * the packet has been transmitted. - */ - refcount_inc(&sk->sk_wmem_alloc); -} - static void sctp_packet_gso_append(struct sk_buff *head, struct sk_buff *skb) { if (SCTP_OUTPUT_CB(head)->last == head) @@ -601,7 +582,7 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) if (!head) goto out; skb_reserve(head, packet->overhead + MAX_HEADER); - sctp_packet_set_owner_w(head, sk); + skb_set_owner_w(head, sk); /* set sctp header */ sh = skb_push(head, sizeof(struct sctphdr)); -- cgit From cc3ccf26f0649089b3a34a2781977755ea36e72c Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 18 Nov 2018 15:21:53 +0800 Subject: sctp: not allow to set asoc prsctp_enable by sockopt As rfc7496#section4.5 says about SCTP_PR_SUPPORTED: This socket option allows the enabling or disabling of the negotiation of PR-SCTP support for future associations. For existing associations, it allows one to query whether or not PR-SCTP support was negotiated on a particular association. It means only sctp sock's prsctp_enable can be set. Note that for the limitation of SCTP_{CURRENT|ALL}_ASSOC, we will add it when introducing SCTP_{FUTURE|CURRENT|ALL}_ASSOC for linux sctp in another patchset. v1->v2: - drop the params.assoc_id check as Neil suggested. Fixes: 28aa4c26fce2 ("sctp: add SCTP_PR_SUPPORTED on sctp sockopt") Reported-by: Ying Xu Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/socket.c | 26 +++++--------------------- 1 file changed, 5 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 739f3e50120d..bf618d1b41fd 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -3940,32 +3940,16 @@ static int sctp_setsockopt_pr_supported(struct sock *sk, unsigned int optlen) { struct sctp_assoc_value params; - struct sctp_association *asoc; - int retval = -EINVAL; if (optlen != sizeof(params)) - goto out; - - if (copy_from_user(¶ms, optval, optlen)) { - retval = -EFAULT; - goto out; - } - - asoc = sctp_id2assoc(sk, params.assoc_id); - if (asoc) { - asoc->prsctp_enable = !!params.assoc_value; - } else if (!params.assoc_id) { - struct sctp_sock *sp = sctp_sk(sk); + return -EINVAL; - sp->ep->prsctp_enable = !!params.assoc_value; - } else { - goto out; - } + if (copy_from_user(¶ms, optval, optlen)) + return -EFAULT; - retval = 0; + sctp_sk(sk)->ep->prsctp_enable = !!params.assoc_value; -out: - return retval; + return 0; } static int sctp_setsockopt_default_prinfo(struct sock *sk, -- cgit From 69fec325a64383667b8a35df5d48d6ce52fb2782 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 18 Nov 2018 16:14:47 +0800 Subject: Revert "sctp: remove sctp_transport_pmtu_check" This reverts commit 22d7be267eaa8114dcc28d66c1c347f667d7878a. The dst's mtu in transport can be updated by a non sctp place like in xfrm where the MTU information didn't get synced between asoc, transport and dst, so it is still needed to do the pmtu check in sctp_packet_config. Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/sctp/output.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/sctp/output.c b/net/sctp/output.c index 88dfa6ae1fb4..b0e74a3e77ec 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -118,6 +118,9 @@ void sctp_packet_config(struct sctp_packet *packet, __u32 vtag, sctp_transport_route(tp, NULL, sp); if (asoc->param_flags & SPP_PMTUD_ENABLE) sctp_assoc_sync_pmtu(asoc); + } else if (!sctp_transport_pmtu_check(tp)) { + if (asoc->param_flags & SPP_PMTUD_ENABLE) + sctp_assoc_sync_pmtu(asoc); } if (asoc->pmtu_pending) { -- cgit From e1e46479847e66f78f79d8c24d5169a5954b3fc2 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 18 Nov 2018 21:59:49 +0800 Subject: sctp: not increase stream's incnt before sending addstrm_in request Different from processing the addstrm_out request, The receiver handles an addstrm_in request by sending back an addstrm_out request to the sender who will increase its stream's in and incnt later. Now stream->incnt has been increased since it sent out the addstrm_in request in sctp_send_add_streams(), with the wrong stream->incnt will even cause crash when copying stream info from the old stream's in to the new one's in sctp_process_strreset_addstrm_out(). This patch is to fix it by simply removing the stream->incnt change from sctp_send_add_streams(). Fixes: 242bd2d519d7 ("sctp: implement sender-side procedures for Add Incoming/Outgoing Streams Request Parameter") Reported-by: Jianwen Ji Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/stream.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/sctp/stream.c b/net/sctp/stream.c index ffb940d3b57c..3892e7630f3a 100644 --- a/net/sctp/stream.c +++ b/net/sctp/stream.c @@ -535,7 +535,6 @@ int sctp_send_add_streams(struct sctp_association *asoc, goto out; } - stream->incnt = incnt; stream->outcnt = outcnt; asoc->strreset_outstanding = !!out + !!in; -- cgit From cadf9df27e7cf40e390e060a1c71bb86ecde798b Mon Sep 17 00:00:00 2001 From: Stephen Mallon Date: Tue, 20 Nov 2018 19:15:02 +1100 Subject: tcp: Fix SOF_TIMESTAMPING_RX_HARDWARE to use the latest timestamp during TCP coalescing During tcp coalescing ensure that the skb hardware timestamp refers to the highest sequence number data. Previously only the software timestamp was updated during coalescing. Signed-off-by: Stephen Mallon Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 2868ef28ce52..e695584bb33f 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4363,6 +4363,7 @@ static bool tcp_try_coalesce(struct sock *sk, if (TCP_SKB_CB(from)->has_rxtstamp) { TCP_SKB_CB(to)->has_rxtstamp = true; to->tstamp = from->tstamp; + skb_hwtstamps(to)->hwtstamp = skb_hwtstamps(from)->hwtstamp; } return true; -- cgit From f2cbd485282014132851bf37cb2ca624a456275d Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Tue, 20 Nov 2018 22:18:44 +0100 Subject: net/sched: act_police: fix race condition on state variables after 'police' configuration parameters were converted to use RCU instead of spinlock, the state variables used to compute the traffic rate (namely 'tcfp_toks', 'tcfp_ptoks' and 'tcfp_t_c') are erroneously read/updated in the traffic path without any protection. Use a dedicated spinlock to avoid race conditions on these variables, and ensure proper cache-line alignment. In this way, 'police' is still faster than what we observed when 'tcf_lock' was used in the traffic path _ i.e. reverting commit 2d550dbad83c ("net/sched: act_police: don't use spinlock in the data path"). Moreover, we preserve the throughput improvement that was obtained after 'police' started using per-cpu counters, when 'avrate' is used instead of 'rate'. Changes since v1 (thanks to Eric Dumazet): - call ktime_get_ns() before acquiring the lock in the traffic path - use a dedicated spinlock instead of tcf_lock - improve cache-line usage Fixes: 2d550dbad83c ("net/sched: act_police: don't use spinlock in the data path") Reported-and-suggested-by: Eric Dumazet Signed-off-by: Davide Caratti Reviewed-by: Eric Dumazet --- net/sched/act_police.c | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 052855d47354..ee4665a5a022 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -27,10 +27,7 @@ struct tcf_police_params { u32 tcfp_ewma_rate; s64 tcfp_burst; u32 tcfp_mtu; - s64 tcfp_toks; - s64 tcfp_ptoks; s64 tcfp_mtu_ptoks; - s64 tcfp_t_c; struct psched_ratecfg rate; bool rate_present; struct psched_ratecfg peak; @@ -41,6 +38,11 @@ struct tcf_police_params { struct tcf_police { struct tc_action common; struct tcf_police_params __rcu *params; + + spinlock_t tcfp_lock ____cacheline_aligned_in_smp; + s64 tcfp_toks; + s64 tcfp_ptoks; + s64 tcfp_t_c; }; #define to_police(pc) ((struct tcf_police *)pc) @@ -186,12 +188,9 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, } new->tcfp_burst = PSCHED_TICKS2NS(parm->burst); - new->tcfp_toks = new->tcfp_burst; - if (new->peak_present) { + if (new->peak_present) new->tcfp_mtu_ptoks = (s64)psched_l2t_ns(&new->peak, new->tcfp_mtu); - new->tcfp_ptoks = new->tcfp_mtu_ptoks; - } if (tb[TCA_POLICE_AVRATE]) new->tcfp_ewma_rate = nla_get_u32(tb[TCA_POLICE_AVRATE]); @@ -207,7 +206,12 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, } spin_lock_bh(&police->tcf_lock); - new->tcfp_t_c = ktime_get_ns(); + spin_lock_bh(&police->tcfp_lock); + police->tcfp_t_c = ktime_get_ns(); + police->tcfp_toks = new->tcfp_burst; + if (new->peak_present) + police->tcfp_ptoks = new->tcfp_mtu_ptoks; + spin_unlock_bh(&police->tcfp_lock); police->tcf_action = parm->action; rcu_swap_protected(police->params, new, @@ -257,25 +261,28 @@ static int tcf_police_act(struct sk_buff *skb, const struct tc_action *a, } now = ktime_get_ns(); - toks = min_t(s64, now - p->tcfp_t_c, p->tcfp_burst); + spin_lock_bh(&police->tcfp_lock); + toks = min_t(s64, now - police->tcfp_t_c, p->tcfp_burst); if (p->peak_present) { - ptoks = toks + p->tcfp_ptoks; + ptoks = toks + police->tcfp_ptoks; if (ptoks > p->tcfp_mtu_ptoks) ptoks = p->tcfp_mtu_ptoks; ptoks -= (s64)psched_l2t_ns(&p->peak, qdisc_pkt_len(skb)); } - toks += p->tcfp_toks; + toks += police->tcfp_toks; if (toks > p->tcfp_burst) toks = p->tcfp_burst; toks -= (s64)psched_l2t_ns(&p->rate, qdisc_pkt_len(skb)); if ((toks|ptoks) >= 0) { - p->tcfp_t_c = now; - p->tcfp_toks = toks; - p->tcfp_ptoks = ptoks; + police->tcfp_t_c = now; + police->tcfp_toks = toks; + police->tcfp_ptoks = ptoks; + spin_unlock_bh(&police->tcfp_lock); ret = p->tcfp_result; goto inc_drops; } + spin_unlock_bh(&police->tcfp_lock); } inc_overlimits: -- cgit From b5dd186d10ba59e6b5ba60e42b3b083df56df6f3 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Tue, 20 Nov 2018 11:39:56 +0000 Subject: net: skb_scrub_packet(): Scrub offload_fwd_mark When a packet is trapped and the corresponding SKB marked as already-forwarded, it retains this marking even after it is forwarded across veth links into another bridge. There, since it ingresses the bridge over veth, which doesn't have offload_fwd_mark, it triggers a warning in nbp_switchdev_frame_mark(). Then nbp_switchdev_allowed_egress() decides not to allow egress from this bridge through another veth, because the SKB is already marked, and the mark (of 0) of course matches. Thus the packet is incorrectly blocked. Solve by resetting offload_fwd_mark() in skb_scrub_packet(). That function is called from tunnels and also from veth, and thus catches the cases where traffic is forwarded between bridges and transformed in a way that invalidates the marking. Fixes: 6bc506b4fb06 ("bridge: switchdev: Add forward mark support for stacked devices") Fixes: abf4bb6b63d0 ("skbuff: Add the offload_mr_fwd_mark field") Signed-off-by: Petr Machata Suggested-by: Ido Schimmel Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/skbuff.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index b4ee5c8b928f..a8217e221e19 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4854,6 +4854,11 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet) nf_reset(skb); nf_reset_trace(skb); +#ifdef CONFIG_NET_SWITCHDEV + skb->offload_fwd_mark = 0; + skb->offload_mr_fwd_mark = 0; +#endif + if (!xnet) return; -- cgit From 86de5921a3d5dd246df661e09bdd0a6131b39ae3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 20 Nov 2018 05:53:59 -0800 Subject: tcp: defer SACK compression after DupThresh Jean-Louis reported a TCP regression and bisected to recent SACK compression. After a loss episode (receiver not able to keep up and dropping packets because its backlog is full), linux TCP stack is sending a single SACK (DUPACK). Sender waits a full RTO timer before recovering losses. While RFC 6675 says in section 5, "Algorithm Details", (2) If DupAcks < DupThresh but IsLost (HighACK + 1) returns true -- indicating at least three segments have arrived above the current cumulative acknowledgment point, which is taken to indicate loss -- go to step (4). ... (4) Invoke fast retransmit and enter loss recovery as follows: there are old TCP stacks not implementing this strategy, and still counting the dupacks before starting fast retransmit. While these stacks probably perform poorly when receivers implement LRO/GRO, we should be a little more gentle to them. This patch makes sure we do not enable SACK compression unless 3 dupacks have been sent since last rcv_nxt update. Ideally we should even rearm the timer to send one or two more DUPACK if no more packets are coming, but that will be work aiming for linux-4.21. Many thanks to Jean-Louis for bisecting the issue, providing packet captures and testing this patch. Fixes: 5d9f4262b7ea ("tcp: add SACK compression") Reported-by: Jean-Louis Dupond Tested-by: Jean-Louis Dupond Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 14 ++++++++++++-- net/ipv4/tcp_output.c | 6 +++--- net/ipv4/tcp_timer.c | 2 +- 3 files changed, 16 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e695584bb33f..1e37c1388189 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4268,7 +4268,7 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq) * If the sack array is full, forget about the last one. */ if (this_sack >= TCP_NUM_SACKS) { - if (tp->compressed_ack) + if (tp->compressed_ack > TCP_FASTRETRANS_THRESH) tcp_send_ack(sk); this_sack--; tp->rx_opt.num_sacks--; @@ -5189,7 +5189,17 @@ send_now: if (!tcp_is_sack(tp) || tp->compressed_ack >= sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr) goto send_now; - tp->compressed_ack++; + + if (tp->compressed_ack_rcv_nxt != tp->rcv_nxt) { + tp->compressed_ack_rcv_nxt = tp->rcv_nxt; + if (tp->compressed_ack > TCP_FASTRETRANS_THRESH) + NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED, + tp->compressed_ack - TCP_FASTRETRANS_THRESH); + tp->compressed_ack = 0; + } + + if (++tp->compressed_ack <= TCP_FASTRETRANS_THRESH) + goto send_now; if (hrtimer_is_queued(&tp->compressed_ack_timer)) return; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 9c34b97d365d..3f510cad0b3e 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -180,10 +180,10 @@ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts, { struct tcp_sock *tp = tcp_sk(sk); - if (unlikely(tp->compressed_ack)) { + if (unlikely(tp->compressed_ack > TCP_FASTRETRANS_THRESH)) { NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED, - tp->compressed_ack); - tp->compressed_ack = 0; + tp->compressed_ack - TCP_FASTRETRANS_THRESH); + tp->compressed_ack = TCP_FASTRETRANS_THRESH; if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1) __sock_put(sk); } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 676020663ce8..5f8b6d3cd855 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -740,7 +740,7 @@ static enum hrtimer_restart tcp_compressed_ack_kick(struct hrtimer *timer) bh_lock_sock(sk); if (!sock_owned_by_user(sk)) { - if (tp->compressed_ack) + if (tp->compressed_ack > TCP_FASTRETRANS_THRESH) tcp_send_ack(sk); } else { if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, -- cgit From f07920ad9c6f5781c90ac4915f8254d999d8c1cc Mon Sep 17 00:00:00 2001 From: Hans Wippel Date: Tue, 20 Nov 2018 16:46:39 +0100 Subject: net/smc: abort CLC connection in smc_release In case of a non-blocking SMC socket, the initial CLC handshake is performed over a blocking TCP connection in a worker. If the SMC socket is released, smc_release has to wait for the blocking CLC socket operations (e.g., kernel_connect) inside the worker. This patch aborts a CLC connection when the respective non-blocking SMC socket is released to avoid waiting on socket operations or timeouts. Signed-off-by: Hans Wippel Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/af_smc.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 80e2119f1c70..84f67f601838 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -127,6 +127,8 @@ static int smc_release(struct socket *sock) smc = smc_sk(sk); /* cleanup for a dangling non-blocking connect */ + if (smc->connect_info && sk->sk_state == SMC_INIT) + tcp_abort(smc->clcsock->sk, ECONNABORTED); flush_work(&smc->connect_work); kfree(smc->connect_info); smc->connect_info = NULL; -- cgit From ee05ff7af26509f39360534a5225ee714416cdfd Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Tue, 20 Nov 2018 16:46:40 +0100 Subject: net/smc: use queue pair number when matching link group When searching for an existing link group the queue pair number is also to be taken into consideration. When the SMC server sends a new number in a CLC packet (keeping all other values equal) then a new link group is to be created on the SMC client side. Signed-off-by: Karsten Graul Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/af_smc.c | 9 +++++---- net/smc/smc_core.c | 10 ++++++---- net/smc/smc_core.h | 2 +- 3 files changed, 12 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 84f67f601838..5fbaf1901571 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -549,7 +549,8 @@ static int smc_connect_rdma(struct smc_sock *smc, mutex_lock(&smc_create_lgr_pending); local_contact = smc_conn_create(smc, false, aclc->hdr.flag, ibdev, - ibport, &aclc->lcl, NULL, 0); + ibport, ntoh24(aclc->qpn), &aclc->lcl, + NULL, 0); if (local_contact < 0) { if (local_contact == -ENOMEM) reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/ @@ -620,7 +621,7 @@ static int smc_connect_ism(struct smc_sock *smc, int rc = 0; mutex_lock(&smc_create_lgr_pending); - local_contact = smc_conn_create(smc, true, aclc->hdr.flag, NULL, 0, + local_contact = smc_conn_create(smc, true, aclc->hdr.flag, NULL, 0, 0, NULL, ismdev, aclc->gid); if (local_contact < 0) return smc_connect_abort(smc, SMC_CLC_DECL_MEM, 0); @@ -1085,7 +1086,7 @@ static int smc_listen_rdma_init(struct smc_sock *new_smc, int *local_contact) { /* allocate connection / link group */ - *local_contact = smc_conn_create(new_smc, false, 0, ibdev, ibport, + *local_contact = smc_conn_create(new_smc, false, 0, ibdev, ibport, 0, &pclc->lcl, NULL, 0); if (*local_contact < 0) { if (*local_contact == -ENOMEM) @@ -1109,7 +1110,7 @@ static int smc_listen_ism_init(struct smc_sock *new_smc, struct smc_clc_msg_smcd *pclc_smcd; pclc_smcd = smc_get_clc_msg_smcd(pclc); - *local_contact = smc_conn_create(new_smc, true, 0, NULL, 0, NULL, + *local_contact = smc_conn_create(new_smc, true, 0, NULL, 0, 0, NULL, ismdev, pclc_smcd->gid); if (*local_contact < 0) { if (*local_contact == -ENOMEM) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 18daebcef181..3c023de58afd 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -559,7 +559,7 @@ out: static bool smcr_lgr_match(struct smc_link_group *lgr, struct smc_clc_msg_local *lcl, - enum smc_lgr_role role) + enum smc_lgr_role role, u32 clcqpn) { return !memcmp(lgr->peer_systemid, lcl->id_for_peer, SMC_SYSTEMID_LEN) && @@ -567,7 +567,9 @@ static bool smcr_lgr_match(struct smc_link_group *lgr, SMC_GID_SIZE) && !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_mac, lcl->mac, sizeof(lcl->mac)) && - lgr->role == role; + lgr->role == role && + (lgr->role == SMC_SERV || + lgr->lnk[SMC_SINGLE_LINK].peer_qpn == clcqpn); } static bool smcd_lgr_match(struct smc_link_group *lgr, @@ -578,7 +580,7 @@ static bool smcd_lgr_match(struct smc_link_group *lgr, /* create a new SMC connection (and a new link group if necessary) */ int smc_conn_create(struct smc_sock *smc, bool is_smcd, int srv_first_contact, - struct smc_ib_device *smcibdev, u8 ibport, + struct smc_ib_device *smcibdev, u8 ibport, u32 clcqpn, struct smc_clc_msg_local *lcl, struct smcd_dev *smcd, u64 peer_gid) { @@ -603,7 +605,7 @@ int smc_conn_create(struct smc_sock *smc, bool is_smcd, int srv_first_contact, list_for_each_entry(lgr, &smc_lgr_list.list, list) { write_lock_bh(&lgr->conns_lock); if ((is_smcd ? smcd_lgr_match(lgr, smcd, peer_gid) : - smcr_lgr_match(lgr, lcl, role)) && + smcr_lgr_match(lgr, lcl, role, clcqpn)) && !lgr->sync_err && lgr->vlan_id == vlan_id && (role == SMC_CLNT || diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index c156674733c9..5bc6cbaf0ed5 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -262,7 +262,7 @@ int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id); void smc_conn_free(struct smc_connection *conn); int smc_conn_create(struct smc_sock *smc, bool is_smcd, int srv_first_contact, - struct smc_ib_device *smcibdev, u8 ibport, + struct smc_ib_device *smcibdev, u8 ibport, u32 clcqpn, struct smc_clc_msg_local *lcl, struct smcd_dev *smcd, u64 peer_gid); void smcd_conn_free(struct smc_connection *conn); -- cgit From 0512f69e388c963dbe955d4bd9ae0f7d88d2dc54 Mon Sep 17 00:00:00 2001 From: Hans Wippel Date: Tue, 20 Nov 2018 16:46:41 +0100 Subject: net/smc: add SMC-D shutdown signal When a SMC-D link group is freed, a shutdown signal should be sent to the peer to indicate that the link group is invalid. This patch adds the shutdown signal to the SMC code. Signed-off-by: Hans Wippel Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_core.c | 10 ++++++++-- net/smc/smc_core.h | 3 ++- net/smc/smc_ism.c | 43 ++++++++++++++++++++++++++++++++----------- net/smc/smc_ism.h | 1 + 4 files changed, 43 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 3c023de58afd..1c9fa7f0261a 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -184,6 +184,8 @@ free: if (!lgr->is_smcd && lnk->state != SMC_LNK_INACTIVE) smc_llc_link_inactive(lnk); + if (lgr->is_smcd) + smc_ism_signal_shutdown(lgr); smc_lgr_free(lgr); } } @@ -485,7 +487,7 @@ void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport) } /* Called when SMC-D device is terminated or peer is lost */ -void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid) +void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid, unsigned short vlan) { struct smc_link_group *lgr, *l; LIST_HEAD(lgr_free_list); @@ -495,7 +497,7 @@ void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid) list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) { if (lgr->is_smcd && lgr->smcd == dev && (!peer_gid || lgr->peer_gid == peer_gid) && - !list_empty(&lgr->list)) { + (vlan == VLAN_VID_MASK || lgr->vlan_id == vlan)) { __smc_lgr_terminate(lgr); list_move(&lgr->list, &lgr_free_list); } @@ -506,6 +508,8 @@ void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid) list_for_each_entry_safe(lgr, l, &lgr_free_list, list) { list_del_init(&lgr->list); cancel_delayed_work_sync(&lgr->free_work); + if (!peer_gid && vlan == VLAN_VID_MASK) /* dev terminated? */ + smc_ism_signal_shutdown(lgr); smc_lgr_free(lgr); } } @@ -1026,6 +1030,8 @@ void smc_core_exit(void) smc_llc_link_inactive(lnk); } cancel_delayed_work_sync(&lgr->free_work); + if (lgr->is_smcd) + smc_ism_signal_shutdown(lgr); smc_lgr_free(lgr); /* free link group */ } } diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 5bc6cbaf0ed5..cf98f4d6093e 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -247,7 +247,8 @@ void smc_lgr_free(struct smc_link_group *lgr); void smc_lgr_forget(struct smc_link_group *lgr); void smc_lgr_terminate(struct smc_link_group *lgr); void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport); -void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid); +void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid, + unsigned short vlan); int smc_buf_create(struct smc_sock *smc, bool is_smcd); int smc_uncompress_bufsize(u8 compressed); int smc_rmb_rtoken_handling(struct smc_connection *conn, diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index e36f21ce7252..2fff79db1a59 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -187,22 +187,28 @@ struct smc_ism_event_work { #define ISM_EVENT_REQUEST 0x0001 #define ISM_EVENT_RESPONSE 0x0002 #define ISM_EVENT_REQUEST_IR 0x00000001 +#define ISM_EVENT_CODE_SHUTDOWN 0x80 #define ISM_EVENT_CODE_TESTLINK 0x83 +union smcd_sw_event_info { + u64 info; + struct { + u8 uid[SMC_LGR_ID_SIZE]; + unsigned short vlan_id; + u16 code; + }; +}; + static void smcd_handle_sw_event(struct smc_ism_event_work *wrk) { - union { - u64 info; - struct { - u32 uid; - unsigned short vlanid; - u16 code; - }; - } ev_info; + union smcd_sw_event_info ev_info; + ev_info.info = wrk->event.info; switch (wrk->event.code) { + case ISM_EVENT_CODE_SHUTDOWN: /* Peer shut down DMBs */ + smc_smcd_terminate(wrk->smcd, wrk->event.tok, ev_info.vlan_id); + break; case ISM_EVENT_CODE_TESTLINK: /* Activity timer */ - ev_info.info = wrk->event.info; if (ev_info.code == ISM_EVENT_REQUEST) { ev_info.code = ISM_EVENT_RESPONSE; wrk->smcd->ops->signal_event(wrk->smcd, @@ -215,6 +221,21 @@ static void smcd_handle_sw_event(struct smc_ism_event_work *wrk) } } +int smc_ism_signal_shutdown(struct smc_link_group *lgr) +{ + int rc; + union smcd_sw_event_info ev_info; + + memcpy(ev_info.uid, lgr->id, SMC_LGR_ID_SIZE); + ev_info.vlan_id = lgr->vlan_id; + ev_info.code = ISM_EVENT_REQUEST; + rc = lgr->smcd->ops->signal_event(lgr->smcd, lgr->peer_gid, + ISM_EVENT_REQUEST_IR, + ISM_EVENT_CODE_SHUTDOWN, + ev_info.info); + return rc; +} + /* worker for SMC-D events */ static void smc_ism_event_work(struct work_struct *work) { @@ -223,7 +244,7 @@ static void smc_ism_event_work(struct work_struct *work) switch (wrk->event.type) { case ISM_EVENT_GID: /* GID event, token is peer GID */ - smc_smcd_terminate(wrk->smcd, wrk->event.tok); + smc_smcd_terminate(wrk->smcd, wrk->event.tok, VLAN_VID_MASK); break; case ISM_EVENT_DMB: break; @@ -289,7 +310,7 @@ void smcd_unregister_dev(struct smcd_dev *smcd) spin_unlock(&smcd_dev_list.lock); flush_workqueue(smcd->event_wq); destroy_workqueue(smcd->event_wq); - smc_smcd_terminate(smcd, 0); + smc_smcd_terminate(smcd, 0, VLAN_VID_MASK); device_del(&smcd->dev); } diff --git a/net/smc/smc_ism.h b/net/smc/smc_ism.h index aee45b860b79..4da946cbfa29 100644 --- a/net/smc/smc_ism.h +++ b/net/smc/smc_ism.h @@ -45,4 +45,5 @@ int smc_ism_register_dmb(struct smc_link_group *lgr, int buf_size, int smc_ism_unregister_dmb(struct smcd_dev *dev, struct smc_buf_desc *dmb_desc); int smc_ism_write(struct smcd_dev *dev, const struct smc_ism_position *pos, void *data, size_t len); +int smc_ism_signal_shutdown(struct smc_link_group *lgr); #endif -- cgit From b9a22dd9811dbcddb5623c499e5b736400059df6 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Tue, 20 Nov 2018 16:46:42 +0100 Subject: net/smc: atomic SMCD cursor handling Running uperf tests with SMCD on LPARs results in corrupted cursors. SMCD cursors should be treated atomically to fix cursor corruption. Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_cdc.c | 26 ++++++++++++++---------- net/smc/smc_cdc.h | 60 +++++++++++++++++++++++++++++++++++++++++-------------- 2 files changed, 60 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index ed5dcf03fe0b..db83332ac1c8 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -81,7 +81,7 @@ static inline void smc_cdc_add_pending_send(struct smc_connection *conn, sizeof(struct smc_cdc_msg) > SMC_WR_BUF_SIZE, "must increase SMC_WR_BUF_SIZE to at least sizeof(struct smc_cdc_msg)"); BUILD_BUG_ON_MSG( - sizeof(struct smc_cdc_msg) != SMC_WR_TX_SIZE, + offsetofend(struct smc_cdc_msg, reserved) > SMC_WR_TX_SIZE, "must adapt SMC_WR_TX_SIZE to sizeof(struct smc_cdc_msg); if not all smc_wr upper layer protocols use the same message size any more, must start to set link->wr_tx_sges[i].length on each individual smc_wr_tx_send()"); BUILD_BUG_ON_MSG( sizeof(struct smc_cdc_tx_pend) > SMC_WR_TX_PEND_PRIV_SIZE, @@ -177,23 +177,24 @@ void smc_cdc_tx_dismiss_slots(struct smc_connection *conn) int smcd_cdc_msg_send(struct smc_connection *conn) { struct smc_sock *smc = container_of(conn, struct smc_sock, conn); + union smc_host_cursor curs; struct smcd_cdc_msg cdc; int rc, diff; memset(&cdc, 0, sizeof(cdc)); cdc.common.type = SMC_CDC_MSG_TYPE; - cdc.prod_wrap = conn->local_tx_ctrl.prod.wrap; - cdc.prod_count = conn->local_tx_ctrl.prod.count; - - cdc.cons_wrap = conn->local_tx_ctrl.cons.wrap; - cdc.cons_count = conn->local_tx_ctrl.cons.count; - cdc.prod_flags = conn->local_tx_ctrl.prod_flags; - cdc.conn_state_flags = conn->local_tx_ctrl.conn_state_flags; + curs.acurs.counter = atomic64_read(&conn->local_tx_ctrl.prod.acurs); + cdc.prod.wrap = curs.wrap; + cdc.prod.count = curs.count; + curs.acurs.counter = atomic64_read(&conn->local_tx_ctrl.cons.acurs); + cdc.cons.wrap = curs.wrap; + cdc.cons.count = curs.count; + cdc.cons.prod_flags = conn->local_tx_ctrl.prod_flags; + cdc.cons.conn_state_flags = conn->local_tx_ctrl.conn_state_flags; rc = smcd_tx_ism_write(conn, &cdc, sizeof(cdc), 0, 1); if (rc) return rc; - smc_curs_copy(&conn->rx_curs_confirmed, &conn->local_tx_ctrl.cons, - conn); + smc_curs_copy(&conn->rx_curs_confirmed, &curs, conn); /* Calculate transmitted data and increment free send buffer space */ diff = smc_curs_diff(conn->sndbuf_desc->len, &conn->tx_curs_fin, &conn->tx_curs_sent); @@ -331,13 +332,16 @@ static void smc_cdc_msg_recv(struct smc_sock *smc, struct smc_cdc_msg *cdc) static void smcd_cdc_rx_tsklet(unsigned long data) { struct smc_connection *conn = (struct smc_connection *)data; + struct smcd_cdc_msg *data_cdc; struct smcd_cdc_msg cdc; struct smc_sock *smc; if (!conn) return; - memcpy(&cdc, conn->rmb_desc->cpu_addr, sizeof(cdc)); + data_cdc = (struct smcd_cdc_msg *)conn->rmb_desc->cpu_addr; + smcd_curs_copy(&cdc.prod, &data_cdc->prod, conn); + smcd_curs_copy(&cdc.cons, &data_cdc->cons, conn); smc = container_of(conn, struct smc_sock, conn); smc_cdc_msg_recv(smc, (struct smc_cdc_msg *)&cdc); } diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h index 934df4473a7c..b5bfe38c7f9b 100644 --- a/net/smc/smc_cdc.h +++ b/net/smc/smc_cdc.h @@ -48,21 +48,31 @@ struct smc_cdc_msg { struct smc_cdc_producer_flags prod_flags; struct smc_cdc_conn_state_flags conn_state_flags; u8 reserved[18]; -} __packed; /* format defined in RFC7609 */ +}; + +/* SMC-D cursor format */ +union smcd_cdc_cursor { + struct { + u16 wrap; + u32 count; + struct smc_cdc_producer_flags prod_flags; + struct smc_cdc_conn_state_flags conn_state_flags; + } __packed; +#ifdef KERNEL_HAS_ATOMIC64 + atomic64_t acurs; /* for atomic processing */ +#else + u64 acurs; /* for atomic processing */ +#endif +} __aligned(8); /* CDC message for SMC-D */ struct smcd_cdc_msg { struct smc_wr_rx_hdr common; /* Type = 0xFE */ u8 res1[7]; - u16 prod_wrap; - u32 prod_count; - u8 res2[2]; - u16 cons_wrap; - u32 cons_count; - struct smc_cdc_producer_flags prod_flags; - struct smc_cdc_conn_state_flags conn_state_flags; + union smcd_cdc_cursor prod; + union smcd_cdc_cursor cons; u8 res3[8]; -} __packed; +} __aligned(8); static inline bool smc_cdc_rxed_any_close(struct smc_connection *conn) { @@ -135,6 +145,21 @@ static inline void smc_curs_copy_net(union smc_cdc_cursor *tgt, #endif } +static inline void smcd_curs_copy(union smcd_cdc_cursor *tgt, + union smcd_cdc_cursor *src, + struct smc_connection *conn) +{ +#ifndef KERNEL_HAS_ATOMIC64 + unsigned long flags; + + spin_lock_irqsave(&conn->acurs_lock, flags); + tgt->acurs = src->acurs; + spin_unlock_irqrestore(&conn->acurs_lock, flags); +#else + atomic64_set(&tgt->acurs, atomic64_read(&src->acurs)); +#endif +} + /* calculate cursor difference between old and new, where old <= new */ static inline int smc_curs_diff(unsigned int size, union smc_host_cursor *old, @@ -222,12 +247,17 @@ static inline void smcr_cdc_msg_to_host(struct smc_host_cdc_msg *local, static inline void smcd_cdc_msg_to_host(struct smc_host_cdc_msg *local, struct smcd_cdc_msg *peer) { - local->prod.wrap = peer->prod_wrap; - local->prod.count = peer->prod_count; - local->cons.wrap = peer->cons_wrap; - local->cons.count = peer->cons_count; - local->prod_flags = peer->prod_flags; - local->conn_state_flags = peer->conn_state_flags; + union smc_host_cursor temp; + + temp.wrap = peer->prod.wrap; + temp.count = peer->prod.count; + atomic64_set(&local->prod.acurs, atomic64_read(&temp.acurs)); + + temp.wrap = peer->cons.wrap; + temp.count = peer->cons.count; + atomic64_set(&local->cons.acurs, atomic64_read(&temp.acurs)); + local->prod_flags = peer->cons.prod_flags; + local->conn_state_flags = peer->cons.conn_state_flags; } static inline void smc_cdc_msg_to_host(struct smc_host_cdc_msg *local, -- cgit From e438bae43c1e08e688c09c410407b59fc1c173b4 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Tue, 20 Nov 2018 16:46:43 +0100 Subject: net/smc: use after free fix in smc_wr_tx_put_slot() In smc_wr_tx_put_slot() field pend->idx is used after being cleared. That means always idx 0 is cleared in the wr_tx_mask. This results in a broken administration of available WR send payload buffers. Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_wr.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 3c458d279855..c2694750a6a8 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -215,12 +215,14 @@ int smc_wr_tx_put_slot(struct smc_link *link, pend = container_of(wr_pend_priv, struct smc_wr_tx_pend, priv); if (pend->idx < link->wr_tx_cnt) { + u32 idx = pend->idx; + /* clear the full struct smc_wr_tx_pend including .priv */ memset(&link->wr_tx_pends[pend->idx], 0, sizeof(link->wr_tx_pends[pend->idx])); memset(&link->wr_tx_bufs[pend->idx], 0, sizeof(link->wr_tx_bufs[pend->idx])); - test_and_clear_bit(pend->idx, link->wr_tx_mask); + test_and_clear_bit(idx, link->wr_tx_mask); return 1; } -- cgit From 5cd8d46ea1562be80063f53c7c6a5f40224de623 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 20 Nov 2018 13:00:18 -0500 Subject: packet: copy user buffers before orphan or clone tpacket_snd sends packets with user pages linked into skb frags. It notifies that pages can be reused when the skb is released by setting skb->destructor to tpacket_destruct_skb. This can cause data corruption if the skb is orphaned (e.g., on transmit through veth) or cloned (e.g., on mirror to another psock). Create a kernel-private copy of data in these cases, same as tun/tap zerocopy transmission. Reuse that infrastructure: mark the skb as SKBTX_ZEROCOPY_FRAG, which will trigger copy in skb_orphan_frags(_rx). Unlike other zerocopy packets, do not set shinfo destructor_arg to struct ubuf_info. tpacket_destruct_skb already uses that ptr to notify when the original skb is released and a timestamp is recorded. Do not change this timestamp behavior. The ubuf_info->callback is not needed anyway, as no zerocopy notification is expected. Mark destructor_arg as not-a-uarg by setting the lower bit to 1. The resulting value is not a valid ubuf_info pointer, nor a valid tpacket_snd frame address. Add skb_zcopy_.._nouarg helpers for this. The fix relies on features introduced in commit 52267790ef52 ("sock: add MSG_ZEROCOPY"), so can be backported as is only to 4.14. Tested with from `./in_netns.sh ./txring_overwrite` from http://github.com/wdebruij/kerneltools/tests Fixes: 69e3c75f4d54 ("net: TX_RING and packet mmap") Reported-by: Anand H. Krishnan Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/packet/af_packet.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index ec3095f13aae..a74650e98f42 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2394,7 +2394,7 @@ static void tpacket_destruct_skb(struct sk_buff *skb) void *ph; __u32 ts; - ph = skb_shinfo(skb)->destructor_arg; + ph = skb_zcopy_get_nouarg(skb); packet_dec_pending(&po->tx_ring); ts = __packet_set_timestamp(po, ph, skb); @@ -2461,7 +2461,7 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, skb->mark = po->sk.sk_mark; skb->tstamp = sockc->transmit_time; sock_tx_timestamp(&po->sk, sockc->tsflags, &skb_shinfo(skb)->tx_flags); - skb_shinfo(skb)->destructor_arg = ph.raw; + skb_zcopy_set_nouarg(skb, ph.raw); skb_reserve(skb, hlen); skb_reset_network_header(skb); -- cgit From 896585d48e8e9ba44cd1754fbce8537feffcc1a5 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Wed, 21 Nov 2018 21:52:33 +0800 Subject: net/ipv6: re-do dad when interface has IFF_NOARP flag change When we add a new IPv6 address, we should also join corresponding solicited-node multicast address, unless the interface has IFF_NOARP flag, as function addrconf_join_solict() did. But if we remove IFF_NOARP flag later, we do not do dad and add the mcast address. So we will drop corresponding neighbour discovery message that came from other nodes. A typical example is after creating a ipvlan with mode l3, setting up an ipv6 address and changing the mode to l2. Then we will not be able to ping this address as the interface doesn't join related solicited-node mcast address. Fix it by re-doing dad when interface changed IFF_NOARP flag. Then we will add corresponding mcast group and check if there is a duplicate address on the network. Reported-by: Jianlin Shi Reviewed-by: Stefano Brivio Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 63a808d5af15..045597b9a7c0 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -179,7 +179,7 @@ static void addrconf_dad_start(struct inet6_ifaddr *ifp); static void addrconf_dad_work(struct work_struct *w); static void addrconf_dad_completed(struct inet6_ifaddr *ifp, bool bump_id, bool send_na); -static void addrconf_dad_run(struct inet6_dev *idev); +static void addrconf_dad_run(struct inet6_dev *idev, bool restart); static void addrconf_rs_timer(struct timer_list *t); static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifa); static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifa); @@ -3439,6 +3439,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct netdev_notifier_change_info *change_info; struct netdev_notifier_changeupper_info *info; struct inet6_dev *idev = __in6_dev_get(dev); struct net *net = dev_net(dev); @@ -3513,7 +3514,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, break; } - if (idev) { + if (!IS_ERR_OR_NULL(idev)) { if (idev->if_flags & IF_READY) { /* device is already configured - * but resend MLD reports, we might @@ -3521,6 +3522,9 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, * multicast snooping switches */ ipv6_mc_up(idev); + change_info = ptr; + if (change_info->flags_changed & IFF_NOARP) + addrconf_dad_run(idev, true); rt6_sync_up(dev, RTNH_F_LINKDOWN); break; } @@ -3555,7 +3559,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, if (!IS_ERR_OR_NULL(idev)) { if (run_pending) - addrconf_dad_run(idev); + addrconf_dad_run(idev, false); /* Device has an address by now */ rt6_sync_up(dev, RTNH_F_DEAD); @@ -4173,16 +4177,19 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp, bool bump_id, addrconf_verify_rtnl(); } -static void addrconf_dad_run(struct inet6_dev *idev) +static void addrconf_dad_run(struct inet6_dev *idev, bool restart) { struct inet6_ifaddr *ifp; read_lock_bh(&idev->lock); list_for_each_entry(ifp, &idev->addr_list, if_list) { spin_lock(&ifp->lock); - if (ifp->flags & IFA_F_TENTATIVE && - ifp->state == INET6_IFADDR_STATE_DAD) + if ((ifp->flags & IFA_F_TENTATIVE && + ifp->state == INET6_IFADDR_STATE_DAD) || restart) { + if (restart) + ifp->state = INET6_IFADDR_STATE_PREDAD; addrconf_dad_kick(ifp); + } spin_unlock(&ifp->lock); } read_unlock_bh(&idev->lock); -- cgit From 605108acfe6233b72e2f803aa1cb59a2af3001ca Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 21 Nov 2018 18:21:35 +0100 Subject: net: don't keep lonely packets forever in the gro hash Eric noted that with UDP GRO and NAPI timeout, we could keep a single UDP packet inside the GRO hash forever, if the related NAPI instance calls napi_gro_complete() at an higher frequency than the NAPI timeout. Willem noted that even TCP packets could be trapped there, till the next retransmission. This patch tries to address the issue, flushing the old packets - those with a NAPI_GRO_CB age before the current jiffy - before scheduling the NAPI timeout. The rationale is that such a timeout should be well below a jiffy and we are not flushing packets eligible for sane GRO. v1 -> v2: - clarified the commit message and comment RFC -> v1: - added 'Fixes tags', cleaned-up the wording. Reported-by: Eric Dumazet Fixes: 3b47d30396ba ("net: gro: add a per device gro flush timer") Signed-off-by: Paolo Abeni Acked-by: Willem de Bruijn Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 066aa902d85c..ddc551f24ba2 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5970,11 +5970,14 @@ bool napi_complete_done(struct napi_struct *n, int work_done) if (work_done) timeout = n->dev->gro_flush_timeout; + /* When the NAPI instance uses a timeout and keeps postponing + * it, we need to bound somehow the time packets are kept in + * the GRO layer + */ + napi_gro_flush(n, !!timeout); if (timeout) hrtimer_start(&n->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED); - else - napi_gro_flush(n, false); } if (unlikely(!list_empty(&n->poll_list))) { /* If n->poll_list is not empty, we need to mask irqs */ -- cgit From 484afd1bd3fc6f9f5347289fc8b285aa65f67054 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Wed, 21 Nov 2018 18:23:53 +0100 Subject: net/sched: act_police: add missing spinlock initialization commit f2cbd4852820 ("net/sched: act_police: fix race condition on state variables") introduces a new spinlock, but forgets its initialization. Ensure that tcf_police_init() initializes 'tcfp_lock' every time a 'police' action is newly created, to avoid the following lockdep splat: INFO: trying to register non-static key. the code is fine but needs lockdep annotation. turning off the locking correctness validator. <...> Call Trace: dump_stack+0x85/0xcb register_lock_class+0x581/0x590 __lock_acquire+0xd4/0x1330 ? tcf_police_init+0x2fa/0x650 [act_police] ? lock_acquire+0x9e/0x1a0 lock_acquire+0x9e/0x1a0 ? tcf_police_init+0x2fa/0x650 [act_police] ? tcf_police_init+0x55a/0x650 [act_police] _raw_spin_lock_bh+0x34/0x40 ? tcf_police_init+0x2fa/0x650 [act_police] tcf_police_init+0x2fa/0x650 [act_police] tcf_action_init_1+0x384/0x4c0 tcf_action_init+0xf6/0x160 tcf_action_add+0x73/0x170 tc_ctl_action+0x122/0x160 rtnetlink_rcv_msg+0x2a4/0x490 ? netlink_deliver_tap+0x99/0x400 ? validate_linkmsg+0x370/0x370 netlink_rcv_skb+0x4d/0x130 netlink_unicast+0x196/0x230 netlink_sendmsg+0x2e5/0x3e0 sock_sendmsg+0x36/0x40 ___sys_sendmsg+0x280/0x2f0 ? _raw_spin_unlock+0x24/0x30 ? handle_pte_fault+0xafe/0xf30 ? find_held_lock+0x2d/0x90 ? syscall_trace_enter+0x1df/0x360 ? __sys_sendmsg+0x5e/0xa0 __sys_sendmsg+0x5e/0xa0 do_syscall_64+0x60/0x210 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x7f1841c7cf10 Code: c3 48 8b 05 82 6f 2c 00 f7 db 64 89 18 48 83 cb ff eb dd 0f 1f 80 00 00 00 00 83 3d 8d d0 2c 00 00 75 10 b8 2e 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 ae cc 00 00 48 89 04 24 RSP: 002b:00007ffcf9df4d68 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007f1841c7cf10 RDX: 0000000000000000 RSI: 00007ffcf9df4dc0 RDI: 0000000000000003 RBP: 000000005bf56105 R08: 0000000000000002 R09: 00007ffcf9df8edc R10: 00007ffcf9df47e0 R11: 0000000000000246 R12: 0000000000671be0 R13: 00007ffcf9df4e84 R14: 0000000000000008 R15: 0000000000000000 Fixes: f2cbd4852820 ("net/sched: act_police: fix race condition on state variables") Reported-by: Cong Wang Signed-off-by: Davide Caratti Acked-by: Cong Wang Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/act_police.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sched/act_police.c b/net/sched/act_police.c index ee4665a5a022..37c9b8f0e10f 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -124,6 +124,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, return ret; } ret = ACT_P_CREATED; + spin_lock_init(&(to_police(*a)->tcfp_lock)); } else if (!ovr) { tcf_idr_release(*a, bind); return -EEXIST; -- cgit