From 49682bfa1e0e448a711471a5db83be0df1fb39a2 Mon Sep 17 00:00:00 2001 From: Mathieu Malaterre Date: Wed, 31 Oct 2018 13:16:58 +0100 Subject: net: document skb parameter in function 'skb_gso_size_check' Remove kernel-doc warning: net/core/skbuff.c:4953: warning: Function parameter or member 'skb' not described in 'skb_gso_size_check' Signed-off-by: Mathieu Malaterre Signed-off-by: David S. Miller --- net/core/skbuff.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 946de0e24c87..b4ee5c8b928f 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4944,6 +4944,8 @@ static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb) * * This is a helper to do that correctly considering GSO_BY_FRAGS. * + * @skb: GSO skb + * * @seg_len: The segmented length (from skb_gso_*_seglen). In the * GSO_BY_FRAGS case this will be [header sizes + GSO_BY_FRAGS]. * -- cgit From fe60faa5063822f2d555f4f326c7dd72a60929bf Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 31 Oct 2018 08:39:13 -0700 Subject: net: do not abort bulk send on BQL status Before calling dev_hard_start_xmit(), upper layers tried to cook optimal skb list based on BQL budget. Problem is that GSO packets can end up comsuming more than the BQL budget. Breaking the loop is not useful, since requeued packets are ahead of any packets still in the qdisc. It is also more expensive, since next TX completion will push these packets later, while skbs are not in cpu caches. It is also a behavior difference with TSO packets, that can break the BQL limit by a large amount. Note that drivers should use __netdev_tx_sent_queue() in order to have optimal xmit_more support, and avoid useless atomic operations as shown in the following patch. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 77d43ae2a7bb..0ffcbdd55fa9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3272,7 +3272,7 @@ struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *de } skb = next; - if (netif_xmit_stopped(txq) && skb) { + if (netif_tx_queue_stopped(txq) && skb) { rc = NETDEV_TX_BUSY; break; } -- cgit From 5e1acb4afacc6229946c3d2b7ffc422b53fb2448 Mon Sep 17 00:00:00 2001 From: Alexey Kodanev Date: Fri, 2 Nov 2018 19:11:04 +0300 Subject: rtnetlink: restore handling of dumpit return value in rtnl_dump_all() For non-zero return from dumpit() we should break the loop in rtnl_dump_all() and return the result. Otherwise, e.g., we could get the memory leak in inet6_dump_fib() [1]. The pointer to the allocated struct fib6_walker there (saved in cb->args) can be lost, reset on the next iteration. Fix it by partially restoring the previous behavior before commit c63586dc9b3e ("net: rtnl_dump_all needs to propagate error from dumpit function"). The returned error from dumpit() is still passed further. [1]: unreferenced object 0xffff88001322a200 (size 96): comm "sshd", pid 1484, jiffies 4296032768 (age 1432.542s) hex dump (first 32 bytes): 00 01 00 00 00 00 ad de 00 02 00 00 00 00 ad de ................ 18 09 41 36 00 88 ff ff 18 09 41 36 00 88 ff ff ..A6......A6.... backtrace: [<0000000095846b39>] kmem_cache_alloc_trace+0x151/0x220 [<000000007d12709f>] inet6_dump_fib+0x68d/0x940 [<000000002775a316>] rtnl_dump_all+0x1d9/0x2d0 [<00000000d7cd302b>] netlink_dump+0x945/0x11a0 [<000000002f43485f>] __netlink_dump_start+0x55d/0x800 [<00000000f76bbeec>] rtnetlink_rcv_msg+0x4fa/0xa00 [<000000009b5761f3>] netlink_rcv_skb+0x29c/0x420 [<0000000087a1dae1>] rtnetlink_rcv+0x15/0x20 [<00000000691b703b>] netlink_unicast+0x4e3/0x6c0 [<00000000b5be0204>] netlink_sendmsg+0x7f2/0xba0 [<0000000096d2aa60>] sock_sendmsg+0xba/0xf0 [<000000008c1b786f>] __sys_sendto+0x1e4/0x330 [<0000000019587b3f>] __x64_sys_sendto+0xe1/0x1a0 [<00000000071f4d56>] do_syscall_64+0x9f/0x300 [<000000002737577f>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [<0000000057587684>] 0xffffffffffffffff Fixes: c63586dc9b3e ("net: rtnl_dump_all needs to propagate error from dumpit function") Signed-off-by: Alexey Kodanev Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index e01274bd5e3e..33d9227a8b80 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3367,7 +3367,7 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) cb->seq = 0; } ret = dumpit(skb, cb); - if (ret < 0) + if (ret) break; } cb->family = idx; -- cgit From d016b4a3562b745fd9fa387a47d8de62ccd7e241 Mon Sep 17 00:00:00 2001 From: "Matwey V. Kornilov" Date: Fri, 2 Nov 2018 21:19:36 +0300 Subject: net: core: netpoll: Enable netconsole IPv6 link local address There is no reason to discard using source link local address when remote netconsole IPv6 address is set to be link local one. The patch allows administrators to use IPv6 netconsole without explicitly configuring source address: netconsole=@/,@fe80::5054:ff:fe2f:6012/ Signed-off-by: Matwey V. Kornilov Signed-off-by: David S. Miller --- net/core/netpoll.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 5da9552b186b..2b9fdbc43205 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -717,7 +717,8 @@ int netpoll_setup(struct netpoll *np) read_lock_bh(&idev->lock); list_for_each_entry(ifp, &idev->addr_list, if_list) { - if (ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL) + if (!!(ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL) != + !!(ipv6_addr_type(&np->remote_ip.in6) & IPV6_ADDR_LINKLOCAL)) continue; np->local_ip.in6 = ifp->addr; err = 0; -- cgit From c34c1287778b080ed692c0a46a8e345206cc29e6 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Sun, 4 Nov 2018 22:37:15 -0800 Subject: sock_diag: fix autoloading of the raw_diag module IPPROTO_RAW isn't registred as an inet protocol, so inet_protos[protocol] is always NULL for it. Cc: Cyrill Gorcunov Cc: Xin Long Fixes: bf2ae2e4bf93 ("sock_diag: request _diag module only when the family or proto has been registered") Signed-off-by: Andrei Vagin Reviewed-by: Cyrill Gorcunov Signed-off-by: David S. Miller --- net/core/sock.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index 6fcc4bc07d19..080a880a1761 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3279,6 +3279,7 @@ int sock_load_diag_module(int family, int protocol) #ifdef CONFIG_INET if (family == AF_INET && + protocol != IPPROTO_RAW && !rcu_access_pointer(inet_protos[protocol])) return -ENOENT; #endif -- cgit From 62230715fd2453b3ba948c9d83cfb3ada9169169 Mon Sep 17 00:00:00 2001 From: 배석진 Date: Fri, 9 Nov 2018 16:53:06 -0800 Subject: flow_dissector: do not dissect l4 ports for fragments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Only first fragment has the sport/dport information, not the following ones. If we want consistent hash for all fragments, we need to ignore ports even for first fragment. This bug is visible for IPv6 traffic, if incoming fragments do not have a flow label, since skb_get_hash() will give different results for first fragment and following ones. It is also visible if any routing rule wants dissection and sport or dport. See commit 5e5d6fed3741 ("ipv6: route: dissect flow in input path if fib rules need it") for details. [edumazet] rewrote the changelog completely. Fixes: 06635a35d13d ("flow_dissect: use programable dissector in skb_flow_dissect and friends") Signed-off-by: 배석진 Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 676f3ad629f9..588f475019d4 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -1166,8 +1166,8 @@ ip_proto_again: break; } - if (dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_PORTS)) { + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS) && + !(key_control->flags & FLOW_DIS_IS_FRAGMENT)) { key_ports = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_PORTS, target_container); -- cgit From 33d9a2c72f086cbf1087b2fd2d1a15aa9df14a7f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 17 Nov 2018 21:57:02 -0800 Subject: net-gro: reset skb->pkt_type in napi_reuse_skb() eth_type_trans() assumes initial value for skb->pkt_type is PACKET_HOST. This is indeed the value right after a fresh skb allocation. However, it is possible that GRO merged a packet with a different value (like PACKET_OTHERHOST in case macvlan is used), so we need to make sure napi->skb will have pkt_type set back to PACKET_HOST. Otherwise, valid packets might be dropped by the stack because their pkt_type is not PACKET_HOST. napi_reuse_skb() was added in commit 96e93eab2033 ("gro: Add internal interfaces for VLAN"), but this bug always has been there. Fixes: 96e93eab2033 ("gro: Add internal interfaces for VLAN") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 0ffcbdd55fa9..066aa902d85c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5655,6 +5655,10 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) skb->vlan_tci = 0; skb->dev = napi->dev; skb->skb_iif = 0; + + /* eth_type_trans() assumes pkt_type is PACKET_HOST */ + skb->pkt_type = PACKET_HOST; + skb->encapsulation = 0; skb_shinfo(skb)->gso_type = 0; skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); -- cgit From b5dd186d10ba59e6b5ba60e42b3b083df56df6f3 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Tue, 20 Nov 2018 11:39:56 +0000 Subject: net: skb_scrub_packet(): Scrub offload_fwd_mark When a packet is trapped and the corresponding SKB marked as already-forwarded, it retains this marking even after it is forwarded across veth links into another bridge. There, since it ingresses the bridge over veth, which doesn't have offload_fwd_mark, it triggers a warning in nbp_switchdev_frame_mark(). Then nbp_switchdev_allowed_egress() decides not to allow egress from this bridge through another veth, because the SKB is already marked, and the mark (of 0) of course matches. Thus the packet is incorrectly blocked. Solve by resetting offload_fwd_mark() in skb_scrub_packet(). That function is called from tunnels and also from veth, and thus catches the cases where traffic is forwarded between bridges and transformed in a way that invalidates the marking. Fixes: 6bc506b4fb06 ("bridge: switchdev: Add forward mark support for stacked devices") Fixes: abf4bb6b63d0 ("skbuff: Add the offload_mr_fwd_mark field") Signed-off-by: Petr Machata Suggested-by: Ido Schimmel Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/skbuff.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index b4ee5c8b928f..a8217e221e19 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4854,6 +4854,11 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet) nf_reset(skb); nf_reset_trace(skb); +#ifdef CONFIG_NET_SWITCHDEV + skb->offload_fwd_mark = 0; + skb->offload_mr_fwd_mark = 0; +#endif + if (!xnet) return; -- cgit From 605108acfe6233b72e2f803aa1cb59a2af3001ca Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 21 Nov 2018 18:21:35 +0100 Subject: net: don't keep lonely packets forever in the gro hash Eric noted that with UDP GRO and NAPI timeout, we could keep a single UDP packet inside the GRO hash forever, if the related NAPI instance calls napi_gro_complete() at an higher frequency than the NAPI timeout. Willem noted that even TCP packets could be trapped there, till the next retransmission. This patch tries to address the issue, flushing the old packets - those with a NAPI_GRO_CB age before the current jiffy - before scheduling the NAPI timeout. The rationale is that such a timeout should be well below a jiffy and we are not flushing packets eligible for sane GRO. v1 -> v2: - clarified the commit message and comment RFC -> v1: - added 'Fixes tags', cleaned-up the wording. Reported-by: Eric Dumazet Fixes: 3b47d30396ba ("net: gro: add a per device gro flush timer") Signed-off-by: Paolo Abeni Acked-by: Willem de Bruijn Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 066aa902d85c..ddc551f24ba2 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5970,11 +5970,14 @@ bool napi_complete_done(struct napi_struct *n, int work_done) if (work_done) timeout = n->dev->gro_flush_timeout; + /* When the NAPI instance uses a timeout and keeps postponing + * it, we need to bound somehow the time packets are kept in + * the GRO layer + */ + napi_gro_flush(n, !!timeout); if (timeout) hrtimer_start(&n->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED); - else - napi_gro_flush(n, false); } if (unlikely(!list_empty(&n->poll_list))) { /* If n->poll_list is not empty, we need to mask irqs */ -- cgit