From 8d50b53d66a8a6ae41bafbdcabe401467803f33a Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 30 Jul 2008 02:37:46 -0700 Subject: pkt_sched: Fix OOPS on ingress qdisc add. Bug report from Steven Jan Springl: Issuing the following command causes a kernel oops: tc qdisc add dev eth0 handle ffff: ingress The problem mostly stems from all of the special case handling of ingress qdiscs. So, to fix this, do the grafting operation the same way we do for TX qdiscs. Which means that dev_activate() and dev_deactivate() now do the "qdisc_sleeping <--> qdisc" transitions on dev->rx_queue too. Future simplifications are possible now, mainly because it is impossible for dev_queue->{qdisc,qdisc_sleeping} to be NULL. There are NULL checks all over to handle the ingress qdisc special case that used to exist before this commit. Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 8d13a9b9f1df..63d6bcddbf46 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2100,7 +2100,7 @@ static int ing_filter(struct sk_buff *skb) rxq = &dev->rx_queue; q = rxq->qdisc; - if (q) { + if (q != &noop_qdisc) { spin_lock(qdisc_lock(q)); result = qdisc_enqueue_root(skb, q); spin_unlock(qdisc_lock(q)); @@ -2113,7 +2113,7 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, struct net_device *orig_dev) { - if (!skb->dev->rx_queue.qdisc) + if (skb->dev->rx_queue.qdisc == &noop_qdisc) goto out; if (*pt_prev) { -- cgit From c3f26a269c2421f97f10cf8ed05d5099b573af4d Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 31 Jul 2008 16:58:50 -0700 Subject: netdev: Fix lockdep warnings in multiqueue configurations. When support for multiple TX queues were added, the netif_tx_lock() routines we converted to iterate over all TX queues and grab each queue's spinlock. This causes heartburn for lockdep and it's not a healthy thing to do with lots of TX queues anyways. So modify this to use a top-level lock and a "frozen" state for the individual TX queues. Signed-off-by: David S. Miller --- net/core/dev.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 63d6bcddbf46..69320a56a084 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4200,6 +4200,7 @@ static void netdev_init_queues(struct net_device *dev) { netdev_init_one_queue(dev, &dev->rx_queue, NULL); netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); + spin_lock_init(&dev->tx_global_lock); } /** -- cgit From 5fb662297b8a4bdadd60371c34b760efca948ebc Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sat, 2 Aug 2008 20:02:43 -0700 Subject: pkt_sched: Use qdisc_lock() on already sampled root qdisc. Based upon a bug report by Jeff Kirsher. Don't use qdisc_root_lock() in these cases as the root qdisc could have been changed, and we'd thus lock the wrong object. Tested by Emil S Tantilov who confirms that this seems to fix the problem. Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 69320a56a084..da7acacf02b5 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1796,7 +1796,7 @@ gso: skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS); #endif if (q->enqueue) { - spinlock_t *root_lock = qdisc_root_lock(q); + spinlock_t *root_lock = qdisc_lock(q); spin_lock(root_lock); @@ -1995,7 +1995,7 @@ static void net_tx_action(struct softirq_action *h) smp_mb__before_clear_bit(); clear_bit(__QDISC_STATE_SCHED, &q->state); - root_lock = qdisc_root_lock(q); + root_lock = qdisc_lock(q); if (spin_trylock(root_lock)) { qdisc_run(q); spin_unlock(root_lock); -- cgit From e5a4a72d4f88f4389e9340d383ca67031d1b8536 Mon Sep 17 00:00:00 2001 From: Lennert Buytenhek Date: Sun, 3 Aug 2008 01:23:10 -0700 Subject: net: use software GSO for SG+CSUM capable netdevices If a netdevice does not support hardware GSO, allowing the stack to use GSO anyway and then splitting the GSO skb into MSS-sized pieces as it is handed to the netdevice for transmitting is likely still a win as far as throughput and/or CPU usage are concerned, since it reduces the number of trips through the output path. This patch enables the use of GSO on any netdevice that supports SG. If a GSO skb is then sent to a netdevice that supports SG but does not support hardware GSO, net/core/dev.c:dev_hard_start_xmit() will take care of doing the necessary GSO segmentation in software. Signed-off-by: Lennert Buytenhek Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index da7acacf02b5..cbf80098980c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3988,6 +3988,10 @@ int register_netdevice(struct net_device *dev) } } + /* Enable software GSO if SG is supported. */ + if (dev->features & NETIF_F_SG) + dev->features |= NETIF_F_GSO; + netdev_initialize_kobject(dev); ret = netdev_register_kobject(dev); if (ret) -- cgit From 6e583ce5242f32e925dcb198f7123256d0798370 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Sun, 3 Aug 2008 21:29:57 -0700 Subject: net: eliminate refcounting in backlog queue Avoid the overhead of atomic increment/decrement on each received packet. This helps performance of non-NAPI devices (like loopback). Use cleanup function to walk queue on each cpu and clean out any left over packets. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/core/dev.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index cbf80098980c..fc6c9881eca8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1909,7 +1909,6 @@ int netif_rx(struct sk_buff *skb) if (queue->input_pkt_queue.qlen <= netdev_max_backlog) { if (queue->input_pkt_queue.qlen) { enqueue: - dev_hold(skb->dev); __skb_queue_tail(&queue->input_pkt_queue, skb); local_irq_restore(flags); return NET_RX_SUCCESS; @@ -2270,6 +2269,20 @@ out: return ret; } +/* Network device is going away, flush any packets still pending */ +static void flush_backlog(void *arg) +{ + struct net_device *dev = arg; + struct softnet_data *queue = &__get_cpu_var(softnet_data); + struct sk_buff *skb, *tmp; + + skb_queue_walk_safe(&queue->input_pkt_queue, skb, tmp) + if (skb->dev == dev) { + __skb_unlink(skb, &queue->input_pkt_queue); + kfree_skb(skb); + } +} + static int process_backlog(struct napi_struct *napi, int quota) { int work = 0; @@ -2279,7 +2292,6 @@ static int process_backlog(struct napi_struct *napi, int quota) napi->weight = weight_p; do { struct sk_buff *skb; - struct net_device *dev; local_irq_disable(); skb = __skb_dequeue(&queue->input_pkt_queue); @@ -2288,14 +2300,9 @@ static int process_backlog(struct napi_struct *napi, int quota) local_irq_enable(); break; } - local_irq_enable(); - dev = skb->dev; - netif_receive_skb(skb); - - dev_put(dev); } while (++work < quota && jiffies == start_time); return work; @@ -4169,6 +4176,8 @@ void netdev_run_todo(void) dev->reg_state = NETREG_UNREGISTERED; + on_each_cpu(flush_backlog, dev, 1); + netdev_wait_allrefs(dev); /* paranoia */ -- cgit From c27f339af90bb874a7a9c680b17abfd32d4a727b Mon Sep 17 00:00:00 2001 From: Jarek Poplawski Date: Mon, 4 Aug 2008 22:39:11 -0700 Subject: net_sched: Add qdisc __NET_XMIT_BYPASS flag Patrick McHardy noticed that it would be nice to handle NET_XMIT_BYPASS by NET_XMIT_SUCCESS with an internal qdisc flag __NET_XMIT_BYPASS and to remove the mapping from dev_queue_xmit(). David Miller spotted a serious bug in the first version of this patch. Signed-off-by: Jarek Poplawski Signed-off-by: David S. Miller --- net/core/dev.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index fc6c9881eca8..01993ad74e76 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1805,7 +1805,6 @@ gso: spin_unlock(root_lock); - rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc; goto out; } -- cgit From cc9bd5cebc0825e0fabc0186ab85806a0891104f Mon Sep 17 00:00:00 2001 From: Joe Eykholt Date: Wed, 2 Jul 2008 18:22:00 -0700 Subject: net/core: Uninline skb_bond(). Otherwise subsequent changes need multiple return values. Signed-off-by: Joe Eykholt Signed-off-by: Jay Vosburgh Signed-off-by: Jeff Garzik --- net/core/dev.c | 28 ++++++++-------------------- 1 file changed, 8 insertions(+), 20 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 01993ad74e76..4a09833331f1 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1939,22 +1939,6 @@ int netif_rx_ni(struct sk_buff *skb) EXPORT_SYMBOL(netif_rx_ni); -static inline struct net_device *skb_bond(struct sk_buff *skb) -{ - struct net_device *dev = skb->dev; - - if (dev->master) { - if (skb_bond_should_drop(skb)) { - kfree_skb(skb); - return NULL; - } - skb->dev = dev->master; - } - - return dev; -} - - static void net_tx_action(struct softirq_action *h) { struct softnet_data *sd = &__get_cpu_var(softnet_data); @@ -2194,10 +2178,14 @@ int netif_receive_skb(struct sk_buff *skb) if (!skb->iif) skb->iif = skb->dev->ifindex; - orig_dev = skb_bond(skb); - - if (!orig_dev) - return NET_RX_DROP; + orig_dev = skb->dev; + if (orig_dev->master) { + if (skb_bond_should_drop(skb)) { + kfree_skb(skb); + return NET_RX_DROP; + } + skb->dev = orig_dev->master; + } __get_cpu_var(netdev_rx_stat).total++; -- cgit From 0d7a3681232f545c6a59f77e60f7667673ef0e93 Mon Sep 17 00:00:00 2001 From: Joe Eykholt Date: Wed, 2 Jul 2008 18:22:01 -0700 Subject: net/core: Allow certain receives on inactive slave. Allow a packet_type that specifies the exact device to receive even on an inactive bonding slave devices. This is important for some L2 protocols such as LLDP and FCoE. This can eventually be used for the bonding special cases as well. Signed-off-by: Joe Eykholt Signed-off-by: Jay Vosburgh Signed-off-by: Jeff Garzik --- net/core/dev.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 4a09833331f1..dab97c7cf275 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2165,6 +2165,7 @@ int netif_receive_skb(struct sk_buff *skb) { struct packet_type *ptype, *pt_prev; struct net_device *orig_dev; + struct net_device *null_or_orig; int ret = NET_RX_DROP; __be16 type; @@ -2178,13 +2179,13 @@ int netif_receive_skb(struct sk_buff *skb) if (!skb->iif) skb->iif = skb->dev->ifindex; + null_or_orig = NULL; orig_dev = skb->dev; if (orig_dev->master) { - if (skb_bond_should_drop(skb)) { - kfree_skb(skb); - return NET_RX_DROP; - } - skb->dev = orig_dev->master; + if (skb_bond_should_drop(skb)) + null_or_orig = orig_dev; /* deliver only exact match */ + else + skb->dev = orig_dev->master; } __get_cpu_var(netdev_rx_stat).total++; @@ -2209,7 +2210,7 @@ int netif_receive_skb(struct sk_buff *skb) #endif list_for_each_entry_rcu(ptype, &ptype_all, list) { - if (!ptype->dev || ptype->dev == skb->dev) { + if (ptype->dev == null_or_orig || ptype->dev == skb->dev) { if (pt_prev) ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = ptype; @@ -2234,7 +2235,7 @@ ncls: list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) { if (ptype->type == type && - (!ptype->dev || ptype->dev == skb->dev)) { + (ptype->dev == null_or_orig || ptype->dev == skb->dev)) { if (pt_prev) ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = ptype; -- cgit From f982307f22db96201e41540295f24e8dcc10c78f Mon Sep 17 00:00:00 2001 From: Joe Eykholt Date: Wed, 2 Jul 2008 18:22:02 -0700 Subject: net/core: Allow receive on active slaves. If a packet_type specifies an active slave to bonding and not just any interface, allow it to receive frames that came in on that interface. Signed-off-by: Joe Eykholt Signed-off-by: Jay Vosburgh Signed-off-by: Jeff Garzik --- net/core/dev.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index dab97c7cf275..600bb23c4c2e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2210,7 +2210,8 @@ int netif_receive_skb(struct sk_buff *skb) #endif list_for_each_entry_rcu(ptype, &ptype_all, list) { - if (ptype->dev == null_or_orig || ptype->dev == skb->dev) { + if (ptype->dev == null_or_orig || ptype->dev == skb->dev || + ptype->dev == orig_dev) { if (pt_prev) ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = ptype; @@ -2235,7 +2236,8 @@ ncls: list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) { if (ptype->type == type && - (ptype->dev == null_or_orig || ptype->dev == skb->dev)) { + (ptype->dev == null_or_orig || ptype->dev == skb->dev || + ptype->dev == orig_dev)) { if (pt_prev) ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = ptype; -- cgit