From 7aa0045dadb6ef37485ea9f2a7d28278ca588b51 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 26 Oct 2017 18:24:28 -0700 Subject: net_sched: introduce a workqueue for RCU callbacks of tc filter This patch introduces a dedicated workqueue for tc filters so that each tc filter's RCU callback could defer their action destroy work to this workqueue. The helper tcf_queue_work() is introduced for them to use. Because we hold RTNL lock when calling tcf_block_put(), we can not simply flush works inside it, therefore we have to defer it again to this workqueue and make sure all flying RCU callbacks have already queued their work before this one, in other words, to ensure this is the last one to execute to prevent any use-after-free. On the other hand, this makes tcf_block_put() ugly and harder to understand. Since David and Eric strongly dislike adding synchronize_rcu(), this is probably the only solution that could make everyone happy. Please also see the code comments below. Reported-by: Chris Mi Cc: Daniel Borkmann Cc: Jiri Pirko Cc: John Fastabend Cc: Jamal Hadi Salim Cc: "Paul E. McKenney" Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/cls_api.c | 68 +++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 51 insertions(+), 17 deletions(-) (limited to 'net/sched/cls_api.c') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 0b2219adf520..045d13679ad6 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -77,6 +77,8 @@ out: } EXPORT_SYMBOL(register_tcf_proto_ops); +static struct workqueue_struct *tc_filter_wq; + int unregister_tcf_proto_ops(struct tcf_proto_ops *ops) { struct tcf_proto_ops *t; @@ -86,6 +88,7 @@ int unregister_tcf_proto_ops(struct tcf_proto_ops *ops) * tcf_proto_ops's destroy() handler. */ rcu_barrier(); + flush_workqueue(tc_filter_wq); write_lock(&cls_mod_lock); list_for_each_entry(t, &tcf_proto_base, head) { @@ -100,6 +103,12 @@ int unregister_tcf_proto_ops(struct tcf_proto_ops *ops) } EXPORT_SYMBOL(unregister_tcf_proto_ops); +bool tcf_queue_work(struct work_struct *work) +{ + return queue_work(tc_filter_wq, work); +} +EXPORT_SYMBOL(tcf_queue_work); + /* Select new prio value from the range, managed by kernel. */ static inline u32 tcf_auto_prio(struct tcf_proto *tp) @@ -266,23 +275,30 @@ err_chain_create: } EXPORT_SYMBOL(tcf_block_get); -void tcf_block_put(struct tcf_block *block) +static void tcf_block_put_final(struct work_struct *work) { + struct tcf_block *block = container_of(work, struct tcf_block, work); struct tcf_chain *chain, *tmp; - if (!block) - return; - - /* XXX: Standalone actions are not allowed to jump to any chain, and - * bound actions should be all removed after flushing. However, - * filters are destroyed in RCU callbacks, we have to hold the chains - * first, otherwise we would always race with RCU callbacks on this list - * without proper locking. - */ + /* At this point, all the chains should have refcnt == 1. */ + rtnl_lock(); + list_for_each_entry_safe(chain, tmp, &block->chain_list, list) + tcf_chain_put(chain); + rtnl_unlock(); + kfree(block); +} - /* Wait for existing RCU callbacks to cool down. */ - rcu_barrier(); +/* XXX: Standalone actions are not allowed to jump to any chain, and bound + * actions should be all removed after flushing. However, filters are destroyed + * in RCU callbacks, we have to hold the chains first, otherwise we would + * always race with RCU callbacks on this list without proper locking. + */ +static void tcf_block_put_deferred(struct work_struct *work) +{ + struct tcf_block *block = container_of(work, struct tcf_block, work); + struct tcf_chain *chain; + rtnl_lock(); /* Hold a refcnt for all chains, except 0, in case they are gone. */ list_for_each_entry(chain, &block->chain_list, list) if (chain->index) @@ -292,13 +308,27 @@ void tcf_block_put(struct tcf_block *block) list_for_each_entry(chain, &block->chain_list, list) tcf_chain_flush(chain); - /* Wait for RCU callbacks to release the reference count. */ + INIT_WORK(&block->work, tcf_block_put_final); + /* Wait for RCU callbacks to release the reference count and make + * sure their works have been queued before this. + */ rcu_barrier(); + tcf_queue_work(&block->work); + rtnl_unlock(); +} - /* At this point, all the chains should have refcnt == 1. */ - list_for_each_entry_safe(chain, tmp, &block->chain_list, list) - tcf_chain_put(chain); - kfree(block); +void tcf_block_put(struct tcf_block *block) +{ + if (!block) + return; + + INIT_WORK(&block->work, tcf_block_put_deferred); + /* Wait for existing RCU callbacks to cool down, make sure their works + * have been queued before this. We can not flush pending works here + * because we are holding the RTNL lock. + */ + rcu_barrier(); + tcf_queue_work(&block->work); } EXPORT_SYMBOL(tcf_block_put); @@ -1030,6 +1060,10 @@ EXPORT_SYMBOL(tcf_exts_get_dev); static int __init tc_filter_init(void) { + tc_filter_wq = alloc_ordered_workqueue("tc_filter_workqueue", 0); + if (!tc_filter_wq) + return -ENOMEM; + rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_ctl_tfilter, NULL, 0); rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_ctl_tfilter, NULL, 0); rtnl_register(PF_UNSPEC, RTM_GETTFILTER, tc_ctl_tfilter, -- cgit From 2d132eba1d972ea6c0e47286e4c821b4a3c5b84d Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 26 Oct 2017 18:24:40 -0700 Subject: net_sched: add rtnl assertion to tcf_exts_destroy() After previous patches, it is now safe to claim that tcf_exts_destroy() is always called with RTNL lock. Cc: Daniel Borkmann Cc: Jiri Pirko Cc: John Fastabend Cc: Jamal Hadi Salim Cc: "Paul E. McKenney" Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/cls_api.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/sched/cls_api.c') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 045d13679ad6..231181c602ed 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -909,6 +909,7 @@ void tcf_exts_destroy(struct tcf_exts *exts) #ifdef CONFIG_NET_CLS_ACT LIST_HEAD(actions); + ASSERT_RTNL(); tcf_exts_to_list(exts, &actions); tcf_action_destroy(&actions, TCA_ACT_UNBIND); kfree(exts->actions); -- cgit From 822e86d997e4d8f942818ea6ac1711f59a66ebef Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 30 Oct 2017 11:10:09 -0700 Subject: net_sched: remove tcf_block_put_deferred() In commit 7aa0045dadb6 ("net_sched: introduce a workqueue for RCU callbacks of tc filter") I defer tcf_chain_flush() to a workqueue, this causes a use-after-free because qdisc is already destroyed after we queue this work. The tcf_block_put_deferred() is no longer necessary after we get RTNL for each tc filter destroy work, no others could jump in at this point. Same for tcf_chain_hold(), we are fully serialized now. This also reduces one indirection therefore makes the code more readable. Note this brings back a rcu_barrier(), however comparing to the code prior to commit 7aa0045dadb6 we still reduced one rcu_barrier(). For net-next, we can consider to refcnt tcf block to avoid it. Fixes: 7aa0045dadb6 ("net_sched: introduce a workqueue for RCU callbacks of tc filter") Cc: Daniel Borkmann Cc: Jiri Pirko Cc: John Fastabend Cc: Jamal Hadi Salim Cc: "Paul E. McKenney" Cc: Eric Dumazet Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/cls_api.c | 37 ++++++++----------------------------- 1 file changed, 8 insertions(+), 29 deletions(-) (limited to 'net/sched/cls_api.c') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 231181c602ed..b2d310745487 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -280,8 +280,8 @@ static void tcf_block_put_final(struct work_struct *work) struct tcf_block *block = container_of(work, struct tcf_block, work); struct tcf_chain *chain, *tmp; - /* At this point, all the chains should have refcnt == 1. */ rtnl_lock(); + /* Only chain 0 should be still here. */ list_for_each_entry_safe(chain, tmp, &block->chain_list, list) tcf_chain_put(chain); rtnl_unlock(); @@ -289,23 +289,17 @@ static void tcf_block_put_final(struct work_struct *work) } /* XXX: Standalone actions are not allowed to jump to any chain, and bound - * actions should be all removed after flushing. However, filters are destroyed - * in RCU callbacks, we have to hold the chains first, otherwise we would - * always race with RCU callbacks on this list without proper locking. + * actions should be all removed after flushing. However, filters are now + * destroyed in tc filter workqueue with RTNL lock, they can not race here. */ -static void tcf_block_put_deferred(struct work_struct *work) +void tcf_block_put(struct tcf_block *block) { - struct tcf_block *block = container_of(work, struct tcf_block, work); - struct tcf_chain *chain; + struct tcf_chain *chain, *tmp; - rtnl_lock(); - /* Hold a refcnt for all chains, except 0, in case they are gone. */ - list_for_each_entry(chain, &block->chain_list, list) - if (chain->index) - tcf_chain_hold(chain); + if (!block) + return; - /* No race on the list, because no chain could be destroyed. */ - list_for_each_entry(chain, &block->chain_list, list) + list_for_each_entry_safe(chain, tmp, &block->chain_list, list) tcf_chain_flush(chain); INIT_WORK(&block->work, tcf_block_put_final); @@ -314,21 +308,6 @@ static void tcf_block_put_deferred(struct work_struct *work) */ rcu_barrier(); tcf_queue_work(&block->work); - rtnl_unlock(); -} - -void tcf_block_put(struct tcf_block *block) -{ - if (!block) - return; - - INIT_WORK(&block->work, tcf_block_put_deferred); - /* Wait for existing RCU callbacks to cool down, make sure their works - * have been queued before this. We can not flush pending works here - * because we are holding the RTNL lock. - */ - rcu_barrier(); - tcf_queue_work(&block->work); } EXPORT_SYMBOL(tcf_block_put); -- cgit From e4b95c41df36befcfd117210900cd790bc2cd048 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 6 Nov 2017 13:47:19 -0800 Subject: net_sched: introduce tcf_exts_get_net() and tcf_exts_put_net() Instead of holding netns refcnt in tc actions, we can minimize the holding time by saving it in struct tcf_exts instead. This means we can just hold netns refcnt right before call_rcu() and release it after tcf_exts_destroy() is done. However, because on netns cleanup path we call tcf_proto_destroy() too, obviously we can not hold netns for a zero refcnt, in this case we have to do cleanup synchronously. It is fine for RCU too, the caller cleanup_net() already waits for a grace period. For other cases, refcnt is non-zero and we can safely grab it as normal and release it after we are done. This patch provides two new API for each filter to use: tcf_exts_get_net() and tcf_exts_put_net(). And all filters now can use the following pattern: void __destroy_filter() { tcf_exts_destroy(); tcf_exts_put_net(); // <== release netns refcnt kfree(); } void some_work() { rtnl_lock(); __destroy_filter(); rtnl_unlock(); } void some_rcu_callback() { tcf_queue_work(some_work); } if (tcf_exts_get_net()) // <== hold netns refcnt call_rcu(some_rcu_callback); else __destroy_filter(); Cc: Lucas Bates Cc: Jamal Hadi Salim Cc: Jiri Pirko Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/cls_api.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/sched/cls_api.c') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index b2d310745487..ecbb019efcbd 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -927,6 +927,7 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, exts->actions[i++] = act; exts->nr_actions = i; } + exts->net = net; } #else if ((exts->action && tb[exts->action]) || -- cgit