summaryrefslogtreecommitdiff
path: root/net/sched
diff options
context:
space:
mode:
Diffstat (limited to 'net/sched')
-rw-r--r--net/sched/Kconfig14
-rw-r--r--net/sched/Makefile1
-rw-r--r--net/sched/act_api.c16
-rw-r--r--net/sched/act_mirred.c28
-rw-r--r--net/sched/bpf_qdisc.c475
-rw-r--r--net/sched/cls_api.c66
-rw-r--r--net/sched/sch_api.c11
-rw-r--r--net/sched/sch_codel.c7
-rw-r--r--net/sched/sch_drr.c16
-rw-r--r--net/sched/sch_ets.c17
-rw-r--r--net/sched/sch_fq.c2
-rw-r--r--net/sched/sch_fq_codel.c8
-rw-r--r--net/sched/sch_fq_pie.c2
-rw-r--r--net/sched/sch_frag.c10
-rw-r--r--net/sched/sch_generic.c7
-rw-r--r--net/sched/sch_hfsc.c46
-rw-r--r--net/sched/sch_hhf.c2
-rw-r--r--net/sched/sch_htb.c13
-rw-r--r--net/sched/sch_pie.c2
-rw-r--r--net/sched/sch_qfq.c18
-rw-r--r--net/sched/sch_sfq.c66
21 files changed, 723 insertions, 104 deletions
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 8180d0c12fce..ad914d2b2e22 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -403,6 +403,18 @@ config NET_SCH_ETS
If unsure, say N.
+config NET_SCH_BPF
+ bool "BPF-based Qdisc"
+ depends on BPF_SYSCALL && BPF_JIT && DEBUG_INFO_BTF
+ help
+ This option allows BPF-based queueing disiplines. With BPF struct_ops,
+ users can implement supported operators in Qdisc_ops using BPF programs.
+ The queue holding skb can be built with BPF maps or graphs.
+
+ Say Y here if you want to use BPF-based Qdisc.
+
+ If unsure, say N.
+
menuconfig NET_SCH_DEFAULT
bool "Allow override default queue discipline"
help
@@ -784,7 +796,7 @@ config NET_ACT_SKBEDIT
config NET_ACT_CSUM
tristate "Checksum Updating"
depends on NET_CLS_ACT && INET
- select LIBCRC32C
+ select NET_CRC32C
help
Say Y here to update some common checksum after some direct
packet alterations.
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 82c3f78ca486..904d784902d1 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -62,6 +62,7 @@ obj-$(CONFIG_NET_SCH_FQ_PIE) += sch_fq_pie.o
obj-$(CONFIG_NET_SCH_CBS) += sch_cbs.o
obj-$(CONFIG_NET_SCH_ETF) += sch_etf.o
obj-$(CONFIG_NET_SCH_TAPRIO) += sch_taprio.o
+obj-$(CONFIG_NET_SCH_BPF) += bpf_qdisc.o
obj-$(CONFIG_NET_CLS_U32) += cls_u32.o
obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 839790043256..057e20cef375 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -1461,17 +1461,29 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla,
struct netlink_ext_ack *extack)
{
struct tc_action_ops *ops[TCA_ACT_MAX_PRIO] = {};
- struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
+ struct nlattr *tb[TCA_ACT_MAX_PRIO + 2];
struct tc_action *act;
size_t sz = 0;
int err;
int i;
- err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX_PRIO, nla, NULL,
+ err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX_PRIO + 1, nla, NULL,
extack);
if (err < 0)
return err;
+ /* The nested attributes are parsed as types, but they are really an
+ * array of actions. So we parse one more than we can handle, and return
+ * an error if the last one is set (as that indicates that the request
+ * contained more than the maximum number of actions).
+ */
+ if (tb[TCA_ACT_MAX_PRIO + 1]) {
+ NL_SET_ERR_MSG_FMT(extack,
+ "Only %d actions supported per filter",
+ TCA_ACT_MAX_PRIO);
+ return -EINVAL;
+ }
+
for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) {
struct tc_action_ops *a_o;
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 5b3814365924..5f01f567c934 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -30,7 +30,29 @@ static LIST_HEAD(mirred_list);
static DEFINE_SPINLOCK(mirred_list_lock);
#define MIRRED_NEST_LIMIT 4
-static DEFINE_PER_CPU(unsigned int, mirred_nest_level);
+
+#ifndef CONFIG_PREEMPT_RT
+static u8 tcf_mirred_nest_level_inc_return(void)
+{
+ return __this_cpu_inc_return(softnet_data.xmit.sched_mirred_nest);
+}
+
+static void tcf_mirred_nest_level_dec(void)
+{
+ __this_cpu_dec(softnet_data.xmit.sched_mirred_nest);
+}
+
+#else
+static u8 tcf_mirred_nest_level_inc_return(void)
+{
+ return current->net_xmit.sched_mirred_nest++;
+}
+
+static void tcf_mirred_nest_level_dec(void)
+{
+ current->net_xmit.sched_mirred_nest--;
+}
+#endif
static bool tcf_mirred_is_act_redirect(int action)
{
@@ -423,7 +445,7 @@ TC_INDIRECT_SCOPE int tcf_mirred_act(struct sk_buff *skb,
int m_eaction;
u32 blockid;
- nest_level = __this_cpu_inc_return(mirred_nest_level);
+ nest_level = tcf_mirred_nest_level_inc_return();
if (unlikely(nest_level > MIRRED_NEST_LIMIT)) {
net_warn_ratelimited("Packet exceeded mirred recursion limit on dev %s\n",
netdev_name(skb->dev));
@@ -454,7 +476,7 @@ TC_INDIRECT_SCOPE int tcf_mirred_act(struct sk_buff *skb,
retval);
dec_nest_level:
- __this_cpu_dec(mirred_nest_level);
+ tcf_mirred_nest_level_dec();
return retval;
}
diff --git a/net/sched/bpf_qdisc.c b/net/sched/bpf_qdisc.c
new file mode 100644
index 000000000000..7ea8b54b2ab1
--- /dev/null
+++ b/net/sched/bpf_qdisc.c
@@ -0,0 +1,475 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/types.h>
+#include <linux/bpf_verifier.h>
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/filter.h>
+#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
+
+#define QDISC_OP_IDX(op) (offsetof(struct Qdisc_ops, op) / sizeof(void (*)(void)))
+#define QDISC_MOFF_IDX(moff) (moff / sizeof(void (*)(void)))
+
+static struct bpf_struct_ops bpf_Qdisc_ops;
+
+struct bpf_sched_data {
+ struct qdisc_watchdog watchdog;
+};
+
+struct bpf_sk_buff_ptr {
+ struct sk_buff *skb;
+};
+
+static int bpf_qdisc_init(struct btf *btf)
+{
+ return 0;
+}
+
+BTF_ID_LIST_SINGLE(bpf_qdisc_ids, struct, Qdisc)
+BTF_ID_LIST_SINGLE(bpf_sk_buff_ids, struct, sk_buff)
+BTF_ID_LIST_SINGLE(bpf_sk_buff_ptr_ids, struct, bpf_sk_buff_ptr)
+
+static bool bpf_qdisc_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ struct btf *btf = prog->aux->attach_btf;
+ u32 arg;
+
+ arg = btf_ctx_arg_idx(btf, prog->aux->attach_func_proto, off);
+ if (prog->aux->attach_st_ops_member_off == offsetof(struct Qdisc_ops, enqueue)) {
+ if (arg == 2 && type == BPF_READ) {
+ info->reg_type = PTR_TO_BTF_ID | PTR_TRUSTED;
+ info->btf = btf;
+ info->btf_id = bpf_sk_buff_ptr_ids[0];
+ return true;
+ }
+ }
+
+ return bpf_tracing_btf_ctx_access(off, size, type, prog, info);
+}
+
+static int bpf_qdisc_qdisc_access(struct bpf_verifier_log *log,
+ const struct bpf_reg_state *reg,
+ int off, size_t *end)
+{
+ switch (off) {
+ case offsetof(struct Qdisc, limit):
+ *end = offsetofend(struct Qdisc, limit);
+ break;
+ case offsetof(struct Qdisc, q) + offsetof(struct qdisc_skb_head, qlen):
+ *end = offsetof(struct Qdisc, q) + offsetofend(struct qdisc_skb_head, qlen);
+ break;
+ case offsetof(struct Qdisc, qstats) ... offsetofend(struct Qdisc, qstats) - 1:
+ *end = offsetofend(struct Qdisc, qstats);
+ break;
+ default:
+ return -EACCES;
+ }
+
+ return 0;
+}
+
+static int bpf_qdisc_sk_buff_access(struct bpf_verifier_log *log,
+ const struct bpf_reg_state *reg,
+ int off, size_t *end)
+{
+ switch (off) {
+ case offsetof(struct sk_buff, tstamp):
+ *end = offsetofend(struct sk_buff, tstamp);
+ break;
+ case offsetof(struct sk_buff, cb) + offsetof(struct qdisc_skb_cb, data[0]) ...
+ offsetof(struct sk_buff, cb) + offsetof(struct qdisc_skb_cb,
+ data[QDISC_CB_PRIV_LEN - 1]):
+ *end = offsetof(struct sk_buff, cb) +
+ offsetofend(struct qdisc_skb_cb, data[QDISC_CB_PRIV_LEN - 1]);
+ break;
+ default:
+ return -EACCES;
+ }
+
+ return 0;
+}
+
+static int bpf_qdisc_btf_struct_access(struct bpf_verifier_log *log,
+ const struct bpf_reg_state *reg,
+ int off, int size)
+{
+ const struct btf_type *t, *skbt, *qdisct;
+ size_t end;
+ int err;
+
+ skbt = btf_type_by_id(reg->btf, bpf_sk_buff_ids[0]);
+ qdisct = btf_type_by_id(reg->btf, bpf_qdisc_ids[0]);
+ t = btf_type_by_id(reg->btf, reg->btf_id);
+
+ if (t == skbt) {
+ err = bpf_qdisc_sk_buff_access(log, reg, off, &end);
+ } else if (t == qdisct) {
+ err = bpf_qdisc_qdisc_access(log, reg, off, &end);
+ } else {
+ bpf_log(log, "only read is supported\n");
+ return -EACCES;
+ }
+
+ if (err) {
+ bpf_log(log, "no write support to %s at off %d\n",
+ btf_name_by_offset(reg->btf, t->name_off), off);
+ return -EACCES;
+ }
+
+ if (off + size > end) {
+ bpf_log(log,
+ "write access at off %d with size %d beyond the member of %s ended at %zu\n",
+ off, size, btf_name_by_offset(reg->btf, t->name_off), end);
+ return -EACCES;
+ }
+
+ return 0;
+}
+
+BTF_ID_LIST(bpf_qdisc_init_prologue_ids)
+BTF_ID(func, bpf_qdisc_init_prologue)
+
+static int bpf_qdisc_gen_prologue(struct bpf_insn *insn_buf, bool direct_write,
+ const struct bpf_prog *prog)
+{
+ struct bpf_insn *insn = insn_buf;
+
+ if (prog->aux->attach_st_ops_member_off != offsetof(struct Qdisc_ops, init))
+ return 0;
+
+ /* r6 = r1; // r6 will be "u64 *ctx". r1 is "u64 *ctx".
+ * r2 = r1[16]; // r2 will be "struct netlink_ext_ack *extack"
+ * r1 = r1[0]; // r1 will be "struct Qdisc *sch"
+ * r0 = bpf_qdisc_init_prologue(r1, r2);
+ * if r0 == 0 goto pc+1;
+ * BPF_EXIT;
+ * r1 = r6; // r1 will be "u64 *ctx".
+ */
+ *insn++ = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1);
+ *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, 16);
+ *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, 0);
+ *insn++ = BPF_CALL_KFUNC(0, bpf_qdisc_init_prologue_ids[0]);
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1);
+ *insn++ = BPF_EXIT_INSN();
+ *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6);
+ *insn++ = prog->insnsi[0];
+
+ return insn - insn_buf;
+}
+
+BTF_ID_LIST(bpf_qdisc_reset_destroy_epilogue_ids)
+BTF_ID(func, bpf_qdisc_reset_destroy_epilogue)
+
+static int bpf_qdisc_gen_epilogue(struct bpf_insn *insn_buf, const struct bpf_prog *prog,
+ s16 ctx_stack_off)
+{
+ struct bpf_insn *insn = insn_buf;
+
+ if (prog->aux->attach_st_ops_member_off != offsetof(struct Qdisc_ops, reset) &&
+ prog->aux->attach_st_ops_member_off != offsetof(struct Qdisc_ops, destroy))
+ return 0;
+
+ /* r1 = stack[ctx_stack_off]; // r1 will be "u64 *ctx"
+ * r1 = r1[0]; // r1 will be "struct Qdisc *sch"
+ * r0 = bpf_qdisc_reset_destroy_epilogue(r1);
+ * BPF_EXIT;
+ */
+ *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_FP, ctx_stack_off);
+ *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, 0);
+ *insn++ = BPF_CALL_KFUNC(0, bpf_qdisc_reset_destroy_epilogue_ids[0]);
+ *insn++ = BPF_EXIT_INSN();
+
+ return insn - insn_buf;
+}
+
+__bpf_kfunc_start_defs();
+
+/* bpf_skb_get_hash - Get the flow hash of an skb.
+ * @skb: The skb to get the flow hash from.
+ */
+__bpf_kfunc u32 bpf_skb_get_hash(struct sk_buff *skb)
+{
+ return skb_get_hash(skb);
+}
+
+/* bpf_kfree_skb - Release an skb's reference and drop it immediately.
+ * @skb: The skb whose reference to be released and dropped.
+ */
+__bpf_kfunc void bpf_kfree_skb(struct sk_buff *skb)
+{
+ kfree_skb(skb);
+}
+
+/* bpf_qdisc_skb_drop - Drop an skb by adding it to a deferred free list.
+ * @skb: The skb whose reference to be released and dropped.
+ * @to_free_list: The list of skbs to be dropped.
+ */
+__bpf_kfunc void bpf_qdisc_skb_drop(struct sk_buff *skb,
+ struct bpf_sk_buff_ptr *to_free_list)
+{
+ __qdisc_drop(skb, (struct sk_buff **)to_free_list);
+}
+
+/* bpf_qdisc_watchdog_schedule - Schedule a qdisc to a later time using a timer.
+ * @sch: The qdisc to be scheduled.
+ * @expire: The expiry time of the timer.
+ * @delta_ns: The slack range of the timer.
+ */
+__bpf_kfunc void bpf_qdisc_watchdog_schedule(struct Qdisc *sch, u64 expire, u64 delta_ns)
+{
+ struct bpf_sched_data *q = qdisc_priv(sch);
+
+ qdisc_watchdog_schedule_range_ns(&q->watchdog, expire, delta_ns);
+}
+
+/* bpf_qdisc_init_prologue - Hidden kfunc called in prologue of .init. */
+__bpf_kfunc int bpf_qdisc_init_prologue(struct Qdisc *sch,
+ struct netlink_ext_ack *extack)
+{
+ struct bpf_sched_data *q = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ struct Qdisc *p;
+
+ qdisc_watchdog_init(&q->watchdog, sch);
+
+ if (sch->parent != TC_H_ROOT) {
+ /* If qdisc_lookup() returns NULL, it means .init is called by
+ * qdisc_create_dflt() in mq/mqprio_init and the parent qdisc
+ * has not been added to qdisc_hash yet.
+ */
+ p = qdisc_lookup(dev, TC_H_MAJ(sch->parent));
+ if (p && !(p->flags & TCQ_F_MQROOT)) {
+ NL_SET_ERR_MSG(extack, "BPF qdisc only supported on root or mq");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+/* bpf_qdisc_reset_destroy_epilogue - Hidden kfunc called in epilogue of .reset
+ * and .destroy
+ */
+__bpf_kfunc void bpf_qdisc_reset_destroy_epilogue(struct Qdisc *sch)
+{
+ struct bpf_sched_data *q = qdisc_priv(sch);
+
+ qdisc_watchdog_cancel(&q->watchdog);
+}
+
+/* bpf_qdisc_bstats_update - Update Qdisc basic statistics
+ * @sch: The qdisc from which an skb is dequeued.
+ * @skb: The skb to be dequeued.
+ */
+__bpf_kfunc void bpf_qdisc_bstats_update(struct Qdisc *sch, const struct sk_buff *skb)
+{
+ bstats_update(&sch->bstats, skb);
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(qdisc_kfunc_ids)
+BTF_ID_FLAGS(func, bpf_skb_get_hash, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_kfree_skb, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_qdisc_skb_drop, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_dynptr_from_skb, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_qdisc_watchdog_schedule, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_qdisc_init_prologue, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_qdisc_reset_destroy_epilogue, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_qdisc_bstats_update, KF_TRUSTED_ARGS)
+BTF_KFUNCS_END(qdisc_kfunc_ids)
+
+BTF_SET_START(qdisc_common_kfunc_set)
+BTF_ID(func, bpf_skb_get_hash)
+BTF_ID(func, bpf_kfree_skb)
+BTF_ID(func, bpf_dynptr_from_skb)
+BTF_SET_END(qdisc_common_kfunc_set)
+
+BTF_SET_START(qdisc_enqueue_kfunc_set)
+BTF_ID(func, bpf_qdisc_skb_drop)
+BTF_ID(func, bpf_qdisc_watchdog_schedule)
+BTF_SET_END(qdisc_enqueue_kfunc_set)
+
+BTF_SET_START(qdisc_dequeue_kfunc_set)
+BTF_ID(func, bpf_qdisc_watchdog_schedule)
+BTF_ID(func, bpf_qdisc_bstats_update)
+BTF_SET_END(qdisc_dequeue_kfunc_set)
+
+enum qdisc_ops_kf_flags {
+ QDISC_OPS_KF_COMMON = 0,
+ QDISC_OPS_KF_ENQUEUE = 1 << 0,
+ QDISC_OPS_KF_DEQUEUE = 1 << 1,
+};
+
+static const u32 qdisc_ops_context_flags[] = {
+ [QDISC_OP_IDX(enqueue)] = QDISC_OPS_KF_ENQUEUE,
+ [QDISC_OP_IDX(dequeue)] = QDISC_OPS_KF_DEQUEUE,
+ [QDISC_OP_IDX(init)] = QDISC_OPS_KF_COMMON,
+ [QDISC_OP_IDX(reset)] = QDISC_OPS_KF_COMMON,
+ [QDISC_OP_IDX(destroy)] = QDISC_OPS_KF_COMMON,
+};
+
+static int bpf_qdisc_kfunc_filter(const struct bpf_prog *prog, u32 kfunc_id)
+{
+ u32 moff, flags;
+
+ if (!btf_id_set8_contains(&qdisc_kfunc_ids, kfunc_id))
+ return 0;
+
+ if (prog->aux->st_ops != &bpf_Qdisc_ops)
+ return -EACCES;
+
+ moff = prog->aux->attach_st_ops_member_off;
+ flags = qdisc_ops_context_flags[QDISC_MOFF_IDX(moff)];
+
+ if ((flags & QDISC_OPS_KF_ENQUEUE) &&
+ btf_id_set_contains(&qdisc_enqueue_kfunc_set, kfunc_id))
+ return 0;
+
+ if ((flags & QDISC_OPS_KF_DEQUEUE) &&
+ btf_id_set_contains(&qdisc_dequeue_kfunc_set, kfunc_id))
+ return 0;
+
+ if (btf_id_set_contains(&qdisc_common_kfunc_set, kfunc_id))
+ return 0;
+
+ return -EACCES;
+}
+
+static const struct btf_kfunc_id_set bpf_qdisc_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &qdisc_kfunc_ids,
+ .filter = bpf_qdisc_kfunc_filter,
+};
+
+static const struct bpf_verifier_ops bpf_qdisc_verifier_ops = {
+ .get_func_proto = bpf_base_func_proto,
+ .is_valid_access = bpf_qdisc_is_valid_access,
+ .btf_struct_access = bpf_qdisc_btf_struct_access,
+ .gen_prologue = bpf_qdisc_gen_prologue,
+ .gen_epilogue = bpf_qdisc_gen_epilogue,
+};
+
+static int bpf_qdisc_init_member(const struct btf_type *t,
+ const struct btf_member *member,
+ void *kdata, const void *udata)
+{
+ const struct Qdisc_ops *uqdisc_ops;
+ struct Qdisc_ops *qdisc_ops;
+ u32 moff;
+
+ uqdisc_ops = (const struct Qdisc_ops *)udata;
+ qdisc_ops = (struct Qdisc_ops *)kdata;
+
+ moff = __btf_member_bit_offset(t, member) / 8;
+ switch (moff) {
+ case offsetof(struct Qdisc_ops, priv_size):
+ if (uqdisc_ops->priv_size)
+ return -EINVAL;
+ qdisc_ops->priv_size = sizeof(struct bpf_sched_data);
+ return 1;
+ case offsetof(struct Qdisc_ops, peek):
+ qdisc_ops->peek = qdisc_peek_dequeued;
+ return 0;
+ case offsetof(struct Qdisc_ops, id):
+ if (bpf_obj_name_cpy(qdisc_ops->id, uqdisc_ops->id,
+ sizeof(qdisc_ops->id)) <= 0)
+ return -EINVAL;
+ return 1;
+ }
+
+ return 0;
+}
+
+static int bpf_qdisc_reg(void *kdata, struct bpf_link *link)
+{
+ return register_qdisc(kdata);
+}
+
+static void bpf_qdisc_unreg(void *kdata, struct bpf_link *link)
+{
+ return unregister_qdisc(kdata);
+}
+
+static int bpf_qdisc_validate(void *kdata)
+{
+ struct Qdisc_ops *ops = (struct Qdisc_ops *)kdata;
+
+ if (!ops->enqueue || !ops->dequeue || !ops->init ||
+ !ops->reset || !ops->destroy)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int Qdisc_ops__enqueue(struct sk_buff *skb__ref, struct Qdisc *sch,
+ struct sk_buff **to_free)
+{
+ return 0;
+}
+
+static struct sk_buff *Qdisc_ops__dequeue(struct Qdisc *sch)
+{
+ return NULL;
+}
+
+static int Qdisc_ops__init(struct Qdisc *sch, struct nlattr *arg,
+ struct netlink_ext_ack *extack)
+{
+ return 0;
+}
+
+static void Qdisc_ops__reset(struct Qdisc *sch)
+{
+}
+
+static void Qdisc_ops__destroy(struct Qdisc *sch)
+{
+}
+
+static struct Qdisc_ops __bpf_ops_qdisc_ops = {
+ .enqueue = Qdisc_ops__enqueue,
+ .dequeue = Qdisc_ops__dequeue,
+ .init = Qdisc_ops__init,
+ .reset = Qdisc_ops__reset,
+ .destroy = Qdisc_ops__destroy,
+};
+
+static struct bpf_struct_ops bpf_Qdisc_ops = {
+ .verifier_ops = &bpf_qdisc_verifier_ops,
+ .reg = bpf_qdisc_reg,
+ .unreg = bpf_qdisc_unreg,
+ .validate = bpf_qdisc_validate,
+ .init_member = bpf_qdisc_init_member,
+ .init = bpf_qdisc_init,
+ .name = "Qdisc_ops",
+ .cfi_stubs = &__bpf_ops_qdisc_ops,
+ .owner = THIS_MODULE,
+};
+
+BTF_ID_LIST(bpf_sk_buff_dtor_ids)
+BTF_ID(func, bpf_kfree_skb)
+
+static int __init bpf_qdisc_kfunc_init(void)
+{
+ int ret;
+ const struct btf_id_dtor_kfunc skb_kfunc_dtors[] = {
+ {
+ .btf_id = bpf_sk_buff_ids[0],
+ .kfunc_btf_id = bpf_sk_buff_dtor_ids[0]
+ },
+ };
+
+ ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &bpf_qdisc_kfunc_set);
+ ret = ret ?: register_btf_id_dtor_kfuncs(skb_kfunc_dtors,
+ ARRAY_SIZE(skb_kfunc_dtors),
+ THIS_MODULE);
+ ret = ret ?: register_bpf_struct_ops(&bpf_Qdisc_ops, Qdisc_ops);
+
+ return ret;
+}
+late_initcall(bpf_qdisc_kfunc_init);
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 4f648af8cfaa..ecec0a1e1c1a 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -2057,6 +2057,7 @@ static int tcf_fill_node(struct net *net, struct sk_buff *skb,
struct tcmsg *tcm;
struct nlmsghdr *nlh;
unsigned char *b = skb_tail_pointer(skb);
+ int ret = -EMSGSIZE;
nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
if (!nlh)
@@ -2101,11 +2102,45 @@ static int tcf_fill_node(struct net *net, struct sk_buff *skb,
return skb->len;
+cls_op_not_supp:
+ ret = -EOPNOTSUPP;
out_nlmsg_trim:
nla_put_failure:
-cls_op_not_supp:
nlmsg_trim(skb, b);
- return -1;
+ return ret;
+}
+
+static struct sk_buff *tfilter_notify_prep(struct net *net,
+ struct sk_buff *oskb,
+ struct nlmsghdr *n,
+ struct tcf_proto *tp,
+ struct tcf_block *block,
+ struct Qdisc *q, u32 parent,
+ void *fh, int event,
+ u32 portid, bool rtnl_held,
+ struct netlink_ext_ack *extack)
+{
+ unsigned int size = oskb ? max(NLMSG_GOODSIZE, oskb->len) : NLMSG_GOODSIZE;
+ struct sk_buff *skb;
+ int ret;
+
+retry:
+ skb = alloc_skb(size, GFP_KERNEL);
+ if (!skb)
+ return ERR_PTR(-ENOBUFS);
+
+ ret = tcf_fill_node(net, skb, tp, block, q, parent, fh, portid,
+ n->nlmsg_seq, n->nlmsg_flags, event, false,
+ rtnl_held, extack);
+ if (ret <= 0) {
+ kfree_skb(skb);
+ if (ret == -EMSGSIZE) {
+ size += NLMSG_GOODSIZE;
+ goto retry;
+ }
+ return ERR_PTR(-EINVAL);
+ }
+ return skb;
}
static int tfilter_notify(struct net *net, struct sk_buff *oskb,
@@ -2121,16 +2156,10 @@ static int tfilter_notify(struct net *net, struct sk_buff *oskb,
if (!unicast && !rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC))
return 0;
- skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
- if (!skb)
- return -ENOBUFS;
-
- if (tcf_fill_node(net, skb, tp, block, q, parent, fh, portid,
- n->nlmsg_seq, n->nlmsg_flags, event,
- false, rtnl_held, extack) <= 0) {
- kfree_skb(skb);
- return -EINVAL;
- }
+ skb = tfilter_notify_prep(net, oskb, n, tp, block, q, parent, fh, event,
+ portid, rtnl_held, extack);
+ if (IS_ERR(skb))
+ return PTR_ERR(skb);
if (unicast)
err = rtnl_unicast(skb, net, portid);
@@ -2153,16 +2182,11 @@ static int tfilter_del_notify(struct net *net, struct sk_buff *oskb,
if (!rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC))
return tp->ops->delete(tp, fh, last, rtnl_held, extack);
- skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
- if (!skb)
- return -ENOBUFS;
-
- if (tcf_fill_node(net, skb, tp, block, q, parent, fh, portid,
- n->nlmsg_seq, n->nlmsg_flags, RTM_DELTFILTER,
- false, rtnl_held, extack) <= 0) {
+ skb = tfilter_notify_prep(net, oskb, n, tp, block, q, parent, fh,
+ RTM_DELTFILTER, portid, rtnl_held, extack);
+ if (IS_ERR(skb)) {
NL_SET_ERR_MSG(extack, "Failed to build del event notification");
- kfree_skb(skb);
- return -EINVAL;
+ return PTR_ERR(skb);
}
err = tp->ops->delete(tp, fh, last, rtnl_held, extack);
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index f74a097f54ae..c5e3673aadbe 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -25,6 +25,7 @@
#include <linux/hrtimer.h>
#include <linux/slab.h>
#include <linux/hashtable.h>
+#include <linux/bpf.h>
#include <net/netdev_lock.h>
#include <net/net_namespace.h>
@@ -207,7 +208,7 @@ static struct Qdisc_ops *qdisc_lookup_default(const char *name)
for (q = qdisc_base; q; q = q->next) {
if (!strcmp(name, q->id)) {
- if (!try_module_get(q->owner))
+ if (!bpf_try_module_get(q, q->owner))
q = NULL;
break;
}
@@ -237,7 +238,7 @@ int qdisc_set_default(const char *name)
if (ops) {
/* Set new default */
- module_put(default_qdisc_ops->owner);
+ bpf_module_put(default_qdisc_ops, default_qdisc_ops->owner);
default_qdisc_ops = ops;
}
write_unlock(&qdisc_mod_lock);
@@ -359,7 +360,7 @@ static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
read_lock(&qdisc_mod_lock);
for (q = qdisc_base; q; q = q->next) {
if (nla_strcmp(kind, q->id) == 0) {
- if (!try_module_get(q->owner))
+ if (!bpf_try_module_get(q, q->owner))
q = NULL;
break;
}
@@ -1370,7 +1371,7 @@ err_out3:
netdev_put(dev, &sch->dev_tracker);
qdisc_free(sch);
err_out2:
- module_put(ops->owner);
+ bpf_module_put(ops, ops->owner);
err_out:
*errp = err;
return NULL;
@@ -1782,7 +1783,7 @@ static void request_qdisc_module(struct nlattr *kind)
ops = qdisc_lookup_ops(kind);
if (ops) {
- module_put(ops->owner);
+ bpf_module_put(ops, ops->owner);
return;
}
diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c
index 81189d02fee7..c93761040c6e 100644
--- a/net/sched/sch_codel.c
+++ b/net/sched/sch_codel.c
@@ -65,10 +65,7 @@ static struct sk_buff *codel_qdisc_dequeue(struct Qdisc *sch)
&q->stats, qdisc_pkt_len, codel_get_enqueue_time,
drop_func, dequeue_func);
- /* We cant call qdisc_tree_reduce_backlog() if our qlen is 0,
- * or HTB crashes. Defer it for next round.
- */
- if (q->stats.drop_count && sch->q.qlen) {
+ if (q->stats.drop_count) {
qdisc_tree_reduce_backlog(sch, q->stats.drop_count, q->stats.drop_len);
q->stats.drop_count = 0;
q->stats.drop_len = 0;
@@ -147,7 +144,7 @@ static int codel_change(struct Qdisc *sch, struct nlattr *opt,
qlen = sch->q.qlen;
while (sch->q.qlen > sch->limit) {
- struct sk_buff *skb = __qdisc_dequeue_head(&sch->q);
+ struct sk_buff *skb = qdisc_dequeue_internal(sch, true);
dropped += qdisc_pkt_len(skb);
qdisc_qstats_backlog_dec(sch, skb);
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index c69b999fae17..9b6d79bd8737 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -35,6 +35,11 @@ struct drr_sched {
struct Qdisc_class_hash clhash;
};
+static bool cl_is_active(struct drr_class *cl)
+{
+ return !list_empty(&cl->alist);
+}
+
static struct drr_class *drr_find_class(struct Qdisc *sch, u32 classid)
{
struct drr_sched *q = qdisc_priv(sch);
@@ -105,6 +110,7 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
return -ENOBUFS;
gnet_stats_basic_sync_init(&cl->bstats);
+ INIT_LIST_HEAD(&cl->alist);
cl->common.classid = classid;
cl->quantum = quantum;
cl->qdisc = qdisc_create_dflt(sch->dev_queue,
@@ -229,7 +235,7 @@ static void drr_qlen_notify(struct Qdisc *csh, unsigned long arg)
{
struct drr_class *cl = (struct drr_class *)arg;
- list_del(&cl->alist);
+ list_del_init(&cl->alist);
}
static int drr_dump_class(struct Qdisc *sch, unsigned long arg,
@@ -336,7 +342,6 @@ static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch,
struct drr_sched *q = qdisc_priv(sch);
struct drr_class *cl;
int err = 0;
- bool first;
cl = drr_classify(skb, sch, &err);
if (cl == NULL) {
@@ -346,7 +351,6 @@ static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch,
return err;
}
- first = !cl->qdisc->q.qlen;
err = qdisc_enqueue(skb, cl->qdisc, to_free);
if (unlikely(err != NET_XMIT_SUCCESS)) {
if (net_xmit_drop_count(err)) {
@@ -356,7 +360,7 @@ static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch,
return err;
}
- if (first) {
+ if (!cl_is_active(cl)) {
list_add_tail(&cl->alist, &q->active);
cl->deficit = cl->quantum;
}
@@ -390,7 +394,7 @@ static struct sk_buff *drr_dequeue(struct Qdisc *sch)
if (unlikely(skb == NULL))
goto out;
if (cl->qdisc->q.qlen == 0)
- list_del(&cl->alist);
+ list_del_init(&cl->alist);
bstats_update(&cl->bstats, skb);
qdisc_bstats_update(sch, skb);
@@ -431,7 +435,7 @@ static void drr_reset_qdisc(struct Qdisc *sch)
for (i = 0; i < q->clhash.hashsize; i++) {
hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
if (cl->qdisc->q.qlen)
- list_del(&cl->alist);
+ list_del_init(&cl->alist);
qdisc_reset(cl->qdisc);
}
}
diff --git a/net/sched/sch_ets.c b/net/sched/sch_ets.c
index 516038a44163..2c069f0181c6 100644
--- a/net/sched/sch_ets.c
+++ b/net/sched/sch_ets.c
@@ -74,6 +74,11 @@ static const struct nla_policy ets_class_policy[TCA_ETS_MAX + 1] = {
[TCA_ETS_QUANTA_BAND] = { .type = NLA_U32 },
};
+static bool cl_is_active(struct ets_class *cl)
+{
+ return !list_empty(&cl->alist);
+}
+
static int ets_quantum_parse(struct Qdisc *sch, const struct nlattr *attr,
unsigned int *quantum,
struct netlink_ext_ack *extack)
@@ -293,7 +298,7 @@ static void ets_class_qlen_notify(struct Qdisc *sch, unsigned long arg)
* to remove them.
*/
if (!ets_class_is_strict(q, cl) && sch->q.qlen)
- list_del(&cl->alist);
+ list_del_init(&cl->alist);
}
static int ets_class_dump(struct Qdisc *sch, unsigned long arg,
@@ -416,7 +421,6 @@ static int ets_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
struct ets_sched *q = qdisc_priv(sch);
struct ets_class *cl;
int err = 0;
- bool first;
cl = ets_classify(skb, sch, &err);
if (!cl) {
@@ -426,7 +430,6 @@ static int ets_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
return err;
}
- first = !cl->qdisc->q.qlen;
err = qdisc_enqueue(skb, cl->qdisc, to_free);
if (unlikely(err != NET_XMIT_SUCCESS)) {
if (net_xmit_drop_count(err)) {
@@ -436,7 +439,7 @@ static int ets_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
return err;
}
- if (first && !ets_class_is_strict(q, cl)) {
+ if (!cl_is_active(cl) && !ets_class_is_strict(q, cl)) {
list_add_tail(&cl->alist, &q->active);
cl->deficit = cl->quantum;
}
@@ -488,7 +491,7 @@ static struct sk_buff *ets_qdisc_dequeue(struct Qdisc *sch)
if (unlikely(!skb))
goto out;
if (cl->qdisc->q.qlen == 0)
- list_del(&cl->alist);
+ list_del_init(&cl->alist);
return ets_qdisc_dequeue_skb(sch, skb);
}
@@ -657,7 +660,7 @@ static int ets_qdisc_change(struct Qdisc *sch, struct nlattr *opt,
}
for (i = q->nbands; i < oldbands; i++) {
if (i >= q->nstrict && q->classes[i].qdisc->q.qlen)
- list_del(&q->classes[i].alist);
+ list_del_init(&q->classes[i].alist);
qdisc_tree_flush_backlog(q->classes[i].qdisc);
}
WRITE_ONCE(q->nstrict, nstrict);
@@ -713,7 +716,7 @@ static void ets_qdisc_reset(struct Qdisc *sch)
for (band = q->nstrict; band < q->nbands; band++) {
if (q->classes[band].qdisc->q.qlen)
- list_del(&q->classes[band].alist);
+ list_del_init(&q->classes[band].alist);
}
for (band = 0; band < q->nbands; band++)
qdisc_reset(q->classes[band].qdisc);
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index 2ca5332cfcc5..902ff5470607 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -1136,7 +1136,7 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt,
sch_tree_lock(sch);
}
while (sch->q.qlen > sch->limit) {
- struct sk_buff *skb = fq_dequeue(sch);
+ struct sk_buff *skb = qdisc_dequeue_internal(sch, false);
if (!skb)
break;
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index 799f5397ad4c..2a0f3a513bfa 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -315,10 +315,8 @@ begin:
}
qdisc_bstats_update(sch, skb);
flow->deficit -= qdisc_pkt_len(skb);
- /* We cant call qdisc_tree_reduce_backlog() if our qlen is 0,
- * or HTB crashes. Defer it for next round.
- */
- if (q->cstats.drop_count && sch->q.qlen) {
+
+ if (q->cstats.drop_count) {
qdisc_tree_reduce_backlog(sch, q->cstats.drop_count,
q->cstats.drop_len);
q->cstats.drop_count = 0;
@@ -443,7 +441,7 @@ static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt,
while (sch->q.qlen > sch->limit ||
q->memory_usage > q->memory_limit) {
- struct sk_buff *skb = fq_codel_dequeue(sch);
+ struct sk_buff *skb = qdisc_dequeue_internal(sch, false);
q->cstats.drop_len += qdisc_pkt_len(skb);
rtnl_kfree_skbs(skb, skb);
diff --git a/net/sched/sch_fq_pie.c b/net/sched/sch_fq_pie.c
index f3b8203d3e85..df7fac95ab15 100644
--- a/net/sched/sch_fq_pie.c
+++ b/net/sched/sch_fq_pie.c
@@ -366,7 +366,7 @@ static int fq_pie_change(struct Qdisc *sch, struct nlattr *opt,
/* Drop excess packets if new limit is lower */
while (sch->q.qlen > sch->limit) {
- struct sk_buff *skb = fq_pie_qdisc_dequeue(sch);
+ struct sk_buff *skb = qdisc_dequeue_internal(sch, false);
len_dropped += qdisc_pkt_len(skb);
num_dropped += 1;
diff --git a/net/sched/sch_frag.c b/net/sched/sch_frag.c
index ce63414185fd..d1d87dce7f3f 100644
--- a/net/sched/sch_frag.c
+++ b/net/sched/sch_frag.c
@@ -16,14 +16,18 @@ struct sch_frag_data {
unsigned int l2_len;
u8 l2_data[VLAN_ETH_HLEN];
int (*xmit)(struct sk_buff *skb);
+ local_lock_t bh_lock;
};
-static DEFINE_PER_CPU(struct sch_frag_data, sch_frag_data_storage);
+static DEFINE_PER_CPU(struct sch_frag_data, sch_frag_data_storage) = {
+ .bh_lock = INIT_LOCAL_LOCK(bh_lock),
+};
static int sch_frag_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct sch_frag_data *data = this_cpu_ptr(&sch_frag_data_storage);
+ lockdep_assert_held(&data->bh_lock);
if (skb_cow_head(skb, data->l2_len) < 0) {
kfree_skb(skb);
return -ENOMEM;
@@ -95,6 +99,7 @@ static int sch_fragment(struct net *net, struct sk_buff *skb,
struct rtable sch_frag_rt = { 0 };
unsigned long orig_dst;
+ local_lock_nested_bh(&sch_frag_data_storage.bh_lock);
sch_frag_prepare_frag(skb, xmit);
dst_init(&sch_frag_rt.dst, &sch_frag_dst_ops, NULL,
DST_OBSOLETE_NONE, DST_NOCOUNT);
@@ -105,11 +110,13 @@ static int sch_fragment(struct net *net, struct sk_buff *skb,
IPCB(skb)->frag_max_size = mru;
ret = ip_do_fragment(net, skb->sk, skb, sch_frag_xmit);
+ local_unlock_nested_bh(&sch_frag_data_storage.bh_lock);
refdst_drop(orig_dst);
} else if (skb_protocol(skb, true) == htons(ETH_P_IPV6)) {
unsigned long orig_dst;
struct rt6_info sch_frag_rt;
+ local_lock_nested_bh(&sch_frag_data_storage.bh_lock);
sch_frag_prepare_frag(skb, xmit);
memset(&sch_frag_rt, 0, sizeof(sch_frag_rt));
dst_init(&sch_frag_rt.dst, &sch_frag_dst_ops, NULL,
@@ -122,6 +129,7 @@ static int sch_fragment(struct net *net, struct sk_buff *skb,
ret = ipv6_stub->ipv6_fragment(net, skb->sk, skb,
sch_frag_xmit);
+ local_unlock_nested_bh(&sch_frag_data_storage.bh_lock);
refdst_drop(orig_dst);
} else {
net_warn_ratelimited("Fail frag %s: eth=%x, MRU=%d, MTU=%d\n",
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 514b1b6ac681..08e0e3aff976 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -24,6 +24,7 @@
#include <linux/if_vlan.h>
#include <linux/skb_array.h>
#include <linux/if_macvlan.h>
+#include <linux/bpf.h>
#include <net/sch_generic.h>
#include <net/pkt_sched.h>
#include <net/dst.h>
@@ -1001,14 +1002,14 @@ struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
{
struct Qdisc *sch;
- if (!try_module_get(ops->owner)) {
+ if (!bpf_try_module_get(ops, ops->owner)) {
NL_SET_ERR_MSG(extack, "Failed to increase module reference counter");
return NULL;
}
sch = qdisc_alloc(dev_queue, ops, extack);
if (IS_ERR(sch)) {
- module_put(ops->owner);
+ bpf_module_put(ops, ops->owner);
return NULL;
}
sch->parent = parentid;
@@ -1078,7 +1079,7 @@ static void __qdisc_destroy(struct Qdisc *qdisc)
ops->destroy(qdisc);
lockdep_unregister_key(&qdisc->root_lock_key);
- module_put(ops->owner);
+ bpf_module_put(ops, ops->owner);
netdev_put(dev, &qdisc->dev_tracker);
trace_qdisc_destroy(qdisc);
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index c287bf8423b4..5a7745170e84 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -175,6 +175,11 @@ struct hfsc_sched {
#define HT_INFINITY 0xffffffffffffffffULL /* infinite time value */
+static bool cl_in_el_or_vttree(struct hfsc_class *cl)
+{
+ return ((cl->cl_flags & HFSC_FSC) && cl->cl_nactive) ||
+ ((cl->cl_flags & HFSC_RSC) && !RB_EMPTY_NODE(&cl->el_node));
+}
/*
* eligible tree holds backlogged classes being sorted by their eligible times.
@@ -203,7 +208,10 @@ eltree_insert(struct hfsc_class *cl)
static inline void
eltree_remove(struct hfsc_class *cl)
{
- rb_erase(&cl->el_node, &cl->sched->eligible);
+ if (!RB_EMPTY_NODE(&cl->el_node)) {
+ rb_erase(&cl->el_node, &cl->sched->eligible);
+ RB_CLEAR_NODE(&cl->el_node);
+ }
}
static inline void
@@ -958,6 +966,7 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
if (cl != NULL) {
int old_flags;
+ int len = 0;
if (parentid) {
if (cl->cl_parent &&
@@ -988,9 +997,13 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
if (usc != NULL)
hfsc_change_usc(cl, usc, cur_time);
+ if (cl->qdisc->q.qlen != 0)
+ len = qdisc_peek_len(cl->qdisc);
+ /* Check queue length again since some qdisc implementations
+ * (e.g., netem/codel) might empty the queue during the peek
+ * operation.
+ */
if (cl->qdisc->q.qlen != 0) {
- int len = qdisc_peek_len(cl->qdisc);
-
if (cl->cl_flags & HFSC_RSC) {
if (old_flags & HFSC_RSC)
update_ed(cl, len);
@@ -1032,6 +1045,8 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
if (cl == NULL)
return -ENOBUFS;
+ RB_CLEAR_NODE(&cl->el_node);
+
err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack);
if (err) {
kfree(cl);
@@ -1220,7 +1235,8 @@ hfsc_qlen_notify(struct Qdisc *sch, unsigned long arg)
/* vttree is now handled in update_vf() so that update_vf(cl, 0, 0)
* needs to be called explicitly to remove a class from vttree.
*/
- update_vf(cl, 0, 0);
+ if (cl->cl_nactive)
+ update_vf(cl, 0, 0);
if (cl->cl_flags & HFSC_RSC)
eltree_remove(cl);
}
@@ -1560,7 +1576,10 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
return err;
}
- if (first) {
+ sch->qstats.backlog += len;
+ sch->q.qlen++;
+
+ if (first && !cl_in_el_or_vttree(cl)) {
if (cl->cl_flags & HFSC_RSC)
init_ed(cl, len);
if (cl->cl_flags & HFSC_FSC)
@@ -1575,9 +1594,6 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
}
- sch->qstats.backlog += len;
- sch->q.qlen++;
-
return NET_XMIT_SUCCESS;
}
@@ -1632,10 +1648,16 @@ hfsc_dequeue(struct Qdisc *sch)
if (cl->qdisc->q.qlen != 0) {
/* update ed */
next_len = qdisc_peek_len(cl->qdisc);
- if (realtime)
- update_ed(cl, next_len);
- else
- update_d(cl, next_len);
+ /* Check queue length again since some qdisc implementations
+ * (e.g., netem/codel) might empty the queue during the peek
+ * operation.
+ */
+ if (cl->qdisc->q.qlen != 0) {
+ if (realtime)
+ update_ed(cl, next_len);
+ else
+ update_d(cl, next_len);
+ }
} else {
/* the class becomes passive */
eltree_remove(cl);
diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c
index 44d9efe1a96a..5aa434b46707 100644
--- a/net/sched/sch_hhf.c
+++ b/net/sched/sch_hhf.c
@@ -564,7 +564,7 @@ static int hhf_change(struct Qdisc *sch, struct nlattr *opt,
qlen = sch->q.qlen;
prev_backlog = sch->qstats.backlog;
while (sch->q.qlen > sch->limit) {
- struct sk_buff *skb = hhf_dequeue(sch);
+ struct sk_buff *skb = qdisc_dequeue_internal(sch, false);
rtnl_kfree_skbs(skb, skb);
}
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index c31bc5489bdd..14bf71f57057 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -348,7 +348,8 @@ static void htb_add_to_wait_tree(struct htb_sched *q,
*/
static inline void htb_next_rb_node(struct rb_node **n)
{
- *n = rb_next(*n);
+ if (*n)
+ *n = rb_next(*n);
}
/**
@@ -609,8 +610,8 @@ static inline void htb_activate(struct htb_sched *q, struct htb_class *cl)
*/
static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl)
{
- WARN_ON(!cl->prio_activity);
-
+ if (!cl->prio_activity)
+ return;
htb_deactivate_prios(q, cl);
cl->prio_activity = 0;
}
@@ -1738,8 +1739,7 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg,
if (cl->parent)
cl->parent->children--;
- if (cl->prio_activity)
- htb_deactivate(q, cl);
+ htb_deactivate(q, cl);
if (cl->cmode != HTB_CAN_SEND)
htb_safe_rb_erase(&cl->pq_node,
@@ -1947,8 +1947,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
/* turn parent into inner node */
qdisc_purge_queue(parent->leaf.q);
parent_qdisc = parent->leaf.q;
- if (parent->prio_activity)
- htb_deactivate(q, parent);
+ htb_deactivate(q, parent);
/* remove from evt list because of level change */
if (parent->cmode != HTB_CAN_SEND) {
diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c
index 3771d000b30d..ff49a6c97033 100644
--- a/net/sched/sch_pie.c
+++ b/net/sched/sch_pie.c
@@ -195,7 +195,7 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt,
/* Drop excess packets if new limit is lower */
qlen = sch->q.qlen;
while (sch->q.qlen > sch->limit) {
- struct sk_buff *skb = __qdisc_dequeue_head(&sch->q);
+ struct sk_buff *skb = qdisc_dequeue_internal(sch, true);
dropped += qdisc_pkt_len(skb);
qdisc_qstats_backlog_dec(sch, skb);
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index 2cfbc977fe6d..bf1282cb22eb 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -202,6 +202,11 @@ struct qfq_sched {
*/
enum update_reason {enqueue, requeue};
+static bool cl_is_active(struct qfq_class *cl)
+{
+ return !list_empty(&cl->alist);
+}
+
static struct qfq_class *qfq_find_class(struct Qdisc *sch, u32 classid)
{
struct qfq_sched *q = qdisc_priv(sch);
@@ -347,7 +352,7 @@ static void qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl)
struct qfq_aggregate *agg = cl->agg;
- list_del(&cl->alist); /* remove from RR queue of the aggregate */
+ list_del_init(&cl->alist); /* remove from RR queue of the aggregate */
if (list_empty(&agg->active)) /* agg is now inactive */
qfq_deactivate_agg(q, agg);
}
@@ -474,6 +479,7 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
gnet_stats_basic_sync_init(&cl->bstats);
cl->common.classid = classid;
cl->deficit = lmax;
+ INIT_LIST_HEAD(&cl->alist);
cl->qdisc = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
classid, NULL);
@@ -982,7 +988,7 @@ static struct sk_buff *agg_dequeue(struct qfq_aggregate *agg,
cl->deficit -= (int) len;
if (cl->qdisc->q.qlen == 0) /* no more packets, remove from list */
- list_del(&cl->alist);
+ list_del_init(&cl->alist);
else if (cl->deficit < qdisc_pkt_len(cl->qdisc->ops->peek(cl->qdisc))) {
cl->deficit += agg->lmax;
list_move_tail(&cl->alist, &agg->active);
@@ -1214,7 +1220,6 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
struct qfq_class *cl;
struct qfq_aggregate *agg;
int err = 0;
- bool first;
cl = qfq_classify(skb, sch, &err);
if (cl == NULL) {
@@ -1236,7 +1241,6 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
}
gso_segs = skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1;
- first = !cl->qdisc->q.qlen;
err = qdisc_enqueue(skb, cl->qdisc, to_free);
if (unlikely(err != NET_XMIT_SUCCESS)) {
pr_debug("qfq_enqueue: enqueue failed %d\n", err);
@@ -1252,8 +1256,8 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
++sch->q.qlen;
agg = cl->agg;
- /* if the queue was not empty, then done here */
- if (!first) {
+ /* if the class is active, then done here */
+ if (cl_is_active(cl)) {
if (unlikely(skb == cl->qdisc->ops->peek(cl->qdisc)) &&
list_first_entry(&agg->active, struct qfq_class, alist)
== cl && cl->deficit < len)
@@ -1415,6 +1419,8 @@ static void qfq_qlen_notify(struct Qdisc *sch, unsigned long arg)
struct qfq_sched *q = qdisc_priv(sch);
struct qfq_class *cl = (struct qfq_class *)arg;
+ if (list_empty(&cl->alist))
+ return;
qfq_deactivate_class(q, cl);
}
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 9ed197e96396..b912ad99aa15 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -631,6 +631,15 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt,
struct red_parms *p = NULL;
struct sk_buff *to_free = NULL;
struct sk_buff *tail = NULL;
+ unsigned int maxflows;
+ unsigned int quantum;
+ unsigned int divisor;
+ int perturb_period;
+ u8 headdrop;
+ u8 maxdepth;
+ int limit;
+ u8 flags;
+
if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
return -EINVAL;
@@ -652,39 +661,64 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt,
if (!p)
return -ENOMEM;
}
- if (ctl->limit == 1) {
- NL_SET_ERR_MSG_MOD(extack, "invalid limit");
- return -EINVAL;
- }
+
sch_tree_lock(sch);
+
+ limit = q->limit;
+ divisor = q->divisor;
+ headdrop = q->headdrop;
+ maxdepth = q->maxdepth;
+ maxflows = q->maxflows;
+ perturb_period = q->perturb_period;
+ quantum = q->quantum;
+ flags = q->flags;
+
+ /* update and validate configuration */
if (ctl->quantum)
- q->quantum = ctl->quantum;
- WRITE_ONCE(q->perturb_period, ctl->perturb_period * HZ);
+ quantum = ctl->quantum;
+ perturb_period = ctl->perturb_period * HZ;
if (ctl->flows)
- q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
+ maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
if (ctl->divisor) {
- q->divisor = ctl->divisor;
- q->maxflows = min_t(u32, q->maxflows, q->divisor);
+ divisor = ctl->divisor;
+ maxflows = min_t(u32, maxflows, divisor);
}
if (ctl_v1) {
if (ctl_v1->depth)
- q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
+ maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
if (p) {
- swap(q->red_parms, p);
- red_set_parms(q->red_parms,
+ red_set_parms(p,
ctl_v1->qth_min, ctl_v1->qth_max,
ctl_v1->Wlog,
ctl_v1->Plog, ctl_v1->Scell_log,
NULL,
ctl_v1->max_P);
}
- q->flags = ctl_v1->flags;
- q->headdrop = ctl_v1->headdrop;
+ flags = ctl_v1->flags;
+ headdrop = ctl_v1->headdrop;
}
if (ctl->limit) {
- q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows);
- q->maxflows = min_t(u32, q->maxflows, q->limit);
+ limit = min_t(u32, ctl->limit, maxdepth * maxflows);
+ maxflows = min_t(u32, maxflows, limit);
}
+ if (limit == 1) {
+ sch_tree_unlock(sch);
+ kfree(p);
+ NL_SET_ERR_MSG_MOD(extack, "invalid limit");
+ return -EINVAL;
+ }
+
+ /* commit configuration */
+ q->limit = limit;
+ q->divisor = divisor;
+ q->headdrop = headdrop;
+ q->maxdepth = maxdepth;
+ q->maxflows = maxflows;
+ WRITE_ONCE(q->perturb_period, perturb_period);
+ q->quantum = quantum;
+ q->flags = flags;
+ if (p)
+ swap(q->red_parms, p);
qlen = sch->q.qlen;
while (sch->q.qlen > q->limit) {