summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/audit.c6
-rw-r--r--kernel/bpf/Makefile3
-rw-r--r--kernel/bpf/bpf_struct_ops.c2
-rw-r--r--kernel/bpf/btf.c52
-rw-r--r--kernel/bpf/cgroup.c70
-rw-r--r--kernel/bpf/core.c29
-rw-r--r--kernel/bpf/dmabuf_iter.c150
-rw-r--r--kernel/bpf/hashtab.c148
-rw-r--r--kernel/bpf/helpers.c133
-rw-r--r--kernel/bpf/syscall.c10
-rw-r--r--kernel/bpf/sysfs_btf.c32
-rw-r--r--kernel/bpf/verifier.c636
-rw-r--r--kernel/cgroup/cgroup-internal.h6
-rw-r--r--kernel/cgroup/cgroup.c148
-rw-r--r--kernel/cgroup/cpuset.c90
-rw-r--r--kernel/cgroup/misc.c4
-rw-r--r--kernel/cgroup/rstat.c460
-rw-r--r--kernel/configs/debug.config5
-rw-r--r--kernel/configs/xen.config3
-rw-r--r--kernel/dma/direct.c44
-rw-r--r--kernel/dma/mapping.c18
-rw-r--r--kernel/entry/common.c49
-rw-r--r--kernel/irq/autoprobe.c26
-rw-r--r--kernel/irq/chip.c631
-rw-r--r--kernel/irq/cpuhotplug.c12
-rw-r--r--kernel/irq/debugfs.c7
-rw-r--r--kernel/irq/generic-chip.c47
-rw-r--r--kernel/irq/internals.h48
-rw-r--r--kernel/irq/irqdesc.c176
-rw-r--r--kernel/irq/irqdomain.c130
-rw-r--r--kernel/irq/manage.c1166
-rw-r--r--kernel/irq/msi.c192
-rw-r--r--kernel/irq/pm.c38
-rw-r--r--kernel/irq/proc.c67
-rw-r--r--kernel/irq/resend.c50
-rw-r--r--kernel/irq/spurious.c104
-rw-r--r--kernel/kcsan/kcsan_test.c2
-rw-r--r--kernel/kexec_core.c54
-rw-r--r--kernel/kexec_file.c33
-rw-r--r--kernel/panic.c30
-rw-r--r--kernel/power/energy_model.c72
-rw-r--r--kernel/power/hibernate.c23
-rw-r--r--kernel/power/main.c8
-rw-r--r--kernel/power/power.h4
-rw-r--r--kernel/power/process.c8
-rw-r--r--kernel/power/wakelock.c3
-rw-r--r--kernel/rcu/rcutorture.c2
-rw-r--r--kernel/sched/cpufreq_schedutil.c9
-rw-r--r--kernel/sched/ext.c1797
-rw-r--r--kernel/sched/ext.h13
-rw-r--r--kernel/sched/ext_idle.c307
-rw-r--r--kernel/sched/ext_idle.h3
-rw-r--r--kernel/sched/sched.h6
-rw-r--r--kernel/sched/topology.c25
-rw-r--r--kernel/signal.c11
-rw-r--r--kernel/sysctl-test.c49
-rw-r--r--kernel/sysctl.c108
-rw-r--r--kernel/time/alarmtimer.c84
-rw-r--r--kernel/time/clocksource.c2
-rw-r--r--kernel/time/jiffies.c5
-rw-r--r--kernel/time/posix-timers.c23
-rw-r--r--kernel/time/sleep_timeout.c2
-rw-r--r--kernel/time/timer.c78
-rw-r--r--kernel/trace/bpf_trace.c321
-rw-r--r--kernel/trace/trace.c36
-rw-r--r--kernel/trace/trace_stack.c22
-rw-r--r--kernel/trace/trace_uprobe.c2
-rw-r--r--kernel/workqueue.c17
68 files changed, 4250 insertions, 3701 deletions
diff --git a/kernel/audit.c b/kernel/audit.c
index 5f5bf85bcc90..61b5744d0bb6 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1956,8 +1956,8 @@ static inline int audit_expand(struct audit_buffer *ab, int extra)
* will be called a second time. Currently, we assume that a printk
* can't format message larger than 1024 bytes, so we don't either.
*/
-static void audit_log_vformat(struct audit_buffer *ab, const char *fmt,
- va_list args)
+static __printf(2, 0)
+void audit_log_vformat(struct audit_buffer *ab, const char *fmt, va_list args)
{
int len, avail;
struct sk_buff *skb;
@@ -2285,7 +2285,7 @@ void audit_log_path_denied(int type, const char *operation)
{
struct audit_buffer *ab;
- if (!audit_enabled || audit_dummy_context())
+ if (!audit_enabled)
return;
/* Generate log with subject, operation, outcome. */
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 70502f038b92..3a335c50e6e3 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -53,6 +53,9 @@ obj-$(CONFIG_BPF_SYSCALL) += relo_core.o
obj-$(CONFIG_BPF_SYSCALL) += btf_iter.o
obj-$(CONFIG_BPF_SYSCALL) += btf_relocate.o
obj-$(CONFIG_BPF_SYSCALL) += kmem_cache_iter.o
+ifeq ($(CONFIG_DMA_SHARED_BUFFER),y)
+obj-$(CONFIG_BPF_SYSCALL) += dmabuf_iter.o
+endif
CFLAGS_REMOVE_percpu_freelist.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_bpf_lru_list.o = $(CC_FLAGS_FTRACE)
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index db13ee70d94d..96113633e391 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -601,7 +601,7 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
if (model->ret_size > 0)
flags |= BPF_TRAMP_F_RET_FENTRY_RET;
- size = arch_bpf_trampoline_size(model, flags, tlinks, NULL);
+ size = arch_bpf_trampoline_size(model, flags, tlinks, stub_func);
if (size <= 0)
return size ? : -EFAULT;
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 16ba36f34dfa..1d2cf898e21e 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -26,6 +26,7 @@
#include <linux/bsearch.h>
#include <linux/kobject.h>
#include <linux/sysfs.h>
+#include <linux/overflow.h>
#include <net/netfilter/nf_bpf_link.h>
@@ -3957,7 +3958,7 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
/* This needs to be kzalloc to zero out padding and unused fields, see
* comment in btf_record_equal.
*/
- rec = kzalloc(offsetof(struct btf_record, fields[cnt]), GFP_KERNEL | __GFP_NOWARN);
+ rec = kzalloc(struct_size(rec, fields, cnt), GFP_KERNEL | __GFP_NOWARN);
if (!rec)
return ERR_PTR(-ENOMEM);
@@ -5583,7 +5584,7 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf)
if (id < 0)
continue;
- new_aof = krealloc(aof, offsetof(struct btf_id_set, ids[aof->cnt + 1]),
+ new_aof = krealloc(aof, struct_size(new_aof, ids, aof->cnt + 1),
GFP_KERNEL | __GFP_NOWARN);
if (!new_aof) {
ret = -ENOMEM;
@@ -5610,7 +5611,7 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf)
if (ret != BTF_FIELD_FOUND)
continue;
- new_aof = krealloc(aof, offsetof(struct btf_id_set, ids[aof->cnt + 1]),
+ new_aof = krealloc(aof, struct_size(new_aof, ids, aof->cnt + 1),
GFP_KERNEL | __GFP_NOWARN);
if (!new_aof) {
ret = -ENOMEM;
@@ -5647,7 +5648,7 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf)
continue;
parse:
tab_cnt = tab ? tab->cnt : 0;
- new_tab = krealloc(tab, offsetof(struct btf_struct_metas, types[tab_cnt + 1]),
+ new_tab = krealloc(tab, struct_size(new_tab, types, tab_cnt + 1),
GFP_KERNEL | __GFP_NOWARN);
if (!new_tab) {
ret = -ENOMEM;
@@ -6383,16 +6384,15 @@ struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog)
return prog->aux->attach_btf;
}
-static bool is_int_ptr(struct btf *btf, const struct btf_type *t)
+static bool is_void_or_int_ptr(struct btf *btf, const struct btf_type *t)
{
/* skip modifiers */
t = btf_type_skip_modifiers(btf, t->type, NULL);
-
- return btf_type_is_int(t);
+ return btf_type_is_void(t) || btf_type_is_int(t);
}
-static u32 get_ctx_arg_idx(struct btf *btf, const struct btf_type *func_proto,
- int off)
+u32 btf_ctx_arg_idx(struct btf *btf, const struct btf_type *func_proto,
+ int off)
{
const struct btf_param *args;
const struct btf_type *t;
@@ -6541,6 +6541,7 @@ static const struct bpf_raw_tp_null_args raw_tp_null_args[] = {
{ "xprt_put_cong", 0x10 },
/* tcp */
{ "tcp_send_reset", 0x11 },
+ { "tcp_sendmsg_locked", 0x100 },
/* tegra_apb_dma */
{ "tegra_dma_tx_status", 0x100 },
/* timer_migration */
@@ -6671,7 +6672,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
tname, off);
return false;
}
- arg = get_ctx_arg_idx(btf, t, off);
+ arg = btf_ctx_arg_idx(btf, t, off);
args = (const struct btf_param *)(t + 1);
/* if (t == NULL) Fall back to default BPF prog with
* MAX_BPF_FUNC_REG_ARGS u64 arguments.
@@ -6776,14 +6777,11 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
}
}
- if (t->type == 0)
- /* This is a pointer to void.
- * It is the same as scalar from the verifier safety pov.
- * No further pointer walking is allowed.
- */
- return true;
-
- if (is_int_ptr(btf, t))
+ /*
+ * If it's a pointer to void, it's the same as scalar from the verifier
+ * safety POV. Either way, no futher pointer walking is allowed.
+ */
+ if (is_void_or_int_ptr(btf, t))
return true;
/* this is a pointer to another type */
@@ -6829,10 +6827,10 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
/* Is this a func with potential NULL args? */
if (strcmp(tname, raw_tp_null_args[i].func))
continue;
- if (raw_tp_null_args[i].mask & (0x1 << (arg * 4)))
+ if (raw_tp_null_args[i].mask & (0x1ULL << (arg * 4)))
info->reg_type |= PTR_MAYBE_NULL;
/* Is the current arg IS_ERR? */
- if (raw_tp_null_args[i].mask & (0x2 << (arg * 4)))
+ if (raw_tp_null_args[i].mask & (0x2ULL << (arg * 4)))
ptr_err_raw_tp = true;
break;
}
@@ -7662,7 +7660,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog)
return 0;
if (!prog->aux->func_info) {
- bpf_log(log, "Verifier bug\n");
+ verifier_bug(env, "func_info undefined");
return -EFAULT;
}
@@ -7686,7 +7684,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog)
tname = btf_name_by_offset(btf, fn_t->name_off);
if (prog->aux->func_info_aux[subprog].unreliable) {
- bpf_log(log, "Verifier bug in function %s()\n", tname);
+ verifier_bug(env, "unreliable BTF for function %s()", tname);
return -EFAULT;
}
if (prog_type == BPF_PROG_TYPE_EXT)
@@ -8563,7 +8561,7 @@ static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
/* Grow set */
set = krealloc(tab->sets[hook],
- offsetof(struct btf_id_set8, pairs[set_cnt + add_set->cnt]),
+ struct_size(set, pairs, set_cnt + add_set->cnt),
GFP_KERNEL | __GFP_NOWARN);
if (!set) {
ret = -ENOMEM;
@@ -8849,7 +8847,7 @@ int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_c
}
tab = krealloc(btf->dtor_kfunc_tab,
- offsetof(struct btf_id_dtor_kfunc_tab, dtors[tab_cnt + add_cnt]),
+ struct_size(tab, dtors, tab_cnt + add_cnt),
GFP_KERNEL | __GFP_NOWARN);
if (!tab) {
ret = -ENOMEM;
@@ -9407,8 +9405,7 @@ btf_add_struct_ops(struct btf *btf, struct bpf_struct_ops *st_ops,
tab = btf->struct_ops_tab;
if (!tab) {
- tab = kzalloc(offsetof(struct btf_struct_ops_tab, ops[4]),
- GFP_KERNEL);
+ tab = kzalloc(struct_size(tab, ops, 4), GFP_KERNEL);
if (!tab)
return -ENOMEM;
tab->capacity = 4;
@@ -9421,8 +9418,7 @@ btf_add_struct_ops(struct btf *btf, struct bpf_struct_ops *st_ops,
if (tab->cnt == tab->capacity) {
new_tab = krealloc(tab,
- offsetof(struct btf_struct_ops_tab,
- ops[tab->capacity * 2]),
+ struct_size(tab, ops, tab->capacity * 2),
GFP_KERNEL);
if (!new_tab)
return -ENOMEM;
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 84f58f3d028a..9122c39870bf 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -41,6 +41,19 @@ static int __init cgroup_bpf_wq_init(void)
}
core_initcall(cgroup_bpf_wq_init);
+static int cgroup_bpf_lifetime_notify(struct notifier_block *nb,
+ unsigned long action, void *data);
+
+static struct notifier_block cgroup_bpf_lifetime_nb = {
+ .notifier_call = cgroup_bpf_lifetime_notify,
+};
+
+void __init cgroup_bpf_lifetime_notifier_init(void)
+{
+ BUG_ON(blocking_notifier_chain_register(&cgroup_lifetime_notifier,
+ &cgroup_bpf_lifetime_nb));
+}
+
/* __always_inline is necessary to prevent indirect call through run_prog
* function pointer.
*/
@@ -206,7 +219,7 @@ bpf_cgroup_atype_find(enum bpf_attach_type attach_type, u32 attach_btf_id)
}
#endif /* CONFIG_BPF_LSM */
-void cgroup_bpf_offline(struct cgroup *cgrp)
+static void cgroup_bpf_offline(struct cgroup *cgrp)
{
cgroup_get(cgrp);
percpu_ref_kill(&cgrp->bpf.refcnt);
@@ -491,7 +504,7 @@ static void activate_effective_progs(struct cgroup *cgrp,
* cgroup_bpf_inherit() - inherit effective programs from parent
* @cgrp: the cgroup to modify
*/
-int cgroup_bpf_inherit(struct cgroup *cgrp)
+static int cgroup_bpf_inherit(struct cgroup *cgrp)
{
/* has to use marco instead of const int, since compiler thinks
* that array below is variable length
@@ -534,6 +547,27 @@ cleanup:
return -ENOMEM;
}
+static int cgroup_bpf_lifetime_notify(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ struct cgroup *cgrp = data;
+ int ret = 0;
+
+ if (cgrp->root != &cgrp_dfl_root)
+ return NOTIFY_OK;
+
+ switch (action) {
+ case CGROUP_LIFETIME_ONLINE:
+ ret = cgroup_bpf_inherit(cgrp);
+ break;
+ case CGROUP_LIFETIME_OFFLINE:
+ cgroup_bpf_offline(cgrp);
+ break;
+ }
+
+ return notifier_from_errno(ret);
+}
+
static int update_effective_progs(struct cgroup *cgrp,
enum cgroup_bpf_attach_type atype)
{
@@ -1653,10 +1687,6 @@ cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
if (func_proto)
return func_proto;
- func_proto = cgroup_current_func_proto(func_id, prog);
- if (func_proto)
- return func_proto;
-
switch (func_id) {
case BPF_FUNC_perf_event_output:
return &bpf_event_output_data_proto;
@@ -2204,10 +2234,6 @@ sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
if (func_proto)
return func_proto;
- func_proto = cgroup_current_func_proto(func_id, prog);
- if (func_proto)
- return func_proto;
-
switch (func_id) {
case BPF_FUNC_sysctl_get_name:
return &bpf_sysctl_get_name_proto;
@@ -2351,10 +2377,6 @@ cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
if (func_proto)
return func_proto;
- func_proto = cgroup_current_func_proto(func_id, prog);
- if (func_proto)
- return func_proto;
-
switch (func_id) {
#ifdef CONFIG_NET
case BPF_FUNC_get_netns_cookie:
@@ -2601,23 +2623,3 @@ cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return NULL;
}
}
-
-/* Common helpers for cgroup hooks with valid process context. */
-const struct bpf_func_proto *
-cgroup_current_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
-{
- switch (func_id) {
- case BPF_FUNC_get_current_uid_gid:
- return &bpf_get_current_uid_gid_proto;
- case BPF_FUNC_get_current_comm:
- return &bpf_get_current_comm_proto;
-#ifdef CONFIG_CGROUP_NET_CLASSID
- case BPF_FUNC_get_cgroup_classid:
- return &bpf_get_cgroup_classid_curr_proto;
-#endif
- case BPF_FUNC_current_task_under_cgroup:
- return &bpf_current_task_under_cgroup_proto;
- default:
- return NULL;
- }
-}
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index ba6b6118cf50..c20babbf998f 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2358,8 +2358,8 @@ static unsigned int __bpf_prog_ret0_warn(const void *ctx,
return 0;
}
-bool bpf_prog_map_compatible(struct bpf_map *map,
- const struct bpf_prog *fp)
+static bool __bpf_prog_map_compatible(struct bpf_map *map,
+ const struct bpf_prog *fp)
{
enum bpf_prog_type prog_type = resolve_prog_type(fp);
bool ret;
@@ -2368,14 +2368,6 @@ bool bpf_prog_map_compatible(struct bpf_map *map,
if (fp->kprobe_override)
return false;
- /* XDP programs inserted into maps are not guaranteed to run on
- * a particular netdev (and can run outside driver context entirely
- * in the case of devmap and cpumap). Until device checks
- * are implemented, prohibit adding dev-bound programs to program maps.
- */
- if (bpf_prog_is_dev_bound(aux))
- return false;
-
spin_lock(&map->owner.lock);
if (!map->owner.type) {
/* There's no owner yet where we could check for
@@ -2409,6 +2401,19 @@ bool bpf_prog_map_compatible(struct bpf_map *map,
return ret;
}
+bool bpf_prog_map_compatible(struct bpf_map *map, const struct bpf_prog *fp)
+{
+ /* XDP programs inserted into maps are not guaranteed to run on
+ * a particular netdev (and can run outside driver context entirely
+ * in the case of devmap and cpumap). Until device checks
+ * are implemented, prohibit adding dev-bound programs to program maps.
+ */
+ if (bpf_prog_is_dev_bound(fp->aux))
+ return false;
+
+ return __bpf_prog_map_compatible(map, fp);
+}
+
static int bpf_check_tail_call(const struct bpf_prog *fp)
{
struct bpf_prog_aux *aux = fp->aux;
@@ -2421,7 +2426,7 @@ static int bpf_check_tail_call(const struct bpf_prog *fp)
if (!map_type_contains_progs(map))
continue;
- if (!bpf_prog_map_compatible(map, fp)) {
+ if (!__bpf_prog_map_compatible(map, fp)) {
ret = -EINVAL;
goto out;
}
@@ -2469,7 +2474,7 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
/* In case of BPF to BPF calls, verifier did all the prep
* work with regards to JITing, etc.
*/
- bool jit_needed = false;
+ bool jit_needed = fp->jit_requested;
if (fp->bpf_func)
goto finalize;
diff --git a/kernel/bpf/dmabuf_iter.c b/kernel/bpf/dmabuf_iter.c
new file mode 100644
index 000000000000..4dd7ef7c145c
--- /dev/null
+++ b/kernel/bpf/dmabuf_iter.c
@@ -0,0 +1,150 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2025 Google LLC */
+#include <linux/bpf.h>
+#include <linux/btf_ids.h>
+#include <linux/dma-buf.h>
+#include <linux/kernel.h>
+#include <linux/seq_file.h>
+
+static void *dmabuf_iter_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ if (*pos)
+ return NULL;
+
+ return dma_buf_iter_begin();
+}
+
+static void *dmabuf_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct dma_buf *dmabuf = v;
+
+ ++*pos;
+
+ return dma_buf_iter_next(dmabuf);
+}
+
+struct bpf_iter__dmabuf {
+ __bpf_md_ptr(struct bpf_iter_meta *, meta);
+ __bpf_md_ptr(struct dma_buf *, dmabuf);
+};
+
+static int __dmabuf_seq_show(struct seq_file *seq, void *v, bool in_stop)
+{
+ struct bpf_iter_meta meta = {
+ .seq = seq,
+ };
+ struct bpf_iter__dmabuf ctx = {
+ .meta = &meta,
+ .dmabuf = v,
+ };
+ struct bpf_prog *prog = bpf_iter_get_info(&meta, in_stop);
+
+ if (prog)
+ return bpf_iter_run_prog(prog, &ctx);
+
+ return 0;
+}
+
+static int dmabuf_iter_seq_show(struct seq_file *seq, void *v)
+{
+ return __dmabuf_seq_show(seq, v, false);
+}
+
+static void dmabuf_iter_seq_stop(struct seq_file *seq, void *v)
+{
+ struct dma_buf *dmabuf = v;
+
+ if (dmabuf)
+ dma_buf_put(dmabuf);
+}
+
+static const struct seq_operations dmabuf_iter_seq_ops = {
+ .start = dmabuf_iter_seq_start,
+ .next = dmabuf_iter_seq_next,
+ .stop = dmabuf_iter_seq_stop,
+ .show = dmabuf_iter_seq_show,
+};
+
+static void bpf_iter_dmabuf_show_fdinfo(const struct bpf_iter_aux_info *aux,
+ struct seq_file *seq)
+{
+ seq_puts(seq, "dmabuf iter\n");
+}
+
+static const struct bpf_iter_seq_info dmabuf_iter_seq_info = {
+ .seq_ops = &dmabuf_iter_seq_ops,
+ .init_seq_private = NULL,
+ .fini_seq_private = NULL,
+ .seq_priv_size = 0,
+};
+
+static struct bpf_iter_reg bpf_dmabuf_reg_info = {
+ .target = "dmabuf",
+ .feature = BPF_ITER_RESCHED,
+ .show_fdinfo = bpf_iter_dmabuf_show_fdinfo,
+ .ctx_arg_info_size = 1,
+ .ctx_arg_info = {
+ { offsetof(struct bpf_iter__dmabuf, dmabuf),
+ PTR_TO_BTF_ID_OR_NULL },
+ },
+ .seq_info = &dmabuf_iter_seq_info,
+};
+
+DEFINE_BPF_ITER_FUNC(dmabuf, struct bpf_iter_meta *meta, struct dma_buf *dmabuf)
+BTF_ID_LIST_SINGLE(bpf_dmabuf_btf_id, struct, dma_buf)
+
+static int __init dmabuf_iter_init(void)
+{
+ bpf_dmabuf_reg_info.ctx_arg_info[0].btf_id = bpf_dmabuf_btf_id[0];
+ return bpf_iter_reg_target(&bpf_dmabuf_reg_info);
+}
+
+late_initcall(dmabuf_iter_init);
+
+struct bpf_iter_dmabuf {
+ /*
+ * opaque iterator state; having __u64 here allows to preserve correct
+ * alignment requirements in vmlinux.h, generated from BTF
+ */
+ __u64 __opaque[1];
+} __aligned(8);
+
+/* Non-opaque version of bpf_iter_dmabuf */
+struct bpf_iter_dmabuf_kern {
+ struct dma_buf *dmabuf;
+} __aligned(8);
+
+__bpf_kfunc_start_defs();
+
+__bpf_kfunc int bpf_iter_dmabuf_new(struct bpf_iter_dmabuf *it)
+{
+ struct bpf_iter_dmabuf_kern *kit = (void *)it;
+
+ BUILD_BUG_ON(sizeof(*kit) > sizeof(*it));
+ BUILD_BUG_ON(__alignof__(*kit) != __alignof__(*it));
+
+ kit->dmabuf = NULL;
+ return 0;
+}
+
+__bpf_kfunc struct dma_buf *bpf_iter_dmabuf_next(struct bpf_iter_dmabuf *it)
+{
+ struct bpf_iter_dmabuf_kern *kit = (void *)it;
+
+ if (kit->dmabuf)
+ kit->dmabuf = dma_buf_iter_next(kit->dmabuf);
+ else
+ kit->dmabuf = dma_buf_iter_begin();
+
+ return kit->dmabuf;
+}
+
+__bpf_kfunc void bpf_iter_dmabuf_destroy(struct bpf_iter_dmabuf *it)
+{
+ struct bpf_iter_dmabuf_kern *kit = (void *)it;
+
+ if (kit->dmabuf)
+ dma_buf_put(kit->dmabuf);
+}
+
+__bpf_kfunc_end_defs();
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 92b606d60020..71f9931ac64c 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -175,20 +175,30 @@ static bool htab_is_percpu(const struct bpf_htab *htab)
htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
}
+static inline bool is_fd_htab(const struct bpf_htab *htab)
+{
+ return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS;
+}
+
+static inline void *htab_elem_value(struct htab_elem *l, u32 key_size)
+{
+ return l->key + round_up(key_size, 8);
+}
+
static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
void __percpu *pptr)
{
- *(void __percpu **)(l->key + roundup(key_size, 8)) = pptr;
+ *(void __percpu **)htab_elem_value(l, key_size) = pptr;
}
static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size)
{
- return *(void __percpu **)(l->key + roundup(key_size, 8));
+ return *(void __percpu **)htab_elem_value(l, key_size);
}
static void *fd_htab_map_get_ptr(const struct bpf_map *map, struct htab_elem *l)
{
- return *(void **)(l->key + roundup(map->key_size, 8));
+ return *(void **)htab_elem_value(l, map->key_size);
}
static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i)
@@ -196,9 +206,13 @@ static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i)
return (struct htab_elem *) (htab->elems + i * (u64)htab->elem_size);
}
+/* Both percpu and fd htab support in-place update, so no need for
+ * extra elem. LRU itself can remove the least used element, so
+ * there is no need for an extra elem during map_update.
+ */
static bool htab_has_extra_elems(struct bpf_htab *htab)
{
- return !htab_is_percpu(htab) && !htab_is_lru(htab);
+ return !htab_is_percpu(htab) && !htab_is_lru(htab) && !is_fd_htab(htab);
}
static void htab_free_prealloced_timers_and_wq(struct bpf_htab *htab)
@@ -215,10 +229,10 @@ static void htab_free_prealloced_timers_and_wq(struct bpf_htab *htab)
elem = get_htab_elem(htab, i);
if (btf_record_has_field(htab->map.record, BPF_TIMER))
bpf_obj_free_timer(htab->map.record,
- elem->key + round_up(htab->map.key_size, 8));
+ htab_elem_value(elem, htab->map.key_size));
if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE))
bpf_obj_free_workqueue(htab->map.record,
- elem->key + round_up(htab->map.key_size, 8));
+ htab_elem_value(elem, htab->map.key_size));
cond_resched();
}
}
@@ -245,7 +259,8 @@ static void htab_free_prealloced_fields(struct bpf_htab *htab)
cond_resched();
}
} else {
- bpf_obj_free_fields(htab->map.record, elem->key + round_up(htab->map.key_size, 8));
+ bpf_obj_free_fields(htab->map.record,
+ htab_elem_value(elem, htab->map.key_size));
cond_resched();
}
cond_resched();
@@ -453,8 +468,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
{
bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
- bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH ||
- attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
/* percpu_lru means each cpu has its own LRU list.
* it is different from BPF_MAP_TYPE_PERCPU_HASH where
* the map's value itself is percpu. percpu_lru has
@@ -549,10 +562,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
if (err)
goto free_map_locked;
- if (!percpu && !lru) {
- /* lru itself can remove the least used element, so
- * there is no need for an extra elem during map_update.
- */
+ if (htab_has_extra_elems(htab)) {
err = alloc_extra_elems(htab);
if (err)
goto free_prealloc;
@@ -670,7 +680,7 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
struct htab_elem *l = __htab_map_lookup_elem(map, key);
if (l)
- return l->key + round_up(map->key_size, 8);
+ return htab_elem_value(l, map->key_size);
return NULL;
}
@@ -709,7 +719,7 @@ static __always_inline void *__htab_lru_map_lookup_elem(struct bpf_map *map,
if (l) {
if (mark)
bpf_lru_node_set_ref(&l->lru_node);
- return l->key + round_up(map->key_size, 8);
+ return htab_elem_value(l, map->key_size);
}
return NULL;
@@ -763,7 +773,7 @@ static void check_and_free_fields(struct bpf_htab *htab,
for_each_possible_cpu(cpu)
bpf_obj_free_fields(htab->map.record, per_cpu_ptr(pptr, cpu));
} else {
- void *map_value = elem->key + round_up(htab->map.key_size, 8);
+ void *map_value = htab_elem_value(elem, htab->map.key_size);
bpf_obj_free_fields(htab->map.record, map_value);
}
@@ -968,8 +978,7 @@ static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr,
static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab)
{
- return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS &&
- BITS_PER_LONG == 64;
+ return is_fd_htab(htab) && BITS_PER_LONG == 64;
}
static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
@@ -1039,11 +1048,9 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
htab_elem_set_ptr(l_new, key_size, pptr);
} else if (fd_htab_map_needs_adjust(htab)) {
size = round_up(size, 8);
- memcpy(l_new->key + round_up(key_size, 8), value, size);
+ memcpy(htab_elem_value(l_new, key_size), value, size);
} else {
- copy_map_value(&htab->map,
- l_new->key + round_up(key_size, 8),
- value);
+ copy_map_value(&htab->map, htab_elem_value(l_new, key_size), value);
}
l_new->hash = hash;
@@ -1072,10 +1079,9 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value,
u64 map_flags)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
- struct htab_elem *l_new = NULL, *l_old;
+ struct htab_elem *l_new, *l_old;
struct hlist_nulls_head *head;
unsigned long flags;
- void *old_map_ptr;
struct bucket *b;
u32 key_size, hash;
int ret;
@@ -1106,7 +1112,7 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value,
if (l_old) {
/* grab the element lock and update value in place */
copy_map_value_locked(map,
- l_old->key + round_up(key_size, 8),
+ htab_elem_value(l_old, key_size),
value, false);
return 0;
}
@@ -1134,7 +1140,7 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value,
* and update element in place
*/
copy_map_value_locked(map,
- l_old->key + round_up(key_size, 8),
+ htab_elem_value(l_old, key_size),
value, false);
ret = 0;
goto err;
@@ -1156,24 +1162,14 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value,
hlist_nulls_del_rcu(&l_old->hash_node);
/* l_old has already been stashed in htab->extra_elems, free
- * its special fields before it is available for reuse. Also
- * save the old map pointer in htab of maps before unlock
- * and release it after unlock.
+ * its special fields before it is available for reuse.
*/
- old_map_ptr = NULL;
- if (htab_is_prealloc(htab)) {
- if (map->ops->map_fd_put_ptr)
- old_map_ptr = fd_htab_map_get_ptr(map, l_old);
+ if (htab_is_prealloc(htab))
check_and_free_fields(htab, l_old);
- }
}
htab_unlock_bucket(b, flags);
- if (l_old) {
- if (old_map_ptr)
- map->ops->map_fd_put_ptr(map, old_map_ptr, true);
- if (!htab_is_prealloc(htab))
- free_htab_elem(htab, l_old);
- }
+ if (l_old && !htab_is_prealloc(htab))
+ free_htab_elem(htab, l_old);
return 0;
err:
htab_unlock_bucket(b, flags);
@@ -1220,8 +1216,7 @@ static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value
l_new = prealloc_lru_pop(htab, key, hash);
if (!l_new)
return -ENOMEM;
- copy_map_value(&htab->map,
- l_new->key + round_up(map->key_size, 8), value);
+ copy_map_value(&htab->map, htab_elem_value(l_new, map->key_size), value);
ret = htab_lock_bucket(b, &flags);
if (ret)
@@ -1255,13 +1250,14 @@ err_lock_bucket:
return ret;
}
-static long __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
+static long htab_map_update_elem_in_place(struct bpf_map *map, void *key,
void *value, u64 map_flags,
- bool onallcpus)
+ bool percpu, bool onallcpus)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
- struct htab_elem *l_new = NULL, *l_old;
+ struct htab_elem *l_new, *l_old;
struct hlist_nulls_head *head;
+ void *old_map_ptr = NULL;
unsigned long flags;
struct bucket *b;
u32 key_size, hash;
@@ -1292,21 +1288,29 @@ static long __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
goto err;
if (l_old) {
- /* per-cpu hash map can update value in-place */
- pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
- value, onallcpus);
+ /* Update value in-place */
+ if (percpu) {
+ pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
+ value, onallcpus);
+ } else {
+ void **inner_map_pptr = htab_elem_value(l_old, key_size);
+
+ old_map_ptr = *inner_map_pptr;
+ WRITE_ONCE(*inner_map_pptr, *(void **)value);
+ }
} else {
l_new = alloc_htab_elem(htab, key, value, key_size,
- hash, true, onallcpus, NULL);
+ hash, percpu, onallcpus, NULL);
if (IS_ERR(l_new)) {
ret = PTR_ERR(l_new);
goto err;
}
hlist_nulls_add_head_rcu(&l_new->hash_node, head);
}
- ret = 0;
err:
htab_unlock_bucket(b, flags);
+ if (old_map_ptr)
+ map->ops->map_fd_put_ptr(map, old_map_ptr, true);
return ret;
}
@@ -1383,7 +1387,7 @@ err_lock_bucket:
static long htab_percpu_map_update_elem(struct bpf_map *map, void *key,
void *value, u64 map_flags)
{
- return __htab_percpu_map_update_elem(map, key, value, map_flags, false);
+ return htab_map_update_elem_in_place(map, key, value, map_flags, true, false);
}
static long htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
@@ -1500,10 +1504,10 @@ static void htab_free_malloced_timers_and_wq(struct bpf_htab *htab)
/* We only free timer on uref dropping to zero */
if (btf_record_has_field(htab->map.record, BPF_TIMER))
bpf_obj_free_timer(htab->map.record,
- l->key + round_up(htab->map.key_size, 8));
+ htab_elem_value(l, htab->map.key_size));
if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE))
bpf_obj_free_workqueue(htab->map.record,
- l->key + round_up(htab->map.key_size, 8));
+ htab_elem_value(l, htab->map.key_size));
}
cond_resched_rcu();
}
@@ -1615,15 +1619,12 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
off += roundup_value_size;
}
} else {
- u32 roundup_key_size = round_up(map->key_size, 8);
+ void *src = htab_elem_value(l, map->key_size);
if (flags & BPF_F_LOCK)
- copy_map_value_locked(map, value, l->key +
- roundup_key_size,
- true);
+ copy_map_value_locked(map, value, src, true);
else
- copy_map_value(map, value, l->key +
- roundup_key_size);
+ copy_map_value(map, value, src);
/* Zeroing special fields in the temp buffer */
check_and_init_map_value(map, value);
}
@@ -1680,12 +1681,12 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
bool is_percpu)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
- u32 bucket_cnt, total, key_size, value_size, roundup_key_size;
void *keys = NULL, *values = NULL, *value, *dst_key, *dst_val;
void __user *uvalues = u64_to_user_ptr(attr->batch.values);
void __user *ukeys = u64_to_user_ptr(attr->batch.keys);
void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch);
u32 batch, max_count, size, bucket_size, map_id;
+ u32 bucket_cnt, total, key_size, value_size;
struct htab_elem *node_to_free = NULL;
u64 elem_map_flags, map_flags;
struct hlist_nulls_head *head;
@@ -1720,7 +1721,6 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
return -ENOENT;
key_size = htab->map.key_size;
- roundup_key_size = round_up(htab->map.key_size, 8);
value_size = htab->map.value_size;
size = round_up(value_size, 8);
if (is_percpu)
@@ -1812,8 +1812,8 @@ again_nocopy:
off += size;
}
} else {
- value = l->key + roundup_key_size;
- if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
+ value = htab_elem_value(l, key_size);
+ if (is_fd_htab(htab)) {
struct bpf_map **inner_map = value;
/* Actual value is the id of the inner map */
@@ -2063,11 +2063,11 @@ static void *bpf_hash_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
static int __bpf_hash_map_seq_show(struct seq_file *seq, struct htab_elem *elem)
{
struct bpf_iter_seq_hash_map_info *info = seq->private;
- u32 roundup_key_size, roundup_value_size;
struct bpf_iter__bpf_map_elem ctx = {};
struct bpf_map *map = info->map;
struct bpf_iter_meta meta;
int ret = 0, off = 0, cpu;
+ u32 roundup_value_size;
struct bpf_prog *prog;
void __percpu *pptr;
@@ -2077,10 +2077,9 @@ static int __bpf_hash_map_seq_show(struct seq_file *seq, struct htab_elem *elem)
ctx.meta = &meta;
ctx.map = info->map;
if (elem) {
- roundup_key_size = round_up(map->key_size, 8);
ctx.key = elem->key;
if (!info->percpu_value_buf) {
- ctx.value = elem->key + roundup_key_size;
+ ctx.value = htab_elem_value(elem, map->key_size);
} else {
roundup_value_size = round_up(map->value_size, 8);
pptr = htab_elem_get_ptr(elem, map->key_size);
@@ -2165,7 +2164,6 @@ static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_
struct hlist_nulls_head *head;
struct hlist_nulls_node *n;
struct htab_elem *elem;
- u32 roundup_key_size;
int i, num_elems = 0;
void __percpu *pptr;
struct bucket *b;
@@ -2180,7 +2178,6 @@ static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_
is_percpu = htab_is_percpu(htab);
- roundup_key_size = round_up(map->key_size, 8);
/* migration has been disabled, so percpu value prepared here will be
* the same as the one seen by the bpf program with
* bpf_map_lookup_elem().
@@ -2196,7 +2193,7 @@ static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_
pptr = htab_elem_get_ptr(elem, map->key_size);
val = this_cpu_ptr(pptr);
} else {
- val = elem->key + roundup_key_size;
+ val = htab_elem_value(elem, map->key_size);
}
num_elems++;
ret = callback_fn((u64)(long)map, (u64)(long)key,
@@ -2411,8 +2408,8 @@ int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
ret = __htab_lru_percpu_map_update_elem(map, key, value,
map_flags, true);
else
- ret = __htab_percpu_map_update_elem(map, key, value, map_flags,
- true);
+ ret = htab_map_update_elem_in_place(map, key, value, map_flags,
+ true, true);
rcu_read_unlock();
return ret;
@@ -2536,24 +2533,23 @@ int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value)
return ret;
}
-/* only called from syscall */
+/* Only called from syscall */
int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file,
void *key, void *value, u64 map_flags)
{
void *ptr;
int ret;
- u32 ufd = *(u32 *)value;
- ptr = map->ops->map_fd_get_ptr(map, map_file, ufd);
+ ptr = map->ops->map_fd_get_ptr(map, map_file, *(int *)value);
if (IS_ERR(ptr))
return PTR_ERR(ptr);
/* The htab bucket lock is always held during update operations in fd
* htab map, and the following rcu_read_lock() is only used to avoid
- * the WARN_ON_ONCE in htab_map_update_elem().
+ * the WARN_ON_ONCE in htab_map_update_elem_in_place().
*/
rcu_read_lock();
- ret = htab_map_update_elem(map, key, &ptr, map_flags);
+ ret = htab_map_update_elem_in_place(map, key, &ptr, map_flags, false, false);
rcu_read_unlock();
if (ret)
map->ops->map_fd_put_ptr(map, ptr, false);
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index e3a2662f4e33..b71e428ad936 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -23,6 +23,7 @@
#include <linux/btf_ids.h>
#include <linux/bpf_mem_alloc.h>
#include <linux/kasan.h>
+#include <linux/bpf_verifier.h>
#include "../../lib/kstrtox.h"
@@ -129,7 +130,8 @@ const struct bpf_func_proto bpf_map_peek_elem_proto = {
BPF_CALL_3(bpf_map_lookup_percpu_elem, struct bpf_map *, map, void *, key, u32, cpu)
{
- WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
+ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
+ !rcu_read_lock_bh_held());
return (unsigned long) map->ops->map_lookup_percpu_elem(map, key, cpu);
}
@@ -1713,16 +1715,6 @@ void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr)
memset(ptr, 0, sizeof(*ptr));
}
-static int bpf_dynptr_check_off_len(const struct bpf_dynptr_kern *ptr, u32 offset, u32 len)
-{
- u32 size = __bpf_dynptr_size(ptr);
-
- if (len > size || offset > size - len)
- return -E2BIG;
-
- return 0;
-}
-
BPF_CALL_4(bpf_dynptr_from_mem, void *, data, u32, size, u64, flags, struct bpf_dynptr_kern *, ptr)
{
int err;
@@ -1809,8 +1801,8 @@ static const struct bpf_func_proto bpf_dynptr_read_proto = {
.arg5_type = ARG_ANYTHING,
};
-static int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u32 offset, void *src,
- u32 len, u64 flags)
+int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u32 offset, void *src,
+ u32 len, u64 flags)
{
enum bpf_dynptr_type type;
int err;
@@ -1912,6 +1904,12 @@ const struct bpf_func_proto bpf_probe_read_user_str_proto __weak;
const struct bpf_func_proto bpf_probe_read_kernel_proto __weak;
const struct bpf_func_proto bpf_probe_read_kernel_str_proto __weak;
const struct bpf_func_proto bpf_task_pt_regs_proto __weak;
+const struct bpf_func_proto bpf_perf_event_read_proto __weak;
+const struct bpf_func_proto bpf_send_signal_proto __weak;
+const struct bpf_func_proto bpf_send_signal_thread_proto __weak;
+const struct bpf_func_proto bpf_get_task_stack_sleepable_proto __weak;
+const struct bpf_func_proto bpf_get_task_stack_proto __weak;
+const struct bpf_func_proto bpf_get_branch_snapshot_proto __weak;
const struct bpf_func_proto *
bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
@@ -1965,6 +1963,8 @@ bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_current_pid_tgid_proto;
case BPF_FUNC_get_ns_current_pid_tgid:
return &bpf_get_ns_current_pid_tgid_proto;
+ case BPF_FUNC_get_current_uid_gid:
+ return &bpf_get_current_uid_gid_proto;
default:
break;
}
@@ -2022,7 +2022,21 @@ bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_current_cgroup_id_proto;
case BPF_FUNC_get_current_ancestor_cgroup_id:
return &bpf_get_current_ancestor_cgroup_id_proto;
+ case BPF_FUNC_current_task_under_cgroup:
+ return &bpf_current_task_under_cgroup_proto;
#endif
+#ifdef CONFIG_CGROUP_NET_CLASSID
+ case BPF_FUNC_get_cgroup_classid:
+ return &bpf_get_cgroup_classid_curr_proto;
+#endif
+ case BPF_FUNC_task_storage_get:
+ if (bpf_prog_check_recur(prog))
+ return &bpf_task_storage_get_recur_proto;
+ return &bpf_task_storage_get_proto;
+ case BPF_FUNC_task_storage_delete:
+ if (bpf_prog_check_recur(prog))
+ return &bpf_task_storage_delete_recur_proto;
+ return &bpf_task_storage_delete_proto;
default:
break;
}
@@ -2037,6 +2051,8 @@ bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_current_task_proto;
case BPF_FUNC_get_current_task_btf:
return &bpf_get_current_task_btf_proto;
+ case BPF_FUNC_get_current_comm:
+ return &bpf_get_current_comm_proto;
case BPF_FUNC_probe_read_user:
return &bpf_probe_read_user_proto;
case BPF_FUNC_probe_read_kernel:
@@ -2047,6 +2063,10 @@ bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_probe_read_kernel_str:
return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
NULL : &bpf_probe_read_kernel_str_proto;
+ case BPF_FUNC_copy_from_user:
+ return &bpf_copy_from_user_proto;
+ case BPF_FUNC_copy_from_user_task:
+ return &bpf_copy_from_user_task_proto;
case BPF_FUNC_snprintf_btf:
return &bpf_snprintf_btf_proto;
case BPF_FUNC_snprintf:
@@ -2057,6 +2077,19 @@ bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return bpf_get_trace_vprintk_proto();
case BPF_FUNC_perf_event_read_value:
return bpf_get_perf_event_read_value_proto();
+ case BPF_FUNC_perf_event_read:
+ return &bpf_perf_event_read_proto;
+ case BPF_FUNC_send_signal:
+ return &bpf_send_signal_proto;
+ case BPF_FUNC_send_signal_thread:
+ return &bpf_send_signal_thread_proto;
+ case BPF_FUNC_get_task_stack:
+ return prog->sleepable ? &bpf_get_task_stack_sleepable_proto
+ : &bpf_get_task_stack_proto;
+ case BPF_FUNC_get_branch_snapshot:
+ return &bpf_get_branch_snapshot_proto;
+ case BPF_FUNC_find_vma:
+ return &bpf_find_vma_proto;
default:
return NULL;
}
@@ -2293,6 +2326,26 @@ __bpf_kfunc struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head)
return __bpf_list_del(head, true);
}
+__bpf_kfunc struct bpf_list_node *bpf_list_front(struct bpf_list_head *head)
+{
+ struct list_head *h = (struct list_head *)head;
+
+ if (list_empty(h) || unlikely(!h->next))
+ return NULL;
+
+ return (struct bpf_list_node *)h->next;
+}
+
+__bpf_kfunc struct bpf_list_node *bpf_list_back(struct bpf_list_head *head)
+{
+ struct list_head *h = (struct list_head *)head;
+
+ if (list_empty(h) || unlikely(!h->next))
+ return NULL;
+
+ return (struct bpf_list_node *)h->prev;
+}
+
__bpf_kfunc struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root,
struct bpf_rb_node *node)
{
@@ -2366,6 +2419,33 @@ __bpf_kfunc struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root)
return (struct bpf_rb_node *)rb_first_cached(r);
}
+__bpf_kfunc struct bpf_rb_node *bpf_rbtree_root(struct bpf_rb_root *root)
+{
+ struct rb_root_cached *r = (struct rb_root_cached *)root;
+
+ return (struct bpf_rb_node *)r->rb_root.rb_node;
+}
+
+__bpf_kfunc struct bpf_rb_node *bpf_rbtree_left(struct bpf_rb_root *root, struct bpf_rb_node *node)
+{
+ struct bpf_rb_node_kern *node_internal = (struct bpf_rb_node_kern *)node;
+
+ if (READ_ONCE(node_internal->owner) != root)
+ return NULL;
+
+ return (struct bpf_rb_node *)node_internal->rb_node.rb_left;
+}
+
+__bpf_kfunc struct bpf_rb_node *bpf_rbtree_right(struct bpf_rb_root *root, struct bpf_rb_node *node)
+{
+ struct bpf_rb_node_kern *node_internal = (struct bpf_rb_node_kern *)node;
+
+ if (READ_ONCE(node_internal->owner) != root)
+ return NULL;
+
+ return (struct bpf_rb_node *)node_internal->rb_node.rb_right;
+}
+
/**
* bpf_task_acquire - Acquire a reference to a task. A task acquired by this
* kfunc which is not stored in a map as a kptr, must be released by calling
@@ -2923,9 +3003,9 @@ __bpf_kfunc int bpf_wq_start(struct bpf_wq *wq, unsigned int flags)
__bpf_kfunc int bpf_wq_set_callback_impl(struct bpf_wq *wq,
int (callback_fn)(void *map, int *key, void *value),
unsigned int flags,
- void *aux__ign)
+ void *aux__prog)
{
- struct bpf_prog_aux *aux = (struct bpf_prog_aux *)aux__ign;
+ struct bpf_prog_aux *aux = (struct bpf_prog_aux *)aux__prog;
struct bpf_async_kern *async = (struct bpf_async_kern *)wq;
if (flags)
@@ -3194,6 +3274,10 @@ __bpf_kfunc void bpf_local_irq_restore(unsigned long *flags__irq_flag)
local_irq_restore(*flags__irq_flag);
}
+__bpf_kfunc void __bpf_trap(void)
+{
+}
+
__bpf_kfunc_end_defs();
BTF_KFUNCS_START(generic_btf_ids)
@@ -3209,11 +3293,16 @@ BTF_ID_FLAGS(func, bpf_list_push_front_impl)
BTF_ID_FLAGS(func, bpf_list_push_back_impl)
BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_list_pop_back, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_list_front, KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_list_back, KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_task_release, KF_RELEASE)
BTF_ID_FLAGS(func, bpf_rbtree_remove, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_rbtree_add_impl)
BTF_ID_FLAGS(func, bpf_rbtree_first, KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_rbtree_root, KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_rbtree_left, KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_rbtree_right, KF_RET_NULL)
#ifdef CONFIG_CGROUPS
BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
@@ -3294,6 +3383,20 @@ BTF_ID_FLAGS(func, bpf_iter_kmem_cache_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLE
BTF_ID_FLAGS(func, bpf_iter_kmem_cache_destroy, KF_ITER_DESTROY | KF_SLEEPABLE)
BTF_ID_FLAGS(func, bpf_local_irq_save)
BTF_ID_FLAGS(func, bpf_local_irq_restore)
+BTF_ID_FLAGS(func, bpf_probe_read_user_dynptr)
+BTF_ID_FLAGS(func, bpf_probe_read_kernel_dynptr)
+BTF_ID_FLAGS(func, bpf_probe_read_user_str_dynptr)
+BTF_ID_FLAGS(func, bpf_probe_read_kernel_str_dynptr)
+BTF_ID_FLAGS(func, bpf_copy_from_user_dynptr, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_copy_from_user_str_dynptr, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_copy_from_user_task_dynptr, KF_SLEEPABLE | KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_copy_from_user_task_str_dynptr, KF_SLEEPABLE | KF_TRUSTED_ARGS)
+#ifdef CONFIG_DMA_SHARED_BUFFER
+BTF_ID_FLAGS(func, bpf_iter_dmabuf_new, KF_ITER_NEW | KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_iter_dmabuf_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_iter_dmabuf_destroy, KF_ITER_DESTROY | KF_SLEEPABLE)
+#endif
+BTF_ID_FLAGS(func, __bpf_trap)
BTF_KFUNCS_END(common_btf_ids)
static const struct btf_kfunc_id_set common_kfunc_set = {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 64c3393e8270..4b5f29168618 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -36,6 +36,7 @@
#include <linux/memcontrol.h>
#include <linux/trace_events.h>
#include <linux/tracepoint.h>
+#include <linux/overflow.h>
#include <net/netfilter/nf_bpf_link.h>
#include <net/netkit.h>
@@ -693,7 +694,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec)
if (IS_ERR_OR_NULL(rec))
return NULL;
- size = offsetof(struct btf_record, fields[rec->cnt]);
+ size = struct_size(rec, fields, rec->cnt);
new_rec = kmemdup(rec, size, GFP_KERNEL | __GFP_NOWARN);
if (!new_rec)
return ERR_PTR(-ENOMEM);
@@ -748,7 +749,7 @@ bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *r
return false;
if (rec_a->cnt != rec_b->cnt)
return false;
- size = offsetof(struct btf_record, fields[rec_a->cnt]);
+ size = struct_size(rec_a, fields, rec_a->cnt);
/* btf_parse_fields uses kzalloc to allocate a btf_record, so unused
* members are zeroed out. So memcmp is safe to do without worrying
* about padding/unused fields.
@@ -3799,14 +3800,14 @@ static int bpf_perf_link_fill_kprobe(const struct perf_event *event,
static int bpf_perf_link_fill_uprobe(const struct perf_event *event,
struct bpf_link_info *info)
{
+ u64 ref_ctr_offset, offset;
char __user *uname;
- u64 addr, offset;
u32 ulen, type;
int err;
uname = u64_to_user_ptr(info->perf_event.uprobe.file_name);
ulen = info->perf_event.uprobe.name_len;
- err = bpf_perf_link_fill_common(event, uname, &ulen, &offset, &addr,
+ err = bpf_perf_link_fill_common(event, uname, &ulen, &offset, &ref_ctr_offset,
&type, NULL);
if (err)
return err;
@@ -3818,6 +3819,7 @@ static int bpf_perf_link_fill_uprobe(const struct perf_event *event,
info->perf_event.uprobe.name_len = ulen;
info->perf_event.uprobe.offset = offset;
info->perf_event.uprobe.cookie = event->bpf_cookie;
+ info->perf_event.uprobe.ref_ctr_offset = ref_ctr_offset;
return 0;
}
#endif
diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c
index 81d6cf90584a..941d0d2427e3 100644
--- a/kernel/bpf/sysfs_btf.c
+++ b/kernel/bpf/sysfs_btf.c
@@ -7,14 +7,46 @@
#include <linux/kobject.h>
#include <linux/init.h>
#include <linux/sysfs.h>
+#include <linux/mm.h>
+#include <linux/io.h>
+#include <linux/btf.h>
/* See scripts/link-vmlinux.sh, gen_btf() func for details */
extern char __start_BTF[];
extern char __stop_BTF[];
+static int btf_sysfs_vmlinux_mmap(struct file *filp, struct kobject *kobj,
+ const struct bin_attribute *attr,
+ struct vm_area_struct *vma)
+{
+ unsigned long pages = PAGE_ALIGN(attr->size) >> PAGE_SHIFT;
+ size_t vm_size = vma->vm_end - vma->vm_start;
+ phys_addr_t addr = virt_to_phys(__start_BTF);
+ unsigned long pfn = addr >> PAGE_SHIFT;
+
+ if (attr->private != __start_BTF || !PAGE_ALIGNED(addr))
+ return -EINVAL;
+
+ if (vma->vm_pgoff)
+ return -EINVAL;
+
+ if (vma->vm_flags & (VM_WRITE | VM_EXEC | VM_MAYSHARE))
+ return -EACCES;
+
+ if (pfn + pages < pfn)
+ return -EINVAL;
+
+ if ((vm_size >> PAGE_SHIFT) > pages)
+ return -EINVAL;
+
+ vm_flags_mod(vma, VM_DONTDUMP, VM_MAYEXEC | VM_MAYWRITE);
+ return remap_pfn_range(vma, vma->vm_start, pfn, vm_size, vma->vm_page_prot);
+}
+
static struct bin_attribute bin_attr_btf_vmlinux __ro_after_init = {
.attr = { .name = "vmlinux", .mode = 0444, },
.read_new = sysfs_bin_attr_simple_read,
+ .mmap = btf_sysfs_vmlinux_mmap,
};
struct kobject *btf_kobj;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 54c6953a8b84..a7d6e0c5928b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -322,6 +322,7 @@ struct bpf_kfunc_call_arg_meta {
struct btf *arg_btf;
u32 arg_btf_id;
bool arg_owning_ref;
+ bool arg_prog;
struct {
struct btf_field *field;
@@ -1923,11 +1924,8 @@ static struct bpf_verifier_state *get_loop_entry(struct bpf_verifier_env *env,
u32 steps = 0;
while (topmost && topmost->loop_entry) {
- if (steps++ > st->dfs_depth) {
- WARN_ONCE(true, "verifier bug: infinite loop in get_loop_entry\n");
- verbose(env, "verifier bug: infinite loop in get_loop_entry()\n");
+ if (verifier_bug_if(steps++ > st->dfs_depth, env, "infinite loop"))
return ERR_PTR(-EFAULT);
- }
topmost = topmost->loop_entry;
}
return topmost;
@@ -3459,12 +3457,11 @@ static int mark_reg_read(struct bpf_verifier_env *env,
/* if read wasn't screened by an earlier write ... */
if (writes && state->live & REG_LIVE_WRITTEN)
break;
- if (parent->live & REG_LIVE_DONE) {
- verbose(env, "verifier BUG type %s var_off %lld off %d\n",
- reg_type_str(env, parent->type),
- parent->var_off.value, parent->off);
+ if (verifier_bug_if(parent->live & REG_LIVE_DONE, env,
+ "type %s var_off %lld off %d",
+ reg_type_str(env, parent->type),
+ parent->var_off.value, parent->off))
return -EFAULT;
- }
/* The first condition is more likely to be true than the
* second, checked it first.
*/
@@ -3649,16 +3646,16 @@ static int insn_def_regno(const struct bpf_insn *insn)
case BPF_ST:
return -1;
case BPF_STX:
- if ((BPF_MODE(insn->code) == BPF_ATOMIC ||
- BPF_MODE(insn->code) == BPF_PROBE_ATOMIC) &&
- (insn->imm & BPF_FETCH)) {
+ if (BPF_MODE(insn->code) == BPF_ATOMIC ||
+ BPF_MODE(insn->code) == BPF_PROBE_ATOMIC) {
if (insn->imm == BPF_CMPXCHG)
return BPF_REG_0;
- else
+ else if (insn->imm == BPF_LOAD_ACQ)
+ return insn->dst_reg;
+ else if (insn->imm & BPF_FETCH)
return insn->src_reg;
- } else {
- return -1;
}
+ return -1;
default:
return insn->dst_reg;
}
@@ -3857,14 +3854,14 @@ static int push_insn_history(struct bpf_verifier_env *env, struct bpf_verifier_s
/* atomic instructions push insn_flags twice, for READ and
* WRITE sides, but they should agree on stack slot
*/
- WARN_ONCE((env->cur_hist_ent->flags & insn_flags) &&
- (env->cur_hist_ent->flags & insn_flags) != insn_flags,
- "verifier insn history bug: insn_idx %d cur flags %x new flags %x\n",
- env->insn_idx, env->cur_hist_ent->flags, insn_flags);
+ verifier_bug_if((env->cur_hist_ent->flags & insn_flags) &&
+ (env->cur_hist_ent->flags & insn_flags) != insn_flags,
+ env, "insn history: insn_idx %d cur flags %x new flags %x",
+ env->insn_idx, env->cur_hist_ent->flags, insn_flags);
env->cur_hist_ent->flags |= insn_flags;
- WARN_ONCE(env->cur_hist_ent->linked_regs != 0,
- "verifier insn history bug: insn_idx %d linked_regs != 0: %#llx\n",
- env->insn_idx, env->cur_hist_ent->linked_regs);
+ verifier_bug_if(env->cur_hist_ent->linked_regs != 0, env,
+ "insn history: insn_idx %d linked_regs: %#llx",
+ env->insn_idx, env->cur_hist_ent->linked_regs);
env->cur_hist_ent->linked_regs = linked_regs;
return 0;
}
@@ -3987,8 +3984,7 @@ static inline u32 bt_empty(struct backtrack_state *bt)
static inline int bt_subprog_enter(struct backtrack_state *bt)
{
if (bt->frame == MAX_CALL_FRAMES - 1) {
- verbose(bt->env, "BUG subprog enter from frame %d\n", bt->frame);
- WARN_ONCE(1, "verifier backtracking bug");
+ verifier_bug(bt->env, "subprog enter from frame %d", bt->frame);
return -EFAULT;
}
bt->frame++;
@@ -3998,8 +3994,7 @@ static inline int bt_subprog_enter(struct backtrack_state *bt)
static inline int bt_subprog_exit(struct backtrack_state *bt)
{
if (bt->frame == 0) {
- verbose(bt->env, "BUG subprog exit from frame 0\n");
- WARN_ONCE(1, "verifier backtracking bug");
+ verifier_bug(bt->env, "subprog exit from frame 0");
return -EFAULT;
}
bt->frame--;
@@ -4277,14 +4272,15 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
* should be literally next instruction in
* caller program
*/
- WARN_ONCE(idx + 1 != subseq_idx, "verifier backtracking bug");
+ verifier_bug_if(idx + 1 != subseq_idx, env,
+ "extra insn from subprog");
/* r1-r5 are invalidated after subprog call,
* so for global func call it shouldn't be set
* anymore
*/
if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) {
- verbose(env, "BUG regs %x\n", bt_reg_mask(bt));
- WARN_ONCE(1, "verifier backtracking bug");
+ verifier_bug(env, "global subprog unexpected regs %x",
+ bt_reg_mask(bt));
return -EFAULT;
}
/* global subprog always sets R0 */
@@ -4298,16 +4294,17 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
* the current frame should be zero by now
*/
if (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) {
- verbose(env, "BUG regs %x\n", bt_reg_mask(bt));
- WARN_ONCE(1, "verifier backtracking bug");
+ verifier_bug(env, "static subprog unexpected regs %x",
+ bt_reg_mask(bt));
return -EFAULT;
}
/* we are now tracking register spills correctly,
* so any instance of leftover slots is a bug
*/
if (bt_stack_mask(bt) != 0) {
- verbose(env, "BUG stack slots %llx\n", bt_stack_mask(bt));
- WARN_ONCE(1, "verifier backtracking bug (subprog leftover stack slots)");
+ verifier_bug(env,
+ "static subprog leftover stack slots %llx",
+ bt_stack_mask(bt));
return -EFAULT;
}
/* propagate r1-r5 to the caller */
@@ -4330,13 +4327,13 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
* not actually arguments passed directly to callback subprogs
*/
if (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) {
- verbose(env, "BUG regs %x\n", bt_reg_mask(bt));
- WARN_ONCE(1, "verifier backtracking bug");
+ verifier_bug(env, "callback unexpected regs %x",
+ bt_reg_mask(bt));
return -EFAULT;
}
if (bt_stack_mask(bt) != 0) {
- verbose(env, "BUG stack slots %llx\n", bt_stack_mask(bt));
- WARN_ONCE(1, "verifier backtracking bug (callback leftover stack slots)");
+ verifier_bug(env, "callback leftover stack slots %llx",
+ bt_stack_mask(bt));
return -EFAULT;
}
/* clear r1-r5 in callback subprog's mask */
@@ -4355,11 +4352,11 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
/* regular helper call sets R0 */
bt_clear_reg(bt, BPF_REG_0);
if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) {
- /* if backtracing was looking for registers R1-R5
+ /* if backtracking was looking for registers R1-R5
* they should have been found already.
*/
- verbose(env, "BUG regs %x\n", bt_reg_mask(bt));
- WARN_ONCE(1, "verifier backtracking bug");
+ verifier_bug(env, "backtracking call unexpected regs %x",
+ bt_reg_mask(bt));
return -EFAULT;
}
} else if (opcode == BPF_EXIT) {
@@ -4377,8 +4374,8 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
for (i = BPF_REG_1; i <= BPF_REG_5; i++)
bt_clear_reg(bt, i);
if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) {
- verbose(env, "BUG regs %x\n", bt_reg_mask(bt));
- WARN_ONCE(1, "verifier backtracking bug");
+ verifier_bug(env, "backtracking exit unexpected regs %x",
+ bt_reg_mask(bt));
return -EFAULT;
}
@@ -4413,8 +4410,10 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
* before it would be equally necessary to
* propagate it to dreg.
*/
- bt_set_reg(bt, dreg);
- bt_set_reg(bt, sreg);
+ if (!hist || !(hist->flags & INSN_F_SRC_REG_STACK))
+ bt_set_reg(bt, sreg);
+ if (!hist || !(hist->flags & INSN_F_DST_REG_STACK))
+ bt_set_reg(bt, dreg);
} else if (BPF_SRC(insn->code) == BPF_K) {
/* dreg <cond> K
* Only dreg still needs precision before
@@ -4719,9 +4718,8 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
return 0;
}
- verbose(env, "BUG backtracking func entry subprog %d reg_mask %x stack_mask %llx\n",
- st->frame[0]->subprogno, bt_reg_mask(bt), bt_stack_mask(bt));
- WARN_ONCE(1, "verifier backtracking bug");
+ verifier_bug(env, "backtracking func entry subprog %d reg_mask %x stack_mask %llx",
+ st->frame[0]->subprogno, bt_reg_mask(bt), bt_stack_mask(bt));
return -EFAULT;
}
@@ -4757,8 +4755,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
* It means the backtracking missed the spot where
* particular register was initialized with a constant.
*/
- verbose(env, "BUG backtracking idx %d\n", i);
- WARN_ONCE(1, "verifier backtracking bug");
+ verifier_bug(env, "backtracking idx %d", i);
return -EFAULT;
}
}
@@ -4783,12 +4780,10 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
bitmap_from_u64(mask, bt_frame_stack_mask(bt, fr));
for_each_set_bit(i, mask, 64) {
- if (i >= func->allocated_stack / BPF_REG_SIZE) {
- verbose(env, "BUG backtracking (stack slot %d, total slots %d)\n",
- i, func->allocated_stack / BPF_REG_SIZE);
- WARN_ONCE(1, "verifier backtracking bug (stack slot out of bounds)");
+ if (verifier_bug_if(i >= func->allocated_stack / BPF_REG_SIZE,
+ env, "stack slot %d, total slots %d",
+ i, func->allocated_stack / BPF_REG_SIZE))
return -EFAULT;
- }
if (!is_spilled_scalar_reg(&func->stack[i])) {
bt_clear_frame_slot(bt, fr, i);
@@ -6561,21 +6556,18 @@ continue_func:
/* find the callee */
next_insn = i + insn[i].imm + 1;
sidx = find_subprog(env, next_insn);
- if (sidx < 0) {
- WARN_ONCE(1, "verifier bug. No program starts at insn %d\n",
- next_insn);
+ if (verifier_bug_if(sidx < 0, env, "callee not found at insn %d", next_insn))
return -EFAULT;
- }
if (subprog[sidx].is_async_cb) {
if (subprog[sidx].has_tail_call) {
- verbose(env, "verifier bug. subprog has tail_call and async cb\n");
+ verifier_bug(env, "subprog has tail_call and async cb");
return -EFAULT;
}
/* async callbacks don't increase bpf prog stack size unless called directly */
if (!bpf_pseudo_call(insn + i))
continue;
if (subprog[sidx].is_exception_cb) {
- verbose(env, "insn %d cannot call exception cb directly\n", i);
+ verbose(env, "insn %d cannot call exception cb directly", i);
return -EINVAL;
}
}
@@ -6675,11 +6667,8 @@ static int get_callee_stack_depth(struct bpf_verifier_env *env,
int start = idx + insn->imm + 1, subprog;
subprog = find_subprog(env, start);
- if (subprog < 0) {
- WARN_ONCE(1, "verifier bug. No program starts at insn %d\n",
- start);
+ if (verifier_bug_if(subprog < 0, env, "get stack depth: no program at insn %d", start))
return -EFAULT;
- }
return env->subprog_info[subprog].stack_depth;
}
#endif
@@ -7984,7 +7973,7 @@ static int check_stack_range_initialized(
slot = -i - 1;
spi = slot / BPF_REG_SIZE;
if (state->allocated_stack <= slot) {
- verbose(env, "verifier bug: allocated_stack too small\n");
+ verbose(env, "allocated_stack too small\n");
return -EFAULT;
}
@@ -8413,7 +8402,7 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno,
return -EINVAL;
}
if (meta->map_ptr) {
- verbose(env, "verifier bug. Two map pointers in a timer helper\n");
+ verifier_bug(env, "Two map pointers in a timer helper");
return -EFAULT;
}
meta->map_uid = reg->map_uid;
@@ -10285,8 +10274,7 @@ static int setup_func_entry(struct bpf_verifier_env *env, int subprog, int calls
}
if (state->frame[state->curframe + 1]) {
- verbose(env, "verifier bug. Frame %d already allocated\n",
- state->curframe + 1);
+ verifier_bug(env, "Frame %d already allocated", state->curframe + 1);
return -EFAULT;
}
@@ -10400,8 +10388,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog,
if (err)
return err;
} else {
- bpf_log(log, "verifier bug: unrecognized arg#%d type %d\n",
- i, arg->arg_type);
+ verifier_bug(env, "unrecognized arg#%d type %d", i, arg->arg_type);
return -EFAULT;
}
}
@@ -10464,13 +10451,13 @@ static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *ins
env->subprog_info[subprog].is_cb = true;
if (bpf_pseudo_kfunc_call(insn) &&
!is_callback_calling_kfunc(insn->imm)) {
- verbose(env, "verifier bug: kfunc %s#%d not marked as callback-calling\n",
- func_id_name(insn->imm), insn->imm);
+ verifier_bug(env, "kfunc %s#%d not marked as callback-calling",
+ func_id_name(insn->imm), insn->imm);
return -EFAULT;
} else if (!bpf_pseudo_kfunc_call(insn) &&
!is_callback_calling_function(insn->imm)) { /* helper */
- verbose(env, "verifier bug: helper %s#%d not marked as callback-calling\n",
- func_id_name(insn->imm), insn->imm);
+ verifier_bug(env, "helper %s#%d not marked as callback-calling",
+ func_id_name(insn->imm), insn->imm);
return -EFAULT;
}
@@ -10522,10 +10509,9 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
target_insn = *insn_idx + insn->imm + 1;
subprog = find_subprog(env, target_insn);
- if (subprog < 0) {
- verbose(env, "verifier bug. No program starts at insn %d\n", target_insn);
+ if (verifier_bug_if(subprog < 0, env, "target of func call at insn %d is not a program",
+ target_insn))
return -EFAULT;
- }
caller = state->frame[state->curframe];
err = btf_check_subprog_call(env, subprog, caller->regs);
@@ -11124,7 +11110,7 @@ static int check_bpf_snprintf_call(struct bpf_verifier_env *env,
err = fmt_map->ops->map_direct_value_addr(fmt_map, &fmt_addr,
fmt_map_off);
if (err) {
- verbose(env, "verifier bug\n");
+ verbose(env, "failed to retrieve map value address\n");
return -EFAULT;
}
fmt = (char *)(long)fmt_addr + fmt_map_off;
@@ -11897,6 +11883,11 @@ static bool is_kfunc_arg_irq_flag(const struct btf *btf, const struct btf_param
return btf_param_match_suffix(btf, arg, "__irq_flag");
}
+static bool is_kfunc_arg_prog(const struct btf *btf, const struct btf_param *arg)
+{
+ return btf_param_match_suffix(btf, arg, "__prog");
+}
+
static bool is_kfunc_arg_scalar_with_name(const struct btf *btf,
const struct btf_param *arg,
const char *name)
@@ -11987,6 +11978,16 @@ static bool is_kfunc_arg_res_spin_lock(const struct btf *btf, const struct btf_p
return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RES_SPIN_LOCK_ID);
}
+static bool is_rbtree_node_type(const struct btf_type *t)
+{
+ return t == btf_type_by_id(btf_vmlinux, kf_arg_btf_ids[KF_ARG_RB_NODE_ID]);
+}
+
+static bool is_list_node_type(const struct btf_type *t)
+{
+ return t == btf_type_by_id(btf_vmlinux, kf_arg_btf_ids[KF_ARG_LIST_NODE_ID]);
+}
+
static bool is_kfunc_arg_callback(struct bpf_verifier_env *env, const struct btf *btf,
const struct btf_param *arg)
{
@@ -12069,6 +12070,8 @@ enum special_kfunc_type {
KF_bpf_list_push_back_impl,
KF_bpf_list_pop_front,
KF_bpf_list_pop_back,
+ KF_bpf_list_front,
+ KF_bpf_list_back,
KF_bpf_cast_to_kern_ctx,
KF_bpf_rdonly_cast,
KF_bpf_rcu_read_lock,
@@ -12076,6 +12079,9 @@ enum special_kfunc_type {
KF_bpf_rbtree_remove,
KF_bpf_rbtree_add_impl,
KF_bpf_rbtree_first,
+ KF_bpf_rbtree_root,
+ KF_bpf_rbtree_left,
+ KF_bpf_rbtree_right,
KF_bpf_dynptr_from_skb,
KF_bpf_dynptr_from_xdp,
KF_bpf_dynptr_slice,
@@ -12101,41 +12107,9 @@ enum special_kfunc_type {
KF_bpf_res_spin_unlock,
KF_bpf_res_spin_lock_irqsave,
KF_bpf_res_spin_unlock_irqrestore,
+ KF___bpf_trap,
};
-BTF_SET_START(special_kfunc_set)
-BTF_ID(func, bpf_obj_new_impl)
-BTF_ID(func, bpf_obj_drop_impl)
-BTF_ID(func, bpf_refcount_acquire_impl)
-BTF_ID(func, bpf_list_push_front_impl)
-BTF_ID(func, bpf_list_push_back_impl)
-BTF_ID(func, bpf_list_pop_front)
-BTF_ID(func, bpf_list_pop_back)
-BTF_ID(func, bpf_cast_to_kern_ctx)
-BTF_ID(func, bpf_rdonly_cast)
-BTF_ID(func, bpf_rbtree_remove)
-BTF_ID(func, bpf_rbtree_add_impl)
-BTF_ID(func, bpf_rbtree_first)
-#ifdef CONFIG_NET
-BTF_ID(func, bpf_dynptr_from_skb)
-BTF_ID(func, bpf_dynptr_from_xdp)
-#endif
-BTF_ID(func, bpf_dynptr_slice)
-BTF_ID(func, bpf_dynptr_slice_rdwr)
-BTF_ID(func, bpf_dynptr_clone)
-BTF_ID(func, bpf_percpu_obj_new_impl)
-BTF_ID(func, bpf_percpu_obj_drop_impl)
-BTF_ID(func, bpf_throw)
-BTF_ID(func, bpf_wq_set_callback_impl)
-#ifdef CONFIG_CGROUPS
-BTF_ID(func, bpf_iter_css_task_new)
-#endif
-#ifdef CONFIG_BPF_LSM
-BTF_ID(func, bpf_set_dentry_xattr)
-BTF_ID(func, bpf_remove_dentry_xattr)
-#endif
-BTF_SET_END(special_kfunc_set)
-
BTF_ID_LIST(special_kfunc_list)
BTF_ID(func, bpf_obj_new_impl)
BTF_ID(func, bpf_obj_drop_impl)
@@ -12144,6 +12118,8 @@ BTF_ID(func, bpf_list_push_front_impl)
BTF_ID(func, bpf_list_push_back_impl)
BTF_ID(func, bpf_list_pop_front)
BTF_ID(func, bpf_list_pop_back)
+BTF_ID(func, bpf_list_front)
+BTF_ID(func, bpf_list_back)
BTF_ID(func, bpf_cast_to_kern_ctx)
BTF_ID(func, bpf_rdonly_cast)
BTF_ID(func, bpf_rcu_read_lock)
@@ -12151,6 +12127,9 @@ BTF_ID(func, bpf_rcu_read_unlock)
BTF_ID(func, bpf_rbtree_remove)
BTF_ID(func, bpf_rbtree_add_impl)
BTF_ID(func, bpf_rbtree_first)
+BTF_ID(func, bpf_rbtree_root)
+BTF_ID(func, bpf_rbtree_left)
+BTF_ID(func, bpf_rbtree_right)
#ifdef CONFIG_NET
BTF_ID(func, bpf_dynptr_from_skb)
BTF_ID(func, bpf_dynptr_from_xdp)
@@ -12194,6 +12173,7 @@ BTF_ID(func, bpf_res_spin_lock)
BTF_ID(func, bpf_res_spin_unlock)
BTF_ID(func, bpf_res_spin_lock_irqsave)
BTF_ID(func, bpf_res_spin_unlock_irqrestore)
+BTF_ID(func, __bpf_trap)
static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta)
{
@@ -12579,14 +12559,19 @@ static bool is_bpf_list_api_kfunc(u32 btf_id)
return btf_id == special_kfunc_list[KF_bpf_list_push_front_impl] ||
btf_id == special_kfunc_list[KF_bpf_list_push_back_impl] ||
btf_id == special_kfunc_list[KF_bpf_list_pop_front] ||
- btf_id == special_kfunc_list[KF_bpf_list_pop_back];
+ btf_id == special_kfunc_list[KF_bpf_list_pop_back] ||
+ btf_id == special_kfunc_list[KF_bpf_list_front] ||
+ btf_id == special_kfunc_list[KF_bpf_list_back];
}
static bool is_bpf_rbtree_api_kfunc(u32 btf_id)
{
return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl] ||
btf_id == special_kfunc_list[KF_bpf_rbtree_remove] ||
- btf_id == special_kfunc_list[KF_bpf_rbtree_first];
+ btf_id == special_kfunc_list[KF_bpf_rbtree_first] ||
+ btf_id == special_kfunc_list[KF_bpf_rbtree_root] ||
+ btf_id == special_kfunc_list[KF_bpf_rbtree_left] ||
+ btf_id == special_kfunc_list[KF_bpf_rbtree_right];
}
static bool is_bpf_iter_num_api_kfunc(u32 btf_id)
@@ -12686,7 +12671,9 @@ static bool check_kfunc_is_graph_node_api(struct bpf_verifier_env *env,
break;
case BPF_RB_NODE:
ret = (kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_remove] ||
- kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl]);
+ kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl] ||
+ kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_left] ||
+ kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_right]);
break;
default:
verbose(env, "verifier internal error: unexpected graph node argument type %s\n",
@@ -12906,6 +12893,17 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
if (is_kfunc_arg_ignore(btf, &args[i]))
continue;
+ if (is_kfunc_arg_prog(btf, &args[i])) {
+ /* Used to reject repeated use of __prog. */
+ if (meta->arg_prog) {
+ verbose(env, "Only 1 prog->aux argument supported per-kfunc\n");
+ return -EFAULT;
+ }
+ meta->arg_prog = true;
+ cur_aux(env)->arg_prog = regno;
+ continue;
+ }
+
if (btf_type_is_scalar(t)) {
if (reg->type != SCALAR_VALUE) {
verbose(env, "R%d is not a scalar\n", regno);
@@ -13200,22 +13198,22 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
return ret;
break;
case KF_ARG_PTR_TO_RB_NODE:
- if (meta->func_id == special_kfunc_list[KF_bpf_rbtree_remove]) {
- if (!type_is_non_owning_ref(reg->type) || reg->ref_obj_id) {
- verbose(env, "rbtree_remove node input must be non-owning ref\n");
+ if (meta->func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
+ if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
+ verbose(env, "arg#%d expected pointer to allocated object\n", i);
return -EINVAL;
}
- if (in_rbtree_lock_required_cb(env)) {
- verbose(env, "rbtree_remove not allowed in rbtree cb\n");
+ if (!reg->ref_obj_id) {
+ verbose(env, "allocated object must be referenced\n");
return -EINVAL;
}
} else {
- if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
- verbose(env, "arg#%d expected pointer to allocated object\n", i);
+ if (!type_is_non_owning_ref(reg->type) && !reg->ref_obj_id) {
+ verbose(env, "%s can only take non-owning or refcounted bpf_rb_node pointer\n", func_name);
return -EINVAL;
}
- if (!reg->ref_obj_id) {
- verbose(env, "allocated object must be referenced\n");
+ if (in_rbtree_lock_required_cb(env)) {
+ verbose(env, "%s not allowed in rbtree cb\n", func_name);
return -EINVAL;
}
}
@@ -13420,6 +13418,178 @@ static int fetch_kfunc_meta(struct bpf_verifier_env *env,
return 0;
}
+/* check special kfuncs and return:
+ * 1 - not fall-through to 'else' branch, continue verification
+ * 0 - fall-through to 'else' branch
+ * < 0 - not fall-through to 'else' branch, return error
+ */
+static int check_special_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta,
+ struct bpf_reg_state *regs, struct bpf_insn_aux_data *insn_aux,
+ const struct btf_type *ptr_type, struct btf *desc_btf)
+{
+ const struct btf_type *ret_t;
+ int err = 0;
+
+ if (meta->btf != btf_vmlinux)
+ return 0;
+
+ if (meta->func_id == special_kfunc_list[KF_bpf_obj_new_impl] ||
+ meta->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
+ struct btf_struct_meta *struct_meta;
+ struct btf *ret_btf;
+ u32 ret_btf_id;
+
+ if (meta->func_id == special_kfunc_list[KF_bpf_obj_new_impl] && !bpf_global_ma_set)
+ return -ENOMEM;
+
+ if (((u64)(u32)meta->arg_constant.value) != meta->arg_constant.value) {
+ verbose(env, "local type ID argument must be in range [0, U32_MAX]\n");
+ return -EINVAL;
+ }
+
+ ret_btf = env->prog->aux->btf;
+ ret_btf_id = meta->arg_constant.value;
+
+ /* This may be NULL due to user not supplying a BTF */
+ if (!ret_btf) {
+ verbose(env, "bpf_obj_new/bpf_percpu_obj_new requires prog BTF\n");
+ return -EINVAL;
+ }
+
+ ret_t = btf_type_by_id(ret_btf, ret_btf_id);
+ if (!ret_t || !__btf_type_is_struct(ret_t)) {
+ verbose(env, "bpf_obj_new/bpf_percpu_obj_new type ID argument must be of a struct\n");
+ return -EINVAL;
+ }
+
+ if (meta->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
+ if (ret_t->size > BPF_GLOBAL_PERCPU_MA_MAX_SIZE) {
+ verbose(env, "bpf_percpu_obj_new type size (%d) is greater than %d\n",
+ ret_t->size, BPF_GLOBAL_PERCPU_MA_MAX_SIZE);
+ return -EINVAL;
+ }
+
+ if (!bpf_global_percpu_ma_set) {
+ mutex_lock(&bpf_percpu_ma_lock);
+ if (!bpf_global_percpu_ma_set) {
+ /* Charge memory allocated with bpf_global_percpu_ma to
+ * root memcg. The obj_cgroup for root memcg is NULL.
+ */
+ err = bpf_mem_alloc_percpu_init(&bpf_global_percpu_ma, NULL);
+ if (!err)
+ bpf_global_percpu_ma_set = true;
+ }
+ mutex_unlock(&bpf_percpu_ma_lock);
+ if (err)
+ return err;
+ }
+
+ mutex_lock(&bpf_percpu_ma_lock);
+ err = bpf_mem_alloc_percpu_unit_init(&bpf_global_percpu_ma, ret_t->size);
+ mutex_unlock(&bpf_percpu_ma_lock);
+ if (err)
+ return err;
+ }
+
+ struct_meta = btf_find_struct_meta(ret_btf, ret_btf_id);
+ if (meta->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
+ if (!__btf_type_is_scalar_struct(env, ret_btf, ret_t, 0)) {
+ verbose(env, "bpf_percpu_obj_new type ID argument must be of a struct of scalars\n");
+ return -EINVAL;
+ }
+
+ if (struct_meta) {
+ verbose(env, "bpf_percpu_obj_new type ID argument must not contain special fields\n");
+ return -EINVAL;
+ }
+ }
+
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC;
+ regs[BPF_REG_0].btf = ret_btf;
+ regs[BPF_REG_0].btf_id = ret_btf_id;
+ if (meta->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl])
+ regs[BPF_REG_0].type |= MEM_PERCPU;
+
+ insn_aux->obj_new_size = ret_t->size;
+ insn_aux->kptr_struct_meta = struct_meta;
+ } else if (meta->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]) {
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC;
+ regs[BPF_REG_0].btf = meta->arg_btf;
+ regs[BPF_REG_0].btf_id = meta->arg_btf_id;
+
+ insn_aux->kptr_struct_meta =
+ btf_find_struct_meta(meta->arg_btf,
+ meta->arg_btf_id);
+ } else if (is_list_node_type(ptr_type)) {
+ struct btf_field *field = meta->arg_list_head.field;
+
+ mark_reg_graph_node(regs, BPF_REG_0, &field->graph_root);
+ } else if (is_rbtree_node_type(ptr_type)) {
+ struct btf_field *field = meta->arg_rbtree_root.field;
+
+ mark_reg_graph_node(regs, BPF_REG_0, &field->graph_root);
+ } else if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) {
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_TRUSTED;
+ regs[BPF_REG_0].btf = desc_btf;
+ regs[BPF_REG_0].btf_id = meta->ret_btf_id;
+ } else if (meta->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
+ ret_t = btf_type_by_id(desc_btf, meta->arg_constant.value);
+ if (!ret_t || !btf_type_is_struct(ret_t)) {
+ verbose(env,
+ "kfunc bpf_rdonly_cast type ID argument must be of a struct\n");
+ return -EINVAL;
+ }
+
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_UNTRUSTED;
+ regs[BPF_REG_0].btf = desc_btf;
+ regs[BPF_REG_0].btf_id = meta->arg_constant.value;
+ } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_slice] ||
+ meta->func_id == special_kfunc_list[KF_bpf_dynptr_slice_rdwr]) {
+ enum bpf_type_flag type_flag = get_dynptr_type_flag(meta->initialized_dynptr.type);
+
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+
+ if (!meta->arg_constant.found) {
+ verbose(env, "verifier internal error: bpf_dynptr_slice(_rdwr) no constant size\n");
+ return -EFAULT;
+ }
+
+ regs[BPF_REG_0].mem_size = meta->arg_constant.value;
+
+ /* PTR_MAYBE_NULL will be added when is_kfunc_ret_null is checked */
+ regs[BPF_REG_0].type = PTR_TO_MEM | type_flag;
+
+ if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_slice]) {
+ regs[BPF_REG_0].type |= MEM_RDONLY;
+ } else {
+ /* this will set env->seen_direct_write to true */
+ if (!may_access_direct_pkt_data(env, NULL, BPF_WRITE)) {
+ verbose(env, "the prog does not allow writes to packet data\n");
+ return -EINVAL;
+ }
+ }
+
+ if (!meta->initialized_dynptr.id) {
+ verbose(env, "verifier internal error: no dynptr id\n");
+ return -EFAULT;
+ }
+ regs[BPF_REG_0].dynptr_id = meta->initialized_dynptr.id;
+
+ /* we don't need to set BPF_REG_0's ref obj id
+ * because packet slices are not refcounted (see
+ * dynptr_type_refcounted)
+ */
+ } else {
+ return 0;
+ }
+
+ return 1;
+}
+
static int check_return_code(struct bpf_verifier_env *env, int regno, const char *reg_name);
static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
@@ -13434,7 +13604,6 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
struct bpf_insn_aux_data *insn_aux;
int err, insn_idx = *insn_idx_p;
const struct btf_param *args;
- const struct btf_type *ret_t;
struct btf *desc_btf;
/* skip for now, but return error when we find this in fixup_kfunc_call */
@@ -13476,6 +13645,9 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
return err;
}
__mark_btf_func_reg_size(env, regs, BPF_REG_0, sizeof(u32));
+ } else if (!insn->off && insn->imm == special_kfunc_list[KF___bpf_trap]) {
+ verbose(env, "unexpected __bpf_trap() due to uninitialized variable?\n");
+ return -EFAULT;
}
if (is_kfunc_destructive(&meta) && !capable(CAP_SYS_BOOT)) {
@@ -13654,165 +13826,10 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
mark_btf_func_reg_size(env, BPF_REG_0, t->size);
} else if (btf_type_is_ptr(t)) {
ptr_type = btf_type_skip_modifiers(desc_btf, t->type, &ptr_type_id);
-
- if (meta.btf == btf_vmlinux && btf_id_set_contains(&special_kfunc_set, meta.func_id)) {
- if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl] ||
- meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
- struct btf_struct_meta *struct_meta;
- struct btf *ret_btf;
- u32 ret_btf_id;
-
- if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl] && !bpf_global_ma_set)
- return -ENOMEM;
-
- if (((u64)(u32)meta.arg_constant.value) != meta.arg_constant.value) {
- verbose(env, "local type ID argument must be in range [0, U32_MAX]\n");
- return -EINVAL;
- }
-
- ret_btf = env->prog->aux->btf;
- ret_btf_id = meta.arg_constant.value;
-
- /* This may be NULL due to user not supplying a BTF */
- if (!ret_btf) {
- verbose(env, "bpf_obj_new/bpf_percpu_obj_new requires prog BTF\n");
- return -EINVAL;
- }
-
- ret_t = btf_type_by_id(ret_btf, ret_btf_id);
- if (!ret_t || !__btf_type_is_struct(ret_t)) {
- verbose(env, "bpf_obj_new/bpf_percpu_obj_new type ID argument must be of a struct\n");
- return -EINVAL;
- }
-
- if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
- if (ret_t->size > BPF_GLOBAL_PERCPU_MA_MAX_SIZE) {
- verbose(env, "bpf_percpu_obj_new type size (%d) is greater than %d\n",
- ret_t->size, BPF_GLOBAL_PERCPU_MA_MAX_SIZE);
- return -EINVAL;
- }
-
- if (!bpf_global_percpu_ma_set) {
- mutex_lock(&bpf_percpu_ma_lock);
- if (!bpf_global_percpu_ma_set) {
- /* Charge memory allocated with bpf_global_percpu_ma to
- * root memcg. The obj_cgroup for root memcg is NULL.
- */
- err = bpf_mem_alloc_percpu_init(&bpf_global_percpu_ma, NULL);
- if (!err)
- bpf_global_percpu_ma_set = true;
- }
- mutex_unlock(&bpf_percpu_ma_lock);
- if (err)
- return err;
- }
-
- mutex_lock(&bpf_percpu_ma_lock);
- err = bpf_mem_alloc_percpu_unit_init(&bpf_global_percpu_ma, ret_t->size);
- mutex_unlock(&bpf_percpu_ma_lock);
- if (err)
- return err;
- }
-
- struct_meta = btf_find_struct_meta(ret_btf, ret_btf_id);
- if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
- if (!__btf_type_is_scalar_struct(env, ret_btf, ret_t, 0)) {
- verbose(env, "bpf_percpu_obj_new type ID argument must be of a struct of scalars\n");
- return -EINVAL;
- }
-
- if (struct_meta) {
- verbose(env, "bpf_percpu_obj_new type ID argument must not contain special fields\n");
- return -EINVAL;
- }
- }
-
- mark_reg_known_zero(env, regs, BPF_REG_0);
- regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC;
- regs[BPF_REG_0].btf = ret_btf;
- regs[BPF_REG_0].btf_id = ret_btf_id;
- if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl])
- regs[BPF_REG_0].type |= MEM_PERCPU;
-
- insn_aux->obj_new_size = ret_t->size;
- insn_aux->kptr_struct_meta = struct_meta;
- } else if (meta.func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]) {
- mark_reg_known_zero(env, regs, BPF_REG_0);
- regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC;
- regs[BPF_REG_0].btf = meta.arg_btf;
- regs[BPF_REG_0].btf_id = meta.arg_btf_id;
-
- insn_aux->kptr_struct_meta =
- btf_find_struct_meta(meta.arg_btf,
- meta.arg_btf_id);
- } else if (meta.func_id == special_kfunc_list[KF_bpf_list_pop_front] ||
- meta.func_id == special_kfunc_list[KF_bpf_list_pop_back]) {
- struct btf_field *field = meta.arg_list_head.field;
-
- mark_reg_graph_node(regs, BPF_REG_0, &field->graph_root);
- } else if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_remove] ||
- meta.func_id == special_kfunc_list[KF_bpf_rbtree_first]) {
- struct btf_field *field = meta.arg_rbtree_root.field;
-
- mark_reg_graph_node(regs, BPF_REG_0, &field->graph_root);
- } else if (meta.func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) {
- mark_reg_known_zero(env, regs, BPF_REG_0);
- regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_TRUSTED;
- regs[BPF_REG_0].btf = desc_btf;
- regs[BPF_REG_0].btf_id = meta.ret_btf_id;
- } else if (meta.func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
- ret_t = btf_type_by_id(desc_btf, meta.arg_constant.value);
- if (!ret_t || !btf_type_is_struct(ret_t)) {
- verbose(env,
- "kfunc bpf_rdonly_cast type ID argument must be of a struct\n");
- return -EINVAL;
- }
-
- mark_reg_known_zero(env, regs, BPF_REG_0);
- regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_UNTRUSTED;
- regs[BPF_REG_0].btf = desc_btf;
- regs[BPF_REG_0].btf_id = meta.arg_constant.value;
- } else if (meta.func_id == special_kfunc_list[KF_bpf_dynptr_slice] ||
- meta.func_id == special_kfunc_list[KF_bpf_dynptr_slice_rdwr]) {
- enum bpf_type_flag type_flag = get_dynptr_type_flag(meta.initialized_dynptr.type);
-
- mark_reg_known_zero(env, regs, BPF_REG_0);
-
- if (!meta.arg_constant.found) {
- verbose(env, "verifier internal error: bpf_dynptr_slice(_rdwr) no constant size\n");
- return -EFAULT;
- }
-
- regs[BPF_REG_0].mem_size = meta.arg_constant.value;
-
- /* PTR_MAYBE_NULL will be added when is_kfunc_ret_null is checked */
- regs[BPF_REG_0].type = PTR_TO_MEM | type_flag;
-
- if (meta.func_id == special_kfunc_list[KF_bpf_dynptr_slice]) {
- regs[BPF_REG_0].type |= MEM_RDONLY;
- } else {
- /* this will set env->seen_direct_write to true */
- if (!may_access_direct_pkt_data(env, NULL, BPF_WRITE)) {
- verbose(env, "the prog does not allow writes to packet data\n");
- return -EINVAL;
- }
- }
-
- if (!meta.initialized_dynptr.id) {
- verbose(env, "verifier internal error: no dynptr id\n");
- return -EFAULT;
- }
- regs[BPF_REG_0].dynptr_id = meta.initialized_dynptr.id;
-
- /* we don't need to set BPF_REG_0's ref obj id
- * because packet slices are not refcounted (see
- * dynptr_type_refcounted)
- */
- } else {
- verbose(env, "kernel function %s unhandled dynamic return type\n",
- meta.func_name);
- return -EFAULT;
- }
+ err = check_special_kfunc(env, &meta, regs, insn_aux, ptr_type, desc_btf);
+ if (err) {
+ if (err < 0)
+ return err;
} else if (btf_type_is_void(ptr_type)) {
/* kfunc returning 'void *' is equivalent to returning scalar */
mark_reg_unknown(env, regs, BPF_REG_0);
@@ -13881,14 +13898,14 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
if (is_kfunc_ret_null(&meta))
regs[BPF_REG_0].id = id;
regs[BPF_REG_0].ref_obj_id = id;
- } else if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_first]) {
+ } else if (is_rbtree_node_type(ptr_type) || is_list_node_type(ptr_type)) {
ref_set_non_owning(env, &regs[BPF_REG_0]);
}
if (reg_may_point_to_spin_lock(&regs[BPF_REG_0]) && !regs[BPF_REG_0].id)
regs[BPF_REG_0].id = ++env->id_gen;
} else if (btf_type_is_void(t)) {
- if (meta.btf == btf_vmlinux && btf_id_set_contains(&special_kfunc_set, meta.func_id)) {
+ if (meta.btf == btf_vmlinux) {
if (meta.func_id == special_kfunc_list[KF_bpf_obj_drop_impl] ||
meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_drop_impl]) {
insn_aux->kptr_struct_meta =
@@ -16377,6 +16394,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
struct bpf_reg_state *eq_branch_regs;
struct linked_regs linked_regs = {};
u8 opcode = BPF_OP(insn->code);
+ int insn_flags = 0;
bool is_jmp32;
int pred = -1;
int err;
@@ -16435,6 +16453,11 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
insn->src_reg);
return -EACCES;
}
+
+ if (src_reg->type == PTR_TO_STACK)
+ insn_flags |= INSN_F_SRC_REG_STACK;
+ if (dst_reg->type == PTR_TO_STACK)
+ insn_flags |= INSN_F_DST_REG_STACK;
} else {
if (insn->src_reg != BPF_REG_0) {
verbose(env, "BPF_JMP/JMP32 uses reserved fields\n");
@@ -16444,6 +16467,15 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
memset(src_reg, 0, sizeof(*src_reg));
src_reg->type = SCALAR_VALUE;
__mark_reg_known(src_reg, insn->imm);
+
+ if (dst_reg->type == PTR_TO_STACK)
+ insn_flags |= INSN_F_DST_REG_STACK;
+ }
+
+ if (insn_flags) {
+ err = push_insn_history(env, this_branch, insn_flags, 0);
+ if (err)
+ return err;
}
is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32;
@@ -19659,10 +19691,9 @@ process_bpf_exit:
return err;
break;
} else {
- if (WARN_ON_ONCE(env->cur_state->loop_entry)) {
- verbose(env, "verifier bug: env->cur_state->loop_entry != NULL\n");
+ if (verifier_bug_if(env->cur_state->loop_entry, env,
+ "broken loop detection"))
return -EFAULT;
- }
do_print_state = true;
continue;
}
@@ -20720,10 +20751,9 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env,
if (bpf_pseudo_kfunc_call(&insn))
continue;
- if (WARN_ON(load_reg == -1)) {
- verbose(env, "verifier bug. zext_dst is set, but no reg is defined\n");
+ if (verifier_bug_if(load_reg == -1, env,
+ "zext_dst is set, but no reg is defined"))
return -EFAULT;
- }
zext_patch[0] = insn;
zext_patch[1].dst_reg = load_reg;
@@ -21040,11 +21070,9 @@ static int jit_subprogs(struct bpf_verifier_env *env)
* propagated in any case.
*/
subprog = find_subprog(env, i + insn->imm + 1);
- if (subprog < 0) {
- WARN_ONCE(1, "verifier bug. No program starts at insn %d\n",
- i + insn->imm + 1);
+ if (verifier_bug_if(subprog < 0, env, "No program to jit at insn %d",
+ i + insn->imm + 1))
return -EFAULT;
- }
/* temporarily remember subprog id inside insn instead of
* aux_data, since next loop will split up all insns into funcs
*/
@@ -21487,13 +21515,17 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1);
*cnt = 1;
- } else if (is_bpf_wq_set_callback_impl_kfunc(desc->func_id)) {
- struct bpf_insn ld_addrs[2] = { BPF_LD_IMM64(BPF_REG_4, (long)env->prog->aux) };
+ }
- insn_buf[0] = ld_addrs[0];
- insn_buf[1] = ld_addrs[1];
- insn_buf[2] = *insn;
- *cnt = 3;
+ if (env->insn_aux_data[insn_idx].arg_prog) {
+ u32 regno = env->insn_aux_data[insn_idx].arg_prog;
+ struct bpf_insn ld_addrs[2] = { BPF_LD_IMM64(regno, (long)env->prog->aux) };
+ int idx = *cnt;
+
+ insn_buf[idx++] = ld_addrs[0];
+ insn_buf[idx++] = ld_addrs[1];
+ insn_buf[idx++] = *insn;
+ *cnt = idx;
}
return 0;
}
@@ -22403,7 +22435,7 @@ next_insn:
continue;
/* We need two slots in case timed may_goto is supported. */
if (stack_slots > slots) {
- verbose(env, "verifier bug: stack_slots supports may_goto only\n");
+ verifier_bug(env, "stack_slots supports may_goto only");
return -EFAULT;
}
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 95ab39e1ec8f..b14e61c64a34 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -270,9 +270,9 @@ int cgroup_task_count(const struct cgroup *cgrp);
/*
* rstat.c
*/
-int cgroup_rstat_init(struct cgroup *cgrp);
-void cgroup_rstat_exit(struct cgroup *cgrp);
-void cgroup_rstat_boot(void);
+int css_rstat_init(struct cgroup_subsys_state *css);
+void css_rstat_exit(struct cgroup_subsys_state *css);
+int ss_rstat_init(struct cgroup_subsys *ss);
void cgroup_base_stat_cputime_show(struct seq_file *seq);
/*
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 63e5b90da1f3..a723b7dc6e4e 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -95,6 +95,9 @@ EXPORT_SYMBOL_GPL(cgroup_mutex);
EXPORT_SYMBOL_GPL(css_set_lock);
#endif
+struct blocking_notifier_head cgroup_lifetime_notifier =
+ BLOCKING_NOTIFIER_INIT(cgroup_lifetime_notifier);
+
DEFINE_SPINLOCK(trace_cgroup_path_lock);
char trace_cgroup_path[TRACE_CGROUP_PATH_LEN];
static bool cgroup_debug __read_mostly;
@@ -161,10 +164,14 @@ static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
};
#undef SUBSYS
-static DEFINE_PER_CPU(struct cgroup_rstat_cpu, cgrp_dfl_root_rstat_cpu);
+static DEFINE_PER_CPU(struct css_rstat_cpu, root_rstat_cpu);
+static DEFINE_PER_CPU(struct cgroup_rstat_base_cpu, root_rstat_base_cpu);
/* the default hierarchy */
-struct cgroup_root cgrp_dfl_root = { .cgrp.rstat_cpu = &cgrp_dfl_root_rstat_cpu };
+struct cgroup_root cgrp_dfl_root = {
+ .cgrp.self.rstat_cpu = &root_rstat_cpu,
+ .cgrp.rstat_base_cpu = &root_rstat_base_cpu,
+};
EXPORT_SYMBOL_GPL(cgrp_dfl_root);
/*
@@ -1335,6 +1342,7 @@ static void cgroup_destroy_root(struct cgroup_root *root)
{
struct cgroup *cgrp = &root->cgrp;
struct cgrp_cset_link *link, *tmp_link;
+ int ret;
trace_cgroup_destroy_root(root);
@@ -1343,6 +1351,10 @@ static void cgroup_destroy_root(struct cgroup_root *root)
BUG_ON(atomic_read(&root->nr_cgrps));
BUG_ON(!list_empty(&cgrp->self.children));
+ ret = blocking_notifier_call_chain(&cgroup_lifetime_notifier,
+ CGROUP_LIFETIME_OFFLINE, cgrp);
+ WARN_ON_ONCE(notifier_to_errno(ret));
+
/* Rebind all subsystems back to the default hierarchy */
WARN_ON(rebind_subsystems(&cgrp_dfl_root, root->subsys_mask));
@@ -1371,7 +1383,6 @@ static void cgroup_destroy_root(struct cgroup_root *root)
cgroup_unlock();
- cgroup_rstat_exit(cgrp);
kernfs_destroy_root(root->kf_root);
cgroup_free_root(root);
}
@@ -1715,7 +1726,7 @@ static void css_clear_dir(struct cgroup_subsys_state *css)
css->flags &= ~CSS_VISIBLE;
- if (!css->ss) {
+ if (css_is_self(css)) {
if (cgroup_on_dfl(cgrp)) {
cgroup_addrm_files(css, cgrp,
cgroup_base_files, false);
@@ -1747,7 +1758,7 @@ static int css_populate_dir(struct cgroup_subsys_state *css)
if (css->flags & CSS_VISIBLE)
return 0;
- if (!css->ss) {
+ if (css_is_self(css)) {
if (cgroup_on_dfl(cgrp)) {
ret = cgroup_addrm_files(css, cgrp,
cgroup_base_files, true);
@@ -1876,13 +1887,6 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
}
spin_unlock_irq(&css_set_lock);
- if (ss->css_rstat_flush) {
- list_del_rcu(&css->rstat_css_node);
- synchronize_rcu();
- list_add_rcu(&css->rstat_css_node,
- &dcgrp->rstat_css_list);
- }
-
/* default hierarchy doesn't enable controllers by default */
dst_root->subsys_mask |= 1 << ssid;
if (dst_root == &cgrp_dfl_root) {
@@ -2065,7 +2069,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
cgrp->dom_cgrp = cgrp;
cgrp->max_descendants = INT_MAX;
cgrp->max_depth = INT_MAX;
- INIT_LIST_HEAD(&cgrp->rstat_css_list);
prev_cputime_init(&cgrp->prev_cputime);
for_each_subsys(ss, ssid)
@@ -2146,7 +2149,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
if (ret)
goto destroy_root;
- ret = cgroup_rstat_init(root_cgrp);
+ ret = css_rstat_init(&root_cgrp->self);
if (ret)
goto destroy_root;
@@ -2154,10 +2157,9 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
if (ret)
goto exit_stats;
- if (root == &cgrp_dfl_root) {
- ret = cgroup_bpf_inherit(root_cgrp);
- WARN_ON_ONCE(ret);
- }
+ ret = blocking_notifier_call_chain(&cgroup_lifetime_notifier,
+ CGROUP_LIFETIME_ONLINE, root_cgrp);
+ WARN_ON_ONCE(notifier_to_errno(ret));
trace_cgroup_setup_root(root);
@@ -2188,7 +2190,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
goto out;
exit_stats:
- cgroup_rstat_exit(root_cgrp);
+ css_rstat_exit(&root_cgrp->self);
destroy_root:
kernfs_destroy_root(root->kf_root);
root->kf_root = NULL;
@@ -5444,8 +5446,9 @@ static void css_free_rwork_fn(struct work_struct *work)
struct cgroup *cgrp = css->cgroup;
percpu_ref_exit(&css->refcnt);
+ css_rstat_exit(css);
- if (ss) {
+ if (!css_is_self(css)) {
/* css free path */
struct cgroup_subsys_state *parent = css->parent;
int id = css->id;
@@ -5474,7 +5477,6 @@ static void css_free_rwork_fn(struct work_struct *work)
cgroup_put(cgroup_parent(cgrp));
kernfs_put(cgrp->kn);
psi_cgroup_free(cgrp);
- cgroup_rstat_exit(cgrp);
kfree(cgrp);
} else {
/*
@@ -5499,14 +5501,10 @@ static void css_release_work_fn(struct work_struct *work)
css->flags |= CSS_RELEASED;
list_del_rcu(&css->sibling);
- if (ss) {
+ if (!css_is_self(css)) {
struct cgroup *parent_cgrp;
- /* css release path */
- if (!list_empty(&css->rstat_css_node)) {
- cgroup_rstat_flush(cgrp);
- list_del_rcu(&css->rstat_css_node);
- }
+ css_rstat_flush(css);
cgroup_idr_replace(&ss->css_idr, NULL, css->id);
if (ss->css_released)
@@ -5532,7 +5530,7 @@ static void css_release_work_fn(struct work_struct *work)
/* cgroup release path */
TRACE_CGROUP_PATH(release, cgrp);
- cgroup_rstat_flush(cgrp);
+ css_rstat_flush(&cgrp->self);
spin_lock_irq(&css_set_lock);
for (tcgrp = cgroup_parent(cgrp); tcgrp;
@@ -5580,7 +5578,6 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
css->id = -1;
INIT_LIST_HEAD(&css->sibling);
INIT_LIST_HEAD(&css->children);
- INIT_LIST_HEAD(&css->rstat_css_node);
css->serial_nr = css_serial_nr_next++;
atomic_set(&css->online_cnt, 0);
@@ -5589,9 +5586,6 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
css_get(css->parent);
}
- if (ss->css_rstat_flush)
- list_add_rcu(&css->rstat_css_node, &cgrp->rstat_css_list);
-
BUG_ON(cgroup_css(cgrp, ss));
}
@@ -5684,6 +5678,10 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
goto err_free_css;
css->id = err;
+ err = css_rstat_init(css);
+ if (err)
+ goto err_free_css;
+
/* @css is ready to be brought online now, make it visible */
list_add_tail_rcu(&css->sibling, &parent_css->children);
cgroup_idr_replace(&ss->css_idr, css, css->id);
@@ -5697,7 +5695,6 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
err_list_del:
list_del_rcu(&css->sibling);
err_free_css:
- list_del_rcu(&css->rstat_css_node);
INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);
return ERR_PTR(err);
@@ -5713,7 +5710,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
struct cgroup_root *root = parent->root;
struct cgroup *cgrp, *tcgrp;
struct kernfs_node *kn;
- int level = parent->level + 1;
+ int i, level = parent->level + 1;
int ret;
/* allocate the cgroup and its ID, 0 is reserved for the root */
@@ -5725,17 +5722,13 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
if (ret)
goto out_free_cgrp;
- ret = cgroup_rstat_init(cgrp);
- if (ret)
- goto out_cancel_ref;
-
/* create the directory */
kn = kernfs_create_dir_ns(parent->kn, name, mode,
current_fsuid(), current_fsgid(),
cgrp, NULL);
if (IS_ERR(kn)) {
ret = PTR_ERR(kn);
- goto out_stat_exit;
+ goto out_cancel_ref;
}
cgrp->kn = kn;
@@ -5745,15 +5738,20 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
cgrp->root = root;
cgrp->level = level;
- ret = psi_cgroup_alloc(cgrp);
+ /*
+ * Now that init_cgroup_housekeeping() has been called and cgrp->self
+ * is setup, it is safe to perform rstat initialization on it.
+ */
+ ret = css_rstat_init(&cgrp->self);
if (ret)
goto out_kernfs_remove;
- if (cgrp->root == &cgrp_dfl_root) {
- ret = cgroup_bpf_inherit(cgrp);
- if (ret)
- goto out_psi_free;
- }
+ ret = psi_cgroup_alloc(cgrp);
+ if (ret)
+ goto out_stat_exit;
+
+ for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp))
+ cgrp->ancestors[tcgrp->level] = tcgrp;
/*
* New cgroup inherits effective freeze counter, and
@@ -5771,24 +5769,6 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
set_bit(CGRP_FROZEN, &cgrp->flags);
}
- spin_lock_irq(&css_set_lock);
- for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
- cgrp->ancestors[tcgrp->level] = tcgrp;
-
- if (tcgrp != cgrp) {
- tcgrp->nr_descendants++;
-
- /*
- * If the new cgroup is frozen, all ancestor cgroups
- * get a new frozen descendant, but their state can't
- * change because of this.
- */
- if (cgrp->freezer.e_freeze)
- tcgrp->freezer.nr_frozen_descendants++;
- }
- }
- spin_unlock_irq(&css_set_lock);
-
if (notify_on_release(parent))
set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
@@ -5797,7 +5777,29 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
cgrp->self.serial_nr = css_serial_nr_next++;
+ ret = blocking_notifier_call_chain_robust(&cgroup_lifetime_notifier,
+ CGROUP_LIFETIME_ONLINE,
+ CGROUP_LIFETIME_OFFLINE, cgrp);
+ ret = notifier_to_errno(ret);
+ if (ret)
+ goto out_psi_free;
+
/* allocation complete, commit to creation */
+ spin_lock_irq(&css_set_lock);
+ for (i = 0; i < level; i++) {
+ tcgrp = cgrp->ancestors[i];
+ tcgrp->nr_descendants++;
+
+ /*
+ * If the new cgroup is frozen, all ancestor cgroups get a new
+ * frozen descendant, but their state can't change because of
+ * this.
+ */
+ if (cgrp->freezer.e_freeze)
+ tcgrp->freezer.nr_frozen_descendants++;
+ }
+ spin_unlock_irq(&css_set_lock);
+
list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children);
atomic_inc(&root->nr_cgrps);
cgroup_get_live(parent);
@@ -5815,10 +5817,10 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
out_psi_free:
psi_cgroup_free(cgrp);
+out_stat_exit:
+ css_rstat_exit(&cgrp->self);
out_kernfs_remove:
kernfs_remove(cgrp->kn);
-out_stat_exit:
- cgroup_rstat_exit(cgrp);
out_cancel_ref:
percpu_ref_exit(&cgrp->self.refcnt);
out_free_cgrp:
@@ -6015,7 +6017,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
struct cgroup *tcgrp, *parent = cgroup_parent(cgrp);
struct cgroup_subsys_state *css;
struct cgrp_cset_link *link;
- int ssid;
+ int ssid, ret;
lockdep_assert_held(&cgroup_mutex);
@@ -6073,8 +6075,9 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
cgroup1_check_for_release(parent);
- if (cgrp->root == &cgrp_dfl_root)
- cgroup_bpf_offline(cgrp);
+ ret = blocking_notifier_call_chain(&cgroup_lifetime_notifier,
+ CGROUP_LIFETIME_OFFLINE, cgrp);
+ WARN_ON_ONCE(notifier_to_errno(ret));
/* put the base reference */
percpu_ref_kill(&cgrp->self.refcnt);
@@ -6136,6 +6139,9 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
} else {
css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL);
BUG_ON(css->id < 0);
+
+ BUG_ON(ss_rstat_init(ss));
+ BUG_ON(css_rstat_init(css));
}
/* Update the init_css_set to contain a subsys
@@ -6184,6 +6190,8 @@ int __init cgroup_init_early(void)
ss->id, ss->name);
WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
"cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);
+ WARN(ss->early_init && ss->css_rstat_flush,
+ "cgroup rstat cannot be used with early init subsystem\n");
ss->id = i;
ss->name = cgroup_subsys_name[i];
@@ -6212,7 +6220,7 @@ int __init cgroup_init(void)
BUG_ON(cgroup_init_cftypes(NULL, cgroup_psi_files));
BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
- cgroup_rstat_boot();
+ BUG_ON(ss_rstat_init(NULL));
get_user_ns(init_cgroup_ns.user_ns);
@@ -6225,6 +6233,8 @@ int __init cgroup_init(void)
hash_add(css_set_table, &init_css_set.hlist,
css_set_hash(init_css_set.subsys));
+ cgroup_bpf_lifetime_notifier_init();
+
BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
cgroup_unlock();
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 24b70ea3e6ce..6d3ac19cc2ac 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -192,6 +192,20 @@ static inline void notify_partition_change(struct cpuset *cs, int old_prs)
WRITE_ONCE(cs->prs_err, PERR_NONE);
}
+/*
+ * The top_cpuset is always synchronized to cpu_active_mask and we should avoid
+ * using cpu_online_mask as much as possible. An active CPU is always an online
+ * CPU, but not vice versa. cpu_active_mask and cpu_online_mask can differ
+ * during hotplug operations. A CPU is marked active at the last stage of CPU
+ * bringup (CPUHP_AP_ACTIVE). It is also the stage where cpuset hotplug code
+ * will be called to update the sched domains so that the scheduler can move
+ * a normal task to a newly active CPU or remove tasks away from a newly
+ * inactivated CPU. The online bit is set much earlier in the CPU bringup
+ * process and cleared much later in CPU teardown.
+ *
+ * If cpu_online_mask is used while a hotunplug operation is happening in
+ * parallel, we may leave an offline CPU in cpu_allowed or some other masks.
+ */
static struct cpuset top_cpuset = {
.flags = BIT(CS_ONLINE) | BIT(CS_CPU_EXCLUSIVE) |
BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE),
@@ -355,18 +369,18 @@ static inline bool partition_is_populated(struct cpuset *cs,
* appropriate cpus.
*
* One way or another, we guarantee to return some non-empty subset
- * of cpu_online_mask.
+ * of cpu_active_mask.
*
* Call with callback_lock or cpuset_mutex held.
*/
-static void guarantee_online_cpus(struct task_struct *tsk,
+static void guarantee_active_cpus(struct task_struct *tsk,
struct cpumask *pmask)
{
const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
struct cpuset *cs;
- if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_online_mask)))
- cpumask_copy(pmask, cpu_online_mask);
+ if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_active_mask)))
+ cpumask_copy(pmask, cpu_active_mask);
rcu_read_lock();
cs = task_cs(tsk);
@@ -1390,14 +1404,12 @@ static int compute_effective_exclusive_cpumask(struct cpuset *cs,
if (sibling == cs)
continue;
- if (!cpumask_empty(sibling->exclusive_cpus) &&
- cpumask_intersects(xcpus, sibling->exclusive_cpus)) {
+ if (cpumask_intersects(xcpus, sibling->exclusive_cpus)) {
cpumask_andnot(xcpus, xcpus, sibling->exclusive_cpus);
retval++;
continue;
}
- if (!cpumask_empty(sibling->effective_xcpus) &&
- cpumask_intersects(xcpus, sibling->effective_xcpus)) {
+ if (cpumask_intersects(xcpus, sibling->effective_xcpus)) {
cpumask_andnot(xcpus, xcpus, sibling->effective_xcpus);
retval++;
}
@@ -1441,13 +1453,15 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs,
* The requested exclusive_cpus must not be allocated to other
* partitions and it can't use up all the root's effective_cpus.
*
- * Note that if there is any local partition root above it or
- * remote partition root underneath it, its exclusive_cpus must
- * have overlapped with subpartitions_cpus.
+ * The effective_xcpus mask can contain offline CPUs, but there must
+ * be at least one or more online CPUs present before it can be enabled.
+ *
+ * Note that creating a remote partition with any local partition root
+ * above it or remote partition root underneath it is not allowed.
*/
compute_effective_exclusive_cpumask(cs, tmp->new_cpus, NULL);
- if (cpumask_empty(tmp->new_cpus) ||
- cpumask_intersects(tmp->new_cpus, subpartitions_cpus) ||
+ WARN_ON_ONCE(cpumask_intersects(tmp->new_cpus, subpartitions_cpus));
+ if (!cpumask_intersects(tmp->new_cpus, cpu_active_mask) ||
cpumask_subset(top_cpuset.effective_cpus, tmp->new_cpus))
return PERR_INVCPUS;
@@ -1543,6 +1557,7 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *xcpus,
* left in the top cpuset.
*/
if (adding) {
+ WARN_ON_ONCE(cpumask_intersects(tmp->addmask, subpartitions_cpus));
if (!capable(CAP_SYS_ADMIN))
cs->prs_err = PERR_ACCESS;
else if (cpumask_intersects(tmp->addmask, subpartitions_cpus) ||
@@ -1652,7 +1667,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
bool nocpu;
lockdep_assert_held(&cpuset_mutex);
- WARN_ON_ONCE(is_remote_partition(cs));
+ WARN_ON_ONCE(is_remote_partition(cs)); /* For local partition only */
/*
* new_prs will only be changed for the partcmd_update and
@@ -1698,7 +1713,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
* exclusive_cpus not set. Sibling conflict should only happen
* if exclusive_cpus isn't set.
*/
- xcpus = tmp->new_cpus;
+ xcpus = tmp->delmask;
if (compute_effective_exclusive_cpumask(cs, xcpus, NULL))
WARN_ON_ONCE(!cpumask_empty(cs->exclusive_cpus));
@@ -1719,9 +1734,20 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
if (nocpu)
return PERR_NOCPUS;
- deleting = cpumask_and(tmp->delmask, xcpus, parent->effective_xcpus);
- if (deleting)
- subparts_delta++;
+ /*
+ * This function will only be called when all the preliminary
+ * checks have passed. At this point, the following condition
+ * should hold.
+ *
+ * (cs->effective_xcpus & cpu_active_mask) ⊆ parent->effective_cpus
+ *
+ * Warn if it is not the case.
+ */
+ cpumask_and(tmp->new_cpus, xcpus, cpu_active_mask);
+ WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, parent->effective_cpus));
+
+ deleting = true;
+ subparts_delta++;
new_prs = (cmd == partcmd_enable) ? PRS_ROOT : PRS_ISOLATED;
} else if (cmd == partcmd_disable) {
/*
@@ -1776,6 +1802,15 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
parent->effective_xcpus);
}
/*
+ * The new CPUs to be removed from parent's effective CPUs
+ * must be present.
+ */
+ if (deleting) {
+ cpumask_and(tmp->new_cpus, tmp->delmask, cpu_active_mask);
+ WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, parent->effective_cpus));
+ }
+
+ /*
* Make partition invalid if parent's effective_cpus could
* become empty and there are tasks in the parent.
*/
@@ -2265,7 +2300,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
bool force = false;
int old_prs = cs->partition_root_state;
- /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
+ /* top_cpuset.cpus_allowed tracks cpu_active_mask; it's read-only */
if (cs == &top_cpuset)
return -EACCES;
@@ -3084,7 +3119,7 @@ static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task)
lockdep_assert_held(&cpuset_mutex);
if (cs != &top_cpuset)
- guarantee_online_cpus(task, cpus_attach);
+ guarantee_active_cpus(task, cpus_attach);
else
cpumask_andnot(cpus_attach, task_cpu_possible_mask(task),
subpartitions_cpus);
@@ -3526,11 +3561,7 @@ out_unlock:
* will call rebuild_sched_domains_locked(). That is not needed
* in the default hierarchy where only changes in partition
* will cause repartitioning.
- *
- * If the cpuset has the 'sched.partition' flag enabled, simulate
- * turning 'sched.partition" off.
*/
-
static void cpuset_css_offline(struct cgroup_subsys_state *css)
{
struct cpuset *cs = css_cs(css);
@@ -3548,6 +3579,11 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
cpus_read_unlock();
}
+/*
+ * If a dying cpuset has the 'cpus.partition' enabled, turn it off by
+ * changing it back to member to free its exclusive CPUs back to the pool to
+ * be used by other online cpusets.
+ */
static void cpuset_css_killed(struct cgroup_subsys_state *css)
{
struct cpuset *cs = css_cs(css);
@@ -4028,7 +4064,7 @@ void __init cpuset_init_smp(void)
*
* Description: Returns the cpumask_var_t cpus_allowed of the cpuset
* attached to the specified @tsk. Guaranteed to return some non-empty
- * subset of cpu_online_mask, even if this means going outside the
+ * subset of cpu_active_mask, even if this means going outside the
* tasks cpuset, except when the task is in the top cpuset.
**/
@@ -4042,7 +4078,7 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
cs = task_cs(tsk);
if (cs != &top_cpuset)
- guarantee_online_cpus(tsk, pmask);
+ guarantee_active_cpus(tsk, pmask);
/*
* Tasks in the top cpuset won't get update to their cpumasks
* when a hotplug online/offline event happens. So we include all
@@ -4056,7 +4092,7 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
* allowable online cpu left, we fall back to all possible cpus.
*/
cpumask_andnot(pmask, possible_mask, subpartitions_cpus);
- if (!cpumask_intersects(pmask, cpu_online_mask))
+ if (!cpumask_intersects(pmask, cpu_active_mask))
cpumask_copy(pmask, possible_mask);
}
diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c
index 2fa3a4fb2aaf..6a01d91ea4cb 100644
--- a/kernel/cgroup/misc.c
+++ b/kernel/cgroup/misc.c
@@ -24,6 +24,10 @@ static const char *const misc_res_name[] = {
/* AMD SEV-ES ASIDs resource */
"sev_es",
#endif
+#ifdef CONFIG_INTEL_TDX_HOST
+ /* Intel TDX HKIDs resource */
+ "tdx",
+#endif
};
/* Root misc cgroup */
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index b2239156b7de..ce4752ab9e09 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -9,18 +9,52 @@
#include <trace/events/cgroup.h>
-static DEFINE_SPINLOCK(cgroup_rstat_lock);
-static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);
+static DEFINE_SPINLOCK(rstat_base_lock);
+static DEFINE_PER_CPU(raw_spinlock_t, rstat_base_cpu_lock);
static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu);
-static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu)
+/*
+ * Determines whether a given css can participate in rstat.
+ * css's that are cgroup::self use rstat for base stats.
+ * Other css's associated with a subsystem use rstat only when
+ * they define the ss->css_rstat_flush callback.
+ */
+static inline bool css_uses_rstat(struct cgroup_subsys_state *css)
+{
+ return css_is_self(css) || css->ss->css_rstat_flush != NULL;
+}
+
+static struct css_rstat_cpu *css_rstat_cpu(
+ struct cgroup_subsys_state *css, int cpu)
+{
+ return per_cpu_ptr(css->rstat_cpu, cpu);
+}
+
+static struct cgroup_rstat_base_cpu *cgroup_rstat_base_cpu(
+ struct cgroup *cgrp, int cpu)
{
- return per_cpu_ptr(cgrp->rstat_cpu, cpu);
+ return per_cpu_ptr(cgrp->rstat_base_cpu, cpu);
+}
+
+static spinlock_t *ss_rstat_lock(struct cgroup_subsys *ss)
+{
+ if (ss)
+ return &ss->rstat_ss_lock;
+
+ return &rstat_base_lock;
+}
+
+static raw_spinlock_t *ss_rstat_cpu_lock(struct cgroup_subsys *ss, int cpu)
+{
+ if (ss)
+ return per_cpu_ptr(ss->rstat_ss_cpu_lock, cpu);
+
+ return per_cpu_ptr(&rstat_base_cpu_lock, cpu);
}
/*
- * Helper functions for rstat per CPU lock (cgroup_rstat_cpu_lock).
+ * Helper functions for rstat per CPU locks.
*
* This makes it easier to diagnose locking issues and contention in
* production environments. The parameter @fast_path determine the
@@ -28,20 +62,23 @@ static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu)
* operations without handling high-frequency fast-path "update" events.
*/
static __always_inline
-unsigned long _cgroup_rstat_cpu_lock(raw_spinlock_t *cpu_lock, int cpu,
- struct cgroup *cgrp, const bool fast_path)
+unsigned long _css_rstat_cpu_lock(struct cgroup_subsys_state *css, int cpu,
+ const bool fast_path)
{
+ struct cgroup *cgrp = css->cgroup;
+ raw_spinlock_t *cpu_lock;
unsigned long flags;
bool contended;
/*
- * The _irqsave() is needed because cgroup_rstat_lock is
- * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
- * this lock with the _irq() suffix only disables interrupts on
- * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
- * interrupts on both configurations. The _irqsave() ensures
- * that interrupts are always disabled and later restored.
+ * The _irqsave() is needed because the locks used for flushing are
+ * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring this lock
+ * with the _irq() suffix only disables interrupts on a non-PREEMPT_RT
+ * kernel. The raw_spinlock_t below disables interrupts on both
+ * configurations. The _irqsave() ensures that interrupts are always
+ * disabled and later restored.
*/
+ cpu_lock = ss_rstat_cpu_lock(css->ss, cpu);
contended = !raw_spin_trylock_irqsave(cpu_lock, flags);
if (contended) {
if (fast_path)
@@ -61,50 +98,59 @@ unsigned long _cgroup_rstat_cpu_lock(raw_spinlock_t *cpu_lock, int cpu,
}
static __always_inline
-void _cgroup_rstat_cpu_unlock(raw_spinlock_t *cpu_lock, int cpu,
- struct cgroup *cgrp, unsigned long flags,
- const bool fast_path)
+void _css_rstat_cpu_unlock(struct cgroup_subsys_state *css, int cpu,
+ unsigned long flags, const bool fast_path)
{
+ struct cgroup *cgrp = css->cgroup;
+ raw_spinlock_t *cpu_lock;
+
if (fast_path)
trace_cgroup_rstat_cpu_unlock_fastpath(cgrp, cpu, false);
else
trace_cgroup_rstat_cpu_unlock(cgrp, cpu, false);
+ cpu_lock = ss_rstat_cpu_lock(css->ss, cpu);
raw_spin_unlock_irqrestore(cpu_lock, flags);
}
/**
- * cgroup_rstat_updated - keep track of updated rstat_cpu
- * @cgrp: target cgroup
+ * css_rstat_updated - keep track of updated rstat_cpu
+ * @css: target cgroup subsystem state
* @cpu: cpu on which rstat_cpu was updated
*
- * @cgrp's rstat_cpu on @cpu was updated. Put it on the parent's matching
- * rstat_cpu->updated_children list. See the comment on top of
- * cgroup_rstat_cpu definition for details.
+ * @css's rstat_cpu on @cpu was updated. Put it on the parent's matching
+ * rstat_cpu->updated_children list. See the comment on top of
+ * css_rstat_cpu definition for details.
*/
-__bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
+__bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu)
{
- raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
unsigned long flags;
/*
+ * Since bpf programs can call this function, prevent access to
+ * uninitialized rstat pointers.
+ */
+ if (!css_uses_rstat(css))
+ return;
+
+ /*
* Speculative already-on-list test. This may race leading to
* temporary inaccuracies, which is fine.
*
* Because @parent's updated_children is terminated with @parent
- * instead of NULL, we can tell whether @cgrp is on the list by
+ * instead of NULL, we can tell whether @css is on the list by
* testing the next pointer for NULL.
*/
- if (data_race(cgroup_rstat_cpu(cgrp, cpu)->updated_next))
+ if (data_race(css_rstat_cpu(css, cpu)->updated_next))
return;
- flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, cgrp, true);
+ flags = _css_rstat_cpu_lock(css, cpu, true);
- /* put @cgrp and all ancestors on the corresponding updated lists */
+ /* put @css and all ancestors on the corresponding updated lists */
while (true) {
- struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
- struct cgroup *parent = cgroup_parent(cgrp);
- struct cgroup_rstat_cpu *prstatc;
+ struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu);
+ struct cgroup_subsys_state *parent = css->parent;
+ struct css_rstat_cpu *prstatc;
/*
* Both additions and removals are bottom-up. If a cgroup
@@ -115,53 +161,78 @@ __bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
/* Root has no parent to link it to, but mark it busy */
if (!parent) {
- rstatc->updated_next = cgrp;
+ rstatc->updated_next = css;
break;
}
- prstatc = cgroup_rstat_cpu(parent, cpu);
+ prstatc = css_rstat_cpu(parent, cpu);
rstatc->updated_next = prstatc->updated_children;
- prstatc->updated_children = cgrp;
+ prstatc->updated_children = css;
- cgrp = parent;
+ css = parent;
}
- _cgroup_rstat_cpu_unlock(cpu_lock, cpu, cgrp, flags, true);
+ _css_rstat_cpu_unlock(css, cpu, flags, true);
}
/**
- * cgroup_rstat_push_children - push children cgroups into the given list
+ * css_rstat_push_children - push children css's into the given list
* @head: current head of the list (= subtree root)
* @child: first child of the root
* @cpu: target cpu
- * Return: A new singly linked list of cgroups to be flush
+ * Return: A new singly linked list of css's to be flushed
*
- * Iteratively traverse down the cgroup_rstat_cpu updated tree level by
+ * Iteratively traverse down the css_rstat_cpu updated tree level by
* level and push all the parents first before their next level children
- * into a singly linked list built from the tail backward like "pushing"
- * cgroups into a stack. The root is pushed by the caller.
+ * into a singly linked list via the rstat_flush_next pointer built from the
+ * tail backward like "pushing" css's into a stack. The root is pushed by
+ * the caller.
*/
-static struct cgroup *cgroup_rstat_push_children(struct cgroup *head,
- struct cgroup *child, int cpu)
+static struct cgroup_subsys_state *css_rstat_push_children(
+ struct cgroup_subsys_state *head,
+ struct cgroup_subsys_state *child, int cpu)
{
- struct cgroup *chead = child; /* Head of child cgroup level */
- struct cgroup *ghead = NULL; /* Head of grandchild cgroup level */
- struct cgroup *parent, *grandchild;
- struct cgroup_rstat_cpu *crstatc;
+ struct cgroup_subsys_state *cnext = child; /* Next head of child css level */
+ struct cgroup_subsys_state *ghead = NULL; /* Head of grandchild css level */
+ struct cgroup_subsys_state *parent, *grandchild;
+ struct css_rstat_cpu *crstatc;
child->rstat_flush_next = NULL;
+ /*
+ * The subsystem rstat lock must be held for the whole duration from
+ * here as the rstat_flush_next list is being constructed to when
+ * it is consumed later in css_rstat_flush().
+ */
+ lockdep_assert_held(ss_rstat_lock(head->ss));
+
+ /*
+ * Notation: -> updated_next pointer
+ * => rstat_flush_next pointer
+ *
+ * Assuming the following sample updated_children lists:
+ * P: C1 -> C2 -> P
+ * C1: G11 -> G12 -> C1
+ * C2: G21 -> G22 -> C2
+ *
+ * After 1st iteration:
+ * head => C2 => C1 => NULL
+ * ghead => G21 => G11 => NULL
+ *
+ * After 2nd iteration:
+ * head => G12 => G11 => G22 => G21 => C2 => C1 => NULL
+ */
next_level:
- while (chead) {
- child = chead;
- chead = child->rstat_flush_next;
- parent = cgroup_parent(child);
+ while (cnext) {
+ child = cnext;
+ cnext = child->rstat_flush_next;
+ parent = child->parent;
- /* updated_next is parent cgroup terminated */
+ /* updated_next is parent cgroup terminated if !NULL */
while (child != parent) {
child->rstat_flush_next = head;
head = child;
- crstatc = cgroup_rstat_cpu(child, cpu);
+ crstatc = css_rstat_cpu(child, cpu);
grandchild = crstatc->updated_children;
if (grandchild != child) {
/* Push the grand child to the next level */
@@ -175,7 +246,7 @@ next_level:
}
if (ghead) {
- chead = ghead;
+ cnext = ghead;
ghead = NULL;
goto next_level;
}
@@ -183,31 +254,31 @@ next_level:
}
/**
- * cgroup_rstat_updated_list - return a list of updated cgroups to be flushed
- * @root: root of the cgroup subtree to traverse
+ * css_rstat_updated_list - build a list of updated css's to be flushed
+ * @root: root of the css subtree to traverse
* @cpu: target cpu
- * Return: A singly linked list of cgroups to be flushed
+ * Return: A singly linked list of css's to be flushed
*
* Walks the updated rstat_cpu tree on @cpu from @root. During traversal,
- * each returned cgroup is unlinked from the updated tree.
+ * each returned css is unlinked from the updated tree.
*
* The only ordering guarantee is that, for a parent and a child pair
* covered by a given traversal, the child is before its parent in
* the list.
*
* Note that updated_children is self terminated and points to a list of
- * child cgroups if not empty. Whereas updated_next is like a sibling link
- * within the children list and terminated by the parent cgroup. An exception
- * here is the cgroup root whose updated_next can be self terminated.
+ * child css's if not empty. Whereas updated_next is like a sibling link
+ * within the children list and terminated by the parent css. An exception
+ * here is the css root whose updated_next can be self terminated.
*/
-static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu)
+static struct cgroup_subsys_state *css_rstat_updated_list(
+ struct cgroup_subsys_state *root, int cpu)
{
- raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
- struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(root, cpu);
- struct cgroup *head = NULL, *parent, *child;
+ struct css_rstat_cpu *rstatc = css_rstat_cpu(root, cpu);
+ struct cgroup_subsys_state *head = NULL, *parent, *child;
unsigned long flags;
- flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, root, false);
+ flags = _css_rstat_cpu_lock(root, cpu, false);
/* Return NULL if this subtree is not on-list */
if (!rstatc->updated_next)
@@ -217,17 +288,17 @@ static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu)
* Unlink @root from its parent. As the updated_children list is
* singly linked, we have to walk it to find the removal point.
*/
- parent = cgroup_parent(root);
+ parent = root->parent;
if (parent) {
- struct cgroup_rstat_cpu *prstatc;
- struct cgroup **nextp;
+ struct css_rstat_cpu *prstatc;
+ struct cgroup_subsys_state **nextp;
- prstatc = cgroup_rstat_cpu(parent, cpu);
+ prstatc = css_rstat_cpu(parent, cpu);
nextp = &prstatc->updated_children;
while (*nextp != root) {
- struct cgroup_rstat_cpu *nrstatc;
+ struct css_rstat_cpu *nrstatc;
- nrstatc = cgroup_rstat_cpu(*nextp, cpu);
+ nrstatc = css_rstat_cpu(*nextp, cpu);
WARN_ON_ONCE(*nextp == parent);
nextp = &nrstatc->updated_next;
}
@@ -242,16 +313,16 @@ static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu)
child = rstatc->updated_children;
rstatc->updated_children = root;
if (child != root)
- head = cgroup_rstat_push_children(head, child, cpu);
+ head = css_rstat_push_children(head, child, cpu);
unlock_ret:
- _cgroup_rstat_cpu_unlock(cpu_lock, cpu, root, flags, false);
+ _css_rstat_cpu_unlock(root, cpu, flags, false);
return head;
}
/*
* A hook for bpf stat collectors to attach to and flush their stats.
- * Together with providing bpf kfuncs for cgroup_rstat_updated() and
- * cgroup_rstat_flush(), this enables a complete workflow where bpf progs that
+ * Together with providing bpf kfuncs for css_rstat_updated() and
+ * css_rstat_flush(), this enables a complete workflow where bpf progs that
* collect cgroup stats can integrate with rstat for efficient flushing.
*
* A static noinline declaration here could cause the compiler to optimize away
@@ -271,7 +342,7 @@ __weak noinline void bpf_rstat_flush(struct cgroup *cgrp,
__bpf_hook_end();
/*
- * Helper functions for locking cgroup_rstat_lock.
+ * Helper functions for locking.
*
* This makes it easier to diagnose locking issues and contention in
* production environments. The parameter @cpu_in_loop indicate lock
@@ -279,115 +350,186 @@ __bpf_hook_end();
* value -1 is used when obtaining the main lock else this is the CPU
* number processed last.
*/
-static inline void __cgroup_rstat_lock(struct cgroup *cgrp, int cpu_in_loop)
- __acquires(&cgroup_rstat_lock)
+static inline void __css_rstat_lock(struct cgroup_subsys_state *css,
+ int cpu_in_loop)
+ __acquires(ss_rstat_lock(css->ss))
{
+ struct cgroup *cgrp = css->cgroup;
+ spinlock_t *lock;
bool contended;
- contended = !spin_trylock_irq(&cgroup_rstat_lock);
+ lock = ss_rstat_lock(css->ss);
+ contended = !spin_trylock_irq(lock);
if (contended) {
trace_cgroup_rstat_lock_contended(cgrp, cpu_in_loop, contended);
- spin_lock_irq(&cgroup_rstat_lock);
+ spin_lock_irq(lock);
}
trace_cgroup_rstat_locked(cgrp, cpu_in_loop, contended);
}
-static inline void __cgroup_rstat_unlock(struct cgroup *cgrp, int cpu_in_loop)
- __releases(&cgroup_rstat_lock)
+static inline void __css_rstat_unlock(struct cgroup_subsys_state *css,
+ int cpu_in_loop)
+ __releases(ss_rstat_lock(css->ss))
{
+ struct cgroup *cgrp = css->cgroup;
+ spinlock_t *lock;
+
+ lock = ss_rstat_lock(css->ss);
trace_cgroup_rstat_unlock(cgrp, cpu_in_loop, false);
- spin_unlock_irq(&cgroup_rstat_lock);
+ spin_unlock_irq(lock);
}
/**
- * cgroup_rstat_flush - flush stats in @cgrp's subtree
- * @cgrp: target cgroup
+ * css_rstat_flush - flush stats in @css's rstat subtree
+ * @css: target cgroup subsystem state
*
- * Collect all per-cpu stats in @cgrp's subtree into the global counters
- * and propagate them upwards. After this function returns, all cgroups in
- * the subtree have up-to-date ->stat.
+ * Collect all per-cpu stats in @css's subtree into the global counters
+ * and propagate them upwards. After this function returns, all rstat
+ * nodes in the subtree have up-to-date ->stat.
*
- * This also gets all cgroups in the subtree including @cgrp off the
+ * This also gets all rstat nodes in the subtree including @css off the
* ->updated_children lists.
*
* This function may block.
*/
-__bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp)
+__bpf_kfunc void css_rstat_flush(struct cgroup_subsys_state *css)
{
int cpu;
+ bool is_self = css_is_self(css);
+
+ /*
+ * Since bpf programs can call this function, prevent access to
+ * uninitialized rstat pointers.
+ */
+ if (!css_uses_rstat(css))
+ return;
might_sleep();
for_each_possible_cpu(cpu) {
- struct cgroup *pos;
+ struct cgroup_subsys_state *pos;
/* Reacquire for each CPU to avoid disabling IRQs too long */
- __cgroup_rstat_lock(cgrp, cpu);
- pos = cgroup_rstat_updated_list(cgrp, cpu);
+ __css_rstat_lock(css, cpu);
+ pos = css_rstat_updated_list(css, cpu);
for (; pos; pos = pos->rstat_flush_next) {
- struct cgroup_subsys_state *css;
-
- cgroup_base_stat_flush(pos, cpu);
- bpf_rstat_flush(pos, cgroup_parent(pos), cpu);
-
- rcu_read_lock();
- list_for_each_entry_rcu(css, &pos->rstat_css_list,
- rstat_css_node)
- css->ss->css_rstat_flush(css, cpu);
- rcu_read_unlock();
+ if (is_self) {
+ cgroup_base_stat_flush(pos->cgroup, cpu);
+ bpf_rstat_flush(pos->cgroup,
+ cgroup_parent(pos->cgroup), cpu);
+ } else
+ pos->ss->css_rstat_flush(pos, cpu);
}
- __cgroup_rstat_unlock(cgrp, cpu);
+ __css_rstat_unlock(css, cpu);
if (!cond_resched())
cpu_relax();
}
}
-int cgroup_rstat_init(struct cgroup *cgrp)
+int css_rstat_init(struct cgroup_subsys_state *css)
{
+ struct cgroup *cgrp = css->cgroup;
int cpu;
+ bool is_self = css_is_self(css);
+
+ if (is_self) {
+ /* the root cgrp has rstat_base_cpu preallocated */
+ if (!cgrp->rstat_base_cpu) {
+ cgrp->rstat_base_cpu = alloc_percpu(struct cgroup_rstat_base_cpu);
+ if (!cgrp->rstat_base_cpu)
+ return -ENOMEM;
+ }
+ } else if (css->ss->css_rstat_flush == NULL)
+ return 0;
+
+ /* the root cgrp's self css has rstat_cpu preallocated */
+ if (!css->rstat_cpu) {
+ css->rstat_cpu = alloc_percpu(struct css_rstat_cpu);
+ if (!css->rstat_cpu) {
+ if (is_self)
+ free_percpu(cgrp->rstat_base_cpu);
- /* the root cgrp has rstat_cpu preallocated */
- if (!cgrp->rstat_cpu) {
- cgrp->rstat_cpu = alloc_percpu(struct cgroup_rstat_cpu);
- if (!cgrp->rstat_cpu)
return -ENOMEM;
+ }
}
/* ->updated_children list is self terminated */
for_each_possible_cpu(cpu) {
- struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
+ struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu);
+
+ rstatc->updated_children = css;
+
+ if (is_self) {
+ struct cgroup_rstat_base_cpu *rstatbc;
- rstatc->updated_children = cgrp;
- u64_stats_init(&rstatc->bsync);
+ rstatbc = cgroup_rstat_base_cpu(cgrp, cpu);
+ u64_stats_init(&rstatbc->bsync);
+ }
}
return 0;
}
-void cgroup_rstat_exit(struct cgroup *cgrp)
+void css_rstat_exit(struct cgroup_subsys_state *css)
{
int cpu;
- cgroup_rstat_flush(cgrp);
+ if (!css_uses_rstat(css))
+ return;
+
+ css_rstat_flush(css);
/* sanity check */
for_each_possible_cpu(cpu) {
- struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
+ struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu);
- if (WARN_ON_ONCE(rstatc->updated_children != cgrp) ||
+ if (WARN_ON_ONCE(rstatc->updated_children != css) ||
WARN_ON_ONCE(rstatc->updated_next))
return;
}
- free_percpu(cgrp->rstat_cpu);
- cgrp->rstat_cpu = NULL;
+ if (css_is_self(css)) {
+ struct cgroup *cgrp = css->cgroup;
+
+ free_percpu(cgrp->rstat_base_cpu);
+ cgrp->rstat_base_cpu = NULL;
+ }
+
+ free_percpu(css->rstat_cpu);
+ css->rstat_cpu = NULL;
}
-void __init cgroup_rstat_boot(void)
+/**
+ * ss_rstat_init - subsystem-specific rstat initialization
+ * @ss: target subsystem
+ *
+ * If @ss is NULL, the static locks associated with the base stats
+ * are initialized. If @ss is non-NULL, the subsystem-specific locks
+ * are initialized.
+ */
+int __init ss_rstat_init(struct cgroup_subsys *ss)
{
int cpu;
+#ifdef CONFIG_SMP
+ /*
+ * On uniprocessor machines, arch_spinlock_t is defined as an empty
+ * struct. Avoid allocating a size of zero by having this block
+ * excluded in this case. It's acceptable to leave the subsystem locks
+ * unitialized since the associated lock functions are no-ops in the
+ * non-smp case.
+ */
+ if (ss) {
+ ss->rstat_ss_cpu_lock = alloc_percpu(raw_spinlock_t);
+ if (!ss->rstat_ss_cpu_lock)
+ return -ENOMEM;
+ }
+#endif
+
+ spin_lock_init(ss_rstat_lock(ss));
for_each_possible_cpu(cpu)
- raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu));
+ raw_spin_lock_init(ss_rstat_cpu_lock(ss, cpu));
+
+ return 0;
}
/*
@@ -420,9 +562,9 @@ static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
{
- struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
+ struct cgroup_rstat_base_cpu *rstatbc = cgroup_rstat_base_cpu(cgrp, cpu);
struct cgroup *parent = cgroup_parent(cgrp);
- struct cgroup_rstat_cpu *prstatc;
+ struct cgroup_rstat_base_cpu *prstatbc;
struct cgroup_base_stat delta;
unsigned seq;
@@ -432,15 +574,15 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
/* fetch the current per-cpu values */
do {
- seq = __u64_stats_fetch_begin(&rstatc->bsync);
- delta = rstatc->bstat;
- } while (__u64_stats_fetch_retry(&rstatc->bsync, seq));
+ seq = __u64_stats_fetch_begin(&rstatbc->bsync);
+ delta = rstatbc->bstat;
+ } while (__u64_stats_fetch_retry(&rstatbc->bsync, seq));
/* propagate per-cpu delta to cgroup and per-cpu global statistics */
- cgroup_base_stat_sub(&delta, &rstatc->last_bstat);
+ cgroup_base_stat_sub(&delta, &rstatbc->last_bstat);
cgroup_base_stat_add(&cgrp->bstat, &delta);
- cgroup_base_stat_add(&rstatc->last_bstat, &delta);
- cgroup_base_stat_add(&rstatc->subtree_bstat, &delta);
+ cgroup_base_stat_add(&rstatbc->last_bstat, &delta);
+ cgroup_base_stat_add(&rstatbc->subtree_bstat, &delta);
/* propagate cgroup and per-cpu global delta to parent (unless that's root) */
if (cgroup_parent(parent)) {
@@ -449,73 +591,73 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
cgroup_base_stat_add(&parent->bstat, &delta);
cgroup_base_stat_add(&cgrp->last_bstat, &delta);
- delta = rstatc->subtree_bstat;
- prstatc = cgroup_rstat_cpu(parent, cpu);
- cgroup_base_stat_sub(&delta, &rstatc->last_subtree_bstat);
- cgroup_base_stat_add(&prstatc->subtree_bstat, &delta);
- cgroup_base_stat_add(&rstatc->last_subtree_bstat, &delta);
+ delta = rstatbc->subtree_bstat;
+ prstatbc = cgroup_rstat_base_cpu(parent, cpu);
+ cgroup_base_stat_sub(&delta, &rstatbc->last_subtree_bstat);
+ cgroup_base_stat_add(&prstatbc->subtree_bstat, &delta);
+ cgroup_base_stat_add(&rstatbc->last_subtree_bstat, &delta);
}
}
-static struct cgroup_rstat_cpu *
+static struct cgroup_rstat_base_cpu *
cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp, unsigned long *flags)
{
- struct cgroup_rstat_cpu *rstatc;
+ struct cgroup_rstat_base_cpu *rstatbc;
- rstatc = get_cpu_ptr(cgrp->rstat_cpu);
- *flags = u64_stats_update_begin_irqsave(&rstatc->bsync);
- return rstatc;
+ rstatbc = get_cpu_ptr(cgrp->rstat_base_cpu);
+ *flags = u64_stats_update_begin_irqsave(&rstatbc->bsync);
+ return rstatbc;
}
static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp,
- struct cgroup_rstat_cpu *rstatc,
+ struct cgroup_rstat_base_cpu *rstatbc,
unsigned long flags)
{
- u64_stats_update_end_irqrestore(&rstatc->bsync, flags);
- cgroup_rstat_updated(cgrp, smp_processor_id());
- put_cpu_ptr(rstatc);
+ u64_stats_update_end_irqrestore(&rstatbc->bsync, flags);
+ css_rstat_updated(&cgrp->self, smp_processor_id());
+ put_cpu_ptr(rstatbc);
}
void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
{
- struct cgroup_rstat_cpu *rstatc;
+ struct cgroup_rstat_base_cpu *rstatbc;
unsigned long flags;
- rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
- rstatc->bstat.cputime.sum_exec_runtime += delta_exec;
- cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags);
+ rstatbc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
+ rstatbc->bstat.cputime.sum_exec_runtime += delta_exec;
+ cgroup_base_stat_cputime_account_end(cgrp, rstatbc, flags);
}
void __cgroup_account_cputime_field(struct cgroup *cgrp,
enum cpu_usage_stat index, u64 delta_exec)
{
- struct cgroup_rstat_cpu *rstatc;
+ struct cgroup_rstat_base_cpu *rstatbc;
unsigned long flags;
- rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
+ rstatbc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
switch (index) {
case CPUTIME_NICE:
- rstatc->bstat.ntime += delta_exec;
+ rstatbc->bstat.ntime += delta_exec;
fallthrough;
case CPUTIME_USER:
- rstatc->bstat.cputime.utime += delta_exec;
+ rstatbc->bstat.cputime.utime += delta_exec;
break;
case CPUTIME_SYSTEM:
case CPUTIME_IRQ:
case CPUTIME_SOFTIRQ:
- rstatc->bstat.cputime.stime += delta_exec;
+ rstatbc->bstat.cputime.stime += delta_exec;
break;
#ifdef CONFIG_SCHED_CORE
case CPUTIME_FORCEIDLE:
- rstatc->bstat.forceidle_sum += delta_exec;
+ rstatbc->bstat.forceidle_sum += delta_exec;
break;
#endif
default:
break;
}
- cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags);
+ cgroup_base_stat_cputime_account_end(cgrp, rstatbc, flags);
}
/*
@@ -574,12 +716,12 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq)
struct cgroup_base_stat bstat;
if (cgroup_parent(cgrp)) {
- cgroup_rstat_flush(cgrp);
- __cgroup_rstat_lock(cgrp, -1);
+ css_rstat_flush(&cgrp->self);
+ __css_rstat_lock(&cgrp->self, -1);
bstat = cgrp->bstat;
cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
&bstat.cputime.utime, &bstat.cputime.stime);
- __cgroup_rstat_unlock(cgrp, -1);
+ __css_rstat_unlock(&cgrp->self, -1);
} else {
root_cgroup_cputime(&bstat);
}
@@ -601,10 +743,10 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq)
cgroup_force_idle_show(seq, &bstat);
}
-/* Add bpf kfuncs for cgroup_rstat_updated() and cgroup_rstat_flush() */
+/* Add bpf kfuncs for css_rstat_updated() and css_rstat_flush() */
BTF_KFUNCS_START(bpf_rstat_kfunc_ids)
-BTF_ID_FLAGS(func, cgroup_rstat_updated)
-BTF_ID_FLAGS(func, cgroup_rstat_flush, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, css_rstat_updated)
+BTF_ID_FLAGS(func, css_rstat_flush, KF_SLEEPABLE)
BTF_KFUNCS_END(bpf_rstat_kfunc_ids)
static const struct btf_kfunc_id_set bpf_rstat_kfunc_set = {
diff --git a/kernel/configs/debug.config b/kernel/configs/debug.config
index 8aafd050b754..e81327d2cd63 100644
--- a/kernel/configs/debug.config
+++ b/kernel/configs/debug.config
@@ -112,3 +112,8 @@ CONFIG_BRANCH_PROFILE_NONE=y
CONFIG_DYNAMIC_FTRACE=y
CONFIG_FTRACE=y
CONFIG_FUNCTION_TRACER=y
+#
+# Preemption
+#
+CONFIG_DEBUG_PREEMPT=y
+CONFIG_PREEMPT=y
diff --git a/kernel/configs/xen.config b/kernel/configs/xen.config
index 6878b9a49be8..1875a0a5047a 100644
--- a/kernel/configs/xen.config
+++ b/kernel/configs/xen.config
@@ -13,6 +13,8 @@ CONFIG_SCSI=y
CONFIG_FB=y
CONFIG_INPUT_MISC=y
CONFIG_MEMORY_HOTPLUG=y
+CONFIG_MEMORY_HOTREMOVE=y
+CONFIG_ZONE_DEVICE=y
CONFIG_TTY=y
# Technically not required but otherwise produces
# pretty useless systems starting from allnoconfig
@@ -47,3 +49,4 @@ CONFIG_XEN_GNTDEV=m
CONFIG_XEN_GRANT_DEV_ALLOC=m
CONFIG_SWIOTLB_XEN=y
CONFIG_XEN_PRIVCMD=m
+CONFIG_XEN_UNPOPULATED_ALLOC=y
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index b8fe0b3d0ffb..24c359d9c879 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -13,6 +13,7 @@
#include <linux/vmalloc.h>
#include <linux/set_memory.h>
#include <linux/slab.h>
+#include <linux/pci-p2pdma.h>
#include "direct.h"
/*
@@ -462,34 +463,33 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
enum dma_data_direction dir, unsigned long attrs)
{
struct pci_p2pdma_map_state p2pdma_state = {};
- enum pci_p2pdma_map_type map;
struct scatterlist *sg;
int i, ret;
for_each_sg(sgl, sg, nents, i) {
- if (is_pci_p2pdma_page(sg_page(sg))) {
- map = pci_p2pdma_map_segment(&p2pdma_state, dev, sg);
- switch (map) {
- case PCI_P2PDMA_MAP_BUS_ADDR:
- continue;
- case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
- /*
- * Any P2P mapping that traverses the PCI
- * host bridge must be mapped with CPU physical
- * address and not PCI bus addresses. This is
- * done with dma_direct_map_page() below.
- */
- break;
- default:
- ret = -EREMOTEIO;
+ switch (pci_p2pdma_state(&p2pdma_state, dev, sg_page(sg))) {
+ case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
+ /*
+ * Any P2P mapping that traverses the PCI host bridge
+ * must be mapped with CPU physical address and not PCI
+ * bus addresses.
+ */
+ break;
+ case PCI_P2PDMA_MAP_NONE:
+ sg->dma_address = dma_direct_map_page(dev, sg_page(sg),
+ sg->offset, sg->length, dir, attrs);
+ if (sg->dma_address == DMA_MAPPING_ERROR) {
+ ret = -EIO;
goto out_unmap;
}
- }
-
- sg->dma_address = dma_direct_map_page(dev, sg_page(sg),
- sg->offset, sg->length, dir, attrs);
- if (sg->dma_address == DMA_MAPPING_ERROR) {
- ret = -EIO;
+ break;
+ case PCI_P2PDMA_MAP_BUS_ADDR:
+ sg->dma_address = pci_p2pdma_bus_addr_map(&p2pdma_state,
+ sg_phys(sg));
+ sg_dma_mark_bus_address(sg);
+ continue;
+ default:
+ ret = -EREMOTEIO;
goto out_unmap;
}
sg_dma_len(sg) = sg->length;
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 051a32988040..107e4a4d251d 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -443,6 +443,24 @@ bool __dma_need_sync(struct device *dev, dma_addr_t dma_addr)
}
EXPORT_SYMBOL_GPL(__dma_need_sync);
+/**
+ * dma_need_unmap - does this device need dma_unmap_* operations
+ * @dev: device to check
+ *
+ * If this function returns %false, drivers can skip calling dma_unmap_* after
+ * finishing an I/O. This function must be called after all mappings that might
+ * need to be unmapped have been performed.
+ */
+bool dma_need_unmap(struct device *dev)
+{
+ if (!dma_map_direct(dev, get_dma_ops(dev)))
+ return true;
+ if (!dev->dma_skip_sync)
+ return true;
+ return IS_ENABLED(CONFIG_DMA_API_DEBUG);
+}
+EXPORT_SYMBOL_GPL(dma_need_unmap);
+
static void dma_setup_need_sync(struct device *dev)
{
const struct dma_map_ops *ops = get_dma_ops(dev);
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 20154572ede9..a8dd1f27417c 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -146,7 +146,7 @@ static inline bool report_single_step(unsigned long work)
return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP;
}
-static void syscall_exit_work(struct pt_regs *regs, unsigned long work)
+void syscall_exit_work(struct pt_regs *regs, unsigned long work)
{
bool step;
@@ -173,53 +173,6 @@ static void syscall_exit_work(struct pt_regs *regs, unsigned long work)
ptrace_report_syscall_exit(regs, step);
}
-/*
- * Syscall specific exit to user mode preparation. Runs with interrupts
- * enabled.
- */
-static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs)
-{
- unsigned long work = READ_ONCE(current_thread_info()->syscall_work);
- unsigned long nr = syscall_get_nr(current, regs);
-
- CT_WARN_ON(ct_state() != CT_STATE_KERNEL);
-
- if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
- if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr))
- local_irq_enable();
- }
-
- rseq_syscall(regs);
-
- /*
- * Do one-time syscall specific work. If these work items are
- * enabled, we want to run them exactly once per syscall exit with
- * interrupts enabled.
- */
- if (unlikely(work & SYSCALL_WORK_EXIT))
- syscall_exit_work(regs, work);
-}
-
-static __always_inline void __syscall_exit_to_user_mode_work(struct pt_regs *regs)
-{
- syscall_exit_to_user_mode_prepare(regs);
- local_irq_disable_exit_to_user();
- exit_to_user_mode_prepare(regs);
-}
-
-void syscall_exit_to_user_mode_work(struct pt_regs *regs)
-{
- __syscall_exit_to_user_mode_work(regs);
-}
-
-__visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs)
-{
- instrumentation_begin();
- __syscall_exit_to_user_mode_work(regs);
- instrumentation_end();
- exit_to_user_mode();
-}
-
noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs)
{
enter_from_user_mode(regs);
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index ae60cae24e9a..d0af8a8b3ae6 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -43,18 +43,16 @@ unsigned long probe_irq_on(void)
* flush such a longstanding irq before considering it as spurious.
*/
for_each_irq_desc_reverse(i, desc) {
- raw_spin_lock_irq(&desc->lock);
+ guard(raw_spinlock_irq)(&desc->lock);
if (!desc->action && irq_settings_can_probe(desc)) {
/*
* Some chips need to know about probing in
* progress:
*/
if (desc->irq_data.chip->irq_set_type)
- desc->irq_data.chip->irq_set_type(&desc->irq_data,
- IRQ_TYPE_PROBE);
+ desc->irq_data.chip->irq_set_type(&desc->irq_data, IRQ_TYPE_PROBE);
irq_activate_and_startup(desc, IRQ_NORESEND);
}
- raw_spin_unlock_irq(&desc->lock);
}
/* Wait for longstanding interrupts to trigger. */
@@ -66,13 +64,12 @@ unsigned long probe_irq_on(void)
* happened in the previous stage, it may have masked itself)
*/
for_each_irq_desc_reverse(i, desc) {
- raw_spin_lock_irq(&desc->lock);
+ guard(raw_spinlock_irq)(&desc->lock);
if (!desc->action && irq_settings_can_probe(desc)) {
desc->istate |= IRQS_AUTODETECT | IRQS_WAITING;
if (irq_activate_and_startup(desc, IRQ_NORESEND))
desc->istate |= IRQS_PENDING;
}
- raw_spin_unlock_irq(&desc->lock);
}
/*
@@ -84,18 +81,16 @@ unsigned long probe_irq_on(void)
* Now filter out any obviously spurious interrupts
*/
for_each_irq_desc(i, desc) {
- raw_spin_lock_irq(&desc->lock);
-
+ guard(raw_spinlock_irq)(&desc->lock);
if (desc->istate & IRQS_AUTODETECT) {
/* It triggered already - consider it spurious. */
if (!(desc->istate & IRQS_WAITING)) {
desc->istate &= ~IRQS_AUTODETECT;
irq_shutdown_and_deactivate(desc);
- } else
- if (i < 32)
- mask |= 1 << i;
+ } else if (i < 32) {
+ mask |= 1 << i;
+ }
}
- raw_spin_unlock_irq(&desc->lock);
}
return mask;
@@ -121,7 +116,7 @@ unsigned int probe_irq_mask(unsigned long val)
int i;
for_each_irq_desc(i, desc) {
- raw_spin_lock_irq(&desc->lock);
+ guard(raw_spinlock_irq)(&desc->lock);
if (desc->istate & IRQS_AUTODETECT) {
if (i < 16 && !(desc->istate & IRQS_WAITING))
mask |= 1 << i;
@@ -129,7 +124,6 @@ unsigned int probe_irq_mask(unsigned long val)
desc->istate &= ~IRQS_AUTODETECT;
irq_shutdown_and_deactivate(desc);
}
- raw_spin_unlock_irq(&desc->lock);
}
mutex_unlock(&probing_active);
@@ -160,8 +154,7 @@ int probe_irq_off(unsigned long val)
struct irq_desc *desc;
for_each_irq_desc(i, desc) {
- raw_spin_lock_irq(&desc->lock);
-
+ guard(raw_spinlock_irq)(&desc->lock);
if (desc->istate & IRQS_AUTODETECT) {
if (!(desc->istate & IRQS_WAITING)) {
if (!nr_of_irqs)
@@ -171,7 +164,6 @@ int probe_irq_off(unsigned long val)
desc->istate &= ~IRQS_AUTODETECT;
irq_shutdown_and_deactivate(desc);
}
- raw_spin_unlock_irq(&desc->lock);
}
mutex_unlock(&probing_active);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 36cf1b09cc84..b0e0a7332993 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -34,98 +34,80 @@ struct irqaction chained_action = {
};
/**
- * irq_set_chip - set the irq chip for an irq
- * @irq: irq number
- * @chip: pointer to irq chip description structure
+ * irq_set_chip - set the irq chip for an irq
+ * @irq: irq number
+ * @chip: pointer to irq chip description structure
*/
int irq_set_chip(unsigned int irq, const struct irq_chip *chip)
{
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
+ int ret = -EINVAL;
- if (!desc)
- return -EINVAL;
-
- desc->irq_data.chip = (struct irq_chip *)(chip ?: &no_irq_chip);
- irq_put_desc_unlock(desc, flags);
- /*
- * For !CONFIG_SPARSE_IRQ make the irq show up in
- * allocated_irqs.
- */
- irq_mark_irq(irq);
- return 0;
+ scoped_irqdesc_get_and_lock(irq, 0) {
+ scoped_irqdesc->irq_data.chip = (struct irq_chip *)(chip ?: &no_irq_chip);
+ ret = 0;
+ }
+ /* For !CONFIG_SPARSE_IRQ make the irq show up in allocated_irqs. */
+ if (!ret)
+ irq_mark_irq(irq);
+ return ret;
}
EXPORT_SYMBOL(irq_set_chip);
/**
- * irq_set_irq_type - set the irq trigger type for an irq
- * @irq: irq number
- * @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h
+ * irq_set_irq_type - set the irq trigger type for an irq
+ * @irq: irq number
+ * @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h
*/
int irq_set_irq_type(unsigned int irq, unsigned int type)
{
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
- int ret = 0;
-
- if (!desc)
- return -EINVAL;
-
- ret = __irq_set_trigger(desc, type);
- irq_put_desc_busunlock(desc, flags);
- return ret;
+ scoped_irqdesc_get_and_buslock(irq, IRQ_GET_DESC_CHECK_GLOBAL)
+ return __irq_set_trigger(scoped_irqdesc, type);
+ return -EINVAL;
}
EXPORT_SYMBOL(irq_set_irq_type);
/**
- * irq_set_handler_data - set irq handler data for an irq
- * @irq: Interrupt number
- * @data: Pointer to interrupt specific data
+ * irq_set_handler_data - set irq handler data for an irq
+ * @irq: Interrupt number
+ * @data: Pointer to interrupt specific data
*
- * Set the hardware irq controller data for an irq
+ * Set the hardware irq controller data for an irq
*/
int irq_set_handler_data(unsigned int irq, void *data)
{
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
-
- if (!desc)
- return -EINVAL;
- desc->irq_common_data.handler_data = data;
- irq_put_desc_unlock(desc, flags);
- return 0;
+ scoped_irqdesc_get_and_lock(irq, 0) {
+ scoped_irqdesc->irq_common_data.handler_data = data;
+ return 0;
+ }
+ return -EINVAL;
}
EXPORT_SYMBOL(irq_set_handler_data);
/**
- * irq_set_msi_desc_off - set MSI descriptor data for an irq at offset
- * @irq_base: Interrupt number base
- * @irq_offset: Interrupt number offset
- * @entry: Pointer to MSI descriptor data
+ * irq_set_msi_desc_off - set MSI descriptor data for an irq at offset
+ * @irq_base: Interrupt number base
+ * @irq_offset: Interrupt number offset
+ * @entry: Pointer to MSI descriptor data
*
- * Set the MSI descriptor entry for an irq at offset
+ * Set the MSI descriptor entry for an irq at offset
*/
-int irq_set_msi_desc_off(unsigned int irq_base, unsigned int irq_offset,
- struct msi_desc *entry)
-{
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_lock(irq_base + irq_offset, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
-
- if (!desc)
- return -EINVAL;
- desc->irq_common_data.msi_desc = entry;
- if (entry && !irq_offset)
- entry->irq = irq_base;
- irq_put_desc_unlock(desc, flags);
- return 0;
+int irq_set_msi_desc_off(unsigned int irq_base, unsigned int irq_offset, struct msi_desc *entry)
+{
+ scoped_irqdesc_get_and_lock(irq_base + irq_offset, IRQ_GET_DESC_CHECK_GLOBAL) {
+ scoped_irqdesc->irq_common_data.msi_desc = entry;
+ if (entry && !irq_offset)
+ entry->irq = irq_base;
+ return 0;
+ }
+ return -EINVAL;
}
/**
- * irq_set_msi_desc - set MSI descriptor data for an irq
- * @irq: Interrupt number
- * @entry: Pointer to MSI descriptor data
+ * irq_set_msi_desc - set MSI descriptor data for an irq
+ * @irq: Interrupt number
+ * @entry: Pointer to MSI descriptor data
*
- * Set the MSI descriptor entry for an irq
+ * Set the MSI descriptor entry for an irq
*/
int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
{
@@ -133,22 +115,19 @@ int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
}
/**
- * irq_set_chip_data - set irq chip data for an irq
- * @irq: Interrupt number
- * @data: Pointer to chip specific data
+ * irq_set_chip_data - set irq chip data for an irq
+ * @irq: Interrupt number
+ * @data: Pointer to chip specific data
*
- * Set the hardware irq chip data for an irq
+ * Set the hardware irq chip data for an irq
*/
int irq_set_chip_data(unsigned int irq, void *data)
{
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
-
- if (!desc)
- return -EINVAL;
- desc->irq_data.chip_data = data;
- irq_put_desc_unlock(desc, flags);
- return 0;
+ scoped_irqdesc_get_and_lock(irq, 0) {
+ scoped_irqdesc->irq_data.chip_data = data;
+ return 0;
+ }
+ return -EINVAL;
}
EXPORT_SYMBOL(irq_set_chip_data);
@@ -223,6 +202,19 @@ __irq_startup_managed(struct irq_desc *desc, const struct cpumask *aff,
return IRQ_STARTUP_ABORT;
return IRQ_STARTUP_MANAGED;
}
+
+void irq_startup_managed(struct irq_desc *desc)
+{
+ /*
+ * Only start it up when the disable depth is 1, so that a disable,
+ * hotunplug, hotplug sequence does not end up enabling it during
+ * hotplug unconditionally.
+ */
+ desc->depth--;
+ if (!desc->depth)
+ irq_startup(desc, IRQ_RESEND, IRQ_START_COND);
+}
+
#else
static __always_inline int
__irq_startup_managed(struct irq_desc *desc, const struct cpumask *aff,
@@ -290,6 +282,7 @@ int irq_startup(struct irq_desc *desc, bool resend, bool force)
ret = __irq_startup(desc);
break;
case IRQ_STARTUP_ABORT:
+ desc->depth = 1;
irqd_set_managed_shutdown(d);
return 0;
}
@@ -322,7 +315,13 @@ void irq_shutdown(struct irq_desc *desc)
{
if (irqd_is_started(&desc->irq_data)) {
clear_irq_resend(desc);
- desc->depth = 1;
+ /*
+ * Increment disable depth, so that a managed shutdown on
+ * CPU hotunplug preserves the actual disabled state when the
+ * CPU comes back online. See irq_startup_managed().
+ */
+ desc->depth++;
+
if (desc->irq_data.chip->irq_shutdown) {
desc->irq_data.chip->irq_shutdown(&desc->irq_data);
irq_state_set_disabled(desc);
@@ -450,48 +449,6 @@ void unmask_threaded_irq(struct irq_desc *desc)
unmask_irq(desc);
}
-/*
- * handle_nested_irq - Handle a nested irq from a irq thread
- * @irq: the interrupt number
- *
- * Handle interrupts which are nested into a threaded interrupt
- * handler. The handler function is called inside the calling
- * threads context.
- */
-void handle_nested_irq(unsigned int irq)
-{
- struct irq_desc *desc = irq_to_desc(irq);
- struct irqaction *action;
- irqreturn_t action_ret;
-
- might_sleep();
-
- raw_spin_lock_irq(&desc->lock);
-
- desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
-
- action = desc->action;
- if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) {
- desc->istate |= IRQS_PENDING;
- raw_spin_unlock_irq(&desc->lock);
- return;
- }
-
- kstat_incr_irqs_this_cpu(desc);
- atomic_inc(&desc->threads_active);
- raw_spin_unlock_irq(&desc->lock);
-
- action_ret = IRQ_NONE;
- for_each_action_of_desc(desc, action)
- action_ret |= action->thread_fn(action->irq, action->dev_id);
-
- if (!irq_settings_no_debug(desc))
- note_interrupt(desc, action_ret);
-
- wake_threads_waitq(desc);
-}
-EXPORT_SYMBOL_GPL(handle_nested_irq);
-
static bool irq_check_poll(struct irq_desc *desc)
{
if (!(desc->istate & IRQS_POLL_INPROGRESS))
@@ -499,7 +456,7 @@ static bool irq_check_poll(struct irq_desc *desc)
return irq_wait_for_poll(desc);
}
-static bool irq_may_run(struct irq_desc *desc)
+static bool irq_can_handle_pm(struct irq_desc *desc)
{
unsigned int mask = IRQD_IRQ_INPROGRESS | IRQD_WAKEUP_ARMED;
@@ -524,77 +481,111 @@ static bool irq_may_run(struct irq_desc *desc)
return irq_check_poll(desc);
}
+static inline bool irq_can_handle_actions(struct irq_desc *desc)
+{
+ desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
+
+ if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
+ desc->istate |= IRQS_PENDING;
+ return false;
+ }
+ return true;
+}
+
+static inline bool irq_can_handle(struct irq_desc *desc)
+{
+ if (!irq_can_handle_pm(desc))
+ return false;
+
+ return irq_can_handle_actions(desc);
+}
+
/**
- * handle_simple_irq - Simple and software-decoded IRQs.
- * @desc: the interrupt description structure for this irq
- *
- * Simple interrupts are either sent from a demultiplexing interrupt
- * handler or come from hardware, where no interrupt hardware control
- * is necessary.
+ * handle_nested_irq - Handle a nested irq from a irq thread
+ * @irq: the interrupt number
*
- * Note: The caller is expected to handle the ack, clear, mask and
- * unmask issues if necessary.
+ * Handle interrupts which are nested into a threaded interrupt
+ * handler. The handler function is called inside the calling threads
+ * context.
*/
-void handle_simple_irq(struct irq_desc *desc)
+void handle_nested_irq(unsigned int irq)
{
- raw_spin_lock(&desc->lock);
+ struct irq_desc *desc = irq_to_desc(irq);
+ struct irqaction *action;
+ irqreturn_t action_ret;
- if (!irq_may_run(desc))
- goto out_unlock;
+ might_sleep();
- desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
+ scoped_guard(raw_spinlock_irq, &desc->lock) {
+ if (!irq_can_handle_actions(desc))
+ return;
- if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
- desc->istate |= IRQS_PENDING;
- goto out_unlock;
+ action = desc->action;
+ kstat_incr_irqs_this_cpu(desc);
+ atomic_inc(&desc->threads_active);
}
+ action_ret = IRQ_NONE;
+ for_each_action_of_desc(desc, action)
+ action_ret |= action->thread_fn(action->irq, action->dev_id);
+
+ if (!irq_settings_no_debug(desc))
+ note_interrupt(desc, action_ret);
+
+ wake_threads_waitq(desc);
+}
+EXPORT_SYMBOL_GPL(handle_nested_irq);
+
+/**
+ * handle_simple_irq - Simple and software-decoded IRQs.
+ * @desc: the interrupt description structure for this irq
+ *
+ * Simple interrupts are either sent from a demultiplexing interrupt
+ * handler or come from hardware, where no interrupt hardware control is
+ * necessary.
+ *
+ * Note: The caller is expected to handle the ack, clear, mask and unmask
+ * issues if necessary.
+ */
+void handle_simple_irq(struct irq_desc *desc)
+{
+ guard(raw_spinlock)(&desc->lock);
+
+ if (!irq_can_handle(desc))
+ return;
+
kstat_incr_irqs_this_cpu(desc);
handle_irq_event(desc);
-
-out_unlock:
- raw_spin_unlock(&desc->lock);
}
EXPORT_SYMBOL_GPL(handle_simple_irq);
/**
- * handle_untracked_irq - Simple and software-decoded IRQs.
- * @desc: the interrupt description structure for this irq
+ * handle_untracked_irq - Simple and software-decoded IRQs.
+ * @desc: the interrupt description structure for this irq
*
- * Untracked interrupts are sent from a demultiplexing interrupt
- * handler when the demultiplexer does not know which device it its
- * multiplexed irq domain generated the interrupt. IRQ's handled
- * through here are not subjected to stats tracking, randomness, or
- * spurious interrupt detection.
+ * Untracked interrupts are sent from a demultiplexing interrupt handler
+ * when the demultiplexer does not know which device it its multiplexed irq
+ * domain generated the interrupt. IRQ's handled through here are not
+ * subjected to stats tracking, randomness, or spurious interrupt
+ * detection.
*
- * Note: Like handle_simple_irq, the caller is expected to handle
- * the ack, clear, mask and unmask issues if necessary.
+ * Note: Like handle_simple_irq, the caller is expected to handle the ack,
+ * clear, mask and unmask issues if necessary.
*/
void handle_untracked_irq(struct irq_desc *desc)
{
- raw_spin_lock(&desc->lock);
-
- if (!irq_may_run(desc))
- goto out_unlock;
-
- desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
+ scoped_guard(raw_spinlock, &desc->lock) {
+ if (!irq_can_handle(desc))
+ return;
- if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
- desc->istate |= IRQS_PENDING;
- goto out_unlock;
+ desc->istate &= ~IRQS_PENDING;
+ irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
}
- desc->istate &= ~IRQS_PENDING;
- irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
- raw_spin_unlock(&desc->lock);
-
__handle_irq_event_percpu(desc);
- raw_spin_lock(&desc->lock);
- irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
-
-out_unlock:
- raw_spin_unlock(&desc->lock);
+ scoped_guard(raw_spinlock, &desc->lock)
+ irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
}
EXPORT_SYMBOL_GPL(handle_untracked_irq);
@@ -617,40 +608,26 @@ static void cond_unmask_irq(struct irq_desc *desc)
}
/**
- * handle_level_irq - Level type irq handler
- * @desc: the interrupt description structure for this irq
+ * handle_level_irq - Level type irq handler
+ * @desc: the interrupt description structure for this irq
*
- * Level type interrupts are active as long as the hardware line has
- * the active level. This may require to mask the interrupt and unmask
- * it after the associated handler has acknowledged the device, so the
- * interrupt line is back to inactive.
+ * Level type interrupts are active as long as the hardware line has the
+ * active level. This may require to mask the interrupt and unmask it after
+ * the associated handler has acknowledged the device, so the interrupt
+ * line is back to inactive.
*/
void handle_level_irq(struct irq_desc *desc)
{
- raw_spin_lock(&desc->lock);
+ guard(raw_spinlock)(&desc->lock);
mask_ack_irq(desc);
- if (!irq_may_run(desc))
- goto out_unlock;
-
- desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
-
- /*
- * If its disabled or no action available
- * keep it masked and get out of here
- */
- if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
- desc->istate |= IRQS_PENDING;
- goto out_unlock;
- }
+ if (!irq_can_handle(desc))
+ return;
kstat_incr_irqs_this_cpu(desc);
handle_irq_event(desc);
cond_unmask_irq(desc);
-
-out_unlock:
- raw_spin_unlock(&desc->lock);
}
EXPORT_SYMBOL_GPL(handle_level_irq);
@@ -675,42 +652,43 @@ static void cond_unmask_eoi_irq(struct irq_desc *desc, struct irq_chip *chip)
}
}
+static inline void cond_eoi_irq(struct irq_chip *chip, struct irq_data *data)
+{
+ if (!(chip->flags & IRQCHIP_EOI_IF_HANDLED))
+ chip->irq_eoi(data);
+}
+
/**
- * handle_fasteoi_irq - irq handler for transparent controllers
- * @desc: the interrupt description structure for this irq
+ * handle_fasteoi_irq - irq handler for transparent controllers
+ * @desc: the interrupt description structure for this irq
*
- * Only a single callback will be issued to the chip: an ->eoi()
- * call when the interrupt has been serviced. This enables support
- * for modern forms of interrupt handlers, which handle the flow
- * details in hardware, transparently.
+ * Only a single callback will be issued to the chip: an ->eoi() call when
+ * the interrupt has been serviced. This enables support for modern forms
+ * of interrupt handlers, which handle the flow details in hardware,
+ * transparently.
*/
void handle_fasteoi_irq(struct irq_desc *desc)
{
struct irq_chip *chip = desc->irq_data.chip;
- raw_spin_lock(&desc->lock);
+ guard(raw_spinlock)(&desc->lock);
/*
* When an affinity change races with IRQ handling, the next interrupt
* can arrive on the new CPU before the original CPU has completed
* handling the previous one - it may need to be resent.
*/
- if (!irq_may_run(desc)) {
+ if (!irq_can_handle_pm(desc)) {
if (irqd_needs_resend_when_in_progress(&desc->irq_data))
desc->istate |= IRQS_PENDING;
- goto out;
+ cond_eoi_irq(chip, &desc->irq_data);
+ return;
}
- desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
-
- /*
- * If its disabled or no action available
- * then mask it and get out of here:
- */
- if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
- desc->istate |= IRQS_PENDING;
+ if (!irq_can_handle_actions(desc)) {
mask_irq(desc);
- goto out;
+ cond_eoi_irq(chip, &desc->irq_data);
+ return;
}
kstat_incr_irqs_this_cpu(desc);
@@ -726,13 +704,6 @@ void handle_fasteoi_irq(struct irq_desc *desc)
*/
if (unlikely(desc->istate & IRQS_PENDING))
check_irq_resend(desc, false);
-
- raw_spin_unlock(&desc->lock);
- return;
-out:
- if (!(chip->flags & IRQCHIP_EOI_IF_HANDLED))
- chip->irq_eoi(&desc->irq_data);
- raw_spin_unlock(&desc->lock);
}
EXPORT_SYMBOL_GPL(handle_fasteoi_irq);
@@ -770,40 +741,27 @@ void handle_fasteoi_nmi(struct irq_desc *desc)
EXPORT_SYMBOL_GPL(handle_fasteoi_nmi);
/**
- * handle_edge_irq - edge type IRQ handler
- * @desc: the interrupt description structure for this irq
+ * handle_edge_irq - edge type IRQ handler
+ * @desc: the interrupt description structure for this irq
*
- * Interrupt occurs on the falling and/or rising edge of a hardware
- * signal. The occurrence is latched into the irq controller hardware
- * and must be acked in order to be reenabled. After the ack another
- * interrupt can happen on the same source even before the first one
- * is handled by the associated event handler. If this happens it
- * might be necessary to disable (mask) the interrupt depending on the
- * controller hardware. This requires to reenable the interrupt inside
- * of the loop which handles the interrupts which have arrived while
- * the handler was running. If all pending interrupts are handled, the
- * loop is left.
+ * Interrupt occurs on the falling and/or rising edge of a hardware
+ * signal. The occurrence is latched into the irq controller hardware and
+ * must be acked in order to be reenabled. After the ack another interrupt
+ * can happen on the same source even before the first one is handled by
+ * the associated event handler. If this happens it might be necessary to
+ * disable (mask) the interrupt depending on the controller hardware. This
+ * requires to reenable the interrupt inside of the loop which handles the
+ * interrupts which have arrived while the handler was running. If all
+ * pending interrupts are handled, the loop is left.
*/
void handle_edge_irq(struct irq_desc *desc)
{
- raw_spin_lock(&desc->lock);
-
- desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
-
- if (!irq_may_run(desc)) {
- desc->istate |= IRQS_PENDING;
- mask_ack_irq(desc);
- goto out_unlock;
- }
+ guard(raw_spinlock)(&desc->lock);
- /*
- * If its disabled or no action available then mask it and get
- * out of here.
- */
- if (irqd_irq_disabled(&desc->irq_data) || !desc->action) {
+ if (!irq_can_handle(desc)) {
desc->istate |= IRQS_PENDING;
mask_ack_irq(desc);
- goto out_unlock;
+ return;
}
kstat_incr_irqs_this_cpu(desc);
@@ -814,7 +772,7 @@ void handle_edge_irq(struct irq_desc *desc)
do {
if (unlikely(!desc->action)) {
mask_irq(desc);
- goto out_unlock;
+ return;
}
/*
@@ -830,11 +788,7 @@ void handle_edge_irq(struct irq_desc *desc)
handle_irq_event(desc);
- } while ((desc->istate & IRQS_PENDING) &&
- !irqd_irq_disabled(&desc->irq_data));
-
-out_unlock:
- raw_spin_unlock(&desc->lock);
+ } while ((desc->istate & IRQS_PENDING) && !irqd_irq_disabled(&desc->irq_data));
}
EXPORT_SYMBOL(handle_edge_irq);
@@ -1007,35 +961,23 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle,
}
}
-void
-__irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
- const char *name)
+void __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
+ const char *name)
{
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, 0);
-
- if (!desc)
- return;
-
- __irq_do_set_handler(desc, handle, is_chained, name);
- irq_put_desc_busunlock(desc, flags);
+ scoped_irqdesc_get_and_lock(irq, 0)
+ __irq_do_set_handler(scoped_irqdesc, handle, is_chained, name);
}
EXPORT_SYMBOL_GPL(__irq_set_handler);
-void
-irq_set_chained_handler_and_data(unsigned int irq, irq_flow_handler_t handle,
- void *data)
+void irq_set_chained_handler_and_data(unsigned int irq, irq_flow_handler_t handle,
+ void *data)
{
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, 0);
-
- if (!desc)
- return;
-
- desc->irq_common_data.handler_data = data;
- __irq_do_set_handler(desc, handle, 1, NULL);
+ scoped_irqdesc_get_and_buslock(irq, 0) {
+ struct irq_desc *desc = scoped_irqdesc;
- irq_put_desc_busunlock(desc, flags);
+ desc->irq_common_data.handler_data = data;
+ __irq_do_set_handler(desc, handle, 1, NULL);
+ }
}
EXPORT_SYMBOL_GPL(irq_set_chained_handler_and_data);
@@ -1050,38 +992,34 @@ EXPORT_SYMBOL_GPL(irq_set_chip_and_handler_name);
void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
{
- unsigned long flags, trigger, tmp;
- struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
-
- if (!desc)
- return;
-
- /*
- * Warn when a driver sets the no autoenable flag on an already
- * active interrupt.
- */
- WARN_ON_ONCE(!desc->depth && (set & _IRQ_NOAUTOEN));
-
- irq_settings_clr_and_set(desc, clr, set);
+ scoped_irqdesc_get_and_lock(irq, 0) {
+ struct irq_desc *desc = scoped_irqdesc;
+ unsigned long trigger, tmp;
+ /*
+ * Warn when a driver sets the no autoenable flag on an already
+ * active interrupt.
+ */
+ WARN_ON_ONCE(!desc->depth && (set & _IRQ_NOAUTOEN));
- trigger = irqd_get_trigger_type(&desc->irq_data);
+ irq_settings_clr_and_set(desc, clr, set);
- irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU |
- IRQD_TRIGGER_MASK | IRQD_LEVEL);
- if (irq_settings_has_no_balance_set(desc))
- irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
- if (irq_settings_is_per_cpu(desc))
- irqd_set(&desc->irq_data, IRQD_PER_CPU);
- if (irq_settings_is_level(desc))
- irqd_set(&desc->irq_data, IRQD_LEVEL);
+ trigger = irqd_get_trigger_type(&desc->irq_data);
- tmp = irq_settings_get_trigger_mask(desc);
- if (tmp != IRQ_TYPE_NONE)
- trigger = tmp;
+ irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU |
+ IRQD_TRIGGER_MASK | IRQD_LEVEL);
+ if (irq_settings_has_no_balance_set(desc))
+ irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
+ if (irq_settings_is_per_cpu(desc))
+ irqd_set(&desc->irq_data, IRQD_PER_CPU);
+ if (irq_settings_is_level(desc))
+ irqd_set(&desc->irq_data, IRQD_LEVEL);
- irqd_set(&desc->irq_data, trigger);
+ tmp = irq_settings_get_trigger_mask(desc);
+ if (tmp != IRQ_TYPE_NONE)
+ trigger = tmp;
- irq_put_desc_unlock(desc, flags);
+ irqd_set(&desc->irq_data, trigger);
+ }
}
EXPORT_SYMBOL_GPL(irq_modify_status);
@@ -1094,25 +1032,21 @@ EXPORT_SYMBOL_GPL(irq_modify_status);
*/
void irq_cpu_online(void)
{
- struct irq_desc *desc;
- struct irq_chip *chip;
- unsigned long flags;
unsigned int irq;
for_each_active_irq(irq) {
- desc = irq_to_desc(irq);
+ struct irq_desc *desc = irq_to_desc(irq);
+ struct irq_chip *chip;
+
if (!desc)
continue;
- raw_spin_lock_irqsave(&desc->lock, flags);
-
+ guard(raw_spinlock_irqsave)(&desc->lock);
chip = irq_data_get_irq_chip(&desc->irq_data);
if (chip && chip->irq_cpu_online &&
(!(chip->flags & IRQCHIP_ONOFFLINE_ENABLED) ||
!irqd_irq_disabled(&desc->irq_data)))
chip->irq_cpu_online(&desc->irq_data);
-
- raw_spin_unlock_irqrestore(&desc->lock, flags);
}
}
@@ -1124,25 +1058,21 @@ void irq_cpu_online(void)
*/
void irq_cpu_offline(void)
{
- struct irq_desc *desc;
- struct irq_chip *chip;
- unsigned long flags;
unsigned int irq;
for_each_active_irq(irq) {
- desc = irq_to_desc(irq);
+ struct irq_desc *desc = irq_to_desc(irq);
+ struct irq_chip *chip;
+
if (!desc)
continue;
- raw_spin_lock_irqsave(&desc->lock, flags);
-
+ guard(raw_spinlock_irqsave)(&desc->lock);
chip = irq_data_get_irq_chip(&desc->irq_data);
if (chip && chip->irq_cpu_offline &&
(!(chip->flags & IRQCHIP_ONOFFLINE_ENABLED) ||
!irqd_irq_disabled(&desc->irq_data)))
chip->irq_cpu_offline(&desc->irq_data);
-
- raw_spin_unlock_irqrestore(&desc->lock, flags);
}
}
#endif
@@ -1151,102 +1081,69 @@ void irq_cpu_offline(void)
#ifdef CONFIG_IRQ_FASTEOI_HIERARCHY_HANDLERS
/**
- * handle_fasteoi_ack_irq - irq handler for edge hierarchy
- * stacked on transparent controllers
+ * handle_fasteoi_ack_irq - irq handler for edge hierarchy stacked on
+ * transparent controllers
*
- * @desc: the interrupt description structure for this irq
+ * @desc: the interrupt description structure for this irq
*
- * Like handle_fasteoi_irq(), but for use with hierarchy where
- * the irq_chip also needs to have its ->irq_ack() function
- * called.
+ * Like handle_fasteoi_irq(), but for use with hierarchy where the irq_chip
+ * also needs to have its ->irq_ack() function called.
*/
void handle_fasteoi_ack_irq(struct irq_desc *desc)
{
struct irq_chip *chip = desc->irq_data.chip;
- raw_spin_lock(&desc->lock);
-
- if (!irq_may_run(desc))
- goto out;
+ guard(raw_spinlock)(&desc->lock);
- desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
+ if (!irq_can_handle_pm(desc)) {
+ cond_eoi_irq(chip, &desc->irq_data);
+ return;
+ }
- /*
- * If its disabled or no action available
- * then mask it and get out of here:
- */
- if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
- desc->istate |= IRQS_PENDING;
+ if (unlikely(!irq_can_handle_actions(desc))) {
mask_irq(desc);
- goto out;
+ cond_eoi_irq(chip, &desc->irq_data);
+ return;
}
kstat_incr_irqs_this_cpu(desc);
if (desc->istate & IRQS_ONESHOT)
mask_irq(desc);
- /* Start handling the irq */
desc->irq_data.chip->irq_ack(&desc->irq_data);
handle_irq_event(desc);
cond_unmask_eoi_irq(desc, chip);
-
- raw_spin_unlock(&desc->lock);
- return;
-out:
- if (!(chip->flags & IRQCHIP_EOI_IF_HANDLED))
- chip->irq_eoi(&desc->irq_data);
- raw_spin_unlock(&desc->lock);
}
EXPORT_SYMBOL_GPL(handle_fasteoi_ack_irq);
/**
- * handle_fasteoi_mask_irq - irq handler for level hierarchy
- * stacked on transparent controllers
+ * handle_fasteoi_mask_irq - irq handler for level hierarchy stacked on
+ * transparent controllers
*
- * @desc: the interrupt description structure for this irq
+ * @desc: the interrupt description structure for this irq
*
- * Like handle_fasteoi_irq(), but for use with hierarchy where
- * the irq_chip also needs to have its ->irq_mask_ack() function
- * called.
+ * Like handle_fasteoi_irq(), but for use with hierarchy where the irq_chip
+ * also needs to have its ->irq_mask_ack() function called.
*/
void handle_fasteoi_mask_irq(struct irq_desc *desc)
{
struct irq_chip *chip = desc->irq_data.chip;
- raw_spin_lock(&desc->lock);
+ guard(raw_spinlock)(&desc->lock);
mask_ack_irq(desc);
- if (!irq_may_run(desc))
- goto out;
-
- desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
-
- /*
- * If its disabled or no action available
- * then mask it and get out of here:
- */
- if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
- desc->istate |= IRQS_PENDING;
- mask_irq(desc);
- goto out;
+ if (!irq_can_handle(desc)) {
+ cond_eoi_irq(chip, &desc->irq_data);
+ return;
}
kstat_incr_irqs_this_cpu(desc);
- if (desc->istate & IRQS_ONESHOT)
- mask_irq(desc);
handle_irq_event(desc);
cond_unmask_eoi_irq(desc, chip);
-
- raw_spin_unlock(&desc->lock);
- return;
-out:
- if (!(chip->flags & IRQCHIP_EOI_IF_HANDLED))
- chip->irq_eoi(&desc->irq_data);
- raw_spin_unlock(&desc->lock);
}
EXPORT_SYMBOL_GPL(handle_fasteoi_mask_irq);
diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
index 15a7654eff68..f07529ae4895 100644
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -177,9 +177,8 @@ void irq_migrate_all_off_this_cpu(void)
bool affinity_broken;
desc = irq_to_desc(irq);
- raw_spin_lock(&desc->lock);
- affinity_broken = migrate_one_irq(desc);
- raw_spin_unlock(&desc->lock);
+ scoped_guard(raw_spinlock, &desc->lock)
+ affinity_broken = migrate_one_irq(desc);
if (affinity_broken) {
pr_debug_ratelimited("IRQ %u: no longer affine to CPU%u\n",
@@ -219,7 +218,7 @@ static void irq_restore_affinity_of_irq(struct irq_desc *desc, unsigned int cpu)
return;
if (irqd_is_managed_and_shutdown(data))
- irq_startup(desc, IRQ_RESEND, IRQ_START_COND);
+ irq_startup_managed(desc);
/*
* If the interrupt can only be directed to a single target
@@ -244,9 +243,8 @@ int irq_affinity_online_cpu(unsigned int cpu)
irq_lock_sparse();
for_each_active_irq(irq) {
desc = irq_to_desc(irq);
- raw_spin_lock_irq(&desc->lock);
- irq_restore_affinity_of_irq(desc, cpu);
- raw_spin_unlock_irq(&desc->lock);
+ scoped_guard(raw_spinlock_irq, &desc->lock)
+ irq_restore_affinity_of_irq(desc, cpu);
}
irq_unlock_sparse();
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
index ca142b9a4db3..3527defd2890 100644
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -160,7 +160,7 @@ static int irq_debug_show(struct seq_file *m, void *p)
struct irq_desc *desc = m->private;
struct irq_data *data;
- raw_spin_lock_irq(&desc->lock);
+ guard(raw_spinlock_irq)(&desc->lock);
data = irq_desc_get_irq_data(desc);
seq_printf(m, "handler: %ps\n", desc->handle_irq);
seq_printf(m, "device: %s\n", desc->dev_name);
@@ -178,7 +178,6 @@ static int irq_debug_show(struct seq_file *m, void *p)
seq_printf(m, "node: %d\n", irq_data_get_node(data));
irq_debug_show_masks(m, desc);
irq_debug_show_data(m, data, 0);
- raw_spin_unlock_irq(&desc->lock);
return 0;
}
@@ -226,12 +225,12 @@ void irq_debugfs_copy_devname(int irq, struct device *dev)
void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *desc)
{
- char name [10];
+ char name [12];
if (!irq_dir || !desc || desc->debugfs_file)
return;
- sprintf(name, "%d", irq);
+ sprintf(name, "%u", irq);
desc->debugfs_file = debugfs_create_file(name, 0644, irq_dir, desc,
&dfs_irq_ops);
}
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index c4a8bca5f2b0..bf59e37d650a 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -40,10 +40,9 @@ void irq_gc_mask_disable_reg(struct irq_data *d)
struct irq_chip_type *ct = irq_data_get_chip_type(d);
u32 mask = d->mask;
- irq_gc_lock(gc);
+ guard(raw_spinlock)(&gc->lock);
irq_reg_writel(gc, mask, ct->regs.disable);
*ct->mask_cache &= ~mask;
- irq_gc_unlock(gc);
}
EXPORT_SYMBOL_GPL(irq_gc_mask_disable_reg);
@@ -60,10 +59,9 @@ void irq_gc_mask_set_bit(struct irq_data *d)
struct irq_chip_type *ct = irq_data_get_chip_type(d);
u32 mask = d->mask;
- irq_gc_lock(gc);
+ guard(raw_spinlock)(&gc->lock);
*ct->mask_cache |= mask;
irq_reg_writel(gc, *ct->mask_cache, ct->regs.mask);
- irq_gc_unlock(gc);
}
EXPORT_SYMBOL_GPL(irq_gc_mask_set_bit);
@@ -80,10 +78,9 @@ void irq_gc_mask_clr_bit(struct irq_data *d)
struct irq_chip_type *ct = irq_data_get_chip_type(d);
u32 mask = d->mask;
- irq_gc_lock(gc);
+ guard(raw_spinlock)(&gc->lock);
*ct->mask_cache &= ~mask;
irq_reg_writel(gc, *ct->mask_cache, ct->regs.mask);
- irq_gc_unlock(gc);
}
EXPORT_SYMBOL_GPL(irq_gc_mask_clr_bit);
@@ -100,10 +97,9 @@ void irq_gc_unmask_enable_reg(struct irq_data *d)
struct irq_chip_type *ct = irq_data_get_chip_type(d);
u32 mask = d->mask;
- irq_gc_lock(gc);
+ guard(raw_spinlock)(&gc->lock);
irq_reg_writel(gc, mask, ct->regs.enable);
*ct->mask_cache |= mask;
- irq_gc_unlock(gc);
}
EXPORT_SYMBOL_GPL(irq_gc_unmask_enable_reg);
@@ -117,9 +113,8 @@ void irq_gc_ack_set_bit(struct irq_data *d)
struct irq_chip_type *ct = irq_data_get_chip_type(d);
u32 mask = d->mask;
- irq_gc_lock(gc);
+ guard(raw_spinlock)(&gc->lock);
irq_reg_writel(gc, mask, ct->regs.ack);
- irq_gc_unlock(gc);
}
EXPORT_SYMBOL_GPL(irq_gc_ack_set_bit);
@@ -133,9 +128,8 @@ void irq_gc_ack_clr_bit(struct irq_data *d)
struct irq_chip_type *ct = irq_data_get_chip_type(d);
u32 mask = ~d->mask;
- irq_gc_lock(gc);
+ guard(raw_spinlock)(&gc->lock);
irq_reg_writel(gc, mask, ct->regs.ack);
- irq_gc_unlock(gc);
}
/**
@@ -156,11 +150,10 @@ void irq_gc_mask_disable_and_ack_set(struct irq_data *d)
struct irq_chip_type *ct = irq_data_get_chip_type(d);
u32 mask = d->mask;
- irq_gc_lock(gc);
+ guard(raw_spinlock)(&gc->lock);
irq_reg_writel(gc, mask, ct->regs.disable);
*ct->mask_cache &= ~mask;
irq_reg_writel(gc, mask, ct->regs.ack);
- irq_gc_unlock(gc);
}
EXPORT_SYMBOL_GPL(irq_gc_mask_disable_and_ack_set);
@@ -174,9 +167,8 @@ void irq_gc_eoi(struct irq_data *d)
struct irq_chip_type *ct = irq_data_get_chip_type(d);
u32 mask = d->mask;
- irq_gc_lock(gc);
+ guard(raw_spinlock)(&gc->lock);
irq_reg_writel(gc, mask, ct->regs.eoi);
- irq_gc_unlock(gc);
}
/**
@@ -196,12 +188,11 @@ int irq_gc_set_wake(struct irq_data *d, unsigned int on)
if (!(mask & gc->wake_enabled))
return -EINVAL;
- irq_gc_lock(gc);
+ guard(raw_spinlock)(&gc->lock);
if (on)
gc->wake_active |= mask;
else
gc->wake_active &= ~mask;
- irq_gc_unlock(gc);
return 0;
}
EXPORT_SYMBOL_GPL(irq_gc_set_wake);
@@ -288,7 +279,6 @@ int irq_domain_alloc_generic_chips(struct irq_domain *d,
{
struct irq_domain_chip_generic *dgc;
struct irq_chip_generic *gc;
- unsigned long flags;
int numchips, i;
size_t dgc_sz;
size_t gc_sz;
@@ -340,9 +330,8 @@ int irq_domain_alloc_generic_chips(struct irq_domain *d,
goto err;
}
- raw_spin_lock_irqsave(&gc_lock, flags);
- list_add_tail(&gc->list, &gc_list);
- raw_spin_unlock_irqrestore(&gc_lock, flags);
+ scoped_guard (raw_spinlock_irqsave, &gc_lock)
+ list_add_tail(&gc->list, &gc_list);
/* Calc pointer to the next generic chip */
tmp += gc_sz;
}
@@ -459,7 +448,6 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
struct irq_chip_generic *gc;
struct irq_chip_type *ct;
struct irq_chip *chip;
- unsigned long flags;
int idx;
gc = __irq_get_domain_generic_chip(d, hw_irq);
@@ -479,9 +467,8 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
/* We only init the cache for the first mapping of a generic chip */
if (!gc->installed) {
- raw_spin_lock_irqsave(&gc->lock, flags);
+ guard(raw_spinlock_irqsave)(&gc->lock);
irq_gc_init_mask_cache(gc, dgc->gc_flags);
- raw_spin_unlock_irqrestore(&gc->lock, flags);
}
/* Mark the interrupt as installed */
@@ -548,9 +535,8 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk,
struct irq_chip *chip = &ct->chip;
unsigned int i;
- raw_spin_lock(&gc_lock);
- list_add_tail(&gc->list, &gc_list);
- raw_spin_unlock(&gc_lock);
+ scoped_guard (raw_spinlock, &gc_lock)
+ list_add_tail(&gc->list, &gc_list);
irq_gc_init_mask_cache(gc, flags);
@@ -616,9 +602,8 @@ void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk,
{
unsigned int i, virq;
- raw_spin_lock(&gc_lock);
- list_del(&gc->list);
- raw_spin_unlock(&gc_lock);
+ scoped_guard (raw_spinlock, &gc_lock)
+ list_del(&gc->list);
for (i = 0; msk; msk >>= 1, i++) {
if (!(msk & 0x01))
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index b0290849c395..aebfe225c9a6 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -87,6 +87,7 @@ extern void __enable_irq(struct irq_desc *desc);
extern int irq_activate(struct irq_desc *desc);
extern int irq_activate_and_startup(struct irq_desc *desc, bool resend);
extern int irq_startup(struct irq_desc *desc, bool resend, bool force);
+extern void irq_startup_managed(struct irq_desc *desc);
extern void irq_shutdown(struct irq_desc *desc);
extern void irq_shutdown_and_deactivate(struct irq_desc *desc);
@@ -141,6 +142,10 @@ extern int irq_setup_affinity(struct irq_desc *desc);
static inline int irq_setup_affinity(struct irq_desc *desc) { return 0; }
#endif
+
+#define for_each_action_of_desc(desc, act) \
+ for (act = desc->action; act; act = act->next)
+
/* Inline functions for support of irq chips on slow busses */
static inline void chip_bus_lock(struct irq_desc *desc)
{
@@ -160,38 +165,33 @@ static inline void chip_bus_sync_unlock(struct irq_desc *desc)
#define IRQ_GET_DESC_CHECK_GLOBAL (_IRQ_DESC_CHECK)
#define IRQ_GET_DESC_CHECK_PERCPU (_IRQ_DESC_CHECK | _IRQ_DESC_PERCPU)
-#define for_each_action_of_desc(desc, act) \
- for (act = desc->action; act; act = act->next)
-
-struct irq_desc *
-__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus,
- unsigned int check);
+struct irq_desc *__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus,
+ unsigned int check);
void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus);
-static inline struct irq_desc *
-irq_get_desc_buslock(unsigned int irq, unsigned long *flags, unsigned int check)
-{
- return __irq_get_desc_lock(irq, flags, true, check);
-}
+__DEFINE_CLASS_IS_CONDITIONAL(irqdesc_lock, true);
+__DEFINE_UNLOCK_GUARD(irqdesc_lock, struct irq_desc,
+ __irq_put_desc_unlock(_T->lock, _T->flags, _T->bus),
+ unsigned long flags; bool bus);
-static inline void
-irq_put_desc_busunlock(struct irq_desc *desc, unsigned long flags)
+static inline class_irqdesc_lock_t class_irqdesc_lock_constructor(unsigned int irq, bool bus,
+ unsigned int check)
{
- __irq_put_desc_unlock(desc, flags, true);
-}
+ class_irqdesc_lock_t _t = { .bus = bus, };
-static inline struct irq_desc *
-irq_get_desc_lock(unsigned int irq, unsigned long *flags, unsigned int check)
-{
- return __irq_get_desc_lock(irq, flags, false, check);
-}
+ _t.lock = __irq_get_desc_lock(irq, &_t.flags, bus, check);
-static inline void
-irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags)
-{
- __irq_put_desc_unlock(desc, flags, false);
+ return _t;
}
+#define scoped_irqdesc_get_and_lock(_irq, _check) \
+ scoped_guard(irqdesc_lock, _irq, false, _check)
+
+#define scoped_irqdesc_get_and_buslock(_irq, _check) \
+ scoped_guard(irqdesc_lock, _irq, true, _check)
+
+#define scoped_irqdesc ((struct irq_desc *)(__guard_ptr(irqdesc_lock)(&scope)))
+
#define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors)
static inline unsigned int irqd_get(struct irq_data *d)
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 4258cd6bd3b4..b64c57b44c20 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -246,8 +246,7 @@ static struct kobject *irq_kobj_base;
#define IRQ_ATTR_RO(_name) \
static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
-static ssize_t per_cpu_count_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
+static ssize_t per_cpu_count_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
{
struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
ssize_t ret = 0;
@@ -257,112 +256,83 @@ static ssize_t per_cpu_count_show(struct kobject *kobj,
for_each_possible_cpu(cpu) {
unsigned int c = irq_desc_kstat_cpu(desc, cpu);
- ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%u", p, c);
+ ret += sysfs_emit_at(buf, ret, "%s%u", p, c);
p = ",";
}
- ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n");
+ ret += sysfs_emit_at(buf, ret, "\n");
return ret;
}
IRQ_ATTR_RO(per_cpu_count);
-static ssize_t chip_name_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
+static ssize_t chip_name_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
{
struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
- ssize_t ret = 0;
-
- raw_spin_lock_irq(&desc->lock);
- if (desc->irq_data.chip && desc->irq_data.chip->name) {
- ret = scnprintf(buf, PAGE_SIZE, "%s\n",
- desc->irq_data.chip->name);
- }
- raw_spin_unlock_irq(&desc->lock);
- return ret;
+ guard(raw_spinlock_irq)(&desc->lock);
+ if (desc->irq_data.chip && desc->irq_data.chip->name)
+ return sysfs_emit(buf, "%s\n", desc->irq_data.chip->name);
+ return 0;
}
IRQ_ATTR_RO(chip_name);
-static ssize_t hwirq_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
+static ssize_t hwirq_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
{
struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
- ssize_t ret = 0;
- raw_spin_lock_irq(&desc->lock);
+ guard(raw_spinlock_irq)(&desc->lock);
if (desc->irq_data.domain)
- ret = sprintf(buf, "%lu\n", desc->irq_data.hwirq);
- raw_spin_unlock_irq(&desc->lock);
-
- return ret;
+ return sysfs_emit(buf, "%lu\n", desc->irq_data.hwirq);
+ return 0;
}
IRQ_ATTR_RO(hwirq);
-static ssize_t type_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
+static ssize_t type_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
{
struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
- ssize_t ret = 0;
- raw_spin_lock_irq(&desc->lock);
- ret = sprintf(buf, "%s\n",
- irqd_is_level_type(&desc->irq_data) ? "level" : "edge");
- raw_spin_unlock_irq(&desc->lock);
-
- return ret;
+ guard(raw_spinlock_irq)(&desc->lock);
+ return sysfs_emit(buf, "%s\n", irqd_is_level_type(&desc->irq_data) ? "level" : "edge");
}
IRQ_ATTR_RO(type);
-static ssize_t wakeup_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
+static ssize_t wakeup_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
{
struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
- ssize_t ret = 0;
-
- raw_spin_lock_irq(&desc->lock);
- ret = sprintf(buf, "%s\n", str_enabled_disabled(irqd_is_wakeup_set(&desc->irq_data)));
- raw_spin_unlock_irq(&desc->lock);
-
- return ret;
+ guard(raw_spinlock_irq)(&desc->lock);
+ return sysfs_emit(buf, "%s\n", str_enabled_disabled(irqd_is_wakeup_set(&desc->irq_data)));
}
IRQ_ATTR_RO(wakeup);
-static ssize_t name_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
+static ssize_t name_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
{
struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
- ssize_t ret = 0;
- raw_spin_lock_irq(&desc->lock);
+ guard(raw_spinlock_irq)(&desc->lock);
if (desc->name)
- ret = scnprintf(buf, PAGE_SIZE, "%s\n", desc->name);
- raw_spin_unlock_irq(&desc->lock);
-
- return ret;
+ return sysfs_emit(buf, "%s\n", desc->name);
+ return 0;
}
IRQ_ATTR_RO(name);
-static ssize_t actions_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
+static ssize_t actions_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
{
struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
struct irqaction *action;
ssize_t ret = 0;
char *p = "";
- raw_spin_lock_irq(&desc->lock);
- for_each_action_of_desc(desc, action) {
- ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s",
- p, action->name);
- p = ",";
+ scoped_guard(raw_spinlock_irq, &desc->lock) {
+ for_each_action_of_desc(desc, action) {
+ ret += sysfs_emit_at(buf, ret, "%s%s", p, action->name);
+ p = ",";
+ }
}
- raw_spin_unlock_irq(&desc->lock);
if (ret)
- ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n");
-
+ ret += sysfs_emit_at(buf, ret, "\n");
return ret;
}
IRQ_ATTR_RO(actions);
@@ -418,19 +388,14 @@ static int __init irq_sysfs_init(void)
int irq;
/* Prevent concurrent irq alloc/free */
- irq_lock_sparse();
-
+ guard(mutex)(&sparse_irq_lock);
irq_kobj_base = kobject_create_and_add("irq", kernel_kobj);
- if (!irq_kobj_base) {
- irq_unlock_sparse();
+ if (!irq_kobj_base)
return -ENOMEM;
- }
/* Add the already allocated interrupts */
for_each_irq_desc(irq, desc)
irq_sysfs_add(irq, desc);
- irq_unlock_sparse();
-
return 0;
}
postcore_initcall(irq_sysfs_init);
@@ -573,12 +538,12 @@ err:
return -ENOMEM;
}
-static int irq_expand_nr_irqs(unsigned int nr)
+static bool irq_expand_nr_irqs(unsigned int nr)
{
if (nr > MAX_SPARSE_IRQS)
- return -ENOMEM;
+ return false;
nr_irqs = nr;
- return 0;
+ return true;
}
int __init early_irq_init(void)
@@ -656,11 +621,9 @@ EXPORT_SYMBOL(irq_to_desc);
static void free_desc(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
- unsigned long flags;
- raw_spin_lock_irqsave(&desc->lock, flags);
- desc_set_defaults(irq, desc, irq_desc_get_node(desc), NULL, NULL);
- raw_spin_unlock_irqrestore(&desc->lock, flags);
+ scoped_guard(raw_spinlock_irqsave, &desc->lock)
+ desc_set_defaults(irq, desc, irq_desc_get_node(desc), NULL, NULL);
delete_irq_desc(irq);
}
@@ -679,16 +642,15 @@ static inline int alloc_descs(unsigned int start, unsigned int cnt, int node,
return start;
}
-static int irq_expand_nr_irqs(unsigned int nr)
+static inline bool irq_expand_nr_irqs(unsigned int nr)
{
- return -ENOMEM;
+ return false;
}
void irq_mark_irq(unsigned int irq)
{
- mutex_lock(&sparse_irq_lock);
+ guard(mutex)(&sparse_irq_lock);
irq_insert_desc(irq, irq_desc + irq);
- mutex_unlock(&sparse_irq_lock);
}
#ifdef CONFIG_GENERIC_IRQ_LEGACY
@@ -827,11 +789,9 @@ void irq_free_descs(unsigned int from, unsigned int cnt)
if (from >= nr_irqs || (from + cnt) > nr_irqs)
return;
- mutex_lock(&sparse_irq_lock);
+ guard(mutex)(&sparse_irq_lock);
for (i = 0; i < cnt; i++)
free_desc(from + i);
-
- mutex_unlock(&sparse_irq_lock);
}
EXPORT_SYMBOL_GPL(irq_free_descs);
@@ -848,11 +808,10 @@ EXPORT_SYMBOL_GPL(irq_free_descs);
*
* Returns the first irq number or error code
*/
-int __ref
-__irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
- struct module *owner, const struct irq_affinity_desc *affinity)
+int __ref __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
+ struct module *owner, const struct irq_affinity_desc *affinity)
{
- int start, ret;
+ int start;
if (!cnt)
return -EINVAL;
@@ -870,22 +829,17 @@ __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
from = arch_dynirq_lower_bound(from);
}
- mutex_lock(&sparse_irq_lock);
+ guard(mutex)(&sparse_irq_lock);
start = irq_find_free_area(from, cnt);
- ret = -EEXIST;
if (irq >=0 && start != irq)
- goto unlock;
+ return -EEXIST;
if (start + cnt > nr_irqs) {
- ret = irq_expand_nr_irqs(start + cnt);
- if (ret)
- goto unlock;
+ if (!irq_expand_nr_irqs(start + cnt))
+ return -ENOMEM;
}
- ret = alloc_descs(start, cnt, node, affinity, owner);
-unlock:
- mutex_unlock(&sparse_irq_lock);
- return ret;
+ return alloc_descs(start, cnt, node, affinity, owner);
}
EXPORT_SYMBOL_GPL(__irq_alloc_descs);
@@ -900,27 +854,27 @@ unsigned int irq_get_next_irq(unsigned int offset)
return irq_find_at_or_after(offset);
}
-struct irq_desc *
-__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus,
- unsigned int check)
+struct irq_desc *__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus,
+ unsigned int check)
{
- struct irq_desc *desc = irq_to_desc(irq);
+ struct irq_desc *desc;
- if (desc) {
- if (check & _IRQ_DESC_CHECK) {
- if ((check & _IRQ_DESC_PERCPU) &&
- !irq_settings_is_per_cpu_devid(desc))
- return NULL;
-
- if (!(check & _IRQ_DESC_PERCPU) &&
- irq_settings_is_per_cpu_devid(desc))
- return NULL;
- }
+ desc = irq_to_desc(irq);
+ if (!desc)
+ return NULL;
+
+ if (check & _IRQ_DESC_CHECK) {
+ if ((check & _IRQ_DESC_PERCPU) && !irq_settings_is_per_cpu_devid(desc))
+ return NULL;
- if (bus)
- chip_bus_lock(desc);
- raw_spin_lock_irqsave(&desc->lock, *flags);
+ if (!(check & _IRQ_DESC_PERCPU) && irq_settings_is_per_cpu_devid(desc))
+ return NULL;
}
+
+ if (bus)
+ chip_bus_lock(desc);
+ raw_spin_lock_irqsave(&desc->lock, *flags);
+
return desc;
}
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 9d5c8651492d..c8b6de09047b 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -480,33 +480,6 @@ struct irq_domain *irq_domain_create_simple(struct fwnode_handle *fwnode,
}
EXPORT_SYMBOL_GPL(irq_domain_create_simple);
-/**
- * irq_domain_add_legacy() - Allocate and register a legacy revmap irq_domain.
- * @of_node: pointer to interrupt controller's device tree node.
- * @size: total number of irqs in legacy mapping
- * @first_irq: first number of irq block assigned to the domain
- * @first_hwirq: first hwirq number to use for the translation. Should normally
- * be '0', but a positive integer can be used if the effective
- * hwirqs numbering does not begin at zero.
- * @ops: map/unmap domain callbacks
- * @host_data: Controller private data pointer
- *
- * Note: the map() callback will be called before this function returns
- * for all legacy interrupts except 0 (which is always the invalid irq for
- * a legacy controller).
- */
-struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
- unsigned int size,
- unsigned int first_irq,
- irq_hw_number_t first_hwirq,
- const struct irq_domain_ops *ops,
- void *host_data)
-{
- return irq_domain_create_legacy(of_node_to_fwnode(of_node), size,
- first_irq, first_hwirq, ops, host_data);
-}
-EXPORT_SYMBOL_GPL(irq_domain_add_legacy);
-
struct irq_domain *irq_domain_create_legacy(struct fwnode_handle *fwnode,
unsigned int size,
unsigned int first_irq,
@@ -885,7 +858,7 @@ void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args,
{
int i;
- fwspec->fwnode = of_node_to_fwnode(np);
+ fwspec->fwnode = of_fwnode_handle(np);
fwspec->param_count = count;
for (i = 0; i < count; i++)
@@ -1133,6 +1106,31 @@ int irq_domain_xlate_twocell(struct irq_domain *d, struct device_node *ctrlr,
EXPORT_SYMBOL_GPL(irq_domain_xlate_twocell);
/**
+ * irq_domain_xlate_twothreecell() - Generic xlate for direct two or three cell bindings
+ * @d: Interrupt domain involved in the translation
+ * @ctrlr: The device tree node for the device whose interrupt is translated
+ * @intspec: The interrupt specifier data from the device tree
+ * @intsize: The number of entries in @intspec
+ * @out_hwirq: Pointer to storage for the hardware interrupt number
+ * @out_type: Pointer to storage for the interrupt type
+ *
+ * Device Tree interrupt specifier translation function for two or three
+ * cell bindings, where the cell values map directly to the hardware
+ * interrupt number and the type specifier.
+ */
+int irq_domain_xlate_twothreecell(struct irq_domain *d, struct device_node *ctrlr,
+ const u32 *intspec, unsigned int intsize,
+ irq_hw_number_t *out_hwirq, unsigned int *out_type)
+{
+ struct irq_fwspec fwspec;
+
+ of_phandle_args_to_fwspec(ctrlr, intspec, intsize, &fwspec);
+
+ return irq_domain_translate_twothreecell(d, &fwspec, out_hwirq, out_type);
+}
+EXPORT_SYMBOL_GPL(irq_domain_xlate_twothreecell);
+
+/**
* irq_domain_xlate_onetwocell() - Generic xlate for one or two cell bindings
* @d: Interrupt domain involved in the translation
* @ctrlr: The device tree node for the device whose interrupt is translated
@@ -1216,6 +1214,37 @@ int irq_domain_translate_twocell(struct irq_domain *d,
}
EXPORT_SYMBOL_GPL(irq_domain_translate_twocell);
+/**
+ * irq_domain_translate_twothreecell() - Generic translate for direct two or three cell
+ * bindings
+ * @d: Interrupt domain involved in the translation
+ * @fwspec: The firmware interrupt specifier to translate
+ * @out_hwirq: Pointer to storage for the hardware interrupt number
+ * @out_type: Pointer to storage for the interrupt type
+ *
+ * Firmware interrupt specifier translation function for two or three cell
+ * specifications, where the parameter values map directly to the hardware
+ * interrupt number and the type specifier.
+ */
+int irq_domain_translate_twothreecell(struct irq_domain *d, struct irq_fwspec *fwspec,
+ unsigned long *out_hwirq, unsigned int *out_type)
+{
+ if (fwspec->param_count == 2) {
+ *out_hwirq = fwspec->param[0];
+ *out_type = fwspec->param[1] & IRQ_TYPE_SENSE_MASK;
+ return 0;
+ }
+
+ if (fwspec->param_count == 3) {
+ *out_hwirq = fwspec->param[1];
+ *out_type = fwspec->param[2] & IRQ_TYPE_SENSE_MASK;
+ return 0;
+ }
+
+ return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(irq_domain_translate_twothreecell);
+
int irq_domain_alloc_descs(int virq, unsigned int cnt, irq_hw_number_t hwirq,
int node, const struct irq_affinity_desc *affinity)
{
@@ -1252,47 +1281,6 @@ void irq_domain_reset_irq_data(struct irq_data *irq_data)
EXPORT_SYMBOL_GPL(irq_domain_reset_irq_data);
#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
-/**
- * irq_domain_create_hierarchy - Add a irqdomain into the hierarchy
- * @parent: Parent irq domain to associate with the new domain
- * @flags: Irq domain flags associated to the domain
- * @size: Size of the domain. See below
- * @fwnode: Optional fwnode of the interrupt controller
- * @ops: Pointer to the interrupt domain callbacks
- * @host_data: Controller private data pointer
- *
- * If @size is 0 a tree domain is created, otherwise a linear domain.
- *
- * If successful the parent is associated to the new domain and the
- * domain flags are set.
- * Returns pointer to IRQ domain, or NULL on failure.
- */
-struct irq_domain *irq_domain_create_hierarchy(struct irq_domain *parent,
- unsigned int flags,
- unsigned int size,
- struct fwnode_handle *fwnode,
- const struct irq_domain_ops *ops,
- void *host_data)
-{
- struct irq_domain_info info = {
- .fwnode = fwnode,
- .size = size,
- .hwirq_max = size,
- .ops = ops,
- .host_data = host_data,
- .domain_flags = flags,
- .parent = parent,
- };
- struct irq_domain *d;
-
- if (!info.size)
- info.hwirq_max = ~0U;
-
- d = irq_domain_instantiate(&info);
- return IS_ERR(d) ? NULL : d;
-}
-EXPORT_SYMBOL_GPL(irq_domain_create_hierarchy);
-
static void irq_domain_insert_irq(int virq)
{
struct irq_data *data;
@@ -2008,7 +1996,7 @@ static void irq_domain_check_hierarchy(struct irq_domain *domain)
domain->flags |= IRQ_DOMAIN_FLAG_HIERARCHY;
}
#else /* CONFIG_IRQ_DOMAIN_HIERARCHY */
-/**
+/*
* irq_domain_get_irq_data - Get irq_data associated with @virq and @domain
* @domain: domain to match
* @virq: IRQ number to get irq_data
@@ -2022,7 +2010,7 @@ struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain,
}
EXPORT_SYMBOL_GPL(irq_domain_get_irq_data);
-/**
+/*
* irq_domain_set_info - Set the complete data for a @virq in @domain
* @domain: Interrupt domain to match
* @virq: IRQ number
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 753eef8e041c..c94837382037 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -43,8 +43,6 @@ static void __synchronize_hardirq(struct irq_desc *desc, bool sync_chip)
bool inprogress;
do {
- unsigned long flags;
-
/*
* Wait until we're out of the critical section. This might
* give the wrong answer due to the lack of memory barriers.
@@ -53,7 +51,7 @@ static void __synchronize_hardirq(struct irq_desc *desc, bool sync_chip)
cpu_relax();
/* Ok, that indicated we're done: double-check carefully. */
- raw_spin_lock_irqsave(&desc->lock, flags);
+ guard(raw_spinlock_irqsave)(&desc->lock);
inprogress = irqd_irq_inprogress(&desc->irq_data);
/*
@@ -69,33 +67,30 @@ static void __synchronize_hardirq(struct irq_desc *desc, bool sync_chip)
__irq_get_irqchip_state(irqd, IRQCHIP_STATE_ACTIVE,
&inprogress);
}
- raw_spin_unlock_irqrestore(&desc->lock, flags);
-
/* Oops, that failed? */
} while (inprogress);
}
/**
- * synchronize_hardirq - wait for pending hard IRQ handlers (on other CPUs)
- * @irq: interrupt number to wait for
+ * synchronize_hardirq - wait for pending hard IRQ handlers (on other CPUs)
+ * @irq: interrupt number to wait for
*
- * This function waits for any pending hard IRQ handlers for this
- * interrupt to complete before returning. If you use this
- * function while holding a resource the IRQ handler may need you
- * will deadlock. It does not take associated threaded handlers
- * into account.
+ * This function waits for any pending hard IRQ handlers for this interrupt
+ * to complete before returning. If you use this function while holding a
+ * resource the IRQ handler may need you will deadlock. It does not take
+ * associated threaded handlers into account.
*
- * Do not use this for shutdown scenarios where you must be sure
- * that all parts (hardirq and threaded handler) have completed.
+ * Do not use this for shutdown scenarios where you must be sure that all
+ * parts (hardirq and threaded handler) have completed.
*
- * Returns: false if a threaded handler is active.
+ * Returns: false if a threaded handler is active.
*
- * This function may be called - with care - from IRQ context.
+ * This function may be called - with care - from IRQ context.
*
- * It does not check whether there is an interrupt in flight at the
- * hardware level, but not serviced yet, as this might deadlock when
- * called with interrupts disabled and the target CPU of the interrupt
- * is the current CPU.
+ * It does not check whether there is an interrupt in flight at the
+ * hardware level, but not serviced yet, as this might deadlock when called
+ * with interrupts disabled and the target CPU of the interrupt is the
+ * current CPU.
*/
bool synchronize_hardirq(unsigned int irq)
{
@@ -121,19 +116,19 @@ static void __synchronize_irq(struct irq_desc *desc)
}
/**
- * synchronize_irq - wait for pending IRQ handlers (on other CPUs)
- * @irq: interrupt number to wait for
+ * synchronize_irq - wait for pending IRQ handlers (on other CPUs)
+ * @irq: interrupt number to wait for
*
- * This function waits for any pending IRQ handlers for this interrupt
- * to complete before returning. If you use this function while
- * holding a resource the IRQ handler may need you will deadlock.
+ * This function waits for any pending IRQ handlers for this interrupt to
+ * complete before returning. If you use this function while holding a
+ * resource the IRQ handler may need you will deadlock.
*
- * Can only be called from preemptible code as it might sleep when
- * an interrupt thread is associated to @irq.
+ * Can only be called from preemptible code as it might sleep when
+ * an interrupt thread is associated to @irq.
*
- * It optionally makes sure (when the irq chip supports that method)
- * that the interrupt is not pending in any CPU and waiting for
- * service.
+ * It optionally makes sure (when the irq chip supports that method)
+ * that the interrupt is not pending in any CPU and waiting for
+ * service.
*/
void synchronize_irq(unsigned int irq)
{
@@ -156,8 +151,8 @@ static bool __irq_can_set_affinity(struct irq_desc *desc)
}
/**
- * irq_can_set_affinity - Check if the affinity of a given irq can be set
- * @irq: Interrupt to check
+ * irq_can_set_affinity - Check if the affinity of a given irq can be set
+ * @irq: Interrupt to check
*
*/
int irq_can_set_affinity(unsigned int irq)
@@ -181,13 +176,13 @@ bool irq_can_set_affinity_usr(unsigned int irq)
}
/**
- * irq_set_thread_affinity - Notify irq threads to adjust affinity
- * @desc: irq descriptor which has affinity changed
+ * irq_set_thread_affinity - Notify irq threads to adjust affinity
+ * @desc: irq descriptor which has affinity changed
*
- * We just set IRQTF_AFFINITY and delegate the affinity setting
- * to the interrupt thread itself. We can not call
- * set_cpus_allowed_ptr() here as we hold desc->lock and this
- * code can be called from hard interrupt context.
+ * Just set IRQTF_AFFINITY and delegate the affinity setting to the
+ * interrupt thread itself. We can not call set_cpus_allowed_ptr() here as
+ * we hold desc->lock and this code can be called from hard interrupt
+ * context.
*/
static void irq_set_thread_affinity(struct irq_desc *desc)
{
@@ -400,14 +395,8 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask,
* an interrupt which is already started or which has already been configured
* as managed will also fail, as these mean invalid init state or double init.
*/
-int irq_update_affinity_desc(unsigned int irq,
- struct irq_affinity_desc *affinity)
+int irq_update_affinity_desc(unsigned int irq, struct irq_affinity_desc *affinity)
{
- struct irq_desc *desc;
- unsigned long flags;
- bool activated;
- int ret = 0;
-
/*
* Supporting this with the reservation scheme used by x86 needs
* some more thought. Fail it for now.
@@ -415,60 +404,50 @@ int irq_update_affinity_desc(unsigned int irq,
if (IS_ENABLED(CONFIG_GENERIC_IRQ_RESERVATION_MODE))
return -EOPNOTSUPP;
- desc = irq_get_desc_buslock(irq, &flags, 0);
- if (!desc)
- return -EINVAL;
-
- /* Requires the interrupt to be shut down */
- if (irqd_is_started(&desc->irq_data)) {
- ret = -EBUSY;
- goto out_unlock;
- }
+ scoped_irqdesc_get_and_buslock(irq, 0) {
+ struct irq_desc *desc = scoped_irqdesc;
+ bool activated;
- /* Interrupts which are already managed cannot be modified */
- if (irqd_affinity_is_managed(&desc->irq_data)) {
- ret = -EBUSY;
- goto out_unlock;
- }
-
- /*
- * Deactivate the interrupt. That's required to undo
- * anything an earlier activation has established.
- */
- activated = irqd_is_activated(&desc->irq_data);
- if (activated)
- irq_domain_deactivate_irq(&desc->irq_data);
+ /* Requires the interrupt to be shut down */
+ if (irqd_is_started(&desc->irq_data))
+ return -EBUSY;
- if (affinity->is_managed) {
- irqd_set(&desc->irq_data, IRQD_AFFINITY_MANAGED);
- irqd_set(&desc->irq_data, IRQD_MANAGED_SHUTDOWN);
- }
+ /* Interrupts which are already managed cannot be modified */
+ if (irqd_affinity_is_managed(&desc->irq_data))
+ return -EBUSY;
+ /*
+ * Deactivate the interrupt. That's required to undo
+ * anything an earlier activation has established.
+ */
+ activated = irqd_is_activated(&desc->irq_data);
+ if (activated)
+ irq_domain_deactivate_irq(&desc->irq_data);
- cpumask_copy(desc->irq_common_data.affinity, &affinity->mask);
+ if (affinity->is_managed) {
+ irqd_set(&desc->irq_data, IRQD_AFFINITY_MANAGED);
+ irqd_set(&desc->irq_data, IRQD_MANAGED_SHUTDOWN);
+ }
- /* Restore the activation state */
- if (activated)
- irq_domain_activate_irq(&desc->irq_data, false);
+ cpumask_copy(desc->irq_common_data.affinity, &affinity->mask);
-out_unlock:
- irq_put_desc_busunlock(desc, flags);
- return ret;
+ /* Restore the activation state */
+ if (activated)
+ irq_domain_activate_irq(&desc->irq_data, false);
+ return 0;
+ }
+ return -EINVAL;
}
static int __irq_set_affinity(unsigned int irq, const struct cpumask *mask,
bool force)
{
struct irq_desc *desc = irq_to_desc(irq);
- unsigned long flags;
- int ret;
if (!desc)
return -EINVAL;
- raw_spin_lock_irqsave(&desc->lock, flags);
- ret = irq_set_affinity_locked(irq_desc_get_irq_data(desc), mask, force);
- raw_spin_unlock_irqrestore(&desc->lock, flags);
- return ret;
+ guard(raw_spinlock_irqsave)(&desc->lock);
+ return irq_set_affinity_locked(irq_desc_get_irq_data(desc), mask, force);
}
/**
@@ -501,39 +480,36 @@ int irq_force_affinity(unsigned int irq, const struct cpumask *cpumask)
}
EXPORT_SYMBOL_GPL(irq_force_affinity);
-int __irq_apply_affinity_hint(unsigned int irq, const struct cpumask *m,
- bool setaffinity)
+int __irq_apply_affinity_hint(unsigned int irq, const struct cpumask *m, bool setaffinity)
{
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
+ int ret = -EINVAL;
- if (!desc)
- return -EINVAL;
- desc->affinity_hint = m;
- irq_put_desc_unlock(desc, flags);
- if (m && setaffinity)
+ scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_GLOBAL) {
+ scoped_irqdesc->affinity_hint = m;
+ ret = 0;
+ }
+
+ if (!ret && m && setaffinity)
__irq_set_affinity(irq, m, false);
- return 0;
+ return ret;
}
EXPORT_SYMBOL_GPL(__irq_apply_affinity_hint);
static void irq_affinity_notify(struct work_struct *work)
{
- struct irq_affinity_notify *notify =
- container_of(work, struct irq_affinity_notify, work);
+ struct irq_affinity_notify *notify = container_of(work, struct irq_affinity_notify, work);
struct irq_desc *desc = irq_to_desc(notify->irq);
cpumask_var_t cpumask;
- unsigned long flags;
if (!desc || !alloc_cpumask_var(&cpumask, GFP_KERNEL))
goto out;
- raw_spin_lock_irqsave(&desc->lock, flags);
- if (irq_move_pending(&desc->irq_data))
- irq_get_pending(cpumask, desc);
- else
- cpumask_copy(cpumask, desc->irq_common_data.affinity);
- raw_spin_unlock_irqrestore(&desc->lock, flags);
+ scoped_guard(raw_spinlock_irqsave, &desc->lock) {
+ if (irq_move_pending(&desc->irq_data))
+ irq_get_pending(cpumask, desc);
+ else
+ cpumask_copy(cpumask, desc->irq_common_data.affinity);
+ }
notify->notify(notify, cpumask);
@@ -543,22 +519,20 @@ out:
}
/**
- * irq_set_affinity_notifier - control notification of IRQ affinity changes
- * @irq: Interrupt for which to enable/disable notification
- * @notify: Context for notification, or %NULL to disable
- * notification. Function pointers must be initialised;
- * the other fields will be initialised by this function.
- *
- * Must be called in process context. Notification may only be enabled
- * after the IRQ is allocated and must be disabled before the IRQ is
- * freed using free_irq().
+ * irq_set_affinity_notifier - control notification of IRQ affinity changes
+ * @irq: Interrupt for which to enable/disable notification
+ * @notify: Context for notification, or %NULL to disable
+ * notification. Function pointers must be initialised;
+ * the other fields will be initialised by this function.
+ *
+ * Must be called in process context. Notification may only be enabled
+ * after the IRQ is allocated and must be disabled before the IRQ is freed
+ * using free_irq().
*/
-int
-irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
+int irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
{
struct irq_desc *desc = irq_to_desc(irq);
struct irq_affinity_notify *old_notify;
- unsigned long flags;
/* The release function is promised process context */
might_sleep();
@@ -573,10 +547,10 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
INIT_WORK(&notify->work, irq_affinity_notify);
}
- raw_spin_lock_irqsave(&desc->lock, flags);
- old_notify = desc->affinity_notify;
- desc->affinity_notify = notify;
- raw_spin_unlock_irqrestore(&desc->lock, flags);
+ scoped_guard(raw_spinlock_irqsave, &desc->lock) {
+ old_notify = desc->affinity_notify;
+ desc->affinity_notify = notify;
+ }
if (old_notify) {
if (cancel_work_sync(&old_notify->work)) {
@@ -597,7 +571,8 @@ EXPORT_SYMBOL_GPL(irq_set_affinity_notifier);
int irq_setup_affinity(struct irq_desc *desc)
{
struct cpumask *set = irq_default_affinity;
- int ret, node = irq_desc_get_node(desc);
+ int node = irq_desc_get_node(desc);
+
static DEFINE_RAW_SPINLOCK(mask_lock);
static struct cpumask mask;
@@ -605,7 +580,7 @@ int irq_setup_affinity(struct irq_desc *desc)
if (!__irq_can_set_affinity(desc))
return 0;
- raw_spin_lock(&mask_lock);
+ guard(raw_spinlock)(&mask_lock);
/*
* Preserve the managed affinity setting and a userspace affinity
* setup, but make sure that one of the targets is online.
@@ -630,9 +605,7 @@ int irq_setup_affinity(struct irq_desc *desc)
if (cpumask_intersects(&mask, nodemask))
cpumask_and(&mask, &mask, nodemask);
}
- ret = irq_do_set_affinity(&desc->irq_data, &mask, false);
- raw_spin_unlock(&mask_lock);
- return ret;
+ return irq_do_set_affinity(&desc->irq_data, &mask, false);
}
#else
/* Wrapper for ALPHA specific affinity selector magic */
@@ -645,44 +618,36 @@ int irq_setup_affinity(struct irq_desc *desc)
/**
- * irq_set_vcpu_affinity - Set vcpu affinity for the interrupt
- * @irq: interrupt number to set affinity
- * @vcpu_info: vCPU specific data or pointer to a percpu array of vCPU
- * specific data for percpu_devid interrupts
- *
- * This function uses the vCPU specific data to set the vCPU
- * affinity for an irq. The vCPU specific data is passed from
- * outside, such as KVM. One example code path is as below:
- * KVM -> IOMMU -> irq_set_vcpu_affinity().
+ * irq_set_vcpu_affinity - Set vcpu affinity for the interrupt
+ * @irq: interrupt number to set affinity
+ * @vcpu_info: vCPU specific data or pointer to a percpu array of vCPU
+ * specific data for percpu_devid interrupts
+ *
+ * This function uses the vCPU specific data to set the vCPU affinity for
+ * an irq. The vCPU specific data is passed from outside, such as KVM. One
+ * example code path is as below: KVM -> IOMMU -> irq_set_vcpu_affinity().
*/
int irq_set_vcpu_affinity(unsigned int irq, void *vcpu_info)
{
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
- struct irq_data *data;
- struct irq_chip *chip;
- int ret = -ENOSYS;
+ scoped_irqdesc_get_and_lock(irq, 0) {
+ struct irq_desc *desc = scoped_irqdesc;
+ struct irq_data *data;
+ struct irq_chip *chip;
- if (!desc)
- return -EINVAL;
-
- data = irq_desc_get_irq_data(desc);
- do {
- chip = irq_data_get_irq_chip(data);
- if (chip && chip->irq_set_vcpu_affinity)
- break;
-#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
- data = data->parent_data;
-#else
- data = NULL;
-#endif
- } while (data);
+ data = irq_desc_get_irq_data(desc);
+ do {
+ chip = irq_data_get_irq_chip(data);
+ if (chip && chip->irq_set_vcpu_affinity)
+ break;
- if (data)
- ret = chip->irq_set_vcpu_affinity(data, vcpu_info);
- irq_put_desc_unlock(desc, flags);
+ data = irqd_get_parent_data(data);
+ } while (data);
- return ret;
+ if (!data)
+ return -ENOSYS;
+ return chip->irq_set_vcpu_affinity(data, vcpu_info);
+ }
+ return -EINVAL;
}
EXPORT_SYMBOL_GPL(irq_set_vcpu_affinity);
@@ -694,26 +659,23 @@ void __disable_irq(struct irq_desc *desc)
static int __disable_irq_nosync(unsigned int irq)
{
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
-
- if (!desc)
- return -EINVAL;
- __disable_irq(desc);
- irq_put_desc_busunlock(desc, flags);
- return 0;
+ scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_GLOBAL) {
+ __disable_irq(scoped_irqdesc);
+ return 0;
+ }
+ return -EINVAL;
}
/**
- * disable_irq_nosync - disable an irq without waiting
- * @irq: Interrupt to disable
+ * disable_irq_nosync - disable an irq without waiting
+ * @irq: Interrupt to disable
*
- * Disable the selected interrupt line. Disables and Enables are
- * nested.
- * Unlike disable_irq(), this function does not ensure existing
- * instances of the IRQ handler have completed before returning.
+ * Disable the selected interrupt line. Disables and Enables are
+ * nested.
+ * Unlike disable_irq(), this function does not ensure existing
+ * instances of the IRQ handler have completed before returning.
*
- * This function may be called from IRQ context.
+ * This function may be called from IRQ context.
*/
void disable_irq_nosync(unsigned int irq)
{
@@ -722,17 +684,17 @@ void disable_irq_nosync(unsigned int irq)
EXPORT_SYMBOL(disable_irq_nosync);
/**
- * disable_irq - disable an irq and wait for completion
- * @irq: Interrupt to disable
+ * disable_irq - disable an irq and wait for completion
+ * @irq: Interrupt to disable
+ *
+ * Disable the selected interrupt line. Enables and Disables are nested.
*
- * Disable the selected interrupt line. Enables and Disables are
- * nested.
- * This function waits for any pending IRQ handlers for this interrupt
- * to complete before returning. If you use this function while
- * holding a resource the IRQ handler may need you will deadlock.
+ * This function waits for any pending IRQ handlers for this interrupt to
+ * complete before returning. If you use this function while holding a
+ * resource the IRQ handler may need you will deadlock.
*
- * Can only be called from preemptible code as it might sleep when
- * an interrupt thread is associated to @irq.
+ * Can only be called from preemptible code as it might sleep when an
+ * interrupt thread is associated to @irq.
*
*/
void disable_irq(unsigned int irq)
@@ -744,40 +706,39 @@ void disable_irq(unsigned int irq)
EXPORT_SYMBOL(disable_irq);
/**
- * disable_hardirq - disables an irq and waits for hardirq completion
- * @irq: Interrupt to disable
+ * disable_hardirq - disables an irq and waits for hardirq completion
+ * @irq: Interrupt to disable
*
- * Disable the selected interrupt line. Enables and Disables are
- * nested.
- * This function waits for any pending hard IRQ handlers for this
- * interrupt to complete before returning. If you use this function while
- * holding a resource the hard IRQ handler may need you will deadlock.
+ * Disable the selected interrupt line. Enables and Disables are nested.
*
- * When used to optimistically disable an interrupt from atomic context
- * the return value must be checked.
+ * This function waits for any pending hard IRQ handlers for this interrupt
+ * to complete before returning. If you use this function while holding a
+ * resource the hard IRQ handler may need you will deadlock.
*
- * Returns: false if a threaded handler is active.
+ * When used to optimistically disable an interrupt from atomic context the
+ * return value must be checked.
*
- * This function may be called - with care - from IRQ context.
+ * Returns: false if a threaded handler is active.
+ *
+ * This function may be called - with care - from IRQ context.
*/
bool disable_hardirq(unsigned int irq)
{
if (!__disable_irq_nosync(irq))
return synchronize_hardirq(irq);
-
return false;
}
EXPORT_SYMBOL_GPL(disable_hardirq);
/**
- * disable_nmi_nosync - disable an nmi without waiting
- * @irq: Interrupt to disable
- *
- * Disable the selected interrupt line. Disables and enables are
- * nested.
- * The interrupt to disable must have been requested through request_nmi.
- * Unlike disable_nmi(), this function does not ensure existing
- * instances of the IRQ handler have completed before returning.
+ * disable_nmi_nosync - disable an nmi without waiting
+ * @irq: Interrupt to disable
+ *
+ * Disable the selected interrupt line. Disables and enables are nested.
+ *
+ * The interrupt to disable must have been requested through request_nmi.
+ * Unlike disable_nmi(), this function does not ensure existing
+ * instances of the IRQ handler have completed before returning.
*/
void disable_nmi_nosync(unsigned int irq)
{
@@ -817,41 +778,34 @@ void __enable_irq(struct irq_desc *desc)
}
/**
- * enable_irq - enable handling of an irq
- * @irq: Interrupt to enable
+ * enable_irq - enable handling of an irq
+ * @irq: Interrupt to enable
*
- * Undoes the effect of one call to disable_irq(). If this
- * matches the last disable, processing of interrupts on this
- * IRQ line is re-enabled.
+ * Undoes the effect of one call to disable_irq(). If this matches the
+ * last disable, processing of interrupts on this IRQ line is re-enabled.
*
- * This function may be called from IRQ context only when
- * desc->irq_data.chip->bus_lock and desc->chip->bus_sync_unlock are NULL !
+ * This function may be called from IRQ context only when
+ * desc->irq_data.chip->bus_lock and desc->chip->bus_sync_unlock are NULL !
*/
void enable_irq(unsigned int irq)
{
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
+ scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_GLOBAL) {
+ struct irq_desc *desc = scoped_irqdesc;
- if (!desc)
- return;
- if (WARN(!desc->irq_data.chip,
- KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq))
- goto out;
-
- __enable_irq(desc);
-out:
- irq_put_desc_busunlock(desc, flags);
+ if (WARN(!desc->irq_data.chip, "enable_irq before setup/request_irq: irq %u\n", irq))
+ return;
+ __enable_irq(desc);
+ }
}
EXPORT_SYMBOL(enable_irq);
/**
- * enable_nmi - enable handling of an nmi
- * @irq: Interrupt to enable
+ * enable_nmi - enable handling of an nmi
+ * @irq: Interrupt to enable
*
- * The interrupt to enable must have been requested through request_nmi.
- * Undoes the effect of one call to disable_nmi(). If this
- * matches the last disable, processing of interrupts on this
- * IRQ line is re-enabled.
+ * The interrupt to enable must have been requested through request_nmi.
+ * Undoes the effect of one call to disable_nmi(). If this matches the last
+ * disable, processing of interrupts on this IRQ line is re-enabled.
*/
void enable_nmi(unsigned int irq)
{
@@ -873,65 +827,59 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on)
}
/**
- * irq_set_irq_wake - control irq power management wakeup
- * @irq: interrupt to control
- * @on: enable/disable power management wakeup
- *
- * Enable/disable power management wakeup mode, which is
- * disabled by default. Enables and disables must match,
- * just as they match for non-wakeup mode support.
- *
- * Wakeup mode lets this IRQ wake the system from sleep
- * states like "suspend to RAM".
- *
- * Note: irq enable/disable state is completely orthogonal
- * to the enable/disable state of irq wake. An irq can be
- * disabled with disable_irq() and still wake the system as
- * long as the irq has wake enabled. If this does not hold,
- * then the underlying irq chip and the related driver need
- * to be investigated.
+ * irq_set_irq_wake - control irq power management wakeup
+ * @irq: interrupt to control
+ * @on: enable/disable power management wakeup
+ *
+ * Enable/disable power management wakeup mode, which is disabled by
+ * default. Enables and disables must match, just as they match for
+ * non-wakeup mode support.
+ *
+ * Wakeup mode lets this IRQ wake the system from sleep states like
+ * "suspend to RAM".
+ *
+ * Note: irq enable/disable state is completely orthogonal to the
+ * enable/disable state of irq wake. An irq can be disabled with
+ * disable_irq() and still wake the system as long as the irq has wake
+ * enabled. If this does not hold, then the underlying irq chip and the
+ * related driver need to be investigated.
*/
int irq_set_irq_wake(unsigned int irq, unsigned int on)
{
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
- int ret = 0;
+ scoped_irqdesc_get_and_buslock(irq, IRQ_GET_DESC_CHECK_GLOBAL) {
+ struct irq_desc *desc = scoped_irqdesc;
+ int ret = 0;
- if (!desc)
- return -EINVAL;
-
- /* Don't use NMIs as wake up interrupts please */
- if (irq_is_nmi(desc)) {
- ret = -EINVAL;
- goto out_unlock;
- }
+ /* Don't use NMIs as wake up interrupts please */
+ if (irq_is_nmi(desc))
+ return -EINVAL;
- /* wakeup-capable irqs can be shared between drivers that
- * don't need to have the same sleep mode behaviors.
- */
- if (on) {
- if (desc->wake_depth++ == 0) {
- ret = set_irq_wake_real(irq, on);
- if (ret)
- desc->wake_depth = 0;
- else
- irqd_set(&desc->irq_data, IRQD_WAKEUP_STATE);
- }
- } else {
- if (desc->wake_depth == 0) {
- WARN(1, "Unbalanced IRQ %d wake disable\n", irq);
- } else if (--desc->wake_depth == 0) {
- ret = set_irq_wake_real(irq, on);
- if (ret)
- desc->wake_depth = 1;
- else
- irqd_clear(&desc->irq_data, IRQD_WAKEUP_STATE);
+ /*
+ * wakeup-capable irqs can be shared between drivers that
+ * don't need to have the same sleep mode behaviors.
+ */
+ if (on) {
+ if (desc->wake_depth++ == 0) {
+ ret = set_irq_wake_real(irq, on);
+ if (ret)
+ desc->wake_depth = 0;
+ else
+ irqd_set(&desc->irq_data, IRQD_WAKEUP_STATE);
+ }
+ } else {
+ if (desc->wake_depth == 0) {
+ WARN(1, "Unbalanced IRQ %d wake disable\n", irq);
+ } else if (--desc->wake_depth == 0) {
+ ret = set_irq_wake_real(irq, on);
+ if (ret)
+ desc->wake_depth = 1;
+ else
+ irqd_clear(&desc->irq_data, IRQD_WAKEUP_STATE);
+ }
}
+ return ret;
}
-
-out_unlock:
- irq_put_desc_busunlock(desc, flags);
- return ret;
+ return -EINVAL;
}
EXPORT_SYMBOL(irq_set_irq_wake);
@@ -940,22 +888,17 @@ EXPORT_SYMBOL(irq_set_irq_wake);
* particular irq has been exclusively allocated or is available
* for driver use.
*/
-int can_request_irq(unsigned int irq, unsigned long irqflags)
+bool can_request_irq(unsigned int irq, unsigned long irqflags)
{
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
- int canrequest = 0;
+ scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_GLOBAL) {
+ struct irq_desc *desc = scoped_irqdesc;
- if (!desc)
- return 0;
-
- if (irq_settings_can_request(desc)) {
- if (!desc->action ||
- irqflags & desc->action->flags & IRQF_SHARED)
- canrequest = 1;
+ if (irq_settings_can_request(desc)) {
+ if (!desc->action || irqflags & desc->action->flags & IRQF_SHARED)
+ return true;
+ }
}
- irq_put_desc_unlock(desc, flags);
- return canrequest;
+ return false;
}
int __irq_set_trigger(struct irq_desc *desc, unsigned long flags)
@@ -1016,16 +959,11 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned long flags)
#ifdef CONFIG_HARDIRQS_SW_RESEND
int irq_set_parent(int irq, int parent_irq)
{
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
-
- if (!desc)
- return -EINVAL;
-
- desc->parent_irq = parent_irq;
-
- irq_put_desc_unlock(desc, flags);
- return 0;
+ scoped_irqdesc_get_and_lock(irq, 0) {
+ scoped_irqdesc->parent_irq = parent_irq;
+ return 0;
+ }
+ return -EINVAL;
}
EXPORT_SYMBOL_GPL(irq_set_parent);
#endif
@@ -1079,19 +1017,19 @@ static void irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *a
return;
}
- raw_spin_lock_irq(&desc->lock);
- /*
- * This code is triggered unconditionally. Check the affinity
- * mask pointer. For CPU_MASK_OFFSTACK=n this is optimized out.
- */
- if (cpumask_available(desc->irq_common_data.affinity)) {
- const struct cpumask *m;
+ scoped_guard(raw_spinlock_irq, &desc->lock) {
+ /*
+ * This code is triggered unconditionally. Check the affinity
+ * mask pointer. For CPU_MASK_OFFSTACK=n this is optimized out.
+ */
+ if (cpumask_available(desc->irq_common_data.affinity)) {
+ const struct cpumask *m;
- m = irq_data_get_effective_affinity_mask(&desc->irq_data);
- cpumask_copy(mask, m);
- valid = true;
+ m = irq_data_get_effective_affinity_mask(&desc->irq_data);
+ cpumask_copy(mask, m);
+ valid = true;
+ }
}
- raw_spin_unlock_irq(&desc->lock);
if (valid)
set_cpus_allowed_ptr(current, mask);
@@ -1259,9 +1197,8 @@ static void irq_wake_secondary(struct irq_desc *desc, struct irqaction *action)
if (WARN_ON_ONCE(!secondary))
return;
- raw_spin_lock_irq(&desc->lock);
+ guard(raw_spinlock_irq)(&desc->lock);
__irq_wake_thread(desc, secondary);
- raw_spin_unlock_irq(&desc->lock);
}
/*
@@ -1334,21 +1271,19 @@ static int irq_thread(void *data)
}
/**
- * irq_wake_thread - wake the irq thread for the action identified by dev_id
- * @irq: Interrupt line
- * @dev_id: Device identity for which the thread should be woken
- *
+ * irq_wake_thread - wake the irq thread for the action identified by dev_id
+ * @irq: Interrupt line
+ * @dev_id: Device identity for which the thread should be woken
*/
void irq_wake_thread(unsigned int irq, void *dev_id)
{
struct irq_desc *desc = irq_to_desc(irq);
struct irqaction *action;
- unsigned long flags;
if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc)))
return;
- raw_spin_lock_irqsave(&desc->lock, flags);
+ guard(raw_spinlock_irqsave)(&desc->lock);
for_each_action_of_desc(desc, action) {
if (action->dev_id == dev_id) {
if (action->thread)
@@ -1356,7 +1291,6 @@ void irq_wake_thread(unsigned int irq, void *dev_id)
break;
}
}
- raw_spin_unlock_irqrestore(&desc->lock, flags);
}
EXPORT_SYMBOL_GPL(irq_wake_thread);
@@ -1987,9 +1921,8 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id)
* There is no interrupt on the fly anymore. Deactivate it
* completely.
*/
- raw_spin_lock_irqsave(&desc->lock, flags);
- irq_domain_deactivate_irq(&desc->irq_data);
- raw_spin_unlock_irqrestore(&desc->lock, flags);
+ scoped_guard(raw_spinlock_irqsave, &desc->lock)
+ irq_domain_deactivate_irq(&desc->irq_data);
irq_release_resources(desc);
chip_bus_sync_unlock(desc);
@@ -2005,20 +1938,19 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id)
}
/**
- * free_irq - free an interrupt allocated with request_irq
- * @irq: Interrupt line to free
- * @dev_id: Device identity to free
+ * free_irq - free an interrupt allocated with request_irq
+ * @irq: Interrupt line to free
+ * @dev_id: Device identity to free
*
- * Remove an interrupt handler. The handler is removed and if the
- * interrupt line is no longer in use by any driver it is disabled.
- * On a shared IRQ the caller must ensure the interrupt is disabled
- * on the card it drives before calling this function. The function
- * does not return until any executing interrupts for this IRQ
- * have completed.
+ * Remove an interrupt handler. The handler is removed and if the interrupt
+ * line is no longer in use by any driver it is disabled. On a shared IRQ
+ * the caller must ensure the interrupt is disabled on the card it drives
+ * before calling this function. The function does not return until any
+ * executing interrupts for this IRQ have completed.
*
- * This function must not be called from interrupt context.
+ * This function must not be called from interrupt context.
*
- * Returns the devname argument passed to request_irq.
+ * Returns the devname argument passed to request_irq.
*/
const void *free_irq(unsigned int irq, void *dev_id)
{
@@ -2075,8 +2007,6 @@ static const void *__cleanup_nmi(unsigned int irq, struct irq_desc *desc)
const void *free_nmi(unsigned int irq, void *dev_id)
{
struct irq_desc *desc = irq_to_desc(irq);
- unsigned long flags;
- const void *devname;
if (!desc || WARN_ON(!irq_is_nmi(desc)))
return NULL;
@@ -2088,53 +2018,46 @@ const void *free_nmi(unsigned int irq, void *dev_id)
if (WARN_ON(desc->depth == 0))
disable_nmi_nosync(irq);
- raw_spin_lock_irqsave(&desc->lock, flags);
-
+ guard(raw_spinlock_irqsave)(&desc->lock);
irq_nmi_teardown(desc);
- devname = __cleanup_nmi(irq, desc);
-
- raw_spin_unlock_irqrestore(&desc->lock, flags);
-
- return devname;
+ return __cleanup_nmi(irq, desc);
}
/**
- * request_threaded_irq - allocate an interrupt line
- * @irq: Interrupt line to allocate
- * @handler: Function to be called when the IRQ occurs.
- * Primary handler for threaded interrupts.
- * If handler is NULL and thread_fn != NULL
- * the default primary handler is installed.
- * @thread_fn: Function called from the irq handler thread
- * If NULL, no irq thread is created
- * @irqflags: Interrupt type flags
- * @devname: An ascii name for the claiming device
- * @dev_id: A cookie passed back to the handler function
- *
- * This call allocates interrupt resources and enables the
- * interrupt line and IRQ handling. From the point this
- * call is made your handler function may be invoked. Since
- * your handler function must clear any interrupt the board
- * raises, you must take care both to initialise your hardware
- * and to set up the interrupt handler in the right order.
- *
- * If you want to set up a threaded irq handler for your device
- * then you need to supply @handler and @thread_fn. @handler is
- * still called in hard interrupt context and has to check
- * whether the interrupt originates from the device. If yes it
- * needs to disable the interrupt on the device and return
- * IRQ_WAKE_THREAD which will wake up the handler thread and run
- * @thread_fn. This split handler design is necessary to support
- * shared interrupts.
- *
- * Dev_id must be globally unique. Normally the address of the
- * device data structure is used as the cookie. Since the handler
- * receives this value it makes sense to use it.
- *
- * If your interrupt is shared you must pass a non NULL dev_id
- * as this is required when freeing the interrupt.
- *
- * Flags:
+ * request_threaded_irq - allocate an interrupt line
+ * @irq: Interrupt line to allocate
+ * @handler: Function to be called when the IRQ occurs.
+ * Primary handler for threaded interrupts.
+ * If handler is NULL and thread_fn != NULL
+ * the default primary handler is installed.
+ * @thread_fn: Function called from the irq handler thread
+ * If NULL, no irq thread is created
+ * @irqflags: Interrupt type flags
+ * @devname: An ascii name for the claiming device
+ * @dev_id: A cookie passed back to the handler function
+ *
+ * This call allocates interrupt resources and enables the interrupt line
+ * and IRQ handling. From the point this call is made your handler function
+ * may be invoked. Since your handler function must clear any interrupt the
+ * board raises, you must take care both to initialise your hardware and to
+ * set up the interrupt handler in the right order.
+ *
+ * If you want to set up a threaded irq handler for your device then you
+ * need to supply @handler and @thread_fn. @handler is still called in hard
+ * interrupt context and has to check whether the interrupt originates from
+ * the device. If yes it needs to disable the interrupt on the device and
+ * return IRQ_WAKE_THREAD which will wake up the handler thread and run
+ * @thread_fn. This split handler design is necessary to support shared
+ * interrupts.
+ *
+ * @dev_id must be globally unique. Normally the address of the device data
+ * structure is used as the cookie. Since the handler receives this value
+ * it makes sense to use it.
+ *
+ * If your interrupt is shared you must pass a non NULL dev_id as this is
+ * required when freeing the interrupt.
+ *
+ * Flags:
*
* IRQF_SHARED Interrupt is shared
* IRQF_TRIGGER_* Specify active edge(s) or level
@@ -2232,21 +2155,20 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
EXPORT_SYMBOL(request_threaded_irq);
/**
- * request_any_context_irq - allocate an interrupt line
- * @irq: Interrupt line to allocate
- * @handler: Function to be called when the IRQ occurs.
- * Threaded handler for threaded interrupts.
- * @flags: Interrupt type flags
- * @name: An ascii name for the claiming device
- * @dev_id: A cookie passed back to the handler function
- *
- * This call allocates interrupt resources and enables the
- * interrupt line and IRQ handling. It selects either a
- * hardirq or threaded handling method depending on the
- * context.
- *
- * On failure, it returns a negative value. On success,
- * it returns either IRQC_IS_HARDIRQ or IRQC_IS_NESTED.
+ * request_any_context_irq - allocate an interrupt line
+ * @irq: Interrupt line to allocate
+ * @handler: Function to be called when the IRQ occurs.
+ * Threaded handler for threaded interrupts.
+ * @flags: Interrupt type flags
+ * @name: An ascii name for the claiming device
+ * @dev_id: A cookie passed back to the handler function
+ *
+ * This call allocates interrupt resources and enables the interrupt line
+ * and IRQ handling. It selects either a hardirq or threaded handling
+ * method depending on the context.
+ *
+ * Returns: On failure, it returns a negative value. On success, it returns either
+ * IRQC_IS_HARDIRQ or IRQC_IS_NESTED.
*/
int request_any_context_irq(unsigned int irq, irq_handler_t handler,
unsigned long flags, const char *name, void *dev_id)
@@ -2273,37 +2195,35 @@ int request_any_context_irq(unsigned int irq, irq_handler_t handler,
EXPORT_SYMBOL_GPL(request_any_context_irq);
/**
- * request_nmi - allocate an interrupt line for NMI delivery
- * @irq: Interrupt line to allocate
- * @handler: Function to be called when the IRQ occurs.
- * Threaded handler for threaded interrupts.
- * @irqflags: Interrupt type flags
- * @name: An ascii name for the claiming device
- * @dev_id: A cookie passed back to the handler function
- *
- * This call allocates interrupt resources and enables the
- * interrupt line and IRQ handling. It sets up the IRQ line
- * to be handled as an NMI.
- *
- * An interrupt line delivering NMIs cannot be shared and IRQ handling
- * cannot be threaded.
- *
- * Interrupt lines requested for NMI delivering must produce per cpu
- * interrupts and have auto enabling setting disabled.
- *
- * Dev_id must be globally unique. Normally the address of the
- * device data structure is used as the cookie. Since the handler
- * receives this value it makes sense to use it.
- *
- * If the interrupt line cannot be used to deliver NMIs, function
- * will fail and return a negative value.
+ * request_nmi - allocate an interrupt line for NMI delivery
+ * @irq: Interrupt line to allocate
+ * @handler: Function to be called when the IRQ occurs.
+ * Threaded handler for threaded interrupts.
+ * @irqflags: Interrupt type flags
+ * @name: An ascii name for the claiming device
+ * @dev_id: A cookie passed back to the handler function
+ *
+ * This call allocates interrupt resources and enables the interrupt line
+ * and IRQ handling. It sets up the IRQ line to be handled as an NMI.
+ *
+ * An interrupt line delivering NMIs cannot be shared and IRQ handling
+ * cannot be threaded.
+ *
+ * Interrupt lines requested for NMI delivering must produce per cpu
+ * interrupts and have auto enabling setting disabled.
+ *
+ * @dev_id must be globally unique. Normally the address of the device data
+ * structure is used as the cookie. Since the handler receives this value
+ * it makes sense to use it.
+ *
+ * If the interrupt line cannot be used to deliver NMIs, function will fail
+ * and return a negative value.
*/
int request_nmi(unsigned int irq, irq_handler_t handler,
unsigned long irqflags, const char *name, void *dev_id)
{
struct irqaction *action;
struct irq_desc *desc;
- unsigned long flags;
int retval;
if (irq == IRQ_NOTCONNECTED)
@@ -2345,21 +2265,17 @@ int request_nmi(unsigned int irq, irq_handler_t handler,
if (retval)
goto err_irq_setup;
- raw_spin_lock_irqsave(&desc->lock, flags);
-
- /* Setup NMI state */
- desc->istate |= IRQS_NMI;
- retval = irq_nmi_setup(desc);
- if (retval) {
- __cleanup_nmi(irq, desc);
- raw_spin_unlock_irqrestore(&desc->lock, flags);
- return -EINVAL;
+ scoped_guard(raw_spinlock_irqsave, &desc->lock) {
+ /* Setup NMI state */
+ desc->istate |= IRQS_NMI;
+ retval = irq_nmi_setup(desc);
+ if (retval) {
+ __cleanup_nmi(irq, desc);
+ return -EINVAL;
+ }
+ return 0;
}
- raw_spin_unlock_irqrestore(&desc->lock, flags);
-
- return 0;
-
err_irq_setup:
irq_chip_pm_put(&desc->irq_data);
err_out:
@@ -2370,35 +2286,25 @@ err_out:
void enable_percpu_irq(unsigned int irq, unsigned int type)
{
- unsigned int cpu = smp_processor_id();
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_PERCPU);
+ scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_PERCPU) {
+ struct irq_desc *desc = scoped_irqdesc;
- if (!desc)
- return;
-
- /*
- * If the trigger type is not specified by the caller, then
- * use the default for this interrupt.
- */
- type &= IRQ_TYPE_SENSE_MASK;
- if (type == IRQ_TYPE_NONE)
- type = irqd_get_trigger_type(&desc->irq_data);
-
- if (type != IRQ_TYPE_NONE) {
- int ret;
-
- ret = __irq_set_trigger(desc, type);
-
- if (ret) {
- WARN(1, "failed to set type for IRQ%d\n", irq);
- goto out;
+ /*
+ * If the trigger type is not specified by the caller, then
+ * use the default for this interrupt.
+ */
+ type &= IRQ_TYPE_SENSE_MASK;
+ if (type == IRQ_TYPE_NONE)
+ type = irqd_get_trigger_type(&desc->irq_data);
+
+ if (type != IRQ_TYPE_NONE) {
+ if (__irq_set_trigger(desc, type)) {
+ WARN(1, "failed to set type for IRQ%d\n", irq);
+ return;
+ }
}
+ irq_percpu_enable(desc, smp_processor_id());
}
-
- irq_percpu_enable(desc, cpu);
-out:
- irq_put_desc_unlock(desc, flags);
}
EXPORT_SYMBOL_GPL(enable_percpu_irq);
@@ -2416,33 +2322,16 @@ void enable_percpu_nmi(unsigned int irq, unsigned int type)
*/
bool irq_percpu_is_enabled(unsigned int irq)
{
- unsigned int cpu = smp_processor_id();
- struct irq_desc *desc;
- unsigned long flags;
- bool is_enabled;
-
- desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_PERCPU);
- if (!desc)
- return false;
-
- is_enabled = cpumask_test_cpu(cpu, desc->percpu_enabled);
- irq_put_desc_unlock(desc, flags);
-
- return is_enabled;
+ scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_PERCPU)
+ return cpumask_test_cpu(smp_processor_id(), scoped_irqdesc->percpu_enabled);
+ return false;
}
EXPORT_SYMBOL_GPL(irq_percpu_is_enabled);
void disable_percpu_irq(unsigned int irq)
{
- unsigned int cpu = smp_processor_id();
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_PERCPU);
-
- if (!desc)
- return;
-
- irq_percpu_disable(desc, cpu);
- irq_put_desc_unlock(desc, flags);
+ scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_PERCPU)
+ irq_percpu_disable(scoped_irqdesc, smp_processor_id());
}
EXPORT_SYMBOL_GPL(disable_percpu_irq);
@@ -2458,71 +2347,47 @@ static struct irqaction *__free_percpu_irq(unsigned int irq, void __percpu *dev_
{
struct irq_desc *desc = irq_to_desc(irq);
struct irqaction *action;
- unsigned long flags;
WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq);
if (!desc)
return NULL;
- raw_spin_lock_irqsave(&desc->lock, flags);
+ scoped_guard(raw_spinlock_irqsave, &desc->lock) {
+ action = desc->action;
+ if (!action || action->percpu_dev_id != dev_id) {
+ WARN(1, "Trying to free already-free IRQ %d\n", irq);
+ return NULL;
+ }
- action = desc->action;
- if (!action || action->percpu_dev_id != dev_id) {
- WARN(1, "Trying to free already-free IRQ %d\n", irq);
- goto bad;
- }
+ if (!cpumask_empty(desc->percpu_enabled)) {
+ WARN(1, "percpu IRQ %d still enabled on CPU%d!\n",
+ irq, cpumask_first(desc->percpu_enabled));
+ return NULL;
+ }
- if (!cpumask_empty(desc->percpu_enabled)) {
- WARN(1, "percpu IRQ %d still enabled on CPU%d!\n",
- irq, cpumask_first(desc->percpu_enabled));
- goto bad;
+ /* Found it - now remove it from the list of entries: */
+ desc->action = NULL;
+ desc->istate &= ~IRQS_NMI;
}
- /* Found it - now remove it from the list of entries: */
- desc->action = NULL;
-
- desc->istate &= ~IRQS_NMI;
-
- raw_spin_unlock_irqrestore(&desc->lock, flags);
-
unregister_handler_proc(irq, action);
-
irq_chip_pm_put(&desc->irq_data);
module_put(desc->owner);
return action;
-
-bad:
- raw_spin_unlock_irqrestore(&desc->lock, flags);
- return NULL;
}
/**
- * remove_percpu_irq - free a per-cpu interrupt
- * @irq: Interrupt line to free
- * @act: irqaction for the interrupt
+ * free_percpu_irq - free an interrupt allocated with request_percpu_irq
+ * @irq: Interrupt line to free
+ * @dev_id: Device identity to free
*
- * Used to remove interrupts statically setup by the early boot process.
- */
-void remove_percpu_irq(unsigned int irq, struct irqaction *act)
-{
- struct irq_desc *desc = irq_to_desc(irq);
-
- if (desc && irq_settings_is_per_cpu_devid(desc))
- __free_percpu_irq(irq, act->percpu_dev_id);
-}
-
-/**
- * free_percpu_irq - free an interrupt allocated with request_percpu_irq
- * @irq: Interrupt line to free
- * @dev_id: Device identity to free
+ * Remove a percpu interrupt handler. The handler is removed, but the
+ * interrupt line is not disabled. This must be done on each CPU before
+ * calling this function. The function does not return until any executing
+ * interrupts for this IRQ have completed.
*
- * Remove a percpu interrupt handler. The handler is removed, but
- * the interrupt line is not disabled. This must be done on each
- * CPU before calling this function. The function does not return
- * until any executing interrupts for this IRQ have completed.
- *
- * This function must not be called from interrupt context.
+ * This function must not be called from interrupt context.
*/
void free_percpu_irq(unsigned int irq, void __percpu *dev_id)
{
@@ -2551,9 +2416,9 @@ void free_percpu_nmi(unsigned int irq, void __percpu *dev_id)
}
/**
- * setup_percpu_irq - setup a per-cpu interrupt
- * @irq: Interrupt line to setup
- * @act: irqaction for the interrupt
+ * setup_percpu_irq - setup a per-cpu interrupt
+ * @irq: Interrupt line to setup
+ * @act: irqaction for the interrupt
*
* Used to statically setup per-cpu interrupts in the early boot process.
*/
@@ -2578,21 +2443,20 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act)
}
/**
- * __request_percpu_irq - allocate a percpu interrupt line
- * @irq: Interrupt line to allocate
- * @handler: Function to be called when the IRQ occurs.
- * @flags: Interrupt type flags (IRQF_TIMER only)
- * @devname: An ascii name for the claiming device
- * @dev_id: A percpu cookie passed back to the handler function
- *
- * This call allocates interrupt resources and enables the
- * interrupt on the local CPU. If the interrupt is supposed to be
- * enabled on other CPUs, it has to be done on each CPU using
- * enable_percpu_irq().
- *
- * Dev_id must be globally unique. It is a per-cpu variable, and
- * the handler gets called with the interrupted CPU's instance of
- * that variable.
+ * __request_percpu_irq - allocate a percpu interrupt line
+ * @irq: Interrupt line to allocate
+ * @handler: Function to be called when the IRQ occurs.
+ * @flags: Interrupt type flags (IRQF_TIMER only)
+ * @devname: An ascii name for the claiming device
+ * @dev_id: A percpu cookie passed back to the handler function
+ *
+ * This call allocates interrupt resources and enables the interrupt on the
+ * local CPU. If the interrupt is supposed to be enabled on other CPUs, it
+ * has to be done on each CPU using enable_percpu_irq().
+ *
+ * @dev_id must be globally unique. It is a per-cpu variable, and
+ * the handler gets called with the interrupted CPU's instance of
+ * that variable.
*/
int __request_percpu_irq(unsigned int irq, irq_handler_t handler,
unsigned long flags, const char *devname,
@@ -2640,32 +2504,31 @@ int __request_percpu_irq(unsigned int irq, irq_handler_t handler,
EXPORT_SYMBOL_GPL(__request_percpu_irq);
/**
- * request_percpu_nmi - allocate a percpu interrupt line for NMI delivery
- * @irq: Interrupt line to allocate
- * @handler: Function to be called when the IRQ occurs.
- * @name: An ascii name for the claiming device
- * @dev_id: A percpu cookie passed back to the handler function
+ * request_percpu_nmi - allocate a percpu interrupt line for NMI delivery
+ * @irq: Interrupt line to allocate
+ * @handler: Function to be called when the IRQ occurs.
+ * @name: An ascii name for the claiming device
+ * @dev_id: A percpu cookie passed back to the handler function
*
- * This call allocates interrupt resources for a per CPU NMI. Per CPU NMIs
- * have to be setup on each CPU by calling prepare_percpu_nmi() before
- * being enabled on the same CPU by using enable_percpu_nmi().
+ * This call allocates interrupt resources for a per CPU NMI. Per CPU NMIs
+ * have to be setup on each CPU by calling prepare_percpu_nmi() before
+ * being enabled on the same CPU by using enable_percpu_nmi().
*
- * Dev_id must be globally unique. It is a per-cpu variable, and
- * the handler gets called with the interrupted CPU's instance of
- * that variable.
+ * @dev_id must be globally unique. It is a per-cpu variable, and the
+ * handler gets called with the interrupted CPU's instance of that
+ * variable.
*
- * Interrupt lines requested for NMI delivering should have auto enabling
- * setting disabled.
+ * Interrupt lines requested for NMI delivering should have auto enabling
+ * setting disabled.
*
- * If the interrupt line cannot be used to deliver NMIs, function
- * will fail returning a negative value.
+ * If the interrupt line cannot be used to deliver NMIs, function
+ * will fail returning a negative value.
*/
int request_percpu_nmi(unsigned int irq, irq_handler_t handler,
const char *name, void __percpu *dev_id)
{
struct irqaction *action;
struct irq_desc *desc;
- unsigned long flags;
int retval;
if (!handler)
@@ -2701,10 +2564,8 @@ int request_percpu_nmi(unsigned int irq, irq_handler_t handler,
if (retval)
goto err_irq_setup;
- raw_spin_lock_irqsave(&desc->lock, flags);
- desc->istate |= IRQS_NMI;
- raw_spin_unlock_irqrestore(&desc->lock, flags);
-
+ scoped_guard(raw_spinlock_irqsave, &desc->lock)
+ desc->istate |= IRQS_NMI;
return 0;
err_irq_setup:
@@ -2716,79 +2577,55 @@ err_out:
}
/**
- * prepare_percpu_nmi - performs CPU local setup for NMI delivery
- * @irq: Interrupt line to prepare for NMI delivery
+ * prepare_percpu_nmi - performs CPU local setup for NMI delivery
+ * @irq: Interrupt line to prepare for NMI delivery
*
- * This call prepares an interrupt line to deliver NMI on the current CPU,
- * before that interrupt line gets enabled with enable_percpu_nmi().
+ * This call prepares an interrupt line to deliver NMI on the current CPU,
+ * before that interrupt line gets enabled with enable_percpu_nmi().
*
- * As a CPU local operation, this should be called from non-preemptible
- * context.
+ * As a CPU local operation, this should be called from non-preemptible
+ * context.
*
- * If the interrupt line cannot be used to deliver NMIs, function
- * will fail returning a negative value.
+ * If the interrupt line cannot be used to deliver NMIs, function will fail
+ * returning a negative value.
*/
int prepare_percpu_nmi(unsigned int irq)
{
- unsigned long flags;
- struct irq_desc *desc;
- int ret = 0;
+ int ret = -EINVAL;
WARN_ON(preemptible());
- desc = irq_get_desc_lock(irq, &flags,
- IRQ_GET_DESC_CHECK_PERCPU);
- if (!desc)
- return -EINVAL;
-
- if (WARN(!irq_is_nmi(desc),
- KERN_ERR "prepare_percpu_nmi called for a non-NMI interrupt: irq %u\n",
- irq)) {
- ret = -EINVAL;
- goto out;
- }
+ scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_PERCPU) {
+ if (WARN(!irq_is_nmi(scoped_irqdesc),
+ "prepare_percpu_nmi called for a non-NMI interrupt: irq %u\n", irq))
+ return -EINVAL;
- ret = irq_nmi_setup(desc);
- if (ret) {
- pr_err("Failed to setup NMI delivery: irq %u\n", irq);
- goto out;
+ ret = irq_nmi_setup(scoped_irqdesc);
+ if (ret)
+ pr_err("Failed to setup NMI delivery: irq %u\n", irq);
}
-
-out:
- irq_put_desc_unlock(desc, flags);
return ret;
}
/**
- * teardown_percpu_nmi - undoes NMI setup of IRQ line
- * @irq: Interrupt line from which CPU local NMI configuration should be
- * removed
- *
- * This call undoes the setup done by prepare_percpu_nmi().
+ * teardown_percpu_nmi - undoes NMI setup of IRQ line
+ * @irq: Interrupt line from which CPU local NMI configuration should be removed
*
- * IRQ line should not be enabled for the current CPU.
+ * This call undoes the setup done by prepare_percpu_nmi().
*
- * As a CPU local operation, this should be called from non-preemptible
- * context.
+ * IRQ line should not be enabled for the current CPU.
+ * As a CPU local operation, this should be called from non-preemptible
+ * context.
*/
void teardown_percpu_nmi(unsigned int irq)
{
- unsigned long flags;
- struct irq_desc *desc;
-
WARN_ON(preemptible());
- desc = irq_get_desc_lock(irq, &flags,
- IRQ_GET_DESC_CHECK_PERCPU);
- if (!desc)
- return;
-
- if (WARN_ON(!irq_is_nmi(desc)))
- goto out;
-
- irq_nmi_teardown(desc);
-out:
- irq_put_desc_unlock(desc, flags);
+ scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_PERCPU) {
+ if (WARN_ON(!irq_is_nmi(scoped_irqdesc)))
+ return;
+ irq_nmi_teardown(scoped_irqdesc);
+ }
}
static int __irq_get_irqchip_state(struct irq_data *data, enum irqchip_irq_state which, bool *state)
@@ -2815,87 +2652,62 @@ static int __irq_get_irqchip_state(struct irq_data *data, enum irqchip_irq_state
}
/**
- * irq_get_irqchip_state - returns the irqchip state of a interrupt.
- * @irq: Interrupt line that is forwarded to a VM
- * @which: One of IRQCHIP_STATE_* the caller wants to know about
- * @state: a pointer to a boolean where the state is to be stored
+ * irq_get_irqchip_state - returns the irqchip state of a interrupt.
+ * @irq: Interrupt line that is forwarded to a VM
+ * @which: One of IRQCHIP_STATE_* the caller wants to know about
+ * @state: a pointer to a boolean where the state is to be stored
*
- * This call snapshots the internal irqchip state of an
- * interrupt, returning into @state the bit corresponding to
- * stage @which
+ * This call snapshots the internal irqchip state of an interrupt,
+ * returning into @state the bit corresponding to stage @which
*
- * This function should be called with preemption disabled if the
- * interrupt controller has per-cpu registers.
+ * This function should be called with preemption disabled if the interrupt
+ * controller has per-cpu registers.
*/
-int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
- bool *state)
+int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which, bool *state)
{
- struct irq_desc *desc;
- struct irq_data *data;
- unsigned long flags;
- int err = -EINVAL;
-
- desc = irq_get_desc_buslock(irq, &flags, 0);
- if (!desc)
- return err;
+ scoped_irqdesc_get_and_buslock(irq, 0) {
+ struct irq_data *data = irq_desc_get_irq_data(scoped_irqdesc);
- data = irq_desc_get_irq_data(desc);
-
- err = __irq_get_irqchip_state(data, which, state);
-
- irq_put_desc_busunlock(desc, flags);
- return err;
+ return __irq_get_irqchip_state(data, which, state);
+ }
+ return -EINVAL;
}
EXPORT_SYMBOL_GPL(irq_get_irqchip_state);
/**
- * irq_set_irqchip_state - set the state of a forwarded interrupt.
- * @irq: Interrupt line that is forwarded to a VM
- * @which: State to be restored (one of IRQCHIP_STATE_*)
- * @val: Value corresponding to @which
+ * irq_set_irqchip_state - set the state of a forwarded interrupt.
+ * @irq: Interrupt line that is forwarded to a VM
+ * @which: State to be restored (one of IRQCHIP_STATE_*)
+ * @val: Value corresponding to @which
*
- * This call sets the internal irqchip state of an interrupt,
- * depending on the value of @which.
+ * This call sets the internal irqchip state of an interrupt, depending on
+ * the value of @which.
*
- * This function should be called with migration disabled if the
- * interrupt controller has per-cpu registers.
+ * This function should be called with migration disabled if the interrupt
+ * controller has per-cpu registers.
*/
-int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
- bool val)
+int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which, bool val)
{
- struct irq_desc *desc;
- struct irq_data *data;
- struct irq_chip *chip;
- unsigned long flags;
- int err = -EINVAL;
+ scoped_irqdesc_get_and_buslock(irq, 0) {
+ struct irq_data *data = irq_desc_get_irq_data(scoped_irqdesc);
+ struct irq_chip *chip;
- desc = irq_get_desc_buslock(irq, &flags, 0);
- if (!desc)
- return err;
+ do {
+ chip = irq_data_get_irq_chip(data);
- data = irq_desc_get_irq_data(desc);
+ if (WARN_ON_ONCE(!chip))
+ return -ENODEV;
- do {
- chip = irq_data_get_irq_chip(data);
- if (WARN_ON_ONCE(!chip)) {
- err = -ENODEV;
- goto out_unlock;
- }
- if (chip->irq_set_irqchip_state)
- break;
-#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
- data = data->parent_data;
-#else
- data = NULL;
-#endif
- } while (data);
+ if (chip->irq_set_irqchip_state)
+ break;
- if (data)
- err = chip->irq_set_irqchip_state(data, which, val);
+ data = irqd_get_parent_data(data);
+ } while (data);
-out_unlock:
- irq_put_desc_busunlock(desc, flags);
- return err;
+ if (data)
+ return chip->irq_set_irqchip_state(data, which, val);
+ }
+ return -EINVAL;
}
EXPORT_SYMBOL_GPL(irq_set_irqchip_state);
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index c05ba7ca00fa..9febe797a5f6 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -59,7 +59,8 @@ struct msi_ctrl {
static void msi_domain_free_locked(struct device *dev, struct msi_ctrl *ctrl);
static unsigned int msi_domain_get_hwsize(struct device *dev, unsigned int domid);
static inline int msi_sysfs_create_group(struct device *dev);
-
+static int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev,
+ int nvec, msi_alloc_info_t *arg);
/**
* msi_alloc_desc - Allocate an initialized msi_desc
@@ -343,26 +344,30 @@ int msi_setup_device_data(struct device *dev)
}
/**
- * msi_lock_descs - Lock the MSI descriptor storage of a device
+ * __msi_lock_descs - Lock the MSI descriptor storage of a device
* @dev: Device to operate on
+ *
+ * Internal function for guard(msi_descs_lock). Don't use in code.
*/
-void msi_lock_descs(struct device *dev)
+void __msi_lock_descs(struct device *dev)
{
mutex_lock(&dev->msi.data->mutex);
}
-EXPORT_SYMBOL_GPL(msi_lock_descs);
+EXPORT_SYMBOL_GPL(__msi_lock_descs);
/**
- * msi_unlock_descs - Unlock the MSI descriptor storage of a device
+ * __msi_unlock_descs - Unlock the MSI descriptor storage of a device
* @dev: Device to operate on
+ *
+ * Internal function for guard(msi_descs_lock). Don't use in code.
*/
-void msi_unlock_descs(struct device *dev)
+void __msi_unlock_descs(struct device *dev)
{
/* Invalidate the index which was cached by the iterator */
dev->msi.data->__iter_idx = MSI_XA_MAX_INDEX;
mutex_unlock(&dev->msi.data->mutex);
}
-EXPORT_SYMBOL_GPL(msi_unlock_descs);
+EXPORT_SYMBOL_GPL(__msi_unlock_descs);
static struct msi_desc *msi_find_desc(struct msi_device_data *md, unsigned int domid,
enum msi_desc_filter filter)
@@ -448,7 +453,6 @@ EXPORT_SYMBOL_GPL(msi_next_desc);
unsigned int msi_domain_get_virq(struct device *dev, unsigned int domid, unsigned int index)
{
struct msi_desc *desc;
- unsigned int ret = 0;
bool pcimsi = false;
struct xarray *xa;
@@ -462,7 +466,7 @@ unsigned int msi_domain_get_virq(struct device *dev, unsigned int domid, unsigne
if (dev_is_pci(dev) && domid == MSI_DEFAULT_DOMAIN)
pcimsi = to_pci_dev(dev)->msi_enabled;
- msi_lock_descs(dev);
+ guard(msi_descs_lock)(dev);
xa = &dev->msi.data->__domains[domid].store;
desc = xa_load(xa, pcimsi ? 0 : index);
if (desc && desc->irq) {
@@ -471,16 +475,12 @@ unsigned int msi_domain_get_virq(struct device *dev, unsigned int domid, unsigne
* PCI-MSIX and platform MSI use a descriptor per
* interrupt.
*/
- if (pcimsi) {
- if (index < desc->nvec_used)
- ret = desc->irq + index;
- } else {
- ret = desc->irq;
- }
+ if (!pcimsi)
+ return desc->irq;
+ if (index < desc->nvec_used)
+ return desc->irq + index;
}
-
- msi_unlock_descs(dev);
- return ret;
+ return 0;
}
EXPORT_SYMBOL_GPL(msi_domain_get_virq);
@@ -796,6 +796,10 @@ static int msi_domain_ops_prepare(struct irq_domain *domain, struct device *dev,
return 0;
}
+static void msi_domain_ops_teardown(struct irq_domain *domain, msi_alloc_info_t *arg)
+{
+}
+
static void msi_domain_ops_set_desc(msi_alloc_info_t *arg,
struct msi_desc *desc)
{
@@ -821,6 +825,7 @@ static struct msi_domain_ops msi_domain_ops_default = {
.get_hwirq = msi_domain_ops_get_hwirq,
.msi_init = msi_domain_ops_init,
.msi_prepare = msi_domain_ops_prepare,
+ .msi_teardown = msi_domain_ops_teardown,
.set_desc = msi_domain_ops_set_desc,
};
@@ -842,6 +847,8 @@ static void msi_domain_update_dom_ops(struct msi_domain_info *info)
ops->msi_init = msi_domain_ops_default.msi_init;
if (ops->msi_prepare == NULL)
ops->msi_prepare = msi_domain_ops_default.msi_prepare;
+ if (ops->msi_teardown == NULL)
+ ops->msi_teardown = msi_domain_ops_default.msi_teardown;
if (ops->set_desc == NULL)
ops->set_desc = msi_domain_ops_default.set_desc;
}
@@ -905,6 +912,32 @@ struct irq_domain *msi_create_irq_domain(struct fwnode_handle *fwnode,
}
/**
+ * msi_create_parent_irq_domain - Create an MSI-parent interrupt domain
+ * @info: MSI irqdomain creation info
+ * @msi_parent_ops: MSI parent callbacks and configuration
+ *
+ * Return: pointer to the created &struct irq_domain or %NULL on failure
+ */
+struct irq_domain *msi_create_parent_irq_domain(struct irq_domain_info *info,
+ const struct msi_parent_ops *msi_parent_ops)
+{
+ struct irq_domain *d;
+
+ info->hwirq_max = max(info->hwirq_max, info->size);
+ info->size = info->hwirq_max;
+ info->domain_flags |= IRQ_DOMAIN_FLAG_MSI_PARENT;
+ info->bus_token = msi_parent_ops->bus_select_token;
+
+ d = irq_domain_instantiate(info);
+ if (IS_ERR(d))
+ return NULL;
+
+ d->msi_parent_ops = msi_parent_ops;
+ return d;
+}
+EXPORT_SYMBOL_GPL(msi_create_parent_irq_domain);
+
+/**
* msi_parent_init_dev_msi_info - Delegate initialization of device MSI info down
* in the domain hierarchy
* @dev: The device for which the domain should be created
@@ -998,9 +1031,8 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid,
void *chip_data)
{
struct irq_domain *domain, *parent = dev->msi.domain;
- struct fwnode_handle *fwnode, *fwnalloced = NULL;
- struct msi_domain_template *bundle;
const struct msi_parent_ops *pops;
+ struct fwnode_handle *fwnode;
if (!irq_domain_is_msi_parent(parent))
return false;
@@ -1008,7 +1040,8 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid,
if (domid >= MSI_MAX_DEVICE_IRQDOMAINS)
return false;
- bundle = kmemdup(template, sizeof(*bundle), GFP_KERNEL);
+ struct msi_domain_template *bundle __free(kfree) =
+ kmemdup(template, sizeof(*bundle), GFP_KERNEL);
if (!bundle)
return false;
@@ -1017,6 +1050,7 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid,
bundle->info.ops = &bundle->ops;
bundle->info.data = domain_data;
bundle->info.chip_data = chip_data;
+ bundle->info.alloc_data = &bundle->alloc_info;
pops = parent->msi_parent_ops;
snprintf(bundle->name, sizeof(bundle->name), "%s%s-%s",
@@ -1031,41 +1065,43 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid,
* node as they are not guaranteed to have a fwnode. They are never
* looked up and always handled in the context of the device.
*/
- if (bundle->info.flags & MSI_FLAG_USE_DEV_FWNODE)
- fwnode = dev->fwnode;
+ struct fwnode_handle *fwnode_alloced __free(irq_domain_free_fwnode) = NULL;
+
+ if (!(bundle->info.flags & MSI_FLAG_USE_DEV_FWNODE))
+ fwnode = fwnode_alloced = irq_domain_alloc_named_fwnode(bundle->name);
else
- fwnode = fwnalloced = irq_domain_alloc_named_fwnode(bundle->name);
+ fwnode = dev->fwnode;
if (!fwnode)
- goto free_bundle;
+ return false;
if (msi_setup_device_data(dev))
- goto free_fwnode;
-
- msi_lock_descs(dev);
+ return false;
+ guard(msi_descs_lock)(dev);
if (WARN_ON_ONCE(msi_get_device_domain(dev, domid)))
- goto fail;
+ return false;
if (!pops->init_dev_msi_info(dev, parent, parent, &bundle->info))
- goto fail;
+ return false;
domain = __msi_create_irq_domain(fwnode, &bundle->info, IRQ_DOMAIN_FLAG_MSI_DEVICE, parent);
if (!domain)
- goto fail;
+ return false;
domain->dev = dev;
dev->msi.data->__domains[domid].domain = domain;
- msi_unlock_descs(dev);
- return true;
-fail:
- msi_unlock_descs(dev);
-free_fwnode:
- irq_domain_free_fwnode(fwnalloced);
-free_bundle:
- kfree(bundle);
- return false;
+ if (msi_domain_prepare_irqs(domain, dev, hwsize, &bundle->alloc_info)) {
+ dev->msi.data->__domains[domid].domain = NULL;
+ irq_domain_remove(domain);
+ return false;
+ }
+
+ /* @bundle and @fwnode_alloced are now in use. Prevent cleanup */
+ retain_and_null_ptr(bundle);
+ retain_and_null_ptr(fwnode_alloced);
+ return true;
}
/**
@@ -1079,23 +1115,21 @@ void msi_remove_device_irq_domain(struct device *dev, unsigned int domid)
struct msi_domain_info *info;
struct irq_domain *domain;
- msi_lock_descs(dev);
-
+ guard(msi_descs_lock)(dev);
domain = msi_get_device_domain(dev, domid);
-
if (!domain || !irq_domain_is_msi_device(domain))
- goto unlock;
+ return;
dev->msi.data->__domains[domid].domain = NULL;
info = domain->host_data;
+
+ info->ops->msi_teardown(domain, info->alloc_data);
+
if (irq_domain_is_msi_device(domain))
fwnode = domain->fwnode;
irq_domain_remove(domain);
irq_domain_free_fwnode(fwnode);
kfree(container_of(info, struct msi_domain_template, info));
-
-unlock:
- msi_unlock_descs(dev);
}
/**
@@ -1111,16 +1145,14 @@ bool msi_match_device_irq_domain(struct device *dev, unsigned int domid,
{
struct msi_domain_info *info;
struct irq_domain *domain;
- bool ret = false;
- msi_lock_descs(dev);
+ guard(msi_descs_lock)(dev);
domain = msi_get_device_domain(dev, domid);
if (domain && irq_domain_is_msi_device(domain)) {
info = domain->host_data;
- ret = info->bus_token == bus_token;
+ return info->bus_token == bus_token;
}
- msi_unlock_descs(dev);
- return ret;
+ return false;
}
static int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev,
@@ -1238,6 +1270,24 @@ static int msi_init_virq(struct irq_domain *domain, int virq, unsigned int vflag
return 0;
}
+static int populate_alloc_info(struct irq_domain *domain, struct device *dev,
+ unsigned int nirqs, msi_alloc_info_t *arg)
+{
+ struct msi_domain_info *info = domain->host_data;
+
+ /*
+ * If the caller has provided a template alloc info, use that. Once
+ * all users of msi_create_irq_domain() have been eliminated, this
+ * should be the only source of allocation information, and the
+ * prepare call below should be finally removed.
+ */
+ if (!info->alloc_data)
+ return msi_domain_prepare_irqs(domain, dev, nirqs, arg);
+
+ *arg = *info->alloc_data;
+ return 0;
+}
+
static int __msi_domain_alloc_irqs(struct device *dev, struct irq_domain *domain,
struct msi_ctrl *ctrl)
{
@@ -1250,7 +1300,7 @@ static int __msi_domain_alloc_irqs(struct device *dev, struct irq_domain *domain
unsigned long idx;
int i, ret, virq;
- ret = msi_domain_prepare_irqs(domain, dev, ctrl->nirqs, &arg);
+ ret = populate_alloc_info(domain, dev, ctrl->nirqs, &arg);
if (ret)
return ret;
@@ -1391,12 +1441,9 @@ int msi_domain_alloc_irqs_range_locked(struct device *dev, unsigned int domid,
int msi_domain_alloc_irqs_range(struct device *dev, unsigned int domid,
unsigned int first, unsigned int last)
{
- int ret;
- msi_lock_descs(dev);
- ret = msi_domain_alloc_irqs_range_locked(dev, domid, first, last);
- msi_unlock_descs(dev);
- return ret;
+ guard(msi_descs_lock)(dev);
+ return msi_domain_alloc_irqs_range_locked(dev, domid, first, last);
}
EXPORT_SYMBOL_GPL(msi_domain_alloc_irqs_range);
@@ -1500,12 +1547,8 @@ struct msi_map msi_domain_alloc_irq_at(struct device *dev, unsigned int domid, u
const struct irq_affinity_desc *affdesc,
union msi_instance_cookie *icookie)
{
- struct msi_map map;
-
- msi_lock_descs(dev);
- map = __msi_domain_alloc_irq_at(dev, domid, index, affdesc, icookie);
- msi_unlock_descs(dev);
- return map;
+ guard(msi_descs_lock)(dev);
+ return __msi_domain_alloc_irq_at(dev, domid, index, affdesc, icookie);
}
/**
@@ -1542,13 +1585,11 @@ int msi_device_domain_alloc_wired(struct irq_domain *domain, unsigned int hwirq,
icookie.value = ((u64)type << 32) | hwirq;
- msi_lock_descs(dev);
+ guard(msi_descs_lock)(dev);
if (WARN_ON_ONCE(msi_get_device_domain(dev, domid) != domain))
map.index = -EINVAL;
else
map = __msi_domain_alloc_irq_at(dev, domid, MSI_ANY_INDEX, NULL, &icookie);
- msi_unlock_descs(dev);
-
return map.index >= 0 ? map.virq : map.index;
}
@@ -1641,9 +1682,8 @@ void msi_domain_free_irqs_range_locked(struct device *dev, unsigned int domid,
void msi_domain_free_irqs_range(struct device *dev, unsigned int domid,
unsigned int first, unsigned int last)
{
- msi_lock_descs(dev);
+ guard(msi_descs_lock)(dev);
msi_domain_free_irqs_range_locked(dev, domid, first, last);
- msi_unlock_descs(dev);
}
EXPORT_SYMBOL_GPL(msi_domain_free_irqs_all);
@@ -1673,9 +1713,8 @@ void msi_domain_free_irqs_all_locked(struct device *dev, unsigned int domid)
*/
void msi_domain_free_irqs_all(struct device *dev, unsigned int domid)
{
- msi_lock_descs(dev);
+ guard(msi_descs_lock)(dev);
msi_domain_free_irqs_all_locked(dev, domid);
- msi_unlock_descs(dev);
}
/**
@@ -1694,12 +1733,11 @@ void msi_device_domain_free_wired(struct irq_domain *domain, unsigned int virq)
if (WARN_ON_ONCE(!dev || !desc || domain->bus_token != DOMAIN_BUS_WIRED_TO_MSI))
return;
- msi_lock_descs(dev);
- if (!WARN_ON_ONCE(msi_get_device_domain(dev, MSI_DEFAULT_DOMAIN) != domain)) {
- msi_domain_free_irqs_range_locked(dev, MSI_DEFAULT_DOMAIN, desc->msi_index,
- desc->msi_index);
- }
- msi_unlock_descs(dev);
+ guard(msi_descs_lock)(dev);
+ if (WARN_ON_ONCE(msi_get_device_domain(dev, MSI_DEFAULT_DOMAIN) != domain))
+ return;
+ msi_domain_free_irqs_range_locked(dev, MSI_DEFAULT_DOMAIN, desc->msi_index,
+ desc->msi_index);
}
/**
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index c556bc49d213..445912d51033 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -46,8 +46,7 @@ void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action)
desc->cond_suspend_depth++;
WARN_ON_ONCE(desc->no_suspend_depth &&
- (desc->no_suspend_depth +
- desc->cond_suspend_depth) != desc->nr_actions);
+ (desc->no_suspend_depth + desc->cond_suspend_depth) != desc->nr_actions);
}
/*
@@ -134,14 +133,12 @@ void suspend_device_irqs(void)
int irq;
for_each_irq_desc(irq, desc) {
- unsigned long flags;
bool sync;
if (irq_settings_is_nested_thread(desc))
continue;
- raw_spin_lock_irqsave(&desc->lock, flags);
- sync = suspend_device_irq(desc);
- raw_spin_unlock_irqrestore(&desc->lock, flags);
+ scoped_guard(raw_spinlock_irqsave, &desc->lock)
+ sync = suspend_device_irq(desc);
if (sync)
synchronize_irq(irq);
@@ -186,18 +183,15 @@ static void resume_irqs(bool want_early)
int irq;
for_each_irq_desc(irq, desc) {
- unsigned long flags;
- bool is_early = desc->action &&
- desc->action->flags & IRQF_EARLY_RESUME;
+ bool is_early = desc->action && desc->action->flags & IRQF_EARLY_RESUME;
if (!is_early && want_early)
continue;
if (irq_settings_is_nested_thread(desc))
continue;
- raw_spin_lock_irqsave(&desc->lock, flags);
+ guard(raw_spinlock_irqsave)(&desc->lock);
resume_irq(desc);
- raw_spin_unlock_irqrestore(&desc->lock, flags);
}
}
@@ -207,22 +201,16 @@ static void resume_irqs(bool want_early)
*/
void rearm_wake_irq(unsigned int irq)
{
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
+ scoped_irqdesc_get_and_buslock(irq, IRQ_GET_DESC_CHECK_GLOBAL) {
+ struct irq_desc *desc = scoped_irqdesc;
- if (!desc)
- return;
-
- if (!(desc->istate & IRQS_SUSPENDED) ||
- !irqd_is_wakeup_set(&desc->irq_data))
- goto unlock;
-
- desc->istate &= ~IRQS_SUSPENDED;
- irqd_set(&desc->irq_data, IRQD_WAKEUP_ARMED);
- __enable_irq(desc);
+ if (!(desc->istate & IRQS_SUSPENDED) || !irqd_is_wakeup_set(&desc->irq_data))
+ return;
-unlock:
- irq_put_desc_busunlock(desc, flags);
+ desc->istate &= ~IRQS_SUSPENDED;
+ irqd_set(&desc->irq_data, IRQD_WAKEUP_ARMED);
+ __enable_irq(desc);
+ }
}
/**
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 8e29809de38d..29c2404e743b 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -81,20 +81,18 @@ static int show_irq_affinity(int type, struct seq_file *m)
static int irq_affinity_hint_proc_show(struct seq_file *m, void *v)
{
struct irq_desc *desc = irq_to_desc((long)m->private);
- unsigned long flags;
cpumask_var_t mask;
if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
return -ENOMEM;
- raw_spin_lock_irqsave(&desc->lock, flags);
- if (desc->affinity_hint)
- cpumask_copy(mask, desc->affinity_hint);
- raw_spin_unlock_irqrestore(&desc->lock, flags);
+ scoped_guard(raw_spinlock_irq, &desc->lock) {
+ if (desc->affinity_hint)
+ cpumask_copy(mask, desc->affinity_hint);
+ }
seq_printf(m, "%*pb\n", cpumask_pr_args(mask));
free_cpumask_var(mask);
-
return 0;
}
@@ -295,32 +293,26 @@ static int irq_spurious_proc_show(struct seq_file *m, void *v)
#define MAX_NAMELEN 128
-static int name_unique(unsigned int irq, struct irqaction *new_action)
+static bool name_unique(unsigned int irq, struct irqaction *new_action)
{
struct irq_desc *desc = irq_to_desc(irq);
struct irqaction *action;
- unsigned long flags;
- int ret = 1;
- raw_spin_lock_irqsave(&desc->lock, flags);
+ guard(raw_spinlock_irq)(&desc->lock);
for_each_action_of_desc(desc, action) {
if ((action != new_action) && action->name &&
- !strcmp(new_action->name, action->name)) {
- ret = 0;
- break;
- }
+ !strcmp(new_action->name, action->name))
+ return false;
}
- raw_spin_unlock_irqrestore(&desc->lock, flags);
- return ret;
+ return true;
}
void register_handler_proc(unsigned int irq, struct irqaction *action)
{
- char name [MAX_NAMELEN];
+ char name[MAX_NAMELEN];
struct irq_desc *desc = irq_to_desc(irq);
- if (!desc->dir || action->dir || !action->name ||
- !name_unique(irq, action))
+ if (!desc->dir || action->dir || !action->name || !name_unique(irq, action))
return;
snprintf(name, MAX_NAMELEN, "%s", action->name);
@@ -347,17 +339,16 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
* added, not when the descriptor is created, so multiple
* tasks might try to register at the same time.
*/
- mutex_lock(&register_lock);
+ guard(mutex)(&register_lock);
if (desc->dir)
- goto out_unlock;
-
- sprintf(name, "%d", irq);
+ return;
/* create /proc/irq/1234 */
+ sprintf(name, "%u", irq);
desc->dir = proc_mkdir(name, root_irq_dir);
if (!desc->dir)
- goto out_unlock;
+ return;
#ifdef CONFIG_SMP
umode_t umode = S_IRUGO;
@@ -366,31 +357,27 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
umode |= S_IWUSR;
/* create /proc/irq/<irq>/smp_affinity */
- proc_create_data("smp_affinity", umode, desc->dir,
- &irq_affinity_proc_ops, irqp);
+ proc_create_data("smp_affinity", umode, desc->dir, &irq_affinity_proc_ops, irqp);
/* create /proc/irq/<irq>/affinity_hint */
proc_create_single_data("affinity_hint", 0444, desc->dir,
- irq_affinity_hint_proc_show, irqp);
+ irq_affinity_hint_proc_show, irqp);
/* create /proc/irq/<irq>/smp_affinity_list */
proc_create_data("smp_affinity_list", umode, desc->dir,
&irq_affinity_list_proc_ops, irqp);
- proc_create_single_data("node", 0444, desc->dir, irq_node_proc_show,
- irqp);
+ proc_create_single_data("node", 0444, desc->dir, irq_node_proc_show, irqp);
# ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
proc_create_single_data("effective_affinity", 0444, desc->dir,
- irq_effective_aff_proc_show, irqp);
+ irq_effective_aff_proc_show, irqp);
proc_create_single_data("effective_affinity_list", 0444, desc->dir,
- irq_effective_aff_list_proc_show, irqp);
+ irq_effective_aff_list_proc_show, irqp);
# endif
#endif
proc_create_single_data("spurious", 0444, desc->dir,
- irq_spurious_proc_show, (void *)(long)irq);
+ irq_spurious_proc_show, (void *)(long)irq);
-out_unlock:
- mutex_unlock(&register_lock);
}
void unregister_irq_proc(unsigned int irq, struct irq_desc *desc)
@@ -468,7 +455,6 @@ int show_interrupts(struct seq_file *p, void *v)
int i = *(loff_t *) v, j;
struct irqaction *action;
struct irq_desc *desc;
- unsigned long flags;
if (i > ACTUAL_NR_IRQS)
return 0;
@@ -487,13 +473,13 @@ int show_interrupts(struct seq_file *p, void *v)
seq_putc(p, '\n');
}
- rcu_read_lock();
+ guard(rcu)();
desc = irq_to_desc(i);
if (!desc || irq_settings_is_hidden(desc))
- goto outsparse;
+ return 0;
if (!desc->action || irq_desc_is_chained(desc) || !desc->kstat_irqs)
- goto outsparse;
+ return 0;
seq_printf(p, "%*d:", prec, i);
for_each_online_cpu(j) {
@@ -503,7 +489,7 @@ int show_interrupts(struct seq_file *p, void *v)
}
seq_putc(p, ' ');
- raw_spin_lock_irqsave(&desc->lock, flags);
+ guard(raw_spinlock_irq)(&desc->lock);
if (desc->irq_data.chip) {
if (desc->irq_data.chip->irq_print_chip)
desc->irq_data.chip->irq_print_chip(&desc->irq_data, p);
@@ -532,9 +518,6 @@ int show_interrupts(struct seq_file *p, void *v)
}
seq_putc(p, '\n');
- raw_spin_unlock_irqrestore(&desc->lock, flags);
-outsparse:
- rcu_read_unlock();
return 0;
}
#endif
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 1b7fa72968bd..ca9cc1b806a9 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -30,18 +30,17 @@ static DEFINE_RAW_SPINLOCK(irq_resend_lock);
*/
static void resend_irqs(struct tasklet_struct *unused)
{
- struct irq_desc *desc;
-
- raw_spin_lock_irq(&irq_resend_lock);
+ guard(raw_spinlock_irq)(&irq_resend_lock);
while (!hlist_empty(&irq_resend_list)) {
- desc = hlist_entry(irq_resend_list.first, struct irq_desc,
- resend_node);
+ struct irq_desc *desc;
+
+ desc = hlist_entry(irq_resend_list.first, struct irq_desc, resend_node);
hlist_del_init(&desc->resend_node);
+
raw_spin_unlock(&irq_resend_lock);
desc->handle_irq(desc);
raw_spin_lock(&irq_resend_lock);
}
- raw_spin_unlock_irq(&irq_resend_lock);
}
/* Tasklet to handle resend: */
@@ -75,19 +74,18 @@ static int irq_sw_resend(struct irq_desc *desc)
}
/* Add to resend_list and activate the softirq: */
- raw_spin_lock(&irq_resend_lock);
- if (hlist_unhashed(&desc->resend_node))
- hlist_add_head(&desc->resend_node, &irq_resend_list);
- raw_spin_unlock(&irq_resend_lock);
+ scoped_guard(raw_spinlock, &irq_resend_lock) {
+ if (hlist_unhashed(&desc->resend_node))
+ hlist_add_head(&desc->resend_node, &irq_resend_list);
+ }
tasklet_schedule(&resend_tasklet);
return 0;
}
void clear_irq_resend(struct irq_desc *desc)
{
- raw_spin_lock(&irq_resend_lock);
+ guard(raw_spinlock)(&irq_resend_lock);
hlist_del_init(&desc->resend_node);
- raw_spin_unlock(&irq_resend_lock);
}
void irq_resend_init(struct irq_desc *desc)
@@ -172,30 +170,24 @@ int check_irq_resend(struct irq_desc *desc, bool inject)
*/
int irq_inject_interrupt(unsigned int irq)
{
- struct irq_desc *desc;
- unsigned long flags;
- int err;
+ int err = -EINVAL;
/* Try the state injection hardware interface first */
if (!irq_set_irqchip_state(irq, IRQCHIP_STATE_PENDING, true))
return 0;
/* That failed, try via the resend mechanism */
- desc = irq_get_desc_buslock(irq, &flags, 0);
- if (!desc)
- return -EINVAL;
+ scoped_irqdesc_get_and_buslock(irq, 0) {
+ struct irq_desc *desc = scoped_irqdesc;
- /*
- * Only try to inject when the interrupt is:
- * - not NMI type
- * - activated
- */
- if (irq_is_nmi(desc) || !irqd_is_activated(&desc->irq_data))
- err = -EINVAL;
- else
- err = check_irq_resend(desc, true);
-
- irq_put_desc_busunlock(desc, flags);
+ /*
+ * Only try to inject when the interrupt is:
+ * - not NMI type
+ * - activated
+ */
+ if (!irq_is_nmi(desc) && irqd_is_activated(&desc->irq_data))
+ err = check_irq_resend(desc, true);
+ }
return err;
}
EXPORT_SYMBOL_GPL(irq_inject_interrupt);
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 02b2daf07441..8f26982e7300 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -34,8 +34,9 @@ static atomic_t irq_poll_active;
* true and let the handler run.
*/
bool irq_wait_for_poll(struct irq_desc *desc)
- __must_hold(&desc->lock)
{
+ lockdep_assert_held(&desc->lock);
+
if (WARN_ONCE(irq_poll_cpu == smp_processor_id(),
"irq poll in progress on cpu %d for irq %d\n",
smp_processor_id(), desc->irq_data.irq))
@@ -59,37 +60,35 @@ bool irq_wait_for_poll(struct irq_desc *desc)
/*
* Recovery handler for misrouted interrupts.
*/
-static int try_one_irq(struct irq_desc *desc, bool force)
+static bool try_one_irq(struct irq_desc *desc, bool force)
{
- irqreturn_t ret = IRQ_NONE;
struct irqaction *action;
+ bool ret = false;
- raw_spin_lock(&desc->lock);
+ guard(raw_spinlock)(&desc->lock);
/*
* PER_CPU, nested thread interrupts and interrupts explicitly
* marked polled are excluded from polling.
*/
- if (irq_settings_is_per_cpu(desc) ||
- irq_settings_is_nested_thread(desc) ||
+ if (irq_settings_is_per_cpu(desc) || irq_settings_is_nested_thread(desc) ||
irq_settings_is_polled(desc))
- goto out;
+ return false;
/*
* Do not poll disabled interrupts unless the spurious
* disabled poller asks explicitly.
*/
if (irqd_irq_disabled(&desc->irq_data) && !force)
- goto out;
+ return false;
/*
* All handlers must agree on IRQF_SHARED, so we test just the
* first.
*/
action = desc->action;
- if (!action || !(action->flags & IRQF_SHARED) ||
- (action->flags & __IRQF_TIMER))
- goto out;
+ if (!action || !(action->flags & IRQF_SHARED) || (action->flags & __IRQF_TIMER))
+ return false;
/* Already running on another processor */
if (irqd_irq_inprogress(&desc->irq_data)) {
@@ -98,21 +97,19 @@ static int try_one_irq(struct irq_desc *desc, bool force)
* CPU to go looking for our mystery interrupt too
*/
desc->istate |= IRQS_PENDING;
- goto out;
+ return false;
}
/* Mark it poll in progress */
desc->istate |= IRQS_POLL_INPROGRESS;
do {
if (handle_irq_event(desc) == IRQ_HANDLED)
- ret = IRQ_HANDLED;
+ ret = true;
/* Make sure that there is still a valid action */
action = desc->action;
} while ((desc->istate & IRQS_PENDING) && action);
desc->istate &= ~IRQS_POLL_INPROGRESS;
-out:
- raw_spin_unlock(&desc->lock);
- return ret == IRQ_HANDLED;
+ return ret;
}
static int misrouted_irq(int irq)
@@ -157,8 +154,7 @@ static void poll_spurious_irqs(struct timer_list *unused)
continue;
/* Racy but it doesn't matter */
- state = desc->istate;
- barrier();
+ state = READ_ONCE(desc->istate);
if (!(state & IRQS_SPURIOUS_DISABLED))
continue;
@@ -168,8 +164,7 @@ static void poll_spurious_irqs(struct timer_list *unused)
}
out:
atomic_dec(&irq_poll_active);
- mod_timer(&poll_spurious_irq_timer,
- jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
+ mod_timer(&poll_spurious_irq_timer, jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
}
static inline int bad_action_ret(irqreturn_t action_ret)
@@ -193,17 +188,13 @@ static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret)
{
unsigned int irq = irq_desc_get_irq(desc);
struct irqaction *action;
- unsigned long flags;
- if (bad_action_ret(action_ret)) {
- printk(KERN_ERR "irq event %d: bogus return value %x\n",
- irq, action_ret);
- } else {
- printk(KERN_ERR "irq %d: nobody cared (try booting with "
- "the \"irqpoll\" option)\n", irq);
- }
+ if (bad_action_ret(action_ret))
+ pr_err("irq event %d: bogus return value %x\n", irq, action_ret);
+ else
+ pr_err("irq %d: nobody cared (try booting with the \"irqpoll\" option)\n", irq);
dump_stack();
- printk(KERN_ERR "handlers:\n");
+ pr_err("handlers:\n");
/*
* We need to take desc->lock here. note_interrupt() is called
@@ -211,15 +202,13 @@ static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret)
* with something else removing an action. It's ok to take
* desc->lock here. See synchronize_irq().
*/
- raw_spin_lock_irqsave(&desc->lock, flags);
+ guard(raw_spinlock_irqsave)(&desc->lock);
for_each_action_of_desc(desc, action) {
- printk(KERN_ERR "[<%p>] %ps", action->handler, action->handler);
+ pr_err("[<%p>] %ps", action->handler, action->handler);
if (action->thread_fn)
- printk(KERN_CONT " threaded [<%p>] %ps",
- action->thread_fn, action->thread_fn);
- printk(KERN_CONT "\n");
+ pr_cont(" threaded [<%p>] %ps", action->thread_fn, action->thread_fn);
+ pr_cont("\n");
}
- raw_spin_unlock_irqrestore(&desc->lock, flags);
}
static void report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret)
@@ -232,18 +221,17 @@ static void report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret)
}
}
-static inline int
-try_misrouted_irq(unsigned int irq, struct irq_desc *desc,
- irqreturn_t action_ret)
+static inline bool try_misrouted_irq(unsigned int irq, struct irq_desc *desc,
+ irqreturn_t action_ret)
{
struct irqaction *action;
if (!irqfixup)
- return 0;
+ return false;
/* We didn't actually handle the IRQ - see if it was misrouted? */
if (action_ret == IRQ_NONE)
- return 1;
+ return true;
/*
* But for 'irqfixup == 2' we also do it for handled interrupts if
@@ -251,19 +239,16 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc,
* traditional PC timer interrupt.. Legacy)
*/
if (irqfixup < 2)
- return 0;
+ return false;
if (!irq)
- return 1;
+ return true;
/*
* Since we don't get the descriptor lock, "action" can
- * change under us. We don't really care, but we don't
- * want to follow a NULL pointer. So tell the compiler to
- * just load it once by using a barrier.
+ * change under us.
*/
- action = desc->action;
- barrier();
+ action = READ_ONCE(desc->action);
return action && (action->flags & IRQF_IRQPOLL);
}
@@ -273,8 +258,7 @@ void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret)
{
unsigned int irq;
- if (desc->istate & IRQS_POLL_INPROGRESS ||
- irq_settings_is_polled(desc))
+ if (desc->istate & IRQS_POLL_INPROGRESS || irq_settings_is_polled(desc))
return;
if (bad_action_ret(action_ret)) {
@@ -420,13 +404,12 @@ void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret)
/*
* Now kill the IRQ
*/
- printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
+ pr_emerg("Disabling IRQ #%d\n", irq);
desc->istate |= IRQS_SPURIOUS_DISABLED;
desc->depth++;
irq_disable(desc);
- mod_timer(&poll_spurious_irq_timer,
- jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
+ mod_timer(&poll_spurious_irq_timer, jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
}
desc->irqs_unhandled = 0;
}
@@ -436,11 +419,9 @@ bool noirqdebug __read_mostly;
int noirqdebug_setup(char *str)
{
noirqdebug = 1;
- printk(KERN_INFO "IRQ lockup detection disabled\n");
-
+ pr_info("IRQ lockup detection disabled\n");
return 1;
}
-
__setup("noirqdebug", noirqdebug_setup);
module_param(noirqdebug, bool, 0644);
MODULE_PARM_DESC(noirqdebug, "Disable irq lockup detection when true");
@@ -452,12 +433,10 @@ static int __init irqfixup_setup(char *str)
return 1;
}
irqfixup = 1;
- printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n");
- printk(KERN_WARNING "This may impact system performance.\n");
-
+ pr_warn("Misrouted IRQ fixup support enabled.\n");
+ pr_warn("This may impact system performance.\n");
return 1;
}
-
__setup("irqfixup", irqfixup_setup);
module_param(irqfixup, int, 0644);
@@ -468,11 +447,8 @@ static int __init irqpoll_setup(char *str)
return 1;
}
irqfixup = 2;
- printk(KERN_WARNING "Misrouted IRQ fixup and polling support "
- "enabled\n");
- printk(KERN_WARNING "This may significantly impact system "
- "performance\n");
+ pr_warn("Misrouted IRQ fixup and polling support enabled\n");
+ pr_warn("This may significantly impact system performance\n");
return 1;
}
-
__setup("irqpoll", irqpoll_setup);
diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c
index 6ce73cceaf53..c2871180edcc 100644
--- a/kernel/kcsan/kcsan_test.c
+++ b/kernel/kcsan/kcsan_test.c
@@ -1501,7 +1501,7 @@ static int access_thread(void *arg)
}
} while (!torture_must_stop());
timer_delete_sync(&timer);
- destroy_timer_on_stack(&timer);
+ timer_destroy_on_stack(&timer);
torture_kthread_stopping("access_thread");
return 0;
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 3e62b944c883..9c59fa480b0b 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -877,6 +877,60 @@ int kimage_load_segment(struct kimage *image,
return result;
}
+void *kimage_map_segment(struct kimage *image,
+ unsigned long addr, unsigned long size)
+{
+ unsigned long src_page_addr, dest_page_addr = 0;
+ unsigned long eaddr = addr + size;
+ kimage_entry_t *ptr, entry;
+ struct page **src_pages;
+ unsigned int npages;
+ void *vaddr = NULL;
+ int i;
+
+ /*
+ * Collect the source pages and map them in a contiguous VA range.
+ */
+ npages = PFN_UP(eaddr) - PFN_DOWN(addr);
+ src_pages = kmalloc_array(npages, sizeof(*src_pages), GFP_KERNEL);
+ if (!src_pages) {
+ pr_err("Could not allocate ima pages array.\n");
+ return NULL;
+ }
+
+ i = 0;
+ for_each_kimage_entry(image, ptr, entry) {
+ if (entry & IND_DESTINATION) {
+ dest_page_addr = entry & PAGE_MASK;
+ } else if (entry & IND_SOURCE) {
+ if (dest_page_addr >= addr && dest_page_addr < eaddr) {
+ src_page_addr = entry & PAGE_MASK;
+ src_pages[i++] =
+ virt_to_page(__va(src_page_addr));
+ if (i == npages)
+ break;
+ dest_page_addr += PAGE_SIZE;
+ }
+ }
+ }
+
+ /* Sanity check. */
+ WARN_ON(i < npages);
+
+ vaddr = vmap(src_pages, npages, VM_MAP, PAGE_KERNEL);
+ kfree(src_pages);
+
+ if (!vaddr)
+ pr_err("Could not map ima buffer.\n");
+
+ return vaddr;
+}
+
+void kimage_unmap_segment(void *segment_buffer)
+{
+ vunmap(segment_buffer);
+}
+
struct kexec_load_limit {
/* Mutex protects the limit count. */
struct mutex mutex;
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index fba686487e3b..0adb645072aa 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -38,6 +38,21 @@ void set_kexec_sig_enforced(void)
}
#endif
+#ifdef CONFIG_IMA_KEXEC
+static bool check_ima_segment_index(struct kimage *image, int i)
+{
+ if (image->is_ima_segment_index_set && i == image->ima_segment_index)
+ return true;
+ else
+ return false;
+}
+#else
+static bool check_ima_segment_index(struct kimage *image, int i)
+{
+ return false;
+}
+#endif
+
static int kexec_calculate_store_digests(struct kimage *image);
/* Maximum size in bytes for kernel/initrd files. */
@@ -186,6 +201,15 @@ kimage_validate_signature(struct kimage *image)
}
#endif
+static int kexec_post_load(struct kimage *image, unsigned long flags)
+{
+#ifdef CONFIG_IMA_KEXEC
+ if (!(flags & KEXEC_FILE_ON_CRASH))
+ ima_kexec_post_load(image);
+#endif
+ return machine_kexec_post_load(image);
+}
+
/*
* In file mode list of segments is prepared by kernel. Copy relevant
* data from user space, do error checking, prepare segment list
@@ -413,7 +437,7 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
kimage_terminate(image);
- ret = machine_kexec_post_load(image);
+ ret = kexec_post_load(image, flags);
if (ret)
goto out;
@@ -776,6 +800,13 @@ static int kexec_calculate_store_digests(struct kimage *image)
if (ksegment->kbuf == pi->purgatory_buf)
continue;
+ /*
+ * Skip the segment if ima_segment_index is set and matches
+ * the current index
+ */
+ if (check_ima_segment_index(image, i))
+ continue;
+
ret = crypto_shash_update(desc, ksegment->kbuf,
ksegment->bufsz);
if (ret)
diff --git a/kernel/panic.c b/kernel/panic.c
index a3889f38153d..047ea3215312 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -97,6 +97,36 @@ static const struct ctl_table kern_panic_table[] = {
},
#endif
{
+ .procname = "panic",
+ .data = &panic_timeout,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "panic_on_oops",
+ .data = &panic_on_oops,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "panic_print",
+ .data = &panic_print,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
+ {
+ .procname = "panic_on_warn",
+ .data = &panic_on_warn,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {
.procname = "warn_limit",
.data = &warn_limit,
.maxlen = sizeof(warn_limit),
diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
index d9b7e2b38c7a..ea7995a25780 100644
--- a/kernel/power/energy_model.c
+++ b/kernel/power/energy_model.c
@@ -233,6 +233,10 @@ static int em_compute_costs(struct device *dev, struct em_perf_state *table,
unsigned long prev_cost = ULONG_MAX;
int i, ret;
+ /* This is needed only for CPUs and EAS skip other devices */
+ if (!_is_cpu_device(dev))
+ return 0;
+
/* Compute the cost of each performance state. */
for (i = nr_states - 1; i >= 0; i--) {
unsigned long power_res, cost;
@@ -698,10 +702,12 @@ static int em_recalc_and_update(struct device *dev, struct em_perf_domain *pd,
{
int ret;
- ret = em_compute_costs(dev, em_table->state, NULL, pd->nr_perf_states,
- pd->flags);
- if (ret)
- goto free_em_table;
+ if (!em_is_artificial(pd)) {
+ ret = em_compute_costs(dev, em_table->state, NULL,
+ pd->nr_perf_states, pd->flags);
+ if (ret)
+ goto free_em_table;
+ }
ret = em_dev_update_perf_domain(dev, em_table);
if (ret)
@@ -721,10 +727,24 @@ free_em_table:
* Adjustment of CPU performance values after boot, when all CPUs capacites
* are correctly calculated.
*/
-static void em_adjust_new_capacity(struct device *dev,
+static void em_adjust_new_capacity(unsigned int cpu, struct device *dev,
struct em_perf_domain *pd)
{
+ unsigned long cpu_capacity = arch_scale_cpu_capacity(cpu);
struct em_perf_table *em_table;
+ struct em_perf_state *table;
+ unsigned long em_max_perf;
+
+ rcu_read_lock();
+ table = em_perf_state_from_pd(pd);
+ em_max_perf = table[pd->nr_perf_states - 1].performance;
+ rcu_read_unlock();
+
+ if (em_max_perf == cpu_capacity)
+ return;
+
+ pr_debug("updating cpu%d cpu_cap=%lu old capacity=%lu\n", cpu,
+ cpu_capacity, em_max_perf);
em_table = em_table_dup(pd);
if (!em_table) {
@@ -737,12 +757,27 @@ static void em_adjust_new_capacity(struct device *dev,
em_recalc_and_update(dev, pd, em_table);
}
+/**
+ * em_adjust_cpu_capacity() - Adjust the EM for a CPU after a capacity update.
+ * @cpu: Target CPU.
+ *
+ * Adjust the existing EM for @cpu after a capacity update under the assumption
+ * that the capacity has been updated in the same way for all of the CPUs in
+ * the same perf domain.
+ */
+void em_adjust_cpu_capacity(unsigned int cpu)
+{
+ struct device *dev = get_cpu_device(cpu);
+ struct em_perf_domain *pd;
+
+ pd = em_pd_get(dev);
+ if (pd)
+ em_adjust_new_capacity(cpu, dev, pd);
+}
+
static void em_check_capacity_update(void)
{
cpumask_var_t cpu_done_mask;
- struct em_perf_state *table;
- struct em_perf_domain *pd;
- unsigned long cpu_capacity;
int cpu;
if (!zalloc_cpumask_var(&cpu_done_mask, GFP_KERNEL)) {
@@ -753,7 +788,7 @@ static void em_check_capacity_update(void)
/* Check if CPUs capacity has changed than update EM */
for_each_possible_cpu(cpu) {
struct cpufreq_policy *policy;
- unsigned long em_max_perf;
+ struct em_perf_domain *pd;
struct device *dev;
if (cpumask_test_cpu(cpu, cpu_done_mask))
@@ -776,24 +811,7 @@ static void em_check_capacity_update(void)
cpumask_or(cpu_done_mask, cpu_done_mask,
em_span_cpus(pd));
- cpu_capacity = arch_scale_cpu_capacity(cpu);
-
- rcu_read_lock();
- table = em_perf_state_from_pd(pd);
- em_max_perf = table[pd->nr_perf_states - 1].performance;
- rcu_read_unlock();
-
- /*
- * Check if the CPU capacity has been adjusted during boot
- * and trigger the update for new performance values.
- */
- if (em_max_perf == cpu_capacity)
- continue;
-
- pr_debug("updating cpu%d cpu_cap=%lu old capacity=%lu\n",
- cpu, cpu_capacity, em_max_perf);
-
- em_adjust_new_capacity(dev, pd);
+ em_adjust_new_capacity(cpu, dev, pd);
}
free_cpumask_var(cpu_done_mask);
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 338c9917d4ee..519fb09de5e0 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -90,6 +90,11 @@ void hibernate_release(void)
atomic_inc(&hibernate_atomic);
}
+bool hibernation_in_progress(void)
+{
+ return !atomic_read(&hibernate_atomic);
+}
+
bool hibernation_available(void)
{
return nohibernate == 0 &&
@@ -133,10 +138,15 @@ bool system_entering_hibernation(void)
EXPORT_SYMBOL(system_entering_hibernation);
#ifdef CONFIG_PM_DEBUG
+static unsigned int pm_test_delay = 5;
+module_param(pm_test_delay, uint, 0644);
+MODULE_PARM_DESC(pm_test_delay,
+ "Number of seconds to wait before resuming from hibernation test");
static void hibernation_debug_sleep(void)
{
- pr_info("debug: Waiting for 5 seconds.\n");
- mdelay(5000);
+ pr_info("hibernation debug: Waiting for %d second(s).\n",
+ pm_test_delay);
+ mdelay(pm_test_delay * 1000);
}
static int hibernation_test(int level)
@@ -757,7 +767,7 @@ int hibernate(void)
* Query for the compression algorithm support if compression is enabled.
*/
if (!nocompress) {
- strscpy(hib_comp_algo, hibernate_compressor, sizeof(hib_comp_algo));
+ strscpy(hib_comp_algo, hibernate_compressor);
if (!crypto_has_acomp(hib_comp_algo, 0, CRYPTO_ALG_ASYNC)) {
pr_err("%s compression is not available\n", hib_comp_algo);
return -EOPNOTSUPP;
@@ -1013,9 +1023,9 @@ static int software_resume(void)
*/
if (!(swsusp_header_flags & SF_NOCOMPRESS_MODE)) {
if (swsusp_header_flags & SF_COMPRESSION_ALG_LZ4)
- strscpy(hib_comp_algo, COMPRESSION_ALGO_LZ4, sizeof(hib_comp_algo));
+ strscpy(hib_comp_algo, COMPRESSION_ALGO_LZ4);
else
- strscpy(hib_comp_algo, COMPRESSION_ALGO_LZO, sizeof(hib_comp_algo));
+ strscpy(hib_comp_algo, COMPRESSION_ALGO_LZO);
if (!crypto_has_acomp(hib_comp_algo, 0, CRYPTO_ALG_ASYNC)) {
pr_err("%s compression is not available\n", hib_comp_algo);
error = -EOPNOTSUPP;
@@ -1470,8 +1480,7 @@ static int hibernate_compressor_param_set(const char *compressor,
if (index >= 0) {
ret = param_set_copystring(comp_alg_enabled[index], kp);
if (!ret)
- strscpy(hib_comp_algo, comp_alg_enabled[index],
- sizeof(hib_comp_algo));
+ strscpy(hib_comp_algo, comp_alg_enabled[index]);
} else {
ret = index;
}
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 0b0e76324c43..3d484630505a 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -557,6 +557,10 @@ static int __init pm_debugfs_init(void)
late_initcall(pm_debugfs_init);
#endif /* CONFIG_DEBUG_FS */
+bool pm_sleep_transition_in_progress(void)
+{
+ return pm_suspend_in_progress() || hibernation_in_progress();
+}
#endif /* CONFIG_PM_SLEEP */
#ifdef CONFIG_PM_SLEEP_DEBUG
@@ -594,7 +598,7 @@ power_attr(pm_print_times);
static inline void pm_print_times_init(void)
{
- pm_print_times_enabled = !!initcall_debug;
+ pm_print_times_enabled = initcall_debug;
}
static ssize_t pm_wakeup_irq_show(struct kobject *kobj,
@@ -613,7 +617,7 @@ bool pm_debug_messages_on __read_mostly;
bool pm_debug_messages_should_print(void)
{
- return pm_debug_messages_on && pm_suspend_target_state != PM_SUSPEND_ON;
+ return pm_debug_messages_on && pm_sleep_transition_in_progress();
}
EXPORT_SYMBOL_GPL(pm_debug_messages_should_print);
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 2eb81662b8fa..cb1d71562002 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -75,10 +75,14 @@ extern void enable_restore_image_protection(void);
static inline void enable_restore_image_protection(void) {}
#endif /* CONFIG_STRICT_KERNEL_RWX */
+extern bool hibernation_in_progress(void);
+
#else /* !CONFIG_HIBERNATION */
static inline void hibernate_reserved_size_init(void) {}
static inline void hibernate_image_size_init(void) {}
+
+static inline bool hibernation_in_progress(void) { return false; }
#endif /* !CONFIG_HIBERNATION */
#define power_attr(_name) \
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 66ac067d9ae6..dc0dfc349f22 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -189,7 +189,7 @@ void thaw_processes(void)
oom_killer_enable();
- pr_info("Restarting tasks ... ");
+ pr_info("Restarting tasks: Starting\n");
__usermodehelper_set_disable_depth(UMH_FREEZING);
thaw_workqueues();
@@ -208,7 +208,7 @@ void thaw_processes(void)
usermodehelper_enable();
schedule();
- pr_cont("done.\n");
+ pr_info("Restarting tasks: Done\n");
trace_suspend_resume(TPS("thaw_processes"), 0, false);
}
@@ -217,7 +217,7 @@ void thaw_kernel_threads(void)
struct task_struct *g, *p;
pm_nosig_freezing = false;
- pr_info("Restarting kernel threads ... ");
+ pr_info("Restarting kernel threads ...\n");
thaw_workqueues();
@@ -229,5 +229,5 @@ void thaw_kernel_threads(void)
read_unlock(&tasklist_lock);
schedule();
- pr_cont("done.\n");
+ pr_info("Done restarting kernel threads.\n");
}
diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c
index 52571dcad768..4e941999a53b 100644
--- a/kernel/power/wakelock.c
+++ b/kernel/power/wakelock.c
@@ -49,6 +49,9 @@ ssize_t pm_show_wakelocks(char *buf, bool show_active)
len += sysfs_emit_at(buf, len, "%s ", wl->name);
}
+ if (len > 0)
+ --len;
+
len += sysfs_emit_at(buf, len, "\n");
mutex_unlock(&wakelocks_lock);
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index b5db1ac32a3d..70ec0f21abc3 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -2371,7 +2371,7 @@ rcu_torture_reader(void *arg)
} while (!torture_must_stop());
if (irqreader && cur_ops->irq_capable) {
timer_delete_sync(&t);
- destroy_timer_on_stack(&t);
+ timer_destroy_on_stack(&t);
}
tick_dep_clear_task(current, TICK_DEP_BIT_RCU);
torture_kthread_stopping("rcu_torture_reader");
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 816f07f9d30f..461242ec958a 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -630,7 +630,7 @@ static const struct kobj_type sugov_tunables_ktype = {
/********************** cpufreq governor interface *********************/
-struct cpufreq_governor schedutil_gov;
+static struct cpufreq_governor schedutil_gov;
static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy)
{
@@ -909,7 +909,7 @@ static void sugov_limits(struct cpufreq_policy *policy)
WRITE_ONCE(sg_policy->limits_changed, true);
}
-struct cpufreq_governor schedutil_gov = {
+static struct cpufreq_governor schedutil_gov = {
.name = "schedutil",
.owner = THIS_MODULE,
.flags = CPUFREQ_GOV_DYNAMIC_SWITCHING,
@@ -927,4 +927,9 @@ struct cpufreq_governor *cpufreq_default_governor(void)
}
#endif
+bool sugov_is_governor(struct cpufreq_policy *policy)
+{
+ return policy->governor == &schedutil_gov;
+}
+
cpufreq_governor_init(schedutil_gov);
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index f5133249fd4d..2c41c78be61e 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -26,7 +26,7 @@ enum scx_consts {
* Iterating all tasks may take a while. Periodically drop
* scx_tasks_lock to avoid causing e.g. CSD and RCU stalls.
*/
- SCX_OPS_TASK_ITER_BATCH = 32,
+ SCX_TASK_ITER_BATCH = 32,
};
enum scx_exit_kind {
@@ -44,9 +44,9 @@ enum scx_exit_kind {
};
/*
- * An exit code can be specified when exiting with scx_bpf_exit() or
- * scx_ops_exit(), corresponding to exit_kind UNREG_BPF and UNREG_KERN
- * respectively. The codes are 64bit of the format:
+ * An exit code can be specified when exiting with scx_bpf_exit() or scx_exit(),
+ * corresponding to exit_kind UNREG_BPF and UNREG_KERN respectively. The codes
+ * are 64bit of the format:
*
* Bits: [63 .. 48 47 .. 32 31 .. 0]
* [ SYS ACT ] [ SYS RSN ] [ USR ]
@@ -163,16 +163,21 @@ enum scx_ops_flags {
/*
* CPU cgroup support flags
*/
- SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16, /* DEPRECATED, will be removed on 6.18 */
+ SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16, /* DEPRECATED, will be removed on 6.18 */
- SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE |
- SCX_OPS_ENQ_LAST |
- SCX_OPS_ENQ_EXITING |
- SCX_OPS_ENQ_MIGRATION_DISABLED |
- SCX_OPS_ALLOW_QUEUED_WAKEUP |
- SCX_OPS_SWITCH_PARTIAL |
- SCX_OPS_BUILTIN_IDLE_PER_NODE |
- SCX_OPS_HAS_CGROUP_WEIGHT,
+ SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE |
+ SCX_OPS_ENQ_LAST |
+ SCX_OPS_ENQ_EXITING |
+ SCX_OPS_ENQ_MIGRATION_DISABLED |
+ SCX_OPS_ALLOW_QUEUED_WAKEUP |
+ SCX_OPS_SWITCH_PARTIAL |
+ SCX_OPS_BUILTIN_IDLE_PER_NODE |
+ SCX_OPS_HAS_CGROUP_WEIGHT,
+
+ /* high 8 bits are internal, don't include in SCX_OPS_ALL_FLAGS */
+ __SCX_OPS_INTERNAL_MASK = 0xffLLU << 56,
+
+ SCX_OPS_HAS_CPU_PREEMPT = 1LLU << 56,
};
/* argument container for ops.init_task() */
@@ -368,6 +373,15 @@ struct sched_ext_ops {
* @running: A task is starting to run on its associated CPU
* @p: task starting to run
*
+ * Note that this callback may be called from a CPU other than the
+ * one the task is going to run on. This can happen when a task
+ * property is changed (i.e., affinity), since scx_next_task_scx(),
+ * which triggers this callback, may run on a CPU different from
+ * the task's assigned CPU.
+ *
+ * Therefore, always use scx_bpf_task_cpu(@p) to determine the
+ * target CPU the task is going to use.
+ *
* See ->runnable() for explanation on the task state notifiers.
*/
void (*running)(struct task_struct *p);
@@ -377,6 +391,15 @@ struct sched_ext_ops {
* @p: task stopping to run
* @runnable: is task @p still runnable?
*
+ * Note that this callback may be called from a CPU other than the
+ * one the task was running on. This can happen when a task
+ * property is changed (i.e., affinity), since dequeue_task_scx(),
+ * which triggers this callback, may run on a CPU different from
+ * the task's assigned CPU.
+ *
+ * Therefore, always use scx_bpf_task_cpu(@p) to retrieve the CPU
+ * the task was running on.
+ *
* See ->runnable() for explanation on the task state notifiers. If
* !@runnable, ->quiescent() will be invoked after this operation
* returns.
@@ -465,6 +488,7 @@ struct sched_ext_ops {
* idle CPU tracking and the following helpers become unavailable:
*
* - scx_bpf_select_cpu_dfl()
+ * - scx_bpf_select_cpu_and()
* - scx_bpf_test_and_clear_cpu_idle()
* - scx_bpf_pick_idle_cpu()
*
@@ -728,6 +752,9 @@ struct sched_ext_ops {
* BPF scheduler is enabled.
*/
char name[SCX_OPS_NAME_LEN];
+
+ /* internal use only, must be NULL */
+ void *priv;
};
enum scx_opi {
@@ -739,6 +766,98 @@ enum scx_opi {
SCX_OPI_END = SCX_OP_IDX(init),
};
+/*
+ * Collection of event counters. Event types are placed in descending order.
+ */
+struct scx_event_stats {
+ /*
+ * If ops.select_cpu() returns a CPU which can't be used by the task,
+ * the core scheduler code silently picks a fallback CPU.
+ */
+ s64 SCX_EV_SELECT_CPU_FALLBACK;
+
+ /*
+ * When dispatching to a local DSQ, the CPU may have gone offline in
+ * the meantime. In this case, the task is bounced to the global DSQ.
+ */
+ s64 SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE;
+
+ /*
+ * If SCX_OPS_ENQ_LAST is not set, the number of times that a task
+ * continued to run because there were no other tasks on the CPU.
+ */
+ s64 SCX_EV_DISPATCH_KEEP_LAST;
+
+ /*
+ * If SCX_OPS_ENQ_EXITING is not set, the number of times that a task
+ * is dispatched to a local DSQ when exiting.
+ */
+ s64 SCX_EV_ENQ_SKIP_EXITING;
+
+ /*
+ * If SCX_OPS_ENQ_MIGRATION_DISABLED is not set, the number of times a
+ * migration disabled task skips ops.enqueue() and is dispatched to its
+ * local DSQ.
+ */
+ s64 SCX_EV_ENQ_SKIP_MIGRATION_DISABLED;
+
+ /*
+ * Total number of times a task's time slice was refilled with the
+ * default value (SCX_SLICE_DFL).
+ */
+ s64 SCX_EV_REFILL_SLICE_DFL;
+
+ /*
+ * The total duration of bypass modes in nanoseconds.
+ */
+ s64 SCX_EV_BYPASS_DURATION;
+
+ /*
+ * The number of tasks dispatched in the bypassing mode.
+ */
+ s64 SCX_EV_BYPASS_DISPATCH;
+
+ /*
+ * The number of times the bypassing mode has been activated.
+ */
+ s64 SCX_EV_BYPASS_ACTIVATE;
+};
+
+struct scx_sched {
+ struct sched_ext_ops ops;
+ DECLARE_BITMAP(has_op, SCX_OPI_END);
+
+ /*
+ * Dispatch queues.
+ *
+ * The global DSQ (%SCX_DSQ_GLOBAL) is split per-node for scalability.
+ * This is to avoid live-locking in bypass mode where all tasks are
+ * dispatched to %SCX_DSQ_GLOBAL and all CPUs consume from it. If
+ * per-node split isn't sufficient, it can be further split.
+ */
+ struct rhashtable dsq_hash;
+ struct scx_dispatch_q **global_dsqs;
+
+ /*
+ * The event counters are in a per-CPU variable to minimize the
+ * accounting overhead. A system-wide view on the event counter is
+ * constructed when requested by scx_bpf_events().
+ */
+ struct scx_event_stats __percpu *event_stats_cpu;
+
+ bool warned_zero_slice;
+
+ atomic_t exit_kind;
+ struct scx_exit_info *exit_info;
+
+ struct kobject kobj;
+
+ struct kthread_worker *helper;
+ struct irq_work error_irq_work;
+ struct kthread_work disable_work;
+ struct rcu_work rcu_work;
+};
+
enum scx_wake_flags {
/* expose select WF_* flags as enums */
SCX_WAKE_FORK = WF_FORK,
@@ -838,18 +957,18 @@ enum scx_tg_flags {
SCX_TG_INITED = 1U << 1,
};
-enum scx_ops_enable_state {
- SCX_OPS_ENABLING,
- SCX_OPS_ENABLED,
- SCX_OPS_DISABLING,
- SCX_OPS_DISABLED,
+enum scx_enable_state {
+ SCX_ENABLING,
+ SCX_ENABLED,
+ SCX_DISABLING,
+ SCX_DISABLED,
};
-static const char *scx_ops_enable_state_str[] = {
- [SCX_OPS_ENABLING] = "enabling",
- [SCX_OPS_ENABLED] = "enabled",
- [SCX_OPS_DISABLING] = "disabling",
- [SCX_OPS_DISABLED] = "disabled",
+static const char *scx_enable_state_str[] = {
+ [SCX_ENABLING] = "enabling",
+ [SCX_ENABLED] = "enabled",
+ [SCX_DISABLING] = "disabling",
+ [SCX_DISABLED] = "disabled",
};
/*
@@ -898,6 +1017,16 @@ enum scx_ops_state {
#define SCX_OPSS_QSEQ_MASK (~SCX_OPSS_STATE_MASK)
/*
+ * NOTE: sched_ext is in the process of growing multiple scheduler support and
+ * scx_root usage is in a transitional state. Naked dereferences are safe if the
+ * caller is one of the tasks attached to SCX and explicit RCU dereference is
+ * necessary otherwise. Naked scx_root dereferences trigger sparse warnings but
+ * are used as temporary markers to indicate that the dereferences need to be
+ * updated to point to the associated scheduler instances rather than scx_root.
+ */
+static struct scx_sched __rcu *scx_root;
+
+/*
* During exit, a task may schedule after losing its PIDs. When disabling the
* BPF scheduler, we need to be able to iterate tasks in every state to
* guarantee system safety. Maintain a dedicated task list which contains every
@@ -907,33 +1036,17 @@ static DEFINE_SPINLOCK(scx_tasks_lock);
static LIST_HEAD(scx_tasks);
/* ops enable/disable */
-static struct kthread_worker *scx_ops_helper;
-static DEFINE_MUTEX(scx_ops_enable_mutex);
-DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled);
+static DEFINE_MUTEX(scx_enable_mutex);
+DEFINE_STATIC_KEY_FALSE(__scx_enabled);
DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem);
-static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED);
+static atomic_t scx_enable_state_var = ATOMIC_INIT(SCX_DISABLED);
static unsigned long scx_in_softlockup;
-static atomic_t scx_ops_breather_depth = ATOMIC_INIT(0);
-static int scx_ops_bypass_depth;
-static bool scx_ops_init_task_enabled;
+static atomic_t scx_breather_depth = ATOMIC_INIT(0);
+static int scx_bypass_depth;
+static bool scx_init_task_enabled;
static bool scx_switching_all;
DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
-static struct sched_ext_ops scx_ops;
-static bool scx_warned_zero_slice;
-
-DEFINE_STATIC_KEY_FALSE(scx_ops_allow_queued_wakeup);
-static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_last);
-static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting);
-static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_migration_disabled);
-static DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt);
-
-static struct static_key_false scx_has_op[SCX_OPI_END] =
- { [0 ... SCX_OPI_END-1] = STATIC_KEY_FALSE_INIT };
-
-static atomic_t scx_exit_kind = ATOMIC_INIT(SCX_EXIT_DONE);
-static struct scx_exit_info *scx_exit_info;
-
static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0);
static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0);
@@ -947,7 +1060,7 @@ static atomic_long_t scx_enable_seq = ATOMIC_LONG_INIT(0);
/*
* The maximum amount of time in jiffies that a task may be runnable without
* being scheduled on a CPU. If this timeout is exceeded, it will trigger
- * scx_ops_error().
+ * scx_error().
*/
static unsigned long scx_watchdog_timeout;
@@ -973,23 +1086,12 @@ static unsigned long __percpu *scx_kick_cpus_pnt_seqs;
*/
static DEFINE_PER_CPU(struct task_struct *, direct_dispatch_task);
-/*
- * Dispatch queues.
- *
- * The global DSQ (%SCX_DSQ_GLOBAL) is split per-node for scalability. This is
- * to avoid live-locking in bypass mode where all tasks are dispatched to
- * %SCX_DSQ_GLOBAL and all CPUs consume from it. If per-node split isn't
- * sufficient, it can be further split.
- */
-static struct scx_dispatch_q **global_dsqs;
-
static const struct rhashtable_params dsq_hash_params = {
.key_len = sizeof_field(struct scx_dispatch_q, id),
.key_offset = offsetof(struct scx_dispatch_q, id),
.head_offset = offsetof(struct scx_dispatch_q, hash_node),
};
-static struct rhashtable dsq_hash;
static LLIST_HEAD(dsqs_to_free);
/* dispatch buf */
@@ -1036,27 +1138,46 @@ static struct scx_dump_data scx_dump_data = {
/* /sys/kernel/sched_ext interface */
static struct kset *scx_kset;
-static struct kobject *scx_root_kobj;
#define CREATE_TRACE_POINTS
#include <trace/events/sched_ext.h>
static void process_ddsp_deferred_locals(struct rq *rq);
static void scx_bpf_kick_cpu(s32 cpu, u64 flags);
-static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind,
- s64 exit_code,
- const char *fmt, ...);
+static void scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind,
+ s64 exit_code, const char *fmt, va_list args);
+
+static __printf(4, 5) void scx_exit(struct scx_sched *sch,
+ enum scx_exit_kind kind, s64 exit_code,
+ const char *fmt, ...)
+{
+ va_list args;
+
+ va_start(args, fmt);
+ scx_vexit(sch, kind, exit_code, fmt, args);
+ va_end(args);
+}
-#define scx_ops_error_kind(err, fmt, args...) \
- scx_ops_exit_kind((err), 0, fmt, ##args)
+static __printf(3, 4) void scx_kf_exit(enum scx_exit_kind kind, s64 exit_code,
+ const char *fmt, ...)
+{
+ struct scx_sched *sch;
+ va_list args;
-#define scx_ops_exit(code, fmt, args...) \
- scx_ops_exit_kind(SCX_EXIT_UNREG_KERN, (code), fmt, ##args)
+ rcu_read_lock();
+ sch = rcu_dereference(scx_root);
+ if (sch) {
+ va_start(args, fmt);
+ scx_vexit(sch, kind, exit_code, fmt, args);
+ va_end(args);
+ }
+ rcu_read_unlock();
+}
-#define scx_ops_error(fmt, args...) \
- scx_ops_error_kind(SCX_EXIT_ERROR, fmt, ##args)
+#define scx_error(sch, fmt, args...) scx_exit((sch), SCX_EXIT_ERROR, 0, fmt, ##args)
+#define scx_kf_error(fmt, args...) scx_kf_exit(SCX_EXIT_ERROR, 0, fmt, ##args)
-#define SCX_HAS_OP(op) static_branch_likely(&scx_has_op[SCX_OP_IDX(op)])
+#define SCX_HAS_OP(sch, op) test_bit(SCX_OP_IDX(op), (sch)->has_op)
static long jiffies_delta_msecs(unsigned long at, unsigned long now)
{
@@ -1086,12 +1207,14 @@ static bool u32_before(u32 a, u32 b)
static struct scx_dispatch_q *find_global_dsq(struct task_struct *p)
{
- return global_dsqs[cpu_to_node(task_cpu(p))];
+ struct scx_sched *sch = scx_root;
+
+ return sch->global_dsqs[cpu_to_node(task_cpu(p))];
}
-static struct scx_dispatch_q *find_user_dsq(u64 dsq_id)
+static struct scx_dispatch_q *find_user_dsq(struct scx_sched *sch, u64 dsq_id)
{
- return rhashtable_lookup_fast(&dsq_hash, &dsq_id, dsq_hash_params);
+ return rhashtable_lookup_fast(&sch->dsq_hash, &dsq_id, dsq_hash_params);
}
/*
@@ -1147,30 +1270,30 @@ static inline struct rq *scx_locked_rq(void)
return __this_cpu_read(locked_rq);
}
-#define SCX_CALL_OP(mask, op, rq, args...) \
+#define SCX_CALL_OP(sch, mask, op, rq, args...) \
do { \
update_locked_rq(rq); \
if (mask) { \
scx_kf_allow(mask); \
- scx_ops.op(args); \
+ (sch)->ops.op(args); \
scx_kf_disallow(mask); \
} else { \
- scx_ops.op(args); \
+ (sch)->ops.op(args); \
} \
update_locked_rq(NULL); \
} while (0)
-#define SCX_CALL_OP_RET(mask, op, rq, args...) \
+#define SCX_CALL_OP_RET(sch, mask, op, rq, args...) \
({ \
- __typeof__(scx_ops.op(args)) __ret; \
+ __typeof__((sch)->ops.op(args)) __ret; \
\
update_locked_rq(rq); \
if (mask) { \
scx_kf_allow(mask); \
- __ret = scx_ops.op(args); \
+ __ret = (sch)->ops.op(args); \
scx_kf_disallow(mask); \
} else { \
- __ret = scx_ops.op(args); \
+ __ret = (sch)->ops.op(args); \
} \
update_locked_rq(NULL); \
__ret; \
@@ -1187,31 +1310,31 @@ do { \
* scx_kf_allowed_on_arg_tasks() to test whether the invocation is allowed on
* the specific task.
*/
-#define SCX_CALL_OP_TASK(mask, op, rq, task, args...) \
+#define SCX_CALL_OP_TASK(sch, mask, op, rq, task, args...) \
do { \
BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \
current->scx.kf_tasks[0] = task; \
- SCX_CALL_OP(mask, op, rq, task, ##args); \
+ SCX_CALL_OP((sch), mask, op, rq, task, ##args); \
current->scx.kf_tasks[0] = NULL; \
} while (0)
-#define SCX_CALL_OP_TASK_RET(mask, op, rq, task, args...) \
+#define SCX_CALL_OP_TASK_RET(sch, mask, op, rq, task, args...) \
({ \
- __typeof__(scx_ops.op(task, ##args)) __ret; \
+ __typeof__((sch)->ops.op(task, ##args)) __ret; \
BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \
current->scx.kf_tasks[0] = task; \
- __ret = SCX_CALL_OP_RET(mask, op, rq, task, ##args); \
+ __ret = SCX_CALL_OP_RET((sch), mask, op, rq, task, ##args); \
current->scx.kf_tasks[0] = NULL; \
__ret; \
})
-#define SCX_CALL_OP_2TASKS_RET(mask, op, rq, task0, task1, args...) \
+#define SCX_CALL_OP_2TASKS_RET(sch, mask, op, rq, task0, task1, args...) \
({ \
- __typeof__(scx_ops.op(task0, task1, ##args)) __ret; \
+ __typeof__((sch)->ops.op(task0, task1, ##args)) __ret; \
BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \
current->scx.kf_tasks[0] = task0; \
current->scx.kf_tasks[1] = task1; \
- __ret = SCX_CALL_OP_RET(mask, op, rq, task0, task1, ##args); \
+ __ret = SCX_CALL_OP_RET((sch), mask, op, rq, task0, task1, ##args); \
current->scx.kf_tasks[0] = NULL; \
current->scx.kf_tasks[1] = NULL; \
__ret; \
@@ -1221,8 +1344,8 @@ do { \
static __always_inline bool scx_kf_allowed(u32 mask)
{
if (unlikely(!(current->scx.kf_mask & mask))) {
- scx_ops_error("kfunc with mask 0x%x called from an operation only allowing 0x%x",
- mask, current->scx.kf_mask);
+ scx_kf_error("kfunc with mask 0x%x called from an operation only allowing 0x%x",
+ mask, current->scx.kf_mask);
return false;
}
@@ -1235,13 +1358,13 @@ static __always_inline bool scx_kf_allowed(u32 mask)
*/
if (unlikely(highest_bit(mask) == SCX_KF_CPU_RELEASE &&
(current->scx.kf_mask & higher_bits(SCX_KF_CPU_RELEASE)))) {
- scx_ops_error("cpu_release kfunc called from a nested operation");
+ scx_kf_error("cpu_release kfunc called from a nested operation");
return false;
}
if (unlikely(highest_bit(mask) == SCX_KF_DISPATCH &&
(current->scx.kf_mask & higher_bits(SCX_KF_DISPATCH)))) {
- scx_ops_error("dispatch kfunc called from a nested operation");
+ scx_kf_error("dispatch kfunc called from a nested operation");
return false;
}
@@ -1257,18 +1380,13 @@ static __always_inline bool scx_kf_allowed_on_arg_tasks(u32 mask,
if (unlikely((p != current->scx.kf_tasks[0] &&
p != current->scx.kf_tasks[1]))) {
- scx_ops_error("called on a task not being operated on");
+ scx_kf_error("called on a task not being operated on");
return false;
}
return true;
}
-static bool scx_kf_allowed_if_unlocked(void)
-{
- return !current->scx.kf_mask;
-}
-
/**
* nldsq_next_task - Iterate to the next task in a non-local DSQ
* @dsq: user dsq being iterated
@@ -1436,15 +1554,15 @@ static void scx_task_iter_stop(struct scx_task_iter *iter)
* @iter: iterator to walk
*
* Visit the next task. See scx_task_iter_start() for details. Locks are dropped
- * and re-acquired every %SCX_OPS_TASK_ITER_BATCH iterations to avoid causing
- * stalls by holding scx_tasks_lock for too long.
+ * and re-acquired every %SCX_TASK_ITER_BATCH iterations to avoid causing stalls
+ * by holding scx_tasks_lock for too long.
*/
static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter)
{
struct list_head *cursor = &iter->cursor.tasks_node;
struct sched_ext_entity *pos;
- if (!(++iter->cnt % SCX_OPS_TASK_ITER_BATCH)) {
+ if (!(++iter->cnt % SCX_TASK_ITER_BATCH)) {
scx_task_iter_unlock(iter);
cond_resched();
scx_task_iter_relock(iter);
@@ -1515,91 +1633,29 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter)
return p;
}
-/*
- * Collection of event counters. Event types are placed in descending order.
- */
-struct scx_event_stats {
- /*
- * If ops.select_cpu() returns a CPU which can't be used by the task,
- * the core scheduler code silently picks a fallback CPU.
- */
- s64 SCX_EV_SELECT_CPU_FALLBACK;
-
- /*
- * When dispatching to a local DSQ, the CPU may have gone offline in
- * the meantime. In this case, the task is bounced to the global DSQ.
- */
- s64 SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE;
-
- /*
- * If SCX_OPS_ENQ_LAST is not set, the number of times that a task
- * continued to run because there were no other tasks on the CPU.
- */
- s64 SCX_EV_DISPATCH_KEEP_LAST;
-
- /*
- * If SCX_OPS_ENQ_EXITING is not set, the number of times that a task
- * is dispatched to a local DSQ when exiting.
- */
- s64 SCX_EV_ENQ_SKIP_EXITING;
-
- /*
- * If SCX_OPS_ENQ_MIGRATION_DISABLED is not set, the number of times a
- * migration disabled task skips ops.enqueue() and is dispatched to its
- * local DSQ.
- */
- s64 SCX_EV_ENQ_SKIP_MIGRATION_DISABLED;
-
- /*
- * The total number of tasks enqueued (or pick_task-ed) with a
- * default time slice (SCX_SLICE_DFL).
- */
- s64 SCX_EV_ENQ_SLICE_DFL;
-
- /*
- * The total duration of bypass modes in nanoseconds.
- */
- s64 SCX_EV_BYPASS_DURATION;
-
- /*
- * The number of tasks dispatched in the bypassing mode.
- */
- s64 SCX_EV_BYPASS_DISPATCH;
-
- /*
- * The number of times the bypassing mode has been activated.
- */
- s64 SCX_EV_BYPASS_ACTIVATE;
-};
-
-/*
- * The event counter is organized by a per-CPU variable to minimize the
- * accounting overhead without synchronization. A system-wide view on the
- * event counter is constructed when requested by scx_bpf_get_event_stat().
- */
-static DEFINE_PER_CPU(struct scx_event_stats, event_stats_cpu);
-
/**
* scx_add_event - Increase an event counter for 'name' by 'cnt'
+ * @sch: scx_sched to account events for
* @name: an event name defined in struct scx_event_stats
* @cnt: the number of the event occured
*
* This can be used when preemption is not disabled.
*/
-#define scx_add_event(name, cnt) do { \
- this_cpu_add(event_stats_cpu.name, cnt); \
- trace_sched_ext_event(#name, cnt); \
+#define scx_add_event(sch, name, cnt) do { \
+ this_cpu_add((sch)->event_stats_cpu->name, (cnt)); \
+ trace_sched_ext_event(#name, (cnt)); \
} while(0)
/**
* __scx_add_event - Increase an event counter for 'name' by 'cnt'
+ * @sch: scx_sched to account events for
* @name: an event name defined in struct scx_event_stats
* @cnt: the number of the event occured
*
* This should be used only when preemption is disabled.
*/
-#define __scx_add_event(name, cnt) do { \
- __this_cpu_add(event_stats_cpu.name, cnt); \
+#define __scx_add_event(sch, name, cnt) do { \
+ __this_cpu_add((sch)->event_stats_cpu->name, (cnt)); \
trace_sched_ext_event(#name, cnt); \
} while(0)
@@ -1624,25 +1680,25 @@ static DEFINE_PER_CPU(struct scx_event_stats, event_stats_cpu);
} while (0)
-static void scx_bpf_events(struct scx_event_stats *events, size_t events__sz);
+static void scx_read_events(struct scx_sched *sch,
+ struct scx_event_stats *events);
-static enum scx_ops_enable_state scx_ops_enable_state(void)
+static enum scx_enable_state scx_enable_state(void)
{
- return atomic_read(&scx_ops_enable_state_var);
+ return atomic_read(&scx_enable_state_var);
}
-static enum scx_ops_enable_state
-scx_ops_set_enable_state(enum scx_ops_enable_state to)
+static enum scx_enable_state scx_set_enable_state(enum scx_enable_state to)
{
- return atomic_xchg(&scx_ops_enable_state_var, to);
+ return atomic_xchg(&scx_enable_state_var, to);
}
-static bool scx_ops_tryset_enable_state(enum scx_ops_enable_state to,
- enum scx_ops_enable_state from)
+static bool scx_tryset_enable_state(enum scx_enable_state to,
+ enum scx_enable_state from)
{
int from_v = from;
- return atomic_try_cmpxchg(&scx_ops_enable_state_var, &from_v, to);
+ return atomic_try_cmpxchg(&scx_enable_state_var, &from_v, to);
}
static bool scx_rq_bypassing(struct rq *rq)
@@ -1667,8 +1723,14 @@ static void wait_ops_state(struct task_struct *p, unsigned long opss)
} while (atomic_long_read_acquire(&p->scx.ops_state) == opss);
}
+static inline bool __cpu_valid(s32 cpu)
+{
+ return likely(cpu >= 0 && cpu < nr_cpu_ids && cpu_possible(cpu));
+}
+
/**
- * ops_cpu_valid - Verify a cpu number
+ * ops_cpu_valid - Verify a cpu number, to be used on ops input args
+ * @sch: scx_sched to abort on error
* @cpu: cpu number which came from a BPF ops
* @where: extra information reported on error
*
@@ -1676,35 +1738,52 @@ static void wait_ops_state(struct task_struct *p, unsigned long opss)
* Verify that it is in range and one of the possible cpus. If invalid, trigger
* an ops error.
*/
-static bool ops_cpu_valid(s32 cpu, const char *where)
+static bool ops_cpu_valid(struct scx_sched *sch, s32 cpu, const char *where)
{
- if (likely(cpu >= 0 && cpu < nr_cpu_ids && cpu_possible(cpu))) {
+ if (__cpu_valid(cpu)) {
return true;
} else {
- scx_ops_error("invalid CPU %d%s%s", cpu,
- where ? " " : "", where ?: "");
+ scx_error(sch, "invalid CPU %d%s%s", cpu, where ? " " : "", where ?: "");
+ return false;
+ }
+}
+
+/**
+ * kf_cpu_valid - Verify a CPU number, to be used on kfunc input args
+ * @cpu: cpu number which came from a BPF ops
+ * @where: extra information reported on error
+ *
+ * The same as ops_cpu_valid() but @sch is implicit.
+ */
+static bool kf_cpu_valid(u32 cpu, const char *where)
+{
+ if (__cpu_valid(cpu)) {
+ return true;
+ } else {
+ scx_kf_error("invalid CPU %d%s%s", cpu, where ? " " : "", where ?: "");
return false;
}
}
/**
* ops_sanitize_err - Sanitize a -errno value
+ * @sch: scx_sched to error out on error
* @ops_name: operation to blame on failure
* @err: -errno value to sanitize
*
- * Verify @err is a valid -errno. If not, trigger scx_ops_error() and return
+ * Verify @err is a valid -errno. If not, trigger scx_error() and return
* -%EPROTO. This is necessary because returning a rogue -errno up the chain can
* cause misbehaviors. For an example, a large negative return from
* ops.init_task() triggers an oops when passed up the call chain because the
* value fails IS_ERR() test after being encoded with ERR_PTR() and then is
* handled as a pointer.
*/
-static int ops_sanitize_err(const char *ops_name, s32 err)
+static int ops_sanitize_err(struct scx_sched *sch, const char *ops_name, s32 err)
{
if (err < 0 && err >= -MAX_ERRNO)
return err;
- scx_ops_error("ops.%s() returned an invalid errno %d", ops_name, err);
+ scx_error(sch, "ops.%s() returned an invalid errno %d", ops_name, err);
return -EPROTO;
}
@@ -1811,7 +1890,7 @@ static void touch_core_sched_dispatch(struct rq *rq, struct task_struct *p)
lockdep_assert_rq_held(rq);
#ifdef CONFIG_SCHED_CORE
- if (SCX_HAS_OP(core_sched_before))
+ if (unlikely(SCX_HAS_OP(scx_root, core_sched_before)))
touch_core_sched(rq, p);
#endif
}
@@ -1849,8 +1928,14 @@ static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta)
WRITE_ONCE(dsq->nr, dsq->nr + delta);
}
-static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
- u64 enq_flags)
+static void refill_task_slice_dfl(struct task_struct *p)
+{
+ p->scx.slice = SCX_SLICE_DFL;
+ __scx_add_event(scx_root, SCX_EV_REFILL_SLICE_DFL, 1);
+}
+
+static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq,
+ struct task_struct *p, u64 enq_flags)
{
bool is_local = dsq->id == SCX_DSQ_LOCAL;
@@ -1861,7 +1946,7 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
if (!is_local) {
raw_spin_lock(&dsq->lock);
if (unlikely(dsq->id == SCX_DSQ_INVALID)) {
- scx_ops_error("attempting to dispatch to a destroyed dsq");
+ scx_error(sch, "attempting to dispatch to a destroyed dsq");
/* fall back to the global dsq */
raw_spin_unlock(&dsq->lock);
dsq = find_global_dsq(p);
@@ -1878,7 +1963,7 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
* disallow any internal DSQ from doing vtime ordering of
* tasks.
*/
- scx_ops_error("cannot use vtime ordering for built-in DSQs");
+ scx_error(sch, "cannot use vtime ordering for built-in DSQs");
enq_flags &= ~SCX_ENQ_DSQ_PRIQ;
}
@@ -1892,8 +1977,8 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
*/
if (unlikely(RB_EMPTY_ROOT(&dsq->priq) &&
nldsq_next_task(dsq, NULL, false)))
- scx_ops_error("DSQ ID 0x%016llx already had FIFO-enqueued tasks",
- dsq->id);
+ scx_error(sch, "DSQ ID 0x%016llx already had FIFO-enqueued tasks",
+ dsq->id);
p->scx.dsq_flags |= SCX_TASK_DSQ_ON_PRIQ;
rb_add(&p->scx.dsq_priq, &dsq->priq, scx_dsq_priq_less);
@@ -1914,8 +1999,8 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
} else {
/* a FIFO DSQ shouldn't be using PRIQ enqueuing */
if (unlikely(!RB_EMPTY_ROOT(&dsq->priq)))
- scx_ops_error("DSQ ID 0x%016llx already had PRIQ-enqueued tasks",
- dsq->id);
+ scx_error(sch, "DSQ ID 0x%016llx already had PRIQ-enqueued tasks",
+ dsq->id);
if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT))
list_add(&p->scx.dsq_list.node, &dsq->list);
@@ -2030,7 +2115,8 @@ static void dispatch_dequeue(struct rq *rq, struct task_struct *p)
raw_spin_unlock(&dsq->lock);
}
-static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id,
+static struct scx_dispatch_q *find_dsq_for_dispatch(struct scx_sched *sch,
+ struct rq *rq, u64 dsq_id,
struct task_struct *p)
{
struct scx_dispatch_q *dsq;
@@ -2041,7 +2127,7 @@ static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id,
if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) {
s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK;
- if (!ops_cpu_valid(cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict"))
+ if (!ops_cpu_valid(sch, cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict"))
return find_global_dsq(p);
return &cpu_rq(cpu)->scx.local_dsq;
@@ -2050,11 +2136,11 @@ static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id,
if (dsq_id == SCX_DSQ_GLOBAL)
dsq = find_global_dsq(p);
else
- dsq = find_user_dsq(dsq_id);
+ dsq = find_user_dsq(sch, dsq_id);
if (unlikely(!dsq)) {
- scx_ops_error("non-existent DSQ 0x%llx for %s[%d]",
- dsq_id, p->comm, p->pid);
+ scx_error(sch, "non-existent DSQ 0x%llx for %s[%d]",
+ dsq_id, p->comm, p->pid);
return find_global_dsq(p);
}
@@ -2075,12 +2161,12 @@ static void mark_direct_dispatch(struct task_struct *ddsp_task,
/* @p must match the task on the enqueue path */
if (unlikely(p != ddsp_task)) {
if (IS_ERR(ddsp_task))
- scx_ops_error("%s[%d] already direct-dispatched",
- p->comm, p->pid);
+ scx_kf_error("%s[%d] already direct-dispatched",
+ p->comm, p->pid);
else
- scx_ops_error("scheduling for %s[%d] but trying to direct-dispatch %s[%d]",
- ddsp_task->comm, ddsp_task->pid,
- p->comm, p->pid);
+ scx_kf_error("scheduling for %s[%d] but trying to direct-dispatch %s[%d]",
+ ddsp_task->comm, ddsp_task->pid,
+ p->comm, p->pid);
return;
}
@@ -2091,11 +2177,12 @@ static void mark_direct_dispatch(struct task_struct *ddsp_task,
p->scx.ddsp_enq_flags = enq_flags;
}
-static void direct_dispatch(struct task_struct *p, u64 enq_flags)
+static void direct_dispatch(struct scx_sched *sch, struct task_struct *p,
+ u64 enq_flags)
{
struct rq *rq = task_rq(p);
struct scx_dispatch_q *dsq =
- find_dsq_for_dispatch(rq, p->scx.ddsp_dsq_id, p);
+ find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, p);
touch_core_sched_dispatch(rq, p);
@@ -2136,7 +2223,8 @@ static void direct_dispatch(struct task_struct *p, u64 enq_flags)
return;
}
- dispatch_enqueue(dsq, p, p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS);
+ dispatch_enqueue(sch, dsq, p,
+ p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS);
}
static bool scx_rq_online(struct rq *rq)
@@ -2154,6 +2242,7 @@ static bool scx_rq_online(struct rq *rq)
static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
int sticky_cpu)
{
+ struct scx_sched *sch = scx_root;
struct task_struct **ddsp_taskp;
unsigned long qseq;
@@ -2172,7 +2261,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
goto local;
if (scx_rq_bypassing(rq)) {
- __scx_add_event(SCX_EV_BYPASS_DISPATCH, 1);
+ __scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1);
goto global;
}
@@ -2180,20 +2269,20 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
goto direct;
/* see %SCX_OPS_ENQ_EXITING */
- if (!static_branch_unlikely(&scx_ops_enq_exiting) &&
+ if (!(sch->ops.flags & SCX_OPS_ENQ_EXITING) &&
unlikely(p->flags & PF_EXITING)) {
- __scx_add_event(SCX_EV_ENQ_SKIP_EXITING, 1);
+ __scx_add_event(sch, SCX_EV_ENQ_SKIP_EXITING, 1);
goto local;
}
/* see %SCX_OPS_ENQ_MIGRATION_DISABLED */
- if (!static_branch_unlikely(&scx_ops_enq_migration_disabled) &&
+ if (!(sch->ops.flags & SCX_OPS_ENQ_MIGRATION_DISABLED) &&
is_migration_disabled(p)) {
- __scx_add_event(SCX_EV_ENQ_SKIP_MIGRATION_DISABLED, 1);
+ __scx_add_event(sch, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED, 1);
goto local;
}
- if (!SCX_HAS_OP(enqueue))
+ if (unlikely(!SCX_HAS_OP(sch, enqueue)))
goto global;
/* DSQ bypass didn't trigger, enqueue on the BPF scheduler */
@@ -2206,7 +2295,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
WARN_ON_ONCE(*ddsp_taskp);
*ddsp_taskp = p;
- SCX_CALL_OP_TASK(SCX_KF_ENQUEUE, enqueue, rq, p, enq_flags);
+ SCX_CALL_OP_TASK(sch, SCX_KF_ENQUEUE, enqueue, rq, p, enq_flags);
*ddsp_taskp = NULL;
if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID)
@@ -2220,7 +2309,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
return;
direct:
- direct_dispatch(p, enq_flags);
+ direct_dispatch(sch, p, enq_flags);
return;
local:
@@ -2230,17 +2319,15 @@ local:
* higher priority it becomes from scx_prio_less()'s POV.
*/
touch_core_sched(rq, p);
- p->scx.slice = SCX_SLICE_DFL;
- __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1);
+ refill_task_slice_dfl(p);
local_norefill:
- dispatch_enqueue(&rq->scx.local_dsq, p, enq_flags);
+ dispatch_enqueue(sch, &rq->scx.local_dsq, p, enq_flags);
return;
global:
touch_core_sched(rq, p); /* see the comment in local: */
- p->scx.slice = SCX_SLICE_DFL;
- __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1);
- dispatch_enqueue(find_global_dsq(p), p, enq_flags);
+ refill_task_slice_dfl(p);
+ dispatch_enqueue(sch, find_global_dsq(p), p, enq_flags);
}
static bool task_runnable(const struct task_struct *p)
@@ -2258,7 +2345,7 @@ static void set_task_runnable(struct rq *rq, struct task_struct *p)
}
/*
- * list_add_tail() must be used. scx_ops_bypass() depends on tasks being
+ * list_add_tail() must be used. scx_bypass() depends on tasks being
* appended to the runnable_list.
*/
list_add_tail(&p->scx.runnable_node, &rq->scx.runnable_list);
@@ -2273,6 +2360,7 @@ static void clr_task_runnable(struct task_struct *p, bool reset_runnable_at)
static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags)
{
+ struct scx_sched *sch = scx_root;
int sticky_cpu = p->scx.sticky_cpu;
if (enq_flags & ENQUEUE_WAKEUP)
@@ -2302,8 +2390,8 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
rq->scx.nr_running++;
add_nr_running(rq, 1);
- if (SCX_HAS_OP(runnable) && !task_on_rq_migrating(p))
- SCX_CALL_OP_TASK(SCX_KF_REST, runnable, rq, p, enq_flags);
+ if (SCX_HAS_OP(sch, runnable) && !task_on_rq_migrating(p))
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, runnable, rq, p, enq_flags);
if (enq_flags & SCX_ENQ_WAKEUP)
touch_core_sched(rq, p);
@@ -2314,11 +2402,12 @@ out:
if ((enq_flags & SCX_ENQ_CPU_SELECTED) &&
unlikely(cpu_of(rq) != p->scx.selected_cpu))
- __scx_add_event(SCX_EV_SELECT_CPU_FALLBACK, 1);
+ __scx_add_event(sch, SCX_EV_SELECT_CPU_FALLBACK, 1);
}
static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags)
{
+ struct scx_sched *sch = scx_root;
unsigned long opss;
/* dequeue is always temporary, don't reset runnable_at */
@@ -2337,8 +2426,9 @@ static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags)
*/
BUG();
case SCX_OPSS_QUEUED:
- if (SCX_HAS_OP(dequeue))
- SCX_CALL_OP_TASK(SCX_KF_REST, dequeue, rq, p, deq_flags);
+ if (SCX_HAS_OP(sch, dequeue))
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, dequeue, rq,
+ p, deq_flags);
if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss,
SCX_OPSS_NONE))
@@ -2366,6 +2456,8 @@ static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags)
static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags)
{
+ struct scx_sched *sch = scx_root;
+
if (!(p->scx.flags & SCX_TASK_QUEUED)) {
WARN_ON_ONCE(task_runnable(p));
return true;
@@ -2385,13 +2477,13 @@ static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags
* information meaningful to the BPF scheduler and can be suppressed by
* skipping the callbacks if the task is !QUEUED.
*/
- if (SCX_HAS_OP(stopping) && task_current(rq, p)) {
+ if (SCX_HAS_OP(sch, stopping) && task_current(rq, p)) {
update_curr_scx(rq);
- SCX_CALL_OP_TASK(SCX_KF_REST, stopping, rq, p, false);
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, stopping, rq, p, false);
}
- if (SCX_HAS_OP(quiescent) && !task_on_rq_migrating(p))
- SCX_CALL_OP_TASK(SCX_KF_REST, quiescent, rq, p, deq_flags);
+ if (SCX_HAS_OP(sch, quiescent) && !task_on_rq_migrating(p))
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, quiescent, rq, p, deq_flags);
if (deq_flags & SCX_DEQ_SLEEP)
p->scx.flags |= SCX_TASK_DEQD_FOR_SLEEP;
@@ -2408,20 +2500,23 @@ static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags
static void yield_task_scx(struct rq *rq)
{
+ struct scx_sched *sch = scx_root;
struct task_struct *p = rq->curr;
- if (SCX_HAS_OP(yield))
- SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, rq, p, NULL);
+ if (SCX_HAS_OP(sch, yield))
+ SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, yield, rq, p, NULL);
else
p->scx.slice = 0;
}
static bool yield_to_task_scx(struct rq *rq, struct task_struct *to)
{
+ struct scx_sched *sch = scx_root;
struct task_struct *from = rq->curr;
- if (SCX_HAS_OP(yield))
- return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, rq, from, to);
+ if (SCX_HAS_OP(sch, yield))
+ return SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, yield, rq,
+ from, to);
else
return false;
}
@@ -2501,7 +2596,8 @@ static void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
*
* The caller must ensure that @p and @rq are on different CPUs.
*/
-static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq,
+static bool task_can_run_on_remote_rq(struct scx_sched *sch,
+ struct task_struct *p, struct rq *rq,
bool enforce)
{
int cpu = cpu_of(rq);
@@ -2522,8 +2618,8 @@ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq,
*/
if (unlikely(is_migration_disabled(p))) {
if (enforce)
- scx_ops_error("SCX_DSQ_LOCAL[_ON] cannot move migration disabled %s[%d] from CPU %d to %d",
- p->comm, p->pid, task_cpu(p), cpu);
+ scx_error(sch, "SCX_DSQ_LOCAL[_ON] cannot move migration disabled %s[%d] from CPU %d to %d",
+ p->comm, p->pid, task_cpu(p), cpu);
return false;
}
@@ -2535,14 +2631,15 @@ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq,
*/
if (!task_allowed_on_cpu(p, cpu)) {
if (enforce)
- scx_ops_error("SCX_DSQ_LOCAL[_ON] target CPU %d not allowed for %s[%d]",
- cpu, p->comm, p->pid);
+ scx_error(sch, "SCX_DSQ_LOCAL[_ON] target CPU %d not allowed for %s[%d]",
+ cpu, p->comm, p->pid);
return false;
}
if (!scx_rq_online(rq)) {
if (enforce)
- __scx_add_event(SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1);
+ __scx_add_event(scx_root,
+ SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1);
return false;
}
@@ -2614,12 +2711,13 @@ static bool consume_remote_task(struct rq *this_rq, struct task_struct *p,
}
#else /* CONFIG_SMP */
static inline void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, struct rq *src_rq, struct rq *dst_rq) { WARN_ON_ONCE(1); }
-static inline bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, bool enforce) { return false; }
+static inline bool task_can_run_on_remote_rq(struct scx_sched *sch, struct task_struct *p, struct rq *rq, bool enforce) { return false; }
static inline bool consume_remote_task(struct rq *this_rq, struct task_struct *p, struct scx_dispatch_q *dsq, struct rq *task_rq) { return false; }
#endif /* CONFIG_SMP */
/**
* move_task_between_dsqs() - Move a task from one DSQ to another
+ * @sch: scx_sched being operated on
* @p: target task
* @enq_flags: %SCX_ENQ_*
* @src_dsq: DSQ @p is currently on, must not be a local DSQ
@@ -2633,7 +2731,8 @@ static inline bool consume_remote_task(struct rq *this_rq, struct task_struct *p
* On return, @src_dsq is unlocked and only @p's new task_rq, which is the
* return value, is locked.
*/
-static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags,
+static struct rq *move_task_between_dsqs(struct scx_sched *sch,
+ struct task_struct *p, u64 enq_flags,
struct scx_dispatch_q *src_dsq,
struct scx_dispatch_q *dst_dsq)
{
@@ -2646,7 +2745,7 @@ static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags,
if (dst_dsq->id == SCX_DSQ_LOCAL) {
dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq);
if (src_rq != dst_rq &&
- unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) {
+ unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) {
dst_dsq = find_global_dsq(p);
dst_rq = src_rq;
}
@@ -2680,7 +2779,7 @@ static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags,
p->scx.dsq = NULL;
raw_spin_unlock(&src_dsq->lock);
- dispatch_enqueue(dst_dsq, p, enq_flags);
+ dispatch_enqueue(sch, dst_dsq, p, enq_flags);
}
return dst_rq;
@@ -2692,13 +2791,13 @@ static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags,
* to the bypass mode can take a long time. Inject artificial delays while the
* bypass mode is switching to guarantee timely completion.
*/
-static void scx_ops_breather(struct rq *rq)
+static void scx_breather(struct rq *rq)
{
u64 until;
lockdep_assert_rq_held(rq);
- if (likely(!atomic_read(&scx_ops_breather_depth)))
+ if (likely(!atomic_read(&scx_breather_depth)))
return;
raw_spin_rq_unlock(rq);
@@ -2707,25 +2806,26 @@ static void scx_ops_breather(struct rq *rq)
do {
int cnt = 1024;
- while (atomic_read(&scx_ops_breather_depth) && --cnt)
+ while (atomic_read(&scx_breather_depth) && --cnt)
cpu_relax();
- } while (atomic_read(&scx_ops_breather_depth) &&
+ } while (atomic_read(&scx_breather_depth) &&
time_before64(ktime_get_ns(), until));
raw_spin_rq_lock(rq);
}
-static bool consume_dispatch_q(struct rq *rq, struct scx_dispatch_q *dsq)
+static bool consume_dispatch_q(struct scx_sched *sch, struct rq *rq,
+ struct scx_dispatch_q *dsq)
{
struct task_struct *p;
retry:
/*
- * This retry loop can repeatedly race against scx_ops_bypass()
- * dequeueing tasks from @dsq trying to put the system into the bypass
- * mode. On some multi-socket machines (e.g. 2x Intel 8480c), this can
- * live-lock the machine into soft lockups. Give a breather.
+ * This retry loop can repeatedly race against scx_bypass() dequeueing
+ * tasks from @dsq trying to put the system into the bypass mode. On
+ * some multi-socket machines (e.g. 2x Intel 8480c), this can live-lock
+ * the machine into soft lockups. Give a breather.
*/
- scx_ops_breather(rq);
+ scx_breather(rq);
/*
* The caller can't expect to successfully consume a task if the task's
@@ -2747,7 +2847,7 @@ retry:
return true;
}
- if (task_can_run_on_remote_rq(p, rq, false)) {
+ if (task_can_run_on_remote_rq(sch, p, rq, false)) {
if (likely(consume_remote_task(rq, p, dsq, task_rq)))
return true;
goto retry;
@@ -2758,15 +2858,16 @@ retry:
return false;
}
-static bool consume_global_dsq(struct rq *rq)
+static bool consume_global_dsq(struct scx_sched *sch, struct rq *rq)
{
int node = cpu_to_node(cpu_of(rq));
- return consume_dispatch_q(rq, global_dsqs[node]);
+ return consume_dispatch_q(sch, rq, sch->global_dsqs[node]);
}
/**
* dispatch_to_local_dsq - Dispatch a task to a local dsq
+ * @sch: scx_sched being operated on
* @rq: current rq which is locked
* @dst_dsq: destination DSQ
* @p: task to dispatch
@@ -2779,7 +2880,8 @@ static bool consume_global_dsq(struct rq *rq)
* The caller must have exclusive ownership of @p (e.g. through
* %SCX_OPSS_DISPATCHING).
*/
-static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
+static void dispatch_to_local_dsq(struct scx_sched *sch, struct rq *rq,
+ struct scx_dispatch_q *dst_dsq,
struct task_struct *p, u64 enq_flags)
{
struct rq *src_rq = task_rq(p);
@@ -2795,14 +2897,15 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
* If dispatching to @rq that @p is already on, no lock dancing needed.
*/
if (rq == src_rq && rq == dst_rq) {
- dispatch_enqueue(dst_dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS);
+ dispatch_enqueue(sch, dst_dsq, p,
+ enq_flags | SCX_ENQ_CLEAR_OPSS);
return;
}
#ifdef CONFIG_SMP
if (src_rq != dst_rq &&
- unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) {
- dispatch_enqueue(find_global_dsq(p), p,
+ unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) {
+ dispatch_enqueue(sch, find_global_dsq(p), p,
enq_flags | SCX_ENQ_CLEAR_OPSS);
return;
}
@@ -2840,7 +2943,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
*/
if (src_rq == dst_rq) {
p->scx.holding_cpu = -1;
- dispatch_enqueue(&dst_rq->scx.local_dsq, p, enq_flags);
+ dispatch_enqueue(sch, &dst_rq->scx.local_dsq, p,
+ enq_flags);
} else {
move_remote_task_to_local_dsq(p, enq_flags,
src_rq, dst_rq);
@@ -2882,7 +2986,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
* was valid in the first place. Make sure that the task is still owned by the
* BPF scheduler and claim the ownership before dispatching.
*/
-static void finish_dispatch(struct rq *rq, struct task_struct *p,
+static void finish_dispatch(struct scx_sched *sch, struct rq *rq,
+ struct task_struct *p,
unsigned long qseq_at_dispatch,
u64 dsq_id, u64 enq_flags)
{
@@ -2935,15 +3040,15 @@ retry:
BUG_ON(!(p->scx.flags & SCX_TASK_QUEUED));
- dsq = find_dsq_for_dispatch(this_rq(), dsq_id, p);
+ dsq = find_dsq_for_dispatch(sch, this_rq(), dsq_id, p);
if (dsq->id == SCX_DSQ_LOCAL)
- dispatch_to_local_dsq(rq, dsq, p, enq_flags);
+ dispatch_to_local_dsq(sch, rq, dsq, p, enq_flags);
else
- dispatch_enqueue(dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS);
+ dispatch_enqueue(sch, dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS);
}
-static void flush_dispatch_buf(struct rq *rq)
+static void flush_dispatch_buf(struct scx_sched *sch, struct rq *rq)
{
struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
u32 u;
@@ -2951,7 +3056,7 @@ static void flush_dispatch_buf(struct rq *rq)
for (u = 0; u < dspc->cursor; u++) {
struct scx_dsp_buf_ent *ent = &dspc->buf[u];
- finish_dispatch(rq, ent->task, ent->qseq, ent->dsq_id,
+ finish_dispatch(sch, rq, ent->task, ent->qseq, ent->dsq_id,
ent->enq_flags);
}
@@ -2961,6 +3066,7 @@ static void flush_dispatch_buf(struct rq *rq)
static int balance_one(struct rq *rq, struct task_struct *prev)
{
+ struct scx_sched *sch = scx_root;
struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
bool prev_on_scx = prev->sched_class == &ext_sched_class;
bool prev_on_rq = prev->scx.flags & SCX_TASK_QUEUED;
@@ -2970,7 +3076,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
rq->scx.flags |= SCX_RQ_IN_BALANCE;
rq->scx.flags &= ~(SCX_RQ_BAL_PENDING | SCX_RQ_BAL_KEEP);
- if (static_branch_unlikely(&scx_ops_cpu_preempt) &&
+ if ((sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT) &&
unlikely(rq->scx.cpu_released)) {
/*
* If the previous sched_class for the current CPU was not SCX,
@@ -2978,8 +3084,9 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
* core. This callback complements ->cpu_release(), which is
* emitted in switch_class().
*/
- if (SCX_HAS_OP(cpu_acquire))
- SCX_CALL_OP(SCX_KF_REST, cpu_acquire, rq, cpu_of(rq), NULL);
+ if (SCX_HAS_OP(sch, cpu_acquire))
+ SCX_CALL_OP(sch, SCX_KF_REST, cpu_acquire, rq,
+ cpu_of(rq), NULL);
rq->scx.cpu_released = false;
}
@@ -2993,8 +3100,8 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
* scheduler wants to handle this explicitly, it should
* implement ->cpu_release().
*
- * See scx_ops_disable_workfn() for the explanation on the
- * bypassing test.
+ * See scx_disable_workfn() for the explanation on the bypassing
+ * test.
*/
if (prev_on_rq && prev->scx.slice && !scx_rq_bypassing(rq)) {
rq->scx.flags |= SCX_RQ_BAL_KEEP;
@@ -3006,10 +3113,11 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
if (rq->scx.local_dsq.nr)
goto has_tasks;
- if (consume_global_dsq(rq))
+ if (consume_global_dsq(sch, rq))
goto has_tasks;
- if (!SCX_HAS_OP(dispatch) || scx_rq_bypassing(rq) || !scx_rq_online(rq))
+ if (unlikely(!SCX_HAS_OP(sch, dispatch)) ||
+ scx_rq_bypassing(rq) || !scx_rq_online(rq))
goto no_tasks;
dspc->rq = rq;
@@ -3024,10 +3132,10 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
do {
dspc->nr_tasks = 0;
- SCX_CALL_OP(SCX_KF_DISPATCH, dispatch, rq, cpu_of(rq),
- prev_on_scx ? prev : NULL);
+ SCX_CALL_OP(sch, SCX_KF_DISPATCH, dispatch, rq,
+ cpu_of(rq), prev_on_scx ? prev : NULL);
- flush_dispatch_buf(rq);
+ flush_dispatch_buf(sch, rq);
if (prev_on_rq && prev->scx.slice) {
rq->scx.flags |= SCX_RQ_BAL_KEEP;
@@ -3035,7 +3143,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
}
if (rq->scx.local_dsq.nr)
goto has_tasks;
- if (consume_global_dsq(rq))
+ if (consume_global_dsq(sch, rq))
goto has_tasks;
/*
@@ -3058,10 +3166,10 @@ no_tasks:
* Didn't find another task to run. Keep running @prev unless
* %SCX_OPS_ENQ_LAST is in effect.
*/
- if (prev_on_rq && (!static_branch_unlikely(&scx_ops_enq_last) ||
- scx_rq_bypassing(rq))) {
+ if (prev_on_rq &&
+ (!(sch->ops.flags & SCX_OPS_ENQ_LAST) || scx_rq_bypassing(rq))) {
rq->scx.flags |= SCX_RQ_BAL_KEEP;
- __scx_add_event(SCX_EV_DISPATCH_KEEP_LAST, 1);
+ __scx_add_event(sch, SCX_EV_DISPATCH_KEEP_LAST, 1);
goto has_tasks;
}
rq->scx.flags &= ~SCX_RQ_IN_BALANCE;
@@ -3121,18 +3229,22 @@ static void process_ddsp_deferred_locals(struct rq *rq)
*/
while ((p = list_first_entry_or_null(&rq->scx.ddsp_deferred_locals,
struct task_struct, scx.dsq_list.node))) {
+ struct scx_sched *sch = scx_root;
struct scx_dispatch_q *dsq;
list_del_init(&p->scx.dsq_list.node);
- dsq = find_dsq_for_dispatch(rq, p->scx.ddsp_dsq_id, p);
+ dsq = find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, p);
if (!WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL))
- dispatch_to_local_dsq(rq, dsq, p, p->scx.ddsp_enq_flags);
+ dispatch_to_local_dsq(sch, rq, dsq, p,
+ p->scx.ddsp_enq_flags);
}
}
static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
{
+ struct scx_sched *sch = scx_root;
+
if (p->scx.flags & SCX_TASK_QUEUED) {
/*
* Core-sched might decide to execute @p before it is
@@ -3145,8 +3257,8 @@ static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
p->se.exec_start = rq_clock_task(rq);
/* see dequeue_task_scx() on why we skip when !QUEUED */
- if (SCX_HAS_OP(running) && (p->scx.flags & SCX_TASK_QUEUED))
- SCX_CALL_OP_TASK(SCX_KF_REST, running, rq, p);
+ if (SCX_HAS_OP(sch, running) && (p->scx.flags & SCX_TASK_QUEUED))
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, running, rq, p);
clr_task_runnable(p, true);
@@ -3189,6 +3301,7 @@ preempt_reason_from_class(const struct sched_class *class)
static void switch_class(struct rq *rq, struct task_struct *next)
{
+ struct scx_sched *sch = scx_root;
const struct sched_class *next_class = next->sched_class;
#ifdef CONFIG_SMP
@@ -3199,7 +3312,7 @@ static void switch_class(struct rq *rq, struct task_struct *next)
*/
smp_store_release(&rq->scx.pnt_seq, rq->scx.pnt_seq + 1);
#endif
- if (!static_branch_unlikely(&scx_ops_cpu_preempt))
+ if (!(sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT))
return;
/*
@@ -3221,13 +3334,14 @@ static void switch_class(struct rq *rq, struct task_struct *next)
* next time that balance_scx() is invoked.
*/
if (!rq->scx.cpu_released) {
- if (SCX_HAS_OP(cpu_release)) {
+ if (SCX_HAS_OP(sch, cpu_release)) {
struct scx_cpu_release_args args = {
.reason = preempt_reason_from_class(next_class),
.task = next,
};
- SCX_CALL_OP(SCX_KF_CPU_RELEASE, cpu_release, rq, cpu_of(rq), &args);
+ SCX_CALL_OP(sch, SCX_KF_CPU_RELEASE, cpu_release, rq,
+ cpu_of(rq), &args);
}
rq->scx.cpu_released = true;
}
@@ -3236,11 +3350,12 @@ static void switch_class(struct rq *rq, struct task_struct *next)
static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
struct task_struct *next)
{
+ struct scx_sched *sch = scx_root;
update_curr_scx(rq);
/* see dequeue_task_scx() on why we skip when !QUEUED */
- if (SCX_HAS_OP(stopping) && (p->scx.flags & SCX_TASK_QUEUED))
- SCX_CALL_OP_TASK(SCX_KF_REST, stopping, rq, p, true);
+ if (SCX_HAS_OP(sch, stopping) && (p->scx.flags & SCX_TASK_QUEUED))
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, stopping, rq, p, true);
if (p->scx.flags & SCX_TASK_QUEUED) {
set_task_runnable(rq, p);
@@ -3252,7 +3367,8 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
* DSQ.
*/
if (p->scx.slice && !scx_rq_bypassing(rq)) {
- dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD);
+ dispatch_enqueue(sch, &rq->scx.local_dsq, p,
+ SCX_ENQ_HEAD);
goto switch_class;
}
@@ -3263,7 +3379,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
* which should trigger an explicit follow-up scheduling event.
*/
if (sched_class_above(&ext_sched_class, next->sched_class)) {
- WARN_ON_ONCE(!static_branch_unlikely(&scx_ops_enq_last));
+ WARN_ON_ONCE(!(sch->ops.flags & SCX_OPS_ENQ_LAST));
do_enqueue_task(rq, p, SCX_ENQ_LAST, -1);
} else {
do_enqueue_task(rq, p, 0, -1);
@@ -3316,7 +3432,7 @@ static struct task_struct *pick_task_scx(struct rq *rq)
* Can happen while enabling as SCX_RQ_BAL_PENDING assertion is
* conditional on scx_enabled() and may have been skipped.
*/
- WARN_ON_ONCE(scx_ops_enable_state() == SCX_OPS_ENABLED);
+ WARN_ON_ONCE(scx_enable_state() == SCX_ENABLED);
keep_prev = false;
}
@@ -3327,10 +3443,8 @@ static struct task_struct *pick_task_scx(struct rq *rq)
*/
if (keep_prev) {
p = prev;
- if (!p->scx.slice) {
- p->scx.slice = SCX_SLICE_DFL;
- __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1);
- }
+ if (!p->scx.slice)
+ refill_task_slice_dfl(p);
} else {
p = first_local_task(rq);
if (!p) {
@@ -3340,13 +3454,14 @@ static struct task_struct *pick_task_scx(struct rq *rq)
}
if (unlikely(!p->scx.slice)) {
- if (!scx_rq_bypassing(rq) && !scx_warned_zero_slice) {
+ struct scx_sched *sch = scx_root;
+
+ if (!scx_rq_bypassing(rq) && !sch->warned_zero_slice) {
printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in %s()\n",
p->comm, p->pid, __func__);
- scx_warned_zero_slice = true;
+ sch->warned_zero_slice = true;
}
- p->scx.slice = SCX_SLICE_DFL;
- __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1);
+ refill_task_slice_dfl(p);
}
}
@@ -3375,13 +3490,17 @@ static struct task_struct *pick_task_scx(struct rq *rq)
bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
bool in_fi)
{
+ struct scx_sched *sch = scx_root;
+
/*
* The const qualifiers are dropped from task_struct pointers when
* calling ops.core_sched_before(). Accesses are controlled by the
* verifier.
*/
- if (SCX_HAS_OP(core_sched_before) && !scx_rq_bypassing(task_rq(a)))
- return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, core_sched_before, NULL,
+ if (SCX_HAS_OP(sch, core_sched_before) &&
+ !scx_rq_bypassing(task_rq(a)))
+ return SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, core_sched_before,
+ NULL,
(struct task_struct *)a,
(struct task_struct *)b);
else
@@ -3393,6 +3512,7 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags)
{
+ struct scx_sched *sch = scx_root;
bool rq_bypass;
/*
@@ -3409,7 +3529,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
return prev_cpu;
rq_bypass = scx_rq_bypassing(task_rq(p));
- if (SCX_HAS_OP(select_cpu) && !rq_bypass) {
+ if (likely(SCX_HAS_OP(sch, select_cpu)) && !rq_bypass) {
s32 cpu;
struct task_struct **ddsp_taskp;
@@ -3417,29 +3537,30 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
WARN_ON_ONCE(*ddsp_taskp);
*ddsp_taskp = p;
- cpu = SCX_CALL_OP_TASK_RET(SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU,
- select_cpu, NULL, p, prev_cpu, wake_flags);
+ cpu = SCX_CALL_OP_TASK_RET(sch,
+ SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU,
+ select_cpu, NULL, p, prev_cpu,
+ wake_flags);
p->scx.selected_cpu = cpu;
*ddsp_taskp = NULL;
- if (ops_cpu_valid(cpu, "from ops.select_cpu()"))
+ if (ops_cpu_valid(sch, cpu, "from ops.select_cpu()"))
return cpu;
else
return prev_cpu;
} else {
s32 cpu;
- cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, 0);
+ cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, NULL, 0);
if (cpu >= 0) {
- p->scx.slice = SCX_SLICE_DFL;
+ refill_task_slice_dfl(p);
p->scx.ddsp_dsq_id = SCX_DSQ_LOCAL;
- __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1);
} else {
cpu = prev_cpu;
}
p->scx.selected_cpu = cpu;
if (rq_bypass)
- __scx_add_event(SCX_EV_BYPASS_DISPATCH, 1);
+ __scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1);
return cpu;
}
}
@@ -3452,6 +3573,8 @@ static void task_woken_scx(struct rq *rq, struct task_struct *p)
static void set_cpus_allowed_scx(struct task_struct *p,
struct affinity_context *ac)
{
+ struct scx_sched *sch = scx_root;
+
set_cpus_allowed_common(p, ac);
/*
@@ -3462,28 +3585,38 @@ static void set_cpus_allowed_scx(struct task_struct *p,
* Fine-grained memory write control is enforced by BPF making the const
* designation pointless. Cast it away when calling the operation.
*/
- if (SCX_HAS_OP(set_cpumask))
- SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, NULL,
+ if (SCX_HAS_OP(sch, set_cpumask))
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_cpumask, NULL,
p, (struct cpumask *)p->cpus_ptr);
}
static void handle_hotplug(struct rq *rq, bool online)
{
+ struct scx_sched *sch = scx_root;
int cpu = cpu_of(rq);
atomic_long_inc(&scx_hotplug_seq);
+ /*
+ * scx_root updates are protected by cpus_read_lock() and will stay
+ * stable here. Note that we can't depend on scx_enabled() test as the
+ * hotplug ops need to be enabled before __scx_enabled is set.
+ */
+ if (unlikely(!sch))
+ return;
+
if (scx_enabled())
- scx_idle_update_selcpu_topology(&scx_ops);
+ scx_idle_update_selcpu_topology(&sch->ops);
- if (online && SCX_HAS_OP(cpu_online))
- SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_online, NULL, cpu);
- else if (!online && SCX_HAS_OP(cpu_offline))
- SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_offline, NULL, cpu);
+ if (online && SCX_HAS_OP(sch, cpu_online))
+ SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cpu_online, NULL, cpu);
+ else if (!online && SCX_HAS_OP(sch, cpu_offline))
+ SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cpu_offline, NULL, cpu);
else
- scx_ops_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG,
- "cpu %d going %s, exiting scheduler", cpu,
- online ? "online" : "offline");
+ scx_exit(sch, SCX_EXIT_UNREG_KERN,
+ SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG,
+ "cpu %d going %s, exiting scheduler", cpu,
+ online ? "online" : "offline");
}
void scx_rq_activate(struct rq *rq)
@@ -3510,11 +3643,16 @@ static void rq_offline_scx(struct rq *rq)
static bool check_rq_for_timeouts(struct rq *rq)
{
+ struct scx_sched *sch;
struct task_struct *p;
struct rq_flags rf;
bool timed_out = false;
rq_lock_irqsave(rq, &rf);
+ sch = rcu_dereference_bh(scx_root);
+ if (unlikely(!sch))
+ goto out_unlock;
+
list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) {
unsigned long last_runnable = p->scx.runnable_at;
@@ -3522,16 +3660,15 @@ static bool check_rq_for_timeouts(struct rq *rq)
last_runnable + scx_watchdog_timeout))) {
u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable);
- scx_ops_error_kind(SCX_EXIT_ERROR_STALL,
- "%s[%d] failed to run for %u.%03us",
- p->comm, p->pid,
- dur_ms / 1000, dur_ms % 1000);
+ scx_exit(sch, SCX_EXIT_ERROR_STALL, 0,
+ "%s[%d] failed to run for %u.%03us",
+ p->comm, p->pid, dur_ms / 1000, dur_ms % 1000);
timed_out = true;
break;
}
}
+out_unlock:
rq_unlock_irqrestore(rq, &rf);
-
return timed_out;
}
@@ -3553,19 +3690,24 @@ static void scx_watchdog_workfn(struct work_struct *work)
void scx_tick(struct rq *rq)
{
+ struct scx_sched *sch;
unsigned long last_check;
if (!scx_enabled())
return;
+ sch = rcu_dereference_bh(scx_root);
+ if (unlikely(!sch))
+ return;
+
last_check = READ_ONCE(scx_watchdog_timestamp);
if (unlikely(time_after(jiffies,
last_check + READ_ONCE(scx_watchdog_timeout)))) {
u32 dur_ms = jiffies_to_msecs(jiffies - last_check);
- scx_ops_error_kind(SCX_EXIT_ERROR_STALL,
- "watchdog failed to check in for %u.%03us",
- dur_ms / 1000, dur_ms % 1000);
+ scx_exit(sch, SCX_EXIT_ERROR_STALL, 0,
+ "watchdog failed to check in for %u.%03us",
+ dur_ms / 1000, dur_ms % 1000);
}
update_other_load_avgs(rq);
@@ -3573,6 +3715,8 @@ void scx_tick(struct rq *rq)
static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued)
{
+ struct scx_sched *sch = scx_root;
+
update_curr_scx(rq);
/*
@@ -3582,8 +3726,8 @@ static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued)
if (scx_rq_bypassing(rq)) {
curr->scx.slice = 0;
touch_core_sched(rq, curr);
- } else if (SCX_HAS_OP(tick)) {
- SCX_CALL_OP_TASK(SCX_KF_REST, tick, rq, curr);
+ } else if (SCX_HAS_OP(sch, tick)) {
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, tick, rq, curr);
}
if (!curr->scx.slice)
@@ -3648,21 +3792,23 @@ static void scx_set_task_state(struct task_struct *p, enum scx_task_state state)
p->scx.flags |= state << SCX_TASK_STATE_SHIFT;
}
-static int scx_ops_init_task(struct task_struct *p, struct task_group *tg, bool fork)
+static int scx_init_task(struct task_struct *p, struct task_group *tg, bool fork)
{
+ struct scx_sched *sch = scx_root;
int ret;
p->scx.disallow = false;
- if (SCX_HAS_OP(init_task)) {
+ if (SCX_HAS_OP(sch, init_task)) {
struct scx_init_task_args args = {
SCX_INIT_TASK_ARGS_CGROUP(tg)
.fork = fork,
};
- ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init_task, NULL, p, &args);
+ ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, init_task, NULL,
+ p, &args);
if (unlikely(ret)) {
- ret = ops_sanitize_err("init_task", ret);
+ ret = ops_sanitize_err(sch, "init_task", ret);
return ret;
}
}
@@ -3690,8 +3836,8 @@ static int scx_ops_init_task(struct task_struct *p, struct task_group *tg, bool
task_rq_unlock(rq, p, &rf);
} else if (p->policy == SCHED_EXT) {
- scx_ops_error("ops.init_task() set task->scx.disallow for %s[%d] during fork",
- p->comm, p->pid);
+ scx_error(sch, "ops.init_task() set task->scx.disallow for %s[%d] during fork",
+ p->comm, p->pid);
}
}
@@ -3699,8 +3845,9 @@ static int scx_ops_init_task(struct task_struct *p, struct task_group *tg, bool
return 0;
}
-static void scx_ops_enable_task(struct task_struct *p)
+static void scx_enable_task(struct task_struct *p)
{
+ struct scx_sched *sch = scx_root;
struct rq *rq = task_rq(p);
u32 weight;
@@ -3717,28 +3864,31 @@ static void scx_ops_enable_task(struct task_struct *p)
p->scx.weight = sched_weight_to_cgroup(weight);
- if (SCX_HAS_OP(enable))
- SCX_CALL_OP_TASK(SCX_KF_REST, enable, rq, p);
+ if (SCX_HAS_OP(sch, enable))
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, enable, rq, p);
scx_set_task_state(p, SCX_TASK_ENABLED);
- if (SCX_HAS_OP(set_weight))
- SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, rq, p, p->scx.weight);
+ if (SCX_HAS_OP(sch, set_weight))
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_weight, rq,
+ p, p->scx.weight);
}
-static void scx_ops_disable_task(struct task_struct *p)
+static void scx_disable_task(struct task_struct *p)
{
+ struct scx_sched *sch = scx_root;
struct rq *rq = task_rq(p);
lockdep_assert_rq_held(rq);
WARN_ON_ONCE(scx_get_task_state(p) != SCX_TASK_ENABLED);
- if (SCX_HAS_OP(disable))
- SCX_CALL_OP_TASK(SCX_KF_REST, disable, rq, p);
+ if (SCX_HAS_OP(sch, disable))
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, disable, rq, p);
scx_set_task_state(p, SCX_TASK_READY);
}
-static void scx_ops_exit_task(struct task_struct *p)
+static void scx_exit_task(struct task_struct *p)
{
+ struct scx_sched *sch = scx_root;
struct scx_exit_task_args args = {
.cancelled = false,
};
@@ -3754,15 +3904,16 @@ static void scx_ops_exit_task(struct task_struct *p)
case SCX_TASK_READY:
break;
case SCX_TASK_ENABLED:
- scx_ops_disable_task(p);
+ scx_disable_task(p);
break;
default:
WARN_ON_ONCE(true);
return;
}
- if (SCX_HAS_OP(exit_task))
- SCX_CALL_OP_TASK(SCX_KF_REST, exit_task, task_rq(p), p, &args);
+ if (SCX_HAS_OP(sch, exit_task))
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, exit_task, task_rq(p),
+ p, &args);
scx_set_task_state(p, SCX_TASK_NONE);
}
@@ -3794,15 +3945,15 @@ int scx_fork(struct task_struct *p)
{
percpu_rwsem_assert_held(&scx_fork_rwsem);
- if (scx_ops_init_task_enabled)
- return scx_ops_init_task(p, task_group(p), true);
+ if (scx_init_task_enabled)
+ return scx_init_task(p, task_group(p), true);
else
return 0;
}
void scx_post_fork(struct task_struct *p)
{
- if (scx_ops_init_task_enabled) {
+ if (scx_init_task_enabled) {
scx_set_task_state(p, SCX_TASK_READY);
/*
@@ -3815,7 +3966,7 @@ void scx_post_fork(struct task_struct *p)
struct rq *rq;
rq = task_rq_lock(p, &rf);
- scx_ops_enable_task(p);
+ scx_enable_task(p);
task_rq_unlock(rq, p, &rf);
}
}
@@ -3835,7 +3986,7 @@ void scx_cancel_fork(struct task_struct *p)
rq = task_rq_lock(p, &rf);
WARN_ON_ONCE(scx_get_task_state(p) >= SCX_TASK_READY);
- scx_ops_exit_task(p);
+ scx_exit_task(p);
task_rq_unlock(rq, p, &rf);
}
@@ -3851,15 +4002,15 @@ void sched_ext_free(struct task_struct *p)
spin_unlock_irqrestore(&scx_tasks_lock, flags);
/*
- * @p is off scx_tasks and wholly ours. scx_ops_enable()'s READY ->
- * ENABLED transitions can't race us. Disable ops for @p.
+ * @p is off scx_tasks and wholly ours. scx_enable()'s READY -> ENABLED
+ * transitions can't race us. Disable ops for @p.
*/
if (scx_get_task_state(p) != SCX_TASK_NONE) {
struct rq_flags rf;
struct rq *rq;
rq = task_rq_lock(p, &rf);
- scx_ops_exit_task(p);
+ scx_exit_task(p);
task_rq_unlock(rq, p, &rf);
}
}
@@ -3867,11 +4018,14 @@ void sched_ext_free(struct task_struct *p)
static void reweight_task_scx(struct rq *rq, struct task_struct *p,
const struct load_weight *lw)
{
+ struct scx_sched *sch = scx_root;
+
lockdep_assert_rq_held(task_rq(p));
p->scx.weight = sched_weight_to_cgroup(scale_load_down(lw->weight));
- if (SCX_HAS_OP(set_weight))
- SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, rq, p, p->scx.weight);
+ if (SCX_HAS_OP(sch, set_weight))
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_weight, rq,
+ p, p->scx.weight);
}
static void prio_changed_scx(struct rq *rq, struct task_struct *p, int oldprio)
@@ -3880,20 +4034,22 @@ static void prio_changed_scx(struct rq *rq, struct task_struct *p, int oldprio)
static void switching_to_scx(struct rq *rq, struct task_struct *p)
{
- scx_ops_enable_task(p);
+ struct scx_sched *sch = scx_root;
+
+ scx_enable_task(p);
/*
* set_cpus_allowed_scx() is not called while @p is associated with a
* different scheduler class. Keep the BPF scheduler up-to-date.
*/
- if (SCX_HAS_OP(set_cpumask))
- SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, rq,
+ if (SCX_HAS_OP(sch, set_cpumask))
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_cpumask, rq,
p, (struct cpumask *)p->cpus_ptr);
}
static void switched_from_scx(struct rq *rq, struct task_struct *p)
{
- scx_ops_disable_task(p);
+ scx_disable_task(p);
}
static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p,int wake_flags) {}
@@ -3938,6 +4094,7 @@ static bool scx_cgroup_enabled;
int scx_tg_online(struct task_group *tg)
{
+ struct scx_sched *sch = scx_root;
int ret = 0;
WARN_ON_ONCE(tg->scx_flags & (SCX_TG_ONLINE | SCX_TG_INITED));
@@ -3945,14 +4102,14 @@ int scx_tg_online(struct task_group *tg)
percpu_down_read(&scx_cgroup_rwsem);
if (scx_cgroup_enabled) {
- if (SCX_HAS_OP(cgroup_init)) {
+ if (SCX_HAS_OP(sch, cgroup_init)) {
struct scx_cgroup_init_args args =
{ .weight = tg->scx_weight };
- ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init, NULL,
- tg->css.cgroup, &args);
+ ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, cgroup_init,
+ NULL, tg->css.cgroup, &args);
if (ret)
- ret = ops_sanitize_err("cgroup_init", ret);
+ ret = ops_sanitize_err(sch, "cgroup_init", ret);
}
if (ret == 0)
tg->scx_flags |= SCX_TG_ONLINE | SCX_TG_INITED;
@@ -3966,12 +4123,16 @@ int scx_tg_online(struct task_group *tg)
void scx_tg_offline(struct task_group *tg)
{
+ struct scx_sched *sch = scx_root;
+
WARN_ON_ONCE(!(tg->scx_flags & SCX_TG_ONLINE));
percpu_down_read(&scx_cgroup_rwsem);
- if (SCX_HAS_OP(cgroup_exit) && (tg->scx_flags & SCX_TG_INITED))
- SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, NULL, tg->css.cgroup);
+ if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_exit) &&
+ (tg->scx_flags & SCX_TG_INITED))
+ SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_exit, NULL,
+ tg->css.cgroup);
tg->scx_flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED);
percpu_up_read(&scx_cgroup_rwsem);
@@ -3979,6 +4140,7 @@ void scx_tg_offline(struct task_group *tg)
int scx_cgroup_can_attach(struct cgroup_taskset *tset)
{
+ struct scx_sched *sch = scx_root;
struct cgroup_subsys_state *css;
struct task_struct *p;
int ret;
@@ -4003,8 +4165,9 @@ int scx_cgroup_can_attach(struct cgroup_taskset *tset)
if (from == to)
continue;
- if (SCX_HAS_OP(cgroup_prep_move)) {
- ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_prep_move, NULL,
+ if (SCX_HAS_OP(sch, cgroup_prep_move)) {
+ ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED,
+ cgroup_prep_move, NULL,
p, from, css->cgroup);
if (ret)
goto err;
@@ -4017,18 +4180,21 @@ int scx_cgroup_can_attach(struct cgroup_taskset *tset)
err:
cgroup_taskset_for_each(p, css, tset) {
- if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from)
- SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, NULL,
+ if (SCX_HAS_OP(sch, cgroup_cancel_move) &&
+ p->scx.cgrp_moving_from)
+ SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_cancel_move, NULL,
p, p->scx.cgrp_moving_from, css->cgroup);
p->scx.cgrp_moving_from = NULL;
}
percpu_up_read(&scx_cgroup_rwsem);
- return ops_sanitize_err("cgroup_prep_move", ret);
+ return ops_sanitize_err(sch, "cgroup_prep_move", ret);
}
void scx_cgroup_move_task(struct task_struct *p)
{
+ struct scx_sched *sch = scx_root;
+
if (!scx_cgroup_enabled)
return;
@@ -4036,9 +4202,11 @@ void scx_cgroup_move_task(struct task_struct *p)
* @p must have ops.cgroup_prep_move() called on it and thus
* cgrp_moving_from set.
*/
- if (SCX_HAS_OP(cgroup_move) && !WARN_ON_ONCE(!p->scx.cgrp_moving_from))
- SCX_CALL_OP_TASK(SCX_KF_UNLOCKED, cgroup_move, NULL,
- p, p->scx.cgrp_moving_from, tg_cgrp(task_group(p)));
+ if (SCX_HAS_OP(sch, cgroup_move) &&
+ !WARN_ON_ONCE(!p->scx.cgrp_moving_from))
+ SCX_CALL_OP_TASK(sch, SCX_KF_UNLOCKED, cgroup_move, NULL,
+ p, p->scx.cgrp_moving_from,
+ tg_cgrp(task_group(p)));
p->scx.cgrp_moving_from = NULL;
}
@@ -4049,6 +4217,7 @@ void scx_cgroup_finish_attach(void)
void scx_cgroup_cancel_attach(struct cgroup_taskset *tset)
{
+ struct scx_sched *sch = scx_root;
struct cgroup_subsys_state *css;
struct task_struct *p;
@@ -4056,8 +4225,9 @@ void scx_cgroup_cancel_attach(struct cgroup_taskset *tset)
goto out_unlock;
cgroup_taskset_for_each(p, css, tset) {
- if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from)
- SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, NULL,
+ if (SCX_HAS_OP(sch, cgroup_cancel_move) &&
+ p->scx.cgrp_moving_from)
+ SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_cancel_move, NULL,
p, p->scx.cgrp_moving_from, css->cgroup);
p->scx.cgrp_moving_from = NULL;
}
@@ -4067,11 +4237,13 @@ out_unlock:
void scx_group_set_weight(struct task_group *tg, unsigned long weight)
{
+ struct scx_sched *sch = scx_root;
+
percpu_down_read(&scx_cgroup_rwsem);
if (scx_cgroup_enabled && tg->scx_weight != weight) {
- if (SCX_HAS_OP(cgroup_set_weight))
- SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_set_weight, NULL,
+ if (SCX_HAS_OP(sch, cgroup_set_weight))
+ SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_weight, NULL,
tg_cgrp(tg), weight);
tg->scx_weight = weight;
}
@@ -4160,29 +4332,6 @@ static void init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id)
dsq->id = dsq_id;
}
-static struct scx_dispatch_q *create_dsq(u64 dsq_id, int node)
-{
- struct scx_dispatch_q *dsq;
- int ret;
-
- if (dsq_id & SCX_DSQ_FLAG_BUILTIN)
- return ERR_PTR(-EINVAL);
-
- dsq = kmalloc_node(sizeof(*dsq), GFP_KERNEL, node);
- if (!dsq)
- return ERR_PTR(-ENOMEM);
-
- init_dsq(dsq, dsq_id);
-
- ret = rhashtable_lookup_insert_fast(&dsq_hash, &dsq->hash_node,
- dsq_hash_params);
- if (ret) {
- kfree(dsq);
- return ERR_PTR(ret);
- }
- return dsq;
-}
-
static void free_dsq_irq_workfn(struct irq_work *irq_work)
{
struct llist_node *to_free = llist_del_all(&dsqs_to_free);
@@ -4194,26 +4343,27 @@ static void free_dsq_irq_workfn(struct irq_work *irq_work)
static DEFINE_IRQ_WORK(free_dsq_irq_work, free_dsq_irq_workfn);
-static void destroy_dsq(u64 dsq_id)
+static void destroy_dsq(struct scx_sched *sch, u64 dsq_id)
{
struct scx_dispatch_q *dsq;
unsigned long flags;
rcu_read_lock();
- dsq = find_user_dsq(dsq_id);
+ dsq = find_user_dsq(sch, dsq_id);
if (!dsq)
goto out_unlock_rcu;
raw_spin_lock_irqsave(&dsq->lock, flags);
if (dsq->nr) {
- scx_ops_error("attempting to destroy in-use dsq 0x%016llx (nr=%u)",
- dsq->id, dsq->nr);
+ scx_error(sch, "attempting to destroy in-use dsq 0x%016llx (nr=%u)",
+ dsq->id, dsq->nr);
goto out_unlock_dsq;
}
- if (rhashtable_remove_fast(&dsq_hash, &dsq->hash_node, dsq_hash_params))
+ if (rhashtable_remove_fast(&sch->dsq_hash, &dsq->hash_node,
+ dsq_hash_params))
goto out_unlock_dsq;
/*
@@ -4233,7 +4383,7 @@ out_unlock_rcu:
}
#ifdef CONFIG_EXT_GROUP_SCHED
-static void scx_cgroup_exit(void)
+static void scx_cgroup_exit(struct scx_sched *sch)
{
struct cgroup_subsys_state *css;
@@ -4253,14 +4403,15 @@ static void scx_cgroup_exit(void)
continue;
tg->scx_flags &= ~SCX_TG_INITED;
- if (!scx_ops.cgroup_exit)
+ if (!sch->ops.cgroup_exit)
continue;
if (WARN_ON_ONCE(!css_tryget(css)))
continue;
rcu_read_unlock();
- SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, NULL, css->cgroup);
+ SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_exit, NULL,
+ css->cgroup);
rcu_read_lock();
css_put(css);
@@ -4268,7 +4419,7 @@ static void scx_cgroup_exit(void)
rcu_read_unlock();
}
-static int scx_cgroup_init(void)
+static int scx_cgroup_init(struct scx_sched *sch)
{
struct cgroup_subsys_state *css;
int ret;
@@ -4288,7 +4439,7 @@ static int scx_cgroup_init(void)
(SCX_TG_ONLINE | SCX_TG_INITED)) != SCX_TG_ONLINE)
continue;
- if (!scx_ops.cgroup_init) {
+ if (!sch->ops.cgroup_init) {
tg->scx_flags |= SCX_TG_INITED;
continue;
}
@@ -4297,11 +4448,11 @@ static int scx_cgroup_init(void)
continue;
rcu_read_unlock();
- ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init, NULL,
+ ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, cgroup_init, NULL,
css->cgroup, &args);
if (ret) {
css_put(css);
- scx_ops_error("ops.cgroup_init() failed (%d)", ret);
+ scx_error(sch, "ops.cgroup_init() failed (%d)", ret);
return ret;
}
tg->scx_flags |= SCX_TG_INITED;
@@ -4318,8 +4469,8 @@ static int scx_cgroup_init(void)
}
#else
-static void scx_cgroup_exit(void) {}
-static int scx_cgroup_init(void) { return 0; }
+static void scx_cgroup_exit(struct scx_sched *sch) {}
+static int scx_cgroup_init(struct scx_sched *sch) { return 0; }
#endif
@@ -4336,8 +4487,7 @@ static int scx_cgroup_init(void) { return 0; }
static ssize_t scx_attr_state_show(struct kobject *kobj,
struct kobj_attribute *ka, char *buf)
{
- return sysfs_emit(buf, "%s\n",
- scx_ops_enable_state_str[scx_ops_enable_state()]);
+ return sysfs_emit(buf, "%s\n", scx_enable_state_str[scx_enable_state()]);
}
SCX_ATTR(state);
@@ -4382,15 +4532,51 @@ static const struct attribute_group scx_global_attr_group = {
.attrs = scx_global_attrs,
};
+static void free_exit_info(struct scx_exit_info *ei);
+
+static void scx_sched_free_rcu_work(struct work_struct *work)
+{
+ struct rcu_work *rcu_work = to_rcu_work(work);
+ struct scx_sched *sch = container_of(rcu_work, struct scx_sched, rcu_work);
+ struct rhashtable_iter rht_iter;
+ struct scx_dispatch_q *dsq;
+ int node;
+
+ kthread_stop(sch->helper->task);
+ free_percpu(sch->event_stats_cpu);
+
+ for_each_node_state(node, N_POSSIBLE)
+ kfree(sch->global_dsqs[node]);
+ kfree(sch->global_dsqs);
+
+ rhashtable_walk_enter(&sch->dsq_hash, &rht_iter);
+ do {
+ rhashtable_walk_start(&rht_iter);
+
+ while ((dsq = rhashtable_walk_next(&rht_iter)) && !IS_ERR(dsq))
+ destroy_dsq(sch, dsq->id);
+
+ rhashtable_walk_stop(&rht_iter);
+ } while (dsq == ERR_PTR(-EAGAIN));
+ rhashtable_walk_exit(&rht_iter);
+
+ rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL);
+ free_exit_info(sch->exit_info);
+ kfree(sch);
+}
+
static void scx_kobj_release(struct kobject *kobj)
{
- kfree(kobj);
+ struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj);
+
+ INIT_RCU_WORK(&sch->rcu_work, scx_sched_free_rcu_work);
+ queue_rcu_work(system_unbound_wq, &sch->rcu_work);
}
static ssize_t scx_attr_ops_show(struct kobject *kobj,
struct kobj_attribute *ka, char *buf)
{
- return sysfs_emit(buf, "%s\n", scx_ops.name);
+ return sysfs_emit(buf, "%s\n", scx_root->ops.name);
}
SCX_ATTR(ops);
@@ -4401,16 +4587,17 @@ SCX_ATTR(ops);
static ssize_t scx_attr_events_show(struct kobject *kobj,
struct kobj_attribute *ka, char *buf)
{
+ struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj);
struct scx_event_stats events;
int at = 0;
- scx_bpf_events(&events, sizeof(events));
+ scx_read_events(sch, &events);
at += scx_attr_event_show(buf, at, &events, SCX_EV_SELECT_CPU_FALLBACK);
at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_KEEP_LAST);
at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_EXITING);
at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED);
- at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SLICE_DFL);
+ at += scx_attr_event_show(buf, at, &events, SCX_EV_REFILL_SLICE_DFL);
at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DURATION);
at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DISPATCH);
at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_ACTIVATE);
@@ -4433,7 +4620,7 @@ static const struct kobj_type scx_ktype = {
static int scx_uevent(const struct kobject *kobj, struct kobj_uevent_env *env)
{
- return add_uevent_var(env, "SCXOPS=%s", scx_ops.name);
+ return add_uevent_var(env, "SCXOPS=%s", scx_root->ops.name);
}
static const struct kset_uevent_ops scx_uevent_ops = {
@@ -4446,14 +4633,20 @@ static const struct kset_uevent_ops scx_uevent_ops = {
*/
bool task_should_scx(int policy)
{
- if (!scx_enabled() ||
- unlikely(scx_ops_enable_state() == SCX_OPS_DISABLING))
+ if (!scx_enabled() || unlikely(scx_enable_state() == SCX_DISABLING))
return false;
if (READ_ONCE(scx_switching_all))
return true;
return policy == SCHED_EXT;
}
+bool scx_allow_ttwu_queue(const struct task_struct *p)
+{
+ return !scx_enabled() ||
+ (scx_root->ops.flags & SCX_OPS_ALLOW_QUEUED_WAKEUP) ||
+ p->sched_class != &ext_sched_class;
+}
+
/**
* scx_softlockup - sched_ext softlockup handler
* @dur_s: number of seconds of CPU stuck due to soft lockup
@@ -4466,40 +4659,48 @@ bool task_should_scx(int policy)
*/
void scx_softlockup(u32 dur_s)
{
- switch (scx_ops_enable_state()) {
- case SCX_OPS_ENABLING:
- case SCX_OPS_ENABLED:
+ struct scx_sched *sch;
+
+ rcu_read_lock();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ goto out_unlock;
+
+ switch (scx_enable_state()) {
+ case SCX_ENABLING:
+ case SCX_ENABLED:
break;
default:
- return;
+ goto out_unlock;
}
- /* allow only one instance, cleared at the end of scx_ops_bypass() */
+ /* allow only one instance, cleared at the end of scx_bypass() */
if (test_and_set_bit(0, &scx_in_softlockup))
- return;
+ goto out_unlock;
printk_deferred(KERN_ERR "sched_ext: Soft lockup - CPU%d stuck for %us, disabling \"%s\"\n",
- smp_processor_id(), dur_s, scx_ops.name);
+ smp_processor_id(), dur_s, scx_root->ops.name);
/*
* Some CPUs may be trapped in the dispatch paths. Enable breather
- * immediately; otherwise, we might even be able to get to
- * scx_ops_bypass().
+ * immediately; otherwise, we might even be able to get to scx_bypass().
*/
- atomic_inc(&scx_ops_breather_depth);
+ atomic_inc(&scx_breather_depth);
- scx_ops_error("soft lockup - CPU#%d stuck for %us",
- smp_processor_id(), dur_s);
+ scx_error(sch, "soft lockup - CPU#%d stuck for %us", smp_processor_id(), dur_s);
+out_unlock:
+ rcu_read_unlock();
}
static void scx_clear_softlockup(void)
{
if (test_and_clear_bit(0, &scx_in_softlockup))
- atomic_dec(&scx_ops_breather_depth);
+ atomic_dec(&scx_breather_depth);
}
/**
- * scx_ops_bypass - [Un]bypass scx_ops and guarantee forward progress
+ * scx_bypass - [Un]bypass scx_ops and guarantee forward progress
* @bypass: true for bypass, false for unbypass
*
* Bypassing guarantees that all runnable tasks make forward progress without
@@ -4529,32 +4730,36 @@ static void scx_clear_softlockup(void)
*
* - scx_prio_less() reverts to the default core_sched_at order.
*/
-static void scx_ops_bypass(bool bypass)
+static void scx_bypass(bool bypass)
{
static DEFINE_RAW_SPINLOCK(bypass_lock);
static unsigned long bypass_timestamp;
-
- int cpu;
+ struct scx_sched *sch;
unsigned long flags;
+ int cpu;
raw_spin_lock_irqsave(&bypass_lock, flags);
+ sch = rcu_dereference_bh(scx_root);
+
if (bypass) {
- scx_ops_bypass_depth++;
- WARN_ON_ONCE(scx_ops_bypass_depth <= 0);
- if (scx_ops_bypass_depth != 1)
+ scx_bypass_depth++;
+ WARN_ON_ONCE(scx_bypass_depth <= 0);
+ if (scx_bypass_depth != 1)
goto unlock;
bypass_timestamp = ktime_get_ns();
- scx_add_event(SCX_EV_BYPASS_ACTIVATE, 1);
+ if (sch)
+ scx_add_event(sch, SCX_EV_BYPASS_ACTIVATE, 1);
} else {
- scx_ops_bypass_depth--;
- WARN_ON_ONCE(scx_ops_bypass_depth < 0);
- if (scx_ops_bypass_depth != 0)
+ scx_bypass_depth--;
+ WARN_ON_ONCE(scx_bypass_depth < 0);
+ if (scx_bypass_depth != 0)
goto unlock;
- scx_add_event(SCX_EV_BYPASS_DURATION,
- ktime_get_ns() - bypass_timestamp);
+ if (sch)
+ scx_add_event(sch, SCX_EV_BYPASS_DURATION,
+ ktime_get_ns() - bypass_timestamp);
}
- atomic_inc(&scx_ops_breather_depth);
+ atomic_inc(&scx_breather_depth);
/*
* No task property is changing. We just need to make sure all currently
@@ -4612,7 +4817,7 @@ static void scx_ops_bypass(bool bypass)
raw_spin_rq_unlock(rq);
}
- atomic_dec(&scx_ops_breather_depth);
+ atomic_dec(&scx_breather_depth);
unlock:
raw_spin_unlock_irqrestore(&bypass_lock, flags);
scx_clear_softlockup();
@@ -4668,42 +4873,36 @@ static const char *scx_exit_reason(enum scx_exit_kind kind)
}
}
-static void scx_ops_disable_workfn(struct kthread_work *work)
+static void scx_disable_workfn(struct kthread_work *work)
{
- struct scx_exit_info *ei = scx_exit_info;
+ struct scx_sched *sch = container_of(work, struct scx_sched, disable_work);
+ struct scx_exit_info *ei = sch->exit_info;
struct scx_task_iter sti;
struct task_struct *p;
- struct rhashtable_iter rht_iter;
- struct scx_dispatch_q *dsq;
- int i, kind, cpu;
+ int kind, cpu;
- kind = atomic_read(&scx_exit_kind);
+ kind = atomic_read(&sch->exit_kind);
while (true) {
- /*
- * NONE indicates that a new scx_ops has been registered since
- * disable was scheduled - don't kill the new ops. DONE
- * indicates that the ops has already been disabled.
- */
- if (kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE)
+ if (kind == SCX_EXIT_DONE) /* already disabled? */
return;
- if (atomic_try_cmpxchg(&scx_exit_kind, &kind, SCX_EXIT_DONE))
+ WARN_ON_ONCE(kind == SCX_EXIT_NONE);
+ if (atomic_try_cmpxchg(&sch->exit_kind, &kind, SCX_EXIT_DONE))
break;
}
ei->kind = kind;
ei->reason = scx_exit_reason(ei->kind);
/* guarantee forward progress by bypassing scx_ops */
- scx_ops_bypass(true);
+ scx_bypass(true);
- switch (scx_ops_set_enable_state(SCX_OPS_DISABLING)) {
- case SCX_OPS_DISABLING:
+ switch (scx_set_enable_state(SCX_DISABLING)) {
+ case SCX_DISABLING:
WARN_ONCE(true, "sched_ext: duplicate disabling instance?");
break;
- case SCX_OPS_DISABLED:
+ case SCX_DISABLED:
pr_warn("sched_ext: ops error detected without ops (%s)\n",
- scx_exit_info->msg);
- WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_DISABLED) !=
- SCX_OPS_DISABLING);
+ sch->exit_info->msg);
+ WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING);
goto done;
default:
break;
@@ -4714,17 +4913,17 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
* we can safely use blocking synchronization constructs. Actually
* disable ops.
*/
- mutex_lock(&scx_ops_enable_mutex);
+ mutex_lock(&scx_enable_mutex);
static_branch_disable(&__scx_switched_all);
WRITE_ONCE(scx_switching_all, false);
/*
* Shut down cgroup support before tasks so that the cgroup attach path
- * doesn't race against scx_ops_exit_task().
+ * doesn't race against scx_exit_task().
*/
scx_cgroup_lock();
- scx_cgroup_exit();
+ scx_cgroup_exit(sch);
scx_cgroup_unlock();
/*
@@ -4733,7 +4932,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
*/
percpu_down_write(&scx_fork_rwsem);
- scx_ops_init_task_enabled = false;
+ scx_init_task_enabled = false;
scx_task_iter_start(&sti);
while ((p = scx_task_iter_next_locked(&sti))) {
@@ -4753,7 +4952,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
sched_enq_and_set_task(&ctx);
check_class_changed(task_rq(p), p, old_class, p->prio);
- scx_ops_exit_task(p);
+ scx_exit_task(p);
}
scx_task_iter_stop(&sti);
percpu_up_write(&scx_fork_rwsem);
@@ -4768,98 +4967,71 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
}
/* no task is on scx, turn off all the switches and flush in-progress calls */
- static_branch_disable(&__scx_ops_enabled);
- for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++)
- static_branch_disable(&scx_has_op[i]);
- static_branch_disable(&scx_ops_allow_queued_wakeup);
- static_branch_disable(&scx_ops_enq_last);
- static_branch_disable(&scx_ops_enq_exiting);
- static_branch_disable(&scx_ops_enq_migration_disabled);
- static_branch_disable(&scx_ops_cpu_preempt);
+ static_branch_disable(&__scx_enabled);
+ bitmap_zero(sch->has_op, SCX_OPI_END);
scx_idle_disable();
synchronize_rcu();
if (ei->kind >= SCX_EXIT_ERROR) {
pr_err("sched_ext: BPF scheduler \"%s\" disabled (%s)\n",
- scx_ops.name, ei->reason);
+ sch->ops.name, ei->reason);
if (ei->msg[0] != '\0')
- pr_err("sched_ext: %s: %s\n", scx_ops.name, ei->msg);
+ pr_err("sched_ext: %s: %s\n", sch->ops.name, ei->msg);
#ifdef CONFIG_STACKTRACE
stack_trace_print(ei->bt, ei->bt_len, 2);
#endif
} else {
pr_info("sched_ext: BPF scheduler \"%s\" disabled (%s)\n",
- scx_ops.name, ei->reason);
+ sch->ops.name, ei->reason);
}
- if (scx_ops.exit)
- SCX_CALL_OP(SCX_KF_UNLOCKED, exit, NULL, ei);
+ if (sch->ops.exit)
+ SCX_CALL_OP(sch, SCX_KF_UNLOCKED, exit, NULL, ei);
cancel_delayed_work_sync(&scx_watchdog_work);
/*
- * Delete the kobject from the hierarchy eagerly in addition to just
- * dropping a reference. Otherwise, if the object is deleted
- * asynchronously, sysfs could observe an object of the same name still
- * in the hierarchy when another scheduler is loaded.
+ * scx_root clearing must be inside cpus_read_lock(). See
+ * handle_hotplug().
*/
- kobject_del(scx_root_kobj);
- kobject_put(scx_root_kobj);
- scx_root_kobj = NULL;
-
- memset(&scx_ops, 0, sizeof(scx_ops));
-
- rhashtable_walk_enter(&dsq_hash, &rht_iter);
- do {
- rhashtable_walk_start(&rht_iter);
-
- while ((dsq = rhashtable_walk_next(&rht_iter)) && !IS_ERR(dsq))
- destroy_dsq(dsq->id);
+ cpus_read_lock();
+ RCU_INIT_POINTER(scx_root, NULL);
+ cpus_read_unlock();
- rhashtable_walk_stop(&rht_iter);
- } while (dsq == ERR_PTR(-EAGAIN));
- rhashtable_walk_exit(&rht_iter);
+ /*
+ * Delete the kobject from the hierarchy synchronously. Otherwise, sysfs
+ * could observe an object of the same name still in the hierarchy when
+ * the next scheduler is loaded.
+ */
+ kobject_del(&sch->kobj);
free_percpu(scx_dsp_ctx);
scx_dsp_ctx = NULL;
scx_dsp_max_batch = 0;
- free_exit_info(scx_exit_info);
- scx_exit_info = NULL;
-
- mutex_unlock(&scx_ops_enable_mutex);
+ mutex_unlock(&scx_enable_mutex);
- WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_DISABLED) !=
- SCX_OPS_DISABLING);
+ WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING);
done:
- scx_ops_bypass(false);
-}
-
-static DEFINE_KTHREAD_WORK(scx_ops_disable_work, scx_ops_disable_workfn);
-
-static void schedule_scx_ops_disable_work(void)
-{
- struct kthread_worker *helper = READ_ONCE(scx_ops_helper);
-
- /*
- * We may be called spuriously before the first bpf_sched_ext_reg(). If
- * scx_ops_helper isn't set up yet, there's nothing to do.
- */
- if (helper)
- kthread_queue_work(helper, &scx_ops_disable_work);
+ scx_bypass(false);
}
-static void scx_ops_disable(enum scx_exit_kind kind)
+static void scx_disable(enum scx_exit_kind kind)
{
int none = SCX_EXIT_NONE;
+ struct scx_sched *sch;
if (WARN_ON_ONCE(kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE))
kind = SCX_EXIT_ERROR;
- atomic_try_cmpxchg(&scx_exit_kind, &none, kind);
-
- schedule_scx_ops_disable_work();
+ rcu_read_lock();
+ sch = rcu_dereference(scx_root);
+ if (sch) {
+ atomic_try_cmpxchg(&sch->exit_kind, &none, kind);
+ kthread_queue_work(sch->helper, &sch->disable_work);
+ }
+ rcu_read_unlock();
}
static void dump_newline(struct seq_buf *s)
@@ -4977,6 +5149,7 @@ static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx,
struct task_struct *p, char marker)
{
static unsigned long bt[SCX_EXIT_BT_LEN];
+ struct scx_sched *sch = scx_root;
char dsq_id_buf[19] = "(n/a)";
unsigned long ops_state = atomic_long_read(&p->scx.ops_state);
unsigned int bt_len = 0;
@@ -4999,9 +5172,9 @@ static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx,
p->scx.dsq_vtime, p->scx.slice, p->scx.weight);
dump_line(s, " cpus=%*pb", cpumask_pr_args(p->cpus_ptr));
- if (SCX_HAS_OP(dump_task)) {
+ if (SCX_HAS_OP(sch, dump_task)) {
ops_dump_init(s, " ");
- SCX_CALL_OP(SCX_KF_REST, dump_task, NULL, dctx, p);
+ SCX_CALL_OP(sch, SCX_KF_REST, dump_task, NULL, dctx, p);
ops_dump_exit();
}
@@ -5018,6 +5191,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
{
static DEFINE_SPINLOCK(dump_lock);
static const char trunc_marker[] = "\n\n~~~~ TRUNCATED ~~~~\n";
+ struct scx_sched *sch = scx_root;
struct scx_dump_ctx dctx = {
.kind = ei->kind,
.exit_code = ei->exit_code,
@@ -5046,9 +5220,9 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
dump_stack_trace(&s, " ", ei->bt, ei->bt_len);
}
- if (SCX_HAS_OP(dump)) {
+ if (SCX_HAS_OP(sch, dump)) {
ops_dump_init(&s, "");
- SCX_CALL_OP(SCX_KF_UNLOCKED, dump, NULL, &dctx);
+ SCX_CALL_OP(sch, SCX_KF_UNLOCKED, dump, NULL, &dctx);
ops_dump_exit();
}
@@ -5069,7 +5243,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
idle = list_empty(&rq->scx.runnable_list) &&
rq->curr->sched_class == &idle_sched_class;
- if (idle && !SCX_HAS_OP(dump_cpu))
+ if (idle && !SCX_HAS_OP(sch, dump_cpu))
goto next;
/*
@@ -5103,9 +5277,10 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
cpumask_pr_args(rq->scx.cpus_to_wait));
used = seq_buf_used(&ns);
- if (SCX_HAS_OP(dump_cpu)) {
+ if (SCX_HAS_OP(sch, dump_cpu)) {
ops_dump_init(&ns, " ");
- SCX_CALL_OP(SCX_KF_REST, dump_cpu, NULL, &dctx, cpu, idle);
+ SCX_CALL_OP(sch, SCX_KF_REST, dump_cpu, NULL,
+ &dctx, cpu, idle);
ops_dump_exit();
}
@@ -5139,13 +5314,13 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
dump_line(&s, "Event counters");
dump_line(&s, "--------------");
- scx_bpf_events(&events, sizeof(events));
+ scx_read_events(sch, &events);
scx_dump_event(s, &events, SCX_EV_SELECT_CPU_FALLBACK);
scx_dump_event(s, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
scx_dump_event(s, &events, SCX_EV_DISPATCH_KEEP_LAST);
scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_EXITING);
scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED);
- scx_dump_event(s, &events, SCX_EV_ENQ_SLICE_DFL);
+ scx_dump_event(s, &events, SCX_EV_REFILL_SLICE_DFL);
scx_dump_event(s, &events, SCX_EV_BYPASS_DURATION);
scx_dump_event(s, &events, SCX_EV_BYPASS_DISPATCH);
scx_dump_event(s, &events, SCX_EV_BYPASS_ACTIVATE);
@@ -5157,27 +5332,25 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
spin_unlock_irqrestore(&dump_lock, flags);
}
-static void scx_ops_error_irq_workfn(struct irq_work *irq_work)
+static void scx_error_irq_workfn(struct irq_work *irq_work)
{
- struct scx_exit_info *ei = scx_exit_info;
+ struct scx_sched *sch = container_of(irq_work, struct scx_sched, error_irq_work);
+ struct scx_exit_info *ei = sch->exit_info;
if (ei->kind >= SCX_EXIT_ERROR)
- scx_dump_state(ei, scx_ops.exit_dump_len);
+ scx_dump_state(ei, sch->ops.exit_dump_len);
- schedule_scx_ops_disable_work();
+ kthread_queue_work(sch->helper, &sch->disable_work);
}
-static DEFINE_IRQ_WORK(scx_ops_error_irq_work, scx_ops_error_irq_workfn);
-
-static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind,
- s64 exit_code,
- const char *fmt, ...)
+static void scx_vexit(struct scx_sched *sch,
+ enum scx_exit_kind kind, s64 exit_code,
+ const char *fmt, va_list args)
{
- struct scx_exit_info *ei = scx_exit_info;
+ struct scx_exit_info *ei = sch->exit_info;
int none = SCX_EXIT_NONE;
- va_list args;
- if (!atomic_try_cmpxchg(&scx_exit_kind, &none, kind))
+ if (!atomic_try_cmpxchg(&sch->exit_kind, &none, kind))
return;
ei->exit_code = exit_code;
@@ -5185,31 +5358,98 @@ static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind,
if (kind >= SCX_EXIT_ERROR)
ei->bt_len = stack_trace_save(ei->bt, SCX_EXIT_BT_LEN, 1);
#endif
- va_start(args, fmt);
vscnprintf(ei->msg, SCX_EXIT_MSG_LEN, fmt, args);
- va_end(args);
/*
* Set ei->kind and ->reason for scx_dump_state(). They'll be set again
- * in scx_ops_disable_workfn().
+ * in scx_disable_workfn().
*/
ei->kind = kind;
ei->reason = scx_exit_reason(ei->kind);
- irq_work_queue(&scx_ops_error_irq_work);
+ irq_work_queue(&sch->error_irq_work);
}
-static struct kthread_worker *scx_create_rt_helper(const char *name)
+static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
{
- struct kthread_worker *helper;
+ struct scx_sched *sch;
+ int node, ret;
- helper = kthread_run_worker(0, name);
- if (helper)
- sched_set_fifo(helper->task);
- return helper;
-}
+ sch = kzalloc(sizeof(*sch), GFP_KERNEL);
+ if (!sch)
+ return ERR_PTR(-ENOMEM);
+
+ sch->exit_info = alloc_exit_info(ops->exit_dump_len);
+ if (!sch->exit_info) {
+ ret = -ENOMEM;
+ goto err_free_sch;
+ }
+
+ ret = rhashtable_init(&sch->dsq_hash, &dsq_hash_params);
+ if (ret < 0)
+ goto err_free_ei;
+
+ sch->global_dsqs = kcalloc(nr_node_ids, sizeof(sch->global_dsqs[0]),
+ GFP_KERNEL);
+ if (!sch->global_dsqs) {
+ ret = -ENOMEM;
+ goto err_free_hash;
+ }
+
+ for_each_node_state(node, N_POSSIBLE) {
+ struct scx_dispatch_q *dsq;
+
+ dsq = kzalloc_node(sizeof(*dsq), GFP_KERNEL, node);
+ if (!dsq) {
+ ret = -ENOMEM;
+ goto err_free_gdsqs;
+ }
+
+ init_dsq(dsq, SCX_DSQ_GLOBAL);
+ sch->global_dsqs[node] = dsq;
+ }
-static void check_hotplug_seq(const struct sched_ext_ops *ops)
+ sch->event_stats_cpu = alloc_percpu(struct scx_event_stats);
+ if (!sch->event_stats_cpu)
+ goto err_free_gdsqs;
+
+ sch->helper = kthread_run_worker(0, "sched_ext_helper");
+ if (!sch->helper)
+ goto err_free_event_stats;
+ sched_set_fifo(sch->helper->task);
+
+ atomic_set(&sch->exit_kind, SCX_EXIT_NONE);
+ init_irq_work(&sch->error_irq_work, scx_error_irq_workfn);
+ kthread_init_work(&sch->disable_work, scx_disable_workfn);
+ sch->ops = *ops;
+ ops->priv = sch;
+
+ sch->kobj.kset = scx_kset;
+ ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root");
+ if (ret < 0)
+ goto err_stop_helper;
+
+ return sch;
+
+err_stop_helper:
+ kthread_stop(sch->helper->task);
+err_free_event_stats:
+ free_percpu(sch->event_stats_cpu);
+err_free_gdsqs:
+ for_each_node_state(node, N_POSSIBLE)
+ kfree(sch->global_dsqs[node]);
+ kfree(sch->global_dsqs);
+err_free_hash:
+ rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL);
+err_free_ei:
+ free_exit_info(sch->exit_info);
+err_free_sch:
+ kfree(sch);
+ return ERR_PTR(ret);
+}
+
+static void check_hotplug_seq(struct scx_sched *sch,
+ const struct sched_ext_ops *ops)
{
unsigned long long global_hotplug_seq;
@@ -5221,21 +5461,22 @@ static void check_hotplug_seq(const struct sched_ext_ops *ops)
if (ops->hotplug_seq) {
global_hotplug_seq = atomic_long_read(&scx_hotplug_seq);
if (ops->hotplug_seq != global_hotplug_seq) {
- scx_ops_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG,
- "expected hotplug seq %llu did not match actual %llu",
- ops->hotplug_seq, global_hotplug_seq);
+ scx_exit(sch, SCX_EXIT_UNREG_KERN,
+ SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG,
+ "expected hotplug seq %llu did not match actual %llu",
+ ops->hotplug_seq, global_hotplug_seq);
}
}
}
-static int validate_ops(const struct sched_ext_ops *ops)
+static int validate_ops(struct scx_sched *sch, const struct sched_ext_ops *ops)
{
/*
* It doesn't make sense to specify the SCX_OPS_ENQ_LAST flag if the
* ops.enqueue() callback isn't implemented.
*/
if ((ops->flags & SCX_OPS_ENQ_LAST) && !ops->enqueue) {
- scx_ops_error("SCX_OPS_ENQ_LAST requires ops.enqueue() to be implemented");
+ scx_error(sch, "SCX_OPS_ENQ_LAST requires ops.enqueue() to be implemented");
return -EINVAL;
}
@@ -5245,7 +5486,7 @@ static int validate_ops(const struct sched_ext_ops *ops)
*/
if ((ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE) &&
(ops->update_idle && !(ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE))) {
- scx_ops_error("SCX_OPS_BUILTIN_IDLE_PER_NODE requires CPU idle selection enabled");
+ scx_error(sch, "SCX_OPS_BUILTIN_IDLE_PER_NODE requires CPU idle selection enabled");
return -EINVAL;
}
@@ -5255,12 +5496,13 @@ static int validate_ops(const struct sched_ext_ops *ops)
return 0;
}
-static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
+static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
{
+ struct scx_sched *sch;
struct scx_task_iter sti;
struct task_struct *p;
unsigned long timeout;
- int i, cpu, node, ret;
+ int i, cpu, ret;
if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN),
cpu_possible_mask)) {
@@ -5268,87 +5510,25 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
return -EINVAL;
}
- mutex_lock(&scx_ops_enable_mutex);
-
- /*
- * Clear event counters so a new scx scheduler gets
- * fresh event counter values.
- */
- for_each_possible_cpu(cpu) {
- struct scx_event_stats *e = per_cpu_ptr(&event_stats_cpu, cpu);
- memset(e, 0, sizeof(*e));
- }
-
- if (!scx_ops_helper) {
- WRITE_ONCE(scx_ops_helper,
- scx_create_rt_helper("sched_ext_ops_helper"));
- if (!scx_ops_helper) {
- ret = -ENOMEM;
- goto err_unlock;
- }
- }
-
- if (!global_dsqs) {
- struct scx_dispatch_q **dsqs;
+ mutex_lock(&scx_enable_mutex);
- dsqs = kcalloc(nr_node_ids, sizeof(dsqs[0]), GFP_KERNEL);
- if (!dsqs) {
- ret = -ENOMEM;
- goto err_unlock;
- }
-
- for_each_node_state(node, N_POSSIBLE) {
- struct scx_dispatch_q *dsq;
-
- dsq = kzalloc_node(sizeof(*dsq), GFP_KERNEL, node);
- if (!dsq) {
- for_each_node_state(node, N_POSSIBLE)
- kfree(dsqs[node]);
- kfree(dsqs);
- ret = -ENOMEM;
- goto err_unlock;
- }
-
- init_dsq(dsq, SCX_DSQ_GLOBAL);
- dsqs[node] = dsq;
- }
-
- global_dsqs = dsqs;
- }
-
- if (scx_ops_enable_state() != SCX_OPS_DISABLED) {
+ if (scx_enable_state() != SCX_DISABLED) {
ret = -EBUSY;
goto err_unlock;
}
- scx_root_kobj = kzalloc(sizeof(*scx_root_kobj), GFP_KERNEL);
- if (!scx_root_kobj) {
- ret = -ENOMEM;
+ sch = scx_alloc_and_add_sched(ops);
+ if (IS_ERR(sch)) {
+ ret = PTR_ERR(sch);
goto err_unlock;
}
- scx_root_kobj->kset = scx_kset;
- ret = kobject_init_and_add(scx_root_kobj, &scx_ktype, NULL, "root");
- if (ret < 0)
- goto err;
-
- scx_exit_info = alloc_exit_info(ops->exit_dump_len);
- if (!scx_exit_info) {
- ret = -ENOMEM;
- goto err_del;
- }
-
/*
- * Set scx_ops, transition to ENABLING and clear exit info to arm the
- * disable path. Failure triggers full disabling from here on.
+ * Transition to ENABLING and clear exit info to arm the disable path.
+ * Failure triggers full disabling from here on.
*/
- scx_ops = *ops;
-
- WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_ENABLING) !=
- SCX_OPS_DISABLED);
-
- atomic_set(&scx_exit_kind, SCX_EXIT_NONE);
- scx_warned_zero_slice = false;
+ WARN_ON_ONCE(scx_set_enable_state(SCX_ENABLING) != SCX_DISABLED);
+ WARN_ON_ONCE(scx_root);
atomic_long_set(&scx_nr_rejected, 0);
@@ -5361,28 +5541,34 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
*/
cpus_read_lock();
+ /*
+ * Make the scheduler instance visible. Must be inside cpus_read_lock().
+ * See handle_hotplug().
+ */
+ rcu_assign_pointer(scx_root, sch);
+
scx_idle_enable(ops);
- if (scx_ops.init) {
- ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init, NULL);
+ if (sch->ops.init) {
+ ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, init, NULL);
if (ret) {
- ret = ops_sanitize_err("init", ret);
+ ret = ops_sanitize_err(sch, "init", ret);
cpus_read_unlock();
- scx_ops_error("ops.init() failed (%d)", ret);
+ scx_error(sch, "ops.init() failed (%d)", ret);
goto err_disable;
}
}
for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++)
if (((void (**)(void))ops)[i])
- static_branch_enable_cpuslocked(&scx_has_op[i]);
+ set_bit(i, sch->has_op);
- check_hotplug_seq(ops);
+ check_hotplug_seq(sch, ops);
scx_idle_update_selcpu_topology(ops);
cpus_read_unlock();
- ret = validate_ops(ops);
+ ret = validate_ops(sch, ops);
if (ret)
goto err_disable;
@@ -5407,27 +5593,19 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
scx_watchdog_timeout / 2);
/*
- * Once __scx_ops_enabled is set, %current can be switched to SCX
- * anytime. This can lead to stalls as some BPF schedulers (e.g.
- * userspace scheduling) may not function correctly before all tasks are
- * switched. Init in bypass mode to guarantee forward progress.
+ * Once __scx_enabled is set, %current can be switched to SCX anytime.
+ * This can lead to stalls as some BPF schedulers (e.g. userspace
+ * scheduling) may not function correctly before all tasks are switched.
+ * Init in bypass mode to guarantee forward progress.
*/
- scx_ops_bypass(true);
+ scx_bypass(true);
for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++)
if (((void (**)(void))ops)[i])
- static_branch_enable(&scx_has_op[i]);
-
- if (ops->flags & SCX_OPS_ALLOW_QUEUED_WAKEUP)
- static_branch_enable(&scx_ops_allow_queued_wakeup);
- if (ops->flags & SCX_OPS_ENQ_LAST)
- static_branch_enable(&scx_ops_enq_last);
- if (ops->flags & SCX_OPS_ENQ_EXITING)
- static_branch_enable(&scx_ops_enq_exiting);
- if (ops->flags & SCX_OPS_ENQ_MIGRATION_DISABLED)
- static_branch_enable(&scx_ops_enq_migration_disabled);
- if (scx_ops.cpu_acquire || scx_ops.cpu_release)
- static_branch_enable(&scx_ops_cpu_preempt);
+ set_bit(i, sch->has_op);
+
+ if (sch->ops.cpu_acquire || sch->ops.cpu_release)
+ sch->ops.flags |= SCX_OPS_HAS_CPU_PREEMPT;
/*
* Lock out forks, cgroup on/offlining and moves before opening the
@@ -5435,8 +5613,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
*/
percpu_down_write(&scx_fork_rwsem);
- WARN_ON_ONCE(scx_ops_init_task_enabled);
- scx_ops_init_task_enabled = true;
+ WARN_ON_ONCE(scx_init_task_enabled);
+ scx_init_task_enabled = true;
/*
* Enable ops for every task. Fork is excluded by scx_fork_rwsem
@@ -5445,14 +5623,14 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
* tasks. Prep all tasks first and then enable them with preemption
* disabled.
*
- * All cgroups should be initialized before scx_ops_init_task() so that
- * the BPF scheduler can reliably track each task's cgroup membership
- * from scx_ops_init_task(). Lock out cgroup on/offlining and task
- * migrations while tasks are being initialized so that
- * scx_cgroup_can_attach() never sees uninitialized tasks.
+ * All cgroups should be initialized before scx_init_task() so that the
+ * BPF scheduler can reliably track each task's cgroup membership from
+ * scx_init_task(). Lock out cgroup on/offlining and task migrations
+ * while tasks are being initialized so that scx_cgroup_can_attach()
+ * never sees uninitialized tasks.
*/
scx_cgroup_lock();
- ret = scx_cgroup_init();
+ ret = scx_cgroup_init(sch);
if (ret)
goto err_disable_unlock_all;
@@ -5468,13 +5646,13 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
scx_task_iter_unlock(&sti);
- ret = scx_ops_init_task(p, task_group(p), false);
+ ret = scx_init_task(p, task_group(p), false);
if (ret) {
put_task_struct(p);
scx_task_iter_relock(&sti);
scx_task_iter_stop(&sti);
- scx_ops_error("ops.init_task() failed (%d) for %s[%d]",
- ret, p->comm, p->pid);
+ scx_error(sch, "ops.init_task() failed (%d) for %s[%d]",
+ ret, p->comm, p->pid);
goto err_disable_unlock_all;
}
@@ -5492,7 +5670,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
* all eligible tasks.
*/
WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL));
- static_branch_enable(&__scx_ops_enabled);
+ static_branch_enable(&__scx_enabled);
/*
* We're fully committed and can't fail. The task READY -> ENABLED
@@ -5523,10 +5701,10 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
scx_task_iter_stop(&sti);
percpu_up_write(&scx_fork_rwsem);
- scx_ops_bypass(false);
+ scx_bypass(false);
- if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLED, SCX_OPS_ENABLING)) {
- WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE);
+ if (!scx_tryset_enable_state(SCX_ENABLED, SCX_ENABLING)) {
+ WARN_ON_ONCE(atomic_read(&sch->exit_kind) == SCX_EXIT_NONE);
goto err_disable;
}
@@ -5534,44 +5712,35 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
static_branch_enable(&__scx_switched_all);
pr_info("sched_ext: BPF scheduler \"%s\" enabled%s\n",
- scx_ops.name, scx_switched_all() ? "" : " (partial)");
- kobject_uevent(scx_root_kobj, KOBJ_ADD);
- mutex_unlock(&scx_ops_enable_mutex);
+ sch->ops.name, scx_switched_all() ? "" : " (partial)");
+ kobject_uevent(&sch->kobj, KOBJ_ADD);
+ mutex_unlock(&scx_enable_mutex);
atomic_long_inc(&scx_enable_seq);
return 0;
-err_del:
- kobject_del(scx_root_kobj);
-err:
- kobject_put(scx_root_kobj);
- scx_root_kobj = NULL;
- if (scx_exit_info) {
- free_exit_info(scx_exit_info);
- scx_exit_info = NULL;
- }
err_unlock:
- mutex_unlock(&scx_ops_enable_mutex);
+ mutex_unlock(&scx_enable_mutex);
return ret;
err_disable_unlock_all:
scx_cgroup_unlock();
percpu_up_write(&scx_fork_rwsem);
- scx_ops_bypass(false);
+ scx_bypass(false);
err_disable:
- mutex_unlock(&scx_ops_enable_mutex);
+ mutex_unlock(&scx_enable_mutex);
/*
* Returning an error code here would not pass all the error information
- * to userspace. Record errno using scx_ops_error() for cases
- * scx_ops_error() wasn't already invoked and exit indicating success so
- * that the error is notified through ops.exit() with all the details.
+ * to userspace. Record errno using scx_error() for cases scx_error()
+ * wasn't already invoked and exit indicating success so that the error
+ * is notified through ops.exit() with all the details.
*
- * Flush scx_ops_disable_work to ensure that error is reported before
- * init completion.
+ * Flush scx_disable_work to ensure that error is reported before init
+ * completion. sch's base reference will be put by bpf_scx_unreg().
*/
- scx_ops_error("scx_ops_enable() failed (%d)", ret);
- kthread_flush_work(&scx_ops_disable_work);
+ scx_error(sch, "scx_enable() failed (%d)", ret);
+ kthread_flush_work(&sch->disable_work);
return 0;
}
@@ -5622,21 +5791,8 @@ static int bpf_scx_btf_struct_access(struct bpf_verifier_log *log,
return -EACCES;
}
-static const struct bpf_func_proto *
-bpf_scx_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
-{
- switch (func_id) {
- case BPF_FUNC_task_storage_get:
- return &bpf_task_storage_get_proto;
- case BPF_FUNC_task_storage_delete:
- return &bpf_task_storage_delete_proto;
- default:
- return bpf_base_func_proto(func_id, prog);
- }
-}
-
static const struct bpf_verifier_ops bpf_scx_verifier_ops = {
- .get_func_proto = bpf_scx_get_func_proto,
+ .get_func_proto = bpf_base_func_proto,
.is_valid_access = bpf_scx_is_valid_access,
.btf_struct_access = bpf_scx_btf_struct_access,
};
@@ -5715,13 +5871,17 @@ static int bpf_scx_check_member(const struct btf_type *t,
static int bpf_scx_reg(void *kdata, struct bpf_link *link)
{
- return scx_ops_enable(kdata, link);
+ return scx_enable(kdata, link);
}
static void bpf_scx_unreg(void *kdata, struct bpf_link *link)
{
- scx_ops_disable(SCX_EXIT_UNREG);
- kthread_flush_work(&scx_ops_disable_work);
+ struct sched_ext_ops *ops = kdata;
+ struct scx_sched *sch = ops->priv;
+
+ scx_disable(SCX_EXIT_UNREG);
+ kthread_flush_work(&sch->disable_work);
+ kobject_put(&sch->kobj);
}
static int bpf_scx_init(struct btf *btf)
@@ -5843,10 +6003,7 @@ static struct bpf_struct_ops bpf_sched_ext_ops = {
static void sysrq_handle_sched_ext_reset(u8 key)
{
- if (scx_ops_helper)
- scx_ops_disable(SCX_EXIT_SYSRQ);
- else
- pr_info("sched_ext: BPF scheduler not yet used\n");
+ scx_disable(SCX_EXIT_SYSRQ);
}
static const struct sysrq_key_op sysrq_sched_ext_reset_op = {
@@ -5994,13 +6151,14 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work)
*/
void print_scx_info(const char *log_lvl, struct task_struct *p)
{
- enum scx_ops_enable_state state = scx_ops_enable_state();
+ struct scx_sched *sch = scx_root;
+ enum scx_enable_state state = scx_enable_state();
const char *all = READ_ONCE(scx_switching_all) ? "+all" : "";
char runnable_at_buf[22] = "?";
struct sched_class *class;
unsigned long runnable_at;
- if (state == SCX_OPS_DISABLED)
+ if (state == SCX_DISABLED)
return;
/*
@@ -6009,8 +6167,8 @@ void print_scx_info(const char *log_lvl, struct task_struct *p)
*/
if (copy_from_kernel_nofault(&class, &p->sched_class, sizeof(class)) ||
class != &ext_sched_class) {
- printk("%sSched_ext: %s (%s%s)", log_lvl, scx_ops.name,
- scx_ops_enable_state_str[state], all);
+ printk("%sSched_ext: %s (%s%s)", log_lvl, sch->ops.name,
+ scx_enable_state_str[state], all);
return;
}
@@ -6021,7 +6179,7 @@ void print_scx_info(const char *log_lvl, struct task_struct *p)
/* print everything onto one line to conserve console space */
printk("%sSched_ext: %s (%s%s), task: runnable_at=%s",
- log_lvl, scx_ops.name, scx_ops_enable_state_str[state], all,
+ log_lvl, sch->ops.name, scx_enable_state_str[state], all,
runnable_at_buf);
}
@@ -6037,12 +6195,12 @@ static int scx_pm_handler(struct notifier_block *nb, unsigned long event, void *
case PM_HIBERNATION_PREPARE:
case PM_SUSPEND_PREPARE:
case PM_RESTORE_PREPARE:
- scx_ops_bypass(true);
+ scx_bypass(true);
break;
case PM_POST_HIBERNATION:
case PM_POST_SUSPEND:
case PM_POST_RESTORE:
- scx_ops_bypass(false);
+ scx_bypass(false);
break;
}
@@ -6065,7 +6223,6 @@ void __init init_sched_ext_class(void)
WRITE_ONCE(v, SCX_ENQ_WAKEUP | SCX_DEQ_SLEEP | SCX_KICK_PREEMPT |
SCX_TG_ONLINE);
- BUG_ON(rhashtable_init(&dsq_hash, &dsq_hash_params));
scx_idle_init_masks();
scx_kick_cpus_pnt_seqs =
@@ -6109,12 +6266,12 @@ static bool scx_dsq_insert_preamble(struct task_struct *p, u64 enq_flags)
lockdep_assert_irqs_disabled();
if (unlikely(!p)) {
- scx_ops_error("called with NULL task");
+ scx_kf_error("called with NULL task");
return false;
}
if (unlikely(enq_flags & __SCX_ENQ_INTERNAL_MASK)) {
- scx_ops_error("invalid enq_flags 0x%llx", enq_flags);
+ scx_kf_error("invalid enq_flags 0x%llx", enq_flags);
return false;
}
@@ -6134,7 +6291,7 @@ static void scx_dsq_insert_commit(struct task_struct *p, u64 dsq_id,
}
if (unlikely(dspc->cursor >= scx_dsp_max_batch)) {
- scx_ops_error("dispatch buffer overflow");
+ scx_kf_error("dispatch buffer overflow");
return;
}
@@ -6266,6 +6423,7 @@ static const struct btf_kfunc_id_set scx_kfunc_set_enqueue_dispatch = {
static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit,
struct task_struct *p, u64 dsq_id, u64 enq_flags)
{
+ struct scx_sched *sch = scx_root;
struct scx_dispatch_q *src_dsq = kit->dsq, *dst_dsq;
struct rq *this_rq, *src_rq, *locked_rq;
bool dispatched = false;
@@ -6300,7 +6458,7 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit,
* cause similar live-lock conditions as consume_dispatch_q(). Insert a
* breather if necessary.
*/
- scx_ops_breather(src_rq);
+ scx_breather(src_rq);
locked_rq = src_rq;
raw_spin_lock(&src_dsq->lock);
@@ -6318,7 +6476,7 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit,
}
/* @p is still on $src_dsq and stable, determine the destination */
- dst_dsq = find_dsq_for_dispatch(this_rq, dsq_id, p);
+ dst_dsq = find_dsq_for_dispatch(sch, this_rq, dsq_id, p);
/*
* Apply vtime and slice updates before moving so that the new time is
@@ -6331,7 +6489,7 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit,
p->scx.slice = kit->slice;
/* execute move */
- locked_rq = move_task_between_dsqs(p, enq_flags, src_dsq, dst_dsq);
+ locked_rq = move_task_between_dsqs(sch, p, enq_flags, src_dsq, dst_dsq);
dispatched = true;
out:
if (in_balance) {
@@ -6379,7 +6537,7 @@ __bpf_kfunc void scx_bpf_dispatch_cancel(void)
if (dspc->cursor > 0)
dspc->cursor--;
else
- scx_ops_error("dispatch buffer underflow");
+ scx_kf_error("dispatch buffer underflow");
}
/**
@@ -6398,21 +6556,22 @@ __bpf_kfunc void scx_bpf_dispatch_cancel(void)
*/
__bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id)
{
+ struct scx_sched *sch = scx_root;
struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
struct scx_dispatch_q *dsq;
if (!scx_kf_allowed(SCX_KF_DISPATCH))
return false;
- flush_dispatch_buf(dspc->rq);
+ flush_dispatch_buf(sch, dspc->rq);
- dsq = find_user_dsq(dsq_id);
+ dsq = find_user_dsq(sch, dsq_id);
if (unlikely(!dsq)) {
- scx_ops_error("invalid DSQ ID 0x%016llx", dsq_id);
+ scx_error(sch, "invalid DSQ ID 0x%016llx", dsq_id);
return false;
}
- if (consume_dispatch_q(dspc->rq, dsq)) {
+ if (consume_dispatch_q(sch, dspc->rq, dsq)) {
/*
* A successfully consumed task can be dequeued before it starts
* running while the CPU is trying to migrate other dispatched
@@ -6666,10 +6825,36 @@ __bpf_kfunc_start_defs();
*/
__bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node)
{
+ struct scx_dispatch_q *dsq;
+ struct scx_sched *sch;
+ s32 ret;
+
if (unlikely(node >= (int)nr_node_ids ||
(node < 0 && node != NUMA_NO_NODE)))
return -EINVAL;
- return PTR_ERR_OR_ZERO(create_dsq(dsq_id, node));
+
+ if (unlikely(dsq_id & SCX_DSQ_FLAG_BUILTIN))
+ return -EINVAL;
+
+ dsq = kmalloc_node(sizeof(*dsq), GFP_KERNEL, node);
+ if (!dsq)
+ return -ENOMEM;
+
+ init_dsq(dsq, dsq_id);
+
+ rcu_read_lock();
+
+ sch = rcu_dereference(scx_root);
+ if (sch)
+ ret = rhashtable_lookup_insert_fast(&sch->dsq_hash, &dsq->hash_node,
+ dsq_hash_params);
+ else
+ ret = -ENODEV;
+
+ rcu_read_unlock();
+ if (ret)
+ kfree(dsq);
+ return ret;
}
__bpf_kfunc_end_defs();
@@ -6708,7 +6893,7 @@ __bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags)
struct rq *this_rq;
unsigned long irq_flags;
- if (!ops_cpu_valid(cpu, NULL))
+ if (!kf_cpu_valid(cpu, NULL))
return;
local_irq_save(irq_flags);
@@ -6732,7 +6917,7 @@ __bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags)
struct rq *target_rq = cpu_rq(cpu);
if (unlikely(flags & (SCX_KICK_PREEMPT | SCX_KICK_WAIT)))
- scx_ops_error("PREEMPT/WAIT cannot be used with SCX_KICK_IDLE");
+ scx_kf_error("PREEMPT/WAIT cannot be used with SCX_KICK_IDLE");
if (raw_spin_rq_trylock(target_rq)) {
if (can_skip_idle_kick(target_rq)) {
@@ -6765,23 +6950,30 @@ out:
*/
__bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id)
{
+ struct scx_sched *sch;
struct scx_dispatch_q *dsq;
s32 ret;
preempt_disable();
+ sch = rcu_dereference_sched(scx_root);
+ if (unlikely(!sch)) {
+ ret = -ENODEV;
+ goto out;
+ }
+
if (dsq_id == SCX_DSQ_LOCAL) {
ret = READ_ONCE(this_rq()->scx.local_dsq.nr);
goto out;
} else if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) {
s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK;
- if (ops_cpu_valid(cpu, NULL)) {
+ if (ops_cpu_valid(sch, cpu, NULL)) {
ret = READ_ONCE(cpu_rq(cpu)->scx.local_dsq.nr);
goto out;
}
} else {
- dsq = find_user_dsq(dsq_id);
+ dsq = find_user_dsq(sch, dsq_id);
if (dsq) {
ret = READ_ONCE(dsq->nr);
goto out;
@@ -6804,7 +6996,13 @@ out:
*/
__bpf_kfunc void scx_bpf_destroy_dsq(u64 dsq_id)
{
- destroy_dsq(dsq_id);
+ struct scx_sched *sch;
+
+ rcu_read_lock();
+ sch = rcu_dereference(scx_root);
+ if (sch)
+ destroy_dsq(sch, dsq_id);
+ rcu_read_unlock();
}
/**
@@ -6821,6 +7019,7 @@ __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id,
u64 flags)
{
struct bpf_iter_scx_dsq_kern *kit = (void *)it;
+ struct scx_sched *sch;
BUILD_BUG_ON(sizeof(struct bpf_iter_scx_dsq_kern) >
sizeof(struct bpf_iter_scx_dsq));
@@ -6833,10 +7032,14 @@ __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id,
*/
kit->dsq = NULL;
+ sch = rcu_dereference_check(scx_root, rcu_read_lock_bh_held());
+ if (unlikely(!sch))
+ return -ENODEV;
+
if (flags & ~__SCX_DSQ_ITER_USER_FLAGS)
return -EINVAL;
- kit->dsq = find_user_dsq(dsq_id);
+ kit->dsq = find_user_dsq(sch, dsq_id);
if (!kit->dsq)
return -ENOENT;
@@ -6926,21 +7129,20 @@ static s32 __bstr_format(u64 *data_buf, char *line_buf, size_t line_size,
if (data__sz % 8 || data__sz > MAX_BPRINTF_VARARGS * 8 ||
(data__sz && !data)) {
- scx_ops_error("invalid data=%p and data__sz=%u",
- (void *)data, data__sz);
+ scx_kf_error("invalid data=%p and data__sz=%u", (void *)data, data__sz);
return -EINVAL;
}
ret = copy_from_kernel_nofault(data_buf, data, data__sz);
if (ret < 0) {
- scx_ops_error("failed to read data fields (%d)", ret);
+ scx_kf_error("failed to read data fields (%d)", ret);
return ret;
}
ret = bpf_bprintf_prepare(fmt, UINT_MAX, data_buf, data__sz / 8,
&bprintf_data);
if (ret < 0) {
- scx_ops_error("format preparation failed (%d)", ret);
+ scx_kf_error("format preparation failed (%d)", ret);
return ret;
}
@@ -6948,8 +7150,7 @@ static s32 __bstr_format(u64 *data_buf, char *line_buf, size_t line_size,
bprintf_data.bin_args);
bpf_bprintf_cleanup(&bprintf_data);
if (ret < 0) {
- scx_ops_error("(\"%s\", %p, %u) failed to format",
- fmt, data, data__sz);
+ scx_kf_error("(\"%s\", %p, %u) failed to format", fmt, data, data__sz);
return ret;
}
@@ -6982,8 +7183,7 @@ __bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt,
raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags);
if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0)
- scx_ops_exit_kind(SCX_EXIT_UNREG_BPF, exit_code, "%s",
- scx_exit_bstr_buf.line);
+ scx_kf_exit(SCX_EXIT_UNREG_BPF, exit_code, "%s", scx_exit_bstr_buf.line);
raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags);
}
@@ -7003,8 +7203,7 @@ __bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data,
raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags);
if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0)
- scx_ops_exit_kind(SCX_EXIT_ERROR_BPF, 0, "%s",
- scx_exit_bstr_buf.line);
+ scx_kf_exit(SCX_EXIT_ERROR_BPF, 0, "%s", scx_exit_bstr_buf.line);
raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags);
}
@@ -7028,7 +7227,7 @@ __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data,
s32 ret;
if (raw_smp_processor_id() != dd->cpu) {
- scx_ops_error("scx_bpf_dump() must only be called from ops.dump() and friends");
+ scx_kf_error("scx_bpf_dump() must only be called from ops.dump() and friends");
return;
}
@@ -7069,7 +7268,7 @@ __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data,
*/
__bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu)
{
- if (ops_cpu_valid(cpu, NULL))
+ if (kf_cpu_valid(cpu, NULL))
return arch_scale_cpu_capacity(cpu);
else
return SCX_CPUPERF_ONE;
@@ -7091,7 +7290,7 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu)
*/
__bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu)
{
- if (ops_cpu_valid(cpu, NULL))
+ if (kf_cpu_valid(cpu, NULL))
return arch_scale_freq_capacity(cpu);
else
return SCX_CPUPERF_ONE;
@@ -7114,11 +7313,11 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu)
__bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf)
{
if (unlikely(perf > SCX_CPUPERF_ONE)) {
- scx_ops_error("Invalid cpuperf target %u for CPU %d", perf, cpu);
+ scx_kf_error("Invalid cpuperf target %u for CPU %d", perf, cpu);
return;
}
- if (ops_cpu_valid(cpu, NULL)) {
+ if (kf_cpu_valid(cpu, NULL)) {
struct rq *rq = cpu_rq(cpu), *locked_rq = scx_locked_rq();
struct rq_flags rf;
@@ -7127,7 +7326,7 @@ __bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf)
* to the corresponding CPU to prevent ABBA deadlocks.
*/
if (locked_rq && rq != locked_rq) {
- scx_ops_error("Invalid target CPU %d", cpu);
+ scx_kf_error("Invalid target CPU %d", cpu);
return;
}
@@ -7222,7 +7421,7 @@ __bpf_kfunc s32 scx_bpf_task_cpu(const struct task_struct *p)
*/
__bpf_kfunc struct rq *scx_bpf_cpu_rq(s32 cpu)
{
- if (!ops_cpu_valid(cpu, NULL))
+ if (!kf_cpu_valid(cpu, NULL))
return NULL;
return cpu_rq(cpu);
@@ -7318,6 +7517,27 @@ __bpf_kfunc u64 scx_bpf_now(void)
return clock;
}
+static void scx_read_events(struct scx_sched *sch, struct scx_event_stats *events)
+{
+ struct scx_event_stats *e_cpu;
+ int cpu;
+
+ /* Aggregate per-CPU event counters into @events. */
+ memset(events, 0, sizeof(*events));
+ for_each_possible_cpu(cpu) {
+ e_cpu = per_cpu_ptr(sch->event_stats_cpu, cpu);
+ scx_agg_event(events, e_cpu, SCX_EV_SELECT_CPU_FALLBACK);
+ scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
+ scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_KEEP_LAST);
+ scx_agg_event(events, e_cpu, SCX_EV_ENQ_SKIP_EXITING);
+ scx_agg_event(events, e_cpu, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED);
+ scx_agg_event(events, e_cpu, SCX_EV_REFILL_SLICE_DFL);
+ scx_agg_event(events, e_cpu, SCX_EV_BYPASS_DURATION);
+ scx_agg_event(events, e_cpu, SCX_EV_BYPASS_DISPATCH);
+ scx_agg_event(events, e_cpu, SCX_EV_BYPASS_ACTIVATE);
+ }
+}
+
/*
* scx_bpf_events - Get a system-wide event counter to
* @events: output buffer from a BPF program
@@ -7326,23 +7546,16 @@ __bpf_kfunc u64 scx_bpf_now(void)
__bpf_kfunc void scx_bpf_events(struct scx_event_stats *events,
size_t events__sz)
{
- struct scx_event_stats e_sys, *e_cpu;
- int cpu;
+ struct scx_sched *sch;
+ struct scx_event_stats e_sys;
- /* Aggregate per-CPU event counters into the system-wide counters. */
- memset(&e_sys, 0, sizeof(e_sys));
- for_each_possible_cpu(cpu) {
- e_cpu = per_cpu_ptr(&event_stats_cpu, cpu);
- scx_agg_event(&e_sys, e_cpu, SCX_EV_SELECT_CPU_FALLBACK);
- scx_agg_event(&e_sys, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
- scx_agg_event(&e_sys, e_cpu, SCX_EV_DISPATCH_KEEP_LAST);
- scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SKIP_EXITING);
- scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED);
- scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SLICE_DFL);
- scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_DURATION);
- scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_DISPATCH);
- scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_ACTIVATE);
- }
+ rcu_read_lock();
+ sch = rcu_dereference(scx_root);
+ if (sch)
+ scx_read_events(sch, &e_sys);
+ else
+ memset(&e_sys, 0, sizeof(e_sys));
+ rcu_read_unlock();
/*
* We cannot entirely trust a BPF-provided size since a BPF program
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index 1bda96b19a1b..6e5072f57771 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -8,6 +8,11 @@
*/
#ifdef CONFIG_SCHED_CLASS_EXT
+static inline bool scx_kf_allowed_if_unlocked(void)
+{
+ return !current->scx.kf_mask;
+}
+
DECLARE_STATIC_KEY_FALSE(scx_ops_allow_queued_wakeup);
void scx_tick(struct rq *rq);
@@ -21,6 +26,7 @@ void scx_rq_activate(struct rq *rq);
void scx_rq_deactivate(struct rq *rq);
int scx_check_setscheduler(struct task_struct *p, int policy);
bool task_should_scx(int policy);
+bool scx_allow_ttwu_queue(const struct task_struct *p);
void init_sched_ext_class(void);
static inline u32 scx_cpuperf_target(s32 cpu)
@@ -36,13 +42,6 @@ static inline bool task_on_scx(const struct task_struct *p)
return scx_enabled() && p->sched_class == &ext_sched_class;
}
-static inline bool scx_allow_ttwu_queue(const struct task_struct *p)
-{
- return !scx_enabled() ||
- static_branch_likely(&scx_ops_allow_queued_wakeup) ||
- p->sched_class != &ext_sched_class;
-}
-
#ifdef CONFIG_SCHED_CORE
bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
bool in_fi);
diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c
index e67a19a071c1..66da03cc0b33 100644
--- a/kernel/sched/ext_idle.c
+++ b/kernel/sched/ext_idle.c
@@ -47,6 +47,13 @@ static struct scx_idle_cpus scx_idle_global_masks;
static struct scx_idle_cpus **scx_idle_node_masks;
/*
+ * Local per-CPU cpumasks (used to generate temporary idle cpumasks).
+ */
+static DEFINE_PER_CPU(cpumask_var_t, local_idle_cpumask);
+static DEFINE_PER_CPU(cpumask_var_t, local_llc_idle_cpumask);
+static DEFINE_PER_CPU(cpumask_var_t, local_numa_idle_cpumask);
+
+/*
* Return the idle masks associated to a target @node.
*
* NUMA_NO_NODE identifies the global idle cpumask.
@@ -392,6 +399,14 @@ void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops)
}
/*
+ * Return true if @p can run on all possible CPUs, false otherwise.
+ */
+static inline bool task_affinity_all(const struct task_struct *p)
+{
+ return p->nr_cpus_allowed >= num_possible_cpus();
+}
+
+/*
* Built-in CPU idle selection policy:
*
* 1. Prioritize full-idle cores:
@@ -403,13 +418,15 @@ void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops)
* branch prediction optimizations.
*
* 3. Pick a CPU within the same LLC (Last-Level Cache):
- * - if the above conditions aren't met, pick a CPU that shares the same LLC
- * to maintain cache locality.
+ * - if the above conditions aren't met, pick a CPU that shares the same
+ * LLC, if the LLC domain is a subset of @cpus_allowed, to maintain
+ * cache locality.
*
* 4. Pick a CPU within the same NUMA node, if enabled:
- * - choose a CPU from the same NUMA node to reduce memory access latency.
+ * - choose a CPU from the same NUMA node, if the node cpumask is a
+ * subset of @cpus_allowed, to reduce memory access latency.
*
- * 5. Pick any idle CPU usable by the task.
+ * 5. Pick any idle CPU within the @cpus_allowed domain.
*
* Step 3 and 4 are performed only if the system has, respectively,
* multiple LLCs / multiple NUMA nodes (see scx_selcpu_topo_llc and
@@ -424,35 +441,77 @@ void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops)
* NOTE: tasks that can only run on 1 CPU are excluded by this logic, because
* we never call ops.select_cpu() for them, see select_task_rq().
*/
-s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64 flags)
+s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
+ const struct cpumask *cpus_allowed, u64 flags)
{
- const struct cpumask *llc_cpus = NULL;
- const struct cpumask *numa_cpus = NULL;
+ const struct cpumask *llc_cpus = NULL, *numa_cpus = NULL;
+ const struct cpumask *allowed = cpus_allowed ?: p->cpus_ptr;
int node = scx_cpu_node_if_enabled(prev_cpu);
s32 cpu;
+ preempt_disable();
+
+ /*
+ * Determine the subset of CPUs usable by @p within @cpus_allowed.
+ */
+ if (allowed != p->cpus_ptr) {
+ struct cpumask *local_cpus = this_cpu_cpumask_var_ptr(local_idle_cpumask);
+
+ if (task_affinity_all(p)) {
+ allowed = cpus_allowed;
+ } else if (cpumask_and(local_cpus, cpus_allowed, p->cpus_ptr)) {
+ allowed = local_cpus;
+ } else {
+ cpu = -EBUSY;
+ goto out_enable;
+ }
+
+ /*
+ * If @prev_cpu is not in the allowed CPUs, skip topology
+ * optimizations and try to pick any idle CPU usable by the
+ * task.
+ *
+ * If %SCX_OPS_BUILTIN_IDLE_PER_NODE is enabled, prioritize
+ * the current node, as it may optimize some waker->wakee
+ * workloads.
+ */
+ if (!cpumask_test_cpu(prev_cpu, allowed)) {
+ node = scx_cpu_node_if_enabled(smp_processor_id());
+ cpu = scx_pick_idle_cpu(allowed, node, flags);
+ goto out_enable;
+ }
+ }
+
/*
* This is necessary to protect llc_cpus.
*/
rcu_read_lock();
/*
- * Determine the scheduling domain only if the task is allowed to run
- * on all CPUs.
+ * Determine the subset of CPUs that the task can use in its
+ * current LLC and node.
*
- * This is done primarily for efficiency, as it avoids the overhead of
- * updating a cpumask every time we need to select an idle CPU (which
- * can be costly in large SMP systems), but it also aligns logically:
- * if a task's scheduling domain is restricted by user-space (through
- * CPU affinity), the task will simply use the flat scheduling domain
- * defined by user-space.
+ * If the task can run on all CPUs, use the node and LLC cpumasks
+ * directly.
*/
- if (p->nr_cpus_allowed >= num_possible_cpus()) {
- if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa))
- numa_cpus = numa_span(prev_cpu);
+ if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa)) {
+ struct cpumask *local_cpus = this_cpu_cpumask_var_ptr(local_numa_idle_cpumask);
+ const struct cpumask *cpus = numa_span(prev_cpu);
+
+ if (allowed == p->cpus_ptr && task_affinity_all(p))
+ numa_cpus = cpus;
+ else if (cpus && cpumask_and(local_cpus, allowed, cpus))
+ numa_cpus = local_cpus;
+ }
+
+ if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc)) {
+ struct cpumask *local_cpus = this_cpu_cpumask_var_ptr(local_llc_idle_cpumask);
+ const struct cpumask *cpus = llc_span(prev_cpu);
- if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc))
- llc_cpus = llc_span(prev_cpu);
+ if (allowed == p->cpus_ptr && task_affinity_all(p))
+ llc_cpus = cpus;
+ else if (cpus && cpumask_and(local_cpus, allowed, cpus))
+ llc_cpus = local_cpus;
}
/*
@@ -490,7 +549,7 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64
cpu_rq(cpu)->scx.local_dsq.nr == 0 &&
(!(flags & SCX_PICK_IDLE_IN_NODE) || (waker_node == node)) &&
!cpumask_empty(idle_cpumask(waker_node)->cpu)) {
- if (cpumask_test_cpu(cpu, p->cpus_ptr))
+ if (cpumask_test_cpu(cpu, allowed))
goto out_unlock;
}
}
@@ -535,7 +594,7 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64
* begin in prev_cpu's node and proceed to other nodes in
* order of increasing distance.
*/
- cpu = scx_pick_idle_cpu(p->cpus_ptr, node, flags | SCX_PICK_IDLE_CORE);
+ cpu = scx_pick_idle_cpu(allowed, node, flags | SCX_PICK_IDLE_CORE);
if (cpu >= 0)
goto out_unlock;
@@ -583,10 +642,12 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64
* in prev_cpu's node and proceed to other nodes in order of
* increasing distance.
*/
- cpu = scx_pick_idle_cpu(p->cpus_ptr, node, flags);
+ cpu = scx_pick_idle_cpu(allowed, node, flags);
out_unlock:
rcu_read_unlock();
+out_enable:
+ preempt_enable();
return cpu;
}
@@ -596,7 +657,7 @@ out_unlock:
*/
void scx_idle_init_masks(void)
{
- int node;
+ int i;
/* Allocate global idle cpumasks */
BUG_ON(!alloc_cpumask_var(&scx_idle_global_masks.cpu, GFP_KERNEL));
@@ -607,13 +668,23 @@ void scx_idle_init_masks(void)
sizeof(*scx_idle_node_masks), GFP_KERNEL);
BUG_ON(!scx_idle_node_masks);
- for_each_node(node) {
- scx_idle_node_masks[node] = kzalloc_node(sizeof(**scx_idle_node_masks),
- GFP_KERNEL, node);
- BUG_ON(!scx_idle_node_masks[node]);
+ for_each_node(i) {
+ scx_idle_node_masks[i] = kzalloc_node(sizeof(**scx_idle_node_masks),
+ GFP_KERNEL, i);
+ BUG_ON(!scx_idle_node_masks[i]);
+
+ BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[i]->cpu, GFP_KERNEL, i));
+ BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[i]->smt, GFP_KERNEL, i));
+ }
- BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[node]->cpu, GFP_KERNEL, node));
- BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[node]->smt, GFP_KERNEL, node));
+ /* Allocate local per-cpu idle cpumasks */
+ for_each_possible_cpu(i) {
+ BUG_ON(!alloc_cpumask_var_node(&per_cpu(local_idle_cpumask, i),
+ GFP_KERNEL, cpu_to_node(i)));
+ BUG_ON(!alloc_cpumask_var_node(&per_cpu(local_llc_idle_cpumask, i),
+ GFP_KERNEL, cpu_to_node(i)));
+ BUG_ON(!alloc_cpumask_var_node(&per_cpu(local_numa_idle_cpumask, i),
+ GFP_KERNEL, cpu_to_node(i)));
}
}
@@ -662,21 +733,12 @@ static void update_builtin_idle(int cpu, bool idle)
*/
void __scx_update_idle(struct rq *rq, bool idle, bool do_notify)
{
+ struct scx_sched *sch = scx_root;
int cpu = cpu_of(rq);
lockdep_assert_rq_held(rq);
/*
- * Trigger ops.update_idle() only when transitioning from a task to
- * the idle thread and vice versa.
- *
- * Idle transitions are indicated by do_notify being set to true,
- * managed by put_prev_task_idle()/set_next_task_idle().
- */
- if (SCX_HAS_OP(update_idle) && do_notify && !scx_rq_bypassing(rq))
- SCX_CALL_OP(SCX_KF_REST, update_idle, rq, cpu_of(rq), idle);
-
- /*
* Update the idle masks:
* - for real idle transitions (do_notify == true)
* - for idle-to-idle transitions (indicated by the previous task
@@ -693,6 +755,21 @@ void __scx_update_idle(struct rq *rq, bool idle, bool do_notify)
if (static_branch_likely(&scx_builtin_idle_enabled))
if (do_notify || is_idle_task(rq->curr))
update_builtin_idle(cpu, idle);
+
+ /*
+ * Trigger ops.update_idle() only when transitioning from a task to
+ * the idle thread and vice versa.
+ *
+ * Idle transitions are indicated by do_notify being set to true,
+ * managed by put_prev_task_idle()/set_next_task_idle().
+ *
+ * This must come after builtin idle update so that BPF schedulers can
+ * create interlocking between ops.update_idle() and ops.enqueue() -
+ * either enqueue() sees the idle bit or update_idle() sees the task
+ * that enqueue() queued.
+ */
+ if (SCX_HAS_OP(sch, update_idle) && do_notify && !scx_rq_bypassing(rq))
+ SCX_CALL_OP(sch, SCX_KF_REST, update_idle, rq, cpu_of(rq), idle);
}
static void reset_idle_masks(struct sched_ext_ops *ops)
@@ -748,7 +825,7 @@ void scx_idle_disable(void)
static int validate_node(int node)
{
if (!static_branch_likely(&scx_builtin_idle_per_node)) {
- scx_ops_error("per-node idle tracking is disabled");
+ scx_kf_error("per-node idle tracking is disabled");
return -EOPNOTSUPP;
}
@@ -758,13 +835,13 @@ static int validate_node(int node)
/* Make sure node is in a valid range */
if (node < 0 || node >= nr_node_ids) {
- scx_ops_error("invalid node %d", node);
+ scx_kf_error("invalid node %d", node);
return -EINVAL;
}
/* Make sure the node is part of the set of possible nodes */
if (!node_possible(node)) {
- scx_ops_error("unavailable node %d", node);
+ scx_kf_error("unavailable node %d", node);
return -EINVAL;
}
@@ -778,10 +855,72 @@ static bool check_builtin_idle_enabled(void)
if (static_branch_likely(&scx_builtin_idle_enabled))
return true;
- scx_ops_error("built-in idle tracking is disabled");
+ scx_kf_error("built-in idle tracking is disabled");
return false;
}
+s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
+ const struct cpumask *allowed, u64 flags)
+{
+ struct rq *rq;
+ struct rq_flags rf;
+ s32 cpu;
+
+ if (!kf_cpu_valid(prev_cpu, NULL))
+ return -EINVAL;
+
+ if (!check_builtin_idle_enabled())
+ return -EBUSY;
+
+ /*
+ * If called from an unlocked context, acquire the task's rq lock,
+ * so that we can safely access p->cpus_ptr and p->nr_cpus_allowed.
+ *
+ * Otherwise, allow to use this kfunc only from ops.select_cpu()
+ * and ops.select_enqueue().
+ */
+ if (scx_kf_allowed_if_unlocked()) {
+ rq = task_rq_lock(p, &rf);
+ } else {
+ if (!scx_kf_allowed(SCX_KF_SELECT_CPU | SCX_KF_ENQUEUE))
+ return -EPERM;
+ rq = scx_locked_rq();
+ }
+
+ /*
+ * Validate locking correctness to access p->cpus_ptr and
+ * p->nr_cpus_allowed: if we're holding an rq lock, we're safe;
+ * otherwise, assert that p->pi_lock is held.
+ */
+ if (!rq)
+ lockdep_assert_held(&p->pi_lock);
+
+#ifdef CONFIG_SMP
+ /*
+ * This may also be called from ops.enqueue(), so we need to handle
+ * per-CPU tasks as well. For these tasks, we can skip all idle CPU
+ * selection optimizations and simply check whether the previously
+ * used CPU is idle and within the allowed cpumask.
+ */
+ if (p->nr_cpus_allowed == 1) {
+ if (cpumask_test_cpu(prev_cpu, allowed ?: p->cpus_ptr) &&
+ scx_idle_test_and_clear_cpu(prev_cpu))
+ cpu = prev_cpu;
+ else
+ cpu = -EBUSY;
+ } else {
+ cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags,
+ allowed ?: p->cpus_ptr, flags);
+ }
+#else
+ cpu = -EBUSY;
+#endif
+ if (scx_kf_allowed_if_unlocked())
+ task_rq_unlock(rq, p, &rf);
+
+ return cpu;
+}
+
/**
* scx_bpf_cpu_node - Return the NUMA node the given @cpu belongs to, or
* trigger an error if @cpu is invalid
@@ -790,7 +929,7 @@ static bool check_builtin_idle_enabled(void)
__bpf_kfunc int scx_bpf_cpu_node(s32 cpu)
{
#ifdef CONFIG_NUMA
- if (!ops_cpu_valid(cpu, NULL))
+ if (!kf_cpu_valid(cpu, NULL))
return NUMA_NO_NODE;
return cpu_to_node(cpu);
@@ -806,9 +945,10 @@ __bpf_kfunc int scx_bpf_cpu_node(s32 cpu)
* @wake_flags: %SCX_WAKE_* flags
* @is_idle: out parameter indicating whether the returned CPU is idle
*
- * Can only be called from ops.select_cpu() if the built-in CPU selection is
- * enabled - ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE is set.
- * @p, @prev_cpu and @wake_flags match ops.select_cpu().
+ * Can be called from ops.select_cpu(), ops.enqueue(), or from an unlocked
+ * context such as a BPF test_run() call, as long as built-in CPU selection
+ * is enabled: ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE
+ * is set.
*
* Returns the picked CPU with *@is_idle indicating whether the picked CPU is
* currently idle and thus a good candidate for direct dispatching.
@@ -816,39 +956,52 @@ __bpf_kfunc int scx_bpf_cpu_node(s32 cpu)
__bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
u64 wake_flags, bool *is_idle)
{
-#ifdef CONFIG_SMP
s32 cpu;
-#endif
- if (!ops_cpu_valid(prev_cpu, NULL))
- goto prev_cpu;
- if (!check_builtin_idle_enabled())
- goto prev_cpu;
-
- if (!scx_kf_allowed(SCX_KF_SELECT_CPU))
- goto prev_cpu;
-
-#ifdef CONFIG_SMP
- cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, 0);
+ cpu = select_cpu_from_kfunc(p, prev_cpu, wake_flags, NULL, 0);
if (cpu >= 0) {
*is_idle = true;
return cpu;
}
-#endif
-
-prev_cpu:
*is_idle = false;
+
return prev_cpu;
}
/**
+ * scx_bpf_select_cpu_and - Pick an idle CPU usable by task @p,
+ * prioritizing those in @cpus_allowed
+ * @p: task_struct to select a CPU for
+ * @prev_cpu: CPU @p was on previously
+ * @wake_flags: %SCX_WAKE_* flags
+ * @cpus_allowed: cpumask of allowed CPUs
+ * @flags: %SCX_PICK_IDLE* flags
+ *
+ * Can be called from ops.select_cpu(), ops.enqueue(), or from an unlocked
+ * context such as a BPF test_run() call, as long as built-in CPU selection
+ * is enabled: ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE
+ * is set.
+ *
+ * @p, @prev_cpu and @wake_flags match ops.select_cpu().
+ *
+ * Returns the selected idle CPU, which will be automatically awakened upon
+ * returning from ops.select_cpu() and can be used for direct dispatch, or
+ * a negative value if no idle CPU is available.
+ */
+__bpf_kfunc s32 scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
+ const struct cpumask *cpus_allowed, u64 flags)
+{
+ return select_cpu_from_kfunc(p, prev_cpu, wake_flags, cpus_allowed, flags);
+}
+
+/**
* scx_bpf_get_idle_cpumask_node - Get a referenced kptr to the
* idle-tracking per-CPU cpumask of a target NUMA node.
* @node: target NUMA node
*
* Returns an empty cpumask if idle tracking is not enabled, if @node is
* not valid, or running on a UP kernel. In this case the actual error will
- * be reported to the BPF scheduler via scx_ops_error().
+ * be reported to the BPF scheduler via scx_error().
*/
__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask_node(int node)
{
@@ -873,7 +1026,7 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask_node(int node)
__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void)
{
if (static_branch_unlikely(&scx_builtin_idle_per_node)) {
- scx_ops_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled");
+ scx_kf_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled");
return cpu_none_mask;
}
@@ -895,7 +1048,7 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void)
*
* Returns an empty cpumask if idle tracking is not enabled, if @node is
* not valid, or running on a UP kernel. In this case the actual error will
- * be reported to the BPF scheduler via scx_ops_error().
+ * be reported to the BPF scheduler via scx_error().
*/
__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask_node(int node)
{
@@ -924,7 +1077,7 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask_node(int node)
__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void)
{
if (static_branch_unlikely(&scx_builtin_idle_per_node)) {
- scx_ops_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled");
+ scx_kf_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled");
return cpu_none_mask;
}
@@ -971,7 +1124,7 @@ __bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu)
if (!check_builtin_idle_enabled())
return false;
- if (ops_cpu_valid(cpu, NULL))
+ if (kf_cpu_valid(cpu, NULL))
return scx_idle_test_and_clear_cpu(cpu);
else
return false;
@@ -1032,7 +1185,7 @@ __bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed,
u64 flags)
{
if (static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) {
- scx_ops_error("per-node idle tracking is enabled");
+ scx_kf_error("per-node idle tracking is enabled");
return -EBUSY;
}
@@ -1109,7 +1262,7 @@ __bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed,
s32 cpu;
if (static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) {
- scx_ops_error("per-node idle tracking is enabled");
+ scx_kf_error("per-node idle tracking is enabled");
return -EBUSY;
}
@@ -1140,6 +1293,8 @@ BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu_node, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu_node, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_select_cpu_and, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU)
BTF_KFUNCS_END(scx_kfunc_ids_idle)
static const struct btf_kfunc_id_set scx_kfunc_set_idle = {
@@ -1147,21 +1302,11 @@ static const struct btf_kfunc_id_set scx_kfunc_set_idle = {
.set = &scx_kfunc_ids_idle,
};
-BTF_KFUNCS_START(scx_kfunc_ids_select_cpu)
-BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU)
-BTF_KFUNCS_END(scx_kfunc_ids_select_cpu)
-
-static const struct btf_kfunc_id_set scx_kfunc_set_select_cpu = {
- .owner = THIS_MODULE,
- .set = &scx_kfunc_ids_select_cpu,
-};
-
int scx_idle_init(void)
{
int ret;
- ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_select_cpu) ||
- register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_idle) ||
+ ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_idle) ||
register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &scx_kfunc_set_idle) ||
register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &scx_kfunc_set_idle);
diff --git a/kernel/sched/ext_idle.h b/kernel/sched/ext_idle.h
index 511cc2221f7a..37be78a7502b 100644
--- a/kernel/sched/ext_idle.h
+++ b/kernel/sched/ext_idle.h
@@ -27,7 +27,8 @@ static inline s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node
}
#endif /* CONFIG_SMP */
-s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64 flags);
+s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
+ const struct cpumask *cpus_allowed, u64 flags);
void scx_idle_enable(struct sched_ext_ops *ops);
void scx_idle_disable(void);
int scx_idle_init(void);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c5a6a503eb6d..475bb5998295 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1736,10 +1736,10 @@ extern struct balance_callback balance_push_callback;
#ifdef CONFIG_SCHED_CLASS_EXT
extern const struct sched_class ext_sched_class;
-DECLARE_STATIC_KEY_FALSE(__scx_ops_enabled); /* SCX BPF scheduler loaded */
+DECLARE_STATIC_KEY_FALSE(__scx_enabled); /* SCX BPF scheduler loaded */
DECLARE_STATIC_KEY_FALSE(__scx_switched_all); /* all fair class tasks on SCX */
-#define scx_enabled() static_branch_unlikely(&__scx_ops_enabled)
+#define scx_enabled() static_branch_unlikely(&__scx_enabled)
#define scx_switched_all() static_branch_unlikely(&__scx_switched_all)
static inline void scx_rq_clock_update(struct rq *rq, u64 clock)
@@ -3535,8 +3535,6 @@ static inline bool sched_energy_enabled(void)
return static_branch_unlikely(&sched_energy_present);
}
-extern struct cpufreq_governor schedutil_gov;
-
#else /* ! (CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */
#define perf_domain_span(pd) NULL
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index a2a38e1b6f18..b958fe48e020 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -212,8 +212,6 @@ static bool sched_energy_update;
static bool sched_is_eas_possible(const struct cpumask *cpu_mask)
{
bool any_asym_capacity = false;
- struct cpufreq_policy *policy;
- struct cpufreq_governor *gov;
int i;
/* EAS is enabled for asymmetric CPU capacity topologies. */
@@ -248,25 +246,12 @@ static bool sched_is_eas_possible(const struct cpumask *cpu_mask)
return false;
}
- /* Do not attempt EAS if schedutil is not being used. */
- for_each_cpu(i, cpu_mask) {
- policy = cpufreq_cpu_get(i);
- if (!policy) {
- if (sched_debug()) {
- pr_info("rd %*pbl: Checking EAS, cpufreq policy not set for CPU: %d",
- cpumask_pr_args(cpu_mask), i);
- }
- return false;
- }
- gov = policy->governor;
- cpufreq_cpu_put(policy);
- if (gov != &schedutil_gov) {
- if (sched_debug()) {
- pr_info("rd %*pbl: Checking EAS, schedutil is mandatory\n",
- cpumask_pr_args(cpu_mask));
- }
- return false;
+ if (!cpufreq_ready_for_eas(cpu_mask)) {
+ if (sched_debug()) {
+ pr_info("rd %*pbl: Checking EAS: cpufreq is not ready\n",
+ cpumask_pr_args(cpu_mask));
}
+ return false;
}
return true;
diff --git a/kernel/signal.c b/kernel/signal.c
index f8859faa26c5..148082db9a55 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -4981,9 +4981,20 @@ static const struct ctl_table signal_debug_table[] = {
#endif
};
+static const struct ctl_table signal_table[] = {
+ {
+ .procname = "print-fatal-signals",
+ .data = &print_fatal_signals,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+};
+
static int __init init_signal_sysctls(void)
{
register_sysctl_init("debug", signal_debug_table);
+ register_sysctl_init("kernel", signal_table);
return 0;
}
early_initcall(init_signal_sysctls);
diff --git a/kernel/sysctl-test.c b/kernel/sysctl-test.c
index eb2842bd0557..92f94ea28957 100644
--- a/kernel/sysctl-test.c
+++ b/kernel/sysctl-test.c
@@ -367,54 +367,6 @@ static void sysctl_test_api_dointvec_write_single_greater_int_max(
KUNIT_EXPECT_EQ(test, 0, *((int *)table.data));
}
-/*
- * Test that registering an invalid extra value is not allowed.
- */
-static void sysctl_test_register_sysctl_sz_invalid_extra_value(
- struct kunit *test)
-{
- unsigned char data = 0;
- const struct ctl_table table_foo[] = {
- {
- .procname = "foo",
- .data = &data,
- .maxlen = sizeof(u8),
- .mode = 0644,
- .proc_handler = proc_dou8vec_minmax,
- .extra1 = SYSCTL_FOUR,
- .extra2 = SYSCTL_ONE_THOUSAND,
- },
- };
-
- const struct ctl_table table_bar[] = {
- {
- .procname = "bar",
- .data = &data,
- .maxlen = sizeof(u8),
- .mode = 0644,
- .proc_handler = proc_dou8vec_minmax,
- .extra1 = SYSCTL_NEG_ONE,
- .extra2 = SYSCTL_ONE_HUNDRED,
- },
- };
-
- const struct ctl_table table_qux[] = {
- {
- .procname = "qux",
- .data = &data,
- .maxlen = sizeof(u8),
- .mode = 0644,
- .proc_handler = proc_dou8vec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_TWO_HUNDRED,
- },
- };
-
- KUNIT_EXPECT_NULL(test, register_sysctl("foo", table_foo));
- KUNIT_EXPECT_NULL(test, register_sysctl("foo", table_bar));
- KUNIT_EXPECT_NOT_NULL(test, register_sysctl("foo", table_qux));
-}
-
static struct kunit_case sysctl_test_cases[] = {
KUNIT_CASE(sysctl_test_api_dointvec_null_tbl_data),
KUNIT_CASE(sysctl_test_api_dointvec_table_maxlen_unset),
@@ -426,7 +378,6 @@ static struct kunit_case sysctl_test_cases[] = {
KUNIT_CASE(sysctl_test_dointvec_write_happy_single_negative),
KUNIT_CASE(sysctl_test_api_dointvec_write_single_less_int_min),
KUNIT_CASE(sysctl_test_api_dointvec_write_single_greater_int_max),
- KUNIT_CASE(sysctl_test_register_sysctl_sz_invalid_extra_value),
{}
};
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 3b7a7308e35b..9b4f0cff76ea 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -22,8 +22,6 @@
#include <linux/module.h>
#include <linux/sysctl.h>
#include <linux/bitmap.h>
-#include <linux/signal.h>
-#include <linux/panic.h>
#include <linux/printk.h>
#include <linux/proc_fs.h>
#include <linux/security.h>
@@ -46,7 +44,6 @@
#include <linux/nfs_fs.h>
#include <linux/acpi.h>
#include <linux/reboot.h>
-#include <linux/ftrace.h>
#include <linux/kmod.h>
#include <linux/capability.h>
#include <linux/binfmts.h>
@@ -61,12 +58,8 @@
#ifdef CONFIG_X86
#include <asm/nmi.h>
-#include <asm/stacktrace.h>
#include <asm/io.h>
#endif
-#ifdef CONFIG_SPARC
-#include <asm/setup.h>
-#endif
#ifdef CONFIG_RT_MUTEXES
#include <linux/rtmutex.h>
#endif
@@ -1588,13 +1581,6 @@ int proc_do_static_key(const struct ctl_table *table, int write,
}
static const struct ctl_table kern_table[] = {
- {
- .procname = "panic",
- .data = &panic_timeout,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
#ifdef CONFIG_PROC_SYSCTL
{
.procname = "tainted",
@@ -1612,45 +1598,6 @@ static const struct ctl_table kern_table[] = {
.extra2 = SYSCTL_ONE,
},
#endif
- {
- .procname = "print-fatal-signals",
- .data = &print_fatal_signals,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
-#ifdef CONFIG_SPARC
- {
- .procname = "reboot-cmd",
- .data = reboot_command,
- .maxlen = 256,
- .mode = 0644,
- .proc_handler = proc_dostring,
- },
- {
- .procname = "stop-a",
- .data = &stop_a_enabled,
- .maxlen = sizeof (int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "scons-poweroff",
- .data = &scons_pwroff,
- .maxlen = sizeof (int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
-#endif
-#ifdef CONFIG_SPARC64
- {
- .procname = "tsb-ratio",
- .data = &sysctl_tsb_ratio,
- .maxlen = sizeof (int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
-#endif
#ifdef CONFIG_PARISC
{
.procname = "soft-power",
@@ -1669,38 +1616,6 @@ static const struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec,
},
#endif
-#ifdef CONFIG_STACK_TRACER
- {
- .procname = "stack_tracer_enabled",
- .data = &stack_tracer_enabled,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = stack_trace_sysctl,
- },
-#endif
-#ifdef CONFIG_TRACING
- {
- .procname = "ftrace_dump_on_oops",
- .data = &ftrace_dump_on_oops,
- .maxlen = MAX_TRACER_SIZE,
- .mode = 0644,
- .proc_handler = proc_dostring,
- },
- {
- .procname = "traceoff_on_warning",
- .data = &__disable_trace_on_warning,
- .maxlen = sizeof(__disable_trace_on_warning),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "tracepoint_printk",
- .data = &tracepoint_printk,
- .maxlen = sizeof(tracepoint_printk),
- .mode = 0644,
- .proc_handler = tracepoint_printk_sysctl,
- },
-#endif
#ifdef CONFIG_MODULES
{
.procname = "modprobe",
@@ -1773,20 +1688,6 @@ static const struct ctl_table kern_table[] = {
.extra2 = SYSCTL_MAXOLDUID,
},
{
- .procname = "panic_on_oops",
- .data = &panic_on_oops,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "panic_print",
- .data = &panic_print,
- .maxlen = sizeof(unsigned long),
- .mode = 0644,
- .proc_handler = proc_doulongvec_minmax,
- },
- {
.procname = "ngroups_max",
.data = (void *)&ngroups_max,
.maxlen = sizeof (int),
@@ -1837,15 +1738,6 @@ static const struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec,
},
#endif
- {
- .procname = "panic_on_warn",
- .data = &panic_on_warn,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
#ifdef CONFIG_TREE_RCU
{
.procname = "panic_on_rcu_stall",
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 0ddccdff119a..577f0e6842d4 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -70,12 +70,10 @@ static DEFINE_SPINLOCK(rtcdev_lock);
*/
struct rtc_device *alarmtimer_get_rtcdev(void)
{
- unsigned long flags;
struct rtc_device *ret;
- spin_lock_irqsave(&rtcdev_lock, flags);
+ guard(spinlock_irqsave)(&rtcdev_lock);
ret = rtcdev;
- spin_unlock_irqrestore(&rtcdev_lock, flags);
return ret;
}
@@ -83,7 +81,6 @@ EXPORT_SYMBOL_GPL(alarmtimer_get_rtcdev);
static int alarmtimer_rtc_add_device(struct device *dev)
{
- unsigned long flags;
struct rtc_device *rtc = to_rtc_device(dev);
struct platform_device *pdev;
int ret = 0;
@@ -101,25 +98,18 @@ static int alarmtimer_rtc_add_device(struct device *dev)
if (!IS_ERR(pdev))
device_init_wakeup(&pdev->dev, true);
- spin_lock_irqsave(&rtcdev_lock, flags);
- if (!IS_ERR(pdev) && !rtcdev) {
- if (!try_module_get(rtc->owner)) {
+ scoped_guard(spinlock_irqsave, &rtcdev_lock) {
+ if (!IS_ERR(pdev) && !rtcdev && try_module_get(rtc->owner)) {
+ rtcdev = rtc;
+ /* hold a reference so it doesn't go away */
+ get_device(dev);
+ pdev = NULL;
+ } else {
ret = -1;
- goto unlock;
}
-
- rtcdev = rtc;
- /* hold a reference so it doesn't go away */
- get_device(dev);
- pdev = NULL;
- } else {
- ret = -1;
}
-unlock:
- spin_unlock_irqrestore(&rtcdev_lock, flags);
platform_device_unregister(pdev);
-
return ret;
}
@@ -198,7 +188,7 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)
struct alarm *alarm = container_of(timer, struct alarm, timer);
struct alarm_base *base = &alarm_bases[alarm->type];
- scoped_guard (spinlock_irqsave, &base->lock)
+ scoped_guard(spinlock_irqsave, &base->lock)
alarmtimer_dequeue(base, alarm);
if (alarm->function)
@@ -228,17 +218,16 @@ EXPORT_SYMBOL_GPL(alarm_expires_remaining);
static int alarmtimer_suspend(struct device *dev)
{
ktime_t min, now, expires;
- int i, ret, type;
struct rtc_device *rtc;
- unsigned long flags;
struct rtc_time tm;
+ int i, ret, type;
- spin_lock_irqsave(&freezer_delta_lock, flags);
- min = freezer_delta;
- expires = freezer_expires;
- type = freezer_alarmtype;
- freezer_delta = 0;
- spin_unlock_irqrestore(&freezer_delta_lock, flags);
+ scoped_guard(spinlock_irqsave, &freezer_delta_lock) {
+ min = freezer_delta;
+ expires = freezer_expires;
+ type = freezer_alarmtype;
+ freezer_delta = 0;
+ }
rtc = alarmtimer_get_rtcdev();
/* If we have no rtcdev, just return */
@@ -251,9 +240,8 @@ static int alarmtimer_suspend(struct device *dev)
struct timerqueue_node *next;
ktime_t delta;
- spin_lock_irqsave(&base->lock, flags);
- next = timerqueue_getnext(&base->timerqueue);
- spin_unlock_irqrestore(&base->lock, flags);
+ scoped_guard(spinlock_irqsave, &base->lock)
+ next = timerqueue_getnext(&base->timerqueue);
if (!next)
continue;
delta = ktime_sub(next->expires, base->get_ktime());
@@ -352,13 +340,12 @@ EXPORT_SYMBOL_GPL(alarm_init);
void alarm_start(struct alarm *alarm, ktime_t start)
{
struct alarm_base *base = &alarm_bases[alarm->type];
- unsigned long flags;
- spin_lock_irqsave(&base->lock, flags);
- alarm->node.expires = start;
- alarmtimer_enqueue(base, alarm);
- hrtimer_start(&alarm->timer, alarm->node.expires, HRTIMER_MODE_ABS);
- spin_unlock_irqrestore(&base->lock, flags);
+ scoped_guard(spinlock_irqsave, &base->lock) {
+ alarm->node.expires = start;
+ alarmtimer_enqueue(base, alarm);
+ hrtimer_start(&alarm->timer, alarm->node.expires, HRTIMER_MODE_ABS);
+ }
trace_alarmtimer_start(alarm, base->get_ktime());
}
@@ -381,13 +368,11 @@ EXPORT_SYMBOL_GPL(alarm_start_relative);
void alarm_restart(struct alarm *alarm)
{
struct alarm_base *base = &alarm_bases[alarm->type];
- unsigned long flags;
- spin_lock_irqsave(&base->lock, flags);
+ guard(spinlock_irqsave)(&base->lock);
hrtimer_set_expires(&alarm->timer, alarm->node.expires);
hrtimer_restart(&alarm->timer);
alarmtimer_enqueue(base, alarm);
- spin_unlock_irqrestore(&base->lock, flags);
}
EXPORT_SYMBOL_GPL(alarm_restart);
@@ -401,14 +386,13 @@ EXPORT_SYMBOL_GPL(alarm_restart);
int alarm_try_to_cancel(struct alarm *alarm)
{
struct alarm_base *base = &alarm_bases[alarm->type];
- unsigned long flags;
int ret;
- spin_lock_irqsave(&base->lock, flags);
- ret = hrtimer_try_to_cancel(&alarm->timer);
- if (ret >= 0)
- alarmtimer_dequeue(base, alarm);
- spin_unlock_irqrestore(&base->lock, flags);
+ scoped_guard(spinlock_irqsave, &base->lock) {
+ ret = hrtimer_try_to_cancel(&alarm->timer);
+ if (ret >= 0)
+ alarmtimer_dequeue(base, alarm);
+ }
trace_alarmtimer_cancel(alarm, base->get_ktime());
return ret;
@@ -479,7 +463,6 @@ EXPORT_SYMBOL_GPL(alarm_forward_now);
static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type)
{
struct alarm_base *base;
- unsigned long flags;
ktime_t delta;
switch(type) {
@@ -498,13 +481,12 @@ static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type)
delta = ktime_sub(absexp, base->get_ktime());
- spin_lock_irqsave(&freezer_delta_lock, flags);
+ guard(spinlock_irqsave)(&freezer_delta_lock);
if (!freezer_delta || (delta < freezer_delta)) {
freezer_delta = delta;
freezer_expires = absexp;
freezer_alarmtype = type;
}
- spin_unlock_irqrestore(&freezer_delta_lock, flags);
}
/**
@@ -515,9 +497,9 @@ static enum alarmtimer_type clock2alarm(clockid_t clockid)
{
if (clockid == CLOCK_REALTIME_ALARM)
return ALARM_REALTIME;
- if (clockid == CLOCK_BOOTTIME_ALARM)
- return ALARM_BOOTTIME;
- return -1;
+
+ WARN_ON_ONCE(clockid != CLOCK_BOOTTIME_ALARM);
+ return ALARM_BOOTTIME;
}
/**
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index bb48498ebb5a..6a8bc7da9062 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -310,7 +310,7 @@ static void clocksource_verify_choose_cpus(void)
{
int cpu, i, n = verify_n_cpus;
- if (n < 0) {
+ if (n < 0 || n >= num_online_cpus()) {
/* Check all of the CPUs. */
cpumask_copy(&cpus_chosen, cpu_online_mask);
cpumask_clear_cpu(smp_processor_id(), &cpus_chosen);
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index bc4db9e5ab70..34eeacac2253 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -75,13 +75,11 @@ struct clocksource * __init __weak clocksource_default_clock(void)
static struct clocksource refined_jiffies;
-int register_refined_jiffies(long cycles_per_second)
+void __init register_refined_jiffies(long cycles_per_second)
{
u64 nsec_per_tick, shift_hz;
long cycles_per_tick;
-
-
refined_jiffies = clocksource_jiffies;
refined_jiffies.name = "refined-jiffies";
refined_jiffies.rating++;
@@ -100,5 +98,4 @@ int register_refined_jiffies(long cycles_per_second)
refined_jiffies.mult = ((u32)nsec_per_tick) << JIFFIES_SHIFT;
__clocksource_register(&refined_jiffies);
- return 0;
}
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 6222112533a7..2053b1a4c9e4 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -30,8 +30,6 @@
#include "timekeeping.h"
#include "posix-timers.h"
-static struct kmem_cache *posix_timers_cache;
-
/*
* Timers are managed in a hash table for lockless lookup. The hash key is
* constructed from current::signal and the timer ID and the timer is
@@ -49,10 +47,12 @@ struct timer_hash_bucket {
static struct {
struct timer_hash_bucket *buckets;
unsigned long mask;
-} __timer_data __ro_after_init __aligned(2*sizeof(long));
+ struct kmem_cache *cache;
+} __timer_data __ro_after_init __aligned(4*sizeof(long));
-#define timer_buckets (__timer_data.buckets)
-#define timer_hashmask (__timer_data.mask)
+#define timer_buckets (__timer_data.buckets)
+#define timer_hashmask (__timer_data.mask)
+#define posix_timers_cache (__timer_data.cache)
static const struct k_clock * const posix_clocks[];
static const struct k_clock *clockid_to_kclock(const clockid_t id);
@@ -283,14 +283,6 @@ static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp)
return 0;
}
-static __init int init_posix_timers(void)
-{
- posix_timers_cache = kmem_cache_create("posix_timers_cache", sizeof(struct k_itimer),
- __alignof__(struct k_itimer), SLAB_ACCOUNT, NULL);
- return 0;
-}
-__initcall(init_posix_timers);
-
/*
* The siginfo si_overrun field and the return value of timer_getoverrun(2)
* are of type int. Clamp the overrun value to INT_MAX
@@ -1556,6 +1548,11 @@ static int __init posixtimer_init(void)
unsigned long i, size;
unsigned int shift;
+ posix_timers_cache = kmem_cache_create("posix_timers_cache",
+ sizeof(struct k_itimer),
+ __alignof__(struct k_itimer),
+ SLAB_ACCOUNT, NULL);
+
if (IS_ENABLED(CONFIG_BASE_SMALL))
size = 512;
else
diff --git a/kernel/time/sleep_timeout.c b/kernel/time/sleep_timeout.c
index c0e960a5de39..5aa38b2cf40a 100644
--- a/kernel/time/sleep_timeout.c
+++ b/kernel/time/sleep_timeout.c
@@ -100,7 +100,7 @@ signed long __sched schedule_timeout(signed long timeout)
timer_delete_sync(&timer.timer);
/* Remove the timer from the object tracker */
- destroy_timer_on_stack(&timer.timer);
+ timer_destroy_on_stack(&timer.timer);
timeout = expire - jiffies;
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 4d915c0a263c..553fa469d7cc 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -386,32 +386,6 @@ static unsigned long round_jiffies_common(unsigned long j, int cpu,
}
/**
- * __round_jiffies - function to round jiffies to a full second
- * @j: the time in (absolute) jiffies that should be rounded
- * @cpu: the processor number on which the timeout will happen
- *
- * __round_jiffies() rounds an absolute time in the future (in jiffies)
- * up or down to (approximately) full seconds. This is useful for timers
- * for which the exact time they fire does not matter too much, as long as
- * they fire approximately every X seconds.
- *
- * By rounding these timers to whole seconds, all such timers will fire
- * at the same time, rather than at various times spread out. The goal
- * of this is to have the CPU wake up less, which saves power.
- *
- * The exact rounding is skewed for each processor to avoid all
- * processors firing at the exact same time, which could lead
- * to lock contention or spurious cache line bouncing.
- *
- * The return value is the rounded version of the @j parameter.
- */
-unsigned long __round_jiffies(unsigned long j, int cpu)
-{
- return round_jiffies_common(j, cpu, false);
-}
-EXPORT_SYMBOL_GPL(__round_jiffies);
-
-/**
* __round_jiffies_relative - function to round jiffies to a full second
* @j: the time in (relative) jiffies that should be rounded
* @cpu: the processor number on which the timeout will happen
@@ -483,22 +457,6 @@ unsigned long round_jiffies_relative(unsigned long j)
EXPORT_SYMBOL_GPL(round_jiffies_relative);
/**
- * __round_jiffies_up - function to round jiffies up to a full second
- * @j: the time in (absolute) jiffies that should be rounded
- * @cpu: the processor number on which the timeout will happen
- *
- * This is the same as __round_jiffies() except that it will never
- * round down. This is useful for timeouts for which the exact time
- * of firing does not matter too much, as long as they don't fire too
- * early.
- */
-unsigned long __round_jiffies_up(unsigned long j, int cpu)
-{
- return round_jiffies_common(j, cpu, true);
-}
-EXPORT_SYMBOL_GPL(__round_jiffies_up);
-
-/**
* __round_jiffies_up_relative - function to round jiffies up to a full second
* @j: the time in (relative) jiffies that should be rounded
* @cpu: the processor number on which the timeout will happen
@@ -850,7 +808,7 @@ static void do_init_timer(struct timer_list *timer,
unsigned int flags,
const char *name, struct lock_class_key *key);
-void init_timer_on_stack_key(struct timer_list *timer,
+void timer_init_key_on_stack(struct timer_list *timer,
void (*func)(struct timer_list *),
unsigned int flags,
const char *name, struct lock_class_key *key)
@@ -858,13 +816,13 @@ void init_timer_on_stack_key(struct timer_list *timer,
debug_object_init_on_stack(timer, &timer_debug_descr);
do_init_timer(timer, func, flags, name, key);
}
-EXPORT_SYMBOL_GPL(init_timer_on_stack_key);
+EXPORT_SYMBOL_GPL(timer_init_key_on_stack);
-void destroy_timer_on_stack(struct timer_list *timer)
+void timer_destroy_on_stack(struct timer_list *timer)
{
debug_object_free(timer, &timer_debug_descr);
}
-EXPORT_SYMBOL_GPL(destroy_timer_on_stack);
+EXPORT_SYMBOL_GPL(timer_destroy_on_stack);
#else
static inline void debug_timer_init(struct timer_list *timer) { }
@@ -904,7 +862,7 @@ static void do_init_timer(struct timer_list *timer,
}
/**
- * init_timer_key - initialize a timer
+ * timer_init_key - initialize a timer
* @timer: the timer to be initialized
* @func: timer callback function
* @flags: timer flags
@@ -912,17 +870,17 @@ static void do_init_timer(struct timer_list *timer,
* @key: lockdep class key of the fake lock used for tracking timer
* sync lock dependencies
*
- * init_timer_key() must be done to a timer prior to calling *any* of the
+ * timer_init_key() must be done to a timer prior to calling *any* of the
* other timer functions.
*/
-void init_timer_key(struct timer_list *timer,
+void timer_init_key(struct timer_list *timer,
void (*func)(struct timer_list *), unsigned int flags,
const char *name, struct lock_class_key *key)
{
debug_init(timer);
do_init_timer(timer, func, flags, name, key);
}
-EXPORT_SYMBOL(init_timer_key);
+EXPORT_SYMBOL(timer_init_key);
static inline void detach_timer(struct timer_list *timer, bool clear_pending)
{
@@ -1511,7 +1469,7 @@ static int __try_to_del_timer_sync(struct timer_list *timer, bool shutdown)
}
/**
- * try_to_del_timer_sync - Try to deactivate a timer
+ * timer_delete_sync_try - Try to deactivate a timer
* @timer: Timer to deactivate
*
* This function tries to deactivate a timer. On success the timer is not
@@ -1526,11 +1484,11 @@ static int __try_to_del_timer_sync(struct timer_list *timer, bool shutdown)
* * %1 - The timer was pending and deactivated
* * %-1 - The timer callback function is running on a different CPU
*/
-int try_to_del_timer_sync(struct timer_list *timer)
+int timer_delete_sync_try(struct timer_list *timer)
{
return __try_to_del_timer_sync(timer, false);
}
-EXPORT_SYMBOL(try_to_del_timer_sync);
+EXPORT_SYMBOL(timer_delete_sync_try);
#ifdef CONFIG_PREEMPT_RT
static __init void timer_base_init_expiry_lock(struct timer_base *base)
@@ -1900,7 +1858,7 @@ static void timer_recalc_next_expiry(struct timer_base *base)
unsigned long clk, next, adj;
unsigned lvl, offset = 0;
- next = base->clk + NEXT_TIMER_MAX_DELTA;
+ next = base->clk + TIMER_NEXT_MAX_DELTA;
clk = base->clk;
for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) {
int pos = next_pending_bucket(base, offset, clk & LVL_MASK);
@@ -1963,7 +1921,7 @@ static void timer_recalc_next_expiry(struct timer_base *base)
WRITE_ONCE(base->next_expiry, next);
base->next_expiry_recalc = false;
- base->timers_pending = !(next == base->clk + NEXT_TIMER_MAX_DELTA);
+ base->timers_pending = !(next == base->clk + TIMER_NEXT_MAX_DELTA);
}
#ifdef CONFIG_NO_HZ_COMMON
@@ -2015,7 +1973,7 @@ static unsigned long next_timer_interrupt(struct timer_base *base,
* easy comparable to find out which base holds the first pending timer.
*/
if (!base->timers_pending)
- WRITE_ONCE(base->next_expiry, basej + NEXT_TIMER_MAX_DELTA);
+ WRITE_ONCE(base->next_expiry, basej + TIMER_NEXT_MAX_DELTA);
return base->next_expiry;
}
@@ -2399,7 +2357,7 @@ static inline void __run_timers(struct timer_base *base)
* timer at this clk are that all matching timers have been
* dequeued or no timer has been queued since
* base::next_expiry was set to base::clk +
- * NEXT_TIMER_MAX_DELTA.
+ * TIMER_NEXT_MAX_DELTA.
*/
WARN_ON_ONCE(!levels && !base->next_expiry_recalc
&& base->timers_pending);
@@ -2544,7 +2502,7 @@ int timers_prepare_cpu(unsigned int cpu)
for (b = 0; b < NR_BASES; b++) {
base = per_cpu_ptr(&timer_bases[b], cpu);
base->clk = jiffies;
- base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
+ base->next_expiry = base->clk + TIMER_NEXT_MAX_DELTA;
base->next_expiry_recalc = false;
base->timers_pending = false;
base->is_idle = false;
@@ -2599,7 +2557,7 @@ static void __init init_timer_cpu(int cpu)
base->cpu = cpu;
raw_spin_lock_init(&base->lock);
base->clk = jiffies;
- base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
+ base->next_expiry = base->clk + TIMER_NEXT_MAX_DELTA;
timer_base_init_expiry_lock(base);
}
}
@@ -2612,7 +2570,7 @@ static void __init init_timer_cpus(void)
init_timer_cpu(cpu);
}
-void __init init_timers(void)
+void __init timers_init(void)
{
init_timer_cpus();
posix_cputimers_init_work();
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 187dc37d61d4..132c8be6f635 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -572,7 +572,7 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
return value;
}
-static const struct bpf_func_proto bpf_perf_event_read_proto = {
+const struct bpf_func_proto bpf_perf_event_read_proto = {
.func = bpf_perf_event_read,
.gpl_only = true,
.ret_type = RET_INTEGER,
@@ -882,7 +882,7 @@ BPF_CALL_1(bpf_send_signal, u32, sig)
return bpf_send_signal_common(sig, PIDTYPE_TGID, NULL, 0);
}
-static const struct bpf_func_proto bpf_send_signal_proto = {
+const struct bpf_func_proto bpf_send_signal_proto = {
.func = bpf_send_signal,
.gpl_only = false,
.ret_type = RET_INTEGER,
@@ -894,7 +894,7 @@ BPF_CALL_1(bpf_send_signal_thread, u32, sig)
return bpf_send_signal_common(sig, PIDTYPE_PID, NULL, 0);
}
-static const struct bpf_func_proto bpf_send_signal_thread_proto = {
+const struct bpf_func_proto bpf_send_signal_thread_proto = {
.func = bpf_send_signal_thread,
.gpl_only = false,
.ret_type = RET_INTEGER,
@@ -1185,7 +1185,7 @@ BPF_CALL_3(bpf_get_branch_snapshot, void *, buf, u32, size, u64, flags)
return entry_cnt * br_entry_size;
}
-static const struct bpf_func_proto bpf_get_branch_snapshot_proto = {
+const struct bpf_func_proto bpf_get_branch_snapshot_proto = {
.func = bpf_get_branch_snapshot,
.gpl_only = true,
.ret_type = RET_INTEGER,
@@ -1430,56 +1430,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
const struct bpf_func_proto *func_proto;
switch (func_id) {
- case BPF_FUNC_map_lookup_elem:
- return &bpf_map_lookup_elem_proto;
- case BPF_FUNC_map_update_elem:
- return &bpf_map_update_elem_proto;
- case BPF_FUNC_map_delete_elem:
- return &bpf_map_delete_elem_proto;
- case BPF_FUNC_map_push_elem:
- return &bpf_map_push_elem_proto;
- case BPF_FUNC_map_pop_elem:
- return &bpf_map_pop_elem_proto;
- case BPF_FUNC_map_peek_elem:
- return &bpf_map_peek_elem_proto;
- case BPF_FUNC_map_lookup_percpu_elem:
- return &bpf_map_lookup_percpu_elem_proto;
- case BPF_FUNC_ktime_get_ns:
- return &bpf_ktime_get_ns_proto;
- case BPF_FUNC_ktime_get_boot_ns:
- return &bpf_ktime_get_boot_ns_proto;
- case BPF_FUNC_tail_call:
- return &bpf_tail_call_proto;
- case BPF_FUNC_get_current_task:
- return &bpf_get_current_task_proto;
- case BPF_FUNC_get_current_task_btf:
- return &bpf_get_current_task_btf_proto;
- case BPF_FUNC_task_pt_regs:
- return &bpf_task_pt_regs_proto;
- case BPF_FUNC_get_current_uid_gid:
- return &bpf_get_current_uid_gid_proto;
- case BPF_FUNC_get_current_comm:
- return &bpf_get_current_comm_proto;
- case BPF_FUNC_trace_printk:
- return bpf_get_trace_printk_proto();
case BPF_FUNC_get_smp_processor_id:
return &bpf_get_smp_processor_id_proto;
- case BPF_FUNC_get_numa_node_id:
- return &bpf_get_numa_node_id_proto;
- case BPF_FUNC_perf_event_read:
- return &bpf_perf_event_read_proto;
- case BPF_FUNC_get_prandom_u32:
- return &bpf_get_prandom_u32_proto;
- case BPF_FUNC_probe_read_user:
- return &bpf_probe_read_user_proto;
- case BPF_FUNC_probe_read_kernel:
- return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
- NULL : &bpf_probe_read_kernel_proto;
- case BPF_FUNC_probe_read_user_str:
- return &bpf_probe_read_user_str_proto;
- case BPF_FUNC_probe_read_kernel_str:
- return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
- NULL : &bpf_probe_read_kernel_str_proto;
#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
case BPF_FUNC_probe_read:
return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
@@ -1488,65 +1440,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
NULL : &bpf_probe_read_compat_str_proto;
#endif
-#ifdef CONFIG_CGROUPS
- case BPF_FUNC_cgrp_storage_get:
- return &bpf_cgrp_storage_get_proto;
- case BPF_FUNC_cgrp_storage_delete:
- return &bpf_cgrp_storage_delete_proto;
- case BPF_FUNC_current_task_under_cgroup:
- return &bpf_current_task_under_cgroup_proto;
-#endif
- case BPF_FUNC_send_signal:
- return &bpf_send_signal_proto;
- case BPF_FUNC_send_signal_thread:
- return &bpf_send_signal_thread_proto;
- case BPF_FUNC_perf_event_read_value:
- return &bpf_perf_event_read_value_proto;
- case BPF_FUNC_ringbuf_output:
- return &bpf_ringbuf_output_proto;
- case BPF_FUNC_ringbuf_reserve:
- return &bpf_ringbuf_reserve_proto;
- case BPF_FUNC_ringbuf_submit:
- return &bpf_ringbuf_submit_proto;
- case BPF_FUNC_ringbuf_discard:
- return &bpf_ringbuf_discard_proto;
- case BPF_FUNC_ringbuf_query:
- return &bpf_ringbuf_query_proto;
- case BPF_FUNC_jiffies64:
- return &bpf_jiffies64_proto;
- case BPF_FUNC_get_task_stack:
- return prog->sleepable ? &bpf_get_task_stack_sleepable_proto
- : &bpf_get_task_stack_proto;
- case BPF_FUNC_copy_from_user:
- return &bpf_copy_from_user_proto;
- case BPF_FUNC_copy_from_user_task:
- return &bpf_copy_from_user_task_proto;
- case BPF_FUNC_snprintf_btf:
- return &bpf_snprintf_btf_proto;
- case BPF_FUNC_per_cpu_ptr:
- return &bpf_per_cpu_ptr_proto;
- case BPF_FUNC_this_cpu_ptr:
- return &bpf_this_cpu_ptr_proto;
- case BPF_FUNC_task_storage_get:
- if (bpf_prog_check_recur(prog))
- return &bpf_task_storage_get_recur_proto;
- return &bpf_task_storage_get_proto;
- case BPF_FUNC_task_storage_delete:
- if (bpf_prog_check_recur(prog))
- return &bpf_task_storage_delete_recur_proto;
- return &bpf_task_storage_delete_proto;
- case BPF_FUNC_for_each_map_elem:
- return &bpf_for_each_map_elem_proto;
- case BPF_FUNC_snprintf:
- return &bpf_snprintf_proto;
case BPF_FUNC_get_func_ip:
return &bpf_get_func_ip_proto_tracing;
- case BPF_FUNC_get_branch_snapshot:
- return &bpf_get_branch_snapshot_proto;
- case BPF_FUNC_find_vma:
- return &bpf_find_vma_proto;
- case BPF_FUNC_trace_vprintk:
- return bpf_get_trace_vprintk_proto();
default:
break;
}
@@ -1858,7 +1753,7 @@ static struct pt_regs *get_bpf_raw_tp_regs(void)
struct bpf_raw_tp_regs *tp_regs = this_cpu_ptr(&bpf_raw_tp_regs);
int nest_level = this_cpu_inc_return(bpf_raw_tp_nest_level);
- if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(tp_regs->regs))) {
+ if (nest_level > ARRAY_SIZE(tp_regs->regs)) {
this_cpu_dec(bpf_raw_tp_nest_level);
return ERR_PTR(-EBUSY);
}
@@ -2987,6 +2882,9 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
if (sizeof(u64) != sizeof(void *))
return -EOPNOTSUPP;
+ if (attr->link_create.flags)
+ return -EINVAL;
+
if (!is_kprobe_multi(prog))
return -EINVAL;
@@ -3376,6 +3274,9 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
if (sizeof(u64) != sizeof(void *))
return -EOPNOTSUPP;
+ if (attr->link_create.flags)
+ return -EINVAL;
+
if (!is_uprobe_multi(prog))
return -EINVAL;
@@ -3417,7 +3318,9 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
}
if (pid) {
+ rcu_read_lock();
task = get_pid_task(find_vpid(pid), PIDTYPE_TGID);
+ rcu_read_unlock();
if (!task) {
err = -ESRCH;
goto error_path_put;
@@ -3565,6 +3468,146 @@ static int __init bpf_kprobe_multi_kfuncs_init(void)
late_initcall(bpf_kprobe_multi_kfuncs_init);
+typedef int (*copy_fn_t)(void *dst, const void *src, u32 size, struct task_struct *tsk);
+
+/*
+ * The __always_inline is to make sure the compiler doesn't
+ * generate indirect calls into callbacks, which is expensive,
+ * on some kernel configurations. This allows compiler to put
+ * direct calls into all the specific callback implementations
+ * (copy_user_data_sleepable, copy_user_data_nofault, and so on)
+ */
+static __always_inline int __bpf_dynptr_copy_str(struct bpf_dynptr *dptr, u32 doff, u32 size,
+ const void *unsafe_src,
+ copy_fn_t str_copy_fn,
+ struct task_struct *tsk)
+{
+ struct bpf_dynptr_kern *dst;
+ u32 chunk_sz, off;
+ void *dst_slice;
+ int cnt, err;
+ char buf[256];
+
+ dst_slice = bpf_dynptr_slice_rdwr(dptr, doff, NULL, size);
+ if (likely(dst_slice))
+ return str_copy_fn(dst_slice, unsafe_src, size, tsk);
+
+ dst = (struct bpf_dynptr_kern *)dptr;
+ if (bpf_dynptr_check_off_len(dst, doff, size))
+ return -E2BIG;
+
+ for (off = 0; off < size; off += chunk_sz - 1) {
+ chunk_sz = min_t(u32, sizeof(buf), size - off);
+ /* Expect str_copy_fn to return count of copied bytes, including
+ * zero terminator. Next iteration increment off by chunk_sz - 1 to
+ * overwrite NUL.
+ */
+ cnt = str_copy_fn(buf, unsafe_src + off, chunk_sz, tsk);
+ if (cnt < 0)
+ return cnt;
+ err = __bpf_dynptr_write(dst, doff + off, buf, cnt, 0);
+ if (err)
+ return err;
+ if (cnt < chunk_sz || chunk_sz == 1) /* we are done */
+ return off + cnt;
+ }
+ return off;
+}
+
+static __always_inline int __bpf_dynptr_copy(const struct bpf_dynptr *dptr, u32 doff,
+ u32 size, const void *unsafe_src,
+ copy_fn_t copy_fn, struct task_struct *tsk)
+{
+ struct bpf_dynptr_kern *dst;
+ void *dst_slice;
+ char buf[256];
+ u32 off, chunk_sz;
+ int err;
+
+ dst_slice = bpf_dynptr_slice_rdwr(dptr, doff, NULL, size);
+ if (likely(dst_slice))
+ return copy_fn(dst_slice, unsafe_src, size, tsk);
+
+ dst = (struct bpf_dynptr_kern *)dptr;
+ if (bpf_dynptr_check_off_len(dst, doff, size))
+ return -E2BIG;
+
+ for (off = 0; off < size; off += chunk_sz) {
+ chunk_sz = min_t(u32, sizeof(buf), size - off);
+ err = copy_fn(buf, unsafe_src + off, chunk_sz, tsk);
+ if (err)
+ return err;
+ err = __bpf_dynptr_write(dst, doff + off, buf, chunk_sz, 0);
+ if (err)
+ return err;
+ }
+ return 0;
+}
+
+static __always_inline int copy_user_data_nofault(void *dst, const void *unsafe_src,
+ u32 size, struct task_struct *tsk)
+{
+ return copy_from_user_nofault(dst, (const void __user *)unsafe_src, size);
+}
+
+static __always_inline int copy_user_data_sleepable(void *dst, const void *unsafe_src,
+ u32 size, struct task_struct *tsk)
+{
+ int ret;
+
+ if (!tsk) { /* Read from the current task */
+ ret = copy_from_user(dst, (const void __user *)unsafe_src, size);
+ if (ret)
+ return -EFAULT;
+ return 0;
+ }
+
+ ret = access_process_vm(tsk, (unsigned long)unsafe_src, dst, size, 0);
+ if (ret != size)
+ return -EFAULT;
+ return 0;
+}
+
+static __always_inline int copy_kernel_data_nofault(void *dst, const void *unsafe_src,
+ u32 size, struct task_struct *tsk)
+{
+ return copy_from_kernel_nofault(dst, unsafe_src, size);
+}
+
+static __always_inline int copy_user_str_nofault(void *dst, const void *unsafe_src,
+ u32 size, struct task_struct *tsk)
+{
+ return strncpy_from_user_nofault(dst, (const void __user *)unsafe_src, size);
+}
+
+static __always_inline int copy_user_str_sleepable(void *dst, const void *unsafe_src,
+ u32 size, struct task_struct *tsk)
+{
+ int ret;
+
+ if (unlikely(size == 0))
+ return 0;
+
+ if (tsk) {
+ ret = copy_remote_vm_str(tsk, (unsigned long)unsafe_src, dst, size, 0);
+ } else {
+ ret = strncpy_from_user(dst, (const void __user *)unsafe_src, size - 1);
+ /* strncpy_from_user does not guarantee NUL termination */
+ if (ret >= 0)
+ ((char *)dst)[ret] = '\0';
+ }
+
+ if (ret < 0)
+ return ret;
+ return ret + 1;
+}
+
+static __always_inline int copy_kernel_str_nofault(void *dst, const void *unsafe_src,
+ u32 size, struct task_struct *tsk)
+{
+ return strncpy_from_kernel_nofault(dst, unsafe_src, size);
+}
+
__bpf_kfunc_start_defs();
__bpf_kfunc int bpf_send_signal_task(struct task_struct *task, int sig, enum pid_type type,
@@ -3576,4 +3619,62 @@ __bpf_kfunc int bpf_send_signal_task(struct task_struct *task, int sig, enum pid
return bpf_send_signal_common(sig, type, task, value);
}
+__bpf_kfunc int bpf_probe_read_user_dynptr(struct bpf_dynptr *dptr, u32 off,
+ u32 size, const void __user *unsafe_ptr__ign)
+{
+ return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign,
+ copy_user_data_nofault, NULL);
+}
+
+__bpf_kfunc int bpf_probe_read_kernel_dynptr(struct bpf_dynptr *dptr, u32 off,
+ u32 size, const void *unsafe_ptr__ign)
+{
+ return __bpf_dynptr_copy(dptr, off, size, unsafe_ptr__ign,
+ copy_kernel_data_nofault, NULL);
+}
+
+__bpf_kfunc int bpf_probe_read_user_str_dynptr(struct bpf_dynptr *dptr, u32 off,
+ u32 size, const void __user *unsafe_ptr__ign)
+{
+ return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign,
+ copy_user_str_nofault, NULL);
+}
+
+__bpf_kfunc int bpf_probe_read_kernel_str_dynptr(struct bpf_dynptr *dptr, u32 off,
+ u32 size, const void *unsafe_ptr__ign)
+{
+ return __bpf_dynptr_copy_str(dptr, off, size, unsafe_ptr__ign,
+ copy_kernel_str_nofault, NULL);
+}
+
+__bpf_kfunc int bpf_copy_from_user_dynptr(struct bpf_dynptr *dptr, u32 off,
+ u32 size, const void __user *unsafe_ptr__ign)
+{
+ return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign,
+ copy_user_data_sleepable, NULL);
+}
+
+__bpf_kfunc int bpf_copy_from_user_str_dynptr(struct bpf_dynptr *dptr, u32 off,
+ u32 size, const void __user *unsafe_ptr__ign)
+{
+ return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign,
+ copy_user_str_sleepable, NULL);
+}
+
+__bpf_kfunc int bpf_copy_from_user_task_dynptr(struct bpf_dynptr *dptr, u32 off,
+ u32 size, const void __user *unsafe_ptr__ign,
+ struct task_struct *tsk)
+{
+ return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign,
+ copy_user_data_sleepable, tsk);
+}
+
+__bpf_kfunc int bpf_copy_from_user_task_str_dynptr(struct bpf_dynptr *dptr, u32 off,
+ u32 size, const void __user *unsafe_ptr__ign,
+ struct task_struct *tsk)
+{
+ return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign,
+ copy_user_str_sleepable, tsk);
+}
+
__bpf_kfunc_end_defs();
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 5b8db27fb6ef..d3459e715fbc 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -120,6 +120,7 @@ static int tracing_disabled = 1;
cpumask_var_t __read_mostly tracing_buffer_mask;
+#define MAX_TRACER_SIZE 100
/*
* ftrace_dump_on_oops - variable to dump ftrace buffer on oops
*
@@ -142,7 +143,40 @@ cpumask_var_t __read_mostly tracing_buffer_mask;
char ftrace_dump_on_oops[MAX_TRACER_SIZE] = "0";
/* When set, tracing will stop when a WARN*() is hit */
-int __disable_trace_on_warning;
+static int __disable_trace_on_warning;
+
+int tracepoint_printk_sysctl(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos);
+static const struct ctl_table trace_sysctl_table[] = {
+ {
+ .procname = "ftrace_dump_on_oops",
+ .data = &ftrace_dump_on_oops,
+ .maxlen = MAX_TRACER_SIZE,
+ .mode = 0644,
+ .proc_handler = proc_dostring,
+ },
+ {
+ .procname = "traceoff_on_warning",
+ .data = &__disable_trace_on_warning,
+ .maxlen = sizeof(__disable_trace_on_warning),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "tracepoint_printk",
+ .data = &tracepoint_printk,
+ .maxlen = sizeof(tracepoint_printk),
+ .mode = 0644,
+ .proc_handler = tracepoint_printk_sysctl,
+ },
+};
+
+static int __init init_trace_sysctls(void)
+{
+ register_sysctl_init("kernel", trace_sysctl_table);
+ return 0;
+}
+subsys_initcall(init_trace_sysctls);
#ifdef CONFIG_TRACE_EVAL_MAP_FILE
/* Map of enums to their values, for "eval_map" file */
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 14c6f272c4d8..e34223c8065d 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -32,7 +32,7 @@ static arch_spinlock_t stack_trace_max_lock =
DEFINE_PER_CPU(int, disable_stack_tracer);
static DEFINE_MUTEX(stack_sysctl_mutex);
-int stack_tracer_enabled;
+static int stack_tracer_enabled;
static void print_max_stack(void)
{
@@ -578,3 +578,23 @@ static __init int stack_trace_init(void)
}
device_initcall(stack_trace_init);
+
+
+static const struct ctl_table trace_stack_sysctl_table[] = {
+ {
+ .procname = "stack_tracer_enabled",
+ .data = &stack_tracer_enabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = stack_trace_sysctl,
+ },
+};
+
+static int __init init_trace_stack_sysctls(void)
+{
+ register_sysctl_init("kernel", trace_stack_sysctl_table);
+ return 0;
+}
+subsys_initcall(init_trace_stack_sysctls);
+
+
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 35cf76c75dd7..f95a2c3d5b1b 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1489,7 +1489,7 @@ int bpf_get_uprobe_info(const struct perf_event *event, u32 *fd_type,
: BPF_FD_TYPE_UPROBE;
*filename = tu->filename;
*probe_offset = tu->offset;
- *probe_addr = 0;
+ *probe_addr = tu->ref_ctr_offset;
return 0;
}
#endif /* CONFIG_PERF_EVENTS */
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index cf6203282737..3bef0754cf73 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -686,7 +686,7 @@ EXPORT_SYMBOL_GPL(destroy_work_on_stack);
void destroy_delayed_work_on_stack(struct delayed_work *work)
{
- destroy_timer_on_stack(&work->timer);
+ timer_destroy_on_stack(&work->timer);
debug_object_free(&work->work, &work_debug_descr);
}
EXPORT_SYMBOL_GPL(destroy_delayed_work_on_stack);
@@ -3241,7 +3241,7 @@ __acquires(&pool->lock)
* point will only record its address.
*/
trace_workqueue_execute_end(work, worker->current_func);
- pwq->stats[PWQ_STAT_COMPLETED]++;
+
lock_map_release(&lockdep_map);
if (!bh_draining)
lock_map_release(pwq->wq->lockdep_map);
@@ -3272,6 +3272,8 @@ __acquires(&pool->lock)
raw_spin_lock_irq(&pool->lock);
+ pwq->stats[PWQ_STAT_COMPLETED]++;
+
/*
* In addition to %WQ_CPU_INTENSIVE, @worker may also have been marked
* CPU intensive by wq_worker_tick() if @work hogged CPU longer than
@@ -5837,6 +5839,17 @@ static bool pwq_busy(struct pool_workqueue *pwq)
* @wq: target workqueue
*
* Safely destroy a workqueue. All work currently pending will be done first.
+ *
+ * This function does NOT guarantee that non-pending work that has been
+ * submitted with queue_delayed_work() and similar functions will be done
+ * before destroying the workqueue. The fundamental problem is that, currently,
+ * the workqueue has no way of accessing non-pending delayed_work. delayed_work
+ * is only linked on the timer-side. All delayed_work must, therefore, be
+ * canceled before calling this function.
+ *
+ * TODO: It would be better if the problem described above wouldn't exist and
+ * destroy_workqueue() would cleanly cancel all pending and non-pending
+ * delayed_work.
*/
void destroy_workqueue(struct workqueue_struct *wq)
{