From 2cd008522707a59bf38c1f45d5c654eddbb86c20 Mon Sep 17 00:00:00 2001 From: Pu Lehui Date: Mon, 30 May 2022 17:28:10 +0800 Subject: bpf: Unify data extension operation of jited_ksyms and jited_linfo We found that 32-bit environment can not print BPF line info due to a data inconsistency between jited_ksyms[0] and jited_linfo[0]. For example: jited_kyms[0] = 0xb800067c, jited_linfo[0] = 0xffffffffb800067c We know that both of them store BPF func address, but due to the different data extension operations when extended to u64, they may not be the same. We need to unify the data extension operations of them. Signed-off-by: Pu Lehui Signed-off-by: Daniel Borkmann Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/CAEf4BzZ-eDcdJZgJ+Np7Y=V-TVjDDvOMqPwzKjyWrh=i5juv4w@mail.gmail.com Link: https://lore.kernel.org/bpf/20220530092815.1112406-2-pulehui@huawei.com --- kernel/bpf/syscall.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2b69306d3c6e..aeb31137b2ed 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -4090,14 +4090,15 @@ static int bpf_prog_get_info_by_fd(struct file *file, info.nr_jited_line_info = 0; if (info.nr_jited_line_info && ulen) { if (bpf_dump_raw_ok(file->f_cred)) { + unsigned long line_addr; __u64 __user *user_linfo; u32 i; user_linfo = u64_to_user_ptr(info.jited_line_info); ulen = min_t(u32, info.nr_jited_line_info, ulen); for (i = 0; i < ulen; i++) { - if (put_user((__u64)(long)prog->aux->jited_linfo[i], - &user_linfo[i])) + line_addr = (unsigned long)prog->aux->jited_linfo[i]; + if (put_user((__u64)line_addr, &user_linfo[i])) return -EFAULT; } } else { -- cgit From cc1685546df87d9872e1ccef5bf56ac5262be0b1 Mon Sep 17 00:00:00 2001 From: Pu Lehui Date: Mon, 30 May 2022 17:28:12 +0800 Subject: bpf: Correct the comment about insn_to_jit_off The insn_to_jit_off passed to bpf_prog_fill_jited_linfo should be the first byte of the next instruction, or the byte off to the end of the current instruction. Signed-off-by: Pu Lehui Signed-off-by: Daniel Borkmann Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220530092815.1112406-4-pulehui@huawei.com --- kernel/bpf/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 5f6f3f829b36..e78cc5eea4a5 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -176,7 +176,7 @@ void bpf_prog_jit_attempt_done(struct bpf_prog *prog) * here is relative to the prog itself instead of the main prog. * This array has one entry for each xlated bpf insn. * - * jited_off is the byte off to the last byte of the jited insn. + * jited_off is the byte off to the end of the jited insn. * * Hence, with * insn_start: -- cgit From 4c46091ee985ae84c60c5e95055d779fcd291d87 Mon Sep 17 00:00:00 2001 From: Tadeusz Struk Date: Tue, 17 May 2022 11:04:20 -0700 Subject: bpf: Fix KASAN use-after-free Read in compute_effective_progs Syzbot found a Use After Free bug in compute_effective_progs(). The reproducer creates a number of BPF links, and causes a fault injected alloc to fail, while calling bpf_link_detach on them. Link detach triggers the link to be freed by bpf_link_free(), which calls __cgroup_bpf_detach() and update_effective_progs(). If the memory allocation in this function fails, the function restores the pointer to the bpf_cgroup_link on the cgroup list, but the memory gets freed just after it returns. After this, every subsequent call to update_effective_progs() causes this already deallocated pointer to be dereferenced in prog_list_length(), and triggers KASAN UAF error. To fix this issue don't preserve the pointer to the prog or link in the list, but remove it and replace it with a dummy prog without shrinking the table. The subsequent call to __cgroup_bpf_detach() or __cgroup_bpf_detach() will correct it. Fixes: af6eea57437a ("bpf: Implement bpf_link-based cgroup BPF program attachment") Reported-by: Signed-off-by: Tadeusz Struk Signed-off-by: Andrii Nakryiko Cc: Link: https://syzkaller.appspot.com/bug?id=8ebf179a95c2a2670f7cf1ba62429ec044369db4 Link: https://lore.kernel.org/bpf/20220517180420.87954-1-tadeusz.struk@linaro.org --- kernel/bpf/cgroup.c | 70 +++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 60 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index afb414b26d01..7a394f7c205c 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -720,6 +720,60 @@ static struct bpf_prog_list *find_detach_entry(struct list_head *progs, return ERR_PTR(-ENOENT); } +/** + * purge_effective_progs() - After compute_effective_progs fails to alloc new + * cgrp->bpf.inactive table we can recover by + * recomputing the array in place. + * + * @cgrp: The cgroup which descendants to travers + * @prog: A program to detach or NULL + * @link: A link to detach or NULL + * @atype: Type of detach operation + */ +static void purge_effective_progs(struct cgroup *cgrp, struct bpf_prog *prog, + struct bpf_cgroup_link *link, + enum cgroup_bpf_attach_type atype) +{ + struct cgroup_subsys_state *css; + struct bpf_prog_array *progs; + struct bpf_prog_list *pl; + struct list_head *head; + struct cgroup *cg; + int pos; + + /* recompute effective prog array in place */ + css_for_each_descendant_pre(css, &cgrp->self) { + struct cgroup *desc = container_of(css, struct cgroup, self); + + if (percpu_ref_is_zero(&desc->bpf.refcnt)) + continue; + + /* find position of link or prog in effective progs array */ + for (pos = 0, cg = desc; cg; cg = cgroup_parent(cg)) { + if (pos && !(cg->bpf.flags[atype] & BPF_F_ALLOW_MULTI)) + continue; + + head = &cg->bpf.progs[atype]; + list_for_each_entry(pl, head, node) { + if (!prog_list_prog(pl)) + continue; + if (pl->prog == prog && pl->link == link) + goto found; + pos++; + } + } +found: + BUG_ON(!cg); + progs = rcu_dereference_protected( + desc->bpf.effective[atype], + lockdep_is_held(&cgroup_mutex)); + + /* Remove the program from the array */ + WARN_ONCE(bpf_prog_array_delete_safe_at(progs, pos), + "Failed to purge a prog from array at index %d", pos); + } +} + /** * __cgroup_bpf_detach() - Detach the program or link from a cgroup, and * propagate the change to descendants @@ -739,7 +793,6 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, struct bpf_prog_list *pl; struct list_head *progs; u32 flags; - int err; atype = to_cgroup_bpf_attach_type(type); if (atype < 0) @@ -761,9 +814,12 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, pl->prog = NULL; pl->link = NULL; - err = update_effective_progs(cgrp, atype); - if (err) - goto cleanup; + if (update_effective_progs(cgrp, atype)) { + /* if update effective array failed replace the prog with a dummy prog*/ + pl->prog = old_prog; + pl->link = link; + purge_effective_progs(cgrp, old_prog, link, atype); + } /* now can actually delete it from this cgroup list */ list_del(&pl->node); @@ -775,12 +831,6 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, bpf_prog_put(old_prog); static_branch_dec(&cgroup_bpf_enabled_key[atype]); return 0; - -cleanup: - /* restore back prog or link */ - pl->prog = old_prog; - pl->link = link; - return err; } static int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, -- cgit From 5f69a6577bc33d8f6d6bbe02bccdeb357b287f56 Mon Sep 17 00:00:00 2001 From: Chen Wandun Date: Thu, 26 May 2022 20:26:56 +0800 Subject: psi: dont alloc memory for psi by default Memory about struct psi_group is allocated by default for each cgroup even if psi_disabled is true, in this case, these allocated memory is waste, so alloc memory for struct psi_group only when psi_disabled is false. Signed-off-by: Chen Wandun Acked-by: Johannes Weiner Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 8 ++++---- kernel/sched/psi.c | 19 +++++++++++++------ 2 files changed, 17 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 1779ccddb734..90a654cb8a1e 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3609,21 +3609,21 @@ static int cpu_stat_show(struct seq_file *seq, void *v) static int cgroup_io_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi; + struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi; return psi_show(seq, psi, PSI_IO); } static int cgroup_memory_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi; + struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi; return psi_show(seq, psi, PSI_MEM); } static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi; + struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi; return psi_show(seq, psi, PSI_CPU); } @@ -3649,7 +3649,7 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, return -EBUSY; } - psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi; + psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi; new = psi_trigger_create(psi, buf, nbytes, res); if (IS_ERR(new)) { cgroup_put(cgrp); diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index a337f3e35997..ec66b40bdd40 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -957,10 +957,16 @@ int psi_cgroup_alloc(struct cgroup *cgroup) if (static_branch_likely(&psi_disabled)) return 0; - cgroup->psi.pcpu = alloc_percpu(struct psi_group_cpu); - if (!cgroup->psi.pcpu) + cgroup->psi = kmalloc(sizeof(struct psi_group), GFP_KERNEL); + if (!cgroup->psi) return -ENOMEM; - group_init(&cgroup->psi); + + cgroup->psi->pcpu = alloc_percpu(struct psi_group_cpu); + if (!cgroup->psi->pcpu) { + kfree(cgroup->psi); + return -ENOMEM; + } + group_init(cgroup->psi); return 0; } @@ -969,10 +975,11 @@ void psi_cgroup_free(struct cgroup *cgroup) if (static_branch_likely(&psi_disabled)) return; - cancel_delayed_work_sync(&cgroup->psi.avgs_work); - free_percpu(cgroup->psi.pcpu); + cancel_delayed_work_sync(&cgroup->psi->avgs_work); + free_percpu(cgroup->psi->pcpu); /* All triggers must be removed by now */ - WARN_ONCE(cgroup->psi.poll_states, "psi: trigger leak\n"); + WARN_ONCE(cgroup->psi->poll_states, "psi: trigger leak\n"); + kfree(cgroup->psi); } /** -- cgit From 6089fb325cf737eeb2c4d236c94697112ca860da Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 6 Jun 2022 23:26:00 -0700 Subject: bpf: Add btf enum64 support Currently, BTF only supports upto 32bit enum value with BTF_KIND_ENUM. But in kernel, some enum indeed has 64bit values, e.g., in uapi bpf.h, we have enum { BPF_F_INDEX_MASK = 0xffffffffULL, BPF_F_CURRENT_CPU = BPF_F_INDEX_MASK, BPF_F_CTXLEN_MASK = (0xfffffULL << 32), }; In this case, BTF_KIND_ENUM will encode the value of BPF_F_CTXLEN_MASK as 0, which certainly is incorrect. This patch added a new btf kind, BTF_KIND_ENUM64, which permits 64bit value to cover the above use case. The BTF_KIND_ENUM64 has the following three fields followed by the common type: struct bpf_enum64 { __u32 nume_off; __u32 val_lo32; __u32 val_hi32; }; Currently, btf type section has an alignment of 4 as all element types are u32. Representing the value with __u64 will introduce a pad for bpf_enum64 and may also introduce misalignment for the 64bit value. Hence, two members of val_hi32 and val_lo32 are chosen to avoid these issues. The kflag is also introduced for BTF_KIND_ENUM and BTF_KIND_ENUM64 to indicate whether the value is signed or unsigned. The kflag intends to provide consistent output of BTF C fortmat with the original source code. For example, the original BTF_KIND_ENUM bit value is 0xffffffff. The format C has two choices, printing out 0xffffffff or -1 and current libbpf prints out as unsigned value. But if the signedness is preserved in btf, the value can be printed the same as the original source code. The kflag value 0 means unsigned values, which is consistent to the default by libbpf and should also cover most cases as well. The new BTF_KIND_ENUM64 is intended to support the enum value represented as 64bit value. But it can represent all BTF_KIND_ENUM values as well. The compiler ([1]) and pahole will generate BTF_KIND_ENUM64 only if the value has to be represented with 64 bits. In addition, a static inline function btf_kind_core_compat() is introduced which will be used later when libbpf relo_core.c changed. Here the kernel shares the same relo_core.c with libbpf. [1] https://reviews.llvm.org/D124641 Acked-by: Andrii Nakryiko Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20220607062600.3716578-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 142 +++++++++++++++++++++++++++++++++++++++++++++----- kernel/bpf/verifier.c | 2 +- 2 files changed, 129 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 7bccaa4646e5..6c0d8480e15c 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -309,6 +309,7 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = { [BTF_KIND_FLOAT] = "FLOAT", [BTF_KIND_DECL_TAG] = "DECL_TAG", [BTF_KIND_TYPE_TAG] = "TYPE_TAG", + [BTF_KIND_ENUM64] = "ENUM64", }; const char *btf_type_str(const struct btf_type *t) @@ -666,6 +667,7 @@ static bool btf_type_has_size(const struct btf_type *t) case BTF_KIND_ENUM: case BTF_KIND_DATASEC: case BTF_KIND_FLOAT: + case BTF_KIND_ENUM64: return true; } @@ -711,6 +713,11 @@ static const struct btf_decl_tag *btf_type_decl_tag(const struct btf_type *t) return (const struct btf_decl_tag *)(t + 1); } +static const struct btf_enum64 *btf_type_enum64(const struct btf_type *t) +{ + return (const struct btf_enum64 *)(t + 1); +} + static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t) { return kind_ops[BTF_INFO_KIND(t->info)]; @@ -1019,6 +1026,7 @@ static const char *btf_show_name(struct btf_show *show) parens = "{"; break; case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: prefix = "enum"; break; default: @@ -1834,6 +1842,7 @@ __btf_resolve_size(const struct btf *btf, const struct btf_type *type, case BTF_KIND_UNION: case BTF_KIND_ENUM: case BTF_KIND_FLOAT: + case BTF_KIND_ENUM64: size = type->size; goto resolved; @@ -3670,6 +3679,7 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env, { const struct btf_enum *enums = btf_type_enum(t); struct btf *btf = env->btf; + const char *fmt_str; u16 i, nr_enums; u32 meta_needed; @@ -3683,11 +3693,6 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env, return -EINVAL; } - if (btf_type_kflag(t)) { - btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); - return -EINVAL; - } - if (t->size > 8 || !is_power_of_2(t->size)) { btf_verifier_log_type(env, t, "Unexpected size"); return -EINVAL; @@ -3718,7 +3723,8 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env, if (env->log.level == BPF_LOG_KERNEL) continue; - btf_verifier_log(env, "\t%s val=%d\n", + fmt_str = btf_type_kflag(t) ? "\t%s val=%d\n" : "\t%s val=%u\n"; + btf_verifier_log(env, fmt_str, __btf_name_by_offset(btf, enums[i].name_off), enums[i].val); } @@ -3759,7 +3765,10 @@ static void btf_enum_show(const struct btf *btf, const struct btf_type *t, return; } - btf_show_type_value(show, "%d", v); + if (btf_type_kflag(t)) + btf_show_type_value(show, "%d", v); + else + btf_show_type_value(show, "%u", v); btf_show_end_type(show); } @@ -3772,6 +3781,109 @@ static struct btf_kind_operations enum_ops = { .show = btf_enum_show, }; +static s32 btf_enum64_check_meta(struct btf_verifier_env *env, + const struct btf_type *t, + u32 meta_left) +{ + const struct btf_enum64 *enums = btf_type_enum64(t); + struct btf *btf = env->btf; + const char *fmt_str; + u16 i, nr_enums; + u32 meta_needed; + + nr_enums = btf_type_vlen(t); + meta_needed = nr_enums * sizeof(*enums); + + if (meta_left < meta_needed) { + btf_verifier_log_basic(env, t, + "meta_left:%u meta_needed:%u", + meta_left, meta_needed); + return -EINVAL; + } + + if (t->size > 8 || !is_power_of_2(t->size)) { + btf_verifier_log_type(env, t, "Unexpected size"); + return -EINVAL; + } + + /* enum type either no name or a valid one */ + if (t->name_off && + !btf_name_valid_identifier(env->btf, t->name_off)) { + btf_verifier_log_type(env, t, "Invalid name"); + return -EINVAL; + } + + btf_verifier_log_type(env, t, NULL); + + for (i = 0; i < nr_enums; i++) { + if (!btf_name_offset_valid(btf, enums[i].name_off)) { + btf_verifier_log(env, "\tInvalid name_offset:%u", + enums[i].name_off); + return -EINVAL; + } + + /* enum member must have a valid name */ + if (!enums[i].name_off || + !btf_name_valid_identifier(btf, enums[i].name_off)) { + btf_verifier_log_type(env, t, "Invalid name"); + return -EINVAL; + } + + if (env->log.level == BPF_LOG_KERNEL) + continue; + + fmt_str = btf_type_kflag(t) ? "\t%s val=%lld\n" : "\t%s val=%llu\n"; + btf_verifier_log(env, fmt_str, + __btf_name_by_offset(btf, enums[i].name_off), + btf_enum64_value(enums + i)); + } + + return meta_needed; +} + +static void btf_enum64_show(const struct btf *btf, const struct btf_type *t, + u32 type_id, void *data, u8 bits_offset, + struct btf_show *show) +{ + const struct btf_enum64 *enums = btf_type_enum64(t); + u32 i, nr_enums = btf_type_vlen(t); + void *safe_data; + s64 v; + + safe_data = btf_show_start_type(show, t, type_id, data); + if (!safe_data) + return; + + v = *(u64 *)safe_data; + + for (i = 0; i < nr_enums; i++) { + if (v != btf_enum64_value(enums + i)) + continue; + + btf_show_type_value(show, "%s", + __btf_name_by_offset(btf, + enums[i].name_off)); + + btf_show_end_type(show); + return; + } + + if (btf_type_kflag(t)) + btf_show_type_value(show, "%lld", v); + else + btf_show_type_value(show, "%llu", v); + btf_show_end_type(show); +} + +static struct btf_kind_operations enum64_ops = { + .check_meta = btf_enum64_check_meta, + .resolve = btf_df_resolve, + .check_member = btf_enum_check_member, + .check_kflag_member = btf_enum_check_kflag_member, + .log_details = btf_enum_log, + .show = btf_enum64_show, +}; + static s32 btf_func_proto_check_meta(struct btf_verifier_env *env, const struct btf_type *t, u32 meta_left) @@ -4438,6 +4550,7 @@ static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = { [BTF_KIND_FLOAT] = &float_ops, [BTF_KIND_DECL_TAG] = &decl_tag_ops, [BTF_KIND_TYPE_TAG] = &modifier_ops, + [BTF_KIND_ENUM64] = &enum64_ops, }; static s32 btf_check_meta(struct btf_verifier_env *env, @@ -5299,7 +5412,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, /* skip modifiers */ while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); - if (btf_type_is_small_int(t) || btf_type_is_enum(t)) + if (btf_type_is_small_int(t) || btf_is_any_enum(t)) /* accessing a scalar */ return true; if (!btf_type_is_ptr(t)) { @@ -5763,7 +5876,7 @@ static int __get_type_size(struct btf *btf, u32 btf_id, if (btf_type_is_ptr(t)) /* kernel size of pointer. Not BPF's size of pointer*/ return sizeof(void *); - if (btf_type_is_int(t) || btf_type_is_enum(t)) + if (btf_type_is_int(t) || btf_is_any_enum(t)) return t->size; *bad_type = t; return -EINVAL; @@ -5911,7 +6024,7 @@ static int btf_check_func_type_match(struct bpf_verifier_log *log, * to context only. And only global functions can be replaced. * Hence type check only those types. */ - if (btf_type_is_int(t1) || btf_type_is_enum(t1)) + if (btf_type_is_int(t1) || btf_is_any_enum(t1)) continue; if (!btf_type_is_ptr(t1)) { bpf_log(log, @@ -6408,7 +6521,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, t = btf_type_by_id(btf, t->type); while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); - if (!btf_type_is_int(t) && !btf_type_is_enum(t)) { + if (!btf_type_is_int(t) && !btf_is_any_enum(t)) { bpf_log(log, "Global function %s() doesn't return scalar. Only those are supported.\n", tname); @@ -6423,7 +6536,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, t = btf_type_by_id(btf, args[i].type); while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); - if (btf_type_is_int(t) || btf_type_is_enum(t)) { + if (btf_type_is_int(t) || btf_is_any_enum(t)) { reg->type = SCALAR_VALUE; continue; } @@ -7335,6 +7448,7 @@ recur: case BTF_KIND_UNION: case BTF_KIND_ENUM: case BTF_KIND_FWD: + case BTF_KIND_ENUM64: return 1; case BTF_KIND_INT: /* just reject deprecated bitfield-like integers; all other @@ -7387,10 +7501,10 @@ recur: * field-based relocations. This function assumes that root types were already * checked for name match. Beyond that initial root-level name check, names * are completely ignored. Compatibility rules are as follows: - * - any two STRUCTs/UNIONs/FWDs/ENUMs/INTs are considered compatible, but + * - any two STRUCTs/UNIONs/FWDs/ENUMs/INTs/ENUM64s are considered compatible, but * kind should match for local and target types (i.e., STRUCT is not * compatible with UNION); - * - for ENUMs, the size is ignored; + * - for ENUMs/ENUM64s, the size is ignored; * - for INT, size and signedness are ignored; * - for ARRAY, dimensionality is ignored, element types are checked for * compatibility recursively; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index aedac2ac02b9..2d2872682278 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -10901,7 +10901,7 @@ static int check_btf_func(struct bpf_verifier_env *env, goto err_free; ret_type = btf_type_skip_modifiers(btf, func_proto->type, NULL); scalar_return = - btf_type_is_small_int(ret_type) || btf_type_is_enum(ret_type); + btf_type_is_small_int(ret_type) || btf_is_any_enum(ret_type); if (i && !scalar_return && env->subprog_info[i].has_ld_abs) { verbose(env, "LD_ABS is only allowed in functions that return 'int'.\n"); goto err_free; -- cgit From 5ee6cfdd11aaf5aee5cd48baf35b1710caa68a5c Mon Sep 17 00:00:00 2001 From: Shreenidhi Shedi Date: Sun, 15 May 2022 13:01:09 +0530 Subject: audit: remove redundant data_len check data_len is already getting checked if it's less than 2 earlier in this function. Signed-off-by: Shreenidhi Shedi Signed-off-by: Paul Moore --- kernel/audit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 7690c29d4ee4..0749211d5552 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1390,7 +1390,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) str); } else { audit_log_format(ab, " data="); - if (data_len > 0 && str[data_len - 1] == '\0') + if (str[data_len - 1] == '\0') data_len--; audit_log_n_untrustedstring(ab, str, data_len); } -- cgit From 73b4b53276a1d6290cd4f47dbbc885b6e6e59ac6 Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Thu, 19 May 2022 09:47:28 -0400 Subject: Revert "workqueue: remove unused cancel_work()" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 6417250d3f894e66a68ba1cd93676143f2376a6f. amdpgu need this function in order to prematurly stop pending reset works when another reset work already in progress. Acked-by: Tejun Heo Signed-off-by: Andrey Grodzovsky Reviewed-by: Lai Jiangshan Reviewed-by: Christian König Signed-off-by: Alex Deucher --- kernel/workqueue.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 0d2514b4ff0d..20d226d5bbc2 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3258,6 +3258,15 @@ static bool __cancel_work(struct work_struct *work, bool is_dwork) return ret; } +/* + * See cancel_delayed_work() + */ +bool cancel_work(struct work_struct *work) +{ + return __cancel_work(work, false); +} +EXPORT_SYMBOL(cancel_work); + /** * cancel_delayed_work - cancel a delayed work * @dwork: delayed_work to cancel -- cgit From 54a9c3a42d92d2b0d4e0f64214ebbbfcf7fbfda8 Mon Sep 17 00:00:00 2001 From: Feng Zhou Date: Fri, 10 Jun 2022 10:33:07 +0800 Subject: bpf: avoid grabbing spin_locks of all cpus when no free elems This patch use head->first in pcpu_freelist_head to check freelist having free or not. If having, grab spin_lock, or check next cpu's freelist. Before patch: hash_map performance ./map_perf_test 1 0:hash_map_perf pre-alloc 1043397 events per sec ... The average of the test results is around 1050000 events per sec. hash_map the worst: no free ./run_bench_bpf_hashmap_full_update.sh Setting up benchmark 'bpf-hashmap-ful-update'... Benchmark 'bpf-hashmap-ful-update' started. 1:hash_map_full_perf 15687 events per sec ... The average of the test results is around 16000 events per sec. ftrace trace: 0) | htab_map_update_elem() { 0) | __pcpu_freelist_pop() { 0) | _raw_spin_lock() 0) | _raw_spin_unlock() 0) | ... 0) + 25.188 us | } 0) + 28.439 us | } The test machine is 16C, trying to get spin_lock 17 times, in addition to 16c, there is an extralist. after patch: hash_map performance ./map_perf_test 1 0:hash_map_perf pre-alloc 1053298 events per sec ... The average of the test results is around 1050000 events per sec. hash_map worst: no free ./run_bench_bpf_hashmap_full_update.sh Setting up benchmark 'bpf-hashmap-ful-update'... Benchmark 'bpf-hashmap-ful-update' started. 1:hash_map_full_perf 555830 events per sec ... The average of the test results is around 550000 events per sec. ftrace trace: 0) | htab_map_update_elem() { 0) | alloc_htab_elem() { 0) 0.586 us | __pcpu_freelist_pop(); 0) 0.945 us | } 0) 8.669 us | } It can be seen that after adding this patch, the map performance is almost not degraded, and when free=0, first check head->first instead of directly acquiring spin_lock. Co-developed-by: Chengming Zhou Signed-off-by: Chengming Zhou Signed-off-by: Feng Zhou Link: https://lore.kernel.org/r/20220610023308.93798-2-zhoufeng.zf@bytedance.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/percpu_freelist.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c index 3d897de89061..00b874c8e889 100644 --- a/kernel/bpf/percpu_freelist.c +++ b/kernel/bpf/percpu_freelist.c @@ -31,7 +31,7 @@ static inline void pcpu_freelist_push_node(struct pcpu_freelist_head *head, struct pcpu_freelist_node *node) { node->next = head->first; - head->first = node; + WRITE_ONCE(head->first, node); } static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head, @@ -130,14 +130,17 @@ static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s) orig_cpu = cpu = raw_smp_processor_id(); while (1) { head = per_cpu_ptr(s->freelist, cpu); + if (!READ_ONCE(head->first)) + goto next_cpu; raw_spin_lock(&head->lock); node = head->first; if (node) { - head->first = node->next; + WRITE_ONCE(head->first, node->next); raw_spin_unlock(&head->lock); return node; } raw_spin_unlock(&head->lock); +next_cpu: cpu = cpumask_next(cpu, cpu_possible_mask); if (cpu >= nr_cpu_ids) cpu = 0; @@ -146,10 +149,12 @@ static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s) } /* per cpu lists are all empty, try extralist */ + if (!READ_ONCE(s->extralist.first)) + return NULL; raw_spin_lock(&s->extralist.lock); node = s->extralist.first; if (node) - s->extralist.first = node->next; + WRITE_ONCE(s->extralist.first, node->next); raw_spin_unlock(&s->extralist.lock); return node; } @@ -164,15 +169,18 @@ ___pcpu_freelist_pop_nmi(struct pcpu_freelist *s) orig_cpu = cpu = raw_smp_processor_id(); while (1) { head = per_cpu_ptr(s->freelist, cpu); + if (!READ_ONCE(head->first)) + goto next_cpu; if (raw_spin_trylock(&head->lock)) { node = head->first; if (node) { - head->first = node->next; + WRITE_ONCE(head->first, node->next); raw_spin_unlock(&head->lock); return node; } raw_spin_unlock(&head->lock); } +next_cpu: cpu = cpumask_next(cpu, cpu_possible_mask); if (cpu >= nr_cpu_ids) cpu = 0; @@ -181,11 +189,11 @@ ___pcpu_freelist_pop_nmi(struct pcpu_freelist *s) } /* cannot pop from per cpu lists, try extralist */ - if (!raw_spin_trylock(&s->extralist.lock)) + if (!READ_ONCE(s->extralist.first) || !raw_spin_trylock(&s->extralist.lock)) return NULL; node = s->extralist.first; if (node) - s->extralist.first = node->next; + WRITE_ONCE(s->extralist.first, node->next); raw_spin_unlock(&s->extralist.lock); return node; } -- cgit From 546093206ba16623c18e344630dbfdd71a4327e0 Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Sat, 11 Jun 2022 17:23:04 +0800 Subject: audit: make is_audit_feature_set() static Currently nobody use is_audit_feature_set() outside this file, so make it static. Signed-off-by: Xiu Jianfeng Signed-off-by: Paul Moore --- kernel/audit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 0749211d5552..a75978ae38ad 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1100,7 +1100,7 @@ static inline void audit_log_user_recv_msg(struct audit_buffer **ab, audit_log_common_recv_msg(NULL, ab, msg_type); } -int is_audit_feature_set(int i) +static int is_audit_feature_set(int i) { return af.features & AUDIT_FEATURE_TO_MASK(i); } -- cgit From 133e2d3e81de5d9706cab2dd1d52d231c27382e5 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Sun, 12 Jun 2022 23:07:22 -0700 Subject: fs/exec: allow to unshare a time namespace on vfork+exec Right now, a new process can't be forked in another time namespace if it shares mm with its parent. It is prohibited, because each time namespace has its own vvar page that is mapped into a process address space. When a process calls exec, it gets a new mm and so it could be "legal" to switch time namespace in that case. This was not implemented and now if we want to do this, we need to add another clone flag to not break backward compatibility. We don't have any user requests to switch times on exec except the vfork+exec combination, so there is no reason to add a new clone flag. As for vfork+exec, this should be safe to allow switching timens with the current clone flag. Right now, vfork (CLONE_VFORK | CLONE_VM) fails if a child is forked into another time namespace. With this change, vfork creates a new process in parent's timens, and the following exec does the actual switch to the target time namespace. Suggested-by: Florian Weimer Signed-off-by: Andrei Vagin Acked-by: Christian Brauner (Microsoft) Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20220613060723.197407-1-avagin@gmail.com --- kernel/fork.c | 5 ++++- kernel/nsproxy.c | 3 ++- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 9d44f2d46c69..9174146f6812 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2033,8 +2033,11 @@ static __latent_entropy struct task_struct *copy_process( /* * If the new process will be in a different time namespace * do not allow it to share VM or a thread group with the forking task. + * + * On vfork, the child process enters the target time namespace only + * after exec. */ - if (clone_flags & (CLONE_THREAD | CLONE_VM)) { + if ((clone_flags & (CLONE_VM | CLONE_VFORK)) == CLONE_VM) { if (nsp->time_ns != nsp->time_ns_for_children) return ERR_PTR(-EINVAL); } diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index eec72ca962e2..b4cbb406bc28 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -179,7 +179,8 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) if (IS_ERR(new_ns)) return PTR_ERR(new_ns); - timens_on_fork(new_ns, tsk); + if ((flags & CLONE_VM) == 0) + timens_on_fork(new_ns, tsk); tsk->nsproxy = new_ns; return 0; -- cgit From e210a89f5b07680fe21d21e846e6817346c5ba3b Mon Sep 17 00:00:00 2001 From: Lin Feng Date: Thu, 16 Jun 2022 18:38:30 +0800 Subject: cgroup.c: add helper __cset_cgroup_from_root to cleanup duplicated codes No funtionality change, but save us some lines. Signed-off-by: Lin Feng Acked-by: Mukesh Ojha Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 58 +++++++++++++++++++++++--------------------------- 1 file changed, 27 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 90a654cb8a1e..4b67e6da6bf2 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1376,6 +1376,31 @@ static void cgroup_destroy_root(struct cgroup_root *root) cgroup_free_root(root); } +static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset, + struct cgroup_root *root) +{ + struct cgroup *res_cgroup = NULL; + + if (cset == &init_css_set) { + res_cgroup = &root->cgrp; + } else if (root == &cgrp_dfl_root) { + res_cgroup = cset->dfl_cgrp; + } else { + struct cgrp_cset_link *link; + + list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { + struct cgroup *c = link->cgrp; + + if (c->root == root) { + res_cgroup = c; + break; + } + } + } + + return res_cgroup; +} + /* * look up cgroup associated with current task's cgroup namespace on the * specified hierarchy @@ -1391,22 +1416,8 @@ current_cgns_cgroup_from_root(struct cgroup_root *root) rcu_read_lock(); cset = current->nsproxy->cgroup_ns->root_cset; - if (cset == &init_css_set) { - res = &root->cgrp; - } else if (root == &cgrp_dfl_root) { - res = cset->dfl_cgrp; - } else { - struct cgrp_cset_link *link; - - list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { - struct cgroup *c = link->cgrp; + res = __cset_cgroup_from_root(cset, root); - if (c->root == root) { - res = c; - break; - } - } - } rcu_read_unlock(); BUG_ON(!res); @@ -1422,22 +1433,7 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset, lockdep_assert_held(&cgroup_mutex); lockdep_assert_held(&css_set_lock); - if (cset == &init_css_set) { - res = &root->cgrp; - } else if (root == &cgrp_dfl_root) { - res = cset->dfl_cgrp; - } else { - struct cgrp_cset_link *link; - - list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { - struct cgroup *c = link->cgrp; - - if (c->root == root) { - res = c; - break; - } - } - } + res = __cset_cgroup_from_root(cset, root); BUG_ON(!res); return res; -- cgit From 8c7dcb84e3b744b2b70baa7a44a9b1881c33a9c9 Mon Sep 17 00:00:00 2001 From: Delyan Kratunov Date: Tue, 14 Jun 2022 23:10:46 +0000 Subject: bpf: implement sleepable uprobes by chaining gps uprobes work by raising a trap, setting a task flag from within the interrupt handler, and processing the actual work for the uprobe on the way back to userspace. As a result, uprobe handlers already execute in a might_fault/_sleep context. The primary obstacle to sleepable bpf uprobe programs is therefore on the bpf side. Namely, the bpf_prog_array attached to the uprobe is protected by normal rcu. In order for uprobe bpf programs to become sleepable, it has to be protected by the tasks_trace rcu flavor instead (and kfree() called after a corresponding grace period). Therefore, the free path for bpf_prog_array now chains a tasks_trace and normal grace periods one after the other. Users who iterate under tasks_trace read section would be safe, as would users who iterate under normal read sections (from non-sleepable locations). The downside is that the tasks_trace latency affects all perf_event-attached bpf programs (and not just uprobe ones). This is deemed safe given the possible attach rates for kprobe/uprobe/tp programs. Separately, non-sleepable programs need access to dynamically sized rcu-protected maps, so bpf_run_prog_array_sleepables now conditionally takes an rcu read section, in addition to the overarching tasks_trace section. Signed-off-by: Delyan Kratunov Link: https://lore.kernel.org/r/ce844d62a2fd0443b08c5ab02e95bc7149f9aeb1.1655248076.git.delyank@fb.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 15 +++++++++++++++ kernel/trace/bpf_trace.c | 4 ++-- kernel/trace/trace_uprobe.c | 5 ++--- 3 files changed, 19 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index e78cc5eea4a5..b5ffebcce6cc 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2279,6 +2279,21 @@ void bpf_prog_array_free(struct bpf_prog_array *progs) kfree_rcu(progs, rcu); } +static void __bpf_prog_array_free_sleepable_cb(struct rcu_head *rcu) +{ + struct bpf_prog_array *progs; + + progs = container_of(rcu, struct bpf_prog_array, rcu); + kfree_rcu(progs, rcu); +} + +void bpf_prog_array_free_sleepable(struct bpf_prog_array *progs) +{ + if (!progs || progs == &bpf_empty_prog_array.hdr) + return; + call_rcu_tasks_trace(&progs->rcu, __bpf_prog_array_free_sleepable_cb); +} + int bpf_prog_array_length(struct bpf_prog_array *array) { struct bpf_prog_array_item *item; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 10b157a6d73e..d1c22594dbf9 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1936,7 +1936,7 @@ int perf_event_attach_bpf_prog(struct perf_event *event, event->prog = prog; event->bpf_cookie = bpf_cookie; rcu_assign_pointer(event->tp_event->prog_array, new_array); - bpf_prog_array_free(old_array); + bpf_prog_array_free_sleepable(old_array); unlock: mutex_unlock(&bpf_event_mutex); @@ -1962,7 +1962,7 @@ void perf_event_detach_bpf_prog(struct perf_event *event) bpf_prog_array_delete_safe(old_array, event->prog); } else { rcu_assign_pointer(event->tp_event->prog_array, new_array); - bpf_prog_array_free(old_array); + bpf_prog_array_free_sleepable(old_array); } bpf_prog_put(event->prog); diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 9711589273cd..0282c119b1b2 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "trace_dynevent.h" #include "trace_probe.h" @@ -1346,9 +1347,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, if (bpf_prog_array_valid(call)) { u32 ret; - preempt_disable(); - ret = trace_call_bpf(call, regs); - preempt_enable(); + ret = bpf_prog_run_array_sleepable(call->prog_array, regs, bpf_prog_run); if (!ret) return; } -- cgit From 64ad7556c75ea102eec2f5bcd60fe2d66ce70308 Mon Sep 17 00:00:00 2001 From: Delyan Kratunov Date: Tue, 14 Jun 2022 23:10:43 +0000 Subject: bpf: allow sleepable uprobe programs to attach uprobe and kprobe programs have the same program type, KPROBE, which is currently not allowed to load sleepable programs. To avoid adding a new UPROBE type, instead allow sleepable KPROBE programs to load and defer the is-it-actually-a-uprobe-program check to attachment time, where there's already validation of the corresponding perf_event. A corollary of this patch is that you can now load a sleepable kprobe program but cannot attach it. Acked-by: Andrii Nakryiko Signed-off-by: Delyan Kratunov Link: https://lore.kernel.org/r/fcd44a7cd204f372f6bb03ef794e829adeaef299.1655248076.git.delyank@fb.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 4 ++-- kernel/events/core.c | 16 ++++++++++------ 2 files changed, 12 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2d2872682278..eadc23a8452c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -14829,8 +14829,8 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) } if (prog->aux->sleepable && prog->type != BPF_PROG_TYPE_TRACING && - prog->type != BPF_PROG_TYPE_LSM) { - verbose(env, "Only fentry/fexit/fmod_ret and lsm programs can be sleepable\n"); + prog->type != BPF_PROG_TYPE_LSM && prog->type != BPF_PROG_TYPE_KPROBE) { + verbose(env, "Only fentry/fexit/fmod_ret, lsm, and kprobe/uprobe programs can be sleepable\n"); return -EINVAL; } diff --git a/kernel/events/core.c b/kernel/events/core.c index 950b25c3f210..deee6815bdd3 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10069,26 +10069,30 @@ static inline bool perf_event_is_tracing(struct perf_event *event) int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie) { - bool is_kprobe, is_tracepoint, is_syscall_tp; + bool is_kprobe, is_uprobe, is_tracepoint, is_syscall_tp; if (!perf_event_is_tracing(event)) return perf_event_set_bpf_handler(event, prog, bpf_cookie); - is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE; + is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_KPROBE; + is_uprobe = event->tp_event->flags & TRACE_EVENT_FL_UPROBE; is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT; is_syscall_tp = is_syscall_trace_event(event->tp_event); - if (!is_kprobe && !is_tracepoint && !is_syscall_tp) + if (!is_kprobe && !is_uprobe && !is_tracepoint && !is_syscall_tp) /* bpf programs can only be attached to u/kprobe or tracepoint */ return -EINVAL; - if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) || + if (((is_kprobe || is_uprobe) && prog->type != BPF_PROG_TYPE_KPROBE) || (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) || (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT)) return -EINVAL; + if (prog->type == BPF_PROG_TYPE_KPROBE && prog->aux->sleepable && !is_uprobe) + /* only uprobe programs are allowed to be sleepable */ + return -EINVAL; + /* Kprobe override only works for kprobes, not uprobes. */ - if (prog->kprobe_override && - !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) + if (prog->kprobe_override && !is_kprobe) return -EINVAL; if (is_tracepoint || is_syscall_tp) { -- cgit From 0fe6ee8f123a4dfb529a5aff07536bb481f34043 Mon Sep 17 00:00:00 2001 From: Chen Zhongjin Date: Tue, 31 May 2022 09:28:54 +0800 Subject: profiling: fix shift too large makes kernel panic 2d186afd04d6 ("profiling: fix shift-out-of-bounds bugs") limits shift value by [0, BITS_PER_LONG -1], which means [0, 63]. However, syzbot found that the max shift value should be the bit number of (_etext - _stext). If shift is outside of this, the "buffer_bytes" will be zero and will cause kzalloc(0). Then the kernel panics due to dereferencing the returned pointer 16. This can be easily reproduced by passing a large number like 60 to enable profiling and then run readprofile. LOGS: BUG: kernel NULL pointer dereference, address: 0000000000000010 #PF: supervisor write access in kernel mode #PF: error_code(0x0002) - not-present page PGD 6148067 P4D 6148067 PUD 6142067 PMD 0 PREEMPT SMP CPU: 4 PID: 184 Comm: readprofile Not tainted 5.18.0+ #162 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.15.0-0-g2dd4b9b3f840-prebuilt.qemu.org 04/01/2014 RIP: 0010:read_profile+0x104/0x220 RSP: 0018:ffffc900006fbe80 EFLAGS: 00000202 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 RDX: ffff888006150000 RSI: 0000000000000001 RDI: ffffffff82aba4a0 RBP: 000000000188bb60 R08: 0000000000000010 R09: ffff888006151000 R10: 0000000000000000 R11: 0000000000000000 R12: ffffffff82aba4a0 R13: 0000000000000000 R14: ffffc900006fbf08 R15: 0000000000020c30 FS: 000000000188a8c0(0000) GS:ffff88803ed00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000010 CR3: 0000000006144000 CR4: 00000000000006e0 Call Trace: proc_reg_read+0x56/0x70 vfs_read+0x9a/0x1b0 ksys_read+0xa1/0xe0 ? fpregs_assert_state_consistent+0x1e/0x40 do_syscall_64+0x3a/0x80 entry_SYSCALL_64_after_hwframe+0x46/0xb0 RIP: 0033:0x4d4b4e RSP: 002b:00007ffebb668d58 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 RAX: ffffffffffffffda RBX: 000000000188a8a0 RCX: 00000000004d4b4e RDX: 0000000000000400 RSI: 000000000188bb60 RDI: 0000000000000003 RBP: 0000000000000003 R08: 000000000000006e R09: 0000000000000000 R10: 0000000000000041 R11: 0000000000000246 R12: 000000000188bb60 R13: 0000000000000400 R14: 0000000000000000 R15: 000000000188bb60 Modules linked in: CR2: 0000000000000010 Killed ---[ end trace 0000000000000000 ]--- Check prof_len in profile_init() to prevent it be zero. Link: https://lkml.kernel.org/r/20220531012854.229439-1-chenzhongjin@huawei.com Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Chen Zhongjin Signed-off-by: Andrew Morton --- kernel/profile.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/profile.c b/kernel/profile.c index 37640a0bd8a3..ae82ddfc6a68 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -109,6 +109,13 @@ int __ref profile_init(void) /* only text is profiled */ prof_len = (_etext - _stext) >> prof_shift; + + if (!prof_len) { + pr_warn("profiling shift: %u too large\n", prof_shift); + prof_on = 0; + return -EINVAL; + } + buffer_bytes = prof_len*sizeof(atomic_t); if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL)) -- cgit From f4da7afe07523ff8930c4466b09a15db18508cd4 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Fri, 27 May 2022 02:55:35 +0000 Subject: kexec_file: increase maximum file size to 4G In some case initrd can be large. For example, it could be a netboot image loaded by u-root, that is kexec'ing into it. The maximum size of initrd is arbitrary set to 2G. Also, the limit is not very obvious because it is hidden behind a generic INT_MAX macro. Theoretically, we could make it LONG_MAX, but it is safer to keep it sane, and just increase it to 4G. Increase the size to 4G, and make it obvious by having a new macro that specifies the maximum file size supported by kexec_file_load() syscall: KEXEC_FILE_SIZE_MAX. Link: https://lkml.kernel.org/r/20220527025535.3953665-3-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Cc: Sasha Levin Cc: "Eric W. Biederman" Cc: Greg Thelen Cc: Al Viro Cc: Baoquan He Signed-off-by: Andrew Morton --- kernel/kexec_file.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 145321a5e798..9b2839775c83 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -31,6 +31,9 @@ static int kexec_calculate_store_digests(struct kimage *image); +/* Maximum size in bytes for kernel/initrd files. */ +#define KEXEC_FILE_SIZE_MAX min_t(s64, 4LL << 30, SSIZE_MAX) + /* * Currently this is the only default function that is exported as some * architectures need it to do additional handlings. @@ -189,11 +192,12 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, const char __user *cmdline_ptr, unsigned long cmdline_len, unsigned flags) { - int ret; + ssize_t ret; void *ldata; ret = kernel_read_file_from_fd(kernel_fd, 0, &image->kernel_buf, - INT_MAX, NULL, READING_KEXEC_IMAGE); + KEXEC_FILE_SIZE_MAX, NULL, + READING_KEXEC_IMAGE); if (ret < 0) return ret; image->kernel_buf_len = ret; @@ -213,7 +217,7 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, /* It is possible that there no initramfs is being loaded */ if (!(flags & KEXEC_FILE_NO_INITRAMFS)) { ret = kernel_read_file_from_fd(initrd_fd, 0, &image->initrd_buf, - INT_MAX, NULL, + KEXEC_FILE_SIZE_MAX, NULL, READING_KEXEC_INITRAMFS); if (ret < 0) goto out; -- cgit From 508362ac66b0478affb4e52cb8da98478312d72d Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Wed, 15 Jun 2022 16:48:43 +0300 Subject: bpf: Allow helpers to accept pointers with a fixed size Before this commit, the BPF verifier required ARG_PTR_TO_MEM arguments to be followed by ARG_CONST_SIZE holding the size of the memory region. The helpers had to check that size in runtime. There are cases where the size expected by a helper is a compile-time constant. Checking it in runtime is an unnecessary overhead and waste of BPF registers. This commit allows helpers to accept pointers to memory without the corresponding ARG_CONST_SIZE, given that they define the memory region size in struct bpf_func_proto and use ARG_PTR_TO_FIXED_SIZE_MEM type. arg_size is unionized with arg_btf_id to reduce the kernel image size, and it's valid because they are used by different argument types. Signed-off-by: Maxim Mikityanskiy Reviewed-by: Tariq Toukan Link: https://lore.kernel.org/r/20220615134847.3753567-3-maximmi@nvidia.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 43 ++++++++++++++++++++++++++++++++----------- 1 file changed, 32 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index eadc23a8452c..2859901ffbe3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5848,6 +5848,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; enum bpf_arg_type arg_type = fn->arg_type[arg]; enum bpf_reg_type type = reg->type; + u32 *arg_btf_id = NULL; int err = 0; if (arg_type == ARG_DONTCARE) @@ -5884,7 +5885,11 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, */ goto skip_type_check; - err = check_reg_type(env, regno, arg_type, fn->arg_btf_id[arg], meta); + /* arg_btf_id and arg_size are in a union. */ + if (base_type(arg_type) == ARG_PTR_TO_BTF_ID) + arg_btf_id = fn->arg_btf_id[arg]; + + err = check_reg_type(env, regno, arg_type, arg_btf_id, meta); if (err) return err; @@ -6011,6 +6016,11 @@ skip_type_check: * next is_mem_size argument below. */ meta->raw_mode = arg_type & MEM_UNINIT; + if (arg_type & MEM_FIXED_SIZE) { + err = check_helper_mem_access(env, regno, + fn->arg_size[arg], false, + meta); + } } else if (arg_type_is_mem_size(arg_type)) { bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO); @@ -6400,11 +6410,19 @@ static bool check_raw_mode_ok(const struct bpf_func_proto *fn) return count <= 1; } -static bool check_args_pair_invalid(enum bpf_arg_type arg_curr, - enum bpf_arg_type arg_next) +static bool check_args_pair_invalid(const struct bpf_func_proto *fn, int arg) { - return (base_type(arg_curr) == ARG_PTR_TO_MEM) != - arg_type_is_mem_size(arg_next); + bool is_fixed = fn->arg_type[arg] & MEM_FIXED_SIZE; + bool has_size = fn->arg_size[arg] != 0; + bool is_next_size = false; + + if (arg + 1 < ARRAY_SIZE(fn->arg_type)) + is_next_size = arg_type_is_mem_size(fn->arg_type[arg + 1]); + + if (base_type(fn->arg_type[arg]) != ARG_PTR_TO_MEM) + return is_next_size; + + return has_size == is_next_size || is_next_size == is_fixed; } static bool check_arg_pair_ok(const struct bpf_func_proto *fn) @@ -6415,11 +6433,11 @@ static bool check_arg_pair_ok(const struct bpf_func_proto *fn) * helper function specification. */ if (arg_type_is_mem_size(fn->arg1_type) || - base_type(fn->arg5_type) == ARG_PTR_TO_MEM || - check_args_pair_invalid(fn->arg1_type, fn->arg2_type) || - check_args_pair_invalid(fn->arg2_type, fn->arg3_type) || - check_args_pair_invalid(fn->arg3_type, fn->arg4_type) || - check_args_pair_invalid(fn->arg4_type, fn->arg5_type)) + check_args_pair_invalid(fn, 0) || + check_args_pair_invalid(fn, 1) || + check_args_pair_invalid(fn, 2) || + check_args_pair_invalid(fn, 3) || + check_args_pair_invalid(fn, 4)) return false; return true; @@ -6460,7 +6478,10 @@ static bool check_btf_id_ok(const struct bpf_func_proto *fn) if (base_type(fn->arg_type[i]) == ARG_PTR_TO_BTF_ID && !fn->arg_btf_id[i]) return false; - if (base_type(fn->arg_type[i]) != ARG_PTR_TO_BTF_ID && fn->arg_btf_id[i]) + if (base_type(fn->arg_type[i]) != ARG_PTR_TO_BTF_ID && fn->arg_btf_id[i] && + /* arg_btf_id and arg_size are in a union. */ + (base_type(fn->arg_type[i]) != ARG_PTR_TO_MEM || + !(fn->arg_type[i] & MEM_FIXED_SIZE))) return false; } -- cgit From dc368e1c658e4f478a45e8d1d5b0c8392ca87506 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Thu, 16 Jun 2022 15:54:07 -0700 Subject: bpf: Fix non-static bpf_func_proto struct definitions This patch does two things: 1) Marks the dynptr bpf_func_proto structs that were added in [1] as static, as pointed out by the kernel test robot in [2]. 2) There are some bpf_func_proto structs marked as extern which can instead be statically defined. [1] https://lore.kernel.org/bpf/20220523210712.3641569-1-joannelkoong@gmail.com/ [2] https://lore.kernel.org/bpf/62ab89f2.Pko7sI08RAKdF8R6%25lkp@intel.com/ Reported-by: kernel test robot Signed-off-by: Joanne Koong Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220616225407.1878436-1-joannelkoong@gmail.com --- kernel/bpf/helpers.c | 12 ++++++------ kernel/bpf/syscall.c | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 225806a02efb..a1c84d256f83 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -584,7 +584,7 @@ BPF_CALL_3(bpf_strncmp, const char *, s1, u32, s1_sz, const char *, s2) return strncmp(s1, s2, s1_sz); } -const struct bpf_func_proto bpf_strncmp_proto = { +static const struct bpf_func_proto bpf_strncmp_proto = { .func = bpf_strncmp, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1402,7 +1402,7 @@ BPF_CALL_2(bpf_kptr_xchg, void *, map_value, void *, ptr) */ #define BPF_PTR_POISON ((void *)((0xeB9FUL << 2) + POISON_POINTER_DELTA)) -const struct bpf_func_proto bpf_kptr_xchg_proto = { +static const struct bpf_func_proto bpf_kptr_xchg_proto = { .func = bpf_kptr_xchg, .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, @@ -1487,7 +1487,7 @@ error: return err; } -const struct bpf_func_proto bpf_dynptr_from_mem_proto = { +static const struct bpf_func_proto bpf_dynptr_from_mem_proto = { .func = bpf_dynptr_from_mem, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1513,7 +1513,7 @@ BPF_CALL_4(bpf_dynptr_read, void *, dst, u32, len, struct bpf_dynptr_kern *, src return 0; } -const struct bpf_func_proto bpf_dynptr_read_proto = { +static const struct bpf_func_proto bpf_dynptr_read_proto = { .func = bpf_dynptr_read, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1539,7 +1539,7 @@ BPF_CALL_4(bpf_dynptr_write, struct bpf_dynptr_kern *, dst, u32, offset, void *, return 0; } -const struct bpf_func_proto bpf_dynptr_write_proto = { +static const struct bpf_func_proto bpf_dynptr_write_proto = { .func = bpf_dynptr_write, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1566,7 +1566,7 @@ BPF_CALL_3(bpf_dynptr_data, struct bpf_dynptr_kern *, ptr, u32, offset, u32, len return (unsigned long)(ptr->data + ptr->offset + offset); } -const struct bpf_func_proto bpf_dynptr_data_proto = { +static const struct bpf_func_proto bpf_dynptr_data_proto = { .func = bpf_dynptr_data, .gpl_only = false, .ret_type = RET_PTR_TO_DYNPTR_MEM_OR_NULL, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index aeb31137b2ed..7d5af5b99f0d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -5131,7 +5131,7 @@ BPF_CALL_4(bpf_kallsyms_lookup_name, const char *, name, int, name_sz, int, flag return *res ? 0 : -ENOENT; } -const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = { +static const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = { .func = bpf_kallsyms_lookup_name, .gpl_only = false, .ret_type = RET_INTEGER, -- cgit From 2403e8044f222e7c816fb2416661f5f469662973 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 21 Mar 2022 18:41:46 -0700 Subject: rcu: Make normal polling GP be more precise about sequence numbers Currently, poll_state_synchronize_rcu() uses rcu_seq_done() to check whether the specified grace period has completed. However, rcu_seq_done() does a simple comparison that reserves have of the sequence-number space for uncompleted grace periods. This has the unfortunate side-effect of not handling sequence-number wrap gracefully. Of course, one can argue that if someone has already waited for half of the full range of grace periods, they can wait for the other half, but why wait at all in this case? This commit therefore creates a rcu_seq_done_exact() that counts as uncompleted only the two grace periods during which the sequence number might have been handed out, while still being uncompleted. This way, if sequence-number wrap happens to hit that range, at most two additional grace periods need be waited for. This commit is in preparation for polled expedited grace periods. Link: https://lore.kernel.org/all/20220121142454.1994916-1-bfoster@redhat.com/ Link: https://docs.google.com/document/d/1RNKWW9jQyfjxw2E8dsXVTdvZYh0HnYeSHDKog9jhdN8/edit?usp=sharing Cc: Brian Foster Cc: Dave Chinner Cc: Al Viro Cc: Ian Kent Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 12 ++++++++++++ kernel/rcu/tree.c | 4 ++-- 2 files changed, 14 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 4916077119f3..0adb55941aeb 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -119,6 +119,18 @@ static inline bool rcu_seq_done(unsigned long *sp, unsigned long s) return ULONG_CMP_GE(READ_ONCE(*sp), s); } +/* + * Given a snapshot from rcu_seq_snap(), determine whether or not a + * full update-side operation has occurred, but do not allow the + * (ULONG_MAX / 2) safety-factor/guard-band. + */ +static inline bool rcu_seq_done_exact(unsigned long *sp, unsigned long s) +{ + unsigned long cur_s = READ_ONCE(*sp); + + return ULONG_CMP_GE(cur_s, s) || ULONG_CMP_LT(cur_s, s - (2 * RCU_SEQ_STATE_MASK + 1)); +} + /* * Has a grace period completed since the time the old gp_seq was collected? */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c25ba442044a..ec28e259774e 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3911,7 +3911,7 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu); * * Yes, this function does not take counter wrap into account. * But counter wrap is harmless. If the counter wraps, we have waited for - * more than 2 billion grace periods (and way more on a 64-bit system!). + * more than a billion grace periods (and way more on a 64-bit system!). * Those needing to keep oldstate values for very long time periods * (many hours even on 32-bit systems) should check them occasionally * and either refresh them or set a flag indicating that the grace period @@ -3924,7 +3924,7 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu); */ bool poll_state_synchronize_rcu(unsigned long oldstate) { - if (rcu_seq_done(&rcu_state.gp_seq, oldstate)) { + if (rcu_seq_done_exact(&rcu_state.gp_seq, oldstate)) { smp_mb(); /* Ensure GP ends before subsequent accesses. */ return true; } -- cgit From 414c12385d4741e35d88670c6cc2f40a77809734 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 13 Apr 2022 15:17:25 -0700 Subject: rcu: Provide a get_completed_synchronize_rcu() function It is currently up to the caller to handle stale return values from get_state_synchronize_rcu(). If poll_state_synchronize_rcu() returned true once, a grace period has elapsed, regardless of the fact that counter wrap might cause some future poll_state_synchronize_rcu() invocation to return false. For example, the caller might store a separate flag that indicates whether some previous call to poll_state_synchronize_rcu() determined that the relevant grace period had already ended. This approach works, but it requires extra storage and is easy to get wrong. This commit therefore introduces a get_completed_synchronize_rcu() that returns a cookie that causes poll_state_synchronize_rcu() to always return true. This already-completed cookie can be stored in place of the cookie that previously caused poll_state_synchronize_rcu() to return true. It can also be used to flag a given structure as not having been exposed to readers, and thus not requiring a grace period to elapse. This commit is in preparation for polled expedited grace periods. Link: https://lore.kernel.org/all/20220121142454.1994916-1-bfoster@redhat.com/ Link: https://docs.google.com/document/d/1RNKWW9jQyfjxw2E8dsXVTdvZYh0HnYeSHDKog9jhdN8/edit?usp=sharing Cc: Brian Foster Cc: Dave Chinner Cc: Al Viro Cc: Ian Kent Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 3 +++ kernel/rcu/tiny.c | 4 ++-- kernel/rcu/tree.c | 3 ++- kernel/rcu/update.c | 13 +++++++++++++ 4 files changed, 20 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 0adb55941aeb..32291f4eefde 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -23,6 +23,9 @@ #define RCU_SEQ_CTR_SHIFT 2 #define RCU_SEQ_STATE_MASK ((1 << RCU_SEQ_CTR_SHIFT) - 1) +/* Low-order bit definition for polled grace-period APIs. */ +#define RCU_GET_STATE_COMPLETED 0x1 + extern int sysctl_sched_rt_runtime; /* diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 340b3f8b090d..dbee6bea6726 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -58,7 +58,7 @@ void rcu_qs(void) rcu_ctrlblk.donetail = rcu_ctrlblk.curtail; raise_softirq_irqoff(RCU_SOFTIRQ); } - WRITE_ONCE(rcu_ctrlblk.gp_seq, rcu_ctrlblk.gp_seq + 1); + WRITE_ONCE(rcu_ctrlblk.gp_seq, rcu_ctrlblk.gp_seq + 2); local_irq_restore(flags); } @@ -213,7 +213,7 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu); */ bool poll_state_synchronize_rcu(unsigned long oldstate) { - return READ_ONCE(rcu_ctrlblk.gp_seq) != oldstate; + return oldstate == RCU_GET_STATE_COMPLETED || READ_ONCE(rcu_ctrlblk.gp_seq) != oldstate; } EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ec28e259774e..46cfceea8784 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3924,7 +3924,8 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu); */ bool poll_state_synchronize_rcu(unsigned long oldstate) { - if (rcu_seq_done_exact(&rcu_state.gp_seq, oldstate)) { + if (oldstate == RCU_GET_STATE_COMPLETED || + rcu_seq_done_exact(&rcu_state.gp_seq, oldstate)) { smp_mb(); /* Ensure GP ends before subsequent accesses. */ return true; } diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index fc7fef575606..2e93acad1e31 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -516,6 +516,19 @@ int rcu_cpu_stall_suppress_at_boot __read_mostly; // !0 = suppress boot stalls. EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress_at_boot); module_param(rcu_cpu_stall_suppress_at_boot, int, 0444); +/** + * get_completed_synchronize_rcu - Return a pre-completed polled state cookie + * + * Returns a value that will always be treated by functions like + * poll_state_synchronize_rcu() as a cookie whose grace period has already + * completed. + */ +unsigned long get_completed_synchronize_rcu(void) +{ + return RCU_GET_STATE_COMPLETED; +} +EXPORT_SYMBOL_GPL(get_completed_synchronize_rcu); + #ifdef CONFIG_PROVE_RCU /* -- cgit From d0eac20f9909de044fbeb958960b08ff235cf8a2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 13 Apr 2022 16:14:02 -0700 Subject: rcutorture: Validate get_completed_synchronize_rcu() This commit verifies that the RCU grace-period state cookie returned from get_completed_synchronize_rcu() causes poll_state_synchronize_rcu() to return true, as required. This commit is in preparation for polled expedited grace periods. Link: https://lore.kernel.org/all/20220121142454.1994916-1-bfoster@redhat.com/ Link: https://docs.google.com/document/d/1RNKWW9jQyfjxw2E8dsXVTdvZYh0HnYeSHDKog9jhdN8/edit?usp=sharing Cc: Brian Foster Cc: Dave Chinner Cc: Al Viro Cc: Ian Kent Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 7120165a9342..4ceec9f4169c 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -338,6 +338,7 @@ struct rcu_torture_ops { void (*sync)(void); void (*exp_sync)(void); unsigned long (*get_gp_state)(void); + unsigned long (*get_gp_completed)(void); unsigned long (*start_gp_poll)(void); bool (*poll_gp_state)(unsigned long oldstate); void (*cond_sync)(unsigned long oldstate); @@ -504,6 +505,7 @@ static struct rcu_torture_ops rcu_ops = { .sync = synchronize_rcu, .exp_sync = synchronize_rcu_expedited, .get_gp_state = get_state_synchronize_rcu, + .get_gp_completed = get_completed_synchronize_rcu, .start_gp_poll = start_poll_synchronize_rcu, .poll_gp_state = poll_state_synchronize_rcu, .cond_sync = cond_synchronize_rcu, @@ -1254,6 +1256,10 @@ rcu_torture_writer(void *arg) rcu_torture_writer_state_getname(), rcu_torture_writer_state, cookie, cur_ops->get_gp_state()); + if (cur_ops->get_gp_completed) { + cookie = cur_ops->get_gp_completed(); + WARN_ON_ONCE(!cur_ops->poll_gp_state(cookie)); + } cur_ops->readunlock(idx); } switch (synctype[torture_random(&rand) % nsynctypes]) { -- cgit From 4cf0585c4d663654fefa0b359f1908b5cd72802b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 6 Dec 2021 16:19:40 -0800 Subject: rcu-tasks: Check for abandoned callbacks This commit adds a debugging scan for callbacks that got lost during a callback-queueing transition. Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 3925e32159b5..b8690a412c5b 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -439,6 +439,11 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp) WRITE_ONCE(rtp->percpu_dequeue_lim, 1); pr_info("Completing switch %s to CPU-0 callback queuing.\n", rtp->name); } + for (cpu = rtp->percpu_dequeue_lim; cpu < nr_cpu_ids; cpu++) { + struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); + + WARN_ON_ONCE(rcu_segcblist_n_cbs(&rtpcp->cblist)); + } raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags); } -- cgit From d96225fd09ffb3a6424cebe01c8b1e1bddc30f07 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 19 Apr 2022 10:47:28 -0700 Subject: rcu-tasks: Split rcu_tasks_one_gp() from rcu_tasks_kthread() This commit abstracts most of the rcu_tasks_kthread() function's loop body into a new rcu_tasks_one_gp() function. It also introduces a new ->tasks_gp_mutex to synchronize concurrent calls to this new rcu_tasks_one_gp() function. This commit is preparation for allowing RCU tasks grace periods to be driven by the calling task during the mid-boot dead zone. Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 58 +++++++++++++++++++++++++++++++++--------------------- 1 file changed, 36 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index b8690a412c5b..d7b12f524e81 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -48,6 +48,7 @@ struct rcu_tasks_percpu { * struct rcu_tasks - Definition for a Tasks-RCU-like mechanism. * @cbs_wait: RCU wait allowing a new callback to get kthread's attention. * @cbs_gbl_lock: Lock protecting callback list. + * @tasks_gp_mutex: Mutex protecting grace period, needed during mid-boot dead zone. * @kthread_ptr: This flavor's grace-period/callback-invocation kthread. * @gp_func: This flavor's grace-period-wait function. * @gp_state: Grace period's most recent state transition (debugging). @@ -79,6 +80,7 @@ struct rcu_tasks_percpu { struct rcu_tasks { struct rcuwait cbs_wait; raw_spinlock_t cbs_gbl_lock; + struct mutex tasks_gp_mutex; int gp_state; int gp_sleep; int init_fract; @@ -119,6 +121,7 @@ static struct rcu_tasks rt_name = \ { \ .cbs_wait = __RCUWAIT_INITIALIZER(rt_name.wait), \ .cbs_gbl_lock = __RAW_SPIN_LOCK_UNLOCKED(rt_name.cbs_gbl_lock), \ + .tasks_gp_mutex = __MUTEX_INITIALIZER(rt_name.tasks_gp_mutex), \ .gp_func = gp, \ .call_func = call, \ .rtpcpu = &rt_name ## __percpu, \ @@ -502,10 +505,37 @@ static void rcu_tasks_invoke_cbs_wq(struct work_struct *wp) rcu_tasks_invoke_cbs(rtp, rtpcp); } -/* RCU-tasks kthread that detects grace periods and invokes callbacks. */ -static int __noreturn rcu_tasks_kthread(void *arg) +// Wait for one grace period. +static void rcu_tasks_one_gp(struct rcu_tasks *rtp) { int needgpcb; + + mutex_lock(&rtp->tasks_gp_mutex); + set_tasks_gp_state(rtp, RTGS_WAIT_CBS); + + // If there were none, wait a bit and start over. + rcuwait_wait_event(&rtp->cbs_wait, + (needgpcb = rcu_tasks_need_gpcb(rtp)), + TASK_IDLE); + + if (needgpcb & 0x2) { + // Wait for one grace period. + set_tasks_gp_state(rtp, RTGS_WAIT_GP); + rtp->gp_start = jiffies; + rcu_seq_start(&rtp->tasks_gp_seq); + rtp->gp_func(rtp); + rcu_seq_end(&rtp->tasks_gp_seq); + } + + // Invoke callbacks. + set_tasks_gp_state(rtp, RTGS_INVOKE_CBS); + rcu_tasks_invoke_cbs(rtp, per_cpu_ptr(rtp->rtpcpu, 0)); + mutex_unlock(&rtp->tasks_gp_mutex); +} + +// RCU-tasks kthread that detects grace periods and invokes callbacks. +static int __noreturn rcu_tasks_kthread(void *arg) +{ struct rcu_tasks *rtp = arg; /* Run on housekeeping CPUs by default. Sysadm can move if desired. */ @@ -519,27 +549,11 @@ static int __noreturn rcu_tasks_kthread(void *arg) * This loop is terminated by the system going down. ;-) */ for (;;) { - set_tasks_gp_state(rtp, RTGS_WAIT_CBS); - - /* If there were none, wait a bit and start over. */ - rcuwait_wait_event(&rtp->cbs_wait, - (needgpcb = rcu_tasks_need_gpcb(rtp)), - TASK_IDLE); - - if (needgpcb & 0x2) { - // Wait for one grace period. - set_tasks_gp_state(rtp, RTGS_WAIT_GP); - rtp->gp_start = jiffies; - rcu_seq_start(&rtp->tasks_gp_seq); - rtp->gp_func(rtp); - rcu_seq_end(&rtp->tasks_gp_seq); - } - - /* Invoke callbacks. */ - set_tasks_gp_state(rtp, RTGS_INVOKE_CBS); - rcu_tasks_invoke_cbs(rtp, per_cpu_ptr(rtp->rtpcpu, 0)); + // Wait for one grace period and invoke any callbacks + // that are ready. + rcu_tasks_one_gp(rtp); - /* Paranoid sleep to keep this from entering a tight loop */ + // Paranoid sleep to keep this from entering a tight loop. schedule_timeout_idle(rtp->gp_sleep); } } -- cgit From 68cb47204db478302a37adb2aaa26a05882dbaa1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 19 Apr 2022 11:06:03 -0700 Subject: rcu-tasks: Move synchronize_rcu_tasks_generic() down This is strictly a code-motion commit that moves the synchronize_rcu_tasks_generic() down to where it can invoke rcu_tasks_one_gp() without the need for a forward declaration. Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index d7b12f524e81..ad993c4ed924 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -326,17 +326,6 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, irq_work_queue(&rtpcp->rtp_irq_work); } -// Wait for a grace period for the specified flavor of Tasks RCU. -static void synchronize_rcu_tasks_generic(struct rcu_tasks *rtp) -{ - /* Complain if the scheduler has not started. */ - RCU_LOCKDEP_WARN(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE, - "synchronize_rcu_tasks called too soon"); - - /* Wait for the grace period. */ - wait_rcu_gp(rtp->call_func); -} - // RCU callback function for rcu_barrier_tasks_generic(). static void rcu_barrier_tasks_generic_cb(struct rcu_head *rhp) { @@ -558,6 +547,17 @@ static int __noreturn rcu_tasks_kthread(void *arg) } } +// Wait for a grace period for the specified flavor of Tasks RCU. +static void synchronize_rcu_tasks_generic(struct rcu_tasks *rtp) +{ + /* Complain if the scheduler has not started. */ + RCU_LOCKDEP_WARN(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE, + "synchronize_rcu_tasks called too soon"); + + /* Wait for the grace period. */ + wait_rcu_gp(rtp->call_func); +} + /* Spawn RCU-tasks grace-period kthread. */ static void __init rcu_spawn_tasks_kthread_generic(struct rcu_tasks *rtp) { -- cgit From 4a8cc433b8bf3106cf7b1173a936c62d77b40b40 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 19 Apr 2022 15:41:38 -0700 Subject: rcu-tasks: Drive synchronous grace periods from calling task This commit causes synchronous grace periods to be driven from the task invoking synchronize_rcu_*(), allowing these functions to be invoked from the mid-boot dead zone extending from when the scheduler was initialized to to point that the various RCU tasks grace-period kthreads are spawned. This change will allow the self-tests to run in a consistent manner. Reported-by: Matthew Wilcox Reported-by: Zhouyi Zhou Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index ad993c4ed924..bd9f2e24f5c7 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -495,17 +495,21 @@ static void rcu_tasks_invoke_cbs_wq(struct work_struct *wp) } // Wait for one grace period. -static void rcu_tasks_one_gp(struct rcu_tasks *rtp) +static void rcu_tasks_one_gp(struct rcu_tasks *rtp, bool midboot) { int needgpcb; mutex_lock(&rtp->tasks_gp_mutex); - set_tasks_gp_state(rtp, RTGS_WAIT_CBS); // If there were none, wait a bit and start over. - rcuwait_wait_event(&rtp->cbs_wait, - (needgpcb = rcu_tasks_need_gpcb(rtp)), - TASK_IDLE); + if (unlikely(midboot)) { + needgpcb = 0x2; + } else { + set_tasks_gp_state(rtp, RTGS_WAIT_CBS); + rcuwait_wait_event(&rtp->cbs_wait, + (needgpcb = rcu_tasks_need_gpcb(rtp)), + TASK_IDLE); + } if (needgpcb & 0x2) { // Wait for one grace period. @@ -540,7 +544,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) for (;;) { // Wait for one grace period and invoke any callbacks // that are ready. - rcu_tasks_one_gp(rtp); + rcu_tasks_one_gp(rtp, false); // Paranoid sleep to keep this from entering a tight loop. schedule_timeout_idle(rtp->gp_sleep); @@ -554,8 +558,12 @@ static void synchronize_rcu_tasks_generic(struct rcu_tasks *rtp) RCU_LOCKDEP_WARN(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE, "synchronize_rcu_tasks called too soon"); - /* Wait for the grace period. */ - wait_rcu_gp(rtp->call_func); + // If the grace-period kthread is running, use it. + if (READ_ONCE(rtp->kthread_ptr)) { + wait_rcu_gp(rtp->call_func); + return; + } + rcu_tasks_one_gp(rtp, true); } /* Spawn RCU-tasks grace-period kthread. */ -- cgit From 3847b64570b1753e9863a4106aec03f4d68e7b17 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 23 May 2022 20:50:11 -0700 Subject: rcu-tasks: Merge state into .b.need_qs and atomically update This commit gets rid of the task_struct structure's ->trc_reader_checked field, making it instead be a bit within the task_struct structure's existing ->trc_reader_special.b.need_qs field. This commit also atomically loads, stores, and checks the resulting combination of the reader-checked and need-quiescent state flags. This will in turn allow significant simplification of the rcu_tasks_trace_postgp() function as well as elimination of the trc_n_readers_need_end counter in later commits. These changes will in turn simplify later elimination of the RCU Tasks Trace scan of the task list, which will make RCU Tasks Trace grace periods less CPU-intensive. [ paulmck: Apply kernel test robot feedback. ] Signed-off-by: Paul E. McKenney Cc: Neeraj Upadhyay Cc: Eric Dumazet Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Martin KaFai Lau Cc: KP Singh --- kernel/rcu/tasks.h | 103 ++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 71 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index bd9f2e24f5c7..7bdc62606816 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1208,6 +1208,39 @@ void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func); DEFINE_RCU_TASKS(rcu_tasks_trace, rcu_tasks_wait_gp, call_rcu_tasks_trace, "RCU Tasks Trace"); +/* Load from ->trc_reader_special.b.need_qs with proper ordering. */ +static u8 rcu_ld_need_qs(struct task_struct *t) +{ + smp_mb(); // Enforce full grace-period ordering. + return smp_load_acquire(&t->trc_reader_special.b.need_qs); +} + +/* Store to ->trc_reader_special.b.need_qs with proper ordering. */ +static void rcu_st_need_qs(struct task_struct *t, u8 v) +{ + smp_store_release(&t->trc_reader_special.b.need_qs, v); + smp_mb(); // Enforce full grace-period ordering. +} + +/* + * Do a cmpxchg() on ->trc_reader_special.b.need_qs, allowing for + * the four-byte operand-size restriction of some platforms. + * Returns the old value, which is often ignored. + */ +u8 rcu_trc_cmpxchg_need_qs(struct task_struct *t, u8 old, u8 new) +{ + union rcu_special ret; + union rcu_special trs_old = READ_ONCE(t->trc_reader_special); + union rcu_special trs_new = trs_old; + + if (trs_old.b.need_qs != old) + return trs_old.b.need_qs; + trs_new.b.need_qs = new; + ret.s = cmpxchg(&t->trc_reader_special.s, trs_old.s, trs_new.s); + return ret.b.need_qs; +} +EXPORT_SYMBOL_GPL(rcu_trc_cmpxchg_need_qs); + /* * This irq_work handler allows rcu_read_unlock_trace() to be invoked * while the scheduler locks are held. @@ -1221,16 +1254,20 @@ static DEFINE_IRQ_WORK(rcu_tasks_trace_iw, rcu_read_unlock_iw); /* If we are the last reader, wake up the grace-period kthread. */ void rcu_read_unlock_trace_special(struct task_struct *t) { - int nq = READ_ONCE(t->trc_reader_special.b.need_qs); + int nqs = (rcu_ld_need_qs(t) == (TRC_NEED_QS_CHECKED | TRC_NEED_QS)); - if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) && - t->trc_reader_special.b.need_mb) + if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) && t->trc_reader_special.b.need_mb) smp_mb(); // Pairs with update-side barriers. // Update .need_qs before ->trc_reader_nesting for irq/NMI handlers. - if (nq) - WRITE_ONCE(t->trc_reader_special.b.need_qs, false); + if (nqs) { + u8 result = rcu_trc_cmpxchg_need_qs(t, TRC_NEED_QS_CHECKED | TRC_NEED_QS, + TRC_NEED_QS_CHECKED); + + WARN_ONCE(result != (TRC_NEED_QS_CHECKED | TRC_NEED_QS), + "%s: result = %d", __func__, result); + } WRITE_ONCE(t->trc_reader_nesting, 0); - if (nq && atomic_dec_and_test(&trc_n_readers_need_end)) + if (nqs && atomic_dec_and_test(&trc_n_readers_need_end)) irq_work_queue(&rcu_tasks_trace_iw); } EXPORT_SYMBOL_GPL(rcu_read_unlock_trace_special); @@ -1260,27 +1297,24 @@ static void trc_read_check_handler(void *t_in) struct task_struct *texp = t_in; // If the task is no longer running on this CPU, leave. - if (unlikely(texp != t)) { + if (unlikely(texp != t)) goto reset_ipi; // Already on holdout list, so will check later. - } // If the task is not in a read-side critical section, and // if this is the last reader, awaken the grace-period kthread. if (likely(!READ_ONCE(t->trc_reader_nesting))) { - WRITE_ONCE(t->trc_reader_checked, true); + rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED); goto reset_ipi; } // If we are racing with an rcu_read_unlock_trace(), try again later. if (unlikely(READ_ONCE(t->trc_reader_nesting) < 0)) goto reset_ipi; - WRITE_ONCE(t->trc_reader_checked, true); // Get here if the task is in a read-side critical section. Set // its state so that it will awaken the grace-period kthread upon // exit from that critical section. - atomic_inc(&trc_n_readers_need_end); // One more to wait on. - WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs)); - WRITE_ONCE(t->trc_reader_special.b.need_qs, true); + if (!rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS | TRC_NEED_QS_CHECKED)) + atomic_inc(&trc_n_readers_need_end); // One more to wait on. reset_ipi: // Allow future IPIs to be sent on CPU and for task. @@ -1291,8 +1325,9 @@ reset_ipi: } /* Callback function for scheduler to check locked-down task. */ -static int trc_inspect_reader(struct task_struct *t, void *arg) +static int trc_inspect_reader(struct task_struct *t, void *bhp_in) { + struct list_head *bhp = bhp_in; int cpu = task_cpu(t); int nesting; bool ofl = cpu_is_offline(cpu); @@ -1323,16 +1358,19 @@ static int trc_inspect_reader(struct task_struct *t, void *arg) // If not exiting a read-side critical section, mark as checked // so that the grace-period kthread will remove it from the // holdout list. - t->trc_reader_checked = nesting >= 0; - if (nesting <= 0) + if (nesting <= 0) { + if (!nesting) + rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED); return nesting ? -EINVAL : 0; // If in QS, done, otherwise try again later. + } // The task is in a read-side critical section, so set up its // state so that it will awaken the grace-period kthread upon exit // from that critical section. - atomic_inc(&trc_n_readers_need_end); // One more to wait on. - WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs)); - WRITE_ONCE(t->trc_reader_special.b.need_qs, true); + if (!rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS | TRC_NEED_QS_CHECKED)) { + atomic_inc(&trc_n_readers_need_end); // One more to wait on. + trc_add_holdout(t, bhp); + } return 0; } @@ -1348,14 +1386,14 @@ static void trc_wait_for_one_reader(struct task_struct *t, // The current task had better be in a quiescent state. if (t == current) { - t->trc_reader_checked = true; + rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED); WARN_ON_ONCE(READ_ONCE(t->trc_reader_nesting)); return; } // Attempt to nail down the task for inspection. get_task_struct(t); - if (!task_call_func(t, trc_inspect_reader, NULL)) { + if (!task_call_func(t, trc_inspect_reader, bhp)) { put_task_struct(t); return; } @@ -1419,8 +1457,7 @@ static void rcu_tasks_trace_pertask(struct task_struct *t, if (unlikely(t == NULL)) return; - WRITE_ONCE(t->trc_reader_special.b.need_qs, false); - WRITE_ONCE(t->trc_reader_checked, false); + rcu_st_need_qs(t, 0); t->trc_ipi_to_cpu = -1; trc_wait_for_one_reader(t, hop); } @@ -1442,7 +1479,8 @@ static void rcu_tasks_trace_postscan(struct list_head *hop) // Wait for late-stage exiting tasks to finish exiting. // These might have passed the call to exit_tasks_rcu_finish(). synchronize_rcu(); - // Any tasks that exit after this point will set ->trc_reader_checked. + // Any tasks that exit after this point will set + // TRC_NEED_QS_CHECKED in ->trc_reader_special.b.need_qs. } /* Communicate task state back to the RCU tasks trace stall warning request. */ @@ -1460,7 +1498,7 @@ static int trc_check_slow_task(struct task_struct *t, void *arg) return false; // It is running, so decline to inspect it. trc_rdrp->nesting = READ_ONCE(t->trc_reader_nesting); trc_rdrp->ipi_to_cpu = READ_ONCE(t->trc_ipi_to_cpu); - trc_rdrp->needqs = READ_ONCE(t->trc_reader_special.b.need_qs); + trc_rdrp->needqs = rcu_ld_need_qs(t); return true; } @@ -1514,12 +1552,12 @@ static void check_all_holdout_tasks_trace(struct list_head *hop, list_for_each_entry_safe(t, g, hop, trc_holdout_list) { // If safe and needed, try to check the current task. if (READ_ONCE(t->trc_ipi_to_cpu) == -1 && - !READ_ONCE(t->trc_reader_checked)) + !(rcu_ld_need_qs(t) & TRC_NEED_QS_CHECKED)) trc_wait_for_one_reader(t, hop); // If check succeeded, remove this task from the list. if (smp_load_acquire(&t->trc_ipi_to_cpu) == -1 && - READ_ONCE(t->trc_reader_checked)) + rcu_ld_need_qs(t) == TRC_NEED_QS_CHECKED) trc_del_holdout(t); else if (needreport) show_stalled_task_trace(t, firstreport); @@ -1574,12 +1612,12 @@ static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp) // Stall warning time, so make a list of the offenders. rcu_read_lock(); for_each_process_thread(g, t) - if (READ_ONCE(t->trc_reader_special.b.need_qs)) + if (rcu_ld_need_qs(t) & TRC_NEED_QS) trc_add_holdout(t, &holdouts); rcu_read_unlock(); firstreport = true; list_for_each_entry_safe(t, g, &holdouts, trc_holdout_list) { - if (READ_ONCE(t->trc_reader_special.b.need_qs)) + if (rcu_ld_need_qs(t) & TRC_NEED_QS) show_stalled_task_trace(t, &firstreport); trc_del_holdout(t); // Release task_struct reference. } @@ -1595,11 +1633,12 @@ static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp) /* Report any needed quiescent state for this exiting task. */ static void exit_tasks_rcu_finish_trace(struct task_struct *t) { - WRITE_ONCE(t->trc_reader_checked, true); + rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED); WARN_ON_ONCE(READ_ONCE(t->trc_reader_nesting)); - WRITE_ONCE(t->trc_reader_nesting, 0); - if (WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs))) + if (WARN_ON_ONCE(rcu_ld_need_qs(t) & TRC_NEED_QS)) rcu_read_unlock_trace_special(t); + else + WRITE_ONCE(t->trc_reader_nesting, 0); } /** -- cgit From 550611269b153dc17b44fa2d692c30f628d1c65c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 25 May 2022 13:37:36 -0700 Subject: rcu-tasks: Remove rcu_tasks_trace_postgp() wait for counter Now that tasks are not removed from the list until they have responded to any needed request for a quiescent state, it is no longer necessary to wait for the trc_n_readers_need_end counter to go to zero. This commit therefore removes that waiting code. It is therefore also no longer necessary for rcu_tasks_trace_postgp() to do the final decrement of this counter, so that code is also removed. This in turn means that trc_n_readers_need_end counter itself can be removed, as can the rcu_tasks_trace_iw irq_work structure and the rcu_read_unlock_iw() function. [ paulmck: Apply feedback from Zqiang. ] Signed-off-by: Paul E. McKenney Cc: Neeraj Upadhyay Cc: Eric Dumazet Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Martin KaFai Lau Cc: KP Singh --- kernel/rcu/tasks.h | 62 +++--------------------------------------------------- 1 file changed, 3 insertions(+), 59 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 7bdc62606816..561d24f7f73c 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1192,9 +1192,6 @@ EXPORT_SYMBOL_GPL(rcu_trace_lock_map); #ifdef CONFIG_TASKS_TRACE_RCU -static atomic_t trc_n_readers_need_end; // Number of waited-for readers. -static DECLARE_WAIT_QUEUE_HEAD(trc_wait); // List of holdout tasks. - // Record outstanding IPIs to each CPU. No point in sending two... static DEFINE_PER_CPU(bool, trc_ipi_to_cpu); @@ -1241,16 +1238,6 @@ u8 rcu_trc_cmpxchg_need_qs(struct task_struct *t, u8 old, u8 new) } EXPORT_SYMBOL_GPL(rcu_trc_cmpxchg_need_qs); -/* - * This irq_work handler allows rcu_read_unlock_trace() to be invoked - * while the scheduler locks are held. - */ -static void rcu_read_unlock_iw(struct irq_work *iwp) -{ - wake_up(&trc_wait); -} -static DEFINE_IRQ_WORK(rcu_tasks_trace_iw, rcu_read_unlock_iw); - /* If we are the last reader, wake up the grace-period kthread. */ void rcu_read_unlock_trace_special(struct task_struct *t) { @@ -1267,8 +1254,6 @@ void rcu_read_unlock_trace_special(struct task_struct *t) "%s: result = %d", __func__, result); } WRITE_ONCE(t->trc_reader_nesting, 0); - if (nqs && atomic_dec_and_test(&trc_n_readers_need_end)) - irq_work_queue(&rcu_tasks_trace_iw); } EXPORT_SYMBOL_GPL(rcu_read_unlock_trace_special); @@ -1313,8 +1298,7 @@ static void trc_read_check_handler(void *t_in) // Get here if the task is in a read-side critical section. Set // its state so that it will awaken the grace-period kthread upon // exit from that critical section. - if (!rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS | TRC_NEED_QS_CHECKED)) - atomic_inc(&trc_n_readers_need_end); // One more to wait on. + rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS | TRC_NEED_QS_CHECKED); reset_ipi: // Allow future IPIs to be sent on CPU and for task. @@ -1367,10 +1351,8 @@ static int trc_inspect_reader(struct task_struct *t, void *bhp_in) // The task is in a read-side critical section, so set up its // state so that it will awaken the grace-period kthread upon exit // from that critical section. - if (!rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS | TRC_NEED_QS_CHECKED)) { - atomic_inc(&trc_n_readers_need_end); // One more to wait on. + if (!rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS | TRC_NEED_QS_CHECKED)) trc_add_holdout(t, bhp); - } return 0; } @@ -1436,9 +1418,6 @@ static void rcu_tasks_trace_pregp_step(void) { int cpu; - // Allow for fast-acting IPIs. - atomic_set(&trc_n_readers_need_end, 1); - // There shouldn't be any old IPIs, but... for_each_possible_cpu(cpu) WARN_ON_ONCE(per_cpu(trc_ipi_to_cpu, cpu)); @@ -1581,10 +1560,6 @@ static void rcu_tasks_trace_empty_fn(void *unused) static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp) { int cpu; - bool firstreport; - struct task_struct *g, *t; - LIST_HEAD(holdouts); - long ret; // Wait for any lingering IPI handlers to complete. Note that // if a CPU has gone offline or transitioned to userspace in the @@ -1595,37 +1570,6 @@ static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp) if (WARN_ON_ONCE(smp_load_acquire(per_cpu_ptr(&trc_ipi_to_cpu, cpu)))) smp_call_function_single(cpu, rcu_tasks_trace_empty_fn, NULL, 1); - // Remove the safety count. - smp_mb__before_atomic(); // Order vs. earlier atomics - atomic_dec(&trc_n_readers_need_end); - smp_mb__after_atomic(); // Order vs. later atomics - - // Wait for readers. - set_tasks_gp_state(rtp, RTGS_WAIT_READERS); - for (;;) { - ret = wait_event_idle_exclusive_timeout( - trc_wait, - atomic_read(&trc_n_readers_need_end) == 0, - READ_ONCE(rcu_task_stall_timeout)); - if (ret) - break; // Count reached zero. - // Stall warning time, so make a list of the offenders. - rcu_read_lock(); - for_each_process_thread(g, t) - if (rcu_ld_need_qs(t) & TRC_NEED_QS) - trc_add_holdout(t, &holdouts); - rcu_read_unlock(); - firstreport = true; - list_for_each_entry_safe(t, g, &holdouts, trc_holdout_list) { - if (rcu_ld_need_qs(t) & TRC_NEED_QS) - show_stalled_task_trace(t, &firstreport); - trc_del_holdout(t); // Release task_struct reference. - } - if (firstreport) - pr_err("INFO: rcu_tasks_trace detected stalls? (Counter/taskslist mismatch?)\n"); - show_stalled_ipi_trace(); - pr_err("\t%d holdouts\n", atomic_read(&trc_n_readers_need_end)); - } smp_mb(); // Caller's code must be ordered after wakeup. // Pairs with pretty much every ordering primitive. } @@ -1725,7 +1669,7 @@ void show_rcu_tasks_trace_gp_kthread(void) { char buf[64]; - sprintf(buf, "N%d h:%lu/%lu/%lu", atomic_read(&trc_n_readers_need_end), + sprintf(buf, "h:%lu/%lu/%lu", data_race(n_heavy_reader_ofl_updates), data_race(n_heavy_reader_updates), data_race(n_heavy_reader_attempts)); -- cgit From 9ff86b4c443cc93bf3a9b624e3385e4114388e85 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 26 May 2022 16:12:51 -0700 Subject: rcu-tasks: Make trc_read_check_handler() fetch ->trc_reader_nesting only once This commit replaces the pair of READ_ONCE(t->trc_reader_nesting) calls with a single such call and a local variable. This makes the code's intent more clear. Signed-off-by: Paul E. McKenney Cc: Neeraj Upadhyay Cc: Eric Dumazet Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Martin KaFai Lau Cc: KP Singh --- kernel/rcu/tasks.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 561d24f7f73c..8fe78a7fecaf 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1278,6 +1278,7 @@ static void trc_del_holdout(struct task_struct *t) /* IPI handler to check task state. */ static void trc_read_check_handler(void *t_in) { + int nesting; struct task_struct *t = current; struct task_struct *texp = t_in; @@ -1287,12 +1288,13 @@ static void trc_read_check_handler(void *t_in) // If the task is not in a read-side critical section, and // if this is the last reader, awaken the grace-period kthread. - if (likely(!READ_ONCE(t->trc_reader_nesting))) { + nesting = READ_ONCE(t->trc_reader_nesting); + if (likely(!nesting)) { rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED); goto reset_ipi; } // If we are racing with an rcu_read_unlock_trace(), try again later. - if (unlikely(READ_ONCE(t->trc_reader_nesting) < 0)) + if (unlikely(nesting < 0)) goto reset_ipi; // Get here if the task is in a read-side critical section. Set -- cgit From 5c9a9ca44fda41c5e82f50efced5297a9c19760d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 24 May 2022 16:59:52 -0700 Subject: rcu-tasks: Idle tasks on offline CPUs are in quiescent states Any idle task corresponding to an offline CPU is in an RCU Tasks Trace quiescent state. This commit causes rcu_tasks_trace_postscan() to ignore idle tasks for offline CPUs, which it can do safely due to CPU-hotplug operations being disabled. Signed-off-by: Paul E. McKenney Cc: Neeraj Upadhyay Cc: Eric Dumazet Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Martin KaFai Lau Cc: KP Singh --- kernel/rcu/tasks.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 8fe78a7fecaf..ec68bfe98c95 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1451,7 +1451,7 @@ static void rcu_tasks_trace_postscan(struct list_head *hop) { int cpu; - for_each_possible_cpu(cpu) + for_each_online_cpu(cpu) rcu_tasks_trace_pertask(idle_task(cpu), hop); // Re-enable CPU hotplug now that the tasklist scan has completed. -- cgit From 897ba84dc5aa7a5518e19da180d2985790723d30 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 24 May 2022 18:02:40 -0700 Subject: rcu-tasks: Handle idle tasks for recently offlined CPUs This commit identifies idle tasks for recently offlined CPUs as residing in a quiescent state. This is safe only because CPU-hotplug operations are excluded during these checks. Signed-off-by: Paul E. McKenney Cc: Neeraj Upadhyay Cc: Eric Dumazet Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Martin KaFai Lau Cc: KP Singh --- kernel/rcu/tasks.h | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index ec68bfe98c95..414861d65196 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1318,27 +1318,26 @@ static int trc_inspect_reader(struct task_struct *t, void *bhp_in) int nesting; bool ofl = cpu_is_offline(cpu); - if (task_curr(t)) { - WARN_ON_ONCE(ofl && !is_idle_task(t)); - + if (task_curr(t) && !ofl) { // If no chance of heavyweight readers, do it the hard way. - if (!ofl && !IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) + if (!IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) return -EINVAL; // If heavyweight readers are enabled on the remote task, // we can inspect its state despite its currently running. // However, we cannot safely change its state. n_heavy_reader_attempts++; - if (!ofl && // Check for "running" idle tasks on offline CPUs. - !rcu_dynticks_zero_in_eqs(cpu, &t->trc_reader_nesting)) + // Check for "running" idle tasks on offline CPUs. + if (!rcu_dynticks_zero_in_eqs(cpu, &t->trc_reader_nesting)) return -EINVAL; // No quiescent state, do it the hard way. n_heavy_reader_updates++; - if (ofl) - n_heavy_reader_ofl_updates++; nesting = 0; } else { // The task is not running, so C-language access is safe. nesting = t->trc_reader_nesting; + WARN_ON_ONCE(ofl && task_curr(t) && !is_idle_task(t)); + if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) && ofl) + n_heavy_reader_ofl_updates++; } // If not exiting a read-side critical section, mark as checked -- cgit From 5d4c90d755d5703d5a74fac0870c0a697250ec33 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 24 May 2022 18:19:17 -0700 Subject: rcu-tasks: RCU Tasks Trace grace-period kthread has implicit QS Because the task driving the grace-period kthread is in quiescent state throughout, this commit excludes it from the list of tasks from which a quiescent state is needed. This does mean that attaching a sleepable BPF program to function in kernel/rcu/tasks.h is a bad idea, by the way. Signed-off-by: Paul E. McKenney Cc: Neeraj Upadhyay Cc: Eric Dumazet Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Martin KaFai Lau Cc: KP Singh --- kernel/rcu/tasks.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 414861d65196..554b2e59a1d5 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1433,8 +1433,9 @@ static void rcu_tasks_trace_pertask(struct task_struct *t, struct list_head *hop) { // During early boot when there is only the one boot CPU, there - // is no idle task for the other CPUs. Just return. - if (unlikely(t == NULL)) + // is no idle task for the other CPUs. Also, the grace-period + // kthread is always in a quiescent state. Either way, just return. + if (unlikely(t == NULL) || t == current) return; rcu_st_need_qs(t, 0); -- cgit From 6a694411977a6d57ff76a896a745c2f717372dac Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 24 May 2022 20:33:17 -0700 Subject: rcu-tasks: Make rcu_note_context_switch() unconditionally call rcu_tasks_qs() This commit makes rcu_note_context_switch() unconditionally invoke the rcu_tasks_qs() function, as opposed to doing so only when RCU (as opposed to RCU Tasks Trace) urgently needs a grace period to end. Signed-off-by: Paul E. McKenney Cc: Neeraj Upadhyay Cc: Eric Dumazet Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Martin KaFai Lau Cc: KP Singh --- kernel/rcu/tree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index c8ba0fe17267..c966d680b789 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -899,8 +899,8 @@ void rcu_note_context_switch(bool preempt) this_cpu_write(rcu_data.rcu_urgent_qs, false); if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs))) rcu_momentary_dyntick_idle(); - rcu_tasks_qs(current, preempt); out: + rcu_tasks_qs(current, preempt); trace_rcu_utilization(TPS("End context switch")); } EXPORT_SYMBOL_GPL(rcu_note_context_switch); -- cgit From 0968e8920b5b11b7a33982890ad9150e09e1cb1f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 31 May 2022 21:38:15 -0700 Subject: rcu-tasks: Simplify trc_inspect_reader() QS logic Currently, trc_inspect_reader() does one check for nesting less than or equal to zero, then sorts out the distinctions within this single "if" statement. This commit simplifies the logic by providing one "if" statement for quiescent states (nesting of zero) and another "if" statement for transitioning from one nesting level to another or the outermost rcu_read_unlock_trace() (negative nesting). Signed-off-by: Paul E. McKenney Cc: Neeraj Upadhyay Cc: Eric Dumazet Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Martin KaFai Lau Cc: KP Singh --- kernel/rcu/tasks.h | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 554b2e59a1d5..6b44c69eeca8 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1343,15 +1343,16 @@ static int trc_inspect_reader(struct task_struct *t, void *bhp_in) // If not exiting a read-side critical section, mark as checked // so that the grace-period kthread will remove it from the // holdout list. - if (nesting <= 0) { - if (!nesting) - rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED); - return nesting ? -EINVAL : 0; // If in QS, done, otherwise try again later. + if (!nesting) { + rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED); + return 0; // In QS, so done. } + if (nesting < 0) + return -EINVAL; // QS transitioning, try again later. // The task is in a read-side critical section, so set up its - // state so that it will awaken the grace-period kthread upon exit - // from that critical section. + // state so that it will update state upon exit from that critical + // section. if (!rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS | TRC_NEED_QS_CHECKED)) trc_add_holdout(t, bhp); return 0; -- cgit From 9f3eb5fb8e468d3c3a6073d0c816405fa73c8038 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 24 May 2022 16:05:15 -0700 Subject: rcu-tasks: Add slow-IPI indicator to RCU Tasks Trace stall warnings This commit adds a "I" indicator to the RCU Tasks Trace CPU stall warning when an IPI directed to a task has thus far failed to arrive. This serves as a debugging aid. Signed-off-by: Paul E. McKenney Cc: Neeraj Upadhyay Cc: Eric Dumazet Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Martin KaFai Lau Cc: KP Singh --- kernel/rcu/tasks.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 6b44c69eeca8..1cfbebf2b597 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1497,8 +1497,9 @@ static void show_stalled_task_trace(struct task_struct *t, bool *firstreport) } cpu = task_cpu(t); if (!task_call_func(t, trc_check_slow_task, &trc_rdr)) - pr_alert("P%d: %c\n", + pr_alert("P%d: %c%c\n", t->pid, + ".I"[t->trc_ipi_to_cpu >= 0], ".i"[is_idle_tsk]); else pr_alert("P%d: %c%c%c nesting: %d%c cpu: %d\n", -- cgit From c8c03ad9d7cd694b88ab8db4199b4e4d3c6cc5aa Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 24 May 2022 20:36:08 -0700 Subject: rcu-tasks: Flag offline CPUs in RCU Tasks Trace stall warnings This commit tags offline CPUs with "(offline)" in RCU Tasks Trace CPU stall warnings. This is a debugging aid. Signed-off-by: Paul E. McKenney Cc: Neeraj Upadhyay Cc: Eric Dumazet Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Martin KaFai Lau Cc: KP Singh --- kernel/rcu/tasks.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 1cfbebf2b597..93096188d363 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1502,14 +1502,14 @@ static void show_stalled_task_trace(struct task_struct *t, bool *firstreport) ".I"[t->trc_ipi_to_cpu >= 0], ".i"[is_idle_tsk]); else - pr_alert("P%d: %c%c%c nesting: %d%c cpu: %d\n", + pr_alert("P%d: %c%c%c nesting: %d%c cpu: %d%s\n", t->pid, ".I"[trc_rdr.ipi_to_cpu >= 0], ".i"[is_idle_tsk], ".N"[cpu >= 0 && tick_nohz_full_cpu(cpu)], trc_rdr.nesting, " N"[!!trc_rdr.needqs], - cpu); + cpu, cpu_online(cpu) ? "" : "(offline)"); sched_show_task(t); } -- cgit From be15a16486dd6513ad801ea320eb21e10eec2b55 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 25 May 2022 09:20:59 -0700 Subject: rcu-tasks: Make RCU Tasks Trace stall warnings print full .b.need_qs field Currently, the RCU Tasks Trace CPU stall warning simply indicates whether or not the .b.need_qs field is zero. This commit shows the three permitted values and flags other values with either "!" or "?". This is a debugging aid. Signed-off-by: Paul E. McKenney Cc: Neeraj Upadhyay Cc: Eric Dumazet Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Martin KaFai Lau Cc: KP Singh --- kernel/rcu/tasks.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 93096188d363..5eefbab7f2ed 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1502,13 +1502,14 @@ static void show_stalled_task_trace(struct task_struct *t, bool *firstreport) ".I"[t->trc_ipi_to_cpu >= 0], ".i"[is_idle_tsk]); else - pr_alert("P%d: %c%c%c nesting: %d%c cpu: %d%s\n", + pr_alert("P%d: %c%c%c nesting: %d%c%c cpu: %d%s\n", t->pid, ".I"[trc_rdr.ipi_to_cpu >= 0], ".i"[is_idle_tsk], ".N"[cpu >= 0 && tick_nohz_full_cpu(cpu)], trc_rdr.nesting, - " N"[!!trc_rdr.needqs], + " !CN"[trc_rdr.needqs & 0x3], + " ?"[trc_rdr.needqs > 0x3], cpu, cpu_online(cpu) ? "" : "(offline)"); sched_show_task(t); } -- cgit From f90f19da88bfe32dd1fdfd104de4c0526a3be701 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 25 May 2022 09:49:26 -0700 Subject: rcu-tasks: Make RCU Tasks Trace stall warning handle idle offline tasks When a CPU is offline, its idle task can appear to be running, but it cannot be doing anything while CPU-hotplug operations are excluded. This commit takes advantage of that fact by making trc_check_slow_task() check for task_curr(t) && cpu_online(task_cpu(t)), and recording full information in that case. Signed-off-by: Paul E. McKenney Cc: Peter Zijlstra Cc: Neeraj Upadhyay Cc: Eric Dumazet Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Martin KaFai Lau Cc: KP Singh --- kernel/rcu/tasks.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 5eefbab7f2ed..64eb4d7b142e 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1476,7 +1476,7 @@ static int trc_check_slow_task(struct task_struct *t, void *arg) { struct trc_stall_chk_rdr *trc_rdrp = arg; - if (task_curr(t)) + if (task_curr(t) && cpu_online(task_cpu(t))) return false; // It is running, so decline to inspect it. trc_rdrp->nesting = READ_ONCE(t->trc_reader_nesting); trc_rdrp->ipi_to_cpu = READ_ONCE(t->trc_ipi_to_cpu); -- cgit From 434c9eefb959c36331a93617ea95df903469b99f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 16 May 2022 17:56:16 -0700 Subject: rcu-tasks: Add data structures for lightweight grace periods This commit adds fields to task_struct and to rcu_tasks_percpu that will be used to avoid the task-list scan for RCU Tasks Trace grace periods, and also initializes these fields. Signed-off-by: Paul E. McKenney Cc: Neeraj Upadhyay Cc: Eric Dumazet Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Martin KaFai Lau Cc: KP Singh --- kernel/fork.c | 1 + kernel/rcu/tasks.h | 4 ++++ 2 files changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 9d44f2d46c69..1950eb870244 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1814,6 +1814,7 @@ static inline void rcu_copy_process(struct task_struct *p) p->trc_reader_nesting = 0; p->trc_reader_special.s = 0; INIT_LIST_HEAD(&p->trc_holdout_list); + INIT_LIST_HEAD(&p->trc_blkd_node); #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ } diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 64eb4d7b142e..fd4508af055e 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -29,6 +29,7 @@ typedef void (*postgp_func_t)(struct rcu_tasks *rtp); * @rtp_work: Work queue for invoking callbacks. * @rtp_irq_work: IRQ work queue for deferred wakeups. * @barrier_q_head: RCU callback for barrier operation. + * @rtp_blkd_tasks: List of tasks blocked as readers. * @cpu: CPU number corresponding to this entry. * @rtpp: Pointer to the rcu_tasks structure. */ @@ -40,6 +41,7 @@ struct rcu_tasks_percpu { struct work_struct rtp_work; struct irq_work rtp_irq_work; struct rcu_head barrier_q_head; + struct list_head rtp_blkd_tasks; int cpu; struct rcu_tasks *rtpp; }; @@ -256,6 +258,8 @@ static void cblist_init_generic(struct rcu_tasks *rtp) INIT_WORK(&rtpcp->rtp_work, rcu_tasks_invoke_cbs_wq); rtpcp->cpu = cpu; rtpcp->rtpp = rtp; + if (!rtpcp->rtp_blkd_tasks.next) + INIT_LIST_HEAD(&rtpcp->rtp_blkd_tasks); raw_spin_unlock_rcu_node(rtpcp); // irqs remain disabled. } raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags); -- cgit From 0356d4e66214569de674ab2684f2e0b440a466ab Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 17 May 2022 11:30:32 -0700 Subject: rcu-tasks: Track blocked RCU Tasks Trace readers This commit places any task that has ever blocked within its current RCU Tasks Trace read-side critical section on a per-CPU list within the rcu_tasks_percpu structure. Tasks are removed from this list when they exit by the exit_tasks_rcu_finish_trace() function. The purpose of this commit is to provide the information needed to eliminate the current scan of the full task list. This commit offsets the INT_MIN value for ->trc_reader_nesting with the new nesting level in order to avoid queueing tasks that are exiting their read-side critical sections. [ paulmck: Apply kernel test robot feedback. ] [ paulmck: Apply feedback from syzbot+9bb26e7c5e8e4fa7e641@syzkaller.appspotmail.com ] Signed-off-by: Paul E. McKenney Tested-by: syzbot Tested-by: "Zhang, Qiang1" Cc: Peter Zijlstra Cc: Neeraj Upadhyay Cc: Eric Dumazet Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Martin KaFai Lau Cc: KP Singh --- kernel/rcu/tasks.h | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index fd4508af055e..bab75ec26bdb 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1261,6 +1261,24 @@ void rcu_read_unlock_trace_special(struct task_struct *t) } EXPORT_SYMBOL_GPL(rcu_read_unlock_trace_special); +/* Add a newly blocked reader task to its CPU's list. */ +void rcu_tasks_trace_qs_blkd(struct task_struct *t) +{ + unsigned long flags; + struct rcu_tasks_percpu *rtpcp; + + local_irq_save(flags); + rtpcp = this_cpu_ptr(rcu_tasks_trace.rtpcpu); + raw_spin_lock_rcu_node(rtpcp); // irqs already disabled + t->trc_blkd_cpu = smp_processor_id(); + if (!rtpcp->rtp_blkd_tasks.next) + INIT_LIST_HEAD(&rtpcp->rtp_blkd_tasks); + list_add(&t->trc_blkd_node, &rtpcp->rtp_blkd_tasks); + t->trc_reader_special.b.blocked = true; + raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); +} +EXPORT_SYMBOL_GPL(rcu_tasks_trace_qs_blkd); + /* Add a task to the holdout list, if it is not already on the list. */ static void trc_add_holdout(struct task_struct *t, struct list_head *bhp) { @@ -1586,9 +1604,11 @@ static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp) /* Report any needed quiescent state for this exiting task. */ static void exit_tasks_rcu_finish_trace(struct task_struct *t) { + union rcu_special trs = READ_ONCE(t->trc_reader_special); + rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED); WARN_ON_ONCE(READ_ONCE(t->trc_reader_nesting)); - if (WARN_ON_ONCE(rcu_ld_need_qs(t) & TRC_NEED_QS)) + if (WARN_ON_ONCE(rcu_ld_need_qs(t) & TRC_NEED_QS) || trs.b.blocked) rcu_read_unlock_trace_special(t); else WRITE_ONCE(t->trc_reader_nesting, 0); -- cgit From 0bcb386857376224b5fd3f38b6e3173ec74d8d36 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 17 May 2022 15:01:14 -0700 Subject: rcu-tasks: Untrack blocked RCU Tasks Trace at reader end This commit causes rcu_read_unlock_trace() to check for the current task being on a per-CPU list within the rcu_tasks_percpu structure, and removes it from that list if so. This has the effect of curtailing tracking of a task that blocked within an RCU Tasks Trace read-side critical section once it exits that critical section. Signed-off-by: Paul E. McKenney Cc: Neeraj Upadhyay Cc: Eric Dumazet Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Martin KaFai Lau Cc: KP Singh --- kernel/rcu/tasks.h | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index bab75ec26bdb..eb87a759ef0b 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1245,17 +1245,29 @@ EXPORT_SYMBOL_GPL(rcu_trc_cmpxchg_need_qs); /* If we are the last reader, wake up the grace-period kthread. */ void rcu_read_unlock_trace_special(struct task_struct *t) { - int nqs = (rcu_ld_need_qs(t) == (TRC_NEED_QS_CHECKED | TRC_NEED_QS)); + unsigned long flags; + struct rcu_tasks_percpu *rtpcp; + union rcu_special trs; + + // Open-coded full-word version of rcu_ld_need_qs(). + smp_mb(); // Enforce full grace-period ordering. + trs = smp_load_acquire(&t->trc_reader_special); if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) && t->trc_reader_special.b.need_mb) smp_mb(); // Pairs with update-side barriers. // Update .need_qs before ->trc_reader_nesting for irq/NMI handlers. - if (nqs) { + if (trs.b.need_qs == (TRC_NEED_QS_CHECKED | TRC_NEED_QS)) { u8 result = rcu_trc_cmpxchg_need_qs(t, TRC_NEED_QS_CHECKED | TRC_NEED_QS, TRC_NEED_QS_CHECKED); - WARN_ONCE(result != (TRC_NEED_QS_CHECKED | TRC_NEED_QS), - "%s: result = %d", __func__, result); + WARN_ONCE(result != trs.b.need_qs, "%s: result = %d", __func__, result); + } + if (trs.b.blocked) { + rtpcp = per_cpu_ptr(rcu_tasks_trace.rtpcpu, t->trc_blkd_cpu); + raw_spin_lock_irqsave_rcu_node(rtpcp, flags); + list_del_init(&t->trc_blkd_node); + WRITE_ONCE(t->trc_reader_special.b.blocked, false); + raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); } WRITE_ONCE(t->trc_reader_nesting, 0); } @@ -1274,7 +1286,7 @@ void rcu_tasks_trace_qs_blkd(struct task_struct *t) if (!rtpcp->rtp_blkd_tasks.next) INIT_LIST_HEAD(&rtpcp->rtp_blkd_tasks); list_add(&t->trc_blkd_node, &rtpcp->rtp_blkd_tasks); - t->trc_reader_special.b.blocked = true; + WRITE_ONCE(t->trc_reader_special.b.blocked, true); raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); } EXPORT_SYMBOL_GPL(rcu_tasks_trace_qs_blkd); @@ -1608,7 +1620,7 @@ static void exit_tasks_rcu_finish_trace(struct task_struct *t) rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED); WARN_ON_ONCE(READ_ONCE(t->trc_reader_nesting)); - if (WARN_ON_ONCE(rcu_ld_need_qs(t) & TRC_NEED_QS) || trs.b.blocked) + if (WARN_ON_ONCE(rcu_ld_need_qs(t) & TRC_NEED_QS || trs.b.blocked)) rcu_read_unlock_trace_special(t); else WRITE_ONCE(t->trc_reader_nesting, 0); -- cgit From 387c0ad702296c4eb7b95322c8af0e4391bf146f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 17 May 2022 16:47:40 -0700 Subject: rcu-tasks: Add blocked-task indicator to RCU Tasks Trace stall warnings This commit adds a "B" indicator to the RCU Tasks Trace CPU stall warning when the task has blocked within its current read-side critical section. This serves as a debugging aid. Signed-off-by: Paul E. McKenney Cc: Neeraj Upadhyay Cc: Eric Dumazet Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Martin KaFai Lau Cc: KP Singh --- kernel/rcu/tasks.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index eb87a759ef0b..6f4b89f9517e 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1536,11 +1536,12 @@ static void show_stalled_task_trace(struct task_struct *t, bool *firstreport) ".I"[t->trc_ipi_to_cpu >= 0], ".i"[is_idle_tsk]); else - pr_alert("P%d: %c%c%c nesting: %d%c%c cpu: %d%s\n", + pr_alert("P%d: %c%c%c%c nesting: %d%c%c cpu: %d%s\n", t->pid, ".I"[trc_rdr.ipi_to_cpu >= 0], ".i"[is_idle_tsk], ".N"[cpu >= 0 && tick_nohz_full_cpu(cpu)], + ".B"[!!data_race(t->trc_reader_special.b.blocked)], trc_rdr.nesting, " !CN"[trc_rdr.needqs & 0x3], " ?"[trc_rdr.needqs > 0x3], -- cgit From 1fa98e2e40e5a46fa528b6ab256ba00dfb319dde Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 17 May 2022 17:11:37 -0700 Subject: rcu-tasks: Move rcu_tasks_trace_pertask() before rcu_tasks_trace_pregp_step() This is a code-motion-only commit that moves rcu_tasks_trace_pertask() to precede rcu_tasks_trace_pregp_step(), so that the latter will be able to invoke the other without forward references. Signed-off-by: Paul E. McKenney Cc: Neeraj Upadhyay Cc: Eric Dumazet Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Martin KaFai Lau Cc: KP Singh --- kernel/rcu/tasks.h | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 6f4b89f9517e..66d8473f1bda 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1449,20 +1449,6 @@ static void trc_wait_for_one_reader(struct task_struct *t, } } -/* Initialize for a new RCU-tasks-trace grace period. */ -static void rcu_tasks_trace_pregp_step(void) -{ - int cpu; - - // There shouldn't be any old IPIs, but... - for_each_possible_cpu(cpu) - WARN_ON_ONCE(per_cpu(trc_ipi_to_cpu, cpu)); - - // Disable CPU hotplug across the tasklist scan. - // This also waits for all readers in CPU-hotplug code paths. - cpus_read_lock(); -} - /* Do first-round processing for the specified task. */ static void rcu_tasks_trace_pertask(struct task_struct *t, struct list_head *hop) @@ -1478,6 +1464,20 @@ static void rcu_tasks_trace_pertask(struct task_struct *t, trc_wait_for_one_reader(t, hop); } +/* Initialize for a new RCU-tasks-trace grace period. */ +static void rcu_tasks_trace_pregp_step(void) +{ + int cpu; + + // There shouldn't be any old IPIs, but... + for_each_possible_cpu(cpu) + WARN_ON_ONCE(per_cpu(trc_ipi_to_cpu, cpu)); + + // Disable CPU hotplug across the tasklist scan. + // This also waits for all readers in CPU-hotplug code paths. + cpus_read_lock(); +} + /* * Do intermediate processing between task and holdout scans and * pick up the idle tasks. -- cgit From 19415004d5221ba59be6ef566fdbb52c44808f7e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 17 May 2022 17:38:02 -0700 Subject: rcu-tasks: Avoid rcu_tasks_trace_pertask() duplicate list additions This commit adds checks within rcu_tasks_trace_pertask() to avoid duplicate (and destructive) additions to the holdouts list. These checks will be required later due to the possibility of a given task having blocked while in an RCU Tasks Trace read-side critical section, but now running on a CPU. Signed-off-by: Paul E. McKenney Cc: Neeraj Upadhyay Cc: Eric Dumazet Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Martin KaFai Lau Cc: KP Singh --- kernel/rcu/tasks.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 66d8473f1bda..1aa6a24a9bc2 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1454,9 +1454,10 @@ static void rcu_tasks_trace_pertask(struct task_struct *t, struct list_head *hop) { // During early boot when there is only the one boot CPU, there - // is no idle task for the other CPUs. Also, the grace-period - // kthread is always in a quiescent state. Either way, just return. - if (unlikely(t == NULL) || t == current) + // is no idle task for the other CPUs. Also, the grace-period + // kthread is always in a quiescent state. In addition, just return + // if this task is already on the list. + if (unlikely(t == NULL) || t == current || !list_empty(&t->trc_holdout_list)) return; rcu_st_need_qs(t, 0); -- cgit From 7460ade1fc6e3f62d5c0006c972755f0aefd41b2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 18 May 2022 16:06:55 -0700 Subject: rcu-tasks: Scan running tasks for RCU Tasks Trace readers A running task might be within an RCU Tasks Trace read-side critical section for any length of time, but will not be placed on any of the per-CPU rcu_tasks_percpu structure's ->rtp_blkd_tasks lists. Therefore any RCU Tasks Trace grace-period processing that does not scan the full task list must interact with the running tasks. This commit therefore causes the rcu_tasks_trace_pregp_step() function to IPI each CPU in order to place the corresponding task on the holdouts list and to record whether or not it was in an RCU Tasks Trace read-side critical section. Yes, it is possible to avoid adding it to that list if it is not a reader, but that would prevent the system from remembering that this task was in a quiescent state. Which is why the running tasks are unconditionally added to the holdout list. Signed-off-by: Paul E. McKenney Cc: Neeraj Upadhyay Cc: Eric Dumazet Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Martin KaFai Lau Cc: KP Singh --- kernel/rcu/tasks.h | 51 ++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 40 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 1aa6a24a9bc2..a8f95864c921 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -14,7 +14,7 @@ struct rcu_tasks; typedef void (*rcu_tasks_gp_func_t)(struct rcu_tasks *rtp); -typedef void (*pregp_func_t)(void); +typedef void (*pregp_func_t)(struct list_head *hop); typedef void (*pertask_func_t)(struct task_struct *t, struct list_head *hop); typedef void (*postscan_func_t)(struct list_head *hop); typedef void (*holdouts_func_t)(struct list_head *hop, bool ndrpt, bool *frptp); @@ -661,7 +661,7 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) struct task_struct *t; set_tasks_gp_state(rtp, RTGS_PRE_WAIT_GP); - rtp->pregp_func(); + rtp->pregp_func(&holdouts); /* * There were callbacks, so we need to wait for an RCU-tasks @@ -791,7 +791,7 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) // disabling. /* Pre-grace-period preparation. */ -static void rcu_tasks_pregp_step(void) +static void rcu_tasks_pregp_step(struct list_head *hop) { /* * Wait for all pre-existing t->on_rq and t->nvcsw transitions @@ -1449,24 +1449,48 @@ static void trc_wait_for_one_reader(struct task_struct *t, } } -/* Do first-round processing for the specified task. */ -static void rcu_tasks_trace_pertask(struct task_struct *t, - struct list_head *hop) +/* + * Initialize for first-round processing for the specified task. + * Return false if task is NULL or already taken care of, true otherwise. + */ +static bool rcu_tasks_trace_pertask_prep(struct task_struct *t, bool notself) { // During early boot when there is only the one boot CPU, there // is no idle task for the other CPUs. Also, the grace-period // kthread is always in a quiescent state. In addition, just return // if this task is already on the list. - if (unlikely(t == NULL) || t == current || !list_empty(&t->trc_holdout_list)) - return; + if (unlikely(t == NULL) || (t == current && notself) || !list_empty(&t->trc_holdout_list)) + return false; rcu_st_need_qs(t, 0); t->trc_ipi_to_cpu = -1; - trc_wait_for_one_reader(t, hop); + return true; +} + +/* Do first-round processing for the specified task. */ +static void rcu_tasks_trace_pertask(struct task_struct *t, struct list_head *hop) +{ + if (rcu_tasks_trace_pertask_prep(t, true)) + trc_wait_for_one_reader(t, hop); +} + +/* + * Get the current CPU's current task on the holdout list. + * Calls to this function must be serialized. + */ +static void rcu_tasks_trace_pertask_handler(void *hop_in) +{ + struct list_head *hop = hop_in; + struct task_struct *t = current; + + // Pull in the currently running task, but only if it is currently + // in an RCU tasks trace read-side critical section. + if (rcu_tasks_trace_pertask_prep(t, false)) + trc_add_holdout(t, hop); } /* Initialize for a new RCU-tasks-trace grace period. */ -static void rcu_tasks_trace_pregp_step(void) +static void rcu_tasks_trace_pregp_step(struct list_head *hop) { int cpu; @@ -1474,9 +1498,14 @@ static void rcu_tasks_trace_pregp_step(void) for_each_possible_cpu(cpu) WARN_ON_ONCE(per_cpu(trc_ipi_to_cpu, cpu)); - // Disable CPU hotplug across the tasklist scan. + // Disable CPU hotplug across the CPU scan. // This also waits for all readers in CPU-hotplug code paths. cpus_read_lock(); + + // These smp_call_function_single() calls are serialized to + // allow safe access to the hop list. + for_each_online_cpu(cpu) + smp_call_function_single(cpu, rcu_tasks_trace_pertask_handler, hop, 1); } /* -- cgit From dc7d54b45170e1e3ced9f86718aa4274fd727790 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 18 May 2022 17:19:27 -0700 Subject: rcu-tasks: Pull in tasks blocked within RCU Tasks Trace readers This commit scans each CPU's ->rtp_blkd_tasks list, adding them to the list of holdout tasks. This will cause the current RCU Tasks Trace grace period to wait until these tasks exit their RCU Tasks Trace read-side critical sections. This commit will enable later work omitting the scan of the full task list. Signed-off-by: Paul E. McKenney Cc: Neeraj Upadhyay Cc: Eric Dumazet Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Martin KaFai Lau Cc: KP Singh --- kernel/rcu/tasks.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index a8f95864c921..d318cdfd2309 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1492,7 +1492,11 @@ static void rcu_tasks_trace_pertask_handler(void *hop_in) /* Initialize for a new RCU-tasks-trace grace period. */ static void rcu_tasks_trace_pregp_step(struct list_head *hop) { + LIST_HEAD(blkd_tasks); int cpu; + unsigned long flags; + struct rcu_tasks_percpu *rtpcp; + struct task_struct *t; // There shouldn't be any old IPIs, but... for_each_possible_cpu(cpu) @@ -1506,6 +1510,26 @@ static void rcu_tasks_trace_pregp_step(struct list_head *hop) // allow safe access to the hop list. for_each_online_cpu(cpu) smp_call_function_single(cpu, rcu_tasks_trace_pertask_handler, hop, 1); + + // Only after all running tasks have been accounted for is it + // safe to take care of the tasks that have blocked within their + // current RCU tasks trace read-side critical section. + for_each_possible_cpu(cpu) { + rtpcp = per_cpu_ptr(rcu_tasks_trace.rtpcpu, cpu); + raw_spin_lock_irqsave_rcu_node(rtpcp, flags); + list_splice_init(&rtpcp->rtp_blkd_tasks, &blkd_tasks); + while (!list_empty(&blkd_tasks)) { + rcu_read_lock(); + t = list_first_entry(&blkd_tasks, struct task_struct, trc_blkd_node); + list_del_init(&t->trc_blkd_node); + list_add(&t->trc_blkd_node, &rtpcp->rtp_blkd_tasks); + raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); + rcu_tasks_trace_pertask(t, hop); + rcu_read_unlock(); + raw_spin_lock_irqsave_rcu_node(rtpcp, flags); + } + raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); + } } /* -- cgit From 955a0192082023bf08f1be279182090264cb2557 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 18 May 2022 17:19:27 -0700 Subject: rcu-tasks: Stop RCU Tasks Trace from scanning idle tasks Now that RCU scans both running tasks and tasks that have blocked within their current RCU Tasks Trace read-side critical section, there is no need for it to scan the idle tasks. After all, an idle loop should not be remain within an RCU Tasks Trace read-side critical section across exit from idle, and from a BPF viewpoint, functions invoked from the idle loop should not sleep. So only running idle tasks can be within RCU Tasks Trace read-side critical sections. This commit therefore removes the scan of the idle tasks from the rcu_tasks_trace_postscan() function. Signed-off-by: Paul E. McKenney Cc: Neeraj Upadhyay Cc: Eric Dumazet Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Martin KaFai Lau Cc: KP Singh --- kernel/rcu/tasks.h | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index d318cdfd2309..272c905995e5 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1533,16 +1533,10 @@ static void rcu_tasks_trace_pregp_step(struct list_head *hop) } /* - * Do intermediate processing between task and holdout scans and - * pick up the idle tasks. + * Do intermediate processing between task and holdout scans. */ static void rcu_tasks_trace_postscan(struct list_head *hop) { - int cpu; - - for_each_online_cpu(cpu) - rcu_tasks_trace_pertask(idle_task(cpu), hop); - // Re-enable CPU hotplug now that the tasklist scan has completed. cpus_read_unlock(); -- cgit From 245a62982502255314b63dd2c4daaedd1cd595a6 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Mon, 25 Apr 2022 09:04:04 +0800 Subject: rcu: Dump rcuc kthread status for CPUs not reporting quiescent state If the rcutree.use_softirq kernel boot parameter is disabled, then it is possible that a RCU CPU stall is due to the rcuc kthreads being starved of CPU time. There is currently no easy way to infer this from the RCU CPU stall warning output. This commit therefore adds a string of the form " rcuc=%ld jiffies(starved)" to a given CPU's output if the corresponding rcuc kthread has been starved for more than two seconds. [ paulmck: Eliminate extraneous space characters. ] Signed-off-by: Zqiang Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_stall.h | 49 +++++++++++++++++++++---------------------------- 1 file changed, 21 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 4995c078cff9..3556637768fd 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -409,7 +409,19 @@ static bool rcu_is_gp_kthread_starving(unsigned long *jp) static bool rcu_is_rcuc_kthread_starving(struct rcu_data *rdp, unsigned long *jp) { - unsigned long j = jiffies - READ_ONCE(rdp->rcuc_activity); + int cpu; + struct task_struct *rcuc; + unsigned long j; + + rcuc = rdp->rcu_cpu_kthread_task; + if (!rcuc) + return false; + + cpu = task_cpu(rcuc); + if (cpu_is_offline(cpu) || idle_cpu(cpu)) + return false; + + j = jiffies - READ_ONCE(rdp->rcuc_activity); if (jp) *jp = j; @@ -434,6 +446,9 @@ static void print_cpu_stall_info(int cpu) struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); char *ticks_title; unsigned long ticks_value; + bool rcuc_starved; + unsigned long j; + char buf[32]; /* * We could be printing a lot while holding a spinlock. Avoid @@ -451,7 +466,10 @@ static void print_cpu_stall_info(int cpu) delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq); falsepositive = rcu_is_gp_kthread_starving(NULL) && rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp)); - pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld %s\n", + rcuc_starved = rcu_is_rcuc_kthread_starving(rdp, &j); + if (rcuc_starved) + sprintf(buf, " rcuc=%ld jiffies(starved)", j); + pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld%s%s\n", cpu, "O."[!!cpu_online(cpu)], "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)], @@ -464,32 +482,10 @@ static void print_cpu_stall_info(int cpu) rdp->dynticks_nesting, rdp->dynticks_nmi_nesting, rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), data_race(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart, + rcuc_starved ? buf : "", falsepositive ? " (false positive?)" : ""); } -static void rcuc_kthread_dump(struct rcu_data *rdp) -{ - int cpu; - unsigned long j; - struct task_struct *rcuc; - - rcuc = rdp->rcu_cpu_kthread_task; - if (!rcuc) - return; - - cpu = task_cpu(rcuc); - if (cpu_is_offline(cpu) || idle_cpu(cpu)) - return; - - if (!rcu_is_rcuc_kthread_starving(rdp, &j)) - return; - - pr_err("%s kthread starved for %ld jiffies\n", rcuc->comm, j); - sched_show_task(rcuc); - if (!trigger_single_cpu_backtrace(cpu)) - dump_cpu_task(cpu); -} - /* Complain about starvation of grace-period kthread. */ static void rcu_check_gp_kthread_starvation(void) { @@ -662,9 +658,6 @@ static void print_cpu_stall(unsigned long gps) rcu_check_gp_kthread_expired_fqs_timer(); rcu_check_gp_kthread_starvation(); - if (!use_softirq) - rcuc_kthread_dump(rdp); - rcu_dump_cpu_stacks(); raw_spin_lock_irqsave_rcu_node(rnp, flags); -- cgit From ed4ae5eff4b38797607cbdd80da394149110fb37 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 17 May 2022 21:00:04 -0700 Subject: rcu: Apply noinstr to rcu_idle_enter() and rcu_idle_exit() This commit applies the "noinstr" tag to the rcu_idle_enter() and rcu_idle_exit() functions, which are invoked from portions of the idle loop that cannot be instrumented. These tags require reworking the rcu_eqs_enter() and rcu_eqs_exit() functions that these two functions invoke in order to cause them to use normal assertions rather than lockdep. In addition, within rcu_idle_exit(), the raw versions of local_irq_save() and local_irq_restore() are used, again to avoid issues with lockdep in uninstrumented code. This patch is based in part on an earlier patch by Jiri Olsa, discussions with Peter Zijlstra and Frederic Weisbecker, earlier changes by Thomas Gleixner, and off-list discussions with Yonghong Song. Link: https://lore.kernel.org/lkml/20220515203653.4039075-1-jolsa@kernel.org/ Reported-by: Jiri Olsa Reported-by: Alexei Starovoitov Reported-by: Andrii Nakryiko Signed-off-by: Paul E. McKenney Reviewed-by: Yonghong Song --- kernel/rcu/tree.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c25ba442044a..9a5edab5558c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -631,8 +631,8 @@ static noinstr void rcu_eqs_enter(bool user) return; } - lockdep_assert_irqs_disabled(); instrumentation_begin(); + lockdep_assert_irqs_disabled(); trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, atomic_read(&rdp->dynticks)); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); rcu_preempt_deferred_qs(current); @@ -659,9 +659,9 @@ static noinstr void rcu_eqs_enter(bool user) * If you add or remove a call to rcu_idle_enter(), be sure to test with * CONFIG_RCU_EQS_DEBUG=y. */ -void rcu_idle_enter(void) +void noinstr rcu_idle_enter(void) { - lockdep_assert_irqs_disabled(); + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !raw_irqs_disabled()); rcu_eqs_enter(false); } EXPORT_SYMBOL_GPL(rcu_idle_enter); @@ -861,7 +861,7 @@ static void noinstr rcu_eqs_exit(bool user) struct rcu_data *rdp; long oldval; - lockdep_assert_irqs_disabled(); + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !raw_irqs_disabled()); rdp = this_cpu_ptr(&rcu_data); oldval = rdp->dynticks_nesting; WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0); @@ -896,13 +896,13 @@ static void noinstr rcu_eqs_exit(bool user) * If you add or remove a call to rcu_idle_exit(), be sure to test with * CONFIG_RCU_EQS_DEBUG=y. */ -void rcu_idle_exit(void) +void noinstr rcu_idle_exit(void) { unsigned long flags; - local_irq_save(flags); + raw_local_irq_save(flags); rcu_eqs_exit(false); - local_irq_restore(flags); + raw_local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_idle_exit); -- cgit From f67671baadf6dbae8eca46cb95e85b762e462b2b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 8 Jun 2022 16:40:19 +0200 Subject: context_tracking: Add a note about noinstr VS unsafe context tracking functions Some context tracking functions enter or exit into/from RCU idle mode while using trace-able and lockdep-aware IRQs (un-)masking. As a result those functions can't get tagged as noinstr. This is unlikely to be fixed since these are obsolete APIs. Drop a note about this matter. [ paulmck: Apply Peter Zijlstra feedback. ] Reported-by: Peter Zijlstra Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Neeraj Upadhyay Cc: Uladzislau Rezki Cc: Joel Fernandes Cc: Boqun Feng Cc: Nicolas Saenz Julienne Cc: Marcelo Tosatti Cc: Xiongfeng Wang Cc: Yu Liao Cc: Phil Auld Cc: Paul Gortmaker Cc: Alex Belits Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Reviewed-by: Nicolas Saenz Julienne Tested-by: Nicolas Saenz Julienne --- kernel/context_tracking.c | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'kernel') diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 36a98c48aedc..3082332f6476 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -103,6 +103,15 @@ void noinstr __context_tracking_enter(enum ctx_state state) } EXPORT_SYMBOL_GPL(__context_tracking_enter); +/* + * OBSOLETE: + * This function should be noinstr but the below local_irq_restore() is + * unsafe because it involves illegal RCU uses through tracing and lockdep. + * This is unlikely to be fixed as this function is obsolete. The preferred + * way is to call __context_tracking_enter() through user_enter_irqoff() + * or context_tracking_guest_enter(). It should be the arch entry code + * responsibility to call into context tracking with IRQs disabled. + */ void context_tracking_enter(enum ctx_state state) { unsigned long flags; @@ -125,6 +134,14 @@ void context_tracking_enter(enum ctx_state state) NOKPROBE_SYMBOL(context_tracking_enter); EXPORT_SYMBOL_GPL(context_tracking_enter); +/* + * OBSOLETE: + * This function should be noinstr but it unsafely calls local_irq_restore(), + * involving illegal RCU uses through tracing and lockdep. + * This is unlikely to be fixed as this function is obsolete. The preferred + * way is to call user_enter_irqoff(). It should be the arch entry code + * responsibility to call into context tracking with IRQs disabled. + */ void context_tracking_user_enter(void) { user_enter(); @@ -168,6 +185,15 @@ void noinstr __context_tracking_exit(enum ctx_state state) } EXPORT_SYMBOL_GPL(__context_tracking_exit); +/* + * OBSOLETE: + * This function should be noinstr but the below local_irq_save() is + * unsafe because it involves illegal RCU uses through tracing and lockdep. + * This is unlikely to be fixed as this function is obsolete. The preferred + * way is to call __context_tracking_exit() through user_exit_irqoff() + * or context_tracking_guest_exit(). It should be the arch entry code + * responsibility to call into context tracking with IRQs disabled. + */ void context_tracking_exit(enum ctx_state state) { unsigned long flags; @@ -182,6 +208,14 @@ void context_tracking_exit(enum ctx_state state) NOKPROBE_SYMBOL(context_tracking_exit); EXPORT_SYMBOL_GPL(context_tracking_exit); +/* + * OBSOLETE: + * This function should be noinstr but it unsafely calls local_irq_save(), + * involving illegal RCU uses through tracing and lockdep. This is unlikely + * to be fixed as this function is obsolete. The preferred way is to call + * user_exit_irqoff(). It should be the arch entry code responsibility to + * call into context tracking with IRQs disabled. + */ void context_tracking_user_exit(void) { user_exit(); -- cgit From aca80dd95e20f1fa0daa212afc83c9fa0ad239e5 Mon Sep 17 00:00:00 2001 From: Delyan Kratunov Date: Mon, 20 Jun 2022 21:47:55 +0000 Subject: uprobe: gate bpf call behind BPF_EVENTS The call into bpf from uprobes needs to be gated now that it doesn't use the trace_events.h helpers. Randy found this as a randconfig build failure on linux-next [1]. [1]: https://lore.kernel.org/linux-next/2de99180-7d55-2fdf-134d-33198c27cc58@infradead.org/ Reported-by: Randy Dunlap Signed-off-by: Delyan Kratunov Tested-by: Randy Dunlap Acked-by: Randy Dunlap Link: https://lore.kernel.org/r/cb8bfbbcde87ed5d811227a393ef4925f2aadb7b.camel@fb.com Signed-off-by: Alexei Starovoitov --- kernel/trace/trace_uprobe.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 0282c119b1b2..326235fd2346 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1344,6 +1344,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, int size, esize; int rctx; +#ifdef CONFIG_BPF_EVENTS if (bpf_prog_array_valid(call)) { u32 ret; @@ -1351,6 +1352,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, if (!ret) return; } +#endif /* CONFIG_BPF_EVENTS */ esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); -- cgit From 1ade23711971b0eececf0d7fedc29d3c1d2fce01 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 21 Jun 2022 02:53:42 +0300 Subject: bpf: Inline calls to bpf_loop when callback is known Calls to `bpf_loop` are replaced with direct loops to avoid indirection. E.g. the following: bpf_loop(10, foo, NULL, 0); Is replaced by equivalent of the following: for (int i = 0; i < 10; ++i) foo(i, NULL); This transformation could be applied when: - callback is known and does not change during program execution; - flags passed to `bpf_loop` are always zero. Inlining logic works as follows: - During execution simulation function `update_loop_inline_state` tracks the following information for each `bpf_loop` call instruction: - is callback known and constant? - are flags constant and zero? - Function `optimize_bpf_loop` increases stack depth for functions where `bpf_loop` calls can be inlined and invokes `inline_bpf_loop` to apply the inlining. The additional stack space is used to spill registers R6, R7 and R8. These registers are used as loop counter, loop maximal bound and callback context parameter; Measurements using `benchs/run_bench_bpf_loop.sh` inside QEMU / KVM on i7-4710HQ CPU show a drop in latency from 14 ns/op to 2 ns/op. Signed-off-by: Eduard Zingerman Acked-by: Song Liu Link: https://lore.kernel.org/r/20220620235344.569325-4-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_iter.c | 9 +-- kernel/bpf/verifier.c | 180 ++++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 180 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index d5d96ceca105..7e8fd49406f6 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -723,9 +723,6 @@ const struct bpf_func_proto bpf_for_each_map_elem_proto = { .arg4_type = ARG_ANYTHING, }; -/* maximum number of loops */ -#define MAX_LOOPS BIT(23) - BPF_CALL_4(bpf_loop, u32, nr_loops, void *, callback_fn, void *, callback_ctx, u64, flags) { @@ -733,9 +730,13 @@ BPF_CALL_4(bpf_loop, u32, nr_loops, void *, callback_fn, void *, callback_ctx, u64 ret; u32 i; + /* Note: these safety checks are also verified when bpf_loop + * is inlined, be careful to modify this code in sync. See + * function verifier.c:inline_bpf_loop. + */ if (flags) return -EINVAL; - if (nr_loops > MAX_LOOPS) + if (nr_loops > BPF_MAX_LOOPS) return -E2BIG; for (i = 0; i < nr_loops; i++) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2859901ffbe3..bf72dc511df6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7124,6 +7124,41 @@ static int check_get_func_ip(struct bpf_verifier_env *env) return -ENOTSUPP; } +static struct bpf_insn_aux_data *cur_aux(struct bpf_verifier_env *env) +{ + return &env->insn_aux_data[env->insn_idx]; +} + +static bool loop_flag_is_zero(struct bpf_verifier_env *env) +{ + struct bpf_reg_state *regs = cur_regs(env); + struct bpf_reg_state *reg = ®s[BPF_REG_4]; + bool reg_is_null = register_is_null(reg); + + if (reg_is_null) + mark_chain_precision(env, BPF_REG_4); + + return reg_is_null; +} + +static void update_loop_inline_state(struct bpf_verifier_env *env, u32 subprogno) +{ + struct bpf_loop_inline_state *state = &cur_aux(env)->loop_inline_state; + + if (!state->initialized) { + state->initialized = 1; + state->fit_for_inline = loop_flag_is_zero(env); + state->callback_subprogno = subprogno; + return; + } + + if (!state->fit_for_inline) + return; + + state->fit_for_inline = (loop_flag_is_zero(env) && + state->callback_subprogno == subprogno); +} + static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx_p) { @@ -7276,6 +7311,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn err = check_bpf_snprintf_call(env, regs); break; case BPF_FUNC_loop: + update_loop_inline_state(env, meta.subprogno); err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, set_loop_callback_state); break; @@ -7682,11 +7718,6 @@ static bool check_reg_sane_offset(struct bpf_verifier_env *env, return true; } -static struct bpf_insn_aux_data *cur_aux(struct bpf_verifier_env *env) -{ - return &env->insn_aux_data[env->insn_idx]; -} - enum { REASON_BOUNDS = -1, REASON_TYPE = -2, @@ -14315,6 +14346,142 @@ patch_call_imm: return 0; } +static struct bpf_prog *inline_bpf_loop(struct bpf_verifier_env *env, + int position, + s32 stack_base, + u32 callback_subprogno, + u32 *cnt) +{ + s32 r6_offset = stack_base + 0 * BPF_REG_SIZE; + s32 r7_offset = stack_base + 1 * BPF_REG_SIZE; + s32 r8_offset = stack_base + 2 * BPF_REG_SIZE; + int reg_loop_max = BPF_REG_6; + int reg_loop_cnt = BPF_REG_7; + int reg_loop_ctx = BPF_REG_8; + + struct bpf_prog *new_prog; + u32 callback_start; + u32 call_insn_offset; + s32 callback_offset; + + /* This represents an inlined version of bpf_iter.c:bpf_loop, + * be careful to modify this code in sync. + */ + struct bpf_insn insn_buf[] = { + /* Return error and jump to the end of the patch if + * expected number of iterations is too big. + */ + BPF_JMP_IMM(BPF_JLE, BPF_REG_1, BPF_MAX_LOOPS, 2), + BPF_MOV32_IMM(BPF_REG_0, -E2BIG), + BPF_JMP_IMM(BPF_JA, 0, 0, 16), + /* spill R6, R7, R8 to use these as loop vars */ + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, r6_offset), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_7, r7_offset), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_8, r8_offset), + /* initialize loop vars */ + BPF_MOV64_REG(reg_loop_max, BPF_REG_1), + BPF_MOV32_IMM(reg_loop_cnt, 0), + BPF_MOV64_REG(reg_loop_ctx, BPF_REG_3), + /* loop header, + * if reg_loop_cnt >= reg_loop_max skip the loop body + */ + BPF_JMP_REG(BPF_JGE, reg_loop_cnt, reg_loop_max, 5), + /* callback call, + * correct callback offset would be set after patching + */ + BPF_MOV64_REG(BPF_REG_1, reg_loop_cnt), + BPF_MOV64_REG(BPF_REG_2, reg_loop_ctx), + BPF_CALL_REL(0), + /* increment loop counter */ + BPF_ALU64_IMM(BPF_ADD, reg_loop_cnt, 1), + /* jump to loop header if callback returned 0 */ + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, -6), + /* return value of bpf_loop, + * set R0 to the number of iterations + */ + BPF_MOV64_REG(BPF_REG_0, reg_loop_cnt), + /* restore original values of R6, R7, R8 */ + BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_10, r6_offset), + BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_10, r7_offset), + BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_10, r8_offset), + }; + + *cnt = ARRAY_SIZE(insn_buf); + new_prog = bpf_patch_insn_data(env, position, insn_buf, *cnt); + if (!new_prog) + return new_prog; + + /* callback start is known only after patching */ + callback_start = env->subprog_info[callback_subprogno].start; + /* Note: insn_buf[12] is an offset of BPF_CALL_REL instruction */ + call_insn_offset = position + 12; + callback_offset = callback_start - call_insn_offset - 1; + env->prog->insnsi[call_insn_offset].imm = callback_offset; + + return new_prog; +} + +static bool is_bpf_loop_call(struct bpf_insn *insn) +{ + return insn->code == (BPF_JMP | BPF_CALL) && + insn->src_reg == 0 && + insn->imm == BPF_FUNC_loop; +} + +/* For all sub-programs in the program (including main) check + * insn_aux_data to see if there are bpf_loop calls that require + * inlining. If such calls are found the calls are replaced with a + * sequence of instructions produced by `inline_bpf_loop` function and + * subprog stack_depth is increased by the size of 3 registers. + * This stack space is used to spill values of the R6, R7, R8. These + * registers are used to store the loop bound, counter and context + * variables. + */ +static int optimize_bpf_loop(struct bpf_verifier_env *env) +{ + struct bpf_subprog_info *subprogs = env->subprog_info; + int i, cur_subprog = 0, cnt, delta = 0; + struct bpf_insn *insn = env->prog->insnsi; + int insn_cnt = env->prog->len; + u16 stack_depth = subprogs[cur_subprog].stack_depth; + u16 stack_depth_roundup = round_up(stack_depth, 8) - stack_depth; + u16 stack_depth_extra = 0; + + for (i = 0; i < insn_cnt; i++, insn++) { + struct bpf_loop_inline_state *inline_state = + &env->insn_aux_data[i + delta].loop_inline_state; + + if (is_bpf_loop_call(insn) && inline_state->fit_for_inline) { + struct bpf_prog *new_prog; + + stack_depth_extra = BPF_REG_SIZE * 3 + stack_depth_roundup; + new_prog = inline_bpf_loop(env, + i + delta, + -(stack_depth + stack_depth_extra), + inline_state->callback_subprogno, + &cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = new_prog; + insn = new_prog->insnsi + i + delta; + } + + if (subprogs[cur_subprog + 1].start == i + delta + 1) { + subprogs[cur_subprog].stack_depth += stack_depth_extra; + cur_subprog++; + stack_depth = subprogs[cur_subprog].stack_depth; + stack_depth_roundup = round_up(stack_depth, 8) - stack_depth; + stack_depth_extra = 0; + } + } + + env->prog->aux->stack_depth = env->subprog_info[0].stack_depth; + + return 0; +} + static void free_states(struct bpf_verifier_env *env) { struct bpf_verifier_state_list *sl, *sln; @@ -15052,6 +15219,9 @@ skip_full_check: ret = check_max_stack_depth(env); /* instruction rewrites happen after this point */ + if (ret == 0) + ret = optimize_bpf_loop(env); + if (is_priv) { if (ret == 0) opt_hard_wire_dead_code_branches(env); -- cgit From 95acd8817e66d031d2e6ee7def3f1e1874819317 Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Fri, 17 Jun 2022 12:57:34 +0200 Subject: bpf, x64: Add predicate for bpf2bpf with tailcalls support in JIT The BPF core/verifier is hard-coded to permit mixing bpf2bpf and tail calls for only x86-64. Change the logic to instead rely on a new weak function 'bool bpf_jit_supports_subprog_tailcalls(void)', which a capable JIT backend can override. Update the x86-64 eBPF JIT to reflect this. Signed-off-by: Tony Ambardar [jakub: drop MIPS bits and tweak patch subject] Signed-off-by: Jakub Sitnicki Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220617105735.733938-2-jakub@cloudflare.com --- kernel/bpf/core.c | 6 ++++++ kernel/bpf/verifier.c | 3 ++- 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index b5ffebcce6cc..f023cb399e3f 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2729,6 +2729,12 @@ bool __weak bpf_jit_needs_zext(void) return false; } +/* Return TRUE if the JIT backend supports mixing bpf2bpf and tailcalls. */ +bool __weak bpf_jit_supports_subprog_tailcalls(void) +{ + return false; +} + bool __weak bpf_jit_supports_kfunc_call(void) { return false; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index bf72dc511df6..a20d7736a5b2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6154,7 +6154,8 @@ static bool may_update_sockmap(struct bpf_verifier_env *env, int func_id) static bool allow_tail_call_in_subprogs(struct bpf_verifier_env *env) { - return env->prog->jit_requested && IS_ENABLED(CONFIG_X86_64); + return env->prog->jit_requested && + bpf_jit_supports_subprog_tailcalls(); } static int check_map_func_compatibility(struct bpf_verifier_env *env, -- cgit From cb506e130e02da24d98103d83b8de859e26d1860 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 17 Apr 2022 20:43:13 -0700 Subject: rcutorture: Update rcutorture.fwd_progress help text This commit updates the rcutorture.fwd_progress help text to say that it is the number of forward-progress kthreads to spawn rather than the old enable/disable functionality. While in the area, make the list of torture-test parameters easier to read by taking advantage of 100 columns. Signed-off-by: Paul E. McKenney Reviewed-by: Neeraj Upadhyay --- kernel/rcu/rcutorture.c | 53 +++++++++++++++++-------------------------------- 1 file changed, 18 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 7120165a9342..6f47d1490c4f 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -75,62 +75,45 @@ MODULE_AUTHOR("Paul E. McKenney and Josh Triplett Date: Fri, 20 May 2022 10:21:00 -0700 Subject: rcu-tasks: Stop RCU Tasks Trace from scanning full tasks list This commit takes off the training wheels and relies only on scanning currently running tasks and tasks that have blocked or been preempted within their current RCU Tasks Trace read-side critical section. Before this commit, the time complexity of an RCU Tasks Trace grace period is O(T), where T is the number of tasks. After this commit, this time complexity is O(C+B), where C is the number of CPUs and B is the number of tasks that have blocked (or been preempted) at least once during their current RCU Tasks Trace read-side critical sections. Of course, if all tasks have blocked (or been preempted) at least once during their current RCU Tasks Trace read-side critical sections, this is still O(T), but current expectations are that RCU Tasks Trace read-side critical section will be short and that there will normally not be large numbers of tasks blocked within such a critical section. Dave Marchevsky kindly measured the effects of this commit on the RCU Tasks Trace grace-period latency and the rcu_tasks_trace_kthread task's CPU consumption per RCU Tasks Trace grace period over the course of a fixed test, all in milliseconds: Before After GP latency 22.3 ms stddev > 0.1 17.0 ms stddev < 0.1 GP CPU 2.3 ms stddev 0.3 1.1 ms stddev 0.2 This was on a system with 15,000 tasks, so it is reasonable to expect much larger savings on the systems on which this issue was first noted, given that they sport well in excess of 100,000 tasks. CPU consumption was measured using profiling techniques. Signed-off-by: Paul E. McKenney Cc: Neeraj Upadhyay Cc: Eric Dumazet Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Martin KaFai Lau Cc: KP Singh Tested-by: Dave Marchevsky --- kernel/rcu/tasks.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 272c905995e5..fe0552086ccf 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -670,10 +670,12 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) * and make a list of them in holdouts. */ set_tasks_gp_state(rtp, RTGS_SCAN_TASKLIST); - rcu_read_lock(); - for_each_process_thread(g, t) - rtp->pertask_func(t, &holdouts); - rcu_read_unlock(); + if (rtp->pertask_func) { + rcu_read_lock(); + for_each_process_thread(g, t) + rtp->pertask_func(t, &holdouts); + rcu_read_unlock(); + } set_tasks_gp_state(rtp, RTGS_POST_SCAN_TASKLIST); rtp->postscan_func(&holdouts); @@ -1746,7 +1748,6 @@ static int __init rcu_spawn_tasks_trace_kthread(void) rcu_tasks_trace.init_fract = 1; } rcu_tasks_trace.pregp_func = rcu_tasks_trace_pregp_step; - rcu_tasks_trace.pertask_func = rcu_tasks_trace_pertask; rcu_tasks_trace.postscan_func = rcu_tasks_trace_postscan; rcu_tasks_trace.holdouts_func = check_all_holdout_tasks_trace; rcu_tasks_trace.postgp_func = rcu_tasks_trace_postgp; -- cgit From ffcc21a315e1ebafad51b318e8ac0cb884df0cdc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 1 Jun 2022 21:26:57 -0700 Subject: rcu-tasks: Maintain a count of tasks blocking RCU Tasks Trace grace period This commit maintains a new n_trc_holdouts counter that tracks the number of tasks blocking the RCU Tasks grace period. This counter is useful for debugging, and its value has been added to a diagostic message. Signed-off-by: Paul E. McKenney Cc: Neeraj Upadhyay Cc: Eric Dumazet Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Martin KaFai Lau Cc: KP Singh --- kernel/rcu/tasks.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index fe0552086ccf..9d7d6fd4b8a7 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1206,6 +1206,7 @@ static DEFINE_PER_CPU(bool, trc_ipi_to_cpu); static unsigned long n_heavy_reader_attempts; static unsigned long n_heavy_reader_updates; static unsigned long n_heavy_reader_ofl_updates; +static unsigned long n_trc_holdouts; void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func); DEFINE_RCU_TASKS(rcu_tasks_trace, rcu_tasks_wait_gp, call_rcu_tasks_trace, @@ -1299,6 +1300,7 @@ static void trc_add_holdout(struct task_struct *t, struct list_head *bhp) if (list_empty(&t->trc_holdout_list)) { get_task_struct(t); list_add(&t->trc_holdout_list, bhp); + n_trc_holdouts++; } } @@ -1308,6 +1310,7 @@ static void trc_del_holdout(struct task_struct *t) if (!list_empty(&t->trc_holdout_list)) { list_del_init(&t->trc_holdout_list); put_task_struct(t); + n_trc_holdouts--; } } @@ -1760,7 +1763,8 @@ void show_rcu_tasks_trace_gp_kthread(void) { char buf[64]; - sprintf(buf, "h:%lu/%lu/%lu", + sprintf(buf, "N%lu h:%lu/%lu/%lu", + data_race(n_trc_holdouts), data_race(n_heavy_reader_ofl_updates), data_race(n_heavy_reader_updates), data_race(n_heavy_reader_attempts)); -- cgit From e386b6725798eec07facedf4d4bb710c079fd25c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 2 Jun 2022 17:30:01 -0700 Subject: rcu-tasks: Eliminate RCU Tasks Trace IPIs to online CPUs Currently, the RCU Tasks Trace grace-period kthread IPIs each online CPU using smp_call_function_single() in order to track any tasks currently in RCU Tasks Trace read-side critical sections during which the corresponding task has neither blocked nor been preempted. These IPIs are annoying and are also not strictly necessary because any task that blocks or is preempted within its current RCU Tasks Trace read-side critical section will be tracked on one of the per-CPU rcu_tasks_percpu structure's ->rtp_blkd_tasks list. So the only time that this is a problem is if one of the CPUs runs through a long-duration RCU Tasks Trace read-side critical section without a context switch. Note that the task_call_func() function cannot help here because there is no safe way to identify the target task. Of course, the task_call_func() function will be very useful later, when processing the list of tasks, but it needs to know the task. This commit therefore creates a cpu_curr_snapshot() function that returns a pointer the task_struct structure of some task that happened to be running on the specified CPU more or less during the time that the cpu_curr_snapshot() function was executing. If there was no context switch during this time, this function will return a pointer to the task_struct structure of the task that was running throughout. If there was a context switch, then the outgoing task will be taken care of by RCU's context-switch hook, and the incoming task was either already taken care during some previous context switch, or it is not currently within an RCU Tasks Trace read-side critical section. And in this latter case, the grace period already started, so there is no need to wait on this task. This new cpu_curr_snapshot() function is invoked on each CPU early in the RCU Tasks Trace grace-period processing, and the resulting tasks are queued for later quiescent-state inspection. Signed-off-by: Paul E. McKenney Cc: Peter Zijlstra Cc: Neeraj Upadhyay Cc: Eric Dumazet Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Martin KaFai Lau Cc: KP Singh --- kernel/rcu/tasks.h | 24 +++++++----------------- kernel/sched/core.c | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 9d7d6fd4b8a7..c2aae2643a0b 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1479,21 +1479,6 @@ static void rcu_tasks_trace_pertask(struct task_struct *t, struct list_head *hop trc_wait_for_one_reader(t, hop); } -/* - * Get the current CPU's current task on the holdout list. - * Calls to this function must be serialized. - */ -static void rcu_tasks_trace_pertask_handler(void *hop_in) -{ - struct list_head *hop = hop_in; - struct task_struct *t = current; - - // Pull in the currently running task, but only if it is currently - // in an RCU tasks trace read-side critical section. - if (rcu_tasks_trace_pertask_prep(t, false)) - trc_add_holdout(t, hop); -} - /* Initialize for a new RCU-tasks-trace grace period. */ static void rcu_tasks_trace_pregp_step(struct list_head *hop) { @@ -1513,8 +1498,13 @@ static void rcu_tasks_trace_pregp_step(struct list_head *hop) // These smp_call_function_single() calls are serialized to // allow safe access to the hop list. - for_each_online_cpu(cpu) - smp_call_function_single(cpu, rcu_tasks_trace_pertask_handler, hop, 1); + for_each_online_cpu(cpu) { + rcu_read_lock(); + t = cpu_curr_snapshot(cpu); + if (rcu_tasks_trace_pertask_prep(t, true)) + trc_add_holdout(t, hop); + rcu_read_unlock(); + } // Only after all running tasks have been accounted for is it // safe to take care of the tasks that have blocked within their diff --git a/kernel/sched/core.c b/kernel/sched/core.c index da0bf6fe9ecd..9568019be124 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4263,6 +4263,38 @@ int task_call_func(struct task_struct *p, task_call_f func, void *arg) return ret; } +/** + * cpu_curr_snapshot - Return a snapshot of the currently running task + * @cpu: The CPU on which to snapshot the task. + * + * Returns the task_struct pointer of the task "currently" running on + * the specified CPU. If the same task is running on that CPU throughout, + * the return value will be a pointer to that task's task_struct structure. + * If the CPU did any context switches even vaguely concurrently with the + * execution of this function, the return value will be a pointer to the + * task_struct structure of a randomly chosen task that was running on + * that CPU somewhere around the time that this function was executing. + * + * If the specified CPU was offline, the return value is whatever it + * is, perhaps a pointer to the task_struct structure of that CPU's idle + * task, but there is no guarantee. Callers wishing a useful return + * value must take some action to ensure that the specified CPU remains + * online throughout. + * + * This function executes full memory barriers before and after fetching + * the pointer, which permits the caller to confine this function's fetch + * with respect to the caller's accesses to other shared variables. + */ +struct task_struct *cpu_curr_snapshot(int cpu) +{ + struct task_struct *t; + + smp_mb(); /* Pairing determined by caller's synchronization design. */ + t = rcu_dereference(cpu_curr(cpu)); + smp_mb(); /* Pairing determined by caller's synchronization design. */ + return t; +} + /** * wake_up_process - Wake up a specific process * @p: The process to be woken up. -- cgit From 56096ecd5b04148b6d292e3847c23d4a2a454e94 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 7 Jun 2022 17:25:03 -0700 Subject: rcu-tasks: Disable and enable CPU hotplug in same function The rcu_tasks_trace_pregp_step() function invokes cpus_read_lock() to disable CPU hotplug, and a later call to the rcu_tasks_trace_postscan() function invokes cpus_read_unlock() to re-enable it. This was absolutely necessary in the past in order to protect the intervening scan of the full tasks list, but there is no longer such a scan. This commit therefore improves readability by moving the cpus_read_unlock() call to the end of the rcu_tasks_trace_pregp_step() function. This commit is a pure code-motion commit without any (intended) change in functionality. Signed-off-by: Paul E. McKenney Cc: Neeraj Upadhyay Cc: Eric Dumazet Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Martin KaFai Lau Cc: KP Singh --- kernel/rcu/tasks.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index c2aae2643a0b..bf9cc5bc4ae5 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1525,6 +1525,9 @@ static void rcu_tasks_trace_pregp_step(struct list_head *hop) } raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); } + + // Re-enable CPU hotplug now that the holdout list is populated. + cpus_read_unlock(); } /* @@ -1532,9 +1535,6 @@ static void rcu_tasks_trace_pregp_step(struct list_head *hop) */ static void rcu_tasks_trace_postscan(struct list_head *hop) { - // Re-enable CPU hotplug now that the tasklist scan has completed. - cpus_read_unlock(); - // Wait for late-stage exiting tasks to finish exiting. // These might have passed the call to exit_tasks_rcu_finish(). synchronize_rcu(); -- cgit From eea3423b162d5d5cdc08af23e8ee2c2d1134fd07 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 6 Jun 2022 21:30:38 -0700 Subject: rcu-tasks: Update comments This commit updates comments to reflect the changes in the series of commits that eliminated the full task-list scan. Signed-off-by: Paul E. McKenney Cc: Neeraj Upadhyay Cc: Eric Dumazet Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Martin KaFai Lau Cc: KP Singh --- kernel/rcu/tasks.h | 71 +++++++++++++++++++++++++----------------------------- 1 file changed, 33 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index bf9cc5bc4ae5..df6b2cb2f205 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1138,11 +1138,10 @@ EXPORT_SYMBOL_GPL(show_rcu_tasks_rude_gp_kthread); // 3. Avoids expensive read-side instructions, having overhead similar // to that of Preemptible RCU. // -// There are of course downsides. The grace-period code can send IPIs to -// CPUs, even when those CPUs are in the idle loop or in nohz_full userspace. -// It is necessary to scan the full tasklist, much as for Tasks RCU. There -// is a single callback queue guarded by a single lock, again, much as for -// Tasks RCU. If needed, these downsides can be at least partially remedied. +// There are of course downsides. For example, the grace-period code +// can send IPIs to CPUs, even when those CPUs are in the idle loop or +// in nohz_full userspace. If needed, these downsides can be at least +// partially remedied. // // Perhaps most important, this variant of RCU does not affect the vanilla // flavors, rcu_preempt and rcu_sched. The fact that RCU Tasks Trace @@ -1155,38 +1154,30 @@ EXPORT_SYMBOL_GPL(show_rcu_tasks_rude_gp_kthread); // invokes these functions in this order: // // rcu_tasks_trace_pregp_step(): -// Initialize the count of readers and block CPU-hotplug operations. -// rcu_tasks_trace_pertask(), invoked on every non-idle task: -// Initialize per-task state and attempt to identify an immediate -// quiescent state for that task, or, failing that, attempt to -// set that task's .need_qs flag so that task's next outermost -// rcu_read_unlock_trace() will report the quiescent state (in which -// case the count of readers is incremented). If both attempts fail, -// the task is added to a "holdout" list. Note that IPIs are used -// to invoke trc_read_check_handler() in the context of running tasks -// in order to avoid ordering overhead on common-case shared-variable -// accessses. +// Disables CPU hotplug, adds all currently executing tasks to the +// holdout list, then checks the state of all tasks that blocked +// or were preempted within their current RCU Tasks Trace read-side +// critical section, adding them to the holdout list if appropriate. +// Finally, this function re-enables CPU hotplug. +// The ->pertask_func() pointer is NULL, so there is no per-task processing. // rcu_tasks_trace_postscan(): -// Initialize state and attempt to identify an immediate quiescent -// state as above (but only for idle tasks), unblock CPU-hotplug -// operations, and wait for an RCU grace period to avoid races with -// tasks that are in the process of exiting. +// Invokes synchronize_rcu() to wait for late-stage exiting tasks +// to finish exiting. // check_all_holdout_tasks_trace(), repeatedly until holdout list is empty: // Scans the holdout list, attempting to identify a quiescent state // for each task on the list. If there is a quiescent state, the -// corresponding task is removed from the holdout list. +// corresponding task is removed from the holdout list. Once this +// list is empty, the grace period has completed. // rcu_tasks_trace_postgp(): -// Wait for the count of readers do drop to zero, reporting any stalls. -// Also execute full memory barriers to maintain ordering with code -// executing after the grace period. +// Provides the needed full memory barrier and does debug checks. // // The exit_tasks_rcu_finish_trace() synchronizes with exiting tasks. // -// Pre-grace-period update-side code is ordered before the grace -// period via the ->cbs_lock and barriers in rcu_tasks_kthread(). -// Pre-grace-period read-side code is ordered before the grace period by -// atomic_dec_and_test() of the count of readers (for IPIed readers) and by -// scheduler context-switch ordering (for locked-down non-running readers). +// Pre-grace-period update-side code is ordered before the grace period +// via the ->cbs_lock and barriers in rcu_tasks_kthread(). Pre-grace-period +// read-side code is ordered before the grace period by atomic operations +// on .b.need_qs flag of each task involved in this process, or by scheduler +// context-switch ordering (for locked-down non-running readers). // The lockdep state must be outside of #ifdef to be useful. #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -1245,7 +1236,10 @@ u8 rcu_trc_cmpxchg_need_qs(struct task_struct *t, u8 old, u8 new) } EXPORT_SYMBOL_GPL(rcu_trc_cmpxchg_need_qs); -/* If we are the last reader, wake up the grace-period kthread. */ +/* + * If we are the last reader, signal the grace-period kthread. + * Also remove from the per-CPU list of blocked tasks. + */ void rcu_read_unlock_trace_special(struct task_struct *t) { unsigned long flags; @@ -1336,9 +1330,9 @@ static void trc_read_check_handler(void *t_in) if (unlikely(nesting < 0)) goto reset_ipi; - // Get here if the task is in a read-side critical section. Set - // its state so that it will awaken the grace-period kthread upon - // exit from that critical section. + // Get here if the task is in a read-side critical section. + // Set its state so that it will update state for the grace-period + // kthread upon exit from that critical section. rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS | TRC_NEED_QS_CHECKED); reset_ipi: @@ -1387,7 +1381,7 @@ static int trc_inspect_reader(struct task_struct *t, void *bhp_in) return 0; // In QS, so done. } if (nesting < 0) - return -EINVAL; // QS transitioning, try again later. + return -EINVAL; // Reader transitioning, try again later. // The task is in a read-side critical section, so set up its // state so that it will update state upon exit from that critical @@ -1492,11 +1486,12 @@ static void rcu_tasks_trace_pregp_step(struct list_head *hop) for_each_possible_cpu(cpu) WARN_ON_ONCE(per_cpu(trc_ipi_to_cpu, cpu)); - // Disable CPU hotplug across the CPU scan. - // This also waits for all readers in CPU-hotplug code paths. + // Disable CPU hotplug across the CPU scan for the benefit of + // any IPIs that might be needed. This also waits for all readers + // in CPU-hotplug code paths. cpus_read_lock(); - // These smp_call_function_single() calls are serialized to + // These rcu_tasks_trace_pertask_prep() calls are serialized to // allow safe access to the hop list. for_each_online_cpu(cpu) { rcu_read_lock(); @@ -1608,7 +1603,7 @@ static void check_all_holdout_tasks_trace(struct list_head *hop, { struct task_struct *g, *t; - // Disable CPU hotplug across the holdout list scan. + // Disable CPU hotplug across the holdout list scan for IPIs. cpus_read_lock(); list_for_each_entry_safe(t, g, hop, trc_holdout_list) { -- cgit From 1cf1144e8473e8c3180ac8b91309e29b6acfd95f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 7 Jun 2022 15:23:52 -0700 Subject: rcu-tasks: Be more patient for RCU Tasks boot-time testing The RCU-Tasks family of grace-period primitives can take some time to complete, and the amount of time can depend on the exact hardware and software configuration. Some configurations boot up fast enough that the RCU-Tasks verification process gets false-positive failures. This commit therefore allows up to 30 seconds for the grace periods to complete, with this value adjustable downwards using the rcupdate.rcu_task_stall_timeout kernel boot parameter. Reported-by: Matthew Wilcox Reported-by: Zhouyi Zhou Signed-off-by: Paul E. McKenney Tested-by: Zhouyi Zhou Tested-by: Mark Rutland --- kernel/rcu/tasks.h | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index df6b2cb2f205..fcbd0ec33c86 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -145,6 +145,7 @@ static int rcu_task_ipi_delay __read_mostly = RCU_TASK_IPI_DELAY; module_param(rcu_task_ipi_delay, int, 0644); /* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */ +#define RCU_TASK_BOOT_STALL_TIMEOUT (HZ * 30) #define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10) static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT; module_param(rcu_task_stall_timeout, int, 0644); @@ -1776,23 +1777,24 @@ struct rcu_tasks_test_desc { struct rcu_head rh; const char *name; bool notrun; + unsigned long runstart; }; static struct rcu_tasks_test_desc tests[] = { { .name = "call_rcu_tasks()", /* If not defined, the test is skipped. */ - .notrun = !IS_ENABLED(CONFIG_TASKS_RCU), + .notrun = IS_ENABLED(CONFIG_TASKS_RCU), }, { .name = "call_rcu_tasks_rude()", /* If not defined, the test is skipped. */ - .notrun = !IS_ENABLED(CONFIG_TASKS_RUDE_RCU), + .notrun = IS_ENABLED(CONFIG_TASKS_RUDE_RCU), }, { .name = "call_rcu_tasks_trace()", /* If not defined, the test is skipped. */ - .notrun = !IS_ENABLED(CONFIG_TASKS_TRACE_RCU) + .notrun = IS_ENABLED(CONFIG_TASKS_TRACE_RCU) } }; @@ -1803,23 +1805,28 @@ static void test_rcu_tasks_callback(struct rcu_head *rhp) pr_info("Callback from %s invoked.\n", rttd->name); - rttd->notrun = true; + rttd->notrun = false; } static void rcu_tasks_initiate_self_tests(void) { + unsigned long j = jiffies; + pr_info("Running RCU-tasks wait API self tests\n"); #ifdef CONFIG_TASKS_RCU + tests[0].runstart = j; synchronize_rcu_tasks(); call_rcu_tasks(&tests[0].rh, test_rcu_tasks_callback); #endif #ifdef CONFIG_TASKS_RUDE_RCU + tests[1].runstart = j; synchronize_rcu_tasks_rude(); call_rcu_tasks_rude(&tests[1].rh, test_rcu_tasks_callback); #endif #ifdef CONFIG_TASKS_TRACE_RCU + tests[2].runstart = j; synchronize_rcu_tasks_trace(); call_rcu_tasks_trace(&tests[2].rh, test_rcu_tasks_callback); #endif @@ -1829,11 +1836,18 @@ static int rcu_tasks_verify_self_tests(void) { int ret = 0; int i; + unsigned long bst = rcu_task_stall_timeout; + if (bst <= 0 || bst > RCU_TASK_BOOT_STALL_TIMEOUT) + bst = RCU_TASK_BOOT_STALL_TIMEOUT; for (i = 0; i < ARRAY_SIZE(tests); i++) { - if (!tests[i].notrun) { // still hanging. - pr_err("%s has been failed.\n", tests[i].name); - ret = -1; + while (tests[i].notrun) { // still hanging. + if (time_after(jiffies, tests[i].runstart + bst)) { + pr_err("%s has failed boot-time tests.\n", tests[i].name); + ret = -1; + break; + } + schedule_timeout_uninterruptible(1); } } -- cgit From e72ee5e1a866b85cb6c3d4c80a1125976020a7e8 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 14 Jun 2022 08:06:20 -0400 Subject: rcu-tasks: Use delayed_work to delay rcu_tasks_verify_self_tests() Commit 2585014188d5 ("rcu-tasks: Be more patient for RCU Tasks boot-time testing") fixes false positive rcu_tasks verification check failure by repeating the test once every second until timeout using schedule_timeout_uninterruptible(). Since rcu_tasks_verify_selft_tests() is called from do_initcalls() as a late_initcall, this has the undesirable side effect of delaying other late_initcall's queued after it by a second or more. Fix this by instead using delayed_work to repeat the verification check. Fixes: 2585014188d5 ("rcu-tasks: Be more patient for RCU Tasks boot-time testing") Signed-off-by: Waiman Long Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 37 ++++++++++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index fcbd0ec33c86..83c7e6620d40 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1832,6 +1832,11 @@ static void rcu_tasks_initiate_self_tests(void) #endif } +/* + * Return: 0 - test passed + * 1 - test failed, but have not timed out yet + * -1 - test failed and timed out + */ static int rcu_tasks_verify_self_tests(void) { int ret = 0; @@ -1847,16 +1852,38 @@ static int rcu_tasks_verify_self_tests(void) ret = -1; break; } - schedule_timeout_uninterruptible(1); + ret = 1; + break; } } - - if (ret) - WARN_ON(1); + WARN_ON(ret < 0); return ret; } -late_initcall(rcu_tasks_verify_self_tests); + +/* + * Repeat the rcu_tasks_verify_self_tests() call once every second until the + * test passes or has timed out. + */ +static struct delayed_work rcu_tasks_verify_work; +static void rcu_tasks_verify_work_fn(struct work_struct *work __maybe_unused) +{ + int ret = rcu_tasks_verify_self_tests(); + + if (ret <= 0) + return; + + /* Test fails but not timed out yet, reschedule another check */ + schedule_delayed_work(&rcu_tasks_verify_work, HZ); +} + +static int rcu_tasks_verify_schedule_work(void) +{ + INIT_DELAYED_WORK(&rcu_tasks_verify_work, rcu_tasks_verify_work_fn); + rcu_tasks_verify_work_fn(NULL); + return 0; +} +late_initcall(rcu_tasks_verify_schedule_work); #else /* #ifdef CONFIG_PROVE_RCU */ static void rcu_tasks_initiate_self_tests(void) { } #endif /* #else #ifdef CONFIG_PROVE_RCU */ -- cgit From 14c0017c19ead104511dbcf6a59b83b2456d09af Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Mon, 11 Apr 2022 17:19:03 +0200 Subject: rcu/torture: Change order of warning and trace dump Dumping a big ftrace buffer could lead to a RCU stall. So there is the ftrace buffer and the stall information which needs to be printed. When there is additionally a WARN_ON() which describes the reason for the ftrace buffer dump and the WARN_ON() is executed _after_ ftrace buffer dump, the information get lost in the middle of the RCU stall information. Therefore print WARN_ON() message before dumping the ftrace buffer in rcu_torture_writer(). [ paulmck: Add tracing_off() to avoid cruft from WARN(). ] Signed-off-by: Anna-Maria Behnsen Reviewed-by: Benedikt Spranger Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 7120165a9342..3032dd7c7ad3 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1321,8 +1321,9 @@ rcu_torture_writer(void *arg) if (list_empty(&rcu_tortures[i].rtort_free) && rcu_access_pointer(rcu_torture_current) != &rcu_tortures[i]) { - rcu_ftrace_dump(DUMP_ALL); + tracing_off(); WARN(1, "%s: rtort_pipe_count: %d\n", __func__, rcu_tortures[i].rtort_pipe_count); + rcu_ftrace_dump(DUMP_ALL); } if (stutter_waited) sched_set_normal(current, oldnice); -- cgit From d984114ec23818670c8873939eac81ba6e104ff5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 27 Apr 2022 11:46:02 -0700 Subject: rcutorture: Simplify rcu_torture_read_exit_child() loop The existing loop has an implicit manual loop that obscures the flow and requires an extra control variable. This commit makes this implicit loop explicit, thus saving several lines of code. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 47 ++++++++++++++++++++--------------------------- 1 file changed, 20 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 3032dd7c7ad3..eed32cea938b 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -2874,7 +2874,6 @@ static int rcu_torture_read_exit_child(void *trsp_in) // Parent kthread which creates and destroys read-exit child kthreads. static int rcu_torture_read_exit(void *unused) { - int count = 0; bool errexit = false; int i; struct task_struct *tsp; @@ -2886,34 +2885,28 @@ static int rcu_torture_read_exit(void *unused) // Each pass through this loop does one read-exit episode. do { - if (++count > read_exit_burst) { - VERBOSE_TOROUT_STRING("rcu_torture_read_exit: End of episode"); - rcu_barrier(); // Wait for task_struct free, avoid OOM. - for (i = 0; i < read_exit_delay; i++) { - schedule_timeout_uninterruptible(HZ); - if (READ_ONCE(read_exit_child_stop)) - break; + VERBOSE_TOROUT_STRING("rcu_torture_read_exit: Start of episode"); + for (i = 0; i < read_exit_burst; i++) { + if (READ_ONCE(read_exit_child_stop)) + break; + stutter_wait("rcu_torture_read_exit"); + // Spawn child. + tsp = kthread_run(rcu_torture_read_exit_child, + &trs, "%s", "rcu_torture_read_exit_child"); + if (IS_ERR(tsp)) { + TOROUT_ERRSTRING("out of memory"); + errexit = true; + break; } - if (!READ_ONCE(read_exit_child_stop)) - VERBOSE_TOROUT_STRING("rcu_torture_read_exit: Start of episode"); - count = 0; - } - if (READ_ONCE(read_exit_child_stop)) - break; - // Spawn child. - tsp = kthread_run(rcu_torture_read_exit_child, - &trs, "%s", - "rcu_torture_read_exit_child"); - if (IS_ERR(tsp)) { - TOROUT_ERRSTRING("out of memory"); - errexit = true; - tsp = NULL; - break; + cond_resched(); + kthread_stop(tsp); + n_read_exits++; } - cond_resched(); - kthread_stop(tsp); - n_read_exits ++; - stutter_wait("rcu_torture_read_exit"); + VERBOSE_TOROUT_STRING("rcu_torture_read_exit: End of episode"); + rcu_barrier(); // Wait for task_struct free, avoid OOM. + i = 0; + for (; !errexit && !READ_ONCE(read_exit_child_stop) && i < read_exit_delay; i++) + schedule_timeout_uninterruptible(HZ); } while (!errexit && !READ_ONCE(read_exit_child_stop)); // Clean up and exit. -- cgit From 98ea20328786372cbbc90c601be168f5fe1f8845 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Wed, 27 Apr 2022 15:15:20 +0800 Subject: rcutorture: Fix memory leak in rcu_test_debug_objects() The kernel memory leak detector located the following: unreferenced object 0xffff95d941135b50 (size 16): comm "swapper/0", pid 1, jiffies 4294667610 (age 1367.451s) hex dump (first 16 bytes): f0 c6 c2 bd d9 95 ff ff 00 00 00 00 00 00 00 00 ................ backtrace: [<00000000bc81d9b1>] kmem_cache_alloc_trace+0x2f6/0x500 [<00000000d28be229>] rcu_torture_init+0x1235/0x1354 [<0000000032c3acd9>] do_one_initcall+0x51/0x210 [<000000003c117727>] kernel_init_freeable+0x205/0x259 [<000000003961f965>] kernel_init+0x1a/0x120 [<000000001998f890>] ret_from_fork+0x22/0x30 This is caused by the rcu_test_debug_objects() function allocating an rcu_head structure, then failing to free it. This commit therefore adds the needed kfree() after the last use of this structure. Signed-off-by: Zqiang Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index eed32cea938b..62841e9cd268 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -3116,6 +3116,7 @@ static void rcu_test_debug_objects(void) pr_alert("%s: WARN: Duplicate call_rcu() test complete.\n", KBUILD_MODNAME); destroy_rcu_head_on_stack(&rh1); destroy_rcu_head_on_stack(&rh2); + kfree(rhp); #else /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ pr_alert("%s: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n", KBUILD_MODNAME); #endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ -- cgit From 8c0666d320f2fff6bc7cf76422bfbe90c20f53cc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 20 May 2022 13:18:16 -0700 Subject: rcutorture: Make failure indication note reader-batch overflow The loop scanning the pipesummary[] array currently skips the last element, which means that the diagnostics ignore those rarest of situations, namely where some readers persist across more than ten grace periods, but all other readers avoid spanning a full grace period. This commit therefore adjusts the scan to include the last element of this array. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 62841e9cd268..7e7c3518ab06 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1869,7 +1869,7 @@ rcu_torture_stats_print(void) batchsummary[i] += READ_ONCE(per_cpu(rcu_torture_batch, cpu)[i]); } } - for (i = RCU_TORTURE_PIPE_LEN - 1; i >= 0; i--) { + for (i = RCU_TORTURE_PIPE_LEN; i >= 0; i--) { if (pipesummary[i] != 0) break; } -- cgit From 92366810644d5675043c792abb70eaf974a77384 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Sat, 21 May 2022 14:56:26 +0800 Subject: rcuscale: Fix smp_processor_id()-in-preemptible warnings Systems built with CONFIG_DEBUG_PREEMPT=y can trigger the following BUG while running the rcuscale performance test: BUG: using smp_processor_id() in preemptible [00000000] code: rcu_scale_write/69 CPU: 0 PID: 66 Comm: rcu_scale_write Not tainted 5.18.0-rc7-next-20220517-yoctodev-standard+ caller is debug_smp_processor_id+0x17/0x20 Call Trace: dump_stack_lvl+0x49/0x5e dump_stack+0x10/0x12 check_preemption_disabled+0xdf/0xf0 debug_smp_processor_id+0x17/0x20 rcu_scale_writer+0x2b5/0x580 kthread+0x177/0x1b0 ret_from_fork+0x22/0x30 Reproduction method: runqemu kvm slirp nographic qemuparams="-m 4096 -smp 8" bootparams="isolcpus=2,3 nohz_full=2,3 rcu_nocbs=2,3 rcutree.dump_tree=1 rcuscale.shutdown=false rcuscale.gp_async=true" -d The problem is that the rcu_scale_writer() kthreads fail to set the PF_NO_SETAFFINITY flags, which causes is_percpu_thread() to assume that the kthread's affinity might change at any time, thus the BUG noted above. This commit therefore causes rcu_scale_writer() to set PF_NO_SETAFFINITY in its kthread's ->flags field, thus preventing this BUG. Signed-off-by: Zqiang Signed-off-by: Paul E. McKenney --- kernel/rcu/rcuscale.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index 277a5bfb37d4..3ef02d4a8108 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -419,6 +419,7 @@ rcu_scale_writer(void *arg) VERBOSE_SCALEOUT_STRING("rcu_scale_writer task started"); WARN_ON(!wdpp); set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); + current->flags |= PF_NO_SETAFFINITY; sched_set_fifo_low(current); if (holdoff) -- cgit From 3002153a91a9732a6d1d0bb95138593c7da15743 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 10 Jun 2022 15:03:57 +0200 Subject: rcutorture: Fix ksoftirqd boosting timing and iteration The RCU priority boosting can fail in two situations: 1) If (nr_cpus= > maxcpus=), which means if the total number of CPUs is higher than those brought online at boot, then torture_onoff() may later bring up CPUs that weren't online on boot. Now since rcutorture initialization only boosts the ksoftirqds of the CPUs that have been set online on boot, the CPUs later set online by torture_onoff won't benefit from the boost, making RCU priority boosting fail. 2) The ksoftirqd kthreads are boosted after the creation of rcu_torture_boost() kthreads, which opens a window large enough for these rcu_torture_boost() kthreads to wait (despite running at FIFO priority) for ksoftirqds that are still running at SCHED_NORMAL priority. The issues can trigger for example with: ./kvm.sh --configs TREE01 --kconfig "CONFIG_RCU_BOOST=y" [ 34.968561] rcu-torture: !!! [ 34.968627] ------------[ cut here ]------------ [ 35.014054] WARNING: CPU: 4 PID: 114 at kernel/rcu/rcutorture.c:1979 rcu_torture_stats_print+0x5ad/0x610 [ 35.052043] Modules linked in: [ 35.069138] CPU: 4 PID: 114 Comm: rcu_torture_sta Not tainted 5.18.0-rc1 #1 [ 35.096424] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.14.0-0-g155821a-rebuilt.opensuse.org 04/01/2014 [ 35.154570] RIP: 0010:rcu_torture_stats_print+0x5ad/0x610 [ 35.198527] Code: 63 1b 02 00 74 02 0f 0b 48 83 3d 35 63 1b 02 00 74 02 0f 0b 48 83 3d 21 63 1b 02 00 74 02 0f 0b 48 83 3d 0d 63 1b 02 00 74 02 <0f> 0b 83 eb 01 0f 8e ba fc ff ff 0f 0b e9 b3 fc ff f82 [ 37.251049] RSP: 0000:ffffa92a0050bdf8 EFLAGS: 00010202 [ 37.277320] rcu: De-offloading 8 [ 37.290367] RAX: 0000000000000000 RBX: 0000000000000001 RCX: 0000000000000001 [ 37.290387] RDX: 0000000000000000 RSI: 00000000ffffbfff RDI: 00000000ffffffff [ 37.290398] RBP: 000000000000007b R08: 0000000000000000 R09: c0000000ffffbfff [ 37.290407] R10: 000000000000002a R11: ffffa92a0050bc18 R12: ffffa92a0050be20 [ 37.290417] R13: ffffa92a0050be78 R14: 0000000000000000 R15: 000000000001bea0 [ 37.290427] FS: 0000000000000000(0000) GS:ffff96045eb00000(0000) knlGS:0000000000000000 [ 37.290448] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 37.290460] CR2: 0000000000000000 CR3: 000000001dc0c000 CR4: 00000000000006e0 [ 37.290470] Call Trace: [ 37.295049] [ 37.295065] ? preempt_count_add+0x63/0x90 [ 37.295095] ? _raw_spin_lock_irqsave+0x12/0x40 [ 37.295125] ? rcu_torture_stats_print+0x610/0x610 [ 37.295143] rcu_torture_stats+0x29/0x70 [ 37.295160] kthread+0xe3/0x110 [ 37.295176] ? kthread_complete_and_exit+0x20/0x20 [ 37.295193] ret_from_fork+0x22/0x30 [ 37.295218] Fix this with boosting the ksoftirqds kthreads from the boosting hotplug callback itself and before the boosting kthreads are created. Fixes: ea6d962e80b6 ("rcutorture: Judge RCU priority boosting on grace periods, not callbacks") Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 7e7c3518ab06..6ba1cf8b3b71 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -2076,6 +2076,19 @@ static int rcutorture_booster_init(unsigned int cpu) if (boost_tasks[cpu] != NULL) return 0; /* Already created, nothing more to do. */ + // Testing RCU priority boosting requires rcutorture do + // some serious abuse. Counter this by running ksoftirqd + // at higher priority. + if (IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)) { + struct sched_param sp; + struct task_struct *t; + + t = per_cpu(ksoftirqd, cpu); + WARN_ON_ONCE(!t); + sp.sched_priority = 2; + sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + } + /* Don't allow time recalculation while creating a new task. */ mutex_lock(&boost_mutex); rcu_torture_disable_rt_throttle(); @@ -3324,21 +3337,6 @@ rcu_torture_init(void) rcutor_hp = firsterr; if (torture_init_error(firsterr)) goto unwind; - - // Testing RCU priority boosting requires rcutorture do - // some serious abuse. Counter this by running ksoftirqd - // at higher priority. - if (IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)) { - for_each_online_cpu(cpu) { - struct sched_param sp; - struct task_struct *t; - - t = per_cpu(ksoftirqd, cpu); - WARN_ON_ONCE(!t); - sp.sched_priority = 2; - sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); - } - } } shutdown_jiffies = jiffies + shutdown_secs * HZ; firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup); -- cgit From 1a5ca5e09811dec9472f50b98eeb3e3b2442d050 Mon Sep 17 00:00:00 2001 From: Li Qiong Date: Sun, 12 Jun 2022 14:48:25 +0800 Subject: rcutorture: Handle failure of memory allocation functions This commit adds warnings for allocation failure during the mem_dump_obj() tests. It also terminates these tests upon such failure. Signed-off-by: Li Qiong Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 6ba1cf8b3b71..2ba74498d36d 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1991,7 +1991,13 @@ static void rcu_torture_mem_dump_obj(void) static int z; kcp = kmem_cache_create("rcuscale", 136, 8, SLAB_STORE_USER, NULL); + if (WARN_ON_ONCE(!kcp)) + return; rhp = kmem_cache_alloc(kcp, GFP_KERNEL); + if (WARN_ON_ONCE(!rhp)) { + kmem_cache_destroy(kcp); + return; + } pr_alert("mem_dump_obj() slab test: rcu_torture_stats = %px, &rhp = %px, rhp = %px, &z = %px\n", stats_task, &rhp, rhp, &z); pr_alert("mem_dump_obj(ZERO_SIZE_PTR):"); mem_dump_obj(ZERO_SIZE_PTR); @@ -2008,6 +2014,8 @@ static void rcu_torture_mem_dump_obj(void) kmem_cache_free(kcp, rhp); kmem_cache_destroy(kcp); rhp = kmalloc(sizeof(*rhp), GFP_KERNEL); + if (WARN_ON_ONCE(!rhp)) + return; pr_alert("mem_dump_obj() kmalloc test: rcu_torture_stats = %px, &rhp = %px, rhp = %px\n", stats_task, &rhp, rhp); pr_alert("mem_dump_obj(kmalloc %px):", rhp); mem_dump_obj(rhp); @@ -2015,6 +2023,8 @@ static void rcu_torture_mem_dump_obj(void) mem_dump_obj(&rhp->func); kfree(rhp); rhp = vmalloc(4096); + if (WARN_ON_ONCE(!rhp)) + return; pr_alert("mem_dump_obj() vmalloc test: rcu_torture_stats = %px, &rhp = %px, rhp = %px\n", stats_task, &rhp, rhp); pr_alert("mem_dump_obj(vmalloc %px):", rhp); mem_dump_obj(rhp); -- cgit From 7bf336fb8dac718febb7bf4fe79e1be0c5e4a631 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Sun, 12 Jun 2022 10:02:25 +0800 Subject: refscale: Convert test_lock spinlock to raw_spinlock In kernels built with CONFIG_PREEMPT_RT=y, spinlocks are replaced by rt_mutex, which can sleep. This means that acquiring a non-raw spinlock in a critical section where preemption is disabled can trigger the following BUG: BUG: scheduling while atomic: ref_scale_reade/76/0x00000002 Preemption disabled at: ref_lock_section+0x16/0x80 Call Trace: dump_stack_lvl+0x5b/0x82 dump_stack+0x10/0x12 __schedule_bug.cold+0x9c/0xad __schedule+0x839/0xc00 schedule_rtlock+0x22/0x40 rtlock_slowlock_locked+0x460/0x1350 rt_spin_lock+0x61/0xe0 ref_lock_section+0x29/0x80 rcu_scale_one_reader+0x52/0x60 ref_scale_reader+0x28d/0x490 kthread+0x128/0x150 ret_from_fork+0x22/0x30 This commit therefore converts spinlock to raw_spinlock. Signed-off-by: Zqiang Signed-off-by: Paul E. McKenney --- kernel/rcu/refscale.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index 909644abee67..435c884c02b5 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -385,7 +385,7 @@ static struct ref_scale_ops rwsem_ops = { }; // Definitions for global spinlock -static DEFINE_SPINLOCK(test_lock); +static DEFINE_RAW_SPINLOCK(test_lock); static void ref_lock_section(const int nloops) { @@ -393,8 +393,8 @@ static void ref_lock_section(const int nloops) preempt_disable(); for (i = nloops; i >= 0; i--) { - spin_lock(&test_lock); - spin_unlock(&test_lock); + raw_spin_lock(&test_lock); + raw_spin_unlock(&test_lock); } preempt_enable(); } @@ -405,9 +405,9 @@ static void ref_lock_delay_section(const int nloops, const int udl, const int nd preempt_disable(); for (i = nloops; i >= 0; i--) { - spin_lock(&test_lock); + raw_spin_lock(&test_lock); un_delay(udl, ndl); - spin_unlock(&test_lock); + raw_spin_unlock(&test_lock); } preempt_enable(); } @@ -427,8 +427,8 @@ static void ref_lock_irq_section(const int nloops) preempt_disable(); for (i = nloops; i >= 0; i--) { - spin_lock_irqsave(&test_lock, flags); - spin_unlock_irqrestore(&test_lock, flags); + raw_spin_lock_irqsave(&test_lock, flags); + raw_spin_unlock_irqrestore(&test_lock, flags); } preempt_enable(); } @@ -440,9 +440,9 @@ static void ref_lock_irq_delay_section(const int nloops, const int udl, const in preempt_disable(); for (i = nloops; i >= 0; i--) { - spin_lock_irqsave(&test_lock, flags); + raw_spin_lock_irqsave(&test_lock, flags); un_delay(udl, ndl); - spin_unlock_irqrestore(&test_lock, flags); + raw_spin_unlock_irqrestore(&test_lock, flags); } preempt_enable(); } -- cgit From 0ffc781a19ed3030c792ad1a0e44e6e047bb9adc Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 8 Jun 2022 16:40:20 +0200 Subject: context_tracking: Rename __context_tracking_enter/exit() to __ct_user_enter/exit() The context tracking namespace is going to expand and some new functions will require even longer names. Start shrinking the context_tracking prefix to "ct" as is already the case for some existing macros, this will make the introduction of new functions easier. Acked-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Neeraj Upadhyay Cc: Uladzislau Rezki Cc: Joel Fernandes Cc: Boqun Feng Cc: Nicolas Saenz Julienne Cc: Marcelo Tosatti Cc: Xiongfeng Wang Cc: Yu Liao Cc: Phil Auld Cc: Paul Gortmaker Cc: Alex Belits Signed-off-by: Paul E. McKenney Reviewed-by: Nicolas Saenz Julienne Tested-by: Nicolas Saenz Julienne --- kernel/context_tracking.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 3082332f6476..88c60ab39fbb 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -51,15 +51,15 @@ static __always_inline void context_tracking_recursion_exit(void) } /** - * context_tracking_enter - Inform the context tracking that the CPU is going - * enter user or guest space mode. + * __ct_user_enter - Inform the context tracking that the CPU is going + * to enter user or guest space mode. * * This function must be called right before we switch from the kernel * to user or guest space, when it's guaranteed the remaining kernel * instructions to execute won't use any RCU read side critical section * because this function sets RCU in extended quiescent state. */ -void noinstr __context_tracking_enter(enum ctx_state state) +void noinstr __ct_user_enter(enum ctx_state state) { /* Kernel threads aren't supposed to go to userspace */ WARN_ON_ONCE(!current->mm); @@ -101,7 +101,7 @@ void noinstr __context_tracking_enter(enum ctx_state state) } context_tracking_recursion_exit(); } -EXPORT_SYMBOL_GPL(__context_tracking_enter); +EXPORT_SYMBOL_GPL(__ct_user_enter); /* * OBSOLETE: @@ -128,7 +128,7 @@ void context_tracking_enter(enum ctx_state state) return; local_irq_save(flags); - __context_tracking_enter(state); + __ct_user_enter(state); local_irq_restore(flags); } NOKPROBE_SYMBOL(context_tracking_enter); @@ -149,8 +149,8 @@ void context_tracking_user_enter(void) NOKPROBE_SYMBOL(context_tracking_user_enter); /** - * context_tracking_exit - Inform the context tracking that the CPU is - * exiting user or guest mode and entering the kernel. + * __ct_user_exit - Inform the context tracking that the CPU is + * exiting user or guest mode and entering the kernel. * * This function must be called after we entered the kernel from user or * guest space before any use of RCU read side critical section. This @@ -160,7 +160,7 @@ NOKPROBE_SYMBOL(context_tracking_user_enter); * This call supports re-entrancy. This way it can be called from any exception * handler without needing to know if we came from userspace or not. */ -void noinstr __context_tracking_exit(enum ctx_state state) +void noinstr __ct_user_exit(enum ctx_state state) { if (!context_tracking_recursion_enter()) return; @@ -183,7 +183,7 @@ void noinstr __context_tracking_exit(enum ctx_state state) } context_tracking_recursion_exit(); } -EXPORT_SYMBOL_GPL(__context_tracking_exit); +EXPORT_SYMBOL_GPL(__ct_user_exit); /* * OBSOLETE: @@ -202,7 +202,7 @@ void context_tracking_exit(enum ctx_state state) return; local_irq_save(flags); - __context_tracking_exit(state); + __ct_user_exit(state); local_irq_restore(flags); } NOKPROBE_SYMBOL(context_tracking_exit); -- cgit From 466298c65678e7a2bd7bd268fd3776de73e96a4b Mon Sep 17 00:00:00 2001 From: Dongli Zhang Date: Sat, 11 Jun 2022 01:25:12 -0700 Subject: swiotlb: remove a useless return in swiotlb_init Both swiotlb_init_remap() and swiotlb_init() have return type void. Signed-off-by: Dongli Zhang Signed-off-by: Christoph Hellwig --- kernel/dma/swiotlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index cb50f8d38360..fd21f4162f4b 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -282,7 +282,7 @@ retry: void __init swiotlb_init(bool addressing_limit, unsigned int flags) { - return swiotlb_init_remap(addressing_limit, flags, NULL); + swiotlb_init_remap(addressing_limit, flags, NULL); } /* -- cgit From 0bf28fc40d89b1a3e00d1b79473bad4e9ca20ad1 Mon Sep 17 00:00:00 2001 From: Dongli Zhang Date: Sat, 11 Jun 2022 01:25:14 -0700 Subject: swiotlb: panic if nslabs is too small Panic on purpose if nslabs is too small, in order to sync with the remap retry logic. In addition, print the number of bytes for tlb alloc failure. Signed-off-by: Dongli Zhang Signed-off-by: Christoph Hellwig --- kernel/dma/swiotlb.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index fd21f4162f4b..1758b724c7a8 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -242,6 +242,9 @@ void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags, if (swiotlb_force_disable) return; + if (nslabs < IO_TLB_MIN_SLABS) + panic("%s: nslabs = %lu too small\n", __func__, nslabs); + /* * By default allocate the bounce buffer memory from low memory, but * allow to pick a location everywhere for hypervisors with guest @@ -254,7 +257,8 @@ retry: else tlb = memblock_alloc_low(bytes, PAGE_SIZE); if (!tlb) { - pr_warn("%s: failed to allocate tlb structure\n", __func__); + pr_warn("%s: Failed to allocate %zu bytes tlb structure\n", + __func__, bytes); return; } -- cgit From 9f0265e921dee14096943ee11f793fa076aa7a72 Mon Sep 17 00:00:00 2001 From: Jörn-Thorben Hinz Date: Wed, 22 Jun 2022 21:12:24 +0200 Subject: bpf: Require only one of cong_avoid() and cong_control() from a TCP CC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove the check for required and optional functions in a struct tcp_congestion_ops from bpf_tcp_ca.c. Rely on tcp_register_congestion_control() to reject a BPF CC that does not implement all required functions, as it will do for a non-BPF CC. When a CC implements tcp_congestion_ops.cong_control(), the alternate cong_avoid() is not in use in the TCP stack. Previously, a BPF CC was still forced to implement cong_avoid() as a no-op since it was non-optional in bpf_tcp_ca.c. Signed-off-by: Jörn-Thorben Hinz Reviewed-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220622191227.898118-3-jthinz@mailbox.tu-berlin.de Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_struct_ops.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index d9a3c9207240..7e0068c3399c 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -503,10 +503,9 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, goto unlock; } - /* Error during st_ops->reg(). It is very unlikely since - * the above init_member() should have caught it earlier - * before reg(). The only possibility is if there was a race - * in registering the struct_ops (under the same name) to + /* Error during st_ops->reg(). Can happen if this struct_ops needs to be + * verified as a whole, after all init_member() calls. Can also happen if + * there was a race in registering the struct_ops (under the same name) to * a sub-system through different struct_ops's maps. */ set_memory_nx((long)st_map->image, 1); -- cgit From 395e942d34a25824457da379baf434b5d6da4dcc Mon Sep 17 00:00:00 2001 From: Simon Wang Date: Tue, 21 Jun 2022 23:19:23 -0400 Subject: bpf: Replace hard-coded 0 with BPF_K in check_alu_op Enhance readability a bit. Signed-off-by: Simon Wang Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20220622031923.65692-1-wangchuanguo@inspur.com --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a20d7736a5b2..f228141c01c5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9096,7 +9096,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) if (opcode == BPF_END || opcode == BPF_NEG) { if (opcode == BPF_NEG) { - if (BPF_SRC(insn->code) != 0 || + if (BPF_SRC(insn->code) != BPF_K || insn->src_reg != BPF_REG_0 || insn->off != 0 || insn->imm != 0) { verbose(env, "BPF_NEG uses reserved fields\n"); -- cgit From fb4e3b33e3e7f13befdf9ee232e34818c6cc5fb9 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Fri, 24 Jun 2022 05:06:12 +0300 Subject: bpf: Fix for use-after-free bug in inline_bpf_loop As reported by Dan Carpenter, the following statements in inline_bpf_loop() might cause a use-after-free bug: struct bpf_prog *new_prog; // ... new_prog = bpf_patch_insn_data(env, position, insn_buf, *cnt); // ... env->prog->insnsi[call_insn_offset].imm = callback_offset; The bpf_patch_insn_data() might free the memory used by env->prog. Fixes: 1ade23711971 ("bpf: Inline calls to bpf_loop when callback is known") Reported-by: Dan Carpenter Signed-off-by: Eduard Zingerman Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220624020613.548108-2-eddyz87@gmail.com --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f228141c01c5..4938477912cd 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -14417,7 +14417,7 @@ static struct bpf_prog *inline_bpf_loop(struct bpf_verifier_env *env, /* Note: insn_buf[12] is an offset of BPF_CALL_REL instruction */ call_insn_offset = position + 12; callback_offset = callback_start - call_insn_offset - 1; - env->prog->insnsi[call_insn_offset].imm = callback_offset; + new_prog->insnsi[call_insn_offset].imm = callback_offset; return new_prog; } -- cgit From e61c451476e61450f6771ce03bbc01210a09be16 Mon Sep 17 00:00:00 2001 From: Mark-PK Tsai Date: Fri, 22 Apr 2022 14:24:35 +0800 Subject: dma-mapping: Add dma_release_coherent_memory to DMA API Add dma_release_coherent_memory to DMA API to allow dma user call it to release dev->dma_mem when the device is removed. Signed-off-by: Mark-PK Tsai Acked-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220422062436.14384-2-mark-pk.tsai@mediatek.com Signed-off-by: Mathieu Poirier --- kernel/dma/coherent.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c index 375fb3c9538d..c21abc77c53e 100644 --- a/kernel/dma/coherent.c +++ b/kernel/dma/coherent.c @@ -74,7 +74,7 @@ out_unmap_membase: return ERR_PTR(-ENOMEM); } -static void dma_release_coherent_memory(struct dma_coherent_mem *mem) +static void _dma_release_coherent_memory(struct dma_coherent_mem *mem) { if (!mem) return; @@ -126,10 +126,16 @@ int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr, ret = dma_assign_coherent_memory(dev, mem); if (ret) - dma_release_coherent_memory(mem); + _dma_release_coherent_memory(mem); return ret; } +void dma_release_coherent_memory(struct device *dev) +{ + if (dev) + _dma_release_coherent_memory(dev->dma_mem); +} + static void *__dma_alloc_from_coherent(struct device *dev, struct dma_coherent_mem *mem, ssize_t size, dma_addr_t *dma_handle) -- cgit From fd75733da2f376c0c8c6513c3cb2ac227082ec5c Mon Sep 17 00:00:00 2001 From: Daniel Müller Date: Thu, 23 Jun 2022 18:29:34 +0000 Subject: bpf: Merge "types_are_compat" logic into relo_core.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BPF type compatibility checks (bpf_core_types_are_compat()) are currently duplicated between kernel and user space. That's a historical artifact more than intentional doing and can lead to subtle bugs where one implementation is adjusted but another is forgotten. That happened with the enum64 work, for example, where the libbpf side was changed (commit 23b2a3a8f63a ("libbpf: Add enum64 relocation support")) to use the btf_kind_core_compat() helper function but the kernel side was not (commit 6089fb325cf7 ("bpf: Add btf enum64 support")). This patch addresses both the duplication issue, by merging both implementations and moving them into relo_core.c, and fixes the alluded to kind check (by giving preference to libbpf's already adjusted logic). For discussion of the topic, please refer to: https://lore.kernel.org/bpf/CAADnVQKbWR7oarBdewgOBZUPzryhRYvEbkhyPJQHHuxq=0K1gw@mail.gmail.com/T/#mcc99f4a33ad9a322afaf1b9276fb1f0b7add9665 Changelog: v1 -> v2: - limited libbpf recursion limit to 32 - changed name to __bpf_core_types_are_compat - included warning previously present in libbpf version - merged kernel and user space changes into a single patch Signed-off-by: Daniel Müller Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220623182934.2582827-1-deso@posteo.net --- kernel/bpf/btf.c | 84 +------------------------------------------------------- 1 file changed, 1 insertion(+), 83 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index f08037c31dd7..2e2066d6af94 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -7416,87 +7416,6 @@ EXPORT_SYMBOL_GPL(register_btf_id_dtor_kfuncs); #define MAX_TYPES_ARE_COMPAT_DEPTH 2 -static -int __bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, - const struct btf *targ_btf, __u32 targ_id, - int level) -{ - const struct btf_type *local_type, *targ_type; - int depth = 32; /* max recursion depth */ - - /* caller made sure that names match (ignoring flavor suffix) */ - local_type = btf_type_by_id(local_btf, local_id); - targ_type = btf_type_by_id(targ_btf, targ_id); - if (btf_kind(local_type) != btf_kind(targ_type)) - return 0; - -recur: - depth--; - if (depth < 0) - return -EINVAL; - - local_type = btf_type_skip_modifiers(local_btf, local_id, &local_id); - targ_type = btf_type_skip_modifiers(targ_btf, targ_id, &targ_id); - if (!local_type || !targ_type) - return -EINVAL; - - if (btf_kind(local_type) != btf_kind(targ_type)) - return 0; - - switch (btf_kind(local_type)) { - case BTF_KIND_UNKN: - case BTF_KIND_STRUCT: - case BTF_KIND_UNION: - case BTF_KIND_ENUM: - case BTF_KIND_FWD: - case BTF_KIND_ENUM64: - return 1; - case BTF_KIND_INT: - /* just reject deprecated bitfield-like integers; all other - * integers are by default compatible between each other - */ - return btf_int_offset(local_type) == 0 && btf_int_offset(targ_type) == 0; - case BTF_KIND_PTR: - local_id = local_type->type; - targ_id = targ_type->type; - goto recur; - case BTF_KIND_ARRAY: - local_id = btf_array(local_type)->type; - targ_id = btf_array(targ_type)->type; - goto recur; - case BTF_KIND_FUNC_PROTO: { - struct btf_param *local_p = btf_params(local_type); - struct btf_param *targ_p = btf_params(targ_type); - __u16 local_vlen = btf_vlen(local_type); - __u16 targ_vlen = btf_vlen(targ_type); - int i, err; - - if (local_vlen != targ_vlen) - return 0; - - for (i = 0; i < local_vlen; i++, local_p++, targ_p++) { - if (level <= 0) - return -EINVAL; - - btf_type_skip_modifiers(local_btf, local_p->type, &local_id); - btf_type_skip_modifiers(targ_btf, targ_p->type, &targ_id); - err = __bpf_core_types_are_compat(local_btf, local_id, - targ_btf, targ_id, - level - 1); - if (err <= 0) - return err; - } - - /* tail recurse for return type check */ - btf_type_skip_modifiers(local_btf, local_type->type, &local_id); - btf_type_skip_modifiers(targ_btf, targ_type->type, &targ_id); - goto recur; - } - default: - return 0; - } -} - /* Check local and target types for compatibility. This check is used for * type-based CO-RE relocations and follow slightly different rules than * field-based relocations. This function assumes that root types were already @@ -7519,8 +7438,7 @@ recur: int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, const struct btf *targ_btf, __u32 targ_id) { - return __bpf_core_types_are_compat(local_btf, local_id, - targ_btf, targ_id, + return __bpf_core_types_are_compat(local_btf, local_id, targ_btf, targ_id, MAX_TYPES_ARE_COMPAT_DEPTH); } -- cgit From d75cd55ae2dedeee5382bb48832c322673b9781c Mon Sep 17 00:00:00 2001 From: Lin Feng Date: Tue, 21 Jun 2022 17:23:58 +0800 Subject: cgroup.c: remove redundant check for mixable cgroup in cgroup_migrate_vet_dst MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We have: int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp) { ... /* mixables don't care */ if (cgroup_is_mixable(dst_cgrp)) return 0; /* * If @dst_cgrp is already or can become a thread root or is * threaded, it doesn't matter. */ if (cgroup_can_be_thread_root(dst_cgrp) || cgroup_is_threaded(dst_cgrp)) return 0; ... } but in fact the entry of cgroup_can_be_thread_root() covers case that checking cgroup_is_mixable() as following: static bool cgroup_can_be_thread_root(struct cgroup *cgrp) { /* mixables don't care */ if (cgroup_is_mixable(cgrp)) return true; ... } so explicitly checking in cgroup_migrate_vet_dst is unnecessary. Signed-off-by: Lin Feng Reviewed-by: Michal Koutný Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 4b67e6da6bf2..64e0f644adfa 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2566,10 +2566,6 @@ int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp) if (!cgroup_is_valid_domain(dst_cgrp->dom_cgrp)) return -EOPNOTSUPP; - /* mixables don't care */ - if (cgroup_is_mixable(dst_cgrp)) - return 0; - /* * If @dst_cgrp is already or can become a thread root or is * threaded, it doesn't matter. -- cgit From cc5c516df028b221d94c65c47c5ae8d20f61b6f9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 28 Jun 2022 19:18:45 +0200 Subject: block: simplify blktrace sysfs attribute creation Add the trace attributes to the default gendisk attributes, just like we already do for partitions. Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche Link: https://lore.kernel.org/r/20220628171850.1313069-2-hch@lst.de Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index fe04c6f96ca5..c584effe5fe9 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1867,17 +1867,6 @@ out_unlock_bdev: out: return ret ? ret : count; } - -int blk_trace_init_sysfs(struct device *dev) -{ - return sysfs_create_group(&dev->kobj, &blk_trace_attr_group); -} - -void blk_trace_remove_sysfs(struct device *dev) -{ - sysfs_remove_group(&dev->kobj, &blk_trace_attr_group); -} - #endif /* CONFIG_BLK_DEV_IO_TRACE */ #ifdef CONFIG_EVENT_TRACING -- cgit From af3f4134006bf3bf894179c08aaf98ed5938cf4e Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 28 Jun 2022 10:43:04 -0700 Subject: bpf: add bpf_func_t and trampoline helpers I'll be adding lsm cgroup specific helpers that grab trampoline mutex. No functional changes. Reviewed-by: Martin KaFai Lau Signed-off-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20220628174314.1216643-2-sdf@google.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/trampoline.c | 63 ++++++++++++++++++++++++++----------------------- 1 file changed, 33 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 93c7675f0c9e..5466e15be61f 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -410,7 +410,7 @@ static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog) } } -int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr) +static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr) { enum bpf_tramp_prog_type kind; struct bpf_tramp_link *link_exiting; @@ -418,44 +418,33 @@ int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline int cnt = 0, i; kind = bpf_attach_type_to_tramp(link->link.prog); - mutex_lock(&tr->mutex); - if (tr->extension_prog) { + if (tr->extension_prog) /* cannot attach fentry/fexit if extension prog is attached. * cannot overwrite extension prog either. */ - err = -EBUSY; - goto out; - } + return -EBUSY; for (i = 0; i < BPF_TRAMP_MAX; i++) cnt += tr->progs_cnt[i]; if (kind == BPF_TRAMP_REPLACE) { /* Cannot attach extension if fentry/fexit are in use. */ - if (cnt) { - err = -EBUSY; - goto out; - } + if (cnt) + return -EBUSY; tr->extension_prog = link->link.prog; - err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, NULL, - link->link.prog->bpf_func); - goto out; - } - if (cnt >= BPF_MAX_TRAMP_LINKS) { - err = -E2BIG; - goto out; + return bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, NULL, + link->link.prog->bpf_func); } - if (!hlist_unhashed(&link->tramp_hlist)) { + if (cnt >= BPF_MAX_TRAMP_LINKS) + return -E2BIG; + if (!hlist_unhashed(&link->tramp_hlist)) /* prog already linked */ - err = -EBUSY; - goto out; - } + return -EBUSY; hlist_for_each_entry(link_exiting, &tr->progs_hlist[kind], tramp_hlist) { if (link_exiting->link.prog != link->link.prog) continue; /* prog already linked */ - err = -EBUSY; - goto out; + return -EBUSY; } hlist_add_head(&link->tramp_hlist, &tr->progs_hlist[kind]); @@ -465,30 +454,44 @@ int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline hlist_del_init(&link->tramp_hlist); tr->progs_cnt[kind]--; } -out: + return err; +} + +int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr) +{ + int err; + + mutex_lock(&tr->mutex); + err = __bpf_trampoline_link_prog(link, tr); mutex_unlock(&tr->mutex); return err; } -/* bpf_trampoline_unlink_prog() should never fail. */ -int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr) +static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr) { enum bpf_tramp_prog_type kind; int err; kind = bpf_attach_type_to_tramp(link->link.prog); - mutex_lock(&tr->mutex); if (kind == BPF_TRAMP_REPLACE) { WARN_ON_ONCE(!tr->extension_prog); err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, tr->extension_prog->bpf_func, NULL); tr->extension_prog = NULL; - goto out; + return err; } hlist_del_init(&link->tramp_hlist); tr->progs_cnt[kind]--; - err = bpf_trampoline_update(tr); -out: + return bpf_trampoline_update(tr); +} + +/* bpf_trampoline_unlink_prog() should never fail. */ +int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr) +{ + int err; + + mutex_lock(&tr->mutex); + err = __bpf_trampoline_unlink_prog(link, tr); mutex_unlock(&tr->mutex); return err; } -- cgit From 00442143a2ab7f1da46fbf4d2a99c85df767d49a Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 28 Jun 2022 10:43:05 -0700 Subject: bpf: convert cgroup_bpf.progs to hlist This lets us reclaim some space to be used by new cgroup lsm slots. Before: struct cgroup_bpf { struct bpf_prog_array * effective[23]; /* 0 184 */ /* --- cacheline 2 boundary (128 bytes) was 56 bytes ago --- */ struct list_head progs[23]; /* 184 368 */ /* --- cacheline 8 boundary (512 bytes) was 40 bytes ago --- */ u32 flags[23]; /* 552 92 */ /* XXX 4 bytes hole, try to pack */ /* --- cacheline 10 boundary (640 bytes) was 8 bytes ago --- */ struct list_head storages; /* 648 16 */ struct bpf_prog_array * inactive; /* 664 8 */ struct percpu_ref refcnt; /* 672 16 */ struct work_struct release_work; /* 688 32 */ /* size: 720, cachelines: 12, members: 7 */ /* sum members: 716, holes: 1, sum holes: 4 */ /* last cacheline: 16 bytes */ }; After: struct cgroup_bpf { struct bpf_prog_array * effective[23]; /* 0 184 */ /* --- cacheline 2 boundary (128 bytes) was 56 bytes ago --- */ struct hlist_head progs[23]; /* 184 184 */ /* --- cacheline 5 boundary (320 bytes) was 48 bytes ago --- */ u8 flags[23]; /* 368 23 */ /* XXX 1 byte hole, try to pack */ /* --- cacheline 6 boundary (384 bytes) was 8 bytes ago --- */ struct list_head storages; /* 392 16 */ struct bpf_prog_array * inactive; /* 408 8 */ struct percpu_ref refcnt; /* 416 16 */ struct work_struct release_work; /* 432 72 */ /* size: 504, cachelines: 8, members: 7 */ /* sum members: 503, holes: 1, sum holes: 1 */ /* last cacheline: 56 bytes */ }; Suggested-by: Jakub Sitnicki Reviewed-by: Jakub Sitnicki Reviewed-by: Martin KaFai Lau Signed-off-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20220628174314.1216643-3-sdf@google.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/cgroup.c | 76 +++++++++++++++++++++++++++++++---------------------- 1 file changed, 44 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 7a394f7c205c..4adb4f3ecb7f 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -157,11 +157,12 @@ static void cgroup_bpf_release(struct work_struct *work) mutex_lock(&cgroup_mutex); for (atype = 0; atype < ARRAY_SIZE(cgrp->bpf.progs); atype++) { - struct list_head *progs = &cgrp->bpf.progs[atype]; - struct bpf_prog_list *pl, *pltmp; + struct hlist_head *progs = &cgrp->bpf.progs[atype]; + struct bpf_prog_list *pl; + struct hlist_node *pltmp; - list_for_each_entry_safe(pl, pltmp, progs, node) { - list_del(&pl->node); + hlist_for_each_entry_safe(pl, pltmp, progs, node) { + hlist_del(&pl->node); if (pl->prog) bpf_prog_put(pl->prog); if (pl->link) @@ -217,12 +218,12 @@ static struct bpf_prog *prog_list_prog(struct bpf_prog_list *pl) /* count number of elements in the list. * it's slow but the list cannot be long */ -static u32 prog_list_length(struct list_head *head) +static u32 prog_list_length(struct hlist_head *head) { struct bpf_prog_list *pl; u32 cnt = 0; - list_for_each_entry(pl, head, node) { + hlist_for_each_entry(pl, head, node) { if (!prog_list_prog(pl)) continue; cnt++; @@ -291,7 +292,7 @@ static int compute_effective_progs(struct cgroup *cgrp, if (cnt > 0 && !(p->bpf.flags[atype] & BPF_F_ALLOW_MULTI)) continue; - list_for_each_entry(pl, &p->bpf.progs[atype], node) { + hlist_for_each_entry(pl, &p->bpf.progs[atype], node) { if (!prog_list_prog(pl)) continue; @@ -342,7 +343,7 @@ int cgroup_bpf_inherit(struct cgroup *cgrp) cgroup_bpf_get(p); for (i = 0; i < NR; i++) - INIT_LIST_HEAD(&cgrp->bpf.progs[i]); + INIT_HLIST_HEAD(&cgrp->bpf.progs[i]); INIT_LIST_HEAD(&cgrp->bpf.storages); @@ -418,7 +419,7 @@ cleanup: #define BPF_CGROUP_MAX_PROGS 64 -static struct bpf_prog_list *find_attach_entry(struct list_head *progs, +static struct bpf_prog_list *find_attach_entry(struct hlist_head *progs, struct bpf_prog *prog, struct bpf_cgroup_link *link, struct bpf_prog *replace_prog, @@ -428,12 +429,12 @@ static struct bpf_prog_list *find_attach_entry(struct list_head *progs, /* single-attach case */ if (!allow_multi) { - if (list_empty(progs)) + if (hlist_empty(progs)) return NULL; - return list_first_entry(progs, typeof(*pl), node); + return hlist_entry(progs->first, typeof(*pl), node); } - list_for_each_entry(pl, progs, node) { + hlist_for_each_entry(pl, progs, node) { if (prog && pl->prog == prog && prog != replace_prog) /* disallow attaching the same prog twice */ return ERR_PTR(-EINVAL); @@ -444,7 +445,7 @@ static struct bpf_prog_list *find_attach_entry(struct list_head *progs, /* direct prog multi-attach w/ replacement case */ if (replace_prog) { - list_for_each_entry(pl, progs, node) { + hlist_for_each_entry(pl, progs, node) { if (pl->prog == replace_prog) /* a match found */ return pl; @@ -480,7 +481,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_cgroup_storage *new_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; enum cgroup_bpf_attach_type atype; struct bpf_prog_list *pl; - struct list_head *progs; + struct hlist_head *progs; int err; if (((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) || @@ -503,7 +504,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, if (!hierarchy_allows_attach(cgrp, atype)) return -EPERM; - if (!list_empty(progs) && cgrp->bpf.flags[atype] != saved_flags) + if (!hlist_empty(progs) && cgrp->bpf.flags[atype] != saved_flags) /* Disallow attaching non-overridable on top * of existing overridable in this cgroup. * Disallow attaching multi-prog if overridable or none @@ -525,12 +526,22 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, if (pl) { old_prog = pl->prog; } else { + struct hlist_node *last = NULL; + pl = kmalloc(sizeof(*pl), GFP_KERNEL); if (!pl) { bpf_cgroup_storages_free(new_storage); return -ENOMEM; } - list_add_tail(&pl->node, progs); + if (hlist_empty(progs)) + hlist_add_head(&pl->node, progs); + else + hlist_for_each(last, progs) { + if (last->next) + continue; + hlist_add_behind(&pl->node, last); + break; + } } pl->prog = prog; @@ -556,7 +567,7 @@ cleanup: } bpf_cgroup_storages_free(new_storage); if (!old_prog) { - list_del(&pl->node); + hlist_del(&pl->node); kfree(pl); } return err; @@ -587,7 +598,7 @@ static void replace_effective_prog(struct cgroup *cgrp, struct cgroup_subsys_state *css; struct bpf_prog_array *progs; struct bpf_prog_list *pl; - struct list_head *head; + struct hlist_head *head; struct cgroup *cg; int pos; @@ -603,7 +614,7 @@ static void replace_effective_prog(struct cgroup *cgrp, continue; head = &cg->bpf.progs[atype]; - list_for_each_entry(pl, head, node) { + hlist_for_each_entry(pl, head, node) { if (!prog_list_prog(pl)) continue; if (pl->link == link) @@ -637,7 +648,7 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp, enum cgroup_bpf_attach_type atype; struct bpf_prog *old_prog; struct bpf_prog_list *pl; - struct list_head *progs; + struct hlist_head *progs; bool found = false; atype = to_cgroup_bpf_attach_type(link->type); @@ -649,7 +660,7 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp, if (link->link.prog->type != new_prog->type) return -EINVAL; - list_for_each_entry(pl, progs, node) { + hlist_for_each_entry(pl, progs, node) { if (pl->link == link) { found = true; break; @@ -688,7 +699,7 @@ out_unlock: return ret; } -static struct bpf_prog_list *find_detach_entry(struct list_head *progs, +static struct bpf_prog_list *find_detach_entry(struct hlist_head *progs, struct bpf_prog *prog, struct bpf_cgroup_link *link, bool allow_multi) @@ -696,14 +707,14 @@ static struct bpf_prog_list *find_detach_entry(struct list_head *progs, struct bpf_prog_list *pl; if (!allow_multi) { - if (list_empty(progs)) + if (hlist_empty(progs)) /* report error when trying to detach and nothing is attached */ return ERR_PTR(-ENOENT); /* to maintain backward compatibility NONE and OVERRIDE cgroups * allow detaching with invalid FD (prog==NULL) in legacy mode */ - return list_first_entry(progs, typeof(*pl), node); + return hlist_entry(progs->first, typeof(*pl), node); } if (!prog && !link) @@ -713,7 +724,7 @@ static struct bpf_prog_list *find_detach_entry(struct list_head *progs, return ERR_PTR(-EINVAL); /* find the prog or link and detach it */ - list_for_each_entry(pl, progs, node) { + hlist_for_each_entry(pl, progs, node) { if (pl->prog == prog && pl->link == link) return pl; } @@ -737,7 +748,7 @@ static void purge_effective_progs(struct cgroup *cgrp, struct bpf_prog *prog, struct cgroup_subsys_state *css; struct bpf_prog_array *progs; struct bpf_prog_list *pl; - struct list_head *head; + struct hlist_head *head; struct cgroup *cg; int pos; @@ -754,7 +765,7 @@ static void purge_effective_progs(struct cgroup *cgrp, struct bpf_prog *prog, continue; head = &cg->bpf.progs[atype]; - list_for_each_entry(pl, head, node) { + hlist_for_each_entry(pl, head, node) { if (!prog_list_prog(pl)) continue; if (pl->prog == prog && pl->link == link) @@ -791,7 +802,7 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, enum cgroup_bpf_attach_type atype; struct bpf_prog *old_prog; struct bpf_prog_list *pl; - struct list_head *progs; + struct hlist_head *progs; u32 flags; atype = to_cgroup_bpf_attach_type(type); @@ -822,9 +833,10 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, } /* now can actually delete it from this cgroup list */ - list_del(&pl->node); + hlist_del(&pl->node); + kfree(pl); - if (list_empty(progs)) + if (hlist_empty(progs)) /* last program was detached, reset flags to zero */ cgrp->bpf.flags[atype] = 0; if (old_prog) @@ -852,7 +864,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, enum bpf_attach_type type = attr->query.attach_type; enum cgroup_bpf_attach_type atype; struct bpf_prog_array *effective; - struct list_head *progs; + struct hlist_head *progs; struct bpf_prog *prog; int cnt, ret = 0, i; u32 flags; @@ -891,7 +903,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, u32 id; i = 0; - list_for_each_entry(pl, progs, node) { + hlist_for_each_entry(pl, progs, node) { prog = prog_list_prog(pl); id = prog->aux->id; if (copy_to_user(prog_ids + i, &id, sizeof(id))) -- cgit From 69fd337a975c7e690dfe49d9cb4fe5ba1e6db44e Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 28 Jun 2022 10:43:06 -0700 Subject: bpf: per-cgroup lsm flavor Allow attaching to lsm hooks in the cgroup context. Attaching to per-cgroup LSM works exactly like attaching to other per-cgroup hooks. New BPF_LSM_CGROUP is added to trigger new mode; the actual lsm hook we attach to is signaled via existing attach_btf_id. For the hooks that have 'struct socket' or 'struct sock' as its first argument, we use the cgroup associated with that socket. For the rest, we use 'current' cgroup (this is all on default hierarchy == v2 only). Note that for some hooks that work on 'struct sock' we still take the cgroup from 'current' because some of them work on the socket that hasn't been properly initialized yet. Behind the scenes, we allocate a shim program that is attached to the trampoline and runs cgroup effective BPF programs array. This shim has some rudimentary ref counting and can be shared between several programs attaching to the same lsm hook from different cgroups. Note that this patch bloats cgroup size because we add 211 cgroup_bpf_attach_type(s) for simplicity sake. This will be addressed in the subsequent patch. Also note that we only add non-sleepable flavor for now. To enable sleepable use-cases, bpf_prog_run_array_cg has to grab trace rcu, shim programs have to be freed via trace rcu, cgroup_bpf.effective should be also trace-rcu-managed + maybe some other changes that I'm not aware of. Reviewed-by: Martin KaFai Lau Signed-off-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20220628174314.1216643-4-sdf@google.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_lsm.c | 48 ++++++++++++ kernel/bpf/btf.c | 11 +++ kernel/bpf/cgroup.c | 136 ++++++++++++++++++++++++++++++--- kernel/bpf/core.c | 2 + kernel/bpf/syscall.c | 10 +++ kernel/bpf/trampoline.c | 198 ++++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 32 ++++++++ 7 files changed, 426 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index c1351df9f7ee..0f72020bfdcf 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -16,6 +16,7 @@ #include #include #include +#include /* For every LSM hook that allows attachment of BPF programs, declare a nop * function where a BPF program can be attached. @@ -35,6 +36,44 @@ BTF_SET_START(bpf_lsm_hooks) #undef LSM_HOOK BTF_SET_END(bpf_lsm_hooks) +/* List of LSM hooks that should operate on 'current' cgroup regardless + * of function signature. + */ +BTF_SET_START(bpf_lsm_current_hooks) +/* operate on freshly allocated sk without any cgroup association */ +BTF_ID(func, bpf_lsm_sk_alloc_security) +BTF_ID(func, bpf_lsm_sk_free_security) +BTF_SET_END(bpf_lsm_current_hooks) + +void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog, + bpf_func_t *bpf_func) +{ + const struct btf_param *args; + + if (btf_type_vlen(prog->aux->attach_func_proto) < 1 || + btf_id_set_contains(&bpf_lsm_current_hooks, + prog->aux->attach_btf_id)) { + *bpf_func = __cgroup_bpf_run_lsm_current; + return; + } + + args = btf_params(prog->aux->attach_func_proto); + +#ifdef CONFIG_NET + if (args[0].type == btf_sock_ids[BTF_SOCK_TYPE_SOCKET]) + *bpf_func = __cgroup_bpf_run_lsm_socket; + else if (args[0].type == btf_sock_ids[BTF_SOCK_TYPE_SOCK]) + *bpf_func = __cgroup_bpf_run_lsm_sock; + else +#endif + *bpf_func = __cgroup_bpf_run_lsm_current; +} + +int bpf_lsm_hook_idx(u32 btf_id) +{ + return btf_id_set_index(&bpf_lsm_hooks, btf_id); +} + int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, const struct bpf_prog *prog) { @@ -158,6 +197,15 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return prog->aux->sleepable ? &bpf_ima_file_hash_proto : NULL; case BPF_FUNC_get_attach_cookie: return bpf_prog_has_trampoline(prog) ? &bpf_get_attach_cookie_proto : NULL; + case BPF_FUNC_get_local_storage: + return prog->expected_attach_type == BPF_LSM_CGROUP ? + &bpf_get_local_storage_proto : NULL; + case BPF_FUNC_set_retval: + return prog->expected_attach_type == BPF_LSM_CGROUP ? + &bpf_set_retval_proto : NULL; + case BPF_FUNC_get_retval: + return prog->expected_attach_type == BPF_LSM_CGROUP ? + &bpf_get_retval_proto : NULL; default: return tracing_prog_func_proto(func_id, prog); } diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 2e2066d6af94..7c1fe422ed3f 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5363,6 +5363,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, if (arg == nr_args) { switch (prog->expected_attach_type) { + case BPF_LSM_CGROUP: case BPF_LSM_MAC: case BPF_TRACE_FEXIT: /* When LSM programs are attached to void LSM hooks @@ -6842,6 +6843,16 @@ static int btf_id_cmp_func(const void *a, const void *b) return *pa - *pb; } +int btf_id_set_index(const struct btf_id_set *set, u32 id) +{ + const u32 *p; + + p = bsearch(&id, set->ids, set->cnt, sizeof(u32), btf_id_cmp_func); + if (!p) + return -1; + return p - set->ids; +} + bool btf_id_set_contains(const struct btf_id_set *set, u32 id) { return bsearch(&id, set->ids, set->cnt, sizeof(u32), btf_id_cmp_func) != NULL; diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 4adb4f3ecb7f..9cf41dd4f96f 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -14,6 +14,8 @@ #include #include #include +#include +#include #include #include @@ -61,6 +63,87 @@ bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp, return run_ctx.retval; } +unsigned int __cgroup_bpf_run_lsm_sock(const void *ctx, + const struct bpf_insn *insn) +{ + const struct bpf_prog *shim_prog; + struct sock *sk; + struct cgroup *cgrp; + int ret = 0; + u64 *args; + + args = (u64 *)ctx; + sk = (void *)(unsigned long)args[0]; + /*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/ + shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi)); + + cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + if (likely(cgrp)) + ret = bpf_prog_run_array_cg(&cgrp->bpf, + shim_prog->aux->cgroup_atype, + ctx, bpf_prog_run, 0, NULL); + return ret; +} + +unsigned int __cgroup_bpf_run_lsm_socket(const void *ctx, + const struct bpf_insn *insn) +{ + const struct bpf_prog *shim_prog; + struct socket *sock; + struct cgroup *cgrp; + int ret = 0; + u64 *args; + + args = (u64 *)ctx; + sock = (void *)(unsigned long)args[0]; + /*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/ + shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi)); + + cgrp = sock_cgroup_ptr(&sock->sk->sk_cgrp_data); + if (likely(cgrp)) + ret = bpf_prog_run_array_cg(&cgrp->bpf, + shim_prog->aux->cgroup_atype, + ctx, bpf_prog_run, 0, NULL); + return ret; +} + +unsigned int __cgroup_bpf_run_lsm_current(const void *ctx, + const struct bpf_insn *insn) +{ + const struct bpf_prog *shim_prog; + struct cgroup *cgrp; + int ret = 0; + + /*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/ + shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi)); + + /* We rely on trampoline's __bpf_prog_enter_lsm_cgroup to grab RCU read lock. */ + cgrp = task_dfl_cgroup(current); + if (likely(cgrp)) + ret = bpf_prog_run_array_cg(&cgrp->bpf, + shim_prog->aux->cgroup_atype, + ctx, bpf_prog_run, 0, NULL); + return ret; +} + +#ifdef CONFIG_BPF_LSM +static enum cgroup_bpf_attach_type +bpf_cgroup_atype_find(enum bpf_attach_type attach_type, u32 attach_btf_id) +{ + if (attach_type != BPF_LSM_CGROUP) + return to_cgroup_bpf_attach_type(attach_type); + return CGROUP_LSM_START + bpf_lsm_hook_idx(attach_btf_id); +} +#else +static enum cgroup_bpf_attach_type +bpf_cgroup_atype_find(enum bpf_attach_type attach_type, u32 attach_btf_id) +{ + if (attach_type != BPF_LSM_CGROUP) + return to_cgroup_bpf_attach_type(attach_type); + return -EOPNOTSUPP; +} +#endif /* CONFIG_BPF_LSM */ + void cgroup_bpf_offline(struct cgroup *cgrp) { cgroup_get(cgrp); @@ -163,10 +246,16 @@ static void cgroup_bpf_release(struct work_struct *work) hlist_for_each_entry_safe(pl, pltmp, progs, node) { hlist_del(&pl->node); - if (pl->prog) + if (pl->prog) { + if (pl->prog->expected_attach_type == BPF_LSM_CGROUP) + bpf_trampoline_unlink_cgroup_shim(pl->prog); bpf_prog_put(pl->prog); - if (pl->link) + } + if (pl->link) { + if (pl->link->link.prog->expected_attach_type == BPF_LSM_CGROUP) + bpf_trampoline_unlink_cgroup_shim(pl->link->link.prog); bpf_cgroup_link_auto_detach(pl->link); + } kfree(pl); static_branch_dec(&cgroup_bpf_enabled_key[atype]); } @@ -479,6 +568,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *old_prog = NULL; struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; struct bpf_cgroup_storage *new_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; + struct bpf_prog *new_prog = prog ? : link->link.prog; enum cgroup_bpf_attach_type atype; struct bpf_prog_list *pl; struct hlist_head *progs; @@ -495,7 +585,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, /* replace_prog implies BPF_F_REPLACE, and vice versa */ return -EINVAL; - atype = to_cgroup_bpf_attach_type(type); + atype = bpf_cgroup_atype_find(type, new_prog->aux->attach_btf_id); if (atype < 0) return -EINVAL; @@ -549,17 +639,30 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, bpf_cgroup_storages_assign(pl->storage, storage); cgrp->bpf.flags[atype] = saved_flags; + if (type == BPF_LSM_CGROUP) { + err = bpf_trampoline_link_cgroup_shim(new_prog, atype); + if (err) + goto cleanup; + } + err = update_effective_progs(cgrp, atype); if (err) - goto cleanup; + goto cleanup_trampoline; - if (old_prog) + if (old_prog) { + if (type == BPF_LSM_CGROUP) + bpf_trampoline_unlink_cgroup_shim(old_prog); bpf_prog_put(old_prog); - else + } else { static_branch_inc(&cgroup_bpf_enabled_key[atype]); + } bpf_cgroup_storages_link(new_storage, cgrp, type); return 0; +cleanup_trampoline: + if (type == BPF_LSM_CGROUP) + bpf_trampoline_unlink_cgroup_shim(new_prog); + cleanup: if (old_prog) { pl->prog = old_prog; @@ -651,7 +754,7 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp, struct hlist_head *progs; bool found = false; - atype = to_cgroup_bpf_attach_type(link->type); + atype = bpf_cgroup_atype_find(link->type, new_prog->aux->attach_btf_id); if (atype < 0) return -EINVAL; @@ -803,9 +906,15 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, struct bpf_prog *old_prog; struct bpf_prog_list *pl; struct hlist_head *progs; + u32 attach_btf_id = 0; u32 flags; - atype = to_cgroup_bpf_attach_type(type); + if (prog) + attach_btf_id = prog->aux->attach_btf_id; + if (link) + attach_btf_id = link->link.prog->aux->attach_btf_id; + + atype = bpf_cgroup_atype_find(type, attach_btf_id); if (atype < 0) return -EINVAL; @@ -839,8 +948,11 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, if (hlist_empty(progs)) /* last program was detached, reset flags to zero */ cgrp->bpf.flags[atype] = 0; - if (old_prog) + if (old_prog) { + if (type == BPF_LSM_CGROUP) + bpf_trampoline_unlink_cgroup_shim(old_prog); bpf_prog_put(old_prog); + } static_branch_dec(&cgroup_bpf_enabled_key[atype]); return 0; } @@ -999,6 +1111,8 @@ static void bpf_cgroup_link_release(struct bpf_link *link) WARN_ON(__cgroup_bpf_detach(cg_link->cgroup, NULL, cg_link, cg_link->type)); + if (cg_link->type == BPF_LSM_CGROUP) + bpf_trampoline_unlink_cgroup_shim(cg_link->link.prog); cg = cg_link->cgroup; cg_link->cgroup = NULL; @@ -1343,7 +1457,7 @@ BPF_CALL_0(bpf_get_retval) return ctx->retval; } -static const struct bpf_func_proto bpf_get_retval_proto = { +const struct bpf_func_proto bpf_get_retval_proto = { .func = bpf_get_retval, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1358,7 +1472,7 @@ BPF_CALL_1(bpf_set_retval, int, retval) return 0; } -static const struct bpf_func_proto bpf_set_retval_proto = { +const struct bpf_func_proto bpf_set_retval_proto = { .func = bpf_set_retval, .gpl_only = false, .ret_type = RET_INTEGER, diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index f023cb399e3f..4cc10b942a3c 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2666,6 +2666,8 @@ const struct bpf_func_proto bpf_get_local_storage_proto __weak; const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto __weak; const struct bpf_func_proto bpf_snprintf_btf_proto __weak; const struct bpf_func_proto bpf_seq_printf_btf_proto __weak; +const struct bpf_func_proto bpf_set_retval_proto __weak; +const struct bpf_func_proto bpf_get_retval_proto __weak; const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void) { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 7d5af5b99f0d..626b8f7d237b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3416,6 +3416,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) return BPF_PROG_TYPE_SK_LOOKUP; case BPF_XDP: return BPF_PROG_TYPE_XDP; + case BPF_LSM_CGROUP: + return BPF_PROG_TYPE_LSM; default: return BPF_PROG_TYPE_UNSPEC; } @@ -3469,6 +3471,11 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_PROG_TYPE_CGROUP_SOCKOPT: case BPF_PROG_TYPE_CGROUP_SYSCTL: case BPF_PROG_TYPE_SOCK_OPS: + case BPF_PROG_TYPE_LSM: + if (ptype == BPF_PROG_TYPE_LSM && + prog->expected_attach_type != BPF_LSM_CGROUP) + return -EINVAL; + ret = cgroup_bpf_prog_attach(attr, ptype, prog); break; default: @@ -3506,6 +3513,7 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_PROG_TYPE_CGROUP_SOCKOPT: case BPF_PROG_TYPE_CGROUP_SYSCTL: case BPF_PROG_TYPE_SOCK_OPS: + case BPF_PROG_TYPE_LSM: return cgroup_bpf_prog_detach(attr, ptype); default: return -EINVAL; @@ -4540,6 +4548,8 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) ret = bpf_raw_tp_link_attach(prog, NULL); else if (prog->expected_attach_type == BPF_TRACE_ITER) ret = bpf_iter_link_attach(attr, uattr, prog); + else if (prog->expected_attach_type == BPF_LSM_CGROUP) + ret = cgroup_bpf_link_attach(attr, prog); else ret = bpf_tracing_prog_attach(prog, attr->link_create.target_fd, diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 5466e15be61f..d7c251d7fbcd 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -11,6 +11,8 @@ #include #include #include +#include +#include /* dummy _ops. The verifier will operate on target program's ops. */ const struct bpf_verifier_ops bpf_extension_verifier_ops = { @@ -496,6 +498,177 @@ int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampolin return err; } +#if defined(CONFIG_BPF_JIT) && defined(CONFIG_BPF_SYSCALL) +static void bpf_shim_tramp_link_release(struct bpf_link *link) +{ + struct bpf_shim_tramp_link *shim_link = + container_of(link, struct bpf_shim_tramp_link, link.link); + + /* paired with 'shim_link->trampoline = tr' in bpf_trampoline_link_cgroup_shim */ + if (!shim_link->trampoline) + return; + + WARN_ON_ONCE(bpf_trampoline_unlink_prog(&shim_link->link, shim_link->trampoline)); + bpf_trampoline_put(shim_link->trampoline); +} + +static void bpf_shim_tramp_link_dealloc(struct bpf_link *link) +{ + struct bpf_shim_tramp_link *shim_link = + container_of(link, struct bpf_shim_tramp_link, link.link); + + kfree(shim_link); +} + +static const struct bpf_link_ops bpf_shim_tramp_link_lops = { + .release = bpf_shim_tramp_link_release, + .dealloc = bpf_shim_tramp_link_dealloc, +}; + +static struct bpf_shim_tramp_link *cgroup_shim_alloc(const struct bpf_prog *prog, + bpf_func_t bpf_func, + int cgroup_atype) +{ + struct bpf_shim_tramp_link *shim_link = NULL; + struct bpf_prog *p; + + shim_link = kzalloc(sizeof(*shim_link), GFP_USER); + if (!shim_link) + return NULL; + + p = bpf_prog_alloc(1, 0); + if (!p) { + kfree(shim_link); + return NULL; + } + + p->jited = false; + p->bpf_func = bpf_func; + + p->aux->cgroup_atype = cgroup_atype; + p->aux->attach_func_proto = prog->aux->attach_func_proto; + p->aux->attach_btf_id = prog->aux->attach_btf_id; + p->aux->attach_btf = prog->aux->attach_btf; + btf_get(p->aux->attach_btf); + p->type = BPF_PROG_TYPE_LSM; + p->expected_attach_type = BPF_LSM_MAC; + bpf_prog_inc(p); + bpf_link_init(&shim_link->link.link, BPF_LINK_TYPE_UNSPEC, + &bpf_shim_tramp_link_lops, p); + + return shim_link; +} + +static struct bpf_shim_tramp_link *cgroup_shim_find(struct bpf_trampoline *tr, + bpf_func_t bpf_func) +{ + struct bpf_tramp_link *link; + int kind; + + for (kind = 0; kind < BPF_TRAMP_MAX; kind++) { + hlist_for_each_entry(link, &tr->progs_hlist[kind], tramp_hlist) { + struct bpf_prog *p = link->link.prog; + + if (p->bpf_func == bpf_func) + return container_of(link, struct bpf_shim_tramp_link, link); + } + } + + return NULL; +} + +int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog, + int cgroup_atype) +{ + struct bpf_shim_tramp_link *shim_link = NULL; + struct bpf_attach_target_info tgt_info = {}; + struct bpf_trampoline *tr; + bpf_func_t bpf_func; + u64 key; + int err; + + err = bpf_check_attach_target(NULL, prog, NULL, + prog->aux->attach_btf_id, + &tgt_info); + if (err) + return err; + + key = bpf_trampoline_compute_key(NULL, prog->aux->attach_btf, + prog->aux->attach_btf_id); + + bpf_lsm_find_cgroup_shim(prog, &bpf_func); + tr = bpf_trampoline_get(key, &tgt_info); + if (!tr) + return -ENOMEM; + + mutex_lock(&tr->mutex); + + shim_link = cgroup_shim_find(tr, bpf_func); + if (shim_link) { + /* Reusing existing shim attached by the other program. */ + bpf_link_inc(&shim_link->link.link); + + mutex_unlock(&tr->mutex); + bpf_trampoline_put(tr); /* bpf_trampoline_get above */ + return 0; + } + + /* Allocate and install new shim. */ + + shim_link = cgroup_shim_alloc(prog, bpf_func, cgroup_atype); + if (!shim_link) { + err = -ENOMEM; + goto err; + } + + err = __bpf_trampoline_link_prog(&shim_link->link, tr); + if (err) + goto err; + + shim_link->trampoline = tr; + /* note, we're still holding tr refcnt from above */ + + mutex_unlock(&tr->mutex); + + return 0; +err: + mutex_unlock(&tr->mutex); + + if (shim_link) + bpf_link_put(&shim_link->link.link); + + /* have to release tr while _not_ holding its mutex */ + bpf_trampoline_put(tr); /* bpf_trampoline_get above */ + + return err; +} + +void bpf_trampoline_unlink_cgroup_shim(struct bpf_prog *prog) +{ + struct bpf_shim_tramp_link *shim_link = NULL; + struct bpf_trampoline *tr; + bpf_func_t bpf_func; + u64 key; + + key = bpf_trampoline_compute_key(NULL, prog->aux->attach_btf, + prog->aux->attach_btf_id); + + bpf_lsm_find_cgroup_shim(prog, &bpf_func); + tr = bpf_trampoline_lookup(key); + if (WARN_ON_ONCE(!tr)) + return; + + mutex_lock(&tr->mutex); + shim_link = cgroup_shim_find(tr, bpf_func); + mutex_unlock(&tr->mutex); + + if (shim_link) + bpf_link_put(&shim_link->link.link); + + bpf_trampoline_put(tr); /* bpf_trampoline_lookup above */ +} +#endif + struct bpf_trampoline *bpf_trampoline_get(u64 key, struct bpf_attach_target_info *tgt_info) { @@ -628,6 +801,31 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start, struct bpf_tramp_ rcu_read_unlock(); } +u64 notrace __bpf_prog_enter_lsm_cgroup(struct bpf_prog *prog, + struct bpf_tramp_run_ctx *run_ctx) + __acquires(RCU) +{ + /* Runtime stats are exported via actual BPF_LSM_CGROUP + * programs, not the shims. + */ + rcu_read_lock(); + migrate_disable(); + + run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); + + return NO_START_TIME; +} + +void notrace __bpf_prog_exit_lsm_cgroup(struct bpf_prog *prog, u64 start, + struct bpf_tramp_run_ctx *run_ctx) + __releases(RCU) +{ + bpf_reset_run_ctx(run_ctx->saved_run_ctx); + + migrate_enable(); + rcu_read_unlock(); +} + u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx) { rcu_read_lock_trace(); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4938477912cd..df3ec6b05f05 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7322,6 +7322,18 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn reg_type_str(env, regs[BPF_REG_1].type)); return -EACCES; } + break; + case BPF_FUNC_set_retval: + if (env->prog->expected_attach_type == BPF_LSM_CGROUP) { + if (!env->prog->aux->attach_func_proto->type) { + /* Make sure programs that attach to void + * hooks don't try to modify return value. + */ + verbose(env, "BPF_LSM_CGROUP that attach to void LSM hooks can't modify return value!\n"); + return -EINVAL; + } + } + break; } if (err) @@ -10527,6 +10539,22 @@ static int check_return_code(struct bpf_verifier_env *env) case BPF_PROG_TYPE_SK_LOOKUP: range = tnum_range(SK_DROP, SK_PASS); break; + + case BPF_PROG_TYPE_LSM: + if (env->prog->expected_attach_type != BPF_LSM_CGROUP) { + /* Regular BPF_PROG_TYPE_LSM programs can return + * any value. + */ + return 0; + } + if (!env->prog->aux->attach_func_proto->type) { + /* Make sure programs that attach to void + * hooks don't try to modify return value. + */ + range = tnum_range(1, 1); + } + break; + case BPF_PROG_TYPE_EXT: /* freplace program can return anything as its return value * depends on the to-be-replaced kernel func or bpf program. @@ -10543,6 +10571,9 @@ static int check_return_code(struct bpf_verifier_env *env) if (!tnum_in(range, reg->var_off)) { verbose_invalid_scalar(env, reg, &range, "program exit", "R0"); + if (prog->expected_attach_type == BPF_LSM_CGROUP && + !prog->aux->attach_func_proto->type) + verbose(env, "Note, BPF_LSM_CGROUP that attach to void LSM hooks can't modify return value!\n"); return -EINVAL; } @@ -14902,6 +14933,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, fallthrough; case BPF_MODIFY_RETURN: case BPF_LSM_MAC: + case BPF_LSM_CGROUP: case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: if (!btf_type_is_func(t)) { -- cgit From c0e19f2c9a3edd38e4b1bdae98eb44555d02bc31 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 28 Jun 2022 10:43:07 -0700 Subject: bpf: minimize number of allocated lsm slots per program Previous patch adds 1:1 mapping between all 211 LSM hooks and bpf_cgroup program array. Instead of reserving a slot per possible hook, reserve 10 slots per cgroup for lsm programs. Those slots are dynamically allocated on demand and reclaimed. struct cgroup_bpf { struct bpf_prog_array * effective[33]; /* 0 264 */ /* --- cacheline 4 boundary (256 bytes) was 8 bytes ago --- */ struct hlist_head progs[33]; /* 264 264 */ /* --- cacheline 8 boundary (512 bytes) was 16 bytes ago --- */ u8 flags[33]; /* 528 33 */ /* XXX 7 bytes hole, try to pack */ struct list_head storages; /* 568 16 */ /* --- cacheline 9 boundary (576 bytes) was 8 bytes ago --- */ struct bpf_prog_array * inactive; /* 584 8 */ struct percpu_ref refcnt; /* 592 16 */ struct work_struct release_work; /* 608 72 */ /* size: 680, cachelines: 11, members: 7 */ /* sum members: 673, holes: 1, sum holes: 7 */ /* last cacheline: 40 bytes */ }; Reviewed-by: Martin KaFai Lau Signed-off-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20220628174314.1216643-5-sdf@google.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_lsm.c | 5 ----- kernel/bpf/btf.c | 10 ---------- kernel/bpf/cgroup.c | 47 ++++++++++++++++++++++++++++++++++++++++++++++- kernel/bpf/core.c | 7 +++++++ kernel/bpf/trampoline.c | 1 + 5 files changed, 54 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 0f72020bfdcf..83aa431dd52e 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -69,11 +69,6 @@ void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog, *bpf_func = __cgroup_bpf_run_lsm_current; } -int bpf_lsm_hook_idx(u32 btf_id) -{ - return btf_id_set_index(&bpf_lsm_hooks, btf_id); -} - int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, const struct bpf_prog *prog) { diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 7c1fe422ed3f..8d3c7ab8af46 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6843,16 +6843,6 @@ static int btf_id_cmp_func(const void *a, const void *b) return *pa - *pb; } -int btf_id_set_index(const struct btf_id_set *set, u32 id) -{ - const u32 *p; - - p = bsearch(&id, set->ids, set->cnt, sizeof(u32), btf_id_cmp_func); - if (!p) - return -1; - return p - set->ids; -} - bool btf_id_set_contains(const struct btf_id_set *set, u32 id) { return bsearch(&id, set->ids, set->cnt, sizeof(u32), btf_id_cmp_func) != NULL; diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 9cf41dd4f96f..169cbd0de797 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -127,12 +127,57 @@ unsigned int __cgroup_bpf_run_lsm_current(const void *ctx, } #ifdef CONFIG_BPF_LSM +struct cgroup_lsm_atype { + u32 attach_btf_id; + int refcnt; +}; + +static struct cgroup_lsm_atype cgroup_lsm_atype[CGROUP_LSM_NUM]; + static enum cgroup_bpf_attach_type bpf_cgroup_atype_find(enum bpf_attach_type attach_type, u32 attach_btf_id) { + int i; + + lockdep_assert_held(&cgroup_mutex); + if (attach_type != BPF_LSM_CGROUP) return to_cgroup_bpf_attach_type(attach_type); - return CGROUP_LSM_START + bpf_lsm_hook_idx(attach_btf_id); + + for (i = 0; i < ARRAY_SIZE(cgroup_lsm_atype); i++) + if (cgroup_lsm_atype[i].attach_btf_id == attach_btf_id) + return CGROUP_LSM_START + i; + + for (i = 0; i < ARRAY_SIZE(cgroup_lsm_atype); i++) + if (cgroup_lsm_atype[i].attach_btf_id == 0) + return CGROUP_LSM_START + i; + + return -E2BIG; + +} + +void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype) +{ + int i = cgroup_atype - CGROUP_LSM_START; + + lockdep_assert_held(&cgroup_mutex); + + WARN_ON_ONCE(cgroup_lsm_atype[i].attach_btf_id && + cgroup_lsm_atype[i].attach_btf_id != attach_btf_id); + + cgroup_lsm_atype[i].attach_btf_id = attach_btf_id; + cgroup_lsm_atype[i].refcnt++; +} + +void bpf_cgroup_atype_put(int cgroup_atype) +{ + int i = cgroup_atype - CGROUP_LSM_START; + + mutex_lock(&cgroup_mutex); + if (--cgroup_lsm_atype[i].refcnt <= 0) + cgroup_lsm_atype[i].attach_btf_id = 0; + WARN_ON_ONCE(cgroup_lsm_atype[i].refcnt < 0); + mutex_unlock(&cgroup_mutex); } #else static enum cgroup_bpf_attach_type diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 4cc10b942a3c..805c2ad5c793 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -107,6 +107,9 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag fp->aux->prog = fp; fp->jit_requested = ebpf_jit_enabled(); fp->blinding_requested = bpf_jit_blinding_enabled(fp); +#ifdef CONFIG_CGROUP_BPF + aux->cgroup_atype = CGROUP_BPF_ATTACH_TYPE_INVALID; +#endif INIT_LIST_HEAD_RCU(&fp->aux->ksym.lnode); mutex_init(&fp->aux->used_maps_mutex); @@ -2569,6 +2572,10 @@ static void bpf_prog_free_deferred(struct work_struct *work) aux = container_of(work, struct bpf_prog_aux, work); #ifdef CONFIG_BPF_SYSCALL bpf_free_kfunc_btf_tab(aux->kfunc_btf_tab); +#endif +#ifdef CONFIG_CGROUP_BPF + if (aux->cgroup_atype != CGROUP_BPF_ATTACH_TYPE_INVALID) + bpf_cgroup_atype_put(aux->cgroup_atype); #endif bpf_free_used_maps(aux); bpf_free_used_btfs(aux); diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index d7c251d7fbcd..6cd226584c33 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -555,6 +555,7 @@ static struct bpf_shim_tramp_link *cgroup_shim_alloc(const struct bpf_prog *prog bpf_prog_inc(p); bpf_link_init(&shim_link->link.link, BPF_LINK_TYPE_UNSPEC, &bpf_shim_tramp_link_lops, p); + bpf_cgroup_atype_get(p->aux->attach_btf_id, cgroup_atype); return shim_link; } -- cgit From b79c9fc9551b45953a94abf550b7bd3b00e3a0f9 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 28 Jun 2022 10:43:08 -0700 Subject: bpf: implement BPF_PROG_QUERY for BPF_LSM_CGROUP We have two options: 1. Treat all BPF_LSM_CGROUP the same, regardless of attach_btf_id 2. Treat BPF_LSM_CGROUP+attach_btf_id as a separate hook point I was doing (2) in the original patch, but switching to (1) here: * bpf_prog_query returns all attached BPF_LSM_CGROUP programs regardless of attach_btf_id * attach_btf_id is exported via bpf_prog_info Reviewed-by: Martin KaFai Lau Signed-off-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20220628174314.1216643-6-sdf@google.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/cgroup.c | 95 +++++++++++++++++++++++++++++++++++----------------- kernel/bpf/syscall.c | 8 ++++- 2 files changed, 71 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 169cbd0de797..59b7eb60d5b4 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1017,57 +1017,90 @@ static int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, union bpf_attr __user *uattr) { + __u32 __user *prog_attach_flags = u64_to_user_ptr(attr->query.prog_attach_flags); __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); enum bpf_attach_type type = attr->query.attach_type; + enum cgroup_bpf_attach_type from_atype, to_atype; enum cgroup_bpf_attach_type atype; struct bpf_prog_array *effective; - struct hlist_head *progs; - struct bpf_prog *prog; int cnt, ret = 0, i; + int total_cnt = 0; u32 flags; - atype = to_cgroup_bpf_attach_type(type); - if (atype < 0) - return -EINVAL; - - progs = &cgrp->bpf.progs[atype]; - flags = cgrp->bpf.flags[atype]; + if (type == BPF_LSM_CGROUP) { + if (attr->query.prog_cnt && prog_ids && !prog_attach_flags) + return -EINVAL; - effective = rcu_dereference_protected(cgrp->bpf.effective[atype], - lockdep_is_held(&cgroup_mutex)); + from_atype = CGROUP_LSM_START; + to_atype = CGROUP_LSM_END; + flags = 0; + } else { + from_atype = to_cgroup_bpf_attach_type(type); + if (from_atype < 0) + return -EINVAL; + to_atype = from_atype; + flags = cgrp->bpf.flags[from_atype]; + } - if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) - cnt = bpf_prog_array_length(effective); - else - cnt = prog_list_length(progs); + for (atype = from_atype; atype <= to_atype; atype++) { + if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) { + effective = rcu_dereference_protected(cgrp->bpf.effective[atype], + lockdep_is_held(&cgroup_mutex)); + total_cnt += bpf_prog_array_length(effective); + } else { + total_cnt += prog_list_length(&cgrp->bpf.progs[atype]); + } + } if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) return -EFAULT; - if (copy_to_user(&uattr->query.prog_cnt, &cnt, sizeof(cnt))) + if (copy_to_user(&uattr->query.prog_cnt, &total_cnt, sizeof(total_cnt))) return -EFAULT; - if (attr->query.prog_cnt == 0 || !prog_ids || !cnt) + if (attr->query.prog_cnt == 0 || !prog_ids || !total_cnt) /* return early if user requested only program count + flags */ return 0; - if (attr->query.prog_cnt < cnt) { - cnt = attr->query.prog_cnt; + + if (attr->query.prog_cnt < total_cnt) { + total_cnt = attr->query.prog_cnt; ret = -ENOSPC; } - if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) { - return bpf_prog_array_copy_to_user(effective, prog_ids, cnt); - } else { - struct bpf_prog_list *pl; - u32 id; + for (atype = from_atype; atype <= to_atype && total_cnt; atype++) { + if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) { + effective = rcu_dereference_protected(cgrp->bpf.effective[atype], + lockdep_is_held(&cgroup_mutex)); + cnt = min_t(int, bpf_prog_array_length(effective), total_cnt); + ret = bpf_prog_array_copy_to_user(effective, prog_ids, cnt); + } else { + struct hlist_head *progs; + struct bpf_prog_list *pl; + struct bpf_prog *prog; + u32 id; + + progs = &cgrp->bpf.progs[atype]; + cnt = min_t(int, prog_list_length(progs), total_cnt); + i = 0; + hlist_for_each_entry(pl, progs, node) { + prog = prog_list_prog(pl); + id = prog->aux->id; + if (copy_to_user(prog_ids + i, &id, sizeof(id))) + return -EFAULT; + if (++i == cnt) + break; + } + } - i = 0; - hlist_for_each_entry(pl, progs, node) { - prog = prog_list_prog(pl); - id = prog->aux->id; - if (copy_to_user(prog_ids + i, &id, sizeof(id))) - return -EFAULT; - if (++i == cnt) - break; + if (prog_attach_flags) { + flags = cgrp->bpf.flags[atype]; + + for (i = 0; i < cnt; i++) + if (copy_to_user(prog_attach_flags + i, &flags, sizeof(flags))) + return -EFAULT; + prog_attach_flags += cnt; } + + prog_ids += cnt; + total_cnt -= cnt; } return ret; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 626b8f7d237b..ab688d85b2c6 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3520,7 +3520,7 @@ static int bpf_prog_detach(const union bpf_attr *attr) } } -#define BPF_PROG_QUERY_LAST_FIELD query.prog_cnt +#define BPF_PROG_QUERY_LAST_FIELD query.prog_attach_flags static int bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) @@ -3556,6 +3556,7 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_SYSCTL: case BPF_CGROUP_GETSOCKOPT: case BPF_CGROUP_SETSOCKOPT: + case BPF_LSM_CGROUP: return cgroup_bpf_prog_query(attr, uattr); case BPF_LIRC_MODE2: return lirc_prog_query(attr, uattr); @@ -4066,6 +4067,11 @@ static int bpf_prog_get_info_by_fd(struct file *file, if (prog->aux->btf) info.btf_id = btf_obj_id(prog->aux->btf); + info.attach_btf_id = prog->aux->attach_btf_id; + if (prog->aux->attach_btf) + info.attach_btf_obj_id = btf_obj_id(prog->aux->attach_btf); + else if (prog->aux->dst_prog) + info.attach_btf_obj_id = btf_obj_id(prog->aux->dst_prog->aux->attach_btf); ulen = info.nr_func_info; info.nr_func_info = prog->aux->func_info_cnt; -- cgit From 9113d7e48e9128522b9f5a54dfd30dff10509a92 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 28 Jun 2022 10:43:09 -0700 Subject: bpf: expose bpf_{g,s}etsockopt to lsm cgroup I don't see how to make it nice without introducing btf id lists for the hooks where these helpers are allowed. Some LSM hooks work on the locked sockets, some are triggering early and don't grab any locks, so have two lists for now: 1. LSM hooks which trigger under socket lock - minority of the hooks, but ideal case for us, we can expose existing BTF-based helpers 2. LSM hooks which trigger without socket lock, but they trigger early in the socket creation path where it should be safe to do setsockopt without any locks 3. The rest are prohibited. I'm thinking that this use-case might be a good gateway to sleeping lsm cgroup hooks in the future. We can either expose lock/unlock operations (and add tracking to the verifier) or have another set of bpf_setsockopt wrapper that grab the locks and might sleep. Reviewed-by: Martin KaFai Lau Signed-off-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20220628174314.1216643-7-sdf@google.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_lsm.c | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 83aa431dd52e..d469b7f3deef 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -45,6 +45,24 @@ BTF_ID(func, bpf_lsm_sk_alloc_security) BTF_ID(func, bpf_lsm_sk_free_security) BTF_SET_END(bpf_lsm_current_hooks) +/* List of LSM hooks that trigger while the socket is properly locked. + */ +BTF_SET_START(bpf_lsm_locked_sockopt_hooks) +BTF_ID(func, bpf_lsm_socket_sock_rcv_skb) +BTF_ID(func, bpf_lsm_sock_graft) +BTF_ID(func, bpf_lsm_inet_csk_clone) +BTF_ID(func, bpf_lsm_inet_conn_established) +BTF_SET_END(bpf_lsm_locked_sockopt_hooks) + +/* List of LSM hooks that trigger while the socket is _not_ locked, + * but it's ok to call bpf_{g,s}etsockopt because the socket is still + * in the early init phase. + */ +BTF_SET_START(bpf_lsm_unlocked_sockopt_hooks) +BTF_ID(func, bpf_lsm_socket_post_create) +BTF_ID(func, bpf_lsm_socket_socketpair) +BTF_SET_END(bpf_lsm_unlocked_sockopt_hooks) + void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog, bpf_func_t *bpf_func) { @@ -201,6 +219,26 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_get_retval: return prog->expected_attach_type == BPF_LSM_CGROUP ? &bpf_get_retval_proto : NULL; + case BPF_FUNC_setsockopt: + if (prog->expected_attach_type != BPF_LSM_CGROUP) + return NULL; + if (btf_id_set_contains(&bpf_lsm_locked_sockopt_hooks, + prog->aux->attach_btf_id)) + return &bpf_sk_setsockopt_proto; + if (btf_id_set_contains(&bpf_lsm_unlocked_sockopt_hooks, + prog->aux->attach_btf_id)) + return &bpf_unlocked_sk_setsockopt_proto; + return NULL; + case BPF_FUNC_getsockopt: + if (prog->expected_attach_type != BPF_LSM_CGROUP) + return NULL; + if (btf_id_set_contains(&bpf_lsm_locked_sockopt_hooks, + prog->aux->attach_btf_id)) + return &bpf_sk_getsockopt_proto; + if (btf_id_set_contains(&bpf_lsm_unlocked_sockopt_hooks, + prog->aux->attach_btf_id)) + return &bpf_unlocked_sk_getsockopt_proto; + return NULL; default: return tracing_prog_func_proto(func_id, prog); } -- cgit From f163f0302ab69722c052519f4014814bf10026a9 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 8 Jun 2022 16:40:21 +0200 Subject: context_tracking: Rename context_tracking_user_enter/exit() to user_enter/exit_callable() context_tracking_user_enter() and context_tracking_user_exit() are ASM callable versions of user_enter() and user_exit() for architectures that didn't manage to check the context tracking static key from ASM. Change those function names to better reflect their purpose. [ frederic: Apply Max Filippov feedback. ] Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Neeraj Upadhyay Cc: Uladzislau Rezki Cc: Joel Fernandes Cc: Boqun Feng Cc: Nicolas Saenz Julienne Cc: Marcelo Tosatti Cc: Xiongfeng Wang Cc: Yu Liao Cc: Phil Auld Cc: Paul Gortmaker Cc: Alex Belits Signed-off-by: Paul E. McKenney Reviewed-by: Nicolas Saenz Julienne Tested-by: Nicolas Saenz Julienne --- kernel/context_tracking.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 88c60ab39fbb..8f7dd5799bda 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -134,19 +134,22 @@ void context_tracking_enter(enum ctx_state state) NOKPROBE_SYMBOL(context_tracking_enter); EXPORT_SYMBOL_GPL(context_tracking_enter); -/* - * OBSOLETE: - * This function should be noinstr but it unsafely calls local_irq_restore(), - * involving illegal RCU uses through tracing and lockdep. +/** + * user_enter_callable() - Unfortunate ASM callable version of user_enter() for + * archs that didn't manage to check the context tracking + * static key from low level code. + * + * This OBSOLETE function should be noinstr but it unsafely calls + * local_irq_restore(), involving illegal RCU uses through tracing and lockdep. * This is unlikely to be fixed as this function is obsolete. The preferred * way is to call user_enter_irqoff(). It should be the arch entry code * responsibility to call into context tracking with IRQs disabled. */ -void context_tracking_user_enter(void) +void user_enter_callable(void) { user_enter(); } -NOKPROBE_SYMBOL(context_tracking_user_enter); +NOKPROBE_SYMBOL(user_enter_callable); /** * __ct_user_exit - Inform the context tracking that the CPU is @@ -208,19 +211,22 @@ void context_tracking_exit(enum ctx_state state) NOKPROBE_SYMBOL(context_tracking_exit); EXPORT_SYMBOL_GPL(context_tracking_exit); -/* - * OBSOLETE: - * This function should be noinstr but it unsafely calls local_irq_save(), +/** + * user_exit_callable() - Unfortunate ASM callable version of user_exit() for + * archs that didn't manage to check the context tracking + * static key from low level code. + * + * This OBSOLETE function should be noinstr but it unsafely calls local_irq_save(), * involving illegal RCU uses through tracing and lockdep. This is unlikely * to be fixed as this function is obsolete. The preferred way is to call * user_exit_irqoff(). It should be the arch entry code responsibility to * call into context tracking with IRQs disabled. */ -void context_tracking_user_exit(void) +void user_exit_callable(void) { user_exit(); } -NOKPROBE_SYMBOL(context_tracking_user_exit); +NOKPROBE_SYMBOL(user_exit_callable); void __init context_tracking_cpu_set(int cpu) { -- cgit From fe98db1c6d1ad7349e61a5a2766ad64975bc9ae4 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 8 Jun 2022 16:40:22 +0200 Subject: context_tracking: Rename context_tracking_enter/exit() to ct_user_enter/exit() context_tracking_enter() and context_tracking_exit() have confusing names that don't explain the fact they are referring to user/guest state. Use more self-explanatory names and shrink to the new context tracking prefix instead. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Neeraj Upadhyay Cc: Uladzislau Rezki Cc: Joel Fernandes Cc: Boqun Feng Cc: Nicolas Saenz Julienne Cc: Marcelo Tosatti Cc: Xiongfeng Wang Cc: Yu Liao Cc: Phil Auld Cc: Paul Gortmaker Cc: Alex Belits Signed-off-by: Paul E. McKenney Reviewed-by: Nicolas Saenz Julienne Tested-by: Nicolas Saenz Julienne --- kernel/context_tracking.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 8f7dd5799bda..590c920ad57f 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -112,7 +112,7 @@ EXPORT_SYMBOL_GPL(__ct_user_enter); * or context_tracking_guest_enter(). It should be the arch entry code * responsibility to call into context tracking with IRQs disabled. */ -void context_tracking_enter(enum ctx_state state) +void ct_user_enter(enum ctx_state state) { unsigned long flags; @@ -131,8 +131,8 @@ void context_tracking_enter(enum ctx_state state) __ct_user_enter(state); local_irq_restore(flags); } -NOKPROBE_SYMBOL(context_tracking_enter); -EXPORT_SYMBOL_GPL(context_tracking_enter); +NOKPROBE_SYMBOL(ct_user_enter); +EXPORT_SYMBOL_GPL(ct_user_enter); /** * user_enter_callable() - Unfortunate ASM callable version of user_enter() for @@ -197,7 +197,7 @@ EXPORT_SYMBOL_GPL(__ct_user_exit); * or context_tracking_guest_exit(). It should be the arch entry code * responsibility to call into context tracking with IRQs disabled. */ -void context_tracking_exit(enum ctx_state state) +void ct_user_exit(enum ctx_state state) { unsigned long flags; @@ -208,8 +208,8 @@ void context_tracking_exit(enum ctx_state state) __ct_user_exit(state); local_irq_restore(flags); } -NOKPROBE_SYMBOL(context_tracking_exit); -EXPORT_SYMBOL_GPL(context_tracking_exit); +NOKPROBE_SYMBOL(ct_user_exit); +EXPORT_SYMBOL_GPL(ct_user_exit); /** * user_exit_callable() - Unfortunate ASM callable version of user_exit() for -- cgit From 2a0aafce963dab996b843f6cdb49237b0ae4a118 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 8 Jun 2022 16:40:23 +0200 Subject: context_tracking: Rename context_tracking_cpu_set() to ct_cpu_track_user() context_tracking_cpu_set() is called in order to tell a CPU to track user/kernel transitions. Since context tracking is going to expand in to also track transitions from/to idle/IRQ/NMIs, the scope of this function name becomes too broad and needs to be made more specific. Also shorten the prefix to align with the new namespace. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Neeraj Upadhyay Cc: Uladzislau Rezki Cc: Joel Fernandes Cc: Boqun Feng Cc: Nicolas Saenz Julienne Cc: Marcelo Tosatti Cc: Xiongfeng Wang Cc: Yu Liao Cc: Phil Auld Cc: Paul Gortmaker Cc: Alex Belits Signed-off-by: Paul E. McKenney Reviewed-by: Nicolas Saenz Julienne Tested-by: Nicolas Saenz Julienne --- kernel/context_tracking.c | 4 ++-- kernel/time/tick-sched.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 590c920ad57f..d361bd52e4e1 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -228,7 +228,7 @@ void user_exit_callable(void) } NOKPROBE_SYMBOL(user_exit_callable); -void __init context_tracking_cpu_set(int cpu) +void __init ct_cpu_track_user(int cpu) { static __initdata bool initialized = false; @@ -258,6 +258,6 @@ void __init context_tracking_init(void) int cpu; for_each_possible_cpu(cpu) - context_tracking_cpu_set(cpu); + ct_cpu_track_user(cpu); } #endif diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 58a11f859ac7..de192dcff828 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -571,7 +571,7 @@ void __init tick_nohz_init(void) } for_each_cpu(cpu, tick_nohz_full_mask) - context_tracking_cpu_set(cpu); + ct_cpu_track_user(cpu); ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "kernel/nohz:predown", NULL, -- cgit From 24a9c54182b3758801b8ca6c8c237cc2ff654732 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 8 Jun 2022 16:40:24 +0200 Subject: context_tracking: Split user tracking Kconfig Context tracking is going to be used not only to track user transitions but also idle/IRQs/NMIs. The user tracking part will then become a separate feature. Prepare Kconfig for that. [ frederic: Apply Max Filippov feedback. ] Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Neeraj Upadhyay Cc: Uladzislau Rezki Cc: Joel Fernandes Cc: Boqun Feng Cc: Nicolas Saenz Julienne Cc: Marcelo Tosatti Cc: Xiongfeng Wang Cc: Yu Liao Cc: Phil Auld Cc: Paul Gortmaker Cc: Alex Belits Signed-off-by: Paul E. McKenney Reviewed-by: Nicolas Saenz Julienne Tested-by: Nicolas Saenz Julienne --- kernel/context_tracking.c | 6 +++++- kernel/sched/core.c | 2 +- kernel/time/Kconfig | 31 ++++++++++++++++++++----------- 3 files changed, 26 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index d361bd52e4e1..f3dec1be2bf6 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -22,6 +22,8 @@ #include #include +#ifdef CONFIG_CONTEXT_TRACKING_USER + #define CREATE_TRACE_POINTS #include @@ -252,7 +254,7 @@ void __init ct_cpu_track_user(int cpu) initialized = true; } -#ifdef CONFIG_CONTEXT_TRACKING_FORCE +#ifdef CONFIG_CONTEXT_TRACKING_USER_FORCE void __init context_tracking_init(void) { int cpu; @@ -261,3 +263,5 @@ void __init context_tracking_init(void) ct_cpu_track_user(cpu); } #endif + +#endif /* #ifdef CONFIG_CONTEXT_TRACKING_USER */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index da0bf6fe9ecd..883167a57bf9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6559,7 +6559,7 @@ void __sched schedule_idle(void) } while (need_resched()); } -#if defined(CONFIG_CONTEXT_TRACKING) && !defined(CONFIG_HAVE_CONTEXT_TRACKING_OFFSTACK) +#if defined(CONFIG_CONTEXT_TRACKING_USER) && !defined(CONFIG_HAVE_CONTEXT_TRACKING_USER_OFFSTACK) asmlinkage __visible void __sched schedule_user(void) { /* diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 27b7868b5c30..41f99bcfe9e6 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -73,6 +73,9 @@ config TIME_KUNIT_TEST If unsure, say N. +config CONTEXT_TRACKING + bool + if GENERIC_CLOCKEVENTS menu "Timers subsystem" @@ -111,7 +114,7 @@ config NO_HZ_FULL # NO_HZ_COMMON dependency # We need at least one periodic CPU for timekeeping depends on SMP - depends on HAVE_CONTEXT_TRACKING + depends on HAVE_CONTEXT_TRACKING_USER # VIRT_CPU_ACCOUNTING_GEN dependency depends on HAVE_VIRT_CPU_ACCOUNTING_GEN select NO_HZ_COMMON @@ -137,31 +140,37 @@ config NO_HZ_FULL endchoice -config CONTEXT_TRACKING - bool +config CONTEXT_TRACKING_USER + bool + depends on HAVE_CONTEXT_TRACKING_USER + select CONTEXT_TRACKING + help + Track transitions between kernel and user on behalf of RCU and + tickless cputime accounting. The former case relies on context + tracking to enter/exit RCU extended quiescent states. -config CONTEXT_TRACKING_FORCE - bool "Force context tracking" - depends on CONTEXT_TRACKING +config CONTEXT_TRACKING_USER_FORCE + bool "Force user context tracking" + depends on CONTEXT_TRACKING_USER default y if !NO_HZ_FULL help The major pre-requirement for full dynticks to work is to - support the context tracking subsystem. But there are also + support the user context tracking subsystem. But there are also other dependencies to provide in order to make the full dynticks working. This option stands for testing when an arch implements the - context tracking backend but doesn't yet fulfill all the + user context tracking backend but doesn't yet fulfill all the requirements to make the full dynticks feature working. Without the full dynticks, there is no way to test the support - for context tracking and the subsystems that rely on it: RCU + for user context tracking and the subsystems that rely on it: RCU userspace extended quiescent state and tickless cputime accounting. This option copes with the absence of the full - dynticks subsystem by forcing the context tracking on all + dynticks subsystem by forcing the user context tracking on all CPUs in the system. Say Y only if you're working on the development of an - architecture backend for the context tracking. + architecture backend for the user context tracking. Say N otherwise, this option brings an overhead that you don't want in production. -- cgit From c381d02b2fd5f82d2207db1b9b25ff60d0d9c27c Mon Sep 17 00:00:00 2001 From: Yuwei Wang Date: Wed, 29 Jun 2022 08:48:31 +0000 Subject: sysctl: add proc_dointvec_ms_jiffies_minmax add proc_dointvec_ms_jiffies_minmax to fit read msecs value to jiffies with a limited range of values Signed-off-by: Yuwei Wang Signed-off-by: Paolo Abeni --- kernel/sysctl.c | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index e52b6e372c60..85c92e2c2570 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1237,6 +1237,30 @@ static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp, return 0; } +static int do_proc_dointvec_ms_jiffies_minmax_conv(bool *negp, unsigned long *lvalp, + int *valp, int write, void *data) +{ + int tmp, ret; + struct do_proc_dointvec_minmax_conv_param *param = data; + /* + * If writing, first do so via a temporary local int so we can + * bounds-check it before touching *valp. + */ + int *ip = write ? &tmp : valp; + + ret = do_proc_dointvec_ms_jiffies_conv(negp, lvalp, ip, write, data); + if (ret) + return ret; + + if (write) { + if ((param->min && *param->min > tmp) || + (param->max && *param->max < tmp)) + return -EINVAL; + *valp = tmp; + } + return 0; +} + /** * proc_dointvec_jiffies - read a vector of integers as seconds * @table: the sysctl table @@ -1259,6 +1283,17 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write, do_proc_dointvec_jiffies_conv,NULL); } +int proc_dointvec_ms_jiffies_minmax(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct do_proc_dointvec_minmax_conv_param param = { + .min = (int *) table->extra1, + .max = (int *) table->extra2, + }; + return do_proc_dointvec(table, write, buffer, lenp, ppos, + do_proc_dointvec_ms_jiffies_minmax_conv, ¶m); +} + /** * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds * @table: the sysctl table @@ -1523,6 +1558,12 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write, return -ENOSYS; } +int proc_dointvec_ms_jiffies_minmax(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { -- cgit From 261e224d6a5c43e2bb8a07b7662f9b4ec425cfec Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Thu, 30 Jun 2022 19:12:29 +0000 Subject: pm/sleep: Add PM_USERSPACE_AUTOSLEEP Kconfig Systems that initiate frequent suspend/resume from userspace can make the kernel aware by enabling PM_USERSPACE_AUTOSLEEP config. This allows for certain sleep-sensitive code (wireguard/rng) to decide on what preparatory work should be performed (or not) in their pm_notification callbacks. This patch was prompted by the discussion at [1] which attempts to remove CONFIG_ANDROID that currently guards these code paths. [1] https://lore.kernel.org/r/20220629150102.1582425-1-hch@lst.de/ Suggested-by: Jason A. Donenfeld Acked-by: Jason A. Donenfeld Signed-off-by: Kalesh Singh Link: https://lore.kernel.org/r/20220630191230.235306-1-kaleshsingh@google.com Signed-off-by: Greg Kroah-Hartman --- kernel/power/Kconfig | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'kernel') diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index a12779650f15..60a1d3051cc7 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -143,6 +143,26 @@ config PM_AUTOSLEEP Allow the kernel to trigger a system transition into a global sleep state automatically whenever there are no active wakeup sources. +config PM_USERSPACE_AUTOSLEEP + bool "Userspace opportunistic sleep" + depends on PM_SLEEP + help + Notify kernel of aggressive userspace autosleep power management policy. + + This option changes the behavior of various sleep-sensitive code to deal + with frequent userspace-initiated transitions into a global sleep state. + + Saying Y here, disables code paths that most users really should keep + enabled. In particular, only enable this if it is very common to be + asleep/awake for very short periods of time (<= 2 seconds). + + Only platforms, such as Android, that implement opportunistic sleep from + a userspace power manager service should enable this option; and not + other machines. Therefore, you should say N here, unless you are + extremely certain that this is what you want. The option otherwise has + bad, undesirable effects, and should not be enabled just for fun. + + config PM_WAKELOCKS bool "User space wakeup sources interface" depends on PM_SLEEP -- cgit From 1045a06724f322ed61f1ffb994427c7bdbe64647 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 29 Jun 2022 17:01:02 +0200 Subject: remove CONFIG_ANDROID The ANDROID config symbol is only used to guard the binder config symbol and to inject completely random config changes. Remove it as it is obviously a bad idea. Acked-by: Paul E. McKenney Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220629150102.1582425-2-hch@lst.de Signed-off-by: Greg Kroah-Hartman --- kernel/configs/android-base.config | 1 - kernel/rcu/Kconfig.debug | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config index eb701b2ac72f..44b0f0146a3f 100644 --- a/kernel/configs/android-base.config +++ b/kernel/configs/android-base.config @@ -7,7 +7,6 @@ # CONFIG_OABI_COMPAT is not set # CONFIG_SYSVIPC is not set # CONFIG_USELIB is not set -CONFIG_ANDROID=y CONFIG_ANDROID_BINDER_IPC=y CONFIG_ANDROID_BINDER_DEVICES=binder,hwbinder,vndbinder CONFIG_ANDROID_LOW_MEMORY_KILLER=y diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 9b64e55d4f61..e875f4f88965 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -86,8 +86,7 @@ config RCU_EXP_CPU_STALL_TIMEOUT int "Expedited RCU CPU stall timeout in milliseconds" depends on RCU_STALL_COMMON range 0 21000 - default 20 if ANDROID - default 0 if !ANDROID + default 0 help If a given expedited RCU grace period extends more than the specified number of milliseconds, a CPU stall warning is printed. -- cgit From 2852ca7fba9f77b204f0fe953b31fadd0057c936 Mon Sep 17 00:00:00 2001 From: David Gow Date: Fri, 1 Jul 2022 16:47:41 +0800 Subject: panic: Taint kernel if tests are run Most in-kernel tests (such as KUnit tests) are not supposed to run on production systems: they may do deliberately illegal things to trigger errors, and have security implications (for example, KUnit assertions will often deliberately leak kernel addresses). Add a new taint type, TAINT_TEST to signal that a test has been run. This will be printed as 'N' (originally for kuNit, as every other sensible letter was taken.) This should discourage people from running these tests on production systems, and to make it easier to tell if tests have been run accidentally (by loading the wrong configuration, etc.) Acked-by: Luis Chamberlain Reviewed-by: Brendan Higgins Signed-off-by: David Gow Signed-off-by: Shuah Khan --- kernel/panic.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index a3c758dba15a..6b3369e21026 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -428,6 +428,7 @@ const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = { [ TAINT_LIVEPATCH ] = { 'K', ' ', true }, [ TAINT_AUX ] = { 'X', ' ', true }, [ TAINT_RANDSTRUCT ] = { 'T', ' ', true }, + [ TAINT_TEST ] = { 'N', ' ', true }, }; /** -- cgit From e33c267ab70de4249d22d7eab1cc7d68a889bac2 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 31 May 2022 20:22:24 -0700 Subject: mm: shrinkers: provide shrinkers with names Currently shrinkers are anonymous objects. For debugging purposes they can be identified by count/scan function names, but it's not always useful: e.g. for superblock's shrinkers it's nice to have at least an idea of to which superblock the shrinker belongs. This commit adds names to shrinkers. register_shrinker() and prealloc_shrinker() functions are extended to take a format and arguments to master a name. In some cases it's not possible to determine a good name at the time when a shrinker is allocated. For such cases shrinker_debugfs_rename() is provided. The expected format is: -[:]- For some shrinkers an instance can be encoded as (MAJOR:MINOR) pair. After this change the shrinker debugfs directory looks like: $ cd /sys/kernel/debug/shrinker/ $ ls dquota-cache-16 sb-devpts-28 sb-proc-47 sb-tmpfs-42 mm-shadow-18 sb-devtmpfs-5 sb-proc-48 sb-tmpfs-43 mm-zspool:zram0-34 sb-hugetlbfs-17 sb-pstore-31 sb-tmpfs-44 rcu-kfree-0 sb-hugetlbfs-33 sb-rootfs-2 sb-tmpfs-49 sb-aio-20 sb-iomem-12 sb-securityfs-6 sb-tracefs-13 sb-anon_inodefs-15 sb-mqueue-21 sb-selinuxfs-22 sb-xfs:vda1-36 sb-bdev-3 sb-nsfs-4 sb-sockfs-8 sb-zsmalloc-19 sb-bpf-32 sb-pipefs-14 sb-sysfs-26 thp-deferred_split-10 sb-btrfs:vda2-24 sb-proc-25 sb-tmpfs-1 thp-zero-9 sb-cgroup2-30 sb-proc-39 sb-tmpfs-27 xfs-buf:vda1-37 sb-configfs-23 sb-proc-41 sb-tmpfs-29 xfs-inodegc:vda1-38 sb-dax-11 sb-proc-45 sb-tmpfs-35 sb-debugfs-7 sb-proc-46 sb-tmpfs-40 [roman.gushchin@linux.dev: fix build warnings] Link: https://lkml.kernel.org/r/Yr+ZTnLb9lJk6fJO@castle Reported-by: kernel test robot Link: https://lkml.kernel.org/r/20220601032227.4076670-4-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin Cc: Christophe JAILLET Cc: Dave Chinner Cc: Hillf Danton Cc: Kent Overstreet Cc: Muchun Song Signed-off-by: Andrew Morton --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c25ba442044a..4b3bf6ebb1eb 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4884,7 +4884,7 @@ static void __init kfree_rcu_batch_init(void) INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func); krcp->initialized = true; } - if (register_shrinker(&kfree_rcu_shrinker)) + if (register_shrinker(&kfree_rcu_shrinker, "rcu-kfree")) pr_err("Failed to register kfree_rcu() shrinker!\n"); } -- cgit From e67198cc05b8ecbb7b8e2d8ef9fb5c8d26821873 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 8 Jun 2022 16:40:25 +0200 Subject: context_tracking: Take idle eqs entrypoints over RCU The RCU dynticks counter is going to be merged into the context tracking subsystem. Start with moving the idle extended quiescent states entrypoints to context tracking. For now those are dumb redirections to existing RCU calls. [ paulmck: Apply kernel test robot feedback. ] Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Neeraj Upadhyay Cc: Uladzislau Rezki Cc: Joel Fernandes Cc: Boqun Feng Cc: Nicolas Saenz Julienne Cc: Marcelo Tosatti Cc: Xiongfeng Wang Cc: Yu Liao Cc: Phil Auld Cc: Paul Gortmaker Cc: Alex Belits Signed-off-by: Paul E. McKenney Reviewed-by: Nicolas Saenz Julienne Tested-by: Nicolas Saenz Julienne --- kernel/context_tracking.c | 15 +++++++++++++++ kernel/locking/lockdep.c | 2 +- kernel/rcu/Kconfig | 2 ++ kernel/rcu/tree.c | 2 -- kernel/rcu/update.c | 2 +- kernel/sched/idle.c | 10 +++++----- kernel/sched/sched.h | 1 + kernel/time/Kconfig | 6 ++++++ 8 files changed, 31 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index f3dec1be2bf6..c0b3798d4e94 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -22,6 +22,21 @@ #include #include + +#ifdef CONFIG_CONTEXT_TRACKING_IDLE +noinstr void ct_idle_enter(void) +{ + rcu_idle_enter(); +} +EXPORT_SYMBOL_GPL(ct_idle_enter); + +void ct_idle_exit(void) +{ + rcu_idle_exit(); +} +EXPORT_SYMBOL_GPL(ct_idle_exit); +#endif /* #ifdef CONFIG_CONTEXT_TRACKING_IDLE */ + #ifdef CONFIG_CONTEXT_TRACKING_USER #define CREATE_TRACE_POINTS diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index f06b91ca6482..5ea690cb4f7a 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -6570,7 +6570,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) /* * If a CPU is in the RCU-free window in idle (ie: in the section - * between rcu_idle_enter() and rcu_idle_exit(), then RCU + * between ct_idle_enter() and ct_idle_exit(), then RCU * considers that CPU to be in an "extended quiescent state", * which means that RCU will be completely ignoring that CPU. * Therefore, rcu_read_lock() and friends have absolutely no diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 1c630e573548..3fa24e63d6f9 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -8,6 +8,8 @@ menu "RCU Subsystem" config TREE_RCU bool default y if SMP + # Dynticks-idle tracking + select CONTEXT_TRACKING_IDLE help This option selects the RCU implementation that is designed for very large SMP system with hundreds or diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 9a5edab5558c..051fed0844b6 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -664,7 +664,6 @@ void noinstr rcu_idle_enter(void) WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !raw_irqs_disabled()); rcu_eqs_enter(false); } -EXPORT_SYMBOL_GPL(rcu_idle_enter); #ifdef CONFIG_NO_HZ_FULL @@ -904,7 +903,6 @@ void noinstr rcu_idle_exit(void) rcu_eqs_exit(false); raw_local_irq_restore(flags); } -EXPORT_SYMBOL_GPL(rcu_idle_exit); #ifdef CONFIG_NO_HZ_FULL /** diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index fc7fef575606..147214b2cd68 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -85,7 +85,7 @@ module_param(rcu_normal_after_boot, int, 0444); * and while lockdep is disabled. * * Note that if the CPU is in the idle loop from an RCU point of view (ie: - * that we are in the section between rcu_idle_enter() and rcu_idle_exit()) + * that we are in the section between ct_idle_enter() and ct_idle_exit()) * then rcu_read_lock_held() sets ``*ret`` to false even if the CPU did an * rcu_read_lock(). The reason for this is that RCU ignores CPUs that are * in such a section, considering these as in extended quiescent state, diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 328cccbee444..f26ab2675f7d 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -53,14 +53,14 @@ static noinline int __cpuidle cpu_idle_poll(void) { trace_cpu_idle(0, smp_processor_id()); stop_critical_timings(); - rcu_idle_enter(); + ct_idle_enter(); local_irq_enable(); while (!tif_need_resched() && (cpu_idle_force_poll || tick_check_broadcast_expired())) cpu_relax(); - rcu_idle_exit(); + ct_idle_exit(); start_critical_timings(); trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); @@ -98,12 +98,12 @@ void __cpuidle default_idle_call(void) * * Trace IRQs enable here, then switch off RCU, and have * arch_cpu_idle() use raw_local_irq_enable(). Note that - * rcu_idle_enter() relies on lockdep IRQ state, so switch that + * ct_idle_enter() relies on lockdep IRQ state, so switch that * last -- this is very similar to the entry code. */ trace_hardirqs_on_prepare(); lockdep_hardirqs_on_prepare(); - rcu_idle_enter(); + ct_idle_enter(); lockdep_hardirqs_on(_THIS_IP_); arch_cpu_idle(); @@ -116,7 +116,7 @@ void __cpuidle default_idle_call(void) */ raw_local_irq_disable(); lockdep_hardirqs_off(_THIS_IP_); - rcu_idle_exit(); + ct_idle_exit(); lockdep_hardirqs_on(_THIS_IP_); raw_local_irq_enable(); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 47b89a0fc6e5..0cfe2d0af294 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 41f99bcfe9e6..a41753be1a2b 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -76,6 +76,12 @@ config TIME_KUNIT_TEST config CONTEXT_TRACKING bool +config CONTEXT_TRACKING_IDLE + bool + select CONTEXT_TRACKING + help + Tracks idle state on behalf of RCU. + if GENERIC_CLOCKEVENTS menu "Timers subsystem" -- cgit From 6f0e6c1598b1a3d19fc30db86b6e26d6f881b43d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 8 Jun 2022 16:40:26 +0200 Subject: context_tracking: Take IRQ eqs entrypoints over RCU The RCU dynticks counter is going to be merged into the context tracking subsystem. Prepare with moving the IRQ extended quiescent states entrypoints to context tracking. For now those are dumb redirection to existing RCU calls. [ paulmck: Apply Stephen Rothwell feedback from -next. ] [ paulmck: Apply Nathan Chancellor feedback. ] Acked-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Neeraj Upadhyay Cc: Uladzislau Rezki Cc: Joel Fernandes Cc: Boqun Feng Cc: Nicolas Saenz Julienne Cc: Marcelo Tosatti Cc: Xiongfeng Wang Cc: Yu Liao Cc: Phil Auld Cc: Paul Gortmaker Cc: Alex Belits Signed-off-by: Paul E. McKenney Reviewed-by: Nicolas Saenz Julienne Tested-by: Nicolas Saenz Julienne --- kernel/cfi.c | 4 ++-- kernel/context_tracking.c | 24 ++++++++++++++++++++++-- kernel/cpu_pm.c | 8 ++++---- kernel/entry/common.c | 12 ++++++------ kernel/softirq.c | 4 ++-- kernel/trace/trace.c | 6 +++--- 6 files changed, 39 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/cfi.c b/kernel/cfi.c index 08102d19ec15..2046276ee234 100644 --- a/kernel/cfi.c +++ b/kernel/cfi.c @@ -295,7 +295,7 @@ static inline cfi_check_fn find_check_fn(unsigned long ptr) rcu_idle = !rcu_is_watching(); if (rcu_idle) { local_irq_save(flags); - rcu_irq_enter(); + ct_irq_enter(); } if (IS_ENABLED(CONFIG_CFI_CLANG_SHADOW)) @@ -304,7 +304,7 @@ static inline cfi_check_fn find_check_fn(unsigned long ptr) fn = find_module_check_fn(ptr); if (rcu_idle) { - rcu_irq_exit(); + ct_irq_exit(); local_irq_restore(flags); } diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index c0b3798d4e94..72bd71a02c44 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -35,6 +35,26 @@ void ct_idle_exit(void) rcu_idle_exit(); } EXPORT_SYMBOL_GPL(ct_idle_exit); + +noinstr void ct_irq_enter(void) +{ + rcu_irq_enter(); +} + +noinstr void ct_irq_exit(void) +{ + rcu_irq_exit(); +} + +void ct_irq_enter_irqson(void) +{ + rcu_irq_enter_irqson(); +} + +void ct_irq_exit_irqson(void) +{ + rcu_irq_exit_irqson(); +} #endif /* #ifdef CONFIG_CONTEXT_TRACKING_IDLE */ #ifdef CONFIG_CONTEXT_TRACKING_USER @@ -90,7 +110,7 @@ void noinstr __ct_user_enter(enum ctx_state state) * At this stage, only low level arch entry code remains and * then we'll run in userspace. We can assume there won't be * any RCU read-side critical section until the next call to - * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency + * user_exit() or ct_irq_enter(). Let's remove RCU's dependency * on the tick. */ if (state == CONTEXT_USER) { @@ -136,7 +156,7 @@ void ct_user_enter(enum ctx_state state) /* * Some contexts may involve an exception occuring in an irq, * leading to that nesting: - * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit() + * ct_irq_enter() rcu_user_exit() rcu_user_exit() ct_irq_exit() * This would mess up the dyntick_nesting count though. And rcu_irq_*() * helpers are enough to protect RCU uses inside the exception. So * just return immediately if we detect we are in an IRQ. diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c index 246efc74e3f3..ba4ba71facf9 100644 --- a/kernel/cpu_pm.c +++ b/kernel/cpu_pm.c @@ -35,11 +35,11 @@ static int cpu_pm_notify(enum cpu_pm_event event) * disfunctional in cpu idle. Copy RCU_NONIDLE code to let RCU know * this. */ - rcu_irq_enter_irqson(); + ct_irq_enter_irqson(); rcu_read_lock(); ret = raw_notifier_call_chain(&cpu_pm_notifier.chain, event, NULL); rcu_read_unlock(); - rcu_irq_exit_irqson(); + ct_irq_exit_irqson(); return notifier_to_errno(ret); } @@ -49,11 +49,11 @@ static int cpu_pm_notify_robust(enum cpu_pm_event event_up, enum cpu_pm_event ev unsigned long flags; int ret; - rcu_irq_enter_irqson(); + ct_irq_enter_irqson(); raw_spin_lock_irqsave(&cpu_pm_notifier.lock, flags); ret = raw_notifier_call_chain_robust(&cpu_pm_notifier.chain, event_up, event_down, NULL); raw_spin_unlock_irqrestore(&cpu_pm_notifier.lock, flags); - rcu_irq_exit_irqson(); + ct_irq_exit_irqson(); return notifier_to_errno(ret); } diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 032f164abe7c..667ba5d581ff 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -321,7 +321,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) } /* - * If this entry hit the idle task invoke rcu_irq_enter() whether + * If this entry hit the idle task invoke ct_irq_enter() whether * RCU is watching or not. * * Interrupts can nest when the first interrupt invokes softirq @@ -332,12 +332,12 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) * not nested into another interrupt. * * Checking for rcu_is_watching() here would prevent the nesting - * interrupt to invoke rcu_irq_enter(). If that nested interrupt is + * interrupt to invoke ct_irq_enter(). If that nested interrupt is * the tick then rcu_flavor_sched_clock_irq() would wrongfully * assume that it is the first interrupt and eventually claim * quiescent state and end grace periods prematurely. * - * Unconditionally invoke rcu_irq_enter() so RCU state stays + * Unconditionally invoke ct_irq_enter() so RCU state stays * consistent. * * TINY_RCU does not support EQS, so let the compiler eliminate @@ -350,7 +350,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) * as in irqentry_enter_from_user_mode(). */ lockdep_hardirqs_off(CALLER_ADDR0); - rcu_irq_enter(); + ct_irq_enter(); instrumentation_begin(); trace_hardirqs_off_finish(); instrumentation_end(); @@ -418,7 +418,7 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state) trace_hardirqs_on_prepare(); lockdep_hardirqs_on_prepare(); instrumentation_end(); - rcu_irq_exit(); + ct_irq_exit(); lockdep_hardirqs_on(CALLER_ADDR0); return; } @@ -436,7 +436,7 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state) * was not watching on entry. */ if (state.exit_rcu) - rcu_irq_exit(); + ct_irq_exit(); } } diff --git a/kernel/softirq.c b/kernel/softirq.c index 9f0aef8aa9ff..c8a6913c067d 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -620,7 +620,7 @@ void irq_enter_rcu(void) */ void irq_enter(void) { - rcu_irq_enter(); + ct_irq_enter(); irq_enter_rcu(); } @@ -672,7 +672,7 @@ void irq_exit_rcu(void) void irq_exit(void) { __irq_exit_rcu(); - rcu_irq_exit(); + ct_irq_exit(); /* must be last! */ lockdep_hardirq_exit(); } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2c95992e2c71..fe78a6818126 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3107,15 +3107,15 @@ void __trace_stack(struct trace_array *tr, unsigned int trace_ctx, /* * When an NMI triggers, RCU is enabled via rcu_nmi_enter(), * but if the above rcu_is_watching() failed, then the NMI - * triggered someplace critical, and rcu_irq_enter() should + * triggered someplace critical, and ct_irq_enter() should * not be called from NMI. */ if (unlikely(in_nmi())) return; - rcu_irq_enter_irqson(); + ct_irq_enter_irqson(); __ftrace_trace_stack(buffer, trace_ctx, skip, NULL); - rcu_irq_exit_irqson(); + ct_irq_exit_irqson(); } /** -- cgit From 493c1822825f00025d6754ec0632990a27edc6f8 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 8 Jun 2022 16:40:27 +0200 Subject: context_tracking: Take NMI eqs entrypoints over RCU The RCU dynticks counter is going to be merged into the context tracking subsystem. Prepare with moving the NMI extended quiescent states entrypoints to context tracking. For now those are dumb redirection to existing RCU calls. Acked-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Neeraj Upadhyay Cc: Uladzislau Rezki Cc: Joel Fernandes Cc: Boqun Feng Cc: Nicolas Saenz Julienne Cc: Marcelo Tosatti Cc: Xiongfeng Wang Cc: Yu Liao Cc: Phil Auld Cc: Paul Gortmaker Cc: Alex Belits Signed-off-by: Paul E. McKenney Reviewed-by: Nicolas Saenz Julienne Tested-by: Nicolas Saenz Julienne --- kernel/context_tracking.c | 10 ++++++++++ kernel/entry/common.c | 4 ++-- kernel/extable.c | 4 ++-- kernel/trace/trace.c | 2 +- 4 files changed, 15 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 72bd71a02c44..b8a731f20778 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -55,6 +55,16 @@ void ct_irq_exit_irqson(void) { rcu_irq_exit_irqson(); } + +noinstr void ct_nmi_enter(void) +{ + rcu_nmi_enter(); +} + +noinstr void ct_nmi_exit(void) +{ + rcu_nmi_exit(); +} #endif /* #ifdef CONFIG_CONTEXT_TRACKING_IDLE */ #ifdef CONFIG_CONTEXT_TRACKING_USER diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 667ba5d581ff..063068a9ea9b 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -449,7 +449,7 @@ irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs) __nmi_enter(); lockdep_hardirqs_off(CALLER_ADDR0); lockdep_hardirq_enter(); - rcu_nmi_enter(); + ct_nmi_enter(); instrumentation_begin(); trace_hardirqs_off_finish(); @@ -469,7 +469,7 @@ void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state) } instrumentation_end(); - rcu_nmi_exit(); + ct_nmi_exit(); lockdep_hardirq_exit(); if (irq_state.lockdep) lockdep_hardirqs_on(CALLER_ADDR0); diff --git a/kernel/extable.c b/kernel/extable.c index bda5e9761541..71f482581cab 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -114,7 +114,7 @@ int kernel_text_address(unsigned long addr) /* Treat this like an NMI as it can happen anywhere */ if (no_rcu) - rcu_nmi_enter(); + ct_nmi_enter(); if (is_module_text_address(addr)) goto out; @@ -127,7 +127,7 @@ int kernel_text_address(unsigned long addr) ret = 0; out: if (no_rcu) - rcu_nmi_exit(); + ct_nmi_exit(); return ret; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index fe78a6818126..5fc7f17f5ec7 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3105,7 +3105,7 @@ void __trace_stack(struct trace_array *tr, unsigned int trace_ctx, } /* - * When an NMI triggers, RCU is enabled via rcu_nmi_enter(), + * When an NMI triggers, RCU is enabled via ct_nmi_enter(), * but if the above rcu_is_watching() failed, then the NMI * triggered someplace critical, and ct_irq_enter() should * not be called from NMI. -- cgit From 3864caafe7c66f01b188ffccb6a4215f3bf56292 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 8 Jun 2022 16:40:28 +0200 Subject: rcu/context-tracking: Remove rcu_irq_enter/exit() Now rcu_irq_enter/exit() is an unnecessary middle call between ct_irq_enter/exit() and nmi_irq_enter/exit(). Take this opportunity to remove the former functions and move the comments above them to the new entrypoints. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Neeraj Upadhyay Cc: Uladzislau Rezki Cc: Joel Fernandes Cc: Boqun Feng Cc: Nicolas Saenz Julienne Cc: Marcelo Tosatti Cc: Xiongfeng Wang Cc: Yu Liao Cc: Phil Auld Cc: Paul Gortmaker Cc: Alex Belits Signed-off-by: Paul E. McKenney Reviewed-by: Nicolas Saenz Julienne Tested-by: Nicolas Saenz Julienne --- kernel/context_tracking.c | 71 +++++++++++++++++++++++++++++++++++++--- kernel/rcu/tree.c | 83 ----------------------------------------------- 2 files changed, 67 insertions(+), 87 deletions(-) (limited to 'kernel') diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index b8a731f20778..c0d86dac98f1 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -36,24 +36,87 @@ void ct_idle_exit(void) } EXPORT_SYMBOL_GPL(ct_idle_exit); +/** + * ct_irq_enter - inform RCU that current CPU is entering irq away from idle + * + * Enter an interrupt handler, which might possibly result in exiting + * idle mode, in other words, entering the mode in which read-side critical + * sections can occur. The caller must have disabled interrupts. + * + * Note that the Linux kernel is fully capable of entering an interrupt + * handler that it never exits, for example when doing upcalls to user mode! + * This code assumes that the idle loop never does upcalls to user mode. + * If your architecture's idle loop does do upcalls to user mode (or does + * anything else that results in unbalanced calls to the irq_enter() and + * irq_exit() functions), RCU will give you what you deserve, good and hard. + * But very infrequently and irreproducibly. + * + * Use things like work queues to work around this limitation. + * + * You have been warned. + * + * If you add or remove a call to ct_irq_enter(), be sure to test with + * CONFIG_RCU_EQS_DEBUG=y. + */ noinstr void ct_irq_enter(void) { - rcu_irq_enter(); + lockdep_assert_irqs_disabled(); + ct_nmi_enter(); } +/** + * ct_irq_exit - inform RCU that current CPU is exiting irq towards idle + * + * Exit from an interrupt handler, which might possibly result in entering + * idle mode, in other words, leaving the mode in which read-side critical + * sections can occur. The caller must have disabled interrupts. + * + * This code assumes that the idle loop never does anything that might + * result in unbalanced calls to irq_enter() and irq_exit(). If your + * architecture's idle loop violates this assumption, RCU will give you what + * you deserve, good and hard. But very infrequently and irreproducibly. + * + * Use things like work queues to work around this limitation. + * + * You have been warned. + * + * If you add or remove a call to ct_irq_exit(), be sure to test with + * CONFIG_RCU_EQS_DEBUG=y. + */ noinstr void ct_irq_exit(void) { - rcu_irq_exit(); + lockdep_assert_irqs_disabled(); + ct_nmi_exit(); } +/* + * Wrapper for ct_irq_enter() where interrupts are enabled. + * + * If you add or remove a call to ct_irq_enter_irqson(), be sure to test + * with CONFIG_RCU_EQS_DEBUG=y. + */ void ct_irq_enter_irqson(void) { - rcu_irq_enter_irqson(); + unsigned long flags; + + local_irq_save(flags); + ct_irq_enter(); + local_irq_restore(flags); } +/* + * Wrapper for ct_irq_exit() where interrupts are enabled. + * + * If you add or remove a call to ct_irq_exit_irqson(), be sure to test + * with CONFIG_RCU_EQS_DEBUG=y. + */ void ct_irq_exit_irqson(void) { - rcu_irq_exit_irqson(); + unsigned long flags; + + local_irq_save(flags); + ct_irq_exit(); + local_irq_restore(flags); } noinstr void ct_nmi_enter(void) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 051fed0844b6..75b433dba427 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -789,31 +789,6 @@ noinstr void rcu_nmi_exit(void) rcu_dynticks_task_enter(); } -/** - * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle - * - * Exit from an interrupt handler, which might possibly result in entering - * idle mode, in other words, leaving the mode in which read-side critical - * sections can occur. The caller must have disabled interrupts. - * - * This code assumes that the idle loop never does anything that might - * result in unbalanced calls to irq_enter() and irq_exit(). If your - * architecture's idle loop violates this assumption, RCU will give you what - * you deserve, good and hard. But very infrequently and irreproducibly. - * - * Use things like work queues to work around this limitation. - * - * You have been warned. - * - * If you add or remove a call to rcu_irq_exit(), be sure to test with - * CONFIG_RCU_EQS_DEBUG=y. - */ -void noinstr rcu_irq_exit(void) -{ - lockdep_assert_irqs_disabled(); - rcu_nmi_exit(); -} - #ifdef CONFIG_PROVE_RCU /** * rcu_irq_exit_check_preempt - Validate that scheduling is possible @@ -832,21 +807,6 @@ void rcu_irq_exit_check_preempt(void) } #endif /* #ifdef CONFIG_PROVE_RCU */ -/* - * Wrapper for rcu_irq_exit() where interrupts are enabled. - * - * If you add or remove a call to rcu_irq_exit_irqson(), be sure to test - * with CONFIG_RCU_EQS_DEBUG=y. - */ -void rcu_irq_exit_irqson(void) -{ - unsigned long flags; - - local_irq_save(flags); - rcu_irq_exit(); - local_irq_restore(flags); -} - /* * Exit an RCU extended quiescent state, which can be either the * idle loop or adaptive-tickless usermode execution. @@ -1041,49 +1001,6 @@ noinstr void rcu_nmi_enter(void) barrier(); } -/** - * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle - * - * Enter an interrupt handler, which might possibly result in exiting - * idle mode, in other words, entering the mode in which read-side critical - * sections can occur. The caller must have disabled interrupts. - * - * Note that the Linux kernel is fully capable of entering an interrupt - * handler that it never exits, for example when doing upcalls to user mode! - * This code assumes that the idle loop never does upcalls to user mode. - * If your architecture's idle loop does do upcalls to user mode (or does - * anything else that results in unbalanced calls to the irq_enter() and - * irq_exit() functions), RCU will give you what you deserve, good and hard. - * But very infrequently and irreproducibly. - * - * Use things like work queues to work around this limitation. - * - * You have been warned. - * - * If you add or remove a call to rcu_irq_enter(), be sure to test with - * CONFIG_RCU_EQS_DEBUG=y. - */ -noinstr void rcu_irq_enter(void) -{ - lockdep_assert_irqs_disabled(); - rcu_nmi_enter(); -} - -/* - * Wrapper for rcu_irq_enter() where interrupts are enabled. - * - * If you add or remove a call to rcu_irq_enter_irqson(), be sure to test - * with CONFIG_RCU_EQS_DEBUG=y. - */ -void rcu_irq_enter_irqson(void) -{ - unsigned long flags; - - local_irq_save(flags); - rcu_irq_enter(); - local_irq_restore(flags); -} - /* * Check to see if any future non-offloaded RCU-related work will need * to be done by the current CPU, even if none need be done immediately, -- cgit From 62e2412df4b90ae6706ce1f1a9649b789b2e44ef Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 8 Jun 2022 16:40:29 +0200 Subject: rcu/context_tracking: Move dynticks counter to context tracking In order to prepare for merging RCU dynticks counter into the context tracking state, move the rcu_data's dynticks field to the context tracking structure. It will later be mixed within the context tracking state itself. [ paulmck: Move enum ctx_state into global scope. ] Acked-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Neeraj Upadhyay Cc: Uladzislau Rezki Cc: Joel Fernandes Cc: Boqun Feng Cc: Nicolas Saenz Julienne Cc: Marcelo Tosatti Cc: Xiongfeng Wang Cc: Yu Liao Cc: Phil Auld Cc: Paul Gortmaker Cc: Alex Belits Signed-off-by: Paul E. McKenney Reviewed-by: Nicolas Saenz Julienne Tested-by: Nicolas Saenz Julienne --- kernel/context_tracking.c | 10 ++++++--- kernel/rcu/tree.c | 56 +++++++++++++++++++++++------------------------ kernel/rcu/tree.h | 1 - kernel/rcu/tree_exp.h | 2 +- kernel/rcu/tree_stall.h | 4 ++-- 5 files changed, 37 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index c0d86dac98f1..01abbcec52f7 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -23,6 +23,13 @@ #include +DEFINE_PER_CPU(struct context_tracking, context_tracking) = { +#ifdef CONFIG_CONTEXT_TRACKING_IDLE + .dynticks = ATOMIC_INIT(1), +#endif +}; +EXPORT_SYMBOL_GPL(context_tracking); + #ifdef CONFIG_CONTEXT_TRACKING_IDLE noinstr void ct_idle_enter(void) { @@ -138,9 +145,6 @@ noinstr void ct_nmi_exit(void) DEFINE_STATIC_KEY_FALSE(context_tracking_key); EXPORT_SYMBOL_GPL(context_tracking_key); -DEFINE_PER_CPU(struct context_tracking, context_tracking); -EXPORT_SYMBOL_GPL(context_tracking); - static noinstr bool context_tracking_recursion_enter(void) { int recursion; diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 75b433dba427..a471edc3d893 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -77,7 +77,6 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = { .dynticks_nesting = 1, .dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE, - .dynticks = ATOMIC_INIT(1), #ifdef CONFIG_RCU_NOCB_CPU .cblist.flags = SEGCBLIST_RCU_CORE, #endif @@ -268,7 +267,7 @@ void rcu_softirq_qs(void) */ static noinline noinstr unsigned long rcu_dynticks_inc(int incby) { - return arch_atomic_add_return(incby, this_cpu_ptr(&rcu_data.dynticks)); + return arch_atomic_add_return(incby, this_cpu_ptr(&context_tracking.dynticks)); } /* @@ -324,9 +323,7 @@ static noinstr void rcu_dynticks_eqs_exit(void) */ static void rcu_dynticks_eqs_online(void) { - struct rcu_data *rdp = this_cpu_ptr(&rcu_data); - - if (atomic_read(&rdp->dynticks) & 0x1) + if (ct_dynticks() & 0x1) return; rcu_dynticks_inc(1); } @@ -338,17 +335,17 @@ static void rcu_dynticks_eqs_online(void) */ static __always_inline bool rcu_dynticks_curr_cpu_in_eqs(void) { - return !(arch_atomic_read(this_cpu_ptr(&rcu_data.dynticks)) & 0x1); + return !(arch_atomic_read(this_cpu_ptr(&context_tracking.dynticks)) & 0x1); } /* * Snapshot the ->dynticks counter with full ordering so as to allow * stable comparison of this counter with past and future snapshots. */ -static int rcu_dynticks_snap(struct rcu_data *rdp) +static int rcu_dynticks_snap(int cpu) { smp_mb(); // Fundamental RCU ordering guarantee. - return atomic_read_acquire(&rdp->dynticks); + return ct_dynticks_cpu_acquire(cpu); } /* @@ -363,9 +360,7 @@ static bool rcu_dynticks_in_eqs(int snap) /* Return true if the specified CPU is currently idle from an RCU viewpoint. */ bool rcu_is_idle_cpu(int cpu) { - struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); - - return rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp)); + return rcu_dynticks_in_eqs(rcu_dynticks_snap(cpu)); } /* @@ -375,7 +370,7 @@ bool rcu_is_idle_cpu(int cpu) */ static bool rcu_dynticks_in_eqs_since(struct rcu_data *rdp, int snap) { - return snap != rcu_dynticks_snap(rdp); + return snap != rcu_dynticks_snap(rdp->cpu); } /* @@ -384,11 +379,10 @@ static bool rcu_dynticks_in_eqs_since(struct rcu_data *rdp, int snap) */ bool rcu_dynticks_zero_in_eqs(int cpu, int *vp) { - struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); int snap; // If not quiescent, force back to earlier extended quiescent state. - snap = atomic_read(&rdp->dynticks) & ~0x1; + snap = ct_dynticks_cpu(cpu) & ~0x1; smp_rmb(); // Order ->dynticks and *vp reads. if (READ_ONCE(*vp)) @@ -396,7 +390,7 @@ bool rcu_dynticks_zero_in_eqs(int cpu, int *vp) smp_rmb(); // Order *vp read and ->dynticks re-read. // If still in the same extended quiescent state, we are good! - return snap == atomic_read(&rdp->dynticks); + return snap == ct_dynticks_cpu(cpu); } /* @@ -620,6 +614,7 @@ EXPORT_SYMBOL_GPL(rcutorture_get_gp_data); static noinstr void rcu_eqs_enter(bool user) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); + struct context_tracking *ct = this_cpu_ptr(&context_tracking); WARN_ON_ONCE(rdp->dynticks_nmi_nesting != DYNTICK_IRQ_NONIDLE); WRITE_ONCE(rdp->dynticks_nmi_nesting, 0); @@ -633,12 +628,12 @@ static noinstr void rcu_eqs_enter(bool user) instrumentation_begin(); lockdep_assert_irqs_disabled(); - trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, atomic_read(&rdp->dynticks)); + trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, ct_dynticks()); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); rcu_preempt_deferred_qs(current); // instrumentation for the noinstr rcu_dynticks_eqs_enter() - instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks)); + instrument_atomic_write(&ct->dynticks, sizeof(ct->dynticks)); instrumentation_end(); WRITE_ONCE(rdp->dynticks_nesting, 0); /* Avoid irq-access tearing. */ @@ -740,7 +735,7 @@ noinstr void rcu_user_enter(void) * rcu_nmi_exit - inform RCU of exit from NMI context * * If we are returning from the outermost NMI handler that interrupted an - * RCU-idle period, update rdp->dynticks and rdp->dynticks_nmi_nesting + * RCU-idle period, update ct->dynticks and rdp->dynticks_nmi_nesting * to let the RCU grace-period handling know that the CPU is back to * being RCU-idle. * @@ -749,6 +744,7 @@ noinstr void rcu_user_enter(void) */ noinstr void rcu_nmi_exit(void) { + struct context_tracking *ct = this_cpu_ptr(&context_tracking); struct rcu_data *rdp = this_cpu_ptr(&rcu_data); instrumentation_begin(); @@ -766,7 +762,7 @@ noinstr void rcu_nmi_exit(void) */ if (rdp->dynticks_nmi_nesting != 1) { trace_rcu_dyntick(TPS("--="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting - 2, - atomic_read(&rdp->dynticks)); + ct_dynticks()); WRITE_ONCE(rdp->dynticks_nmi_nesting, /* No store tearing. */ rdp->dynticks_nmi_nesting - 2); instrumentation_end(); @@ -774,11 +770,11 @@ noinstr void rcu_nmi_exit(void) } /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */ - trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, atomic_read(&rdp->dynticks)); + trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, ct_dynticks()); WRITE_ONCE(rdp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */ // instrumentation for the noinstr rcu_dynticks_eqs_enter() - instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks)); + instrument_atomic_write(&ct->dynticks, sizeof(ct->dynticks)); instrumentation_end(); // RCU is watching here ... @@ -817,6 +813,7 @@ void rcu_irq_exit_check_preempt(void) */ static void noinstr rcu_eqs_exit(bool user) { + struct context_tracking *ct = this_cpu_ptr(&context_tracking); struct rcu_data *rdp; long oldval; @@ -836,9 +833,9 @@ static void noinstr rcu_eqs_exit(bool user) instrumentation_begin(); // instrumentation for the noinstr rcu_dynticks_eqs_exit() - instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks)); + instrument_atomic_write(&ct->dynticks, sizeof(ct->dynticks)); - trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, atomic_read(&rdp->dynticks)); + trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, ct_dynticks()); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); WRITE_ONCE(rdp->dynticks_nesting, 1); WARN_ON_ONCE(rdp->dynticks_nmi_nesting); @@ -944,7 +941,7 @@ void __rcu_irq_enter_check_tick(void) /** * rcu_nmi_enter - inform RCU of entry to NMI context * - * If the CPU was idle from RCU's viewpoint, update rdp->dynticks and + * If the CPU was idle from RCU's viewpoint, update ct->dynticks and * rdp->dynticks_nmi_nesting to let the RCU grace-period handling know * that the CPU is active. This implementation permits nested NMIs, as * long as the nesting level does not overflow an int. (You will probably @@ -957,6 +954,7 @@ noinstr void rcu_nmi_enter(void) { long incby = 2; struct rcu_data *rdp = this_cpu_ptr(&rcu_data); + struct context_tracking *ct = this_cpu_ptr(&context_tracking); /* Complain about underflow. */ WARN_ON_ONCE(rdp->dynticks_nmi_nesting < 0); @@ -980,9 +978,9 @@ noinstr void rcu_nmi_enter(void) instrumentation_begin(); // instrumentation for the noinstr rcu_dynticks_curr_cpu_in_eqs() - instrument_atomic_read(&rdp->dynticks, sizeof(rdp->dynticks)); + instrument_atomic_read(&ct->dynticks, sizeof(ct->dynticks)); // instrumentation for the noinstr rcu_dynticks_eqs_exit() - instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks)); + instrument_atomic_write(&ct->dynticks, sizeof(ct->dynticks)); incby = 1; } else if (!in_nmi()) { @@ -994,7 +992,7 @@ noinstr void rcu_nmi_enter(void) trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="), rdp->dynticks_nmi_nesting, - rdp->dynticks_nmi_nesting + incby, atomic_read(&rdp->dynticks)); + rdp->dynticks_nmi_nesting + incby, ct_dynticks()); instrumentation_end(); WRITE_ONCE(rdp->dynticks_nmi_nesting, /* Prevent store tearing. */ rdp->dynticks_nmi_nesting + incby); @@ -1138,7 +1136,7 @@ static void rcu_gpnum_ovf(struct rcu_node *rnp, struct rcu_data *rdp) */ static int dyntick_save_progress_counter(struct rcu_data *rdp) { - rdp->dynticks_snap = rcu_dynticks_snap(rdp); + rdp->dynticks_snap = rcu_dynticks_snap(rdp->cpu); if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) { trace_rcu_fqs(rcu_state.name, rdp->gp_seq, rdp->cpu, TPS("dti")); rcu_gpnum_ovf(rdp->mynode, rdp); @@ -4142,7 +4140,7 @@ rcu_boot_init_percpu_data(int cpu) rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu); INIT_WORK(&rdp->strict_work, strict_work_handler); WARN_ON_ONCE(rdp->dynticks_nesting != 1); - WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp))); + WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(cpu))); rdp->barrier_seq_snap = rcu_state.barrier_sequence; rdp->rcu_ofl_gp_seq = rcu_state.gp_seq; rdp->rcu_ofl_gp_flags = RCU_GP_CLEANED; diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 2ccf5845957d..ebb973f5b190 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -189,7 +189,6 @@ struct rcu_data { int dynticks_snap; /* Per-GP tracking for dynticks. */ long dynticks_nesting; /* Track process nesting level. */ long dynticks_nmi_nesting; /* Track irq/NMI nesting level. */ - atomic_t dynticks; /* Even value for idle, else odd. */ bool rcu_need_heavy_qs; /* GP old, so heavy quiescent state! */ bool rcu_urgent_qs; /* GP old need light quiescent state. */ bool rcu_forced_tick; /* Forced tick to provide QS. */ diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 0f70f62039a9..75c22d1034c1 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -356,7 +356,7 @@ static void __sync_rcu_exp_select_node_cpus(struct rcu_exp_work *rewp) !(rnp->qsmaskinitnext & mask)) { mask_ofl_test |= mask; } else { - snap = rcu_dynticks_snap(rdp); + snap = rcu_dynticks_snap(cpu); if (rcu_dynticks_in_eqs(snap)) mask_ofl_test |= mask; else diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 3556637768fd..250fbf2e8522 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -465,7 +465,7 @@ static void print_cpu_stall_info(int cpu) } delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq); falsepositive = rcu_is_gp_kthread_starving(NULL) && - rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp)); + rcu_dynticks_in_eqs(rcu_dynticks_snap(cpu)); rcuc_starved = rcu_is_rcuc_kthread_starving(rdp, &j); if (rcuc_starved) sprintf(buf, " rcuc=%ld jiffies(starved)", j); @@ -478,7 +478,7 @@ static void print_cpu_stall_info(int cpu) rdp->rcu_iw_pending ? (int)min(delta, 9UL) + '0' : "!."[!delta], ticks_value, ticks_title, - rcu_dynticks_snap(rdp) & 0xfff, + rcu_dynticks_snap(cpu) & 0xfff, rdp->dynticks_nesting, rdp->dynticks_nmi_nesting, rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), data_race(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart, -- cgit From 904e600e60f46f92eb4bcfb95788b1fedf7e8237 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 8 Jun 2022 16:40:30 +0200 Subject: rcu/context_tracking: Move dynticks_nesting to context tracking The RCU eqs tracking is going to be performed by the context tracking subsystem. The related nesting counters thus need to be moved to the context tracking structure. Acked-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Neeraj Upadhyay Cc: Uladzislau Rezki Cc: Joel Fernandes Cc: Boqun Feng Cc: Nicolas Saenz Julienne Cc: Marcelo Tosatti Cc: Xiongfeng Wang Cc: Yu Liao Cc: Phil Auld Cc: Paul Gortmaker Cc: Alex Belits Signed-off-by: Paul E. McKenney Reviewed-by: Nicolas Saenz Julienne Tested-by: Nicolas Saenz Julienne --- kernel/context_tracking.c | 1 + kernel/rcu/tree.c | 31 ++++++++++++++++--------------- kernel/rcu/tree.h | 1 - kernel/rcu/tree_stall.h | 2 +- 4 files changed, 18 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 01abbcec52f7..dfefe04400f8 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -25,6 +25,7 @@ DEFINE_PER_CPU(struct context_tracking, context_tracking) = { #ifdef CONFIG_CONTEXT_TRACKING_IDLE + .dynticks_nesting = 1, .dynticks = ATOMIC_INIT(1), #endif }; diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a471edc3d893..f6bf328bb9cf 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -75,7 +75,6 @@ /* Data structures. */ static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = { - .dynticks_nesting = 1, .dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE, #ifdef CONFIG_RCU_NOCB_CPU .cblist.flags = SEGCBLIST_RCU_CORE, @@ -436,7 +435,7 @@ static int rcu_is_cpu_rrupt_from_idle(void) lockdep_assert_irqs_disabled(); /* Check for counter underflows */ - RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) < 0, + RCU_LOCKDEP_WARN(ct_dynticks_nesting() < 0, "RCU dynticks_nesting counter underflow!"); RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) <= 0, "RCU dynticks_nmi_nesting counter underflow/zero!"); @@ -452,7 +451,7 @@ static int rcu_is_cpu_rrupt_from_idle(void) WARN_ON_ONCE(!nesting && !is_idle_task(current)); /* Does CPU appear to be idle from an RCU standpoint? */ - return __this_cpu_read(rcu_data.dynticks_nesting) == 0; + return ct_dynticks_nesting() == 0; } #define DEFAULT_RCU_BLIMIT (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) ? 1000 : 10) @@ -619,16 +618,16 @@ static noinstr void rcu_eqs_enter(bool user) WARN_ON_ONCE(rdp->dynticks_nmi_nesting != DYNTICK_IRQ_NONIDLE); WRITE_ONCE(rdp->dynticks_nmi_nesting, 0); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && - rdp->dynticks_nesting == 0); - if (rdp->dynticks_nesting != 1) { + ct_dynticks_nesting() == 0); + if (ct_dynticks_nesting() != 1) { // RCU will still be watching, so just do accounting and leave. - rdp->dynticks_nesting--; + ct->dynticks_nesting--; return; } instrumentation_begin(); lockdep_assert_irqs_disabled(); - trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, ct_dynticks()); + trace_rcu_dyntick(TPS("Start"), ct_dynticks_nesting(), 0, ct_dynticks()); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); rcu_preempt_deferred_qs(current); @@ -636,7 +635,7 @@ static noinstr void rcu_eqs_enter(bool user) instrument_atomic_write(&ct->dynticks, sizeof(ct->dynticks)); instrumentation_end(); - WRITE_ONCE(rdp->dynticks_nesting, 0); /* Avoid irq-access tearing. */ + WRITE_ONCE(ct->dynticks_nesting, 0); /* Avoid irq-access tearing. */ // RCU is watching here ... rcu_dynticks_eqs_enter(); // ... but is no longer watching here. @@ -793,7 +792,7 @@ void rcu_irq_exit_check_preempt(void) { lockdep_assert_irqs_disabled(); - RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) <= 0, + RCU_LOCKDEP_WARN(ct_dynticks_nesting() <= 0, "RCU dynticks_nesting counter underflow/zero!"); RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) != DYNTICK_IRQ_NONIDLE, @@ -819,11 +818,11 @@ static void noinstr rcu_eqs_exit(bool user) WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !raw_irqs_disabled()); rdp = this_cpu_ptr(&rcu_data); - oldval = rdp->dynticks_nesting; + oldval = ct_dynticks_nesting(); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0); if (oldval) { // RCU was already watching, so just do accounting and leave. - rdp->dynticks_nesting++; + ct->dynticks_nesting++; return; } rcu_dynticks_task_exit(); @@ -835,9 +834,9 @@ static void noinstr rcu_eqs_exit(bool user) // instrumentation for the noinstr rcu_dynticks_eqs_exit() instrument_atomic_write(&ct->dynticks, sizeof(ct->dynticks)); - trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, ct_dynticks()); + trace_rcu_dyntick(TPS("End"), ct_dynticks_nesting(), 1, ct_dynticks()); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); - WRITE_ONCE(rdp->dynticks_nesting, 1); + WRITE_ONCE(ct->dynticks_nesting, 1); WARN_ON_ONCE(rdp->dynticks_nmi_nesting); WRITE_ONCE(rdp->dynticks_nmi_nesting, DYNTICK_IRQ_NONIDLE); instrumentation_end(); @@ -4134,12 +4133,13 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf) static void __init rcu_boot_init_percpu_data(int cpu) { + struct context_tracking *ct = this_cpu_ptr(&context_tracking); struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); /* Set up local state, ensuring consistent view of global state. */ rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu); INIT_WORK(&rdp->strict_work, strict_work_handler); - WARN_ON_ONCE(rdp->dynticks_nesting != 1); + WARN_ON_ONCE(ct->dynticks_nesting != 1); WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(cpu))); rdp->barrier_seq_snap = rcu_state.barrier_sequence; rdp->rcu_ofl_gp_seq = rcu_state.gp_seq; @@ -4164,6 +4164,7 @@ rcu_boot_init_percpu_data(int cpu) int rcutree_prepare_cpu(unsigned int cpu) { unsigned long flags; + struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu); struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); struct rcu_node *rnp = rcu_get_root(); @@ -4172,7 +4173,7 @@ int rcutree_prepare_cpu(unsigned int cpu) rdp->qlen_last_fqs_check = 0; rdp->n_force_qs_snap = READ_ONCE(rcu_state.n_force_qs); rdp->blimit = blimit; - rdp->dynticks_nesting = 1; /* CPU not up, no tearing. */ + ct->dynticks_nesting = 1; /* CPU not up, no tearing. */ raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ /* diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index ebb973f5b190..650ff3cf0121 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -187,7 +187,6 @@ struct rcu_data { /* 3) dynticks interface. */ int dynticks_snap; /* Per-GP tracking for dynticks. */ - long dynticks_nesting; /* Track process nesting level. */ long dynticks_nmi_nesting; /* Track irq/NMI nesting level. */ bool rcu_need_heavy_qs; /* GP old, so heavy quiescent state! */ bool rcu_urgent_qs; /* GP old need light quiescent state. */ diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 250fbf2e8522..a9c82254b6c6 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -479,7 +479,7 @@ static void print_cpu_stall_info(int cpu) "!."[!delta], ticks_value, ticks_title, rcu_dynticks_snap(cpu) & 0xfff, - rdp->dynticks_nesting, rdp->dynticks_nmi_nesting, + ct_dynticks_nesting_cpu(cpu), rdp->dynticks_nmi_nesting, rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), data_race(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart, rcuc_starved ? buf : "", -- cgit From 95e04f48ec0a634e2f221081f5fa1a904755f326 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 8 Jun 2022 16:40:31 +0200 Subject: rcu/context_tracking: Move dynticks_nmi_nesting to context tracking The RCU eqs tracking is going to be performed by the context tracking subsystem. The related nesting counters thus need to be moved to the context tracking structure. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Neeraj Upadhyay Cc: Uladzislau Rezki Cc: Joel Fernandes Cc: Boqun Feng Cc: Nicolas Saenz Julienne Cc: Marcelo Tosatti Cc: Xiongfeng Wang Cc: Yu Liao Cc: Phil Auld Cc: Paul Gortmaker Cc: Alex Belits Signed-off-by: Paul E. McKenney Reviewed-by: Nicolas Saenz Julienne Tested-by: Nicolas Saenz Julienne --- kernel/context_tracking.c | 1 + kernel/rcu/rcu.h | 4 ---- kernel/rcu/tree.c | 48 +++++++++++++++++++++-------------------------- kernel/rcu/tree.h | 1 - kernel/rcu/tree_stall.h | 2 +- 5 files changed, 23 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index dfefe04400f8..7c3033e9a518 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -26,6 +26,7 @@ DEFINE_PER_CPU(struct context_tracking, context_tracking) = { #ifdef CONFIG_CONTEXT_TRACKING_IDLE .dynticks_nesting = 1, + .dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE, .dynticks = ATOMIC_INIT(1), #endif }; diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 4916077119f3..7b4a88deff9a 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -12,10 +12,6 @@ #include -/* Offset to allow distinguishing irq vs. task-based idle entry/exit. */ -#define DYNTICK_IRQ_NONIDLE ((LONG_MAX / 2) + 1) - - /* * Grace-period counter management. */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f6bf328bb9cf..006939b29e82 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -75,7 +75,6 @@ /* Data structures. */ static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = { - .dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE, #ifdef CONFIG_RCU_NOCB_CPU .cblist.flags = SEGCBLIST_RCU_CORE, #endif @@ -437,11 +436,11 @@ static int rcu_is_cpu_rrupt_from_idle(void) /* Check for counter underflows */ RCU_LOCKDEP_WARN(ct_dynticks_nesting() < 0, "RCU dynticks_nesting counter underflow!"); - RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) <= 0, + RCU_LOCKDEP_WARN(ct_dynticks_nmi_nesting() <= 0, "RCU dynticks_nmi_nesting counter underflow/zero!"); /* Are we at first interrupt nesting level? */ - nesting = __this_cpu_read(rcu_data.dynticks_nmi_nesting); + nesting = ct_dynticks_nmi_nesting(); if (nesting > 1) return false; @@ -612,11 +611,10 @@ EXPORT_SYMBOL_GPL(rcutorture_get_gp_data); */ static noinstr void rcu_eqs_enter(bool user) { - struct rcu_data *rdp = this_cpu_ptr(&rcu_data); struct context_tracking *ct = this_cpu_ptr(&context_tracking); - WARN_ON_ONCE(rdp->dynticks_nmi_nesting != DYNTICK_IRQ_NONIDLE); - WRITE_ONCE(rdp->dynticks_nmi_nesting, 0); + WARN_ON_ONCE(ct_dynticks_nmi_nesting() != DYNTICK_IRQ_NONIDLE); + WRITE_ONCE(ct->dynticks_nmi_nesting, 0); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && ct_dynticks_nesting() == 0); if (ct_dynticks_nesting() != 1) { @@ -734,7 +732,7 @@ noinstr void rcu_user_enter(void) * rcu_nmi_exit - inform RCU of exit from NMI context * * If we are returning from the outermost NMI handler that interrupted an - * RCU-idle period, update ct->dynticks and rdp->dynticks_nmi_nesting + * RCU-idle period, update ct->dynticks and ct->dynticks_nmi_nesting * to let the RCU grace-period handling know that the CPU is back to * being RCU-idle. * @@ -744,7 +742,6 @@ noinstr void rcu_user_enter(void) noinstr void rcu_nmi_exit(void) { struct context_tracking *ct = this_cpu_ptr(&context_tracking); - struct rcu_data *rdp = this_cpu_ptr(&rcu_data); instrumentation_begin(); /* @@ -752,25 +749,25 @@ noinstr void rcu_nmi_exit(void) * (We are exiting an NMI handler, so RCU better be paying attention * to us!) */ - WARN_ON_ONCE(rdp->dynticks_nmi_nesting <= 0); + WARN_ON_ONCE(ct_dynticks_nmi_nesting() <= 0); WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs()); /* * If the nesting level is not 1, the CPU wasn't RCU-idle, so * leave it in non-RCU-idle state. */ - if (rdp->dynticks_nmi_nesting != 1) { - trace_rcu_dyntick(TPS("--="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting - 2, + if (ct_dynticks_nmi_nesting() != 1) { + trace_rcu_dyntick(TPS("--="), ct_dynticks_nmi_nesting(), ct_dynticks_nmi_nesting() - 2, ct_dynticks()); - WRITE_ONCE(rdp->dynticks_nmi_nesting, /* No store tearing. */ - rdp->dynticks_nmi_nesting - 2); + WRITE_ONCE(ct->dynticks_nmi_nesting, /* No store tearing. */ + ct_dynticks_nmi_nesting() - 2); instrumentation_end(); return; } /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */ - trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, ct_dynticks()); - WRITE_ONCE(rdp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */ + trace_rcu_dyntick(TPS("Startirq"), ct_dynticks_nmi_nesting(), 0, ct_dynticks()); + WRITE_ONCE(ct->dynticks_nmi_nesting, 0); /* Avoid store tearing. */ // instrumentation for the noinstr rcu_dynticks_eqs_enter() instrument_atomic_write(&ct->dynticks, sizeof(ct->dynticks)); @@ -794,7 +791,7 @@ void rcu_irq_exit_check_preempt(void) RCU_LOCKDEP_WARN(ct_dynticks_nesting() <= 0, "RCU dynticks_nesting counter underflow/zero!"); - RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) != + RCU_LOCKDEP_WARN(ct_dynticks_nmi_nesting() != DYNTICK_IRQ_NONIDLE, "Bad RCU dynticks_nmi_nesting counter\n"); RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(), @@ -813,11 +810,9 @@ void rcu_irq_exit_check_preempt(void) static void noinstr rcu_eqs_exit(bool user) { struct context_tracking *ct = this_cpu_ptr(&context_tracking); - struct rcu_data *rdp; long oldval; WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !raw_irqs_disabled()); - rdp = this_cpu_ptr(&rcu_data); oldval = ct_dynticks_nesting(); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0); if (oldval) { @@ -837,8 +832,8 @@ static void noinstr rcu_eqs_exit(bool user) trace_rcu_dyntick(TPS("End"), ct_dynticks_nesting(), 1, ct_dynticks()); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); WRITE_ONCE(ct->dynticks_nesting, 1); - WARN_ON_ONCE(rdp->dynticks_nmi_nesting); - WRITE_ONCE(rdp->dynticks_nmi_nesting, DYNTICK_IRQ_NONIDLE); + WARN_ON_ONCE(ct_dynticks_nmi_nesting()); + WRITE_ONCE(ct->dynticks_nmi_nesting, DYNTICK_IRQ_NONIDLE); instrumentation_end(); } @@ -941,7 +936,7 @@ void __rcu_irq_enter_check_tick(void) * rcu_nmi_enter - inform RCU of entry to NMI context * * If the CPU was idle from RCU's viewpoint, update ct->dynticks and - * rdp->dynticks_nmi_nesting to let the RCU grace-period handling know + * ct->dynticks_nmi_nesting to let the RCU grace-period handling know * that the CPU is active. This implementation permits nested NMIs, as * long as the nesting level does not overflow an int. (You will probably * run out of stack space first.) @@ -952,11 +947,10 @@ void __rcu_irq_enter_check_tick(void) noinstr void rcu_nmi_enter(void) { long incby = 2; - struct rcu_data *rdp = this_cpu_ptr(&rcu_data); struct context_tracking *ct = this_cpu_ptr(&context_tracking); /* Complain about underflow. */ - WARN_ON_ONCE(rdp->dynticks_nmi_nesting < 0); + WARN_ON_ONCE(ct_dynticks_nmi_nesting() < 0); /* * If idle from RCU viewpoint, atomically increment ->dynticks @@ -990,11 +984,11 @@ noinstr void rcu_nmi_enter(void) } trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="), - rdp->dynticks_nmi_nesting, - rdp->dynticks_nmi_nesting + incby, ct_dynticks()); + ct_dynticks_nmi_nesting(), + ct_dynticks_nmi_nesting() + incby, ct_dynticks()); instrumentation_end(); - WRITE_ONCE(rdp->dynticks_nmi_nesting, /* Prevent store tearing. */ - rdp->dynticks_nmi_nesting + incby); + WRITE_ONCE(ct->dynticks_nmi_nesting, /* Prevent store tearing. */ + ct_dynticks_nmi_nesting() + incby); barrier(); } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 650ff3cf0121..72dbf8512ce7 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -187,7 +187,6 @@ struct rcu_data { /* 3) dynticks interface. */ int dynticks_snap; /* Per-GP tracking for dynticks. */ - long dynticks_nmi_nesting; /* Track irq/NMI nesting level. */ bool rcu_need_heavy_qs; /* GP old, so heavy quiescent state! */ bool rcu_urgent_qs; /* GP old need light quiescent state. */ bool rcu_forced_tick; /* Forced tick to provide QS. */ diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index a9c82254b6c6..2683ce0a7c72 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -479,7 +479,7 @@ static void print_cpu_stall_info(int cpu) "!."[!delta], ticks_value, ticks_title, rcu_dynticks_snap(cpu) & 0xfff, - ct_dynticks_nesting_cpu(cpu), rdp->dynticks_nmi_nesting, + ct_dynticks_nesting_cpu(cpu), ct_dynticks_nmi_nesting_cpu(cpu), rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), data_race(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart, rcuc_starved ? buf : "", -- cgit From 564506495ca96a6e66d077d3d5b9f02d4b9b0f45 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 8 Jun 2022 16:40:32 +0200 Subject: rcu/context-tracking: Move deferred nocb resched to context tracking To prepare for migrating the RCU eqs accounting code to context tracking, split the last-resort deferred nocb resched from rcu_user_enter() and move it into a separate call from context tracking. Acked-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Neeraj Upadhyay Cc: Uladzislau Rezki Cc: Joel Fernandes Cc: Boqun Feng Cc: Nicolas Saenz Julienne Cc: Marcelo Tosatti Cc: Xiongfeng Wang Cc: Yu Liao Cc: Phil Auld Cc: Paul Gortmaker Cc: Alex Belits Signed-off-by: Paul E. McKenney Reviewed-by: Nicolas Saenz Julienne Tested-by: Nicolas Saenz Julienne --- kernel/context_tracking.c | 8 ++++++++ kernel/rcu/tree.c | 15 ++------------- 2 files changed, 10 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 7c3033e9a518..8cf59d8a6af6 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -177,6 +177,8 @@ static __always_inline void context_tracking_recursion_exit(void) */ void noinstr __ct_user_enter(enum ctx_state state) { + lockdep_assert_irqs_disabled(); + /* Kernel threads aren't supposed to go to userspace */ WARN_ON_ONCE(!current->mm); @@ -198,6 +200,12 @@ void noinstr __ct_user_enter(enum ctx_state state) vtime_user_enter(current); instrumentation_end(); } + /* + * Other than generic entry implementation, we may be past the last + * rescheduling opportunity in the entry code. Trigger a self IPI + * that will fire and reschedule once we resume in user/guest mode. + */ + rcu_irq_work_resched(); rcu_user_enter(); } /* diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 006939b29e82..8c0c3490532e 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -681,7 +681,7 @@ static DEFINE_PER_CPU(struct irq_work, late_wakeup_work) = * last resort is to fire a local irq_work that will trigger a reschedule once IRQs * get re-enabled again. */ -noinstr static void rcu_irq_work_resched(void) +noinstr void rcu_irq_work_resched(void) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); @@ -697,10 +697,7 @@ noinstr static void rcu_irq_work_resched(void) } instrumentation_end(); } - -#else -static inline void rcu_irq_work_resched(void) { } -#endif +#endif /* #if !defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK) */ /** * rcu_user_enter - inform RCU that we are resuming userspace. @@ -715,14 +712,6 @@ static inline void rcu_irq_work_resched(void) { } */ noinstr void rcu_user_enter(void) { - lockdep_assert_irqs_disabled(); - - /* - * Other than generic entry implementation, we may be past the last - * rescheduling opportunity in the entry code. Trigger a self IPI - * that will fire and reschedule once we resume in user/guest mode. - */ - rcu_irq_work_resched(); rcu_eqs_enter(true); } -- cgit From 172114552701b85d5c3b1a089a73ee85d0d7786b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 8 Jun 2022 16:40:33 +0200 Subject: rcu/context-tracking: Move RCU-dynticks internal functions to context_tracking Move the core RCU eqs/dynticks functions to context tracking so that we can later merge all that code within context tracking. Acked-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Neeraj Upadhyay Cc: Uladzislau Rezki Cc: Joel Fernandes Cc: Boqun Feng Cc: Nicolas Saenz Julienne Cc: Marcelo Tosatti Cc: Xiongfeng Wang Cc: Yu Liao Cc: Phil Auld Cc: Paul Gortmaker Cc: Alex Belits Signed-off-by: Paul E. McKenney Reviewed-by: Nicolas Saenz Julienne Tested-by: Nicolas Saenz Julienne --- kernel/context_tracking.c | 336 ++++++++++++++++++++++++++++++++++++++++++++++ kernel/rcu/tree.c | 324 +------------------------------------------- kernel/rcu/tree.h | 5 - kernel/rcu/tree_plugin.h | 38 +----- 4 files changed, 341 insertions(+), 362 deletions(-) (limited to 'kernel') diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 8cf59d8a6af6..072c4b6044b3 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -21,6 +21,7 @@ #include #include #include +#include DEFINE_PER_CPU(struct context_tracking, context_tracking) = { @@ -33,6 +34,309 @@ DEFINE_PER_CPU(struct context_tracking, context_tracking) = { EXPORT_SYMBOL_GPL(context_tracking); #ifdef CONFIG_CONTEXT_TRACKING_IDLE +#define TPS(x) tracepoint_string(x) + +/* Record the current task on dyntick-idle entry. */ +static __always_inline void rcu_dynticks_task_enter(void) +{ +#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) + WRITE_ONCE(current->rcu_tasks_idle_cpu, smp_processor_id()); +#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */ +} + +/* Record no current task on dyntick-idle exit. */ +static __always_inline void rcu_dynticks_task_exit(void) +{ +#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) + WRITE_ONCE(current->rcu_tasks_idle_cpu, -1); +#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */ +} + +/* Turn on heavyweight RCU tasks trace readers on idle/user entry. */ +static __always_inline void rcu_dynticks_task_trace_enter(void) +{ +#ifdef CONFIG_TASKS_TRACE_RCU + if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) + current->trc_reader_special.b.need_mb = true; +#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ +} + +/* Turn off heavyweight RCU tasks trace readers on idle/user exit. */ +static __always_inline void rcu_dynticks_task_trace_exit(void) +{ +#ifdef CONFIG_TASKS_TRACE_RCU + if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) + current->trc_reader_special.b.need_mb = false; +#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ +} + +/* + * Record entry into an extended quiescent state. This is only to be + * called when not already in an extended quiescent state, that is, + * RCU is watching prior to the call to this function and is no longer + * watching upon return. + */ +static noinstr void rcu_dynticks_eqs_enter(void) +{ + int seq; + + /* + * CPUs seeing atomic_add_return() must see prior RCU read-side + * critical sections, and we also must force ordering with the + * next idle sojourn. + */ + rcu_dynticks_task_trace_enter(); // Before ->dynticks update! + seq = rcu_dynticks_inc(1); + // RCU is no longer watching. Better be in extended quiescent state! + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (seq & 0x1)); +} + +/* + * Record exit from an extended quiescent state. This is only to be + * called from an extended quiescent state, that is, RCU is not watching + * prior to the call to this function and is watching upon return. + */ +static noinstr void rcu_dynticks_eqs_exit(void) +{ + int seq; + + /* + * CPUs seeing atomic_add_return() must see prior idle sojourns, + * and we also must force ordering with the next RCU read-side + * critical section. + */ + seq = rcu_dynticks_inc(1); + // RCU is now watching. Better not be in an extended quiescent state! + rcu_dynticks_task_trace_exit(); // After ->dynticks update! + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(seq & 0x1)); +} + +/* + * Enter an RCU extended quiescent state, which can be either the + * idle loop or adaptive-tickless usermode execution. + * + * We crowbar the ->dynticks_nmi_nesting field to zero to allow for + * the possibility of usermode upcalls having messed up our count + * of interrupt nesting level during the prior busy period. + */ +static void noinstr rcu_eqs_enter(bool user) +{ + struct context_tracking *ct = this_cpu_ptr(&context_tracking); + + WARN_ON_ONCE(ct_dynticks_nmi_nesting() != DYNTICK_IRQ_NONIDLE); + WRITE_ONCE(ct->dynticks_nmi_nesting, 0); + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && + ct_dynticks_nesting() == 0); + if (ct_dynticks_nesting() != 1) { + // RCU will still be watching, so just do accounting and leave. + ct->dynticks_nesting--; + return; + } + + instrumentation_begin(); + lockdep_assert_irqs_disabled(); + trace_rcu_dyntick(TPS("Start"), ct_dynticks_nesting(), 0, ct_dynticks()); + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); + rcu_preempt_deferred_qs(current); + + // instrumentation for the noinstr rcu_dynticks_eqs_enter() + instrument_atomic_write(&ct->dynticks, sizeof(ct->dynticks)); + + instrumentation_end(); + WRITE_ONCE(ct->dynticks_nesting, 0); /* Avoid irq-access tearing. */ + // RCU is watching here ... + rcu_dynticks_eqs_enter(); + // ... but is no longer watching here. + rcu_dynticks_task_enter(); +} + +/* + * Exit an RCU extended quiescent state, which can be either the + * idle loop or adaptive-tickless usermode execution. + * + * We crowbar the ->dynticks_nmi_nesting field to DYNTICK_IRQ_NONIDLE to + * allow for the possibility of usermode upcalls messing up our count of + * interrupt nesting level during the busy period that is just now starting. + */ +static void noinstr rcu_eqs_exit(bool user) +{ + struct context_tracking *ct = this_cpu_ptr(&context_tracking); + long oldval; + + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !raw_irqs_disabled()); + oldval = ct_dynticks_nesting(); + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0); + if (oldval) { + // RCU was already watching, so just do accounting and leave. + ct->dynticks_nesting++; + return; + } + rcu_dynticks_task_exit(); + // RCU is not watching here ... + rcu_dynticks_eqs_exit(); + // ... but is watching here. + instrumentation_begin(); + + // instrumentation for the noinstr rcu_dynticks_eqs_exit() + instrument_atomic_write(&ct->dynticks, sizeof(ct->dynticks)); + + trace_rcu_dyntick(TPS("End"), ct_dynticks_nesting(), 1, ct_dynticks()); + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); + WRITE_ONCE(ct->dynticks_nesting, 1); + WARN_ON_ONCE(ct_dynticks_nmi_nesting()); + WRITE_ONCE(ct->dynticks_nmi_nesting, DYNTICK_IRQ_NONIDLE); + instrumentation_end(); +} + +/** + * rcu_nmi_exit - inform RCU of exit from NMI context + * + * If we are returning from the outermost NMI handler that interrupted an + * RCU-idle period, update ct->dynticks and ct->dynticks_nmi_nesting + * to let the RCU grace-period handling know that the CPU is back to + * being RCU-idle. + * + * If you add or remove a call to rcu_nmi_exit(), be sure to test + * with CONFIG_RCU_EQS_DEBUG=y. + */ +void noinstr rcu_nmi_exit(void) +{ + struct context_tracking *ct = this_cpu_ptr(&context_tracking); + + instrumentation_begin(); + /* + * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks. + * (We are exiting an NMI handler, so RCU better be paying attention + * to us!) + */ + WARN_ON_ONCE(ct_dynticks_nmi_nesting() <= 0); + WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs()); + + /* + * If the nesting level is not 1, the CPU wasn't RCU-idle, so + * leave it in non-RCU-idle state. + */ + if (ct_dynticks_nmi_nesting() != 1) { + trace_rcu_dyntick(TPS("--="), ct_dynticks_nmi_nesting(), ct_dynticks_nmi_nesting() - 2, + ct_dynticks()); + WRITE_ONCE(ct->dynticks_nmi_nesting, /* No store tearing. */ + ct_dynticks_nmi_nesting() - 2); + instrumentation_end(); + return; + } + + /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */ + trace_rcu_dyntick(TPS("Startirq"), ct_dynticks_nmi_nesting(), 0, ct_dynticks()); + WRITE_ONCE(ct->dynticks_nmi_nesting, 0); /* Avoid store tearing. */ + + // instrumentation for the noinstr rcu_dynticks_eqs_enter() + instrument_atomic_write(&ct->dynticks, sizeof(ct->dynticks)); + instrumentation_end(); + + // RCU is watching here ... + rcu_dynticks_eqs_enter(); + // ... but is no longer watching here. + + if (!in_nmi()) + rcu_dynticks_task_enter(); +} + +/** + * rcu_nmi_enter - inform RCU of entry to NMI context + * + * If the CPU was idle from RCU's viewpoint, update ct->dynticks and + * ct->dynticks_nmi_nesting to let the RCU grace-period handling know + * that the CPU is active. This implementation permits nested NMIs, as + * long as the nesting level does not overflow an int. (You will probably + * run out of stack space first.) + * + * If you add or remove a call to rcu_nmi_enter(), be sure to test + * with CONFIG_RCU_EQS_DEBUG=y. + */ +void noinstr rcu_nmi_enter(void) +{ + long incby = 2; + struct context_tracking *ct = this_cpu_ptr(&context_tracking); + + /* Complain about underflow. */ + WARN_ON_ONCE(ct_dynticks_nmi_nesting() < 0); + + /* + * If idle from RCU viewpoint, atomically increment ->dynticks + * to mark non-idle and increment ->dynticks_nmi_nesting by one. + * Otherwise, increment ->dynticks_nmi_nesting by two. This means + * if ->dynticks_nmi_nesting is equal to one, we are guaranteed + * to be in the outermost NMI handler that interrupted an RCU-idle + * period (observation due to Andy Lutomirski). + */ + if (rcu_dynticks_curr_cpu_in_eqs()) { + + if (!in_nmi()) + rcu_dynticks_task_exit(); + + // RCU is not watching here ... + rcu_dynticks_eqs_exit(); + // ... but is watching here. + + instrumentation_begin(); + // instrumentation for the noinstr rcu_dynticks_curr_cpu_in_eqs() + instrument_atomic_read(&ct->dynticks, sizeof(ct->dynticks)); + // instrumentation for the noinstr rcu_dynticks_eqs_exit() + instrument_atomic_write(&ct->dynticks, sizeof(ct->dynticks)); + + incby = 1; + } else if (!in_nmi()) { + instrumentation_begin(); + rcu_irq_enter_check_tick(); + } else { + instrumentation_begin(); + } + + trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="), + ct_dynticks_nmi_nesting(), + ct_dynticks_nmi_nesting() + incby, ct_dynticks()); + instrumentation_end(); + WRITE_ONCE(ct->dynticks_nmi_nesting, /* Prevent store tearing. */ + ct_dynticks_nmi_nesting() + incby); + barrier(); +} + +/** + * rcu_idle_enter - inform RCU that current CPU is entering idle + * + * Enter idle mode, in other words, -leave- the mode in which RCU + * read-side critical sections can occur. (Though RCU read-side + * critical sections can occur in irq handlers in idle, a possibility + * handled by irq_enter() and irq_exit().) + * + * If you add or remove a call to rcu_idle_enter(), be sure to test with + * CONFIG_RCU_EQS_DEBUG=y. + */ +void noinstr rcu_idle_enter(void) +{ + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !raw_irqs_disabled()); + rcu_eqs_enter(false); +} + +/** + * rcu_idle_exit - inform RCU that current CPU is leaving idle + * + * Exit idle mode, in other words, -enter- the mode in which RCU + * read-side critical sections can occur. + * + * If you add or remove a call to rcu_idle_exit(), be sure to test with + * CONFIG_RCU_EQS_DEBUG=y. + */ +void noinstr rcu_idle_exit(void) +{ + unsigned long flags; + + raw_local_irq_save(flags); + rcu_eqs_exit(false); + raw_local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(rcu_idle_exit); + noinstr void ct_idle_enter(void) { rcu_idle_enter(); @@ -139,6 +443,38 @@ noinstr void ct_nmi_exit(void) } #endif /* #ifdef CONFIG_CONTEXT_TRACKING_IDLE */ +#ifdef CONFIG_NO_HZ_FULL +/** + * rcu_user_enter - inform RCU that we are resuming userspace. + * + * Enter RCU idle mode right before resuming userspace. No use of RCU + * is permitted between this call and rcu_user_exit(). This way the + * CPU doesn't need to maintain the tick for RCU maintenance purposes + * when the CPU runs in userspace. + * + * If you add or remove a call to rcu_user_enter(), be sure to test with + * CONFIG_RCU_EQS_DEBUG=y. + */ +noinstr void rcu_user_enter(void) +{ + rcu_eqs_enter(true); +} + +/** + * rcu_user_exit - inform RCU that we are exiting userspace. + * + * Exit RCU idle mode while entering the kernel because it can + * run a RCU read side critical section anytime. + * + * If you add or remove a call to rcu_user_exit(), be sure to test with + * CONFIG_RCU_EQS_DEBUG=y. + */ +void noinstr rcu_user_exit(void) +{ + rcu_eqs_exit(true); +} +#endif /* #ifdef CONFIG_NO_HZ_FULL */ + #ifdef CONFIG_CONTEXT_TRACKING_USER #define CREATE_TRACE_POINTS diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 8c0c3490532e..e2a2083079a2 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -62,6 +62,7 @@ #include #include #include +#include #include "../time/tick-internal.h" #include "tree.h" @@ -259,56 +260,6 @@ void rcu_softirq_qs(void) rcu_tasks_qs(current, false); } -/* - * Increment the current CPU's rcu_data structure's ->dynticks field - * with ordering. Return the new value. - */ -static noinline noinstr unsigned long rcu_dynticks_inc(int incby) -{ - return arch_atomic_add_return(incby, this_cpu_ptr(&context_tracking.dynticks)); -} - -/* - * Record entry into an extended quiescent state. This is only to be - * called when not already in an extended quiescent state, that is, - * RCU is watching prior to the call to this function and is no longer - * watching upon return. - */ -static noinstr void rcu_dynticks_eqs_enter(void) -{ - int seq; - - /* - * CPUs seeing atomic_add_return() must see prior RCU read-side - * critical sections, and we also must force ordering with the - * next idle sojourn. - */ - rcu_dynticks_task_trace_enter(); // Before ->dynticks update! - seq = rcu_dynticks_inc(1); - // RCU is no longer watching. Better be in extended quiescent state! - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (seq & 0x1)); -} - -/* - * Record exit from an extended quiescent state. This is only to be - * called from an extended quiescent state, that is, RCU is not watching - * prior to the call to this function and is watching upon return. - */ -static noinstr void rcu_dynticks_eqs_exit(void) -{ - int seq; - - /* - * CPUs seeing atomic_add_return() must see prior idle sojourns, - * and we also must force ordering with the next RCU read-side - * critical section. - */ - seq = rcu_dynticks_inc(1); - // RCU is now watching. Better not be in an extended quiescent state! - rcu_dynticks_task_trace_exit(); // After ->dynticks update! - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(seq & 0x1)); -} - /* * Reset the current CPU's ->dynticks counter to indicate that the * newly onlined CPU is no longer in an extended quiescent state. @@ -326,16 +277,6 @@ static void rcu_dynticks_eqs_online(void) rcu_dynticks_inc(1); } -/* - * Is the current CPU in an extended quiescent state? - * - * No ordering, as we are sampling CPU-local information. - */ -static __always_inline bool rcu_dynticks_curr_cpu_in_eqs(void) -{ - return !(arch_atomic_read(this_cpu_ptr(&context_tracking.dynticks)) & 0x1); -} - /* * Snapshot the ->dynticks counter with full ordering so as to allow * stable comparison of this counter with past and future snapshots. @@ -601,65 +542,7 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, } EXPORT_SYMBOL_GPL(rcutorture_get_gp_data); -/* - * Enter an RCU extended quiescent state, which can be either the - * idle loop or adaptive-tickless usermode execution. - * - * We crowbar the ->dynticks_nmi_nesting field to zero to allow for - * the possibility of usermode upcalls having messed up our count - * of interrupt nesting level during the prior busy period. - */ -static noinstr void rcu_eqs_enter(bool user) -{ - struct context_tracking *ct = this_cpu_ptr(&context_tracking); - - WARN_ON_ONCE(ct_dynticks_nmi_nesting() != DYNTICK_IRQ_NONIDLE); - WRITE_ONCE(ct->dynticks_nmi_nesting, 0); - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && - ct_dynticks_nesting() == 0); - if (ct_dynticks_nesting() != 1) { - // RCU will still be watching, so just do accounting and leave. - ct->dynticks_nesting--; - return; - } - - instrumentation_begin(); - lockdep_assert_irqs_disabled(); - trace_rcu_dyntick(TPS("Start"), ct_dynticks_nesting(), 0, ct_dynticks()); - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); - rcu_preempt_deferred_qs(current); - - // instrumentation for the noinstr rcu_dynticks_eqs_enter() - instrument_atomic_write(&ct->dynticks, sizeof(ct->dynticks)); - - instrumentation_end(); - WRITE_ONCE(ct->dynticks_nesting, 0); /* Avoid irq-access tearing. */ - // RCU is watching here ... - rcu_dynticks_eqs_enter(); - // ... but is no longer watching here. - rcu_dynticks_task_enter(); -} - -/** - * rcu_idle_enter - inform RCU that current CPU is entering idle - * - * Enter idle mode, in other words, -leave- the mode in which RCU - * read-side critical sections can occur. (Though RCU read-side - * critical sections can occur in irq handlers in idle, a possibility - * handled by irq_enter() and irq_exit().) - * - * If you add or remove a call to rcu_idle_enter(), be sure to test with - * CONFIG_RCU_EQS_DEBUG=y. - */ -void noinstr rcu_idle_enter(void) -{ - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !raw_irqs_disabled()); - rcu_eqs_enter(false); -} - -#ifdef CONFIG_NO_HZ_FULL - -#if !defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK) +#if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK)) /* * An empty function that will trigger a reschedule on * IRQ tail once IRQs get re-enabled on userspace/guest resume. @@ -697,78 +580,7 @@ noinstr void rcu_irq_work_resched(void) } instrumentation_end(); } -#endif /* #if !defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK) */ - -/** - * rcu_user_enter - inform RCU that we are resuming userspace. - * - * Enter RCU idle mode right before resuming userspace. No use of RCU - * is permitted between this call and rcu_user_exit(). This way the - * CPU doesn't need to maintain the tick for RCU maintenance purposes - * when the CPU runs in userspace. - * - * If you add or remove a call to rcu_user_enter(), be sure to test with - * CONFIG_RCU_EQS_DEBUG=y. - */ -noinstr void rcu_user_enter(void) -{ - rcu_eqs_enter(true); -} - -#endif /* CONFIG_NO_HZ_FULL */ - -/** - * rcu_nmi_exit - inform RCU of exit from NMI context - * - * If we are returning from the outermost NMI handler that interrupted an - * RCU-idle period, update ct->dynticks and ct->dynticks_nmi_nesting - * to let the RCU grace-period handling know that the CPU is back to - * being RCU-idle. - * - * If you add or remove a call to rcu_nmi_exit(), be sure to test - * with CONFIG_RCU_EQS_DEBUG=y. - */ -noinstr void rcu_nmi_exit(void) -{ - struct context_tracking *ct = this_cpu_ptr(&context_tracking); - - instrumentation_begin(); - /* - * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks. - * (We are exiting an NMI handler, so RCU better be paying attention - * to us!) - */ - WARN_ON_ONCE(ct_dynticks_nmi_nesting() <= 0); - WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs()); - - /* - * If the nesting level is not 1, the CPU wasn't RCU-idle, so - * leave it in non-RCU-idle state. - */ - if (ct_dynticks_nmi_nesting() != 1) { - trace_rcu_dyntick(TPS("--="), ct_dynticks_nmi_nesting(), ct_dynticks_nmi_nesting() - 2, - ct_dynticks()); - WRITE_ONCE(ct->dynticks_nmi_nesting, /* No store tearing. */ - ct_dynticks_nmi_nesting() - 2); - instrumentation_end(); - return; - } - - /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */ - trace_rcu_dyntick(TPS("Startirq"), ct_dynticks_nmi_nesting(), 0, ct_dynticks()); - WRITE_ONCE(ct->dynticks_nmi_nesting, 0); /* Avoid store tearing. */ - - // instrumentation for the noinstr rcu_dynticks_eqs_enter() - instrument_atomic_write(&ct->dynticks, sizeof(ct->dynticks)); - instrumentation_end(); - - // RCU is watching here ... - rcu_dynticks_eqs_enter(); - // ... but is no longer watching here. - - if (!in_nmi()) - rcu_dynticks_task_enter(); -} +#endif /* #if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK)) */ #ifdef CONFIG_PROVE_RCU /** @@ -788,77 +600,7 @@ void rcu_irq_exit_check_preempt(void) } #endif /* #ifdef CONFIG_PROVE_RCU */ -/* - * Exit an RCU extended quiescent state, which can be either the - * idle loop or adaptive-tickless usermode execution. - * - * We crowbar the ->dynticks_nmi_nesting field to DYNTICK_IRQ_NONIDLE to - * allow for the possibility of usermode upcalls messing up our count of - * interrupt nesting level during the busy period that is just now starting. - */ -static void noinstr rcu_eqs_exit(bool user) -{ - struct context_tracking *ct = this_cpu_ptr(&context_tracking); - long oldval; - - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !raw_irqs_disabled()); - oldval = ct_dynticks_nesting(); - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0); - if (oldval) { - // RCU was already watching, so just do accounting and leave. - ct->dynticks_nesting++; - return; - } - rcu_dynticks_task_exit(); - // RCU is not watching here ... - rcu_dynticks_eqs_exit(); - // ... but is watching here. - instrumentation_begin(); - - // instrumentation for the noinstr rcu_dynticks_eqs_exit() - instrument_atomic_write(&ct->dynticks, sizeof(ct->dynticks)); - - trace_rcu_dyntick(TPS("End"), ct_dynticks_nesting(), 1, ct_dynticks()); - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); - WRITE_ONCE(ct->dynticks_nesting, 1); - WARN_ON_ONCE(ct_dynticks_nmi_nesting()); - WRITE_ONCE(ct->dynticks_nmi_nesting, DYNTICK_IRQ_NONIDLE); - instrumentation_end(); -} - -/** - * rcu_idle_exit - inform RCU that current CPU is leaving idle - * - * Exit idle mode, in other words, -enter- the mode in which RCU - * read-side critical sections can occur. - * - * If you add or remove a call to rcu_idle_exit(), be sure to test with - * CONFIG_RCU_EQS_DEBUG=y. - */ -void noinstr rcu_idle_exit(void) -{ - unsigned long flags; - - raw_local_irq_save(flags); - rcu_eqs_exit(false); - raw_local_irq_restore(flags); -} - #ifdef CONFIG_NO_HZ_FULL -/** - * rcu_user_exit - inform RCU that we are exiting userspace. - * - * Exit RCU idle mode while entering the kernel because it can - * run a RCU read side critical section anytime. - * - * If you add or remove a call to rcu_user_exit(), be sure to test with - * CONFIG_RCU_EQS_DEBUG=y. - */ -void noinstr rcu_user_exit(void) -{ - rcu_eqs_exit(true); -} - /** * __rcu_irq_enter_check_tick - Enable scheduler tick on CPU if RCU needs it. * @@ -921,66 +663,6 @@ void __rcu_irq_enter_check_tick(void) } #endif /* CONFIG_NO_HZ_FULL */ -/** - * rcu_nmi_enter - inform RCU of entry to NMI context - * - * If the CPU was idle from RCU's viewpoint, update ct->dynticks and - * ct->dynticks_nmi_nesting to let the RCU grace-period handling know - * that the CPU is active. This implementation permits nested NMIs, as - * long as the nesting level does not overflow an int. (You will probably - * run out of stack space first.) - * - * If you add or remove a call to rcu_nmi_enter(), be sure to test - * with CONFIG_RCU_EQS_DEBUG=y. - */ -noinstr void rcu_nmi_enter(void) -{ - long incby = 2; - struct context_tracking *ct = this_cpu_ptr(&context_tracking); - - /* Complain about underflow. */ - WARN_ON_ONCE(ct_dynticks_nmi_nesting() < 0); - - /* - * If idle from RCU viewpoint, atomically increment ->dynticks - * to mark non-idle and increment ->dynticks_nmi_nesting by one. - * Otherwise, increment ->dynticks_nmi_nesting by two. This means - * if ->dynticks_nmi_nesting is equal to one, we are guaranteed - * to be in the outermost NMI handler that interrupted an RCU-idle - * period (observation due to Andy Lutomirski). - */ - if (rcu_dynticks_curr_cpu_in_eqs()) { - - if (!in_nmi()) - rcu_dynticks_task_exit(); - - // RCU is not watching here ... - rcu_dynticks_eqs_exit(); - // ... but is watching here. - - instrumentation_begin(); - // instrumentation for the noinstr rcu_dynticks_curr_cpu_in_eqs() - instrument_atomic_read(&ct->dynticks, sizeof(ct->dynticks)); - // instrumentation for the noinstr rcu_dynticks_eqs_exit() - instrument_atomic_write(&ct->dynticks, sizeof(ct->dynticks)); - - incby = 1; - } else if (!in_nmi()) { - instrumentation_begin(); - rcu_irq_enter_check_tick(); - } else { - instrumentation_begin(); - } - - trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="), - ct_dynticks_nmi_nesting(), - ct_dynticks_nmi_nesting() + incby, ct_dynticks()); - instrumentation_end(); - WRITE_ONCE(ct->dynticks_nmi_nesting, /* Prevent store tearing. */ - ct_dynticks_nmi_nesting() + incby); - barrier(); -} - /* * Check to see if any future non-offloaded RCU-related work will need * to be done by the current CPU, even if none need be done immediately, diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 72dbf8512ce7..0d5d1de327e4 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -427,7 +427,6 @@ static void rcu_cpu_kthread_setup(unsigned int cpu); static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp); static bool rcu_preempt_has_tasks(struct rcu_node *rnp); static bool rcu_preempt_need_deferred_qs(struct task_struct *t); -static void rcu_preempt_deferred_qs(struct task_struct *t); static void zero_cpu_stall_ticks(struct rcu_data *rdp); static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp); static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq); @@ -467,10 +466,6 @@ do { \ static void rcu_bind_gp_kthread(void); static bool rcu_nohz_full_cpu(void); -static void rcu_dynticks_task_enter(void); -static void rcu_dynticks_task_exit(void); -static void rcu_dynticks_task_trace_enter(void); -static void rcu_dynticks_task_trace_exit(void); /* Forward declarations for tree_stall.h */ static void record_gp_stall_check_time(void); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index c8ba0fe17267..4a53aa013f82 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -595,7 +595,7 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t) * evaluate safety in terms of interrupt, softirq, and preemption * disabling. */ -static void rcu_preempt_deferred_qs(struct task_struct *t) +void rcu_preempt_deferred_qs(struct task_struct *t) { unsigned long flags; @@ -935,7 +935,7 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t) // period for a quiescent state from this CPU. Note that requests from // tasks are handled when removing the task from the blocked-tasks list // below. -static void rcu_preempt_deferred_qs(struct task_struct *t) +void rcu_preempt_deferred_qs(struct task_struct *t) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); @@ -1290,37 +1290,3 @@ static void rcu_bind_gp_kthread(void) return; housekeeping_affine(current, HK_TYPE_RCU); } - -/* Record the current task on dyntick-idle entry. */ -static __always_inline void rcu_dynticks_task_enter(void) -{ -#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) - WRITE_ONCE(current->rcu_tasks_idle_cpu, smp_processor_id()); -#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */ -} - -/* Record no current task on dyntick-idle exit. */ -static __always_inline void rcu_dynticks_task_exit(void) -{ -#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) - WRITE_ONCE(current->rcu_tasks_idle_cpu, -1); -#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */ -} - -/* Turn on heavyweight RCU tasks trace readers on idle/user entry. */ -static __always_inline void rcu_dynticks_task_trace_enter(void) -{ -#ifdef CONFIG_TASKS_TRACE_RCU - if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) - current->trc_reader_special.b.need_mb = true; -#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ -} - -/* Turn off heavyweight RCU tasks trace readers on idle/user exit. */ -static __always_inline void rcu_dynticks_task_trace_exit(void) -{ -#ifdef CONFIG_TASKS_TRACE_RCU - if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) - current->trc_reader_special.b.need_mb = false; -#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ -} -- cgit From c33ef43a359001415032665dfcd433979c462b71 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 8 Jun 2022 16:40:34 +0200 Subject: rcu/context-tracking: Remove unused and/or unecessary middle functions Some eqs functions are now only used internally by context tracking, so their public declarations can be removed. Also middle functions such as rcu_user_*() and rcu_idle_*() which now directly call to rcu_eqs_enter() and rcu_eqs_exit() can be wiped out as well. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Neeraj Upadhyay Cc: Uladzislau Rezki Cc: Joel Fernandes Cc: Boqun Feng Cc: Nicolas Saenz Julienne Cc: Marcelo Tosatti Cc: Xiongfeng Wang Cc: Yu Liao Cc: Phil Auld Cc: Paul Gortmaker Cc: Alex Belits Signed-off-by: Paul E. McKenney Reviewed-by: Nicolas Saenz Julienne Tested-by: Nicolas Saenz Julienne --- kernel/context_tracking.c | 98 +++++++++++++---------------------------------- 1 file changed, 27 insertions(+), 71 deletions(-) (limited to 'kernel') diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 072c4b6044b3..e485b6b01537 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -189,17 +189,17 @@ static void noinstr rcu_eqs_exit(bool user) } /** - * rcu_nmi_exit - inform RCU of exit from NMI context + * ct_nmi_exit - inform RCU of exit from NMI context * * If we are returning from the outermost NMI handler that interrupted an * RCU-idle period, update ct->dynticks and ct->dynticks_nmi_nesting * to let the RCU grace-period handling know that the CPU is back to * being RCU-idle. * - * If you add or remove a call to rcu_nmi_exit(), be sure to test + * If you add or remove a call to ct_nmi_exit(), be sure to test * with CONFIG_RCU_EQS_DEBUG=y. */ -void noinstr rcu_nmi_exit(void) +void noinstr ct_nmi_exit(void) { struct context_tracking *ct = this_cpu_ptr(&context_tracking); @@ -242,7 +242,7 @@ void noinstr rcu_nmi_exit(void) } /** - * rcu_nmi_enter - inform RCU of entry to NMI context + * ct_nmi_enter - inform RCU of entry to NMI context * * If the CPU was idle from RCU's viewpoint, update ct->dynticks and * ct->dynticks_nmi_nesting to let the RCU grace-period handling know @@ -250,10 +250,10 @@ void noinstr rcu_nmi_exit(void) * long as the nesting level does not overflow an int. (You will probably * run out of stack space first.) * - * If you add or remove a call to rcu_nmi_enter(), be sure to test + * If you add or remove a call to ct_nmi_enter(), be sure to test * with CONFIG_RCU_EQS_DEBUG=y. */ -void noinstr rcu_nmi_enter(void) +void noinstr ct_nmi_enter(void) { long incby = 2; struct context_tracking *ct = this_cpu_ptr(&context_tracking); @@ -302,32 +302,33 @@ void noinstr rcu_nmi_enter(void) } /** - * rcu_idle_enter - inform RCU that current CPU is entering idle + * ct_idle_enter - inform RCU that current CPU is entering idle * * Enter idle mode, in other words, -leave- the mode in which RCU * read-side critical sections can occur. (Though RCU read-side * critical sections can occur in irq handlers in idle, a possibility * handled by irq_enter() and irq_exit().) * - * If you add or remove a call to rcu_idle_enter(), be sure to test with + * If you add or remove a call to ct_idle_enter(), be sure to test with * CONFIG_RCU_EQS_DEBUG=y. */ -void noinstr rcu_idle_enter(void) +void noinstr ct_idle_enter(void) { WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !raw_irqs_disabled()); rcu_eqs_enter(false); } +EXPORT_SYMBOL_GPL(ct_idle_enter); /** - * rcu_idle_exit - inform RCU that current CPU is leaving idle + * ct_idle_exit - inform RCU that current CPU is leaving idle * * Exit idle mode, in other words, -enter- the mode in which RCU * read-side critical sections can occur. * - * If you add or remove a call to rcu_idle_exit(), be sure to test with + * If you add or remove a call to ct_idle_exit(), be sure to test with * CONFIG_RCU_EQS_DEBUG=y. */ -void noinstr rcu_idle_exit(void) +void noinstr ct_idle_exit(void) { unsigned long flags; @@ -335,18 +336,6 @@ void noinstr rcu_idle_exit(void) rcu_eqs_exit(false); raw_local_irq_restore(flags); } -EXPORT_SYMBOL_GPL(rcu_idle_exit); - -noinstr void ct_idle_enter(void) -{ - rcu_idle_enter(); -} -EXPORT_SYMBOL_GPL(ct_idle_enter); - -void ct_idle_exit(void) -{ - rcu_idle_exit(); -} EXPORT_SYMBOL_GPL(ct_idle_exit); /** @@ -431,50 +420,11 @@ void ct_irq_exit_irqson(void) ct_irq_exit(); local_irq_restore(flags); } - -noinstr void ct_nmi_enter(void) -{ - rcu_nmi_enter(); -} - -noinstr void ct_nmi_exit(void) -{ - rcu_nmi_exit(); -} +#else +static __always_inline void rcu_eqs_enter(bool user) { } +static __always_inline void rcu_eqs_exit(bool user) { } #endif /* #ifdef CONFIG_CONTEXT_TRACKING_IDLE */ -#ifdef CONFIG_NO_HZ_FULL -/** - * rcu_user_enter - inform RCU that we are resuming userspace. - * - * Enter RCU idle mode right before resuming userspace. No use of RCU - * is permitted between this call and rcu_user_exit(). This way the - * CPU doesn't need to maintain the tick for RCU maintenance purposes - * when the CPU runs in userspace. - * - * If you add or remove a call to rcu_user_enter(), be sure to test with - * CONFIG_RCU_EQS_DEBUG=y. - */ -noinstr void rcu_user_enter(void) -{ - rcu_eqs_enter(true); -} - -/** - * rcu_user_exit - inform RCU that we are exiting userspace. - * - * Exit RCU idle mode while entering the kernel because it can - * run a RCU read side critical section anytime. - * - * If you add or remove a call to rcu_user_exit(), be sure to test with - * CONFIG_RCU_EQS_DEBUG=y. - */ -void noinstr rcu_user_exit(void) -{ - rcu_eqs_exit(true); -} -#endif /* #ifdef CONFIG_NO_HZ_FULL */ - #ifdef CONFIG_CONTEXT_TRACKING_USER #define CREATE_TRACE_POINTS @@ -542,7 +492,13 @@ void noinstr __ct_user_enter(enum ctx_state state) * that will fire and reschedule once we resume in user/guest mode. */ rcu_irq_work_resched(); - rcu_user_enter(); + /* + * Enter RCU idle mode right before resuming userspace. No use of RCU + * is permitted between this call and rcu_eqs_exit(). This way the + * CPU doesn't need to maintain the tick for RCU maintenance purposes + * when the CPU runs in userspace. + */ + rcu_eqs_enter(true); } /* * Even if context tracking is disabled on this CPU, because it's outside @@ -579,7 +535,7 @@ void ct_user_enter(enum ctx_state state) /* * Some contexts may involve an exception occuring in an irq, * leading to that nesting: - * ct_irq_enter() rcu_user_exit() rcu_user_exit() ct_irq_exit() + * ct_irq_enter() rcu_eqs_exit(true) rcu_eqs_enter(true) ct_irq_exit() * This would mess up the dyntick_nesting count though. And rcu_irq_*() * helpers are enough to protect RCU uses inside the exception. So * just return immediately if we detect we are in an IRQ. @@ -631,10 +587,10 @@ void noinstr __ct_user_exit(enum ctx_state state) if (__this_cpu_read(context_tracking.state) == state) { if (__this_cpu_read(context_tracking.active)) { /* - * We are going to run code that may use RCU. Inform - * RCU core about that (ie: we may need the tick again). + * Exit RCU idle mode while entering the kernel because it can + * run a RCU read side critical section anytime. */ - rcu_user_exit(); + rcu_eqs_exit(true); if (state == CONTEXT_USER) { instrumentation_begin(); vtime_user_exit(current); -- cgit From 171476775d32a40bfebf83250136c19b2e842672 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 8 Jun 2022 16:40:35 +0200 Subject: context_tracking: Convert state to atomic_t Context tracking's state and dynticks counter are going to be merged in a single field so that both updates can happen atomically and at the same time. Prepare for that with converting the state into an atomic_t. [ paulmck: Apply kernel test robot feedback. ] Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Neeraj Upadhyay Cc: Uladzislau Rezki Cc: Joel Fernandes Cc: Boqun Feng Cc: Nicolas Saenz Julienne Cc: Marcelo Tosatti Cc: Xiongfeng Wang Cc: Yu Liao Cc: Phil Auld Cc: Paul Gortmaker Cc: Alex Belits Signed-off-by: Paul E. McKenney Reviewed-by: Nicolas Saenz Julienne Tested-by: Nicolas Saenz Julienne --- kernel/context_tracking.c | 143 ++++++++++++++++++++++++++++++---------------- kernel/rcu/tree.c | 13 ++--- kernel/rcu/tree_stall.h | 4 +- 3 files changed, 102 insertions(+), 58 deletions(-) (limited to 'kernel') diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index e485b6b01537..ca78ff27dc53 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -28,8 +28,8 @@ DEFINE_PER_CPU(struct context_tracking, context_tracking) = { #ifdef CONFIG_CONTEXT_TRACKING_IDLE .dynticks_nesting = 1, .dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE, - .dynticks = ATOMIC_INIT(1), #endif + .state = ATOMIC_INIT(RCU_DYNTICKS_IDX), }; EXPORT_SYMBOL_GPL(context_tracking); @@ -76,7 +76,7 @@ static __always_inline void rcu_dynticks_task_trace_exit(void) * RCU is watching prior to the call to this function and is no longer * watching upon return. */ -static noinstr void rcu_dynticks_eqs_enter(void) +static noinstr void ct_kernel_exit_state(int offset) { int seq; @@ -86,9 +86,9 @@ static noinstr void rcu_dynticks_eqs_enter(void) * next idle sojourn. */ rcu_dynticks_task_trace_enter(); // Before ->dynticks update! - seq = rcu_dynticks_inc(1); + seq = ct_state_inc(offset); // RCU is no longer watching. Better be in extended quiescent state! - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (seq & 0x1)); + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (seq & RCU_DYNTICKS_IDX)); } /* @@ -96,7 +96,7 @@ static noinstr void rcu_dynticks_eqs_enter(void) * called from an extended quiescent state, that is, RCU is not watching * prior to the call to this function and is watching upon return. */ -static noinstr void rcu_dynticks_eqs_exit(void) +static noinstr void ct_kernel_enter_state(int offset) { int seq; @@ -105,10 +105,10 @@ static noinstr void rcu_dynticks_eqs_exit(void) * and we also must force ordering with the next RCU read-side * critical section. */ - seq = rcu_dynticks_inc(1); + seq = ct_state_inc(offset); // RCU is now watching. Better not be in an extended quiescent state! rcu_dynticks_task_trace_exit(); // After ->dynticks update! - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(seq & 0x1)); + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(seq & RCU_DYNTICKS_IDX)); } /* @@ -119,7 +119,7 @@ static noinstr void rcu_dynticks_eqs_exit(void) * the possibility of usermode upcalls having messed up our count * of interrupt nesting level during the prior busy period. */ -static void noinstr rcu_eqs_enter(bool user) +static void noinstr ct_kernel_exit(bool user, int offset) { struct context_tracking *ct = this_cpu_ptr(&context_tracking); @@ -139,13 +139,13 @@ static void noinstr rcu_eqs_enter(bool user) WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); rcu_preempt_deferred_qs(current); - // instrumentation for the noinstr rcu_dynticks_eqs_enter() - instrument_atomic_write(&ct->dynticks, sizeof(ct->dynticks)); + // instrumentation for the noinstr ct_kernel_exit_state() + instrument_atomic_write(&ct->state, sizeof(ct->state)); instrumentation_end(); WRITE_ONCE(ct->dynticks_nesting, 0); /* Avoid irq-access tearing. */ // RCU is watching here ... - rcu_dynticks_eqs_enter(); + ct_kernel_exit_state(offset); // ... but is no longer watching here. rcu_dynticks_task_enter(); } @@ -158,7 +158,7 @@ static void noinstr rcu_eqs_enter(bool user) * allow for the possibility of usermode upcalls messing up our count of * interrupt nesting level during the busy period that is just now starting. */ -static void noinstr rcu_eqs_exit(bool user) +static void noinstr ct_kernel_enter(bool user, int offset) { struct context_tracking *ct = this_cpu_ptr(&context_tracking); long oldval; @@ -173,12 +173,12 @@ static void noinstr rcu_eqs_exit(bool user) } rcu_dynticks_task_exit(); // RCU is not watching here ... - rcu_dynticks_eqs_exit(); + ct_kernel_enter_state(offset); // ... but is watching here. instrumentation_begin(); - // instrumentation for the noinstr rcu_dynticks_eqs_exit() - instrument_atomic_write(&ct->dynticks, sizeof(ct->dynticks)); + // instrumentation for the noinstr ct_kernel_enter_state() + instrument_atomic_write(&ct->state, sizeof(ct->state)); trace_rcu_dyntick(TPS("End"), ct_dynticks_nesting(), 1, ct_dynticks()); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); @@ -192,7 +192,7 @@ static void noinstr rcu_eqs_exit(bool user) * ct_nmi_exit - inform RCU of exit from NMI context * * If we are returning from the outermost NMI handler that interrupted an - * RCU-idle period, update ct->dynticks and ct->dynticks_nmi_nesting + * RCU-idle period, update ct->state and ct->dynticks_nmi_nesting * to let the RCU grace-period handling know that the CPU is back to * being RCU-idle. * @@ -229,12 +229,12 @@ void noinstr ct_nmi_exit(void) trace_rcu_dyntick(TPS("Startirq"), ct_dynticks_nmi_nesting(), 0, ct_dynticks()); WRITE_ONCE(ct->dynticks_nmi_nesting, 0); /* Avoid store tearing. */ - // instrumentation for the noinstr rcu_dynticks_eqs_enter() - instrument_atomic_write(&ct->dynticks, sizeof(ct->dynticks)); + // instrumentation for the noinstr ct_kernel_exit_state() + instrument_atomic_write(&ct->state, sizeof(ct->state)); instrumentation_end(); // RCU is watching here ... - rcu_dynticks_eqs_enter(); + ct_kernel_exit_state(RCU_DYNTICKS_IDX); // ... but is no longer watching here. if (!in_nmi()) @@ -244,7 +244,7 @@ void noinstr ct_nmi_exit(void) /** * ct_nmi_enter - inform RCU of entry to NMI context * - * If the CPU was idle from RCU's viewpoint, update ct->dynticks and + * If the CPU was idle from RCU's viewpoint, update ct->state and * ct->dynticks_nmi_nesting to let the RCU grace-period handling know * that the CPU is active. This implementation permits nested NMIs, as * long as the nesting level does not overflow an int. (You will probably @@ -275,14 +275,14 @@ void noinstr ct_nmi_enter(void) rcu_dynticks_task_exit(); // RCU is not watching here ... - rcu_dynticks_eqs_exit(); + ct_kernel_enter_state(RCU_DYNTICKS_IDX); // ... but is watching here. instrumentation_begin(); // instrumentation for the noinstr rcu_dynticks_curr_cpu_in_eqs() - instrument_atomic_read(&ct->dynticks, sizeof(ct->dynticks)); - // instrumentation for the noinstr rcu_dynticks_eqs_exit() - instrument_atomic_write(&ct->dynticks, sizeof(ct->dynticks)); + instrument_atomic_read(&ct->state, sizeof(ct->state)); + // instrumentation for the noinstr ct_kernel_enter_state() + instrument_atomic_write(&ct->state, sizeof(ct->state)); incby = 1; } else if (!in_nmi()) { @@ -315,7 +315,7 @@ void noinstr ct_nmi_enter(void) void noinstr ct_idle_enter(void) { WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !raw_irqs_disabled()); - rcu_eqs_enter(false); + ct_kernel_exit(false, RCU_DYNTICKS_IDX + CONTEXT_IDLE); } EXPORT_SYMBOL_GPL(ct_idle_enter); @@ -333,7 +333,7 @@ void noinstr ct_idle_exit(void) unsigned long flags; raw_local_irq_save(flags); - rcu_eqs_exit(false); + ct_kernel_enter(false, RCU_DYNTICKS_IDX - CONTEXT_IDLE); raw_local_irq_restore(flags); } EXPORT_SYMBOL_GPL(ct_idle_exit); @@ -421,8 +421,8 @@ void ct_irq_exit_irqson(void) local_irq_restore(flags); } #else -static __always_inline void rcu_eqs_enter(bool user) { } -static __always_inline void rcu_eqs_exit(bool user) { } +static __always_inline void ct_kernel_exit(bool user, int offset) { } +static __always_inline void ct_kernel_enter(bool user, int offset) { } #endif /* #ifdef CONFIG_CONTEXT_TRACKING_IDLE */ #ifdef CONFIG_CONTEXT_TRACKING_USER @@ -463,6 +463,7 @@ static __always_inline void context_tracking_recursion_exit(void) */ void noinstr __ct_user_enter(enum ctx_state state) { + struct context_tracking *ct = this_cpu_ptr(&context_tracking); lockdep_assert_irqs_disabled(); /* Kernel threads aren't supposed to go to userspace */ @@ -471,8 +472,8 @@ void noinstr __ct_user_enter(enum ctx_state state) if (!context_tracking_recursion_enter()) return; - if ( __this_cpu_read(context_tracking.state) != state) { - if (__this_cpu_read(context_tracking.active)) { + if (__ct_state() != state) { + if (ct->active) { /* * At this stage, only low level arch entry code remains and * then we'll run in userspace. We can assume there won't be @@ -492,28 +493,49 @@ void noinstr __ct_user_enter(enum ctx_state state) * that will fire and reschedule once we resume in user/guest mode. */ rcu_irq_work_resched(); + /* * Enter RCU idle mode right before resuming userspace. No use of RCU * is permitted between this call and rcu_eqs_exit(). This way the * CPU doesn't need to maintain the tick for RCU maintenance purposes * when the CPU runs in userspace. */ - rcu_eqs_enter(true); + ct_kernel_exit(true, RCU_DYNTICKS_IDX + state); + + /* + * Special case if we only track user <-> kernel transitions for tickless + * cputime accounting but we don't support RCU extended quiescent state. + * In this we case we don't care about any concurrency/ordering. + */ + if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) + atomic_set(&ct->state, state); + } else { + /* + * Even if context tracking is disabled on this CPU, because it's outside + * the full dynticks mask for example, we still have to keep track of the + * context transitions and states to prevent inconsistency on those of + * other CPUs. + * If a task triggers an exception in userspace, sleep on the exception + * handler and then migrate to another CPU, that new CPU must know where + * the exception returns by the time we call exception_exit(). + * This information can only be provided by the previous CPU when it called + * exception_enter(). + * OTOH we can spare the calls to vtime and RCU when context_tracking.active + * is false because we know that CPU is not tickless. + */ + if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) { + /* Tracking for vtime only, no concurrent RCU EQS accounting */ + atomic_set(&ct->state, state); + } else { + /* + * Tracking for vtime and RCU EQS. Make sure we don't race + * with NMIs. OTOH we don't care about ordering here since + * RCU only requires RCU_DYNTICKS_IDX increments to be fully + * ordered. + */ + atomic_add(state, &ct->state); + } } - /* - * Even if context tracking is disabled on this CPU, because it's outside - * the full dynticks mask for example, we still have to keep track of the - * context transitions and states to prevent inconsistency on those of - * other CPUs. - * If a task triggers an exception in userspace, sleep on the exception - * handler and then migrate to another CPU, that new CPU must know where - * the exception returns by the time we call exception_exit(). - * This information can only be provided by the previous CPU when it called - * exception_enter(). - * OTOH we can spare the calls to vtime and RCU when context_tracking.active - * is false because we know that CPU is not tickless. - */ - __this_cpu_write(context_tracking.state, state); } context_tracking_recursion_exit(); } @@ -581,24 +603,47 @@ NOKPROBE_SYMBOL(user_enter_callable); */ void noinstr __ct_user_exit(enum ctx_state state) { + struct context_tracking *ct = this_cpu_ptr(&context_tracking); + if (!context_tracking_recursion_enter()) return; - if (__this_cpu_read(context_tracking.state) == state) { - if (__this_cpu_read(context_tracking.active)) { + if (__ct_state() == state) { + if (ct->active) { /* * Exit RCU idle mode while entering the kernel because it can * run a RCU read side critical section anytime. */ - rcu_eqs_exit(true); + ct_kernel_enter(true, RCU_DYNTICKS_IDX - state); if (state == CONTEXT_USER) { instrumentation_begin(); vtime_user_exit(current); trace_user_exit(0); instrumentation_end(); } + + /* + * Special case if we only track user <-> kernel transitions for tickless + * cputime accounting but we don't support RCU extended quiescent state. + * In this we case we don't care about any concurrency/ordering. + */ + if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) + atomic_set(&ct->state, CONTEXT_KERNEL); + + } else { + if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) { + /* Tracking for vtime only, no concurrent RCU EQS accounting */ + atomic_set(&ct->state, CONTEXT_KERNEL); + } else { + /* + * Tracking for vtime and RCU EQS. Make sure we don't race + * with NMIs. OTOH we don't care about ordering here since + * RCU only requires RCU_DYNTICKS_IDX increments to be fully + * ordered. + */ + atomic_sub(state, &ct->state); + } } - __this_cpu_write(context_tracking.state, CONTEXT_KERNEL); } context_tracking_recursion_exit(); } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e2a2083079a2..f9d20b40071f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -272,9 +272,9 @@ void rcu_softirq_qs(void) */ static void rcu_dynticks_eqs_online(void) { - if (ct_dynticks() & 0x1) + if (ct_dynticks() & RCU_DYNTICKS_IDX) return; - rcu_dynticks_inc(1); + ct_state_inc(RCU_DYNTICKS_IDX); } /* @@ -293,7 +293,7 @@ static int rcu_dynticks_snap(int cpu) */ static bool rcu_dynticks_in_eqs(int snap) { - return !(snap & 0x1); + return !(snap & RCU_DYNTICKS_IDX); } /* Return true if the specified CPU is currently idle from an RCU viewpoint. */ @@ -321,8 +321,7 @@ bool rcu_dynticks_zero_in_eqs(int cpu, int *vp) int snap; // If not quiescent, force back to earlier extended quiescent state. - snap = ct_dynticks_cpu(cpu) & ~0x1; - + snap = ct_dynticks_cpu(cpu) & ~RCU_DYNTICKS_IDX; smp_rmb(); // Order ->dynticks and *vp reads. if (READ_ONCE(*vp)) return false; // Non-zero, so report failure; @@ -348,9 +347,9 @@ notrace void rcu_momentary_dyntick_idle(void) int seq; raw_cpu_write(rcu_data.rcu_need_heavy_qs, false); - seq = rcu_dynticks_inc(2); + seq = ct_state_inc(2 * RCU_DYNTICKS_IDX); /* It is illegal to call this from idle state. */ - WARN_ON_ONCE(!(seq & 0x1)); + WARN_ON_ONCE(!(seq & RCU_DYNTICKS_IDX)); rcu_preempt_deferred_qs(current); } EXPORT_SYMBOL_GPL(rcu_momentary_dyntick_idle); diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 2683ce0a7c72..195cad14742d 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -469,7 +469,7 @@ static void print_cpu_stall_info(int cpu) rcuc_starved = rcu_is_rcuc_kthread_starving(rdp, &j); if (rcuc_starved) sprintf(buf, " rcuc=%ld jiffies(starved)", j); - pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld%s%s\n", + pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%04x/%ld/%#lx softirq=%u/%u fqs=%ld%s%s\n", cpu, "O."[!!cpu_online(cpu)], "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)], @@ -478,7 +478,7 @@ static void print_cpu_stall_info(int cpu) rdp->rcu_iw_pending ? (int)min(delta, 9UL) + '0' : "!."[!delta], ticks_value, ticks_title, - rcu_dynticks_snap(cpu) & 0xfff, + rcu_dynticks_snap(cpu) & 0xffff, ct_dynticks_nesting_cpu(cpu), ct_dynticks_nmi_nesting_cpu(cpu), rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), data_race(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart, -- cgit From 08ab707dfc83d6ab7829c1c0f39b0d4530fa42a8 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 8 Jun 2022 16:40:37 +0200 Subject: MAINTAINERS: Add Paul as context tracking maintainer Since most of the bits have been imported from kernel/rcu/tree.c and now that the context tracking code is tightly linked to RCU, add Paul as a context tracking maintainer. Also update the context tracking file header accordingly. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Neeraj Upadhyay Cc: Uladzislau Rezki Cc: Joel Fernandes Cc: Boqun Feng Cc: Nicolas Saenz Julienne Cc: Marcelo Tosatti Cc: Xiongfeng Wang Cc: Yu Liao Cc: Phil Auld Cc: Paul Gortmaker Cc: Alex Belits Signed-off-by: Paul E. McKenney Reviewed-by: Nicolas Saenz Julienne Tested-by: Nicolas Saenz Julienne --- kernel/context_tracking.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index ca78ff27dc53..77978e372377 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -1,18 +1,20 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Context tracking: Probe on high level context boundaries such as kernel - * and userspace. This includes syscalls and exceptions entry/exit. + * Context tracking: Probe on high level context boundaries such as kernel, + * userspace, guest or idle. * * This is used by RCU to remove its dependency on the timer tick while a CPU - * runs in userspace. + * runs in idle, userspace or guest mode. * - * Started by Frederic Weisbecker: + * User/guest tracking started by Frederic Weisbecker: * - * Copyright (C) 2012 Red Hat, Inc., Frederic Weisbecker + * Copyright (C) 2012 Red Hat, Inc., Frederic Weisbecker * * Many thanks to Gilad Ben-Yossef, Paul McKenney, Ingo Molnar, Andrew Morton, * Steven Rostedt, Peter Zijlstra for suggestions and improvements. * + * RCU extended quiescent state bits imported from kernel/rcu/tree.c + * where the relevant authorship may be found. */ #include -- cgit From ec6209c8d42f815bc3bef10934637ca92114cd1b Mon Sep 17 00:00:00 2001 From: Daniel Müller Date: Tue, 28 Jun 2022 16:01:21 +0000 Subject: bpf, libbpf: Add type match support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch adds support for the proposed type match relation to relo_core where it is shared between userspace and kernel. It plumbs through both kernel-side and libbpf-side support. The matching relation is defined as follows (copy from source): - modifiers and typedefs are stripped (and, hence, effectively ignored) - generally speaking types need to be of same kind (struct vs. struct, union vs. union, etc.) - exceptions are struct/union behind a pointer which could also match a forward declaration of a struct or union, respectively, and enum vs. enum64 (see below) Then, depending on type: - integers: - match if size and signedness match - arrays & pointers: - target types are recursively matched - structs & unions: - local members need to exist in target with the same name - for each member we recursively check match unless it is already behind a pointer, in which case we only check matching names and compatible kind - enums: - local variants have to have a match in target by symbolic name (but not numeric value) - size has to match (but enum may match enum64 and vice versa) - function pointers: - number and position of arguments in local type has to match target - for each argument and the return value we recursively check match Signed-off-by: Daniel Müller Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220628160127.607834-5-deso@posteo.net --- kernel/bpf/btf.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 8d3c7ab8af46..4f2408a4df08 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -7443,6 +7443,15 @@ int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, MAX_TYPES_ARE_COMPAT_DEPTH); } +#define MAX_TYPES_MATCH_DEPTH 2 + +int bpf_core_types_match(const struct btf *local_btf, u32 local_id, + const struct btf *targ_btf, u32 targ_id) +{ + return __bpf_core_types_match(local_btf, local_id, targ_btf, targ_id, false, + MAX_TYPES_MATCH_DEPTH); +} + static bool bpf_core_is_flavor_sep(const char *s) { /* check X___Y name pattern, where X and Y are not underscores */ -- cgit From c02b872a7ca7842e4cdbbf621f77607d0a655f83 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Sun, 26 Jun 2022 10:10:56 +0100 Subject: Documentation: update watch_queue.rst references Changeset f5461124d59b ("Documentation: move watch_queue to core-api") renamed: Documentation/watch_queue.rst to: Documentation/core-api/watch_queue.rst. Update the cross-references accordingly. Fixes: f5461124d59b ("Documentation: move watch_queue to core-api") Reviewed-by: Randy Dunlap Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/1c220de9c58f35e815a3df9458ac2bea323c8bfb.1656234456.git.mchehab@kernel.org Signed-off-by: Jonathan Corbet --- kernel/watch_queue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index 230038d4f908..869fea4fe26b 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -4,7 +4,7 @@ * Copyright (C) 2020 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * - * See Documentation/watch_queue.rst + * See Documentation/core-api/watch_queue.rst */ #define pr_fmt(fmt) "watchq: " fmt -- cgit From d1a6edecc1fddfb6ef92c8f720631d2c02bf2744 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Fri, 8 Jul 2022 10:50:00 -0700 Subject: bpf: Check attach_func_proto more carefully in check_return_code Syzkaller reports the following crash: RIP: 0010:check_return_code kernel/bpf/verifier.c:10575 [inline] RIP: 0010:do_check kernel/bpf/verifier.c:12346 [inline] RIP: 0010:do_check_common+0xb3d2/0xd250 kernel/bpf/verifier.c:14610 With the following reproducer: bpf$PROG_LOAD_XDP(0x5, &(0x7f00000004c0)={0xd, 0x3, &(0x7f0000000000)=ANY=[@ANYBLOB="1800000000000019000000000000000095"], &(0x7f0000000300)='GPL\x00', 0x0, 0x0, 0x0, 0x0, 0x0, '\x00', 0x0, 0x2b, 0xffffffffffffffff, 0x8, 0x0, 0x0, 0x10, 0x0}, 0x80) Because we don't enforce expected_attach_type for XDP programs, we end up in hitting 'if (prog->expected_attach_type == BPF_LSM_CGROUP' part in check_return_code and follow up with testing `prog->aux->attach_func_proto->type`, but `prog->aux->attach_func_proto` is NULL. Add explicit prog_type check for the "Note, BPF_LSM_CGROUP that attach ..." condition. Also, don't skip return code check for LSM/STRUCT_OPS. The above actually brings an issue with existing selftest which tries to return EPERM from void inet_csk_clone. Fix the test (and move called_socket_clone to make sure it's not incremented in case of an error) and add a new one to explicitly verify this condition. Fixes: 69fd337a975c ("bpf: per-cgroup lsm flavor") Reported-by: syzbot+5cc0730bd4b4d2c5f152@syzkaller.appspotmail.com Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20220708175000.2603078-1-sdf@google.com --- kernel/bpf/verifier.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index df3ec6b05f05..e3cf6194c24f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -10444,11 +10444,21 @@ static int check_return_code(struct bpf_verifier_env *env) const bool is_subprog = frame->subprogno; /* LSM and struct_ops func-ptr's return type could be "void" */ - if (!is_subprog && - (prog_type == BPF_PROG_TYPE_STRUCT_OPS || - prog_type == BPF_PROG_TYPE_LSM) && - !prog->aux->attach_func_proto->type) - return 0; + if (!is_subprog) { + switch (prog_type) { + case BPF_PROG_TYPE_LSM: + if (prog->expected_attach_type == BPF_LSM_CGROUP) + /* See below, can be 0 or 0-1 depending on hook. */ + break; + fallthrough; + case BPF_PROG_TYPE_STRUCT_OPS: + if (!prog->aux->attach_func_proto->type) + return 0; + break; + default: + break; + } + } /* eBPF calling convention is such that R0 is used * to return the value from eBPF program. @@ -10572,6 +10582,7 @@ static int check_return_code(struct bpf_verifier_env *env) if (!tnum_in(range, reg->var_off)) { verbose_invalid_scalar(env, reg, &range, "program exit", "R0"); if (prog->expected_attach_type == BPF_LSM_CGROUP && + prog_type == BPF_PROG_TYPE_LSM && !prog->aux->attach_func_proto->type) verbose(env, "Note, BPF_LSM_CGROUP that attach to void LSM hooks can't modify return value!\n"); return -EINVAL; -- cgit From ae39e9ed964f8e450d0de410b5a757e19581dfc5 Mon Sep 17 00:00:00 2001 From: Saravana Kannan Date: Fri, 3 Jun 2022 18:01:00 -0700 Subject: module: Add support for default value for module async_probe Add a module.async_probe kernel command line option that allows enabling async probing for all modules. When this command line option is used, there might still be some modules for which we want to explicitly force synchronous probing, so extend .async_probe to take an optional bool input so that async probing can be disabled for a specific module. Signed-off-by: Saravana Kannan Reviewed-by: Aaron Tomlin Signed-off-by: Luis Chamberlain --- kernel/module/main.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module/main.c b/kernel/module/main.c index 0548151dd933..07dd9c293ab9 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -2410,6 +2410,12 @@ static void do_free_init(struct work_struct *w) } } +#undef MODULE_PARAM_PREFIX +#define MODULE_PARAM_PREFIX "module." +/* Default value for module->async_probe_requested */ +static bool async_probe; +module_param(async_probe, bool, 0644); + /* * This is where the real work happens. * @@ -2630,7 +2636,8 @@ static int unknown_module_param_cb(char *param, char *val, const char *modname, int ret; if (strcmp(param, "async_probe") == 0) { - mod->async_probe_requested = true; + if (strtobool(val, &mod->async_probe_requested)) + mod->async_probe_requested = true; return 0; } @@ -2797,6 +2804,8 @@ static int load_module(struct load_info *info, const char __user *uargs, if (err) goto bug_cleanup; + mod->async_probe_requested = async_probe; + /* Module is ready to execute: parsing args may do that. */ after_dashes = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, -32768, 32767, mod, -- cgit From ecc726f1458e7aa50e120ff334f0a3be5cccd94c Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 13 Jun 2022 08:02:01 +0200 Subject: module: Fix ERRORs reported by checkpatch.pl Checkpatch reports following errors: ERROR: do not use assignment in if condition + if ((colon = strnchr(name, MODULE_NAME_LEN, ':')) != NULL) { ERROR: do not use assignment in if condition + if ((mod = find_module_all(name, colon - name, false)) != NULL) ERROR: do not use assignment in if condition + if ((ret = find_kallsyms_symbol_value(mod, name)) != 0) ERROR: do not initialise globals to 0 +int modules_disabled = 0; Fix them. The following one has to remain, because the condition has to be evaluated multiple times by the macro wait_event_interruptible_timeout(). ERROR: do not use assignment in if condition + if (wait_event_interruptible_timeout(module_wq, Signed-off-by: Christophe Leroy Signed-off-by: Luis Chamberlain --- kernel/module/kallsyms.c | 9 ++++++--- kernel/module/main.c | 2 +- 2 files changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c index 77e75bead569..6a74545fc8a1 100644 --- a/kernel/module/kallsyms.c +++ b/kernel/module/kallsyms.c @@ -466,14 +466,17 @@ unsigned long module_kallsyms_lookup_name(const char *name) /* Don't lock: we're in enough trouble already. */ preempt_disable(); - if ((colon = strnchr(name, MODULE_NAME_LEN, ':')) != NULL) { - if ((mod = find_module_all(name, colon - name, false)) != NULL) + colon = strnchr(name, MODULE_NAME_LEN, ':'); + if (colon) { + mod = find_module_all(name, colon - name, false); + if (mod) ret = find_kallsyms_symbol_value(mod, colon + 1); } else { list_for_each_entry_rcu(mod, &modules, list) { if (mod->state == MODULE_STATE_UNFORMED) continue; - if ((ret = find_kallsyms_symbol_value(mod, name)) != 0) + ret = find_kallsyms_symbol_value(mod, name); + if (ret) break; } } diff --git a/kernel/module/main.c b/kernel/module/main.c index 07dd9c293ab9..b2de00e09abc 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -119,7 +119,7 @@ static void mod_update_bounds(struct module *mod) } /* Block module loading/unloading? */ -int modules_disabled = 0; +int modules_disabled; core_param(nomodule, modules_disabled, bint, 0); /* Waiting for a module to finish initializing? */ -- cgit From 07ade45a765bb7d7571f1a89ab8edfa4ce5e7268 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 13 Jun 2022 08:02:02 +0200 Subject: module: Increase readability of module_kallsyms_lookup_name() module_kallsyms_lookup_name() has several exit conditions but can't return immediately due to preempt_disable(). Refactor module_kallsyms_lookup_name() to allow returning from anywhere, and reduce depth. Signed-off-by: Christophe Leroy Signed-off-by: Luis Chamberlain --- kernel/module/kallsyms.c | 38 ++++++++++++++++++++++++-------------- 1 file changed, 24 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c index 6a74545fc8a1..f5c5c9175333 100644 --- a/kernel/module/kallsyms.c +++ b/kernel/module/kallsyms.c @@ -457,29 +457,39 @@ unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name) return 0; } -/* Look for this name: can be of form module:name. */ -unsigned long module_kallsyms_lookup_name(const char *name) +static unsigned long __module_kallsyms_lookup_name(const char *name) { struct module *mod; char *colon; - unsigned long ret = 0; - /* Don't lock: we're in enough trouble already. */ - preempt_disable(); colon = strnchr(name, MODULE_NAME_LEN, ':'); if (colon) { mod = find_module_all(name, colon - name, false); if (mod) - ret = find_kallsyms_symbol_value(mod, colon + 1); - } else { - list_for_each_entry_rcu(mod, &modules, list) { - if (mod->state == MODULE_STATE_UNFORMED) - continue; - ret = find_kallsyms_symbol_value(mod, name); - if (ret) - break; - } + return find_kallsyms_symbol_value(mod, colon + 1); + return 0; + } + + list_for_each_entry_rcu(mod, &modules, list) { + unsigned long ret; + + if (mod->state == MODULE_STATE_UNFORMED) + continue; + ret = find_kallsyms_symbol_value(mod, name); + if (ret) + return ret; } + return 0; +} + +/* Look for this name: can be of form module:name. */ +unsigned long module_kallsyms_lookup_name(const char *name) +{ + unsigned long ret; + + /* Don't lock: we're in enough trouble already. */ + preempt_disable(); + ret = __module_kallsyms_lookup_name(name); preempt_enable(); return ret; } -- cgit From 2b9401e90d369b5fbb8a62e9034ad97297594475 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Mon, 4 Jul 2022 20:03:37 +0800 Subject: module: Use vzalloc() instead of vmalloc()/memset(0) Use vzalloc() instead of vmalloc() and memset(0) to simpify the code. Signed-off-by: Yang Yingliang Reviewed-by: Aaron Tomlin Signed-off-by: Luis Chamberlain --- kernel/module/main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/module/main.c b/kernel/module/main.c index b2de00e09abc..d34227ca3932 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -2138,7 +2138,7 @@ static int move_module(struct module *mod, struct load_info *info) #ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC /* Do the allocs. */ - ptr = vmalloc(mod->data_layout.size); + ptr = vzalloc(mod->data_layout.size); /* * The pointer to this block is stored in the module structure * which is inside the block. Just mark it as not being a @@ -2151,7 +2151,6 @@ static int move_module(struct module *mod, struct load_info *info) return -ENOMEM; } - memset(ptr, 0, mod->data_layout.size); mod->data_layout.base = ptr; #endif /* Transfer each section which specifies SHF_ALLOC */ -- cgit From 535a57a7ffc04932ad83c1a5649b09ba6c93ce83 Mon Sep 17 00:00:00 2001 From: Xu Kuohai Date: Mon, 11 Jul 2022 11:08:20 -0400 Subject: bpf: Remove is_valid_bpf_tramp_flags() Before generating bpf trampoline, x86 calls is_valid_bpf_tramp_flags() to check the input flags. This check is architecture independent. So, to be consistent with x86, arm64 should also do this check before generating bpf trampoline. However, the BPF_TRAMP_F_XXX flags are not used by user code and the flags argument is almost constant at compile time, so this run time check is a bit redundant. Remove is_valid_bpf_tramp_flags() and add some comments to the usage of BPF_TRAMP_F_XXX flags, as suggested by Alexei. Signed-off-by: Xu Kuohai Signed-off-by: Daniel Borkmann Reviewed-by: Jean-Philippe Brucker Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20220711150823.2128542-2-xukuohai@huawei.com --- kernel/bpf/bpf_struct_ops.c | 3 +++ kernel/bpf/trampoline.c | 3 +++ 2 files changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 7e0068c3399c..84b2d9dba79a 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -341,6 +341,9 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, tlinks[BPF_TRAMP_FENTRY].links[0] = link; tlinks[BPF_TRAMP_FENTRY].nr_links = 1; + /* BPF_TRAMP_F_RET_FENTRY_RET is only used by bpf_struct_ops, + * and it must be used alone. + */ flags = model->ret_size > 0 ? BPF_TRAMP_F_RET_FENTRY_RET : 0; return arch_prepare_bpf_trampoline(NULL, image, image_end, model, flags, tlinks, NULL); diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 6cd226584c33..fd69812412ca 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -360,6 +360,9 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr) if (tlinks[BPF_TRAMP_FEXIT].nr_links || tlinks[BPF_TRAMP_MODIFY_RETURN].nr_links) + /* NOTE: BPF_TRAMP_F_RESTORE_REGS and BPF_TRAMP_F_SKIP_FRAME + * should not be set together. + */ flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME; if (ip_arg) -- cgit From 74829ddf5977567d77440150d72d4c0c5c427446 Mon Sep 17 00:00:00 2001 From: David Gow Date: Fri, 8 Jul 2022 12:48:45 +0800 Subject: module: panic: Taint the kernel when selftest modules load Taint the kernel with TAINT_TEST whenever a test module loads, by adding a new "TEST" module property, and setting it for all modules in the tools/testing directory. This property can also be set manually, for tests which live outside the tools/testing directory with: MODULE_INFO(test, "Y"); Reviewed-by: Luis Chamberlain Reviewed-by: Aaron Tomlin Acked-by: Brendan Higgins Signed-off-by: David Gow Signed-off-by: Shuah Khan --- kernel/module/main.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/module/main.c b/kernel/module/main.c index fed58d30725d..4723f1316709 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -1988,6 +1988,13 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags) /* Set up license info based on the info section */ set_license(mod, get_modinfo(info, "license")); + if (get_modinfo(info, "test")) { + if (!test_taint(TAINT_TEST)) + pr_warn("%s: loading test module taints kernel.\n", + mod->name); + add_taint_module(mod, TAINT_TEST, LOCKDEP_STILL_OK); + } + return 0; } -- cgit From 3d6e44623841c8b82c2157f2f749019803fb238a Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Sat, 9 Jul 2022 11:19:57 +0800 Subject: kunit: unify module and builtin suite definitions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, KUnit runs built-in tests and tests loaded from modules differently. For built-in tests, the kunit_test_suite{,s}() macro adds a list of suites in the .kunit_test_suites linker section. However, for kernel modules, a module_init() function is used to run the test suites. This causes problems if tests are included in a module which already defines module_init/exit_module functions, as they'll conflict with the kunit-provided ones. This change removes the kunit-defined module inits, and instead parses the kunit tests from their own section in the module. After module init, we call __kunit_test_suites_init() on the contents of that section, which prepares and runs the suite. This essentially unifies the module- and non-module kunit init formats. Tested-by: Maíra Canal Reviewed-by: Brendan Higgins Signed-off-by: Jeremy Kerr Signed-off-by: Daniel Latypov Signed-off-by: David Gow Signed-off-by: Shuah Khan --- kernel/module/main.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/module/main.c b/kernel/module/main.c index 4723f1316709..324a770f789c 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -2094,6 +2094,12 @@ static int find_module_sections(struct module *mod, struct load_info *info) sizeof(*mod->static_call_sites), &mod->num_static_call_sites); #endif +#ifdef CONFIG_KUNIT + mod->kunit_suites = section_objs(info, ".kunit_test_suites", + sizeof(*mod->kunit_suites), + &mod->num_kunit_suites); +#endif + mod->extable = section_objs(info, "__ex_table", sizeof(*mod->extable), &mod->num_exentries); -- cgit From c51ba246cb172c9e947dc6fb8868a1eaf0b2a913 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Tue, 12 Jul 2022 08:46:45 +0200 Subject: swiotlb: fail map correctly with failed io_tlb_default_mem In the failure case of trying to use a buffer which we'd previously failed to allocate, the "!mem" condition is no longer sufficient since io_tlb_default_mem became static and assigned by default. Update the condition to work as intended per the rest of that conversion. Fixes: 463e862ac63e ("swiotlb: Convert io_default_tlb_mem to static allocation") Signed-off-by: Robin Murphy Signed-off-by: Christoph Hellwig --- kernel/dma/swiotlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 1758b724c7a8..909b43445574 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -584,7 +584,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, int index; phys_addr_t tlb_addr; - if (!mem) + if (!mem || !mem->nslabs) panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer"); if (cc_platform_has(CC_ATTR_MEM_ENCRYPT)) -- cgit From 73b4fc92f97d775da26d86d2732497be6c610ec6 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 12 Jul 2022 07:52:33 +0200 Subject: module: Move module's Kconfig items in kernel/module/ In init/Kconfig, the part dedicated to modules is quite large. Move it into a dedicated Kconfig in kernel/module/ MODULES_TREE_LOOKUP was outside of the 'if MODULES', but as it is only used when MODULES are set, move it in with everything else to avoid confusion. MODULE_SIG_FORMAT is left in init/Kconfig because this configuration item is not used in kernel/modules/ but in kernel/ and can be selected independently from CONFIG_MODULES. It is for instance selected from security/integrity/ima/Kconfig. Signed-off-by: Christophe Leroy Signed-off-by: Luis Chamberlain --- kernel/module/Kconfig | 293 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 293 insertions(+) create mode 100644 kernel/module/Kconfig (limited to 'kernel') diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig new file mode 100644 index 000000000000..26ea5d04f56c --- /dev/null +++ b/kernel/module/Kconfig @@ -0,0 +1,293 @@ +# SPDX-License-Identifier: GPL-2.0-only +menuconfig MODULES + bool "Enable loadable module support" + modules + help + Kernel modules are small pieces of compiled code which can + be inserted in the running kernel, rather than being + permanently built into the kernel. You use the "modprobe" + tool to add (and sometimes remove) them. If you say Y here, + many parts of the kernel can be built as modules (by + answering M instead of Y where indicated): this is most + useful for infrequently used options which are not required + for booting. For more information, see the man pages for + modprobe, lsmod, modinfo, insmod and rmmod. + + If you say Y here, you will need to run "make + modules_install" to put the modules under /lib/modules/ + where modprobe can find them (you may need to be root to do + this). + + If unsure, say Y. + +if MODULES + +config MODULE_FORCE_LOAD + bool "Forced module loading" + default n + help + Allow loading of modules without version information (ie. modprobe + --force). Forced module loading sets the 'F' (forced) taint flag and + is usually a really bad idea. + +config MODULE_UNLOAD + bool "Module unloading" + help + Without this option you will not be able to unload any + modules (note that some modules may not be unloadable + anyway), which makes your kernel smaller, faster + and simpler. If unsure, say Y. + +config MODULE_FORCE_UNLOAD + bool "Forced module unloading" + depends on MODULE_UNLOAD + help + This option allows you to force a module to unload, even if the + kernel believes it is unsafe: the kernel will remove the module + without waiting for anyone to stop using it (using the -f option to + rmmod). This is mainly for kernel developers and desperate users. + If unsure, say N. + +config MODULE_UNLOAD_TAINT_TRACKING + bool "Tainted module unload tracking" + depends on MODULE_UNLOAD + default n + help + This option allows you to maintain a record of each unloaded + module that tainted the kernel. In addition to displaying a + list of linked (or loaded) modules e.g. on detection of a bad + page (see bad_page()), the aforementioned details are also + shown. If unsure, say N. + +config MODVERSIONS + bool "Module versioning support" + help + Usually, you have to use modules compiled with your kernel. + Saying Y here makes it sometimes possible to use modules + compiled for different kernels, by adding enough information + to the modules to (hopefully) spot any changes which would + make them incompatible with the kernel you are running. If + unsure, say N. + +config ASM_MODVERSIONS + bool + default HAVE_ASM_MODVERSIONS && MODVERSIONS + help + This enables module versioning for exported symbols also from + assembly. This can be enabled only when the target architecture + supports it. + +config MODULE_SRCVERSION_ALL + bool "Source checksum for all modules" + help + Modules which contain a MODULE_VERSION get an extra "srcversion" + field inserted into their modinfo section, which contains a + sum of the source files which made it. This helps maintainers + see exactly which source was used to build a module (since + others sometimes change the module source without updating + the version). With this option, such a "srcversion" field + will be created for all modules. If unsure, say N. + +config MODULE_SIG + bool "Module signature verification" + select MODULE_SIG_FORMAT + help + Check modules for valid signatures upon load: the signature + is simply appended to the module. For more information see + . + + Note that this option adds the OpenSSL development packages as a + kernel build dependency so that the signing tool can use its crypto + library. + + You should enable this option if you wish to use either + CONFIG_SECURITY_LOCKDOWN_LSM or lockdown functionality imposed via + another LSM - otherwise unsigned modules will be loadable regardless + of the lockdown policy. + + !!!WARNING!!! If you enable this option, you MUST make sure that the + module DOES NOT get stripped after being signed. This includes the + debuginfo strip done by some packagers (such as rpmbuild) and + inclusion into an initramfs that wants the module size reduced. + +config MODULE_SIG_FORCE + bool "Require modules to be validly signed" + depends on MODULE_SIG + help + Reject unsigned modules or signed modules for which we don't have a + key. Without this, such modules will simply taint the kernel. + +config MODULE_SIG_ALL + bool "Automatically sign all modules" + default y + depends on MODULE_SIG || IMA_APPRAISE_MODSIG + help + Sign all modules during make modules_install. Without this option, + modules must be signed manually, using the scripts/sign-file tool. + +comment "Do not forget to sign required modules with scripts/sign-file" + depends on MODULE_SIG_FORCE && !MODULE_SIG_ALL + +choice + prompt "Which hash algorithm should modules be signed with?" + depends on MODULE_SIG || IMA_APPRAISE_MODSIG + help + This determines which sort of hashing algorithm will be used during + signature generation. This algorithm _must_ be built into the kernel + directly so that signature verification can take place. It is not + possible to load a signed module containing the algorithm to check + the signature on that module. + +config MODULE_SIG_SHA1 + bool "Sign modules with SHA-1" + select CRYPTO_SHA1 + +config MODULE_SIG_SHA224 + bool "Sign modules with SHA-224" + select CRYPTO_SHA256 + +config MODULE_SIG_SHA256 + bool "Sign modules with SHA-256" + select CRYPTO_SHA256 + +config MODULE_SIG_SHA384 + bool "Sign modules with SHA-384" + select CRYPTO_SHA512 + +config MODULE_SIG_SHA512 + bool "Sign modules with SHA-512" + select CRYPTO_SHA512 + +endchoice + +config MODULE_SIG_HASH + string + depends on MODULE_SIG || IMA_APPRAISE_MODSIG + default "sha1" if MODULE_SIG_SHA1 + default "sha224" if MODULE_SIG_SHA224 + default "sha256" if MODULE_SIG_SHA256 + default "sha384" if MODULE_SIG_SHA384 + default "sha512" if MODULE_SIG_SHA512 + +choice + prompt "Module compression mode" + help + This option allows you to choose the algorithm which will be used to + compress modules when 'make modules_install' is run. (or, you can + choose to not compress modules at all.) + + External modules will also be compressed in the same way during the + installation. + + For modules inside an initrd or initramfs, it's more efficient to + compress the whole initrd or initramfs instead. + + This is fully compatible with signed modules. + + Please note that the tool used to load modules needs to support the + corresponding algorithm. module-init-tools MAY support gzip, and kmod + MAY support gzip, xz and zstd. + + Your build system needs to provide the appropriate compression tool + to compress the modules. + + If in doubt, select 'None'. + +config MODULE_COMPRESS_NONE + bool "None" + help + Do not compress modules. The installed modules are suffixed + with .ko. + +config MODULE_COMPRESS_GZIP + bool "GZIP" + help + Compress modules with GZIP. The installed modules are suffixed + with .ko.gz. + +config MODULE_COMPRESS_XZ + bool "XZ" + help + Compress modules with XZ. The installed modules are suffixed + with .ko.xz. + +config MODULE_COMPRESS_ZSTD + bool "ZSTD" + help + Compress modules with ZSTD. The installed modules are suffixed + with .ko.zst. + +endchoice + +config MODULE_DECOMPRESS + bool "Support in-kernel module decompression" + depends on MODULE_COMPRESS_GZIP || MODULE_COMPRESS_XZ + select ZLIB_INFLATE if MODULE_COMPRESS_GZIP + select XZ_DEC if MODULE_COMPRESS_XZ + help + + Support for decompressing kernel modules by the kernel itself + instead of relying on userspace to perform this task. Useful when + load pinning security policy is enabled. + + If unsure, say N. + +config MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS + bool "Allow loading of modules with missing namespace imports" + help + Symbols exported with EXPORT_SYMBOL_NS*() are considered exported in + a namespace. A module that makes use of a symbol exported with such a + namespace is required to import the namespace via MODULE_IMPORT_NS(). + There is no technical reason to enforce correct namespace imports, + but it creates consistency between symbols defining namespaces and + users importing namespaces they make use of. This option relaxes this + requirement and lifts the enforcement when loading a module. + + If unsure, say N. + +config MODPROBE_PATH + string "Path to modprobe binary" + default "/sbin/modprobe" + help + When kernel code requests a module, it does so by calling + the "modprobe" userspace utility. This option allows you to + set the path where that binary is found. This can be changed + at runtime via the sysctl file + /proc/sys/kernel/modprobe. Setting this to the empty string + removes the kernel's ability to request modules (but + userspace can still load modules explicitly). + +config TRIM_UNUSED_KSYMS + bool "Trim unused exported kernel symbols" if EXPERT + depends on !COMPILE_TEST + help + The kernel and some modules make many symbols available for + other modules to use via EXPORT_SYMBOL() and variants. Depending + on the set of modules being selected in your kernel configuration, + many of those exported symbols might never be used. + + This option allows for unused exported symbols to be dropped from + the build. In turn, this provides the compiler more opportunities + (especially when using LTO) for optimizing the code and reducing + binary size. This might have some security advantages as well. + + If unsure, or if you need to build out-of-tree modules, say N. + +config UNUSED_KSYMS_WHITELIST + string "Whitelist of symbols to keep in ksymtab" + depends on TRIM_UNUSED_KSYMS + help + By default, all unused exported symbols will be un-exported from the + build when TRIM_UNUSED_KSYMS is selected. + + UNUSED_KSYMS_WHITELIST allows to whitelist symbols that must be kept + exported at all times, even in absence of in-tree users. The value to + set here is the path to a text file containing the list of symbols, + one per line. The path can be absolute, or relative to the kernel + source tree. + +config MODULES_TREE_LOOKUP + def_bool y + depends on PERF_EVENTS || TRACING || CFI_CLANG + +endif # MODULES -- cgit From 94c255ac676fa922df3498608ead42b0a0c85122 Mon Sep 17 00:00:00 2001 From: Xiang wangx Date: Mon, 6 Jun 2022 10:30:07 +0800 Subject: tracing/user_events: Fix syntax errors in comments Delete the redundant word 'have'. Link: https://lkml.kernel.org/r/20220606023007.23377-1-wangxiang@cdjrlc.com Signed-off-by: Xiang wangx Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 706e1686b5eb..a6621c52ce45 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -567,7 +567,7 @@ static int user_event_set_call_visible(struct user_event *user, bool visible) * to allow user_event files to be less locked down. The extreme case * being "other" has read/write access to user_events_data/status. * - * When not locked down, processes may not have have permissions to + * When not locked down, processes may not have permissions to * add/remove calls themselves to tracefs. We need to temporarily * switch to root file permission to allow for this scenario. */ -- cgit From fb991f1942334b0cbf6aa6a88faa586ba22d3550 Mon Sep 17 00:00:00 2001 From: Zheng Yejian Date: Thu, 30 Jun 2022 09:31:52 +0800 Subject: tracing/histograms: Simplify create_hist_fields() When I look into implements of create_hist_fields(), I think there can be following two simplifications: 1. If something wrong happened in parse_var_defs(), free_var_defs() would have been called in it, so no need goto free again after calling it; 2. After calling create_key_fields(), regardless of the value of 'ret', it then always runs into 'out: ', so the judge of 'ret' is redundant. Link: https://lkml.kernel.org/r/20220630013152.164871-1-zhengyejian1@huawei.com Signed-off-by: Zheng Yejian Reviewed-by: Tom Rix Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_hist.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index e87a46794079..fdf784620c28 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -4455,7 +4455,7 @@ static int create_hist_fields(struct hist_trigger_data *hist_data, ret = parse_var_defs(hist_data); if (ret) - goto out; + return ret; ret = create_val_fields(hist_data, file); if (ret) @@ -4466,8 +4466,7 @@ static int create_hist_fields(struct hist_trigger_data *hist_data, goto out; ret = create_key_fields(hist_data, file); - if (ret) - goto out; + out: free_var_defs(hist_data); -- cgit From 647cafa22349026a8435030e9157074ab7fe5710 Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Tue, 12 Jul 2022 13:31:44 +0100 Subject: bpf: add a ksym BPF iterator add a "ksym" iterator which provides access to a "struct kallsym_iter" for each symbol. Intent is to support more flexible symbol parsing as discussed in [1]. [1] https://lore.kernel.org/all/YjRPZj6Z8vuLeEZo@krava/ Suggested-by: Alexei Starovoitov Signed-off-by: Alan Maguire Acked-by: Yonghong Song Link: https://lore.kernel.org/r/1657629105-7812-2-git-send-email-alan.maguire@oracle.com Signed-off-by: Alexei Starovoitov --- kernel/kallsyms.c | 91 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) (limited to 'kernel') diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index fbdf8d3279ac..79a85834ce9d 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -30,6 +30,7 @@ #include #include #include +#include /* * These will be re-linked against their real values @@ -799,6 +800,96 @@ static const struct seq_operations kallsyms_op = { .show = s_show }; +#ifdef CONFIG_BPF_SYSCALL + +struct bpf_iter__ksym { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct kallsym_iter *, ksym); +}; + +static int ksym_prog_seq_show(struct seq_file *m, bool in_stop) +{ + struct bpf_iter__ksym ctx; + struct bpf_iter_meta meta; + struct bpf_prog *prog; + + meta.seq = m; + prog = bpf_iter_get_info(&meta, in_stop); + if (!prog) + return 0; + + ctx.meta = &meta; + ctx.ksym = m ? m->private : NULL; + return bpf_iter_run_prog(prog, &ctx); +} + +static int bpf_iter_ksym_seq_show(struct seq_file *m, void *p) +{ + return ksym_prog_seq_show(m, false); +} + +static void bpf_iter_ksym_seq_stop(struct seq_file *m, void *p) +{ + if (!p) + (void) ksym_prog_seq_show(m, true); + else + s_stop(m, p); +} + +static const struct seq_operations bpf_iter_ksym_ops = { + .start = s_start, + .next = s_next, + .stop = bpf_iter_ksym_seq_stop, + .show = bpf_iter_ksym_seq_show, +}; + +static int bpf_iter_ksym_init(void *priv_data, struct bpf_iter_aux_info *aux) +{ + struct kallsym_iter *iter = priv_data; + + reset_iter(iter, 0); + + /* cache here as in kallsyms_open() case; use current process + * credentials to tell BPF iterators if values should be shown. + */ + iter->show_value = kallsyms_show_value(current_cred()); + + return 0; +} + +DEFINE_BPF_ITER_FUNC(ksym, struct bpf_iter_meta *meta, struct kallsym_iter *ksym) + +static const struct bpf_iter_seq_info ksym_iter_seq_info = { + .seq_ops = &bpf_iter_ksym_ops, + .init_seq_private = bpf_iter_ksym_init, + .fini_seq_private = NULL, + .seq_priv_size = sizeof(struct kallsym_iter), +}; + +static struct bpf_iter_reg ksym_iter_reg_info = { + .target = "ksym", + .feature = BPF_ITER_RESCHED, + .ctx_arg_info_size = 1, + .ctx_arg_info = { + { offsetof(struct bpf_iter__ksym, ksym), + PTR_TO_BTF_ID_OR_NULL }, + }, + .seq_info = &ksym_iter_seq_info, +}; + +BTF_ID_LIST(btf_ksym_iter_id) +BTF_ID(struct, kallsym_iter) + +static int __init bpf_ksym_iter_register(void) +{ + ksym_iter_reg_info.ctx_arg_info[0].btf_id = *btf_ksym_iter_id; + return bpf_iter_reg_target(&ksym_iter_reg_info); +} + +late_initcall(bpf_ksym_iter_register); + +#endif /* CONFIG_BPF_SYSCALL */ + static inline int kallsyms_for_perf(void) { #ifdef CONFIG_PERF_EVENTS -- cgit From 4201d9ab3e42d9e2a20320b751a931e6239c0df2 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 11 Jul 2022 09:28:27 -0700 Subject: bpf: reparent bpf maps on memcg offlining The memory consumed by a bpf map is always accounted to the memory cgroup of the process which created the map. The map can outlive the memory cgroup if it's used by processes in other cgroups or is pinned on bpffs. In this case the map pins the original cgroup in the dying state. For other types of objects (slab objects, non-slab kernel allocations, percpu objects and recently LRU pages) there is a reparenting process implemented: on cgroup offlining charged objects are getting reassigned to the parent cgroup. Because all charges and statistics are fully recursive it's a fairly cheap operation. For efficiency and consistency with other types of objects, let's do the same for bpf maps. Fortunately thanks to the objcg API, the required changes are minimal. Please, note that individual allocations (slabs, percpu and large kmallocs) already have the reparenting mechanism. This commit adds it to the saved map->memcg pointer by replacing it to map->objcg. Because dying cgroups are not visible for a user and all charges are recursive, this commit doesn't bring any behavior changes for a user. v2: added a missing const qualifier Signed-off-by: Roman Gushchin Reviewed-by: Shakeel Butt Link: https://lore.kernel.org/r/20220711162827.184743-1-roman.gushchin@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 36 ++++++++++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ab688d85b2c6..83c7136c5788 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -419,35 +419,53 @@ void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock) #ifdef CONFIG_MEMCG_KMEM static void bpf_map_save_memcg(struct bpf_map *map) { - map->memcg = get_mem_cgroup_from_mm(current->mm); + /* Currently if a map is created by a process belonging to the root + * memory cgroup, get_obj_cgroup_from_current() will return NULL. + * So we have to check map->objcg for being NULL each time it's + * being used. + */ + map->objcg = get_obj_cgroup_from_current(); } static void bpf_map_release_memcg(struct bpf_map *map) { - mem_cgroup_put(map->memcg); + if (map->objcg) + obj_cgroup_put(map->objcg); +} + +static struct mem_cgroup *bpf_map_get_memcg(const struct bpf_map *map) +{ + if (map->objcg) + return get_mem_cgroup_from_objcg(map->objcg); + + return root_mem_cgroup; } void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags, int node) { - struct mem_cgroup *old_memcg; + struct mem_cgroup *memcg, *old_memcg; void *ptr; - old_memcg = set_active_memcg(map->memcg); + memcg = bpf_map_get_memcg(map); + old_memcg = set_active_memcg(memcg); ptr = kmalloc_node(size, flags | __GFP_ACCOUNT, node); set_active_memcg(old_memcg); + mem_cgroup_put(memcg); return ptr; } void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags) { - struct mem_cgroup *old_memcg; + struct mem_cgroup *memcg, *old_memcg; void *ptr; - old_memcg = set_active_memcg(map->memcg); + memcg = bpf_map_get_memcg(map); + old_memcg = set_active_memcg(memcg); ptr = kzalloc(size, flags | __GFP_ACCOUNT); set_active_memcg(old_memcg); + mem_cgroup_put(memcg); return ptr; } @@ -455,12 +473,14 @@ void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags) void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, size_t align, gfp_t flags) { - struct mem_cgroup *old_memcg; + struct mem_cgroup *memcg, *old_memcg; void __percpu *ptr; - old_memcg = set_active_memcg(map->memcg); + memcg = bpf_map_get_memcg(map); + old_memcg = set_active_memcg(memcg); ptr = __alloc_percpu_gfp(size, align, flags | __GFP_ACCOUNT); set_active_memcg(old_memcg); + mem_cgroup_put(memcg); return ptr; } -- cgit From 1d5f82d9dd477d5c66e0214a68c3e4f308eadd6d Mon Sep 17 00:00:00 2001 From: Song Liu Date: Tue, 5 Jul 2022 17:26:12 -0700 Subject: bpf, x86: fix freeing of not-finalized bpf_prog_pack syzbot reported a few issues with bpf_prog_pack [1], [2]. This only happens with multiple subprogs. In jit_subprogs(), we first call bpf_int_jit_compile() on each sub program. And then, we call it on each sub program again. jit_data is not freed in the first call of bpf_int_jit_compile(). Similarly we don't call bpf_jit_binary_pack_finalize() in the first call of bpf_int_jit_compile(). If bpf_int_jit_compile() failed for one sub program, we will call bpf_jit_binary_pack_finalize() for this sub program. However, we don't have a chance to call it for other sub programs. Then we will hit "goto out_free" in jit_subprogs(), and call bpf_jit_free on some subprograms that haven't got bpf_jit_binary_pack_finalize() yet. At this point, bpf_jit_binary_pack_free() is called and the whole 2MB page is freed erroneously. Fix this with a custom bpf_jit_free() for x86_64, which calls bpf_jit_binary_pack_finalize() if necessary. Also, with custom bpf_jit_free(), bpf_prog_aux->use_bpf_prog_pack is not needed any more, remove it. Fixes: 1022a5498f6f ("bpf, x86_64: Use bpf_jit_binary_pack_alloc") [1] https://syzkaller.appspot.com/bug?extid=2f649ec6d2eea1495a8f [2] https://syzkaller.appspot.com/bug?extid=87f65c75f4a72db05445 Reported-by: syzbot+2f649ec6d2eea1495a8f@syzkaller.appspotmail.com Reported-by: syzbot+87f65c75f4a72db05445@syzkaller.appspotmail.com Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20220706002612.4013790-1-song@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 805c2ad5c793..cfb8a50a9f12 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -650,12 +650,6 @@ static bool bpf_prog_kallsyms_candidate(const struct bpf_prog *fp) return fp->jited && !bpf_prog_was_classic(fp); } -static bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp) -{ - return list_empty(&fp->aux->ksym.lnode) || - fp->aux->ksym.lnode.prev == LIST_POISON2; -} - void bpf_prog_kallsyms_add(struct bpf_prog *fp) { if (!bpf_prog_kallsyms_candidate(fp) || @@ -1153,7 +1147,6 @@ int bpf_jit_binary_pack_finalize(struct bpf_prog *prog, bpf_prog_pack_free(ro_header); return PTR_ERR(ptr); } - prog->aux->use_bpf_prog_pack = true; return 0; } @@ -1177,17 +1170,23 @@ void bpf_jit_binary_pack_free(struct bpf_binary_header *ro_header, bpf_jit_uncharge_modmem(size); } +struct bpf_binary_header * +bpf_jit_binary_pack_hdr(const struct bpf_prog *fp) +{ + unsigned long real_start = (unsigned long)fp->bpf_func; + unsigned long addr; + + addr = real_start & BPF_PROG_CHUNK_MASK; + return (void *)addr; +} + static inline struct bpf_binary_header * bpf_jit_binary_hdr(const struct bpf_prog *fp) { unsigned long real_start = (unsigned long)fp->bpf_func; unsigned long addr; - if (fp->aux->use_bpf_prog_pack) - addr = real_start & BPF_PROG_CHUNK_MASK; - else - addr = real_start & PAGE_MASK; - + addr = real_start & PAGE_MASK; return (void *)addr; } @@ -1200,11 +1199,7 @@ void __weak bpf_jit_free(struct bpf_prog *fp) if (fp->jited) { struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp); - if (fp->aux->use_bpf_prog_pack) - bpf_jit_binary_pack_free(hdr, NULL /* rw_buffer */); - else - bpf_jit_binary_free(hdr); - + bpf_jit_binary_free(hdr); WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp)); } -- cgit From ace2bee839e08df324cb320763258dfd72e6120e Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sat, 9 Jul 2022 15:44:56 +0000 Subject: bpf: Make non-preallocated allocation low priority GFP_ATOMIC doesn't cooperate well with memcg pressure so far, especially if we allocate too much GFP_ATOMIC memory. For example, when we set the memcg limit to limit a non-preallocated bpf memory, the GFP_ATOMIC can easily break the memcg limit by force charge. So it is very dangerous to use GFP_ATOMIC in non-preallocated case. One way to make it safe is to remove __GFP_HIGH from GFP_ATOMIC, IOW, use (__GFP_ATOMIC | __GFP_KSWAPD_RECLAIM) instead, then it will be limited if we allocate too much memory. There's a plan to completely remove __GFP_ATOMIC in the mm side[1], so let's use GFP_NOWAIT instead. We introduced BPF_F_NO_PREALLOC is because full map pre-allocation is too memory expensive for some cases. That means removing __GFP_HIGH doesn't break the rule of BPF_F_NO_PREALLOC, but has the same goal with it-avoiding issues caused by too much memory. So let's remove it. This fix can also apply to other run-time allocations, for example, the allocation in lpm trie, local storage and devmap. So let fix it consistently over the bpf code It also fixes a typo in the comment. [1]. https://lore.kernel.org/linux-mm/163712397076.13692.4727608274002939094@noble.neil.brown.name/ Cc: Roman Gushchin Cc: Shakeel Butt Cc: NeilBrown Signed-off-by: Yafang Shao Reviewed-by: Shakeel Butt Link: https://lore.kernel.org/r/20220709154457.57379-2-laoar.shao@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/devmap.c | 2 +- kernel/bpf/hashtab.c | 6 +++--- kernel/bpf/local_storage.c | 2 +- kernel/bpf/lpm_trie.c | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index c2867068e5bd..1400561efb15 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -845,7 +845,7 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, struct bpf_dtab_netdev *dev; dev = bpf_map_kmalloc_node(&dtab->map, sizeof(*dev), - GFP_ATOMIC | __GFP_NOWARN, + GFP_NOWAIT | __GFP_NOWARN, dtab->map.numa_node); if (!dev) return ERR_PTR(-ENOMEM); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 17fb69c0e0dc..da7578426a46 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -61,7 +61,7 @@ * * As regular device interrupt handlers and soft interrupts are forced into * thread context, the existing code which does - * spin_lock*(); alloc(GPF_ATOMIC); spin_unlock*(); + * spin_lock*(); alloc(GFP_ATOMIC); spin_unlock*(); * just works. * * In theory the BPF locks could be converted to regular spinlocks as well, @@ -978,7 +978,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, goto dec_count; } l_new = bpf_map_kmalloc_node(&htab->map, htab->elem_size, - GFP_ATOMIC | __GFP_NOWARN, + GFP_NOWAIT | __GFP_NOWARN, htab->map.numa_node); if (!l_new) { l_new = ERR_PTR(-ENOMEM); @@ -996,7 +996,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, } else { /* alloc_percpu zero-fills */ pptr = bpf_map_alloc_percpu(&htab->map, size, 8, - GFP_ATOMIC | __GFP_NOWARN); + GFP_NOWAIT | __GFP_NOWARN); if (!pptr) { kfree(l_new); l_new = ERR_PTR(-ENOMEM); diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 8654fc97f5fe..49ef0ce040c7 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -165,7 +165,7 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *key, } new = bpf_map_kmalloc_node(map, struct_size(new, data, map->value_size), - __GFP_ZERO | GFP_ATOMIC | __GFP_NOWARN, + __GFP_ZERO | GFP_NOWAIT | __GFP_NOWARN, map->numa_node); if (!new) return -ENOMEM; diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index f0d05a3cc4b9..d789e3b831ad 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -285,7 +285,7 @@ static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie, if (value) size += trie->map.value_size; - node = bpf_map_kmalloc_node(&trie->map, size, GFP_ATOMIC | __GFP_NOWARN, + node = bpf_map_kmalloc_node(&trie->map, size, GFP_NOWAIT | __GFP_NOWARN, trie->map.numa_node); if (!node) return NULL; -- cgit From 20347fca71a387a3751f7bb270062616ddc5317a Mon Sep 17 00:00:00 2001 From: Tianyu Lan Date: Fri, 8 Jul 2022 12:15:44 -0400 Subject: swiotlb: split up the global swiotlb lock Traditionally swiotlb was not performance critical because it was only used for slow devices. But in some setups, like TDX/SEV confidential guests, all IO has to go through swiotlb. Currently swiotlb only has a single lock. Under high IO load with multiple CPUs this can lead to significat lock contention on the swiotlb lock. This patch splits the swiotlb bounce buffer pool into individual areas which have their own lock. Each CPU tries to allocate in its own area first. Only if that fails does it search other areas. On freeing the allocation is freed into the area where the memory was originally allocated from. Area number can be set via swiotlb kernel parameter and is default to be possible cpu number. If possible cpu number is not power of 2, area number will be round up to the next power of 2. This idea from Andi Kleen patch(https://github.com/intel/tdx/commit/ 4529b5784c141782c72ec9bd9a92df2b68cb7d45). Based-on-idea-by: Andi Kleen Signed-off-by: Tianyu Lan Signed-off-by: Christoph Hellwig --- kernel/dma/swiotlb.c | 229 ++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 189 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 909b43445574..dcf1459ce723 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -70,6 +70,43 @@ struct io_tlb_mem io_tlb_default_mem; phys_addr_t swiotlb_unencrypted_base; static unsigned long default_nslabs = IO_TLB_DEFAULT_SIZE >> IO_TLB_SHIFT; +static unsigned long default_nareas; + +/** + * struct io_tlb_area - IO TLB memory area descriptor + * + * This is a single area with a single lock. + * + * @used: The number of used IO TLB block. + * @index: The slot index to start searching in this area for next round. + * @lock: The lock to protect the above data structures in the map and + * unmap calls. + */ +struct io_tlb_area { + unsigned long used; + unsigned int index; + spinlock_t lock; +}; + +static void swiotlb_adjust_nareas(unsigned int nareas) +{ + if (!is_power_of_2(nareas)) + nareas = roundup_pow_of_two(nareas); + + default_nareas = nareas; + + pr_info("area num %d.\n", nareas); + /* + * Round up number of slabs to the next power of 2. + * The last area is going be smaller than the rest if + * default_nslabs is not power of two. + */ + if (nareas && !is_power_of_2(default_nslabs)) { + default_nslabs = roundup_pow_of_two(default_nslabs); + pr_info("SWIOTLB bounce buffer size roundup to %luMB", + (default_nslabs << IO_TLB_SHIFT) >> 20); + } +} static int __init setup_io_tlb_npages(char *str) @@ -79,6 +116,10 @@ setup_io_tlb_npages(char *str) default_nslabs = ALIGN(simple_strtoul(str, &str, 0), IO_TLB_SEGSIZE); } + if (*str == ',') + ++str; + if (isdigit(*str)) + swiotlb_adjust_nareas(simple_strtoul(str, &str, 0)); if (*str == ',') ++str; if (!strcmp(str, "force")) @@ -112,8 +153,19 @@ void __init swiotlb_adjust_size(unsigned long size) */ if (default_nslabs != IO_TLB_DEFAULT_SIZE >> IO_TLB_SHIFT) return; + + /* + * Round up number of slabs to the next power of 2. + * The last area is going be smaller than the rest if + * default_nslabs is not power of two. + */ size = ALIGN(size, IO_TLB_SIZE); default_nslabs = ALIGN(size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE); + if (default_nareas) { + default_nslabs = roundup_pow_of_two(default_nslabs); + size = default_nslabs << IO_TLB_SHIFT; + } + pr_info("SWIOTLB bounce buffer size adjusted to %luMB", size >> 20); } @@ -192,7 +244,8 @@ void __init swiotlb_update_mem_attributes(void) } static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start, - unsigned long nslabs, unsigned int flags, bool late_alloc) + unsigned long nslabs, unsigned int flags, + bool late_alloc, unsigned int nareas) { void *vaddr = phys_to_virt(start); unsigned long bytes = nslabs << IO_TLB_SHIFT, i; @@ -202,10 +255,17 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start, mem->end = mem->start + bytes; mem->index = 0; mem->late_alloc = late_alloc; + mem->nareas = nareas; + mem->area_nslabs = nslabs / mem->nareas; mem->force_bounce = swiotlb_force_bounce || (flags & SWIOTLB_FORCE); spin_lock_init(&mem->lock); + for (i = 0; i < mem->nareas; i++) { + spin_lock_init(&mem->areas[i].lock); + mem->areas[i].index = 0; + } + for (i = 0; i < mem->nslabs; i++) { mem->slots[i].list = IO_TLB_SEGSIZE - io_tlb_offset(i); mem->slots[i].orig_addr = INVALID_PHYS_ADDR; @@ -232,7 +292,7 @@ void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags, int (*remap)(void *tlb, unsigned long nslabs)) { struct io_tlb_mem *mem = &io_tlb_default_mem; - unsigned long nslabs = default_nslabs; + unsigned long nslabs; size_t alloc_size; size_t bytes; void *tlb; @@ -242,6 +302,14 @@ void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags, if (swiotlb_force_disable) return; + /* + * default_nslabs maybe changed when adjust area number. + * So allocate bounce buffer after adjusting area number. + */ + if (!default_nareas) + swiotlb_adjust_nareas(num_possible_cpus()); + + nslabs = default_nslabs; if (nslabs < IO_TLB_MIN_SLABS) panic("%s: nslabs = %lu too small\n", __func__, nslabs); @@ -278,7 +346,13 @@ retry: panic("%s: Failed to allocate %zu bytes align=0x%lx\n", __func__, alloc_size, PAGE_SIZE); - swiotlb_init_io_tlb_mem(mem, __pa(tlb), nslabs, flags, false); + mem->areas = memblock_alloc(sizeof(struct io_tlb_area) * + default_nareas, SMP_CACHE_BYTES); + if (!mem->areas) + panic("%s: Failed to allocate mem->areas.\n", __func__); + + swiotlb_init_io_tlb_mem(mem, __pa(tlb), nslabs, flags, false, + default_nareas); if (flags & SWIOTLB_VERBOSE) swiotlb_print_info(); @@ -300,7 +374,7 @@ int swiotlb_init_late(size_t size, gfp_t gfp_mask, struct io_tlb_mem *mem = &io_tlb_default_mem; unsigned long nslabs = ALIGN(size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE); unsigned char *vstart = NULL; - unsigned int order; + unsigned int order, area_order; bool retried = false; int rc = 0; @@ -341,19 +415,34 @@ retry: (PAGE_SIZE << order) >> 20); } + if (!default_nareas) + swiotlb_adjust_nareas(num_possible_cpus()); + + area_order = get_order(array_size(sizeof(*mem->areas), + default_nareas)); + mem->areas = (struct io_tlb_area *) + __get_free_pages(GFP_KERNEL | __GFP_ZERO, area_order); + if (!mem->areas) + goto error_area; + mem->slots = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(array_size(sizeof(*mem->slots), nslabs))); - if (!mem->slots) { - free_pages((unsigned long)vstart, order); - return -ENOMEM; - } + if (!mem->slots) + goto error_slots; set_memory_decrypted((unsigned long)vstart, (nslabs << IO_TLB_SHIFT) >> PAGE_SHIFT); - swiotlb_init_io_tlb_mem(mem, virt_to_phys(vstart), nslabs, 0, true); + swiotlb_init_io_tlb_mem(mem, virt_to_phys(vstart), nslabs, 0, true, + default_nareas); swiotlb_print_info(); return 0; + +error_slots: + free_pages((unsigned long)mem->areas, area_order); +error_area: + free_pages((unsigned long)vstart, order); + return -ENOMEM; } void __init swiotlb_exit(void) @@ -361,6 +450,7 @@ void __init swiotlb_exit(void) struct io_tlb_mem *mem = &io_tlb_default_mem; unsigned long tbl_vaddr; size_t tbl_size, slots_size; + unsigned int area_order; if (swiotlb_force_bounce) return; @@ -375,9 +465,14 @@ void __init swiotlb_exit(void) set_memory_encrypted(tbl_vaddr, tbl_size >> PAGE_SHIFT); if (mem->late_alloc) { + area_order = get_order(array_size(sizeof(*mem->areas), + mem->nareas)); + free_pages((unsigned long)mem->areas, area_order); free_pages(tbl_vaddr, get_order(tbl_size)); free_pages((unsigned long)mem->slots, get_order(slots_size)); } else { + memblock_free_late(__pa(mem->areas), + mem->nareas * sizeof(struct io_tlb_area)); memblock_free_late(mem->start, tbl_size); memblock_free_late(__pa(mem->slots), slots_size); } @@ -480,9 +575,9 @@ static inline unsigned long get_max_slots(unsigned long boundary_mask) return nr_slots(boundary_mask + 1); } -static unsigned int wrap_index(struct io_tlb_mem *mem, unsigned int index) +static unsigned int wrap_area_index(struct io_tlb_mem *mem, unsigned int index) { - if (index >= mem->nslabs) + if (index >= mem->area_nslabs) return 0; return index; } @@ -491,10 +586,11 @@ static unsigned int wrap_index(struct io_tlb_mem *mem, unsigned int index) * Find a suitable number of IO TLB entries size that will fit this request and * allocate a buffer from that IO TLB pool. */ -static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr, - size_t alloc_size, unsigned int alloc_align_mask) +static int swiotlb_do_find_slots(struct io_tlb_mem *mem, + struct io_tlb_area *area, int area_index, + struct device *dev, phys_addr_t orig_addr, + size_t alloc_size, unsigned int alloc_align_mask) { - struct io_tlb_mem *mem = dev->dma_io_tlb_mem; unsigned long boundary_mask = dma_get_seg_boundary(dev); dma_addr_t tbl_dma_addr = phys_to_dma_unencrypted(dev, mem->start) & boundary_mask; @@ -505,8 +601,11 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr, unsigned int index, wrap, count = 0, i; unsigned int offset = swiotlb_align_offset(dev, orig_addr); unsigned long flags; + unsigned int slot_base; + unsigned int slot_index; BUG_ON(!nslots); + BUG_ON(area_index >= mem->nareas); /* * For mappings with an alignment requirement don't bother looping to @@ -518,16 +617,20 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr, stride = max(stride, stride << (PAGE_SHIFT - IO_TLB_SHIFT)); stride = max(stride, (alloc_align_mask >> IO_TLB_SHIFT) + 1); - spin_lock_irqsave(&mem->lock, flags); - if (unlikely(nslots > mem->nslabs - mem->used)) + spin_lock_irqsave(&area->lock, flags); + if (unlikely(nslots > mem->area_nslabs - area->used)) goto not_found; - index = wrap = wrap_index(mem, ALIGN(mem->index, stride)); + slot_base = area_index * mem->area_nslabs; + index = wrap = wrap_area_index(mem, ALIGN(area->index, stride)); + do { + slot_index = slot_base + index; + if (orig_addr && - (slot_addr(tbl_dma_addr, index) & iotlb_align_mask) != - (orig_addr & iotlb_align_mask)) { - index = wrap_index(mem, index + 1); + (slot_addr(tbl_dma_addr, slot_index) & + iotlb_align_mask) != (orig_addr & iotlb_align_mask)) { + index = wrap_area_index(mem, index + 1); continue; } @@ -536,26 +639,26 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr, * contiguous buffers, we allocate the buffers from that slot * and mark the entries as '0' indicating unavailable. */ - if (!iommu_is_span_boundary(index, nslots, + if (!iommu_is_span_boundary(slot_index, nslots, nr_slots(tbl_dma_addr), max_slots)) { - if (mem->slots[index].list >= nslots) + if (mem->slots[slot_index].list >= nslots) goto found; } - index = wrap_index(mem, index + stride); + index = wrap_area_index(mem, index + stride); } while (index != wrap); not_found: - spin_unlock_irqrestore(&mem->lock, flags); + spin_unlock_irqrestore(&area->lock, flags); return -1; found: - for (i = index; i < index + nslots; i++) { + for (i = slot_index; i < slot_index + nslots; i++) { mem->slots[i].list = 0; - mem->slots[i].alloc_size = - alloc_size - (offset + ((i - index) << IO_TLB_SHIFT)); + mem->slots[i].alloc_size = alloc_size - (offset + + ((i - slot_index) << IO_TLB_SHIFT)); } - for (i = index - 1; + for (i = slot_index - 1; io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && mem->slots[i].list; i--) mem->slots[i].list = ++count; @@ -563,14 +666,43 @@ found: /* * Update the indices to avoid searching in the next round. */ - if (index + nslots < mem->nslabs) - mem->index = index + nslots; + if (index + nslots < mem->area_nslabs) + area->index = index + nslots; else - mem->index = 0; - mem->used += nslots; + area->index = 0; + area->used += nslots; + spin_unlock_irqrestore(&area->lock, flags); + return slot_index; +} - spin_unlock_irqrestore(&mem->lock, flags); - return index; +static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr, + size_t alloc_size, unsigned int alloc_align_mask) +{ + struct io_tlb_mem *mem = dev->dma_io_tlb_mem; + int start = raw_smp_processor_id() & ((1U << __fls(mem->nareas)) - 1); + int i = start, index; + + do { + index = swiotlb_do_find_slots(mem, mem->areas + i, i, + dev, orig_addr, alloc_size, + alloc_align_mask); + if (index >= 0) + return index; + if (++i >= mem->nareas) + i = 0; + } while (i != start); + + return -1; +} + +static unsigned long mem_used(struct io_tlb_mem *mem) +{ + int i; + unsigned long used = 0; + + for (i = 0; i < mem->nareas; i++) + used += mem->areas[i].used; + return used; } phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, @@ -602,7 +734,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, if (!(attrs & DMA_ATTR_NO_WARN)) dev_warn_ratelimited(dev, "swiotlb buffer is full (sz: %zd bytes), total %lu (slots), used %lu (slots)\n", - alloc_size, mem->nslabs, mem->used); + alloc_size, mem->nslabs, mem_used(mem)); return (phys_addr_t)DMA_MAPPING_ERROR; } @@ -632,6 +764,8 @@ static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr) unsigned int offset = swiotlb_align_offset(dev, tlb_addr); int index = (tlb_addr - offset - mem->start) >> IO_TLB_SHIFT; int nslots = nr_slots(mem->slots[index].alloc_size + offset); + int aindex = index / mem->area_nslabs; + struct io_tlb_area *area = &mem->areas[aindex]; int count, i; /* @@ -640,7 +774,9 @@ static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr) * While returning the entries to the free list, we merge the entries * with slots below and above the pool being returned. */ - spin_lock_irqsave(&mem->lock, flags); + BUG_ON(aindex >= mem->nareas); + + spin_lock_irqsave(&area->lock, flags); if (index + nslots < ALIGN(index + 1, IO_TLB_SEGSIZE)) count = mem->slots[index + nslots].list; else @@ -664,8 +800,8 @@ static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr) io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && mem->slots[i].list; i--) mem->slots[i].list = ++count; - mem->used -= nslots; - spin_unlock_irqrestore(&mem->lock, flags); + area->used -= nslots; + spin_unlock_irqrestore(&area->lock, flags); } /* @@ -763,12 +899,14 @@ EXPORT_SYMBOL_GPL(is_swiotlb_active); static void swiotlb_create_debugfs_files(struct io_tlb_mem *mem, const char *dirname) { + unsigned long used = mem_used(mem); + mem->debugfs = debugfs_create_dir(dirname, io_tlb_default_mem.debugfs); if (!mem->nslabs) return; debugfs_create_ulong("io_tlb_nslabs", 0400, mem->debugfs, &mem->nslabs); - debugfs_create_ulong("io_tlb_used", 0400, mem->debugfs, &mem->used); + debugfs_create_ulong("io_tlb_used", 0400, mem->debugfs, &used); } static int __init __maybe_unused swiotlb_create_default_debugfs(void) @@ -819,6 +957,9 @@ static int rmem_swiotlb_device_init(struct reserved_mem *rmem, struct io_tlb_mem *mem = rmem->priv; unsigned long nslabs = rmem->size >> IO_TLB_SHIFT; + /* Set Per-device io tlb area to one */ + unsigned int nareas = 1; + /* * Since multiple devices can share the same pool, the private data, * io_tlb_mem struct, will be initialized by the first device attached @@ -835,10 +976,18 @@ static int rmem_swiotlb_device_init(struct reserved_mem *rmem, return -ENOMEM; } + mem->areas = kcalloc(nareas, sizeof(*mem->areas), + GFP_KERNEL); + if (!mem->areas) { + kfree(mem); + kfree(mem->slots); + return -ENOMEM; + } + set_memory_decrypted((unsigned long)phys_to_virt(rmem->base), rmem->size >> PAGE_SHIFT); swiotlb_init_io_tlb_mem(mem, rmem->base, nslabs, SWIOTLB_FORCE, - false); + false, nareas); mem->for_alloc = true; rmem->priv = mem; -- cgit From 8ab4cdcf03d0b060fbf73f76460f199bbd759ff7 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Tue, 12 Jul 2022 14:06:03 -0700 Subject: bpf: Tidy up verifier check_func_arg() This patch does two things: 1. For matching against the arg type, the match should be against the base type of the arg type, since the arg type can have different bpf_type_flags set on it. 2. Uses switch casing to improve readability + efficiency. Signed-off-by: Joanne Koong Acked-by: Hao Luo Link: https://lore.kernel.org/r/20220712210603.123791-1-joannelkoong@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 66 +++++++++++++++++++++++++++++---------------------- 1 file changed, 38 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 328cfab3af60..26e7e787c20a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5533,17 +5533,6 @@ static bool arg_type_is_mem_size(enum bpf_arg_type type) type == ARG_CONST_SIZE_OR_ZERO; } -static bool arg_type_is_alloc_size(enum bpf_arg_type type) -{ - return type == ARG_CONST_ALLOC_SIZE_OR_ZERO; -} - -static bool arg_type_is_int_ptr(enum bpf_arg_type type) -{ - return type == ARG_PTR_TO_INT || - type == ARG_PTR_TO_LONG; -} - static bool arg_type_is_release(enum bpf_arg_type type) { return type & OBJ_RELEASE; @@ -5929,7 +5918,8 @@ skip_type_check: meta->ref_obj_id = reg->ref_obj_id; } - if (arg_type == ARG_CONST_MAP_PTR) { + switch (base_type(arg_type)) { + case ARG_CONST_MAP_PTR: /* bpf_map_xxx(map_ptr) call: remember that map_ptr */ if (meta->map_ptr) { /* Use map_uid (which is unique id of inner map) to reject: @@ -5954,7 +5944,8 @@ skip_type_check: } meta->map_ptr = reg->map_ptr; meta->map_uid = reg->map_uid; - } else if (arg_type == ARG_PTR_TO_MAP_KEY) { + break; + case ARG_PTR_TO_MAP_KEY: /* bpf_map_xxx(..., map_ptr, ..., key) call: * check that [key, key + map->key_size) are within * stack limits and initialized @@ -5971,7 +5962,8 @@ skip_type_check: err = check_helper_mem_access(env, regno, meta->map_ptr->key_size, false, NULL); - } else if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE) { + break; + case ARG_PTR_TO_MAP_VALUE: if (type_may_be_null(arg_type) && register_is_null(reg)) return 0; @@ -5987,14 +5979,16 @@ skip_type_check: err = check_helper_mem_access(env, regno, meta->map_ptr->value_size, false, meta); - } else if (arg_type == ARG_PTR_TO_PERCPU_BTF_ID) { + break; + case ARG_PTR_TO_PERCPU_BTF_ID: if (!reg->btf_id) { verbose(env, "Helper has invalid btf_id in R%d\n", regno); return -EACCES; } meta->ret_btf = reg->btf; meta->ret_btf_id = reg->btf_id; - } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) { + break; + case ARG_PTR_TO_SPIN_LOCK: if (meta->func_id == BPF_FUNC_spin_lock) { if (process_spin_lock(env, regno, true)) return -EACCES; @@ -6005,12 +5999,15 @@ skip_type_check: verbose(env, "verifier internal error\n"); return -EFAULT; } - } else if (arg_type == ARG_PTR_TO_TIMER) { + break; + case ARG_PTR_TO_TIMER: if (process_timer_func(env, regno, meta)) return -EACCES; - } else if (arg_type == ARG_PTR_TO_FUNC) { + break; + case ARG_PTR_TO_FUNC: meta->subprogno = reg->subprogno; - } else if (base_type(arg_type) == ARG_PTR_TO_MEM) { + break; + case ARG_PTR_TO_MEM: /* The access to this pointer is only checked when we hit the * next is_mem_size argument below. */ @@ -6020,11 +6017,14 @@ skip_type_check: fn->arg_size[arg], false, meta); } - } else if (arg_type_is_mem_size(arg_type)) { - bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO); - - err = check_mem_size_reg(env, reg, regno, zero_size_allowed, meta); - } else if (arg_type_is_dynptr(arg_type)) { + break; + case ARG_CONST_SIZE: + err = check_mem_size_reg(env, reg, regno, false, meta); + break; + case ARG_CONST_SIZE_OR_ZERO: + err = check_mem_size_reg(env, reg, regno, true, meta); + break; + case ARG_PTR_TO_DYNPTR: if (arg_type & MEM_UNINIT) { if (!is_dynptr_reg_valid_uninit(env, reg)) { verbose(env, "Dynptr has to be an uninitialized dynptr\n"); @@ -6058,21 +6058,28 @@ skip_type_check: err_extra, arg + 1); return -EINVAL; } - } else if (arg_type_is_alloc_size(arg_type)) { + break; + case ARG_CONST_ALLOC_SIZE_OR_ZERO: if (!tnum_is_const(reg->var_off)) { verbose(env, "R%d is not a known constant'\n", regno); return -EACCES; } meta->mem_size = reg->var_off.value; - } else if (arg_type_is_int_ptr(arg_type)) { + break; + case ARG_PTR_TO_INT: + case ARG_PTR_TO_LONG: + { int size = int_ptr_type_to_size(arg_type); err = check_helper_mem_access(env, regno, size, false, meta); if (err) return err; err = check_ptr_alignment(env, reg, 0, size, true); - } else if (arg_type == ARG_PTR_TO_CONST_STR) { + break; + } + case ARG_PTR_TO_CONST_STR: + { struct bpf_map *map = reg->map_ptr; int map_off; u64 map_addr; @@ -6111,9 +6118,12 @@ skip_type_check: verbose(env, "string is not zero-terminated\n"); return -EINVAL; } - } else if (arg_type == ARG_PTR_TO_KPTR) { + break; + } + case ARG_PTR_TO_KPTR: if (process_kptr_func(env, regno, meta)) return -EACCES; + break; } return err; -- cgit From 900d156bac2bc474cf7c7bee4efbc6c83ec5ae58 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 Jul 2022 07:53:17 +0200 Subject: block: remove bdevname Replace the remaining calls of bdevname with snprintf using the %pg format specifier. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Chaitanya Kulkarni Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20220713055317.1888500-10-hch@lst.de Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index c584effe5fe9..4752bda1b1a0 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -736,12 +736,12 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) switch (cmd) { case BLKTRACESETUP: - bdevname(bdev, b); + snprintf(b, sizeof(b), "%pg", bdev); ret = __blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); break; #if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64) case BLKTRACESETUP32: - bdevname(bdev, b); + snprintf(b, sizeof(b), "%pg", bdev); ret = compat_blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); break; #endif -- cgit From 22c80aac882f712897b88b7ea8f5a74ea19019df Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 14 Jul 2022 11:06:36 -0700 Subject: blktrace: Trace remapped requests correctly Trace the remapped operation and its flags instead of only the data direction of remapped operations. This issue was detected by analyzing the warnings reported by sparse related to the new blk_opf_t type. Reviewed-by: Jun'ichi Nomura Cc: Mike Snitzer Cc: Mike Christie Cc: Li Zefan Cc: Chaitanya Kulkarni Fixes: 1b9a9ab78b0a ("blktrace: use op accessors") Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20220714180729.1065367-11-bvanassche@acm.org Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 4752bda1b1a0..4327b51da403 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1058,7 +1058,7 @@ static void blk_add_trace_rq_remap(void *ignore, struct request *rq, dev_t dev, r.sector_from = cpu_to_be64(from); __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), - rq_data_dir(rq), 0, BLK_TA_REMAP, 0, + req_op(rq), rq->cmd_flags, BLK_TA_REMAP, 0, sizeof(r), &r, blk_trace_request_get_cgid(rq)); rcu_read_unlock(); } -- cgit From 919dbca8670d0f7828dfbb2f9b434ac22dca8d2e Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 14 Jul 2022 11:06:37 -0700 Subject: blktrace: Use the new blk_opf_t type Improve static type checking by using the new blk_opf_t type for a function argument that represents a combination of a request operation and request flags. Rename that argument from 'op' into 'opf' to make its role more clear. Cc: Christoph Hellwig Cc: Steven Rostedt Cc: Li Zefan Cc: Chaitanya Kulkarni Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20220714180729.1065367-12-bvanassche@acm.org Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 51 ++++++++++++++++++++++++------------------------- 1 file changed, 25 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 4327b51da403..150058f5daa9 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -205,7 +205,7 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), #define BLK_TC_PREFLUSH BLK_TC_FLUSH /* The ilog2() calls fall out because they're constant */ -#define MASK_TC_BIT(rw, __name) ((rw & REQ_ ## __name) << \ +#define MASK_TC_BIT(rw, __name) ((__force u32)(rw & REQ_ ## __name) << \ (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - __REQ_ ## __name)) /* @@ -213,8 +213,8 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), * blk_io_trace structure and places it in a per-cpu subbuffer. */ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, - int op, int op_flags, u32 what, int error, int pdu_len, - void *pdu_data, u64 cgid) + const blk_opf_t opf, u32 what, int error, + int pdu_len, void *pdu_data, u64 cgid) { struct task_struct *tsk = current; struct ring_buffer_event *event = NULL; @@ -227,16 +227,17 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, int cpu; bool blk_tracer = blk_tracer_enabled; ssize_t cgid_len = cgid ? sizeof(cgid) : 0; + const enum req_op op = opf & REQ_OP_MASK; if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer)) return; what |= ddir_act[op_is_write(op) ? WRITE : READ]; - what |= MASK_TC_BIT(op_flags, SYNC); - what |= MASK_TC_BIT(op_flags, RAHEAD); - what |= MASK_TC_BIT(op_flags, META); - what |= MASK_TC_BIT(op_flags, PREFLUSH); - what |= MASK_TC_BIT(op_flags, FUA); + what |= MASK_TC_BIT(opf, SYNC); + what |= MASK_TC_BIT(opf, RAHEAD); + what |= MASK_TC_BIT(opf, META); + what |= MASK_TC_BIT(opf, PREFLUSH); + what |= MASK_TC_BIT(opf, FUA); if (op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE) what |= BLK_TC_ACT(BLK_TC_DISCARD); if (op == REQ_OP_FLUSH) @@ -842,9 +843,8 @@ static void blk_add_trace_rq(struct request *rq, blk_status_t error, else what |= BLK_TC_ACT(BLK_TC_FS); - __blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq), - rq->cmd_flags, what, blk_status_to_errno(error), 0, - NULL, cgid); + __blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, rq->cmd_flags, + what, blk_status_to_errno(error), 0, NULL, cgid); rcu_read_unlock(); } @@ -903,7 +903,7 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, } __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, - bio_op(bio), bio->bi_opf, what, error, 0, NULL, + bio->bi_opf, what, error, 0, NULL, blk_trace_bio_get_cgid(q, bio)); rcu_read_unlock(); } @@ -949,7 +949,7 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q) rcu_read_lock(); bt = rcu_dereference(q->blk_trace); if (bt) - __blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, 0); + __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, 0); rcu_read_unlock(); } @@ -969,7 +969,7 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q, else what = BLK_TA_UNPLUG_TIMER; - __blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, 0); + __blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, 0); } rcu_read_unlock(); } @@ -985,8 +985,7 @@ static void blk_add_trace_split(void *ignore, struct bio *bio, unsigned int pdu) __be64 rpdu = cpu_to_be64(pdu); __blk_add_trace(bt, bio->bi_iter.bi_sector, - bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf, - BLK_TA_SPLIT, + bio->bi_iter.bi_size, bio->bi_opf, BLK_TA_SPLIT, blk_status_to_errno(bio->bi_status), sizeof(rpdu), &rpdu, blk_trace_bio_get_cgid(q, bio)); @@ -1022,7 +1021,7 @@ static void blk_add_trace_bio_remap(void *ignore, struct bio *bio, dev_t dev, r.sector_from = cpu_to_be64(from); __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, - bio_op(bio), bio->bi_opf, BLK_TA_REMAP, + bio->bi_opf, BLK_TA_REMAP, blk_status_to_errno(bio->bi_status), sizeof(r), &r, blk_trace_bio_get_cgid(q, bio)); rcu_read_unlock(); @@ -1058,7 +1057,7 @@ static void blk_add_trace_rq_remap(void *ignore, struct request *rq, dev_t dev, r.sector_from = cpu_to_be64(from); __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), - req_op(rq), rq->cmd_flags, BLK_TA_REMAP, 0, + rq->cmd_flags, BLK_TA_REMAP, 0, sizeof(r), &r, blk_trace_request_get_cgid(rq)); rcu_read_unlock(); } @@ -1084,7 +1083,7 @@ void blk_add_driver_data(struct request *rq, void *data, size_t len) return; } - __blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0, + __blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, BLK_TA_DRV_DATA, 0, len, data, blk_trace_request_get_cgid(rq)); rcu_read_unlock(); @@ -1881,14 +1880,14 @@ out: * caller with resulting string. * **/ -void blk_fill_rwbs(char *rwbs, unsigned int op) +void blk_fill_rwbs(char *rwbs, blk_opf_t opf) { int i = 0; - if (op & REQ_PREFLUSH) + if (opf & REQ_PREFLUSH) rwbs[i++] = 'F'; - switch (op & REQ_OP_MASK) { + switch (opf & REQ_OP_MASK) { case REQ_OP_WRITE: rwbs[i++] = 'W'; break; @@ -1909,13 +1908,13 @@ void blk_fill_rwbs(char *rwbs, unsigned int op) rwbs[i++] = 'N'; } - if (op & REQ_FUA) + if (opf & REQ_FUA) rwbs[i++] = 'F'; - if (op & REQ_RAHEAD) + if (opf & REQ_RAHEAD) rwbs[i++] = 'A'; - if (op & REQ_SYNC) + if (opf & REQ_SYNC) rwbs[i++] = 'S'; - if (op & REQ_META) + if (opf & REQ_META) rwbs[i++] = 'M'; rwbs[i] = '\0'; -- cgit From 568e34ed7339e357f73c8e1ae5cc4f4595805357 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 14 Jul 2022 11:07:27 -0700 Subject: PM: Use the enum req_op and blk_opf_t types Improve static type checking by using the enum req_op type for variables that represent a request operation and the new blk_opf_t type for variables that represent request flags. Combine the first two hib_submit_io() arguments into a single argument. Acked-by: Rafael J. Wysocki Cc: Christoph Hellwig Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20220714180729.1065367-62-bvanassche@acm.org Signed-off-by: Jens Axboe --- kernel/power/swap.c | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 91fffdd2c7fb..277434b6c0bf 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -269,15 +269,14 @@ static void hib_end_io(struct bio *bio) bio_put(bio); } -static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr, - struct hib_bio_batch *hb) +static int hib_submit_io(blk_opf_t opf, pgoff_t page_off, void *addr, + struct hib_bio_batch *hb) { struct page *page = virt_to_page(addr); struct bio *bio; int error = 0; - bio = bio_alloc(hib_resume_bdev, 1, op | op_flags, - GFP_NOIO | __GFP_HIGH); + bio = bio_alloc(hib_resume_bdev, 1, opf, GFP_NOIO | __GFP_HIGH); bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9); if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { @@ -317,8 +316,7 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) { int error; - hib_submit_io(REQ_OP_READ, 0, swsusp_resume_block, - swsusp_header, NULL); + hib_submit_io(REQ_OP_READ, swsusp_resume_block, swsusp_header, NULL); if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) || !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) { memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10); @@ -331,7 +329,7 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) swsusp_header->flags = flags; if (flags & SF_CRC32_MODE) swsusp_header->crc32 = handle->crc32; - error = hib_submit_io(REQ_OP_WRITE, REQ_SYNC, + error = hib_submit_io(REQ_OP_WRITE | REQ_SYNC, swsusp_resume_block, swsusp_header, NULL); } else { pr_err("Swap header not found!\n"); @@ -408,7 +406,7 @@ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb) } else { src = buf; } - return hib_submit_io(REQ_OP_WRITE, REQ_SYNC, offset, src, hb); + return hib_submit_io(REQ_OP_WRITE | REQ_SYNC, offset, src, hb); } static void release_swap_writer(struct swap_map_handle *handle) @@ -1003,7 +1001,7 @@ static int get_swap_reader(struct swap_map_handle *handle, return -ENOMEM; } - error = hib_submit_io(REQ_OP_READ, 0, offset, tmp->map, NULL); + error = hib_submit_io(REQ_OP_READ, offset, tmp->map, NULL); if (error) { release_swap_reader(handle); return error; @@ -1027,7 +1025,7 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf, offset = handle->cur->entries[handle->k]; if (!offset) return -EFAULT; - error = hib_submit_io(REQ_OP_READ, 0, offset, buf, hb); + error = hib_submit_io(REQ_OP_READ, offset, buf, hb); if (error) return error; if (++handle->k >= MAP_PAGE_ENTRIES) { @@ -1526,8 +1524,7 @@ int swsusp_check(void) if (!IS_ERR(hib_resume_bdev)) { set_blocksize(hib_resume_bdev, PAGE_SIZE); clear_page(swsusp_header); - error = hib_submit_io(REQ_OP_READ, 0, - swsusp_resume_block, + error = hib_submit_io(REQ_OP_READ, swsusp_resume_block, swsusp_header, NULL); if (error) goto put; @@ -1535,7 +1532,7 @@ int swsusp_check(void) if (!memcmp(HIBERNATE_SIG, swsusp_header->sig, 10)) { memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); /* Reset swap signature now */ - error = hib_submit_io(REQ_OP_WRITE, REQ_SYNC, + error = hib_submit_io(REQ_OP_WRITE | REQ_SYNC, swsusp_resume_block, swsusp_header, NULL); } else { @@ -1586,11 +1583,11 @@ int swsusp_unmark(void) { int error; - hib_submit_io(REQ_OP_READ, 0, swsusp_resume_block, - swsusp_header, NULL); + hib_submit_io(REQ_OP_READ, swsusp_resume_block, + swsusp_header, NULL); if (!memcmp(HIBERNATE_SIG,swsusp_header->sig, 10)) { memcpy(swsusp_header->sig,swsusp_header->orig_sig, 10); - error = hib_submit_io(REQ_OP_WRITE, REQ_SYNC, + error = hib_submit_io(REQ_OP_WRITE | REQ_SYNC, swsusp_resume_block, swsusp_header, NULL); } else { -- cgit From 5002615a37b1e23a4b51c386ee22c8f90a70b4dd Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Wed, 13 Jul 2022 16:09:36 +0000 Subject: bpf: Warn on non-preallocated case for BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE is also tracing type, which may cause unexpected memory allocation if we set BPF_F_NO_PREALLOC. Let's also warn on it similar as we do in case of BPF_PROG_TYPE_RAW_TRACEPOINT. Signed-off-by: Yafang Shao Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20220713160936.57488-1-laoar.shao@gmail.com --- kernel/bpf/verifier.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 26e7e787c20a..47fd6efa102a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12572,6 +12572,7 @@ static bool is_tracing_prog_type(enum bpf_prog_type type) case BPF_PROG_TYPE_TRACEPOINT: case BPF_PROG_TYPE_PERF_EVENT: case BPF_PROG_TYPE_RAW_TRACEPOINT: + case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE: return true; default: return false; -- cgit From 17dd25c29cda98c370f8d5a4ae3daee33fac1669 Mon Sep 17 00:00:00 2001 From: Aaron Tomlin Date: Thu, 14 Jul 2022 16:39:31 +0100 Subject: module: Modify module_flags() to accept show_state argument No functional change. With this patch a given module's state information (i.e. 'mod->state') can be omitted from the specified buffer. Please note that this is in preparation to include the last unloaded module's taint flag(s), if available. Signed-off-by: Aaron Tomlin Signed-off-by: Luis Chamberlain --- kernel/module/internal.h | 2 +- kernel/module/main.c | 11 +++++++---- kernel/module/procfs.c | 2 +- 3 files changed, 9 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/module/internal.h b/kernel/module/internal.h index ec104c2950c3..680d980a4fb2 100644 --- a/kernel/module/internal.h +++ b/kernel/module/internal.h @@ -103,7 +103,7 @@ struct module *find_module_all(const char *name, size_t len, bool even_unformed) int cmp_name(const void *name, const void *sym); long module_get_offset(struct module *mod, unsigned int *size, Elf_Shdr *sechdr, unsigned int section); -char *module_flags(struct module *mod, char *buf); +char *module_flags(struct module *mod, char *buf, bool show_state); size_t module_flags_taint(unsigned long taints, char *buf); static inline void module_assert_mutex_or_preempt(void) diff --git a/kernel/module/main.c b/kernel/module/main.c index d34227ca3932..b6e3dfd2068c 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -2979,24 +2979,27 @@ static void cfi_cleanup(struct module *mod) } /* Keep in sync with MODULE_FLAGS_BUF_SIZE !!! */ -char *module_flags(struct module *mod, char *buf) +char *module_flags(struct module *mod, char *buf, bool show_state) { int bx = 0; BUG_ON(mod->state == MODULE_STATE_UNFORMED); + if (!mod->taints && !show_state) + goto out; if (mod->taints || mod->state == MODULE_STATE_GOING || mod->state == MODULE_STATE_COMING) { buf[bx++] = '('; bx += module_flags_taint(mod->taints, buf + bx); /* Show a - for module-is-being-unloaded */ - if (mod->state == MODULE_STATE_GOING) + if (mod->state == MODULE_STATE_GOING && show_state) buf[bx++] = '-'; /* Show a + for module-is-being-loaded */ - if (mod->state == MODULE_STATE_COMING) + if (mod->state == MODULE_STATE_COMING && show_state) buf[bx++] = '+'; buf[bx++] = ')'; } +out: buf[bx] = '\0'; return buf; @@ -3129,7 +3132,7 @@ void print_modules(void) list_for_each_entry_rcu(mod, &modules, list) { if (mod->state == MODULE_STATE_UNFORMED) continue; - pr_cont(" %s%s", mod->name, module_flags(mod, buf)); + pr_cont(" %s%s", mod->name, module_flags(mod, buf, true)); } print_unloaded_tainted_modules(); diff --git a/kernel/module/procfs.c b/kernel/module/procfs.c index 9a8f4f0f6329..cf5b9f1e6ec4 100644 --- a/kernel/module/procfs.c +++ b/kernel/module/procfs.c @@ -91,7 +91,7 @@ static int m_show(struct seq_file *m, void *p) /* Taints info */ if (mod->taints) - seq_printf(m, " %s", module_flags(mod, buf)); + seq_printf(m, " %s", module_flags(mod, buf, true)); seq_puts(m, "\n"); return 0; -- cgit From dbf0ae65bce48bf4c2b6d114cac10193ef050294 Mon Sep 17 00:00:00 2001 From: Aaron Tomlin Date: Thu, 14 Jul 2022 16:39:32 +0100 Subject: module: Use strscpy() for last_unloaded_module The use of strlcpy() is considered deprecated [1]. In this particular context, there is no need to remain with strlcpy(). Therefore we transition to strscpy(). [1]: https://www.kernel.org/doc/html/latest/process/deprecated.html#strlcpy Signed-off-by: Aaron Tomlin Signed-off-by: Luis Chamberlain --- kernel/module/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module/main.c b/kernel/module/main.c index b6e3dfd2068c..a776fdaf021d 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -754,7 +754,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, async_synchronize_full(); /* Store the name of the last unloaded module for diagnostic purposes */ - strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module)); + strscpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module)); free_module(mod); /* someone could wait for the module in add_unformed_module() */ -- cgit From 6f1dae1d84b6d08541d8e12edd1c8677ab279dea Mon Sep 17 00:00:00 2001 From: Aaron Tomlin Date: Thu, 14 Jul 2022 16:39:33 +0100 Subject: module: Show the last unloaded module's taint flag(s) For diagnostic purposes, this patch, in addition to keeping a record/or track of the last known unloaded module, we now will include the module's taint flag(s) too e.g: " [last unloaded: fpga_mgr_mod(OE)]" Signed-off-by: Aaron Tomlin Signed-off-by: Luis Chamberlain --- kernel/module/main.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/module/main.c b/kernel/module/main.c index a776fdaf021d..e696f5624377 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -524,7 +524,10 @@ static struct module_attribute modinfo_##field = { \ MODINFO_ATTR(version); MODINFO_ATTR(srcversion); -static char last_unloaded_module[MODULE_NAME_LEN+1]; +static struct { + char name[MODULE_NAME_LEN + 1]; + char taints[MODULE_FLAGS_BUF_SIZE]; +} last_unloaded_module; #ifdef CONFIG_MODULE_UNLOAD @@ -694,6 +697,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, { struct module *mod; char name[MODULE_NAME_LEN]; + char buf[MODULE_FLAGS_BUF_SIZE]; int ret, forced = 0; if (!capable(CAP_SYS_MODULE) || modules_disabled) @@ -753,8 +757,9 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, async_synchronize_full(); - /* Store the name of the last unloaded module for diagnostic purposes */ - strscpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module)); + /* Store the name and taints of the last unloaded module for diagnostic purposes */ + strscpy(last_unloaded_module.name, mod->name, sizeof(last_unloaded_module.name)); + strscpy(last_unloaded_module.taints, module_flags(mod, buf, false), sizeof(last_unloaded_module.taints)); free_module(mod); /* someone could wait for the module in add_unformed_module() */ @@ -3137,7 +3142,8 @@ void print_modules(void) print_unloaded_tainted_modules(); preempt_enable(); - if (last_unloaded_module[0]) - pr_cont(" [last unloaded: %s]", last_unloaded_module); + if (last_unloaded_module.name[0]) + pr_cont(" [last unloaded: %s%s]", last_unloaded_module.name, + last_unloaded_module.taints); pr_cont("\n"); } -- cgit From 9c7c48d6a1e2eb5192ad5294c1c4dbd42a88e88b Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 14 Jul 2022 14:16:37 -0700 Subject: bpf: Fix subprog names in stack traces. The commit 7337224fc150 ("bpf: Improve the info.func_info and info.func_info_rec_size behavior") accidently made bpf_prog_ksym_set_name() conservative for bpf subprograms. Fixed it so instead of "bpf_prog_tag_F" the stack traces print "bpf_prog_tag_full_subprog_name". Fixes: 7337224fc150 ("bpf: Improve the info.func_info and info.func_info_rec_size behavior") Reported-by: Tejun Heo Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20220714211637.17150-1-alexei.starovoitov@gmail.com --- kernel/bpf/verifier.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 47fd6efa102a..c59c3df0fea6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -13631,6 +13631,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) /* Below members will be freed only at prog->aux */ func[i]->aux->btf = prog->aux->btf; func[i]->aux->func_info = prog->aux->func_info; + func[i]->aux->func_info_cnt = prog->aux->func_info_cnt; func[i]->aux->poke_tab = prog->aux->poke_tab; func[i]->aux->size_poke_tab = prog->aux->size_poke_tab; @@ -13643,9 +13644,6 @@ static int jit_subprogs(struct bpf_verifier_env *env) poke->aux = func[i]->aux; } - /* Use bpf_prog_F_tag to indicate functions in stack traces. - * Long term would need debug info to populate names - */ func[i]->aux->name[0] = 'F'; func[i]->aux->stack_depth = env->subprog_info[i].stack_depth; func[i]->jit_requested = 1; -- cgit From 65d9a9a60fd71be964effb2e94747a6acb6e7015 Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Fri, 1 Jul 2022 13:04:04 +0530 Subject: kexec_file: drop weak attribute from functions As requested (http://lkml.kernel.org/r/87ee0q7b92.fsf@email.froward.int.ebiederm.org), this series converts weak functions in kexec to use the #ifdef approach. Quoting the 3e35142ef99fe ("kexec_file: drop weak attribute from arch_kexec_apply_relocations[_add]") changelog: : Since commit d1bcae833b32f1 ("ELF: Don't generate unused section symbols") : [1], binutils (v2.36+) started dropping section symbols that it thought : were unused. This isn't an issue in general, but with kexec_file.c, gcc : is placing kexec_arch_apply_relocations[_add] into a separate : .text.unlikely section and the section symbol ".text.unlikely" is being : dropped. Due to this, recordmcount is unable to find a non-weak symbol in : .text.unlikely to generate a relocation record against. This patch (of 2); Drop __weak attribute from functions in kexec_file.c: - arch_kexec_kernel_image_probe() - arch_kimage_file_post_load_cleanup() - arch_kexec_kernel_image_load() - arch_kexec_locate_mem_hole() - arch_kexec_kernel_verify_sig() arch_kexec_kernel_image_load() calls into kexec_image_load_default(), so drop the static attribute for the latter. arch_kexec_kernel_verify_sig() is not overridden by any architecture, so drop the __weak attribute. Link: https://lkml.kernel.org/r/cover.1656659357.git.naveen.n.rao@linux.vnet.ibm.com Link: https://lkml.kernel.org/r/2cd7ca1fe4d6bb6ca38e3283c717878388ed6788.1656659357.git.naveen.n.rao@linux.vnet.ibm.com Signed-off-by: Naveen N. Rao Suggested-by: Eric Biederman Signed-off-by: Andrew Morton Signed-off-by: Mimi Zohar --- kernel/kexec_file.c | 35 ++--------------------------------- 1 file changed, 2 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index f9261c07b048..0c27c81351ee 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -62,14 +62,7 @@ int kexec_image_probe_default(struct kimage *image, void *buf, return ret; } -/* Architectures can provide this probe function */ -int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf, - unsigned long buf_len) -{ - return kexec_image_probe_default(image, buf, buf_len); -} - -static void *kexec_image_load_default(struct kimage *image) +void *kexec_image_load_default(struct kimage *image) { if (!image->fops || !image->fops->load) return ERR_PTR(-ENOEXEC); @@ -80,11 +73,6 @@ static void *kexec_image_load_default(struct kimage *image) image->cmdline_buf_len); } -void * __weak arch_kexec_kernel_image_load(struct kimage *image) -{ - return kexec_image_load_default(image); -} - int kexec_image_post_load_cleanup_default(struct kimage *image) { if (!image->fops || !image->fops->cleanup) @@ -93,11 +81,6 @@ int kexec_image_post_load_cleanup_default(struct kimage *image) return image->fops->cleanup(image->image_loader_data); } -int __weak arch_kimage_file_post_load_cleanup(struct kimage *image) -{ - return kexec_image_post_load_cleanup_default(image); -} - #ifdef CONFIG_KEXEC_SIG static int kexec_image_verify_sig_default(struct kimage *image, void *buf, unsigned long buf_len) @@ -110,8 +93,7 @@ static int kexec_image_verify_sig_default(struct kimage *image, void *buf, return image->fops->verify_sig(buf, buf_len); } -int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf, - unsigned long buf_len) +int arch_kexec_kernel_verify_sig(struct kimage *image, void *buf, unsigned long buf_len) { return kexec_image_verify_sig_default(image, buf, buf_len); } @@ -621,19 +603,6 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf) return ret == 1 ? 0 : -EADDRNOTAVAIL; } -/** - * arch_kexec_locate_mem_hole - Find free memory to place the segments. - * @kbuf: Parameters for the memory search. - * - * On success, kbuf->mem will have the start address of the memory region found. - * - * Return: 0 on success, negative errno on error. - */ -int __weak arch_kexec_locate_mem_hole(struct kexec_buf *kbuf) -{ - return kexec_locate_mem_hole(kbuf); -} - /** * kexec_add_buffer - place a buffer in a kexec segment * @kbuf: Buffer contents and memory parameters. -- cgit From 0738eceb6201691534df07e0928d0a6168a35787 Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Fri, 1 Jul 2022 13:04:05 +0530 Subject: kexec: drop weak attribute from functions Drop __weak attribute from functions in kexec_core.c: - machine_kexec_post_load() - arch_kexec_protect_crashkres() - arch_kexec_unprotect_crashkres() - crash_free_reserved_phys_range() Link: https://lkml.kernel.org/r/c0f6219e03cb399d166d518ab505095218a902dd.1656659357.git.naveen.n.rao@linux.vnet.ibm.com Signed-off-by: Naveen N. Rao Suggested-by: Eric Biederman Signed-off-by: Andrew Morton Signed-off-by: Mimi Zohar --- kernel/kexec_core.c | 27 --------------------------- 1 file changed, 27 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 4d34c78334ce..acd029b307e4 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -591,11 +591,6 @@ static void kimage_free_extra_pages(struct kimage *image) } -int __weak machine_kexec_post_load(struct kimage *image) -{ - return 0; -} - void kimage_terminate(struct kimage *image) { if (*image->entry != 0) @@ -1020,15 +1015,6 @@ size_t crash_get_memory_size(void) return size; } -void __weak crash_free_reserved_phys_range(unsigned long begin, - unsigned long end) -{ - unsigned long addr; - - for (addr = begin; addr < end; addr += PAGE_SIZE) - free_reserved_page(boot_pfn_to_page(addr >> PAGE_SHIFT)); -} - int crash_shrink_memory(unsigned long new_size) { int ret = 0; @@ -1225,16 +1211,3 @@ int kernel_kexec(void) mutex_unlock(&kexec_mutex); return error; } - -/* - * Protection mechanism for crashkernel reserved memory after - * the kdump kernel is loaded. - * - * Provide an empty default implementation here -- architecture - * code may override this - */ -void __weak arch_kexec_protect_crashkres(void) -{} - -void __weak arch_kexec_unprotect_crashkres(void) -{} -- cgit From 689a71493bd2f31c024f8c0395f85a1fd4b2138e Mon Sep 17 00:00:00 2001 From: Coiby Xu Date: Thu, 14 Jul 2022 21:40:24 +0800 Subject: kexec: clean up arch_kexec_kernel_verify_sig Before commit 105e10e2cf1c ("kexec_file: drop weak attribute from functions"), there was already no arch-specific implementation of arch_kexec_kernel_verify_sig. With weak attribute dropped by that commit, arch_kexec_kernel_verify_sig is completely useless. So clean it up. Note later patches are dependent on this patch so it should be backported to the stable tree as well. Cc: stable@vger.kernel.org Suggested-by: Eric W. Biederman Reviewed-by: Michal Suchanek Acked-by: Baoquan He Signed-off-by: Coiby Xu [zohar@linux.ibm.com: reworded patch description "Note"] Link: https://lore.kernel.org/linux-integrity/20220714134027.394370-1-coxu@redhat.com/ Signed-off-by: Mimi Zohar --- kernel/kexec_file.c | 33 +++++++++++++-------------------- 1 file changed, 13 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 0c27c81351ee..6dc1294c90fc 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -81,24 +81,6 @@ int kexec_image_post_load_cleanup_default(struct kimage *image) return image->fops->cleanup(image->image_loader_data); } -#ifdef CONFIG_KEXEC_SIG -static int kexec_image_verify_sig_default(struct kimage *image, void *buf, - unsigned long buf_len) -{ - if (!image->fops || !image->fops->verify_sig) { - pr_debug("kernel loader does not support signature verification.\n"); - return -EKEYREJECTED; - } - - return image->fops->verify_sig(buf, buf_len); -} - -int arch_kexec_kernel_verify_sig(struct kimage *image, void *buf, unsigned long buf_len) -{ - return kexec_image_verify_sig_default(image, buf, buf_len); -} -#endif - /* * Free up memory used by kernel, initrd, and command line. This is temporary * memory allocation which is not needed any more after these buffers have @@ -141,13 +123,24 @@ void kimage_file_post_load_cleanup(struct kimage *image) } #ifdef CONFIG_KEXEC_SIG +static int kexec_image_verify_sig(struct kimage *image, void *buf, + unsigned long buf_len) +{ + if (!image->fops || !image->fops->verify_sig) { + pr_debug("kernel loader does not support signature verification.\n"); + return -EKEYREJECTED; + } + + return image->fops->verify_sig(buf, buf_len); +} + static int kimage_validate_signature(struct kimage *image) { int ret; - ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf, - image->kernel_buf_len); + ret = kexec_image_verify_sig(image, image->kernel_buf, + image->kernel_buf_len); if (ret) { if (sig_enforce) { -- cgit From c903dae8941deb55043ee46ded29e84e97cd84bb Mon Sep 17 00:00:00 2001 From: Coiby Xu Date: Thu, 14 Jul 2022 21:40:25 +0800 Subject: kexec, KEYS: make the code in bzImage64_verify_sig generic commit 278311e417be ("kexec, KEYS: Make use of platform keyring for signature verify") adds platform keyring support on x86 kexec but not arm64. The code in bzImage64_verify_sig uses the keys on the .builtin_trusted_keys, .machine, if configured and enabled, .secondary_trusted_keys, also if configured, and .platform keyrings to verify the signed kernel image as PE file. Cc: kexec@lists.infradead.org Cc: keyrings@vger.kernel.org Cc: linux-security-module@vger.kernel.org Reviewed-by: Michal Suchanek Signed-off-by: Coiby Xu Signed-off-by: Mimi Zohar --- kernel/kexec_file.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 6dc1294c90fc..a7b411c22f19 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -123,6 +123,23 @@ void kimage_file_post_load_cleanup(struct kimage *image) } #ifdef CONFIG_KEXEC_SIG +#ifdef CONFIG_SIGNED_PE_FILE_VERIFICATION +int kexec_kernel_verify_pe_sig(const char *kernel, unsigned long kernel_len) +{ + int ret; + + ret = verify_pefile_signature(kernel, kernel_len, + VERIFY_USE_SECONDARY_KEYRING, + VERIFYING_KEXEC_PE_SIGNATURE); + if (ret == -ENOKEY && IS_ENABLED(CONFIG_INTEGRITY_PLATFORM_KEYRING)) { + ret = verify_pefile_signature(kernel, kernel_len, + VERIFY_USE_PLATFORM_KEYRING, + VERIFYING_KEXEC_PE_SIGNATURE); + } + return ret; +} +#endif + static int kexec_image_verify_sig(struct kimage *image, void *buf, unsigned long buf_len) { -- cgit From a2a5580fcbf808e7c2310e4959b62f9d2157fdb6 Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Thu, 14 Jul 2022 11:03:22 +0100 Subject: bpf: Fix check against plain integer v 'NULL' When checking with sparse, btf_show_type_value() is causing a warning about checking integer vs NULL when the macro is passed a pointer, due to the 'value != 0' check. Stop sparse complaining about any type-casting by adding a cast to the typeof(value). This fixes the following sparse warnings: kernel/bpf/btf.c:2579:17: warning: Using plain integer as NULL pointer kernel/bpf/btf.c:2581:17: warning: Using plain integer as NULL pointer kernel/bpf/btf.c:3407:17: warning: Using plain integer as NULL pointer kernel/bpf/btf.c:3758:9: warning: Using plain integer as NULL pointer Signed-off-by: Ben Dooks Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20220714100322.260467-1-ben.dooks@sifive.com --- kernel/bpf/btf.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 4423045b8ff3..5869f03bcb6e 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -1116,7 +1116,8 @@ __printf(2, 3) static void btf_show(struct btf_show *show, const char *fmt, ...) */ #define btf_show_type_value(show, fmt, value) \ do { \ - if ((value) != 0 || (show->flags & BTF_SHOW_ZERO) || \ + if ((value) != (__typeof__(value))0 || \ + (show->flags & BTF_SHOW_ZERO) || \ show->state.depth == 0) { \ btf_show(show, "%s%s" fmt "%s%s", \ btf_show_indent(show), \ -- cgit From ae6ccaa650380d243cf43d31c864c5ced2fd4612 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Thu, 7 Jul 2022 08:15:52 +0100 Subject: PM: EM: convert power field to micro-Watts precision and align drivers The milli-Watts precision causes rounding errors while calculating efficiency cost for each OPP. This is especially visible in the 'simple' Energy Model (EM), where the power for each OPP is provided from OPP framework. This can cause some OPPs to be marked inefficient, while using micro-Watts precision that might not happen. Update all EM users which access 'power' field and assume the value is in milli-Watts. Solve also an issue with potential overflow in calculation of energy estimation on 32bit machine. It's needed now since the power value (thus the 'cost' as well) are higher. Example calculation which shows the rounding error and impact: power = 'dyn-power-coeff' * volt_mV * volt_mV * freq_MHz power_a_uW = (100 * 600mW * 600mW * 500MHz) / 10^6 = 18000 power_a_mW = (100 * 600mW * 600mW * 500MHz) / 10^9 = 18 power_b_uW = (100 * 605mW * 605mW * 600MHz) / 10^6 = 21961 power_b_mW = (100 * 605mW * 605mW * 600MHz) / 10^9 = 21 max_freq = 2000MHz cost_a_mW = 18 * 2000MHz/500MHz = 72 cost_a_uW = 18000 * 2000MHz/500MHz = 72000 cost_b_mW = 21 * 2000MHz/600MHz = 70 // <- artificially better cost_b_uW = 21961 * 2000MHz/600MHz = 73203 The 'cost_b_mW' (which is based on old milli-Watts) is misleadingly better that the 'cost_b_uW' (this patch uses micro-Watts) and such would have impact on the 'inefficient OPPs' information in the Cpufreq framework. This patch set removes the rounding issue. Signed-off-by: Lukasz Luba Acked-by: Daniel Lezcano Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- kernel/power/energy_model.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 6c373f2960e7..f82111837b8d 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -145,7 +145,7 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, /* * The power returned by active_state() is expected to be - * positive and to fit into 16 bits. + * positive and be in range. */ if (!power || power > EM_MAX_POWER) { dev_err(dev, "EM: invalid power: %lu\n", @@ -170,7 +170,7 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, goto free_ps_table; } } else { - power_res = em_scale_power(table[i].power); + power_res = table[i].power; cost = div64_u64(fmax * power_res, table[i].frequency); } @@ -201,9 +201,17 @@ static int em_create_pd(struct device *dev, int nr_states, { struct em_perf_domain *pd; struct device *cpu_dev; - int cpu, ret; + int cpu, ret, num_cpus; if (_is_cpu_device(dev)) { + num_cpus = cpumask_weight(cpus); + + /* Prevent max possible energy calculation to not overflow */ + if (num_cpus > EM_MAX_NUM_CPUS) { + dev_err(dev, "EM: too many CPUs, overflow possible\n"); + return -EINVAL; + } + pd = kzalloc(sizeof(*pd) + cpumask_size(), GFP_KERNEL); if (!pd) return -ENOMEM; @@ -314,13 +322,13 @@ EXPORT_SYMBOL_GPL(em_cpu_get); * @cpus : Pointer to cpumask_t, which in case of a CPU device is * obligatory. It can be taken from i.e. 'policy->cpus'. For other * type of devices this should be set to NULL. - * @milliwatts : Flag indicating that the power values are in milliWatts or + * @microwatts : Flag indicating that the power values are in micro-Watts or * in some other scale. It must be set properly. * * Create Energy Model tables for a performance domain using the callbacks * defined in cb. * - * The @milliwatts is important to set with correct value. Some kernel + * The @microwatts is important to set with correct value. Some kernel * sub-systems might rely on this flag and check if all devices in the EM are * using the same scale. * @@ -331,7 +339,7 @@ EXPORT_SYMBOL_GPL(em_cpu_get); */ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, struct em_data_callback *cb, cpumask_t *cpus, - bool milliwatts) + bool microwatts) { unsigned long cap, prev_cap = 0; unsigned long flags = 0; @@ -381,8 +389,8 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, } } - if (milliwatts) - flags |= EM_PERF_DOMAIN_MILLIWATTS; + if (microwatts) + flags |= EM_PERF_DOMAIN_MICROWATTS; else if (cb->get_cost) flags |= EM_PERF_DOMAIN_ARTIFICIAL; -- cgit From fcfe0ac2fcfae7d5fcad3d0375cb8ff38caf8aba Mon Sep 17 00:00:00 2001 From: Micah Morton Date: Wed, 8 Jun 2022 20:57:11 +0000 Subject: security: Add LSM hook to setgroups() syscall Give the LSM framework the ability to filter setgroups() syscalls. There are already analagous hooks for the set*uid() and set*gid() syscalls. The SafeSetID LSM will use this new hook to ensure setgroups() calls are allowed by the installed security policy. Tested by putting print statement in security_task_fix_setgroups() hook and confirming that it gets hit when userspace does a setgroups() syscall. Acked-by: Casey Schaufler Reviewed-by: Serge Hallyn Signed-off-by: Micah Morton --- kernel/groups.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/groups.c b/kernel/groups.c index 787b381c7c00..9aaed2a31073 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -134,13 +134,26 @@ EXPORT_SYMBOL(set_groups); int set_current_groups(struct group_info *group_info) { struct cred *new; + const struct cred *old; + int retval; new = prepare_creds(); if (!new) return -ENOMEM; + old = current_cred(); + set_groups(new, group_info); + + retval = security_task_fix_setgroups(new, old); + if (retval < 0) + goto error; + return commit_creds(new); + +error: + abort_creds(new); + return retval; } EXPORT_SYMBOL(set_current_groups); -- cgit From 3848636b4a88f0706f9ce48d532163244abadd43 Mon Sep 17 00:00:00 2001 From: Pu Lehui Date: Thu, 14 Jul 2022 10:46:12 +0800 Subject: bpf: iterators: Build and use lightweight bootstrap version of bpftool kernel/bpf/preload/iterators use bpftool for vmlinux.h, skeleton, and static linking only. So we can use lightweight bootstrap version of bpftool to handle these, and it will be faster. Suggested-by: Andrii Nakryiko Signed-off-by: Pu Lehui Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220714024612.944071-4-pulehui@huawei.com --- kernel/bpf/preload/iterators/Makefile | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/preload/iterators/Makefile b/kernel/bpf/preload/iterators/Makefile index bfe24f8c5a20..6762b1260f2f 100644 --- a/kernel/bpf/preload/iterators/Makefile +++ b/kernel/bpf/preload/iterators/Makefile @@ -9,7 +9,7 @@ LLVM_STRIP ?= llvm-strip TOOLS_PATH := $(abspath ../../../../tools) BPFTOOL_SRC := $(TOOLS_PATH)/bpf/bpftool BPFTOOL_OUTPUT := $(abs_out)/bpftool -DEFAULT_BPFTOOL := $(OUTPUT)/sbin/bpftool +DEFAULT_BPFTOOL := $(BPFTOOL_OUTPUT)/bootstrap/bpftool BPFTOOL ?= $(DEFAULT_BPFTOOL) LIBBPF_SRC := $(TOOLS_PATH)/lib/bpf @@ -61,9 +61,5 @@ $(BPFOBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(LIBBPF_OU OUTPUT=$(abspath $(dir $@))/ prefix= \ DESTDIR=$(LIBBPF_DESTDIR) $(abspath $@) install_headers -$(DEFAULT_BPFTOOL): $(BPFOBJ) | $(BPFTOOL_OUTPUT) - $(Q)$(MAKE) $(submake_extras) -C $(BPFTOOL_SRC) \ - OUTPUT=$(BPFTOOL_OUTPUT)/ \ - LIBBPF_OUTPUT=$(LIBBPF_OUTPUT)/ \ - LIBBPF_DESTDIR=$(LIBBPF_DESTDIR)/ \ - prefix= DESTDIR=$(abs_out)/ install-bin +$(DEFAULT_BPFTOOL): | $(BPFTOOL_OUTPUT) + $(Q)$(MAKE) $(submake_extras) -C $(BPFTOOL_SRC) OUTPUT=$(BPFTOOL_OUTPUT)/ bootstrap -- cgit From 020e3618cc81abf11fe6bffaac27861ff94707ce Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 15 Jul 2022 11:47:35 -0700 Subject: blktrace: Fix the blk_fill_rwbs() kernel-doc header Reflect recent changes in the blk_fill_rwbs() kernel-doc header. Reported-by: Stephen Rothwell Cc: Christoph Hellwig Cc: Steven Rostedt Cc: Li Zefan Cc: Chaitanya Kulkarni Cc: Stephen Rothwell Fixes: 919dbca8670d ("blktrace: Use the new blk_opf_t type") Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20220715184735.2326034-3-bvanassche@acm.org Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 150058f5daa9..7f5eb295fe19 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1873,11 +1873,11 @@ out: /** * blk_fill_rwbs - Fill the buffer rwbs by mapping op to character string. * @rwbs: buffer to be filled - * @op: REQ_OP_XXX for the tracepoint + * @opf: request operation type (REQ_OP_XXX) and flags for the tracepoint * * Description: - * Maps the REQ_OP_XXX to character and fills the buffer provided by the - * caller with resulting string. + * Maps each request operation and flag to a single character and fills the + * buffer provided by the caller with resulting string. * **/ void blk_fill_rwbs(char *rwbs, blk_opf_t opf) -- cgit From 868941b14441282ba08761b770fc6cad69d5bdb7 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Wed, 29 Jun 2022 15:07:00 +0200 Subject: fs: remove no_llseek Now that all callers of ->llseek are going through vfs_llseek(), we don't gain anything by keeping no_llseek around. Nothing actually calls it and setting ->llseek to no_lseek is completely equivalent to leaving it NULL. Longer term (== by the end of merge window) we want to remove all such intializations. To simplify the merge window this commit does *not* touch initializers - it only defines no_llseek as NULL (and simplifies the tests on file opening). At -rc1 we'll need do a mechanical removal of no_llseek - git grep -l -w no_llseek | grep -v porting.rst | while read i; do sed -i '/\/d' $i done would do it. Signed-off-by: Jason A. Donenfeld Signed-off-by: Al Viro --- kernel/bpf/bpf_iter.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index d5d96ceca105..8af0cbf9c0cd 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -81,10 +81,9 @@ static bool bpf_iter_support_resched(struct seq_file *seq) #define MAX_ITER_OBJECTS 1000000 /* bpf_seq_read, a customized and simpler version for bpf iterator. - * no_llseek is assumed for this file. * The following are differences from seq_read(): * . fixed buffer size (PAGE_SIZE) - * . assuming no_llseek + * . assuming NULL ->llseek() * . stop() may call bpf program, handling potential overflow there */ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size, -- cgit From 71f8c15565d0f3d2f5b3339845e05cf4f03725cd Mon Sep 17 00:00:00 2001 From: Stephen Brennan Date: Mon, 16 May 2022 17:05:07 -0700 Subject: kallsyms: move declarations to internal header Patch series "Expose kallsyms data in vmcoreinfo note". The kernel can be configured to contain a lot of introspection or debugging information built-in, such as ORC for unwinding stack traces, BTF for type information, and of course kallsyms. Debuggers could use this information to navigate a core dump or live system, but they need to be able to find it. This patch series adds the necessary symbols into vmcoreinfo, which would allow a debugger to find and interpret the kallsyms table. Using the kallsyms data, the debugger can then lookup any symbol, allowing it to find ORC, BTF, or any other useful data. This would allow a live kernel, or core dump, to be debugged without any DWARF debuginfo. This is useful for many cases: the debuginfo may not have been generated, or you may not want to deploy the large files everywhere you need them. I've demonstrated a proof of concept for this at LSF/MM+BPF during a lighting talk. Using a work-in-progress branch of the drgn debugger, and an extended set of BTF generated by a patched version of dwarves, I've been able to open a core dump without any DWARF info and do basic tasks such as enumerating slab caches, block devices, tasks, and doing backtraces. I hope this series can be a first step toward a new possibility of "DWARFless debugging". Related discussion around the BTF side of this: https://lore.kernel.org/bpf/586a6288-704a-f7a7-b256-e18a675927df@oracle.com/T/#u Some work-in-progress branches using this feature: https://github.com/brenns10/dwarves/tree/remove_percpu_restriction_1 https://github.com/brenns10/drgn/tree/kallsyms_plus_btf This patch (of 2): To include kallsyms data in the vmcoreinfo note, we must make the symbol declarations visible outside of kallsyms.c. Move these to a new internal header file. Link: https://lkml.kernel.org/r/20220517000508.777145-1-stephen.s.brennan@oracle.com Link: https://lkml.kernel.org/r/20220517000508.777145-2-stephen.s.brennan@oracle.com Signed-off-by: Stephen Brennan Acked-by: Baoquan He Cc: Nick Desaulniers Cc: Dave Young Cc: Kees Cook Cc: Jiri Olsa Cc: Stephen Boyd Cc: Bixuan Cui Cc: David Vernet Cc: Vivek Goyal Cc: Sami Tolvanen Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton --- kernel/kallsyms.c | 23 +---------------------- kernel/kallsyms_internal.h | 30 ++++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 22 deletions(-) create mode 100644 kernel/kallsyms_internal.h (limited to 'kernel') diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index fbdf8d3279ac..510fba0ba5b4 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -31,28 +31,7 @@ #include #include -/* - * These will be re-linked against their real values - * during the second link stage. - */ -extern const unsigned long kallsyms_addresses[] __weak; -extern const int kallsyms_offsets[] __weak; -extern const u8 kallsyms_names[] __weak; - -/* - * Tell the compiler that the count isn't in the small data section if the arch - * has one (eg: FRV). - */ -extern const unsigned int kallsyms_num_syms -__section(".rodata") __attribute__((weak)); - -extern const unsigned long kallsyms_relative_base -__section(".rodata") __attribute__((weak)); - -extern const char kallsyms_token_table[] __weak; -extern const u16 kallsyms_token_index[] __weak; - -extern const unsigned int kallsyms_markers[] __weak; +#include "kallsyms_internal.h" /* * Expand a compressed symbol data into the resulting uncompressed string, diff --git a/kernel/kallsyms_internal.h b/kernel/kallsyms_internal.h new file mode 100644 index 000000000000..2d0c6f2f0243 --- /dev/null +++ b/kernel/kallsyms_internal.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef LINUX_KALLSYMS_INTERNAL_H_ +#define LINUX_KALLSYMS_INTERNAL_H_ + +#include + +/* + * These will be re-linked against their real values + * during the second link stage. + */ +extern const unsigned long kallsyms_addresses[] __weak; +extern const int kallsyms_offsets[] __weak; +extern const u8 kallsyms_names[] __weak; + +/* + * Tell the compiler that the count isn't in the small data section if the arch + * has one (eg: FRV). + */ +extern const unsigned int kallsyms_num_syms +__section(".rodata") __attribute__((weak)); + +extern const unsigned long kallsyms_relative_base +__section(".rodata") __attribute__((weak)); + +extern const char kallsyms_token_table[] __weak; +extern const u16 kallsyms_token_index[] __weak; + +extern const unsigned int kallsyms_markers[] __weak; + +#endif // LINUX_KALLSYMS_INTERNAL_H_ -- cgit From 5fd8fea935a1091083506d0b982fcc5d35062f06 Mon Sep 17 00:00:00 2001 From: Stephen Brennan Date: Mon, 16 May 2022 17:05:08 -0700 Subject: vmcoreinfo: include kallsyms symbols The internal kallsyms tables contain information which could be quite useful to a debugging tool in the absence of other debuginfo. If kallsyms is enabled, then a debugging tool could parse it and use it as a fallback symbol table. Combined with BTF data, live & post-mortem debuggers can support basic operations without needing a large DWARF debuginfo file available. As many as five symbols are necessary to properly parse kallsyms names and addresses. Add these to the vmcoreinfo note. CONFIG_KALLSYMS_ABSOLUTE_PERCPU does impact the computation of symbol addresses. However, a debugger can infer this configuration value by comparing the address of _stext in the vmcoreinfo with the address computed via kallsyms. So there's no need to include information about this config value in the vmcoreinfo note. To verify that we're still well below the maximum of 4096 bytes, I created a script[1] to compute a rough upper bound on the possible size of vmcoreinfo. On v5.18-rc7, the script reports 3106 bytes, and with this patch, the maximum become 3370 bytes. [1]: https://github.com/brenns10/kernel_stuff/blob/master/vmcoreinfosize/ Link: https://lkml.kernel.org/r/20220517000508.777145-3-stephen.s.brennan@oracle.com Signed-off-by: Stephen Brennan Acked-by: Baoquan He Cc: Bixuan Cui Cc: Dave Young Cc: David Vernet Cc: "Eric W. Biederman" Cc: Jiri Olsa Cc: Kees Cook Cc: Nick Desaulniers Cc: Sami Tolvanen Cc: Stephen Boyd Cc: Vivek Goyal Signed-off-by: Andrew Morton --- kernel/crash_core.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'kernel') diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 71122e01623c..f64d35e28411 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -15,6 +15,8 @@ #include +#include "kallsyms_internal.h" + /* vmcoreinfo stuff */ unsigned char *vmcoreinfo_data; size_t vmcoreinfo_size; @@ -480,6 +482,18 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE); #endif +#ifdef CONFIG_KALLSYMS + VMCOREINFO_SYMBOL(kallsyms_names); + VMCOREINFO_SYMBOL(kallsyms_token_table); + VMCOREINFO_SYMBOL(kallsyms_token_index); +#ifdef CONFIG_KALLSYMS_BASE_RELATIVE + VMCOREINFO_SYMBOL(kallsyms_offsets); + VMCOREINFO_SYMBOL(kallsyms_relative_base); +#else + VMCOREINFO_SYMBOL(kallsyms_addresses); +#endif /* CONFIG_KALLSYMS_BASE_RELATIVE */ +#endif /* CONFIG_KALLSYMS */ + arch_crash_save_vmcoreinfo(); update_vmcoreinfo_note(); -- cgit From 46d36b1be18b745fc9f6be2087633ba2f9895ffe Mon Sep 17 00:00:00 2001 From: Tao Liu Date: Mon, 27 Jun 2022 15:44:41 +0800 Subject: kdump: round up the total memory size to 128M for crashkernel reservation The total memory size we get in kernel is usually slightly less than the actual memory size because BIOS/firmware will reserve some memory region. So it won't export all memory as usable. E.g, on my x86_64 kvm guest with 1G memory, the total_mem value shows: UEFI boot with ovmf: 0x3faef000 Legacy boot kvm guest: 0x3ff7ec00 When specifying crashkernel=1G-2G:128M, if we have a 1G memory machine, we get total size 1023M from firmware. Then it will not fall into 1G-2G, thus no memory reserved. User will never know this, it is hard to let user know the exact total value in kernel. One way is to use dmi/smbios to get physical memory size, but it's not reliable as well. According to Prarit hardware vendors sometimes screw this up. Thus round up total size to 128M to work around this problem. This patch is a resend of [1] and rebased onto v5.19-rc2, and the original credit goes to Dave Young. [1]: http://lists.infradead.org/pipermail/kexec/2018-April/020568.html Link: https://lkml.kernel.org/r/20220627074440.187222-1-ltao@redhat.com Signed-off-by: Tao Liu Acked-by: Baoquan He Cc: Dave Young Signed-off-by: Andrew Morton --- kernel/crash_core.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/crash_core.c b/kernel/crash_core.c index f64d35e28411..07b26df453a9 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -45,6 +46,15 @@ static int __init parse_crashkernel_mem(char *cmdline, unsigned long long *crash_base) { char *cur = cmdline, *tmp; + unsigned long long total_mem = system_ram; + + /* + * Firmware sometimes reserves some memory regions for its own use, + * so the system memory size is less than the actual physical memory + * size. Work around this by rounding up the total size to 128M, + * which is enough for most test cases. + */ + total_mem = roundup(total_mem, SZ_128M); /* for each entry of the comma-separated list */ do { @@ -89,13 +99,13 @@ static int __init parse_crashkernel_mem(char *cmdline, return -EINVAL; } cur = tmp; - if (size >= system_ram) { + if (size >= total_mem) { pr_warn("crashkernel: invalid size\n"); return -EINVAL; } /* match ? */ - if (system_ram >= start && system_ram < end) { + if (total_mem >= start && total_mem < end) { *crash_size = size; break; } -- cgit From 4a97739474c402e0a14cf6a432f1920262f6811c Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 15 Jul 2022 11:19:50 +0300 Subject: swiotlb: fix use after free on error handling path Don't dereference "mem" after it has been freed. Flip the two kfree()s around to address this bug. Fixes: 26ffb91fa5e0 ("swiotlb: split up the global swiotlb lock") Signed-off-by: Dan Carpenter Signed-off-by: Christoph Hellwig --- kernel/dma/swiotlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index dcf1459ce723..c50e6fe20f37 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -979,8 +979,8 @@ static int rmem_swiotlb_device_init(struct reserved_mem *rmem, mem->areas = kcalloc(nareas, sizeof(*mem->areas), GFP_KERNEL); if (!mem->areas) { - kfree(mem); kfree(mem->slots); + kfree(mem); return -ENOMEM; } -- cgit From 91561d4ecb755f056f8ff04f9dcaec210140e55c Mon Sep 17 00:00:00 2001 From: Chao Gao Date: Fri, 15 Jul 2022 18:45:33 +0800 Subject: swiotlb: remove unused fields in io_tlb_mem Commit 20347fca71a3 ("swiotlb: split up the global swiotlb lock") splits io_tlb_mem into multiple areas. Each area has its own lock and index. The global ones are not used so remove them. Signed-off-by: Chao Gao Signed-off-by: Christoph Hellwig --- kernel/dma/swiotlb.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index c50e6fe20f37..cbffa0b1ace5 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -253,14 +253,12 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start, mem->nslabs = nslabs; mem->start = start; mem->end = mem->start + bytes; - mem->index = 0; mem->late_alloc = late_alloc; mem->nareas = nareas; mem->area_nslabs = nslabs / mem->nareas; mem->force_bounce = swiotlb_force_bounce || (flags & SWIOTLB_FORCE); - spin_lock_init(&mem->lock); for (i = 0; i < mem->nareas; i++) { spin_lock_init(&mem->areas[i].lock); mem->areas[i].index = 0; -- cgit From 44335487bab05e06902f9184179857aae764bfe6 Mon Sep 17 00:00:00 2001 From: Chao Gao Date: Fri, 15 Jul 2022 18:45:34 +0800 Subject: swiotlb: consolidate rounding up default_nslabs default_nslabs are rounded up in two cases with exactly same comments. Add a simple wrapper to reduce duplicate code/comments. It is preparatory to adding more logics into the round-up. No functional change intended. Signed-off-by: Chao Gao Signed-off-by: Christoph Hellwig --- kernel/dma/swiotlb.c | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index cbffa0b1ace5..604f2469ac0e 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -88,6 +88,20 @@ struct io_tlb_area { spinlock_t lock; }; +/* + * Round up number of slabs to the next power of 2. The last area is going + * be smaller than the rest if default_nslabs is not power of two. + * + * Return true if default_nslabs is rounded up. + */ +static bool round_up_default_nslabs(void) +{ + if (!default_nareas || is_power_of_2(default_nslabs)) + return false; + default_nslabs = roundup_pow_of_two(default_nslabs); + return true; +} + static void swiotlb_adjust_nareas(unsigned int nareas) { if (!is_power_of_2(nareas)) @@ -96,16 +110,9 @@ static void swiotlb_adjust_nareas(unsigned int nareas) default_nareas = nareas; pr_info("area num %d.\n", nareas); - /* - * Round up number of slabs to the next power of 2. - * The last area is going be smaller than the rest if - * default_nslabs is not power of two. - */ - if (nareas && !is_power_of_2(default_nslabs)) { - default_nslabs = roundup_pow_of_two(default_nslabs); + if (round_up_default_nslabs()) pr_info("SWIOTLB bounce buffer size roundup to %luMB", (default_nslabs << IO_TLB_SHIFT) >> 20); - } } static int __init @@ -154,18 +161,10 @@ void __init swiotlb_adjust_size(unsigned long size) if (default_nslabs != IO_TLB_DEFAULT_SIZE >> IO_TLB_SHIFT) return; - /* - * Round up number of slabs to the next power of 2. - * The last area is going be smaller than the rest if - * default_nslabs is not power of two. - */ size = ALIGN(size, IO_TLB_SIZE); default_nslabs = ALIGN(size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE); - if (default_nareas) { - default_nslabs = roundup_pow_of_two(default_nslabs); + if (round_up_default_nslabs()) size = default_nslabs << IO_TLB_SHIFT; - } - pr_info("SWIOTLB bounce buffer size adjusted to %luMB", size >> 20); } -- cgit From 57e6840cf79a4af84f44af3f8cfeacd8a14a6c6f Mon Sep 17 00:00:00 2001 From: Chao Gao Date: Fri, 15 Jul 2022 18:45:35 +0800 Subject: swiotlb: ensure a segment doesn't cross the area boundary Free slots tracking assumes that slots in a segment can be allocated to fulfill a request. This implies that slots in a segment should belong to the same area. Although the possibility of a violation is low, it is better to explicitly enforce segments won't span multiple areas by adjusting the number of slabs when configuring areas. Signed-off-by: Chao Gao Signed-off-by: Christoph Hellwig --- kernel/dma/swiotlb.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 604f2469ac0e..608923e8dab1 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -91,12 +91,21 @@ struct io_tlb_area { /* * Round up number of slabs to the next power of 2. The last area is going * be smaller than the rest if default_nslabs is not power of two. + * The number of slot in an area should be a multiple of IO_TLB_SEGSIZE, + * otherwise a segment may span two or more areas. It conflicts with free + * contiguous slots tracking: free slots are treated contiguous no matter + * whether they cross an area boundary. * * Return true if default_nslabs is rounded up. */ static bool round_up_default_nslabs(void) { - if (!default_nareas || is_power_of_2(default_nslabs)) + if (!default_nareas) + return false; + + if (default_nslabs < IO_TLB_SEGSIZE * default_nareas) + default_nslabs = IO_TLB_SEGSIZE * default_nareas; + else if (is_power_of_2(default_nslabs)) return false; default_nslabs = roundup_pow_of_two(default_nslabs); return true; -- cgit From 942a8186eb4451700dadd1d60a2e43ce67605991 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 12 Jul 2022 08:43:07 +0200 Subject: swiotlb: move struct io_tlb_slot to swiotlb.c No need to expose this structure definition in the header. Signed-off-by: Christoph Hellwig --- kernel/dma/swiotlb.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 608923e8dab1..39dee4004439 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -62,6 +62,12 @@ #define INVALID_PHYS_ADDR (~(phys_addr_t)0) +struct io_tlb_slot { + phys_addr_t orig_addr; + size_t alloc_size; + unsigned int list; +}; + static bool swiotlb_force_bounce; static bool swiotlb_force_disable; -- cgit From b8ac29b40183a6038919768b5d189c9bd91ce9b4 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Sun, 17 Jul 2022 23:53:34 +0200 Subject: timekeeping: contribute wall clock to rng on time change The rng's random_init() function contributes the real time to the rng at boot time, so that events can at least start in relation to something particular in the real world. But this clock might not yet be set that point in boot, so nothing is contributed. In addition, the relation between minor clock changes from, say, NTP, and the cycle counter is potentially useful entropic data. This commit addresses this by mixing in a time stamp on calls to settimeofday and adjtimex. No entropy is credited in doing so, so it doesn't make initialization faster, but it is still useful input to have. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Cc: stable@vger.kernel.org Reviewed-by: Thomas Gleixner Reviewed-by: Eric Biggers Signed-off-by: Jason A. Donenfeld --- kernel/time/timekeeping.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 8e4b3c32fcf9..f72b9f1de178 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -23,6 +23,7 @@ #include #include #include +#include #include "tick-internal.h" #include "ntp_internal.h" @@ -1343,8 +1344,10 @@ out: /* Signal hrtimers about time change */ clock_was_set(CLOCK_SET_WALL); - if (!ret) + if (!ret) { audit_tk_injoffset(ts_delta); + add_device_randomness(ts, sizeof(*ts)); + } return ret; } @@ -2430,6 +2433,7 @@ int do_adjtimex(struct __kernel_timex *txc) ret = timekeeping_validate_timex(txc); if (ret) return ret; + add_device_randomness(txc, sizeof(*txc)); if (txc->modes & ADJ_SETOFFSET) { struct timespec64 delta; @@ -2447,6 +2451,7 @@ int do_adjtimex(struct __kernel_timex *txc) audit_ntp_init(&ad); ktime_get_real_ts64(&ts); + add_device_randomness(&ts, sizeof(ts)); raw_spin_lock_irqsave(&timekeeper_lock, flags); write_seqcount_begin(&tk_core.seq); -- cgit From a229cc14f3395311b899e5e582b71efa8dd01df0 Mon Sep 17 00:00:00 2001 From: John Garry Date: Thu, 14 Jul 2022 19:15:24 +0800 Subject: dma-mapping: add dma_opt_mapping_size() Streaming DMA mapping involving an IOMMU may be much slower for larger total mapping size. This is because every IOMMU DMA mapping requires an IOVA to be allocated and freed. IOVA sizes above a certain limit are not cached, which can have a big impact on DMA mapping performance. Provide an API for device drivers to know this "optimal" limit, such that they may try to produce mapping which don't exceed it. Signed-off-by: John Garry Reviewed-by: Damien Le Moal Acked-by: Martin K. Petersen Signed-off-by: Christoph Hellwig --- kernel/dma/mapping.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index db7244291b74..1bfe11b1edb6 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -773,6 +773,18 @@ size_t dma_max_mapping_size(struct device *dev) } EXPORT_SYMBOL_GPL(dma_max_mapping_size); +size_t dma_opt_mapping_size(struct device *dev) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + size_t size = SIZE_MAX; + + if (ops && ops->opt_mapping_size) + size = ops->opt_mapping_size(); + + return min(dma_max_mapping_size(dev), size); +} +EXPORT_SYMBOL_GPL(dma_opt_mapping_size); + bool dma_need_sync(struct device *dev, dma_addr_t dma_addr) { const struct dma_map_ops *ops = get_dma_ops(dev); -- cgit From 3908fcddc65d04e069b03be49b33fae90e424631 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Thu, 14 Jul 2022 11:54:04 -0700 Subject: bpf: fix lsm_cgroup build errors on esoteric configs This particular ones is about having the following: CONFIG_BPF_LSM=y # CONFIG_CGROUP_BPF is not set Also, add __maybe_unused to the args for the !CONFIG_NET cases. Reported-by: kernel test robot Signed-off-by: Stanislav Fomichev Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20220714185404.3647772-1-sdf@google.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_lsm.c | 8 ++++++-- kernel/bpf/trampoline.c | 2 +- 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index d469b7f3deef..fa71d58b7ded 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -63,10 +63,11 @@ BTF_ID(func, bpf_lsm_socket_post_create) BTF_ID(func, bpf_lsm_socket_socketpair) BTF_SET_END(bpf_lsm_unlocked_sockopt_hooks) +#ifdef CONFIG_CGROUP_BPF void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog, bpf_func_t *bpf_func) { - const struct btf_param *args; + const struct btf_param *args __maybe_unused; if (btf_type_vlen(prog->aux->attach_func_proto) < 1 || btf_id_set_contains(&bpf_lsm_current_hooks, @@ -75,9 +76,9 @@ void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog, return; } +#ifdef CONFIG_NET args = btf_params(prog->aux->attach_func_proto); -#ifdef CONFIG_NET if (args[0].type == btf_sock_ids[BTF_SOCK_TYPE_SOCKET]) *bpf_func = __cgroup_bpf_run_lsm_socket; else if (args[0].type == btf_sock_ids[BTF_SOCK_TYPE_SOCK]) @@ -86,6 +87,7 @@ void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog, #endif *bpf_func = __cgroup_bpf_run_lsm_current; } +#endif int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, const struct bpf_prog *prog) @@ -219,6 +221,7 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_get_retval: return prog->expected_attach_type == BPF_LSM_CGROUP ? &bpf_get_retval_proto : NULL; +#ifdef CONFIG_NET case BPF_FUNC_setsockopt: if (prog->expected_attach_type != BPF_LSM_CGROUP) return NULL; @@ -239,6 +242,7 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) prog->aux->attach_btf_id)) return &bpf_unlocked_sk_getsockopt_proto; return NULL; +#endif default: return tracing_prog_func_proto(func_id, prog); } diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index fd69812412ca..6691dbf9e467 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -501,7 +501,7 @@ int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampolin return err; } -#if defined(CONFIG_BPF_JIT) && defined(CONFIG_BPF_SYSCALL) +#if defined(CONFIG_CGROUP_BPF) && defined(CONFIG_BPF_LSM) static void bpf_shim_tramp_link_release(struct bpf_link *link) { struct bpf_shim_tramp_link *shim_link = -- cgit From 87ac0d600943994444e24382a87aa19acc4cd3d4 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 14 Jul 2022 22:31:43 -0700 Subject: bpf: fix potential 32-bit overflow when accessing ARRAY map element If BPF array map is bigger than 4GB, element pointer calculation can overflow because both index and elem_size are u32. Fix this everywhere by forcing 64-bit multiplication. Extract this formula into separate small helper and use it consistently in various places. Speculative-preventing formula utilizing index_mask trick is left as is, but explicit u64 casts are added in both places. Fixes: c85d69135a91 ("bpf: move memory size checks to bpf_map_charge_init()") Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20220715053146.1291891-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/arraymap.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index fe40d3b9458f..1d05d63e6fa5 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -156,6 +156,11 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) return &array->map; } +static void *array_map_elem_ptr(struct bpf_array* array, u32 index) +{ + return array->value + (u64)array->elem_size * index; +} + /* Called from syscall or from eBPF program */ static void *array_map_lookup_elem(struct bpf_map *map, void *key) { @@ -165,7 +170,7 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key) if (unlikely(index >= array->map.max_entries)) return NULL; - return array->value + array->elem_size * (index & array->index_mask); + return array->value + (u64)array->elem_size * (index & array->index_mask); } static int array_map_direct_value_addr(const struct bpf_map *map, u64 *imm, @@ -339,7 +344,7 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, value, map->value_size); } else { val = array->value + - array->elem_size * (index & array->index_mask); + (u64)array->elem_size * (index & array->index_mask); if (map_flags & BPF_F_LOCK) copy_map_value_locked(map, val, value, false); else @@ -408,8 +413,7 @@ static void array_map_free_timers(struct bpf_map *map) return; for (i = 0; i < array->map.max_entries; i++) - bpf_timer_cancel_and_free(array->value + array->elem_size * i + - map->timer_off); + bpf_timer_cancel_and_free(array_map_elem_ptr(array, i) + map->timer_off); } /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ @@ -420,7 +424,7 @@ static void array_map_free(struct bpf_map *map) if (map_value_has_kptrs(map)) { for (i = 0; i < array->map.max_entries; i++) - bpf_map_free_kptrs(map, array->value + array->elem_size * i); + bpf_map_free_kptrs(map, array_map_elem_ptr(array, i)); bpf_map_free_kptr_off_tab(map); } @@ -556,7 +560,7 @@ static void *bpf_array_map_seq_start(struct seq_file *seq, loff_t *pos) index = info->index & array->index_mask; if (info->percpu_value_buf) return array->pptrs[index]; - return array->value + array->elem_size * index; + return array_map_elem_ptr(array, index); } static void *bpf_array_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) @@ -575,7 +579,7 @@ static void *bpf_array_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) index = info->index & array->index_mask; if (info->percpu_value_buf) return array->pptrs[index]; - return array->value + array->elem_size * index; + return array_map_elem_ptr(array, index); } static int __bpf_array_map_seq_show(struct seq_file *seq, void *v) @@ -690,7 +694,7 @@ static int bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback_ if (is_percpu) val = this_cpu_ptr(array->pptrs[i]); else - val = array->value + array->elem_size * i; + val = array_map_elem_ptr(array, i); num_elems++; key = i; ret = callback_fn((u64)(long)map, (u64)(long)&key, -- cgit From d937bc3449fa868cbeaf5c87576f9929b765c1e0 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 14 Jul 2022 22:31:44 -0700 Subject: bpf: make uniform use of array->elem_size everywhere in arraymap.c BPF_MAP_TYPE_ARRAY is rounding value_size to closest multiple of 8 and stores that as array->elem_size for various memory allocations and accesses. But the code tends to re-calculate round_up(map->value_size, 8) in multiple places instead of using array->elem_size. Cleaning this up and making sure we always use array->size to avoid duplication of this (admittedly simple) logic for consistency. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20220715053146.1291891-3-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/arraymap.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 1d05d63e6fa5..98ee09155151 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -208,7 +208,7 @@ static int array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { struct bpf_array *array = container_of(map, struct bpf_array, map); struct bpf_insn *insn = insn_buf; - u32 elem_size = round_up(map->value_size, 8); + u32 elem_size = array->elem_size; const int ret = BPF_REG_0; const int map_ptr = BPF_REG_1; const int index = BPF_REG_2; @@ -277,7 +277,7 @@ int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) * access 'value_size' of them, so copying rounded areas * will not leak any kernel data */ - size = round_up(map->value_size, 8); + size = array->elem_size; rcu_read_lock(); pptr = array->pptrs[index & array->index_mask]; for_each_possible_cpu(cpu) { @@ -381,7 +381,7 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, * returned or zeros which were zero-filled by percpu_alloc, * so no kernel data leaks possible */ - size = round_up(map->value_size, 8); + size = array->elem_size; rcu_read_lock(); pptr = array->pptrs[index & array->index_mask]; for_each_possible_cpu(cpu) { @@ -587,6 +587,7 @@ static int __bpf_array_map_seq_show(struct seq_file *seq, void *v) struct bpf_iter_seq_array_map_info *info = seq->private; struct bpf_iter__bpf_map_elem ctx = {}; struct bpf_map *map = info->map; + struct bpf_array *array = container_of(map, struct bpf_array, map); struct bpf_iter_meta meta; struct bpf_prog *prog; int off = 0, cpu = 0; @@ -607,7 +608,7 @@ static int __bpf_array_map_seq_show(struct seq_file *seq, void *v) ctx.value = v; } else { pptr = v; - size = round_up(map->value_size, 8); + size = array->elem_size; for_each_possible_cpu(cpu) { bpf_long_memcpy(info->percpu_value_buf + off, per_cpu_ptr(pptr, cpu), @@ -637,11 +638,12 @@ static int bpf_iter_init_array_map(void *priv_data, { struct bpf_iter_seq_array_map_info *seq_info = priv_data; struct bpf_map *map = aux->map; + struct bpf_array *array = container_of(map, struct bpf_array, map); void *value_buf; u32 buf_size; if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { - buf_size = round_up(map->value_size, 8) * num_possible_cpus(); + buf_size = array->elem_size * num_possible_cpus(); value_buf = kmalloc(buf_size, GFP_USER | __GFP_NOWARN); if (!value_buf) return -ENOMEM; @@ -1326,7 +1328,7 @@ static int array_of_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { struct bpf_array *array = container_of(map, struct bpf_array, map); - u32 elem_size = round_up(map->value_size, 8); + u32 elem_size = array->elem_size; struct bpf_insn *insn = insn_buf; const int ret = BPF_REG_0; const int map_ptr = BPF_REG_1; -- cgit From 63b8ce77b15ebf69c4b0ef4b87451e2626aa3c43 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 14 Jul 2022 22:31:45 -0700 Subject: bpf: remove obsolete KMALLOC_MAX_SIZE restriction on array map value size Syscall-side map_lookup_elem() and map_update_elem() used to use kmalloc() to allocate temporary buffers of value_size, so KMALLOC_MAX_SIZE limit on value_size made sense to prevent creation of array map that won't be accessible through syscall interface. But this limitation since has been lifted by relying on kvmalloc() in syscall handling code. So remove KMALLOC_MAX_SIZE, which among other things means that it's possible to have BPF global variable sections (.bss, .data, .rodata) bigger than 8MB now. Keep the sanity check to prevent trivial overflows like round_up(map->value_size, 8) and restrict value size to <= INT_MAX (2GB). Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20220715053146.1291891-4-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/arraymap.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 98ee09155151..d3e734bf8056 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -70,10 +70,8 @@ int array_map_alloc_check(union bpf_attr *attr) attr->map_flags & BPF_F_PRESERVE_ELEMS) return -EINVAL; - if (attr->value_size > KMALLOC_MAX_SIZE) - /* if value_size is bigger, the user space won't be able to - * access the elements. - */ + /* avoid overflow on round_up(map->value_size) */ + if (attr->value_size > INT_MAX) return -E2BIG; return 0; -- cgit From fb77dccfc701b6ebcc232574c828bc69146cf90a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 12 Apr 2022 15:08:14 -0700 Subject: rcu: Decrease FQS scan wait time in case of callback overloading The force-quiesce-state loop function rcu_gp_fqs_loop() checks for callback overloading and does an immediate initial scan for idle CPUs if so. However, subsequent rescans will be carried out at as leisurely a rate as they always are, as specified by the rcutree.jiffies_till_next_fqs module parameter. It might be tempting to just continue immediately rescanning, but this turns the RCU grace-period kthread into a CPU hog. It might also be tempting to reduce the time between rescans to a single jiffy, but this can be problematic on larger systems. This commit therefore divides the normal time between rescans by three, rounding up. Thus a small system running at HZ=1000 that is suffering from callback overload will wait only one jiffy instead of the normal three between rescans. [ paulmck: Apply Neeraj Upadhyay feedback. ] Signed-off-by: Paul E. McKenney Reviewed-by: Neeraj Upadhyay --- kernel/rcu/tree.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c25ba442044a..52094e72866e 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1983,7 +1983,12 @@ static noinline_for_stack void rcu_gp_fqs_loop(void) gf = RCU_GP_FLAG_OVLD; ret = 0; for (;;) { - if (!ret) { + if (rcu_state.cbovld) { + j = (j + 2) / 3; + if (j <= 0) + j = 1; + } + if (!ret || time_before(jiffies + j, rcu_state.jiffies_force_qs)) { WRITE_ONCE(rcu_state.jiffies_force_qs, jiffies + j); /* * jiffies_force_qs before RCU_GP_WAIT_FQS state -- cgit From 48f8070f5dd8e13148ae4647780a452d53c457a2 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Tue, 26 Apr 2022 18:45:02 +0800 Subject: rcu: Avoid tracing a few functions executed in stop machine Stop-machine recently started calling additional functions while waiting: ---------------------------------------------------------------- Former stop machine wait loop: do { cpu_relax(); => macro ... } while (curstate != STOPMACHINE_EXIT); ----------------------------------------------------------------- Current stop machine wait loop: do { stop_machine_yield(cpumask); => function (notraced) ... touch_nmi_watchdog(); => function (notraced, inside calls also notraced) ... rcu_momentary_dyntick_idle(); => function (notraced, inside calls traced) } while (curstate != MULTI_STOP_EXIT); ------------------------------------------------------------------ These functions (and the functions that they call) must be marked notrace to prevent them from being updated while they are executing. The consequences of failing to mark these functions can be severe: rcu: INFO: rcu_preempt detected stalls on CPUs/tasks: rcu: 1-...!: (0 ticks this GP) idle=14f/1/0x4000000000000000 softirq=3397/3397 fqs=0 rcu: 3-...!: (0 ticks this GP) idle=ee9/1/0x4000000000000000 softirq=5168/5168 fqs=0 (detected by 0, t=8137 jiffies, g=5889, q=2 ncpus=4) Task dump for CPU 1: task:migration/1 state:R running task stack: 0 pid: 19 ppid: 2 flags:0x00000000 Stopper: multi_cpu_stop+0x0/0x18c <- stop_machine_cpuslocked+0x128/0x174 Call Trace: Task dump for CPU 3: task:migration/3 state:R running task stack: 0 pid: 29 ppid: 2 flags:0x00000000 Stopper: multi_cpu_stop+0x0/0x18c <- stop_machine_cpuslocked+0x128/0x174 Call Trace: rcu: rcu_preempt kthread timer wakeup didn't happen for 8136 jiffies! g5889 f0x0 RCU_GP_WAIT_FQS(5) ->state=0x402 rcu: Possible timer handling issue on cpu=2 timer-softirq=594 rcu: rcu_preempt kthread starved for 8137 jiffies! g5889 f0x0 RCU_GP_WAIT_FQS(5) ->state=0x402 ->cpu=2 rcu: Unless rcu_preempt kthread gets sufficient CPU time, OOM is now expected behavior. rcu: RCU grace-period kthread stack dump: task:rcu_preempt state:I stack: 0 pid: 14 ppid: 2 flags:0x00000000 Call Trace: schedule+0x56/0xc2 schedule_timeout+0x82/0x184 rcu_gp_fqs_loop+0x19a/0x318 rcu_gp_kthread+0x11a/0x140 kthread+0xee/0x118 ret_from_exception+0x0/0x14 rcu: Stack dump where RCU GP kthread last ran: Task dump for CPU 2: task:migration/2 state:R running task stack: 0 pid: 24 ppid: 2 flags:0x00000000 Stopper: multi_cpu_stop+0x0/0x18c <- stop_machine_cpuslocked+0x128/0x174 Call Trace: This commit therefore marks these functions notrace: rcu_preempt_deferred_qs() rcu_preempt_need_deferred_qs() rcu_preempt_deferred_qs_irqrestore() [ paulmck: Apply feedback from Neeraj Upadhyay. ] Signed-off-by: Patrick Wang Acked-by: Steven Rostedt (Google) Signed-off-by: Paul E. McKenney Reviewed-by: Neeraj Upadhyay --- kernel/rcu/tree_plugin.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index c8ba0fe17267..7a07f2ca153e 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -460,7 +460,7 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp) * be quite short, for example, in the case of the call from * rcu_read_unlock_special(). */ -static void +static notrace void rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) { bool empty_exp; @@ -581,7 +581,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) * is disabled. This function cannot be expected to understand these * nuances, so the caller must handle them. */ -static bool rcu_preempt_need_deferred_qs(struct task_struct *t) +static notrace bool rcu_preempt_need_deferred_qs(struct task_struct *t) { return (__this_cpu_read(rcu_data.cpu_no_qs.b.exp) || READ_ONCE(t->rcu_read_unlock_special.s)) && @@ -595,7 +595,7 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t) * evaluate safety in terms of interrupt, softirq, and preemption * disabling. */ -static void rcu_preempt_deferred_qs(struct task_struct *t) +static notrace void rcu_preempt_deferred_qs(struct task_struct *t) { unsigned long flags; @@ -926,7 +926,7 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp) * Because there is no preemptible RCU, there can be no deferred quiescent * states. */ -static bool rcu_preempt_need_deferred_qs(struct task_struct *t) +static notrace bool rcu_preempt_need_deferred_qs(struct task_struct *t) { return false; } @@ -935,7 +935,7 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t) // period for a quiescent state from this CPU. Note that requests from // tasks are handled when removing the task from the blocked-tasks list // below. -static void rcu_preempt_deferred_qs(struct task_struct *t) +static notrace void rcu_preempt_deferred_qs(struct task_struct *t) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); -- cgit From 52c1d81ee2911ef592048582c6d07975b7399726 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Thu, 5 May 2022 23:52:36 +0800 Subject: rcu: Add rnp->cbovldmask check in rcutree_migrate_callbacks() Currently, the rcu_node structure's ->cbovlmask field is set in call_rcu() when a given CPU is suffering from callback overload. But if that CPU goes offline, the outgoing CPU's callbacks is migrated to the running CPU, which is likely to overload the running CPU. However, that CPU's bit in its leaf rcu_node structure's ->cbovlmask field remains zero. Initially, this is OK because the outgoing CPU's bit remains set. However, that bit will be cleared at the next end of a grace period, at which time it is quite possible that the running CPU will still be overloaded. If the running CPU invokes call_rcu(), then overload will be checked for and the bit will be set. Except that there is no guarantee that the running CPU will invoke call_rcu(), in which case the next grace period will fail to take the running CPU's overload condition into account. Plus, because the bit is not set, the end of the grace period won't check for overload on this CPU. This commit therefore adds a call to check_cb_ovld_locked() in rcutree_migrate_callbacks() to set the running CPU's ->cbovlmask bit appropriately. Signed-off-by: Zqiang Signed-off-by: Paul E. McKenney Reviewed-by: Neeraj Upadhyay --- kernel/rcu/tree.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 52094e72866e..7c62af481981 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4491,6 +4491,7 @@ void rcutree_migrate_callbacks(int cpu) needwake = needwake || rcu_advance_cbs(my_rnp, my_rdp); rcu_segcblist_disable(&rdp->cblist); WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) != !rcu_segcblist_n_cbs(&my_rdp->cblist)); + check_cb_ovld_locked(my_rdp, my_rnp); if (rcu_rdp_is_offloaded(my_rdp)) { raw_spin_unlock_rcu_node(my_rnp); /* irqs remain disabled. */ __call_rcu_nocb_wake(my_rdp, true, flags); -- cgit From 70a82c3c55c8665d3996dcb9968adcf24d52bbc4 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Fri, 13 May 2022 08:42:55 +0800 Subject: rcu: Immediately boost preempted readers for strict grace periods The intent of the CONFIG_RCU_STRICT_GRACE_PERIOD Konfig option is to cause normal grace periods to complete quickly in order to better catch errors resulting from improperly leaking pointers from RCU read-side critical sections. However, kernels built with this option enabled still wait for some hundreds of milliseconds before boosting RCU readers that have been preempted within their current critical section. The value of this delay is set by the CONFIG_RCU_BOOST_DELAY Kconfig option, which defaults to 500 milliseconds. This commit therefore causes kernels build with strict grace periods to ignore CONFIG_RCU_BOOST_DELAY. This causes rcu_initiate_boost() to start boosting immediately after all CPUs on a given leaf rcu_node structure have passed through their quiescent states. Signed-off-by: Zqiang Signed-off-by: Paul E. McKenney Reviewed-by: Neeraj Upadhyay --- kernel/rcu/tree_plugin.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 7a07f2ca153e..4c92034e2670 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1140,7 +1140,8 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) (rnp->gp_tasks != NULL && rnp->boost_tasks == NULL && rnp->qsmask == 0 && - (!time_after(rnp->boost_time, jiffies) || rcu_state.cbovld))) { + (!time_after(rnp->boost_time, jiffies) || rcu_state.cbovld || + IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)))) { if (rnp->exp_tasks == NULL) WRITE_ONCE(rnp->boost_tasks, rnp->gp_tasks); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); -- cgit From b3ade95b8ee507d650d9e163abfdf645a9f3886d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 17 May 2022 18:03:36 -0700 Subject: rcu: Forbid RCU_STRICT_GRACE_PERIOD in TINY_RCU kernels The RCU_STRICT_GRACE_PERIOD Kconfig option does nothing in kernels built with CONFIG_TINY_RCU=y, so this commit adjusts the dependencies to disallow this combination. Signed-off-by: Paul E. McKenney Reviewed-by: Neeraj Upadhyay --- kernel/rcu/Kconfig.debug | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 9b64e55d4f61..4da05beb13d7 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -121,7 +121,7 @@ config RCU_EQS_DEBUG config RCU_STRICT_GRACE_PERIOD bool "Provide debug RCU implementation with short grace periods" - depends on DEBUG_KERNEL && RCU_EXPERT && NR_CPUS <= 4 + depends on DEBUG_KERNEL && RCU_EXPERT && NR_CPUS <= 4 && !TINY_RCU default n select PREEMPT_COUNT if PREEMPT=n help -- cgit From 9c9b26b0df270d4f9246e483a44686fca951a29c Mon Sep 17 00:00:00 2001 From: Chen Zhongjin Date: Tue, 10 May 2022 17:46:39 +0800 Subject: locking/csd_lock: Change csdlock_debug from early_param to __setup The csdlock_debug kernel-boot parameter is parsed by the early_param() function csdlock_debug(). If set, csdlock_debug() invokes static_branch_enable() to enable csd_lock_wait feature, which triggers a panic on arm64 for kernels built with CONFIG_SPARSEMEM=y and CONFIG_SPARSEMEM_VMEMMAP=n. With CONFIG_SPARSEMEM_VMEMMAP=n, __nr_to_section is called in static_key_enable() and returns NULL, resulting in a NULL dereference because mem_section is initialized only later in sparse_init(). This is also a problem for powerpc because early_param() functions are invoked earlier than jump_label_init(), also resulting in static_key_enable() failures. These failures cause the warning "static key 'xxx' used before call to jump_label_init()". Thus, early_param is too early for csd_lock_wait to run static_branch_enable(), so changes it to __setup to fix these. Fixes: 8d0968cc6b8f ("locking/csd_lock: Add boot parameter for controlling CSD lock debugging") Cc: stable@vger.kernel.org Reported-by: Chen jingwen Signed-off-by: Chen Zhongjin Signed-off-by: Paul E. McKenney --- kernel/smp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index dd215f439426..650810a6f29b 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -174,9 +174,9 @@ static int __init csdlock_debug(char *str) if (val) static_branch_enable(&csdlock_debug_enabled); - return 0; + return 1; } -early_param("csdlock_debug", csdlock_debug); +__setup("csdlock_debug=", csdlock_debug); static DEFINE_PER_CPU(call_single_data_t *, cur_csd); static DEFINE_PER_CPU(smp_call_func_t, cur_csd_func); -- cgit From 800d6acf40e5ce676b53a1259fb4e93e56279367 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 27 May 2022 17:07:45 +0200 Subject: rcu: tiny: Record kvfree_call_rcu() call stack for KASAN When running KASAN with Tiny RCU (e.g. under ARCH=um, where a working KASAN patch is now available), we don't get any information on the original kfree_rcu() (or similar) caller when a problem is reported, as Tiny RCU doesn't record this. Add the recording, which required pulling kvfree_call_rcu() out of line for the KASAN case since the recording function (kasan_record_aux_stack_noalloc) is neither exported, nor can we include kasan.h into rcutiny.h. without KASAN, the patch has no size impact (ARCH=um kernel): text data bss dec hex filename 6151515 4423154 33148520 43723189 29b29b5 linux 6151515 4423154 33148520 43723189 29b29b5 linux + patch with KASAN, the impact on my build was minimal: text data bss dec hex filename 13915539 7388050 33282304 54585893 340ea25 linux 13911266 7392114 33282304 54585684 340e954 linux + patch -4273 +4064 +-0 -209 Acked-by: Dmitry Vyukov Signed-off-by: Johannes Berg Signed-off-by: Paul E. McKenney --- kernel/rcu/tiny.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 340b3f8b090d..58ff3721d975 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -217,6 +217,20 @@ bool poll_state_synchronize_rcu(unsigned long oldstate) } EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu); +#ifdef CONFIG_KASAN_GENERIC +void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) +{ + if (head) { + void *ptr = (void *) head - (unsigned long) func; + + kasan_record_aux_stack_noalloc(ptr); + } + + __kvfree_call_rcu(head, func); +} +EXPORT_SYMBOL_GPL(kvfree_call_rcu); +#endif + void __init rcu_init(void) { open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); -- cgit From e2bb1288a381e9239aaf606ae8c1e20ea71c20bd Mon Sep 17 00:00:00 2001 From: Zqiang Date: Thu, 26 May 2022 09:55:12 +0800 Subject: rcu: Cleanup RCU urgency state for offline CPU When a CPU is slow to provide a quiescent state for a given grace period, RCU takes steps to encourage that CPU to get with the quiescent-state program in a more timely fashion. These steps include these flags in the rcu_data structure: 1. ->rcu_urgent_qs, which causes the scheduling-clock interrupt to request an otherwise pointless context switch from the scheduler. 2. ->rcu_need_heavy_qs, which causes both cond_resched() and RCU's context-switch hook to do an immediate momentary quiscent state. 3. ->rcu_need_heavy_qs, which causes the scheduler-clock tick to be enabled even on nohz_full CPUs with only one runnable task. These flags are of course cleared once the corresponding CPU has passed through a quiescent state. Unless that quiescent state is the CPU going offline, which means that when the CPU comes back online, it will needlessly consume additional CPU time and incur additional latency, which constitutes a minor but very real performance bug. This commit therefore adds the call to rcu_disable_urgency_upon_qs() that clears these flags to the CPU-hotplug offlining code path. Signed-off-by: Zqiang Signed-off-by: Paul E. McKenney Reviewed-by: Neeraj Upadhyay --- kernel/rcu/tree.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 7c62af481981..9d9a2a657823 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4446,6 +4446,7 @@ void rcu_report_dead(unsigned int cpu) rdp->rcu_ofl_gp_flags = READ_ONCE(rcu_state.gp_flags); if (rnp->qsmask & mask) { /* RCU waiting on outgoing CPU? */ /* Report quiescent state -before- changing ->qsmaskinitnext! */ + rcu_disable_urgency_upon_qs(rdp); rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); raw_spin_lock_irqsave_rcu_node(rnp, flags); } -- cgit From 82d26c36cc68e781400eb4e541f943008208f2d6 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Thu, 2 Jun 2022 10:06:43 +0200 Subject: rcu/kvfree: Remove useless monitor_todo flag monitor_todo is not needed as the work struct already tracks if work is pending. Just use that to know if work is pending using schedule_delayed_work() helper. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney Reviewed-by: Neeraj Upadhyay --- kernel/rcu/tree.c | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 9d9a2a657823..6f4656aed962 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3216,7 +3216,6 @@ struct kfree_rcu_cpu_work { * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period * @lock: Synchronize access to this structure * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES - * @monitor_todo: Tracks whether a @monitor_work delayed work is pending * @initialized: The @rcu_work fields have been initialized * @count: Number of objects for which GP not started * @bkvcache: @@ -3241,7 +3240,6 @@ struct kfree_rcu_cpu { struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES]; raw_spinlock_t lock; struct delayed_work monitor_work; - bool monitor_todo; bool initialized; int count; @@ -3421,6 +3419,18 @@ static void kfree_rcu_work(struct work_struct *work) } } +static bool +need_offload_krc(struct kfree_rcu_cpu *krcp) +{ + int i; + + for (i = 0; i < FREE_N_CHANNELS; i++) + if (krcp->bkvhead[i]) + return true; + + return !!krcp->head; +} + /* * This function is invoked after the KFREE_DRAIN_JIFFIES timeout. */ @@ -3477,9 +3487,7 @@ static void kfree_rcu_monitor(struct work_struct *work) // of the channels that is still busy we should rearm the // work to repeat an attempt. Because previous batches are // still in progress. - if (!krcp->bkvhead[0] && !krcp->bkvhead[1] && !krcp->head) - krcp->monitor_todo = false; - else + if (need_offload_krc(krcp)) schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); raw_spin_unlock_irqrestore(&krcp->lock, flags); @@ -3667,11 +3675,8 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) WRITE_ONCE(krcp->count, krcp->count + 1); // Set timer to drain after KFREE_DRAIN_JIFFIES. - if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING && - !krcp->monitor_todo) { - krcp->monitor_todo = true; + if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING) schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); - } unlock_return: krc_this_cpu_unlock(krcp, flags); @@ -3746,14 +3751,8 @@ void __init kfree_rcu_scheduler_running(void) struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); raw_spin_lock_irqsave(&krcp->lock, flags); - if ((!krcp->bkvhead[0] && !krcp->bkvhead[1] && !krcp->head) || - krcp->monitor_todo) { - raw_spin_unlock_irqrestore(&krcp->lock, flags); - continue; - } - krcp->monitor_todo = true; - schedule_delayed_work_on(cpu, &krcp->monitor_work, - KFREE_DRAIN_JIFFIES); + if (need_offload_krc(krcp)) + schedule_delayed_work_on(cpu, &krcp->monitor_work, KFREE_DRAIN_JIFFIES); raw_spin_unlock_irqrestore(&krcp->lock, flags); } } -- cgit From 9bdb5b3a8d8ad1c92db309219859fe1c87c95351 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 8 Jun 2022 09:34:10 -0700 Subject: rcu: Initialize first_gp_fqs at declaration in rcu_gp_fqs() This commit saves a line of code by initializing the rcu_gp_fqs() function's first_gp_fqs local variable in its declaration. Reported-by: Frederic Weisbecker Reported-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 6f4656aed962..a23877a773be 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1971,13 +1971,12 @@ static void rcu_gp_fqs(bool first_time) */ static noinline_for_stack void rcu_gp_fqs_loop(void) { - bool first_gp_fqs; + bool first_gp_fqs = true; int gf = 0; unsigned long j; int ret; struct rcu_node *rnp = rcu_get_root(); - first_gp_fqs = true; j = READ_ONCE(jiffies_till_first_fqs); if (rcu_state.cbovld) gf = RCU_GP_FLAG_OVLD; -- cgit From a03ae49c4785c1bc7b940e38bbdf2e63d79d1470 Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Thu, 9 Jun 2022 12:43:40 +0530 Subject: rcu/tree: Add comment to describe GP-done condition in fqs loop Add a comment to explain why !rcu_preempt_blocked_readers_cgp() condition is required on root rnp node, for GP completion check in rcu_gp_fqs_loop(). Reviewed-by: Joel Fernandes (Google) Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a23877a773be..7c2231823e84 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2005,7 +2005,15 @@ static noinline_for_stack void rcu_gp_fqs_loop(void) rcu_gp_torture_wait(); WRITE_ONCE(rcu_state.gp_state, RCU_GP_DOING_FQS); /* Locking provides needed memory barriers. */ - /* If grace period done, leave loop. */ + /* + * Exit the loop if the root rcu_node structure indicates that the grace period + * has ended, leave the loop. The rcu_preempt_blocked_readers_cgp(rnp) check + * is required only for single-node rcu_node trees because readers blocking + * the current grace period are queued only on leaf rcu_node structures. + * For multi-node trees, checking the root node's ->qsmask suffices, because a + * given root node's ->qsmask bit is cleared only when all CPUs and tasks from + * the corresponding leaf nodes have passed through their quiescent state. + */ if (!READ_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp)) break; -- cgit From 1598f4a4762be0ea6a1bcd229c2c9ff1ebb212bb Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 19 Apr 2022 14:23:18 +0200 Subject: rcu/nocb: Add/del rdp to iterate from rcuog itself NOCB rdp's are part of a group whose list is iterated by the corresponding rdp leader. This list is RCU traversed because an rdp can be either added or deleted concurrently. Upon addition, a new iteration to the list after a synchronization point (a pair of LOCK/UNLOCK ->nocb_gp_lock) is forced to make sure: 1) we didn't miss a new element added in the middle of an iteration 2) we didn't ignore a whole subset of the list due to an element being quickly deleted and then re-added. 3) we prevent from probably other surprises... Although this layout is expected to be safe, it doesn't help anybody to sleep well. Simplify instead the nocb state toggling with moving the list modification from the nocb (de-)offloading workqueue to the rcuog kthreads instead. Whenever the rdp leader is expected to (re-)set the SEGCBLIST_KTHREAD_GP flag of a target rdp, the latter is queued so that the leader handles the flag flip along with adding or deleting the target rdp to the list to iterate. This way the list modification and iteration happen from the same kthread and those operations can't race altogether. As a bonus, the flags for each rdp don't need to be checked locklessly before each iteration, which is one less opportunity to produce nightmares. Signed-off-by: Frederic Weisbecker Cc: Neeraj Upadhyay Cc: Boqun Feng Cc: Uladzislau Rezki Cc: Joel Fernandes Cc: Zqiang Signed-off-by: Paul E. McKenney Reviewed-by: Neeraj Upadhyay --- kernel/rcu/tree.h | 1 + kernel/rcu/tree_nocb.h | 138 +++++++++++++++++++++++++------------------------ 2 files changed, 71 insertions(+), 68 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 2ccf5845957d..4f8532c33558 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -235,6 +235,7 @@ struct rcu_data { * if rdp_gp. */ struct list_head nocb_entry_rdp; /* rcu_data node in wakeup chain. */ + struct rcu_data *nocb_toggling_rdp; /* rdp queued for (de-)offloading */ /* The following fields are used by CB kthread, hence new cacheline. */ struct rcu_data *nocb_gp_rdp ____cacheline_internodealigned_in_smp; diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 46694e13398a..dac74952e1d1 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -546,52 +546,43 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, } } -/* - * Check if we ignore this rdp. - * - * We check that without holding the nocb lock but - * we make sure not to miss a freshly offloaded rdp - * with the current ordering: - * - * rdp_offload_toggle() nocb_gp_enabled_cb() - * ------------------------- ---------------------------- - * WRITE flags LOCK nocb_gp_lock - * LOCK nocb_gp_lock READ/WRITE nocb_gp_sleep - * READ/WRITE nocb_gp_sleep UNLOCK nocb_gp_lock - * UNLOCK nocb_gp_lock READ flags - */ -static inline bool nocb_gp_enabled_cb(struct rcu_data *rdp) -{ - u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_GP; - - return rcu_segcblist_test_flags(&rdp->cblist, flags); -} - -static inline bool nocb_gp_update_state_deoffloading(struct rcu_data *rdp, - bool *needwake_state) +static int nocb_gp_toggle_rdp(struct rcu_data *rdp, + bool *wake_state) { struct rcu_segcblist *cblist = &rdp->cblist; + unsigned long flags; + int ret; - if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) { - if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) { - rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_GP); - if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) - *needwake_state = true; - } - return false; + rcu_nocb_lock_irqsave(rdp, flags); + if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED) && + !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) { + /* + * Offloading. Set our flag and notify the offload worker. + * We will handle this rdp until it ever gets de-offloaded. + */ + rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_GP); + if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) + *wake_state = true; + ret = 1; + } else if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED) && + rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) { + /* + * De-offloading. Clear our flag and notify the de-offload worker. + * We will ignore this rdp until it ever gets re-offloaded. + */ + rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_GP); + if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) + *wake_state = true; + ret = 0; + } else { + WARN_ON_ONCE(1); + ret = -1; } - /* - * De-offloading. Clear our flag and notify the de-offload worker. - * We will ignore this rdp until it ever gets re-offloaded. - */ - WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)); - rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_GP); - if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) - *needwake_state = true; - return true; -} + rcu_nocb_unlock_irqrestore(rdp, flags); + return ret; +} /* * No-CBs GP kthreads come here to wait for additional callbacks to show up @@ -609,7 +600,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) bool needwait_gp = false; // This prevents actual uninitialized use. bool needwake; bool needwake_gp; - struct rcu_data *rdp; + struct rcu_data *rdp, *rdp_toggling = NULL; struct rcu_node *rnp; unsigned long wait_gp_seq = 0; // Suppress "use uninitialized" warning. bool wasempty = false; @@ -634,19 +625,10 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) * is added to the list, so the skipped-over rcu_data structures * won't be ignored for long. */ - list_for_each_entry_rcu(rdp, &my_rdp->nocb_head_rdp, nocb_entry_rdp, 1) { - bool needwake_state = false; - - if (!nocb_gp_enabled_cb(rdp)) - continue; + list_for_each_entry(rdp, &my_rdp->nocb_head_rdp, nocb_entry_rdp) { trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check")); rcu_nocb_lock_irqsave(rdp, flags); - if (nocb_gp_update_state_deoffloading(rdp, &needwake_state)) { - rcu_nocb_unlock_irqrestore(rdp, flags); - if (needwake_state) - swake_up_one(&rdp->nocb_state_wq); - continue; - } + lockdep_assert_held(&rdp->nocb_lock); bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); if (bypass_ncbs && (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + 1) || @@ -656,8 +638,6 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); } else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) { rcu_nocb_unlock_irqrestore(rdp, flags); - if (needwake_state) - swake_up_one(&rdp->nocb_state_wq); continue; /* No callbacks here, try next. */ } if (bypass_ncbs) { @@ -705,8 +685,6 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) } if (needwake_gp) rcu_gp_kthread_wake(); - if (needwake_state) - swake_up_one(&rdp->nocb_state_wq); } my_rdp->nocb_gp_bypass = bypass; @@ -739,15 +717,49 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) !READ_ONCE(my_rdp->nocb_gp_sleep)); trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("EndWait")); } + if (!rcu_nocb_poll) { raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags); + // (De-)queue an rdp to/from the group if its nocb state is changing + rdp_toggling = my_rdp->nocb_toggling_rdp; + if (rdp_toggling) + my_rdp->nocb_toggling_rdp = NULL; + if (my_rdp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) { WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); del_timer(&my_rdp->nocb_timer); } WRITE_ONCE(my_rdp->nocb_gp_sleep, true); raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags); + } else { + rdp_toggling = READ_ONCE(my_rdp->nocb_toggling_rdp); + if (rdp_toggling) { + /* + * Paranoid locking to make sure nocb_toggling_rdp is well + * reset *before* we (re)set SEGCBLIST_KTHREAD_GP or we could + * race with another round of nocb toggling for this rdp. + * Nocb locking should prevent from that already but we stick + * to paranoia, especially in rare path. + */ + raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags); + my_rdp->nocb_toggling_rdp = NULL; + raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags); + } + } + + if (rdp_toggling) { + bool wake_state = false; + int ret; + + ret = nocb_gp_toggle_rdp(rdp_toggling, &wake_state); + if (ret == 1) + list_add_tail(&rdp_toggling->nocb_entry_rdp, &my_rdp->nocb_head_rdp); + else if (ret == 0) + list_del(&rdp_toggling->nocb_entry_rdp); + if (wake_state) + swake_up_one(&rdp_toggling->nocb_state_wq); } + my_rdp->nocb_gp_seq = -1; WARN_ON(signal_pending(current)); } @@ -966,6 +978,8 @@ static int rdp_offload_toggle(struct rcu_data *rdp, swake_up_one(&rdp->nocb_cb_wq); raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); + // Queue this rdp for add/del to/from the list to iterate on rcuog + WRITE_ONCE(rdp_gp->nocb_toggling_rdp, rdp); if (rdp_gp->nocb_gp_sleep) { rdp_gp->nocb_gp_sleep = false; wake_gp = true; @@ -1013,8 +1027,6 @@ static long rcu_nocb_rdp_deoffload(void *arg) swait_event_exclusive(rdp->nocb_state_wq, !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP)); - /* Stop nocb_gp_wait() from iterating over this structure. */ - list_del_rcu(&rdp->nocb_entry_rdp); /* * Lock one last time to acquire latest callback updates from kthreads * so we can later handle callbacks locally without locking. @@ -1079,16 +1091,6 @@ static long rcu_nocb_rdp_offload(void *arg) pr_info("Offloading %d\n", rdp->cpu); - /* - * Cause future nocb_gp_wait() invocations to iterate over - * structure, resetting ->nocb_gp_sleep and waking up the related - * "rcuog". Since nocb_gp_wait() in turn locks ->nocb_gp_lock - * before setting ->nocb_gp_sleep again, we are guaranteed to - * iterate this newly added structure before "rcuog" goes to - * sleep again. - */ - list_add_tail_rcu(&rdp->nocb_entry_rdp, &rdp->nocb_gp_rdp->nocb_head_rdp); - /* * Can't use rcu_nocb_lock_irqsave() before SEGCBLIST_LOCKING * is set. -- cgit From 24a57affd242566fef2935f04f062350b8275187 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Tue, 19 Apr 2022 14:23:19 +0200 Subject: rcu/nocb: Invert rcu_state.barrier_mutex VS hotplug lock locking order In case of failure to spawn either rcuog or rcuo[p] kthreads for a given rdp, rcu_nocb_rdp_deoffload() needs to be called with the hotplug lock and the barrier_mutex held. However cpus write lock is already held while calling rcutree_prepare_cpu(). It's not possible to call rcu_nocb_rdp_deoffload() from there with just locking the barrier_mutex or this would result in a locking inversion against rcu_nocb_cpu_deoffload() which holds both locks in the reverse order. Simply solve this with inverting the locking order inside rcu_nocb_cpu_[de]offload(). This will be a pre-requisite to toggle NOCB states toward cpusets anyway. Signed-off-by: Zqiang Cc: Neeraj Upadhyay Cc: Boqun Feng Cc: Uladzislau Rezki Cc: Joel Fernandes Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Reviewed-by: Neeraj Upadhyay --- kernel/rcu/tree_nocb.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index dac74952e1d1..f2f2cab6285a 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -1055,8 +1055,8 @@ int rcu_nocb_cpu_deoffload(int cpu) struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); int ret = 0; - mutex_lock(&rcu_state.barrier_mutex); cpus_read_lock(); + mutex_lock(&rcu_state.barrier_mutex); if (rcu_rdp_is_offloaded(rdp)) { if (cpu_online(cpu)) { ret = work_on_cpu(cpu, rcu_nocb_rdp_deoffload, rdp); @@ -1067,8 +1067,8 @@ int rcu_nocb_cpu_deoffload(int cpu) ret = -EINVAL; } } - cpus_read_unlock(); mutex_unlock(&rcu_state.barrier_mutex); + cpus_read_unlock(); return ret; } @@ -1134,8 +1134,8 @@ int rcu_nocb_cpu_offload(int cpu) struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); int ret = 0; - mutex_lock(&rcu_state.barrier_mutex); cpus_read_lock(); + mutex_lock(&rcu_state.barrier_mutex); if (!rcu_rdp_is_offloaded(rdp)) { if (cpu_online(cpu)) { ret = work_on_cpu(cpu, rcu_nocb_rdp_offload, rdp); @@ -1146,8 +1146,8 @@ int rcu_nocb_cpu_offload(int cpu) ret = -EINVAL; } } - cpus_read_unlock(); mutex_unlock(&rcu_state.barrier_mutex); + cpus_read_unlock(); return ret; } -- cgit From 3a5761dc025da47960755ac64d9fbf1c32e8cd80 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Tue, 19 Apr 2022 14:23:20 +0200 Subject: rcu/nocb: Fix NOCB kthreads spawn failure with rcu_nocb_rdp_deoffload() direct call If the rcuog/o[p] kthreads spawn failed, the offloaded rdp needs to be explicitly deoffloaded, otherwise the target rdp is still considered offloaded even though nothing actually handles the callbacks. Signed-off-by: Zqiang Cc: Neeraj Upadhyay Cc: Boqun Feng Cc: Uladzislau Rezki Cc: Joel Fernandes Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Reviewed-by: Neeraj Upadhyay --- kernel/rcu/tree_nocb.h | 80 ++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 64 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index f2f2cab6285a..4cf9a29bba79 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -986,10 +986,7 @@ static int rdp_offload_toggle(struct rcu_data *rdp, } raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); - if (wake_gp) - wake_up_process(rdp_gp->nocb_gp_kthread); - - return 0; + return wake_gp; } static long rcu_nocb_rdp_deoffload(void *arg) @@ -997,9 +994,15 @@ static long rcu_nocb_rdp_deoffload(void *arg) struct rcu_data *rdp = arg; struct rcu_segcblist *cblist = &rdp->cblist; unsigned long flags; - int ret; + int wake_gp; + struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; - WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id()); + /* + * rcu_nocb_rdp_deoffload() may be called directly if + * rcuog/o[p] spawn failed, because at this time the rdp->cpu + * is not online yet. + */ + WARN_ON_ONCE((rdp->cpu != raw_smp_processor_id()) && cpu_online(rdp->cpu)); pr_info("De-offloading %d\n", rdp->cpu); @@ -1023,10 +1026,41 @@ static long rcu_nocb_rdp_deoffload(void *arg) */ rcu_segcblist_set_flags(cblist, SEGCBLIST_RCU_CORE); invoke_rcu_core(); - ret = rdp_offload_toggle(rdp, false, flags); - swait_event_exclusive(rdp->nocb_state_wq, - !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB | - SEGCBLIST_KTHREAD_GP)); + wake_gp = rdp_offload_toggle(rdp, false, flags); + + mutex_lock(&rdp_gp->nocb_gp_kthread_mutex); + if (rdp_gp->nocb_gp_kthread) { + if (wake_gp) + wake_up_process(rdp_gp->nocb_gp_kthread); + + /* + * If rcuo[p] kthread spawn failed, directly remove SEGCBLIST_KTHREAD_CB. + * Just wait SEGCBLIST_KTHREAD_GP to be cleared by rcuog. + */ + if (!rdp->nocb_cb_kthread) { + rcu_nocb_lock_irqsave(rdp, flags); + rcu_segcblist_clear_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB); + rcu_nocb_unlock_irqrestore(rdp, flags); + } + + swait_event_exclusive(rdp->nocb_state_wq, + !rcu_segcblist_test_flags(cblist, + SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP)); + } else { + /* + * No kthread to clear the flags for us or remove the rdp from the nocb list + * to iterate. Do it here instead. Locking doesn't look stricly necessary + * but we stick to paranoia in this rare path. + */ + rcu_nocb_lock_irqsave(rdp, flags); + rcu_segcblist_clear_flags(&rdp->cblist, + SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP); + rcu_nocb_unlock_irqrestore(rdp, flags); + + list_del(&rdp->nocb_entry_rdp); + } + mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex); + /* * Lock one last time to acquire latest callback updates from kthreads * so we can later handle callbacks locally without locking. @@ -1047,7 +1081,7 @@ static long rcu_nocb_rdp_deoffload(void *arg) WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); - return ret; + return 0; } int rcu_nocb_cpu_deoffload(int cpu) @@ -1079,7 +1113,8 @@ static long rcu_nocb_rdp_offload(void *arg) struct rcu_data *rdp = arg; struct rcu_segcblist *cblist = &rdp->cblist; unsigned long flags; - int ret; + int wake_gp; + struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id()); /* @@ -1089,6 +1124,9 @@ static long rcu_nocb_rdp_offload(void *arg) if (!rdp->nocb_gp_rdp) return -EINVAL; + if (WARN_ON_ONCE(!rdp_gp->nocb_gp_kthread)) + return -EINVAL; + pr_info("Offloading %d\n", rdp->cpu); /* @@ -1113,7 +1151,9 @@ static long rcu_nocb_rdp_offload(void *arg) * WRITE flags READ callbacks * rcu_nocb_unlock() rcu_nocb_unlock() */ - ret = rdp_offload_toggle(rdp, true, flags); + wake_gp = rdp_offload_toggle(rdp, true, flags); + if (wake_gp) + wake_up_process(rdp_gp->nocb_gp_kthread); swait_event_exclusive(rdp->nocb_state_wq, rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB) && rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)); @@ -1126,7 +1166,7 @@ static long rcu_nocb_rdp_offload(void *arg) rcu_segcblist_clear_flags(cblist, SEGCBLIST_RCU_CORE); rcu_nocb_unlock_irqrestore(rdp, flags); - return ret; + return 0; } int rcu_nocb_cpu_offload(int cpu) @@ -1248,7 +1288,7 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu) "rcuog/%d", rdp_gp->cpu); if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo GP kthread, OOM is now expected behavior\n", __func__)) { mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex); - return; + goto end; } WRITE_ONCE(rdp_gp->nocb_gp_kthread, t); if (kthread_prio) @@ -1260,12 +1300,20 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu) t = kthread_run(rcu_nocb_cb_kthread, rdp, "rcuo%c/%d", rcu_state.abbr, cpu); if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo CB kthread, OOM is now expected behavior\n", __func__)) - return; + goto end; if (kthread_prio) sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); WRITE_ONCE(rdp->nocb_cb_kthread, t); WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread); + return; +end: + mutex_lock(&rcu_state.barrier_mutex); + if (rcu_rdp_is_offloaded(rdp)) { + rcu_nocb_rdp_deoffload(rdp); + cpumask_clear_cpu(cpu, rcu_nocb_mask); + } + mutex_unlock(&rcu_state.barrier_mutex); } /* How many CB CPU IDs per GP kthread? Default of -1 for sqrt(nr_cpu_ids). */ -- cgit From b37a667c62421b34e96b05613457b9fb0ed66ea1 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Fri, 22 Apr 2022 17:52:47 +0000 Subject: rcu/nocb: Add an option to offload all CPUs on boot Systems built with CONFIG_RCU_NOCB_CPU=y but booted without either the rcu_nocbs= or rcu_nohz_full= kernel-boot parameters will not have callback offloading on any of the CPUs, nor can any of the CPUs be switched to enable callback offloading at runtime. Although this is intentional, it would be nice to have a way to offload all the CPUs without having to make random bootloaders specify either the rcu_nocbs= or the rcu_nohz_full= kernel-boot parameters. This commit therefore provides a new CONFIG_RCU_NOCB_CPU_DEFAULT_ALL Kconfig option that switches the default so as to offload callback processing on all of the CPUs. This default can still be overridden using the rcu_nocbs= and rcu_nohz_full= kernel-boot parameters. Reviewed-by: Kalesh Singh Reviewed-by: Uladzislau Rezki (In v4.1, fixed issues with CONFIG maze reported by kernel test robot). Reported-by: kernel test robot Signed-off-by: Joel Fernandes Signed-off-by: Paul E. McKenney Reviewed-by: Neeraj Upadhyay --- kernel/rcu/Kconfig | 13 +++++++++++++ kernel/rcu/tree_nocb.h | 15 ++++++++++++++- 2 files changed, 27 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 1c630e573548..27aab870ae4c 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -262,6 +262,19 @@ config RCU_NOCB_CPU Say Y here if you need reduced OS jitter, despite added overhead. Say N here if you are unsure. +config RCU_NOCB_CPU_DEFAULT_ALL + bool "Offload RCU callback processing from all CPUs by default" + depends on RCU_NOCB_CPU + default n + help + Use this option to offload callback processing from all CPUs + by default, in the absence of the rcu_nocbs or nohz_full boot + parameter. This also avoids the need to use any boot parameters + to achieve the effect of offloading all CPUs on boot. + + Say Y here if you want offload all CPUs by default on boot. + Say N here if you are unsure. + config TASKS_TRACE_RCU_READ_MB bool "Tasks Trace RCU readers use memory barriers in user and idle" depends on RCU_EXPERT && TASKS_TRACE_RCU diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 4cf9a29bba79..60cc92cc6655 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -1197,11 +1197,21 @@ void __init rcu_init_nohz(void) { int cpu; bool need_rcu_nocb_mask = false; + bool offload_all = false; struct rcu_data *rdp; +#if defined(CONFIG_RCU_NOCB_CPU_DEFAULT_ALL) + if (!rcu_state.nocb_is_setup) { + need_rcu_nocb_mask = true; + offload_all = true; + } +#endif /* #if defined(CONFIG_RCU_NOCB_CPU_DEFAULT_ALL) */ + #if defined(CONFIG_NO_HZ_FULL) - if (tick_nohz_full_running && !cpumask_empty(tick_nohz_full_mask)) + if (tick_nohz_full_running && !cpumask_empty(tick_nohz_full_mask)) { need_rcu_nocb_mask = true; + offload_all = false; /* NO_HZ_FULL has its own mask. */ + } #endif /* #if defined(CONFIG_NO_HZ_FULL) */ if (need_rcu_nocb_mask) { @@ -1222,6 +1232,9 @@ void __init rcu_init_nohz(void) cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask); #endif /* #if defined(CONFIG_NO_HZ_FULL) */ + if (offload_all) + cpumask_setall(rcu_nocb_mask); + if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) { pr_info("\tNote: kernel parameter 'rcu_nocbs=', 'nohz_full', or 'isolcpus=' contains nonexistent CPUs.\n"); cpumask_and(rcu_nocb_mask, cpu_possible_mask, -- cgit From 5103850654fdc651f0a7076ac753b958f018bb85 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Fri, 29 Apr 2022 20:42:22 +0800 Subject: rcu: Add nocb_cb_kthread check to rcu_is_callbacks_kthread() Callbacks are invoked in RCU kthreads when calbacks are offloaded (rcu_nocbs boot parameter) or when RCU's softirq handler has been offloaded to rcuc kthreads (use_softirq==0). The current code allows for the rcu_nocbs case but not the use_softirq case. This commit adds support for the use_softirq case. Reported-by: kernel test robot Signed-off-by: Zqiang Signed-off-by: Paul E. McKenney Reviewed-by: Neeraj Upadhyay --- kernel/rcu/tree.c | 4 ++-- kernel/rcu/tree.h | 2 +- kernel/rcu/tree_plugin.h | 33 +++++++++++++++++++-------------- 3 files changed, 22 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c25ba442044a..74455671e6cf 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2530,7 +2530,7 @@ static void rcu_do_batch(struct rcu_data *rdp) trace_rcu_batch_end(rcu_state.name, 0, !rcu_segcblist_empty(&rdp->cblist), need_resched(), is_idle_task(current), - rcu_is_callbacks_kthread()); + rcu_is_callbacks_kthread(rdp)); return; } @@ -2608,7 +2608,7 @@ static void rcu_do_batch(struct rcu_data *rdp) rcu_nocb_lock_irqsave(rdp, flags); rdp->n_cbs_invoked += count; trace_rcu_batch_end(rcu_state.name, count, !!rcl.head, need_resched(), - is_idle_task(current), rcu_is_callbacks_kthread()); + is_idle_task(current), rcu_is_callbacks_kthread(rdp)); /* Update counts and requeue any remaining callbacks. */ rcu_segcblist_insert_done_cbs(&rdp->cblist, &rcl); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 4f8532c33558..649ad4f0129b 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -426,7 +426,7 @@ static void rcu_flavor_sched_clock_irq(int user); static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck); static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); -static bool rcu_is_callbacks_kthread(void); +static bool rcu_is_callbacks_kthread(struct rcu_data *rdp); static void rcu_cpu_kthread_setup(unsigned int cpu); static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp); static bool rcu_preempt_has_tasks(struct rcu_node *rnp); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index c8ba0fe17267..0483e1338c41 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1012,6 +1012,25 @@ static void rcu_cpu_kthread_setup(unsigned int cpu) WRITE_ONCE(rdp->rcuc_activity, jiffies); } +static bool rcu_is_callbacks_nocb_kthread(struct rcu_data *rdp) +{ +#ifdef CONFIG_RCU_NOCB_CPU + return rdp->nocb_cb_kthread == current; +#else + return false; +#endif +} + +/* + * Is the current CPU running the RCU-callbacks kthread? + * Caller must have preemption disabled. + */ +static bool rcu_is_callbacks_kthread(struct rcu_data *rdp) +{ + return rdp->rcu_cpu_kthread_task == current || + rcu_is_callbacks_nocb_kthread(rdp); +} + #ifdef CONFIG_RCU_BOOST /* @@ -1151,15 +1170,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) } } -/* - * Is the current CPU running the RCU-callbacks kthread? - * Caller must have preemption disabled. - */ -static bool rcu_is_callbacks_kthread(void) -{ - return __this_cpu_read(rcu_data.rcu_cpu_kthread_task) == current; -} - #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) /* @@ -1242,11 +1252,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } -static bool rcu_is_callbacks_kthread(void) -{ - return false; -} - static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) { } -- cgit From 8f489b4da5278fc6e5fc8f0029ae7fb51c060215 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Wed, 11 May 2022 10:57:03 +0200 Subject: rcu/nocb: Add option to opt rcuo kthreads out of RT priority This commit introduces a RCU_NOCB_CPU_CB_BOOST Kconfig option that prevents rcuo kthreads from running at real-time priority, even in kernels built with RCU_BOOST. This capability is important to devices needing low-latency (as in a few milliseconds) response from expedited RCU grace periods, but which are not running a classic real-time workload. On such devices, permitting the rcuo kthreads to run at real-time priority results in unacceptable latencies imposed on the application tasks, which run as SCHED_OTHER. See for example the following trace output: <...>-60 [006] d..1 2979.028717: rcu_batch_start: rcu_preempt CBs=34619 bl=270 If that rcuop kthread were permitted to run at real-time SCHED_FIFO priority, it would monopolize its CPU for hundreds of milliseconds while invoking those 34619 RCU callback functions, which would cause an unacceptably long latency spike for many application stacks on Android platforms. However, some existing real-time workloads require that callback invocation run at SCHED_FIFO priority, for example, those running on systems with heavy SCHED_OTHER background loads. (It is the real-time system's administrator's responsibility to make sure that important real-time tasks run at a higher priority than do RCU's kthreads.) Therefore, this new RCU_NOCB_CPU_CB_BOOST Kconfig option defaults to "y" on kernels built with PREEMPT_RT and defaults to "n" otherwise. The effect is to preserve current behavior for real-time systems, but for other systems to allow expedited RCU grace periods to run with real-time priority while continuing to invoke RCU callbacks as SCHED_OTHER. As you would expect, this RCU_NOCB_CPU_CB_BOOST Kconfig option has no effect except on CPUs with offloaded RCU callbacks. Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney Acked-by: Joel Fernandes (Google) Reviewed-by: Neeraj Upadhyay --- kernel/rcu/Kconfig | 16 ++++++++++++++++ kernel/rcu/tree.c | 6 +++++- kernel/rcu/tree_nocb.h | 3 ++- 3 files changed, 23 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 27aab870ae4c..c05ca52cdf64 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -275,6 +275,22 @@ config RCU_NOCB_CPU_DEFAULT_ALL Say Y here if you want offload all CPUs by default on boot. Say N here if you are unsure. +config RCU_NOCB_CPU_CB_BOOST + bool "Offload RCU callback from real-time kthread" + depends on RCU_NOCB_CPU && RCU_BOOST + default y if PREEMPT_RT + help + Use this option to invoke offloaded callbacks as SCHED_FIFO + to avoid starvation by heavy SCHED_OTHER background load. + Of course, running as SCHED_FIFO during callback floods will + cause the rcuo[ps] kthreads to monopolize the CPU for hundreds + of milliseconds or more. Therefore, when enabling this option, + it is your responsibility to ensure that latency-sensitive + tasks either run with higher priority or run on some other CPU. + + Say Y here if you want to set RT priority for offloading kthreads. + Say N here if you are building a !PREEMPT_RT kernel and are unsure. + config TASKS_TRACE_RCU_READ_MB bool "Tasks Trace RCU readers use memory barriers in user and idle" depends on RCU_EXPERT && TASKS_TRACE_RCU diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 74455671e6cf..3b9f45ebb499 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -154,7 +154,11 @@ static void sync_sched_exp_online_cleanup(int cpu); static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp); static bool rcu_rdp_is_offloaded(struct rcu_data *rdp); -/* rcuc/rcub/rcuop kthread realtime priority */ +/* + * rcuc/rcub/rcuop kthread realtime priority. The "rcuop" + * real-time priority(enabling/disabling) is controlled by + * the extra CONFIG_RCU_NOCB_CPU_CB_BOOST configuration. + */ static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0; module_param(kthread_prio, int, 0444); diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 60cc92cc6655..fa8e4f82e60c 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -1315,8 +1315,9 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu) if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo CB kthread, OOM is now expected behavior\n", __func__)) goto end; - if (kthread_prio) + if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_CB_BOOST) && kthread_prio) sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + WRITE_ONCE(rdp->nocb_cb_kthread, t); WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread); return; -- cgit From 0578e14c945b1739e15c0e993280151fa5b99ca2 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Sat, 11 Jun 2022 19:00:44 +0800 Subject: rcu/nocb: Avoid polling when my_rdp->nocb_head_rdp list is empty Currently, if the 'rcu_nocb_poll' kernel boot parameter is enabled, all rcuog kthreads enter polling mode. However, if all of a given group of rcuo kthreads correspond to CPUs that have been de-offloaded, the corresponding rcuog kthread will nonetheless still wake up periodically, unnecessarily consuming power and perturbing workloads. Fortunately, this situation is easily detected by the fact that the rcuog kthread's CPU's rcu_data structure's ->nocb_head_rdp list is empty. This commit saves power and avoids unnecessarily perturbing workloads by putting an rcuog kthread to sleep during any time period when all of its rcuo kthreads' CPUs are de-offloaded. Co-developed-by: Frederic Weisbecker Signed-off-by: Frederic Weisbecker Signed-off-by: Zqiang Signed-off-by: Paul E. McKenney Reviewed-by: Neeraj Upadhyay --- kernel/rcu/tree_nocb.h | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index fa8e4f82e60c..a8f574d8850d 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -584,6 +584,14 @@ static int nocb_gp_toggle_rdp(struct rcu_data *rdp, return ret; } +static void nocb_gp_sleep(struct rcu_data *my_rdp, int cpu) +{ + trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Sleep")); + swait_event_interruptible_exclusive(my_rdp->nocb_gp_wq, + !READ_ONCE(my_rdp->nocb_gp_sleep)); + trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("EndSleep")); +} + /* * No-CBs GP kthreads come here to wait for additional callbacks to show up * or for grace periods to end. @@ -701,13 +709,19 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) /* Polling, so trace if first poll in the series. */ if (gotcbs) trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Poll")); - schedule_timeout_idle(1); + if (list_empty(&my_rdp->nocb_head_rdp)) { + raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags); + if (!my_rdp->nocb_toggling_rdp) + WRITE_ONCE(my_rdp->nocb_gp_sleep, true); + raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags); + /* Wait for any offloading rdp */ + nocb_gp_sleep(my_rdp, cpu); + } else { + schedule_timeout_idle(1); + } } else if (!needwait_gp) { /* Wait for callbacks to appear. */ - trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Sleep")); - swait_event_interruptible_exclusive(my_rdp->nocb_gp_wq, - !READ_ONCE(my_rdp->nocb_gp_sleep)); - trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("EndSleep")); + nocb_gp_sleep(my_rdp, cpu); } else { rnp = my_rdp->mynode; trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("StartWait")); -- cgit From 554694ba120b87e39cf732ed632e6a0c52fafb7c Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Wed, 20 Jul 2022 18:19:32 +0200 Subject: module: Replace kmap() with kmap_local_page() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit kmap() is being deprecated in favor of kmap_local_page(). Two main problems with kmap(): (1) It comes with an overhead as mapping space is restricted and protected by a global lock for synchronization and (2) it also requires global TLB invalidation when the kmap’s pool wraps and it might block when the mapping space is fully utilized until a slot becomes available. With kmap_local_page() the mappings are per thread, CPU local, can take page faults, and can be called from any context (including interrupts). Tasks can be preempted and, when scheduled to run again, the kernel virtual addresses are restored and still valid. kmap_local_page() is faster than kmap() in kernels with HIGHMEM enabled. Since the use of kmap_local_page() in module_gzip_decompress() and in module_xz_decompress() is safe (i.e., it does not break the strict rules of use), it should be preferred over kmap(). Therefore, replace kmap() with kmap_local_page(). Tested on a QEMU/KVM x86_32 VM with 4GB RAM, booting kernels with HIGHMEM64GB enabled. Modules compressed with XZ or GZIP decompress properly. Cc: Matthew Wilcox Suggested-by: Ira Weiny Signed-off-by: Fabio M. De Francesco Signed-off-by: Luis Chamberlain --- kernel/module/decompress.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/module/decompress.c b/kernel/module/decompress.c index 2fc7081dd7c1..4d0bcb3d9e44 100644 --- a/kernel/module/decompress.c +++ b/kernel/module/decompress.c @@ -119,10 +119,10 @@ static ssize_t module_gzip_decompress(struct load_info *info, goto out_inflate_end; } - s.next_out = kmap(page); + s.next_out = kmap_local_page(page); s.avail_out = PAGE_SIZE; rc = zlib_inflate(&s, 0); - kunmap(page); + kunmap_local(s.next_out); new_size += PAGE_SIZE - s.avail_out; } while (rc == Z_OK); @@ -178,11 +178,11 @@ static ssize_t module_xz_decompress(struct load_info *info, goto out; } - xz_buf.out = kmap(page); + xz_buf.out = kmap_local_page(page); xz_buf.out_pos = 0; xz_buf.out_size = PAGE_SIZE; xz_ret = xz_dec_run(xz_dec, &xz_buf); - kunmap(page); + kunmap_local(xz_buf.out); new_size += xz_buf.out_pos; } while (xz_buf.out_pos == PAGE_SIZE && xz_ret == XZ_OK); -- cgit From aef9d4a34a51f0a50b4cc04c635955b37972fc90 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 20 Jul 2022 09:47:29 -0700 Subject: bpf: Check attach_func_proto more carefully in check_helper_call Syzkaller found a problem similar to d1a6edecc1fd ("bpf: Check attach_func_proto more carefully in check_return_code") where attach_func_proto might be NULL: RIP: 0010:check_helper_call+0x3dcb/0x8d50 kernel/bpf/verifier.c:7330 do_check kernel/bpf/verifier.c:12302 [inline] do_check_common+0x6e1e/0xb980 kernel/bpf/verifier.c:14610 do_check_main kernel/bpf/verifier.c:14673 [inline] bpf_check+0x661e/0xc520 kernel/bpf/verifier.c:15243 bpf_prog_load+0x11ae/0x1f80 kernel/bpf/syscall.c:2620 With the following reproducer: bpf$BPF_PROG_RAW_TRACEPOINT_LOAD(0x5, &(0x7f0000000780)={0xf, 0x4, &(0x7f0000000040)=@framed={{}, [@call={0x85, 0x0, 0x0, 0xbb}]}, &(0x7f0000000000)='GPL\x00', 0x0, 0x0, 0x0, 0x0, 0x0, '\x00', 0x0, 0x2b, 0xffffffffffffffff, 0x8, 0x0, 0x0, 0x10, 0x0}, 0x80) Let's do the same here, only check attach_func_proto for the prog types where we are certain that attach_func_proto is defined. Fixes: 69fd337a975c ("bpf: per-cgroup lsm flavor") Reported-by: syzbot+0f8d989b1fba1addc5e0@syzkaller.appspotmail.com Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20220720164729.147544-1-sdf@google.com --- kernel/bpf/verifier.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c59c3df0fea6..7c1e056624f9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7170,6 +7170,7 @@ static void update_loop_inline_state(struct bpf_verifier_env *env, u32 subprogno static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx_p) { + enum bpf_prog_type prog_type = resolve_prog_type(env->prog); const struct bpf_func_proto *fn = NULL; enum bpf_return_type ret_type; enum bpf_type_flag ret_flag; @@ -7331,7 +7332,8 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn } break; case BPF_FUNC_set_retval: - if (env->prog->expected_attach_type == BPF_LSM_CGROUP) { + if (prog_type == BPF_PROG_TYPE_LSM && + env->prog->expected_attach_type == BPF_LSM_CGROUP) { if (!env->prog->aux->attach_func_proto->type) { /* Make sure programs that attach to void * hooks don't try to modify return value. -- cgit From 974854ab0728532600c72e41a44d6ce1cf8f20a4 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 12 Jul 2022 18:37:54 -0700 Subject: cxl/acpi: Track CXL resources in iomem_resource Recall that CXL capable address ranges, on ACPI platforms, are published in the CEDT.CFMWS (CXL Early Discovery Table: CXL Fixed Memory Window Structures). These windows represent both the actively mapped capacity and the potential address space that can be dynamically assigned to a new CXL decode configuration (region / interleave-set). CXL endpoints like DDR DIMMs can be mapped at any physical address including 0 and legacy ranges. There is an expectation and requirement that the /proc/iomem interface and the iomem_resource tree in the kernel reflect the full set of platform address ranges. I.e. that every address range that platform firmware and bus drivers enumerate be reflected as an iomem_resource entry. The hard requirement to do this for CXL arises from the fact that facilities like CONFIG_DEVICE_PRIVATE expect to be able to treat empty iomem_resource ranges as free for software to use as proxy address space. Without CXL publishing its potential address ranges in iomem_resource, the CONFIG_DEVICE_PRIVATE mechanism may inadvertently steal capacity reserved for runtime provisioning of new CXL regions. So, iomem_resource needs to know about both active and potential CXL resource ranges. The active CXL resources might already be reflected in iomem_resource as "System RAM". insert_resource_expand_to_fit() handles re-parenting "System RAM" underneath a CXL window. The "_expand_to_fit()" behavior handles cases where a CXL window is not a strict superset of an existing entry in the iomem_resource tree. The "_expand_to_fit()" behavior is acceptable from the perspective of resource allocation. The expansion happens because a conflicting resource range is already populated, which means the resource boundary expansion does not result in any additional free CXL address space being made available. CXL address space allocation is always bounded by the orginal unexpanded address range. However, the potential for expansion does mean that something like walk_iomem_res_desc(IORES_DESC_CXL...) can only return fuzzy answers on corner case platforms that cause the resource tree to expand a CXL window resource over a range that is not decoded by CXL. This would be an odd platform configuration, but if it becomes a problem in practice the CXL subsytem could just publish an API that returns definitive answers. Cc: Andrew Morton Cc: David Hildenbrand Cc: Jason Gunthorpe Cc: Tony Luck Cc: Christoph Hellwig Reviewed-by: Jonathan Cameron Acked-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/165784325943.1758207.5310344844375305118.stgit@dwillia2-xfh.jf.intel.com Signed-off-by: Dan Williams --- kernel/resource.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 34eaee179689..53a534db350e 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -891,6 +891,13 @@ void insert_resource_expand_to_fit(struct resource *root, struct resource *new) } write_unlock(&resource_lock); } +/* + * Not for general consumption, only early boot memory map parsing, PCI + * resource discovery, and late discovery of CXL resources are expected + * to use this interface. The former are built-in and only the latter, + * CXL, is a module. + */ +EXPORT_SYMBOL_NS_GPL(insert_resource_expand_to_fit, CXL); /** * remove_resource - Remove a resource in the resource tree -- cgit From 14b80582c43e4f550acfd93c2b2cadbe36ea0874 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 20 May 2022 13:41:24 -0700 Subject: resource: Introduce alloc_free_mem_region() The core of devm_request_free_mem_region() is a helper that searches for free space in iomem_resource and performs __request_region_locked() on the result of that search. The policy choices of the implementation conform to what CONFIG_DEVICE_PRIVATE users want which is memory that is immediately marked busy, and a preference to search for the first-fit free range in descending order from the top of the physical address space. CXL has a need for a similar allocator, but with the following tweaks: 1/ Search for free space in ascending order 2/ Search for free space relative to a given CXL window 3/ 'insert' rather than 'request' the new resource given downstream drivers from the CXL Region driver (like the pmem or dax drivers) are responsible for request_mem_region() when they activate the memory range. Rework __request_free_mem_region() into get_free_mem_region() which takes a set of GFR_* (Get Free Region) flags to control the allocation policy (ascending vs descending), and "busy" policy (insert_resource() vs request_region()). As part of the consolidation of the legacy GFR_REQUEST_REGION case with the new default of just inserting a new resource into the free space some minor cleanups like not checking for NULL before calling devres_free() (which does its own check) is included. Suggested-by: Jason Gunthorpe Link: https://lore.kernel.org/linux-cxl/20220420143406.GY2120790@nvidia.com/ Cc: Matthew Wilcox Cc: Christoph Hellwig Reviewed-by: Jonathan Cameron Link: https://lore.kernel.org/r/165784333333.1758207.13703329337805274043.stgit@dwillia2-xfh.jf.intel.com Signed-off-by: Dan Williams --- kernel/resource.c | 178 +++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 143 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 53a534db350e..4c5e80b92f2f 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -489,8 +489,9 @@ int __weak page_is_ram(unsigned long pfn) } EXPORT_SYMBOL_GPL(page_is_ram); -static int __region_intersects(resource_size_t start, size_t size, - unsigned long flags, unsigned long desc) +static int __region_intersects(struct resource *parent, resource_size_t start, + size_t size, unsigned long flags, + unsigned long desc) { struct resource res; int type = 0; int other = 0; @@ -499,7 +500,7 @@ static int __region_intersects(resource_size_t start, size_t size, res.start = start; res.end = start + size - 1; - for (p = iomem_resource.child; p ; p = p->sibling) { + for (p = parent->child; p ; p = p->sibling) { bool is_type = (((p->flags & flags) == flags) && ((desc == IORES_DESC_NONE) || (desc == p->desc))); @@ -543,7 +544,7 @@ int region_intersects(resource_size_t start, size_t size, unsigned long flags, int ret; read_lock(&resource_lock); - ret = __region_intersects(start, size, flags, desc); + ret = __region_intersects(&iomem_resource, start, size, flags, desc); read_unlock(&resource_lock); return ret; @@ -1780,62 +1781,139 @@ void resource_list_free(struct list_head *head) } EXPORT_SYMBOL(resource_list_free); -#ifdef CONFIG_DEVICE_PRIVATE -static struct resource *__request_free_mem_region(struct device *dev, - struct resource *base, unsigned long size, const char *name) +#ifdef CONFIG_GET_FREE_REGION +#define GFR_DESCENDING (1UL << 0) +#define GFR_REQUEST_REGION (1UL << 1) +#define GFR_DEFAULT_ALIGN (1UL << PA_SECTION_SHIFT) + +static resource_size_t gfr_start(struct resource *base, resource_size_t size, + resource_size_t align, unsigned long flags) +{ + if (flags & GFR_DESCENDING) { + resource_size_t end; + + end = min_t(resource_size_t, base->end, + (1ULL << MAX_PHYSMEM_BITS) - 1); + return end - size + 1; + } + + return ALIGN(base->start, align); +} + +static bool gfr_continue(struct resource *base, resource_size_t addr, + resource_size_t size, unsigned long flags) +{ + if (flags & GFR_DESCENDING) + return addr > size && addr >= base->start; + /* + * In the ascend case be careful that the last increment by + * @size did not wrap 0. + */ + return addr > addr - size && + addr <= min_t(resource_size_t, base->end, + (1ULL << MAX_PHYSMEM_BITS) - 1); +} + +static resource_size_t gfr_next(resource_size_t addr, resource_size_t size, + unsigned long flags) +{ + if (flags & GFR_DESCENDING) + return addr - size; + return addr + size; +} + +static void remove_free_mem_region(void *_res) +{ + struct resource *res = _res; + + if (res->parent) + remove_resource(res); + free_resource(res); +} + +static struct resource * +get_free_mem_region(struct device *dev, struct resource *base, + resource_size_t size, const unsigned long align, + const char *name, const unsigned long desc, + const unsigned long flags) { - resource_size_t end, addr; + resource_size_t addr; struct resource *res; struct region_devres *dr = NULL; - size = ALIGN(size, 1UL << PA_SECTION_SHIFT); - end = min_t(unsigned long, base->end, (1UL << MAX_PHYSMEM_BITS) - 1); - addr = end - size + 1UL; + size = ALIGN(size, align); res = alloc_resource(GFP_KERNEL); if (!res) return ERR_PTR(-ENOMEM); - if (dev) { + if (dev && (flags & GFR_REQUEST_REGION)) { dr = devres_alloc(devm_region_release, sizeof(struct region_devres), GFP_KERNEL); if (!dr) { free_resource(res); return ERR_PTR(-ENOMEM); } + } else if (dev) { + if (devm_add_action_or_reset(dev, remove_free_mem_region, res)) + return ERR_PTR(-ENOMEM); } write_lock(&resource_lock); - for (; addr > size && addr >= base->start; addr -= size) { - if (__region_intersects(addr, size, 0, IORES_DESC_NONE) != - REGION_DISJOINT) + for (addr = gfr_start(base, size, align, flags); + gfr_continue(base, addr, size, flags); + addr = gfr_next(addr, size, flags)) { + if (__region_intersects(base, addr, size, 0, IORES_DESC_NONE) != + REGION_DISJOINT) continue; - if (__request_region_locked(res, &iomem_resource, addr, size, - name, 0)) - break; + if (flags & GFR_REQUEST_REGION) { + if (__request_region_locked(res, &iomem_resource, addr, + size, name, 0)) + break; - if (dev) { - dr->parent = &iomem_resource; - dr->start = addr; - dr->n = size; - devres_add(dev, dr); - } + if (dev) { + dr->parent = &iomem_resource; + dr->start = addr; + dr->n = size; + devres_add(dev, dr); + } - res->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY; - write_unlock(&resource_lock); + res->desc = desc; + write_unlock(&resource_lock); + + + /* + * A driver is claiming this region so revoke any + * mappings. + */ + revoke_iomem(res); + } else { + res->start = addr; + res->end = addr + size - 1; + res->name = name; + res->desc = desc; + res->flags = IORESOURCE_MEM; + + /* + * Only succeed if the resource hosts an exclusive + * range after the insert + */ + if (__insert_resource(base, res) || res->child) + break; + + write_unlock(&resource_lock); + } - /* - * A driver is claiming this region so revoke any mappings. - */ - revoke_iomem(res); return res; } write_unlock(&resource_lock); - free_resource(res); - if (dr) + if (flags & GFR_REQUEST_REGION) { + free_resource(res); devres_free(dr); + } else if (dev) + devm_release_action(dev, remove_free_mem_region, res); return ERR_PTR(-ERANGE); } @@ -1854,18 +1932,48 @@ static struct resource *__request_free_mem_region(struct device *dev, struct resource *devm_request_free_mem_region(struct device *dev, struct resource *base, unsigned long size) { - return __request_free_mem_region(dev, base, size, dev_name(dev)); + unsigned long flags = GFR_DESCENDING | GFR_REQUEST_REGION; + + return get_free_mem_region(dev, base, size, GFR_DEFAULT_ALIGN, + dev_name(dev), + IORES_DESC_DEVICE_PRIVATE_MEMORY, flags); } EXPORT_SYMBOL_GPL(devm_request_free_mem_region); struct resource *request_free_mem_region(struct resource *base, unsigned long size, const char *name) { - return __request_free_mem_region(NULL, base, size, name); + unsigned long flags = GFR_DESCENDING | GFR_REQUEST_REGION; + + return get_free_mem_region(NULL, base, size, GFR_DEFAULT_ALIGN, name, + IORES_DESC_DEVICE_PRIVATE_MEMORY, flags); } EXPORT_SYMBOL_GPL(request_free_mem_region); -#endif /* CONFIG_DEVICE_PRIVATE */ +/** + * alloc_free_mem_region - find a free region relative to @base + * @base: resource that will parent the new resource + * @size: size in bytes of memory to allocate from @base + * @align: alignment requirements for the allocation + * @name: resource name + * + * Buses like CXL, that can dynamically instantiate new memory regions, + * need a method to allocate physical address space for those regions. + * Allocate and insert a new resource to cover a free, unclaimed by a + * descendant of @base, range in the span of @base. + */ +struct resource *alloc_free_mem_region(struct resource *base, + unsigned long size, unsigned long align, + const char *name) +{ + /* Default of ascending direction and insert resource */ + unsigned long flags = 0; + + return get_free_mem_region(NULL, base, size, align, name, + IORES_DESC_NONE, flags); +} +EXPORT_SYMBOL_NS_GPL(alloc_free_mem_region, CXL); +#endif /* CONFIG_GET_FREE_REGION */ static int __init strict_iomem(char *str) { -- cgit From bf95b2bc3e42f11f4d7a5e8a98376c2b4a2aa82f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 13 Apr 2022 17:46:15 -0700 Subject: rcu: Switch polled grace-period APIs to ->gp_seq_polled This commit switches the existing polled grace-period APIs to use a new ->gp_seq_polled counter in the rcu_state structure. An additional ->gp_seq_polled_snap counter in that same structure allows the normal grace period kthread to interact properly with the !SMP !PREEMPT fastpath through synchronize_rcu(). The first of the two to note the end of a given grace period will make knowledge of this transition available to the polled API. This commit is in preparation for polled expedited grace periods. [ paulmck: Fix use of rcu_state.gp_seq_polled to start normal grace period. ] Link: https://lore.kernel.org/all/20220121142454.1994916-1-bfoster@redhat.com/ Link: https://docs.google.com/document/d/1RNKWW9jQyfjxw2E8dsXVTdvZYh0HnYeSHDKog9jhdN8/edit?usp=sharing Cc: Brian Foster Cc: Dave Chinner Cc: Al Viro Cc: Ian Kent Co-developed-by: Boqun Feng Signed-off-by: Boqun Feng Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 98 ++++++++++++++++++++++++++++++++++++++++++++++++++++--- kernel/rcu/tree.h | 2 ++ 2 files changed, 96 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 46cfceea8784..b40a5a19ddd2 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1775,6 +1775,78 @@ static void rcu_strict_gp_boundary(void *unused) invoke_rcu_core(); } +// Has rcu_init() been invoked? This is used (for example) to determine +// whether spinlocks may be acquired safely. +static bool rcu_init_invoked(void) +{ + return !!rcu_state.n_online_cpus; +} + +// Make the polled API aware of the beginning of a grace period. +static void rcu_poll_gp_seq_start(unsigned long *snap) +{ + struct rcu_node *rnp = rcu_get_root(); + + if (rcu_init_invoked()) + raw_lockdep_assert_held_rcu_node(rnp); + + // If RCU was idle, note beginning of GP. + if (!rcu_seq_state(rcu_state.gp_seq_polled)) + rcu_seq_start(&rcu_state.gp_seq_polled); + + // Either way, record current state. + *snap = rcu_state.gp_seq_polled; +} + +// Make the polled API aware of the end of a grace period. +static void rcu_poll_gp_seq_end(unsigned long *snap) +{ + struct rcu_node *rnp = rcu_get_root(); + + if (rcu_init_invoked()) + raw_lockdep_assert_held_rcu_node(rnp); + + // If the previously noted GP is still in effect, record the + // end of that GP. Either way, zero counter to avoid counter-wrap + // problems. + if (*snap && *snap == rcu_state.gp_seq_polled) { + rcu_seq_end(&rcu_state.gp_seq_polled); + rcu_state.gp_seq_polled_snap = 0; + } else { + *snap = 0; + } +} + +// Make the polled API aware of the beginning of a grace period, but +// where caller does not hold the root rcu_node structure's lock. +static void rcu_poll_gp_seq_start_unlocked(unsigned long *snap) +{ + struct rcu_node *rnp = rcu_get_root(); + + if (rcu_init_invoked()) { + lockdep_assert_irqs_enabled(); + raw_spin_lock_irq_rcu_node(rnp); + } + rcu_poll_gp_seq_start(snap); + if (rcu_init_invoked()) + raw_spin_unlock_irq_rcu_node(rnp); +} + +// Make the polled API aware of the end of a grace period, but where +// caller does not hold the root rcu_node structure's lock. +static void rcu_poll_gp_seq_end_unlocked(unsigned long *snap) +{ + struct rcu_node *rnp = rcu_get_root(); + + if (rcu_init_invoked()) { + lockdep_assert_irqs_enabled(); + raw_spin_lock_irq_rcu_node(rnp); + } + rcu_poll_gp_seq_end(snap); + if (rcu_init_invoked()) + raw_spin_unlock_irq_rcu_node(rnp); +} + /* * Initialize a new grace period. Return false if no grace period required. */ @@ -1810,6 +1882,7 @@ static noinline_for_stack bool rcu_gp_init(void) rcu_seq_start(&rcu_state.gp_seq); ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq); trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("start")); + rcu_poll_gp_seq_start(&rcu_state.gp_seq_polled_snap); raw_spin_unlock_irq_rcu_node(rnp); /* @@ -2069,6 +2142,7 @@ static noinline void rcu_gp_cleanup(void) * safe for us to drop the lock in order to mark the grace * period as completed in all of the rcu_node structures. */ + rcu_poll_gp_seq_end(&rcu_state.gp_seq_polled_snap); raw_spin_unlock_irq_rcu_node(rnp); /* @@ -3837,8 +3911,18 @@ void synchronize_rcu(void) lock_is_held(&rcu_lock_map) || lock_is_held(&rcu_sched_lock_map), "Illegal synchronize_rcu() in RCU read-side critical section"); - if (rcu_blocking_is_gp()) + if (rcu_blocking_is_gp()) { + // Note well that this code runs with !PREEMPT && !SMP. + // In addition, all code that advances grace periods runs + // at process level. Therefore, this GP overlaps with other + // GPs only by being fully nested within them, which allows + // reuse of ->gp_seq_polled_snap. + rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_snap); + rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_snap); + if (rcu_init_invoked()) + cond_resched_tasks_rcu_qs(); return; // Context allows vacuous grace periods. + } if (rcu_gp_is_expedited()) synchronize_rcu_expedited(); else @@ -3860,7 +3944,7 @@ unsigned long get_state_synchronize_rcu(void) * before the load from ->gp_seq. */ smp_mb(); /* ^^^ */ - return rcu_seq_snap(&rcu_state.gp_seq); + return rcu_seq_snap(&rcu_state.gp_seq_polled); } EXPORT_SYMBOL_GPL(get_state_synchronize_rcu); @@ -3889,7 +3973,13 @@ unsigned long start_poll_synchronize_rcu(void) rdp = this_cpu_ptr(&rcu_data); rnp = rdp->mynode; raw_spin_lock_rcu_node(rnp); // irqs already disabled. - needwake = rcu_start_this_gp(rnp, rdp, gp_seq); + // Note it is possible for a grace period to have elapsed between + // the above call to get_state_synchronize_rcu() and the below call + // to rcu_seq_snap. This is OK, the worst that happens is that we + // get a grace period that no one needed. These accesses are ordered + // by smp_mb(), and we are accessing them in the opposite order + // from which they are updated at grace-period start, as required. + needwake = rcu_start_this_gp(rnp, rdp, rcu_seq_snap(&rcu_state.gp_seq)); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); if (needwake) rcu_gp_kthread_wake(); @@ -3925,7 +4015,7 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu); bool poll_state_synchronize_rcu(unsigned long oldstate) { if (oldstate == RCU_GET_STATE_COMPLETED || - rcu_seq_done_exact(&rcu_state.gp_seq, oldstate)) { + rcu_seq_done_exact(&rcu_state.gp_seq_polled, oldstate)) { smp_mb(); /* Ensure GP ends before subsequent accesses. */ return true; } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 2ccf5845957d..9c853033f159 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -323,6 +323,8 @@ struct rcu_state { short gp_state; /* GP kthread sleep state. */ unsigned long gp_wake_time; /* Last GP kthread wake. */ unsigned long gp_wake_seq; /* ->gp_seq at ^^^. */ + unsigned long gp_seq_polled; /* GP seq for polled API. */ + unsigned long gp_seq_polled_snap; /* ->gp_seq_polled at normal GP start. */ /* End of fields guarded by root rcu_node's lock. */ -- cgit From dd04140531b5d38b77ad9ff7b18117654be5bf5c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 14 Apr 2022 06:56:35 -0700 Subject: rcu: Make polled grace-period API account for expedited grace periods Currently, this code could splat: oldstate = get_state_synchronize_rcu(); synchronize_rcu_expedited(); WARN_ON_ONCE(!poll_state_synchronize_rcu(oldstate)); This situation is counter-intuitive and user-unfriendly. After all, there really was a perfectly valid full grace period right after the call to get_state_synchronize_rcu(), so why shouldn't poll_state_synchronize_rcu() know about it? This commit therefore makes the polled grace-period API aware of expedited grace periods in addition to the normal grace periods that it is already aware of. With this change, the above code is guaranteed not to splat. Please note that the above code can still splat due to counter wrap on the one hand and situations involving partially overlapping normal/expedited grace periods on the other. On 64-bit systems, the second is of course much more likely than the first. It is possible to modify this approach to prevent overlapping grace periods from causing splats, but only at the expense of greatly increasing the probability of counter wrap, as in within milliseconds on 32-bit systems and within minutes on 64-bit systems. This commit is in preparation for polled expedited grace periods. Link: https://lore.kernel.org/all/20220121142454.1994916-1-bfoster@redhat.com/ Link: https://docs.google.com/document/d/1RNKWW9jQyfjxw2E8dsXVTdvZYh0HnYeSHDKog9jhdN8/edit?usp=sharing Cc: Brian Foster Cc: Dave Chinner Cc: Al Viro Cc: Ian Kent Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 9 +++++---- kernel/rcu/tree.h | 1 + kernel/rcu/tree_exp.h | 16 ++++++++++++++-- 3 files changed, 20 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b40a5a19ddd2..1505b02b4e53 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1812,6 +1812,7 @@ static void rcu_poll_gp_seq_end(unsigned long *snap) if (*snap && *snap == rcu_state.gp_seq_polled) { rcu_seq_end(&rcu_state.gp_seq_polled); rcu_state.gp_seq_polled_snap = 0; + rcu_state.gp_seq_polled_exp_snap = 0; } else { *snap = 0; } @@ -3913,10 +3914,10 @@ void synchronize_rcu(void) "Illegal synchronize_rcu() in RCU read-side critical section"); if (rcu_blocking_is_gp()) { // Note well that this code runs with !PREEMPT && !SMP. - // In addition, all code that advances grace periods runs - // at process level. Therefore, this GP overlaps with other - // GPs only by being fully nested within them, which allows - // reuse of ->gp_seq_polled_snap. + // In addition, all code that advances grace periods runs at + // process level. Therefore, this normal GP overlaps with + // other normal GPs only by being fully nested within them, + // which allows reuse of ->gp_seq_polled_snap. rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_snap); rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_snap); if (rcu_init_invoked()) diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 9c853033f159..5634e76106c4 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -325,6 +325,7 @@ struct rcu_state { unsigned long gp_wake_seq; /* ->gp_seq at ^^^. */ unsigned long gp_seq_polled; /* GP seq for polled API. */ unsigned long gp_seq_polled_snap; /* ->gp_seq_polled at normal GP start. */ + unsigned long gp_seq_polled_exp_snap; /* ->gp_seq_polled at expedited GP start. */ /* End of fields guarded by root rcu_node's lock. */ diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 0f70f62039a9..e0258066b881 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -18,6 +18,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp); static void rcu_exp_gp_seq_start(void) { rcu_seq_start(&rcu_state.expedited_sequence); + rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_exp_snap); } /* @@ -34,6 +35,7 @@ static __maybe_unused unsigned long rcu_exp_gp_seq_endval(void) */ static void rcu_exp_gp_seq_end(void) { + rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_exp_snap); rcu_seq_end(&rcu_state.expedited_sequence); smp_mb(); /* Ensure that consecutive grace periods serialize. */ } @@ -913,8 +915,18 @@ void synchronize_rcu_expedited(void) "Illegal synchronize_rcu_expedited() in RCU read-side critical section"); /* Is the state is such that the call is a grace period? */ - if (rcu_blocking_is_gp()) - return; + if (rcu_blocking_is_gp()) { + // Note well that this code runs with !PREEMPT && !SMP. + // In addition, all code that advances grace periods runs + // at process level. Therefore, this expedited GP overlaps + // with other expedited GPs only by being fully nested within + // them, which allows reuse of ->gp_seq_polled_exp_snap. + rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_exp_snap); + rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_exp_snap); + if (rcu_init_invoked()) + cond_resched(); + return; // Context allows vacuous grace periods. + } /* If expedited grace periods are prohibited, fall back to normal. */ if (rcu_gp_is_normal()) { -- cgit From 7f4535366f8f77b3ddbc79d4ba82df966c5c2aab Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 14 Apr 2022 11:49:58 -0700 Subject: rcu: Make Tiny RCU grace periods visible to polled APIs This commit makes the Tiny RCU implementation of synchronize_rcu() increment the rcu_ctrlblk.gp_seq counter, thus making both synchronize_rcu() and synchronize_rcu_expedited() visible to get_state_synchronize_rcu() and friends. Link: https://lore.kernel.org/all/20220121142454.1994916-1-bfoster@redhat.com/ Link: https://docs.google.com/document/d/1RNKWW9jQyfjxw2E8dsXVTdvZYh0HnYeSHDKog9jhdN8/edit?usp=sharing Cc: Brian Foster Cc: Dave Chinner Cc: Al Viro Cc: Ian Kent Signed-off-by: Paul E. McKenney --- kernel/rcu/tiny.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index dbee6bea6726..60071817d939 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -139,8 +139,10 @@ static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused /* * Wait for a grace period to elapse. But it is illegal to invoke * synchronize_rcu() from within an RCU read-side critical section. - * Therefore, any legal call to synchronize_rcu() is a quiescent - * state, and so on a UP system, synchronize_rcu() need do nothing. + * Therefore, any legal call to synchronize_rcu() is a quiescent state, + * and so on a UP system, synchronize_rcu() need do nothing, other than + * let the polled APIs know that another grace period elapsed. + * * (But Lai Jiangshan points out the benefits of doing might_sleep() * to reduce latency.) * @@ -152,6 +154,7 @@ void synchronize_rcu(void) lock_is_held(&rcu_lock_map) || lock_is_held(&rcu_sched_lock_map), "Illegal synchronize_rcu() in RCU read-side critical section"); + WRITE_ONCE(rcu_ctrlblk.gp_seq, rcu_ctrlblk.gp_seq + 2); } EXPORT_SYMBOL_GPL(synchronize_rcu); -- cgit From e4333cb20f047d96485a9416a93ae4b2ec3b27dd Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 14 Apr 2022 09:09:11 -0700 Subject: rcutorture: Verify that polled GP API sees synchronous grace periods This commit causes rcu_torture_writer() to use WARN_ON_ONCE() to check that the cookie returned by the current RCU flavor's ->get_gp_state() function (get_state_synchronize_rcu() for vanilla RCU) causes that flavor's ->poll_gp_state function (poll_state_synchronize_rcu() for vanilla RCU) to unconditionally return true. Note that a pair calls to synchronous grace-period-wait functions are used. This is necessary to account for partially overlapping normal and expedited grace periods aligning in just the wrong way with polled API invocations, which can cause those polled API invocations to ignore one or the other of those partially overlapping grace periods. It is unlikely that this sort of ignored grace period will be a problem in production, but rcutorture can make it happen quite within a few tens of seconds. This commit is in preparation for polled expedited grace periods. [ paulmck: Apply feedback from Frederic Weisbecker. ] Link: https://lore.kernel.org/all/20220121142454.1994916-1-bfoster@redhat.com/ Link: https://docs.google.com/document/d/1RNKWW9jQyfjxw2E8dsXVTdvZYh0HnYeSHDKog9jhdN8/edit?usp=sharing Cc: Brian Foster Cc: Dave Chinner Cc: Al Viro Cc: Ian Kent Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 4ceec9f4169c..d2edc763bb92 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1269,7 +1269,12 @@ rcu_torture_writer(void *arg) break; case RTWS_EXP_SYNC: rcu_torture_writer_state = RTWS_EXP_SYNC; + if (cur_ops->get_gp_state && cur_ops->poll_gp_state) + cookie = cur_ops->get_gp_state(); cur_ops->exp_sync(); + cur_ops->exp_sync(); + if (cur_ops->get_gp_state && cur_ops->poll_gp_state) + WARN_ON_ONCE(!cur_ops->poll_gp_state(cookie)); rcu_torture_pipe_update(old_rp); break; case RTWS_COND_GET: @@ -1291,7 +1296,12 @@ rcu_torture_writer(void *arg) break; case RTWS_SYNC: rcu_torture_writer_state = RTWS_SYNC; + if (cur_ops->get_gp_state && cur_ops->poll_gp_state) + cookie = cur_ops->get_gp_state(); cur_ops->sync(); + cur_ops->sync(); + if (cur_ops->get_gp_state && cur_ops->poll_gp_state) + WARN_ON_ONCE(!cur_ops->poll_gp_state(cookie)); rcu_torture_pipe_update(old_rp); break; default: -- cgit From d96c52fe4907c68adc5e61a0bef7aec0933223d5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 15 Apr 2022 10:55:42 -0700 Subject: rcu: Add polled expedited grace-period primitives This commit adds expedited grace-period functionality to RCU's polled grace-period API, adding start_poll_synchronize_rcu_expedited() and cond_synchronize_rcu_expedited(), which are similar to the existing start_poll_synchronize_rcu() and cond_synchronize_rcu() functions, respectively. Note that although start_poll_synchronize_rcu_expedited() can be invoked very early, the resulting expedited grace periods are not guaranteed to start until after workqueues are fully initialized. On the other hand, both synchronize_rcu() and synchronize_rcu_expedited() can also be invoked very early, and the resulting grace periods will be taken into account as they occur. [ paulmck: Apply feedback from Neeraj Upadhyay. ] Link: https://lore.kernel.org/all/20220121142454.1994916-1-bfoster@redhat.com/ Link: https://docs.google.com/document/d/1RNKWW9jQyfjxw2E8dsXVTdvZYh0HnYeSHDKog9jhdN8/edit?usp=sharing Cc: Brian Foster Cc: Dave Chinner Cc: Al Viro Cc: Ian Kent Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 17 ++++++++--- kernel/rcu/tree.h | 7 +++++ kernel/rcu/tree_exp.h | 85 +++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 104 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 1505b02b4e53..6cf5b51622cd 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4027,20 +4027,20 @@ EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu); /** * cond_synchronize_rcu - Conditionally wait for an RCU grace period * - * @oldstate: value from get_state_synchronize_rcu() or start_poll_synchronize_rcu() + * @oldstate: value from get_state_synchronize_rcu(), start_poll_synchronize_rcu(), or start_poll_synchronize_rcu_expedited() * * If a full RCU grace period has elapsed since the earlier call to * get_state_synchronize_rcu() or start_poll_synchronize_rcu(), just return. * Otherwise, invoke synchronize_rcu() to wait for a full grace period. * - * Yes, this function does not take counter wrap into account. But - * counter wrap is harmless. If the counter wraps, we have waited for + * Yes, this function does not take counter wrap into account. + * But counter wrap is harmless. If the counter wraps, we have waited for * more than 2 billion grace periods (and way more on a 64-bit system!), - * so waiting for one additional grace period should be just fine. + * so waiting for a couple of additional grace periods should be just fine. * * This function provides the same memory-ordering guarantees that * would be provided by a synchronize_rcu() that was invoked at the call - * to the function that provided @oldstate, and that returned at the end + * to the function that provided @oldstate and that returned at the end * of this function. */ void cond_synchronize_rcu(unsigned long oldstate) @@ -4793,6 +4793,9 @@ static void __init rcu_init_one(void) init_waitqueue_head(&rnp->exp_wq[3]); spin_lock_init(&rnp->exp_lock); mutex_init(&rnp->boost_kthread_mutex); + raw_spin_lock_init(&rnp->exp_poll_lock); + rnp->exp_seq_poll_rq = RCU_GET_STATE_COMPLETED; + INIT_WORK(&rnp->exp_poll_wq, sync_rcu_do_polled_gp); } } @@ -5018,6 +5021,10 @@ void __init rcu_init(void) qovld_calc = DEFAULT_RCU_QOVLD_MULT * qhimark; else qovld_calc = qovld; + + // Kick-start any polled grace periods that started early. + if (!(per_cpu_ptr(&rcu_data, cpu)->mynode->exp_seq_poll_rq & 0x1)) + (void)start_poll_synchronize_rcu_expedited(); } #include "tree_stall.h" diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 5634e76106c4..fb77deca5f5c 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -133,6 +133,10 @@ struct rcu_node { wait_queue_head_t exp_wq[4]; struct rcu_exp_work rew; bool exp_need_flush; /* Need to flush workitem? */ + raw_spinlock_t exp_poll_lock; + /* Lock and data for polled expedited grace periods. */ + unsigned long exp_seq_poll_rq; + struct work_struct exp_poll_wq; } ____cacheline_internodealigned_in_smp; /* @@ -484,3 +488,6 @@ static void rcu_iw_handler(struct irq_work *iwp); static void check_cpu_stall(struct rcu_data *rdp); static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp, const unsigned long gpssdelay); + +/* Forward declarations for tree_exp.h. */ +static void sync_rcu_do_polled_gp(struct work_struct *wp); diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index e0258066b881..571b0a700cce 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -962,3 +962,88 @@ void synchronize_rcu_expedited(void) synchronize_rcu_expedited_destroy_work(&rew); } EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); + +/* + * Ensure that start_poll_synchronize_rcu_expedited() has the expedited + * RCU grace periods that it needs. + */ +static void sync_rcu_do_polled_gp(struct work_struct *wp) +{ + unsigned long flags; + struct rcu_node *rnp = container_of(wp, struct rcu_node, exp_poll_wq); + unsigned long s; + + raw_spin_lock_irqsave(&rnp->exp_poll_lock, flags); + s = rnp->exp_seq_poll_rq; + rnp->exp_seq_poll_rq = RCU_GET_STATE_COMPLETED; + raw_spin_unlock_irqrestore(&rnp->exp_poll_lock, flags); + if (s == RCU_GET_STATE_COMPLETED) + return; + while (!poll_state_synchronize_rcu(s)) + synchronize_rcu_expedited(); + raw_spin_lock_irqsave(&rnp->exp_poll_lock, flags); + s = rnp->exp_seq_poll_rq; + if (poll_state_synchronize_rcu(s)) + rnp->exp_seq_poll_rq = RCU_GET_STATE_COMPLETED; + raw_spin_unlock_irqrestore(&rnp->exp_poll_lock, flags); +} + +/** + * start_poll_synchronize_rcu_expedited - Snapshot current RCU state and start expedited grace period + * + * Returns a cookie to pass to a call to cond_synchronize_rcu(), + * cond_synchronize_rcu_expedited(), or poll_state_synchronize_rcu(), + * allowing them to determine whether or not any sort of grace period has + * elapsed in the meantime. If the needed expedited grace period is not + * already slated to start, initiates that grace period. + */ +unsigned long start_poll_synchronize_rcu_expedited(void) +{ + unsigned long flags; + struct rcu_data *rdp; + struct rcu_node *rnp; + unsigned long s; + + s = get_state_synchronize_rcu(); + rdp = per_cpu_ptr(&rcu_data, raw_smp_processor_id()); + rnp = rdp->mynode; + if (rcu_init_invoked()) + raw_spin_lock_irqsave(&rnp->exp_poll_lock, flags); + if (!poll_state_synchronize_rcu(s)) { + rnp->exp_seq_poll_rq = s; + if (rcu_init_invoked()) + queue_work(rcu_gp_wq, &rnp->exp_poll_wq); + } + if (rcu_init_invoked()) + raw_spin_unlock_irqrestore(&rnp->exp_poll_lock, flags); + + return s; +} +EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu_expedited); + +/** + * cond_synchronize_rcu_expedited - Conditionally wait for an expedited RCU grace period + * + * @oldstate: value from get_state_synchronize_rcu(), start_poll_synchronize_rcu(), or start_poll_synchronize_rcu_expedited() + * + * If any type of full RCU grace period has elapsed since the earlier + * call to get_state_synchronize_rcu(), start_poll_synchronize_rcu(), + * or start_poll_synchronize_rcu_expedited(), just return. Otherwise, + * invoke synchronize_rcu_expedited() to wait for a full grace period. + * + * Yes, this function does not take counter wrap into account. + * But counter wrap is harmless. If the counter wraps, we have waited for + * more than 2 billion grace periods (and way more on a 64-bit system!), + * so waiting for a couple of additional grace periods should be just fine. + * + * This function provides the same memory-ordering guarantees that + * would be provided by a synchronize_rcu() that was invoked at the call + * to the function that provided @oldstate and that returned at the end + * of this function. + */ +void cond_synchronize_rcu_expedited(unsigned long oldstate) +{ + if (!poll_state_synchronize_rcu(oldstate)) + synchronize_rcu_expedited(); +} +EXPORT_SYMBOL_GPL(cond_synchronize_rcu_expedited); -- cgit From 11d62f0f43a35d7c62aabc06a99cd4691a47ccb4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 Feb 2022 07:01:20 -0800 Subject: rcutorture: Test polled expedited grace-period primitives This commit adds tests of start_poll_synchronize_rcu_expedited() and poll_state_synchronize_rcu_expedited(). Link: https://lore.kernel.org/all/20220121142454.1994916-1-bfoster@redhat.com/ Link: https://docs.google.com/document/d/1RNKWW9jQyfjxw2E8dsXVTdvZYh0HnYeSHDKog9jhdN8/edit?usp=sharing Cc: Brian Foster Cc: Dave Chinner Cc: Al Viro Cc: Ian Kent Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 87 +++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 74 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index d2edc763bb92..0788ef2a4491 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -86,10 +86,12 @@ torture_param(int, fwd_progress_holdoff, 60, torture_param(bool, fwd_progress_need_resched, 1, "Hide cond_resched() behind need_resched()"); torture_param(bool, gp_cond, false, "Use conditional/async GP wait primitives"); +torture_param(bool, gp_cond_exp, false, "Use conditional/async expedited GP wait primitives"); torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); torture_param(bool, gp_normal, false, "Use normal (non-expedited) GP wait primitives"); torture_param(bool, gp_poll, false, "Use polling GP wait primitives"); +torture_param(bool, gp_poll_exp, false, "Use polling expedited GP wait primitives"); torture_param(bool, gp_sync, false, "Use synchronous GP wait primitives"); torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers"); torture_param(int, leakpointer, 0, "Leak pointer dereferences from readers"); @@ -209,12 +211,16 @@ static int rcu_torture_writer_state; #define RTWS_DEF_FREE 3 #define RTWS_EXP_SYNC 4 #define RTWS_COND_GET 5 -#define RTWS_COND_SYNC 6 -#define RTWS_POLL_GET 7 -#define RTWS_POLL_WAIT 8 -#define RTWS_SYNC 9 -#define RTWS_STUTTER 10 -#define RTWS_STOPPING 11 +#define RTWS_COND_GET_EXP 6 +#define RTWS_COND_SYNC 7 +#define RTWS_COND_SYNC_EXP 8 +#define RTWS_POLL_GET 9 +#define RTWS_POLL_GET_EXP 10 +#define RTWS_POLL_WAIT 11 +#define RTWS_POLL_WAIT_EXP 12 +#define RTWS_SYNC 13 +#define RTWS_STUTTER 14 +#define RTWS_STOPPING 15 static const char * const rcu_torture_writer_state_names[] = { "RTWS_FIXED_DELAY", "RTWS_DELAY", @@ -222,9 +228,13 @@ static const char * const rcu_torture_writer_state_names[] = { "RTWS_DEF_FREE", "RTWS_EXP_SYNC", "RTWS_COND_GET", + "RTWS_COND_GET_EXP", "RTWS_COND_SYNC", + "RTWS_COND_SYNC_EXP", "RTWS_POLL_GET", + "RTWS_POLL_GET_EXP", "RTWS_POLL_WAIT", + "RTWS_POLL_WAIT_EXP", "RTWS_SYNC", "RTWS_STUTTER", "RTWS_STOPPING", @@ -337,6 +347,10 @@ struct rcu_torture_ops { void (*deferred_free)(struct rcu_torture *p); void (*sync)(void); void (*exp_sync)(void); + unsigned long (*get_gp_state_exp)(void); + unsigned long (*start_gp_poll_exp)(void); + bool (*poll_gp_state_exp)(unsigned long oldstate); + void (*cond_sync_exp)(unsigned long oldstate); unsigned long (*get_gp_state)(void); unsigned long (*get_gp_completed)(void); unsigned long (*start_gp_poll)(void); @@ -509,6 +523,10 @@ static struct rcu_torture_ops rcu_ops = { .start_gp_poll = start_poll_synchronize_rcu, .poll_gp_state = poll_state_synchronize_rcu, .cond_sync = cond_synchronize_rcu, + .get_gp_state_exp = get_state_synchronize_rcu, + .start_gp_poll_exp = start_poll_synchronize_rcu_expedited, + .poll_gp_state_exp = poll_state_synchronize_rcu, + .cond_sync_exp = cond_synchronize_rcu_expedited, .call = call_rcu, .cb_barrier = rcu_barrier, .fqs = rcu_force_quiescent_state, @@ -1138,9 +1156,8 @@ rcu_torture_fqs(void *arg) return 0; } -// Used by writers to randomly choose from the available grace-period -// primitives. The only purpose of the initialization is to size the array. -static int synctype[] = { RTWS_DEF_FREE, RTWS_EXP_SYNC, RTWS_COND_GET, RTWS_POLL_GET, RTWS_SYNC }; +// Used by writers to randomly choose from the available grace-period primitives. +static int synctype[ARRAY_SIZE(rcu_torture_writer_state_names)] = { }; static int nsynctypes; /* @@ -1148,18 +1165,27 @@ static int nsynctypes; */ static void rcu_torture_write_types(void) { - bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal; - bool gp_poll1 = gp_poll, gp_sync1 = gp_sync; + bool gp_cond1 = gp_cond, gp_cond_exp1 = gp_cond_exp, gp_exp1 = gp_exp; + bool gp_poll_exp1 = gp_poll_exp, gp_normal1 = gp_normal, gp_poll1 = gp_poll; + bool gp_sync1 = gp_sync; /* Initialize synctype[] array. If none set, take default. */ - if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_poll1 && !gp_sync1) - gp_cond1 = gp_exp1 = gp_normal1 = gp_poll1 = gp_sync1 = true; + if (!gp_cond1 && !gp_cond_exp1 && !gp_exp1 && !gp_poll_exp && + !gp_normal1 && !gp_poll1 && !gp_sync1) + gp_cond1 = gp_cond_exp1 = gp_exp1 = gp_poll_exp1 = + gp_normal1 = gp_poll1 = gp_sync1 = true; if (gp_cond1 && cur_ops->get_gp_state && cur_ops->cond_sync) { synctype[nsynctypes++] = RTWS_COND_GET; pr_info("%s: Testing conditional GPs.\n", __func__); } else if (gp_cond && (!cur_ops->get_gp_state || !cur_ops->cond_sync)) { pr_alert("%s: gp_cond without primitives.\n", __func__); } + if (gp_cond_exp1 && cur_ops->get_gp_state_exp && cur_ops->cond_sync_exp) { + synctype[nsynctypes++] = RTWS_COND_GET_EXP; + pr_info("%s: Testing conditional expedited GPs.\n", __func__); + } else if (gp_cond_exp && (!cur_ops->get_gp_state_exp || !cur_ops->cond_sync_exp)) { + pr_alert("%s: gp_cond_exp without primitives.\n", __func__); + } if (gp_exp1 && cur_ops->exp_sync) { synctype[nsynctypes++] = RTWS_EXP_SYNC; pr_info("%s: Testing expedited GPs.\n", __func__); @@ -1178,6 +1204,12 @@ static void rcu_torture_write_types(void) } else if (gp_poll && (!cur_ops->start_gp_poll || !cur_ops->poll_gp_state)) { pr_alert("%s: gp_poll without primitives.\n", __func__); } + if (gp_poll_exp1 && cur_ops->start_gp_poll_exp && cur_ops->poll_gp_state_exp) { + synctype[nsynctypes++] = RTWS_POLL_GET_EXP; + pr_info("%s: Testing polling expedited GPs.\n", __func__); + } else if (gp_poll_exp && (!cur_ops->start_gp_poll_exp || !cur_ops->poll_gp_state_exp)) { + pr_alert("%s: gp_poll_exp without primitives.\n", __func__); + } if (gp_sync1 && cur_ops->sync) { synctype[nsynctypes++] = RTWS_SYNC; pr_info("%s: Testing normal GPs.\n", __func__); @@ -1285,6 +1317,14 @@ rcu_torture_writer(void *arg) cur_ops->cond_sync(gp_snap); rcu_torture_pipe_update(old_rp); break; + case RTWS_COND_GET_EXP: + rcu_torture_writer_state = RTWS_COND_GET_EXP; + gp_snap = cur_ops->get_gp_state_exp(); + torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); + rcu_torture_writer_state = RTWS_COND_SYNC_EXP; + cur_ops->cond_sync_exp(gp_snap); + rcu_torture_pipe_update(old_rp); + break; case RTWS_POLL_GET: rcu_torture_writer_state = RTWS_POLL_GET; gp_snap = cur_ops->start_gp_poll(); @@ -1294,6 +1334,15 @@ rcu_torture_writer(void *arg) &rand); rcu_torture_pipe_update(old_rp); break; + case RTWS_POLL_GET_EXP: + rcu_torture_writer_state = RTWS_POLL_GET_EXP; + gp_snap = cur_ops->start_gp_poll_exp(); + rcu_torture_writer_state = RTWS_POLL_WAIT_EXP; + while (!cur_ops->poll_gp_state_exp(gp_snap)) + torture_hrtimeout_jiffies(torture_random(&rand) % 16, + &rand); + rcu_torture_pipe_update(old_rp); + break; case RTWS_SYNC: rcu_torture_writer_state = RTWS_SYNC; if (cur_ops->get_gp_state && cur_ops->poll_gp_state) @@ -1400,6 +1449,11 @@ rcu_torture_fakewriter(void *arg) torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); cur_ops->cond_sync(gp_snap); break; + case RTWS_COND_GET_EXP: + gp_snap = cur_ops->get_gp_state_exp(); + torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); + cur_ops->cond_sync_exp(gp_snap); + break; case RTWS_POLL_GET: gp_snap = cur_ops->start_gp_poll(); while (!cur_ops->poll_gp_state(gp_snap)) { @@ -1407,6 +1461,13 @@ rcu_torture_fakewriter(void *arg) &rand); } break; + case RTWS_POLL_GET_EXP: + gp_snap = cur_ops->start_gp_poll_exp(); + while (!cur_ops->poll_gp_state_exp(gp_snap)) { + torture_hrtimeout_jiffies(torture_random(&rand) % 16, + &rand); + } + break; case RTWS_SYNC: cur_ops->sync(); break; -- cgit From ef4f9d9b9230fcf4fca9801f03c31d99ed06a716 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Fri, 22 Apr 2022 21:15:18 +0800 Subject: rcu: Put panic_on_rcu_stall() after expedited RCU CPU stall warnings When a normal RCU CPU stall warning is encountered with the panic_on_rcu_stall sysfs variable is set, the system panics only after the stall warning is printed. But when an expedited RCU CPU stall warning is encountered with the panic_on_rcu_stall sysfs variable is set, the system panics first, thus never printing the stall warning. This commit therefore brings the expedited stall warning into line with the normal stall warning by printing first and panicking afterwards. Signed-off-by: Zqiang Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 571b0a700cce..f05a15b11fa0 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -623,7 +623,6 @@ static void synchronize_rcu_expedited_wait(void) return; if (rcu_stall_is_suppressed()) continue; - panic_on_rcu_stall(); trace_rcu_stall_warning(rcu_state.name, TPS("ExpeditedStall")); pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", rcu_state.name); @@ -671,6 +670,7 @@ static void synchronize_rcu_expedited_wait(void) } } jiffies_stall = 3 * rcu_exp_jiffies_till_stall_check() + 3; + panic_on_rcu_stall(); } } -- cgit From 82e445697d6a14d6b7462c13c613ebdd96468818 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 9 May 2022 09:49:05 -0700 Subject: rcu: Diagnose extended sync_rcu_do_polled_gp() loops This commit dumps out state when the sync_rcu_do_polled_gp() function loops more than expected. This is a debugging aid. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index f05a15b11fa0..4c7037b50703 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -970,6 +970,7 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); static void sync_rcu_do_polled_gp(struct work_struct *wp) { unsigned long flags; + int i = 0; struct rcu_node *rnp = container_of(wp, struct rcu_node, exp_poll_wq); unsigned long s; @@ -979,8 +980,12 @@ static void sync_rcu_do_polled_gp(struct work_struct *wp) raw_spin_unlock_irqrestore(&rnp->exp_poll_lock, flags); if (s == RCU_GET_STATE_COMPLETED) return; - while (!poll_state_synchronize_rcu(s)) + while (!poll_state_synchronize_rcu(s)) { synchronize_rcu_expedited(); + if (i == 10 || i == 20) + pr_info("%s: i = %d s = %lx gp_seq_polled = %lx\n", __func__, i, s, READ_ONCE(rcu_state.gp_seq_polled)); + i++; + } raw_spin_lock_irqsave(&rnp->exp_poll_lock, flags); s = rnp->exp_seq_poll_rq; if (poll_state_synchronize_rcu(s)) -- cgit From 28787e04fb67963673cbe6f77fb27137eba42718 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Wed, 18 May 2022 19:43:10 +0800 Subject: rcu: Add irqs-disabled indicator to expedited RCU CPU stall warnings If a CPU has interrupts disabled continuously starting before the beginning of a given expedited RCU grace period, that CPU will not execute that grace period's IPI handler. This will in turn mean that the ->cpu_no_qs.b.exp field in that CPU's rcu_data structure will continue to contain the boolean value false. Knowing whether or not a CPU has had interrupts disabled can be helpful when debugging an expedited RCU CPU stall warning, so this commit adds a "D" indicator expedited RCU CPU stall warnings that signifies that the corresponding CPU has had interrupts disabled throughout. This capability was tested as follows: runqemu kvm slirp nographic qemuparams="-m 4096 -smp 4" bootparams= "isolcpus=2,3 nohz_full=2,3 rcu_nocbs=2,3 rcutree.dump_tree=1 rcutorture.stall_cpu_holdoff=30 rcutorture.stall_cpu=40 rcutorture.stall_cpu_irqsoff=1 rcutorture.stall_cpu_block=0 rcutorture.stall_no_softlockup=1" -d The rcu_torture_stall() function ran on CPU 1, which displays the "D" as expected given the rcutorture.stall_cpu_irqsoff=1 module parameter: ............ rcu: INFO: rcu_preempt detected expedited stalls on CPUs/tasks: { 1-...D } 26467 jiffies s: 13317 root: 0x1/. rcu: blocking rcu_node structures (internal RCU debug): l=1:0-1:0x2/. Task dump for CPU 1: task:rcu_torture_sta state:R running task stack: 0 pid: 76 ppid: 2 flags:0x00004008 Signed-off-by: Zqiang Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 4c7037b50703..f092c7f18a5f 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -637,10 +637,11 @@ static void synchronize_rcu_expedited_wait(void) continue; ndetected++; rdp = per_cpu_ptr(&rcu_data, cpu); - pr_cont(" %d-%c%c%c", cpu, + pr_cont(" %d-%c%c%c%c", cpu, "O."[!!cpu_online(cpu)], "o."[!!(rdp->grpmask & rnp->expmaskinit)], - "N."[!!(rdp->grpmask & rnp->expmaskinitnext)]); + "N."[!!(rdp->grpmask & rnp->expmaskinitnext)], + "D."[!!(rdp->cpu_no_qs.b.exp)]); } } pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", -- cgit From a4703e3184320d6e15e2bc81d2ccf1c8c883f9d1 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Thu, 21 Jul 2022 15:42:35 +0200 Subject: bpf: Switch to new kfunc flags infrastructure Instead of populating multiple sets to indicate some attribute and then researching the same BTF ID in them, prepare a single unified BTF set which indicates whether a kfunc is allowed to be called, and also its attributes if any at the same time. Now, only one call is needed to perform the lookup for both kfunc availability and its attributes. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220721134245.2450-4-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 106 +++++++++++++++++++++++--------------------------- kernel/bpf/verifier.c | 14 +++---- 2 files changed, 54 insertions(+), 66 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 5869f03bcb6e..4d9c2d88720f 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -213,7 +213,7 @@ enum { }; struct btf_kfunc_set_tab { - struct btf_id_set *sets[BTF_KFUNC_HOOK_MAX][BTF_KFUNC_TYPE_MAX]; + struct btf_id_set8 *sets[BTF_KFUNC_HOOK_MAX]; }; struct btf_id_dtor_kfunc_tab { @@ -1616,7 +1616,7 @@ static void btf_free_id(struct btf *btf) static void btf_free_kfunc_set_tab(struct btf *btf) { struct btf_kfunc_set_tab *tab = btf->kfunc_set_tab; - int hook, type; + int hook; if (!tab) return; @@ -1625,10 +1625,8 @@ static void btf_free_kfunc_set_tab(struct btf *btf) */ if (btf_is_module(btf)) goto free_tab; - for (hook = 0; hook < ARRAY_SIZE(tab->sets); hook++) { - for (type = 0; type < ARRAY_SIZE(tab->sets[0]); type++) - kfree(tab->sets[hook][type]); - } + for (hook = 0; hook < ARRAY_SIZE(tab->sets); hook++) + kfree(tab->sets[hook]); free_tab: kfree(tab); btf->kfunc_set_tab = NULL; @@ -6172,7 +6170,8 @@ static bool is_kfunc_arg_mem_size(const struct btf *btf, static int btf_check_func_arg_match(struct bpf_verifier_env *env, const struct btf *btf, u32 func_id, struct bpf_reg_state *regs, - bool ptr_to_mem_ok) + bool ptr_to_mem_ok, + u32 kfunc_flags) { enum bpf_prog_type prog_type = resolve_prog_type(env->prog); struct bpf_verifier_log *log = &env->log; @@ -6210,10 +6209,8 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, if (is_kfunc) { /* Only kfunc can be release func */ - rel = btf_kfunc_id_set_contains(btf, resolve_prog_type(env->prog), - BTF_KFUNC_TYPE_RELEASE, func_id); - kptr_get = btf_kfunc_id_set_contains(btf, resolve_prog_type(env->prog), - BTF_KFUNC_TYPE_KPTR_ACQUIRE, func_id); + rel = kfunc_flags & KF_RELEASE; + kptr_get = kfunc_flags & KF_KPTR_GET; } /* check that BTF function arguments match actual types that the @@ -6442,7 +6439,7 @@ int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog, return -EINVAL; is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL; - err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global); + err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, 0); /* Compiler optimizations can remove arguments from static functions * or mismatched type can be passed into a global function. @@ -6455,9 +6452,10 @@ int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog, int btf_check_kfunc_arg_match(struct bpf_verifier_env *env, const struct btf *btf, u32 func_id, - struct bpf_reg_state *regs) + struct bpf_reg_state *regs, + u32 kfunc_flags) { - return btf_check_func_arg_match(env, btf, func_id, regs, true); + return btf_check_func_arg_match(env, btf, func_id, regs, true, kfunc_flags); } /* Convert BTF of a function into bpf_reg_state if possible @@ -6854,6 +6852,11 @@ bool btf_id_set_contains(const struct btf_id_set *set, u32 id) return bsearch(&id, set->ids, set->cnt, sizeof(u32), btf_id_cmp_func) != NULL; } +static void *btf_id_set8_contains(const struct btf_id_set8 *set, u32 id) +{ + return bsearch(&id, set->pairs, set->cnt, sizeof(set->pairs[0]), btf_id_cmp_func); +} + enum { BTF_MODULE_F_LIVE = (1 << 0), }; @@ -7102,16 +7105,16 @@ BTF_TRACING_TYPE_xxx /* Kernel Function (kfunc) BTF ID set registration API */ -static int __btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook, - enum btf_kfunc_type type, - struct btf_id_set *add_set, bool vmlinux_set) +static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook, + struct btf_id_set8 *add_set) { + bool vmlinux_set = !btf_is_module(btf); struct btf_kfunc_set_tab *tab; - struct btf_id_set *set; + struct btf_id_set8 *set; u32 set_cnt; int ret; - if (hook >= BTF_KFUNC_HOOK_MAX || type >= BTF_KFUNC_TYPE_MAX) { + if (hook >= BTF_KFUNC_HOOK_MAX) { ret = -EINVAL; goto end; } @@ -7127,7 +7130,7 @@ static int __btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook, btf->kfunc_set_tab = tab; } - set = tab->sets[hook][type]; + set = tab->sets[hook]; /* Warn when register_btf_kfunc_id_set is called twice for the same hook * for module sets. */ @@ -7141,7 +7144,7 @@ static int __btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook, * pointer and return. */ if (!vmlinux_set) { - tab->sets[hook][type] = add_set; + tab->sets[hook] = add_set; return 0; } @@ -7150,7 +7153,7 @@ static int __btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook, * and concatenate all individual sets being registered. While each set * is individually sorted, they may become unsorted when concatenated, * hence re-sorting the final set again is required to make binary - * searching the set using btf_id_set_contains function work. + * searching the set using btf_id_set8_contains function work. */ set_cnt = set ? set->cnt : 0; @@ -7165,8 +7168,8 @@ static int __btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook, } /* Grow set */ - set = krealloc(tab->sets[hook][type], - offsetof(struct btf_id_set, ids[set_cnt + add_set->cnt]), + set = krealloc(tab->sets[hook], + offsetof(struct btf_id_set8, pairs[set_cnt + add_set->cnt]), GFP_KERNEL | __GFP_NOWARN); if (!set) { ret = -ENOMEM; @@ -7174,15 +7177,15 @@ static int __btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook, } /* For newly allocated set, initialize set->cnt to 0 */ - if (!tab->sets[hook][type]) + if (!tab->sets[hook]) set->cnt = 0; - tab->sets[hook][type] = set; + tab->sets[hook] = set; /* Concatenate the two sets */ - memcpy(set->ids + set->cnt, add_set->ids, add_set->cnt * sizeof(set->ids[0])); + memcpy(set->pairs + set->cnt, add_set->pairs, add_set->cnt * sizeof(set->pairs[0])); set->cnt += add_set->cnt; - sort(set->ids, set->cnt, sizeof(set->ids[0]), btf_id_cmp_func, NULL); + sort(set->pairs, set->cnt, sizeof(set->pairs[0]), btf_id_cmp_func, NULL); return 0; end: @@ -7190,38 +7193,25 @@ end: return ret; } -static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook, - const struct btf_kfunc_id_set *kset) -{ - bool vmlinux_set = !btf_is_module(btf); - int type, ret = 0; - - for (type = 0; type < ARRAY_SIZE(kset->sets); type++) { - if (!kset->sets[type]) - continue; - - ret = __btf_populate_kfunc_set(btf, hook, type, kset->sets[type], vmlinux_set); - if (ret) - break; - } - return ret; -} - -static bool __btf_kfunc_id_set_contains(const struct btf *btf, +static u32 *__btf_kfunc_id_set_contains(const struct btf *btf, enum btf_kfunc_hook hook, - enum btf_kfunc_type type, u32 kfunc_btf_id) { - struct btf_id_set *set; + struct btf_id_set8 *set; + u32 *id; - if (hook >= BTF_KFUNC_HOOK_MAX || type >= BTF_KFUNC_TYPE_MAX) - return false; + if (hook >= BTF_KFUNC_HOOK_MAX) + return NULL; if (!btf->kfunc_set_tab) - return false; - set = btf->kfunc_set_tab->sets[hook][type]; + return NULL; + set = btf->kfunc_set_tab->sets[hook]; if (!set) - return false; - return btf_id_set_contains(set, kfunc_btf_id); + return NULL; + id = btf_id_set8_contains(set, kfunc_btf_id); + if (!id) + return NULL; + /* The flags for BTF ID are located next to it */ + return id + 1; } static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type) @@ -7249,14 +7239,14 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type) * keeping the reference for the duration of the call provides the necessary * protection for looking up a well-formed btf->kfunc_set_tab. */ -bool btf_kfunc_id_set_contains(const struct btf *btf, +u32 *btf_kfunc_id_set_contains(const struct btf *btf, enum bpf_prog_type prog_type, - enum btf_kfunc_type type, u32 kfunc_btf_id) + u32 kfunc_btf_id) { enum btf_kfunc_hook hook; hook = bpf_prog_type_to_kfunc_hook(prog_type); - return __btf_kfunc_id_set_contains(btf, hook, type, kfunc_btf_id); + return __btf_kfunc_id_set_contains(btf, hook, kfunc_btf_id); } /* This function must be invoked only from initcalls/module init functions */ @@ -7283,7 +7273,7 @@ int register_btf_kfunc_id_set(enum bpf_prog_type prog_type, return PTR_ERR(btf); hook = bpf_prog_type_to_kfunc_hook(prog_type); - ret = btf_populate_kfunc_set(btf, hook, kset); + ret = btf_populate_kfunc_set(btf, hook, kset->set); btf_put(btf); return ret; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7c1e056624f9..096fdac70165 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7562,6 +7562,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int err, insn_idx = *insn_idx_p; const struct btf_param *args; struct btf *desc_btf; + u32 *kfunc_flags; bool acq; /* skip for now, but return error when we find this in fixup_kfunc_call */ @@ -7577,18 +7578,16 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, func_name = btf_name_by_offset(desc_btf, func->name_off); func_proto = btf_type_by_id(desc_btf, func->type); - if (!btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog), - BTF_KFUNC_TYPE_CHECK, func_id)) { + kfunc_flags = btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog), func_id); + if (!kfunc_flags) { verbose(env, "calling kernel function %s is not allowed\n", func_name); return -EACCES; } - - acq = btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog), - BTF_KFUNC_TYPE_ACQUIRE, func_id); + acq = *kfunc_flags & KF_ACQUIRE; /* Check the arguments */ - err = btf_check_kfunc_arg_match(env, desc_btf, func_id, regs); + err = btf_check_kfunc_arg_match(env, desc_btf, func_id, regs, *kfunc_flags); if (err < 0) return err; /* In case of release function, we get register number of refcounted @@ -7632,8 +7631,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, regs[BPF_REG_0].btf = desc_btf; regs[BPF_REG_0].type = PTR_TO_BTF_ID; regs[BPF_REG_0].btf_id = ptr_type_id; - if (btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog), - BTF_KFUNC_TYPE_RET_NULL, func_id)) { + if (*kfunc_flags & KF_RET_NULL) { regs[BPF_REG_0].type |= PTR_MAYBE_NULL; /* For mark_ptr_or_null_reg, see 93c230e3f5bd6 */ regs[BPF_REG_0].id = ++env->id_gen; -- cgit From 56e948ffc098a780fefb6c1784a3a2c7b81100a1 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Thu, 21 Jul 2022 15:42:36 +0200 Subject: bpf: Add support for forcing kfunc args to be trusted Teach the verifier to detect a new KF_TRUSTED_ARGS kfunc flag, which means each pointer argument must be trusted, which we define as a pointer that is referenced (has non-zero ref_obj_id) and also needs to have its offset unchanged, similar to how release functions expect their argument. This allows a kfunc to receive pointer arguments unchanged from the result of the acquire kfunc. This is required to ensure that kfunc that operate on some object only work on acquired pointers and not normal PTR_TO_BTF_ID with same type which can be obtained by pointer walking. The restrictions applied to release arguments also apply to trusted arguments. This implies that strict type matching (not deducing type by recursively following members at offset) and OBJ_RELEASE offset checks (ensuring they are zero) are used for trusted pointer arguments. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220721134245.2450-5-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 4d9c2d88720f..7ac971ea98d1 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6174,10 +6174,10 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, u32 kfunc_flags) { enum bpf_prog_type prog_type = resolve_prog_type(env->prog); + bool rel = false, kptr_get = false, trusted_arg = false; struct bpf_verifier_log *log = &env->log; u32 i, nargs, ref_id, ref_obj_id = 0; bool is_kfunc = btf_is_kernel(btf); - bool rel = false, kptr_get = false; const char *func_name, *ref_tname; const struct btf_type *t, *ref_t; const struct btf_param *args; @@ -6211,6 +6211,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, /* Only kfunc can be release func */ rel = kfunc_flags & KF_RELEASE; kptr_get = kfunc_flags & KF_KPTR_GET; + trusted_arg = kfunc_flags & KF_TRUSTED_ARGS; } /* check that BTF function arguments match actual types that the @@ -6235,10 +6236,19 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, return -EINVAL; } + /* Check if argument must be a referenced pointer, args + i has + * been verified to be a pointer (after skipping modifiers). + */ + if (is_kfunc && trusted_arg && !reg->ref_obj_id) { + bpf_log(log, "R%d must be referenced\n", regno); + return -EINVAL; + } + ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id); ref_tname = btf_name_by_offset(btf, ref_t->name_off); - if (rel && reg->ref_obj_id) + /* Trusted args have the same offset checks as release arguments */ + if (trusted_arg || (rel && reg->ref_obj_id)) arg_type |= OBJ_RELEASE; ret = check_func_arg_reg_off(env, reg, regno, arg_type); if (ret < 0) @@ -6336,7 +6346,8 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, reg_ref_tname = btf_name_by_offset(reg_btf, reg_ref_t->name_off); if (!btf_struct_ids_match(log, reg_btf, reg_ref_id, - reg->off, btf, ref_id, rel && reg->ref_obj_id)) { + reg->off, btf, ref_id, + trusted_arg || (rel && reg->ref_obj_id))) { bpf_log(log, "kernel function %s args#%d expected pointer to %s %s but R%d has a pointer to %s %s\n", func_name, i, btf_type_str(ref_t), ref_tname, -- cgit From 72311809031217714e635b24f8478e6ecb0d93d9 Mon Sep 17 00:00:00 2001 From: Tianyu Lan Date: Thu, 21 Jul 2022 23:38:46 -0400 Subject: swiotlb: clean up some coding style and minor issues - Fix the used field of struct io_tlb_area wasn't initialized - Set area number to be 0 if input area number parameter is 0 - Use array_size() to calculate io_tlb_area array size - Make parameters of swiotlb_do_find_slots() more reasonable Fixes: 26ffb91fa5e0 ("swiotlb: split up the global swiotlb lock") Signed-off-by: Tianyu Lan Reviewed-by: Michael Kelley Signed-off-by: Christoph Hellwig --- kernel/dma/swiotlb.c | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 39dee4004439..cc50f1fb127f 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -119,7 +119,10 @@ static bool round_up_default_nslabs(void) static void swiotlb_adjust_nareas(unsigned int nareas) { - if (!is_power_of_2(nareas)) + /* use a single area when non is specified */ + if (!nareas) + nareas = 1; + else if (!is_power_of_2(nareas)) nareas = roundup_pow_of_two(nareas); default_nareas = nareas; @@ -276,6 +279,7 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start, for (i = 0; i < mem->nareas; i++) { spin_lock_init(&mem->areas[i].lock); mem->areas[i].index = 0; + mem->areas[i].used = 0; } for (i = 0; i < mem->nslabs; i++) { @@ -358,8 +362,8 @@ retry: panic("%s: Failed to allocate %zu bytes align=0x%lx\n", __func__, alloc_size, PAGE_SIZE); - mem->areas = memblock_alloc(sizeof(struct io_tlb_area) * - default_nareas, SMP_CACHE_BYTES); + mem->areas = memblock_alloc(array_size(sizeof(struct io_tlb_area), + default_nareas), SMP_CACHE_BYTES); if (!mem->areas) panic("%s: Failed to allocate mem->areas.\n", __func__); @@ -484,7 +488,7 @@ void __init swiotlb_exit(void) free_pages((unsigned long)mem->slots, get_order(slots_size)); } else { memblock_free_late(__pa(mem->areas), - mem->nareas * sizeof(struct io_tlb_area)); + array_size(sizeof(*mem->areas), mem->nareas)); memblock_free_late(mem->start, tbl_size); memblock_free_late(__pa(mem->slots), slots_size); } @@ -598,11 +602,12 @@ static unsigned int wrap_area_index(struct io_tlb_mem *mem, unsigned int index) * Find a suitable number of IO TLB entries size that will fit this request and * allocate a buffer from that IO TLB pool. */ -static int swiotlb_do_find_slots(struct io_tlb_mem *mem, - struct io_tlb_area *area, int area_index, - struct device *dev, phys_addr_t orig_addr, - size_t alloc_size, unsigned int alloc_align_mask) +static int swiotlb_do_find_slots(struct device *dev, int area_index, + phys_addr_t orig_addr, size_t alloc_size, + unsigned int alloc_align_mask) { + struct io_tlb_mem *mem = dev->dma_io_tlb_mem; + struct io_tlb_area *area = mem->areas + area_index; unsigned long boundary_mask = dma_get_seg_boundary(dev); dma_addr_t tbl_dma_addr = phys_to_dma_unencrypted(dev, mem->start) & boundary_mask; @@ -691,12 +696,11 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr, size_t alloc_size, unsigned int alloc_align_mask) { struct io_tlb_mem *mem = dev->dma_io_tlb_mem; - int start = raw_smp_processor_id() & ((1U << __fls(mem->nareas)) - 1); + int start = raw_smp_processor_id() & (mem->nareas - 1); int i = start, index; do { - index = swiotlb_do_find_slots(mem, mem->areas + i, i, - dev, orig_addr, alloc_size, + index = swiotlb_do_find_slots(dev, i, orig_addr, alloc_size, alloc_align_mask); if (index >= 0) return index; -- cgit From 426752b25377487807455bb2f55c09b01bb6aaf0 Mon Sep 17 00:00:00 2001 From: David Gow Date: Fri, 15 Jul 2022 14:40:52 +0800 Subject: kcsan: test: Add a .kunitconfig to run KCSAN tests Add a .kunitconfig file, which provides a default, working config for running the KCSAN tests. Note that it needs to run on an SMP machine, so to run under kunit_tool, the --qemu_args option should be used (on a supported architecture, like x86_64). For example: ./tools/testing/kunit/kunit.py run --arch=x86_64 --qemu_args='-smp 8' --kunitconfig=kernel/kcsan Signed-off-by: David Gow Reviewed-by: Marco Elver Acked-by: Brendan Higgins Tested-by: Daniel Latypov Signed-off-by: Shuah Khan --- kernel/kcsan/.kunitconfig | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 kernel/kcsan/.kunitconfig (limited to 'kernel') diff --git a/kernel/kcsan/.kunitconfig b/kernel/kcsan/.kunitconfig new file mode 100644 index 000000000000..e82f0f52ab0a --- /dev/null +++ b/kernel/kcsan/.kunitconfig @@ -0,0 +1,24 @@ +# Note that the KCSAN tests need to run on an SMP setup. +# Under kunit_tool, this can be done by using the --qemu_args +# option to configure a machine with several cores. For example: +# ./tools/testing/kunit/kunit.py run --kunitconfig=kernel/kcsan \ +# --arch=x86_64 --qemu_args="-smp 8" + +CONFIG_KUNIT=y + +CONFIG_DEBUG_KERNEL=y + +# Need some level of concurrency to test a concurrency sanitizer. +CONFIG_SMP=y + +CONFIG_KCSAN=y +CONFIG_KCSAN_KUNIT_TEST=y + +# Set these if you want to run test_barrier_nothreads +#CONFIG_KCSAN_STRICT=y +#CONFIG_KCSAN_WEAK_MEMORY=y + +# This prevents the test from timing out on many setups. Feel free to remove +# (or alter) this, in conjunction with setting a different test timeout with, +# for example, the --timeout kunit_tool option. +CONFIG_KCSAN_REPORT_ONCE_IN_MS=100 -- cgit From f96f644ab97abeed3f7007c953836a574ce928cc Mon Sep 17 00:00:00 2001 From: Song Liu Date: Tue, 19 Jul 2022 17:21:23 -0700 Subject: ftrace: Add modify_ftrace_direct_multi_nolock This is similar to modify_ftrace_direct_multi, but does not acquire direct_mutex. This is useful when direct_mutex is already locked by the user. Signed-off-by: Song Liu Signed-off-by: Daniel Borkmann Reviewed-by: Steven Rostedt (Google) Link: https://lore.kernel.org/bpf/20220720002126.803253-2-song@kernel.org --- kernel/trace/ftrace.c | 86 +++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 62 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 601ccf1b2f09..5d67dc12231d 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5691,22 +5691,8 @@ int unregister_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) } EXPORT_SYMBOL_GPL(unregister_ftrace_direct_multi); -/** - * modify_ftrace_direct_multi - Modify an existing direct 'multi' call - * to call something else - * @ops: The address of the struct ftrace_ops object - * @addr: The address of the new trampoline to call at @ops functions - * - * This is used to unregister currently registered direct caller and - * register new one @addr on functions registered in @ops object. - * - * Note there's window between ftrace_shutdown and ftrace_startup calls - * where there will be no callbacks called. - * - * Returns: zero on success. Non zero on error, which includes: - * -EINVAL - The @ops object was not properly registered. - */ -int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) +static int +__modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) { struct ftrace_hash *hash; struct ftrace_func_entry *entry, *iter; @@ -5717,12 +5703,7 @@ int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) int i, size; int err; - if (check_direct_multi(ops)) - return -EINVAL; - if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) - return -EINVAL; - - mutex_lock(&direct_mutex); + lockdep_assert_held_once(&direct_mutex); /* Enable the tmp_ops to have the same functions as the direct ops */ ftrace_ops_init(&tmp_ops); @@ -5730,7 +5711,7 @@ int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) err = register_ftrace_function(&tmp_ops); if (err) - goto out_direct; + return err; /* * Now the ftrace_ops_list_func() is called to do the direct callers. @@ -5754,7 +5735,64 @@ int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) /* Removing the tmp_ops will add the updated direct callers to the functions */ unregister_ftrace_function(&tmp_ops); - out_direct: + return err; +} + +/** + * modify_ftrace_direct_multi_nolock - Modify an existing direct 'multi' call + * to call something else + * @ops: The address of the struct ftrace_ops object + * @addr: The address of the new trampoline to call at @ops functions + * + * This is used to unregister currently registered direct caller and + * register new one @addr on functions registered in @ops object. + * + * Note there's window between ftrace_shutdown and ftrace_startup calls + * where there will be no callbacks called. + * + * Caller should already have direct_mutex locked, so we don't lock + * direct_mutex here. + * + * Returns: zero on success. Non zero on error, which includes: + * -EINVAL - The @ops object was not properly registered. + */ +int modify_ftrace_direct_multi_nolock(struct ftrace_ops *ops, unsigned long addr) +{ + if (check_direct_multi(ops)) + return -EINVAL; + if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) + return -EINVAL; + + return __modify_ftrace_direct_multi(ops, addr); +} +EXPORT_SYMBOL_GPL(modify_ftrace_direct_multi_nolock); + +/** + * modify_ftrace_direct_multi - Modify an existing direct 'multi' call + * to call something else + * @ops: The address of the struct ftrace_ops object + * @addr: The address of the new trampoline to call at @ops functions + * + * This is used to unregister currently registered direct caller and + * register new one @addr on functions registered in @ops object. + * + * Note there's window between ftrace_shutdown and ftrace_startup calls + * where there will be no callbacks called. + * + * Returns: zero on success. Non zero on error, which includes: + * -EINVAL - The @ops object was not properly registered. + */ +int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) +{ + int err; + + if (check_direct_multi(ops)) + return -EINVAL; + if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) + return -EINVAL; + + mutex_lock(&direct_mutex); + err = __modify_ftrace_direct_multi(ops, addr); mutex_unlock(&direct_mutex); return err; } -- cgit From 53cd885bc5c3ea283cc9c00ca6446c778f00bfba Mon Sep 17 00:00:00 2001 From: Song Liu Date: Tue, 19 Jul 2022 17:21:24 -0700 Subject: ftrace: Allow IPMODIFY and DIRECT ops on the same function IPMODIFY (livepatch) and DIRECT (bpf trampoline) ops are both important users of ftrace. It is necessary to allow them work on the same function at the same time. First, DIRECT ops no longer specify IPMODIFY flag. Instead, DIRECT flag is handled together with IPMODIFY flag in __ftrace_hash_update_ipmodify(). Then, a callback function, ops_func, is added to ftrace_ops. This is used by ftrace core code to understand whether the DIRECT ops can share with an IPMODIFY ops. To share with IPMODIFY ops, the DIRECT ops need to implement the callback function and adjust the direct trampoline accordingly. If DIRECT ops is attached before the IPMODIFY ops, ftrace core code calls ENABLE_SHARE_IPMODIFY_PEER on the DIRECT ops before registering the IPMODIFY ops. If IPMODIFY ops is attached before the DIRECT ops, ftrace core code calls ENABLE_SHARE_IPMODIFY_SELF in __ftrace_hash_update_ipmodify. Owner of the DIRECT ops may return 0 if the DIRECT trampoline can share with IPMODIFY, so error code otherwise. The error code is propagated to register_ftrace_direct_multi so that onwer of the DIRECT trampoline can handle it properly. For more details, please refer to comment before enum ftrace_ops_cmd. Signed-off-by: Song Liu Signed-off-by: Daniel Borkmann Reviewed-by: Steven Rostedt (Google) Link: https://lore.kernel.org/all/20220602193706.2607681-2-song@kernel.org/ Link: https://lore.kernel.org/all/20220718055449.3960512-1-song@kernel.org/ Link: https://lore.kernel.org/bpf/20220720002126.803253-3-song@kernel.org --- kernel/trace/ftrace.c | 242 ++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 216 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 5d67dc12231d..bc921a3f7ea8 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1861,6 +1861,8 @@ static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, ftrace_hash_rec_update_modify(ops, filter_hash, 1); } +static bool ops_references_ip(struct ftrace_ops *ops, unsigned long ip); + /* * Try to update IPMODIFY flag on each ftrace_rec. Return 0 if it is OK * or no-needed to update, -EBUSY if it detects a conflict of the flag @@ -1869,6 +1871,13 @@ static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, * - If the hash is NULL, it hits all recs (if IPMODIFY is set, this is rejected) * - If the hash is EMPTY_HASH, it hits nothing * - Anything else hits the recs which match the hash entries. + * + * DIRECT ops does not have IPMODIFY flag, but we still need to check it + * against functions with FTRACE_FL_IPMODIFY. If there is any overlap, call + * ops_func(SHARE_IPMODIFY_SELF) to make sure current ops can share with + * IPMODIFY. If ops_func(SHARE_IPMODIFY_SELF) returns non-zero, propagate + * the return value to the caller and eventually to the owner of the DIRECT + * ops. */ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops, struct ftrace_hash *old_hash, @@ -1877,17 +1886,26 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops, struct ftrace_page *pg; struct dyn_ftrace *rec, *end = NULL; int in_old, in_new; + bool is_ipmodify, is_direct; /* Only update if the ops has been registered */ if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) return 0; - if (!(ops->flags & FTRACE_OPS_FL_IPMODIFY)) + is_ipmodify = ops->flags & FTRACE_OPS_FL_IPMODIFY; + is_direct = ops->flags & FTRACE_OPS_FL_DIRECT; + + /* neither IPMODIFY nor DIRECT, skip */ + if (!is_ipmodify && !is_direct) + return 0; + + if (WARN_ON_ONCE(is_ipmodify && is_direct)) return 0; /* - * Since the IPMODIFY is a very address sensitive action, we do not - * allow ftrace_ops to set all functions to new hash. + * Since the IPMODIFY and DIRECT are very address sensitive + * actions, we do not allow ftrace_ops to set all functions to new + * hash. */ if (!new_hash || !old_hash) return -EINVAL; @@ -1905,12 +1923,32 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops, continue; if (in_new) { - /* New entries must ensure no others are using it */ - if (rec->flags & FTRACE_FL_IPMODIFY) - goto rollback; - rec->flags |= FTRACE_FL_IPMODIFY; - } else /* Removed entry */ + if (rec->flags & FTRACE_FL_IPMODIFY) { + int ret; + + /* Cannot have two ipmodify on same rec */ + if (is_ipmodify) + goto rollback; + + FTRACE_WARN_ON(rec->flags & FTRACE_FL_DIRECT); + + /* + * Another ops with IPMODIFY is already + * attached. We are now attaching a direct + * ops. Run SHARE_IPMODIFY_SELF, to check + * whether sharing is supported. + */ + if (!ops->ops_func) + return -EBUSY; + ret = ops->ops_func(ops, FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_SELF); + if (ret) + return ret; + } else if (is_ipmodify) { + rec->flags |= FTRACE_FL_IPMODIFY; + } + } else if (is_ipmodify) { rec->flags &= ~FTRACE_FL_IPMODIFY; + } } while_for_each_ftrace_rec(); return 0; @@ -2454,8 +2492,7 @@ static void call_direct_funcs(unsigned long ip, unsigned long pip, struct ftrace_ops direct_ops = { .func = call_direct_funcs, - .flags = FTRACE_OPS_FL_IPMODIFY - | FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_REGS + .flags = FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_REGS | FTRACE_OPS_FL_PERMANENT, /* * By declaring the main trampoline as this trampoline @@ -3072,14 +3109,14 @@ static inline int ops_traces_mod(struct ftrace_ops *ops) } /* - * Check if the current ops references the record. + * Check if the current ops references the given ip. * * If the ops traces all functions, then it was already accounted for. * If the ops does not trace the current record function, skip it. * If the ops ignores the function via notrace filter, skip it. */ -static inline bool -ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec) +static bool +ops_references_ip(struct ftrace_ops *ops, unsigned long ip) { /* If ops isn't enabled, ignore it */ if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) @@ -3091,16 +3128,29 @@ ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec) /* The function must be in the filter */ if (!ftrace_hash_empty(ops->func_hash->filter_hash) && - !__ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip)) + !__ftrace_lookup_ip(ops->func_hash->filter_hash, ip)) return false; /* If in notrace hash, we ignore it too */ - if (ftrace_lookup_ip(ops->func_hash->notrace_hash, rec->ip)) + if (ftrace_lookup_ip(ops->func_hash->notrace_hash, ip)) return false; return true; } +/* + * Check if the current ops references the record. + * + * If the ops traces all functions, then it was already accounted for. + * If the ops does not trace the current record function, skip it. + * If the ops ignores the function via notrace filter, skip it. + */ +static bool +ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec) +{ + return ops_references_ip(ops, rec->ip); +} + static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs) { bool init_nop = ftrace_need_init_nop(); @@ -5215,6 +5265,8 @@ static struct ftrace_direct_func *ftrace_alloc_direct_func(unsigned long addr) return direct; } +static int register_ftrace_function_nolock(struct ftrace_ops *ops); + /** * register_ftrace_direct - Call a custom trampoline directly * @ip: The address of the nop at the beginning of a function @@ -5286,7 +5338,7 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) ret = ftrace_set_filter_ip(&direct_ops, ip, 0, 0); if (!ret && !(direct_ops.flags & FTRACE_OPS_FL_ENABLED)) { - ret = register_ftrace_function(&direct_ops); + ret = register_ftrace_function_nolock(&direct_ops); if (ret) ftrace_set_filter_ip(&direct_ops, ip, 1, 0); } @@ -5545,8 +5597,7 @@ int modify_ftrace_direct(unsigned long ip, } EXPORT_SYMBOL_GPL(modify_ftrace_direct); -#define MULTI_FLAGS (FTRACE_OPS_FL_IPMODIFY | FTRACE_OPS_FL_DIRECT | \ - FTRACE_OPS_FL_SAVE_REGS) +#define MULTI_FLAGS (FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_REGS) static int check_direct_multi(struct ftrace_ops *ops) { @@ -5639,7 +5690,7 @@ int register_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) ops->flags = MULTI_FLAGS; ops->trampoline = FTRACE_REGS_ADDR; - err = register_ftrace_function(ops); + err = register_ftrace_function_nolock(ops); out_remove: if (err) @@ -5709,7 +5760,7 @@ __modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) ftrace_ops_init(&tmp_ops); tmp_ops.func_hash = ops->func_hash; - err = register_ftrace_function(&tmp_ops); + err = register_ftrace_function_nolock(&tmp_ops); if (err) return err; @@ -8003,6 +8054,143 @@ int ftrace_is_dead(void) return ftrace_disabled; } +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +/* + * When registering ftrace_ops with IPMODIFY, it is necessary to make sure + * it doesn't conflict with any direct ftrace_ops. If there is existing + * direct ftrace_ops on a kernel function being patched, call + * FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_PEER on it to enable sharing. + * + * @ops: ftrace_ops being registered. + * + * Returns: + * 0 on success; + * Negative on failure. + */ +static int prepare_direct_functions_for_ipmodify(struct ftrace_ops *ops) +{ + struct ftrace_func_entry *entry; + struct ftrace_hash *hash; + struct ftrace_ops *op; + int size, i, ret; + + lockdep_assert_held_once(&direct_mutex); + + if (!(ops->flags & FTRACE_OPS_FL_IPMODIFY)) + return 0; + + hash = ops->func_hash->filter_hash; + size = 1 << hash->size_bits; + for (i = 0; i < size; i++) { + hlist_for_each_entry(entry, &hash->buckets[i], hlist) { + unsigned long ip = entry->ip; + bool found_op = false; + + mutex_lock(&ftrace_lock); + do_for_each_ftrace_op(op, ftrace_ops_list) { + if (!(op->flags & FTRACE_OPS_FL_DIRECT)) + continue; + if (ops_references_ip(op, ip)) { + found_op = true; + break; + } + } while_for_each_ftrace_op(op); + mutex_unlock(&ftrace_lock); + + if (found_op) { + if (!op->ops_func) + return -EBUSY; + + ret = op->ops_func(op, FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_PEER); + if (ret) + return ret; + } + } + } + + return 0; +} + +/* + * Similar to prepare_direct_functions_for_ipmodify, clean up after ops + * with IPMODIFY is unregistered. The cleanup is optional for most DIRECT + * ops. + */ +static void cleanup_direct_functions_after_ipmodify(struct ftrace_ops *ops) +{ + struct ftrace_func_entry *entry; + struct ftrace_hash *hash; + struct ftrace_ops *op; + int size, i; + + if (!(ops->flags & FTRACE_OPS_FL_IPMODIFY)) + return; + + mutex_lock(&direct_mutex); + + hash = ops->func_hash->filter_hash; + size = 1 << hash->size_bits; + for (i = 0; i < size; i++) { + hlist_for_each_entry(entry, &hash->buckets[i], hlist) { + unsigned long ip = entry->ip; + bool found_op = false; + + mutex_lock(&ftrace_lock); + do_for_each_ftrace_op(op, ftrace_ops_list) { + if (!(op->flags & FTRACE_OPS_FL_DIRECT)) + continue; + if (ops_references_ip(op, ip)) { + found_op = true; + break; + } + } while_for_each_ftrace_op(op); + mutex_unlock(&ftrace_lock); + + /* The cleanup is optional, ignore any errors */ + if (found_op && op->ops_func) + op->ops_func(op, FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY_PEER); + } + } + mutex_unlock(&direct_mutex); +} + +#define lock_direct_mutex() mutex_lock(&direct_mutex) +#define unlock_direct_mutex() mutex_unlock(&direct_mutex) + +#else /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ + +static int prepare_direct_functions_for_ipmodify(struct ftrace_ops *ops) +{ + return 0; +} + +static void cleanup_direct_functions_after_ipmodify(struct ftrace_ops *ops) +{ +} + +#define lock_direct_mutex() do { } while (0) +#define unlock_direct_mutex() do { } while (0) + +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ + +/* + * Similar to register_ftrace_function, except we don't lock direct_mutex. + */ +static int register_ftrace_function_nolock(struct ftrace_ops *ops) +{ + int ret; + + ftrace_ops_init(ops); + + mutex_lock(&ftrace_lock); + + ret = ftrace_startup(ops, 0); + + mutex_unlock(&ftrace_lock); + + return ret; +} + /** * register_ftrace_function - register a function for profiling * @ops: ops structure that holds the function for profiling. @@ -8018,14 +8206,15 @@ int register_ftrace_function(struct ftrace_ops *ops) { int ret; - ftrace_ops_init(ops); - - mutex_lock(&ftrace_lock); - - ret = ftrace_startup(ops, 0); + lock_direct_mutex(); + ret = prepare_direct_functions_for_ipmodify(ops); + if (ret < 0) + goto out_unlock; - mutex_unlock(&ftrace_lock); + ret = register_ftrace_function_nolock(ops); +out_unlock: + unlock_direct_mutex(); return ret; } EXPORT_SYMBOL_GPL(register_ftrace_function); @@ -8044,6 +8233,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops) ret = ftrace_shutdown(ops, 0); mutex_unlock(&ftrace_lock); + cleanup_direct_functions_after_ipmodify(ops); return ret; } EXPORT_SYMBOL_GPL(unregister_ftrace_function); -- cgit From 00963a2e75a872e5fce4d0115ac2786ec86b57a6 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Tue, 19 Jul 2022 17:21:26 -0700 Subject: bpf: Support bpf_trampoline on functions with IPMODIFY (e.g. livepatch) When tracing a function with IPMODIFY ftrace_ops (livepatch), the bpf trampoline must follow the instruction pointer saved on stack. This needs extra handling for bpf trampolines with BPF_TRAMP_F_CALL_ORIG flag. Implement bpf_tramp_ftrace_ops_func and use it for the ftrace_ops used by BPF trampoline. This enables tracing functions with livepatch. This also requires moving bpf trampoline to *_ftrace_direct_mult APIs. Signed-off-by: Song Liu Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/all/20220602193706.2607681-2-song@kernel.org/ Link: https://lore.kernel.org/bpf/20220720002126.803253-5-song@kernel.org --- kernel/bpf/trampoline.c | 158 ++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 141 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 6691dbf9e467..42e387a12694 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -13,6 +13,7 @@ #include #include #include +#include /* dummy _ops. The verifier will operate on target program's ops. */ const struct bpf_verifier_ops bpf_extension_verifier_ops = { @@ -29,6 +30,81 @@ static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE]; /* serializes access to trampoline_table */ static DEFINE_MUTEX(trampoline_mutex); +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex); + +static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, enum ftrace_ops_cmd cmd) +{ + struct bpf_trampoline *tr = ops->private; + int ret = 0; + + if (cmd == FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_SELF) { + /* This is called inside register_ftrace_direct_multi(), so + * tr->mutex is already locked. + */ + lockdep_assert_held_once(&tr->mutex); + + /* Instead of updating the trampoline here, we propagate + * -EAGAIN to register_ftrace_direct_multi(). Then we can + * retry register_ftrace_direct_multi() after updating the + * trampoline. + */ + if ((tr->flags & BPF_TRAMP_F_CALL_ORIG) && + !(tr->flags & BPF_TRAMP_F_ORIG_STACK)) { + if (WARN_ON_ONCE(tr->flags & BPF_TRAMP_F_SHARE_IPMODIFY)) + return -EBUSY; + + tr->flags |= BPF_TRAMP_F_SHARE_IPMODIFY; + return -EAGAIN; + } + + return 0; + } + + /* The normal locking order is + * tr->mutex => direct_mutex (ftrace.c) => ftrace_lock (ftrace.c) + * + * The following two commands are called from + * + * prepare_direct_functions_for_ipmodify + * cleanup_direct_functions_after_ipmodify + * + * In both cases, direct_mutex is already locked. Use + * mutex_trylock(&tr->mutex) to avoid deadlock in race condition + * (something else is making changes to this same trampoline). + */ + if (!mutex_trylock(&tr->mutex)) { + /* sleep 1 ms to make sure whatever holding tr->mutex makes + * some progress. + */ + msleep(1); + return -EAGAIN; + } + + switch (cmd) { + case FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_PEER: + tr->flags |= BPF_TRAMP_F_SHARE_IPMODIFY; + + if ((tr->flags & BPF_TRAMP_F_CALL_ORIG) && + !(tr->flags & BPF_TRAMP_F_ORIG_STACK)) + ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */); + break; + case FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY_PEER: + tr->flags &= ~BPF_TRAMP_F_SHARE_IPMODIFY; + + if (tr->flags & BPF_TRAMP_F_ORIG_STACK) + ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */); + break; + default: + ret = -EINVAL; + break; + }; + + mutex_unlock(&tr->mutex); + return ret; +} +#endif + bool bpf_prog_has_trampoline(const struct bpf_prog *prog) { enum bpf_attach_type eatype = prog->expected_attach_type; @@ -89,6 +165,16 @@ static struct bpf_trampoline *bpf_trampoline_lookup(u64 key) tr = kzalloc(sizeof(*tr), GFP_KERNEL); if (!tr) goto out; +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + tr->fops = kzalloc(sizeof(struct ftrace_ops), GFP_KERNEL); + if (!tr->fops) { + kfree(tr); + tr = NULL; + goto out; + } + tr->fops->private = tr; + tr->fops->ops_func = bpf_tramp_ftrace_ops_func; +#endif tr->key = key; INIT_HLIST_NODE(&tr->hlist); @@ -128,7 +214,7 @@ static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr) int ret; if (tr->func.ftrace_managed) - ret = unregister_ftrace_direct((long)ip, (long)old_addr); + ret = unregister_ftrace_direct_multi(tr->fops, (long)old_addr); else ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, NULL); @@ -137,15 +223,20 @@ static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr) return ret; } -static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_addr) +static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_addr, + bool lock_direct_mutex) { void *ip = tr->func.addr; int ret; - if (tr->func.ftrace_managed) - ret = modify_ftrace_direct((long)ip, (long)old_addr, (long)new_addr); - else + if (tr->func.ftrace_managed) { + if (lock_direct_mutex) + ret = modify_ftrace_direct_multi(tr->fops, (long)new_addr); + else + ret = modify_ftrace_direct_multi_nolock(tr->fops, (long)new_addr); + } else { ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, new_addr); + } return ret; } @@ -163,10 +254,12 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr) if (bpf_trampoline_module_get(tr)) return -ENOENT; - if (tr->func.ftrace_managed) - ret = register_ftrace_direct((long)ip, (long)new_addr); - else + if (tr->func.ftrace_managed) { + ftrace_set_filter_ip(tr->fops, (unsigned long)ip, 0, 0); + ret = register_ftrace_direct_multi(tr->fops, (long)new_addr); + } else { ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, NULL, new_addr); + } if (ret) bpf_trampoline_module_put(tr); @@ -332,11 +425,11 @@ out: return ERR_PTR(err); } -static int bpf_trampoline_update(struct bpf_trampoline *tr) +static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex) { struct bpf_tramp_image *im; struct bpf_tramp_links *tlinks; - u32 flags = BPF_TRAMP_F_RESTORE_REGS; + u32 orig_flags = tr->flags; bool ip_arg = false; int err, total; @@ -358,18 +451,31 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr) goto out; } + /* clear all bits except SHARE_IPMODIFY */ + tr->flags &= BPF_TRAMP_F_SHARE_IPMODIFY; + if (tlinks[BPF_TRAMP_FEXIT].nr_links || - tlinks[BPF_TRAMP_MODIFY_RETURN].nr_links) + tlinks[BPF_TRAMP_MODIFY_RETURN].nr_links) { /* NOTE: BPF_TRAMP_F_RESTORE_REGS and BPF_TRAMP_F_SKIP_FRAME * should not be set together. */ - flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME; + tr->flags |= BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME; + } else { + tr->flags |= BPF_TRAMP_F_RESTORE_REGS; + } if (ip_arg) - flags |= BPF_TRAMP_F_IP_ARG; + tr->flags |= BPF_TRAMP_F_IP_ARG; + +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +again: + if ((tr->flags & BPF_TRAMP_F_SHARE_IPMODIFY) && + (tr->flags & BPF_TRAMP_F_CALL_ORIG)) + tr->flags |= BPF_TRAMP_F_ORIG_STACK; +#endif err = arch_prepare_bpf_trampoline(im, im->image, im->image + PAGE_SIZE, - &tr->func.model, flags, tlinks, + &tr->func.model, tr->flags, tlinks, tr->func.addr); if (err < 0) goto out; @@ -378,17 +484,34 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr) WARN_ON(!tr->cur_image && tr->selector); if (tr->cur_image) /* progs already running at this address */ - err = modify_fentry(tr, tr->cur_image->image, im->image); + err = modify_fentry(tr, tr->cur_image->image, im->image, lock_direct_mutex); else /* first time registering */ err = register_fentry(tr, im->image); + +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + if (err == -EAGAIN) { + /* -EAGAIN from bpf_tramp_ftrace_ops_func. Now + * BPF_TRAMP_F_SHARE_IPMODIFY is set, we can generate the + * trampoline again, and retry register. + */ + /* reset fops->func and fops->trampoline for re-register */ + tr->fops->func = NULL; + tr->fops->trampoline = 0; + goto again; + } +#endif if (err) goto out; + if (tr->cur_image) bpf_tramp_image_put(tr->cur_image); tr->cur_image = im; tr->selector++; out: + /* If any error happens, restore previous flags */ + if (err) + tr->flags = orig_flags; kfree(tlinks); return err; } @@ -454,7 +577,7 @@ static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_tr hlist_add_head(&link->tramp_hlist, &tr->progs_hlist[kind]); tr->progs_cnt[kind]++; - err = bpf_trampoline_update(tr); + err = bpf_trampoline_update(tr, true /* lock_direct_mutex */); if (err) { hlist_del_init(&link->tramp_hlist); tr->progs_cnt[kind]--; @@ -487,7 +610,7 @@ static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_ } hlist_del_init(&link->tramp_hlist); tr->progs_cnt[kind]--; - return bpf_trampoline_update(tr); + return bpf_trampoline_update(tr, true /* lock_direct_mutex */); } /* bpf_trampoline_unlink_prog() should never fail. */ @@ -715,6 +838,7 @@ void bpf_trampoline_put(struct bpf_trampoline *tr) * multiple rcu callbacks. */ hlist_del(&tr->hlist); + kfree(tr->fops); kfree(tr); out: mutex_unlock(&trampoline_mutex); -- cgit From ea2babac63d40e59926dc5de4550dac94cc3c6d2 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 13 Jul 2022 13:49:50 -0700 Subject: bpf: Simplify bpf_prog_pack_[size|mask] Simplify the logic that selects bpf_prog_pack_size, and always use (PMD_SIZE * num_possible_nodes()). This is a good tradeoff, as most of the performance benefit observed is from less direct map fragmentation [0]. Also, module_alloc(4MB) may not allocate 4MB aligned memory. Therefore, we cannot use (ptr & bpf_prog_pack_mask) to find the correct address of bpf_prog_pack. Fix this by checking the header address falls in the range of pack->ptr and (pack->ptr + bpf_prog_pack_size). [0] https://lore.kernel.org/bpf/20220707223546.4124919-1-song@kernel.org/ Signed-off-by: Song Liu Signed-off-by: Daniel Borkmann Reviewed-by: Stanislav Fomichev Link: https://lore.kernel.org/bpf/20220713204950.3015201-1-song@kernel.org --- kernel/bpf/core.c | 71 +++++++++++++------------------------------------------ 1 file changed, 17 insertions(+), 54 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index cfb8a50a9f12..72d0721318e1 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -825,15 +825,6 @@ struct bpf_prog_pack { #define BPF_PROG_SIZE_TO_NBITS(size) (round_up(size, BPF_PROG_CHUNK_SIZE) / BPF_PROG_CHUNK_SIZE) -static size_t bpf_prog_pack_size = -1; -static size_t bpf_prog_pack_mask = -1; - -static int bpf_prog_chunk_count(void) -{ - WARN_ON_ONCE(bpf_prog_pack_size == -1); - return bpf_prog_pack_size / BPF_PROG_CHUNK_SIZE; -} - static DEFINE_MUTEX(pack_mutex); static LIST_HEAD(pack_list); @@ -841,55 +832,33 @@ static LIST_HEAD(pack_list); * CONFIG_MMU=n. Use PAGE_SIZE in these cases. */ #ifdef PMD_SIZE -#define BPF_HPAGE_SIZE PMD_SIZE -#define BPF_HPAGE_MASK PMD_MASK +#define BPF_PROG_PACK_SIZE (PMD_SIZE * num_possible_nodes()) #else -#define BPF_HPAGE_SIZE PAGE_SIZE -#define BPF_HPAGE_MASK PAGE_MASK +#define BPF_PROG_PACK_SIZE PAGE_SIZE #endif -static size_t select_bpf_prog_pack_size(void) -{ - size_t size; - void *ptr; - - size = BPF_HPAGE_SIZE * num_online_nodes(); - ptr = module_alloc(size); - - /* Test whether we can get huge pages. If not just use PAGE_SIZE - * packs. - */ - if (!ptr || !is_vm_area_hugepages(ptr)) { - size = PAGE_SIZE; - bpf_prog_pack_mask = PAGE_MASK; - } else { - bpf_prog_pack_mask = BPF_HPAGE_MASK; - } - - vfree(ptr); - return size; -} +#define BPF_PROG_CHUNK_COUNT (BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE) static struct bpf_prog_pack *alloc_new_pack(bpf_jit_fill_hole_t bpf_fill_ill_insns) { struct bpf_prog_pack *pack; - pack = kzalloc(struct_size(pack, bitmap, BITS_TO_LONGS(bpf_prog_chunk_count())), + pack = kzalloc(struct_size(pack, bitmap, BITS_TO_LONGS(BPF_PROG_CHUNK_COUNT)), GFP_KERNEL); if (!pack) return NULL; - pack->ptr = module_alloc(bpf_prog_pack_size); + pack->ptr = module_alloc(BPF_PROG_PACK_SIZE); if (!pack->ptr) { kfree(pack); return NULL; } - bpf_fill_ill_insns(pack->ptr, bpf_prog_pack_size); - bitmap_zero(pack->bitmap, bpf_prog_pack_size / BPF_PROG_CHUNK_SIZE); + bpf_fill_ill_insns(pack->ptr, BPF_PROG_PACK_SIZE); + bitmap_zero(pack->bitmap, BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE); list_add_tail(&pack->list, &pack_list); set_vm_flush_reset_perms(pack->ptr); - set_memory_ro((unsigned long)pack->ptr, bpf_prog_pack_size / PAGE_SIZE); - set_memory_x((unsigned long)pack->ptr, bpf_prog_pack_size / PAGE_SIZE); + set_memory_ro((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE); + set_memory_x((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE); return pack; } @@ -901,10 +870,7 @@ static void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insn void *ptr = NULL; mutex_lock(&pack_mutex); - if (bpf_prog_pack_size == -1) - bpf_prog_pack_size = select_bpf_prog_pack_size(); - - if (size > bpf_prog_pack_size) { + if (size > BPF_PROG_PACK_SIZE) { size = round_up(size, PAGE_SIZE); ptr = module_alloc(size); if (ptr) { @@ -916,9 +882,9 @@ static void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insn goto out; } list_for_each_entry(pack, &pack_list, list) { - pos = bitmap_find_next_zero_area(pack->bitmap, bpf_prog_chunk_count(), 0, + pos = bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0, nbits, 0); - if (pos < bpf_prog_chunk_count()) + if (pos < BPF_PROG_CHUNK_COUNT) goto found_free_area; } @@ -942,18 +908,15 @@ static void bpf_prog_pack_free(struct bpf_binary_header *hdr) struct bpf_prog_pack *pack = NULL, *tmp; unsigned int nbits; unsigned long pos; - void *pack_ptr; mutex_lock(&pack_mutex); - if (hdr->size > bpf_prog_pack_size) { + if (hdr->size > BPF_PROG_PACK_SIZE) { module_memfree(hdr); goto out; } - pack_ptr = (void *)((unsigned long)hdr & bpf_prog_pack_mask); - list_for_each_entry(tmp, &pack_list, list) { - if (tmp->ptr == pack_ptr) { + if ((void *)hdr >= tmp->ptr && (tmp->ptr + BPF_PROG_PACK_SIZE) > (void *)hdr) { pack = tmp; break; } @@ -963,14 +926,14 @@ static void bpf_prog_pack_free(struct bpf_binary_header *hdr) goto out; nbits = BPF_PROG_SIZE_TO_NBITS(hdr->size); - pos = ((unsigned long)hdr - (unsigned long)pack_ptr) >> BPF_PROG_CHUNK_SHIFT; + pos = ((unsigned long)hdr - (unsigned long)pack->ptr) >> BPF_PROG_CHUNK_SHIFT; WARN_ONCE(bpf_arch_text_invalidate(hdr, hdr->size), "bpf_prog_pack bug: missing bpf_arch_text_invalidate?\n"); bitmap_clear(pack->bitmap, pos, nbits); - if (bitmap_find_next_zero_area(pack->bitmap, bpf_prog_chunk_count(), 0, - bpf_prog_chunk_count(), 0) == 0) { + if (bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0, + BPF_PROG_CHUNK_COUNT, 0) == 0) { list_del(&pack->list); module_memfree(pack->ptr); kfree(pack); -- cgit From 671c11f0619e5ccb380bcf0f062f69ba95fc974a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 14 Jul 2022 18:38:15 -1000 Subject: cgroup: Elide write-locking threadgroup_rwsem when updating csses on an empty subtree MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cgroup_update_dfl_csses() write-lock the threadgroup_rwsem as updating the csses can trigger process migrations. However, if the subtree doesn't contain any tasks, there aren't gonna be any cgroup migrations. This condition can be trivially detected by testing whether mgctx.preloaded_src_csets is empty. Elide write-locking threadgroup_rwsem if the subtree is empty. After this optimization, the usage pattern of creating a cgroup, enabling the necessary controllers, and then seeding it with CLONE_INTO_CGROUP and then removing the cgroup after it becomes empty doesn't need to write-lock threadgroup_rwsem at all. Signed-off-by: Tejun Heo Cc: Christian Brauner Cc: Michal Koutný --- kernel/cgroup/cgroup.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 64e0f644adfa..f8e00affe007 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2933,12 +2933,11 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) struct cgroup_subsys_state *d_css; struct cgroup *dsct; struct css_set *src_cset; + bool has_tasks; int ret; lockdep_assert_held(&cgroup_mutex); - percpu_down_write(&cgroup_threadgroup_rwsem); - /* look up all csses currently attached to @cgrp's subtree */ spin_lock_irq(&css_set_lock); cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { @@ -2949,6 +2948,16 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) } spin_unlock_irq(&css_set_lock); + /* + * We need to write-lock threadgroup_rwsem while migrating tasks. + * However, if there are no source csets for @cgrp, changing its + * controllers isn't gonna produce any task migrations and the + * write-locking can be skipped safely. + */ + has_tasks = !list_empty(&mgctx.preloaded_src_csets); + if (has_tasks) + percpu_down_write(&cgroup_threadgroup_rwsem); + /* NULL dst indicates self on default hierarchy */ ret = cgroup_migrate_prepare_dst(&mgctx); if (ret) @@ -2967,7 +2976,8 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) ret = cgroup_migrate_execute(&mgctx); out_finish: cgroup_migrate_finish(&mgctx); - percpu_up_write(&cgroup_threadgroup_rwsem); + if (has_tasks) + percpu_up_write(&cgroup_threadgroup_rwsem); return ret; } -- cgit From 30312730bd029f567045c38098d7e5a62e9aa658 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 14 Jul 2022 18:38:43 -1000 Subject: cgroup: Add "no" prefixed mount options MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We allow modifying these mount options via remount. Let's add "no" prefixed variants so that they can be turned off too. Signed-off-by: Tejun Heo Cc: Christian Brauner Cc: Michal Koutný --- kernel/cgroup/cgroup.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index f8e00affe007..9ce24d5cf2d5 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -279,8 +279,6 @@ bool cgroup_ssid_enabled(int ssid) * * - When mounting an existing superblock, mount options should match. * - * - Remount is disallowed. - * * - rename(2) is disallowed. * * - "tasks" is removed. Everything should be at process granularity. Use @@ -1859,16 +1857,19 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, } enum cgroup2_param { - Opt_nsdelegate, - Opt_memory_localevents, - Opt_memory_recursiveprot, + Opt_nsdelegate, Opt_nonsdelegate, + Opt_memory_localevents, Opt_memory_nolocalevents, + Opt_memory_recursiveprot, Opt_memory_norecursiveprot, nr__cgroup2_params }; static const struct fs_parameter_spec cgroup2_fs_parameters[] = { fsparam_flag("nsdelegate", Opt_nsdelegate), + fsparam_flag("nonsdelegate", Opt_nonsdelegate), fsparam_flag("memory_localevents", Opt_memory_localevents), + fsparam_flag("memory_nolocalevents", Opt_memory_nolocalevents), fsparam_flag("memory_recursiveprot", Opt_memory_recursiveprot), + fsparam_flag("memory_norecursiveprot", Opt_memory_norecursiveprot), {} }; @@ -1886,12 +1887,21 @@ static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param case Opt_nsdelegate: ctx->flags |= CGRP_ROOT_NS_DELEGATE; return 0; + case Opt_nonsdelegate: + ctx->flags &= ~CGRP_ROOT_NS_DELEGATE; + return 0; case Opt_memory_localevents: ctx->flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS; return 0; + case Opt_memory_nolocalevents: + ctx->flags &= ~CGRP_ROOT_MEMORY_LOCAL_EVENTS; + return 0; case Opt_memory_recursiveprot: ctx->flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT; return 0; + case Opt_memory_norecursiveprot: + ctx->flags &= ~CGRP_ROOT_MEMORY_RECURSIVE_PROT; + return 0; } return -EINVAL; } -- cgit From 6a010a49b63ac8465851a79185d8deff966f8e1a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 23 Jul 2022 04:28:28 -1000 Subject: cgroup: Make !percpu threadgroup_rwsem operations optional MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 3942a9bd7b58 ("locking, rcu, cgroup: Avoid synchronize_sched() in __cgroup_procs_write()") disabled percpu operations on threadgroup_rwsem because the impiled synchronize_rcu() on write locking was pushing up the latencies too much for android which constantly moves processes between cgroups. This makes the hotter paths - fork and exit - slower as they're always forced into the slow path. There is no reason to force this on everyone especially given that more common static usage pattern can now completely avoid write-locking the rwsem. Write-locking is elided when turning on and off controllers on empty sub-trees and CLONE_INTO_CGROUP enables seeding a cgroup without grabbing the rwsem. Restore the default percpu operations and introduce the mount option "favordynmods" and config option CGROUP_FAVOR_DYNMODS for users who need lower latencies for the dynamic operations. Signed-off-by: Tejun Heo Cc: Christian Brauner Cc: Michal Koutn� Cc: Peter Zijlstra Cc: John Stultz Cc: Dmitry Shmidt Cc: Oleg Nesterov --- kernel/cgroup/cgroup-internal.h | 1 + kernel/cgroup/cgroup-v1.c | 17 +++++++++++++++- kernel/cgroup/cgroup.c | 43 ++++++++++++++++++++++++++++++++++------- 3 files changed, 53 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 5da09c74228d..36b740cb3d59 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -233,6 +233,7 @@ void cgroup_kn_unlock(struct kernfs_node *kn); int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, struct cgroup_namespace *ns); +void cgroup_favor_dynmods(struct cgroup_root *root, bool favor); void cgroup_free_root(struct cgroup_root *root); void init_cgroup_root(struct cgroup_fs_context *ctx); int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask); diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index afc6c0e9c966..2ade21b54dc4 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -875,6 +875,8 @@ static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_roo seq_puts(seq, ",xattr"); if (root->flags & CGRP_ROOT_CPUSET_V2_MODE) seq_puts(seq, ",cpuset_v2_mode"); + if (root->flags & CGRP_ROOT_FAVOR_DYNMODS) + seq_puts(seq, ",favordynmods"); spin_lock(&release_agent_path_lock); if (strlen(root->release_agent_path)) @@ -898,6 +900,8 @@ enum cgroup1_param { Opt_noprefix, Opt_release_agent, Opt_xattr, + Opt_favordynmods, + Opt_nofavordynmods, }; const struct fs_parameter_spec cgroup1_fs_parameters[] = { @@ -909,6 +913,8 @@ const struct fs_parameter_spec cgroup1_fs_parameters[] = { fsparam_flag ("noprefix", Opt_noprefix), fsparam_string("release_agent", Opt_release_agent), fsparam_flag ("xattr", Opt_xattr), + fsparam_flag ("favordynmods", Opt_favordynmods), + fsparam_flag ("nofavordynmods", Opt_nofavordynmods), {} }; @@ -960,6 +966,12 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param) case Opt_xattr: ctx->flags |= CGRP_ROOT_XATTR; break; + case Opt_favordynmods: + ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS; + break; + case Opt_nofavordynmods: + ctx->flags &= ~CGRP_ROOT_FAVOR_DYNMODS; + break; case Opt_release_agent: /* Specifying two release agents is forbidden */ if (ctx->release_agent) @@ -1211,8 +1223,11 @@ static int cgroup1_root_to_use(struct fs_context *fc) init_cgroup_root(ctx); ret = cgroup_setup_root(root, ctx->subsys_mask); - if (ret) + if (!ret) + cgroup_favor_dynmods(root, ctx->flags & CGRP_ROOT_FAVOR_DYNMODS); + else cgroup_free_root(root); + return ret; } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 9ce24d5cf2d5..7d023d42a6a5 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1305,6 +1305,20 @@ struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root) return root_cgrp->root; } +void cgroup_favor_dynmods(struct cgroup_root *root, bool favor) +{ + bool favoring = root->flags & CGRP_ROOT_FAVOR_DYNMODS; + + /* see the comment above CGRP_ROOT_FAVOR_DYNMODS definition */ + if (favor && !favoring) { + rcu_sync_enter(&cgroup_threadgroup_rwsem.rss); + root->flags |= CGRP_ROOT_FAVOR_DYNMODS; + } else if (!favor && favoring) { + rcu_sync_exit(&cgroup_threadgroup_rwsem.rss); + root->flags &= ~CGRP_ROOT_FAVOR_DYNMODS; + } +} + static int cgroup_init_root_id(struct cgroup_root *root) { int id; @@ -1365,6 +1379,7 @@ static void cgroup_destroy_root(struct cgroup_root *root) cgroup_root_count--; } + cgroup_favor_dynmods(root, false); cgroup_exit_root_id(root); mutex_unlock(&cgroup_mutex); @@ -1858,6 +1873,7 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, enum cgroup2_param { Opt_nsdelegate, Opt_nonsdelegate, + Opt_favordynmods, Opt_nofavordynmods, Opt_memory_localevents, Opt_memory_nolocalevents, Opt_memory_recursiveprot, Opt_memory_norecursiveprot, nr__cgroup2_params @@ -1866,6 +1882,8 @@ enum cgroup2_param { static const struct fs_parameter_spec cgroup2_fs_parameters[] = { fsparam_flag("nsdelegate", Opt_nsdelegate), fsparam_flag("nonsdelegate", Opt_nonsdelegate), + fsparam_flag("favordynmods", Opt_favordynmods), + fsparam_flag("nofavordynmods", Opt_nofavordynmods), fsparam_flag("memory_localevents", Opt_memory_localevents), fsparam_flag("memory_nolocalevents", Opt_memory_nolocalevents), fsparam_flag("memory_recursiveprot", Opt_memory_recursiveprot), @@ -1890,6 +1908,12 @@ static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param case Opt_nonsdelegate: ctx->flags &= ~CGRP_ROOT_NS_DELEGATE; return 0; + case Opt_favordynmods: + ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS; + return 0; + case Opt_nofavordynmods: + ctx->flags &= ~CGRP_ROOT_FAVOR_DYNMODS; + return 0; case Opt_memory_localevents: ctx->flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS; return 0; @@ -1914,6 +1938,9 @@ static void apply_cgroup_root_flags(unsigned int root_flags) else cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE; + cgroup_favor_dynmods(&cgrp_dfl_root, + root_flags & CGRP_ROOT_FAVOR_DYNMODS); + if (root_flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS) cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS; else @@ -1930,6 +1957,8 @@ static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root { if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) seq_puts(seq, ",nsdelegate"); + if (cgrp_dfl_root.flags & CGRP_ROOT_FAVOR_DYNMODS) + seq_puts(seq, ",favordynmods"); if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS) seq_puts(seq, ",memory_localevents"); if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT) @@ -1980,7 +2009,8 @@ void init_cgroup_root(struct cgroup_fs_context *ctx) cgrp->root = root; init_cgroup_housekeeping(cgrp); - root->flags = ctx->flags; + /* DYNMODS must be modified through cgroup_favor_dynmods() */ + root->flags = ctx->flags & ~CGRP_ROOT_FAVOR_DYNMODS; if (ctx->release_agent) strscpy(root->release_agent_path, ctx->release_agent, PATH_MAX); if (ctx->name) @@ -2202,6 +2232,10 @@ static int cgroup_init_fs_context(struct fs_context *fc) put_user_ns(fc->user_ns); fc->user_ns = get_user_ns(ctx->ns->user_ns); fc->global = true; + +#ifdef CONFIG_CGROUP_FAVOR_DYNMODS + ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS; +#endif return 0; } @@ -5854,12 +5888,6 @@ int __init cgroup_init(void) cgroup_rstat_boot(); - /* - * The latency of the synchronize_rcu() is too high for cgroups, - * avoid it at the cost of forcing all readers into the slow path. - */ - rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss); - get_user_ns(init_cgroup_ns.user_ns); mutex_lock(&cgroup_mutex); @@ -6771,6 +6799,7 @@ static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr, { return snprintf(buf, PAGE_SIZE, "nsdelegate\n" + "favordynmods\n" "memory_localevents\n" "memory_recursiveprot\n"); } -- cgit From b774926c733850037b15c50f893383aa71bd8695 Mon Sep 17 00:00:00 2001 From: Linyu Yuan Date: Mon, 27 Jun 2022 10:19:05 +0800 Subject: tracing: eprobe: Add missing log index Add trace_probe_log_set_index(1) to allow report correct error if user input wrong SYSTEM.EVENT format. Link: https://lore.kernel.org/all/1656296348-16111-2-git-send-email-quic_linyyuan@quicinc.com/ Reviewed-by: Tom Zanussi Signed-off-by: Linyu Yuan Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_eprobe.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index 7d4478525c66..b805b570305f 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -881,6 +881,7 @@ static int __trace_eprobe_create(int argc, const char *argv[]) if (!is_good_name(event) || !is_good_name(group)) goto parse_error; + trace_probe_log_set_index(1); sys_event = argv[1]; ret = traceprobe_parse_event_name(&sys_event, &sys_name, buf2, sys_event - argv[1]); -- cgit From f360ea5641dc9473ad485e882c8ac3b1aa2672ff Mon Sep 17 00:00:00 2001 From: Linyu Yuan Date: Mon, 27 Jun 2022 10:19:06 +0800 Subject: tracing: eprobe: Remove duplicate is_good_name() operation traceprobe_parse_event_name() already validate SYSTEM and EVENT name, there is no need to call is_good_name() after it. Link: https://lore.kernel.org/all/1656296348-16111-3-git-send-email-quic_linyyuan@quicinc.com/ Acked-by: Masami Hiramatsu (Google) Reviewed-by: Tom Zanussi Signed-off-by: Linyu Yuan Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_eprobe.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index b805b570305f..8979cb9ec37a 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -887,8 +887,6 @@ static int __trace_eprobe_create(int argc, const char *argv[]) sys_event - argv[1]); if (ret || !sys_name) goto parse_error; - if (!is_good_name(sys_event) || !is_good_name(sys_name)) - goto parse_error; mutex_lock(&event_mutex); event_call = find_and_get_event(sys_name, sys_event); -- cgit From 95c104c378dc7d4cb3fb9f289dc5354bfc285fe0 Mon Sep 17 00:00:00 2001 From: Linyu Yuan Date: Mon, 27 Jun 2022 10:19:07 +0800 Subject: tracing: Auto generate event name when creating a group of events Currently when creating a specific group of trace events, take kprobe event as example, the user must use the following format: p:GRP/EVENT [MOD:]KSYM[+OFFS]|KADDR [FETCHARGS], which means user must enter EVENT name, one example is: echo 'p:usb_gadget/config_usb_cfg_link config_usb_cfg_link $arg1' >> kprobe_events It is not simple if there are too many entries because the event name is the same as symbol name. This change allows user to specify no EVENT name, format changed as: p:GRP/ [MOD:]KSYM[+OFFS]|KADDR [FETCHARGS] It will generate event name automatically and one example is: echo 'p:usb_gadget/ config_usb_cfg_link $arg1' >> kprobe_events. Link: https://lore.kernel.org/all/1656296348-16111-4-git-send-email-quic_linyyuan@quicinc.com/ Acked-by: Masami Hiramatsu (Google) Reviewed-by: Tom Zanussi Signed-off-by: Linyu Yuan Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 8 ++++---- kernel/trace/trace_dynevent.c | 2 +- kernel/trace/trace_eprobe.c | 25 +++++++++++++------------ kernel/trace/trace_kprobe.c | 16 ++++++++++------ kernel/trace/trace_probe.c | 4 ++++ kernel/trace/trace_uprobe.c | 12 ++++++++---- 6 files changed, 40 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b8dd54627075..7eb5bce62500 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5569,13 +5569,13 @@ static const char readme_msg[] = #endif #if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS) "\t accepts: event-definitions (one definition per line)\n" - "\t Format: p[:[/]] []\n" - "\t r[maxactive][:[/]] []\n" + "\t Format: p[:[/][]] []\n" + "\t r[maxactive][:[/][]] []\n" #ifdef CONFIG_HIST_TRIGGERS "\t s:[synthetic/] []\n" #endif - "\t e[:[/]] . []\n" - "\t -:[/]\n" + "\t e[:[/][]] . []\n" + "\t -:[/][]\n" #ifdef CONFIG_KPROBE_EVENTS "\t place: [:][+]|\n" "place (kretprobe): [:][+]%return|\n" diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index 076b447a1b88..154996684fb5 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -101,7 +101,7 @@ int dyn_event_release(const char *raw_command, struct dyn_event_operations *type event = p + 1; *p = '\0'; } - if (event[0] == '\0') { + if (!system && event[0] == '\0') { ret = -EINVAL; goto out; } diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index 8979cb9ec37a..a30f21499e81 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -125,6 +125,7 @@ static bool eprobe_dyn_event_match(const char *system, const char *event, * We match the following: * event only - match all eprobes with event name * system and event only - match all system/event probes + * system only - match all system probes * * The below has the above satisfied with more arguments: * @@ -143,7 +144,7 @@ static bool eprobe_dyn_event_match(const char *system, const char *event, return false; /* Must match the event name */ - if (strcmp(trace_probe_name(&ep->tp), event) != 0) + if (event[0] != '\0' && strcmp(trace_probe_name(&ep->tp), event) != 0) return false; /* No arguments match all */ @@ -848,7 +849,7 @@ static int __trace_eprobe_create(int argc, const char *argv[]) { /* * Argument syntax: - * e[:[GRP/]ENAME] SYSTEM.EVENT [FETCHARGS] + * e[:[GRP/][ENAME]] SYSTEM.EVENT [FETCHARGS] * Fetch args: * =$[:TYPE] */ @@ -858,6 +859,7 @@ static int __trace_eprobe_create(int argc, const char *argv[]) struct trace_eprobe *ep = NULL; char buf1[MAX_EVENT_NAME_LEN]; char buf2[MAX_EVENT_NAME_LEN]; + char gbuf[MAX_EVENT_NAME_LEN]; int ret = 0; int i; @@ -869,25 +871,24 @@ static int __trace_eprobe_create(int argc, const char *argv[]) event = strchr(&argv[0][1], ':'); if (event) { event++; - ret = traceprobe_parse_event_name(&event, &group, buf1, + ret = traceprobe_parse_event_name(&event, &group, gbuf, event - argv[0]); if (ret) goto parse_error; - } else { - strscpy(buf1, argv[1], MAX_EVENT_NAME_LEN); - sanitize_event_name(buf1); - event = buf1; } - if (!is_good_name(event) || !is_good_name(group)) - goto parse_error; trace_probe_log_set_index(1); sys_event = argv[1]; - ret = traceprobe_parse_event_name(&sys_event, &sys_name, buf2, - sys_event - argv[1]); - if (ret || !sys_name) + ret = traceprobe_parse_event_name(&sys_event, &sys_name, buf2, 0); + if (!sys_event || !sys_name) goto parse_error; + if (!event) { + strscpy(buf1, argv[1], MAX_EVENT_NAME_LEN); + sanitize_event_name(buf1); + event = buf1; + } + mutex_lock(&event_mutex); event_call = find_and_get_event(sys_name, sys_event); ep = alloc_event_probe(group, event, event_call, argc - 2); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index a245ea673715..23f7f0ec4f4c 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -163,7 +163,8 @@ static bool trace_kprobe_match(const char *system, const char *event, { struct trace_kprobe *tk = to_trace_kprobe(ev); - return strcmp(trace_probe_name(&tk->tp), event) == 0 && + return (event[0] == '\0' || + strcmp(trace_probe_name(&tk->tp), event) == 0) && (!system || strcmp(trace_probe_group_name(&tk->tp), system) == 0) && trace_kprobe_match_command_head(tk, argc, argv); } @@ -708,11 +709,11 @@ static int __trace_kprobe_create(int argc, const char *argv[]) /* * Argument syntax: * - Add kprobe: - * p[:[GRP/]EVENT] [MOD:]KSYM[+OFFS]|KADDR [FETCHARGS] + * p[:[GRP/][EVENT]] [MOD:]KSYM[+OFFS]|KADDR [FETCHARGS] * - Add kretprobe: - * r[MAXACTIVE][:[GRP/]EVENT] [MOD:]KSYM[+0] [FETCHARGS] + * r[MAXACTIVE][:[GRP/][EVENT]] [MOD:]KSYM[+0] [FETCHARGS] * Or - * p:[GRP/]EVENT] [MOD:]KSYM[+0]%return [FETCHARGS] + * p[:[GRP/][EVENT]] [MOD:]KSYM[+0]%return [FETCHARGS] * * Fetch args: * $retval : fetch return value @@ -739,6 +740,7 @@ static int __trace_kprobe_create(int argc, const char *argv[]) long offset = 0; void *addr = NULL; char buf[MAX_EVENT_NAME_LEN]; + char gbuf[MAX_EVENT_NAME_LEN]; unsigned int flags = TPARG_FL_KERNEL; switch (argv[0][0]) { @@ -833,11 +835,13 @@ static int __trace_kprobe_create(int argc, const char *argv[]) trace_probe_log_set_index(0); if (event) { - ret = traceprobe_parse_event_name(&event, &group, buf, + ret = traceprobe_parse_event_name(&event, &group, gbuf, event - argv[0]); if (ret) goto parse_error; - } else { + } + + if (!event) { /* Make a new event name */ if (symbol) snprintf(buf, MAX_EVENT_NAME_LEN, "%c_%s_%ld", diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 80863c6508e5..850a88abd33b 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -257,6 +257,10 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup, } len = strlen(event); if (len == 0) { + if (slash) { + *pevent = NULL; + return 0; + } trace_probe_log_err(offset, NO_EVENT_NAME); return -EINVAL; } else if (len > MAX_EVENT_NAME_LEN) { diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index c3dc4f859a6b..a3fec28961d6 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -312,7 +312,8 @@ static bool trace_uprobe_match(const char *system, const char *event, { struct trace_uprobe *tu = to_trace_uprobe(ev); - return strcmp(trace_probe_name(&tu->tp), event) == 0 && + return (event[0] == '\0' || + strcmp(trace_probe_name(&tu->tp), event) == 0) && (!system || strcmp(trace_probe_group_name(&tu->tp), system) == 0) && trace_uprobe_match_command_head(tu, argc, argv); } @@ -532,7 +533,7 @@ end: /* * Argument syntax: - * - Add uprobe: p|r[:[GRP/]EVENT] PATH:OFFSET[%return][(REF)] [FETCHARGS] + * - Add uprobe: p|r[:[GRP/][EVENT]] PATH:OFFSET[%return][(REF)] [FETCHARGS] */ static int __trace_uprobe_create(int argc, const char **argv) { @@ -540,6 +541,7 @@ static int __trace_uprobe_create(int argc, const char **argv) const char *event = NULL, *group = UPROBE_EVENT_SYSTEM; char *arg, *filename, *rctr, *rctr_end, *tmp; char buf[MAX_EVENT_NAME_LEN]; + char gbuf[MAX_EVENT_NAME_LEN]; enum probe_print_type ptype; struct path path; unsigned long offset, ref_ctr_offset; @@ -644,11 +646,13 @@ static int __trace_uprobe_create(int argc, const char **argv) /* setup a probe */ trace_probe_log_set_index(0); if (event) { - ret = traceprobe_parse_event_name(&event, &group, buf, + ret = traceprobe_parse_event_name(&event, &group, gbuf, event - argv[0]); if (ret) goto fail_address_parse; - } else { + } + + if (!event) { char *tail; char *ptr; -- cgit From ed29b0b4fd835b058ddd151c49d021e28d631ee6 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 23 May 2022 17:05:03 -0600 Subject: io_uring: move to separate directory In preparation for splitting io_uring up a bit, move it into its own top level directory. It didn't really belong in fs/ anyway, as it's not a file system only API. This adds io_uring/ and moves the core files in there, and updates the MAINTAINERS file for the new location. Signed-off-by: Jens Axboe --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index da0bf6fe9ecd..f35674e89621 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -91,7 +91,7 @@ #include "stats.h" #include "../workqueue_internal.h" -#include "../../fs/io-wq.h" +#include "../../io_uring/io-wq.h" #include "../smpboot.h" /* -- cgit From 7c2645a2a30a45d3dc4c98b315a51be44ec69a67 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Fri, 8 Jul 2022 10:50:55 -0600 Subject: dma-mapping: allow EREMOTEIO return code for P2PDMA transfers Add EREMOTEIO error return to dma_map_sgtable() which will be used by .map_sg() implementations that detect P2PDMA pages that the underlying DMA device cannot access. Signed-off-by: Logan Gunthorpe Reviewed-by: Jason Gunthorpe Reviewed-by: Christoph Hellwig Signed-off-by: Christoph Hellwig --- kernel/dma/mapping.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 1bfe11b1edb6..746d46825d08 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -197,7 +197,7 @@ static int __dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, if (ents > 0) debug_dma_map_sg(dev, sg, nents, ents, dir, attrs); else if (WARN_ON_ONCE(ents != -EINVAL && ents != -ENOMEM && - ents != -EIO)) + ents != -EIO && ents != -EREMOTEIO)) return -EIO; return ents; @@ -255,6 +255,9 @@ EXPORT_SYMBOL(dma_map_sg_attrs); * complete the mapping. Should succeed if retried later. * -EIO Legacy error code with an unknown meaning. eg. this is * returned if a lower level call returned DMA_MAPPING_ERROR. + * -EREMOTEIO The DMA device cannot access P2PDMA memory specified in + * the sg_table. This will not succeed if retried. + * */ int dma_map_sgtable(struct device *dev, struct sg_table *sgt, enum dma_data_direction dir, unsigned long attrs) -- cgit From f02ad36d4f76645e7e1c21f572260e9a2e61c26b Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Fri, 8 Jul 2022 10:50:56 -0600 Subject: dma-direct: support PCI P2PDMA pages in dma-direct map_sg Add PCI P2PDMA support for dma_direct_map_sg() so that it can map PCI P2PDMA pages directly without a hack in the callers. This allows for heterogeneous SGLs that contain both P2PDMA and regular pages. A P2PDMA page may have three possible outcomes when being mapped: 1) If the data path between the two devices doesn't go through the root port, then it should be mapped with a PCI bus address 2) If the data path goes through the host bridge, it should be mapped normally, as though it were a CPU physical address 3) It is not possible for the two devices to communicate and thus the mapping operation should fail (and it will return -EREMOTEIO). SGL segments that contain PCI bus addresses are marked with sg_dma_mark_pci_p2pdma() and are ignored when unmapped. P2PDMA mappings are also failed if swiotlb needs to be used on the mapping. Signed-off-by: Logan Gunthorpe Reviewed-by: Christoph Hellwig Signed-off-by: Christoph Hellwig --- kernel/dma/direct.c | 43 +++++++++++++++++++++++++++++++++++++------ kernel/dma/direct.h | 8 +++++++- 2 files changed, 44 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index e978f36e6be8..133a4be2d3e5 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -454,29 +454,60 @@ void dma_direct_sync_sg_for_cpu(struct device *dev, arch_sync_dma_for_cpu_all(); } +/* + * Unmaps segments, except for ones marked as pci_p2pdma which do not + * require any further action as they contain a bus address. + */ void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir, unsigned long attrs) { struct scatterlist *sg; int i; - for_each_sg(sgl, sg, nents, i) - dma_direct_unmap_page(dev, sg->dma_address, sg_dma_len(sg), dir, - attrs); + for_each_sg(sgl, sg, nents, i) { + if (sg_is_dma_bus_address(sg)) + sg_dma_unmark_bus_address(sg); + else + dma_direct_unmap_page(dev, sg->dma_address, + sg_dma_len(sg), dir, attrs); + } } #endif int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir, unsigned long attrs) { - int i; + struct pci_p2pdma_map_state p2pdma_state = {}; + enum pci_p2pdma_map_type map; struct scatterlist *sg; + int i, ret; for_each_sg(sgl, sg, nents, i) { + if (is_pci_p2pdma_page(sg_page(sg))) { + map = pci_p2pdma_map_segment(&p2pdma_state, dev, sg); + switch (map) { + case PCI_P2PDMA_MAP_BUS_ADDR: + continue; + case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: + /* + * Any P2P mapping that traverses the PCI + * host bridge must be mapped with CPU physical + * address and not PCI bus addresses. This is + * done with dma_direct_map_page() below. + */ + break; + default: + ret = -EREMOTEIO; + goto out_unmap; + } + } + sg->dma_address = dma_direct_map_page(dev, sg_page(sg), sg->offset, sg->length, dir, attrs); - if (sg->dma_address == DMA_MAPPING_ERROR) + if (sg->dma_address == DMA_MAPPING_ERROR) { + ret = -EIO; goto out_unmap; + } sg_dma_len(sg) = sg->length; } @@ -484,7 +515,7 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, out_unmap: dma_direct_unmap_sg(dev, sgl, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC); - return -EIO; + return ret; } dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr, diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h index a78c0ba70645..e38ffc5e6bdd 100644 --- a/kernel/dma/direct.h +++ b/kernel/dma/direct.h @@ -8,6 +8,7 @@ #define _KERNEL_DMA_DIRECT_H #include +#include int dma_direct_get_sgtable(struct device *dev, struct sg_table *sgt, void *cpu_addr, dma_addr_t dma_addr, size_t size, @@ -87,10 +88,15 @@ static inline dma_addr_t dma_direct_map_page(struct device *dev, phys_addr_t phys = page_to_phys(page) + offset; dma_addr_t dma_addr = phys_to_dma(dev, phys); - if (is_swiotlb_force_bounce(dev)) + if (is_swiotlb_force_bounce(dev)) { + if (is_pci_p2pdma_page(page)) + return DMA_MAPPING_ERROR; return swiotlb_map(dev, phys, size, dir, attrs); + } if (unlikely(!dma_capable(dev, dma_addr, size, true))) { + if (is_pci_p2pdma_page(page)) + return DMA_MAPPING_ERROR; if (is_swiotlb_active(dev)) return swiotlb_map(dev, phys, size, dir, attrs); -- cgit From 159bf19270e80b5bc4b13aa88072dcb390b4d297 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Fri, 8 Jul 2022 10:50:57 -0600 Subject: dma-mapping: add flags to dma_map_ops to indicate PCI P2PDMA support Add a flags member to the dma_map_ops structure with one flag to indicate support for PCI P2PDMA. Also, add a helper to check if a device supports PCI P2PDMA. Signed-off-by: Logan Gunthorpe Reviewed-by: Jason Gunthorpe Reviewed-by: Christoph Hellwig Signed-off-by: Christoph Hellwig --- kernel/dma/mapping.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'kernel') diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 746d46825d08..a9ce5d728231 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -723,6 +723,24 @@ int dma_supported(struct device *dev, u64 mask) } EXPORT_SYMBOL(dma_supported); +bool dma_pci_p2pdma_supported(struct device *dev) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + /* if ops is not set, dma direct will be used which supports P2PDMA */ + if (!ops) + return true; + + /* + * Note: dma_ops_bypass is not checked here because P2PDMA should + * not be used with dma mapping ops that do not have support even + * if the specific device is bypassing them. + */ + + return ops->flags & DMA_F_PCI_P2PDMA_SUPPORTED; +} +EXPORT_SYMBOL_GPL(dma_pci_p2pdma_supported); + #ifdef CONFIG_ARCH_HAS_DMA_SET_MASK void arch_dma_set_mask(struct device *dev, u64 mask); #else -- cgit From bd82ea52f0ee2890b698155a6d7bf9ea5bd8930d Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Sat, 23 Jul 2022 19:17:10 +0200 Subject: bpf, devmap: Compute proper xdp_frame len redirecting frames Even if it is currently forbidden to XDP_REDIRECT a multi-frag xdp_frame into a devmap, compute proper xdp_frame length in __xdp_enqueue and is_valid_dst routines running xdp_get_frame_len(). Signed-off-by: Lorenzo Bianconi Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/894d99c01139e921bdb6868158ff8e67f661c072.1658596075.git.lorenzo@kernel.org --- kernel/bpf/devmap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 1400561efb15..a0e02b009487 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -477,7 +477,7 @@ static inline int __xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf, if (!dev->netdev_ops->ndo_xdp_xmit) return -EOPNOTSUPP; - err = xdp_ok_fwd_dev(dev, xdpf->len); + err = xdp_ok_fwd_dev(dev, xdp_get_frame_len(xdpf)); if (unlikely(err)) return err; @@ -536,7 +536,7 @@ static bool is_valid_dst(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf) !obj->dev->netdev_ops->ndo_xdp_xmit) return false; - if (xdp_ok_fwd_dev(obj->dev, xdpf->len)) + if (xdp_ok_fwd_dev(obj->dev, xdp_get_frame_len(xdpf))) return false; return true; -- cgit From 8386c414e27caba8501119948e9551e52b527f59 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Fri, 15 Jul 2022 14:49:58 +0900 Subject: PM: hibernate: defer device probing when resuming from hibernation syzbot is reporting hung task at misc_open() [1], for there is a race window of AB-BA deadlock which involves probe_count variable. Currently wait_for_device_probe() from snapshot_open() from misc_open() can sleep forever with misc_mtx held if probe_count cannot become 0. When a device is probed by hub_event() work function, probe_count is incremented before the probe function starts, and probe_count is decremented after the probe function completed. There are three cases that can prevent probe_count from dropping to 0. (a) A device being probed stopped responding (i.e. broken/malicious hardware). (b) A process emulating a USB device using /dev/raw-gadget interface stopped responding for some reason. (c) New device probe requests keeps coming in before existing device probe requests complete. The phenomenon syzbot is reporting is (b). A process which is holding system_transition_mutex and misc_mtx is waiting for probe_count to become 0 inside wait_for_device_probe(), but the probe function which is called from hub_event() work function is waiting for the processes which are blocked at mutex_lock(&misc_mtx) to respond via /dev/raw-gadget interface. This patch mitigates (b) by deferring wait_for_device_probe() from snapshot_open() to snapshot_write() and snapshot_ioctl(). Please note that the possibility of (b) remains as long as any thread which is emulating a USB device via /dev/raw-gadget interface can be blocked by uninterruptible blocking operations (e.g. mutex_lock()). Please also note that (a) and (c) are not addressed. Regarding (c), we should change the code to wait for only one device which contains the image for resuming from hibernation. I don't know how to address (a), for use of timeout for wait_for_device_probe() might result in loss of user data in the image. Maybe we should require the userland to wait for the image device before opening /dev/snapshot interface. Link: https://syzkaller.appspot.com/bug?extid=358c9ab4c93da7b7238c [1] Reported-by: syzbot Signed-off-by: Tetsuo Handa Tested-by: syzbot Signed-off-by: Rafael J. Wysocki --- kernel/power/user.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/user.c b/kernel/power/user.c index ad241b4ff64c..d43c2aa583b2 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -26,6 +26,7 @@ #include "power.h" +static bool need_wait; static struct snapshot_data { struct snapshot_handle handle; @@ -78,7 +79,7 @@ static int snapshot_open(struct inode *inode, struct file *filp) * Resuming. We may need to wait for the image device to * appear. */ - wait_for_device_probe(); + need_wait = true; data->swap = -1; data->mode = O_WRONLY; @@ -168,6 +169,11 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf, ssize_t res; loff_t pg_offp = *offp & ~PAGE_MASK; + if (need_wait) { + wait_for_device_probe(); + need_wait = false; + } + lock_system_sleep(); data = filp->private_data; @@ -244,6 +250,11 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, loff_t size; sector_t offset; + if (need_wait) { + wait_for_device_probe(); + need_wait = false; + } + if (_IOC_TYPE(cmd) != SNAPSHOT_IOC_MAGIC) return -ENOTTY; if (_IOC_NR(cmd) > SNAPSHOT_IOC_MAXNR) -- cgit From 8d36694245f2aec17f03a5e0b2797953d2e36704 Mon Sep 17 00:00:00 2001 From: Shivnandan Kumar Date: Fri, 15 Jul 2022 17:55:39 +0530 Subject: PM: QoS: Add check to make sure CPU freq is non-negative CPU frequency should never be negative. If some client driver calls freq_qos_update_request with a negative value which will be very high in absolute terms, then frequency QoS sets max CPU freq at fmax as it considers it's absolute value but it will add plist node with negative priority. plist node has priority from INT_MIN (highest) to INT_MAX(lowest). Once priority is set as negative, another client will not be able to reduce CPU frequency. Adding check to make sure CPU freq is non-negative will fix this problem. Signed-off-by: Shivnandan Kumar [ rjw: Changelog edits ] Signed-off-by: Rafael J. Wysocki --- kernel/power/qos.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/qos.c b/kernel/power/qos.c index ec7e1e85923e..af51ed6d45ef 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -531,7 +531,7 @@ int freq_qos_add_request(struct freq_constraints *qos, { int ret; - if (IS_ERR_OR_NULL(qos) || !req) + if (IS_ERR_OR_NULL(qos) || !req || value < 0) return -EINVAL; if (WARN(freq_qos_request_active(req), @@ -563,7 +563,7 @@ EXPORT_SYMBOL_GPL(freq_qos_add_request); */ int freq_qos_update_request(struct freq_qos_request *req, s32 new_value) { - if (!req) + if (!req || new_value < 0) return -EINVAL; if (WARN(!freq_qos_request_active(req), -- cgit From 7c56a8733d0a2a4be2438a7512566e5ce552fccf Mon Sep 17 00:00:00 2001 From: Laurent Dufour Date: Wed, 13 Jul 2022 17:47:27 +0200 Subject: watchdog: export lockup_detector_reconfigure In some circumstances it may be interesting to reconfigure the watchdog from inside the kernel. On PowerPC, this may helpful before and after a LPAR migration (LPM) is initiated, because it implies some latencies, watchdog, and especially NMI watchdog is expected to be triggered during this operation. Reconfiguring the watchdog with a factor, would prevent it to happen too frequently during LPM. Rename lockup_detector_reconfigure() as __lockup_detector_reconfigure() and create a new function lockup_detector_reconfigure() calling __lockup_detector_reconfigure() under the protection of watchdog_mutex. Signed-off-by: Laurent Dufour [mpe: Squash in build fix from Laurent, reported by Sachin] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220713154729.80789-3-ldufour@linux.ibm.com --- kernel/watchdog.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 20a7a55e62b6..41596c415111 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -541,7 +541,7 @@ int lockup_detector_offline_cpu(unsigned int cpu) return 0; } -static void lockup_detector_reconfigure(void) +static void __lockup_detector_reconfigure(void) { cpus_read_lock(); watchdog_nmi_stop(); @@ -561,6 +561,13 @@ static void lockup_detector_reconfigure(void) __lockup_detector_cleanup(); } +void lockup_detector_reconfigure(void) +{ + mutex_lock(&watchdog_mutex); + __lockup_detector_reconfigure(); + mutex_unlock(&watchdog_mutex); +} + /* * Create the watchdog infrastructure and configure the detector(s). */ @@ -577,13 +584,13 @@ static __init void lockup_detector_setup(void) return; mutex_lock(&watchdog_mutex); - lockup_detector_reconfigure(); + __lockup_detector_reconfigure(); softlockup_initialized = true; mutex_unlock(&watchdog_mutex); } #else /* CONFIG_SOFTLOCKUP_DETECTOR */ -static void lockup_detector_reconfigure(void) +static void __lockup_detector_reconfigure(void) { cpus_read_lock(); watchdog_nmi_stop(); @@ -591,9 +598,13 @@ static void lockup_detector_reconfigure(void) watchdog_nmi_start(); cpus_read_unlock(); } +void lockup_detector_reconfigure(void) +{ + __lockup_detector_reconfigure(); +} static inline void lockup_detector_setup(void) { - lockup_detector_reconfigure(); + __lockup_detector_reconfigure(); } #endif /* !CONFIG_SOFTLOCKUP_DETECTOR */ @@ -633,7 +644,7 @@ static void proc_watchdog_update(void) { /* Remove impossible cpus to keep sysctl output clean. */ cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask); - lockup_detector_reconfigure(); + __lockup_detector_reconfigure(); } /* -- cgit From c808f4632349bda65b2ec41220f0a2035f780619 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 27 Jul 2022 07:54:55 -1000 Subject: cgroup: remove "no" prefixed mount options MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 30312730bd02 ("cgroup: Add "no" prefixed mount options") added "no" prefixed mount options to allow turning them off and 6a010a49b63a ("cgroup: Make !percpu threadgroup_rwsem operations optional") added one more "no" prefixed mount option. However, Michal pointed out that the "no" prefixed options aren't necessary in allowing mount options to be turned off: # grep group /proc/mounts cgroup2 /sys/fs/cgroup cgroup2 rw,nosuid,nodev,relatime,nsdelegate,memory_recursiveprot 0 0 # mount -o remount,nsdelegate,memory_recursiveprot none /sys/fs/cgroup # grep cgroup /proc/mounts cgroup2 /sys/fs/cgroup cgroup2 rw,relatime,nsdelegate,memory_recursiveprot 0 0 Note that this is different from the remount behavior when the mount(1) is invoked without the device argument - "none": # grep cgroup /proc/mounts cgroup2 /sys/fs/cgroup cgroup2 rw,nosuid,nodev,noexec,relatime,nsdelegate,memory_recursiveprot 0 0 # mount -o remount,nsdelegate,memory_recursiveprot /sys/fs/cgroup # grep cgroup /proc/mounts cgroup2 /sys/fs/cgroup cgroup2 rw,nosuid,nodev,noexec,relatime,nsdelegate,memory_recursiveprot 0 0 While a bit confusing, given that there is a way to turn off the options, there's no reason to have the explicit "no" prefixed options. Let's remove them. Signed-off-by: Tejun Heo Cc: Michal Koutný Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 24 ++++-------------------- 1 file changed, 4 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 7d023d42a6a5..c85dadf0d53b 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1872,22 +1872,18 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, } enum cgroup2_param { - Opt_nsdelegate, Opt_nonsdelegate, - Opt_favordynmods, Opt_nofavordynmods, - Opt_memory_localevents, Opt_memory_nolocalevents, - Opt_memory_recursiveprot, Opt_memory_norecursiveprot, + Opt_nsdelegate, + Opt_favordynmods, + Opt_memory_localevents, + Opt_memory_recursiveprot, nr__cgroup2_params }; static const struct fs_parameter_spec cgroup2_fs_parameters[] = { fsparam_flag("nsdelegate", Opt_nsdelegate), - fsparam_flag("nonsdelegate", Opt_nonsdelegate), fsparam_flag("favordynmods", Opt_favordynmods), - fsparam_flag("nofavordynmods", Opt_nofavordynmods), fsparam_flag("memory_localevents", Opt_memory_localevents), - fsparam_flag("memory_nolocalevents", Opt_memory_nolocalevents), fsparam_flag("memory_recursiveprot", Opt_memory_recursiveprot), - fsparam_flag("memory_norecursiveprot", Opt_memory_norecursiveprot), {} }; @@ -1905,27 +1901,15 @@ static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param case Opt_nsdelegate: ctx->flags |= CGRP_ROOT_NS_DELEGATE; return 0; - case Opt_nonsdelegate: - ctx->flags &= ~CGRP_ROOT_NS_DELEGATE; - return 0; case Opt_favordynmods: ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS; return 0; - case Opt_nofavordynmods: - ctx->flags &= ~CGRP_ROOT_FAVOR_DYNMODS; - return 0; case Opt_memory_localevents: ctx->flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS; return 0; - case Opt_memory_nolocalevents: - ctx->flags &= ~CGRP_ROOT_MEMORY_LOCAL_EVENTS; - return 0; case Opt_memory_recursiveprot: ctx->flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT; return 0; - case Opt_memory_norecursiveprot: - ctx->flags &= ~CGRP_ROOT_MEMORY_RECURSIVE_PROT; - return 0; } return -EINVAL; } -- cgit From 8419702489f3be7f9e4fcf12c04d9d3f00114d35 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Wed, 27 Jul 2022 13:15:22 -0600 Subject: dma-mapping: reformat comment to suppress htmldoc warning make html doc reports a cryptic warning with the commit named below: kernel/dma/mapping.c:258: WARNING: Option list ends without a blank line; unexpected unindent. Seems the parser is a bit fussy about the tabbing and having a single space tab causes the warning. To suppress the warning add another tab to the list and reindent everything. Fixes: 7c2645a2a30a ("dma-mapping: allow EREMOTEIO return code for P2PDMA transfers") Signed-off-by: Logan Gunthorpe Signed-off-by: Christoph Hellwig --- kernel/dma/mapping.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index a9ce5d728231..49cbf3e33de7 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -249,15 +249,15 @@ EXPORT_SYMBOL(dma_map_sg_attrs); * Returns 0 on success or a negative error code on error. The following * error codes are supported with the given meaning: * - * -EINVAL An invalid argument, unaligned access or other error - * in usage. Will not succeed if retried. - * -ENOMEM Insufficient resources (like memory or IOVA space) to - * complete the mapping. Should succeed if retried later. - * -EIO Legacy error code with an unknown meaning. eg. this is - * returned if a lower level call returned DMA_MAPPING_ERROR. - * -EREMOTEIO The DMA device cannot access P2PDMA memory specified in - * the sg_table. This will not succeed if retried. - * + * -EINVAL An invalid argument, unaligned access or other error + * in usage. Will not succeed if retried. + * -ENOMEM Insufficient resources (like memory or IOVA space) to + * complete the mapping. Should succeed if retried later. + * -EIO Legacy error code with an unknown meaning. eg. this is + * returned if a lower level call returned + * DMA_MAPPING_ERROR. + * -EREMOTEIO The DMA device cannot access P2PDMA memory specified + * in the sg_table. This will not succeed if retried. */ int dma_map_sgtable(struct device *dev, struct sg_table *sgt, enum dma_data_direction dir, unsigned long attrs) -- cgit From 5c850d31880e00f063fa2a3746ba212c4bcc510f Mon Sep 17 00:00:00 2001 From: Tianyu Lan Date: Thu, 28 Jul 2022 03:24:20 -0400 Subject: swiotlb: fix passing local variable to debugfs_create_ulong() Debugfs node will be run-timely checked and so local variable should be not passed to debugfs_create_ulong(). Fix it via debugfs_create_file() to create io_tlb_used node and calculate used io tlb number with fops_io_tlb_used attribute. Fixes: 20347fca71a3 ("swiotlb: split up the global swiotlb lock") Signed-off-by: Tianyu Lan Signed-off-by: Christoph Hellwig --- kernel/dma/swiotlb.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index cc50f1fb127f..c5a9190b218f 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -912,17 +912,23 @@ bool is_swiotlb_active(struct device *dev) } EXPORT_SYMBOL_GPL(is_swiotlb_active); +static int io_tlb_used_get(void *data, u64 *val) +{ + *val = mem_used(&io_tlb_default_mem); + return 0; +} +DEFINE_DEBUGFS_ATTRIBUTE(fops_io_tlb_used, io_tlb_used_get, NULL, "%llu\n"); + static void swiotlb_create_debugfs_files(struct io_tlb_mem *mem, const char *dirname) { - unsigned long used = mem_used(mem); - mem->debugfs = debugfs_create_dir(dirname, io_tlb_default_mem.debugfs); if (!mem->nslabs) return; debugfs_create_ulong("io_tlb_nslabs", 0400, mem->debugfs, &mem->nslabs); - debugfs_create_ulong("io_tlb_used", 0400, mem->debugfs, &used); + debugfs_create_file("io_tlb_used", 0400, mem->debugfs, NULL, + &fops_io_tlb_used); } static int __init __maybe_unused swiotlb_create_default_debugfs(void) -- cgit From 265792d0dede9259f0ca56bb3efcc23eceee7d01 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 27 Jul 2022 20:58:15 -0400 Subject: cgroup: Skip subtree root in cgroup_update_dfl_csses() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The cgroup_update_dfl_csses() function updates css associations when a cgroup's subtree_control file is modified. Any changes made to a cgroup's subtree_control file, however, will only affect its descendants but not the cgroup itself. So there is no point in migrating csses associated with that cgroup. We can skip them instead. Signed-off-by: Waiman Long Reviewed-by: Michal Koutný Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index c85dadf0d53b..85fa4c8587a8 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2971,6 +2971,15 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { struct cgrp_cset_link *link; + /* + * As cgroup_update_dfl_csses() is only called by + * cgroup_apply_control(). The csses associated with the + * given cgrp will not be affected by changes made to + * its subtree_control file. We can skip them. + */ + if (dsct == cgrp) + continue; + list_for_each_entry(link, &dsct->cset_links, cset_link) cgroup_migrate_add_src(link->cset, dsct, &mgctx); } -- cgit From 58250ae350de8d28ce91ade4605d32c9e7f062a8 Mon Sep 17 00:00:00 2001 From: Fedor Tokarev Date: Mon, 11 Jul 2022 23:13:17 +0200 Subject: bpf: btf: Fix vsnprintf return value check vsnprintf returns the number of characters which would have been written if enough space had been available, excluding the terminating null byte. Thus, the return value of 'len_left' means that the last character has been dropped. Signed-off-by: Fedor Tokarev Signed-off-by: Andrii Nakryiko Acked-by: Alan Maguire Link: https://lore.kernel.org/bpf/20220711211317.GA1143610@laptop --- kernel/bpf/btf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 7ac971ea98d1..7e64447659f3 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6643,7 +6643,7 @@ static void btf_snprintf_show(struct btf_show *show, const char *fmt, if (len < 0) { ssnprintf->len_left = 0; ssnprintf->len = len; - } else if (len > ssnprintf->len_left) { + } else if (len >= ssnprintf->len_left) { /* no space, drive on to get length we would have written */ ssnprintf->len_left = 0; ssnprintf->len += len; -- cgit From dc81f8d1e8ea3f5dfa88919cb834a135a6a536b8 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 29 Jul 2022 12:41:06 -0700 Subject: bpf: Fix test_progs -j error with fentry/fexit tests When multiple threads are attaching/detaching fentry/fexit programs to the same trampoline, we may call register_fentry on the same trampoline twice: register_fentry(), unregister_fentry(), then register_fentry again. This causes ftrace_set_filter_ip() for the same ip on tr->fops twice, which leaves duplicated ip in tr->fops. The extra ip is not cleaned up properly on unregister and thus causes failures with further register in register_ftrace_direct_multi(): register_ftrace_direct_multi() { ... for (i = 0; i < size; i++) { hlist_for_each_entry(entry, &hash->buckets[i], hlist) { if (ftrace_find_rec_direct(entry->ip)) goto out_unlock; } } ... } This can be triggered with parallel fentry/fexit tests with test_progs: ./test_progs -t fentry,fexit -j Fix this by resetting tr->fops in ftrace_set_filter_ip(), so that there will never be duplicated entries in tr->fops. Fixes: 00963a2e75a8 ("bpf: Support bpf_trampoline on functions with IPMODIFY (e.g. livepatch)") Reported-by: Andrii Nakryiko Signed-off-by: Song Liu Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220729194106.1207472-1-song@kernel.org --- kernel/bpf/trampoline.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 42e387a12694..7ec7e23559ad 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -255,7 +255,7 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr) return -ENOENT; if (tr->func.ftrace_managed) { - ftrace_set_filter_ip(tr->fops, (unsigned long)ip, 0, 0); + ftrace_set_filter_ip(tr->fops, (unsigned long)ip, 0, 1); ret = register_ftrace_direct_multi(tr->fops, (long)new_addr); } else { ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, NULL, new_addr); -- cgit From 3b317abc71598bda8ff9a9c483ad8ae167b18382 Mon Sep 17 00:00:00 2001 From: Xu Kuohai Date: Thu, 28 Jul 2022 07:40:48 -0400 Subject: bpf: Fix NULL pointer dereference when registering bpf trampoline A panic was reported on arm64: [ 44.517109] audit: type=1334 audit(1658859870.268:59): prog-id=19 op=LOAD [ 44.622031] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000010 [ 44.624321] Mem abort info: [ 44.625049] ESR = 0x0000000096000004 [ 44.625935] EC = 0x25: DABT (current EL), IL = 32 bits [ 44.627182] SET = 0, FnV = 0 [ 44.627930] EA = 0, S1PTW = 0 [ 44.628684] FSC = 0x04: level 0 translation fault [ 44.629788] Data abort info: [ 44.630474] ISV = 0, ISS = 0x00000004 [ 44.631362] CM = 0, WnR = 0 [ 44.632041] user pgtable: 4k pages, 48-bit VAs, pgdp=0000000100ab5000 [ 44.633494] [0000000000000010] pgd=0000000000000000, p4d=0000000000000000 [ 44.635202] Internal error: Oops: 96000004 [#1] SMP [ 44.636452] Modules linked in: xfs crct10dif_ce ghash_ce virtio_blk virtio_console virtio_mmio qemu_fw_cfg [ 44.638713] CPU: 2 PID: 1 Comm: systemd Not tainted 5.19.0-rc7 #1 [ 44.640164] Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015 [ 44.641799] pstate: 00400005 (nzcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 44.643404] pc : ftrace_set_filter_ip+0x24/0xa0 [ 44.644659] lr : bpf_trampoline_update.constprop.0+0x428/0x4a0 [ 44.646118] sp : ffff80000803b9f0 [ 44.646950] x29: ffff80000803b9f0 x28: ffff0b5d80364400 x27: ffff80000803bb48 [ 44.648721] x26: ffff8000085ad000 x25: ffff0b5d809d2400 x24: 0000000000000000 [ 44.650493] x23: 00000000ffffffed x22: ffff0b5dd7ea0900 x21: 0000000000000000 [ 44.652279] x20: 0000000000000000 x19: 0000000000000000 x18: ffffffffffffffff [ 44.654067] x17: 0000000000000000 x16: 0000000000000000 x15: ffffffffffffffff [ 44.655787] x14: ffff0b5d809d2498 x13: ffff0b5d809d2432 x12: 0000000005f5e100 [ 44.657535] x11: abcc77118461cefd x10: 000000000000005f x9 : ffffa7219cb5b190 [ 44.659254] x8 : ffffa7219c8e0000 x7 : 0000000000000000 x6 : ffffa7219db075e0 [ 44.661066] x5 : ffffa7219d3130e0 x4 : ffffa7219cab9da0 x3 : 0000000000000000 [ 44.662837] x2 : 0000000000000000 x1 : ffffa7219cb7a5c0 x0 : 0000000000000000 [ 44.664675] Call trace: [ 44.665274] ftrace_set_filter_ip+0x24/0xa0 [ 44.666327] bpf_trampoline_update.constprop.0+0x428/0x4a0 [ 44.667696] __bpf_trampoline_link_prog+0xcc/0x1c0 [ 44.668834] bpf_trampoline_link_prog+0x40/0x64 [ 44.669919] bpf_tracing_prog_attach+0x120/0x490 [ 44.671011] link_create+0xe0/0x2b0 [ 44.671869] __sys_bpf+0x484/0xd30 [ 44.672706] __arm64_sys_bpf+0x30/0x40 [ 44.673678] invoke_syscall+0x78/0x100 [ 44.674623] el0_svc_common.constprop.0+0x4c/0xf4 [ 44.675783] do_el0_svc+0x38/0x4c [ 44.676624] el0_svc+0x34/0x100 [ 44.677429] el0t_64_sync_handler+0x11c/0x150 [ 44.678532] el0t_64_sync+0x190/0x194 [ 44.679439] Code: 2a0203f4 f90013f5 2a0303f5 f9001fe1 (f9400800) [ 44.680959] ---[ end trace 0000000000000000 ]--- [ 44.682111] Kernel panic - not syncing: Oops: Fatal exception [ 44.683488] SMP: stopping secondary CPUs [ 44.684551] Kernel Offset: 0x2721948e0000 from 0xffff800008000000 [ 44.686095] PHYS_OFFSET: 0xfffff4a380000000 [ 44.687144] CPU features: 0x010,00022811,19001080 [ 44.688308] Memory Limit: none [ 44.689082] ---[ end Kernel panic - not syncing: Oops: Fatal exception ]--- It's caused by a NULL tr->fops passed to ftrace_set_filter_ip(). tr->fops is initialized to NULL and is assigned to an allocated memory address if CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS is enabled. Since there is no direct call on arm64 yet, the config can't be enabled. To fix it, call ftrace_set_filter_ip() only if tr->fops is not NULL. Fixes: 00963a2e75a8 ("bpf: Support bpf_trampoline on functions with IPMODIFY (e.g. livepatch)") Reported-by: Bruno Goncalves Signed-off-by: Xu Kuohai Signed-off-by: Andrii Nakryiko Tested-by: Bruno Goncalves Acked-by: Song Liu Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/20220728114048.3540461-1-xukuohai@huaweicloud.com --- kernel/bpf/trampoline.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 7ec7e23559ad..c122d8b3ddc9 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -248,8 +248,11 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr) int ret; faddr = ftrace_location((unsigned long)ip); - if (faddr) + if (faddr) { + if (!tr->fops) + return -ENOTSUPP; tr->func.ftrace_managed = true; + } if (bpf_trampoline_module_get(tr)) return -ENOENT; -- cgit From 14250fa4839b3a48c979e7faaf4cbcce619d02bd Mon Sep 17 00:00:00 2001 From: Yang Li Date: Tue, 26 Jul 2022 06:27:33 +0800 Subject: bpf: Remove unneeded semicolon Eliminate the following coccicheck warning: /kernel/bpf/trampoline.c:101:2-3: Unneeded semicolon Reported-by: Abaci Robot Signed-off-by: Yang Li Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220725222733.55613-1-yang.lee@linux.alibaba.com --- kernel/bpf/trampoline.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index c122d8b3ddc9..0f532e6a717f 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -98,7 +98,7 @@ static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, enum ftrace_ops_cmd default: ret = -EINVAL; break; - }; + } mutex_unlock(&tr->mutex); return ret; -- cgit From 591c32bddbe20ba0e172d9def3c7f22b9c926ad9 Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Thu, 14 Jul 2022 08:47:44 +0100 Subject: kernel/hung_task: fix address space of proc_dohung_task_timeout_secs The proc_dohung_task_timeout_secs() function is incorrectly marked as having a __user buffer as argument 3. However this is not the case and it is casing multiple sparse warnings. Fix the following warnings by removing __user from the argument: kernel/hung_task.c:237:52: warning: incorrect type in argument 3 (different address spaces) kernel/hung_task.c:237:52: expected void * kernel/hung_task.c:237:52: got void [noderef] __user *buffer kernel/hung_task.c:287:35: warning: incorrect type in initializer (incompatible argument 3 (different address spaces)) kernel/hung_task.c:287:35: expected int ( [usertype] *proc_handler )( ... ) kernel/hung_task.c:287:35: got int ( * )( ... ) kernel/hung_task.c:295:35: warning: incorrect type in initializer (incompatible argument 3 (different address spaces)) kernel/hung_task.c:295:35: expected int ( [usertype] *proc_handler )( ... ) kernel/hung_task.c:295:35: got int ( * )( ... ) Link: https://lkml.kernel.org/r/20220714074744.189017-1-ben.dooks@sifive.com Signed-off-by: Ben Dooks Cc: Signed-off-by: Andrew Morton --- kernel/hung_task.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/hung_task.c b/kernel/hung_task.c index cff3ae8c818f..bb2354f73ded 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -229,7 +229,7 @@ static long hung_timeout_jiffies(unsigned long last_checked, * Process updating of timeout sysctl */ static int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, - void __user *buffer, + void *buffer, size_t *lenp, loff_t *ppos) { int ret; -- cgit From 787dbea11a5d6843999ff71a3fb9aa1ed6d5d889 Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Thu, 21 Jul 2022 20:55:09 +0100 Subject: profile: setup_profiling_timer() is moslty not implemented The setup_profiling_timer() is mostly un-implemented by many architectures. In many places it isn't guarded by CONFIG_PROFILE which is needed for it to be used. Make it a weak symbol in kernel/profile.c and remove the 'return -EINVAL' implementations from the kenrel. There are a couple of architectures which do return 0 from the setup_profiling_timer() function but they don't seem to do anything else with it. To keep the /proc compatibility for now, leave these for a future update or removal. On ARM, this fixes the following sparse warning: arch/arm/kernel/smp.c:793:5: warning: symbol 'setup_profiling_timer' was not declared. Should it be static? Link: https://lkml.kernel.org/r/20220721195509.418205-1-ben-linux@fluff.org Signed-off-by: Ben Dooks Signed-off-by: Andrew Morton --- kernel/profile.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/profile.c b/kernel/profile.c index ae82ddfc6a68..7ea01ba30e75 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -425,6 +425,12 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos) return read; } +/* default is to not implement this call */ +int __weak setup_profiling_timer(unsigned mult) +{ + return -EINVAL; +} + /* * Writing to /proc/profile resets the counters * @@ -435,8 +441,6 @@ static ssize_t write_profile(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { #ifdef CONFIG_SMP - extern int setup_profiling_timer(unsigned int multiplier); - if (count == sizeof(int)) { unsigned int multiplier; -- cgit From 102227b970a15256f5ffd12a6a276ddf978e6caf Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Fri, 29 Jul 2022 11:38:40 +0200 Subject: rv: Add Runtime Verification (RV) interface RV is a lightweight (yet rigorous) method that complements classical exhaustive verification techniques (such as model checking and theorem proving) with a more practical approach to complex systems. RV works by analyzing the trace of the system's actual execution, comparing it against a formal specification of the system behavior. RV can give precise information on the runtime behavior of the monitored system while enabling the reaction for unexpected events, avoiding, for example, the propagation of a failure on safety-critical systems. The development of this interface roots in the development of the paper: De Oliveira, Daniel Bristot; Cucinotta, Tommaso; De Oliveira, Romulo Silva. Efficient formal verification for the Linux kernel. In: International Conference on Software Engineering and Formal Methods. Springer, Cham, 2019. p. 315-332. And: De Oliveira, Daniel Bristot. Automata-based formal analysis and verification of the real-time Linux kernel. PhD Thesis, 2020. The RV interface resembles the tracing/ interface on purpose. The current path for the RV interface is /sys/kernel/tracing/rv/. It presents these files: "available_monitors" - List the available monitors, one per line. For example: # cat available_monitors wip wwnr "enabled_monitors" - Lists the enabled monitors, one per line; - Writing to it enables a given monitor; - Writing a monitor name with a '!' prefix disables it; - Truncating the file disables all enabled monitors. For example: # cat enabled_monitors # echo wip > enabled_monitors # echo wwnr >> enabled_monitors # cat enabled_monitors wip wwnr # echo '!wip' >> enabled_monitors # cat enabled_monitors wwnr # echo > enabled_monitors # cat enabled_monitors # Note that more than one monitor can be enabled concurrently. "monitoring_on" - It is an on/off general switcher for monitoring. Note that it does not disable enabled monitors or detach events, but stop the per-entity monitors of monitoring the events received from the system. It resembles the "tracing_on" switcher. "monitors/" Each monitor will have its one directory inside "monitors/". There the monitor specific files will be presented. The "monitors/" directory resembles the "events" directory on tracefs. For example: # cd monitors/wip/ # ls desc enable # cat desc wakeup in preemptive per-cpu testing monitor. # cat enable 0 For further information, see the comments in the header of kernel/trace/rv/rv.c from this patch. Link: https://lkml.kernel.org/r/a4bfe038f50cb047bfb343ad0e12b0e646ab308b.1659052063.git.bristot@kernel.org Cc: Wim Van Sebroeck Cc: Guenter Roeck Cc: Jonathan Corbet Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Will Deacon Cc: Catalin Marinas Cc: Marco Elver Cc: Dmitry Vyukov Cc: "Paul E. McKenney" Cc: Shuah Khan Cc: Gabriele Paoloni Cc: Juri Lelli Cc: Clark Williams Cc: Tao Zhou Cc: Randy Dunlap Cc: linux-doc@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- kernel/trace/Kconfig | 2 + kernel/trace/Makefile | 1 + kernel/trace/rv/Kconfig | 12 + kernel/trace/rv/Makefile | 3 + kernel/trace/rv/rv.c | 782 +++++++++++++++++++++++++++++++++++++++++++++++ kernel/trace/rv/rv.h | 33 ++ kernel/trace/trace.c | 2 + kernel/trace/trace.h | 9 + 8 files changed, 844 insertions(+) create mode 100644 kernel/trace/rv/Kconfig create mode 100644 kernel/trace/rv/Makefile create mode 100644 kernel/trace/rv/rv.c create mode 100644 kernel/trace/rv/rv.h (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index ccd6a5ade3e9..1052126bdca2 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -1106,4 +1106,6 @@ config HIST_TRIGGERS_DEBUG If unsure, say N. +source "kernel/trace/rv/Kconfig" + endif # FTRACE diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 0d261774d6f3..c6651e16b557 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -106,5 +106,6 @@ obj-$(CONFIG_FPROBE) += fprobe.o obj-$(CONFIG_RETHOOK) += rethook.o obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o +obj-$(CONFIG_RV) += rv/ libftrace-y := ftrace.o diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig new file mode 100644 index 000000000000..6d127cdb00dd --- /dev/null +++ b/kernel/trace/rv/Kconfig @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +menuconfig RV + bool "Runtime Verification" + depends on TRACING + help + Enable the kernel runtime verification infrastructure. RV is a + lightweight (yet rigorous) method that complements classical + exhaustive verification techniques (such as model checking and + theorem proving). RV works by analyzing the trace of the system's + actual execution, comparing it against a formal specification of + the system behavior. diff --git a/kernel/trace/rv/Makefile b/kernel/trace/rv/Makefile new file mode 100644 index 000000000000..fd995379df67 --- /dev/null +++ b/kernel/trace/rv/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-$(CONFIG_RV) += rv.o diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c new file mode 100644 index 000000000000..731cc961cc70 --- /dev/null +++ b/kernel/trace/rv/rv.c @@ -0,0 +1,782 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2019-2022 Red Hat, Inc. Daniel Bristot de Oliveira + * + * This is the online Runtime Verification (RV) interface. + * + * RV is a lightweight (yet rigorous) method that complements classical + * exhaustive verification techniques (such as model checking and + * theorem proving) with a more practical approach to complex systems. + * + * RV works by analyzing the trace of the system's actual execution, + * comparing it against a formal specification of the system behavior. + * RV can give precise information on the runtime behavior of the + * monitored system while enabling the reaction for unexpected + * events, avoiding, for example, the propagation of a failure on + * safety-critical systems. + * + * The development of this interface roots in the development of the + * paper: + * + * De Oliveira, Daniel Bristot; Cucinotta, Tommaso; De Oliveira, Romulo + * Silva. Efficient formal verification for the Linux kernel. In: + * International Conference on Software Engineering and Formal Methods. + * Springer, Cham, 2019. p. 315-332. + * + * And: + * + * De Oliveira, Daniel Bristot, et al. Automata-based formal analysis + * and verification of the real-time Linux kernel. PhD Thesis, 2020. + * + * == Runtime monitor interface == + * + * A monitor is the central part of the runtime verification of a system. + * + * The monitor stands in between the formal specification of the desired + * (or undesired) behavior, and the trace of the actual system. + * + * In Linux terms, the runtime verification monitors are encapsulated + * inside the "RV monitor" abstraction. A RV monitor includes a reference + * model of the system, a set of instances of the monitor (per-cpu monitor, + * per-task monitor, and so on), and the helper functions that glue the + * monitor to the system via trace. Generally, a monitor includes some form + * of trace output as a reaction for event parsing and exceptions, + * as depicted bellow: + * + * Linux +----- RV Monitor ----------------------------------+ Formal + * Realm | | Realm + * +-------------------+ +----------------+ +-----------------+ + * | Linux kernel | | Monitor | | Reference | + * | Tracing | -> | Instance(s) | <- | Model | + * | (instrumentation) | | (verification) | | (specification) | + * +-------------------+ +----------------+ +-----------------+ + * | | | + * | V | + * | +----------+ | + * | | Reaction | | + * | +--+--+--+-+ | + * | | | | | + * | | | +-> trace output ? | + * +------------------------|--|----------------------+ + * | +----> panic ? + * +-------> + * + * This file implements the interface for loading RV monitors, and + * to control the verification session. + * + * == Registering monitors == + * + * The struct rv_monitor defines a set of callback functions to control + * a verification session. For instance, when a given monitor is enabled, + * the "enable" callback function is called to hook the instrumentation + * functions to the kernel trace events. The "disable" function is called + * when disabling the verification session. + * + * A RV monitor is registered via: + * int rv_register_monitor(struct rv_monitor *monitor); + * And unregistered via: + * int rv_unregister_monitor(struct rv_monitor *monitor); + * + * == User interface == + * + * The user interface resembles kernel tracing interface. It presents + * these files: + * + * "available_monitors" + * - List the available monitors, one per line. + * + * For example: + * # cat available_monitors + * wip + * wwnr + * + * "enabled_monitors" + * - Lists the enabled monitors, one per line; + * - Writing to it enables a given monitor; + * - Writing a monitor name with a '!' prefix disables it; + * - Truncating the file disables all enabled monitors. + * + * For example: + * # cat enabled_monitors + * # echo wip > enabled_monitors + * # echo wwnr >> enabled_monitors + * # cat enabled_monitors + * wip + * wwnr + * # echo '!wip' >> enabled_monitors + * # cat enabled_monitors + * wwnr + * # echo > enabled_monitors + * # cat enabled_monitors + * # + * + * Note that more than one monitor can be enabled concurrently. + * + * "monitoring_on" + * - It is an on/off general switcher for monitoring. Note + * that it does not disable enabled monitors or detach events, + * but stops the per-entity monitors from monitoring the events + * received from the instrumentation. It resembles the "tracing_on" + * switcher. + * + * "monitors/" + * Each monitor will have its own directory inside "monitors/". There + * the monitor specific files will be presented. + * The "monitors/" directory resembles the "events" directory on + * tracefs. + * + * For example: + * # cd monitors/wip/ + * # ls + * desc enable + * # cat desc + * auto-generated wakeup in preemptive monitor. + * # cat enable + * 0 + */ + +#include +#include +#include +#include + +#include "rv.h" + +DEFINE_MUTEX(rv_interface_lock); + +static struct rv_interface rv_root; + +struct dentry *get_monitors_root(void) +{ + return rv_root.monitors_dir; +} + +/* + * Interface for the monitor register. + */ +static LIST_HEAD(rv_monitors_list); + +static int task_monitor_count; +static bool task_monitor_slots[RV_PER_TASK_MONITORS]; + +int rv_get_task_monitor_slot(void) +{ + int i; + + lockdep_assert_held(&rv_interface_lock); + + if (task_monitor_count == RV_PER_TASK_MONITORS) + return -EBUSY; + + task_monitor_count++; + + for (i = 0; i < RV_PER_TASK_MONITORS; i++) { + if (task_monitor_slots[i] == false) { + task_monitor_slots[i] = true; + return i; + } + } + + WARN_ONCE(1, "RV task_monitor_count and slots are out of sync\n"); + + return -EINVAL; +} + +void rv_put_task_monitor_slot(int slot) +{ + lockdep_assert_held(&rv_interface_lock); + + if (slot < 0 || slot >= RV_PER_TASK_MONITORS) { + WARN_ONCE(1, "RV releasing an invalid slot!: %d\n", slot); + return; + } + + WARN_ONCE(!task_monitor_slots[slot], "RV releasing unused task_monitor_slots: %d\n", + slot); + + task_monitor_count--; + task_monitor_slots[slot] = false; +} + +/* + * This section collects the monitor/ files and folders. + */ +static ssize_t monitor_enable_read_data(struct file *filp, char __user *user_buf, size_t count, + loff_t *ppos) +{ + struct rv_monitor_def *mdef = filp->private_data; + const char *buff; + + buff = mdef->monitor->enabled ? "1\n" : "0\n"; + + return simple_read_from_buffer(user_buf, count, ppos, buff, strlen(buff)+1); +} + +/* + * __rv_disable_monitor - disabled an enabled monitor + */ +static int __rv_disable_monitor(struct rv_monitor_def *mdef, bool sync) +{ + lockdep_assert_held(&rv_interface_lock); + + if (mdef->monitor->enabled) { + mdef->monitor->enabled = 0; + mdef->monitor->disable(); + + /* + * Wait for the execution of all events to finish. + * Otherwise, the data used by the monitor could + * be inconsistent. i.e., if the monitor is re-enabled. + */ + if (sync) + tracepoint_synchronize_unregister(); + return 1; + } + return 0; +} + +/** + * rv_disable_monitor - disable a given runtime monitor + * + * Returns 0 on success. + */ +int rv_disable_monitor(struct rv_monitor_def *mdef) +{ + __rv_disable_monitor(mdef, true); + return 0; +} + +/** + * rv_enable_monitor - enable a given runtime monitor + * + * Returns 0 on success, error otherwise. + */ +int rv_enable_monitor(struct rv_monitor_def *mdef) +{ + int retval; + + lockdep_assert_held(&rv_interface_lock); + + if (mdef->monitor->enabled) + return 0; + + retval = mdef->monitor->enable(); + + if (!retval) + mdef->monitor->enabled = 1; + + return retval; +} + +/* + * interface for enabling/disabling a monitor. + */ +static ssize_t monitor_enable_write_data(struct file *filp, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct rv_monitor_def *mdef = filp->private_data; + int retval; + bool val; + + retval = kstrtobool_from_user(user_buf, count, &val); + if (retval) + return retval; + + retval = count; + + mutex_lock(&rv_interface_lock); + + if (val) + retval = rv_enable_monitor(mdef); + else + retval = rv_disable_monitor(mdef); + + mutex_unlock(&rv_interface_lock); + + return retval ? : count; +} + +static const struct file_operations interface_enable_fops = { + .open = simple_open, + .llseek = no_llseek, + .write = monitor_enable_write_data, + .read = monitor_enable_read_data, +}; + +/* + * Interface to read monitors description. + */ +static ssize_t monitor_desc_read_data(struct file *filp, char __user *user_buf, size_t count, + loff_t *ppos) +{ + struct rv_monitor_def *mdef = filp->private_data; + char buff[256]; + + memset(buff, 0, sizeof(buff)); + + snprintf(buff, sizeof(buff), "%s\n", mdef->monitor->description); + + return simple_read_from_buffer(user_buf, count, ppos, buff, strlen(buff) + 1); +} + +static const struct file_operations interface_desc_fops = { + .open = simple_open, + .llseek = no_llseek, + .read = monitor_desc_read_data, +}; + +/* + * During the registration of a monitor, this function creates + * the monitor dir, where the specific options of the monitor + * are exposed. + */ +static int create_monitor_dir(struct rv_monitor_def *mdef) +{ + struct dentry *root = get_monitors_root(); + const char *name = mdef->monitor->name; + struct dentry *tmp; + int retval; + + mdef->root_d = rv_create_dir(name, root); + if (!mdef->root_d) + return -ENOMEM; + + tmp = rv_create_file("enable", RV_MODE_WRITE, mdef->root_d, mdef, &interface_enable_fops); + if (!tmp) { + retval = -ENOMEM; + goto out_remove_root; + } + + tmp = rv_create_file("desc", RV_MODE_READ, mdef->root_d, mdef, &interface_desc_fops); + if (!tmp) { + retval = -ENOMEM; + goto out_remove_root; + } + + return 0; + +out_remove_root: + rv_remove(mdef->root_d); + return retval; +} + +/* + * Available/Enable monitor shared seq functions. + */ +static int monitors_show(struct seq_file *m, void *p) +{ + struct rv_monitor_def *mon_def = p; + + seq_printf(m, "%s\n", mon_def->monitor->name); + return 0; +} + +/* + * Used by the seq file operations at the end of a read + * operation. + */ +static void monitors_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&rv_interface_lock); +} + +/* + * Available monitor seq functions. + */ +static void *available_monitors_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&rv_interface_lock); + return seq_list_start(&rv_monitors_list, *pos); +} + +static void *available_monitors_next(struct seq_file *m, void *p, loff_t *pos) +{ + return seq_list_next(p, &rv_monitors_list, pos); +} + +/* + * Enable monitor seq functions. + */ +static void *enabled_monitors_next(struct seq_file *m, void *p, loff_t *pos) +{ + struct rv_monitor_def *m_def = p; + + (*pos)++; + + list_for_each_entry_continue(m_def, &rv_monitors_list, list) { + if (m_def->monitor->enabled) + return m_def; + } + + return NULL; +} + +static void *enabled_monitors_start(struct seq_file *m, loff_t *pos) +{ + struct rv_monitor_def *m_def; + loff_t l; + + mutex_lock(&rv_interface_lock); + + if (list_empty(&rv_monitors_list)) + return NULL; + + m_def = list_entry(&rv_monitors_list, struct rv_monitor_def, list); + + for (l = 0; l <= *pos; ) { + m_def = enabled_monitors_next(m, m_def, &l); + if (!m_def) + break; + } + + return m_def; +} + +/* + * available/enabled monitors seq definition. + */ +static const struct seq_operations available_monitors_seq_ops = { + .start = available_monitors_start, + .next = available_monitors_next, + .stop = monitors_stop, + .show = monitors_show +}; + +static const struct seq_operations enabled_monitors_seq_ops = { + .start = enabled_monitors_start, + .next = enabled_monitors_next, + .stop = monitors_stop, + .show = monitors_show +}; + +/* + * available_monitors interface. + */ +static int available_monitors_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &available_monitors_seq_ops); +}; + +static const struct file_operations available_monitors_ops = { + .open = available_monitors_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release +}; + +/* + * enabled_monitors interface. + */ +static void disable_all_monitors(void) +{ + struct rv_monitor_def *mdef; + int enabled = 0; + + mutex_lock(&rv_interface_lock); + + list_for_each_entry(mdef, &rv_monitors_list, list) + enabled += __rv_disable_monitor(mdef, false); + + if (enabled) { + /* + * Wait for the execution of all events to finish. + * Otherwise, the data used by the monitor could + * be inconsistent. i.e., if the monitor is re-enabled. + */ + tracepoint_synchronize_unregister(); + } + + mutex_unlock(&rv_interface_lock); +} + +static int enabled_monitors_open(struct inode *inode, struct file *file) +{ + if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) + disable_all_monitors(); + + return seq_open(file, &enabled_monitors_seq_ops); +}; + +static ssize_t enabled_monitors_write(struct file *filp, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + char buff[MAX_RV_MONITOR_NAME_SIZE + 2]; + struct rv_monitor_def *mdef; + int retval = -EINVAL; + bool enable = true; + char *ptr = buff; + int len; + + if (count < 1 || count > MAX_RV_MONITOR_NAME_SIZE + 1) + return -EINVAL; + + memset(buff, 0, sizeof(buff)); + + retval = simple_write_to_buffer(buff, sizeof(buff) - 1, ppos, user_buf, count); + if (retval < 0) + return -EFAULT; + + ptr = strim(buff); + + if (ptr[0] == '!') { + enable = false; + ptr++; + } + + len = strlen(ptr); + if (!len) + return count; + + mutex_lock(&rv_interface_lock); + + retval = -EINVAL; + + list_for_each_entry(mdef, &rv_monitors_list, list) { + if (strcmp(ptr, mdef->monitor->name) != 0) + continue; + + /* + * Monitor found! + */ + if (enable) + retval = rv_enable_monitor(mdef); + else + retval = rv_disable_monitor(mdef); + + if (!retval) + retval = count; + + break; + } + + mutex_unlock(&rv_interface_lock); + return retval; +} + +static const struct file_operations enabled_monitors_ops = { + .open = enabled_monitors_open, + .read = seq_read, + .write = enabled_monitors_write, + .llseek = seq_lseek, + .release = seq_release, +}; + +/* + * Monitoring on global switcher! + */ +static bool __read_mostly monitoring_on; + +/** + * rv_monitoring_on - checks if monitoring is on + * + * Returns 1 if on, 0 otherwise. + */ +bool rv_monitoring_on(void) +{ + /* Ensures that concurrent monitors read consistent monitoring_on */ + smp_rmb(); + return READ_ONCE(monitoring_on); +} + +/* + * monitoring_on general switcher. + */ +static ssize_t monitoring_on_read_data(struct file *filp, char __user *user_buf, + size_t count, loff_t *ppos) +{ + const char *buff; + + buff = rv_monitoring_on() ? "1\n" : "0\n"; + + return simple_read_from_buffer(user_buf, count, ppos, buff, strlen(buff) + 1); +} + +static void turn_monitoring_off(void) +{ + WRITE_ONCE(monitoring_on, false); + /* Ensures that concurrent monitors read consistent monitoring_on */ + smp_wmb(); +} + +static void reset_all_monitors(void) +{ + struct rv_monitor_def *mdef; + + list_for_each_entry(mdef, &rv_monitors_list, list) { + if (mdef->monitor->enabled) + mdef->monitor->reset(); + } +} + +static void turn_monitoring_on(void) +{ + WRITE_ONCE(monitoring_on, true); + /* Ensures that concurrent monitors read consistent monitoring_on */ + smp_wmb(); +} + +static void turn_monitoring_on_with_reset(void) +{ + lockdep_assert_held(&rv_interface_lock); + + if (rv_monitoring_on()) + return; + + /* + * Monitors might be out of sync with the system if events were not + * processed because of !rv_monitoring_on(). + * + * Reset all monitors, forcing a re-sync. + */ + reset_all_monitors(); + turn_monitoring_on(); +} + +static ssize_t monitoring_on_write_data(struct file *filp, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + int retval; + bool val; + + retval = kstrtobool_from_user(user_buf, count, &val); + if (retval) + return retval; + + mutex_lock(&rv_interface_lock); + + if (val) + turn_monitoring_on_with_reset(); + else + turn_monitoring_off(); + + /* + * Wait for the execution of all events to finish + * before returning to user-space. + */ + tracepoint_synchronize_unregister(); + + mutex_unlock(&rv_interface_lock); + + return count; +} + +static const struct file_operations monitoring_on_fops = { + .open = simple_open, + .llseek = no_llseek, + .write = monitoring_on_write_data, + .read = monitoring_on_read_data, +}; + +static void destroy_monitor_dir(struct rv_monitor_def *mdef) +{ + rv_remove(mdef->root_d); +} + +/** + * rv_register_monitor - register a rv monitor. + * @monitor: The rv_monitor to be registered. + * + * Returns 0 if successful, error otherwise. + */ +int rv_register_monitor(struct rv_monitor *monitor) +{ + struct rv_monitor_def *r; + int retval = 0; + + if (strlen(monitor->name) >= MAX_RV_MONITOR_NAME_SIZE) { + pr_info("Monitor %s has a name longer than %d\n", monitor->name, + MAX_RV_MONITOR_NAME_SIZE); + return -1; + } + + mutex_lock(&rv_interface_lock); + + list_for_each_entry(r, &rv_monitors_list, list) { + if (strcmp(monitor->name, r->monitor->name) == 0) { + pr_info("Monitor %s is already registered\n", monitor->name); + retval = -1; + goto out_unlock; + } + } + + r = kzalloc(sizeof(struct rv_monitor_def), GFP_KERNEL); + if (!r) { + retval = -ENOMEM; + goto out_unlock; + } + + r->monitor = monitor; + + retval = create_monitor_dir(r); + if (retval) { + kfree(r); + goto out_unlock; + } + + list_add_tail(&r->list, &rv_monitors_list); + +out_unlock: + mutex_unlock(&rv_interface_lock); + return retval; +} + +/** + * rv_unregister_monitor - unregister a rv monitor. + * @monitor: The rv_monitor to be unregistered. + * + * Returns 0 if successful, error otherwise. + */ +int rv_unregister_monitor(struct rv_monitor *monitor) +{ + struct rv_monitor_def *ptr, *next; + + mutex_lock(&rv_interface_lock); + + list_for_each_entry_safe(ptr, next, &rv_monitors_list, list) { + if (strcmp(monitor->name, ptr->monitor->name) == 0) { + rv_disable_monitor(ptr); + list_del(&ptr->list); + destroy_monitor_dir(ptr); + } + } + + mutex_unlock(&rv_interface_lock); + return 0; +} + +int __init rv_init_interface(void) +{ + struct dentry *tmp; + + rv_root.root_dir = rv_create_dir("rv", NULL); + if (!rv_root.root_dir) + goto out_err; + + rv_root.monitors_dir = rv_create_dir("monitors", rv_root.root_dir); + if (!rv_root.monitors_dir) + goto out_err; + + tmp = rv_create_file("available_monitors", RV_MODE_READ, rv_root.root_dir, NULL, + &available_monitors_ops); + if (!tmp) + goto out_err; + + tmp = rv_create_file("enabled_monitors", RV_MODE_WRITE, rv_root.root_dir, NULL, + &enabled_monitors_ops); + if (!tmp) + goto out_err; + + tmp = rv_create_file("monitoring_on", RV_MODE_WRITE, rv_root.root_dir, NULL, + &monitoring_on_fops); + if (!tmp) + goto out_err; + + turn_monitoring_on(); + + return 0; + +out_err: + rv_remove(rv_root.root_dir); + printk(KERN_ERR "RV: Error while creating the RV interface\n"); + return 1; +} diff --git a/kernel/trace/rv/rv.h b/kernel/trace/rv/rv.h new file mode 100644 index 000000000000..50014aa224a7 --- /dev/null +++ b/kernel/trace/rv/rv.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include + +struct rv_interface { + struct dentry *root_dir; + struct dentry *monitors_dir; +}; + +#include "../trace.h" +#include +#include + +#define RV_MODE_WRITE TRACE_MODE_WRITE +#define RV_MODE_READ TRACE_MODE_READ + +#define rv_create_dir tracefs_create_dir +#define rv_create_file tracefs_create_file +#define rv_remove tracefs_remove + +#define MAX_RV_MONITOR_NAME_SIZE 32 + +extern struct mutex rv_interface_lock; + +struct rv_monitor_def { + struct list_head list; + struct rv_monitor *monitor; + struct dentry *root_d; + bool task_monitor; +}; + +struct dentry *get_monitors_root(void); +int rv_disable_monitor(struct rv_monitor_def *mdef); +int rv_enable_monitor(struct rv_monitor_def *mdef); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 7eb5bce62500..301305ec134b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -9772,6 +9772,8 @@ static __init int tracer_init_tracefs(void) tracer_init_tracefs_work_func(NULL); } + rv_init_interface(); + return 0; } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index ff816fb41e48..900e75d96c84 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -2005,4 +2005,13 @@ struct trace_min_max_param { extern const struct file_operations trace_min_max_fops; +#ifdef CONFIG_RV +extern int rv_init_interface(void); +#else +static inline int rv_init_interface(void) +{ + return 0; +} +#endif + #endif /* _LINUX_KERNEL_TRACE_H */ -- cgit From 04acadcb4453cf8011dd3d4ce8d97fecac42d325 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Fri, 29 Jul 2022 11:38:41 +0200 Subject: rv: Add runtime reactors interface A runtime monitor can cause a reaction to the detection of an exception on the model's execution. By default, the monitors have tracing reactions, printing the monitor output via tracepoints. But other reactions can be added (on-demand) via this interface. The user interface resembles the kernel tracing interface and presents these files: "available_reactors" - Reading shows the available reactors, one per line. For example: # cat available_reactors nop panic printk "reacting_on" - It is an on/off general switch for reactors, disabling all reactions. "monitors/MONITOR/reactors" - List available reactors, with the select reaction for the given MONITOR inside []. The default one is the nop (no operation) reactor. - Writing the name of a reactor enables it to the given MONITOR. For example: # cat monitors/wip/reactors [nop] panic printk # echo panic > monitors/wip/reactors # cat monitors/wip/reactors nop [panic] printk Link: https://lkml.kernel.org/r/1794eb994637457bdeaa6bad0b8263d2f7eece0c.1659052063.git.bristot@kernel.org Cc: Wim Van Sebroeck Cc: Guenter Roeck Cc: Jonathan Corbet Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Will Deacon Cc: Catalin Marinas Cc: Marco Elver Cc: Dmitry Vyukov Cc: "Paul E. McKenney" Cc: Shuah Khan Cc: Gabriele Paoloni Cc: Juri Lelli Cc: Clark Williams Cc: Tao Zhou Cc: Randy Dunlap Cc: linux-doc@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- kernel/trace/rv/Kconfig | 11 + kernel/trace/rv/Makefile | 1 + kernel/trace/rv/rv.c | 9 + kernel/trace/rv/rv.h | 35 +++ kernel/trace/rv/rv_reactors.c | 508 ++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 564 insertions(+) create mode 100644 kernel/trace/rv/rv_reactors.c (limited to 'kernel') diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig index 6d127cdb00dd..3eb5d48ab4f6 100644 --- a/kernel/trace/rv/Kconfig +++ b/kernel/trace/rv/Kconfig @@ -10,3 +10,14 @@ menuconfig RV theorem proving). RV works by analyzing the trace of the system's actual execution, comparing it against a formal specification of the system behavior. + +config RV_REACTORS + bool "Runtime verification reactors" + default y + depends on RV + help + Enables the online runtime verification reactors. A runtime + monitor can cause a reaction to the detection of an exception + on the model's execution. By default, the monitors have + tracing reactions, printing the monitor output via tracepoints, + but other reactions can be added (on-demand) via this interface. diff --git a/kernel/trace/rv/Makefile b/kernel/trace/rv/Makefile index fd995379df67..8944274d9b41 100644 --- a/kernel/trace/rv/Makefile +++ b/kernel/trace/rv/Makefile @@ -1,3 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_RV) += rv.o +obj-$(CONFIG_RV_REACTORS) += rv_reactors.o diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c index 731cc961cc70..45cf64eb2600 100644 --- a/kernel/trace/rv/rv.c +++ b/kernel/trace/rv/rv.c @@ -353,6 +353,10 @@ static int create_monitor_dir(struct rv_monitor_def *mdef) goto out_remove_root; } + retval = reactor_populate_monitor(mdef); + if (retval) + goto out_remove_root; + return 0; out_remove_root: @@ -669,6 +673,7 @@ static const struct file_operations monitoring_on_fops = { static void destroy_monitor_dir(struct rv_monitor_def *mdef) { + reactor_cleanup_monitor(mdef); rv_remove(mdef->root_d); } @@ -747,6 +752,7 @@ int rv_unregister_monitor(struct rv_monitor *monitor) int __init rv_init_interface(void) { struct dentry *tmp; + int retval; rv_root.root_dir = rv_create_dir("rv", NULL); if (!rv_root.root_dir) @@ -770,6 +776,9 @@ int __init rv_init_interface(void) &monitoring_on_fops); if (!tmp) goto out_err; + retval = init_rv_reactors(rv_root.root_dir); + if (retval) + goto out_err; turn_monitoring_on(); diff --git a/kernel/trace/rv/rv.h b/kernel/trace/rv/rv.h index 50014aa224a7..db6cb0913dbd 100644 --- a/kernel/trace/rv/rv.h +++ b/kernel/trace/rv/rv.h @@ -18,16 +18,51 @@ struct rv_interface { #define rv_remove tracefs_remove #define MAX_RV_MONITOR_NAME_SIZE 32 +#define MAX_RV_REACTOR_NAME_SIZE 32 extern struct mutex rv_interface_lock; +#ifdef CONFIG_RV_REACTORS +struct rv_reactor_def { + struct list_head list; + struct rv_reactor *reactor; + /* protected by the monitor interface lock */ + int counter; +}; +#endif + struct rv_monitor_def { struct list_head list; struct rv_monitor *monitor; struct dentry *root_d; +#ifdef CONFIG_RV_REACTORS + struct rv_reactor_def *rdef; + bool reacting; +#endif bool task_monitor; }; struct dentry *get_monitors_root(void); int rv_disable_monitor(struct rv_monitor_def *mdef); int rv_enable_monitor(struct rv_monitor_def *mdef); + +#ifdef CONFIG_RV_REACTORS +int reactor_populate_monitor(struct rv_monitor_def *mdef); +void reactor_cleanup_monitor(struct rv_monitor_def *mdef); +int init_rv_reactors(struct dentry *root_dir); +#else +static inline int reactor_populate_monitor(struct rv_monitor_def *mdef) +{ + return 0; +} + +static inline void reactor_cleanup_monitor(struct rv_monitor_def *mdef) +{ + return; +} + +static inline int init_rv_reactors(struct dentry *root_dir) +{ + return 0; +} +#endif diff --git a/kernel/trace/rv/rv_reactors.c b/kernel/trace/rv/rv_reactors.c new file mode 100644 index 000000000000..a6522c196382 --- /dev/null +++ b/kernel/trace/rv/rv_reactors.c @@ -0,0 +1,508 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2019-2022 Red Hat, Inc. Daniel Bristot de Oliveira + * + * Runtime reactor interface. + * + * A runtime monitor can cause a reaction to the detection of an + * exception on the model's execution. By default, the monitors have + * tracing reactions, printing the monitor output via tracepoints. + * But other reactions can be added (on-demand) via this interface. + * + * == Registering reactors == + * + * The struct rv_reactor defines a callback function to be executed + * in case of a model exception happens. The callback function + * receives a message to be (optionally) printed before executing + * the reaction. + * + * A RV reactor is registered via: + * int rv_register_reactor(struct rv_reactor *reactor) + * And unregistered via: + * int rv_unregister_reactor(struct rv_reactor *reactor) + * + * These functions are exported to modules, enabling reactors to be + * dynamically loaded. + * + * == User interface == + * + * The user interface resembles the kernel tracing interface and + * presents these files: + * + * "available_reactors" + * - List the available reactors, one per line. + * + * For example: + * # cat available_reactors + * nop + * panic + * printk + * + * "reacting_on" + * - It is an on/off general switch for reactors, disabling + * all reactions. + * + * "monitors/MONITOR/reactors" + * - List available reactors, with the select reaction for the given + * MONITOR inside []. The default one is the nop (no operation) + * reactor. + * - Writing the name of an reactor enables it to the given + * MONITOR. + * + * For example: + * # cat monitors/wip/reactors + * [nop] + * panic + * printk + * # echo panic > monitors/wip/reactors + * # cat monitors/wip/reactors + * nop + * [panic] + * printk + */ + +#include + +#include "rv.h" + +/* + * Interface for the reactor register. + */ +static LIST_HEAD(rv_reactors_list); + +static struct rv_reactor_def *get_reactor_rdef_by_name(char *name) +{ + struct rv_reactor_def *r; + + list_for_each_entry(r, &rv_reactors_list, list) { + if (strcmp(name, r->reactor->name) == 0) + return r; + } + return NULL; +} + +/* + * Available reactors seq functions. + */ +static int reactors_show(struct seq_file *m, void *p) +{ + struct rv_reactor_def *rea_def = p; + + seq_printf(m, "%s\n", rea_def->reactor->name); + return 0; +} + +static void reactors_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&rv_interface_lock); +} + +static void *reactors_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&rv_interface_lock); + return seq_list_start(&rv_reactors_list, *pos); +} + +static void *reactors_next(struct seq_file *m, void *p, loff_t *pos) +{ + return seq_list_next(p, &rv_reactors_list, pos); +} + +/* + * available_reactors seq definition. + */ +static const struct seq_operations available_reactors_seq_ops = { + .start = reactors_start, + .next = reactors_next, + .stop = reactors_stop, + .show = reactors_show +}; + +/* + * available_reactors interface. + */ +static int available_reactors_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &available_reactors_seq_ops); +}; + +static const struct file_operations available_reactors_ops = { + .open = available_reactors_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release +}; + +/* + * Monitor's reactor file. + */ +static int monitor_reactor_show(struct seq_file *m, void *p) +{ + struct rv_monitor_def *mdef = m->private; + struct rv_reactor_def *rdef = p; + + if (mdef->rdef == rdef) + seq_printf(m, "[%s]\n", rdef->reactor->name); + else + seq_printf(m, "%s\n", rdef->reactor->name); + return 0; +} + +/* + * available_reactors seq definition. + */ +static const struct seq_operations monitor_reactors_seq_ops = { + .start = reactors_start, + .next = reactors_next, + .stop = reactors_stop, + .show = monitor_reactor_show +}; + +static void monitor_swap_reactors(struct rv_monitor_def *mdef, struct rv_reactor_def *rdef, + bool reacting) +{ + bool monitor_enabled; + + /* nothing to do */ + if (mdef->rdef == rdef) + return; + + monitor_enabled = mdef->monitor->enabled; + if (monitor_enabled) + rv_disable_monitor(mdef); + + /* swap reactor's usage */ + mdef->rdef->counter--; + rdef->counter++; + + mdef->rdef = rdef; + mdef->reacting = reacting; + mdef->monitor->react = rdef->reactor->react; + + if (monitor_enabled) + rv_enable_monitor(mdef); +} + +static ssize_t +monitor_reactors_write(struct file *file, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + char buff[MAX_RV_REACTOR_NAME_SIZE + 2]; + struct rv_monitor_def *mdef; + struct rv_reactor_def *rdef; + struct seq_file *seq_f; + int retval = -EINVAL; + bool enable; + char *ptr; + int len; + + if (count < 1 || count > MAX_RV_REACTOR_NAME_SIZE + 1) + return -EINVAL; + + memset(buff, 0, sizeof(buff)); + + retval = simple_write_to_buffer(buff, sizeof(buff) - 1, ppos, user_buf, count); + if (retval < 0) + return -EFAULT; + + ptr = strim(buff); + + len = strlen(ptr); + if (!len) + return count; + + /* + * See monitor_reactors_open() + */ + seq_f = file->private_data; + mdef = seq_f->private; + + mutex_lock(&rv_interface_lock); + + retval = -EINVAL; + + list_for_each_entry(rdef, &rv_reactors_list, list) { + if (strcmp(ptr, rdef->reactor->name) != 0) + continue; + + if (rdef == get_reactor_rdef_by_name("nop")) + enable = false; + else + enable = true; + + monitor_swap_reactors(mdef, rdef, enable); + + retval = count; + break; + } + + mutex_unlock(&rv_interface_lock); + + return retval; +} + +/* + * available_reactors interface. + */ +static int monitor_reactors_open(struct inode *inode, struct file *file) +{ + struct rv_monitor_def *mdef = inode->i_private; + struct seq_file *seq_f; + int ret; + + ret = seq_open(file, &monitor_reactors_seq_ops); + if (ret < 0) + return ret; + + /* + * seq_open stores the seq_file on the file->private data. + */ + seq_f = file->private_data; + + /* + * Copy the create file "private" data to the seq_file private data. + */ + seq_f->private = mdef; + + return 0; +}; + +static const struct file_operations monitor_reactors_ops = { + .open = monitor_reactors_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, + .write = monitor_reactors_write +}; + +static int __rv_register_reactor(struct rv_reactor *reactor) +{ + struct rv_reactor_def *r; + + list_for_each_entry(r, &rv_reactors_list, list) { + if (strcmp(reactor->name, r->reactor->name) == 0) { + pr_info("Reactor %s is already registered\n", reactor->name); + return -EINVAL; + } + } + + r = kzalloc(sizeof(struct rv_reactor_def), GFP_KERNEL); + if (!r) + return -ENOMEM; + + r->reactor = reactor; + r->counter = 0; + + list_add_tail(&r->list, &rv_reactors_list); + + return 0; +} + +/** + * rv_register_reactor - register a rv reactor. + * @reactor: The rv_reactor to be registered. + * + * Returns 0 if successful, error otherwise. + */ +int rv_register_reactor(struct rv_reactor *reactor) +{ + int retval = 0; + + if (strlen(reactor->name) >= MAX_RV_REACTOR_NAME_SIZE) { + pr_info("Reactor %s has a name longer than %d\n", + reactor->name, MAX_RV_MONITOR_NAME_SIZE); + return -EINVAL; + } + + mutex_lock(&rv_interface_lock); + retval = __rv_register_reactor(reactor); + mutex_unlock(&rv_interface_lock); + return retval; +} + +/** + * rv_unregister_reactor - unregister a rv reactor. + * @reactor: The rv_reactor to be unregistered. + * + * Returns 0 if successful, error otherwise. + */ +int rv_unregister_reactor(struct rv_reactor *reactor) +{ + struct rv_reactor_def *ptr, *next; + + mutex_lock(&rv_interface_lock); + + list_for_each_entry_safe(ptr, next, &rv_reactors_list, list) { + if (strcmp(reactor->name, ptr->reactor->name) == 0) { + + if (!ptr->counter) { + list_del(&ptr->list); + } else { + printk(KERN_WARNING + "rv: the rv_reactor %s is in use by %d monitor(s)\n", + ptr->reactor->name, ptr->counter); + printk(KERN_WARNING "rv: the rv_reactor %s cannot be removed\n", + ptr->reactor->name); + return -EBUSY; + } + } + } + + mutex_unlock(&rv_interface_lock); + return 0; +} + +/* + * reacting_on interface. + */ +static bool __read_mostly reacting_on; + +/** + * rv_reacting_on - checks if reacting is on + * + * Returns 1 if on, 0 otherwise. + */ +bool rv_reacting_on(void) +{ + /* Ensures that concurrent monitors read consistent reacting_on */ + smp_rmb(); + return READ_ONCE(reacting_on); +} + +static ssize_t reacting_on_read_data(struct file *filp, + char __user *user_buf, + size_t count, loff_t *ppos) +{ + char *buff; + + buff = rv_reacting_on() ? "1\n" : "0\n"; + + return simple_read_from_buffer(user_buf, count, ppos, buff, strlen(buff)+1); +} + +static void turn_reacting_off(void) +{ + WRITE_ONCE(reacting_on, false); + /* Ensures that concurrent monitors read consistent reacting_on */ + smp_wmb(); +} + +static void turn_reacting_on(void) +{ + WRITE_ONCE(reacting_on, true); + /* Ensures that concurrent monitors read consistent reacting_on */ + smp_wmb(); +} + +static ssize_t reacting_on_write_data(struct file *filp, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + int retval; + bool val; + + retval = kstrtobool_from_user(user_buf, count, &val); + if (retval) + return retval; + + mutex_lock(&rv_interface_lock); + + if (val) + turn_reacting_on(); + else + turn_reacting_off(); + + /* + * Wait for the execution of all events to finish + * before returning to user-space. + */ + tracepoint_synchronize_unregister(); + + mutex_unlock(&rv_interface_lock); + + return count; +} + +static const struct file_operations reacting_on_fops = { + .open = simple_open, + .llseek = no_llseek, + .write = reacting_on_write_data, + .read = reacting_on_read_data, +}; + +/** + * reactor_populate_monitor - creates per monitor reactors file + * @mdef: monitor's definition. + * + * Returns 0 if successful, error otherwise. + */ +int reactor_populate_monitor(struct rv_monitor_def *mdef) +{ + struct dentry *tmp; + + tmp = rv_create_file("reactors", RV_MODE_WRITE, mdef->root_d, mdef, &monitor_reactors_ops); + if (!tmp) + return -ENOMEM; + + /* + * Configure as the rv_nop reactor. + */ + mdef->rdef = get_reactor_rdef_by_name("nop"); + mdef->rdef->counter++; + mdef->reacting = false; + + return 0; +} + +/** + * reactor_cleanup_monitor - cleanup a monitor reference + * @mdef: monitor's definition. + */ +void reactor_cleanup_monitor(struct rv_monitor_def *mdef) +{ + lockdep_assert_held(&rv_interface_lock); + mdef->rdef->counter--; + WARN_ON_ONCE(mdef->rdef->counter < 0); +} + +/* + * Nop reactor register + */ +static void rv_nop_reaction(char *msg) +{ +} + +static struct rv_reactor rv_nop = { + .name = "nop", + .description = "no-operation reactor: do nothing.", + .react = rv_nop_reaction +}; + +int init_rv_reactors(struct dentry *root_dir) +{ + struct dentry *available, *reacting; + int retval; + + available = rv_create_file("available_reactors", RV_MODE_READ, root_dir, NULL, + &available_reactors_ops); + if (!available) + goto out_err; + + reacting = rv_create_file("reacting_on", RV_MODE_WRITE, root_dir, NULL, &reacting_on_fops); + if (!reacting) + goto rm_available; + + retval = __rv_register_reactor(&rv_nop); + if (retval) + goto rm_reacting; + + turn_reacting_on(); + + return 0; + +rm_reacting: + rv_remove(reacting); +rm_available: + rv_remove(available); +out_err: + return -ENOMEM; +} -- cgit From 792575348ff70e05c6040d02fce38e949ef92c37 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Fri, 29 Jul 2022 11:38:43 +0200 Subject: rv/include: Add deterministic automata monitor definition via C macros In Linux terms, the runtime verification monitors are encapsulated inside the "RV monitor" abstraction. The "RV monitor" includes a set of instances of the monitor (per-cpu monitor, per-task monitor, and so on), the helper functions that glue the monitor to the system reference model, and the trace output as a reaction for event parsing and exceptions, as depicted below: Linux +----- RV Monitor ----------------------------------+ Formal Realm | | Realm +-------------------+ +----------------+ +-----------------+ | Linux kernel | | Monitor | | Reference | | Tracing | -> | Instance(s) | <- | Model | | (instrumentation) | | (verification) | | (specification) | +-------------------+ +----------------+ +-----------------+ | | | | V | | +----------+ | | | Reaction | | | +--+--+--+-+ | | | | | | | | | +-> trace output ? | +------------------------|--|----------------------+ | +----> panic ? +-------> Add the rv/da_monitor.h, enabling automatic code generation for the *Monitor Instance(s)* using C macros, and code to support it. The benefits of the usage of macro for monitor synthesis are 3-fold as it: - Reduces the code duplication; - Facilitates the bug fix/improvement; - Avoids the case of developers changing the core of the monitor code to manipulate the model in a (let's say) non-standard way. This initial implementation presents three different types of monitor instances: - DECLARE_DA_MON_GLOBAL(name, type) - DECLARE_DA_MON_PER_CPU(name, type) - DECLARE_DA_MON_PER_TASK(name, type) The first declares the functions for a global deterministic automata monitor, the second for monitors with per-cpu instances, and the third with per-task instances. Link: https://lkml.kernel.org/r/51b0bf425a281e226dfeba7401d2115d6091f84e.1659052063.git.bristot@kernel.org Cc: Wim Van Sebroeck Cc: Guenter Roeck Cc: Jonathan Corbet Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Will Deacon Cc: Catalin Marinas Cc: Marco Elver Cc: Dmitry Vyukov Cc: "Paul E. McKenney" Cc: Shuah Khan Cc: Gabriele Paoloni Cc: Juri Lelli Cc: Clark Williams Cc: Tao Zhou Cc: Randy Dunlap Cc: linux-doc@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- kernel/fork.c | 14 ++++++++++++++ kernel/trace/rv/Kconfig | 11 +++++++++++ kernel/trace/rv/rv.c | 5 +++++ 3 files changed, 30 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 9d44f2d46c69..6f1f82ccd5f2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1964,6 +1964,18 @@ static void copy_oom_score_adj(u64 clone_flags, struct task_struct *tsk) mutex_unlock(&oom_adj_mutex); } +#ifdef CONFIG_RV +static void rv_task_fork(struct task_struct *p) +{ + int i; + + for (i = 0; i < RV_PER_TASK_MONITORS; i++) + p->rv[i].da_mon.monitoring = false; +} +#else +#define rv_task_fork(p) do {} while (0) +#endif + /* * This creates a new process as a copy of the old one, * but does not actually start it yet. @@ -2399,6 +2411,8 @@ static __latent_entropy struct task_struct *copy_process( */ copy_seccomp(p); + rv_task_fork(p); + rseq_fork(p, clone_flags); /* Don't start children in a dying pid namespace */ diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig index 3eb5d48ab4f6..8714800e22ad 100644 --- a/kernel/trace/rv/Kconfig +++ b/kernel/trace/rv/Kconfig @@ -1,5 +1,16 @@ # SPDX-License-Identifier: GPL-2.0-only # +config DA_MON_EVENTS + bool + +config DA_MON_EVENTS_IMPLICIT + select DA_MON_EVENTS + bool + +config DA_MON_EVENTS_ID + select DA_MON_EVENTS + bool + menuconfig RV bool "Runtime Verification" depends on TRACING diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c index 45cf64eb2600..df6678c86334 100644 --- a/kernel/trace/rv/rv.c +++ b/kernel/trace/rv/rv.c @@ -140,6 +140,11 @@ #include #include +#ifdef CONFIG_DA_MON_EVENTS +#define CREATE_TRACE_POINTS +#include +#endif + #include "rv.h" DEFINE_MUTEX(rv_interface_lock); -- cgit From ff0aaf671230d409a68fd7400f41e9eb3ac61dd8 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Fri, 29 Jul 2022 11:38:45 +0200 Subject: Documentation/rv: Add a basic documentation Add the runtime-verification.rst document, explaining the basics of RV and how to use the interface. Link: https://lkml.kernel.org/r/4be7d1a88ab1e2eb0767521e1ab52a149a154bc4.1659052063.git.bristot@kernel.org Cc: Wim Van Sebroeck Cc: Guenter Roeck Cc: Jonathan Corbet Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Will Deacon Cc: Catalin Marinas Cc: Marco Elver Cc: Dmitry Vyukov Cc: "Paul E. McKenney" Cc: Shuah Khan Cc: Gabriele Paoloni Cc: Juri Lelli Cc: Clark Williams Cc: Tao Zhou Cc: Randy Dunlap Cc: linux-doc@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- kernel/trace/rv/Kconfig | 3 +++ kernel/trace/rv/rv.c | 3 +++ 2 files changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig index 8714800e22ad..0d9552b406c6 100644 --- a/kernel/trace/rv/Kconfig +++ b/kernel/trace/rv/Kconfig @@ -22,6 +22,9 @@ menuconfig RV actual execution, comparing it against a formal specification of the system behavior. + For further information, see: + Documentation/trace/rv/runtime-verification.rst + config RV_REACTORS bool "Runtime verification reactors" default y diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c index df6678c86334..6c97cc2d754a 100644 --- a/kernel/trace/rv/rv.c +++ b/kernel/trace/rv/rv.c @@ -133,6 +133,9 @@ * auto-generated wakeup in preemptive monitor. * # cat enable * 0 + * + * For further information, see: + * Documentation/trace/rv/runtime-verification.rst */ #include -- cgit From 8812d21219b9c649dd25eb93915e00939944aeb7 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Fri, 29 Jul 2022 11:38:51 +0200 Subject: rv/monitor: Add the wip monitor skeleton created by dot2k THIS CODE IS NOT LINKED TO THE MAKEFILE. This model does not compile because it lacks the instrumentation part, which will be added next. In the typical case, there will be only one patch, but it was split into two patches for educational purposes. This is the direct output this command line: $ dot2k -d tools/verification/models/wip.dot -t per_cpu Link: https://lkml.kernel.org/r/5eb7a9118917e8a814c5e49853a72fc62be0a101.1659052063.git.bristot@kernel.org Cc: Wim Van Sebroeck Cc: Guenter Roeck Cc: Jonathan Corbet Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Will Deacon Cc: Catalin Marinas Cc: Marco Elver Cc: Dmitry Vyukov Cc: "Paul E. McKenney" Cc: Shuah Khan Cc: Gabriele Paoloni Cc: Juri Lelli Cc: Clark Williams Cc: Tao Zhou Cc: Randy Dunlap Cc: linux-doc@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- kernel/trace/rv/monitors/wip/wip.c | 109 +++++++++++++++++++++++++++++++++++++ kernel/trace/rv/monitors/wip/wip.h | 46 ++++++++++++++++ 2 files changed, 155 insertions(+) create mode 100644 kernel/trace/rv/monitors/wip/wip.c create mode 100644 kernel/trace/rv/monitors/wip/wip.h (limited to 'kernel') diff --git a/kernel/trace/rv/monitors/wip/wip.c b/kernel/trace/rv/monitors/wip/wip.c new file mode 100644 index 000000000000..79a054ca0cde --- /dev/null +++ b/kernel/trace/rv/monitors/wip/wip.c @@ -0,0 +1,109 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include + +#define MODULE_NAME "wip" + +/* + * XXX: include required tracepoint headers, e.g., + * #include + */ +#include + +/* + * This is the self-generated part of the monitor. Generally, there is no need + * to touch this section. + */ +#include "wip.h" + +/* + * Declare the deterministic automata monitor. + * + * The rv monitor reference is needed for the monitor declaration. + */ +struct rv_monitor rv_wip; +DECLARE_DA_MON_PER_CPU(wip, unsigned char); + +/* + * This is the instrumentation part of the monitor. + * + * This is the section where manual work is required. Here the kernel events + * are translated into model's event. + * + */ +static void handle_preempt_disable(void *data, /* XXX: fill header */) +{ + da_handle_event_wip(preempt_disable_wip); +} + +static void handle_preempt_enable(void *data, /* XXX: fill header */) +{ + da_handle_event_wip(preempt_enable_wip); +} + +static void handle_sched_waking(void *data, /* XXX: fill header */) +{ + da_handle_event_wip(sched_waking_wip); +} + +static int enable_wip(void) +{ + int retval; + + retval = da_monitor_init_wip(); + if (retval) + return retval; + + rv_attach_trace_probe("wip", /* XXX: tracepoint */, handle_preempt_disable); + rv_attach_trace_probe("wip", /* XXX: tracepoint */, handle_preempt_enable); + rv_attach_trace_probe("wip", /* XXX: tracepoint */, handle_sched_waking); + + return 0; +} + +static void disable_wip(void) +{ + rv_wip.enabled = 0; + + rv_detach_trace_probe("wip", /* XXX: tracepoint */, handle_preempt_disable); + rv_detach_trace_probe("wip", /* XXX: tracepoint */, handle_preempt_enable); + rv_detach_trace_probe("wip", /* XXX: tracepoint */, handle_sched_waking); + + da_monitor_destroy_wip(); +} + +/* + * This is the monitor register section. + */ +struct rv_monitor rv_wip = { + .name = "wip", + .description = "auto-generated wip", + .enable = enable_wip, + .disable = disable_wip, + .reset = da_monitor_reset_all_wip, + .enabled = 0, +}; + +static int register_wip(void) +{ + rv_register_monitor(&rv_wip); + return 0; +} + +static void unregister_wip(void) +{ + rv_unregister_monitor(&rv_wip); +} + +module_init(register_wip); +module_exit(unregister_wip); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("dot2k: auto-generated"); +MODULE_DESCRIPTION("wip"); diff --git a/kernel/trace/rv/monitors/wip/wip.h b/kernel/trace/rv/monitors/wip/wip.h new file mode 100644 index 000000000000..c1c47e2305ef --- /dev/null +++ b/kernel/trace/rv/monitors/wip/wip.h @@ -0,0 +1,46 @@ +/* + * Automatically generated C representation of wip automaton + * For further information about this format, see kernel documentation: + * Documentation/trace/rv/deterministic_automata.rst + */ + +enum states_wip { + preemptive_wip = 0, + non_preemptive_wip, + state_max_wip +}; + +#define INVALID_STATE state_max_wip + +enum events_wip { + preempt_disable_wip = 0, + preempt_enable_wip, + sched_waking_wip, + event_max_wip +}; + +struct automaton_wip { + char *state_names[state_max_wip]; + char *event_names[event_max_wip]; + unsigned char function[state_max_wip][event_max_wip]; + unsigned char initial_state; + bool final_states[state_max_wip]; +}; + +struct automaton_wip automaton_wip = { + .state_names = { + "preemptive", + "non_preemptive" + }, + .event_names = { + "preempt_disable", + "preempt_enable", + "sched_waking" + }, + .function = { + { non_preemptive_wip, INVALID_STATE, INVALID_STATE }, + { INVALID_STATE, preemptive_wip, non_preemptive_wip }, + }, + .initial_state = preemptive_wip, + .final_states = { 1, 0 }, +}; -- cgit From 10bde81c74863472047f31304064018c40f488ee Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Fri, 29 Jul 2022 11:38:52 +0200 Subject: rv/monitor: Add the wip monitor The wakeup in preemptive (wip) monitor verifies if the wakeup events always take place with preemption disabled: | | v #==================# H preemptive H <+ #==================# | | | | preempt_disable | preempt_enable v | sched_waking +------------------+ | +--------------- | | | | | non_preemptive | | +--------------> | | -+ +------------------+ The wakeup event always takes place with preemption disabled because of the scheduler synchronization. However, because the preempt_count and its trace event are not atomic with regard to interrupts, some inconsistencies might happen. The documentation illustrates one of these cases. Link: https://lkml.kernel.org/r/c98ca678df81115fddc04921b3c79720c836b18f.1659052063.git.bristot@kernel.org Cc: Wim Van Sebroeck Cc: Guenter Roeck Cc: Jonathan Corbet Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Will Deacon Cc: Catalin Marinas Cc: Marco Elver Cc: Dmitry Vyukov Cc: "Paul E. McKenney" Cc: Shuah Khan Cc: Gabriele Paoloni Cc: Juri Lelli Cc: Clark Williams Cc: Tao Zhou Cc: Randy Dunlap Cc: linux-doc@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- kernel/trace/rv/Kconfig | 13 ++++++++++ kernel/trace/rv/Makefile | 1 + kernel/trace/rv/monitors/wip/wip.c | 51 +++++++++++--------------------------- 3 files changed, 29 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig index 0d9552b406c6..e50f3346164a 100644 --- a/kernel/trace/rv/Kconfig +++ b/kernel/trace/rv/Kconfig @@ -25,6 +25,19 @@ menuconfig RV For further information, see: Documentation/trace/rv/runtime-verification.rst +config RV_MON_WIP + depends on RV + depends on PREEMPT_TRACER + select DA_MON_EVENTS_IMPLICIT + bool "wip monitor" + help + Enable wip (wakeup in preemptive) sample monitor that illustrates + the usage of per-cpu monitors, and one limitation of the + preempt_disable/enable events. + + For further information, see: + Documentation/trace/rv/monitor_wip.rst + config RV_REACTORS bool "Runtime verification reactors" default y diff --git a/kernel/trace/rv/Makefile b/kernel/trace/rv/Makefile index 8944274d9b41..b41109d2750a 100644 --- a/kernel/trace/rv/Makefile +++ b/kernel/trace/rv/Makefile @@ -2,3 +2,4 @@ obj-$(CONFIG_RV) += rv.o obj-$(CONFIG_RV_REACTORS) += rv_reactors.o +obj-$(CONFIG_RV_MON_WIP) += monitors/wip/wip.o diff --git a/kernel/trace/rv/monitors/wip/wip.c b/kernel/trace/rv/monitors/wip/wip.c index 79a054ca0cde..83cace53b9fa 100644 --- a/kernel/trace/rv/monitors/wip/wip.c +++ b/kernel/trace/rv/monitors/wip/wip.c @@ -10,44 +10,26 @@ #define MODULE_NAME "wip" -/* - * XXX: include required tracepoint headers, e.g., - * #include - */ #include +#include +#include -/* - * This is the self-generated part of the monitor. Generally, there is no need - * to touch this section. - */ #include "wip.h" -/* - * Declare the deterministic automata monitor. - * - * The rv monitor reference is needed for the monitor declaration. - */ struct rv_monitor rv_wip; DECLARE_DA_MON_PER_CPU(wip, unsigned char); -/* - * This is the instrumentation part of the monitor. - * - * This is the section where manual work is required. Here the kernel events - * are translated into model's event. - * - */ -static void handle_preempt_disable(void *data, /* XXX: fill header */) +static void handle_preempt_disable(void *data, unsigned long ip, unsigned long parent_ip) { da_handle_event_wip(preempt_disable_wip); } -static void handle_preempt_enable(void *data, /* XXX: fill header */) +static void handle_preempt_enable(void *data, unsigned long ip, unsigned long parent_ip) { - da_handle_event_wip(preempt_enable_wip); + da_handle_start_event_wip(preempt_enable_wip); } -static void handle_sched_waking(void *data, /* XXX: fill header */) +static void handle_sched_waking(void *data, struct task_struct *task) { da_handle_event_wip(sched_waking_wip); } @@ -60,9 +42,9 @@ static int enable_wip(void) if (retval) return retval; - rv_attach_trace_probe("wip", /* XXX: tracepoint */, handle_preempt_disable); - rv_attach_trace_probe("wip", /* XXX: tracepoint */, handle_preempt_enable); - rv_attach_trace_probe("wip", /* XXX: tracepoint */, handle_sched_waking); + rv_attach_trace_probe("wip", preempt_enable, handle_preempt_enable); + rv_attach_trace_probe("wip", sched_waking, handle_sched_waking); + rv_attach_trace_probe("wip", preempt_disable, handle_preempt_disable); return 0; } @@ -71,19 +53,16 @@ static void disable_wip(void) { rv_wip.enabled = 0; - rv_detach_trace_probe("wip", /* XXX: tracepoint */, handle_preempt_disable); - rv_detach_trace_probe("wip", /* XXX: tracepoint */, handle_preempt_enable); - rv_detach_trace_probe("wip", /* XXX: tracepoint */, handle_sched_waking); + rv_detach_trace_probe("wip", preempt_disable, handle_preempt_disable); + rv_detach_trace_probe("wip", preempt_enable, handle_preempt_enable); + rv_detach_trace_probe("wip", sched_waking, handle_sched_waking); da_monitor_destroy_wip(); } -/* - * This is the monitor register section. - */ struct rv_monitor rv_wip = { .name = "wip", - .description = "auto-generated wip", + .description = "wakeup in preemptive per-cpu testing monitor.", .enable = enable_wip, .disable = disable_wip, .reset = da_monitor_reset_all_wip, @@ -105,5 +84,5 @@ module_init(register_wip); module_exit(unregister_wip); MODULE_LICENSE("GPL"); -MODULE_AUTHOR("dot2k: auto-generated"); -MODULE_DESCRIPTION("wip"); +MODULE_AUTHOR("Daniel Bristot de Oliveira "); +MODULE_DESCRIPTION("wip: wakeup in preemptive - per-cpu sample monitor."); -- cgit From ccc319dcb450d57b7befe924453d06804d83ba73 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Fri, 29 Jul 2022 11:38:53 +0200 Subject: rv/monitor: Add the wwnr monitor Per task wakeup while not running (wwnr) monitor. This model is broken, the reason is that a task can be running in the processor without being set as RUNNABLE. Think about a task about to sleep: 1: set_current_state(TASK_UNINTERRUPTIBLE); 2: schedule(); And then imagine an IRQ happening in between the lines one and two, waking the task up. BOOM, the wakeup will happen while the task is running. Q: Why do we need this model, so? A: To test the reactors. Link: https://lkml.kernel.org/r/473c0fc39967250fdebcff8b620311c11dccad30.1659052063.git.bristot@kernel.org Cc: Wim Van Sebroeck Cc: Guenter Roeck Cc: Jonathan Corbet Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Will Deacon Cc: Catalin Marinas Cc: Marco Elver Cc: Dmitry Vyukov Cc: "Paul E. McKenney" Cc: Shuah Khan Cc: Gabriele Paoloni Cc: Juri Lelli Cc: Clark Williams Cc: Tao Zhou Cc: Randy Dunlap Cc: linux-doc@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- kernel/trace/rv/Kconfig | 12 +++++ kernel/trace/rv/Makefile | 1 + kernel/trace/rv/monitors/wwnr/wwnr.c | 87 ++++++++++++++++++++++++++++++++++++ kernel/trace/rv/monitors/wwnr/wwnr.h | 46 +++++++++++++++++++ 4 files changed, 146 insertions(+) create mode 100644 kernel/trace/rv/monitors/wwnr/wwnr.c create mode 100644 kernel/trace/rv/monitors/wwnr/wwnr.h (limited to 'kernel') diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig index e50f3346164a..b259d6e8dc7c 100644 --- a/kernel/trace/rv/Kconfig +++ b/kernel/trace/rv/Kconfig @@ -38,6 +38,18 @@ config RV_MON_WIP For further information, see: Documentation/trace/rv/monitor_wip.rst +config RV_MON_WWNR + depends on RV + select DA_MON_EVENTS_ID + bool "wwnr monitor" + help + Enable wwnr (wakeup while not running) sample monitor, this is a + sample monitor that illustrates the usage of per-task monitor. + The model is borken on purpose: it serves to test reactors. + + For further information, see: + Documentation/trace/rv/monitor_wwnr.rst + config RV_REACTORS bool "Runtime verification reactors" default y diff --git a/kernel/trace/rv/Makefile b/kernel/trace/rv/Makefile index b41109d2750a..af0ff9a46418 100644 --- a/kernel/trace/rv/Makefile +++ b/kernel/trace/rv/Makefile @@ -3,3 +3,4 @@ obj-$(CONFIG_RV) += rv.o obj-$(CONFIG_RV_REACTORS) += rv_reactors.o obj-$(CONFIG_RV_MON_WIP) += monitors/wip/wip.o +obj-$(CONFIG_RV_MON_WWNR) += monitors/wwnr/wwnr.o diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.c b/kernel/trace/rv/monitors/wwnr/wwnr.c new file mode 100644 index 000000000000..599225d9cf38 --- /dev/null +++ b/kernel/trace/rv/monitors/wwnr/wwnr.c @@ -0,0 +1,87 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include + +#define MODULE_NAME "wwnr" + +#include +#include + +#include "wwnr.h" + +struct rv_monitor rv_wwnr; +DECLARE_DA_MON_PER_TASK(wwnr, unsigned char); + +static void handle_switch(void *data, bool preempt, struct task_struct *p, + struct task_struct *n, unsigned int prev_state) +{ + /* start monitoring only after the first suspension */ + if (prev_state == TASK_INTERRUPTIBLE) + da_handle_start_event_wwnr(p, switch_out_wwnr); + else + da_handle_event_wwnr(p, switch_out_wwnr); + + da_handle_event_wwnr(n, switch_in_wwnr); +} + +static void handle_wakeup(void *data, struct task_struct *p) +{ + da_handle_event_wwnr(p, wakeup_wwnr); +} + +static int enable_wwnr(void) +{ + int retval; + + retval = da_monitor_init_wwnr(); + if (retval) + return retval; + + rv_attach_trace_probe("wwnr", sched_switch, handle_switch); + rv_attach_trace_probe("wwnr", sched_wakeup, handle_wakeup); + + return 0; +} + +static void disable_wwnr(void) +{ + rv_wwnr.enabled = 0; + + rv_detach_trace_probe("wwnr", sched_switch, handle_switch); + rv_detach_trace_probe("wwnr", sched_wakeup, handle_wakeup); + + da_monitor_destroy_wwnr(); +} + +struct rv_monitor rv_wwnr = { + .name = "wwnr", + .description = "wakeup while not running per-task testing model.", + .enable = enable_wwnr, + .disable = disable_wwnr, + .reset = da_monitor_reset_all_wwnr, + .enabled = 0, +}; + +static int register_wwnr(void) +{ + rv_register_monitor(&rv_wwnr); + return 0; +} + +static void unregister_wwnr(void) +{ + rv_unregister_monitor(&rv_wwnr); +} + +module_init(register_wwnr); +module_exit(unregister_wwnr); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Daniel Bristot de Oliveira "); +MODULE_DESCRIPTION("wwnr: wakeup while not running monitor"); diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.h b/kernel/trace/rv/monitors/wwnr/wwnr.h new file mode 100644 index 000000000000..d1afe55cdd4c --- /dev/null +++ b/kernel/trace/rv/monitors/wwnr/wwnr.h @@ -0,0 +1,46 @@ +/* + * Automatically generated C representation of wwnr automaton + * For further information about this format, see kernel documentation: + * Documentation/trace/rv/deterministic_automata.rst + */ + +enum states_wwnr { + not_running_wwnr = 0, + running_wwnr, + state_max_wwnr +}; + +#define INVALID_STATE state_max_wwnr + +enum events_wwnr { + switch_in_wwnr = 0, + switch_out_wwnr, + wakeup_wwnr, + event_max_wwnr +}; + +struct automaton_wwnr { + char *state_names[state_max_wwnr]; + char *event_names[event_max_wwnr]; + unsigned char function[state_max_wwnr][event_max_wwnr]; + unsigned char initial_state; + bool final_states[state_max_wwnr]; +}; + +struct automaton_wwnr automaton_wwnr = { + .state_names = { + "not_running", + "running" + }, + .event_names = { + "switch_in", + "switch_out", + "wakeup" + }, + .function = { + { running_wwnr, INVALID_STATE, not_running_wwnr }, + { INVALID_STATE, not_running_wwnr, INVALID_STATE }, + }, + .initial_state = not_running_wwnr, + .final_states = { 1, 0 }, +}; -- cgit From 135b881ea88566f27dd4acc5d2ed83ad418a3a69 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Fri, 29 Jul 2022 11:38:54 +0200 Subject: rv/reactor: Add the printk reactor A reactor that printks the reaction message. Link: https://lkml.kernel.org/r/b65f18a7fd6dc6659a3008fd7b7392de3465d47b.1659052063.git.bristot@kernel.org Cc: Wim Van Sebroeck Cc: Guenter Roeck Cc: Jonathan Corbet Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Will Deacon Cc: Catalin Marinas Cc: Marco Elver Cc: Dmitry Vyukov Cc: "Paul E. McKenney" Cc: Shuah Khan Cc: Gabriele Paoloni Cc: Juri Lelli Cc: Clark Williams Cc: Tao Zhou Cc: Randy Dunlap Cc: linux-doc@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- kernel/trace/rv/Kconfig | 8 ++++++++ kernel/trace/rv/Makefile | 3 ++- kernel/trace/rv/reactor_printk.c | 42 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 52 insertions(+), 1 deletion(-) create mode 100644 kernel/trace/rv/reactor_printk.c (limited to 'kernel') diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig index b259d6e8dc7c..e82d5015e6ab 100644 --- a/kernel/trace/rv/Kconfig +++ b/kernel/trace/rv/Kconfig @@ -60,3 +60,11 @@ config RV_REACTORS on the model's execution. By default, the monitors have tracing reactions, printing the monitor output via tracepoints, but other reactions can be added (on-demand) via this interface. + +config RV_REACT_PRINTK + bool "Printk reactor" + depends on RV_REACTORS + default y + help + Enables the printk reactor. The printk reactor emits a printk() + message if an exception is found. diff --git a/kernel/trace/rv/Makefile b/kernel/trace/rv/Makefile index af0ff9a46418..a13c750a35c1 100644 --- a/kernel/trace/rv/Makefile +++ b/kernel/trace/rv/Makefile @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_RV) += rv.o -obj-$(CONFIG_RV_REACTORS) += rv_reactors.o obj-$(CONFIG_RV_MON_WIP) += monitors/wip/wip.o obj-$(CONFIG_RV_MON_WWNR) += monitors/wwnr/wwnr.o +obj-$(CONFIG_RV_REACTORS) += rv_reactors.o +obj-$(CONFIG_RV_REACT_PRINTK) += reactor_printk.o diff --git a/kernel/trace/rv/reactor_printk.c b/kernel/trace/rv/reactor_printk.c new file mode 100644 index 000000000000..31899f953af4 --- /dev/null +++ b/kernel/trace/rv/reactor_printk.c @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2019-2022 Red Hat, Inc. Daniel Bristot de Oliveira + * + * Printk RV reactor: + * Prints the exception msg to the kernel message log. + */ +#include +#include +#include +#include +#include +#include + +static void rv_printk_reaction(char *msg) +{ + printk_deferred(msg); +} + +static struct rv_reactor rv_printk = { + .name = "printk", + .description = "prints the exception msg to the kernel message log.", + .react = rv_printk_reaction +}; + +static int register_react_printk(void) +{ + rv_register_reactor(&rv_printk); + return 0; +} + +static void unregister_react_printk(void) +{ + rv_unregister_reactor(&rv_printk); +} + +module_init(register_react_printk); +module_exit(unregister_react_printk); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Daniel Bristot de Oliveira"); +MODULE_DESCRIPTION("printk rv reactor: printk if an exception is hit."); -- cgit From e88043c0ac16f19960048372dcffc6df7c05c5b8 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Fri, 29 Jul 2022 11:38:55 +0200 Subject: rv/reactor: Add the panic reactor Sample reactor that panics the system when an exception is found. This is useful both to capture a vmcore, or to fail-safe a critical system. Link: https://lkml.kernel.org/r/729aae3aba95f35738b8f8180e626d747d1d9da2.1659052063.git.bristot@kernel.org Cc: Wim Van Sebroeck Cc: Guenter Roeck Cc: Jonathan Corbet Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Will Deacon Cc: Catalin Marinas Cc: Marco Elver Cc: Dmitry Vyukov Cc: "Paul E. McKenney" Cc: Shuah Khan Cc: Gabriele Paoloni Cc: Juri Lelli Cc: Clark Williams Cc: Tao Zhou Cc: Randy Dunlap Cc: linux-doc@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- kernel/trace/rv/Kconfig | 8 ++++++++ kernel/trace/rv/Makefile | 1 + kernel/trace/rv/reactor_panic.c | 43 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 52 insertions(+) create mode 100644 kernel/trace/rv/reactor_panic.c (limited to 'kernel') diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig index e82d5015e6ab..831779607e84 100644 --- a/kernel/trace/rv/Kconfig +++ b/kernel/trace/rv/Kconfig @@ -68,3 +68,11 @@ config RV_REACT_PRINTK help Enables the printk reactor. The printk reactor emits a printk() message if an exception is found. + +config RV_REACT_PANIC + bool "Panic reactor" + depends on RV_REACTORS + default y + help + Enables the panic reactor. The panic reactor emits a printk() + message if an exception is found and panic()s the system. diff --git a/kernel/trace/rv/Makefile b/kernel/trace/rv/Makefile index a13c750a35c1..963d14875b45 100644 --- a/kernel/trace/rv/Makefile +++ b/kernel/trace/rv/Makefile @@ -5,3 +5,4 @@ obj-$(CONFIG_RV_MON_WIP) += monitors/wip/wip.o obj-$(CONFIG_RV_MON_WWNR) += monitors/wwnr/wwnr.o obj-$(CONFIG_RV_REACTORS) += rv_reactors.o obj-$(CONFIG_RV_REACT_PRINTK) += reactor_printk.o +obj-$(CONFIG_RV_REACT_PANIC) += reactor_panic.o diff --git a/kernel/trace/rv/reactor_panic.c b/kernel/trace/rv/reactor_panic.c new file mode 100644 index 000000000000..b698d05dd069 --- /dev/null +++ b/kernel/trace/rv/reactor_panic.c @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2019-2022 Red Hat, Inc. Daniel Bristot de Oliveira + * + * Panic RV reactor: + * Prints the exception msg to the kernel message log and panic(). + */ + +#include +#include +#include +#include +#include +#include + +static void rv_panic_reaction(char *msg) +{ + panic(msg); +} + +static struct rv_reactor rv_panic = { + .name = "panic", + .description = "panic the system if an exception is found.", + .react = rv_panic_reaction +}; + +static int register_react_panic(void) +{ + rv_register_reactor(&rv_panic); + return 0; +} + +static void unregister_react_panic(void) +{ + rv_unregister_reactor(&rv_panic); +} + +module_init(register_react_panic); +module_exit(unregister_react_panic); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Daniel Bristot de Oliveira"); +MODULE_DESCRIPTION("panic rv reactor: panic if an exception is found."); -- cgit From a870544ca9d215449e91ebc01e35d80b23151c78 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Wed, 22 Jun 2022 08:38:37 +0200 Subject: kernel: remove platform_has() infrastructure The only use case of the platform_has() infrastructure has been removed again, so remove the whole feature. Signed-off-by: Juergen Gross Tested-by: Oleksandr Tyshchenko # Arm64 guest using Xen Reviewed-by: Stefano Stabellini Link: https://lore.kernel.org/r/20220622063838.8854-3-jgross@suse.com Signed-off-by: Juergen Gross --- kernel/Makefile | 2 +- kernel/platform-feature.c | 27 --------------------------- 2 files changed, 1 insertion(+), 28 deletions(-) delete mode 100644 kernel/platform-feature.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index a7e1f49ab2b3..318789c728d3 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -7,7 +7,7 @@ obj-y = fork.o exec_domain.o panic.o \ cpu.o exit.o softirq.o resource.o \ sysctl.o capability.o ptrace.o user.o \ signal.o sys.o umh.o workqueue.o pid.o task_work.o \ - extable.o params.o platform-feature.o \ + extable.o params.o \ kthread.o sys_ni.o nsproxy.o \ notifier.o ksysfs.o cred.o reboot.o \ async.o range.o smpboot.o ucount.o regset.o diff --git a/kernel/platform-feature.c b/kernel/platform-feature.c deleted file mode 100644 index cb6a6c3e4fed..000000000000 --- a/kernel/platform-feature.c +++ /dev/null @@ -1,27 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include -#include -#include -#include - -#define PLATFORM_FEAT_ARRAY_SZ BITS_TO_LONGS(PLATFORM_FEAT_N) -static unsigned long __read_mostly platform_features[PLATFORM_FEAT_ARRAY_SZ]; - -void platform_set(unsigned int feature) -{ - set_bit(feature, platform_features); -} -EXPORT_SYMBOL_GPL(platform_set); - -void platform_clear(unsigned int feature) -{ - clear_bit(feature, platform_features); -} -EXPORT_SYMBOL_GPL(platform_clear); - -bool platform_has(unsigned int feature) -{ - return test_bit(feature, platform_features); -} -EXPORT_SYMBOL_GPL(platform_has); -- cgit From 28f6c37a2910f565b4f5960df52b2eccae28c891 Mon Sep 17 00:00:00 2001 From: Chen Zhongjin Date: Mon, 1 Aug 2022 11:37:19 +0800 Subject: kprobes: Forbid probing on trampoline and BPF code areas kernel_text_address() treats ftrace_trampoline, kprobe_insn_slot and bpf_text_address as valid kprobe addresses - which is not ideal. These text areas are removable and changeable without any notification to kprobes, and probing on them can trigger unexpected behavior: https://lkml.org/lkml/2022/7/26/1148 Considering that jump_label and static_call text are already forbiden to probe, kernel_text_address() should be replaced with core_kernel_text() and is_module_text_address() to check other text areas which are unsafe to kprobe. [ mingo: Rewrote the changelog. ] Fixes: 5b485629ba0d ("kprobes, extable: Identify kprobes trampolines as kernel text area") Fixes: 74451e66d516 ("bpf: make jited programs visible in traces") Signed-off-by: Chen Zhongjin Signed-off-by: Ingo Molnar Acked-by: Masami Hiramatsu (Google) Link: https://lore.kernel.org/r/20220801033719.228248-1-chenzhongjin@huawei.com --- kernel/kprobes.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index f214f8c088ed..80697e5e03e4 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1560,7 +1560,8 @@ static int check_kprobe_address_safe(struct kprobe *p, preempt_disable(); /* Ensure it is not in reserved area nor out of text */ - if (!kernel_text_address((unsigned long) p->addr) || + if (!(core_kernel_text((unsigned long) p->addr) || + is_module_text_address((unsigned long) p->addr)) || within_kprobe_blacklist((unsigned long) p->addr) || jump_label_text_reserved(p->addr, p->addr) || static_call_text_reserved(p->addr, p->addr) || -- cgit From 59927cbe3f30c5ed7fc5d33487ef06611394a548 Mon Sep 17 00:00:00 2001 From: Zhiqiang Liu Date: Wed, 20 Jul 2022 10:46:48 +0800 Subject: tracing: Use free_trace_buffer() in allocate_trace_buffers() In allocate_trace_buffers(), if allocating tr->max_buffer fails, we can directly call free_trace_buffer to free tr->array_buffer. Link: https://lkml.kernel.org/r/65f0702d-07f6-08de-2a07-4c50af56a67b@huawei.com Signed-off-by: Zhiqiang Liu Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 301305ec134b..27febd4ee33e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -9101,6 +9101,16 @@ allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, int size return 0; } +static void free_trace_buffer(struct array_buffer *buf) +{ + if (buf->buffer) { + ring_buffer_free(buf->buffer); + buf->buffer = NULL; + free_percpu(buf->data); + buf->data = NULL; + } +} + static int allocate_trace_buffers(struct trace_array *tr, int size) { int ret; @@ -9113,10 +9123,7 @@ static int allocate_trace_buffers(struct trace_array *tr, int size) ret = allocate_trace_buffer(tr, &tr->max_buffer, allocate_snapshot ? size : 1); if (MEM_FAIL(ret, "Failed to allocate trace buffer\n")) { - ring_buffer_free(tr->array_buffer.buffer); - tr->array_buffer.buffer = NULL; - free_percpu(tr->array_buffer.data); - tr->array_buffer.data = NULL; + free_trace_buffer(&tr->array_buffer); return -ENOMEM; } tr->allocated_snapshot = allocate_snapshot; @@ -9131,16 +9138,6 @@ static int allocate_trace_buffers(struct trace_array *tr, int size) return 0; } -static void free_trace_buffer(struct array_buffer *buf) -{ - if (buf->buffer) { - ring_buffer_free(buf->buffer); - buf->buffer = NULL; - free_percpu(buf->data); - buf->data = NULL; - } -} - static void free_trace_buffers(struct trace_array *tr) { if (!tr) -- cgit From 2f63e5d2e391837c70741311b5b70d2fbd15d138 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Mon, 1 Aug 2022 11:32:15 +0900 Subject: tracing/eprobe: Show syntax error logs in error_log file Show the syntax errors for event probes in error_log file as same as other dynamic events, so that user can understand what is the problem. Link: https://lkml.kernel.org/r/165932113556.2850673.3483079297896607612.stgit@devnote2 Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_eprobe.c | 11 +++++++++-- kernel/trace/trace_probe.h | 5 ++++- 2 files changed, 13 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index a30f21499e81..4a0e9d927443 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -839,8 +839,11 @@ static int trace_eprobe_tp_update_arg(struct trace_eprobe *ep, const char *argv[ if (ret) return ret; - if (ep->tp.args[i].code->op == FETCH_OP_TP_ARG) + if (ep->tp.args[i].code->op == FETCH_OP_TP_ARG) { ret = trace_eprobe_tp_arg_update(ep, i); + if (ret) + trace_probe_log_err(0, BAD_ATTACH_ARG); + } return ret; } @@ -880,8 +883,10 @@ static int __trace_eprobe_create(int argc, const char *argv[]) trace_probe_log_set_index(1); sys_event = argv[1]; ret = traceprobe_parse_event_name(&sys_event, &sys_name, buf2, 0); - if (!sys_event || !sys_name) + if (!sys_event || !sys_name) { + trace_probe_log_err(0, NO_EVENT_INFO); goto parse_error; + } if (!event) { strscpy(buf1, argv[1], MAX_EVENT_NAME_LEN); @@ -896,6 +901,8 @@ static int __trace_eprobe_create(int argc, const char *argv[]) if (IS_ERR(ep)) { ret = PTR_ERR(ep); + if (ret == -ENODEV) + trace_probe_log_err(0, BAD_ATTACH_EVENT); /* This must return -ENOMEM or missing event, else there is a bug */ WARN_ON_ONCE(ret != -ENOMEM && ret != -ENODEV); ep = NULL; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 92cc149af0fd..3b3869ae8cfd 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -442,7 +442,10 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(FAIL_REG_PROBE, "Failed to register probe event"),\ C(DIFF_PROBE_TYPE, "Probe type is different from existing probe"),\ C(DIFF_ARG_TYPE, "Argument type or name is different from existing probe"),\ - C(SAME_PROBE, "There is already the exact same probe event"), + C(SAME_PROBE, "There is already the exact same probe event"),\ + C(NO_EVENT_INFO, "This requires both group and event name to attach"),\ + C(BAD_ATTACH_EVENT, "Attached event does not exist"),\ + C(BAD_ATTACH_ARG, "Attached event does not have this field"), #undef C #define C(a, b) TP_ERR_##a -- cgit From 151c8e499f4705010780189377f85b57400ccbf5 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Tue, 2 Aug 2022 14:56:10 +0200 Subject: wireguard: ratelimiter: use hrtimer in selftest Using msleep() is problematic because it's compared against ratelimiter.c's ktime_get_coarse_boottime_ns(), which means on systems with slow jiffies (such as UML's forced HZ=100), the result is inaccurate. So switch to using schedule_hrtimeout(). However, hrtimer gives us access only to the traditional posix timers, and none of the _COARSE variants. So now, rather than being too imprecise like jiffies, it's too precise. One solution would be to give it a large "range" value, but this will still fire early on a loaded system. A better solution is to align the timeout to the actual coarse timer, and then round up to the nearest tick, plus change. So add the timeout to the current coarse time, and then schedule_hrtimer() until the absolute computed time. This should hopefully reduce flakes in CI as well. Note that we keep the retry loop in case the entire function is running behind, because the test could still be scheduled out, by either the kernel or by the hypervisor's kernel, in which case restarting the test and hoping to not be scheduled out still helps. Fixes: e7096c131e51 ("net: WireGuard secure network tunnel") Suggested-by: Thomas Gleixner Signed-off-by: Jason A. Donenfeld Signed-off-by: Jakub Kicinski --- kernel/time/hrtimer.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 0ea8702eb516..23af5eca11b1 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -2311,6 +2311,7 @@ schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, return !t.task ? 0 : -EINTR; } +EXPORT_SYMBOL_GPL(schedule_hrtimeout_range_clock); /** * schedule_hrtimeout_range - sleep until timeout -- cgit From b6e8d40d43ae4dec00c8fea2593eeea3114b8f44 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 2 Aug 2022 21:54:51 -0400 Subject: sched, cpuset: Fix dl_cpu_busy() panic due to empty cs->cpus_allowed With cgroup v2, the cpuset's cpus_allowed mask can be empty indicating that the cpuset will just use the effective CPUs of its parent. So cpuset_can_attach() can call task_can_attach() with an empty mask. This can lead to cpumask_any_and() returns nr_cpu_ids causing the call to dl_bw_of() to crash due to percpu value access of an out of bound CPU value. For example: [80468.182258] BUG: unable to handle page fault for address: ffffffff8b6648b0 : [80468.191019] RIP: 0010:dl_cpu_busy+0x30/0x2b0 : [80468.207946] Call Trace: [80468.208947] cpuset_can_attach+0xa0/0x140 [80468.209953] cgroup_migrate_execute+0x8c/0x490 [80468.210931] cgroup_update_dfl_csses+0x254/0x270 [80468.211898] cgroup_subtree_control_write+0x322/0x400 [80468.212854] kernfs_fop_write_iter+0x11c/0x1b0 [80468.213777] new_sync_write+0x11f/0x1b0 [80468.214689] vfs_write+0x1eb/0x280 [80468.215592] ksys_write+0x5f/0xe0 [80468.216463] do_syscall_64+0x5c/0x80 [80468.224287] entry_SYSCALL_64_after_hwframe+0x44/0xae Fix that by using effective_cpus instead. For cgroup v1, effective_cpus is the same as cpus_allowed. For v2, effective_cpus is the real cpumask to be used by tasks within the cpuset anyway. Also update task_can_attach()'s 2nd argument name to cs_effective_cpus to reflect the change. In addition, a check is added to task_can_attach() to guard against the possibility that cpumask_any_and() may return a value >= nr_cpu_ids. Fixes: 7f51412a415d ("sched/deadline: Fix bandwidth check/update when migrating tasks between exclusive cpusets") Signed-off-by: Waiman Long Signed-off-by: Ingo Molnar Acked-by: Juri Lelli Link: https://lore.kernel.org/r/20220803015451.2219567-1-longman@redhat.com --- kernel/cgroup/cpuset.c | 2 +- kernel/sched/core.c | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 71a418858a5e..58aadfda9b8b 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2239,7 +2239,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) goto out_unlock; cgroup_taskset_for_each(task, css, tset) { - ret = task_can_attach(task, cs->cpus_allowed); + ret = task_can_attach(task, cs->effective_cpus); if (ret) goto out_unlock; ret = security_task_setscheduler(task); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5555e49c4e12..addc3c2d2122 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8980,7 +8980,7 @@ int cpuset_cpumask_can_shrink(const struct cpumask *cur, } int task_can_attach(struct task_struct *p, - const struct cpumask *cs_cpus_allowed) + const struct cpumask *cs_effective_cpus) { int ret = 0; @@ -8999,9 +8999,11 @@ int task_can_attach(struct task_struct *p, } if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span, - cs_cpus_allowed)) { - int cpu = cpumask_any_and(cpu_active_mask, cs_cpus_allowed); + cs_effective_cpus)) { + int cpu = cpumask_any_and(cpu_active_mask, cs_effective_cpus); + if (unlikely(cpu >= nr_cpu_ids)) + return -EINVAL; ret = dl_cpu_busy(cpu, p); } -- cgit From dcca34754a3f5290406403b8066e3b15dda9f4bf Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 3 Aug 2022 10:43:42 +0200 Subject: exit: Fix typo in comment: s/sub-theads/sub-threads Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/exit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 64c938ce36fe..84021b24f79e 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1051,7 +1051,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) * p->signal fields because the whole thread group is dead * and nobody can change them. * - * psig->stats_lock also protects us from our sub-theads + * psig->stats_lock also protects us from our sub-threads * which can reap other children at the same time. Until * we change k_getrusage()-like users to rely on this lock * we have to take ->siglock as well. -- cgit From 87514b2c24f294c32e9e743b095541dcf43928f7 Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Thu, 21 Jul 2022 15:51:55 +0100 Subject: sched/rt: Fix Sparse warnings due to undefined rt.c declarations There are several symbols defined in kernel/sched/sched.h but get wrapped in CONFIG_CGROUP_SCHED, even though dummy versions get built in rt.c and therefore trigger Sparse warnings: kernel/sched/rt.c:309:6: warning: symbol 'unregister_rt_sched_group' was not declared. Should it be static? kernel/sched/rt.c:311:6: warning: symbol 'free_rt_sched_group' was not declared. Should it be static? kernel/sched/rt.c:313:5: warning: symbol 'alloc_rt_sched_group' was not declared. Should it be static? Fix this by moving them outside the CONFIG_CGROUP_SCHED block. [ mingo: Refreshed to the latest scheduler tree, tweaked changelog. ] Signed-off-by: Ben Dooks Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20220721145155.358366-1-ben-linux@fluff.org --- kernel/sched/sched.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index aad7f5ee9666..1429315610d9 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -480,9 +480,6 @@ extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b); extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b); extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq); -extern void unregister_rt_sched_group(struct task_group *tg); -extern void free_rt_sched_group(struct task_group *tg); -extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent); extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int cpu, struct sched_rt_entity *parent); @@ -520,6 +517,10 @@ struct cfs_bandwidth { }; #endif /* CONFIG_CGROUP_SCHED */ +extern void unregister_rt_sched_group(struct task_group *tg); +extern void free_rt_sched_group(struct task_group *tg); +extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent); + /* * u64_u32_load/u64_u32_store * -- cgit From 99643bab36b642be10bf09cd3285c37c9e5b597f Mon Sep 17 00:00:00 2001 From: Slark Xiao Date: Wed, 20 Jul 2022 17:12:20 +0800 Subject: perf/core: Fix ';;' typo Remove double ';;'. Signed-off-by: Slark Xiao Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20220720091220.14200-1-slark_xiao@163.com --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index c9d32d4d2e20..bd23f3e9bdae 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4457,7 +4457,7 @@ int perf_event_read_local(struct perf_event *event, u64 *value, *value = local64_read(&event->count); if (enabled || running) { - u64 __enabled, __running, __now;; + u64 __enabled, __running, __now; calc_timer_values(event, &__now, &__enabled, &__running); if (enabled) -- cgit From 751d4cbc43879229dbc124afefe240b70fd29a85 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 4 Aug 2022 10:21:19 +0100 Subject: sched/core: Do not requeue task on CPU excluded from cpus_mask The following warning was triggered on a large machine early in boot on a distribution kernel but the same problem should also affect mainline. WARNING: CPU: 439 PID: 10 at ../kernel/workqueue.c:2231 process_one_work+0x4d/0x440 Call Trace: rescuer_thread+0x1f6/0x360 kthread+0x156/0x180 ret_from_fork+0x22/0x30 Commit c6e7bd7afaeb ("sched/core: Optimize ttwu() spinning on p->on_cpu") optimises ttwu by queueing a task that is descheduling on the wakelist, but does not check if the task descheduling is still allowed to run on that CPU. In this warning, the problematic task is a workqueue rescue thread which checks if the rescue is for a per-cpu workqueue and running on the wrong CPU. While this is early in boot and it should be possible to create workers, the rescue thread may still used if the MAYDAY_INITIAL_TIMEOUT is reached or MAYDAY_INTERVAL and on a sufficiently large machine, the rescue thread is being used frequently. Tracing confirmed that the task should have migrated properly using the stopper thread to handle the migration. However, a parallel wakeup from udev running on another CPU that does not share CPU cache observes p->on_cpu and uses task_cpu(p), queues the task on the old CPU and triggers the warning. Check that the wakee task that is descheduling is still allowed to run on its current CPU and if not, wait for the descheduling to complete and select an allowed CPU. Fixes: c6e7bd7afaeb ("sched/core: Optimize ttwu() spinning on p->on_cpu") Signed-off-by: Mel Gorman Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20220804092119.20137-1-mgorman@techsingularity.net --- kernel/sched/core.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index addc3c2d2122..02afa1cc3c8c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3802,7 +3802,7 @@ bool cpus_share_cache(int this_cpu, int that_cpu) return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); } -static inline bool ttwu_queue_cond(int cpu) +static inline bool ttwu_queue_cond(struct task_struct *p, int cpu) { /* * Do not complicate things with the async wake_list while the CPU is @@ -3811,6 +3811,10 @@ static inline bool ttwu_queue_cond(int cpu) if (!cpu_active(cpu)) return false; + /* Ensure the task will still be allowed to run on the CPU. */ + if (!cpumask_test_cpu(cpu, p->cpus_ptr)) + return false; + /* * If the CPU does not share cache, then queue the task on the * remote rqs wakelist to avoid accessing remote data. @@ -3840,7 +3844,7 @@ static inline bool ttwu_queue_cond(int cpu) static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) { - if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu)) { + if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(p, cpu)) { sched_clock_cpu(cpu); /* Sync clocks across CPUs */ __ttwu_queue_wakelist(p, cpu, wake_flags); return true; -- cgit From f482aa98652795846cc55da98ebe331eb74f3d0b Mon Sep 17 00:00:00 2001 From: Peilin Ye Date: Wed, 3 Aug 2022 15:23:43 -0700 Subject: audit, io_uring, io-wq: Fix memory leak in io_sq_thread() and io_wqe_worker() Currently @audit_context is allocated twice for io_uring workers: 1. copy_process() calls audit_alloc(); 2. io_sq_thread() or io_wqe_worker() calls audit_alloc_kernel() (which is effectively audit_alloc()) and overwrites @audit_context, causing: BUG: memory leak unreferenced object 0xffff888144547400 (size 1024): <...> hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 01 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [] audit_alloc+0x133/0x210 [] copy_process+0xcd3/0x2340 [] create_io_thread+0x63/0x90 [] create_io_worker+0xb4/0x230 [] io_wqe_enqueue+0x248/0x3b0 [] io_queue_iowq+0xba/0x200 [] io_queue_async+0x113/0x180 [] io_req_task_submit+0x18f/0x1a0 [] io_apoll_task_func+0xdd/0x120 [] tctx_task_work+0x11f/0x570 [] task_work_run+0x7e/0xc0 [] get_signal+0xc18/0xf10 [] arch_do_signal_or_restart+0x2b/0x730 [] exit_to_user_mode_prepare+0x5e/0x180 [] syscall_exit_to_user_mode+0x12/0x20 [] do_syscall_64+0x40/0x80 Then, 3. io_sq_thread() or io_wqe_worker() frees @audit_context using audit_free(); 4. do_exit() eventually calls audit_free() again, which is okay because audit_free() does a NULL check. As suggested by Paul Moore, fix it by deleting audit_alloc_kernel() and redundant audit_free() calls. Fixes: 5bd2182d58e9 ("audit,io_uring,io-wq: add some basic audit support to io_uring") Suggested-by: Paul Moore Cc: stable@vger.kernel.org Signed-off-by: Peilin Ye Acked-by: Paul Moore Link: https://lore.kernel.org/r/20220803222343.31673-1-yepeilin.cs@gmail.com Signed-off-by: Jens Axboe --- kernel/auditsc.c | 25 ------------------------- 1 file changed, 25 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 3a8c9d744800..dd8d9ab747c3 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1073,31 +1073,6 @@ int audit_alloc(struct task_struct *tsk) return 0; } -/** - * audit_alloc_kernel - allocate an audit_context for a kernel task - * @tsk: the kernel task - * - * Similar to the audit_alloc() function, but intended for kernel private - * threads. Returns zero on success, negative values on failure. - */ -int audit_alloc_kernel(struct task_struct *tsk) -{ - /* - * At the moment we are just going to call into audit_alloc() to - * simplify the code, but there two things to keep in mind with this - * approach: - * - * 1. Filtering internal kernel tasks is a bit laughable in almost all - * cases, but there is at least one case where there is a benefit: - * the '-a task,never' case allows the admin to effectively disable - * task auditing at runtime. - * - * 2. The {set,clear}_task_syscall_work() ops likely have zero effect - * on these internal kernel tasks, but they probably don't hurt either. - */ - return audit_alloc(tsk); -} - static inline void audit_free_context(struct audit_context *context) { /* resetting is extra work, but it is likely just noise */ -- cgit From f1a15b977ff864513133ecb611eb28603d32c1b4 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 4 Aug 2022 17:33:48 +0300 Subject: rv: Unlock on error path in rv_unregister_reactor() Unlock the "rv_interface_lock" mutex before returning. Link: https://lkml.kernel.org/r/YuvYzNfGMgV+PIhd@kili Fixes: 04acadcb4453 ("rv: Add runtime reactors interface") Signed-off-by: Dan Carpenter Signed-off-by: Steven Rostedt (Google) --- kernel/trace/rv/rv_reactors.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/rv/rv_reactors.c b/kernel/trace/rv/rv_reactors.c index a6522c196382..6aae106695b6 100644 --- a/kernel/trace/rv/rv_reactors.c +++ b/kernel/trace/rv/rv_reactors.c @@ -329,6 +329,7 @@ int rv_register_reactor(struct rv_reactor *reactor) int rv_unregister_reactor(struct rv_reactor *reactor) { struct rv_reactor_def *ptr, *next; + int ret = 0; mutex_lock(&rv_interface_lock); @@ -343,13 +344,14 @@ int rv_unregister_reactor(struct rv_reactor *reactor) ptr->reactor->name, ptr->counter); printk(KERN_WARNING "rv: the rv_reactor %s cannot be removed\n", ptr->reactor->name); - return -EBUSY; + ret = -EBUSY; + break; } } } mutex_unlock(&rv_interface_lock); - return 0; + return ret; } /* -- cgit From 62d468e5e10012e8b67d066ba7bac0a8afdc3cee Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 2 Aug 2022 15:56:51 +0200 Subject: bpf: Cleanup ftrace hash in bpf_trampoline_put We need to release possible hash from trampoline fops object before removing it, otherwise we leak it. Fixes: 00963a2e75a8 ("bpf: Support bpf_trampoline on functions with IPMODIFY (e.g. livepatch)") Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20220802135651.1794015-1-jolsa@kernel.org --- kernel/bpf/trampoline.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 0f532e6a717f..ff87e38af8a7 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -841,7 +841,10 @@ void bpf_trampoline_put(struct bpf_trampoline *tr) * multiple rcu callbacks. */ hlist_del(&tr->hlist); - kfree(tr->fops); + if (tr->fops) { + ftrace_free_filter(tr->fops); + kfree(tr->fops); + } kfree(tr); out: mutex_unlock(&trampoline_mutex); -- cgit From 221f9d9cdf429df8c3843b4291f4f412fde11543 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Tue, 19 Jul 2022 10:56:20 +0200 Subject: posix-timers: Make do_clock_gettime() static MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit do_clock_gettime() is used only in posix-stubs.c, so make it static. It avoids a compiler warning too: time/posix-stubs.c:73:5: warning: no previous prototype for ‘do_clock_gettime’ [-Wmissing-prototypes] Signed-off-by: Jiri Slaby Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20220719085620.30567-1-jslaby@suse.cz --- kernel/time/posix-stubs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index fcb3b21d8bdc..90ea5f373e50 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -70,7 +70,7 @@ SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, return do_sys_settimeofday64(&new_tp, NULL); } -int do_clock_gettime(clockid_t which_clock, struct timespec64 *tp) +static int do_clock_gettime(clockid_t which_clock, struct timespec64 *tp) { switch (which_clock) { case CLOCK_REALTIME: @@ -90,6 +90,7 @@ int do_clock_gettime(clockid_t which_clock, struct timespec64 *tp) return 0; } + SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, struct __kernel_timespec __user *, tp) { -- cgit From 6644aabbd8973a9f8008cabfd054a36b69a3a3f5 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Thu, 4 Aug 2022 13:11:39 -0700 Subject: bpf: Use proper target btf when exporting attach_btf_obj_id When attaching to program, the program itself might not be attached to anything (and, hence, might not have attach_btf), so we can't unconditionally use 'prog->aux->dst_prog->aux->attach_btf'. Instead, use bpf_prog_get_target_btf to pick proper target BTF: * when attached to dst_prog, use dst_prog->aux->btf * when attached to kernel btf, use prog->aux->attach_btf Fixes: b79c9fc9551b ("bpf: implement BPF_PROG_QUERY for BPF_LSM_CGROUP") Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann Acked-by: Hao Luo Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20220804201140.1340684-1-sdf@google.com --- kernel/bpf/syscall.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 83c7136c5788..7dc3f8003631 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3886,6 +3886,7 @@ static int bpf_prog_get_info_by_fd(struct file *file, union bpf_attr __user *uattr) { struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info); + struct btf *attach_btf = bpf_prog_get_target_btf(prog); struct bpf_prog_info info; u32 info_len = attr->info.info_len; struct bpf_prog_kstats stats; @@ -4088,10 +4089,8 @@ static int bpf_prog_get_info_by_fd(struct file *file, if (prog->aux->btf) info.btf_id = btf_obj_id(prog->aux->btf); info.attach_btf_id = prog->aux->attach_btf_id; - if (prog->aux->attach_btf) - info.attach_btf_obj_id = btf_obj_id(prog->aux->attach_btf); - else if (prog->aux->dst_prog) - info.attach_btf_obj_id = btf_obj_id(prog->aux->dst_prog->aux->attach_btf); + if (attach_btf) + info.attach_btf_obj_id = btf_obj_id(attach_btf); ulen = info.nr_func_info; info.nr_func_info = prog->aux->func_info_cnt; -- cgit From 7251ceb51af972603552fcea2db316ed2b9d95ba Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 17 May 2022 17:07:31 +0200 Subject: sysctl: Merge adjacent CONFIG_TREE_RCU blocks There are two adjacent sysctl entries protected by the same CONFIG_TREE_RCU config symbol. Merge them into a single block to improve readability. Use the more common "#ifdef" form while at it. Signed-off-by: Geert Uytterhoeven Reviewed-by: Paul E. McKenney Signed-off-by: Luis Chamberlain --- kernel/sysctl.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index b233714a1c78..8a0e85a95138 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2061,7 +2061,7 @@ static struct ctl_table kern_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, -#if defined(CONFIG_TREE_RCU) +#ifdef CONFIG_TREE_RCU { .procname = "panic_on_rcu_stall", .data = &sysctl_panic_on_rcu_stall, @@ -2071,8 +2071,6 @@ static struct ctl_table kern_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, -#endif -#if defined(CONFIG_TREE_RCU) { .procname = "max_rcu_stall_to_panic", .data = &sysctl_max_rcu_stall_to_panic, -- cgit From 5bfd5d3e2ec883a3db3414a42d94d23961a790ed Mon Sep 17 00:00:00 2001 From: Fanjun Kong Date: Sun, 22 May 2022 13:29:33 +0800 Subject: kernel/sysctl.c: Clean up indentation, replace spaces with tab. This patch fixes two coding style issues: 1. Clean up indentation, replace spaces with tab 2. Add space after ',' Signed-off-by: Fanjun Kong Signed-off-by: Luis Chamberlain --- kernel/sysctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8a0e85a95138..223376959d29 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1315,8 +1315,8 @@ int proc_dointvec_ms_jiffies_minmax(struct ctl_table *table, int write, int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_dointvec(table,write,buffer,lenp,ppos, - do_proc_dointvec_userhz_jiffies_conv,NULL); + return do_proc_dointvec(table, write, buffer, lenp, ppos, + do_proc_dointvec_userhz_jiffies_conv, NULL); } /** -- cgit From 374a723c7448bbea22846884ba336ed83b085aab Mon Sep 17 00:00:00 2001 From: Fanjun Kong Date: Mon, 16 May 2022 17:07:53 +0800 Subject: kernel/sysctl.c: Remove trailing white space This patch removes the trailing white space in kernel/sysysctl.c Signed-off-by: Fanjun Kong Reviewed-by: Muchun Song [mcgrof: fix commit message subject] Signed-off-by: Luis Chamberlain --- kernel/sysctl.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 223376959d29..205d605cacc5 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -492,12 +492,12 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, int *i, vleft, first = 1, err = 0; size_t left; char *p; - + if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) { *lenp = 0; return 0; } - + i = (int *) tbl_data; vleft = table->maxlen / sizeof(*i); left = *lenp; @@ -729,7 +729,7 @@ int proc_dobool(struct ctl_table *table, int write, void *buffer, * @ppos: file position * * Reads/writes up to table->maxlen/sizeof(unsigned int) integer - * values from/to the user buffer, treated as an ASCII string. + * values from/to the user buffer, treated as an ASCII string. * * Returns 0 on success. */ @@ -1273,7 +1273,7 @@ static int do_proc_dointvec_ms_jiffies_minmax_conv(bool *negp, unsigned long *lv * @ppos: file position * * Reads/writes up to table->maxlen/sizeof(unsigned int) integer - * values from/to the user buffer, treated as an ASCII string. + * values from/to the user buffer, treated as an ASCII string. * The values read are assumed to be in seconds, and are converted into * jiffies. * @@ -1306,8 +1306,8 @@ int proc_dointvec_ms_jiffies_minmax(struct ctl_table *table, int write, * @ppos: pointer to the file position * * Reads/writes up to table->maxlen/sizeof(unsigned int) integer - * values from/to the user buffer, treated as an ASCII string. - * The values read are assumed to be in 1/USER_HZ seconds, and + * values from/to the user buffer, treated as an ASCII string. + * The values read are assumed to be in 1/USER_HZ seconds, and * are converted into jiffies. * * Returns 0 on success. -- cgit From 46dae32fe625a75f549c3a70edc77b778197bb05 Mon Sep 17 00:00:00 2001 From: Youngmin Nam Date: Tue, 12 Jul 2022 18:47:15 +0900 Subject: time: Correct the prototype of ns_to_kernel_old_timeval and ns_to_timespec64 In ns_to_kernel_old_timeval() definition, the function argument is defined with const identifier in kernel/time/time.c, but the prototype in include/linux/time32.h looks different. - The function is defined in kernel/time/time.c as below: struct __kernel_old_timeval ns_to_kernel_old_timeval(const s64 nsec) - The function is decalared in include/linux/time32.h as below: extern struct __kernel_old_timeval ns_to_kernel_old_timeval(s64 nsec); Because the variable of arithmethic types isn't modified in the calling scope, there's no need to mark arguments as const, which was already mentioned during review (Link[1) of the original patch. Likewise remove the "const" keyword in both definition and declaration of ns_to_timespec64() as requested by Arnd (Link[2]). Fixes: a84d1169164b ("y2038: Introduce struct __kernel_old_timeval") Signed-off-by: Youngmin Nam Signed-off-by: Thomas Gleixner Reviewed-by: Arnd Bergmann Link: https://lore.kernel.org/all/20220712094715.2918823-1-youngmin.nam@samsung.com Link[1]: https://lore.kernel.org/all/20180310081123.thin6wphgk7tongy@gmail.com/ Link[2]: https://lore.kernel.org/all/CAK8P3a3nknJgEDESGdJH91jMj6R_xydFqWASd8r5BbesdvMBgA@mail.gmail.com/ --- kernel/time/time.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/time.c b/kernel/time/time.c index 29923b20e0e4..526257b3727c 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -449,7 +449,7 @@ time64_t mktime64(const unsigned int year0, const unsigned int mon0, } EXPORT_SYMBOL(mktime64); -struct __kernel_old_timeval ns_to_kernel_old_timeval(const s64 nsec) +struct __kernel_old_timeval ns_to_kernel_old_timeval(s64 nsec) { struct timespec64 ts = ns_to_timespec64(nsec); struct __kernel_old_timeval tv; @@ -503,7 +503,7 @@ EXPORT_SYMBOL(set_normalized_timespec64); * * Returns the timespec64 representation of the nsec parameter. */ -struct timespec64 ns_to_timespec64(const s64 nsec) +struct timespec64 ns_to_timespec64(s64 nsec) { struct timespec64 ts = { 0, 0 }; s32 rem; -- cgit From 275c30bcee66a27d1aa97a215d607ad6d49804cb Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Tue, 9 Aug 2022 23:30:32 +0200 Subject: bpf: Don't reinit map value in prealloc_lru_pop The LRU map that is preallocated may have its elements reused while another program holds a pointer to it from bpf_map_lookup_elem. Hence, only check_and_free_fields is appropriate when the element is being deleted, as it ensures proper synchronization against concurrent access of the map value. After that, we cannot call check_and_init_map_value again as it may rewrite bpf_spin_lock, bpf_timer, and kptr fields while they can be concurrently accessed from a BPF program. This is safe to do as when the map entry is deleted, concurrent access is protected against by check_and_free_fields, i.e. an existing timer would be freed, and any existing kptr will be released by it. The program can create further timers and kptrs after check_and_free_fields, but they will eventually be released once the preallocated items are freed on map destruction, even if the item is never reused again. Hence, the deleted item sitting in the free list can still have resources attached to it, and they would never leak. With spin_lock, we never touch the field at all on delete or update, as we may end up modifying the state of the lock. Since the verifier ensures that a bpf_spin_lock call is always paired with bpf_spin_unlock call, the program will eventually release the lock so that on reuse the new user of the value can take the lock. Essentially, for the preallocated case, we must assume that the map value may always be in use by the program, even when it is sitting in the freelist, and handle things accordingly, i.e. use proper synchronization inside check_and_free_fields, and never reinitialize the special fields when it is reused on update. Fixes: 68134668c17f ("bpf: Add map side support for bpf timers.") Acked-by: Yonghong Song Signed-off-by: Kumar Kartikeya Dwivedi Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220809213033.24147-3-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/hashtab.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index da7578426a46..4d793a92301b 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -311,12 +311,8 @@ static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key, struct htab_elem *l; if (node) { - u32 key_size = htab->map.key_size; - l = container_of(node, struct htab_elem, lru_node); - memcpy(l->key, key, key_size); - check_and_init_map_value(&htab->map, - l->key + round_up(key_size, 8)); + memcpy(l->key, key, htab->map.key_size); return l; } -- cgit From 86f44fcec22ce2979507742bc53db8400e454f46 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 8 Aug 2022 20:58:09 -0700 Subject: bpf: Disallow bpf programs call prog_run command. The verifier cannot perform sufficient validation of bpf_attr->test.ctx_in pointer, therefore bpf programs should not be allowed to call BPF_PROG_RUN command from within the program. To fix this issue split bpf_sys_bpf() bpf helper into normal kern_sys_bpf() kernel function that can only be used by the kernel light skeleton directly. Reported-by: YiFei Zhu Fixes: b1d18a7574d0 ("bpf: Extend sys_bpf commands for bpf_syscall programs.") Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 7dc3f8003631..a1cb0bdc5ad6 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -5071,9 +5071,6 @@ static bool syscall_prog_is_valid_access(int off, int size, BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size) { - struct bpf_prog * __maybe_unused prog; - struct bpf_tramp_run_ctx __maybe_unused run_ctx; - switch (cmd) { case BPF_MAP_CREATE: case BPF_MAP_UPDATE_ELEM: @@ -5083,6 +5080,18 @@ BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size) case BPF_LINK_CREATE: case BPF_RAW_TRACEPOINT_OPEN: break; + default: + return -EINVAL; + } + return __sys_bpf(cmd, KERNEL_BPFPTR(attr), attr_size); +} + +int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size) +{ + struct bpf_prog * __maybe_unused prog; + struct bpf_tramp_run_ctx __maybe_unused run_ctx; + + switch (cmd) { #ifdef CONFIG_BPF_JIT /* __bpf_prog_enter_sleepable used by trampoline and JIT */ case BPF_PROG_TEST_RUN: if (attr->test.data_in || attr->test.data_out || @@ -5113,11 +5122,10 @@ BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size) return 0; #endif default: - return -EINVAL; + return ____bpf_sys_bpf(cmd, attr, size); } - return __sys_bpf(cmd, KERNEL_BPFPTR(attr), attr_size); } -EXPORT_SYMBOL(bpf_sys_bpf); +EXPORT_SYMBOL(kern_sys_bpf); static const struct bpf_func_proto bpf_sys_bpf_proto = { .func = bpf_sys_bpf, -- cgit From f76fa6b338055054f80c72b29c97fb95c1becadc Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Wed, 10 Aug 2022 16:05:30 +0800 Subject: bpf: Acquire map uref in .init_seq_private for array map iterator bpf_iter_attach_map() acquires a map uref, and the uref may be released before or in the middle of iterating map elements. For example, the uref could be released in bpf_iter_detach_map() as part of bpf_link_release(), or could be released in bpf_map_put_with_uref() as part of bpf_map_release(). Alternative fix is acquiring an extra bpf_link reference just like a pinned map iterator does, but it introduces unnecessary dependency on bpf_link instead of bpf_map. So choose another fix: acquiring an extra map uref in .init_seq_private for array map iterator. Fixes: d3cc2ab546ad ("bpf: Implement bpf iterator for array maps") Signed-off-by: Hou Tao Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20220810080538.1845898-2-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/arraymap.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index d3e734bf8056..624527401d4d 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -649,6 +649,11 @@ static int bpf_iter_init_array_map(void *priv_data, seq_info->percpu_value_buf = value_buf; } + /* bpf_iter_attach_map() acquires a map uref, and the uref may be + * released before or in the middle of iterating map elements, so + * acquire an extra map uref for iterator. + */ + bpf_map_inc_with_uref(map); seq_info->map = map; return 0; } @@ -657,6 +662,7 @@ static void bpf_iter_fini_array_map(void *priv_data) { struct bpf_iter_seq_array_map_info *seq_info = priv_data; + bpf_map_put_with_uref(seq_info->map); kfree(seq_info->percpu_value_buf); } -- cgit From ef1e93d2eeb58a1f08c37b22a2314b94bc045f15 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Wed, 10 Aug 2022 16:05:31 +0800 Subject: bpf: Acquire map uref in .init_seq_private for hash map iterator bpf_iter_attach_map() acquires a map uref, and the uref may be released before or in the middle of iterating map elements. For example, the uref could be released in bpf_iter_detach_map() as part of bpf_link_release(), or could be released in bpf_map_put_with_uref() as part of bpf_map_release(). So acquiring an extra map uref in bpf_iter_init_hash_map() and releasing it in bpf_iter_fini_hash_map(). Fixes: d6c4503cc296 ("bpf: Implement bpf iterator for hash maps") Signed-off-by: Hou Tao Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20220810080538.1845898-3-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/hashtab.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 4d793a92301b..6c530a5e560a 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -2060,6 +2060,7 @@ static int bpf_iter_init_hash_map(void *priv_data, seq_info->percpu_value_buf = value_buf; } + bpf_map_inc_with_uref(map); seq_info->map = map; seq_info->htab = container_of(map, struct bpf_htab, map); return 0; @@ -2069,6 +2070,7 @@ static void bpf_iter_fini_hash_map(void *priv_data) { struct bpf_iter_seq_hash_map_info *seq_info = priv_data; + bpf_map_put_with_uref(seq_info->map); kfree(seq_info->percpu_value_buf); } -- cgit From d247049f4fd088e4e40294819a932a6057b3632c Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Wed, 10 Aug 2022 16:05:35 +0800 Subject: bpf: Only allow sleepable program for resched-able iterator When a sleepable program is attached to a hash map iterator, might_fault() will report "BUG: sleeping function called from invalid context..." if CONFIG_DEBUG_ATOMIC_SLEEP is enabled. The reason is that rcu_read_lock() is held in bpf_hash_map_seq_next() and won't be released until all elements are traversed or bpf_hash_map_seq_stop() is called. Fixing it by reusing BPF_ITER_RESCHED to indicate that only non-sleepable program is allowed for iterator without BPF_ITER_RESCHED. We can revise bpf_iter_link_attach() later if there are other conditions which may cause rcu_read_lock() or spin_lock() issues. Signed-off-by: Hou Tao Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20220810080538.1845898-7-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_iter.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 2726a5950cfa..24b755eca0b3 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -68,13 +68,18 @@ static void bpf_iter_done_stop(struct seq_file *seq) iter_priv->done_stop = true; } +static inline bool bpf_iter_target_support_resched(const struct bpf_iter_target_info *tinfo) +{ + return tinfo->reg_info->feature & BPF_ITER_RESCHED; +} + static bool bpf_iter_support_resched(struct seq_file *seq) { struct bpf_iter_priv_data *iter_priv; iter_priv = container_of(seq->private, struct bpf_iter_priv_data, target_private); - return iter_priv->tinfo->reg_info->feature & BPF_ITER_RESCHED; + return bpf_iter_target_support_resched(iter_priv->tinfo); } /* maximum visited objects before bailing out */ @@ -537,6 +542,10 @@ int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr, if (!tinfo) return -ENOENT; + /* Only allow sleepable program for resched-able iterator */ + if (prog->aux->sleepable && !bpf_iter_target_support_resched(tinfo)) + return -EINVAL; + link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN); if (!link) return -ENOMEM; -- cgit From cf8c1e967224c931119d3447f2213d1f645a1a2a Mon Sep 17 00:00:00 2001 From: Hawkins Jiawei Date: Fri, 5 Aug 2022 15:48:36 +0800 Subject: net: refactor bpf_sk_reuseport_detach() Refactor sk_user_data dereference using more generic function __rcu_dereference_sk_user_data_with_flags(), which improve its maintainability Suggested-by: Jakub Kicinski Signed-off-by: Hawkins Jiawei Reviewed-by: Jakub Sitnicki Signed-off-by: Jakub Kicinski --- kernel/bpf/reuseport_array.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index e2618fb5870e..85fa9dbfa8bf 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -21,14 +21,11 @@ static struct reuseport_array *reuseport_array(struct bpf_map *map) /* The caller must hold the reuseport_lock */ void bpf_sk_reuseport_detach(struct sock *sk) { - uintptr_t sk_user_data; + struct sock __rcu **socks; write_lock_bh(&sk->sk_callback_lock); - sk_user_data = (uintptr_t)sk->sk_user_data; - if (sk_user_data & SK_USER_DATA_BPF) { - struct sock __rcu **socks; - - socks = (void *)(sk_user_data & SK_USER_DATA_PTRMASK); + socks = __rcu_dereference_sk_user_data_with_flags(sk, SK_USER_DATA_BPF); + if (socks) { WRITE_ONCE(sk->sk_user_data, NULL); /* * Do not move this NULL assignment outside of -- cgit From 4e4588f1c4d2e67c993208f0550ef3fae33abce4 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 10 Aug 2022 23:52:28 -0700 Subject: bpf: Shut up kern_sys_bpf warning. Shut up this warning: kernel/bpf/syscall.c:5089:5: warning: no previous prototype for function 'kern_sys_bpf' [-Wmissing-prototypes] int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size) Reported-by: Jakub Kicinski Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a1cb0bdc5ad6..a4d40d98428a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -5086,6 +5086,14 @@ BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size) return __sys_bpf(cmd, KERNEL_BPFPTR(attr), attr_size); } + +/* To shut up -Wmissing-prototypes. + * This function is used by the kernel light skeleton + * to load bpf programs when modules are loaded or during kernel boot. + * See tools/lib/bpf/skel_internal.h + */ +int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size); + int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size) { struct bpf_prog * __maybe_unused prog; -- cgit From aa6d1e5b502866a4364dfeb2cdecf7c258c400ee Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Wed, 10 Aug 2022 07:07:10 +0200 Subject: xen: remove XEN_SCRUB_PAGES in xen.config Commit 197ecb3802c0 ("xen/balloon: add runtime control for scrubbing ballooned out pages") changed config XEN_SCRUB_PAGES to config XEN_SCRUB_PAGES_DEFAULT. As xen.config sets 'XEN_BALLOON=y' and XEN_SCRUB_PAGES_DEFAULT defaults to yes, there is no further need to set this config in the xen.config file. Remove setting XEN_SCRUB_PAGES in xen.config, which is without effect since the commit above anyway. Signed-off-by: Lukas Bulwahn Reviewed-by: Juergen Gross Link: https://lore.kernel.org/r/20220810050712.9539-3-lukas.bulwahn@gmail.com Signed-off-by: Juergen Gross --- kernel/configs/xen.config | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/configs/xen.config b/kernel/configs/xen.config index ff756221f112..436f806aa1ed 100644 --- a/kernel/configs/xen.config +++ b/kernel/configs/xen.config @@ -34,7 +34,6 @@ CONFIG_INPUT_XEN_KBDDEV_FRONTEND=m CONFIG_XEN_SCSI_FRONTEND=m # others CONFIG_XEN_BALLOON=y -CONFIG_XEN_SCRUB_PAGES=y CONFIG_XEN_DEV_EVTCHN=m CONFIG_XEN_BLKDEV_FRONTEND=m CONFIG_XEN_NETDEV_FRONTEND=m -- cgit From 41a55567b9e31cb852670684404654ec4fd0d8d6 Mon Sep 17 00:00:00 2001 From: David Gow Date: Wed, 13 Jul 2022 08:52:20 +0800 Subject: module: kunit: Load .kunit_test_suites section when CONFIG_KUNIT=m The new KUnit module handling has KUnit test suites listed in a .kunit_test_suites section of each module. This should be loaded when the module is, but at the moment this only happens if KUnit is built-in. Also load this when KUnit is enabled as a module: it'll not be usable unless KUnit is loaded, but such modules are likely to depend on KUnit anyway, so it's unlikely to ever be loaded needlessly. Fixes: 3d6e44623841 ("kunit: unify module and builtin suite definitions") Signed-off-by: David Gow Reviewed-by: Brendan Higgins Tested-by: Geert Uytterhoeven Signed-off-by: Shuah Khan --- kernel/module/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module/main.c b/kernel/module/main.c index 6a477c622544..a4e4d84b6f4e 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -2099,7 +2099,7 @@ static int find_module_sections(struct module *mod, struct load_info *info) sizeof(*mod->static_call_sites), &mod->num_static_call_sites); #endif -#ifdef CONFIG_KUNIT +#if IS_ENABLED(CONFIG_KUNIT) mod->kunit_suites = section_objs(info, ".kunit_test_suites", sizeof(*mod->kunit_suites), &mod->num_kunit_suites); -- cgit From 2b97cf76289a4fcae66d7959b0d74a87207d7068 Mon Sep 17 00:00:00 2001 From: Hao Jia Date: Sat, 6 Aug 2022 20:05:08 +0800 Subject: sched/psi: Zero the memory of struct psi_group After commit 5f69a6577bc3 ("psi: dont alloc memory for psi by default"), the memory used by struct psi_group is no longer allocated and zeroed in cgroup_create(). Since the memory of struct psi_group is not zeroed, the data in this memory is random, which will lead to inaccurate psi statistics when creating a new cgroup. So we use kzlloc() to allocate and zero the struct psi_group and remove the redundant zeroing in group_init(). Steps to reproduce: 1. Use cgroup v2 and enable CONFIG_PSI 2. Create a new cgroup, and query psi statistics mkdir /sys/fs/cgroup/test cat /sys/fs/cgroup/test/cpu.pressure some avg10=0.00 avg60=0.00 avg300=47927752200.00 total=12884901 full avg10=561815124.00 avg60=125835394188.00 avg300=1077090462000.00 total=10273561772 cat /sys/fs/cgroup/test/io.pressure some avg10=1040093132823.95 avg60=1203770351379.21 avg300=3862252669559.46 total=4294967296 full avg10=921884564601.39 avg60=0.00 avg300=1984507298.35 total=442381631 cat /sys/fs/cgroup/test/memory.pressure some avg10=232476085778.11 avg60=0.00 avg300=0.00 total=0 full avg10=0.00 avg60=0.00 avg300=2585658472280.57 total=12884901 Fixes: commit 5f69a6577bc3 ("psi: dont alloc memory for psi by default") Signed-off-by: Hao Jia Reviewed-by: Ingo Molnar Acked-by: Johannes Weiner Signed-off-by: Tejun Heo --- kernel/sched/psi.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index ec66b40bdd40..5ee615a59fe1 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -190,12 +190,8 @@ static void group_init(struct psi_group *group) /* Init trigger-related members */ mutex_init(&group->trigger_lock); INIT_LIST_HEAD(&group->triggers); - memset(group->nr_triggers, 0, sizeof(group->nr_triggers)); - group->poll_states = 0; group->poll_min_period = U32_MAX; - memset(group->polling_total, 0, sizeof(group->polling_total)); group->polling_next_update = ULLONG_MAX; - group->polling_until = 0; init_waitqueue_head(&group->poll_wait); timer_setup(&group->poll_timer, poll_timer_fn, 0); rcu_assign_pointer(group->poll_task, NULL); @@ -957,7 +953,7 @@ int psi_cgroup_alloc(struct cgroup *cgroup) if (static_branch_likely(&psi_disabled)) return 0; - cgroup->psi = kmalloc(sizeof(struct psi_group), GFP_KERNEL); + cgroup->psi = kzalloc(sizeof(struct psi_group), GFP_KERNEL); if (!cgroup->psi) return -ENOMEM; -- cgit From 76b079ef4cc954fc2c2e0333a01855b0b2b6bdee Mon Sep 17 00:00:00 2001 From: Hao Jia Date: Sat, 6 Aug 2022 20:05:09 +0800 Subject: sched/psi: Remove unused parameter nbytes of psi_trigger_create() psi_trigger_create()'s 'nbytes' parameter is not used, so we can remove it. Signed-off-by: Hao Jia Reviewed-by: Ingo Molnar Acked-by: Johannes Weiner Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 2 +- kernel/sched/psi.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index ffaccd6373f1..df7df5843b4f 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3698,7 +3698,7 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, } psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi; - new = psi_trigger_create(psi, buf, nbytes, res); + new = psi_trigger_create(psi, buf, res); if (IS_ERR(new)) { cgroup_put(cgrp); return PTR_ERR(new); diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 5ee615a59fe1..ecb4b4ff4ce0 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -1087,7 +1087,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) } struct psi_trigger *psi_trigger_create(struct psi_group *group, - char *buf, size_t nbytes, enum psi_res res) + char *buf, enum psi_res res) { struct psi_trigger *t; enum psi_states state; @@ -1316,7 +1316,7 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf, return -EBUSY; } - new = psi_trigger_create(&psi_system, buf, nbytes, res); + new = psi_trigger_create(&psi_system, buf, res); if (IS_ERR(new)) { mutex_unlock(&seq->lock); return PTR_ERR(new); -- cgit From 4f7e7236435ca0abe005c674ebd6892c6e83aeb3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 15 Aug 2022 13:27:38 -1000 Subject: cgroup: Fix threadgroup_rwsem <-> cpus_read_lock() deadlock Bringing up a CPU may involve creating and destroying tasks which requires read-locking threadgroup_rwsem, so threadgroup_rwsem nests inside cpus_read_lock(). However, cpuset's ->attach(), which may be called with thredagroup_rwsem write-locked, also wants to disable CPU hotplug and acquires cpus_read_lock(), leading to a deadlock. Fix it by guaranteeing that ->attach() is always called with CPU hotplug disabled and removing cpus_read_lock() call from cpuset_attach(). Signed-off-by: Tejun Heo Reviewed-and-tested-by: Imran Khan Reported-and-tested-by: Xuewen Yan Fixes: 05c7b7a92cc8 ("cgroup/cpuset: Fix a race between cpuset_attach() and cpu hotplug") Cc: stable@vger.kernel.org # v5.17+ --- kernel/cgroup/cgroup.c | 77 +++++++++++++++++++++++++++++++++++--------------- kernel/cgroup/cpuset.c | 3 +- 2 files changed, 55 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index df7df5843b4f..e1387499b336 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2369,6 +2369,47 @@ int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) } EXPORT_SYMBOL_GPL(task_cgroup_path); +/** + * cgroup_attach_lock - Lock for ->attach() + * @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem + * + * cgroup migration sometimes needs to stabilize threadgroups against forks and + * exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach() + * implementations (e.g. cpuset), also need to disable CPU hotplug. + * Unfortunately, letting ->attach() operations acquire cpus_read_lock() can + * lead to deadlocks. + * + * Bringing up a CPU may involve creating and destroying tasks which requires + * read-locking threadgroup_rwsem, so threadgroup_rwsem nests inside + * cpus_read_lock(). If we call an ->attach() which acquires the cpus lock while + * write-locking threadgroup_rwsem, the locking order is reversed and we end up + * waiting for an on-going CPU hotplug operation which in turn is waiting for + * the threadgroup_rwsem to be released to create new tasks. For more details: + * + * http://lkml.kernel.org/r/20220711174629.uehfmqegcwn2lqzu@wubuntu + * + * Resolve the situation by always acquiring cpus_read_lock() before optionally + * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that + * CPU hotplug is disabled on entry. + */ +static void cgroup_attach_lock(bool lock_threadgroup) +{ + cpus_read_lock(); + if (lock_threadgroup) + percpu_down_write(&cgroup_threadgroup_rwsem); +} + +/** + * cgroup_attach_unlock - Undo cgroup_attach_lock() + * @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem + */ +static void cgroup_attach_unlock(bool lock_threadgroup) +{ + if (lock_threadgroup) + percpu_up_write(&cgroup_threadgroup_rwsem); + cpus_read_unlock(); +} + /** * cgroup_migrate_add_task - add a migration target task to a migration context * @task: target task @@ -2841,8 +2882,7 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, } struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, - bool *locked) - __acquires(&cgroup_threadgroup_rwsem) + bool *threadgroup_locked) { struct task_struct *tsk; pid_t pid; @@ -2859,12 +2899,8 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, * Therefore, we can skip the global lock. */ lockdep_assert_held(&cgroup_mutex); - if (pid || threadgroup) { - percpu_down_write(&cgroup_threadgroup_rwsem); - *locked = true; - } else { - *locked = false; - } + *threadgroup_locked = pid || threadgroup; + cgroup_attach_lock(*threadgroup_locked); rcu_read_lock(); if (pid) { @@ -2895,17 +2931,14 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, goto out_unlock_rcu; out_unlock_threadgroup: - if (*locked) { - percpu_up_write(&cgroup_threadgroup_rwsem); - *locked = false; - } + cgroup_attach_unlock(*threadgroup_locked); + *threadgroup_locked = false; out_unlock_rcu: rcu_read_unlock(); return tsk; } -void cgroup_procs_write_finish(struct task_struct *task, bool locked) - __releases(&cgroup_threadgroup_rwsem) +void cgroup_procs_write_finish(struct task_struct *task, bool threadgroup_locked) { struct cgroup_subsys *ss; int ssid; @@ -2913,8 +2946,8 @@ void cgroup_procs_write_finish(struct task_struct *task, bool locked) /* release reference from cgroup_procs_write_start() */ put_task_struct(task); - if (locked) - percpu_up_write(&cgroup_threadgroup_rwsem); + cgroup_attach_unlock(threadgroup_locked); + for_each_subsys(ss, ssid) if (ss->post_attach) ss->post_attach(); @@ -3000,8 +3033,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) * write-locking can be skipped safely. */ has_tasks = !list_empty(&mgctx.preloaded_src_csets); - if (has_tasks) - percpu_down_write(&cgroup_threadgroup_rwsem); + cgroup_attach_lock(has_tasks); /* NULL dst indicates self on default hierarchy */ ret = cgroup_migrate_prepare_dst(&mgctx); @@ -3022,8 +3054,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) ret = cgroup_migrate_execute(&mgctx); out_finish: cgroup_migrate_finish(&mgctx); - if (has_tasks) - percpu_up_write(&cgroup_threadgroup_rwsem); + cgroup_attach_unlock(has_tasks); return ret; } @@ -4971,13 +5002,13 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, struct task_struct *task; const struct cred *saved_cred; ssize_t ret; - bool locked; + bool threadgroup_locked; dst_cgrp = cgroup_kn_lock_live(of->kn, false); if (!dst_cgrp) return -ENODEV; - task = cgroup_procs_write_start(buf, threadgroup, &locked); + task = cgroup_procs_write_start(buf, threadgroup, &threadgroup_locked); ret = PTR_ERR_OR_ZERO(task); if (ret) goto out_unlock; @@ -5003,7 +5034,7 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, ret = cgroup_attach_task(dst_cgrp, task, threadgroup); out_finish: - cgroup_procs_write_finish(task, locked); + cgroup_procs_write_finish(task, threadgroup_locked); out_unlock: cgroup_kn_unlock(of->kn); diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 58aadfda9b8b..1f3a55297f39 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2289,7 +2289,7 @@ static void cpuset_attach(struct cgroup_taskset *tset) cgroup_taskset_first(tset, &css); cs = css_cs(css); - cpus_read_lock(); + lockdep_assert_cpus_held(); /* see cgroup_attach_lock() */ percpu_down_write(&cpuset_rwsem); guarantee_online_mems(cs, &cpuset_attach_nodemask_to); @@ -2343,7 +2343,6 @@ static void cpuset_attach(struct cgroup_taskset *tset) wake_up(&cpuset_attach_wq); percpu_up_write(&cpuset_rwsem); - cpus_read_unlock(); } /* The various types of files and directories in a cpuset file system */ -- cgit From 14b20b784f59bdd95f6f1cfb112c9818bcec4d84 Mon Sep 17 00:00:00 2001 From: YiFei Zhu Date: Tue, 16 Aug 2022 20:55:16 +0000 Subject: bpf: Restrict bpf_sys_bpf to CAP_PERFMON The verifier cannot perform sufficient validation of any pointers passed into bpf_attr and treats them as integers rather than pointers. The helper will then read from arbitrary pointers passed into it. Restrict the helper to CAP_PERFMON since the security model in BPF of arbitrary kernel read is CAP_BPF + CAP_PERFMON. Fixes: af2ac3e13e45 ("bpf: Prepare bpf syscall to be used from kernel and user space.") Signed-off-by: YiFei Zhu Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220816205517.682470-1-zhuyifei@google.com --- kernel/bpf/syscall.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a4d40d98428a..27760627370d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -5197,7 +5197,7 @@ syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_sys_bpf: - return &bpf_sys_bpf_proto; + return !perfmon_capable() ? NULL : &bpf_sys_bpf_proto; case BPF_FUNC_btf_find_by_name_kind: return &bpf_btf_find_by_name_kind_proto; case BPF_FUNC_sys_close: -- cgit From fc4aaf9fb3c99bcb326d52f9d320ed5680bd1cee Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 16 Aug 2022 10:34:40 +0100 Subject: net: Fix suspicious RCU usage in bpf_sk_reuseport_detach() bpf_sk_reuseport_detach() calls __rcu_dereference_sk_user_data_with_flags() to obtain the value of sk->sk_user_data, but that function is only usable if the RCU read lock is held, and neither that function nor any of its callers hold it. Fix this by adding a new helper, __locked_read_sk_user_data_with_flags() that checks to see if sk->sk_callback_lock() is held and use that here instead. Alternatively, making __rcu_dereference_sk_user_data_with_flags() use rcu_dereference_checked() might suffice. Without this, the following warning can be occasionally observed: ============================= WARNING: suspicious RCU usage 6.0.0-rc1-build2+ #563 Not tainted ----------------------------- include/net/sock.h:592 suspicious rcu_dereference_check() usage! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 1 5 locks held by locktest/29873: #0: ffff88812734b550 (&sb->s_type->i_mutex_key#9){+.+.}-{3:3}, at: __sock_release+0x77/0x121 #1: ffff88812f5621b0 (sk_lock-AF_INET){+.+.}-{0:0}, at: tcp_close+0x1c/0x70 #2: ffff88810312f5c8 (&h->lhash2[i].lock){+.+.}-{2:2}, at: inet_unhash+0x76/0x1c0 #3: ffffffff83768bb8 (reuseport_lock){+...}-{2:2}, at: reuseport_detach_sock+0x18/0xdd #4: ffff88812f562438 (clock-AF_INET){++..}-{2:2}, at: bpf_sk_reuseport_detach+0x24/0xa4 stack backtrace: CPU: 1 PID: 29873 Comm: locktest Not tainted 6.0.0-rc1-build2+ #563 Hardware name: ASUS All Series/H97-PLUS, BIOS 2306 10/09/2014 Call Trace: dump_stack_lvl+0x4c/0x5f bpf_sk_reuseport_detach+0x6d/0xa4 reuseport_detach_sock+0x75/0xdd inet_unhash+0xa5/0x1c0 tcp_set_state+0x169/0x20f ? lockdep_sock_is_held+0x3a/0x3a ? __lock_release.isra.0+0x13e/0x220 ? reacquire_held_locks+0x1bb/0x1bb ? hlock_class+0x31/0x96 ? mark_lock+0x9e/0x1af __tcp_close+0x50/0x4b6 tcp_close+0x28/0x70 inet_release+0x8e/0xa7 __sock_release+0x95/0x121 sock_close+0x14/0x17 __fput+0x20f/0x36a task_work_run+0xa3/0xcc exit_to_user_mode_prepare+0x9c/0x14d syscall_exit_to_user_mode+0x18/0x44 entry_SYSCALL_64_after_hwframe+0x63/0xcd Fixes: cf8c1e967224 ("net: refactor bpf_sk_reuseport_detach()") Signed-off-by: David Howells cc: Hawkins Jiawei Link: https://lore.kernel.org/r/166064248071.3502205.10036394558814861778.stgit@warthog.procyon.org.uk Signed-off-by: Jakub Kicinski --- kernel/bpf/reuseport_array.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index 85fa9dbfa8bf..82c61612f382 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -24,7 +24,7 @@ void bpf_sk_reuseport_detach(struct sock *sk) struct sock __rcu **socks; write_lock_bh(&sk->sk_callback_lock); - socks = __rcu_dereference_sk_user_data_with_flags(sk, SK_USER_DATA_BPF); + socks = __locked_read_sk_user_data_with_flags(sk, SK_USER_DATA_BPF); if (socks) { WRITE_ONCE(sk->sk_user_data, NULL); /* -- cgit From 7d6620f107bae6ed687ff07668e8e8f855487aa9 Mon Sep 17 00:00:00 2001 From: Pu Lehui Date: Sat, 13 Aug 2022 21:40:30 +0800 Subject: bpf, cgroup: Fix kernel BUG in purge_effective_progs Syzkaller reported a triggered kernel BUG as follows: ------------[ cut here ]------------ kernel BUG at kernel/bpf/cgroup.c:925! invalid opcode: 0000 [#1] PREEMPT SMP NOPTI CPU: 1 PID: 194 Comm: detach Not tainted 5.19.0-14184-g69dac8e431af #8 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 RIP: 0010:__cgroup_bpf_detach+0x1f2/0x2a0 Code: 00 e8 92 60 30 00 84 c0 75 d8 4c 89 e0 31 f6 85 f6 74 19 42 f6 84 28 48 05 00 00 02 75 0e 48 8b 80 c0 00 00 00 48 85 c0 75 e5 <0f> 0b 48 8b 0c5 RSP: 0018:ffffc9000055bdb0 EFLAGS: 00000246 RAX: 0000000000000000 RBX: ffff888100ec0800 RCX: ffffc900000f1000 RDX: 0000000000000000 RSI: 0000000000000001 RDI: ffff888100ec4578 RBP: 0000000000000000 R08: ffff888100ec0800 R09: 0000000000000040 R10: 0000000000000000 R11: 0000000000000000 R12: ffff888100ec4000 R13: 000000000000000d R14: ffffc90000199000 R15: ffff888100effb00 FS: 00007f68213d2b80(0000) GS:ffff88813bc80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000055f74a0e5850 CR3: 0000000102836000 CR4: 00000000000006e0 Call Trace: cgroup_bpf_prog_detach+0xcc/0x100 __sys_bpf+0x2273/0x2a00 __x64_sys_bpf+0x17/0x20 do_syscall_64+0x3b/0x90 entry_SYSCALL_64_after_hwframe+0x63/0xcd RIP: 0033:0x7f68214dbcb9 Code: 08 44 89 e0 5b 41 5c c3 66 0f 1f 84 00 00 00 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff8 RSP: 002b:00007ffeb487db68 EFLAGS: 00000246 ORIG_RAX: 0000000000000141 RAX: ffffffffffffffda RBX: 000000000000000b RCX: 00007f68214dbcb9 RDX: 0000000000000090 RSI: 00007ffeb487db70 RDI: 0000000000000009 RBP: 0000000000000003 R08: 0000000000000012 R09: 0000000b00000003 R10: 00007ffeb487db70 R11: 0000000000000246 R12: 00007ffeb487dc20 R13: 0000000000000004 R14: 0000000000000001 R15: 000055f74a1011b0 Modules linked in: ---[ end trace 0000000000000000 ]--- Repetition steps: For the following cgroup tree, root | cg1 | cg2 1. attach prog2 to cg2, and then attach prog1 to cg1, both bpf progs attach type is NONE or OVERRIDE. 2. write 1 to /proc/thread-self/fail-nth for failslab. 3. detach prog1 for cg1, and then kernel BUG occur. Failslab injection will cause kmalloc fail and fall back to purge_effective_progs. The problem is that cg2 have attached another prog, so when go through cg2 layer, iteration will add pos to 1, and subsequent operations will be skipped by the following condition, and cg will meet NULL in the end. `if (pos && !(cg->bpf.flags[atype] & BPF_F_ALLOW_MULTI))` The NULL cg means no link or prog match, this is as expected, and it's not a bug. So here just skip the no match situation. Fixes: 4c46091ee985 ("bpf: Fix KASAN use-after-free Read in compute_effective_progs") Signed-off-by: Pu Lehui Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220813134030.1972696-1-pulehui@huawei.com --- kernel/bpf/cgroup.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 59b7eb60d5b4..4a400cd63731 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -921,8 +921,10 @@ static void purge_effective_progs(struct cgroup *cgrp, struct bpf_prog *prog, pos++; } } + + /* no link or prog match, skip the cgroup of this layer */ + continue; found: - BUG_ON(!cg); progs = rcu_dereference_protected( desc->bpf.effective[atype], lockdep_is_held(&cgroup_mutex)); -- cgit From a8faed3a02eeb75857a3b5d660fa80fe79db77a3 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 7 Aug 2022 15:09:34 -0700 Subject: kernel/sys_ni: add compat entry for fadvise64_64 When CONFIG_ADVISE_SYSCALLS is not set/enabled and CONFIG_COMPAT is set/enabled, the riscv compat_syscall_table references 'compat_sys_fadvise64_64', which is not defined: riscv64-linux-ld: arch/riscv/kernel/compat_syscall_table.o:(.rodata+0x6f8): undefined reference to `compat_sys_fadvise64_64' Add 'fadvise64_64' to kernel/sys_ni.c as a conditional COMPAT function so that when CONFIG_ADVISE_SYSCALLS is not set, there is a fallback function available. Link: https://lkml.kernel.org/r/20220807220934.5689-1-rdunlap@infradead.org Fixes: d3ac21cacc24 ("mm: Support compiling out madvise and fadvise") Signed-off-by: Randy Dunlap Suggested-by: Arnd Bergmann Reviewed-by: Arnd Bergmann Cc: Josh Triplett Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Albert Ou Cc: Signed-off-by: Andrew Morton --- kernel/sys_ni.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index a492f159624f..860b2dcf3ac4 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -277,6 +277,7 @@ COND_SYSCALL(landlock_restrict_self); /* mm/fadvise.c */ COND_SYSCALL(fadvise64_64); +COND_SYSCALL_COMPAT(fadvise64_64); /* mm/, CONFIG_MMU only */ COND_SYSCALL(swapon); -- cgit From 9c80e79906b4ca440d09e7f116609262bb747909 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 12 Aug 2022 19:05:09 -0700 Subject: kprobes: don't call disarm_kprobe() for disabled kprobes The assumption in __disable_kprobe() is wrong, and it could try to disarm an already disarmed kprobe and fire the WARN_ONCE() below. [0] We can easily reproduce this issue. 1. Write 0 to /sys/kernel/debug/kprobes/enabled. # echo 0 > /sys/kernel/debug/kprobes/enabled 2. Run execsnoop. At this time, one kprobe is disabled. # /usr/share/bcc/tools/execsnoop & [1] 2460 PCOMM PID PPID RET ARGS # cat /sys/kernel/debug/kprobes/list ffffffff91345650 r __x64_sys_execve+0x0 [FTRACE] ffffffff91345650 k __x64_sys_execve+0x0 [DISABLED][FTRACE] 3. Write 1 to /sys/kernel/debug/kprobes/enabled, which changes kprobes_all_disarmed to false but does not arm the disabled kprobe. # echo 1 > /sys/kernel/debug/kprobes/enabled # cat /sys/kernel/debug/kprobes/list ffffffff91345650 r __x64_sys_execve+0x0 [FTRACE] ffffffff91345650 k __x64_sys_execve+0x0 [DISABLED][FTRACE] 4. Kill execsnoop, when __disable_kprobe() calls disarm_kprobe() for the disabled kprobe and hits the WARN_ONCE() in __disarm_kprobe_ftrace(). # fg /usr/share/bcc/tools/execsnoop ^C Actually, WARN_ONCE() is fired twice, and __unregister_kprobe_top() misses some cleanups and leaves the aggregated kprobe in the hash table. Then, __unregister_trace_kprobe() initialises tk->rp.kp.list and creates an infinite loop like this. aggregated kprobe.list -> kprobe.list -. ^ | '.__.' In this situation, these commands fall into the infinite loop and result in RCU stall or soft lockup. cat /sys/kernel/debug/kprobes/list : show_kprobe_addr() enters into the infinite loop with RCU. /usr/share/bcc/tools/execsnoop : warn_kprobe_rereg() holds kprobe_mutex, and __get_valid_kprobe() is stuck in the loop. To avoid the issue, make sure we don't call disarm_kprobe() for disabled kprobes. [0] Failed to disarm kprobe-ftrace at __x64_sys_execve+0x0/0x40 (error -2) WARNING: CPU: 6 PID: 2460 at kernel/kprobes.c:1130 __disarm_kprobe_ftrace.isra.19 (kernel/kprobes.c:1129) Modules linked in: ena CPU: 6 PID: 2460 Comm: execsnoop Not tainted 5.19.0+ #28 Hardware name: Amazon EC2 c5.2xlarge/, BIOS 1.0 10/16/2017 RIP: 0010:__disarm_kprobe_ftrace.isra.19 (kernel/kprobes.c:1129) Code: 24 8b 02 eb c1 80 3d c4 83 f2 01 00 75 d4 48 8b 75 00 89 c2 48 c7 c7 90 fa 0f 92 89 04 24 c6 05 ab 83 01 e8 e4 94 f0 ff <0f> 0b 8b 04 24 eb b1 89 c6 48 c7 c7 60 fa 0f 92 89 04 24 e8 cc 94 RSP: 0018:ffff9e6ec154bd98 EFLAGS: 00010282 RAX: 0000000000000000 RBX: ffffffff930f7b00 RCX: 0000000000000001 RDX: 0000000080000001 RSI: ffffffff921461c5 RDI: 00000000ffffffff RBP: ffff89c504286da8 R08: 0000000000000000 R09: c0000000fffeffff R10: 0000000000000000 R11: ffff9e6ec154bc28 R12: ffff89c502394e40 R13: ffff89c502394c00 R14: ffff9e6ec154bc00 R15: 0000000000000000 FS: 00007fe800398740(0000) GS:ffff89c812d80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000c00057f010 CR3: 0000000103b54006 CR4: 00000000007706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: __disable_kprobe (kernel/kprobes.c:1716) disable_kprobe (kernel/kprobes.c:2392) __disable_trace_kprobe (kernel/trace/trace_kprobe.c:340) disable_trace_kprobe (kernel/trace/trace_kprobe.c:429) perf_trace_event_unreg.isra.2 (./include/linux/tracepoint.h:93 kernel/trace/trace_event_perf.c:168) perf_kprobe_destroy (kernel/trace/trace_event_perf.c:295) _free_event (kernel/events/core.c:4971) perf_event_release_kernel (kernel/events/core.c:5176) perf_release (kernel/events/core.c:5186) __fput (fs/file_table.c:321) task_work_run (./include/linux/sched.h:2056 (discriminator 1) kernel/task_work.c:179 (discriminator 1)) exit_to_user_mode_prepare (./include/linux/resume_user_mode.h:49 kernel/entry/common.c:169 kernel/entry/common.c:201) syscall_exit_to_user_mode (./arch/x86/include/asm/jump_label.h:55 ./arch/x86/include/asm/nospec-branch.h:384 ./arch/x86/include/asm/entry-common.h:94 kernel/entry/common.c:133 kernel/entry/common.c:296) do_syscall_64 (arch/x86/entry/common.c:87) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:120) RIP: 0033:0x7fe7ff210654 Code: 15 79 89 20 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb be 0f 1f 00 8b 05 9a cd 20 00 48 63 ff 85 c0 75 11 b8 03 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 3a f3 c3 48 83 ec 18 48 89 7c 24 08 e8 34 fc RSP: 002b:00007ffdbd1d3538 EFLAGS: 00000246 ORIG_RAX: 0000000000000003 RAX: 0000000000000000 RBX: 0000000000000008 RCX: 00007fe7ff210654 RDX: 0000000000000000 RSI: 0000000000002401 RDI: 0000000000000008 RBP: 0000000000000000 R08: 94ae31d6fda838a4 R0900007fe8001c9d30 R10: 00007ffdbd1d34b0 R11: 0000000000000246 R12: 00007ffdbd1d3600 R13: 0000000000000000 R14: fffffffffffffffc R15: 00007ffdbd1d3560 Link: https://lkml.kernel.org/r/20220813020509.90805-1-kuniyu@amazon.com Fixes: 69d54b916d83 ("kprobes: makes kprobes/enabled works correctly for optimized kprobes.") Signed-off-by: Kuniyuki Iwashima Reported-by: Ayushman Dutta Cc: "Naveen N. Rao" Cc: Anil S Keshavamurthy Cc: "David S. Miller" Cc: Masami Hiramatsu Cc: Wang Nan Cc: Kuniyuki Iwashima Cc: Kuniyuki Iwashima Cc: Ayushman Dutta Cc: Signed-off-by: Andrew Morton --- kernel/kprobes.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 80697e5e03e4..08350e35aba2 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1707,11 +1707,12 @@ static struct kprobe *__disable_kprobe(struct kprobe *p) /* Try to disarm and disable this/parent probe */ if (p == orig_p || aggr_kprobe_disabled(orig_p)) { /* - * If 'kprobes_all_disarmed' is set, 'orig_p' - * should have already been disarmed, so - * skip unneed disarming process. + * Don't be lazy here. Even if 'kprobes_all_disarmed' + * is false, 'orig_p' might not have been armed yet. + * Note arm_all_kprobes() __tries__ to arm all kprobes + * on the best effort basis. */ - if (!kprobes_all_disarmed) { + if (!kprobes_all_disarmed && !kprobe_disabled(orig_p)) { ret = disarm_kprobe(orig_p, true); if (ret) { p->flags &= ~KPROBE_FLAG_DISABLED; -- cgit From d8a64313c171464aedd6289378a51c8f0f524acb Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Thu, 11 Aug 2022 09:17:34 +0200 Subject: tracing: React to error return from traceprobe_parse_event_name() The function traceprobe_parse_event_name() may set the first two function arguments to a non-null value and still return -EINVAL to indicate an unsuccessful completion of the function. Hence, it is not sufficient to just check the result of the two function arguments for being not null, but the return value also needs to be checked. Commit 95c104c378dc ("tracing: Auto generate event name when creating a group of events") changed the error-return-value checking of the second traceprobe_parse_event_name() invocation in __trace_eprobe_create() and removed checking the return value to jump to the error handling case. Reinstate using the return value in the error-return-value checking. Link: https://lkml.kernel.org/r/20220811071734.20700-1-lukas.bulwahn@gmail.com Fixes: 95c104c378dc ("tracing: Auto generate event name when creating a group of events") Acked-by: Linyu Yuan Signed-off-by: Lukas Bulwahn Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_eprobe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index 4a0e9d927443..550671985fd1 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -883,7 +883,7 @@ static int __trace_eprobe_create(int argc, const char *argv[]) trace_probe_log_set_index(1); sys_event = argv[1]; ret = traceprobe_parse_event_name(&sys_event, &sys_name, buf2, 0); - if (!sys_event || !sys_name) { + if (ret || !sys_event || !sys_name) { trace_probe_log_err(0, NO_EVENT_INFO); goto parse_error; } -- cgit From 7249921d94ff64f67b733eca0b68853a62032b3d Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 16 Aug 2022 19:28:17 -0400 Subject: tracing/perf: Fix double put of trace event when init fails If in perf_trace_event_init(), the perf_trace_event_open() fails, then it will call perf_trace_event_unreg() which will not only unregister the perf trace event, but will also call the put() function of the tp_event. The problem here is that the trace_event_try_get_ref() is called by the caller of perf_trace_event_init() and if perf_trace_event_init() returns a failure, it will then call trace_event_put(). But since the perf_trace_event_unreg() already called the trace_event_put() function, it triggers a WARN_ON(). WARNING: CPU: 1 PID: 30309 at kernel/trace/trace_dynevent.c:46 trace_event_dyn_put_ref+0x15/0x20 If perf_trace_event_reg() does not call the trace_event_try_get_ref() then the perf_trace_event_unreg() should not be calling trace_event_put(). This breaks symmetry and causes bugs like these. Pull out the trace_event_put() from perf_trace_event_unreg() and call it in the locations that perf_trace_event_unreg() is called. This not only fixes this bug, but also brings back the proper symmetry of the reg/unreg vs get/put logic. Link: https://lore.kernel.org/all/cover.1660347763.git.kjlx@templeofstupid.com/ Link: https://lkml.kernel.org/r/20220816192817.43d5e17f@gandalf.local.home Cc: stable@vger.kernel.org Fixes: 1d18538e6a092 ("tracing: Have dynamic events have a ref counter") Reported-by: Krister Johansen Reviewed-by: Krister Johansen Tested-by: Krister Johansen Acked-by: Jiri Olsa Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_event_perf.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index a114549720d6..61e3a2620fa3 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -157,7 +157,7 @@ static void perf_trace_event_unreg(struct perf_event *p_event) int i; if (--tp_event->perf_refcount > 0) - goto out; + return; tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER, NULL); @@ -176,8 +176,6 @@ static void perf_trace_event_unreg(struct perf_event *p_event) perf_trace_buf[i] = NULL; } } -out: - trace_event_put_ref(tp_event); } static int perf_trace_event_open(struct perf_event *p_event) @@ -241,6 +239,7 @@ void perf_trace_destroy(struct perf_event *p_event) mutex_lock(&event_mutex); perf_trace_event_close(p_event); perf_trace_event_unreg(p_event); + trace_event_put_ref(p_event->tp_event); mutex_unlock(&event_mutex); } @@ -292,6 +291,7 @@ void perf_kprobe_destroy(struct perf_event *p_event) mutex_lock(&event_mutex); perf_trace_event_close(p_event); perf_trace_event_unreg(p_event); + trace_event_put_ref(p_event->tp_event); mutex_unlock(&event_mutex); destroy_local_trace_kprobe(p_event->tp_event); @@ -347,6 +347,7 @@ void perf_uprobe_destroy(struct perf_event *p_event) mutex_lock(&event_mutex); perf_trace_event_close(p_event); perf_trace_event_unreg(p_event); + trace_event_put_ref(p_event->tp_event); mutex_unlock(&event_mutex); destroy_local_trace_uprobe(p_event->tp_event); } -- cgit From c3b0f72e805f0801f05fa2aa52011c4bfc694c44 Mon Sep 17 00:00:00 2001 From: Yang Jihong Date: Thu, 18 Aug 2022 11:26:59 +0800 Subject: ftrace: Fix NULL pointer dereference in is_ftrace_trampoline when ftrace is dead ftrace_startup does not remove ops from ftrace_ops_list when ftrace_startup_enable fails: register_ftrace_function ftrace_startup __register_ftrace_function ... add_ftrace_ops(&ftrace_ops_list, ops) ... ... ftrace_startup_enable // if ftrace failed to modify, ftrace_disabled is set to 1 ... return 0 // ops is in the ftrace_ops_list. When ftrace_disabled = 1, unregister_ftrace_function simply returns without doing anything: unregister_ftrace_function ftrace_shutdown if (unlikely(ftrace_disabled)) return -ENODEV; // return here, __unregister_ftrace_function is not executed, // as a result, ops is still in the ftrace_ops_list __unregister_ftrace_function ... If ops is dynamically allocated, it will be free later, in this case, is_ftrace_trampoline accesses NULL pointer: is_ftrace_trampoline ftrace_ops_trampoline do_for_each_ftrace_op(op, ftrace_ops_list) // OOPS! op may be NULL! Syzkaller reports as follows: [ 1203.506103] BUG: kernel NULL pointer dereference, address: 000000000000010b [ 1203.508039] #PF: supervisor read access in kernel mode [ 1203.508798] #PF: error_code(0x0000) - not-present page [ 1203.509558] PGD 800000011660b067 P4D 800000011660b067 PUD 130fb8067 PMD 0 [ 1203.510560] Oops: 0000 [#1] SMP KASAN PTI [ 1203.511189] CPU: 6 PID: 29532 Comm: syz-executor.2 Tainted: G B W 5.10.0 #8 [ 1203.512324] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 [ 1203.513895] RIP: 0010:is_ftrace_trampoline+0x26/0xb0 [ 1203.514644] Code: ff eb d3 90 41 55 41 54 49 89 fc 55 53 e8 f2 00 fd ff 48 8b 1d 3b 35 5d 03 e8 e6 00 fd ff 48 8d bb 90 00 00 00 e8 2a 81 26 00 <48> 8b ab 90 00 00 00 48 85 ed 74 1d e8 c9 00 fd ff 48 8d bb 98 00 [ 1203.518838] RSP: 0018:ffffc900012cf960 EFLAGS: 00010246 [ 1203.520092] RAX: 0000000000000000 RBX: 000000000000007b RCX: ffffffff8a331866 [ 1203.521469] RDX: 0000000000000000 RSI: 0000000000000008 RDI: 000000000000010b [ 1203.522583] RBP: 0000000000000000 R08: 0000000000000000 R09: ffffffff8df18b07 [ 1203.523550] R10: fffffbfff1be3160 R11: 0000000000000001 R12: 0000000000478399 [ 1203.524596] R13: 0000000000000000 R14: ffff888145088000 R15: 0000000000000008 [ 1203.525634] FS: 00007f429f5f4700(0000) GS:ffff8881daf00000(0000) knlGS:0000000000000000 [ 1203.526801] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1203.527626] CR2: 000000000000010b CR3: 0000000170e1e001 CR4: 00000000003706e0 [ 1203.528611] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 1203.529605] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Therefore, when ftrace_startup_enable fails, we need to rollback registration process and remove ops from ftrace_ops_list. Link: https://lkml.kernel.org/r/20220818032659.56209-1-yangjihong1@huawei.com Suggested-by: Steven Rostedt Signed-off-by: Yang Jihong Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 601ccf1b2f09..4baa99363b16 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2937,6 +2937,16 @@ int ftrace_startup(struct ftrace_ops *ops, int command) ftrace_startup_enable(command); + /* + * If ftrace is in an undefined state, we just remove ops from list + * to prevent the NULL pointer, instead of totally rolling it back and + * free trampoline, because those actions could cause further damage. + */ + if (unlikely(ftrace_disabled)) { + __unregister_ftrace_function(ops); + return -ENODEV; + } + ops->flags &= ~FTRACE_OPS_FL_ADDING; return 0; -- cgit From 2673c60ee67e71f2ebe34386e62d348f71edee47 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Sat, 20 Aug 2022 09:43:17 -0400 Subject: tracing/eprobes: Do not allow eprobes to use $stack, or % for regs While playing with event probes (eprobes), I tried to see what would happen if I attempted to retrieve the instruction pointer (%rip) knowing that event probes do not use pt_regs. The result was: BUG: kernel NULL pointer dereference, address: 0000000000000024 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP PTI CPU: 1 PID: 1847 Comm: trace-cmd Not tainted 5.19.0-rc5-test+ #309 Hardware name: Hewlett-Packard HP Compaq Pro 6300 SFF/339A, BIOS K01 v03.03 07/14/2016 RIP: 0010:get_event_field.isra.0+0x0/0x50 Code: ff 48 c7 c7 c0 8f 74 a1 e8 3d 8b f5 ff e8 88 09 f6 ff 4c 89 e7 e8 50 6a 13 00 48 89 ef 5b 5d 41 5c 41 5d e9 42 6a 13 00 66 90 <48> 63 47 24 8b 57 2c 48 01 c6 8b 47 28 83 f8 02 74 0e 83 f8 04 74 RSP: 0018:ffff916c394bbaf0 EFLAGS: 00010086 RAX: ffff916c854041d8 RBX: ffff916c8d9fbf50 RCX: ffff916c255d2000 RDX: 0000000000000000 RSI: ffff916c255d2008 RDI: 0000000000000000 RBP: 0000000000000000 R08: ffff916c3a2a0c08 R09: ffff916c394bbda8 R10: 0000000000000000 R11: 0000000000000000 R12: ffff916c854041d8 R13: ffff916c854041b0 R14: 0000000000000000 R15: 0000000000000000 FS: 0000000000000000(0000) GS:ffff916c9ea40000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000024 CR3: 000000011b60a002 CR4: 00000000001706e0 Call Trace: get_eprobe_size+0xb4/0x640 ? __mod_node_page_state+0x72/0xc0 __eprobe_trace_func+0x59/0x1a0 ? __mod_lruvec_page_state+0xaa/0x1b0 ? page_remove_file_rmap+0x14/0x230 ? page_remove_rmap+0xda/0x170 event_triggers_call+0x52/0xe0 trace_event_buffer_commit+0x18f/0x240 trace_event_raw_event_sched_wakeup_template+0x7a/0xb0 try_to_wake_up+0x260/0x4c0 __wake_up_common+0x80/0x180 __wake_up_common_lock+0x7c/0xc0 do_notify_parent+0x1c9/0x2a0 exit_notify+0x1a9/0x220 do_exit+0x2ba/0x450 do_group_exit+0x2d/0x90 __x64_sys_exit_group+0x14/0x20 do_syscall_64+0x3b/0x90 entry_SYSCALL_64_after_hwframe+0x46/0xb0 Obviously this is not the desired result. Move the testing for TPARG_FL_TPOINT which is only used for event probes to the top of the "$" variable check, as all the other variables are not used for event probes. Also add a check in the register parsing "%" to fail if an event probe is used. Link: https://lkml.kernel.org/r/20220820134400.564426983@goodmis.org Cc: stable@vger.kernel.org Cc: Ingo Molnar Cc: Andrew Morton Cc: Tzvetomir Stoyanov Cc: Tom Zanussi Fixes: 7491e2c44278 ("tracing: Add a probe that attaches to trace events") Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_probe.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 850a88abd33b..dec657af363c 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -283,7 +283,14 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, int ret = 0; int len; - if (strcmp(arg, "retval") == 0) { + if (flags & TPARG_FL_TPOINT) { + if (code->data) + return -EFAULT; + code->data = kstrdup(arg, GFP_KERNEL); + if (!code->data) + return -ENOMEM; + code->op = FETCH_OP_TP_ARG; + } else if (strcmp(arg, "retval") == 0) { if (flags & TPARG_FL_RETURN) { code->op = FETCH_OP_RETVAL; } else { @@ -323,13 +330,6 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, code->op = FETCH_OP_ARG; code->param = (unsigned int)param - 1; #endif - } else if (flags & TPARG_FL_TPOINT) { - if (code->data) - return -EFAULT; - code->data = kstrdup(arg, GFP_KERNEL); - if (!code->data) - return -ENOMEM; - code->op = FETCH_OP_TP_ARG; } else goto inval_var; @@ -384,6 +384,11 @@ parse_probe_arg(char *arg, const struct fetch_type *type, break; case '%': /* named register */ + if (flags & TPARG_FL_TPOINT) { + /* eprobes do not handle registers */ + trace_probe_log_err(offs, BAD_VAR); + break; + } ret = regs_query_register_offset(arg + 1); if (ret >= 0) { code->op = FETCH_OP_REG; -- cgit From 02333de90e5945e2fe7fc75b15b4eb9aee187f0a Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Sat, 20 Aug 2022 09:43:18 -0400 Subject: tracing/eprobes: Do not hardcode $comm as a string The variable $comm is hard coded as a string, which is true for both kprobes and uprobes, but for event probes (eprobes) it is a field name. In most cases the "comm" field would be a string, but there's no guarantee of that fact. Do not assume that comm is a string. Not to mention, it currently forces comm fields to fault, as string processing for event probes is currently broken. Link: https://lkml.kernel.org/r/20220820134400.756152112@goodmis.org Cc: stable@vger.kernel.org Cc: Ingo Molnar Cc: Andrew Morton Cc: Masami Hiramatsu Cc: Tzvetomir Stoyanov Cc: Tom Zanussi Fixes: 7491e2c44278 ("tracing: Add a probe that attaches to trace events") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_probe.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index dec657af363c..4daabbb8b772 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -622,9 +622,10 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, /* * Since $comm and immediate string can not be dereferenced, - * we can find those by strcmp. + * we can find those by strcmp. But ignore for eprobes. */ - if (strcmp(arg, "$comm") == 0 || strncmp(arg, "\\\"", 2) == 0) { + if (!(flags & TPARG_FL_TPOINT) && + (strcmp(arg, "$comm") == 0 || strncmp(arg, "\\\"", 2) == 0)) { /* The type of $comm must be "string", and not an array. */ if (parg->count || (t && strcmp(t, "string"))) goto out; -- cgit From f04dec93466a0481763f3b56cdadf8076e28bfbf Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Sat, 20 Aug 2022 09:43:19 -0400 Subject: tracing/eprobes: Fix reading of string fields Currently when an event probe (eprobe) hooks to a string field, it does not display it as a string, but instead as a number. This makes the field rather useless. Handle the different kinds of strings, dynamic, static, relational/dynamic etc. Now when a string field is used, the ":string" type can be used to display it: echo "e:sw sched/sched_switch comm=$next_comm:string" > dynamic_events Link: https://lkml.kernel.org/r/20220820134400.959640191@goodmis.org Cc: stable@vger.kernel.org Cc: Ingo Molnar Cc: Andrew Morton Cc: Tzvetomir Stoyanov Cc: Tom Zanussi Fixes: 7491e2c44278 ("tracing: Add a probe that attaches to trace events") Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_eprobe.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index 550671985fd1..a1d3423ab74f 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -311,6 +311,27 @@ static unsigned long get_event_field(struct fetch_insn *code, void *rec) addr = rec + field->offset; + if (is_string_field(field)) { + switch (field->filter_type) { + case FILTER_DYN_STRING: + val = (unsigned long)(rec + (*(unsigned int *)addr & 0xffff)); + break; + case FILTER_RDYN_STRING: + val = (unsigned long)(addr + (*(unsigned int *)addr & 0xffff)); + break; + case FILTER_STATIC_STRING: + val = (unsigned long)addr; + break; + case FILTER_PTR_STRING: + val = (unsigned long)(*(char *)addr); + break; + default: + WARN_ON_ONCE(1); + return 0; + } + return val; + } + switch (field->size) { case 1: if (field->is_signed) -- cgit From 6a832ec3d680b3a4f4fad5752672827d71bae501 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Sat, 20 Aug 2022 09:43:20 -0400 Subject: tracing/eprobes: Have event probes be consistent with kprobes and uprobes Currently, if a symbol "@" is attempted to be used with an event probe (eprobes), it will cause a NULL pointer dereference crash. Both kprobes and uprobes can reference data other than the main registers. Such as immediate address, symbols and the current task name. Have eprobes do the same thing. For "comm", if "comm" is used and the event being attached to does not have the "comm" field, then make it the "$comm" that kprobes has. This is consistent to the way histograms and filters work. Link: https://lkml.kernel.org/r/20220820134401.136924220@goodmis.org Cc: stable@vger.kernel.org Cc: Ingo Molnar Cc: Andrew Morton Cc: Masami Hiramatsu Cc: Tzvetomir Stoyanov Cc: Tom Zanussi Fixes: 7491e2c44278 ("tracing: Add a probe that attaches to trace events") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_eprobe.c | 70 +++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 64 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index a1d3423ab74f..1783e3478912 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -227,6 +227,7 @@ static int trace_eprobe_tp_arg_update(struct trace_eprobe *ep, int i) struct probe_arg *parg = &ep->tp.args[i]; struct ftrace_event_field *field; struct list_head *head; + int ret = -ENOENT; head = trace_get_fields(ep->event); list_for_each_entry(field, head, link) { @@ -236,9 +237,20 @@ static int trace_eprobe_tp_arg_update(struct trace_eprobe *ep, int i) return 0; } } + + /* + * Argument not found on event. But allow for comm and COMM + * to be used to get the current->comm. + */ + if (strcmp(parg->code->data, "COMM") == 0 || + strcmp(parg->code->data, "comm") == 0) { + parg->code->op = FETCH_OP_COMM; + ret = 0; + } + kfree(parg->code->data); parg->code->data = NULL; - return -ENOENT; + return ret; } static int eprobe_event_define_fields(struct trace_event_call *event_call) @@ -363,16 +375,38 @@ static unsigned long get_event_field(struct fetch_insn *code, void *rec) static int get_eprobe_size(struct trace_probe *tp, void *rec) { + struct fetch_insn *code; struct probe_arg *arg; int i, len, ret = 0; for (i = 0; i < tp->nr_args; i++) { arg = tp->args + i; - if (unlikely(arg->dynamic)) { + if (arg->dynamic) { unsigned long val; - val = get_event_field(arg->code, rec); - len = process_fetch_insn_bottom(arg->code + 1, val, NULL, NULL); + code = arg->code; + retry: + switch (code->op) { + case FETCH_OP_TP_ARG: + val = get_event_field(code, rec); + break; + case FETCH_OP_IMM: + val = code->immediate; + break; + case FETCH_OP_COMM: + val = (unsigned long)current->comm; + break; + case FETCH_OP_DATA: + val = (unsigned long)code->data; + break; + case FETCH_NOP_SYMBOL: /* Ignore a place holder */ + code++; + goto retry; + default: + continue; + } + code++; + len = process_fetch_insn_bottom(code, val, NULL, NULL); if (len > 0) ret += len; } @@ -390,8 +424,28 @@ process_fetch_insn(struct fetch_insn *code, void *rec, void *dest, { unsigned long val; - val = get_event_field(code, rec); - return process_fetch_insn_bottom(code + 1, val, dest, base); + retry: + switch (code->op) { + case FETCH_OP_TP_ARG: + val = get_event_field(code, rec); + break; + case FETCH_OP_IMM: + val = code->immediate; + break; + case FETCH_OP_COMM: + val = (unsigned long)current->comm; + break; + case FETCH_OP_DATA: + val = (unsigned long)code->data; + break; + case FETCH_NOP_SYMBOL: /* Ignore a place holder */ + code++; + goto retry; + default: + return -EILSEQ; + } + code++; + return process_fetch_insn_bottom(code, val, dest, base); } NOKPROBE_SYMBOL(process_fetch_insn) @@ -866,6 +920,10 @@ static int trace_eprobe_tp_update_arg(struct trace_eprobe *ep, const char *argv[ trace_probe_log_err(0, BAD_ATTACH_ARG); } + /* Handle symbols "@" */ + if (!ret) + ret = traceprobe_update_arg(&ep->tp.args[i]); + return ret; } -- cgit From ab8384442ee512fc0fc72deeb036110843d0e7ff Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Sat, 20 Aug 2022 09:43:21 -0400 Subject: tracing/probes: Have kprobes and uprobes use $COMM too Both $comm and $COMM can be used to get current->comm in eprobes and the filtering and histogram logic. Make kprobes and uprobes consistent in this regard and allow both $comm and $COMM as well. Currently kprobes and uprobes only handle $comm, which is inconsistent with the other utilities, and can be confusing to users. Link: https://lkml.kernel.org/r/20220820134401.317014913@goodmis.org Link: https://lore.kernel.org/all/20220820220442.776e1ddaf8836e82edb34d01@kernel.org/ Cc: stable@vger.kernel.org Cc: Ingo Molnar Cc: Andrew Morton Cc: Tzvetomir Stoyanov Cc: Tom Zanussi Fixes: 533059281ee5 ("tracing: probeevent: Introduce new argument fetching code") Suggested-by: Masami Hiramatsu (Google) Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_probe.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 4daabbb8b772..36dff277de46 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -314,7 +314,7 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, } } else goto inval_var; - } else if (strcmp(arg, "comm") == 0) { + } else if (strcmp(arg, "comm") == 0 || strcmp(arg, "COMM") == 0) { code->op = FETCH_OP_COMM; #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API } else if (((flags & TPARG_FL_MASK) == @@ -625,7 +625,8 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, * we can find those by strcmp. But ignore for eprobes. */ if (!(flags & TPARG_FL_TPOINT) && - (strcmp(arg, "$comm") == 0 || strncmp(arg, "\\\"", 2) == 0)) { + (strcmp(arg, "$comm") == 0 || strcmp(arg, "$COMM") == 0 || + strncmp(arg, "\\\"", 2) == 0)) { /* The type of $comm must be "string", and not an array. */ if (parg->count || (t && strcmp(t, "string"))) goto out; -- cgit From b2380577d4fe1c0ef3fa50417f1e441c016e4cbe Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Sat, 20 Aug 2022 09:43:22 -0400 Subject: tracing: Have filter accept "common_cpu" to be consistent Make filtering consistent with histograms. As "cpu" can be a field of an event, allow for "common_cpu" to keep it from being confused with the "cpu" field of the event. Link: https://lkml.kernel.org/r/20220820134401.513062765@goodmis.org Link: https://lore.kernel.org/all/20220820220920.e42fa32b70505b1904f0a0ad@kernel.org/ Cc: stable@vger.kernel.org Cc: Ingo Molnar Cc: Andrew Morton Cc: Tzvetomir Stoyanov Cc: Tom Zanussi Fixes: 1e3bac71c5053 ("tracing/histogram: Rename "cpu" to "common_cpu"") Suggested-by: Masami Hiramatsu (Google) Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 181f08186d32..0356cae0cf74 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -176,6 +176,7 @@ static int trace_define_generic_fields(void) __generic_field(int, CPU, FILTER_CPU); __generic_field(int, cpu, FILTER_CPU); + __generic_field(int, common_cpu, FILTER_CPU); __generic_field(char *, COMM, FILTER_COMM); __generic_field(char *, comm, FILTER_COMM); -- cgit From 123d6455771ec577ce65f8d1bda548fb0eb7ef21 Mon Sep 17 00:00:00 2001 From: Wang Jingjin Date: Mon, 1 Aug 2022 16:47:45 +0800 Subject: ftrace: Fix build warning for ops_references_rec() not used MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The change that made IPMODIFY and DIRECT ops work together needed access to the ops_references_ip() function, which it pulled out of the module only code. But now if both CONFIG_MODULES and CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS is not set, we get the below warning: ‘ops_references_rec’ defined but not used. Since ops_references_rec() only calls ops_references_ip() replace the usage of ops_references_rec() with ops_references_ip() and encompass the function with an #ifdef of DIRECT_CALLS || MODULES being defined. Link: https://lkml.kernel.org/r/20220801084745.1187987-1-wangjingjin1@huawei.com Fixes: 53cd885bc5c3 ("ftrace: Allow IPMODIFY and DIRECT ops on the same function") Signed-off-by: Wang Jingjin Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 79 +++++++++++++++++++++------------------------------ 1 file changed, 33 insertions(+), 46 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 126c769d36c3..439e2ab6905e 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1861,8 +1861,6 @@ static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, ftrace_hash_rec_update_modify(ops, filter_hash, 1); } -static bool ops_references_ip(struct ftrace_ops *ops, unsigned long ip); - /* * Try to update IPMODIFY flag on each ftrace_rec. Return 0 if it is OK * or no-needed to update, -EBUSY if it detects a conflict of the flag @@ -3118,49 +3116,6 @@ static inline int ops_traces_mod(struct ftrace_ops *ops) ftrace_hash_empty(ops->func_hash->notrace_hash); } -/* - * Check if the current ops references the given ip. - * - * If the ops traces all functions, then it was already accounted for. - * If the ops does not trace the current record function, skip it. - * If the ops ignores the function via notrace filter, skip it. - */ -static bool -ops_references_ip(struct ftrace_ops *ops, unsigned long ip) -{ - /* If ops isn't enabled, ignore it */ - if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) - return false; - - /* If ops traces all then it includes this function */ - if (ops_traces_mod(ops)) - return true; - - /* The function must be in the filter */ - if (!ftrace_hash_empty(ops->func_hash->filter_hash) && - !__ftrace_lookup_ip(ops->func_hash->filter_hash, ip)) - return false; - - /* If in notrace hash, we ignore it too */ - if (ftrace_lookup_ip(ops->func_hash->notrace_hash, ip)) - return false; - - return true; -} - -/* - * Check if the current ops references the record. - * - * If the ops traces all functions, then it was already accounted for. - * If the ops does not trace the current record function, skip it. - * If the ops ignores the function via notrace filter, skip it. - */ -static bool -ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec) -{ - return ops_references_ip(ops, rec->ip); -} - static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs) { bool init_nop = ftrace_need_init_nop(); @@ -6822,6 +6777,38 @@ static int ftrace_get_trampoline_kallsym(unsigned int symnum, return -ERANGE; } +#if defined(CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS) || defined(CONFIG_MODULES) +/* + * Check if the current ops references the given ip. + * + * If the ops traces all functions, then it was already accounted for. + * If the ops does not trace the current record function, skip it. + * If the ops ignores the function via notrace filter, skip it. + */ +static bool +ops_references_ip(struct ftrace_ops *ops, unsigned long ip) +{ + /* If ops isn't enabled, ignore it */ + if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) + return false; + + /* If ops traces all then it includes this function */ + if (ops_traces_mod(ops)) + return true; + + /* The function must be in the filter */ + if (!ftrace_hash_empty(ops->func_hash->filter_hash) && + !__ftrace_lookup_ip(ops->func_hash->filter_hash, ip)) + return false; + + /* If in notrace hash, we ignore it too */ + if (ftrace_lookup_ip(ops->func_hash->notrace_hash, ip)) + return false; + + return true; +} +#endif + #ifdef CONFIG_MODULES #define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next) @@ -6834,7 +6821,7 @@ static int referenced_filters(struct dyn_ftrace *rec) int cnt = 0; for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) { - if (ops_references_rec(ops, rec)) { + if (ops_references_ip(ops, rec->ip)) { if (WARN_ON_ONCE(ops->flags & FTRACE_OPS_FL_DIRECT)) continue; if (WARN_ON_ONCE(ops->flags & FTRACE_OPS_FL_IPMODIFY)) -- cgit From ad982c3be4e60c7d39c03f782733503cbd88fd2a Mon Sep 17 00:00:00 2001 From: Gaosheng Cui Date: Mon, 22 Aug 2022 10:29:05 +0800 Subject: audit: fix potential double free on error path from fsnotify_add_inode_mark Audit_alloc_mark() assign pathname to audit_mark->path, on error path from fsnotify_add_inode_mark(), fsnotify_put_mark will free memory of audit_mark->path, but the caller of audit_alloc_mark will free the pathname again, so there will be double free problem. Fix this by resetting audit_mark->path to NULL pointer on error path from fsnotify_add_inode_mark(). Cc: stable@vger.kernel.org Fixes: 7b1293234084d ("fsnotify: Add group pointer in fsnotify_init_mark()") Signed-off-by: Gaosheng Cui Reviewed-by: Jan Kara Signed-off-by: Paul Moore --- kernel/audit_fsnotify.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c index 6432a37ac1c9..c565fbf66ac8 100644 --- a/kernel/audit_fsnotify.c +++ b/kernel/audit_fsnotify.c @@ -102,6 +102,7 @@ struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pa ret = fsnotify_add_inode_mark(&audit_mark->mark, inode, 0); if (ret < 0) { + audit_mark->path = NULL; fsnotify_put_mark(&audit_mark->mark); audit_mark = ERR_PTR(ret); } -- cgit From 763f4fb76e24959c370cdaa889b2492ba6175580 Mon Sep 17 00:00:00 2001 From: Jing-Ting Wu Date: Tue, 23 Aug 2022 13:41:46 +0800 Subject: cgroup: Fix race condition at rebind_subsystems() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Root cause: The rebind_subsystems() is no lock held when move css object from A list to B list,then let B's head be treated as css node at list_for_each_entry_rcu(). Solution: Add grace period before invalidating the removed rstat_css_node. Reported-by: Jing-Ting Wu Suggested-by: Michal Koutný Signed-off-by: Jing-Ting Wu Tested-by: Jing-Ting Wu Link: https://lore.kernel.org/linux-arm-kernel/d8f0bc5e2fb6ed259f9334c83279b4c011283c41.camel@mediatek.com/T/ Acked-by: Mukesh Ojha Fixes: a7df69b81aac ("cgroup: rstat: support cgroup1") Cc: stable@vger.kernel.org # v5.13+ Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index e1387499b336..e4bb5d57f4d1 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1820,6 +1820,7 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) if (ss->css_rstat_flush) { list_del_rcu(&css->rstat_css_node); + synchronize_rcu(); list_add_rcu(&css->rstat_css_node, &dcgrp->rstat_css_list); } -- cgit From 0947ae1121083d363d522ff7518ee72b55bd8d29 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 23 Aug 2022 14:58:04 -0700 Subject: bpf: Fix a data-race around bpf_jit_limit. While reading bpf_jit_limit, it can be changed concurrently via sysctl, WRITE_ONCE() in __do_proc_doulongvec_minmax(). The size of bpf_jit_limit is long, so we need to add a paired READ_ONCE() to avoid load-tearing. Fixes: ede95a63b5e8 ("bpf: add bpf_jit_limit knob to restrict unpriv allocations") Signed-off-by: Kuniyuki Iwashima Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220823215804.2177-1-kuniyu@amazon.com --- kernel/bpf/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index c1e10d088dbb..3d9eb3ae334c 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -971,7 +971,7 @@ pure_initcall(bpf_jit_charge_init); int bpf_jit_charge_modmem(u32 size) { - if (atomic_long_add_return(size, &bpf_jit_current) > bpf_jit_limit) { + if (atomic_long_add_return(size, &bpf_jit_current) > READ_ONCE(bpf_jit_limit)) { if (!bpf_capable()) { atomic_long_sub(size, &bpf_jit_current); return -EPERM; -- cgit From 43626dade36fa74d3329046f4ae2d7fdefe401c6 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Thu, 25 Aug 2022 17:38:38 +0900 Subject: cgroup: Add missing cpus_read_lock() to cgroup_attach_task_all() syzbot is hitting percpu_rwsem_assert_held(&cpu_hotplug_lock) warning at cpuset_attach() [1], for commit 4f7e7236435ca0ab ("cgroup: Fix threadgroup_rwsem <-> cpus_read_lock() deadlock") missed that cpuset_attach() is also called from cgroup_attach_task_all(). Add cpus_read_lock() like what cgroup_procs_write_start() does. Link: https://syzkaller.appspot.com/bug?extid=29d3a3b4d86c8136ad9e [1] Reported-by: syzbot Signed-off-by: Tetsuo Handa Fixes: 4f7e7236435ca0ab ("cgroup: Fix threadgroup_rwsem <-> cpus_read_lock() deadlock") Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup-v1.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 2ade21b54dc4..ff6a8099eb2a 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -59,6 +59,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) int retval = 0; mutex_lock(&cgroup_mutex); + cpus_read_lock(); percpu_down_write(&cgroup_threadgroup_rwsem); for_each_root(root) { struct cgroup *from_cgrp; @@ -72,6 +73,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) break; } percpu_up_write(&cgroup_threadgroup_rwsem); + cpus_read_unlock(); mutex_unlock(&cgroup_mutex); return retval; -- cgit From 2fc31465c5373b5ca4edf2e5238558cb62902311 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Tue, 23 Aug 2022 20:52:59 +0200 Subject: bpf: Do mark_chain_precision for ARG_CONST_ALLOC_SIZE_OR_ZERO Precision markers need to be propagated whenever we have an ARG_CONST_* style argument, as the verifier cannot consider imprecise scalars to be equivalent for the purposes of states_equal check when such arguments refine the return value (in this case, set mem_size for PTR_TO_MEM). The resultant mem_size for the R0 is derived from the constant value, and if the verifier incorrectly prunes states considering them equivalent where such arguments exist (by seeing that both registers have reg->precise as false in regsafe), we can end up with invalid programs passing the verifier which can do access beyond what should have been the correct mem_size in that explored state. To show a concrete example of the problem: 0000000000000000 : 0: r2 = *(u32 *)(r1 + 80) 1: r1 = *(u32 *)(r1 + 76) 2: r3 = r1 3: r3 += 4 4: if r3 > r2 goto +18 5: w2 = 0 6: *(u32 *)(r1 + 0) = r2 7: r1 = *(u32 *)(r1 + 0) 8: r2 = 1 9: if w1 == 0 goto +1 10: r2 = -1 0000000000000058 : 11: r1 = 0 ll 13: r3 = 0 14: call bpf_ringbuf_reserve 15: if r0 == 0 goto +7 16: r1 = r0 17: r1 += 16777215 18: w2 = 0 19: *(u8 *)(r1 + 0) = r2 20: r1 = r0 21: r2 = 0 22: call bpf_ringbuf_submit 00000000000000b8 : 23: w0 = 0 24: exit For the first case, the single line execution's exploration will prune the search at insn 14 for the branch insn 9's second leg as it will be verified first using r2 = -1 (UINT_MAX), while as w1 at insn 9 will always be 0 so at runtime we don't get error for being greater than UINT_MAX/4 from bpf_ringbuf_reserve. The verifier during regsafe just sees reg->precise as false for both r2 registers in both states, hence considers them equal for purposes of states_equal. If we propagated precise markers using the backtracking support, we would use the precise marking to then ensure that old r2 (UINT_MAX) was within the new r2 (1) and this would never be true, so the verification would rightfully fail. The end result is that the out of bounds access at instruction 19 would be permitted without this fix. Note that reg->precise is always set to true when user does not have CAP_BPF (or when subprog count is greater than 1 (i.e. use of any static or global functions)), hence this is only a problem when precision marks need to be explicitly propagated (i.e. privileged users with CAP_BPF). A simplified test case has been included in the next patch to prevent future regressions. Fixes: 457f44363a88 ("bpf: Implement BPF ring buffer and verifier support for it") Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220823185300.406-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 096fdac70165..30c6eebce146 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6066,6 +6066,9 @@ skip_type_check: return -EACCES; } meta->mem_size = reg->var_off.value; + err = mark_chain_precision(env, regno); + if (err) + return err; break; case ARG_PTR_TO_INT: case ARG_PTR_TO_LONG: -- cgit From d4fefa4801a1c2f9c0c7a48fbb0fdf384e89a4ab Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Thu, 25 Aug 2022 15:32:40 -0400 Subject: audit: move audit_return_fixup before the filters The success and return_code are needed by the filters. Move audit_return_fixup() before the filters. This was causing syscall auditing events to be missed. Link: https://github.com/linux-audit/audit-kernel/issues/138 Cc: stable@vger.kernel.org Fixes: 12c5e81d3fd0 ("audit: prepare audit_context for use in calling contexts beyond syscalls") Signed-off-by: Richard Guy Briggs [PM: manual merge required] Signed-off-by: Paul Moore --- kernel/auditsc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index dd8d9ab747c3..79a5da1bc5bb 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1940,6 +1940,7 @@ void __audit_uring_exit(int success, long code) goto out; } + audit_return_fixup(ctx, success, code); if (ctx->context == AUDIT_CTX_SYSCALL) { /* * NOTE: See the note in __audit_uring_entry() about the case @@ -1981,7 +1982,6 @@ void __audit_uring_exit(int success, long code) audit_filter_inodes(current, ctx); if (ctx->current_state != AUDIT_STATE_RECORD) goto out; - audit_return_fixup(ctx, success, code); audit_log_exit(); out: @@ -2065,13 +2065,13 @@ void __audit_syscall_exit(int success, long return_code) if (!list_empty(&context->killed_trees)) audit_kill_trees(context); + audit_return_fixup(context, success, return_code); /* run through both filters to ensure we set the filterkey properly */ audit_filter_syscall(current, context); audit_filter_inodes(current, context); if (context->current_state < AUDIT_STATE_RECORD) goto out; - audit_return_fixup(context, success, return_code); audit_log_exit(); out: -- cgit From a657182a5c5150cdfacb6640aad1d2712571a409 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 25 Aug 2022 23:26:47 +0200 Subject: bpf: Don't use tnum_range on array range checking for poke descriptors Hsin-Wei reported a KASAN splat triggered by their BPF runtime fuzzer which is based on a customized syzkaller: BUG: KASAN: slab-out-of-bounds in bpf_int_jit_compile+0x1257/0x13f0 Read of size 8 at addr ffff888004e90b58 by task syz-executor.0/1489 CPU: 1 PID: 1489 Comm: syz-executor.0 Not tainted 5.19.0 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1.1 04/01/2014 Call Trace: dump_stack_lvl+0x9c/0xc9 print_address_description.constprop.0+0x1f/0x1f0 ? bpf_int_jit_compile+0x1257/0x13f0 kasan_report.cold+0xeb/0x197 ? kvmalloc_node+0x170/0x200 ? bpf_int_jit_compile+0x1257/0x13f0 bpf_int_jit_compile+0x1257/0x13f0 ? arch_prepare_bpf_dispatcher+0xd0/0xd0 ? rcu_read_lock_sched_held+0x43/0x70 bpf_prog_select_runtime+0x3e8/0x640 ? bpf_obj_name_cpy+0x149/0x1b0 bpf_prog_load+0x102f/0x2220 ? __bpf_prog_put.constprop.0+0x220/0x220 ? find_held_lock+0x2c/0x110 ? __might_fault+0xd6/0x180 ? lock_downgrade+0x6e0/0x6e0 ? lock_is_held_type+0xa6/0x120 ? __might_fault+0x147/0x180 __sys_bpf+0x137b/0x6070 ? bpf_perf_link_attach+0x530/0x530 ? new_sync_read+0x600/0x600 ? __fget_files+0x255/0x450 ? lock_downgrade+0x6e0/0x6e0 ? fput+0x30/0x1a0 ? ksys_write+0x1a8/0x260 __x64_sys_bpf+0x7a/0xc0 ? syscall_enter_from_user_mode+0x21/0x70 do_syscall_64+0x3b/0x90 entry_SYSCALL_64_after_hwframe+0x63/0xcd RIP: 0033:0x7f917c4e2c2d The problem here is that a range of tnum_range(0, map->max_entries - 1) has limited ability to represent the concrete tight range with the tnum as the set of resulting states from value + mask can result in a superset of the actual intended range, and as such a tnum_in(range, reg->var_off) check may yield true when it shouldn't, for example tnum_range(0, 2) would result in 00XX -> v = 0000, m = 0011 such that the intended set of {0, 1, 2} is here represented by a less precise superset of {0, 1, 2, 3}. As the register is known const scalar, really just use the concrete reg->var_off.value for the upper index check. Fixes: d2e4c1e6c294 ("bpf: Constant map key tracking for prog array pokes") Reported-by: Hsin-Wei Hung Signed-off-by: Daniel Borkmann Cc: Shung-Hsi Yu Acked-by: John Fastabend Link: https://lore.kernel.org/r/984b37f9fdf7ac36831d2137415a4a915744c1b6.1661462653.git.daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 30c6eebce146..3eadb14e090b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7033,8 +7033,7 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx]; struct bpf_reg_state *regs = cur_regs(env), *reg; struct bpf_map *map = meta->map_ptr; - struct tnum range; - u64 val; + u64 val, max; int err; if (func_id != BPF_FUNC_tail_call) @@ -7044,10 +7043,11 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, return -EINVAL; } - range = tnum_range(0, map->max_entries - 1); reg = ®s[BPF_REG_3]; + val = reg->var_off.value; + max = map->max_entries; - if (!register_is_const(reg) || !tnum_in(range, reg->var_off)) { + if (!(register_is_const(reg) && val < max)) { bpf_map_key_store(aux, BPF_MAP_KEY_POISON); return 0; } @@ -7055,8 +7055,6 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, err = mark_chain_precision(env, BPF_REG_3); if (err) return err; - - val = reg->var_off.value; if (bpf_map_key_unseen(aux)) bpf_map_key_store(aux, val); else if (!bpf_map_key_poisoned(aux) && -- cgit From 8238b4579866b7c1bb99883cfe102a43db5506ff Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 26 Aug 2022 09:17:08 -0400 Subject: wait_on_bit: add an acquire memory barrier There are several places in the kernel where wait_on_bit is not followed by a memory barrier (for example, in drivers/md/dm-bufio.c:new_read). On architectures with weak memory ordering, it may happen that memory accesses that follow wait_on_bit are reordered before wait_on_bit and they may return invalid data. Fix this class of bugs by introducing a new function "test_bit_acquire" that works like test_bit, but has acquire memory ordering semantics. Signed-off-by: Mikulas Patocka Acked-by: Will Deacon Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- kernel/sched/wait_bit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c index d4788f810b55..0b1cd985dc27 100644 --- a/kernel/sched/wait_bit.c +++ b/kernel/sched/wait_bit.c @@ -47,7 +47,7 @@ __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_ prepare_to_wait(wq_head, &wbq_entry->wq_entry, mode); if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags)) ret = (*action)(&wbq_entry->key, mode); - } while (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret); + } while (test_bit_acquire(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret); finish_wait(wq_head, &wbq_entry->wq_entry); -- cgit From f09bddbd86619bf6213c96142a3b6b6a84818798 Mon Sep 17 00:00:00 2001 From: Stephen Brennan Date: Mon, 8 Aug 2022 13:54:10 -0700 Subject: vmcoreinfo: add kallsyms_num_syms symbol The rest of the kallsyms symbols are useless without knowing the number of symbols in the table. In an earlier patch, I somehow dropped the kallsyms_num_syms symbol, so add it back in. Link: https://lkml.kernel.org/r/20220808205410.18590-1-stephen.s.brennan@oracle.com Fixes: 5fd8fea935a1 ("vmcoreinfo: include kallsyms symbols") Signed-off-by: Stephen Brennan Cc: Baoquan He Cc: Dave Young Cc: Vivek Goyal Signed-off-by: Andrew Morton --- kernel/crash_core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 07b26df453a9..a0eb4d5cf557 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -494,6 +494,7 @@ static int __init crash_save_vmcoreinfo_init(void) #ifdef CONFIG_KALLSYMS VMCOREINFO_SYMBOL(kallsyms_names); + VMCOREINFO_SYMBOL(kallsyms_num_syms); VMCOREINFO_SYMBOL(kallsyms_token_table); VMCOREINFO_SYMBOL(kallsyms_token_index); #ifdef CONFIG_KALLSYMS_BASE_RELATIVE -- cgit From c2e406596571659451f4b95e37ddfd5a8ef1d0dc Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 2 Sep 2022 14:31:07 +0200 Subject: sched/debug: fix dentry leak in update_sched_domain_debugfs Kuyo reports that the pattern of using debugfs_remove(debugfs_lookup()) leaks a dentry and with a hotplug stress test, the machine eventually runs out of memory. Fix this up by using the newly created debugfs_lookup_and_remove() call instead which properly handles the dentry reference counting logic. Cc: Major Chen Cc: stable Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Juri Lelli Cc: Vincent Guittot Cc: Dietmar Eggemann Cc: Steven Rostedt Cc: Ben Segall Cc: Mel Gorman Cc: Daniel Bristot de Oliveira Cc: Valentin Schneider Cc: Matthias Brugger Reported-by: Kuyo Chang Tested-by: Kuyo Chang Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220902123107.109274-2-gregkh@linuxfoundation.org Signed-off-by: Greg Kroah-Hartman --- kernel/sched/debug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index bb3d63bdf4ae..667876da8382 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -416,7 +416,7 @@ void update_sched_domain_debugfs(void) char buf[32]; snprintf(buf, sizeof(buf), "cpu%d", cpu); - debugfs_remove(debugfs_lookup(buf, sd_dentry)); + debugfs_lookup_and_remove(buf, sd_dentry); d_cpu = debugfs_create_dir(buf, sd_dentry); i = 0; -- cgit From 85eaeb5058f0f04dffb124c97c86b4f18db0b833 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Wed, 24 Aug 2022 09:10:36 +0300 Subject: IB/core: Fix a nested dead lock as part of ODP flow Fix a nested dead lock as part of ODP flow by using mmput_async(). From the below call trace [1] can see that calling mmput() once we have the umem_odp->umem_mutex locked as required by ib_umem_odp_map_dma_and_lock() might trigger in the same task the exit_mmap()->__mmu_notifier_release()->mlx5_ib_invalidate_range() which may dead lock when trying to lock the same mutex. Moving to use mmput_async() will solve the problem as the above exit_mmap() flow will be called in other task and will be executed once the lock will be available. [1] [64843.077665] task:kworker/u133:2 state:D stack: 0 pid:80906 ppid: 2 flags:0x00004000 [64843.077672] Workqueue: mlx5_ib_page_fault mlx5_ib_eqe_pf_action [mlx5_ib] [64843.077719] Call Trace: [64843.077722] [64843.077724] __schedule+0x23d/0x590 [64843.077729] schedule+0x4e/0xb0 [64843.077735] schedule_preempt_disabled+0xe/0x10 [64843.077740] __mutex_lock.constprop.0+0x263/0x490 [64843.077747] __mutex_lock_slowpath+0x13/0x20 [64843.077752] mutex_lock+0x34/0x40 [64843.077758] mlx5_ib_invalidate_range+0x48/0x270 [mlx5_ib] [64843.077808] __mmu_notifier_release+0x1a4/0x200 [64843.077816] exit_mmap+0x1bc/0x200 [64843.077822] ? walk_page_range+0x9c/0x120 [64843.077828] ? __cond_resched+0x1a/0x50 [64843.077833] ? mutex_lock+0x13/0x40 [64843.077839] ? uprobe_clear_state+0xac/0x120 [64843.077860] mmput+0x5f/0x140 [64843.077867] ib_umem_odp_map_dma_and_lock+0x21b/0x580 [ib_core] [64843.077931] pagefault_real_mr+0x9a/0x140 [mlx5_ib] [64843.077962] pagefault_mr+0xb4/0x550 [mlx5_ib] [64843.077992] pagefault_single_data_segment.constprop.0+0x2ac/0x560 [mlx5_ib] [64843.078022] mlx5_ib_eqe_pf_action+0x528/0x780 [mlx5_ib] [64843.078051] process_one_work+0x22b/0x3d0 [64843.078059] worker_thread+0x53/0x410 [64843.078065] ? process_one_work+0x3d0/0x3d0 [64843.078073] kthread+0x12a/0x150 [64843.078079] ? set_kthread_struct+0x50/0x50 [64843.078085] ret_from_fork+0x22/0x30 [64843.078093] Fixes: 36f30e486dce ("IB/core: Improve ODP to use hmm_range_fault()") Reviewed-by: Maor Gottlieb Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/74d93541ea533ef7daec6f126deb1072500aeb16.1661251841.git.leonro@nvidia.com Signed-off-by: Leon Romanovsky --- kernel/fork.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 90c85b17bf69..8a9e92068b15 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1225,6 +1225,7 @@ void mmput_async(struct mm_struct *mm) schedule_work(&mm->async_put_work); } } +EXPORT_SYMBOL_GPL(mmput_async); #endif /** -- cgit From baf2c002402702accc30dcc8f19199f303638306 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Tue, 23 Aug 2022 17:20:29 +0200 Subject: rv/monitors: Make monitor's automata definition static Monitor's automata definition is only used locally, so make them static for all existing monitors. Link: https://lore.kernel.org/all/202208210332.gtHXje45-lkp@intel.com Link: https://lore.kernel.org/all/202208210358.6HH3OrVs-lkp@intel.com Link: https://lkml.kernel.org/r/a50e27c3738d6ef809f4201857229fed64799234.1661266564.git.bristot@kernel.org Fixes: ccc319dcb450 ("rv/monitor: Add the wwnr monitor") Fixes: 8812d21219b9 ("rv/monitor: Add the wip monitor skeleton created by dot2k") Reported-by: kernel test robot Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- kernel/trace/rv/monitors/wip/wip.h | 2 +- kernel/trace/rv/monitors/wwnr/wwnr.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/rv/monitors/wip/wip.h b/kernel/trace/rv/monitors/wip/wip.h index c1c47e2305ef..dacc37b62a2c 100644 --- a/kernel/trace/rv/monitors/wip/wip.h +++ b/kernel/trace/rv/monitors/wip/wip.h @@ -27,7 +27,7 @@ struct automaton_wip { bool final_states[state_max_wip]; }; -struct automaton_wip automaton_wip = { +static struct automaton_wip automaton_wip = { .state_names = { "preemptive", "non_preemptive" diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.h b/kernel/trace/rv/monitors/wwnr/wwnr.h index d1afe55cdd4c..118e576b91b4 100644 --- a/kernel/trace/rv/monitors/wwnr/wwnr.h +++ b/kernel/trace/rv/monitors/wwnr/wwnr.h @@ -27,7 +27,7 @@ struct automaton_wwnr { bool final_states[state_max_wwnr]; }; -struct automaton_wwnr automaton_wwnr = { +static struct automaton_wwnr automaton_wwnr = { .state_names = { "not_running", "running" -- cgit From 54be5509422ebee333709c92441aa7185ca182fa Mon Sep 17 00:00:00 2001 From: Alison Schofield Date: Mon, 29 Aug 2022 10:10:48 -0700 Subject: tracepoint: Allow trace events in modules with TAINT_TEST Commit 2852ca7fba9f ("panic: Taint kernel if tests are run") introduced a new taint type, TAINT_TEST, to signal that an in-kernel test module has been loaded. TAINT_TEST taint type defaults into a 'bad_taint' list for kernel tracing and blocks the creation of trace events. This causes a problem for CXL testing where loading the cxl_test module makes all CXL modules out-of-tree, blocking any trace events. Trace events are in development for CXL at the moment and this issue was found in test with v6.0-rc1. Link: https://lkml.kernel.org/r/20220829171048.263065-1-alison.schofield@intel.com Fixes: 2852ca7fba9f7 ("panic: Taint kernel if tests are run") Reported-by: Ira Weiny Suggested-by: Dan Williams Tested-by: Ira Weiny Reviewed-by: David Gow Signed-off-by: Alison Schofield Signed-off-by: Steven Rostedt (Google) --- kernel/tracepoint.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 64ea283f2f86..ef42c1a11920 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -571,7 +571,8 @@ static void for_each_tracepoint_range( bool trace_module_has_bad_taint(struct module *mod) { return mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP) | - (1 << TAINT_UNSIGNED_MODULE)); + (1 << TAINT_UNSIGNED_MODULE) | + (1 << TAINT_TEST)); } static BLOCKING_NOTIFIER_HEAD(tracepoint_notify_list); @@ -647,7 +648,7 @@ static int tracepoint_module_coming(struct module *mod) /* * We skip modules that taint the kernel, especially those with different * module headers (for forced load), to make sure we don't cause a crash. - * Staging, out-of-tree, and unsigned GPL modules are fine. + * Staging, out-of-tree, unsigned GPL, and test modules are fine. */ if (trace_module_has_bad_taint(mod)) return 0; -- cgit From 54c3931957f6a6194d5972eccc36d052964b2abe Mon Sep 17 00:00:00 2001 From: Yipeng Zou Date: Thu, 1 Sep 2022 18:45:14 +0800 Subject: tracing: hold caller_addr to hardirq_{enable,disable}_ip Currently, The arguments passing to lockdep_hardirqs_{on,off} was fixed in CALLER_ADDR0. The function trace_hardirqs_on_caller should have been intended to use caller_addr to represent the address that caller wants to be traced. For example, lockdep log in riscv showing the last {enabled,disabled} at __trace_hardirqs_{on,off} all the time(if called by): [ 57.853175] hardirqs last enabled at (2519): __trace_hardirqs_on+0xc/0x14 [ 57.853848] hardirqs last disabled at (2520): __trace_hardirqs_off+0xc/0x14 After use trace_hardirqs_xx_caller, we can get more effective information: [ 53.781428] hardirqs last enabled at (2595): restore_all+0xe/0x66 [ 53.782185] hardirqs last disabled at (2596): ret_from_exception+0xa/0x10 Link: https://lkml.kernel.org/r/20220901104515.135162-2-zouyipeng@huawei.com Cc: stable@vger.kernel.org Fixes: c3bc8fd637a96 ("tracing: Centralize preemptirq tracepoints and unify their usage") Signed-off-by: Yipeng Zou Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_preemptirq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c index 95b58bd757ce..1e130da1b742 100644 --- a/kernel/trace/trace_preemptirq.c +++ b/kernel/trace/trace_preemptirq.c @@ -95,14 +95,14 @@ __visible void trace_hardirqs_on_caller(unsigned long caller_addr) } lockdep_hardirqs_on_prepare(); - lockdep_hardirqs_on(CALLER_ADDR0); + lockdep_hardirqs_on(caller_addr); } EXPORT_SYMBOL(trace_hardirqs_on_caller); NOKPROBE_SYMBOL(trace_hardirqs_on_caller); __visible void trace_hardirqs_off_caller(unsigned long caller_addr) { - lockdep_hardirqs_off(CALLER_ADDR0); + lockdep_hardirqs_off(caller_addr); if (!this_cpu_read(tracing_irq_cpu)) { this_cpu_write(tracing_irq_cpu, 1); -- cgit From cecf8e128ec69149fe53c9a7bafa505a4bee25d9 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Sun, 4 Sep 2022 13:12:29 +0900 Subject: tracing: Fix to check event_mutex is held while accessing trigger list Since the check_user_trigger() is called outside of RCU read lock, this list_for_each_entry_rcu() caused a suspicious RCU usage warning. # echo hist:keys=pid > events/sched/sched_stat_runtime/trigger # cat events/sched/sched_stat_runtime/trigger [ 43.167032] [ 43.167418] ============================= [ 43.167992] WARNING: suspicious RCU usage [ 43.168567] 5.19.0-rc5-00029-g19ebe4651abf #59 Not tainted [ 43.169283] ----------------------------- [ 43.169863] kernel/trace/trace_events_trigger.c:145 RCU-list traversed in non-reader section!! ... However, this file->triggers list is safe when it is accessed under event_mutex is held. To fix this warning, adds a lockdep_is_held check to the list_for_each_entry_rcu(). Link: https://lkml.kernel.org/r/166226474977.223837.1992182913048377113.stgit@devnote2 Cc: stable@vger.kernel.org Fixes: 7491e2c44278 ("tracing: Add a probe that attaches to trace events") Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_trigger.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index cb866c3141af..918730d74932 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -142,7 +142,8 @@ static bool check_user_trigger(struct trace_event_file *file) { struct event_trigger_data *data; - list_for_each_entry_rcu(data, &file->triggers, list) { + list_for_each_entry_rcu(data, &file->triggers, list, + lockdep_is_held(&event_mutex)) { if (data->flags & EVENT_TRIGGER_FL_PROBE) continue; return true; -- cgit From 93d71986a60c37395e0a1d2f7fe170ef1e0a8ba4 Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Tue, 6 Sep 2022 22:12:10 +0800 Subject: rv/reactor: add __init/__exit annotations to module init/exit funcs Add missing __init/__exit annotations to module init/exit funcs. Link: https://lkml.kernel.org/r/20220906141210.132607-1-xiujianfeng@huawei.com Fixes: 135b881ea885 ("rv/reactor: Add the printk reactor") Fixes: e88043c0ac16 ("rv/reactor: Add the panic reactor") Signed-off-by: Xiu Jianfeng Acked-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- kernel/trace/rv/reactor_panic.c | 4 ++-- kernel/trace/rv/reactor_printk.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/rv/reactor_panic.c b/kernel/trace/rv/reactor_panic.c index b698d05dd069..d65f6c25a87c 100644 --- a/kernel/trace/rv/reactor_panic.c +++ b/kernel/trace/rv/reactor_panic.c @@ -24,13 +24,13 @@ static struct rv_reactor rv_panic = { .react = rv_panic_reaction }; -static int register_react_panic(void) +static int __init register_react_panic(void) { rv_register_reactor(&rv_panic); return 0; } -static void unregister_react_panic(void) +static void __exit unregister_react_panic(void) { rv_unregister_reactor(&rv_panic); } diff --git a/kernel/trace/rv/reactor_printk.c b/kernel/trace/rv/reactor_printk.c index 31899f953af4..4b6b7106a477 100644 --- a/kernel/trace/rv/reactor_printk.c +++ b/kernel/trace/rv/reactor_printk.c @@ -23,13 +23,13 @@ static struct rv_reactor rv_printk = { .react = rv_printk_reaction }; -static int register_react_printk(void) +static int __init register_react_printk(void) { rv_register_reactor(&rv_printk); return 0; } -static void unregister_react_printk(void) +static void __exit unregister_react_printk(void) { rv_unregister_reactor(&rv_printk); } -- cgit From 81c12e922b97b94f64e84aa9bfe5a9d1fca2dc25 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Wed, 31 Aug 2022 00:38:18 -0600 Subject: Revert "swiotlb: panic if nslabs is too small" This reverts commit 0bf28fc40d89b1a3e00d1b79473bad4e9ca20ad1. Reasons: 1. new panic()s shouldn't be added [1]. 2. It does no "cleanup" but breaks MIPS [2]. v2: properly solved the conflict [3] with commit 20347fca71a38 ("swiotlb: split up the global swiotlb lock") Reported-by: kernel test robot Reported-by: Dan Carpenter [1] https://lore.kernel.org/r/CAHk-=wit-DmhMfQErY29JSPjFgebx_Ld+pnerc4J2Ag990WwAA@mail.gmail.com/ [2] https://lore.kernel.org/r/20220820012031.1285979-1-yuzhao@google.com/ [3] https://lore.kernel.org/r/202208310701.LKr1WDCh-lkp@intel.com/ Fixes: 0bf28fc40d89b ("swiotlb: panic if nslabs is too small") Signed-off-by: Yu Zhao Signed-off-by: Christoph Hellwig --- kernel/dma/swiotlb.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index c5a9190b218f..dd8863987e0c 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -326,9 +326,6 @@ void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags, swiotlb_adjust_nareas(num_possible_cpus()); nslabs = default_nslabs; - if (nslabs < IO_TLB_MIN_SLABS) - panic("%s: nslabs = %lu too small\n", __func__, nslabs); - /* * By default allocate the bounce buffer memory from low memory, but * allow to pick a location everywhere for hypervisors with guest @@ -341,8 +338,7 @@ retry: else tlb = memblock_alloc_low(bytes, PAGE_SIZE); if (!tlb) { - pr_warn("%s: Failed to allocate %zu bytes tlb structure\n", - __func__, bytes); + pr_warn("%s: failed to allocate tlb structure\n", __func__); return; } -- cgit From 2995b8002cefc7b0b00b8f9a0ce36601a8c390c0 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Mon, 15 Aug 2022 20:28:40 +0100 Subject: dma-debug: improve search for partial syncs When bucket_find_contains() tries to find the original entry for a partial sync, it manages to constrain its search in a way that is both too restrictive and not restrictive enough. A driver which only uses single mappings rather than scatterlists might not set max_seg_size, but could still technically perform a partial sync at an offset of more than 64KB into a sufficiently large mapping, so we could stop searching too early before reaching a legitimate entry. Conversely, if no valid entry is present and max_range is large enough, we can pointlessly search buckets that we've already searched, or that represent an impossible wrapping around the bottom of the address space. At worst, the (legitimate) case of max_seg_size == UINT_MAX can make the loop infinite. Replace the fragile and frankly hard-to-follow "range" logic with a simple counted loop for the number of possible hash buckets below the given address. Reported-by: Yunfei Wang Signed-off-by: Robin Murphy Signed-off-by: Christoph Hellwig --- kernel/dma/debug.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 2caafd13f8aa..18c93c2276ca 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -350,11 +350,10 @@ static struct dma_debug_entry *bucket_find_contain(struct hash_bucket **bucket, unsigned long *flags) { - unsigned int max_range = dma_get_max_seg_size(ref->dev); struct dma_debug_entry *entry, index = *ref; - unsigned int range = 0; + int limit = min(HASH_SIZE, (index.dev_addr >> HASH_FN_SHIFT) + 1); - while (range <= max_range) { + for (int i = 0; i < limit; i++) { entry = __hash_bucket_find(*bucket, ref, containing_match); if (entry) @@ -364,7 +363,6 @@ static struct dma_debug_entry *bucket_find_contain(struct hash_bucket **bucket, * Nothing found, go back a hash bucket */ put_hash_bucket(*bucket, *flags); - range += (1 << HASH_FN_SHIFT); index.dev_addr -= (1 << HASH_FN_SHIFT); *bucket = get_hash_bucket(&index, flags); } -- cgit From 3f0461613ebcdc8c4073e235053d06d5aa58750f Mon Sep 17 00:00:00 2001 From: Chao Gao Date: Fri, 19 Aug 2022 16:45:37 +0800 Subject: swiotlb: avoid potential left shift overflow The second operand passed to slot_addr() is declared as int or unsigned int in all call sites. The left-shift to get the offset of a slot can overflow if swiotlb size is larger than 4G. Convert the macro to an inline function and declare the second argument as phys_addr_t to avoid the potential overflow. Fixes: 26a7e094783d ("swiotlb: refactor swiotlb_tbl_map_single") Signed-off-by: Chao Gao Reviewed-by: Dongli Zhang Signed-off-by: Christoph Hellwig --- kernel/dma/swiotlb.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index dd8863987e0c..1ce8977d911c 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -575,7 +575,10 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size } } -#define slot_addr(start, idx) ((start) + ((idx) << IO_TLB_SHIFT)) +static inline phys_addr_t slot_addr(phys_addr_t start, phys_addr_t idx) +{ + return start + (idx << IO_TLB_SHIFT); +} /* * Carefully handle integer overflow which can occur when boundary_mask == ~0UL. -- cgit From 43b919017fe755ccd2b19afb2dc2d3da8f840038 Mon Sep 17 00:00:00 2001 From: Chao Gao Date: Fri, 26 Aug 2022 17:50:46 +0800 Subject: swiotlb: fix a typo "overwirte" isn't a word. It should be "overwrite". Signed-off-by: Chao Gao Signed-off-by: Christoph Hellwig --- kernel/dma/swiotlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 1ce8977d911c..0ef6b12f961d 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -764,7 +764,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, /* * When dir == DMA_FROM_DEVICE we could omit the copy from the orig * to the tlb buffer, if we knew for sure the device will - * overwirte the entire current content. But we don't. Thus + * overwrite the entire current content. But we don't. Thus * unconditional bounce may prevent leaking swiotlb content (i.e. * kernel memory) to user-space. */ -- cgit From 9fc18f6d56d5b79d527c17a8100a0965d18345cf Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 21 Aug 2022 16:06:44 +0200 Subject: dma-mapping: mark dma_supported static Now that the remaining users in drivers are gone, this function can be marked static. Signed-off-by: Christoph Hellwig --- kernel/dma/mapping.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 49cbf3e33de7..27f272381cf2 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -707,7 +707,7 @@ int dma_mmap_noncontiguous(struct device *dev, struct vm_area_struct *vma, } EXPORT_SYMBOL_GPL(dma_mmap_noncontiguous); -int dma_supported(struct device *dev, u64 mask) +static int dma_supported(struct device *dev, u64 mask) { const struct dma_map_ops *ops = get_dma_ops(dev); @@ -721,7 +721,6 @@ int dma_supported(struct device *dev, u64 mask) return 1; return ops->dma_supported(dev, mask); } -EXPORT_SYMBOL(dma_supported); bool dma_pci_p2pdma_supported(struct device *dev) { -- cgit From 1efda38d6f9ba26ac88b359c6277f1172db03f1e Mon Sep 17 00:00:00 2001 From: "Christian A. Ehrhardt" Date: Wed, 7 Sep 2022 22:09:17 +0200 Subject: kprobes: Prohibit probes in gate area The system call gate area counts as kernel text but trying to install a kprobe in this area fails with an Oops later on. To fix this explicitly disallow the gate area for kprobes. Found by syzkaller with the following reproducer: perf_event_open$cgroup(&(0x7f00000001c0)={0x6, 0x80, 0x0, 0x0, 0x0, 0x0, 0x80ffff, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, @perf_config_ext={0x0, 0xffffffffff600000}}, 0xffffffffffffffff, 0x0, 0xffffffffffffffff, 0x0) Sample report: BUG: unable to handle page fault for address: fffffbfff3ac6000 PGD 6dfcb067 P4D 6dfcb067 PUD 6df8f067 PMD 6de4d067 PTE 0 Oops: 0000 [#1] PREEMPT SMP KASAN NOPTI CPU: 0 PID: 21978 Comm: syz-executor.2 Not tainted 6.0.0-rc3-00363-g7726d4c3e60b-dirty #6 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 RIP: 0010:__insn_get_emulate_prefix arch/x86/lib/insn.c:91 [inline] RIP: 0010:insn_get_emulate_prefix arch/x86/lib/insn.c:106 [inline] RIP: 0010:insn_get_prefixes.part.0+0xa8/0x1110 arch/x86/lib/insn.c:134 Code: 49 be 00 00 00 00 00 fc ff df 48 8b 40 60 48 89 44 24 08 e9 81 00 00 00 e8 e5 4b 39 ff 4c 89 fa 4c 89 f9 48 c1 ea 03 83 e1 07 <42> 0f b6 14 32 38 ca 7f 08 84 d2 0f 85 06 10 00 00 48 89 d8 48 89 RSP: 0018:ffffc900088bf860 EFLAGS: 00010246 RAX: 0000000000040000 RBX: ffffffff9b9bebc0 RCX: 0000000000000000 RDX: 1ffffffff3ac6000 RSI: ffffc90002d82000 RDI: ffffc900088bf9e8 RBP: ffffffff9d630001 R08: 0000000000000000 R09: ffffc900088bf9e8 R10: 0000000000000000 R11: 0000000000000001 R12: 0000000000000001 R13: ffffffff9d630000 R14: dffffc0000000000 R15: ffffffff9d630000 FS: 00007f63eef63640(0000) GS:ffff88806d000000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: fffffbfff3ac6000 CR3: 0000000029d90005 CR4: 0000000000770ef0 PKRU: 55555554 Call Trace: insn_get_prefixes arch/x86/lib/insn.c:131 [inline] insn_get_opcode arch/x86/lib/insn.c:272 [inline] insn_get_modrm+0x64a/0x7b0 arch/x86/lib/insn.c:343 insn_get_sib+0x29a/0x330 arch/x86/lib/insn.c:421 insn_get_displacement+0x350/0x6b0 arch/x86/lib/insn.c:464 insn_get_immediate arch/x86/lib/insn.c:632 [inline] insn_get_length arch/x86/lib/insn.c:707 [inline] insn_decode+0x43a/0x490 arch/x86/lib/insn.c:747 can_probe+0xfc/0x1d0 arch/x86/kernel/kprobes/core.c:282 arch_prepare_kprobe+0x79/0x1c0 arch/x86/kernel/kprobes/core.c:739 prepare_kprobe kernel/kprobes.c:1160 [inline] register_kprobe kernel/kprobes.c:1641 [inline] register_kprobe+0xb6e/0x1690 kernel/kprobes.c:1603 __register_trace_kprobe kernel/trace/trace_kprobe.c:509 [inline] __register_trace_kprobe+0x26a/0x2d0 kernel/trace/trace_kprobe.c:477 create_local_trace_kprobe+0x1f7/0x350 kernel/trace/trace_kprobe.c:1833 perf_kprobe_init+0x18c/0x280 kernel/trace/trace_event_perf.c:271 perf_kprobe_event_init+0xf8/0x1c0 kernel/events/core.c:9888 perf_try_init_event+0x12d/0x570 kernel/events/core.c:11261 perf_init_event kernel/events/core.c:11325 [inline] perf_event_alloc.part.0+0xf7f/0x36a0 kernel/events/core.c:11619 perf_event_alloc kernel/events/core.c:12059 [inline] __do_sys_perf_event_open+0x4a8/0x2a00 kernel/events/core.c:12157 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x38/0x90 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd RIP: 0033:0x7f63ef7efaed Code: 02 b8 ff ff ff ff c3 66 0f 1f 44 00 00 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b0 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007f63eef63028 EFLAGS: 00000246 ORIG_RAX: 000000000000012a RAX: ffffffffffffffda RBX: 00007f63ef90ff80 RCX: 00007f63ef7efaed RDX: 0000000000000000 RSI: ffffffffffffffff RDI: 00000000200001c0 RBP: 00007f63ef86019c R08: 0000000000000000 R09: 0000000000000000 R10: ffffffffffffffff R11: 0000000000000246 R12: 0000000000000000 R13: 0000000000000002 R14: 00007f63ef90ff80 R15: 00007f63eef43000 Modules linked in: CR2: fffffbfff3ac6000 ---[ end trace 0000000000000000 ]--- RIP: 0010:__insn_get_emulate_prefix arch/x86/lib/insn.c:91 [inline] RIP: 0010:insn_get_emulate_prefix arch/x86/lib/insn.c:106 [inline] RIP: 0010:insn_get_prefixes.part.0+0xa8/0x1110 arch/x86/lib/insn.c:134 Code: 49 be 00 00 00 00 00 fc ff df 48 8b 40 60 48 89 44 24 08 e9 81 00 00 00 e8 e5 4b 39 ff 4c 89 fa 4c 89 f9 48 c1 ea 03 83 e1 07 <42> 0f b6 14 32 38 ca 7f 08 84 d2 0f 85 06 10 00 00 48 89 d8 48 89 RSP: 0018:ffffc900088bf860 EFLAGS: 00010246 RAX: 0000000000040000 RBX: ffffffff9b9bebc0 RCX: 0000000000000000 RDX: 1ffffffff3ac6000 RSI: ffffc90002d82000 RDI: ffffc900088bf9e8 RBP: ffffffff9d630001 R08: 0000000000000000 R09: ffffc900088bf9e8 R10: 0000000000000000 R11: 0000000000000001 R12: 0000000000000001 R13: ffffffff9d630000 R14: dffffc0000000000 R15: ffffffff9d630000 FS: 00007f63eef63640(0000) GS:ffff88806d000000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: fffffbfff3ac6000 CR3: 0000000029d90005 CR4: 0000000000770ef0 PKRU: 55555554 ================================================================== Link: https://lkml.kernel.org/r/20220907200917.654103-1-lk@c--e.de cc: "Naveen N. Rao" cc: Anil S Keshavamurthy cc: "David S. Miller" Cc: stable@vger.kernel.org Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Acked-by: Masami Hiramatsu (Google) Signed-off-by: Christian A. Ehrhardt Signed-off-by: Steven Rostedt (Google) --- kernel/kprobes.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 08350e35aba2..ca9d834d0b84 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1562,6 +1562,7 @@ static int check_kprobe_address_safe(struct kprobe *p, /* Ensure it is not in reserved area nor out of text */ if (!(core_kernel_text((unsigned long) p->addr) || is_module_text_address((unsigned long) p->addr)) || + in_gate_area_no_mm((unsigned long) p->addr) || within_kprobe_blacklist((unsigned long) p->addr) || jump_label_text_reserved(p->addr, p->addr) || static_call_text_reserved(p->addr, p->addr) || -- cgit