diff options
Diffstat (limited to 'kernel')
255 files changed, 14604 insertions, 8235 deletions
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz index 38ef6d06888e..ce1435cb08b1 100644 --- a/kernel/Kconfig.hz +++ b/kernel/Kconfig.hz @@ -30,7 +30,7 @@ choice 250 Hz is a good compromise choice allowing server performance while also showing good interactive responsiveness even on SMP and NUMA systems. If you are going to be using NTSC video - or multimedia, selected 300Hz instead. + or multimedia, select 300Hz instead. config HZ_300 bool "300 HZ" diff --git a/kernel/audit.c b/kernel/audit.c index 5f5bf85bcc90..61b5744d0bb6 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1956,8 +1956,8 @@ static inline int audit_expand(struct audit_buffer *ab, int extra) * will be called a second time. Currently, we assume that a printk * can't format message larger than 1024 bytes, so we don't either. */ -static void audit_log_vformat(struct audit_buffer *ab, const char *fmt, - va_list args) +static __printf(2, 0) +void audit_log_vformat(struct audit_buffer *ab, const char *fmt, va_list args) { int len, avail; struct sk_buff *skb; @@ -2285,7 +2285,7 @@ void audit_log_path_denied(int type, const char *operation) { struct audit_buffer *ab; - if (!audit_enabled || audit_dummy_context()) + if (!audit_enabled) return; /* Generate log with subject, operation, outcome. */ diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 367eaf2c78b7..0ebbbe37a60f 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -347,12 +347,17 @@ static void audit_remove_parent_watches(struct audit_parent *parent) /* Get path information necessary for adding watches. */ static int audit_get_nd(struct audit_watch *watch, struct path *parent) { - struct dentry *d = kern_path_locked(watch->path, parent); + struct dentry *d; + + d = kern_path_locked_negative(watch->path, parent); if (IS_ERR(d)) return PTR_ERR(d); - /* update watch filter fields */ - watch->dev = d->d_sb->s_dev; - watch->ino = d_backing_inode(d)->i_ino; + + if (d_is_positive(d)) { + /* update watch filter fields */ + watch->dev = d->d_sb->s_dev; + watch->ino = d_backing_inode(d)->i_ino; + } inode_unlock(d_backing_inode(parent->dentry)); dput(d); @@ -418,11 +423,10 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list) /* caller expects mutex locked */ mutex_lock(&audit_filter_mutex); - if (ret && ret != -ENOENT) { + if (ret) { audit_put_watch(watch); return ret; } - ret = 0; /* either find an old parent or attach a new one */ parent = audit_find_parent(d_backing_inode(parent_path.dentry)); diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 410028633621..3a335c50e6e3 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -14,7 +14,7 @@ obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o mprog.o obj-$(CONFIG_BPF_JIT) += trampoline.o -obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o +obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o rqspinlock.o ifeq ($(CONFIG_MMU)$(CONFIG_64BIT),yy) obj-$(CONFIG_BPF_SYSCALL) += arena.o range_tree.o endif @@ -53,6 +53,9 @@ obj-$(CONFIG_BPF_SYSCALL) += relo_core.o obj-$(CONFIG_BPF_SYSCALL) += btf_iter.o obj-$(CONFIG_BPF_SYSCALL) += btf_relocate.o obj-$(CONFIG_BPF_SYSCALL) += kmem_cache_iter.o +ifeq ($(CONFIG_DMA_SHARED_BUFFER),y) +obj-$(CONFIG_BPF_SYSCALL) += dmabuf_iter.o +endif CFLAGS_REMOVE_percpu_freelist.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_bpf_lru_list.o = $(CC_FLAGS_FTRACE) diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 095a9554e1de..0d56cea71602 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -287,7 +287,7 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf) return VM_FAULT_SIGSEGV; /* Account into memcg of the process that created bpf_arena */ - ret = bpf_map_alloc_pages(map, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, 1, &page); + ret = bpf_map_alloc_pages(map, NUMA_NO_NODE, 1, &page); if (ret) { range_tree_set(&arena->rt, vmf->pgoff, 1); return VM_FAULT_SIGSEGV; @@ -465,8 +465,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt if (ret) goto out_free_pages; - ret = bpf_map_alloc_pages(&arena->map, GFP_KERNEL | __GFP_ZERO, - node_id, page_cnt, pages); + ret = bpf_map_alloc_pages(&arena->map, node_id, page_cnt, pages); if (ret) goto out; @@ -577,8 +576,8 @@ __bpf_kfunc void bpf_arena_free_pages(void *p__map, void *ptr__ign, u32 page_cnt __bpf_kfunc_end_defs(); BTF_KFUNCS_START(arena_kfuncs) -BTF_ID_FLAGS(func, bpf_arena_alloc_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE) -BTF_ID_FLAGS(func, bpf_arena_free_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_arena_alloc_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE | KF_ARENA_RET | KF_ARENA_ARG2) +BTF_ID_FLAGS(func, bpf_arena_free_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE | KF_ARENA_ARG2) BTF_KFUNCS_END(arena_kfuncs) static const struct btf_kfunc_id_set common_kfunc_set = { diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c index 54ff2a85d4c0..148da8f7ff36 100644 --- a/kernel/bpf/bpf_cgrp_storage.c +++ b/kernel/bpf/bpf_cgrp_storage.c @@ -161,6 +161,7 @@ BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup, void *, value, u64, flags, gfp_t, gfp_flags) { struct bpf_local_storage_data *sdata; + bool nobusy; WARN_ON_ONCE(!bpf_rcu_lock_held()); if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE)) @@ -169,21 +170,21 @@ BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup, if (!cgroup) return (unsigned long)NULL; - if (!bpf_cgrp_storage_trylock()) - return (unsigned long)NULL; + nobusy = bpf_cgrp_storage_trylock(); - sdata = cgroup_storage_lookup(cgroup, map, true); + sdata = cgroup_storage_lookup(cgroup, map, nobusy); if (sdata) goto unlock; /* only allocate new storage, when the cgroup is refcounted */ if (!percpu_ref_is_dying(&cgroup->self.refcnt) && - (flags & BPF_LOCAL_STORAGE_GET_F_CREATE)) + (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) && nobusy) sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map, value, BPF_NOEXIST, false, gfp_flags); unlock: - bpf_cgrp_storage_unlock(); + if (nobusy) + bpf_cgrp_storage_unlock(); return IS_ERR_OR_NULL(sdata) ? (unsigned long)NULL : (unsigned long)sdata->data; } diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 106735145948..380e9a7cac75 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -335,7 +335,7 @@ static void cache_btf_id(struct bpf_iter_target_info *tinfo, tinfo->btf_id = prog->aux->attach_btf_id; } -bool bpf_iter_prog_supported(struct bpf_prog *prog) +int bpf_iter_prog_supported(struct bpf_prog *prog) { const char *attach_fname = prog->aux->attach_func_name; struct bpf_iter_target_info *tinfo = NULL, *iter; @@ -344,7 +344,7 @@ bool bpf_iter_prog_supported(struct bpf_prog *prog) int prefix_len = strlen(prefix); if (strncmp(attach_fname, prefix, prefix_len)) - return false; + return -EINVAL; mutex_lock(&targets_mutex); list_for_each_entry(iter, &targets, list) { @@ -360,12 +360,11 @@ bool bpf_iter_prog_supported(struct bpf_prog *prog) } mutex_unlock(&targets_mutex); - if (tinfo) { - prog->aux->ctx_arg_info_size = tinfo->reg_info->ctx_arg_info_size; - prog->aux->ctx_arg_info = tinfo->reg_info->ctx_arg_info; - } + if (!tinfo) + return -EINVAL; - return tinfo != NULL; + return bpf_prog_ctx_arg_info_init(prog, tinfo->reg_info->ctx_arg_info, + tinfo->reg_info->ctx_arg_info_size); } const struct bpf_func_proto * diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 967492b65185..0a59df1c550a 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -316,7 +316,9 @@ BTF_ID(func, bpf_lsm_inode_getxattr) BTF_ID(func, bpf_lsm_inode_mknod) BTF_ID(func, bpf_lsm_inode_need_killpriv) BTF_ID(func, bpf_lsm_inode_post_setxattr) +BTF_ID(func, bpf_lsm_inode_post_removexattr) BTF_ID(func, bpf_lsm_inode_readlink) +BTF_ID(func, bpf_lsm_inode_removexattr) BTF_ID(func, bpf_lsm_inode_rename) BTF_ID(func, bpf_lsm_inode_rmdir) BTF_ID(func, bpf_lsm_inode_setattr) diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 040fb1cd840b..96113633e391 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -146,39 +146,7 @@ void bpf_struct_ops_image_free(void *image) } #define MAYBE_NULL_SUFFIX "__nullable" -#define MAX_STUB_NAME 128 - -/* Return the type info of a stub function, if it exists. - * - * The name of a stub function is made up of the name of the struct_ops and - * the name of the function pointer member, separated by "__". For example, - * if the struct_ops type is named "foo_ops" and the function pointer - * member is named "bar", the stub function name would be "foo_ops__bar". - */ -static const struct btf_type * -find_stub_func_proto(const struct btf *btf, const char *st_op_name, - const char *member_name) -{ - char stub_func_name[MAX_STUB_NAME]; - const struct btf_type *func_type; - s32 btf_id; - int cp; - - cp = snprintf(stub_func_name, MAX_STUB_NAME, "%s__%s", - st_op_name, member_name); - if (cp >= MAX_STUB_NAME) { - pr_warn("Stub function name too long\n"); - return NULL; - } - btf_id = btf_find_by_name_kind(btf, stub_func_name, BTF_KIND_FUNC); - if (btf_id < 0) - return NULL; - func_type = btf_type_by_id(btf, btf_id); - if (!func_type) - return NULL; - - return btf_type_by_id(btf, func_type->type); /* FUNC_PROTO */ -} +#define REFCOUNTED_SUFFIX "__ref" /* Prepare argument info for every nullable argument of a member of a * struct_ops type. @@ -203,27 +171,44 @@ find_stub_func_proto(const struct btf *btf, const char *st_op_name, static int prepare_arg_info(struct btf *btf, const char *st_ops_name, const char *member_name, - const struct btf_type *func_proto, + const struct btf_type *func_proto, void *stub_func_addr, struct bpf_struct_ops_arg_info *arg_info) { const struct btf_type *stub_func_proto, *pointed_type; + bool is_nullable = false, is_refcounted = false; const struct btf_param *stub_args, *args; struct bpf_ctx_arg_aux *info, *info_buf; u32 nargs, arg_no, info_cnt = 0; + char ksym[KSYM_SYMBOL_LEN]; + const char *stub_fname; + const char *suffix; + s32 stub_func_id; u32 arg_btf_id; int offset; - stub_func_proto = find_stub_func_proto(btf, st_ops_name, member_name); - if (!stub_func_proto) - return 0; + stub_fname = kallsyms_lookup((unsigned long)stub_func_addr, NULL, NULL, NULL, ksym); + if (!stub_fname) { + pr_warn("Cannot find the stub function name for the %s in struct %s\n", + member_name, st_ops_name); + return -ENOENT; + } + + stub_func_id = btf_find_by_name_kind(btf, stub_fname, BTF_KIND_FUNC); + if (stub_func_id < 0) { + pr_warn("Cannot find the stub function %s in btf\n", stub_fname); + return -ENOENT; + } + + stub_func_proto = btf_type_by_id(btf, stub_func_id); + stub_func_proto = btf_type_by_id(btf, stub_func_proto->type); /* Check if the number of arguments of the stub function is the same * as the number of arguments of the function pointer. */ nargs = btf_type_vlen(func_proto); if (nargs != btf_type_vlen(stub_func_proto)) { - pr_warn("the number of arguments of the stub function %s__%s does not match the number of arguments of the member %s of struct %s\n", - st_ops_name, member_name, member_name, st_ops_name); + pr_warn("the number of arguments of the stub function %s does not match the number of arguments of the member %s of struct %s\n", + stub_fname, member_name, st_ops_name); return -EINVAL; } @@ -241,10 +226,18 @@ static int prepare_arg_info(struct btf *btf, info = info_buf; for (arg_no = 0; arg_no < nargs; arg_no++) { /* Skip arguments that is not suffixed with - * "__nullable". + * "__nullable or __ref". */ - if (!btf_param_match_suffix(btf, &stub_args[arg_no], - MAYBE_NULL_SUFFIX)) + is_nullable = btf_param_match_suffix(btf, &stub_args[arg_no], + MAYBE_NULL_SUFFIX); + is_refcounted = btf_param_match_suffix(btf, &stub_args[arg_no], + REFCOUNTED_SUFFIX); + + if (is_nullable) + suffix = MAYBE_NULL_SUFFIX; + else if (is_refcounted) + suffix = REFCOUNTED_SUFFIX; + else continue; /* Should be a pointer to struct */ @@ -253,30 +246,34 @@ static int prepare_arg_info(struct btf *btf, &arg_btf_id); if (!pointed_type || !btf_type_is_struct(pointed_type)) { - pr_warn("stub function %s__%s has %s tagging to an unsupported type\n", - st_ops_name, member_name, MAYBE_NULL_SUFFIX); + pr_warn("stub function %s has %s tagging to an unsupported type\n", + stub_fname, suffix); goto err_out; } offset = btf_ctx_arg_offset(btf, func_proto, arg_no); if (offset < 0) { - pr_warn("stub function %s__%s has an invalid trampoline ctx offset for arg#%u\n", - st_ops_name, member_name, arg_no); + pr_warn("stub function %s has an invalid trampoline ctx offset for arg#%u\n", + stub_fname, arg_no); goto err_out; } if (args[arg_no].type != stub_args[arg_no].type) { - pr_warn("arg#%u type in stub function %s__%s does not match with its original func_proto\n", - arg_no, st_ops_name, member_name); + pr_warn("arg#%u type in stub function %s does not match with its original func_proto\n", + arg_no, stub_fname); goto err_out; } /* Fill the information of the new argument */ - info->reg_type = - PTR_TRUSTED | PTR_TO_BTF_ID | PTR_MAYBE_NULL; info->btf_id = arg_btf_id; info->btf = btf; info->offset = offset; + if (is_nullable) { + info->reg_type = PTR_TRUSTED | PTR_TO_BTF_ID | PTR_MAYBE_NULL; + } else if (is_refcounted) { + info->reg_type = PTR_TRUSTED | PTR_TO_BTF_ID; + info->refcounted = true; + } info++; info_cnt++; @@ -324,6 +321,13 @@ static bool is_module_member(const struct btf *btf, u32 id) return !strcmp(btf_name_by_offset(btf, t->name_off), "module"); } +int bpf_struct_ops_supported(const struct bpf_struct_ops *st_ops, u32 moff) +{ + void *func_ptr = *(void **)(st_ops->cfi_stubs + moff); + + return func_ptr ? 0 : -ENOTSUPP; +} + int bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc, struct btf *btf, struct bpf_verifier_log *log) @@ -386,8 +390,11 @@ int bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc, st_ops_desc->value_type = btf_type_by_id(btf, value_id); for_each_member(i, t, member) { - const struct btf_type *func_proto; + const struct btf_type *func_proto, *ret_type; + void **stub_func_addr; + u32 moff; + moff = __btf_member_bit_offset(t, member) / 8; mname = btf_name_by_offset(btf, member->name_off); if (!*mname) { pr_warn("anon member in struct %s is not supported\n", @@ -413,9 +420,23 @@ int bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc, func_proto = btf_type_resolve_func_ptr(btf, member->type, NULL); - if (!func_proto) + + /* The member is not a function pointer or + * the function pointer is not supported. + */ + if (!func_proto || bpf_struct_ops_supported(st_ops, moff)) continue; + if (func_proto->type) { + ret_type = btf_type_resolve_ptr(btf, func_proto->type, NULL); + if (ret_type && !__btf_type_is_struct(ret_type)) { + pr_warn("func ptr %s in struct %s returns non-struct pointer, which is not supported\n", + mname, st_ops->name); + err = -EOPNOTSUPP; + goto errout; + } + } + if (btf_distill_func_proto(log, btf, func_proto, mname, &st_ops->func_models[i])) { @@ -425,8 +446,9 @@ int bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc, goto errout; } + stub_func_addr = *(void **)(st_ops->cfi_stubs + moff); err = prepare_arg_info(btf, st_ops->name, mname, - func_proto, + func_proto, stub_func_addr, arg_info + i); if (err) goto errout; @@ -579,7 +601,7 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, if (model->ret_size > 0) flags |= BPF_TRAMP_F_RET_FENTRY_RET; - size = arch_bpf_trampoline_size(model, flags, tlinks, NULL); + size = arch_bpf_trampoline_size(model, flags, tlinks, stub_func); if (size <= 0) return size ? : -EFAULT; @@ -1152,13 +1174,6 @@ void bpf_struct_ops_put(const void *kdata) bpf_map_put(&st_map->map); } -int bpf_struct_ops_supported(const struct bpf_struct_ops *st_ops, u32 moff) -{ - void *func_ptr = *(void **)(st_ops->cfi_stubs + moff); - - return func_ptr ? 0 : -ENOTSUPP; -} - static bool bpf_struct_ops_valid_to_reg(struct bpf_map *map) { struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index c3223e0db2f5..1d2cf898e21e 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -26,6 +26,7 @@ #include <linux/bsearch.h> #include <linux/kobject.h> #include <linux/sysfs.h> +#include <linux/overflow.h> #include <net/netfilter/nf_bpf_link.h> @@ -606,6 +607,7 @@ s32 bpf_find_btf_id(const char *name, u32 kind, struct btf **btf_p) spin_unlock_bh(&btf_idr_lock); return ret; } +EXPORT_SYMBOL_GPL(bpf_find_btf_id); const struct btf_type *btf_type_skip_modifiers(const struct btf *btf, u32 id, u32 *res_id) @@ -2575,7 +2577,7 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env, return -EINVAL; } - if (btf_type_kflag(t)) { + if (btf_type_kflag(t) && !btf_type_is_type_tag(t)) { btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); return -EINVAL; } @@ -3332,6 +3334,8 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t, u32 off, int sz, struct btf_field_info *info, u32 field_mask) { enum btf_field_type type; + const char *tag_value; + bool is_type_tag; u32 res_id; /* Permit modifiers on the pointer itself */ @@ -3341,19 +3345,20 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t, if (!btf_type_is_ptr(t)) return BTF_FIELD_IGNORE; t = btf_type_by_id(btf, t->type); - - if (!btf_type_is_type_tag(t)) + is_type_tag = btf_type_is_type_tag(t) && !btf_type_kflag(t); + if (!is_type_tag) return BTF_FIELD_IGNORE; /* Reject extra tags */ if (btf_type_is_type_tag(btf_type_by_id(btf, t->type))) return -EINVAL; - if (!strcmp("kptr_untrusted", __btf_name_by_offset(btf, t->name_off))) + tag_value = __btf_name_by_offset(btf, t->name_off); + if (!strcmp("kptr_untrusted", tag_value)) type = BPF_KPTR_UNREF; - else if (!strcmp("kptr", __btf_name_by_offset(btf, t->name_off))) + else if (!strcmp("kptr", tag_value)) type = BPF_KPTR_REF; - else if (!strcmp("percpu_kptr", __btf_name_by_offset(btf, t->name_off))) + else if (!strcmp("percpu_kptr", tag_value)) type = BPF_KPTR_PERCPU; - else if (!strcmp("uptr", __btf_name_by_offset(btf, t->name_off))) + else if (!strcmp("uptr", tag_value)) type = BPF_UPTR; else return -EINVAL; @@ -3477,6 +3482,15 @@ static int btf_get_field_type(const struct btf *btf, const struct btf_type *var_ goto end; } } + if (field_mask & BPF_RES_SPIN_LOCK) { + if (!strcmp(name, "bpf_res_spin_lock")) { + if (*seen_mask & BPF_RES_SPIN_LOCK) + return -E2BIG; + *seen_mask |= BPF_RES_SPIN_LOCK; + type = BPF_RES_SPIN_LOCK; + goto end; + } + } if (field_mask & BPF_TIMER) { if (!strcmp(name, "bpf_timer")) { if (*seen_mask & BPF_TIMER) @@ -3655,6 +3669,7 @@ static int btf_find_field_one(const struct btf *btf, switch (field_type) { case BPF_SPIN_LOCK: + case BPF_RES_SPIN_LOCK: case BPF_TIMER: case BPF_WORKQUEUE: case BPF_LIST_NODE: @@ -3943,11 +3958,12 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type /* This needs to be kzalloc to zero out padding and unused fields, see * comment in btf_record_equal. */ - rec = kzalloc(offsetof(struct btf_record, fields[cnt]), GFP_KERNEL | __GFP_NOWARN); + rec = kzalloc(struct_size(rec, fields, cnt), GFP_KERNEL | __GFP_NOWARN); if (!rec) return ERR_PTR(-ENOMEM); rec->spin_lock_off = -EINVAL; + rec->res_spin_lock_off = -EINVAL; rec->timer_off = -EINVAL; rec->wq_off = -EINVAL; rec->refcount_off = -EINVAL; @@ -3975,6 +3991,11 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type /* Cache offset for faster lookup at runtime */ rec->spin_lock_off = rec->fields[i].offset; break; + case BPF_RES_SPIN_LOCK: + WARN_ON_ONCE(rec->spin_lock_off >= 0); + /* Cache offset for faster lookup at runtime */ + rec->res_spin_lock_off = rec->fields[i].offset; + break; case BPF_TIMER: WARN_ON_ONCE(rec->timer_off >= 0); /* Cache offset for faster lookup at runtime */ @@ -4018,9 +4039,15 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type rec->cnt++; } + if (rec->spin_lock_off >= 0 && rec->res_spin_lock_off >= 0) { + ret = -EINVAL; + goto end; + } + /* bpf_{list_head, rb_node} require bpf_spin_lock */ if ((btf_record_has_field(rec, BPF_LIST_HEAD) || - btf_record_has_field(rec, BPF_RB_ROOT)) && rec->spin_lock_off < 0) { + btf_record_has_field(rec, BPF_RB_ROOT)) && + (rec->spin_lock_off < 0 && rec->res_spin_lock_off < 0)) { ret = -EINVAL; goto end; } @@ -4944,11 +4971,6 @@ static s32 btf_decl_tag_check_meta(struct btf_verifier_env *env, return -EINVAL; } - if (btf_type_kflag(t)) { - btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); - return -EINVAL; - } - component_idx = btf_type_decl_tag(t)->component_idx; if (component_idx < -1) { btf_verifier_log_type(env, t, "Invalid component_idx"); @@ -5562,7 +5584,7 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf) if (id < 0) continue; - new_aof = krealloc(aof, offsetof(struct btf_id_set, ids[aof->cnt + 1]), + new_aof = krealloc(aof, struct_size(new_aof, ids, aof->cnt + 1), GFP_KERNEL | __GFP_NOWARN); if (!new_aof) { ret = -ENOMEM; @@ -5589,7 +5611,7 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf) if (ret != BTF_FIELD_FOUND) continue; - new_aof = krealloc(aof, offsetof(struct btf_id_set, ids[aof->cnt + 1]), + new_aof = krealloc(aof, struct_size(new_aof, ids, aof->cnt + 1), GFP_KERNEL | __GFP_NOWARN); if (!new_aof) { ret = -ENOMEM; @@ -5626,7 +5648,7 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf) continue; parse: tab_cnt = tab ? tab->cnt : 0; - new_tab = krealloc(tab, offsetof(struct btf_struct_metas, types[tab_cnt + 1]), + new_tab = krealloc(tab, struct_size(new_tab, types, tab_cnt + 1), GFP_KERNEL | __GFP_NOWARN); if (!new_tab) { ret = -ENOMEM; @@ -5638,7 +5660,7 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf) type = &tab->types[tab->cnt]; type->btf_id = i; - record = btf_parse_fields(btf, t, BPF_SPIN_LOCK | BPF_LIST_HEAD | BPF_LIST_NODE | + record = btf_parse_fields(btf, t, BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK | BPF_LIST_HEAD | BPF_LIST_NODE | BPF_RB_ROOT | BPF_RB_NODE | BPF_REFCOUNT | BPF_KPTR, t->size); /* The record cannot be unset, treat it as an error if so */ @@ -6362,16 +6384,15 @@ struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog) return prog->aux->attach_btf; } -static bool is_int_ptr(struct btf *btf, const struct btf_type *t) +static bool is_void_or_int_ptr(struct btf *btf, const struct btf_type *t) { /* skip modifiers */ t = btf_type_skip_modifiers(btf, t->type, NULL); - - return btf_type_is_int(t); + return btf_type_is_void(t) || btf_type_is_int(t); } -static u32 get_ctx_arg_idx(struct btf *btf, const struct btf_type *func_proto, - int off) +u32 btf_ctx_arg_idx(struct btf *btf, const struct btf_type *func_proto, + int off) { const struct btf_param *args; const struct btf_type *t; @@ -6507,6 +6528,8 @@ static const struct bpf_raw_tp_null_args raw_tp_null_args[] = { /* rxrpc */ { "rxrpc_recvdata", 0x1 }, { "rxrpc_resend", 0x10 }, + { "rxrpc_tq", 0x10 }, + { "rxrpc_client", 0x1 }, /* skb */ {"kfree_skb", 0x1000}, /* sunrpc */ @@ -6518,6 +6541,7 @@ static const struct bpf_raw_tp_null_args raw_tp_null_args[] = { { "xprt_put_cong", 0x10 }, /* tcp */ { "tcp_send_reset", 0x11 }, + { "tcp_sendmsg_locked", 0x100 }, /* tegra_apb_dma */ { "tegra_dma_tx_status", 0x100 }, /* timer_migration */ @@ -6529,6 +6553,103 @@ static const struct bpf_raw_tp_null_args raw_tp_null_args[] = { { "mr_integ_alloc", 0x2000 }, /* bpf_testmod */ { "bpf_testmod_test_read", 0x0 }, + /* amdgpu */ + { "amdgpu_vm_bo_map", 0x1 }, + { "amdgpu_vm_bo_unmap", 0x1 }, + /* netfs */ + { "netfs_folioq", 0x1 }, + /* xfs from xfs_defer_pending_class */ + { "xfs_defer_create_intent", 0x1 }, + { "xfs_defer_cancel_list", 0x1 }, + { "xfs_defer_pending_finish", 0x1 }, + { "xfs_defer_pending_abort", 0x1 }, + { "xfs_defer_relog_intent", 0x1 }, + { "xfs_defer_isolate_paused", 0x1 }, + { "xfs_defer_item_pause", 0x1 }, + { "xfs_defer_item_unpause", 0x1 }, + /* xfs from xfs_defer_pending_item_class */ + { "xfs_defer_add_item", 0x1 }, + { "xfs_defer_cancel_item", 0x1 }, + { "xfs_defer_finish_item", 0x1 }, + /* xfs from xfs_icwalk_class */ + { "xfs_ioc_free_eofblocks", 0x10 }, + { "xfs_blockgc_free_space", 0x10 }, + /* xfs from xfs_btree_cur_class */ + { "xfs_btree_updkeys", 0x100 }, + { "xfs_btree_overlapped_query_range", 0x100 }, + /* xfs from xfs_imap_class*/ + { "xfs_map_blocks_found", 0x10000 }, + { "xfs_map_blocks_alloc", 0x10000 }, + { "xfs_iomap_alloc", 0x1000 }, + { "xfs_iomap_found", 0x1000 }, + /* xfs from xfs_fs_class */ + { "xfs_inodegc_flush", 0x1 }, + { "xfs_inodegc_push", 0x1 }, + { "xfs_inodegc_start", 0x1 }, + { "xfs_inodegc_stop", 0x1 }, + { "xfs_inodegc_queue", 0x1 }, + { "xfs_inodegc_throttle", 0x1 }, + { "xfs_fs_sync_fs", 0x1 }, + { "xfs_blockgc_start", 0x1 }, + { "xfs_blockgc_stop", 0x1 }, + { "xfs_blockgc_worker", 0x1 }, + { "xfs_blockgc_flush_all", 0x1 }, + /* xfs_scrub */ + { "xchk_nlinks_live_update", 0x10 }, + /* xfs_scrub from xchk_metapath_class */ + { "xchk_metapath_lookup", 0x100 }, + /* nfsd */ + { "nfsd_dirent", 0x1 }, + { "nfsd_file_acquire", 0x1001 }, + { "nfsd_file_insert_err", 0x1 }, + { "nfsd_file_cons_err", 0x1 }, + /* nfs4 */ + { "nfs4_setup_sequence", 0x1 }, + { "pnfs_update_layout", 0x10000 }, + { "nfs4_inode_callback_event", 0x200 }, + { "nfs4_inode_stateid_callback_event", 0x200 }, + /* nfs from pnfs_layout_event */ + { "pnfs_mds_fallback_pg_init_read", 0x10000 }, + { "pnfs_mds_fallback_pg_init_write", 0x10000 }, + { "pnfs_mds_fallback_pg_get_mirror_count", 0x10000 }, + { "pnfs_mds_fallback_read_done", 0x10000 }, + { "pnfs_mds_fallback_write_done", 0x10000 }, + { "pnfs_mds_fallback_read_pagelist", 0x10000 }, + { "pnfs_mds_fallback_write_pagelist", 0x10000 }, + /* coda */ + { "coda_dec_pic_run", 0x10 }, + { "coda_dec_pic_done", 0x10 }, + /* cfg80211 */ + { "cfg80211_scan_done", 0x11 }, + { "rdev_set_coalesce", 0x10 }, + { "cfg80211_report_wowlan_wakeup", 0x100 }, + { "cfg80211_inform_bss_frame", 0x100 }, + { "cfg80211_michael_mic_failure", 0x10000 }, + /* cfg80211 from wiphy_work_event */ + { "wiphy_work_queue", 0x10 }, + { "wiphy_work_run", 0x10 }, + { "wiphy_work_cancel", 0x10 }, + { "wiphy_work_flush", 0x10 }, + /* hugetlbfs */ + { "hugetlbfs_alloc_inode", 0x10 }, + /* spufs */ + { "spufs_context", 0x10 }, + /* kvm_hv */ + { "kvm_page_fault_enter", 0x100 }, + /* dpu */ + { "dpu_crtc_setup_mixer", 0x100 }, + /* binder */ + { "binder_transaction", 0x100 }, + /* bcachefs */ + { "btree_path_free", 0x100 }, + /* hfi1_tx */ + { "hfi1_sdma_progress", 0x1000 }, + /* iptfs */ + { "iptfs_ingress_postq_event", 0x1000 }, + /* neigh */ + { "neigh_update", 0x10 }, + /* snd_firewire_lib */ + { "amdtp_packet", 0x100 }, }; bool btf_ctx_access(int off, int size, enum bpf_access_type type, @@ -6551,7 +6672,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, tname, off); return false; } - arg = get_ctx_arg_idx(btf, t, off); + arg = btf_ctx_arg_idx(btf, t, off); args = (const struct btf_param *)(t + 1); /* if (t == NULL) Fall back to default BPF prog with * MAX_BPF_FUNC_REG_ARGS u64 arguments. @@ -6656,14 +6777,11 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, } } - if (t->type == 0) - /* This is a pointer to void. - * It is the same as scalar from the verifier safety pov. - * No further pointer walking is allowed. - */ - return true; - - if (is_int_ptr(btf, t)) + /* + * If it's a pointer to void, it's the same as scalar from the verifier + * safety POV. Either way, no futher pointer walking is allowed. + */ + if (is_void_or_int_ptr(btf, t)) return true; /* this is a pointer to another type */ @@ -6679,6 +6797,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, info->reg_type = ctx_arg_info->reg_type; info->btf = ctx_arg_info->btf ? : btf_vmlinux; info->btf_id = ctx_arg_info->btf_id; + info->ref_obj_id = ctx_arg_info->ref_obj_id; return true; } } @@ -6708,10 +6827,10 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, /* Is this a func with potential NULL args? */ if (strcmp(tname, raw_tp_null_args[i].func)) continue; - if (raw_tp_null_args[i].mask & (0x1 << (arg * 4))) + if (raw_tp_null_args[i].mask & (0x1ULL << (arg * 4))) info->reg_type |= PTR_MAYBE_NULL; /* Is the current arg IS_ERR? */ - if (raw_tp_null_args[i].mask & (0x2 << (arg * 4))) + if (raw_tp_null_args[i].mask & (0x2ULL << (arg * 4))) ptr_err_raw_tp = true; break; } @@ -6745,7 +6864,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, info->btf_id = t->type; t = btf_type_by_id(btf, t->type); - if (btf_type_is_type_tag(t)) { + if (btf_type_is_type_tag(t) && !btf_type_kflag(t)) { tag_value = __btf_name_by_offset(btf, t->name_off); if (strcmp(tag_value, "user") == 0) info->reg_type |= MEM_USER; @@ -7004,7 +7123,7 @@ error: /* check type tag */ t = btf_type_by_id(btf, mtype->type); - if (btf_type_is_type_tag(t)) { + if (btf_type_is_type_tag(t) && !btf_type_kflag(t)) { tag_value = __btf_name_by_offset(btf, t->name_off); /* check __user tag */ if (strcmp(tag_value, "user") == 0) @@ -7541,7 +7660,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) return 0; if (!prog->aux->func_info) { - bpf_log(log, "Verifier bug\n"); + verifier_bug(env, "func_info undefined"); return -EFAULT; } @@ -7565,7 +7684,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) tname = btf_name_by_offset(btf, fn_t->name_off); if (prog->aux->func_info_aux[subprog].unreliable) { - bpf_log(log, "Verifier bug in function %s()\n", tname); + verifier_bug(env, "unreliable BTF for function %s()", tname); return -EFAULT; } if (prog_type == BPF_PROG_TYPE_EXT) @@ -8442,7 +8561,7 @@ static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook, /* Grow set */ set = krealloc(tab->sets[hook], - offsetof(struct btf_id_set8, pairs[set_cnt + add_set->cnt]), + struct_size(set, pairs, set_cnt + add_set->cnt), GFP_KERNEL | __GFP_NOWARN); if (!set) { ret = -ENOMEM; @@ -8526,6 +8645,7 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type) case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: case BPF_PROG_TYPE_CGROUP_SOCKOPT: case BPF_PROG_TYPE_CGROUP_SYSCTL: + case BPF_PROG_TYPE_SOCK_OPS: return BTF_KFUNC_HOOK_CGROUP; case BPF_PROG_TYPE_SCHED_ACT: return BTF_KFUNC_HOOK_SCHED_ACT; @@ -8727,7 +8847,7 @@ int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_c } tab = krealloc(btf->dtor_kfunc_tab, - offsetof(struct btf_id_dtor_kfunc_tab, dtors[tab_cnt + add_cnt]), + struct_size(tab, dtors, tab_cnt + add_cnt), GFP_KERNEL | __GFP_NOWARN); if (!tab) { ret = -ENOMEM; @@ -9285,8 +9405,7 @@ btf_add_struct_ops(struct btf *btf, struct bpf_struct_ops *st_ops, tab = btf->struct_ops_tab; if (!tab) { - tab = kzalloc(offsetof(struct btf_struct_ops_tab, ops[4]), - GFP_KERNEL); + tab = kzalloc(struct_size(tab, ops, 4), GFP_KERNEL); if (!tab) return -ENOMEM; tab->capacity = 4; @@ -9299,8 +9418,7 @@ btf_add_struct_ops(struct btf *btf, struct bpf_struct_ops *st_ops, if (tab->cnt == tab->capacity) { new_tab = krealloc(tab, - offsetof(struct btf_struct_ops_tab, - ops[tab->capacity * 2]), + struct_size(tab, ops, tab->capacity * 2), GFP_KERNEL); if (!new_tab) return -ENOMEM; diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 46e5db65dbc8..9122c39870bf 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -41,6 +41,19 @@ static int __init cgroup_bpf_wq_init(void) } core_initcall(cgroup_bpf_wq_init); +static int cgroup_bpf_lifetime_notify(struct notifier_block *nb, + unsigned long action, void *data); + +static struct notifier_block cgroup_bpf_lifetime_nb = { + .notifier_call = cgroup_bpf_lifetime_notify, +}; + +void __init cgroup_bpf_lifetime_notifier_init(void) +{ + BUG_ON(blocking_notifier_chain_register(&cgroup_lifetime_notifier, + &cgroup_bpf_lifetime_nb)); +} + /* __always_inline is necessary to prevent indirect call through run_prog * function pointer. */ @@ -206,7 +219,7 @@ bpf_cgroup_atype_find(enum bpf_attach_type attach_type, u32 attach_btf_id) } #endif /* CONFIG_BPF_LSM */ -void cgroup_bpf_offline(struct cgroup *cgrp) +static void cgroup_bpf_offline(struct cgroup *cgrp) { cgroup_get(cgrp); percpu_ref_kill(&cgrp->bpf.refcnt); @@ -369,7 +382,7 @@ static struct bpf_prog *prog_list_prog(struct bpf_prog_list *pl) /* count number of elements in the list. * it's slow but the list cannot be long */ -static u32 prog_list_length(struct hlist_head *head) +static u32 prog_list_length(struct hlist_head *head, int *preorder_cnt) { struct bpf_prog_list *pl; u32 cnt = 0; @@ -377,6 +390,8 @@ static u32 prog_list_length(struct hlist_head *head) hlist_for_each_entry(pl, head, node) { if (!prog_list_prog(pl)) continue; + if (preorder_cnt && (pl->flags & BPF_F_PREORDER)) + (*preorder_cnt)++; cnt++; } return cnt; @@ -400,7 +415,7 @@ static bool hierarchy_allows_attach(struct cgroup *cgrp, if (flags & BPF_F_ALLOW_MULTI) return true; - cnt = prog_list_length(&p->bpf.progs[atype]); + cnt = prog_list_length(&p->bpf.progs[atype], NULL); WARN_ON_ONCE(cnt > 1); if (cnt == 1) return !!(flags & BPF_F_ALLOW_OVERRIDE); @@ -423,12 +438,12 @@ static int compute_effective_progs(struct cgroup *cgrp, struct bpf_prog_array *progs; struct bpf_prog_list *pl; struct cgroup *p = cgrp; - int cnt = 0; + int i, j, cnt = 0, preorder_cnt = 0, fstart, bstart, init_bstart; /* count number of effective programs by walking parents */ do { if (cnt == 0 || (p->bpf.flags[atype] & BPF_F_ALLOW_MULTI)) - cnt += prog_list_length(&p->bpf.progs[atype]); + cnt += prog_list_length(&p->bpf.progs[atype], &preorder_cnt); p = cgroup_parent(p); } while (p); @@ -439,20 +454,34 @@ static int compute_effective_progs(struct cgroup *cgrp, /* populate the array with effective progs */ cnt = 0; p = cgrp; + fstart = preorder_cnt; + bstart = preorder_cnt - 1; do { if (cnt > 0 && !(p->bpf.flags[atype] & BPF_F_ALLOW_MULTI)) continue; + init_bstart = bstart; hlist_for_each_entry(pl, &p->bpf.progs[atype], node) { if (!prog_list_prog(pl)) continue; - item = &progs->items[cnt]; + if (pl->flags & BPF_F_PREORDER) { + item = &progs->items[bstart]; + bstart--; + } else { + item = &progs->items[fstart]; + fstart++; + } item->prog = prog_list_prog(pl); bpf_cgroup_storages_assign(item->cgroup_storage, pl->storage); cnt++; } + + /* reverse pre-ordering progs at this cgroup level */ + for (i = bstart + 1, j = init_bstart; i < j; i++, j--) + swap(progs->items[i], progs->items[j]); + } while ((p = cgroup_parent(p))); *array = progs; @@ -475,7 +504,7 @@ static void activate_effective_progs(struct cgroup *cgrp, * cgroup_bpf_inherit() - inherit effective programs from parent * @cgrp: the cgroup to modify */ -int cgroup_bpf_inherit(struct cgroup *cgrp) +static int cgroup_bpf_inherit(struct cgroup *cgrp) { /* has to use marco instead of const int, since compiler thinks * that array below is variable length @@ -518,6 +547,27 @@ cleanup: return -ENOMEM; } +static int cgroup_bpf_lifetime_notify(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct cgroup *cgrp = data; + int ret = 0; + + if (cgrp->root != &cgrp_dfl_root) + return NOTIFY_OK; + + switch (action) { + case CGROUP_LIFETIME_ONLINE: + ret = cgroup_bpf_inherit(cgrp); + break; + case CGROUP_LIFETIME_OFFLINE: + cgroup_bpf_offline(cgrp); + break; + } + + return notifier_from_errno(ret); +} + static int update_effective_progs(struct cgroup *cgrp, enum cgroup_bpf_attach_type atype) { @@ -663,7 +713,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, */ return -EPERM; - if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS) + if (prog_list_length(progs, NULL) >= BPF_CGROUP_MAX_PROGS) return -E2BIG; pl = find_attach_entry(progs, prog, link, replace_prog, @@ -698,6 +748,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, pl->prog = prog; pl->link = link; + pl->flags = flags; bpf_cgroup_storages_assign(pl->storage, storage); cgrp->bpf.flags[atype] = saved_flags; @@ -1073,7 +1124,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, lockdep_is_held(&cgroup_mutex)); total_cnt += bpf_prog_array_length(effective); } else { - total_cnt += prog_list_length(&cgrp->bpf.progs[atype]); + total_cnt += prog_list_length(&cgrp->bpf.progs[atype], NULL); } } @@ -1105,7 +1156,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, u32 id; progs = &cgrp->bpf.progs[atype]; - cnt = min_t(int, prog_list_length(progs), total_cnt); + cnt = min_t(int, prog_list_length(progs, NULL), total_cnt); i = 0; hlist_for_each_entry(pl, progs, node) { prog = prog_list_prog(pl); @@ -1636,10 +1687,6 @@ cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) if (func_proto) return func_proto; - func_proto = cgroup_current_func_proto(func_id, prog); - if (func_proto) - return func_proto; - switch (func_id) { case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; @@ -2187,10 +2234,6 @@ sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) if (func_proto) return func_proto; - func_proto = cgroup_current_func_proto(func_id, prog); - if (func_proto) - return func_proto; - switch (func_id) { case BPF_FUNC_sysctl_get_name: return &bpf_sysctl_get_name_proto; @@ -2334,10 +2377,6 @@ cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) if (func_proto) return func_proto; - func_proto = cgroup_current_func_proto(func_id, prog); - if (func_proto) - return func_proto; - switch (func_id) { #ifdef CONFIG_NET case BPF_FUNC_get_netns_cookie: @@ -2584,23 +2623,3 @@ cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return NULL; } } - -/* Common helpers for cgroup hooks with valid process context. */ -const struct bpf_func_proto * -cgroup_current_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) -{ - switch (func_id) { - case BPF_FUNC_get_current_uid_gid: - return &bpf_get_current_uid_gid_proto; - case BPF_FUNC_get_current_comm: - return &bpf_get_current_comm_proto; -#ifdef CONFIG_CGROUP_NET_CLASSID - case BPF_FUNC_get_cgroup_classid: - return &bpf_get_cgroup_classid_curr_proto; -#endif - case BPF_FUNC_current_task_under_cgroup: - return &bpf_current_task_under_cgroup_proto; - default: - return NULL; - } -} diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index da729cbbaeb9..c20babbf998f 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1663,14 +1663,17 @@ EXPORT_SYMBOL_GPL(__bpf_call_base); INSN_3(JMP, JSET, K), \ INSN_2(JMP, JA), \ INSN_2(JMP32, JA), \ + /* Atomic operations. */ \ + INSN_3(STX, ATOMIC, B), \ + INSN_3(STX, ATOMIC, H), \ + INSN_3(STX, ATOMIC, W), \ + INSN_3(STX, ATOMIC, DW), \ /* Store instructions. */ \ /* Register based. */ \ INSN_3(STX, MEM, B), \ INSN_3(STX, MEM, H), \ INSN_3(STX, MEM, W), \ INSN_3(STX, MEM, DW), \ - INSN_3(STX, ATOMIC, W), \ - INSN_3(STX, ATOMIC, DW), \ /* Immediate based. */ \ INSN_3(ST, MEM, B), \ INSN_3(ST, MEM, H), \ @@ -2152,24 +2155,33 @@ out: if (BPF_SIZE(insn->code) == BPF_W) \ atomic_##KOP((u32) SRC, (atomic_t *)(unsigned long) \ (DST + insn->off)); \ - else \ + else if (BPF_SIZE(insn->code) == BPF_DW) \ atomic64_##KOP((u64) SRC, (atomic64_t *)(unsigned long) \ (DST + insn->off)); \ + else \ + goto default_label; \ break; \ case BOP | BPF_FETCH: \ if (BPF_SIZE(insn->code) == BPF_W) \ SRC = (u32) atomic_fetch_##KOP( \ (u32) SRC, \ (atomic_t *)(unsigned long) (DST + insn->off)); \ - else \ + else if (BPF_SIZE(insn->code) == BPF_DW) \ SRC = (u64) atomic64_fetch_##KOP( \ (u64) SRC, \ (atomic64_t *)(unsigned long) (DST + insn->off)); \ + else \ + goto default_label; \ break; STX_ATOMIC_DW: STX_ATOMIC_W: + STX_ATOMIC_H: + STX_ATOMIC_B: switch (IMM) { + /* Atomic read-modify-write instructions support only W and DW + * size modifiers. + */ ATOMIC_ALU_OP(BPF_ADD, add) ATOMIC_ALU_OP(BPF_AND, and) ATOMIC_ALU_OP(BPF_OR, or) @@ -2181,20 +2193,63 @@ out: SRC = (u32) atomic_xchg( (atomic_t *)(unsigned long) (DST + insn->off), (u32) SRC); - else + else if (BPF_SIZE(insn->code) == BPF_DW) SRC = (u64) atomic64_xchg( (atomic64_t *)(unsigned long) (DST + insn->off), (u64) SRC); + else + goto default_label; break; case BPF_CMPXCHG: if (BPF_SIZE(insn->code) == BPF_W) BPF_R0 = (u32) atomic_cmpxchg( (atomic_t *)(unsigned long) (DST + insn->off), (u32) BPF_R0, (u32) SRC); - else + else if (BPF_SIZE(insn->code) == BPF_DW) BPF_R0 = (u64) atomic64_cmpxchg( (atomic64_t *)(unsigned long) (DST + insn->off), (u64) BPF_R0, (u64) SRC); + else + goto default_label; + break; + /* Atomic load and store instructions support all size + * modifiers. + */ + case BPF_LOAD_ACQ: + switch (BPF_SIZE(insn->code)) { +#define LOAD_ACQUIRE(SIZEOP, SIZE) \ + case BPF_##SIZEOP: \ + DST = (SIZE)smp_load_acquire( \ + (SIZE *)(unsigned long)(SRC + insn->off)); \ + break; + LOAD_ACQUIRE(B, u8) + LOAD_ACQUIRE(H, u16) + LOAD_ACQUIRE(W, u32) +#ifdef CONFIG_64BIT + LOAD_ACQUIRE(DW, u64) +#endif +#undef LOAD_ACQUIRE + default: + goto default_label; + } + break; + case BPF_STORE_REL: + switch (BPF_SIZE(insn->code)) { +#define STORE_RELEASE(SIZEOP, SIZE) \ + case BPF_##SIZEOP: \ + smp_store_release( \ + (SIZE *)(unsigned long)(DST + insn->off), (SIZE)SRC); \ + break; + STORE_RELEASE(B, u8) + STORE_RELEASE(H, u16) + STORE_RELEASE(W, u32) +#ifdef CONFIG_64BIT + STORE_RELEASE(DW, u64) +#endif +#undef STORE_RELEASE + default: + goto default_label; + } break; default: @@ -2290,20 +2345,21 @@ void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth) insn->code = BPF_JMP | BPF_CALL_ARGS; } #endif -#else +#endif + static unsigned int __bpf_prog_ret0_warn(const void *ctx, const struct bpf_insn *insn) { /* If this handler ever gets executed, then BPF_JIT_ALWAYS_ON - * is not working properly, so warn about it! + * is not working properly, or interpreter is being used when + * prog->jit_requested is not 0, so warn about it! */ WARN_ON_ONCE(1); return 0; } -#endif -bool bpf_prog_map_compatible(struct bpf_map *map, - const struct bpf_prog *fp) +static bool __bpf_prog_map_compatible(struct bpf_map *map, + const struct bpf_prog *fp) { enum bpf_prog_type prog_type = resolve_prog_type(fp); bool ret; @@ -2312,14 +2368,6 @@ bool bpf_prog_map_compatible(struct bpf_map *map, if (fp->kprobe_override) return false; - /* XDP programs inserted into maps are not guaranteed to run on - * a particular netdev (and can run outside driver context entirely - * in the case of devmap and cpumap). Until device checks - * are implemented, prohibit adding dev-bound programs to program maps. - */ - if (bpf_prog_is_dev_bound(aux)) - return false; - spin_lock(&map->owner.lock); if (!map->owner.type) { /* There's no owner yet where we could check for @@ -2353,6 +2401,19 @@ bool bpf_prog_map_compatible(struct bpf_map *map, return ret; } +bool bpf_prog_map_compatible(struct bpf_map *map, const struct bpf_prog *fp) +{ + /* XDP programs inserted into maps are not guaranteed to run on + * a particular netdev (and can run outside driver context entirely + * in the case of devmap and cpumap). Until device checks + * are implemented, prohibit adding dev-bound programs to program maps. + */ + if (bpf_prog_is_dev_bound(fp->aux)) + return false; + + return __bpf_prog_map_compatible(map, fp); +} + static int bpf_check_tail_call(const struct bpf_prog *fp) { struct bpf_prog_aux *aux = fp->aux; @@ -2365,7 +2426,7 @@ static int bpf_check_tail_call(const struct bpf_prog *fp) if (!map_type_contains_progs(map)) continue; - if (!bpf_prog_map_compatible(map, fp)) { + if (!__bpf_prog_map_compatible(map, fp)) { ret = -EINVAL; goto out; } @@ -2380,8 +2441,18 @@ static void bpf_prog_select_func(struct bpf_prog *fp) { #ifndef CONFIG_BPF_JIT_ALWAYS_ON u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1); + u32 idx = (round_up(stack_depth, 32) / 32) - 1; - fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1]; + /* may_goto may cause stack size > 512, leading to idx out-of-bounds. + * But for non-JITed programs, we don't need bpf_func, so no bounds + * check needed. + */ + if (!fp->jit_requested && + !WARN_ON_ONCE(idx >= ARRAY_SIZE(interpreters))) { + fp->bpf_func = interpreters[idx]; + } else { + fp->bpf_func = __bpf_prog_ret0_warn; + } #else fp->bpf_func = __bpf_prog_ret0_warn; #endif @@ -2403,7 +2474,7 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) /* In case of BPF to BPF calls, verifier did all the prep * work with regards to JITing, etc. */ - bool jit_needed = false; + bool jit_needed = fp->jit_requested; if (fp->bpf_func) goto finalize; @@ -2906,6 +2977,11 @@ const struct bpf_func_proto * __weak bpf_get_trace_vprintk_proto(void) return NULL; } +const struct bpf_func_proto * __weak bpf_get_perf_event_read_value_proto(void) +{ + return NULL; +} + u64 __weak bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy) @@ -3058,6 +3134,32 @@ void __weak arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, { } +bool __weak bpf_jit_supports_timed_may_goto(void) +{ + return false; +} + +u64 __weak arch_bpf_timed_may_goto(void) +{ + return 0; +} + +u64 bpf_check_timed_may_goto(struct bpf_timed_may_goto *p) +{ + u64 time = ktime_get_mono_fast_ns(); + + /* Populate the timestamp for this stack frame, and refresh count. */ + if (!p->timestamp) { + p->timestamp = time; + return BPF_MAX_TIMED_LOOPS; + } + /* Check if we've exhausted our time slice, and zero count. */ + if (time - p->timestamp >= (NSEC_PER_SEC / 4)) + return 0; + /* Refresh the count for the stack frame. */ + return BPF_MAX_TIMED_LOOPS; +} + /* for configs without MMU or 32-bit */ __weak const struct bpf_map_ops arena_map_ops; __weak u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena) diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 774accbd4a22..67e8a2fc1a99 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -33,8 +33,8 @@ #include <trace/events/xdp.h> #include <linux/btf_ids.h> -#include <linux/netdevice.h> /* netif_receive_skb_list */ -#include <linux/etherdevice.h> /* eth_type_trans */ +#include <linux/netdevice.h> +#include <net/gro.h> /* General idea: XDP packets getting XDP redirected to another CPU, * will maximum be stored/queued for one driver ->poll() call. It is @@ -68,6 +68,7 @@ struct bpf_cpu_map_entry { struct bpf_cpumap_val value; struct bpf_prog *prog; + struct gro_node gro; struct completion kthread_running; struct rcu_work free_work; @@ -133,22 +134,23 @@ static void __cpu_map_ring_cleanup(struct ptr_ring *ring) } } -static void cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu, - struct list_head *listp, - struct xdp_cpumap_stats *stats) +static u32 cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu, + void **skbs, u32 skb_n, + struct xdp_cpumap_stats *stats) { - struct sk_buff *skb, *tmp; struct xdp_buff xdp; - u32 act; + u32 act, pass = 0; int err; - list_for_each_entry_safe(skb, tmp, listp, list) { + for (u32 i = 0; i < skb_n; i++) { + struct sk_buff *skb = skbs[i]; + act = bpf_prog_run_generic_xdp(skb, &xdp, rcpu->prog); switch (act) { case XDP_PASS: + skbs[pass++] = skb; break; case XDP_REDIRECT: - skb_list_del_init(skb); err = xdp_do_generic_redirect(skb->dev, skb, &xdp, rcpu->prog); if (unlikely(err)) { @@ -157,7 +159,7 @@ static void cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu, } else { stats->redirect++; } - return; + break; default: bpf_warn_invalid_xdp_action(NULL, rcpu->prog, act); fallthrough; @@ -165,12 +167,15 @@ static void cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu, trace_xdp_exception(skb->dev, rcpu->prog, act); fallthrough; case XDP_DROP: - skb_list_del_init(skb); - kfree_skb(skb); + napi_consume_skb(skb, true); stats->drop++; - return; + break; } } + + stats->pass += pass; + + return pass; } static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, @@ -204,7 +209,6 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, stats->drop++; } else { frames[nframes++] = xdpf; - stats->pass++; } break; case XDP_REDIRECT: @@ -228,43 +232,65 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, } xdp_clear_return_frame_no_direct(); + stats->pass += nframes; return nframes; } #define CPUMAP_BATCH 8 -static int cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames, - int xdp_n, struct xdp_cpumap_stats *stats, - struct list_head *list) +struct cpu_map_ret { + u32 xdp_n; + u32 skb_n; +}; + +static void cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames, + void **skbs, struct cpu_map_ret *ret, + struct xdp_cpumap_stats *stats) { struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; - int nframes; if (!rcpu->prog) - return xdp_n; + goto out; - rcu_read_lock_bh(); + rcu_read_lock(); bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); - nframes = cpu_map_bpf_prog_run_xdp(rcpu, frames, xdp_n, stats); + ret->xdp_n = cpu_map_bpf_prog_run_xdp(rcpu, frames, ret->xdp_n, stats); + if (unlikely(ret->skb_n)) + ret->skb_n = cpu_map_bpf_prog_run_skb(rcpu, skbs, ret->skb_n, + stats); if (stats->redirect) xdp_do_flush(); - if (unlikely(!list_empty(list))) - cpu_map_bpf_prog_run_skb(rcpu, list, stats); - bpf_net_ctx_clear(bpf_net_ctx); - rcu_read_unlock_bh(); /* resched point, may call do_softirq() */ + rcu_read_unlock(); - return nframes; +out: + if (unlikely(ret->skb_n) && ret->xdp_n) + memmove(&skbs[ret->xdp_n], skbs, ret->skb_n * sizeof(*skbs)); +} + +static void cpu_map_gro_flush(struct bpf_cpu_map_entry *rcpu, bool empty) +{ + /* + * If the ring is not empty, there'll be a new iteration soon, and we + * only need to do a full flush if a tick is long (> 1 ms). + * If the ring is empty, to not hold GRO packets in the stack for too + * long, do a full flush. + * This is equivalent to how NAPI decides whether to perform a full + * flush. + */ + gro_flush(&rcpu->gro, !empty && HZ >= 1000); + gro_normal_list(&rcpu->gro); } static int cpu_map_kthread_run(void *data) { struct bpf_cpu_map_entry *rcpu = data; unsigned long last_qs = jiffies; + u32 packets = 0; complete(&rcpu->kthread_running); set_current_state(TASK_INTERRUPTIBLE); @@ -277,11 +303,11 @@ static int cpu_map_kthread_run(void *data) while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) { struct xdp_cpumap_stats stats = {}; /* zero stats */ unsigned int kmem_alloc_drops = 0, sched = 0; - gfp_t gfp = __GFP_ZERO | GFP_ATOMIC; - int i, n, m, nframes, xdp_n; + struct cpu_map_ret ret = { }; void *frames[CPUMAP_BATCH]; void *skbs[CPUMAP_BATCH]; - LIST_HEAD(list); + u32 i, n, m; + bool empty; /* Release CPU reschedule checks */ if (__ptr_ring_empty(rcpu->queue)) { @@ -306,7 +332,7 @@ static int cpu_map_kthread_run(void *data) */ n = __ptr_ring_consume_batched(rcpu->queue, frames, CPUMAP_BATCH); - for (i = 0, xdp_n = 0; i < n; i++) { + for (i = 0; i < n; i++) { void *f = frames[i]; struct page *page; @@ -314,11 +340,11 @@ static int cpu_map_kthread_run(void *data) struct sk_buff *skb = f; __ptr_clear_bit(0, &skb); - list_add_tail(&skb->list, &list); + skbs[ret.skb_n++] = skb; continue; } - frames[xdp_n++] = f; + frames[ret.xdp_n++] = f; page = virt_to_page(f); /* Bring struct page memory area to curr CPU. Read by @@ -328,40 +354,51 @@ static int cpu_map_kthread_run(void *data) prefetchw(page); } + local_bh_disable(); + /* Support running another XDP prog on this CPU */ - nframes = cpu_map_bpf_prog_run(rcpu, frames, xdp_n, &stats, &list); - if (nframes) { - m = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache, - gfp, nframes, skbs); - if (unlikely(m == 0)) { - for (i = 0; i < nframes; i++) - skbs[i] = NULL; /* effect: xdp_return_frame */ - kmem_alloc_drops += nframes; - } + cpu_map_bpf_prog_run(rcpu, frames, skbs, &ret, &stats); + if (!ret.xdp_n) + goto stats; + + m = napi_skb_cache_get_bulk(skbs, ret.xdp_n); + if (unlikely(m < ret.xdp_n)) { + for (i = m; i < ret.xdp_n; i++) + xdp_return_frame(frames[i]); + + if (ret.skb_n) + memmove(&skbs[m], &skbs[ret.xdp_n], + ret.skb_n * sizeof(*skbs)); + + kmem_alloc_drops += ret.xdp_n - m; + ret.xdp_n = m; } - local_bh_disable(); - for (i = 0; i < nframes; i++) { + for (i = 0; i < ret.xdp_n; i++) { struct xdp_frame *xdpf = frames[i]; - struct sk_buff *skb = skbs[i]; - skb = __xdp_build_skb_from_frame(xdpf, skb, - xdpf->dev_rx); - if (!skb) { - xdp_return_frame(xdpf); - continue; - } - - list_add_tail(&skb->list, &list); + /* Can fail only when !skb -- already handled above */ + __xdp_build_skb_from_frame(xdpf, skbs[i], xdpf->dev_rx); } +stats: /* Feedback loop via tracepoint. * NB: keep before recv to allow measuring enqueue/dequeue latency. */ trace_xdp_cpumap_kthread(rcpu->map_id, n, kmem_alloc_drops, sched, &stats); - netif_receive_skb_list(&list); + for (i = 0; i < ret.xdp_n + ret.skb_n; i++) + gro_receive_skb(&rcpu->gro, skbs[i]); + + /* Flush either every 64 packets or in case of empty ring */ + packets += n; + empty = __ptr_ring_empty(rcpu->queue); + if (packets >= NAPI_POLL_WEIGHT || empty) { + cpu_map_gro_flush(rcpu, empty); + packets = 0; + } + local_bh_enable(); /* resched point, may call do_softirq() */ } __set_current_state(TASK_RUNNING); @@ -430,6 +467,7 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value, rcpu->cpu = cpu; rcpu->map_id = map->id; rcpu->value.qsize = value->qsize; + gro_init(&rcpu->gro); if (fd > 0 && __cpu_map_load_bpf_program(rcpu, map, fd)) goto free_ptr_ring; @@ -458,6 +496,7 @@ free_prog: if (rcpu->prog) bpf_prog_put(rcpu->prog); free_ptr_ring: + gro_cleanup(&rcpu->gro); ptr_ring_cleanup(rcpu->queue, NULL); free_queue: kfree(rcpu->queue); @@ -487,6 +526,7 @@ static void __cpu_map_entry_free(struct work_struct *work) if (rcpu->prog) bpf_prog_put(rcpu->prog); + gro_cleanup(&rcpu->gro); /* The queue should be empty at this point */ __cpu_map_ring_cleanup(rcpu->queue); ptr_ring_cleanup(rcpu->queue, NULL); diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c index cfa1c18e3a48..9876c5fe6c2a 100644 --- a/kernel/bpf/cpumask.c +++ b/kernel/bpf/cpumask.c @@ -45,6 +45,10 @@ __bpf_kfunc_start_defs(); * * bpf_cpumask_create() allocates memory using the BPF memory allocator, and * will not block. It may return NULL if no memory is available. + * + * Return: + * * A pointer to a new struct bpf_cpumask instance on success. + * * NULL if the BPF memory allocator is out of memory. */ __bpf_kfunc struct bpf_cpumask *bpf_cpumask_create(void) { @@ -71,6 +75,10 @@ __bpf_kfunc struct bpf_cpumask *bpf_cpumask_create(void) * Acquires a reference to a BPF cpumask. The cpumask returned by this function * must either be embedded in a map as a kptr, or freed with * bpf_cpumask_release(). + * + * Return: + * * The struct bpf_cpumask pointer passed to the function. + * */ __bpf_kfunc struct bpf_cpumask *bpf_cpumask_acquire(struct bpf_cpumask *cpumask) { @@ -106,6 +114,9 @@ CFI_NOSEAL(bpf_cpumask_release_dtor); * * Find the index of the first nonzero bit of the cpumask. A struct bpf_cpumask * pointer may be safely passed to this function. + * + * Return: + * * The index of the first nonzero bit in the struct cpumask. */ __bpf_kfunc u32 bpf_cpumask_first(const struct cpumask *cpumask) { @@ -119,6 +130,9 @@ __bpf_kfunc u32 bpf_cpumask_first(const struct cpumask *cpumask) * * Find the index of the first unset bit of the cpumask. A struct bpf_cpumask * pointer may be safely passed to this function. + * + * Return: + * * The index of the first zero bit in the struct cpumask. */ __bpf_kfunc u32 bpf_cpumask_first_zero(const struct cpumask *cpumask) { @@ -133,6 +147,9 @@ __bpf_kfunc u32 bpf_cpumask_first_zero(const struct cpumask *cpumask) * * Find the index of the first nonzero bit of the AND of two cpumasks. * struct bpf_cpumask pointers may be safely passed to @src1 and @src2. + * + * Return: + * * The index of the first bit that is nonzero in both cpumask instances. */ __bpf_kfunc u32 bpf_cpumask_first_and(const struct cpumask *src1, const struct cpumask *src2) @@ -414,12 +431,47 @@ __bpf_kfunc u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1, * @cpumask: The cpumask being queried. * * Count the number of set bits in the given cpumask. + * + * Return: + * * The number of bits set in the mask. */ __bpf_kfunc u32 bpf_cpumask_weight(const struct cpumask *cpumask) { return cpumask_weight(cpumask); } +/** + * bpf_cpumask_populate() - Populate the CPU mask from the contents of + * a BPF memory region. + * + * @cpumask: The cpumask being populated. + * @src: The BPF memory holding the bit pattern. + * @src__sz: Length of the BPF memory region in bytes. + * + * Return: + * * 0 if the struct cpumask * instance was populated successfully. + * * -EACCES if the memory region is too small to populate the cpumask. + * * -EINVAL if the memory region is not aligned to the size of a long + * and the architecture does not support efficient unaligned accesses. + */ +__bpf_kfunc int bpf_cpumask_populate(struct cpumask *cpumask, void *src, size_t src__sz) +{ + unsigned long source = (unsigned long)src; + + /* The memory region must be large enough to populate the entire CPU mask. */ + if (src__sz < bitmap_size(nr_cpu_ids)) + return -EACCES; + + /* If avoiding unaligned accesses, the input region must be aligned to the nearest long. */ + if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && + !IS_ALIGNED(source, sizeof(long))) + return -EINVAL; + + bitmap_copy(cpumask_bits(cpumask), src, nr_cpu_ids); + + return 0; +} + __bpf_kfunc_end_defs(); BTF_KFUNCS_START(cpumask_kfunc_btf_ids) @@ -448,6 +500,7 @@ BTF_ID_FLAGS(func, bpf_cpumask_copy, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_any_distribute, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_any_and_distribute, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_weight, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_populate, KF_RCU) BTF_KFUNCS_END(cpumask_kfunc_btf_ids) static const struct btf_kfunc_id_set cpumask_kfunc_set = { diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index 309c4aa1b026..20883c6b1546 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -202,7 +202,7 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, insn->dst_reg, class == BPF_ALU ? 'w' : 'r', insn->dst_reg); } else if (is_addr_space_cast(insn)) { - verbose(cbs->private_data, "(%02x) r%d = addr_space_cast(r%d, %d, %d)\n", + verbose(cbs->private_data, "(%02x) r%d = addr_space_cast(r%d, %u, %u)\n", insn->code, insn->dst_reg, insn->src_reg, ((u32)insn->imm) >> 16, (u16)insn->imm); } else if (is_mov_percpu_addr(insn)) { @@ -267,6 +267,18 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, BPF_SIZE(insn->code) == BPF_DW ? "64" : "", bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, insn->src_reg); + } else if (BPF_MODE(insn->code) == BPF_ATOMIC && + insn->imm == BPF_LOAD_ACQ) { + verbose(cbs->private_data, "(%02x) r%d = load_acquire((%s *)(r%d %+d))\n", + insn->code, insn->dst_reg, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->src_reg, insn->off); + } else if (BPF_MODE(insn->code) == BPF_ATOMIC && + insn->imm == BPF_STORE_REL) { + verbose(cbs->private_data, "(%02x) store_release((%s *)(r%d %+d), r%d)\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, insn->off, insn->src_reg); } else { verbose(cbs->private_data, "BUG_%02x\n", insn->code); } @@ -369,7 +381,7 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, insn->code, class == BPF_JMP32 ? 'w' : 'r', insn->dst_reg, bpf_jmp_string[BPF_OP(insn->code) >> 4], - insn->imm, insn->off); + (u32)insn->imm, insn->off); } } else { verbose(cbs->private_data, "(%02x) %s\n", diff --git a/kernel/bpf/dmabuf_iter.c b/kernel/bpf/dmabuf_iter.c new file mode 100644 index 000000000000..4dd7ef7c145c --- /dev/null +++ b/kernel/bpf/dmabuf_iter.c @@ -0,0 +1,150 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2025 Google LLC */ +#include <linux/bpf.h> +#include <linux/btf_ids.h> +#include <linux/dma-buf.h> +#include <linux/kernel.h> +#include <linux/seq_file.h> + +static void *dmabuf_iter_seq_start(struct seq_file *seq, loff_t *pos) +{ + if (*pos) + return NULL; + + return dma_buf_iter_begin(); +} + +static void *dmabuf_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct dma_buf *dmabuf = v; + + ++*pos; + + return dma_buf_iter_next(dmabuf); +} + +struct bpf_iter__dmabuf { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct dma_buf *, dmabuf); +}; + +static int __dmabuf_seq_show(struct seq_file *seq, void *v, bool in_stop) +{ + struct bpf_iter_meta meta = { + .seq = seq, + }; + struct bpf_iter__dmabuf ctx = { + .meta = &meta, + .dmabuf = v, + }; + struct bpf_prog *prog = bpf_iter_get_info(&meta, in_stop); + + if (prog) + return bpf_iter_run_prog(prog, &ctx); + + return 0; +} + +static int dmabuf_iter_seq_show(struct seq_file *seq, void *v) +{ + return __dmabuf_seq_show(seq, v, false); +} + +static void dmabuf_iter_seq_stop(struct seq_file *seq, void *v) +{ + struct dma_buf *dmabuf = v; + + if (dmabuf) + dma_buf_put(dmabuf); +} + +static const struct seq_operations dmabuf_iter_seq_ops = { + .start = dmabuf_iter_seq_start, + .next = dmabuf_iter_seq_next, + .stop = dmabuf_iter_seq_stop, + .show = dmabuf_iter_seq_show, +}; + +static void bpf_iter_dmabuf_show_fdinfo(const struct bpf_iter_aux_info *aux, + struct seq_file *seq) +{ + seq_puts(seq, "dmabuf iter\n"); +} + +static const struct bpf_iter_seq_info dmabuf_iter_seq_info = { + .seq_ops = &dmabuf_iter_seq_ops, + .init_seq_private = NULL, + .fini_seq_private = NULL, + .seq_priv_size = 0, +}; + +static struct bpf_iter_reg bpf_dmabuf_reg_info = { + .target = "dmabuf", + .feature = BPF_ITER_RESCHED, + .show_fdinfo = bpf_iter_dmabuf_show_fdinfo, + .ctx_arg_info_size = 1, + .ctx_arg_info = { + { offsetof(struct bpf_iter__dmabuf, dmabuf), + PTR_TO_BTF_ID_OR_NULL }, + }, + .seq_info = &dmabuf_iter_seq_info, +}; + +DEFINE_BPF_ITER_FUNC(dmabuf, struct bpf_iter_meta *meta, struct dma_buf *dmabuf) +BTF_ID_LIST_SINGLE(bpf_dmabuf_btf_id, struct, dma_buf) + +static int __init dmabuf_iter_init(void) +{ + bpf_dmabuf_reg_info.ctx_arg_info[0].btf_id = bpf_dmabuf_btf_id[0]; + return bpf_iter_reg_target(&bpf_dmabuf_reg_info); +} + +late_initcall(dmabuf_iter_init); + +struct bpf_iter_dmabuf { + /* + * opaque iterator state; having __u64 here allows to preserve correct + * alignment requirements in vmlinux.h, generated from BTF + */ + __u64 __opaque[1]; +} __aligned(8); + +/* Non-opaque version of bpf_iter_dmabuf */ +struct bpf_iter_dmabuf_kern { + struct dma_buf *dmabuf; +} __aligned(8); + +__bpf_kfunc_start_defs(); + +__bpf_kfunc int bpf_iter_dmabuf_new(struct bpf_iter_dmabuf *it) +{ + struct bpf_iter_dmabuf_kern *kit = (void *)it; + + BUILD_BUG_ON(sizeof(*kit) > sizeof(*it)); + BUILD_BUG_ON(__alignof__(*kit) != __alignof__(*it)); + + kit->dmabuf = NULL; + return 0; +} + +__bpf_kfunc struct dma_buf *bpf_iter_dmabuf_next(struct bpf_iter_dmabuf *it) +{ + struct bpf_iter_dmabuf_kern *kit = (void *)it; + + if (kit->dmabuf) + kit->dmabuf = dma_buf_iter_next(kit->dmabuf); + else + kit->dmabuf = dma_buf_iter_begin(); + + return kit->dmabuf; +} + +__bpf_kfunc void bpf_iter_dmabuf_destroy(struct bpf_iter_dmabuf *it) +{ + struct bpf_iter_dmabuf_kern *kit = (void *)it; + + if (kit->dmabuf) + dma_buf_put(kit->dmabuf); +} + +__bpf_kfunc_end_defs(); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 4a9eeb7aef85..71f9931ac64c 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -16,6 +16,7 @@ #include "bpf_lru_list.h" #include "map_in_map.h" #include <linux/bpf_mem_alloc.h> +#include <asm/rqspinlock.h> #define HTAB_CREATE_FLAG_MASK \ (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \ @@ -78,7 +79,7 @@ */ struct bucket { struct hlist_nulls_head head; - raw_spinlock_t raw_lock; + rqspinlock_t raw_lock; }; #define HASHTAB_MAP_LOCK_COUNT 8 @@ -104,8 +105,6 @@ struct bpf_htab { u32 n_buckets; /* number of hash buckets */ u32 elem_size; /* size of each element in bytes */ u32 hashrnd; - struct lock_class_key lockdep_key; - int __percpu *map_locked[HASHTAB_MAP_LOCK_COUNT]; }; /* each htab element is struct htab_elem + key + value */ @@ -140,45 +139,26 @@ static void htab_init_buckets(struct bpf_htab *htab) for (i = 0; i < htab->n_buckets; i++) { INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i); - raw_spin_lock_init(&htab->buckets[i].raw_lock); - lockdep_set_class(&htab->buckets[i].raw_lock, - &htab->lockdep_key); + raw_res_spin_lock_init(&htab->buckets[i].raw_lock); cond_resched(); } } -static inline int htab_lock_bucket(const struct bpf_htab *htab, - struct bucket *b, u32 hash, - unsigned long *pflags) +static inline int htab_lock_bucket(struct bucket *b, unsigned long *pflags) { unsigned long flags; + int ret; - hash = hash & min_t(u32, HASHTAB_MAP_LOCK_MASK, htab->n_buckets - 1); - - preempt_disable(); - local_irq_save(flags); - if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) { - __this_cpu_dec(*(htab->map_locked[hash])); - local_irq_restore(flags); - preempt_enable(); - return -EBUSY; - } - - raw_spin_lock(&b->raw_lock); + ret = raw_res_spin_lock_irqsave(&b->raw_lock, flags); + if (ret) + return ret; *pflags = flags; - return 0; } -static inline void htab_unlock_bucket(const struct bpf_htab *htab, - struct bucket *b, u32 hash, - unsigned long flags) +static inline void htab_unlock_bucket(struct bucket *b, unsigned long flags) { - hash = hash & min_t(u32, HASHTAB_MAP_LOCK_MASK, htab->n_buckets - 1); - raw_spin_unlock(&b->raw_lock); - __this_cpu_dec(*(htab->map_locked[hash])); - local_irq_restore(flags); - preempt_enable(); + raw_res_spin_unlock_irqrestore(&b->raw_lock, flags); } static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node); @@ -195,20 +175,30 @@ static bool htab_is_percpu(const struct bpf_htab *htab) htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; } +static inline bool is_fd_htab(const struct bpf_htab *htab) +{ + return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS; +} + +static inline void *htab_elem_value(struct htab_elem *l, u32 key_size) +{ + return l->key + round_up(key_size, 8); +} + static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size, void __percpu *pptr) { - *(void __percpu **)(l->key + key_size) = pptr; + *(void __percpu **)htab_elem_value(l, key_size) = pptr; } static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size) { - return *(void __percpu **)(l->key + key_size); + return *(void __percpu **)htab_elem_value(l, key_size); } static void *fd_htab_map_get_ptr(const struct bpf_map *map, struct htab_elem *l) { - return *(void **)(l->key + roundup(map->key_size, 8)); + return *(void **)htab_elem_value(l, map->key_size); } static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i) @@ -216,9 +206,13 @@ static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i) return (struct htab_elem *) (htab->elems + i * (u64)htab->elem_size); } +/* Both percpu and fd htab support in-place update, so no need for + * extra elem. LRU itself can remove the least used element, so + * there is no need for an extra elem during map_update. + */ static bool htab_has_extra_elems(struct bpf_htab *htab) { - return !htab_is_percpu(htab) && !htab_is_lru(htab); + return !htab_is_percpu(htab) && !htab_is_lru(htab) && !is_fd_htab(htab); } static void htab_free_prealloced_timers_and_wq(struct bpf_htab *htab) @@ -235,10 +229,10 @@ static void htab_free_prealloced_timers_and_wq(struct bpf_htab *htab) elem = get_htab_elem(htab, i); if (btf_record_has_field(htab->map.record, BPF_TIMER)) bpf_obj_free_timer(htab->map.record, - elem->key + round_up(htab->map.key_size, 8)); + htab_elem_value(elem, htab->map.key_size)); if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) bpf_obj_free_workqueue(htab->map.record, - elem->key + round_up(htab->map.key_size, 8)); + htab_elem_value(elem, htab->map.key_size)); cond_resched(); } } @@ -265,7 +259,8 @@ static void htab_free_prealloced_fields(struct bpf_htab *htab) cond_resched(); } } else { - bpf_obj_free_fields(htab->map.record, elem->key + round_up(htab->map.key_size, 8)); + bpf_obj_free_fields(htab->map.record, + htab_elem_value(elem, htab->map.key_size)); cond_resched(); } cond_resched(); @@ -473,8 +468,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) { bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH || attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); - bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH || - attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); /* percpu_lru means each cpu has its own LRU list. * it is different from BPF_MAP_TYPE_PERCPU_HASH where * the map's value itself is percpu. percpu_lru has @@ -483,14 +476,12 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); struct bpf_htab *htab; - int err, i; + int err; htab = bpf_map_area_alloc(sizeof(*htab), NUMA_NO_NODE); if (!htab) return ERR_PTR(-ENOMEM); - lockdep_register_key(&htab->lockdep_key); - bpf_map_init_from_attr(&htab->map, attr); if (percpu_lru) { @@ -536,15 +527,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) if (!htab->buckets) goto free_elem_count; - for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) { - htab->map_locked[i] = bpf_map_alloc_percpu(&htab->map, - sizeof(int), - sizeof(int), - GFP_USER); - if (!htab->map_locked[i]) - goto free_map_locked; - } - if (htab->map.map_flags & BPF_F_ZERO_SEED) htab->hashrnd = 0; else @@ -580,10 +562,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) if (err) goto free_map_locked; - if (!percpu && !lru) { - /* lru itself can remove the least used element, so - * there is no need for an extra elem during map_update. - */ + if (htab_has_extra_elems(htab)) { err = alloc_extra_elems(htab); if (err) goto free_prealloc; @@ -607,15 +586,12 @@ free_prealloc: free_map_locked: if (htab->use_percpu_counter) percpu_counter_destroy(&htab->pcount); - for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) - free_percpu(htab->map_locked[i]); bpf_map_area_free(htab->buckets); bpf_mem_alloc_destroy(&htab->pcpu_ma); bpf_mem_alloc_destroy(&htab->ma); free_elem_count: bpf_map_free_elem_count(&htab->map); free_htab: - lockdep_unregister_key(&htab->lockdep_key); bpf_map_area_free(htab); return ERR_PTR(err); } @@ -704,7 +680,7 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key) struct htab_elem *l = __htab_map_lookup_elem(map, key); if (l) - return l->key + round_up(map->key_size, 8); + return htab_elem_value(l, map->key_size); return NULL; } @@ -743,7 +719,7 @@ static __always_inline void *__htab_lru_map_lookup_elem(struct bpf_map *map, if (l) { if (mark) bpf_lru_node_set_ref(&l->lru_node); - return l->key + round_up(map->key_size, 8); + return htab_elem_value(l, map->key_size); } return NULL; @@ -787,6 +763,9 @@ static int htab_lru_map_gen_lookup(struct bpf_map *map, static void check_and_free_fields(struct bpf_htab *htab, struct htab_elem *elem) { + if (IS_ERR_OR_NULL(htab->map.record)) + return; + if (htab_is_percpu(htab)) { void __percpu *pptr = htab_elem_get_ptr(elem, htab->map.key_size); int cpu; @@ -794,7 +773,7 @@ static void check_and_free_fields(struct bpf_htab *htab, for_each_possible_cpu(cpu) bpf_obj_free_fields(htab->map.record, per_cpu_ptr(pptr, cpu)); } else { - void *map_value = elem->key + round_up(htab->map.key_size, 8); + void *map_value = htab_elem_value(elem, htab->map.key_size); bpf_obj_free_fields(htab->map.record, map_value); } @@ -817,7 +796,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) b = __select_bucket(htab, tgt_l->hash); head = &b->head; - ret = htab_lock_bucket(htab, b, tgt_l->hash, &flags); + ret = htab_lock_bucket(b, &flags); if (ret) return false; @@ -828,7 +807,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) break; } - htab_unlock_bucket(htab, b, tgt_l->hash, flags); + htab_unlock_bucket(b, flags); if (l == tgt_l) check_and_free_fields(htab, l); @@ -999,8 +978,7 @@ static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr, static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab) { - return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS && - BITS_PER_LONG == 64; + return is_fd_htab(htab) && BITS_PER_LONG == 64; } static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, @@ -1070,11 +1048,9 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, htab_elem_set_ptr(l_new, key_size, pptr); } else if (fd_htab_map_needs_adjust(htab)) { size = round_up(size, 8); - memcpy(l_new->key + round_up(key_size, 8), value, size); + memcpy(htab_elem_value(l_new, key_size), value, size); } else { - copy_map_value(&htab->map, - l_new->key + round_up(key_size, 8), - value); + copy_map_value(&htab->map, htab_elem_value(l_new, key_size), value); } l_new->hash = hash; @@ -1103,10 +1079,9 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct htab_elem *l_new = NULL, *l_old; + struct htab_elem *l_new, *l_old; struct hlist_nulls_head *head; unsigned long flags; - void *old_map_ptr; struct bucket *b; u32 key_size, hash; int ret; @@ -1137,7 +1112,7 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, if (l_old) { /* grab the element lock and update value in place */ copy_map_value_locked(map, - l_old->key + round_up(key_size, 8), + htab_elem_value(l_old, key_size), value, false); return 0; } @@ -1147,7 +1122,7 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, */ } - ret = htab_lock_bucket(htab, b, hash, &flags); + ret = htab_lock_bucket(b, &flags); if (ret) return ret; @@ -1165,7 +1140,7 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, * and update element in place */ copy_map_value_locked(map, - l_old->key + round_up(key_size, 8), + htab_elem_value(l_old, key_size), value, false); ret = 0; goto err; @@ -1187,27 +1162,17 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, hlist_nulls_del_rcu(&l_old->hash_node); /* l_old has already been stashed in htab->extra_elems, free - * its special fields before it is available for reuse. Also - * save the old map pointer in htab of maps before unlock - * and release it after unlock. + * its special fields before it is available for reuse. */ - old_map_ptr = NULL; - if (htab_is_prealloc(htab)) { - if (map->ops->map_fd_put_ptr) - old_map_ptr = fd_htab_map_get_ptr(map, l_old); + if (htab_is_prealloc(htab)) check_and_free_fields(htab, l_old); - } - } - htab_unlock_bucket(htab, b, hash, flags); - if (l_old) { - if (old_map_ptr) - map->ops->map_fd_put_ptr(map, old_map_ptr, true); - if (!htab_is_prealloc(htab)) - free_htab_elem(htab, l_old); } + htab_unlock_bucket(b, flags); + if (l_old && !htab_is_prealloc(htab)) + free_htab_elem(htab, l_old); return 0; err: - htab_unlock_bucket(htab, b, hash, flags); + htab_unlock_bucket(b, flags); return ret; } @@ -1251,10 +1216,9 @@ static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value l_new = prealloc_lru_pop(htab, key, hash); if (!l_new) return -ENOMEM; - copy_map_value(&htab->map, - l_new->key + round_up(map->key_size, 8), value); + copy_map_value(&htab->map, htab_elem_value(l_new, map->key_size), value); - ret = htab_lock_bucket(htab, b, hash, &flags); + ret = htab_lock_bucket(b, &flags); if (ret) goto err_lock_bucket; @@ -1275,7 +1239,7 @@ static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value ret = 0; err: - htab_unlock_bucket(htab, b, hash, flags); + htab_unlock_bucket(b, flags); err_lock_bucket: if (ret) @@ -1286,13 +1250,14 @@ err_lock_bucket: return ret; } -static long __htab_percpu_map_update_elem(struct bpf_map *map, void *key, +static long htab_map_update_elem_in_place(struct bpf_map *map, void *key, void *value, u64 map_flags, - bool onallcpus) + bool percpu, bool onallcpus) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct htab_elem *l_new = NULL, *l_old; + struct htab_elem *l_new, *l_old; struct hlist_nulls_head *head; + void *old_map_ptr = NULL; unsigned long flags; struct bucket *b; u32 key_size, hash; @@ -1312,7 +1277,7 @@ static long __htab_percpu_map_update_elem(struct bpf_map *map, void *key, b = __select_bucket(htab, hash); head = &b->head; - ret = htab_lock_bucket(htab, b, hash, &flags); + ret = htab_lock_bucket(b, &flags); if (ret) return ret; @@ -1323,21 +1288,29 @@ static long __htab_percpu_map_update_elem(struct bpf_map *map, void *key, goto err; if (l_old) { - /* per-cpu hash map can update value in-place */ - pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), - value, onallcpus); + /* Update value in-place */ + if (percpu) { + pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), + value, onallcpus); + } else { + void **inner_map_pptr = htab_elem_value(l_old, key_size); + + old_map_ptr = *inner_map_pptr; + WRITE_ONCE(*inner_map_pptr, *(void **)value); + } } else { l_new = alloc_htab_elem(htab, key, value, key_size, - hash, true, onallcpus, NULL); + hash, percpu, onallcpus, NULL); if (IS_ERR(l_new)) { ret = PTR_ERR(l_new); goto err; } hlist_nulls_add_head_rcu(&l_new->hash_node, head); } - ret = 0; err: - htab_unlock_bucket(htab, b, hash, flags); + htab_unlock_bucket(b, flags); + if (old_map_ptr) + map->ops->map_fd_put_ptr(map, old_map_ptr, true); return ret; } @@ -1378,7 +1351,7 @@ static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, return -ENOMEM; } - ret = htab_lock_bucket(htab, b, hash, &flags); + ret = htab_lock_bucket(b, &flags); if (ret) goto err_lock_bucket; @@ -1402,7 +1375,7 @@ static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, } ret = 0; err: - htab_unlock_bucket(htab, b, hash, flags); + htab_unlock_bucket(b, flags); err_lock_bucket: if (l_new) { bpf_map_dec_elem_count(&htab->map); @@ -1414,7 +1387,7 @@ err_lock_bucket: static long htab_percpu_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { - return __htab_percpu_map_update_elem(map, key, value, map_flags, false); + return htab_map_update_elem_in_place(map, key, value, map_flags, true, false); } static long htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, @@ -1444,7 +1417,7 @@ static long htab_map_delete_elem(struct bpf_map *map, void *key) b = __select_bucket(htab, hash); head = &b->head; - ret = htab_lock_bucket(htab, b, hash, &flags); + ret = htab_lock_bucket(b, &flags); if (ret) return ret; @@ -1454,7 +1427,7 @@ static long htab_map_delete_elem(struct bpf_map *map, void *key) else ret = -ENOENT; - htab_unlock_bucket(htab, b, hash, flags); + htab_unlock_bucket(b, flags); if (l) free_htab_elem(htab, l); @@ -1480,7 +1453,7 @@ static long htab_lru_map_delete_elem(struct bpf_map *map, void *key) b = __select_bucket(htab, hash); head = &b->head; - ret = htab_lock_bucket(htab, b, hash, &flags); + ret = htab_lock_bucket(b, &flags); if (ret) return ret; @@ -1491,7 +1464,7 @@ static long htab_lru_map_delete_elem(struct bpf_map *map, void *key) else ret = -ENOENT; - htab_unlock_bucket(htab, b, hash, flags); + htab_unlock_bucket(b, flags); if (l) htab_lru_push_free(htab, l); return ret; @@ -1531,10 +1504,10 @@ static void htab_free_malloced_timers_and_wq(struct bpf_htab *htab) /* We only free timer on uref dropping to zero */ if (btf_record_has_field(htab->map.record, BPF_TIMER)) bpf_obj_free_timer(htab->map.record, - l->key + round_up(htab->map.key_size, 8)); + htab_elem_value(l, htab->map.key_size)); if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) bpf_obj_free_workqueue(htab->map.record, - l->key + round_up(htab->map.key_size, 8)); + htab_elem_value(l, htab->map.key_size)); } cond_resched_rcu(); } @@ -1558,7 +1531,6 @@ static void htab_map_free_timers_and_wq(struct bpf_map *map) static void htab_map_free(struct bpf_map *map) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - int i; /* bpf_free_used_maps() or close(map_fd) will trigger this map_free callback. * bpf_free_used_maps() is called after bpf prog is no longer executing. @@ -1583,9 +1555,6 @@ static void htab_map_free(struct bpf_map *map) bpf_mem_alloc_destroy(&htab->ma); if (htab->use_percpu_counter) percpu_counter_destroy(&htab->pcount); - for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) - free_percpu(htab->map_locked[i]); - lockdep_unregister_key(&htab->lockdep_key); bpf_map_area_free(htab); } @@ -1628,7 +1597,7 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key, b = __select_bucket(htab, hash); head = &b->head; - ret = htab_lock_bucket(htab, b, hash, &bflags); + ret = htab_lock_bucket(b, &bflags); if (ret) return ret; @@ -1650,22 +1619,19 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key, off += roundup_value_size; } } else { - u32 roundup_key_size = round_up(map->key_size, 8); + void *src = htab_elem_value(l, map->key_size); if (flags & BPF_F_LOCK) - copy_map_value_locked(map, value, l->key + - roundup_key_size, - true); + copy_map_value_locked(map, value, src, true); else - copy_map_value(map, value, l->key + - roundup_key_size); + copy_map_value(map, value, src); /* Zeroing special fields in the temp buffer */ check_and_init_map_value(map, value); } hlist_nulls_del_rcu(&l->hash_node); out_unlock: - htab_unlock_bucket(htab, b, hash, bflags); + htab_unlock_bucket(b, bflags); if (l) { if (is_lru_map) @@ -1715,12 +1681,12 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, bool is_percpu) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - u32 bucket_cnt, total, key_size, value_size, roundup_key_size; void *keys = NULL, *values = NULL, *value, *dst_key, *dst_val; void __user *uvalues = u64_to_user_ptr(attr->batch.values); void __user *ukeys = u64_to_user_ptr(attr->batch.keys); void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch); u32 batch, max_count, size, bucket_size, map_id; + u32 bucket_cnt, total, key_size, value_size; struct htab_elem *node_to_free = NULL; u64 elem_map_flags, map_flags; struct hlist_nulls_head *head; @@ -1755,7 +1721,6 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, return -ENOENT; key_size = htab->map.key_size; - roundup_key_size = round_up(htab->map.key_size, 8); value_size = htab->map.value_size; size = round_up(value_size, 8); if (is_percpu) @@ -1787,7 +1752,7 @@ again_nocopy: head = &b->head; /* do not grab the lock unless need it (bucket_cnt > 0). */ if (locked) { - ret = htab_lock_bucket(htab, b, batch, &flags); + ret = htab_lock_bucket(b, &flags); if (ret) { rcu_read_unlock(); bpf_enable_instrumentation(); @@ -1810,7 +1775,7 @@ again_nocopy: /* Note that since bucket_cnt > 0 here, it is implicit * that the locked was grabbed, so release it. */ - htab_unlock_bucket(htab, b, batch, flags); + htab_unlock_bucket(b, flags); rcu_read_unlock(); bpf_enable_instrumentation(); goto after_loop; @@ -1821,7 +1786,7 @@ again_nocopy: /* Note that since bucket_cnt > 0 here, it is implicit * that the locked was grabbed, so release it. */ - htab_unlock_bucket(htab, b, batch, flags); + htab_unlock_bucket(b, flags); rcu_read_unlock(); bpf_enable_instrumentation(); kvfree(keys); @@ -1847,8 +1812,8 @@ again_nocopy: off += size; } } else { - value = l->key + roundup_key_size; - if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { + value = htab_elem_value(l, key_size); + if (is_fd_htab(htab)) { struct bpf_map **inner_map = value; /* Actual value is the id of the inner map */ @@ -1884,7 +1849,7 @@ again_nocopy: dst_val += value_size; } - htab_unlock_bucket(htab, b, batch, flags); + htab_unlock_bucket(b, flags); locked = false; while (node_to_free) { @@ -2098,11 +2063,11 @@ static void *bpf_hash_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) static int __bpf_hash_map_seq_show(struct seq_file *seq, struct htab_elem *elem) { struct bpf_iter_seq_hash_map_info *info = seq->private; - u32 roundup_key_size, roundup_value_size; struct bpf_iter__bpf_map_elem ctx = {}; struct bpf_map *map = info->map; struct bpf_iter_meta meta; int ret = 0, off = 0, cpu; + u32 roundup_value_size; struct bpf_prog *prog; void __percpu *pptr; @@ -2112,10 +2077,9 @@ static int __bpf_hash_map_seq_show(struct seq_file *seq, struct htab_elem *elem) ctx.meta = &meta; ctx.map = info->map; if (elem) { - roundup_key_size = round_up(map->key_size, 8); ctx.key = elem->key; if (!info->percpu_value_buf) { - ctx.value = elem->key + roundup_key_size; + ctx.value = htab_elem_value(elem, map->key_size); } else { roundup_value_size = round_up(map->value_size, 8); pptr = htab_elem_get_ptr(elem, map->key_size); @@ -2200,7 +2164,6 @@ static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_ struct hlist_nulls_head *head; struct hlist_nulls_node *n; struct htab_elem *elem; - u32 roundup_key_size; int i, num_elems = 0; void __percpu *pptr; struct bucket *b; @@ -2215,7 +2178,6 @@ static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_ is_percpu = htab_is_percpu(htab); - roundup_key_size = round_up(map->key_size, 8); /* migration has been disabled, so percpu value prepared here will be * the same as the one seen by the bpf program with * bpf_map_lookup_elem(). @@ -2224,14 +2186,14 @@ static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_ b = &htab->buckets[i]; rcu_read_lock(); head = &b->head; - hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) { + hlist_nulls_for_each_entry_safe(elem, n, head, hash_node) { key = elem->key; if (is_percpu) { /* current cpu value for percpu map */ pptr = htab_elem_get_ptr(elem, map->key_size); val = this_cpu_ptr(pptr); } else { - val = elem->key + roundup_key_size; + val = htab_elem_value(elem, map->key_size); } num_elems++; ret = callback_fn((u64)(long)map, (u64)(long)key, @@ -2354,7 +2316,7 @@ static int htab_percpu_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem); *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3); *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, - offsetof(struct htab_elem, key) + map->key_size); + offsetof(struct htab_elem, key) + roundup(map->key_size, 8)); *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0); *insn++ = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0); @@ -2446,8 +2408,8 @@ int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, ret = __htab_lru_percpu_map_update_elem(map, key, value, map_flags, true); else - ret = __htab_percpu_map_update_elem(map, key, value, map_flags, - true); + ret = htab_map_update_elem_in_place(map, key, value, map_flags, + true, true); rcu_read_unlock(); return ret; @@ -2571,24 +2533,23 @@ int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value) return ret; } -/* only called from syscall */ +/* Only called from syscall */ int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, void *key, void *value, u64 map_flags) { void *ptr; int ret; - u32 ufd = *(u32 *)value; - ptr = map->ops->map_fd_get_ptr(map, map_file, ufd); + ptr = map->ops->map_fd_get_ptr(map, map_file, *(int *)value); if (IS_ERR(ptr)) return PTR_ERR(ptr); /* The htab bucket lock is always held during update operations in fd * htab map, and the following rcu_read_lock() is only used to avoid - * the WARN_ON_ONCE in htab_map_update_elem(). + * the WARN_ON_ONCE in htab_map_update_elem_in_place(). */ rcu_read_lock(); - ret = htab_map_update_elem(map, key, &ptr, map_flags); + ret = htab_map_update_elem_in_place(map, key, &ptr, map_flags, false, false); rcu_read_unlock(); if (ret) map->ops->map_fd_put_ptr(map, ptr, false); diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index f27ce162427a..b71e428ad936 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -23,6 +23,7 @@ #include <linux/btf_ids.h> #include <linux/bpf_mem_alloc.h> #include <linux/kasan.h> +#include <linux/bpf_verifier.h> #include "../../lib/kstrtox.h" @@ -129,7 +130,8 @@ const struct bpf_func_proto bpf_map_peek_elem_proto = { BPF_CALL_3(bpf_map_lookup_percpu_elem, struct bpf_map *, map, void *, key, u32, cpu) { - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && + !rcu_read_lock_bh_held()); return (unsigned long) map->ops->map_lookup_percpu_elem(map, key, cpu); } @@ -1284,8 +1286,7 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u atomic_set(&t->cancelling, 0); INIT_WORK(&t->cb.delete_work, bpf_timer_delete_work); - hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT); - t->timer.function = bpf_timer_cb; + hrtimer_setup(&t->timer, bpf_timer_cb, clockid, HRTIMER_MODE_REL_SOFT); cb->value = (void *)async - map->record->timer_off; break; case BPF_ASYNC_TYPE_WQ: @@ -1714,16 +1715,6 @@ void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr) memset(ptr, 0, sizeof(*ptr)); } -static int bpf_dynptr_check_off_len(const struct bpf_dynptr_kern *ptr, u32 offset, u32 len) -{ - u32 size = __bpf_dynptr_size(ptr); - - if (len > size || offset > size - len) - return -E2BIG; - - return 0; -} - BPF_CALL_4(bpf_dynptr_from_mem, void *, data, u32, size, u64, flags, struct bpf_dynptr_kern *, ptr) { int err; @@ -1759,8 +1750,8 @@ static const struct bpf_func_proto bpf_dynptr_from_mem_proto = { .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT | MEM_WRITE, }; -BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern *, src, - u32, offset, u64, flags) +static int __bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr_kern *src, + u32 offset, u64 flags) { enum bpf_dynptr_type type; int err; @@ -1793,6 +1784,12 @@ BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern } } +BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern *, src, + u32, offset, u64, flags) +{ + return __bpf_dynptr_read(dst, len, src, offset, flags); +} + static const struct bpf_func_proto bpf_dynptr_read_proto = { .func = bpf_dynptr_read, .gpl_only = false, @@ -1804,8 +1801,8 @@ static const struct bpf_func_proto bpf_dynptr_read_proto = { .arg5_type = ARG_ANYTHING, }; -BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, void *, src, - u32, len, u64, flags) +int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u32 offset, void *src, + u32 len, u64 flags) { enum bpf_dynptr_type type; int err; @@ -1843,6 +1840,12 @@ BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, v } } +BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, void *, src, + u32, len, u64, flags) +{ + return __bpf_dynptr_write(dst, offset, src, len, flags); +} + static const struct bpf_func_proto bpf_dynptr_write_proto = { .func = bpf_dynptr_write, .gpl_only = false, @@ -1901,6 +1904,12 @@ const struct bpf_func_proto bpf_probe_read_user_str_proto __weak; const struct bpf_func_proto bpf_probe_read_kernel_proto __weak; const struct bpf_func_proto bpf_probe_read_kernel_str_proto __weak; const struct bpf_func_proto bpf_task_pt_regs_proto __weak; +const struct bpf_func_proto bpf_perf_event_read_proto __weak; +const struct bpf_func_proto bpf_send_signal_proto __weak; +const struct bpf_func_proto bpf_send_signal_thread_proto __weak; +const struct bpf_func_proto bpf_get_task_stack_sleepable_proto __weak; +const struct bpf_func_proto bpf_get_task_stack_proto __weak; +const struct bpf_func_proto bpf_get_branch_snapshot_proto __weak; const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) @@ -1954,6 +1963,8 @@ bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_current_pid_tgid_proto; case BPF_FUNC_get_ns_current_pid_tgid: return &bpf_get_ns_current_pid_tgid_proto; + case BPF_FUNC_get_current_uid_gid: + return &bpf_get_current_uid_gid_proto; default: break; } @@ -2011,7 +2022,21 @@ bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_current_cgroup_id_proto; case BPF_FUNC_get_current_ancestor_cgroup_id: return &bpf_get_current_ancestor_cgroup_id_proto; + case BPF_FUNC_current_task_under_cgroup: + return &bpf_current_task_under_cgroup_proto; #endif +#ifdef CONFIG_CGROUP_NET_CLASSID + case BPF_FUNC_get_cgroup_classid: + return &bpf_get_cgroup_classid_curr_proto; +#endif + case BPF_FUNC_task_storage_get: + if (bpf_prog_check_recur(prog)) + return &bpf_task_storage_get_recur_proto; + return &bpf_task_storage_get_proto; + case BPF_FUNC_task_storage_delete: + if (bpf_prog_check_recur(prog)) + return &bpf_task_storage_delete_recur_proto; + return &bpf_task_storage_delete_proto; default: break; } @@ -2026,6 +2051,8 @@ bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_current_task_proto; case BPF_FUNC_get_current_task_btf: return &bpf_get_current_task_btf_proto; + case BPF_FUNC_get_current_comm: + return &bpf_get_current_comm_proto; case BPF_FUNC_probe_read_user: return &bpf_probe_read_user_proto; case BPF_FUNC_probe_read_kernel: @@ -2036,6 +2063,10 @@ bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_probe_read_kernel_str: return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ? NULL : &bpf_probe_read_kernel_str_proto; + case BPF_FUNC_copy_from_user: + return &bpf_copy_from_user_proto; + case BPF_FUNC_copy_from_user_task: + return &bpf_copy_from_user_task_proto; case BPF_FUNC_snprintf_btf: return &bpf_snprintf_btf_proto; case BPF_FUNC_snprintf: @@ -2044,6 +2075,21 @@ bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_task_pt_regs_proto; case BPF_FUNC_trace_vprintk: return bpf_get_trace_vprintk_proto(); + case BPF_FUNC_perf_event_read_value: + return bpf_get_perf_event_read_value_proto(); + case BPF_FUNC_perf_event_read: + return &bpf_perf_event_read_proto; + case BPF_FUNC_send_signal: + return &bpf_send_signal_proto; + case BPF_FUNC_send_signal_thread: + return &bpf_send_signal_thread_proto; + case BPF_FUNC_get_task_stack: + return prog->sleepable ? &bpf_get_task_stack_sleepable_proto + : &bpf_get_task_stack_proto; + case BPF_FUNC_get_branch_snapshot: + return &bpf_get_branch_snapshot_proto; + case BPF_FUNC_find_vma: + return &bpf_find_vma_proto; default: return NULL; } @@ -2280,6 +2326,26 @@ __bpf_kfunc struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head) return __bpf_list_del(head, true); } +__bpf_kfunc struct bpf_list_node *bpf_list_front(struct bpf_list_head *head) +{ + struct list_head *h = (struct list_head *)head; + + if (list_empty(h) || unlikely(!h->next)) + return NULL; + + return (struct bpf_list_node *)h->next; +} + +__bpf_kfunc struct bpf_list_node *bpf_list_back(struct bpf_list_head *head) +{ + struct list_head *h = (struct list_head *)head; + + if (list_empty(h) || unlikely(!h->next)) + return NULL; + + return (struct bpf_list_node *)h->prev; +} + __bpf_kfunc struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root, struct bpf_rb_node *node) { @@ -2353,6 +2419,33 @@ __bpf_kfunc struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root) return (struct bpf_rb_node *)rb_first_cached(r); } +__bpf_kfunc struct bpf_rb_node *bpf_rbtree_root(struct bpf_rb_root *root) +{ + struct rb_root_cached *r = (struct rb_root_cached *)root; + + return (struct bpf_rb_node *)r->rb_root.rb_node; +} + +__bpf_kfunc struct bpf_rb_node *bpf_rbtree_left(struct bpf_rb_root *root, struct bpf_rb_node *node) +{ + struct bpf_rb_node_kern *node_internal = (struct bpf_rb_node_kern *)node; + + if (READ_ONCE(node_internal->owner) != root) + return NULL; + + return (struct bpf_rb_node *)node_internal->rb_node.rb_left; +} + +__bpf_kfunc struct bpf_rb_node *bpf_rbtree_right(struct bpf_rb_root *root, struct bpf_rb_node *node) +{ + struct bpf_rb_node_kern *node_internal = (struct bpf_rb_node_kern *)node; + + if (READ_ONCE(node_internal->owner) != root) + return NULL; + + return (struct bpf_rb_node *)node_internal->rb_node.rb_right; +} + /** * bpf_task_acquire - Acquire a reference to a task. A task acquired by this * kfunc which is not stored in a map as a kptr, must be released by calling @@ -2758,6 +2851,61 @@ __bpf_kfunc int bpf_dynptr_clone(const struct bpf_dynptr *p, return 0; } +/** + * bpf_dynptr_copy() - Copy data from one dynptr to another. + * @dst_ptr: Destination dynptr - where data should be copied to + * @dst_off: Offset into the destination dynptr + * @src_ptr: Source dynptr - where data should be copied from + * @src_off: Offset into the source dynptr + * @size: Length of the data to copy from source to destination + * + * Copies data from source dynptr to destination dynptr. + * Returns 0 on success; negative error, otherwise. + */ +__bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u32 dst_off, + struct bpf_dynptr *src_ptr, u32 src_off, u32 size) +{ + struct bpf_dynptr_kern *dst = (struct bpf_dynptr_kern *)dst_ptr; + struct bpf_dynptr_kern *src = (struct bpf_dynptr_kern *)src_ptr; + void *src_slice, *dst_slice; + char buf[256]; + u32 off; + + src_slice = bpf_dynptr_slice(src_ptr, src_off, NULL, size); + dst_slice = bpf_dynptr_slice_rdwr(dst_ptr, dst_off, NULL, size); + + if (src_slice && dst_slice) { + memmove(dst_slice, src_slice, size); + return 0; + } + + if (src_slice) + return __bpf_dynptr_write(dst, dst_off, src_slice, size, 0); + + if (dst_slice) + return __bpf_dynptr_read(dst_slice, size, src, src_off, 0); + + if (bpf_dynptr_check_off_len(dst, dst_off, size) || + bpf_dynptr_check_off_len(src, src_off, size)) + return -E2BIG; + + off = 0; + while (off < size) { + u32 chunk_sz = min_t(u32, sizeof(buf), size - off); + int err; + + err = __bpf_dynptr_read(buf, chunk_sz, src, src_off + off, 0); + if (err) + return err; + err = __bpf_dynptr_write(dst, dst_off + off, buf, chunk_sz, 0); + if (err) + return err; + + off += chunk_sz; + } + return 0; +} + __bpf_kfunc void *bpf_cast_to_kern_ctx(void *obj) { return obj; @@ -2855,9 +3003,9 @@ __bpf_kfunc int bpf_wq_start(struct bpf_wq *wq, unsigned int flags) __bpf_kfunc int bpf_wq_set_callback_impl(struct bpf_wq *wq, int (callback_fn)(void *map, int *key, void *value), unsigned int flags, - void *aux__ign) + void *aux__prog) { - struct bpf_prog_aux *aux = (struct bpf_prog_aux *)aux__ign; + struct bpf_prog_aux *aux = (struct bpf_prog_aux *)aux__prog; struct bpf_async_kern *async = (struct bpf_async_kern *)wq; if (flags) @@ -3067,6 +3215,50 @@ __bpf_kfunc int bpf_copy_from_user_str(void *dst, u32 dst__sz, const void __user return ret + 1; } +/** + * bpf_copy_from_user_task_str() - Copy a string from an task's address space + * @dst: Destination address, in kernel space. This buffer must be + * at least @dst__sz bytes long. + * @dst__sz: Maximum number of bytes to copy, includes the trailing NUL. + * @unsafe_ptr__ign: Source address in the task's address space. + * @tsk: The task whose address space will be used + * @flags: The only supported flag is BPF_F_PAD_ZEROS + * + * Copies a NUL terminated string from a task's address space to @dst__sz + * buffer. If user string is too long this will still ensure zero termination + * in the @dst__sz buffer unless buffer size is 0. + * + * If BPF_F_PAD_ZEROS flag is set, memset the tail of @dst__sz to 0 on success + * and memset all of @dst__sz on failure. + * + * Return: The number of copied bytes on success including the NUL terminator. + * A negative error code on failure. + */ +__bpf_kfunc int bpf_copy_from_user_task_str(void *dst, u32 dst__sz, + const void __user *unsafe_ptr__ign, + struct task_struct *tsk, u64 flags) +{ + int ret; + + if (unlikely(flags & ~BPF_F_PAD_ZEROS)) + return -EINVAL; + + if (unlikely(dst__sz == 0)) + return 0; + + ret = copy_remote_vm_str(tsk, (unsigned long)unsafe_ptr__ign, dst, dst__sz, 0); + if (ret < 0) { + if (flags & BPF_F_PAD_ZEROS) + memset(dst, 0, dst__sz); + return ret; + } + + if (flags & BPF_F_PAD_ZEROS) + memset(dst + ret, 0, dst__sz - ret); + + return ret + 1; +} + /* Keep unsinged long in prototype so that kfunc is usable when emitted to * vmlinux.h in BPF programs directly, but note that while in BPF prog, the * unsigned long always points to 8-byte region on stack, the kernel may only @@ -3082,6 +3274,10 @@ __bpf_kfunc void bpf_local_irq_restore(unsigned long *flags__irq_flag) local_irq_restore(*flags__irq_flag); } +__bpf_kfunc void __bpf_trap(void) +{ +} + __bpf_kfunc_end_defs(); BTF_KFUNCS_START(generic_btf_ids) @@ -3097,11 +3293,16 @@ BTF_ID_FLAGS(func, bpf_list_push_front_impl) BTF_ID_FLAGS(func, bpf_list_push_back_impl) BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_list_pop_back, KF_ACQUIRE | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_list_front, KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_list_back, KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_release, KF_RELEASE) BTF_ID_FLAGS(func, bpf_rbtree_remove, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_rbtree_add_impl) BTF_ID_FLAGS(func, bpf_rbtree_first, KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_rbtree_root, KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_rbtree_left, KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_rbtree_right, KF_RET_NULL) #ifdef CONFIG_CGROUPS BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL) @@ -3162,6 +3363,7 @@ BTF_ID_FLAGS(func, bpf_dynptr_is_null) BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly) BTF_ID_FLAGS(func, bpf_dynptr_size) BTF_ID_FLAGS(func, bpf_dynptr_clone) +BTF_ID_FLAGS(func, bpf_dynptr_copy) #ifdef CONFIG_NET BTF_ID_FLAGS(func, bpf_modify_return_test_tp) #endif @@ -3174,12 +3376,27 @@ BTF_ID_FLAGS(func, bpf_iter_bits_new, KF_ITER_NEW) BTF_ID_FLAGS(func, bpf_iter_bits_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_bits_destroy, KF_ITER_DESTROY) BTF_ID_FLAGS(func, bpf_copy_from_user_str, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_copy_from_user_task_str, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_get_kmem_cache) BTF_ID_FLAGS(func, bpf_iter_kmem_cache_new, KF_ITER_NEW | KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_iter_kmem_cache_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_iter_kmem_cache_destroy, KF_ITER_DESTROY | KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_local_irq_save) BTF_ID_FLAGS(func, bpf_local_irq_restore) +BTF_ID_FLAGS(func, bpf_probe_read_user_dynptr) +BTF_ID_FLAGS(func, bpf_probe_read_kernel_dynptr) +BTF_ID_FLAGS(func, bpf_probe_read_user_str_dynptr) +BTF_ID_FLAGS(func, bpf_probe_read_kernel_str_dynptr) +BTF_ID_FLAGS(func, bpf_copy_from_user_dynptr, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_copy_from_user_str_dynptr, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_copy_from_user_task_dynptr, KF_SLEEPABLE | KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_copy_from_user_task_str_dynptr, KF_SLEEPABLE | KF_TRUSTED_ARGS) +#ifdef CONFIG_DMA_SHARED_BUFFER +BTF_ID_FLAGS(func, bpf_iter_dmabuf_new, KF_ITER_NEW | KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_iter_dmabuf_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_iter_dmabuf_destroy, KF_ITER_DESTROY | KF_SLEEPABLE) +#endif +BTF_ID_FLAGS(func, __bpf_trap) BTF_KFUNCS_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index dc3aa91a6ba0..5c2e96b19392 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -421,7 +421,7 @@ static int bpf_iter_link_pin_kernel(struct dentry *parent, int ret; inode_lock(parent->d_inode); - dentry = lookup_one_len(name, parent, strlen(name)); + dentry = lookup_noperm(&QSTR(name), parent); if (IS_ERR(dentry)) { inode_unlock(parent->d_inode); return PTR_ERR(dentry); diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index e8a772e64324..be66d7e520e0 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -15,6 +15,7 @@ #include <net/ipv6.h> #include <uapi/linux/btf.h> #include <linux/btf_ids.h> +#include <asm/rqspinlock.h> #include <linux/bpf_mem_alloc.h> /* Intermediate node */ @@ -36,7 +37,7 @@ struct lpm_trie { size_t n_entries; size_t max_prefixlen; size_t data_size; - raw_spinlock_t lock; + rqspinlock_t lock; }; /* This trie implements a longest prefix match algorithm that can be used to @@ -342,7 +343,9 @@ static long trie_update_elem(struct bpf_map *map, if (!new_node) return -ENOMEM; - raw_spin_lock_irqsave(&trie->lock, irq_flags); + ret = raw_res_spin_lock_irqsave(&trie->lock, irq_flags); + if (ret) + goto out_free; new_node->prefixlen = key->prefixlen; RCU_INIT_POINTER(new_node->child[0], NULL); @@ -356,8 +359,7 @@ static long trie_update_elem(struct bpf_map *map, */ slot = &trie->root; - while ((node = rcu_dereference_protected(*slot, - lockdep_is_held(&trie->lock)))) { + while ((node = rcu_dereference(*slot))) { matchlen = longest_prefix_match(trie, node, key); if (node->prefixlen != matchlen || @@ -442,8 +444,8 @@ static long trie_update_elem(struct bpf_map *map, rcu_assign_pointer(*slot, im_node); out: - raw_spin_unlock_irqrestore(&trie->lock, irq_flags); - + raw_res_spin_unlock_irqrestore(&trie->lock, irq_flags); +out_free: if (ret) bpf_mem_cache_free(&trie->ma, new_node); bpf_mem_cache_free_rcu(&trie->ma, free_node); @@ -467,7 +469,9 @@ static long trie_delete_elem(struct bpf_map *map, void *_key) if (key->prefixlen > trie->max_prefixlen) return -EINVAL; - raw_spin_lock_irqsave(&trie->lock, irq_flags); + ret = raw_res_spin_lock_irqsave(&trie->lock, irq_flags); + if (ret) + return ret; /* Walk the tree looking for an exact key/length match and keeping * track of the path we traverse. We will need to know the node @@ -478,8 +482,7 @@ static long trie_delete_elem(struct bpf_map *map, void *_key) trim = &trie->root; trim2 = trim; parent = NULL; - while ((node = rcu_dereference_protected( - *trim, lockdep_is_held(&trie->lock)))) { + while ((node = rcu_dereference(*trim))) { matchlen = longest_prefix_match(trie, node, key); if (node->prefixlen != matchlen || @@ -543,7 +546,7 @@ static long trie_delete_elem(struct bpf_map *map, void *_key) free_node = node; out: - raw_spin_unlock_irqrestore(&trie->lock, irq_flags); + raw_res_spin_unlock_irqrestore(&trie->lock, irq_flags); bpf_mem_cache_free_rcu(&trie->ma, free_parent); bpf_mem_cache_free_rcu(&trie->ma, free_node); @@ -592,7 +595,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) offsetof(struct bpf_lpm_trie_key_u8, data); trie->max_prefixlen = trie->data_size * 8; - raw_spin_lock_init(&trie->lock); + raw_res_spin_lock_init(&trie->lock); /* Allocate intermediate and leaf nodes from the same allocator */ leaf_size = sizeof(struct lpm_trie_node) + trie->data_size + diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 1a4fec330eaa..42ae8d595c2c 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -25,6 +25,7 @@ #include <linux/rhashtable.h> #include <linux/rtnetlink.h> #include <linux/rwsem.h> +#include <net/netdev_lock.h> #include <net/xdp.h> /* Protects offdevs, members of bpf_offload_netdev and offload members @@ -528,13 +529,14 @@ struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&offmap->map, attr); - rtnl_lock(); - down_write(&bpf_devs_lock); offmap->netdev = __dev_get_by_index(net, attr->map_ifindex); err = bpf_dev_offload_check(offmap->netdev); if (err) - goto err_unlock; + goto err_unlock_rtnl; + + netdev_lock_ops(offmap->netdev); + down_write(&bpf_devs_lock); ondev = bpf_offload_find_netdev(offmap->netdev); if (!ondev) { @@ -548,12 +550,15 @@ struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr) list_add_tail(&offmap->offloads, &ondev->maps); up_write(&bpf_devs_lock); + netdev_unlock_ops(offmap->netdev); rtnl_unlock(); return &offmap->map; err_unlock: up_write(&bpf_devs_lock); + netdev_unlock_ops(offmap->netdev); +err_unlock_rtnl: rtnl_unlock(); bpf_map_area_free(offmap); return ERR_PTR(err); diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c index 034cf87b54e9..632762b57299 100644 --- a/kernel/bpf/percpu_freelist.c +++ b/kernel/bpf/percpu_freelist.c @@ -14,11 +14,9 @@ int pcpu_freelist_init(struct pcpu_freelist *s) for_each_possible_cpu(cpu) { struct pcpu_freelist_head *head = per_cpu_ptr(s->freelist, cpu); - raw_spin_lock_init(&head->lock); + raw_res_spin_lock_init(&head->lock); head->first = NULL; } - raw_spin_lock_init(&s->extralist.lock); - s->extralist.first = NULL; return 0; } @@ -34,58 +32,39 @@ static inline void pcpu_freelist_push_node(struct pcpu_freelist_head *head, WRITE_ONCE(head->first, node); } -static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head, +static inline bool ___pcpu_freelist_push(struct pcpu_freelist_head *head, struct pcpu_freelist_node *node) { - raw_spin_lock(&head->lock); - pcpu_freelist_push_node(head, node); - raw_spin_unlock(&head->lock); -} - -static inline bool pcpu_freelist_try_push_extra(struct pcpu_freelist *s, - struct pcpu_freelist_node *node) -{ - if (!raw_spin_trylock(&s->extralist.lock)) + if (raw_res_spin_lock(&head->lock)) return false; - - pcpu_freelist_push_node(&s->extralist, node); - raw_spin_unlock(&s->extralist.lock); + pcpu_freelist_push_node(head, node); + raw_res_spin_unlock(&head->lock); return true; } -static inline void ___pcpu_freelist_push_nmi(struct pcpu_freelist *s, - struct pcpu_freelist_node *node) +void __pcpu_freelist_push(struct pcpu_freelist *s, + struct pcpu_freelist_node *node) { - int cpu, orig_cpu; + struct pcpu_freelist_head *head; + int cpu; - orig_cpu = raw_smp_processor_id(); - while (1) { - for_each_cpu_wrap(cpu, cpu_possible_mask, orig_cpu) { - struct pcpu_freelist_head *head; + if (___pcpu_freelist_push(this_cpu_ptr(s->freelist), node)) + return; + while (true) { + for_each_cpu_wrap(cpu, cpu_possible_mask, raw_smp_processor_id()) { + if (cpu == raw_smp_processor_id()) + continue; head = per_cpu_ptr(s->freelist, cpu); - if (raw_spin_trylock(&head->lock)) { - pcpu_freelist_push_node(head, node); - raw_spin_unlock(&head->lock); - return; - } - } - - /* cannot lock any per cpu lock, try extralist */ - if (pcpu_freelist_try_push_extra(s, node)) + if (raw_res_spin_lock(&head->lock)) + continue; + pcpu_freelist_push_node(head, node); + raw_res_spin_unlock(&head->lock); return; + } } } -void __pcpu_freelist_push(struct pcpu_freelist *s, - struct pcpu_freelist_node *node) -{ - if (in_nmi()) - ___pcpu_freelist_push_nmi(s, node); - else - ___pcpu_freelist_push(this_cpu_ptr(s->freelist), node); -} - void pcpu_freelist_push(struct pcpu_freelist *s, struct pcpu_freelist_node *node) { @@ -120,71 +99,29 @@ void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size, static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s) { + struct pcpu_freelist_node *node = NULL; struct pcpu_freelist_head *head; - struct pcpu_freelist_node *node; int cpu; for_each_cpu_wrap(cpu, cpu_possible_mask, raw_smp_processor_id()) { head = per_cpu_ptr(s->freelist, cpu); if (!READ_ONCE(head->first)) continue; - raw_spin_lock(&head->lock); + if (raw_res_spin_lock(&head->lock)) + continue; node = head->first; if (node) { WRITE_ONCE(head->first, node->next); - raw_spin_unlock(&head->lock); + raw_res_spin_unlock(&head->lock); return node; } - raw_spin_unlock(&head->lock); + raw_res_spin_unlock(&head->lock); } - - /* per cpu lists are all empty, try extralist */ - if (!READ_ONCE(s->extralist.first)) - return NULL; - raw_spin_lock(&s->extralist.lock); - node = s->extralist.first; - if (node) - WRITE_ONCE(s->extralist.first, node->next); - raw_spin_unlock(&s->extralist.lock); - return node; -} - -static struct pcpu_freelist_node * -___pcpu_freelist_pop_nmi(struct pcpu_freelist *s) -{ - struct pcpu_freelist_head *head; - struct pcpu_freelist_node *node; - int cpu; - - for_each_cpu_wrap(cpu, cpu_possible_mask, raw_smp_processor_id()) { - head = per_cpu_ptr(s->freelist, cpu); - if (!READ_ONCE(head->first)) - continue; - if (raw_spin_trylock(&head->lock)) { - node = head->first; - if (node) { - WRITE_ONCE(head->first, node->next); - raw_spin_unlock(&head->lock); - return node; - } - raw_spin_unlock(&head->lock); - } - } - - /* cannot pop from per cpu lists, try extralist */ - if (!READ_ONCE(s->extralist.first) || !raw_spin_trylock(&s->extralist.lock)) - return NULL; - node = s->extralist.first; - if (node) - WRITE_ONCE(s->extralist.first, node->next); - raw_spin_unlock(&s->extralist.lock); return node; } struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s) { - if (in_nmi()) - return ___pcpu_freelist_pop_nmi(s); return ___pcpu_freelist_pop(s); } diff --git a/kernel/bpf/percpu_freelist.h b/kernel/bpf/percpu_freelist.h index 3c76553cfe57..914798b74967 100644 --- a/kernel/bpf/percpu_freelist.h +++ b/kernel/bpf/percpu_freelist.h @@ -5,15 +5,15 @@ #define __PERCPU_FREELIST_H__ #include <linux/spinlock.h> #include <linux/percpu.h> +#include <asm/rqspinlock.h> struct pcpu_freelist_head { struct pcpu_freelist_node *first; - raw_spinlock_t lock; + rqspinlock_t lock; }; struct pcpu_freelist { struct pcpu_freelist_head __percpu *freelist; - struct pcpu_freelist_head extralist; }; struct pcpu_freelist_node { diff --git a/kernel/bpf/preload/bpf_preload_kern.c b/kernel/bpf/preload/bpf_preload_kern.c index 0c63bc2cd895..774e5a538811 100644 --- a/kernel/bpf/preload/bpf_preload_kern.c +++ b/kernel/bpf/preload/bpf_preload_kern.c @@ -89,4 +89,6 @@ static void __exit fini(void) } late_initcall(load); module_exit(fini); +MODULE_IMPORT_NS("BPF_INTERNAL"); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Embedded BPF programs for introspection in bpffs"); diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index d869f51ea93a..9a5f94371e50 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -9,13 +9,14 @@ #include <linux/slab.h> #include <linux/btf_ids.h> #include "percpu_freelist.h" +#include <asm/rqspinlock.h> #define QUEUE_STACK_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK) struct bpf_queue_stack { struct bpf_map map; - raw_spinlock_t lock; + rqspinlock_t lock; u32 head, tail; u32 size; /* max_entries + 1 */ @@ -78,7 +79,7 @@ static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr) qs->size = size; - raw_spin_lock_init(&qs->lock); + raw_res_spin_lock_init(&qs->lock); return &qs->map; } @@ -98,12 +99,8 @@ static long __queue_map_get(struct bpf_map *map, void *value, bool delete) int err = 0; void *ptr; - if (in_nmi()) { - if (!raw_spin_trylock_irqsave(&qs->lock, flags)) - return -EBUSY; - } else { - raw_spin_lock_irqsave(&qs->lock, flags); - } + if (raw_res_spin_lock_irqsave(&qs->lock, flags)) + return -EBUSY; if (queue_stack_map_is_empty(qs)) { memset(value, 0, qs->map.value_size); @@ -120,7 +117,7 @@ static long __queue_map_get(struct bpf_map *map, void *value, bool delete) } out: - raw_spin_unlock_irqrestore(&qs->lock, flags); + raw_res_spin_unlock_irqrestore(&qs->lock, flags); return err; } @@ -133,12 +130,8 @@ static long __stack_map_get(struct bpf_map *map, void *value, bool delete) void *ptr; u32 index; - if (in_nmi()) { - if (!raw_spin_trylock_irqsave(&qs->lock, flags)) - return -EBUSY; - } else { - raw_spin_lock_irqsave(&qs->lock, flags); - } + if (raw_res_spin_lock_irqsave(&qs->lock, flags)) + return -EBUSY; if (queue_stack_map_is_empty(qs)) { memset(value, 0, qs->map.value_size); @@ -157,7 +150,7 @@ static long __stack_map_get(struct bpf_map *map, void *value, bool delete) qs->head = index; out: - raw_spin_unlock_irqrestore(&qs->lock, flags); + raw_res_spin_unlock_irqrestore(&qs->lock, flags); return err; } @@ -203,12 +196,8 @@ static long queue_stack_map_push_elem(struct bpf_map *map, void *value, if (flags & BPF_NOEXIST || flags > BPF_EXIST) return -EINVAL; - if (in_nmi()) { - if (!raw_spin_trylock_irqsave(&qs->lock, irq_flags)) - return -EBUSY; - } else { - raw_spin_lock_irqsave(&qs->lock, irq_flags); - } + if (raw_res_spin_lock_irqsave(&qs->lock, irq_flags)) + return -EBUSY; if (queue_stack_map_is_full(qs)) { if (!replace) { @@ -227,7 +216,7 @@ static long queue_stack_map_push_elem(struct bpf_map *map, void *value, qs->head = 0; out: - raw_spin_unlock_irqrestore(&qs->lock, irq_flags); + raw_res_spin_unlock_irqrestore(&qs->lock, irq_flags); return err; } diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index 1499d8caa9a3..719d73299397 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -11,6 +11,7 @@ #include <linux/kmemleak.h> #include <uapi/linux/btf.h> #include <linux/btf_ids.h> +#include <asm/rqspinlock.h> #define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE) @@ -29,7 +30,7 @@ struct bpf_ringbuf { u64 mask; struct page **pages; int nr_pages; - raw_spinlock_t spinlock ____cacheline_aligned_in_smp; + rqspinlock_t spinlock ____cacheline_aligned_in_smp; /* For user-space producer ring buffers, an atomic_t busy bit is used * to synchronize access to the ring buffers in the kernel, rather than * the spinlock that is used for kernel-producer ring buffers. This is @@ -173,7 +174,7 @@ static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node) if (!rb) return NULL; - raw_spin_lock_init(&rb->spinlock); + raw_res_spin_lock_init(&rb->spinlock); atomic_set(&rb->busy, 0); init_waitqueue_head(&rb->waitq); init_irq_work(&rb->work, bpf_ringbuf_notify); @@ -416,12 +417,8 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) cons_pos = smp_load_acquire(&rb->consumer_pos); - if (in_nmi()) { - if (!raw_spin_trylock_irqsave(&rb->spinlock, flags)) - return NULL; - } else { - raw_spin_lock_irqsave(&rb->spinlock, flags); - } + if (raw_res_spin_lock_irqsave(&rb->spinlock, flags)) + return NULL; pend_pos = rb->pending_pos; prod_pos = rb->producer_pos; @@ -446,7 +443,7 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) */ if (new_prod_pos - cons_pos > rb->mask || new_prod_pos - pend_pos > rb->mask) { - raw_spin_unlock_irqrestore(&rb->spinlock, flags); + raw_res_spin_unlock_irqrestore(&rb->spinlock, flags); return NULL; } @@ -458,7 +455,7 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) /* pairs with consumer's smp_load_acquire() */ smp_store_release(&rb->producer_pos, new_prod_pos); - raw_spin_unlock_irqrestore(&rb->spinlock, flags); + raw_res_spin_unlock_irqrestore(&rb->spinlock, flags); return (void *)hdr + BPF_RINGBUF_HDR_SZ; } diff --git a/kernel/bpf/rqspinlock.c b/kernel/bpf/rqspinlock.c new file mode 100644 index 000000000000..338305c8852c --- /dev/null +++ b/kernel/bpf/rqspinlock.c @@ -0,0 +1,737 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Resilient Queued Spin Lock + * + * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P. + * (C) Copyright 2013-2014,2018 Red Hat, Inc. + * (C) Copyright 2015 Intel Corp. + * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP + * (C) Copyright 2024-2025 Meta Platforms, Inc. and affiliates. + * + * Authors: Waiman Long <longman@redhat.com> + * Peter Zijlstra <peterz@infradead.org> + * Kumar Kartikeya Dwivedi <memxor@gmail.com> + */ + +#include <linux/smp.h> +#include <linux/bug.h> +#include <linux/bpf.h> +#include <linux/err.h> +#include <linux/cpumask.h> +#include <linux/percpu.h> +#include <linux/hardirq.h> +#include <linux/mutex.h> +#include <linux/prefetch.h> +#include <asm/byteorder.h> +#ifdef CONFIG_QUEUED_SPINLOCKS +#include <asm/qspinlock.h> +#endif +#include <trace/events/lock.h> +#include <asm/rqspinlock.h> +#include <linux/timekeeping.h> + +/* + * Include queued spinlock definitions and statistics code + */ +#ifdef CONFIG_QUEUED_SPINLOCKS +#include "../locking/qspinlock.h" +#include "../locking/lock_events.h" +#include "rqspinlock.h" +#include "../locking/mcs_spinlock.h" +#endif + +/* + * The basic principle of a queue-based spinlock can best be understood + * by studying a classic queue-based spinlock implementation called the + * MCS lock. A copy of the original MCS lock paper ("Algorithms for Scalable + * Synchronization on Shared-Memory Multiprocessors by Mellor-Crummey and + * Scott") is available at + * + * https://bugzilla.kernel.org/show_bug.cgi?id=206115 + * + * This queued spinlock implementation is based on the MCS lock, however to + * make it fit the 4 bytes we assume spinlock_t to be, and preserve its + * existing API, we must modify it somehow. + * + * In particular; where the traditional MCS lock consists of a tail pointer + * (8 bytes) and needs the next pointer (another 8 bytes) of its own node to + * unlock the next pending (next->locked), we compress both these: {tail, + * next->locked} into a single u32 value. + * + * Since a spinlock disables recursion of its own context and there is a limit + * to the contexts that can nest; namely: task, softirq, hardirq, nmi. As there + * are at most 4 nesting levels, it can be encoded by a 2-bit number. Now + * we can encode the tail by combining the 2-bit nesting level with the cpu + * number. With one byte for the lock value and 3 bytes for the tail, only a + * 32-bit word is now needed. Even though we only need 1 bit for the lock, + * we extend it to a full byte to achieve better performance for architectures + * that support atomic byte write. + * + * We also change the first spinner to spin on the lock bit instead of its + * node; whereby avoiding the need to carry a node from lock to unlock, and + * preserving existing lock API. This also makes the unlock code simpler and + * faster. + * + * N.B. The current implementation only supports architectures that allow + * atomic operations on smaller 8-bit and 16-bit data types. + * + */ + +struct rqspinlock_timeout { + u64 timeout_end; + u64 duration; + u64 cur; + u16 spin; +}; + +#define RES_TIMEOUT_VAL 2 + +DEFINE_PER_CPU_ALIGNED(struct rqspinlock_held, rqspinlock_held_locks); +EXPORT_SYMBOL_GPL(rqspinlock_held_locks); + +static bool is_lock_released(rqspinlock_t *lock, u32 mask, struct rqspinlock_timeout *ts) +{ + if (!(atomic_read_acquire(&lock->val) & (mask))) + return true; + return false; +} + +static noinline int check_deadlock_AA(rqspinlock_t *lock, u32 mask, + struct rqspinlock_timeout *ts) +{ + struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks); + int cnt = min(RES_NR_HELD, rqh->cnt); + + /* + * Return an error if we hold the lock we are attempting to acquire. + * We'll iterate over max 32 locks; no need to do is_lock_released. + */ + for (int i = 0; i < cnt - 1; i++) { + if (rqh->locks[i] == lock) + return -EDEADLK; + } + return 0; +} + +/* + * This focuses on the most common case of ABBA deadlocks (or ABBA involving + * more locks, which reduce to ABBA). This is not exhaustive, and we rely on + * timeouts as the final line of defense. + */ +static noinline int check_deadlock_ABBA(rqspinlock_t *lock, u32 mask, + struct rqspinlock_timeout *ts) +{ + struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks); + int rqh_cnt = min(RES_NR_HELD, rqh->cnt); + void *remote_lock; + int cpu; + + /* + * Find the CPU holding the lock that we want to acquire. If there is a + * deadlock scenario, we will read a stable set on the remote CPU and + * find the target. This would be a constant time operation instead of + * O(NR_CPUS) if we could determine the owning CPU from a lock value, but + * that requires increasing the size of the lock word. + */ + for_each_possible_cpu(cpu) { + struct rqspinlock_held *rqh_cpu = per_cpu_ptr(&rqspinlock_held_locks, cpu); + int real_cnt = READ_ONCE(rqh_cpu->cnt); + int cnt = min(RES_NR_HELD, real_cnt); + + /* + * Let's ensure to break out of this loop if the lock is available for + * us to potentially acquire. + */ + if (is_lock_released(lock, mask, ts)) + return 0; + + /* + * Skip ourselves, and CPUs whose count is less than 2, as they need at + * least one held lock and one acquisition attempt (reflected as top + * most entry) to participate in an ABBA deadlock. + * + * If cnt is more than RES_NR_HELD, it means the current lock being + * acquired won't appear in the table, and other locks in the table are + * already held, so we can't determine ABBA. + */ + if (cpu == smp_processor_id() || real_cnt < 2 || real_cnt > RES_NR_HELD) + continue; + + /* + * Obtain the entry at the top, this corresponds to the lock the + * remote CPU is attempting to acquire in a deadlock situation, + * and would be one of the locks we hold on the current CPU. + */ + remote_lock = READ_ONCE(rqh_cpu->locks[cnt - 1]); + /* + * If it is NULL, we've raced and cannot determine a deadlock + * conclusively, skip this CPU. + */ + if (!remote_lock) + continue; + /* + * Find if the lock we're attempting to acquire is held by this CPU. + * Don't consider the topmost entry, as that must be the latest lock + * being held or acquired. For a deadlock, the target CPU must also + * attempt to acquire a lock we hold, so for this search only 'cnt - 1' + * entries are important. + */ + for (int i = 0; i < cnt - 1; i++) { + if (READ_ONCE(rqh_cpu->locks[i]) != lock) + continue; + /* + * We found our lock as held on the remote CPU. Is the + * acquisition attempt on the remote CPU for a lock held + * by us? If so, we have a deadlock situation, and need + * to recover. + */ + for (int i = 0; i < rqh_cnt - 1; i++) { + if (rqh->locks[i] == remote_lock) + return -EDEADLK; + } + /* + * Inconclusive; retry again later. + */ + return 0; + } + } + return 0; +} + +static noinline int check_deadlock(rqspinlock_t *lock, u32 mask, + struct rqspinlock_timeout *ts) +{ + int ret; + + ret = check_deadlock_AA(lock, mask, ts); + if (ret) + return ret; + ret = check_deadlock_ABBA(lock, mask, ts); + if (ret) + return ret; + + return 0; +} + +static noinline int check_timeout(rqspinlock_t *lock, u32 mask, + struct rqspinlock_timeout *ts) +{ + u64 time = ktime_get_mono_fast_ns(); + u64 prev = ts->cur; + + if (!ts->timeout_end) { + ts->cur = time; + ts->timeout_end = time + ts->duration; + return 0; + } + + if (time > ts->timeout_end) + return -ETIMEDOUT; + + /* + * A millisecond interval passed from last time? Trigger deadlock + * checks. + */ + if (prev + NSEC_PER_MSEC < time) { + ts->cur = time; + return check_deadlock(lock, mask, ts); + } + + return 0; +} + +/* + * Do not amortize with spins when res_smp_cond_load_acquire is defined, + * as the macro does internal amortization for us. + */ +#ifndef res_smp_cond_load_acquire +#define RES_CHECK_TIMEOUT(ts, ret, mask) \ + ({ \ + if (!(ts).spin++) \ + (ret) = check_timeout((lock), (mask), &(ts)); \ + (ret); \ + }) +#else +#define RES_CHECK_TIMEOUT(ts, ret, mask) \ + ({ (ret) = check_timeout((lock), (mask), &(ts)); }) +#endif + +/* + * Initialize the 'spin' member. + * Set spin member to 0 to trigger AA/ABBA checks immediately. + */ +#define RES_INIT_TIMEOUT(ts) ({ (ts).spin = 0; }) + +/* + * We only need to reset 'timeout_end', 'spin' will just wrap around as necessary. + * Duration is defined for each spin attempt, so set it here. + */ +#define RES_RESET_TIMEOUT(ts, _duration) ({ (ts).timeout_end = 0; (ts).duration = _duration; }) + +/* + * Provide a test-and-set fallback for cases when queued spin lock support is + * absent from the architecture. + */ +int __lockfunc resilient_tas_spin_lock(rqspinlock_t *lock) +{ + struct rqspinlock_timeout ts; + int val, ret = 0; + + RES_INIT_TIMEOUT(ts); + grab_held_lock_entry(lock); + + /* + * Since the waiting loop's time is dependent on the amount of + * contention, a short timeout unlike rqspinlock waiting loops + * isn't enough. Choose a second as the timeout value. + */ + RES_RESET_TIMEOUT(ts, NSEC_PER_SEC); +retry: + val = atomic_read(&lock->val); + + if (val || !atomic_try_cmpxchg(&lock->val, &val, 1)) { + if (RES_CHECK_TIMEOUT(ts, ret, ~0u)) + goto out; + cpu_relax(); + goto retry; + } + + return 0; +out: + release_held_lock_entry(); + return ret; +} +EXPORT_SYMBOL_GPL(resilient_tas_spin_lock); + +#ifdef CONFIG_QUEUED_SPINLOCKS + +/* + * Per-CPU queue node structures; we can never have more than 4 nested + * contexts: task, softirq, hardirq, nmi. + * + * Exactly fits one 64-byte cacheline on a 64-bit architecture. + */ +static DEFINE_PER_CPU_ALIGNED(struct qnode, rqnodes[_Q_MAX_NODES]); + +#ifndef res_smp_cond_load_acquire +#define res_smp_cond_load_acquire(v, c) smp_cond_load_acquire(v, c) +#endif + +#define res_atomic_cond_read_acquire(v, c) res_smp_cond_load_acquire(&(v)->counter, (c)) + +/** + * resilient_queued_spin_lock_slowpath - acquire the queued spinlock + * @lock: Pointer to queued spinlock structure + * @val: Current value of the queued spinlock 32-bit word + * + * Return: + * * 0 - Lock was acquired successfully. + * * -EDEADLK - Lock acquisition failed because of AA/ABBA deadlock. + * * -ETIMEDOUT - Lock acquisition failed because of timeout. + * + * (queue tail, pending bit, lock value) + * + * fast : slow : unlock + * : : + * uncontended (0,0,0) -:--> (0,0,1) ------------------------------:--> (*,*,0) + * : | ^--------.------. / : + * : v \ \ | : + * pending : (0,1,1) +--> (0,1,0) \ | : + * : | ^--' | | : + * : v | | : + * uncontended : (n,x,y) +--> (n,0,0) --' | : + * queue : | ^--' | : + * : v | : + * contended : (*,x,y) +--> (*,0,0) ---> (*,0,1) -' : + * queue : ^--' : + */ +int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val) +{ + struct mcs_spinlock *prev, *next, *node; + struct rqspinlock_timeout ts; + int idx, ret = 0; + u32 old, tail; + + BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS)); + + if (resilient_virt_spin_lock_enabled()) + return resilient_virt_spin_lock(lock); + + RES_INIT_TIMEOUT(ts); + + /* + * Wait for in-progress pending->locked hand-overs with a bounded + * number of spins so that we guarantee forward progress. + * + * 0,1,0 -> 0,0,1 + */ + if (val == _Q_PENDING_VAL) { + int cnt = _Q_PENDING_LOOPS; + val = atomic_cond_read_relaxed(&lock->val, + (VAL != _Q_PENDING_VAL) || !cnt--); + } + + /* + * If we observe any contention; queue. + */ + if (val & ~_Q_LOCKED_MASK) + goto queue; + + /* + * trylock || pending + * + * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock + */ + val = queued_fetch_set_pending_acquire(lock); + + /* + * If we observe contention, there is a concurrent locker. + * + * Undo and queue; our setting of PENDING might have made the + * n,0,0 -> 0,0,0 transition fail and it will now be waiting + * on @next to become !NULL. + */ + if (unlikely(val & ~_Q_LOCKED_MASK)) { + + /* Undo PENDING if we set it. */ + if (!(val & _Q_PENDING_MASK)) + clear_pending(lock); + + goto queue; + } + + /* + * Grab an entry in the held locks array, to enable deadlock detection. + */ + grab_held_lock_entry(lock); + + /* + * We're pending, wait for the owner to go away. + * + * 0,1,1 -> *,1,0 + * + * this wait loop must be a load-acquire such that we match the + * store-release that clears the locked bit and create lock + * sequentiality; this is because not all + * clear_pending_set_locked() implementations imply full + * barriers. + */ + if (val & _Q_LOCKED_MASK) { + RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT); + res_smp_cond_load_acquire(&lock->locked, !VAL || RES_CHECK_TIMEOUT(ts, ret, _Q_LOCKED_MASK)); + } + + if (ret) { + /* + * We waited for the locked bit to go back to 0, as the pending + * waiter, but timed out. We need to clear the pending bit since + * we own it. Once a stuck owner has been recovered, the lock + * must be restored to a valid state, hence removing the pending + * bit is necessary. + * + * *,1,* -> *,0,* + */ + clear_pending(lock); + lockevent_inc(rqspinlock_lock_timeout); + goto err_release_entry; + } + + /* + * take ownership and clear the pending bit. + * + * 0,1,0 -> 0,0,1 + */ + clear_pending_set_locked(lock); + lockevent_inc(lock_pending); + return 0; + + /* + * End of pending bit optimistic spinning and beginning of MCS + * queuing. + */ +queue: + lockevent_inc(lock_slowpath); + /* + * Grab deadlock detection entry for the queue path. + */ + grab_held_lock_entry(lock); + + node = this_cpu_ptr(&rqnodes[0].mcs); + idx = node->count++; + tail = encode_tail(smp_processor_id(), idx); + + trace_contention_begin(lock, LCB_F_SPIN); + + /* + * 4 nodes are allocated based on the assumption that there will + * not be nested NMIs taking spinlocks. That may not be true in + * some architectures even though the chance of needing more than + * 4 nodes will still be extremely unlikely. When that happens, + * we fall back to spinning on the lock directly without using + * any MCS node. This is not the most elegant solution, but is + * simple enough. + */ + if (unlikely(idx >= _Q_MAX_NODES)) { + lockevent_inc(lock_no_node); + RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT); + while (!queued_spin_trylock(lock)) { + if (RES_CHECK_TIMEOUT(ts, ret, ~0u)) { + lockevent_inc(rqspinlock_lock_timeout); + goto err_release_node; + } + cpu_relax(); + } + goto release; + } + + node = grab_mcs_node(node, idx); + + /* + * Keep counts of non-zero index values: + */ + lockevent_cond_inc(lock_use_node2 + idx - 1, idx); + + /* + * Ensure that we increment the head node->count before initialising + * the actual node. If the compiler is kind enough to reorder these + * stores, then an IRQ could overwrite our assignments. + */ + barrier(); + + node->locked = 0; + node->next = NULL; + + /* + * We touched a (possibly) cold cacheline in the per-cpu queue node; + * attempt the trylock once more in the hope someone let go while we + * weren't watching. + */ + if (queued_spin_trylock(lock)) + goto release; + + /* + * Ensure that the initialisation of @node is complete before we + * publish the updated tail via xchg_tail() and potentially link + * @node into the waitqueue via WRITE_ONCE(prev->next, node) below. + */ + smp_wmb(); + + /* + * Publish the updated tail. + * We have already touched the queueing cacheline; don't bother with + * pending stuff. + * + * p,*,* -> n,*,* + */ + old = xchg_tail(lock, tail); + next = NULL; + + /* + * if there was a previous node; link it and wait until reaching the + * head of the waitqueue. + */ + if (old & _Q_TAIL_MASK) { + int val; + + prev = decode_tail(old, rqnodes); + + /* Link @node into the waitqueue. */ + WRITE_ONCE(prev->next, node); + + val = arch_mcs_spin_lock_contended(&node->locked); + if (val == RES_TIMEOUT_VAL) { + ret = -EDEADLK; + goto waitq_timeout; + } + + /* + * While waiting for the MCS lock, the next pointer may have + * been set by another lock waiter. We optimistically load + * the next pointer & prefetch the cacheline for writing + * to reduce latency in the upcoming MCS unlock operation. + */ + next = READ_ONCE(node->next); + if (next) + prefetchw(next); + } + + /* + * we're at the head of the waitqueue, wait for the owner & pending to + * go away. + * + * *,x,y -> *,0,0 + * + * this wait loop must use a load-acquire such that we match the + * store-release that clears the locked bit and create lock + * sequentiality; this is because the set_locked() function below + * does not imply a full barrier. + * + * We use RES_DEF_TIMEOUT * 2 as the duration, as RES_DEF_TIMEOUT is + * meant to span maximum allowed time per critical section, and we may + * have both the owner of the lock and the pending bit waiter ahead of + * us. + */ + RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT * 2); + val = res_atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK) || + RES_CHECK_TIMEOUT(ts, ret, _Q_LOCKED_PENDING_MASK)); + +waitq_timeout: + if (ret) { + /* + * If the tail is still pointing to us, then we are the final waiter, + * and are responsible for resetting the tail back to 0. Otherwise, if + * the cmpxchg operation fails, we signal the next waiter to take exit + * and try the same. For a waiter with tail node 'n': + * + * n,*,* -> 0,*,* + * + * When performing cmpxchg for the whole word (NR_CPUS > 16k), it is + * possible locked/pending bits keep changing and we see failures even + * when we remain the head of wait queue. However, eventually, + * pending bit owner will unset the pending bit, and new waiters + * will queue behind us. This will leave the lock owner in + * charge, and it will eventually either set locked bit to 0, or + * leave it as 1, allowing us to make progress. + * + * We terminate the whole wait queue for two reasons. Firstly, + * we eschew per-waiter timeouts with one applied at the head of + * the wait queue. This allows everyone to break out faster + * once we've seen the owner / pending waiter not responding for + * the timeout duration from the head. Secondly, it avoids + * complicated synchronization, because when not leaving in FIFO + * order, prev's next pointer needs to be fixed up etc. + */ + if (!try_cmpxchg_tail(lock, tail, 0)) { + next = smp_cond_load_relaxed(&node->next, VAL); + WRITE_ONCE(next->locked, RES_TIMEOUT_VAL); + } + lockevent_inc(rqspinlock_lock_timeout); + goto err_release_node; + } + + /* + * claim the lock: + * + * n,0,0 -> 0,0,1 : lock, uncontended + * *,*,0 -> *,*,1 : lock, contended + * + * If the queue head is the only one in the queue (lock value == tail) + * and nobody is pending, clear the tail code and grab the lock. + * Otherwise, we only need to grab the lock. + */ + + /* + * Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the + * above wait condition, therefore any concurrent setting of + * PENDING will make the uncontended transition fail. + */ + if ((val & _Q_TAIL_MASK) == tail) { + if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL)) + goto release; /* No contention */ + } + + /* + * Either somebody is queued behind us or _Q_PENDING_VAL got set + * which will then detect the remaining tail and queue behind us + * ensuring we'll see a @next. + */ + set_locked(lock); + + /* + * contended path; wait for next if not observed yet, release. + */ + if (!next) + next = smp_cond_load_relaxed(&node->next, (VAL)); + + arch_mcs_spin_unlock_contended(&next->locked); + +release: + trace_contention_end(lock, 0); + + /* + * release the node + */ + __this_cpu_dec(rqnodes[0].mcs.count); + return ret; +err_release_node: + trace_contention_end(lock, ret); + __this_cpu_dec(rqnodes[0].mcs.count); +err_release_entry: + release_held_lock_entry(); + return ret; +} +EXPORT_SYMBOL_GPL(resilient_queued_spin_lock_slowpath); + +#endif /* CONFIG_QUEUED_SPINLOCKS */ + +__bpf_kfunc_start_defs(); + +__bpf_kfunc int bpf_res_spin_lock(struct bpf_res_spin_lock *lock) +{ + int ret; + + BUILD_BUG_ON(sizeof(rqspinlock_t) != sizeof(struct bpf_res_spin_lock)); + BUILD_BUG_ON(__alignof__(rqspinlock_t) != __alignof__(struct bpf_res_spin_lock)); + + preempt_disable(); + ret = res_spin_lock((rqspinlock_t *)lock); + if (unlikely(ret)) { + preempt_enable(); + return ret; + } + return 0; +} + +__bpf_kfunc void bpf_res_spin_unlock(struct bpf_res_spin_lock *lock) +{ + res_spin_unlock((rqspinlock_t *)lock); + preempt_enable(); +} + +__bpf_kfunc int bpf_res_spin_lock_irqsave(struct bpf_res_spin_lock *lock, unsigned long *flags__irq_flag) +{ + u64 *ptr = (u64 *)flags__irq_flag; + unsigned long flags; + int ret; + + preempt_disable(); + local_irq_save(flags); + ret = res_spin_lock((rqspinlock_t *)lock); + if (unlikely(ret)) { + local_irq_restore(flags); + preempt_enable(); + return ret; + } + *ptr = flags; + return 0; +} + +__bpf_kfunc void bpf_res_spin_unlock_irqrestore(struct bpf_res_spin_lock *lock, unsigned long *flags__irq_flag) +{ + u64 *ptr = (u64 *)flags__irq_flag; + unsigned long flags = *ptr; + + res_spin_unlock((rqspinlock_t *)lock); + local_irq_restore(flags); + preempt_enable(); +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(rqspinlock_kfunc_ids) +BTF_ID_FLAGS(func, bpf_res_spin_lock, KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_res_spin_unlock) +BTF_ID_FLAGS(func, bpf_res_spin_lock_irqsave, KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_res_spin_unlock_irqrestore) +BTF_KFUNCS_END(rqspinlock_kfunc_ids) + +static const struct btf_kfunc_id_set rqspinlock_kfunc_set = { + .owner = THIS_MODULE, + .set = &rqspinlock_kfunc_ids, +}; + +static __init int rqspinlock_register_kfuncs(void) +{ + return register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &rqspinlock_kfunc_set); +} +late_initcall(rqspinlock_register_kfuncs); diff --git a/kernel/bpf/rqspinlock.h b/kernel/bpf/rqspinlock.h new file mode 100644 index 000000000000..5d8cb1b1aab4 --- /dev/null +++ b/kernel/bpf/rqspinlock.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Resilient Queued Spin Lock defines + * + * (C) Copyright 2024-2025 Meta Platforms, Inc. and affiliates. + * + * Authors: Kumar Kartikeya Dwivedi <memxor@gmail.com> + */ +#ifndef __LINUX_RQSPINLOCK_H +#define __LINUX_RQSPINLOCK_H + +#include "../locking/qspinlock.h" + +/* + * try_cmpxchg_tail - Return result of cmpxchg of tail word with a new value + * @lock: Pointer to queued spinlock structure + * @tail: The tail to compare against + * @new_tail: The new queue tail code word + * Return: Bool to indicate whether the cmpxchg operation succeeded + * + * This is used by the head of the wait queue to clean up the queue. + * Provides relaxed ordering, since observers only rely on initialized + * state of the node which was made visible through the xchg_tail operation, + * i.e. through the smp_wmb preceding xchg_tail. + * + * We avoid using 16-bit cmpxchg, which is not available on all architectures. + */ +static __always_inline bool try_cmpxchg_tail(struct qspinlock *lock, u32 tail, u32 new_tail) +{ + u32 old, new; + + old = atomic_read(&lock->val); + do { + /* + * Is the tail part we compare to already stale? Fail. + */ + if ((old & _Q_TAIL_MASK) != tail) + return false; + /* + * Encode latest locked/pending state for new tail. + */ + new = (old & _Q_LOCKED_PENDING_MASK) | new_tail; + } while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new)); + + return true; +} + +#endif /* __LINUX_RQSPINLOCK_H */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e1e42e918ba7..4b5f29168618 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -36,6 +36,7 @@ #include <linux/memcontrol.h> #include <linux/trace_events.h> #include <linux/tracepoint.h> +#include <linux/overflow.h> #include <net/netfilter/nf_bpf_link.h> #include <net/netkit.h> @@ -569,7 +570,24 @@ static void bpf_map_release_memcg(struct bpf_map *map) } #endif -int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid, +static bool can_alloc_pages(void) +{ + return preempt_count() == 0 && !irqs_disabled() && + !IS_ENABLED(CONFIG_PREEMPT_RT); +} + +static struct page *__bpf_alloc_page(int nid) +{ + if (!can_alloc_pages()) + return try_alloc_pages(nid, 0); + + return alloc_pages_node(nid, + GFP_KERNEL | __GFP_ZERO | __GFP_ACCOUNT + | __GFP_NOWARN, + 0); +} + +int bpf_map_alloc_pages(const struct bpf_map *map, int nid, unsigned long nr_pages, struct page **pages) { unsigned long i, j; @@ -582,14 +600,14 @@ int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid, old_memcg = set_active_memcg(memcg); #endif for (i = 0; i < nr_pages; i++) { - pg = alloc_pages_node(nid, gfp | __GFP_ACCOUNT, 0); + pg = __bpf_alloc_page(nid); if (pg) { pages[i] = pg; continue; } for (j = 0; j < i; j++) - __free_page(pages[j]); + free_pages_nolock(pages[j], 0); ret = -ENOMEM; break; } @@ -648,6 +666,7 @@ void btf_record_free(struct btf_record *rec) case BPF_RB_ROOT: case BPF_RB_NODE: case BPF_SPIN_LOCK: + case BPF_RES_SPIN_LOCK: case BPF_TIMER: case BPF_REFCOUNT: case BPF_WORKQUEUE: @@ -675,7 +694,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec) if (IS_ERR_OR_NULL(rec)) return NULL; - size = offsetof(struct btf_record, fields[rec->cnt]); + size = struct_size(rec, fields, rec->cnt); new_rec = kmemdup(rec, size, GFP_KERNEL | __GFP_NOWARN); if (!new_rec) return ERR_PTR(-ENOMEM); @@ -700,6 +719,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec) case BPF_RB_ROOT: case BPF_RB_NODE: case BPF_SPIN_LOCK: + case BPF_RES_SPIN_LOCK: case BPF_TIMER: case BPF_REFCOUNT: case BPF_WORKQUEUE: @@ -729,7 +749,7 @@ bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *r return false; if (rec_a->cnt != rec_b->cnt) return false; - size = offsetof(struct btf_record, fields[rec_a->cnt]); + size = struct_size(rec_a, fields, rec_a->cnt); /* btf_parse_fields uses kzalloc to allocate a btf_record, so unused * members are zeroed out. So memcmp is safe to do without worrying * about padding/unused fields. @@ -777,6 +797,7 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj) switch (fields[i].type) { case BPF_SPIN_LOCK: + case BPF_RES_SPIN_LOCK: break; case BPF_TIMER: bpf_timer_cancel_and_free(field_ptr); @@ -1212,7 +1233,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token, return -EINVAL; map->record = btf_parse_fields(btf, value_type, - BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD | + BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD | BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE | BPF_UPTR, map->value_size); if (!IS_ERR_OR_NULL(map->record)) { @@ -1231,6 +1252,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token, case 0: continue; case BPF_SPIN_LOCK: + case BPF_RES_SPIN_LOCK: if (map->map_type != BPF_MAP_TYPE_HASH && map->map_type != BPF_MAP_TYPE_ARRAY && map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE && @@ -1315,7 +1337,7 @@ static bool bpf_net_capable(void) #define BPF_MAP_CREATE_LAST_FIELD map_token_fd /* called via syscall */ -static int map_create(union bpf_attr *attr) +static int map_create(union bpf_attr *attr, bool kernel) { const struct bpf_map_ops *ops; struct bpf_token *token = NULL; @@ -1505,7 +1527,7 @@ static int map_create(union bpf_attr *attr) attr->btf_vmlinux_value_type_id; } - err = security_bpf_map_create(map, attr, token); + err = security_bpf_map_create(map, attr, token, kernel); if (err) goto free_map_sec; @@ -1562,7 +1584,7 @@ struct bpf_map *bpf_map_get(u32 ufd) return map; } -EXPORT_SYMBOL(bpf_map_get); +EXPORT_SYMBOL_NS(bpf_map_get, "BPF_INTERNAL"); struct bpf_map *bpf_map_get_with_uref(u32 ufd) { @@ -1593,11 +1615,8 @@ struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref) struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map) { - spin_lock_bh(&map_idr_lock); - map = __bpf_map_inc_not_zero(map, false); - spin_unlock_bh(&map_idr_lock); - - return map; + lockdep_assert(rcu_read_lock_held()); + return __bpf_map_inc_not_zero(map, false); } EXPORT_SYMBOL_GPL(bpf_map_inc_not_zero); @@ -2314,6 +2333,7 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred) kvfree(prog->aux->jited_linfo); kvfree(prog->aux->linfo); kfree(prog->aux->kfunc_tab); + kfree(prog->aux->ctx_arg_info); if (prog->aux->attach_btf) btf_put(prog->aux->attach_btf); @@ -2944,7 +2964,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) if (err < 0) goto free_prog; - err = security_bpf_prog_load(prog, attr, token); + err = security_bpf_prog_load(prog, attr, token, uattr.is_kernel); if (err) goto free_prog_sec; @@ -3345,7 +3365,7 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd) bpf_link_inc(link); return link; } -EXPORT_SYMBOL(bpf_link_get_from_fd); +EXPORT_SYMBOL_NS(bpf_link_get_from_fd, "BPF_INTERNAL"); static void bpf_tracing_link_release(struct bpf_link *link) { @@ -3780,14 +3800,14 @@ static int bpf_perf_link_fill_kprobe(const struct perf_event *event, static int bpf_perf_link_fill_uprobe(const struct perf_event *event, struct bpf_link_info *info) { + u64 ref_ctr_offset, offset; char __user *uname; - u64 addr, offset; u32 ulen, type; int err; uname = u64_to_user_ptr(info->perf_event.uprobe.file_name); ulen = info->perf_event.uprobe.name_len; - err = bpf_perf_link_fill_common(event, uname, &ulen, &offset, &addr, + err = bpf_perf_link_fill_common(event, uname, &ulen, &offset, &ref_ctr_offset, &type, NULL); if (err) return err; @@ -3799,6 +3819,7 @@ static int bpf_perf_link_fill_uprobe(const struct perf_event *event, info->perf_event.uprobe.name_len = ulen; info->perf_event.uprobe.offset = offset; info->perf_event.uprobe.cookie = event->bpf_cookie; + info->perf_event.uprobe.ref_ctr_offset = ref_ctr_offset; return 0; } #endif @@ -4169,7 +4190,8 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, #define BPF_F_ATTACH_MASK_BASE \ (BPF_F_ALLOW_OVERRIDE | \ BPF_F_ALLOW_MULTI | \ - BPF_F_REPLACE) + BPF_F_REPLACE | \ + BPF_F_PREORDER) #define BPF_F_ATTACH_MASK_MPROG \ (BPF_F_REPLACE | \ @@ -4733,6 +4755,8 @@ static int bpf_prog_get_info_by_fd(struct file *file, info.recursion_misses = stats.misses; info.verified_insns = prog->aux->verified_insns; + if (prog->aux->btf) + info.btf_id = btf_obj_id(prog->aux->btf); if (!bpf_capable()) { info.jited_prog_len = 0; @@ -4879,8 +4903,6 @@ static int bpf_prog_get_info_by_fd(struct file *file, } } - if (prog->aux->btf) - info.btf_id = btf_obj_id(prog->aux->btf); info.attach_btf_id = prog->aux->attach_btf_id; if (attach_btf) info.attach_btf_obj_id = btf_obj_id(attach_btf); @@ -5121,15 +5143,34 @@ static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_ return btf_new_fd(attr, uattr, uattr_size); } -#define BPF_BTF_GET_FD_BY_ID_LAST_FIELD btf_id +#define BPF_BTF_GET_FD_BY_ID_LAST_FIELD fd_by_id_token_fd static int bpf_btf_get_fd_by_id(const union bpf_attr *attr) { + struct bpf_token *token = NULL; + if (CHECK_ATTR(BPF_BTF_GET_FD_BY_ID)) return -EINVAL; - if (!capable(CAP_SYS_ADMIN)) + if (attr->open_flags & ~BPF_F_TOKEN_FD) + return -EINVAL; + + if (attr->open_flags & BPF_F_TOKEN_FD) { + token = bpf_token_get_from_fd(attr->fd_by_id_token_fd); + if (IS_ERR(token)) + return PTR_ERR(token); + if (!bpf_token_allow_cmd(token, BPF_BTF_GET_FD_BY_ID)) { + bpf_token_put(token); + token = NULL; + } + } + + if (!bpf_token_capable(token, CAP_SYS_ADMIN)) { + bpf_token_put(token); return -EPERM; + } + + bpf_token_put(token); return btf_get_fd_by_id(attr->btf_id); } @@ -5768,13 +5809,13 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size) if (copy_from_bpfptr(&attr, uattr, size) != 0) return -EFAULT; - err = security_bpf(cmd, &attr, size); + err = security_bpf(cmd, &attr, size, uattr.is_kernel); if (err < 0) return err; switch (cmd) { case BPF_MAP_CREATE: - err = map_create(&attr); + err = map_create(&attr, uattr.is_kernel); break; case BPF_MAP_LOOKUP_ELEM: err = map_lookup_elem(&attr); @@ -5981,7 +6022,7 @@ int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size) return ____bpf_sys_bpf(cmd, attr, size); } } -EXPORT_SYMBOL(kern_sys_bpf); +EXPORT_SYMBOL_NS(kern_sys_bpf, "BPF_INTERNAL"); static const struct bpf_func_proto bpf_sys_bpf_proto = { .func = bpf_sys_bpf, diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c index 81d6cf90584a..941d0d2427e3 100644 --- a/kernel/bpf/sysfs_btf.c +++ b/kernel/bpf/sysfs_btf.c @@ -7,14 +7,46 @@ #include <linux/kobject.h> #include <linux/init.h> #include <linux/sysfs.h> +#include <linux/mm.h> +#include <linux/io.h> +#include <linux/btf.h> /* See scripts/link-vmlinux.sh, gen_btf() func for details */ extern char __start_BTF[]; extern char __stop_BTF[]; +static int btf_sysfs_vmlinux_mmap(struct file *filp, struct kobject *kobj, + const struct bin_attribute *attr, + struct vm_area_struct *vma) +{ + unsigned long pages = PAGE_ALIGN(attr->size) >> PAGE_SHIFT; + size_t vm_size = vma->vm_end - vma->vm_start; + phys_addr_t addr = virt_to_phys(__start_BTF); + unsigned long pfn = addr >> PAGE_SHIFT; + + if (attr->private != __start_BTF || !PAGE_ALIGNED(addr)) + return -EINVAL; + + if (vma->vm_pgoff) + return -EINVAL; + + if (vma->vm_flags & (VM_WRITE | VM_EXEC | VM_MAYSHARE)) + return -EACCES; + + if (pfn + pages < pfn) + return -EINVAL; + + if ((vm_size >> PAGE_SHIFT) > pages) + return -EINVAL; + + vm_flags_mod(vma, VM_DONTDUMP, VM_MAYEXEC | VM_MAYWRITE); + return remap_pfn_range(vma, vma->vm_start, pfn, vm_size, vma->vm_page_prot); +} + static struct bin_attribute bin_attr_btf_vmlinux __ro_after_init = { .attr = { .name = "vmlinux", .mode = 0444, }, .read_new = sysfs_bin_attr_simple_read, + .mmap = btf_sysfs_vmlinux_mmap, }; struct kobject *btf_kobj; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6e604caa870c..a7d6e0c5928b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -322,6 +322,7 @@ struct bpf_kfunc_call_arg_meta { struct btf *arg_btf; u32 arg_btf_id; bool arg_owning_ref; + bool arg_prog; struct { struct btf_field *field; @@ -456,7 +457,7 @@ static bool subprog_is_exc_cb(struct bpf_verifier_env *env, int subprog) static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) { - return btf_record_has_field(reg_btf_record(reg), BPF_SPIN_LOCK); + return btf_record_has_field(reg_btf_record(reg), BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK); } static bool type_is_rdonly_mem(u32 type) @@ -579,6 +580,13 @@ static bool is_cmpxchg_insn(const struct bpf_insn *insn) insn->imm == BPF_CMPXCHG; } +static bool is_atomic_load_insn(const struct bpf_insn *insn) +{ + return BPF_CLASS(insn->code) == BPF_STX && + BPF_MODE(insn->code) == BPF_ATOMIC && + insn->imm == BPF_LOAD_ACQ; +} + static int __get_spi(s32 off) { return (-off - 1) / BPF_REG_SIZE; @@ -1148,7 +1156,8 @@ static int release_irq_state(struct bpf_verifier_state *state, int id); static int mark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta, - struct bpf_reg_state *reg, int insn_idx) + struct bpf_reg_state *reg, int insn_idx, + int kfunc_class) { struct bpf_func_state *state = func(env, reg); struct bpf_stack_state *slot; @@ -1170,6 +1179,7 @@ static int mark_stack_slot_irq_flag(struct bpf_verifier_env *env, st->type = PTR_TO_STACK; /* we don't have dedicated reg type */ st->live |= REG_LIVE_WRITTEN; st->ref_obj_id = id; + st->irq.kfunc_class = kfunc_class; for (i = 0; i < BPF_REG_SIZE; i++) slot->slot_type[i] = STACK_IRQ_FLAG; @@ -1178,7 +1188,8 @@ static int mark_stack_slot_irq_flag(struct bpf_verifier_env *env, return 0; } -static int unmark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_reg_state *reg) +static int unmark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_reg_state *reg, + int kfunc_class) { struct bpf_func_state *state = func(env, reg); struct bpf_stack_state *slot; @@ -1192,6 +1203,15 @@ static int unmark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_r slot = &state->stack[spi]; st = &slot->spilled_ptr; + if (st->irq.kfunc_class != kfunc_class) { + const char *flag_kfunc = st->irq.kfunc_class == IRQ_NATIVE_KFUNC ? "native" : "lock"; + const char *used_kfunc = kfunc_class == IRQ_NATIVE_KFUNC ? "native" : "lock"; + + verbose(env, "irq flag acquired by %s kfuncs cannot be restored with %s kfuncs\n", + flag_kfunc, used_kfunc); + return -EINVAL; + } + err = release_irq_state(env->cur_state, st->ref_obj_id); WARN_ON_ONCE(err && err != -EACCES); if (err) { @@ -1409,6 +1429,8 @@ static int copy_reference_state(struct bpf_verifier_state *dst, const struct bpf dst->active_preempt_locks = src->active_preempt_locks; dst->active_rcu_lock = src->active_rcu_lock; dst->active_irq_id = src->active_irq_id; + dst->active_lock_id = src->active_lock_id; + dst->active_lock_ptr = src->active_lock_ptr; return 0; } @@ -1508,6 +1530,8 @@ static int acquire_lock_state(struct bpf_verifier_env *env, int insn_idx, enum r s->ptr = ptr; state->active_locks++; + state->active_lock_id = id; + state->active_lock_ptr = ptr; return 0; } @@ -1545,18 +1569,37 @@ static void release_reference_state(struct bpf_verifier_state *state, int idx) return; } +static bool find_reference_state(struct bpf_verifier_state *state, int ptr_id) +{ + int i; + + for (i = 0; i < state->acquired_refs; i++) + if (state->refs[i].id == ptr_id) + return true; + + return false; +} + static int release_lock_state(struct bpf_verifier_state *state, int type, int id, void *ptr) { + void *prev_ptr = NULL; + u32 prev_id = 0; int i; for (i = 0; i < state->acquired_refs; i++) { - if (state->refs[i].type != type) - continue; - if (state->refs[i].id == id && state->refs[i].ptr == ptr) { + if (state->refs[i].type == type && state->refs[i].id == id && + state->refs[i].ptr == ptr) { release_reference_state(state, i); state->active_locks--; + /* Reassign active lock (id, ptr). */ + state->active_lock_id = prev_id; + state->active_lock_ptr = prev_ptr; return 0; } + if (state->refs[i].type & REF_TYPE_LOCK_MASK) { + prev_id = state->refs[i].id; + prev_ptr = state->refs[i].ptr; + } } return -EINVAL; } @@ -1591,7 +1634,7 @@ static struct bpf_reference_state *find_lock_state(struct bpf_verifier_state *st for (i = 0; i < state->acquired_refs; i++) { struct bpf_reference_state *s = &state->refs[i]; - if (s->type != type) + if (!(s->type & type)) continue; if (s->id == id && s->ptr == ptr) @@ -1600,6 +1643,14 @@ static struct bpf_reference_state *find_lock_state(struct bpf_verifier_state *st return NULL; } +static void update_peak_states(struct bpf_verifier_env *env) +{ + u32 cur_states; + + cur_states = env->explored_states_size + env->free_list_size; + env->peak_states = max(env->peak_states, cur_states); +} + static void free_func_state(struct bpf_func_state *state) { if (!state) @@ -1622,6 +1673,50 @@ static void free_verifier_state(struct bpf_verifier_state *state, kfree(state); } +/* struct bpf_verifier_state->{parent,loop_entry} refer to states + * that are in either of env->{expored_states,free_list}. + * In both cases the state is contained in struct bpf_verifier_state_list. + */ +static struct bpf_verifier_state_list *state_parent_as_list(struct bpf_verifier_state *st) +{ + if (st->parent) + return container_of(st->parent, struct bpf_verifier_state_list, state); + return NULL; +} + +static struct bpf_verifier_state_list *state_loop_entry_as_list(struct bpf_verifier_state *st) +{ + if (st->loop_entry) + return container_of(st->loop_entry, struct bpf_verifier_state_list, state); + return NULL; +} + +/* A state can be freed if it is no longer referenced: + * - is in the env->free_list; + * - has no children states; + * - is not used as loop_entry. + * + * Freeing a state can make it's loop_entry free-able. + */ +static void maybe_free_verifier_state(struct bpf_verifier_env *env, + struct bpf_verifier_state_list *sl) +{ + struct bpf_verifier_state_list *loop_entry_sl; + + while (sl && sl->in_free_list && + sl->state.branches == 0 && + sl->state.used_as_loop_entry == 0) { + loop_entry_sl = state_loop_entry_as_list(&sl->state); + if (loop_entry_sl) + loop_entry_sl->state.used_as_loop_entry--; + list_del(&sl->node); + free_verifier_state(&sl->state, false); + kfree(sl); + env->free_list_size--; + sl = loop_entry_sl; + } +} + /* copy verifier state from src to dst growing dst stack space * when necessary to accommodate larger src stack */ @@ -1661,6 +1756,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, dst_state->callback_unroll_depth = src->callback_unroll_depth; dst_state->used_as_loop_entry = src->used_as_loop_entry; dst_state->may_goto_depth = src->may_goto_depth; + dst_state->loop_entry = src->loop_entry; for (i = 0; i <= src->curframe; i++) { dst = dst_state->frame[i]; if (!dst) { @@ -1681,7 +1777,7 @@ static u32 state_htab_size(struct bpf_verifier_env *env) return env->prog->len; } -static struct bpf_verifier_state_list **explored_state(struct bpf_verifier_env *env, int idx) +static struct list_head *explored_state(struct bpf_verifier_env *env, int idx) { struct bpf_verifier_state *cur = env->cur_state; struct bpf_func_state *state = cur->frame[cur->curframe]; @@ -1789,16 +1885,13 @@ static bool same_callsites(struct bpf_verifier_state *a, struct bpf_verifier_sta * # Find outermost loop entry known for n * def get_loop_entry(n): * h = entries.get(n, None) - * while h in entries and entries[h] != h: + * while h in entries: * h = entries[h] * return h * - * # Update n's loop entry if h's outermost entry comes - * # before n's outermost entry in current DFS path. + * # Update n's loop entry if h comes before n in current DFS path. * def update_loop_entry(n, h): - * n1 = get_loop_entry(n) or n - * h1 = get_loop_entry(h) or h - * if h1 in path and depths[h1] <= depths[n1]: + * if h in path and depths[entries.get(n, n)] < depths[n]: * entries[n] = h1 * * def dfs(n, depth): @@ -1810,7 +1903,7 @@ static bool same_callsites(struct bpf_verifier_state *a, struct bpf_verifier_sta * # Case A: explore succ and update cur's loop entry * # only if succ's entry is in current DFS path. * dfs(succ, depth + 1) - * h = get_loop_entry(succ) + * h = entries.get(succ, None) * update_loop_entry(n, h) * else: * # Case B or C depending on `h1 in path` check in update_loop_entry(). @@ -1822,46 +1915,46 @@ static bool same_callsites(struct bpf_verifier_state *a, struct bpf_verifier_sta * and cur's loop entry has to be updated (case A), handle this in * update_branch_counts(); * - use st->branch > 0 as a signal that st is in the current DFS path; - * - handle cases B and C in is_state_visited(); - * - update topmost loop entry for intermediate states in get_loop_entry(). + * - handle cases B and C in is_state_visited(). */ -static struct bpf_verifier_state *get_loop_entry(struct bpf_verifier_state *st) +static struct bpf_verifier_state *get_loop_entry(struct bpf_verifier_env *env, + struct bpf_verifier_state *st) { - struct bpf_verifier_state *topmost = st->loop_entry, *old; + struct bpf_verifier_state *topmost = st->loop_entry; + u32 steps = 0; - while (topmost && topmost->loop_entry && topmost != topmost->loop_entry) + while (topmost && topmost->loop_entry) { + if (verifier_bug_if(steps++ > st->dfs_depth, env, "infinite loop")) + return ERR_PTR(-EFAULT); topmost = topmost->loop_entry; - /* Update loop entries for intermediate states to avoid this - * traversal in future get_loop_entry() calls. - */ - while (st && st->loop_entry != topmost) { - old = st->loop_entry; - st->loop_entry = topmost; - st = old; } return topmost; } -static void update_loop_entry(struct bpf_verifier_state *cur, struct bpf_verifier_state *hdr) +static void update_loop_entry(struct bpf_verifier_env *env, + struct bpf_verifier_state *cur, struct bpf_verifier_state *hdr) { - struct bpf_verifier_state *cur1, *hdr1; - - cur1 = get_loop_entry(cur) ?: cur; - hdr1 = get_loop_entry(hdr) ?: hdr; - /* The head1->branches check decides between cases B and C in - * comment for get_loop_entry(). If hdr1->branches == 0 then + /* The hdr->branches check decides between cases B and C in + * comment for get_loop_entry(). If hdr->branches == 0 then * head's topmost loop entry is not in current DFS path, * hence 'cur' and 'hdr' are not in the same loop and there is * no need to update cur->loop_entry. */ - if (hdr1->branches && hdr1->dfs_depth <= cur1->dfs_depth) { + if (hdr->branches && hdr->dfs_depth < (cur->loop_entry ?: cur)->dfs_depth) { + if (cur->loop_entry) { + cur->loop_entry->used_as_loop_entry--; + maybe_free_verifier_state(env, state_loop_entry_as_list(cur)); + } cur->loop_entry = hdr; - hdr->used_as_loop_entry = true; + hdr->used_as_loop_entry++; } } static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_state *st) { + struct bpf_verifier_state_list *sl = NULL, *parent_sl; + struct bpf_verifier_state *parent; + while (st) { u32 br = --st->branches; @@ -1871,7 +1964,7 @@ static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifi * This is a part of 'case A' in get_loop_entry() comment. */ if (br == 0 && st->parent && st->loop_entry) - update_loop_entry(st->parent, st->loop_entry); + update_loop_entry(env, st->parent, st->loop_entry); /* WARN_ON(br > 1) technically makes sense here, * but see comment in push_stack(), hence: @@ -1881,7 +1974,12 @@ static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifi br); if (br) break; - st = st->parent; + parent = st->parent; + parent_sl = state_parent_as_list(st); + if (sl) + maybe_free_verifier_state(env, sl); + st = parent; + sl = parent_sl; } } @@ -3206,6 +3304,21 @@ bpf_jit_find_kfunc_model(const struct bpf_prog *prog, return res ? &res->func_model : NULL; } +static int add_kfunc_in_insns(struct bpf_verifier_env *env, + struct bpf_insn *insn, int cnt) +{ + int i, ret; + + for (i = 0; i < cnt; i++, insn++) { + if (bpf_pseudo_kfunc_call(insn)) { + ret = add_kfunc_call(env, insn->imm, insn->off); + if (ret < 0) + return ret; + } + } + return 0; +} + static int add_subprog_and_kfunc(struct bpf_verifier_env *env) { struct bpf_subprog_info *subprog = env->subprog_info; @@ -3269,6 +3382,15 @@ static int add_subprog_and_kfunc(struct bpf_verifier_env *env) return 0; } +static int jmp_offset(struct bpf_insn *insn) +{ + u8 code = insn->code; + + if (code == (BPF_JMP32 | BPF_JA)) + return insn->imm; + return insn->off; +} + static int check_subprogs(struct bpf_verifier_env *env) { int i, subprog_start, subprog_end, off, cur_subprog = 0; @@ -3295,10 +3417,7 @@ static int check_subprogs(struct bpf_verifier_env *env) goto next; if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL) goto next; - if (code == (BPF_JMP32 | BPF_JA)) - off = i + insn[i].imm + 1; - else - off = i + insn[i].off + 1; + off = i + jmp_offset(&insn[i]) + 1; if (off < subprog_start || off >= subprog_end) { verbose(env, "jump out of range from insn %d to %d\n", i, off); return -EINVAL; @@ -3338,12 +3457,11 @@ static int mark_reg_read(struct bpf_verifier_env *env, /* if read wasn't screened by an earlier write ... */ if (writes && state->live & REG_LIVE_WRITTEN) break; - if (parent->live & REG_LIVE_DONE) { - verbose(env, "verifier BUG type %s var_off %lld off %d\n", - reg_type_str(env, parent->type), - parent->var_off.value, parent->off); + if (verifier_bug_if(parent->live & REG_LIVE_DONE, env, + "type %s var_off %lld off %d", + reg_type_str(env, parent->type), + parent->var_off.value, parent->off)) return -EFAULT; - } /* The first condition is more likely to be true than the * second, checked it first. */ @@ -3483,7 +3601,7 @@ static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn, } if (class == BPF_STX) { - /* BPF_STX (including atomic variants) has multiple source + /* BPF_STX (including atomic variants) has one or more source * operands, one of which is a ptr. Check whether the caller is * asking about it. */ @@ -3528,16 +3646,16 @@ static int insn_def_regno(const struct bpf_insn *insn) case BPF_ST: return -1; case BPF_STX: - if ((BPF_MODE(insn->code) == BPF_ATOMIC || - BPF_MODE(insn->code) == BPF_PROBE_ATOMIC) && - (insn->imm & BPF_FETCH)) { + if (BPF_MODE(insn->code) == BPF_ATOMIC || + BPF_MODE(insn->code) == BPF_PROBE_ATOMIC) { if (insn->imm == BPF_CMPXCHG) return BPF_REG_0; - else + else if (insn->imm == BPF_LOAD_ACQ) + return insn->dst_reg; + else if (insn->imm & BPF_FETCH) return insn->src_reg; - } else { - return -1; } + return -1; default: return insn->dst_reg; } @@ -3736,14 +3854,14 @@ static int push_insn_history(struct bpf_verifier_env *env, struct bpf_verifier_s /* atomic instructions push insn_flags twice, for READ and * WRITE sides, but they should agree on stack slot */ - WARN_ONCE((env->cur_hist_ent->flags & insn_flags) && - (env->cur_hist_ent->flags & insn_flags) != insn_flags, - "verifier insn history bug: insn_idx %d cur flags %x new flags %x\n", - env->insn_idx, env->cur_hist_ent->flags, insn_flags); + verifier_bug_if((env->cur_hist_ent->flags & insn_flags) && + (env->cur_hist_ent->flags & insn_flags) != insn_flags, + env, "insn history: insn_idx %d cur flags %x new flags %x", + env->insn_idx, env->cur_hist_ent->flags, insn_flags); env->cur_hist_ent->flags |= insn_flags; - WARN_ONCE(env->cur_hist_ent->linked_regs != 0, - "verifier insn history bug: insn_idx %d linked_regs != 0: %#llx\n", - env->insn_idx, env->cur_hist_ent->linked_regs); + verifier_bug_if(env->cur_hist_ent->linked_regs != 0, env, + "insn history: insn_idx %d linked_regs: %#llx", + env->insn_idx, env->cur_hist_ent->linked_regs); env->cur_hist_ent->linked_regs = linked_regs; return 0; } @@ -3828,6 +3946,17 @@ static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn) return btf_name_by_offset(desc_btf, func->name_off); } +static void verbose_insn(struct bpf_verifier_env *env, struct bpf_insn *insn) +{ + const struct bpf_insn_cbs cbs = { + .cb_call = disasm_kfunc_name, + .cb_print = verbose, + .private_data = env, + }; + + print_bpf_insn(&cbs, insn, env->allow_ptr_leaks); +} + static inline void bt_init(struct backtrack_state *bt, u32 frame) { bt->frame = frame; @@ -3855,8 +3984,7 @@ static inline u32 bt_empty(struct backtrack_state *bt) static inline int bt_subprog_enter(struct backtrack_state *bt) { if (bt->frame == MAX_CALL_FRAMES - 1) { - verbose(bt->env, "BUG subprog enter from frame %d\n", bt->frame); - WARN_ONCE(1, "verifier backtracking bug"); + verifier_bug(bt->env, "subprog enter from frame %d", bt->frame); return -EFAULT; } bt->frame++; @@ -3866,8 +3994,7 @@ static inline int bt_subprog_enter(struct backtrack_state *bt) static inline int bt_subprog_exit(struct backtrack_state *bt) { if (bt->frame == 0) { - verbose(bt->env, "BUG subprog exit from frame 0\n"); - WARN_ONCE(1, "verifier backtracking bug"); + verifier_bug(bt->env, "subprog exit from frame 0"); return -EFAULT; } bt->frame--; @@ -4028,11 +4155,6 @@ static bool calls_callback(struct bpf_verifier_env *env, int insn_idx); static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, struct bpf_insn_hist_entry *hist, struct backtrack_state *bt) { - const struct bpf_insn_cbs cbs = { - .cb_call = disasm_kfunc_name, - .cb_print = verbose, - .private_data = env, - }; struct bpf_insn *insn = env->prog->insnsi + idx; u8 class = BPF_CLASS(insn->code); u8 opcode = BPF_OP(insn->code); @@ -4050,7 +4172,7 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_stack_mask(bt)); verbose(env, "stack=%s before ", env->tmp_str_buf); verbose(env, "%d: ", idx); - print_bpf_insn(&cbs, insn, env->allow_ptr_leaks); + verbose_insn(env, insn); } /* If there is a history record that some registers gained range at this insn, @@ -4097,7 +4219,7 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, * dreg still needs precision before this insn */ } - } else if (class == BPF_LDX) { + } else if (class == BPF_LDX || is_atomic_load_insn(insn)) { if (!bt_is_reg_set(bt, dreg)) return 0; bt_clear_reg(bt, dreg); @@ -4150,14 +4272,15 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, * should be literally next instruction in * caller program */ - WARN_ONCE(idx + 1 != subseq_idx, "verifier backtracking bug"); + verifier_bug_if(idx + 1 != subseq_idx, env, + "extra insn from subprog"); /* r1-r5 are invalidated after subprog call, * so for global func call it shouldn't be set * anymore */ if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) { - verbose(env, "BUG regs %x\n", bt_reg_mask(bt)); - WARN_ONCE(1, "verifier backtracking bug"); + verifier_bug(env, "global subprog unexpected regs %x", + bt_reg_mask(bt)); return -EFAULT; } /* global subprog always sets R0 */ @@ -4171,16 +4294,17 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, * the current frame should be zero by now */ if (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) { - verbose(env, "BUG regs %x\n", bt_reg_mask(bt)); - WARN_ONCE(1, "verifier backtracking bug"); + verifier_bug(env, "static subprog unexpected regs %x", + bt_reg_mask(bt)); return -EFAULT; } /* we are now tracking register spills correctly, * so any instance of leftover slots is a bug */ if (bt_stack_mask(bt) != 0) { - verbose(env, "BUG stack slots %llx\n", bt_stack_mask(bt)); - WARN_ONCE(1, "verifier backtracking bug (subprog leftover stack slots)"); + verifier_bug(env, + "static subprog leftover stack slots %llx", + bt_stack_mask(bt)); return -EFAULT; } /* propagate r1-r5 to the caller */ @@ -4203,13 +4327,13 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, * not actually arguments passed directly to callback subprogs */ if (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) { - verbose(env, "BUG regs %x\n", bt_reg_mask(bt)); - WARN_ONCE(1, "verifier backtracking bug"); + verifier_bug(env, "callback unexpected regs %x", + bt_reg_mask(bt)); return -EFAULT; } if (bt_stack_mask(bt) != 0) { - verbose(env, "BUG stack slots %llx\n", bt_stack_mask(bt)); - WARN_ONCE(1, "verifier backtracking bug (callback leftover stack slots)"); + verifier_bug(env, "callback leftover stack slots %llx", + bt_stack_mask(bt)); return -EFAULT; } /* clear r1-r5 in callback subprog's mask */ @@ -4228,11 +4352,11 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, /* regular helper call sets R0 */ bt_clear_reg(bt, BPF_REG_0); if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) { - /* if backtracing was looking for registers R1-R5 + /* if backtracking was looking for registers R1-R5 * they should have been found already. */ - verbose(env, "BUG regs %x\n", bt_reg_mask(bt)); - WARN_ONCE(1, "verifier backtracking bug"); + verifier_bug(env, "backtracking call unexpected regs %x", + bt_reg_mask(bt)); return -EFAULT; } } else if (opcode == BPF_EXIT) { @@ -4250,8 +4374,8 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, for (i = BPF_REG_1; i <= BPF_REG_5; i++) bt_clear_reg(bt, i); if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) { - verbose(env, "BUG regs %x\n", bt_reg_mask(bt)); - WARN_ONCE(1, "verifier backtracking bug"); + verifier_bug(env, "backtracking exit unexpected regs %x", + bt_reg_mask(bt)); return -EFAULT; } @@ -4286,8 +4410,10 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, * before it would be equally necessary to * propagate it to dreg. */ - bt_set_reg(bt, dreg); - bt_set_reg(bt, sreg); + if (!hist || !(hist->flags & INSN_F_SRC_REG_STACK)) + bt_set_reg(bt, sreg); + if (!hist || !(hist->flags & INSN_F_DST_REG_STACK)) + bt_set_reg(bt, dreg); } else if (BPF_SRC(insn->code) == BPF_K) { /* dreg <cond> K * Only dreg still needs precision before @@ -4592,9 +4718,8 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) return 0; } - verbose(env, "BUG backtracking func entry subprog %d reg_mask %x stack_mask %llx\n", - st->frame[0]->subprogno, bt_reg_mask(bt), bt_stack_mask(bt)); - WARN_ONCE(1, "verifier backtracking bug"); + verifier_bug(env, "backtracking func entry subprog %d reg_mask %x stack_mask %llx", + st->frame[0]->subprogno, bt_reg_mask(bt), bt_stack_mask(bt)); return -EFAULT; } @@ -4630,8 +4755,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) * It means the backtracking missed the spot where * particular register was initialized with a constant. */ - verbose(env, "BUG backtracking idx %d\n", i); - WARN_ONCE(1, "verifier backtracking bug"); + verifier_bug(env, "backtracking idx %d", i); return -EFAULT; } } @@ -4656,12 +4780,10 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) bitmap_from_u64(mask, bt_frame_stack_mask(bt, fr)); for_each_set_bit(i, mask, 64) { - if (i >= func->allocated_stack / BPF_REG_SIZE) { - verbose(env, "BUG backtracking (stack slot %d, total slots %d)\n", - i, func->allocated_stack / BPF_REG_SIZE); - WARN_ONCE(1, "verifier backtracking bug (stack slot out of bounds)"); + if (verifier_bug_if(i >= func->allocated_stack / BPF_REG_SIZE, + env, "stack slot %d, total slots %d", + i, func->allocated_stack / BPF_REG_SIZE)) return -EFAULT; - } if (!is_spilled_scalar_reg(&func->stack[i])) { bt_clear_frame_slot(bt, fr, i); @@ -5982,18 +6104,10 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, /* check access to 'struct bpf_context' fields. Supports fixed offsets only */ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size, - enum bpf_access_type t, enum bpf_reg_type *reg_type, - struct btf **btf, u32 *btf_id, bool *is_retval, bool is_ldsx) + enum bpf_access_type t, struct bpf_insn_access_aux *info) { - struct bpf_insn_access_aux info = { - .reg_type = *reg_type, - .log = &env->log, - .is_retval = false, - .is_ldsx = is_ldsx, - }; - if (env->ops->is_valid_access && - env->ops->is_valid_access(off, size, t, env->prog, &info)) { + env->ops->is_valid_access(off, size, t, env->prog, info)) { /* A non zero info.ctx_field_size indicates that this field is a * candidate for later verifier transformation to load the whole * field and then apply a mask when accessed with a narrower @@ -6001,14 +6115,15 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, * will only allow for whole field access and rejects any other * type of narrower access. */ - *reg_type = info.reg_type; - *is_retval = info.is_retval; - - if (base_type(*reg_type) == PTR_TO_BTF_ID) { - *btf = info.btf; - *btf_id = info.btf_id; + if (base_type(info->reg_type) == PTR_TO_BTF_ID) { + if (info->ref_obj_id && + !find_reference_state(env->cur_state, info->ref_obj_id)) { + verbose(env, "invalid bpf_context access off=%d. Reference may already be released\n", + off); + return -EACCES; + } } else { - env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size; + env->insn_aux_data[insn_idx].ctx_field_size = info->ctx_field_size; } /* remember the offset of last byte accessed in ctx */ if (env->prog->aux->max_ctx_offset < off + size) @@ -6118,6 +6233,26 @@ static bool is_arena_reg(struct bpf_verifier_env *env, int regno) return reg->type == PTR_TO_ARENA; } +/* Return false if @regno contains a pointer whose type isn't supported for + * atomic instruction @insn. + */ +static bool atomic_ptr_type_ok(struct bpf_verifier_env *env, int regno, + struct bpf_insn *insn) +{ + if (is_ctx_reg(env, regno)) + return false; + if (is_pkt_reg(env, regno)) + return false; + if (is_flow_key_reg(env, regno)) + return false; + if (is_sk_reg(env, regno)) + return false; + if (is_arena_reg(env, regno)) + return bpf_jit_supports_insn(insn, true); + + return true; +} + static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = { #ifdef CONFIG_NET [PTR_TO_SOCKET] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK], @@ -6421,21 +6556,18 @@ continue_func: /* find the callee */ next_insn = i + insn[i].imm + 1; sidx = find_subprog(env, next_insn); - if (sidx < 0) { - WARN_ONCE(1, "verifier bug. No program starts at insn %d\n", - next_insn); + if (verifier_bug_if(sidx < 0, env, "callee not found at insn %d", next_insn)) return -EFAULT; - } if (subprog[sidx].is_async_cb) { if (subprog[sidx].has_tail_call) { - verbose(env, "verifier bug. subprog has tail_call and async cb\n"); + verifier_bug(env, "subprog has tail_call and async cb"); return -EFAULT; } /* async callbacks don't increase bpf prog stack size unless called directly */ if (!bpf_pseudo_call(insn + i)) continue; if (subprog[sidx].is_exception_cb) { - verbose(env, "insn %d cannot call exception cb directly\n", i); + verbose(env, "insn %d cannot call exception cb directly", i); return -EINVAL; } } @@ -6535,11 +6667,8 @@ static int get_callee_stack_depth(struct bpf_verifier_env *env, int start = idx + insn->imm + 1, subprog; subprog = find_subprog(env, start); - if (subprog < 0) { - WARN_ONCE(1, "verifier bug. No program starts at insn %d\n", - start); + if (verifier_bug_if(subprog < 0, env, "get stack depth: no program at insn %d", start)) return -EFAULT; - } return env->subprog_info[subprog].stack_depth; } #endif @@ -7365,11 +7494,12 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (!err && value_regno >= 0 && (t == BPF_READ || rdonly_mem)) mark_reg_unknown(env, regs, value_regno); } else if (reg->type == PTR_TO_CTX) { - bool is_retval = false; struct bpf_retval_range range; - enum bpf_reg_type reg_type = SCALAR_VALUE; - struct btf *btf = NULL; - u32 btf_id = 0; + struct bpf_insn_access_aux info = { + .reg_type = SCALAR_VALUE, + .is_ldsx = is_ldsx, + .log = &env->log, + }; if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { @@ -7381,8 +7511,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (err < 0) return err; - err = check_ctx_access(env, insn_idx, off, size, t, ®_type, &btf, - &btf_id, &is_retval, is_ldsx); + err = check_ctx_access(env, insn_idx, off, size, t, &info); if (err) verbose_linfo(env, insn_idx, "; "); if (!err && t == BPF_READ && value_regno >= 0) { @@ -7390,8 +7519,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn * PTR_TO_PACKET[_META,_END]. In the latter * case, we know the offset is zero. */ - if (reg_type == SCALAR_VALUE) { - if (is_retval && get_func_retval_range(env->prog, &range)) { + if (info.reg_type == SCALAR_VALUE) { + if (info.is_retval && get_func_retval_range(env->prog, &range)) { err = __mark_reg_s32_range(env, regs, value_regno, range.minval, range.maxval); if (err) @@ -7402,7 +7531,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } else { mark_reg_known_zero(env, regs, value_regno); - if (type_may_be_null(reg_type)) + if (type_may_be_null(info.reg_type)) regs[value_regno].id = ++env->id_gen; /* A load of ctx field could have different * actual load size with the one encoded in the @@ -7410,12 +7539,13 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn * a sub-register. */ regs[value_regno].subreg_def = DEF_NOT_SUBREG; - if (base_type(reg_type) == PTR_TO_BTF_ID) { - regs[value_regno].btf = btf; - regs[value_regno].btf_id = btf_id; + if (base_type(info.reg_type) == PTR_TO_BTF_ID) { + regs[value_regno].btf = info.btf; + regs[value_regno].btf_id = info.btf_id; + regs[value_regno].ref_obj_id = info.ref_obj_id; } } - regs[value_regno].type = reg_type; + regs[value_regno].type = info.reg_type; } } else if (reg->type == PTR_TO_STACK) { @@ -7518,27 +7648,72 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type type, bool allow_trust_mismatch); -static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn) +static int check_load_mem(struct bpf_verifier_env *env, struct bpf_insn *insn, + bool strict_alignment_once, bool is_ldsx, + bool allow_trust_mismatch, const char *ctx) { - int load_reg; + struct bpf_reg_state *regs = cur_regs(env); + enum bpf_reg_type src_reg_type; int err; - switch (insn->imm) { - case BPF_ADD: - case BPF_ADD | BPF_FETCH: - case BPF_AND: - case BPF_AND | BPF_FETCH: - case BPF_OR: - case BPF_OR | BPF_FETCH: - case BPF_XOR: - case BPF_XOR | BPF_FETCH: - case BPF_XCHG: - case BPF_CMPXCHG: - break; - default: - verbose(env, "BPF_ATOMIC uses invalid atomic opcode %02x\n", insn->imm); - return -EINVAL; - } + /* check src operand */ + err = check_reg_arg(env, insn->src_reg, SRC_OP); + if (err) + return err; + + /* check dst operand */ + err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK); + if (err) + return err; + + src_reg_type = regs[insn->src_reg].type; + + /* Check if (src_reg + off) is readable. The state of dst_reg will be + * updated by this call. + */ + err = check_mem_access(env, env->insn_idx, insn->src_reg, insn->off, + BPF_SIZE(insn->code), BPF_READ, insn->dst_reg, + strict_alignment_once, is_ldsx); + err = err ?: save_aux_ptr_type(env, src_reg_type, + allow_trust_mismatch); + err = err ?: reg_bounds_sanity_check(env, ®s[insn->dst_reg], ctx); + + return err; +} + +static int check_store_reg(struct bpf_verifier_env *env, struct bpf_insn *insn, + bool strict_alignment_once) +{ + struct bpf_reg_state *regs = cur_regs(env); + enum bpf_reg_type dst_reg_type; + int err; + + /* check src1 operand */ + err = check_reg_arg(env, insn->src_reg, SRC_OP); + if (err) + return err; + + /* check src2 operand */ + err = check_reg_arg(env, insn->dst_reg, SRC_OP); + if (err) + return err; + + dst_reg_type = regs[insn->dst_reg].type; + + /* Check if (dst_reg + off) is writeable. */ + err = check_mem_access(env, env->insn_idx, insn->dst_reg, insn->off, + BPF_SIZE(insn->code), BPF_WRITE, insn->src_reg, + strict_alignment_once, false); + err = err ?: save_aux_ptr_type(env, dst_reg_type, false); + + return err; +} + +static int check_atomic_rmw(struct bpf_verifier_env *env, + struct bpf_insn *insn) +{ + int load_reg; + int err; if (BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) { verbose(env, "invalid atomic operand size\n"); @@ -7574,11 +7749,7 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i return -EACCES; } - if (is_ctx_reg(env, insn->dst_reg) || - is_pkt_reg(env, insn->dst_reg) || - is_flow_key_reg(env, insn->dst_reg) || - is_sk_reg(env, insn->dst_reg) || - (is_arena_reg(env, insn->dst_reg) && !bpf_jit_supports_insn(insn, true))) { + if (!atomic_ptr_type_ok(env, insn->dst_reg, insn)) { verbose(env, "BPF_ATOMIC stores into R%d %s is not allowed\n", insn->dst_reg, reg_type_str(env, reg_state(env, insn->dst_reg)->type)); @@ -7605,12 +7776,12 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i /* Check whether we can read the memory, with second call for fetch * case to simulate the register fill. */ - err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, + err = check_mem_access(env, env->insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_READ, -1, true, false); if (!err && load_reg >= 0) - err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, - BPF_SIZE(insn->code), BPF_READ, load_reg, - true, false); + err = check_mem_access(env, env->insn_idx, insn->dst_reg, + insn->off, BPF_SIZE(insn->code), + BPF_READ, load_reg, true, false); if (err) return err; @@ -7620,13 +7791,86 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i return err; } /* Check whether we can write into the same memory. */ - err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, + err = check_mem_access(env, env->insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_WRITE, -1, true, false); if (err) return err; return 0; } +static int check_atomic_load(struct bpf_verifier_env *env, + struct bpf_insn *insn) +{ + int err; + + err = check_load_mem(env, insn, true, false, false, "atomic_load"); + if (err) + return err; + + if (!atomic_ptr_type_ok(env, insn->src_reg, insn)) { + verbose(env, "BPF_ATOMIC loads from R%d %s is not allowed\n", + insn->src_reg, + reg_type_str(env, reg_state(env, insn->src_reg)->type)); + return -EACCES; + } + + return 0; +} + +static int check_atomic_store(struct bpf_verifier_env *env, + struct bpf_insn *insn) +{ + int err; + + err = check_store_reg(env, insn, true); + if (err) + return err; + + if (!atomic_ptr_type_ok(env, insn->dst_reg, insn)) { + verbose(env, "BPF_ATOMIC stores into R%d %s is not allowed\n", + insn->dst_reg, + reg_type_str(env, reg_state(env, insn->dst_reg)->type)); + return -EACCES; + } + + return 0; +} + +static int check_atomic(struct bpf_verifier_env *env, struct bpf_insn *insn) +{ + switch (insn->imm) { + case BPF_ADD: + case BPF_ADD | BPF_FETCH: + case BPF_AND: + case BPF_AND | BPF_FETCH: + case BPF_OR: + case BPF_OR | BPF_FETCH: + case BPF_XOR: + case BPF_XOR | BPF_FETCH: + case BPF_XCHG: + case BPF_CMPXCHG: + return check_atomic_rmw(env, insn); + case BPF_LOAD_ACQ: + if (BPF_SIZE(insn->code) == BPF_DW && BITS_PER_LONG != 64) { + verbose(env, + "64-bit load-acquires are only supported on 64-bit arches\n"); + return -EOPNOTSUPP; + } + return check_atomic_load(env, insn); + case BPF_STORE_REL: + if (BPF_SIZE(insn->code) == BPF_DW && BITS_PER_LONG != 64) { + verbose(env, + "64-bit store-releases are only supported on 64-bit arches\n"); + return -EOPNOTSUPP; + } + return check_atomic_store(env, insn); + default: + verbose(env, "BPF_ATOMIC uses invalid atomic opcode %02x\n", + insn->imm); + return -EINVAL; + } +} + /* When register 'regno' is used to read the stack (either directly or through * a helper function) make sure that it's within stack boundary and, depending * on the access type and privileges, that all elements of the stack are @@ -7729,7 +7973,7 @@ static int check_stack_range_initialized( slot = -i - 1; spi = slot / BPF_REG_SIZE; if (state->allocated_stack <= slot) { - verbose(env, "verifier bug: allocated_stack too small\n"); + verbose(env, "allocated_stack too small\n"); return -EFAULT; } @@ -7985,6 +8229,12 @@ static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg return err; } +enum { + PROCESS_SPIN_LOCK = (1 << 0), + PROCESS_RES_LOCK = (1 << 1), + PROCESS_LOCK_IRQ = (1 << 2), +}; + /* Implementation details: * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL. * bpf_obj_new returns PTR_TO_BTF_ID | MEM_ALLOC | PTR_MAYBE_NULL. @@ -8007,30 +8257,33 @@ static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg * env->cur_state->active_locks remembers which map value element or allocated * object got locked and clears it after bpf_spin_unlock. */ -static int process_spin_lock(struct bpf_verifier_env *env, int regno, - bool is_lock) +static int process_spin_lock(struct bpf_verifier_env *env, int regno, int flags) { + bool is_lock = flags & PROCESS_SPIN_LOCK, is_res_lock = flags & PROCESS_RES_LOCK; + const char *lock_str = is_res_lock ? "bpf_res_spin" : "bpf_spin"; struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; struct bpf_verifier_state *cur = env->cur_state; bool is_const = tnum_is_const(reg->var_off); + bool is_irq = flags & PROCESS_LOCK_IRQ; u64 val = reg->var_off.value; struct bpf_map *map = NULL; struct btf *btf = NULL; struct btf_record *rec; + u32 spin_lock_off; int err; if (!is_const) { verbose(env, - "R%d doesn't have constant offset. bpf_spin_lock has to be at the constant offset\n", - regno); + "R%d doesn't have constant offset. %s_lock has to be at the constant offset\n", + regno, lock_str); return -EINVAL; } if (reg->type == PTR_TO_MAP_VALUE) { map = reg->map_ptr; if (!map->btf) { verbose(env, - "map '%s' has to have BTF in order to use bpf_spin_lock\n", - map->name); + "map '%s' has to have BTF in order to use %s_lock\n", + map->name, lock_str); return -EINVAL; } } else { @@ -8038,36 +8291,53 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, } rec = reg_btf_record(reg); - if (!btf_record_has_field(rec, BPF_SPIN_LOCK)) { - verbose(env, "%s '%s' has no valid bpf_spin_lock\n", map ? "map" : "local", - map ? map->name : "kptr"); + if (!btf_record_has_field(rec, is_res_lock ? BPF_RES_SPIN_LOCK : BPF_SPIN_LOCK)) { + verbose(env, "%s '%s' has no valid %s_lock\n", map ? "map" : "local", + map ? map->name : "kptr", lock_str); return -EINVAL; } - if (rec->spin_lock_off != val + reg->off) { - verbose(env, "off %lld doesn't point to 'struct bpf_spin_lock' that is at %d\n", - val + reg->off, rec->spin_lock_off); + spin_lock_off = is_res_lock ? rec->res_spin_lock_off : rec->spin_lock_off; + if (spin_lock_off != val + reg->off) { + verbose(env, "off %lld doesn't point to 'struct %s_lock' that is at %d\n", + val + reg->off, lock_str, spin_lock_off); return -EINVAL; } if (is_lock) { void *ptr; + int type; if (map) ptr = map; else ptr = btf; - if (cur->active_locks) { - verbose(env, - "Locking two bpf_spin_locks are not allowed\n"); - return -EINVAL; + if (!is_res_lock && cur->active_locks) { + if (find_lock_state(env->cur_state, REF_TYPE_LOCK, 0, NULL)) { + verbose(env, + "Locking two bpf_spin_locks are not allowed\n"); + return -EINVAL; + } + } else if (is_res_lock && cur->active_locks) { + if (find_lock_state(env->cur_state, REF_TYPE_RES_LOCK | REF_TYPE_RES_LOCK_IRQ, reg->id, ptr)) { + verbose(env, "Acquiring the same lock again, AA deadlock detected\n"); + return -EINVAL; + } } - err = acquire_lock_state(env, env->insn_idx, REF_TYPE_LOCK, reg->id, ptr); + + if (is_res_lock && is_irq) + type = REF_TYPE_RES_LOCK_IRQ; + else if (is_res_lock) + type = REF_TYPE_RES_LOCK; + else + type = REF_TYPE_LOCK; + err = acquire_lock_state(env, env->insn_idx, type, reg->id, ptr); if (err < 0) { verbose(env, "Failed to acquire lock state\n"); return err; } } else { void *ptr; + int type; if (map) ptr = map; @@ -8075,12 +8345,26 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, ptr = btf; if (!cur->active_locks) { - verbose(env, "bpf_spin_unlock without taking a lock\n"); + verbose(env, "%s_unlock without taking a lock\n", lock_str); return -EINVAL; } - if (release_lock_state(env->cur_state, REF_TYPE_LOCK, reg->id, ptr)) { - verbose(env, "bpf_spin_unlock of different lock\n"); + if (is_res_lock && is_irq) + type = REF_TYPE_RES_LOCK_IRQ; + else if (is_res_lock) + type = REF_TYPE_RES_LOCK; + else + type = REF_TYPE_LOCK; + if (!find_lock_state(cur, type, reg->id, ptr)) { + verbose(env, "%s_unlock of different lock\n", lock_str); + return -EINVAL; + } + if (reg->id != cur->active_lock_id || ptr != cur->active_lock_ptr) { + verbose(env, "%s_unlock cannot be out of order\n", lock_str); + return -EINVAL; + } + if (release_lock_state(cur, type, reg->id, ptr)) { + verbose(env, "%s_unlock of different lock\n", lock_str); return -EINVAL; } @@ -8118,7 +8402,7 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno, return -EINVAL; } if (meta->map_ptr) { - verbose(env, "verifier bug. Two map pointers in a timer helper\n"); + verifier_bug(env, "Two map pointers in a timer helper"); return -EFAULT; } meta->map_uid = reg->map_uid; @@ -8431,10 +8715,12 @@ static struct bpf_verifier_state *find_prev_entry(struct bpf_verifier_env *env, { struct bpf_verifier_state_list *sl; struct bpf_verifier_state *st; + struct list_head *pos, *head; /* Explored states are pushed in stack order, most recent states come first */ - sl = *explored_state(env, insn_idx); - for (; sl; sl = sl->next) { + head = explored_state(env, insn_idx); + list_for_each(pos, head) { + sl = container_of(pos, struct bpf_verifier_state_list, node); /* If st->branches != 0 state is a part of current DFS verification path, * hence cur & st for a loop. */ @@ -9404,11 +9690,11 @@ skip_type_check: return -EACCES; } if (meta->func_id == BPF_FUNC_spin_lock) { - err = process_spin_lock(env, regno, true); + err = process_spin_lock(env, regno, PROCESS_SPIN_LOCK); if (err) return err; } else if (meta->func_id == BPF_FUNC_spin_unlock) { - err = process_spin_lock(env, regno, false); + err = process_spin_lock(env, regno, 0); if (err) return err; } else { @@ -9666,7 +9952,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY) goto error; if (env->subprog_cnt > 1 && !allow_tail_call_in_subprogs(env)) { - verbose(env, "tail_calls are not allowed in non-JITed programs with bpf-to-bpf calls\n"); + verbose(env, "mixing of tail_calls and bpf-to-bpf calls is not supported\n"); return -EINVAL; } break; @@ -9988,8 +10274,7 @@ static int setup_func_entry(struct bpf_verifier_env *env, int subprog, int calls } if (state->frame[state->curframe + 1]) { - verbose(env, "verifier bug. Frame %d already allocated\n", - state->curframe + 1); + verifier_bug(env, "Frame %d already allocated", state->curframe + 1); return -EFAULT; } @@ -10103,8 +10388,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, if (err) return err; } else { - bpf_log(log, "verifier bug: unrecognized arg#%d type %d\n", - i, arg->arg_type); + verifier_bug(env, "unrecognized arg#%d type %d", i, arg->arg_type); return -EFAULT; } } @@ -10167,13 +10451,13 @@ static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *ins env->subprog_info[subprog].is_cb = true; if (bpf_pseudo_kfunc_call(insn) && !is_callback_calling_kfunc(insn->imm)) { - verbose(env, "verifier bug: kfunc %s#%d not marked as callback-calling\n", - func_id_name(insn->imm), insn->imm); + verifier_bug(env, "kfunc %s#%d not marked as callback-calling", + func_id_name(insn->imm), insn->imm); return -EFAULT; } else if (!bpf_pseudo_kfunc_call(insn) && !is_callback_calling_function(insn->imm)) { /* helper */ - verbose(env, "verifier bug: helper %s#%d not marked as callback-calling\n", - func_id_name(insn->imm), insn->imm); + verifier_bug(env, "helper %s#%d not marked as callback-calling", + func_id_name(insn->imm), insn->imm); return -EFAULT; } @@ -10225,10 +10509,9 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, target_insn = *insn_idx + insn->imm + 1; subprog = find_subprog(env, target_insn); - if (subprog < 0) { - verbose(env, "verifier bug. No program starts at insn %d\n", target_insn); + if (verifier_bug_if(subprog < 0, env, "target of func call at insn %d is not a program", + target_insn)) return -EFAULT; - } caller = state->frame[state->curframe]; err = btf_check_subprog_call(env, subprog, caller->regs); @@ -10237,23 +10520,18 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (subprog_is_global(env, subprog)) { const char *sub_name = subprog_name(env, subprog); - /* Only global subprogs cannot be called with a lock held. */ if (env->cur_state->active_locks) { verbose(env, "global function calls are not allowed while holding a lock,\n" "use static function instead\n"); return -EINVAL; } - /* Only global subprogs cannot be called with preemption disabled. */ - if (env->cur_state->active_preempt_locks) { - verbose(env, "global function calls are not allowed with preemption disabled,\n" - "use static function instead\n"); - return -EINVAL; - } - - if (env->cur_state->active_irq_id) { - verbose(env, "global function calls are not allowed with IRQs disabled,\n" - "use static function instead\n"); + if (env->subprog_info[subprog].might_sleep && + (env->cur_state->active_rcu_lock || env->cur_state->active_preempt_locks || + env->cur_state->active_irq_id || !in_sleepable(env))) { + verbose(env, "global functions that may sleep are not allowed in non-sleepable context,\n" + "i.e., in a RCU/IRQ/preempt-disabled section, or in\n" + "a non-sleepable BPF program context\n"); return -EINVAL; } @@ -10752,6 +11030,8 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, static int check_reference_leak(struct bpf_verifier_env *env, bool exception_exit) { struct bpf_verifier_state *state = env->cur_state; + enum bpf_prog_type type = resolve_prog_type(env->prog); + struct bpf_reg_state *reg = reg_state(env, BPF_REG_0); bool refs_lingering = false; int i; @@ -10761,6 +11041,12 @@ static int check_reference_leak(struct bpf_verifier_env *env, bool exception_exi for (i = 0; i < state->acquired_refs; i++) { if (state->refs[i].type != REF_TYPE_PTR) continue; + /* Allow struct_ops programs to return a referenced kptr back to + * kernel. Type checks are performed later in check_return_code. + */ + if (type == BPF_PROG_TYPE_STRUCT_OPS && !exception_exit && + reg->ref_obj_id == state->refs[i].id) + continue; verbose(env, "Unreleased reference id=%d alloc_insn=%d\n", state->refs[i].id, state->refs[i].insn_idx); refs_lingering = true; @@ -10824,7 +11110,7 @@ static int check_bpf_snprintf_call(struct bpf_verifier_env *env, err = fmt_map->ops->map_direct_value_addr(fmt_map, &fmt_addr, fmt_map_off); if (err) { - verbose(env, "verifier bug\n"); + verbose(env, "failed to retrieve map value address\n"); return -EFAULT; } fmt = (char *)(long)fmt_addr + fmt_map_off; @@ -11287,7 +11573,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn regs[BPF_REG_0].map_uid = meta.map_uid; regs[BPF_REG_0].type = PTR_TO_MAP_VALUE | ret_flag; if (!type_may_be_null(ret_flag) && - btf_record_has_field(meta.map_ptr->record, BPF_SPIN_LOCK)) { + btf_record_has_field(meta.map_ptr->record, BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK)) { regs[BPF_REG_0].id = ++env->id_gen; } break; @@ -11459,10 +11745,10 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn /* mark_btf_func_reg_size() is used when the reg size is determined by * the BTF func_proto's return value size and argument. */ -static void mark_btf_func_reg_size(struct bpf_verifier_env *env, u32 regno, - size_t reg_size) +static void __mark_btf_func_reg_size(struct bpf_verifier_env *env, struct bpf_reg_state *regs, + u32 regno, size_t reg_size) { - struct bpf_reg_state *reg = &cur_regs(env)[regno]; + struct bpf_reg_state *reg = ®s[regno]; if (regno == BPF_REG_0) { /* Function return value */ @@ -11480,6 +11766,12 @@ static void mark_btf_func_reg_size(struct bpf_verifier_env *env, u32 regno, } } +static void mark_btf_func_reg_size(struct bpf_verifier_env *env, u32 regno, + size_t reg_size) +{ + return __mark_btf_func_reg_size(env, cur_regs(env), regno, reg_size); +} + static bool is_kfunc_acquire(struct bpf_kfunc_call_arg_meta *meta) { return meta->kfunc_flags & KF_ACQUIRE; @@ -11591,6 +11883,11 @@ static bool is_kfunc_arg_irq_flag(const struct btf *btf, const struct btf_param return btf_param_match_suffix(btf, arg, "__irq_flag"); } +static bool is_kfunc_arg_prog(const struct btf *btf, const struct btf_param *arg) +{ + return btf_param_match_suffix(btf, arg, "__prog"); +} + static bool is_kfunc_arg_scalar_with_name(const struct btf *btf, const struct btf_param *arg, const char *name) @@ -11617,6 +11914,7 @@ enum { KF_ARG_RB_ROOT_ID, KF_ARG_RB_NODE_ID, KF_ARG_WORKQUEUE_ID, + KF_ARG_RES_SPIN_LOCK_ID, }; BTF_ID_LIST(kf_arg_btf_ids) @@ -11626,6 +11924,7 @@ BTF_ID(struct, bpf_list_node) BTF_ID(struct, bpf_rb_root) BTF_ID(struct, bpf_rb_node) BTF_ID(struct, bpf_wq) +BTF_ID(struct, bpf_res_spin_lock) static bool __is_kfunc_ptr_arg_type(const struct btf *btf, const struct btf_param *arg, int type) @@ -11674,6 +11973,21 @@ static bool is_kfunc_arg_wq(const struct btf *btf, const struct btf_param *arg) return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_WORKQUEUE_ID); } +static bool is_kfunc_arg_res_spin_lock(const struct btf *btf, const struct btf_param *arg) +{ + return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RES_SPIN_LOCK_ID); +} + +static bool is_rbtree_node_type(const struct btf_type *t) +{ + return t == btf_type_by_id(btf_vmlinux, kf_arg_btf_ids[KF_ARG_RB_NODE_ID]); +} + +static bool is_list_node_type(const struct btf_type *t) +{ + return t == btf_type_by_id(btf_vmlinux, kf_arg_btf_ids[KF_ARG_LIST_NODE_ID]); +} + static bool is_kfunc_arg_callback(struct bpf_verifier_env *env, const struct btf *btf, const struct btf_param *arg) { @@ -11745,6 +12059,7 @@ enum kfunc_ptr_arg_type { KF_ARG_PTR_TO_MAP, KF_ARG_PTR_TO_WORKQUEUE, KF_ARG_PTR_TO_IRQ_FLAG, + KF_ARG_PTR_TO_RES_SPIN_LOCK, }; enum special_kfunc_type { @@ -11755,6 +12070,8 @@ enum special_kfunc_type { KF_bpf_list_push_back_impl, KF_bpf_list_pop_front, KF_bpf_list_pop_back, + KF_bpf_list_front, + KF_bpf_list_back, KF_bpf_cast_to_kern_ctx, KF_bpf_rdonly_cast, KF_bpf_rcu_read_lock, @@ -11762,6 +12079,9 @@ enum special_kfunc_type { KF_bpf_rbtree_remove, KF_bpf_rbtree_add_impl, KF_bpf_rbtree_first, + KF_bpf_rbtree_root, + KF_bpf_rbtree_left, + KF_bpf_rbtree_right, KF_bpf_dynptr_from_skb, KF_bpf_dynptr_from_xdp, KF_bpf_dynptr_slice, @@ -11781,37 +12101,15 @@ enum special_kfunc_type { KF_bpf_iter_num_new, KF_bpf_iter_num_next, KF_bpf_iter_num_destroy, + KF_bpf_set_dentry_xattr, + KF_bpf_remove_dentry_xattr, + KF_bpf_res_spin_lock, + KF_bpf_res_spin_unlock, + KF_bpf_res_spin_lock_irqsave, + KF_bpf_res_spin_unlock_irqrestore, + KF___bpf_trap, }; -BTF_SET_START(special_kfunc_set) -BTF_ID(func, bpf_obj_new_impl) -BTF_ID(func, bpf_obj_drop_impl) -BTF_ID(func, bpf_refcount_acquire_impl) -BTF_ID(func, bpf_list_push_front_impl) -BTF_ID(func, bpf_list_push_back_impl) -BTF_ID(func, bpf_list_pop_front) -BTF_ID(func, bpf_list_pop_back) -BTF_ID(func, bpf_cast_to_kern_ctx) -BTF_ID(func, bpf_rdonly_cast) -BTF_ID(func, bpf_rbtree_remove) -BTF_ID(func, bpf_rbtree_add_impl) -BTF_ID(func, bpf_rbtree_first) -#ifdef CONFIG_NET -BTF_ID(func, bpf_dynptr_from_skb) -BTF_ID(func, bpf_dynptr_from_xdp) -#endif -BTF_ID(func, bpf_dynptr_slice) -BTF_ID(func, bpf_dynptr_slice_rdwr) -BTF_ID(func, bpf_dynptr_clone) -BTF_ID(func, bpf_percpu_obj_new_impl) -BTF_ID(func, bpf_percpu_obj_drop_impl) -BTF_ID(func, bpf_throw) -BTF_ID(func, bpf_wq_set_callback_impl) -#ifdef CONFIG_CGROUPS -BTF_ID(func, bpf_iter_css_task_new) -#endif -BTF_SET_END(special_kfunc_set) - BTF_ID_LIST(special_kfunc_list) BTF_ID(func, bpf_obj_new_impl) BTF_ID(func, bpf_obj_drop_impl) @@ -11820,6 +12118,8 @@ BTF_ID(func, bpf_list_push_front_impl) BTF_ID(func, bpf_list_push_back_impl) BTF_ID(func, bpf_list_pop_front) BTF_ID(func, bpf_list_pop_back) +BTF_ID(func, bpf_list_front) +BTF_ID(func, bpf_list_back) BTF_ID(func, bpf_cast_to_kern_ctx) BTF_ID(func, bpf_rdonly_cast) BTF_ID(func, bpf_rcu_read_lock) @@ -11827,6 +12127,9 @@ BTF_ID(func, bpf_rcu_read_unlock) BTF_ID(func, bpf_rbtree_remove) BTF_ID(func, bpf_rbtree_add_impl) BTF_ID(func, bpf_rbtree_first) +BTF_ID(func, bpf_rbtree_root) +BTF_ID(func, bpf_rbtree_left) +BTF_ID(func, bpf_rbtree_right) #ifdef CONFIG_NET BTF_ID(func, bpf_dynptr_from_skb) BTF_ID(func, bpf_dynptr_from_xdp) @@ -11859,6 +12162,18 @@ BTF_ID(func, bpf_local_irq_restore) BTF_ID(func, bpf_iter_num_new) BTF_ID(func, bpf_iter_num_next) BTF_ID(func, bpf_iter_num_destroy) +#ifdef CONFIG_BPF_LSM +BTF_ID(func, bpf_set_dentry_xattr) +BTF_ID(func, bpf_remove_dentry_xattr) +#else +BTF_ID_UNUSED +BTF_ID_UNUSED +#endif +BTF_ID(func, bpf_res_spin_lock) +BTF_ID(func, bpf_res_spin_unlock) +BTF_ID(func, bpf_res_spin_lock_irqsave) +BTF_ID(func, bpf_res_spin_unlock_irqrestore) +BTF_ID(func, __bpf_trap) static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta) { @@ -11952,6 +12267,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, if (is_kfunc_arg_irq_flag(meta->btf, &args[argno])) return KF_ARG_PTR_TO_IRQ_FLAG; + if (is_kfunc_arg_res_spin_lock(meta->btf, &args[argno])) + return KF_ARG_PTR_TO_RES_SPIN_LOCK; + if ((base_type(reg->type) == PTR_TO_BTF_ID || reg2btf_ids[base_type(reg->type)])) { if (!btf_type_is_struct(ref_t)) { verbose(env, "kernel function %s args#%d pointer type %s %s is not supported\n", @@ -12059,13 +12377,19 @@ static int process_irq_flag(struct bpf_verifier_env *env, int regno, struct bpf_kfunc_call_arg_meta *meta) { struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + int err, kfunc_class = IRQ_NATIVE_KFUNC; bool irq_save; - int err; - if (meta->func_id == special_kfunc_list[KF_bpf_local_irq_save]) { + if (meta->func_id == special_kfunc_list[KF_bpf_local_irq_save] || + meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave]) { irq_save = true; - } else if (meta->func_id == special_kfunc_list[KF_bpf_local_irq_restore]) { + if (meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave]) + kfunc_class = IRQ_LOCK_KFUNC; + } else if (meta->func_id == special_kfunc_list[KF_bpf_local_irq_restore] || + meta->func_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore]) { irq_save = false; + if (meta->func_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore]) + kfunc_class = IRQ_LOCK_KFUNC; } else { verbose(env, "verifier internal error: unknown irq flags kfunc\n"); return -EFAULT; @@ -12081,7 +12405,7 @@ static int process_irq_flag(struct bpf_verifier_env *env, int regno, if (err) return err; - err = mark_stack_slot_irq_flag(env, meta, reg, env->insn_idx); + err = mark_stack_slot_irq_flag(env, meta, reg, env->insn_idx, kfunc_class); if (err) return err; } else { @@ -12095,7 +12419,7 @@ static int process_irq_flag(struct bpf_verifier_env *env, int regno, if (err) return err; - err = unmark_stack_slot_irq_flag(env, reg); + err = unmark_stack_slot_irq_flag(env, reg, kfunc_class); if (err) return err; } @@ -12222,7 +12546,7 @@ static int check_reg_allocation_locked(struct bpf_verifier_env *env, struct bpf_ if (!env->cur_state->active_locks) return -EINVAL; - s = find_lock_state(env->cur_state, REF_TYPE_LOCK, id, ptr); + s = find_lock_state(env->cur_state, REF_TYPE_LOCK_MASK, id, ptr); if (!s) { verbose(env, "held lock and object are not in the same allocation\n"); return -EINVAL; @@ -12235,14 +12559,19 @@ static bool is_bpf_list_api_kfunc(u32 btf_id) return btf_id == special_kfunc_list[KF_bpf_list_push_front_impl] || btf_id == special_kfunc_list[KF_bpf_list_push_back_impl] || btf_id == special_kfunc_list[KF_bpf_list_pop_front] || - btf_id == special_kfunc_list[KF_bpf_list_pop_back]; + btf_id == special_kfunc_list[KF_bpf_list_pop_back] || + btf_id == special_kfunc_list[KF_bpf_list_front] || + btf_id == special_kfunc_list[KF_bpf_list_back]; } static bool is_bpf_rbtree_api_kfunc(u32 btf_id) { return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl] || btf_id == special_kfunc_list[KF_bpf_rbtree_remove] || - btf_id == special_kfunc_list[KF_bpf_rbtree_first]; + btf_id == special_kfunc_list[KF_bpf_rbtree_first] || + btf_id == special_kfunc_list[KF_bpf_rbtree_root] || + btf_id == special_kfunc_list[KF_bpf_rbtree_left] || + btf_id == special_kfunc_list[KF_bpf_rbtree_right]; } static bool is_bpf_iter_num_api_kfunc(u32 btf_id) @@ -12258,9 +12587,18 @@ static bool is_bpf_graph_api_kfunc(u32 btf_id) btf_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]; } +static bool is_bpf_res_spin_lock_kfunc(u32 btf_id) +{ + return btf_id == special_kfunc_list[KF_bpf_res_spin_lock] || + btf_id == special_kfunc_list[KF_bpf_res_spin_unlock] || + btf_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave] || + btf_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore]; +} + static bool kfunc_spin_allowed(u32 btf_id) { - return is_bpf_graph_api_kfunc(btf_id) || is_bpf_iter_num_api_kfunc(btf_id); + return is_bpf_graph_api_kfunc(btf_id) || is_bpf_iter_num_api_kfunc(btf_id) || + is_bpf_res_spin_lock_kfunc(btf_id); } static bool is_sync_callback_calling_kfunc(u32 btf_id) @@ -12333,7 +12671,9 @@ static bool check_kfunc_is_graph_node_api(struct bpf_verifier_env *env, break; case BPF_RB_NODE: ret = (kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_remove] || - kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl]); + kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl] || + kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_left] || + kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_right]); break; default: verbose(env, "verifier internal error: unexpected graph node argument type %s\n", @@ -12553,6 +12893,17 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (is_kfunc_arg_ignore(btf, &args[i])) continue; + if (is_kfunc_arg_prog(btf, &args[i])) { + /* Used to reject repeated use of __prog. */ + if (meta->arg_prog) { + verbose(env, "Only 1 prog->aux argument supported per-kfunc\n"); + return -EFAULT; + } + meta->arg_prog = true; + cur_aux(env)->arg_prog = regno; + continue; + } + if (btf_type_is_scalar(t)) { if (reg->type != SCALAR_VALUE) { verbose(env, "R%d is not a scalar\n", regno); @@ -12692,6 +13043,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ case KF_ARG_PTR_TO_CONST_STR: case KF_ARG_PTR_TO_WORKQUEUE: case KF_ARG_PTR_TO_IRQ_FLAG: + case KF_ARG_PTR_TO_RES_SPIN_LOCK: break; default: WARN_ON_ONCE(1); @@ -12846,22 +13198,22 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return ret; break; case KF_ARG_PTR_TO_RB_NODE: - if (meta->func_id == special_kfunc_list[KF_bpf_rbtree_remove]) { - if (!type_is_non_owning_ref(reg->type) || reg->ref_obj_id) { - verbose(env, "rbtree_remove node input must be non-owning ref\n"); + if (meta->func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) { + if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { + verbose(env, "arg#%d expected pointer to allocated object\n", i); return -EINVAL; } - if (in_rbtree_lock_required_cb(env)) { - verbose(env, "rbtree_remove not allowed in rbtree cb\n"); + if (!reg->ref_obj_id) { + verbose(env, "allocated object must be referenced\n"); return -EINVAL; } } else { - if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { - verbose(env, "arg#%d expected pointer to allocated object\n", i); + if (!type_is_non_owning_ref(reg->type) && !reg->ref_obj_id) { + verbose(env, "%s can only take non-owning or refcounted bpf_rb_node pointer\n", func_name); return -EINVAL; } - if (!reg->ref_obj_id) { - verbose(env, "allocated object must be referenced\n"); + if (in_rbtree_lock_required_cb(env)) { + verbose(env, "%s not allowed in rbtree cb\n", func_name); return -EINVAL; } } @@ -12990,6 +13342,28 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (ret < 0) return ret; break; + case KF_ARG_PTR_TO_RES_SPIN_LOCK: + { + int flags = PROCESS_RES_LOCK; + + if (reg->type != PTR_TO_MAP_VALUE && reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { + verbose(env, "arg#%d doesn't point to map value or allocated object\n", i); + return -EINVAL; + } + + if (!is_bpf_res_spin_lock_kfunc(meta->func_id)) + return -EFAULT; + if (meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock] || + meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave]) + flags |= PROCESS_SPIN_LOCK; + if (meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave] || + meta->func_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore]) + flags |= PROCESS_LOCK_IRQ; + ret = process_spin_lock(env, regno, flags); + if (ret < 0) + return ret; + break; + } } } @@ -13044,6 +13418,178 @@ static int fetch_kfunc_meta(struct bpf_verifier_env *env, return 0; } +/* check special kfuncs and return: + * 1 - not fall-through to 'else' branch, continue verification + * 0 - fall-through to 'else' branch + * < 0 - not fall-through to 'else' branch, return error + */ +static int check_special_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta, + struct bpf_reg_state *regs, struct bpf_insn_aux_data *insn_aux, + const struct btf_type *ptr_type, struct btf *desc_btf) +{ + const struct btf_type *ret_t; + int err = 0; + + if (meta->btf != btf_vmlinux) + return 0; + + if (meta->func_id == special_kfunc_list[KF_bpf_obj_new_impl] || + meta->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) { + struct btf_struct_meta *struct_meta; + struct btf *ret_btf; + u32 ret_btf_id; + + if (meta->func_id == special_kfunc_list[KF_bpf_obj_new_impl] && !bpf_global_ma_set) + return -ENOMEM; + + if (((u64)(u32)meta->arg_constant.value) != meta->arg_constant.value) { + verbose(env, "local type ID argument must be in range [0, U32_MAX]\n"); + return -EINVAL; + } + + ret_btf = env->prog->aux->btf; + ret_btf_id = meta->arg_constant.value; + + /* This may be NULL due to user not supplying a BTF */ + if (!ret_btf) { + verbose(env, "bpf_obj_new/bpf_percpu_obj_new requires prog BTF\n"); + return -EINVAL; + } + + ret_t = btf_type_by_id(ret_btf, ret_btf_id); + if (!ret_t || !__btf_type_is_struct(ret_t)) { + verbose(env, "bpf_obj_new/bpf_percpu_obj_new type ID argument must be of a struct\n"); + return -EINVAL; + } + + if (meta->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) { + if (ret_t->size > BPF_GLOBAL_PERCPU_MA_MAX_SIZE) { + verbose(env, "bpf_percpu_obj_new type size (%d) is greater than %d\n", + ret_t->size, BPF_GLOBAL_PERCPU_MA_MAX_SIZE); + return -EINVAL; + } + + if (!bpf_global_percpu_ma_set) { + mutex_lock(&bpf_percpu_ma_lock); + if (!bpf_global_percpu_ma_set) { + /* Charge memory allocated with bpf_global_percpu_ma to + * root memcg. The obj_cgroup for root memcg is NULL. + */ + err = bpf_mem_alloc_percpu_init(&bpf_global_percpu_ma, NULL); + if (!err) + bpf_global_percpu_ma_set = true; + } + mutex_unlock(&bpf_percpu_ma_lock); + if (err) + return err; + } + + mutex_lock(&bpf_percpu_ma_lock); + err = bpf_mem_alloc_percpu_unit_init(&bpf_global_percpu_ma, ret_t->size); + mutex_unlock(&bpf_percpu_ma_lock); + if (err) + return err; + } + + struct_meta = btf_find_struct_meta(ret_btf, ret_btf_id); + if (meta->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) { + if (!__btf_type_is_scalar_struct(env, ret_btf, ret_t, 0)) { + verbose(env, "bpf_percpu_obj_new type ID argument must be of a struct of scalars\n"); + return -EINVAL; + } + + if (struct_meta) { + verbose(env, "bpf_percpu_obj_new type ID argument must not contain special fields\n"); + return -EINVAL; + } + } + + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC; + regs[BPF_REG_0].btf = ret_btf; + regs[BPF_REG_0].btf_id = ret_btf_id; + if (meta->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) + regs[BPF_REG_0].type |= MEM_PERCPU; + + insn_aux->obj_new_size = ret_t->size; + insn_aux->kptr_struct_meta = struct_meta; + } else if (meta->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]) { + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC; + regs[BPF_REG_0].btf = meta->arg_btf; + regs[BPF_REG_0].btf_id = meta->arg_btf_id; + + insn_aux->kptr_struct_meta = + btf_find_struct_meta(meta->arg_btf, + meta->arg_btf_id); + } else if (is_list_node_type(ptr_type)) { + struct btf_field *field = meta->arg_list_head.field; + + mark_reg_graph_node(regs, BPF_REG_0, &field->graph_root); + } else if (is_rbtree_node_type(ptr_type)) { + struct btf_field *field = meta->arg_rbtree_root.field; + + mark_reg_graph_node(regs, BPF_REG_0, &field->graph_root); + } else if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) { + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_TRUSTED; + regs[BPF_REG_0].btf = desc_btf; + regs[BPF_REG_0].btf_id = meta->ret_btf_id; + } else if (meta->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) { + ret_t = btf_type_by_id(desc_btf, meta->arg_constant.value); + if (!ret_t || !btf_type_is_struct(ret_t)) { + verbose(env, + "kfunc bpf_rdonly_cast type ID argument must be of a struct\n"); + return -EINVAL; + } + + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_UNTRUSTED; + regs[BPF_REG_0].btf = desc_btf; + regs[BPF_REG_0].btf_id = meta->arg_constant.value; + } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_slice] || + meta->func_id == special_kfunc_list[KF_bpf_dynptr_slice_rdwr]) { + enum bpf_type_flag type_flag = get_dynptr_type_flag(meta->initialized_dynptr.type); + + mark_reg_known_zero(env, regs, BPF_REG_0); + + if (!meta->arg_constant.found) { + verbose(env, "verifier internal error: bpf_dynptr_slice(_rdwr) no constant size\n"); + return -EFAULT; + } + + regs[BPF_REG_0].mem_size = meta->arg_constant.value; + + /* PTR_MAYBE_NULL will be added when is_kfunc_ret_null is checked */ + regs[BPF_REG_0].type = PTR_TO_MEM | type_flag; + + if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_slice]) { + regs[BPF_REG_0].type |= MEM_RDONLY; + } else { + /* this will set env->seen_direct_write to true */ + if (!may_access_direct_pkt_data(env, NULL, BPF_WRITE)) { + verbose(env, "the prog does not allow writes to packet data\n"); + return -EINVAL; + } + } + + if (!meta->initialized_dynptr.id) { + verbose(env, "verifier internal error: no dynptr id\n"); + return -EFAULT; + } + regs[BPF_REG_0].dynptr_id = meta->initialized_dynptr.id; + + /* we don't need to set BPF_REG_0's ref obj id + * because packet slices are not refcounted (see + * dynptr_type_refcounted) + */ + } else { + return 0; + } + + return 1; +} + static int check_return_code(struct bpf_verifier_env *env, int regno, const char *reg_name); static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, @@ -13058,7 +13604,6 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, struct bpf_insn_aux_data *insn_aux; int err, insn_idx = *insn_idx_p; const struct btf_param *args; - const struct btf_type *ret_t; struct btf *desc_btf; /* skip for now, but return error when we find this in fixup_kfunc_call */ @@ -13075,6 +13620,36 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, insn_aux->is_iter_next = is_iter_next_kfunc(&meta); + if (!insn->off && + (insn->imm == special_kfunc_list[KF_bpf_res_spin_lock] || + insn->imm == special_kfunc_list[KF_bpf_res_spin_lock_irqsave])) { + struct bpf_verifier_state *branch; + struct bpf_reg_state *regs; + + branch = push_stack(env, env->insn_idx + 1, env->insn_idx, false); + if (!branch) { + verbose(env, "failed to push state for failed lock acquisition\n"); + return -ENOMEM; + } + + regs = branch->frame[branch->curframe]->regs; + + /* Clear r0-r5 registers in forked state */ + for (i = 0; i < CALLER_SAVED_REGS; i++) + mark_reg_not_init(env, regs, caller_saved[i]); + + mark_reg_unknown(env, regs, BPF_REG_0); + err = __mark_reg_s32_range(env, regs, BPF_REG_0, -MAX_ERRNO, -1); + if (err) { + verbose(env, "failed to mark s32 range for retval in forked state for lock\n"); + return err; + } + __mark_btf_func_reg_size(env, regs, BPF_REG_0, sizeof(u32)); + } else if (!insn->off && insn->imm == special_kfunc_list[KF___bpf_trap]) { + verbose(env, "unexpected __bpf_trap() due to uninitialized variable?\n"); + return -EFAULT; + } + if (is_kfunc_destructive(&meta) && !capable(CAP_SYS_BOOT)) { verbose(env, "destructive kfunc calls require CAP_SYS_BOOT capability\n"); return -EACCES; @@ -13245,168 +13820,16 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (btf_type_is_scalar(t)) { mark_reg_unknown(env, regs, BPF_REG_0); + if (meta.btf == btf_vmlinux && (meta.func_id == special_kfunc_list[KF_bpf_res_spin_lock] || + meta.func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave])) + __mark_reg_const_zero(env, ®s[BPF_REG_0]); mark_btf_func_reg_size(env, BPF_REG_0, t->size); } else if (btf_type_is_ptr(t)) { ptr_type = btf_type_skip_modifiers(desc_btf, t->type, &ptr_type_id); - - if (meta.btf == btf_vmlinux && btf_id_set_contains(&special_kfunc_set, meta.func_id)) { - if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl] || - meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) { - struct btf_struct_meta *struct_meta; - struct btf *ret_btf; - u32 ret_btf_id; - - if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl] && !bpf_global_ma_set) - return -ENOMEM; - - if (((u64)(u32)meta.arg_constant.value) != meta.arg_constant.value) { - verbose(env, "local type ID argument must be in range [0, U32_MAX]\n"); - return -EINVAL; - } - - ret_btf = env->prog->aux->btf; - ret_btf_id = meta.arg_constant.value; - - /* This may be NULL due to user not supplying a BTF */ - if (!ret_btf) { - verbose(env, "bpf_obj_new/bpf_percpu_obj_new requires prog BTF\n"); - return -EINVAL; - } - - ret_t = btf_type_by_id(ret_btf, ret_btf_id); - if (!ret_t || !__btf_type_is_struct(ret_t)) { - verbose(env, "bpf_obj_new/bpf_percpu_obj_new type ID argument must be of a struct\n"); - return -EINVAL; - } - - if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) { - if (ret_t->size > BPF_GLOBAL_PERCPU_MA_MAX_SIZE) { - verbose(env, "bpf_percpu_obj_new type size (%d) is greater than %d\n", - ret_t->size, BPF_GLOBAL_PERCPU_MA_MAX_SIZE); - return -EINVAL; - } - - if (!bpf_global_percpu_ma_set) { - mutex_lock(&bpf_percpu_ma_lock); - if (!bpf_global_percpu_ma_set) { - /* Charge memory allocated with bpf_global_percpu_ma to - * root memcg. The obj_cgroup for root memcg is NULL. - */ - err = bpf_mem_alloc_percpu_init(&bpf_global_percpu_ma, NULL); - if (!err) - bpf_global_percpu_ma_set = true; - } - mutex_unlock(&bpf_percpu_ma_lock); - if (err) - return err; - } - - mutex_lock(&bpf_percpu_ma_lock); - err = bpf_mem_alloc_percpu_unit_init(&bpf_global_percpu_ma, ret_t->size); - mutex_unlock(&bpf_percpu_ma_lock); - if (err) - return err; - } - - struct_meta = btf_find_struct_meta(ret_btf, ret_btf_id); - if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) { - if (!__btf_type_is_scalar_struct(env, ret_btf, ret_t, 0)) { - verbose(env, "bpf_percpu_obj_new type ID argument must be of a struct of scalars\n"); - return -EINVAL; - } - - if (struct_meta) { - verbose(env, "bpf_percpu_obj_new type ID argument must not contain special fields\n"); - return -EINVAL; - } - } - - mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC; - regs[BPF_REG_0].btf = ret_btf; - regs[BPF_REG_0].btf_id = ret_btf_id; - if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) - regs[BPF_REG_0].type |= MEM_PERCPU; - - insn_aux->obj_new_size = ret_t->size; - insn_aux->kptr_struct_meta = struct_meta; - } else if (meta.func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]) { - mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC; - regs[BPF_REG_0].btf = meta.arg_btf; - regs[BPF_REG_0].btf_id = meta.arg_btf_id; - - insn_aux->kptr_struct_meta = - btf_find_struct_meta(meta.arg_btf, - meta.arg_btf_id); - } else if (meta.func_id == special_kfunc_list[KF_bpf_list_pop_front] || - meta.func_id == special_kfunc_list[KF_bpf_list_pop_back]) { - struct btf_field *field = meta.arg_list_head.field; - - mark_reg_graph_node(regs, BPF_REG_0, &field->graph_root); - } else if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_remove] || - meta.func_id == special_kfunc_list[KF_bpf_rbtree_first]) { - struct btf_field *field = meta.arg_rbtree_root.field; - - mark_reg_graph_node(regs, BPF_REG_0, &field->graph_root); - } else if (meta.func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) { - mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_TRUSTED; - regs[BPF_REG_0].btf = desc_btf; - regs[BPF_REG_0].btf_id = meta.ret_btf_id; - } else if (meta.func_id == special_kfunc_list[KF_bpf_rdonly_cast]) { - ret_t = btf_type_by_id(desc_btf, meta.arg_constant.value); - if (!ret_t || !btf_type_is_struct(ret_t)) { - verbose(env, - "kfunc bpf_rdonly_cast type ID argument must be of a struct\n"); - return -EINVAL; - } - - mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_UNTRUSTED; - regs[BPF_REG_0].btf = desc_btf; - regs[BPF_REG_0].btf_id = meta.arg_constant.value; - } else if (meta.func_id == special_kfunc_list[KF_bpf_dynptr_slice] || - meta.func_id == special_kfunc_list[KF_bpf_dynptr_slice_rdwr]) { - enum bpf_type_flag type_flag = get_dynptr_type_flag(meta.initialized_dynptr.type); - - mark_reg_known_zero(env, regs, BPF_REG_0); - - if (!meta.arg_constant.found) { - verbose(env, "verifier internal error: bpf_dynptr_slice(_rdwr) no constant size\n"); - return -EFAULT; - } - - regs[BPF_REG_0].mem_size = meta.arg_constant.value; - - /* PTR_MAYBE_NULL will be added when is_kfunc_ret_null is checked */ - regs[BPF_REG_0].type = PTR_TO_MEM | type_flag; - - if (meta.func_id == special_kfunc_list[KF_bpf_dynptr_slice]) { - regs[BPF_REG_0].type |= MEM_RDONLY; - } else { - /* this will set env->seen_direct_write to true */ - if (!may_access_direct_pkt_data(env, NULL, BPF_WRITE)) { - verbose(env, "the prog does not allow writes to packet data\n"); - return -EINVAL; - } - } - - if (!meta.initialized_dynptr.id) { - verbose(env, "verifier internal error: no dynptr id\n"); - return -EFAULT; - } - regs[BPF_REG_0].dynptr_id = meta.initialized_dynptr.id; - - /* we don't need to set BPF_REG_0's ref obj id - * because packet slices are not refcounted (see - * dynptr_type_refcounted) - */ - } else { - verbose(env, "kernel function %s unhandled dynamic return type\n", - meta.func_name); - return -EFAULT; - } + err = check_special_kfunc(env, &meta, regs, insn_aux, ptr_type, desc_btf); + if (err) { + if (err < 0) + return err; } else if (btf_type_is_void(ptr_type)) { /* kfunc returning 'void *' is equivalent to returning scalar */ mark_reg_unknown(env, regs, BPF_REG_0); @@ -13475,14 +13898,14 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (is_kfunc_ret_null(&meta)) regs[BPF_REG_0].id = id; regs[BPF_REG_0].ref_obj_id = id; - } else if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_first]) { + } else if (is_rbtree_node_type(ptr_type) || is_list_node_type(ptr_type)) { ref_set_non_owning(env, ®s[BPF_REG_0]); } if (reg_may_point_to_spin_lock(®s[BPF_REG_0]) && !regs[BPF_REG_0].id) regs[BPF_REG_0].id = ++env->id_gen; } else if (btf_type_is_void(t)) { - if (meta.btf == btf_vmlinux && btf_id_set_contains(&special_kfunc_set, meta.func_id)) { + if (meta.btf == btf_vmlinux) { if (meta.func_id == special_kfunc_list[KF_bpf_obj_drop_impl] || meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_drop_impl]) { insn_aux->kptr_struct_meta = @@ -15971,6 +16394,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, struct bpf_reg_state *eq_branch_regs; struct linked_regs linked_regs = {}; u8 opcode = BPF_OP(insn->code); + int insn_flags = 0; bool is_jmp32; int pred = -1; int err; @@ -16029,6 +16453,11 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, insn->src_reg); return -EACCES; } + + if (src_reg->type == PTR_TO_STACK) + insn_flags |= INSN_F_SRC_REG_STACK; + if (dst_reg->type == PTR_TO_STACK) + insn_flags |= INSN_F_DST_REG_STACK; } else { if (insn->src_reg != BPF_REG_0) { verbose(env, "BPF_JMP/JMP32 uses reserved fields\n"); @@ -16038,6 +16467,15 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, memset(src_reg, 0, sizeof(*src_reg)); src_reg->type = SCALAR_VALUE; __mark_reg_known(src_reg, insn->imm); + + if (dst_reg->type == PTR_TO_STACK) + insn_flags |= INSN_F_DST_REG_STACK; + } + + if (insn_flags) { + err = push_insn_history(env, this_branch, insn_flags, 0); + if (err) + return err; } is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32; @@ -16399,13 +16837,14 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char const char *exit_ctx = "At program exit"; struct tnum enforce_attach_type_range = tnum_unknown; const struct bpf_prog *prog = env->prog; - struct bpf_reg_state *reg; + struct bpf_reg_state *reg = reg_state(env, regno); struct bpf_retval_range range = retval_range(0, 1); enum bpf_prog_type prog_type = resolve_prog_type(env->prog); int err; struct bpf_func_state *frame = env->cur_state->frame[0]; const bool is_subprog = frame->subprogno; bool return_32bit = false; + const struct btf_type *reg_type, *ret_type = NULL; /* LSM and struct_ops func-ptr's return type could be "void" */ if (!is_subprog || frame->in_exception_callback_fn) { @@ -16414,10 +16853,26 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char if (prog->expected_attach_type == BPF_LSM_CGROUP) /* See below, can be 0 or 0-1 depending on hook. */ break; - fallthrough; + if (!prog->aux->attach_func_proto->type) + return 0; + break; case BPF_PROG_TYPE_STRUCT_OPS: if (!prog->aux->attach_func_proto->type) return 0; + + if (frame->in_exception_callback_fn) + break; + + /* Allow a struct_ops program to return a referenced kptr if it + * matches the operator's return type and is in its unmodified + * form. A scalar zero (i.e., a null pointer) is also allowed. + */ + reg_type = reg->btf ? btf_type_by_id(reg->btf, reg->btf_id) : NULL; + ret_type = btf_type_resolve_ptr(prog->aux->attach_btf, + prog->aux->attach_func_proto->type, + NULL); + if (ret_type && ret_type == reg_type && reg->ref_obj_id) + return __check_ptr_off_reg(env, reg, regno, false); break; default: break; @@ -16439,8 +16894,6 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char return -EACCES; } - reg = cur_regs(env) + regno; - if (frame->in_async_callback_fn) { /* enforce return zero from async callbacks like timer */ exit_ctx = "At async callback return"; @@ -16539,6 +16992,11 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char case BPF_PROG_TYPE_NETFILTER: range = retval_range(NF_DROP, NF_ACCEPT); break; + case BPF_PROG_TYPE_STRUCT_OPS: + if (!ret_type) + return 0; + range = retval_range(0, 0); + break; case BPF_PROG_TYPE_EXT: /* freplace program can return anything as its return value * depends on the to-be-replaced kernel func or bpf program. @@ -16582,6 +17040,14 @@ static void mark_subprog_changes_pkt_data(struct bpf_verifier_env *env, int off) subprog->changes_pkt_data = true; } +static void mark_subprog_might_sleep(struct bpf_verifier_env *env, int off) +{ + struct bpf_subprog_info *subprog; + + subprog = find_containing_subprog(env, off); + subprog->might_sleep = true; +} + /* 't' is an index of a call-site. * 'w' is a callee entry point. * Eventually this function would be called when env->cfg.insn_state[w] == EXPLORED. @@ -16595,6 +17061,7 @@ static void merge_callee_effects(struct bpf_verifier_env *env, int t, int w) caller = find_containing_subprog(env, t); callee = find_containing_subprog(env, w); caller->changes_pkt_data |= callee->changes_pkt_data; + caller->might_sleep |= callee->might_sleep; } /* non-recursive DFS pseudo code @@ -16753,27 +17220,6 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns, /* Bitmask with 1s for all caller saved registers */ #define ALL_CALLER_SAVED_REGS ((1u << CALLER_SAVED_REGS) - 1) -/* Return a bitmask specifying which caller saved registers are - * clobbered by a call to a helper *as if* this helper follows - * bpf_fastcall contract: - * - includes R0 if function is non-void; - * - includes R1-R5 if corresponding parameter has is described - * in the function prototype. - */ -static u32 helper_fastcall_clobber_mask(const struct bpf_func_proto *fn) -{ - u32 mask; - int i; - - mask = 0; - if (fn->ret_type != RET_VOID) - mask |= BIT(BPF_REG_0); - for (i = 0; i < ARRAY_SIZE(fn->arg_type); ++i) - if (fn->arg_type[i] != ARG_DONTCARE) - mask |= BIT(BPF_REG_1 + i); - return mask; -} - /* True if do_misc_fixups() replaces calls to helper number 'imm', * replacement patch is presumed to follow bpf_fastcall contract * (see mark_fastcall_pattern_for_call() below). @@ -16790,24 +17236,54 @@ static bool verifier_inlines_helper_call(struct bpf_verifier_env *env, s32 imm) } } -/* Same as helper_fastcall_clobber_mask() but for kfuncs, see comment above */ -static u32 kfunc_fastcall_clobber_mask(struct bpf_kfunc_call_arg_meta *meta) +struct call_summary { + u8 num_params; + bool is_void; + bool fastcall; +}; + +/* If @call is a kfunc or helper call, fills @cs and returns true, + * otherwise returns false. + */ +static bool get_call_summary(struct bpf_verifier_env *env, struct bpf_insn *call, + struct call_summary *cs) { - u32 vlen, i, mask; + struct bpf_kfunc_call_arg_meta meta; + const struct bpf_func_proto *fn; + int i; - vlen = btf_type_vlen(meta->func_proto); - mask = 0; - if (!btf_type_is_void(btf_type_by_id(meta->btf, meta->func_proto->type))) - mask |= BIT(BPF_REG_0); - for (i = 0; i < vlen; ++i) - mask |= BIT(BPF_REG_1 + i); - return mask; -} + if (bpf_helper_call(call)) { -/* Same as verifier_inlines_helper_call() but for kfuncs, see comment above */ -static bool is_fastcall_kfunc_call(struct bpf_kfunc_call_arg_meta *meta) -{ - return meta->kfunc_flags & KF_FASTCALL; + if (get_helper_proto(env, call->imm, &fn) < 0) + /* error would be reported later */ + return false; + cs->fastcall = fn->allow_fastcall && + (verifier_inlines_helper_call(env, call->imm) || + bpf_jit_inlines_helper_call(call->imm)); + cs->is_void = fn->ret_type == RET_VOID; + cs->num_params = 0; + for (i = 0; i < ARRAY_SIZE(fn->arg_type); ++i) { + if (fn->arg_type[i] == ARG_DONTCARE) + break; + cs->num_params++; + } + return true; + } + + if (bpf_pseudo_kfunc_call(call)) { + int err; + + err = fetch_kfunc_meta(env, call, &meta, NULL); + if (err < 0) + /* error would be reported later */ + return false; + cs->num_params = btf_type_vlen(meta.func_proto); + cs->fastcall = meta.kfunc_flags & KF_FASTCALL; + cs->is_void = btf_type_is_void(btf_type_by_id(meta.btf, meta.func_proto->type)); + return true; + } + + return false; } /* LLVM define a bpf_fastcall function attribute. @@ -16890,39 +17366,23 @@ static void mark_fastcall_pattern_for_call(struct bpf_verifier_env *env, { struct bpf_insn *insns = env->prog->insnsi, *stx, *ldx; struct bpf_insn *call = &env->prog->insnsi[insn_idx]; - const struct bpf_func_proto *fn; - u32 clobbered_regs_mask = ALL_CALLER_SAVED_REGS; + u32 clobbered_regs_mask; + struct call_summary cs; u32 expected_regs_mask; - bool can_be_inlined = false; s16 off; int i; - if (bpf_helper_call(call)) { - if (get_helper_proto(env, call->imm, &fn) < 0) - /* error would be reported later */ - return; - clobbered_regs_mask = helper_fastcall_clobber_mask(fn); - can_be_inlined = fn->allow_fastcall && - (verifier_inlines_helper_call(env, call->imm) || - bpf_jit_inlines_helper_call(call->imm)); - } - - if (bpf_pseudo_kfunc_call(call)) { - struct bpf_kfunc_call_arg_meta meta; - int err; - - err = fetch_kfunc_meta(env, call, &meta, NULL); - if (err < 0) - /* error would be reported later */ - return; - - clobbered_regs_mask = kfunc_fastcall_clobber_mask(&meta); - can_be_inlined = is_fastcall_kfunc_call(&meta); - } - - if (clobbered_regs_mask == ALL_CALLER_SAVED_REGS) + if (!get_call_summary(env, call, &cs)) return; + /* A bitmask specifying which caller saved registers are clobbered + * by a call to a helper/kfunc *as if* this helper/kfunc follows + * bpf_fastcall contract: + * - includes R0 if function is non-void; + * - includes R1-R5 if corresponding parameter has is described + * in the function prototype. + */ + clobbered_regs_mask = GENMASK(cs.num_params, cs.is_void ? 1 : 0); /* e.g. if helper call clobbers r{0,1}, expect r{2,3,4,5} in the pattern */ expected_regs_mask = ~clobbered_regs_mask & ALL_CALLER_SAVED_REGS; @@ -16980,7 +17440,7 @@ static void mark_fastcall_pattern_for_call(struct bpf_verifier_env *env, * don't set 'fastcall_spills_num' for call B so that remove_fastcall_spills_fills() * does not remove spill/fill pair {4,6}. */ - if (can_be_inlined) + if (cs.fastcall) env->insn_aux_data[insn_idx].fastcall_spills_num = i - 1; else subprog->keep_fastcall_stack = 1; @@ -17062,9 +17522,20 @@ static int visit_insn(int t, struct bpf_verifier_env *env) mark_prune_point(env, t); mark_jmp_point(env, t); } - if (bpf_helper_call(insn) && bpf_helper_changes_pkt_data(insn->imm)) - mark_subprog_changes_pkt_data(env, t); - if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { + if (bpf_helper_call(insn)) { + const struct bpf_func_proto *fp; + + ret = get_helper_proto(env, insn->imm, &fp); + /* If called in a non-sleepable context program will be + * rejected anyway, so we should end up with precise + * sleepable marks on subprogs, except for dead code + * elimination. + */ + if (ret == 0 && fp->might_sleep) + mark_subprog_might_sleep(env, t); + if (bpf_helper_changes_pkt_data(insn->imm)) + mark_subprog_changes_pkt_data(env, t); + } else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { struct bpf_kfunc_call_arg_meta meta; ret = fetch_kfunc_meta(env, insn, &meta, NULL); @@ -17083,6 +17554,13 @@ static int visit_insn(int t, struct bpf_verifier_env *env) */ mark_force_checkpoint(env, t); } + /* Same as helpers, if called in a non-sleepable context + * program will be rejected anyway, so we should end up + * with precise sleepable marks on subprogs, except for + * dead code elimination. + */ + if (ret == 0 && is_kfunc_sleepable(&meta)) + mark_subprog_might_sleep(env, t); } return visit_func_call_insn(t, insns, env, insn->src_reg == BPF_PSEUDO_CALL); @@ -17125,9 +17603,8 @@ static int visit_insn(int t, struct bpf_verifier_env *env) static int check_cfg(struct bpf_verifier_env *env) { int insn_cnt = env->prog->len; - int *insn_stack, *insn_state; + int *insn_stack, *insn_state, *insn_postorder; int ex_insn_beg, i, ret = 0; - bool ex_done = false; insn_state = env->cfg.insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); if (!insn_state) @@ -17139,6 +17616,17 @@ static int check_cfg(struct bpf_verifier_env *env) return -ENOMEM; } + insn_postorder = env->cfg.insn_postorder = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); + if (!insn_postorder) { + kvfree(insn_state); + kvfree(insn_stack); + return -ENOMEM; + } + + ex_insn_beg = env->exception_callback_subprog + ? env->subprog_info[env->exception_callback_subprog].start + : 0; + insn_state[0] = DISCOVERED; /* mark 1st insn as discovered */ insn_stack[0] = 0; /* 0 is the first instruction */ env->cfg.cur_stack = 1; @@ -17152,6 +17640,7 @@ walk_cfg: case DONE_EXPLORING: insn_state[t] = EXPLORED; env->cfg.cur_stack--; + insn_postorder[env->cfg.cur_postorder++] = t; break; case KEEP_EXPLORING: break; @@ -17170,13 +17659,10 @@ walk_cfg: goto err_free; } - if (env->exception_callback_subprog && !ex_done) { - ex_insn_beg = env->subprog_info[env->exception_callback_subprog].start; - + if (ex_insn_beg && insn_state[ex_insn_beg] != EXPLORED) { insn_state[ex_insn_beg] = DISCOVERED; insn_stack[0] = ex_insn_beg; env->cfg.cur_stack = 1; - ex_done = true; goto walk_cfg; } @@ -17199,6 +17685,7 @@ walk_cfg: } ret = 0; /* cfg looks good */ env->prog->aux->changes_pkt_data = env->subprog_info[0].changes_pkt_data; + env->prog->aux->might_sleep = env->subprog_info[0].might_sleep; err_free: kvfree(insn_state); @@ -17815,18 +18302,22 @@ static void clean_verifier_state(struct bpf_verifier_env *env, static void clean_live_states(struct bpf_verifier_env *env, int insn, struct bpf_verifier_state *cur) { + struct bpf_verifier_state *loop_entry; struct bpf_verifier_state_list *sl; + struct list_head *pos, *head; - sl = *explored_state(env, insn); - while (sl) { + head = explored_state(env, insn); + list_for_each(pos, head) { + sl = container_of(pos, struct bpf_verifier_state_list, node); if (sl->state.branches) - goto next; + continue; + loop_entry = get_loop_entry(env, &sl->state); + if (!IS_ERR_OR_NULL(loop_entry) && loop_entry->branches) + continue; if (sl->state.insn_idx != insn || !same_callsites(&sl->state, cur)) - goto next; + continue; clean_verifier_state(env, &sl->state); -next: - sl = sl->next; } } @@ -18127,7 +18618,8 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, case STACK_IRQ_FLAG: old_reg = &old->stack[spi].spilled_ptr; cur_reg = &cur->stack[spi].spilled_ptr; - if (!check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap)) + if (!check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap) || + old_reg->irq.kfunc_class != cur_reg->irq.kfunc_class) return false; break; case STACK_MISC: @@ -18162,6 +18654,10 @@ static bool refsafe(struct bpf_verifier_state *old, struct bpf_verifier_state *c if (!check_ids(old->active_irq_id, cur->active_irq_id, idmap)) return false; + if (!check_ids(old->active_lock_id, cur->active_lock_id, idmap) || + old->active_lock_ptr != cur->active_lock_ptr) + return false; + for (i = 0; i < old->acquired_refs; i++) { if (!check_ids(old->refs[i].id, cur->refs[i].id, idmap) || old->refs[i].type != cur->refs[i].type) @@ -18171,6 +18667,8 @@ static bool refsafe(struct bpf_verifier_state *old, struct bpf_verifier_state *c case REF_TYPE_IRQ: break; case REF_TYPE_LOCK: + case REF_TYPE_RES_LOCK: + case REF_TYPE_RES_LOCK_IRQ: if (old->refs[i].ptr != cur->refs[i].ptr) return false; break; @@ -18210,15 +18708,17 @@ static bool refsafe(struct bpf_verifier_state *old, struct bpf_verifier_state *c * the current state will reach 'bpf_exit' instruction safely */ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_state *old, - struct bpf_func_state *cur, enum exact_level exact) + struct bpf_func_state *cur, u32 insn_idx, enum exact_level exact) { - int i; + u16 live_regs = env->insn_aux_data[insn_idx].live_regs_before; + u16 i; if (old->callback_depth > cur->callback_depth) return false; for (i = 0; i < MAX_BPF_REG; i++) - if (!regsafe(env, &old->regs[i], &cur->regs[i], + if (((1 << i) & live_regs) && + !regsafe(env, &old->regs[i], &cur->regs[i], &env->idmap_scratch, exact)) return false; @@ -18239,6 +18739,7 @@ static bool states_equal(struct bpf_verifier_env *env, struct bpf_verifier_state *cur, enum exact_level exact) { + u32 insn_idx; int i; if (old->curframe != cur->curframe) @@ -18262,9 +18763,12 @@ static bool states_equal(struct bpf_verifier_env *env, * and all frame states need to be equivalent */ for (i = 0; i <= old->curframe; i++) { + insn_idx = i == old->curframe + ? env->insn_idx + : old->frame[i + 1]->callsite; if (old->frame[i]->callsite != cur->frame[i]->callsite) return false; - if (!func_states_equal(env, old->frame[i], cur->frame[i], exact)) + if (!func_states_equal(env, old->frame[i], cur->frame[i], insn_idx, exact)) return false; } return true; @@ -18517,10 +19021,11 @@ static bool iter_active_depths_differ(struct bpf_verifier_state *old, struct bpf static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) { struct bpf_verifier_state_list *new_sl; - struct bpf_verifier_state_list *sl, **pprev; + struct bpf_verifier_state_list *sl; struct bpf_verifier_state *cur = env->cur_state, *new, *loop_entry; int i, j, n, err, states_cnt = 0; bool force_new_state, add_new_state, force_exact; + struct list_head *pos, *tmp, *head; force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx) || /* Avoid accumulating infinitely long jmp history */ @@ -18539,15 +19044,14 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) env->insn_processed - env->prev_insn_processed >= 8) add_new_state = true; - pprev = explored_state(env, insn_idx); - sl = *pprev; - clean_live_states(env, insn_idx, cur); - while (sl) { + head = explored_state(env, insn_idx); + list_for_each_safe(pos, tmp, head) { + sl = container_of(pos, struct bpf_verifier_state_list, node); states_cnt++; if (sl->state.insn_idx != insn_idx) - goto next; + continue; if (sl->state.branches) { struct bpf_func_state *frame = sl->state.frame[sl->state.curframe]; @@ -18621,7 +19125,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) spi = __get_spi(iter_reg->off + iter_reg->var_off.value); iter_state = &func(env, iter_reg)->stack[spi].spilled_ptr; if (iter_state->iter.state == BPF_ITER_STATE_ACTIVE) { - update_loop_entry(cur, &sl->state); + update_loop_entry(env, cur, &sl->state); goto hit; } } @@ -18630,7 +19134,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) if (is_may_goto_insn_at(env, insn_idx)) { if (sl->state.may_goto_depth != cur->may_goto_depth && states_equal(env, &sl->state, cur, RANGE_WITHIN)) { - update_loop_entry(cur, &sl->state); + update_loop_entry(env, cur, &sl->state); goto hit; } } @@ -18697,11 +19201,13 @@ skip_inf_loop_check: * * Additional details are in the comment before get_loop_entry(). */ - loop_entry = get_loop_entry(&sl->state); + loop_entry = get_loop_entry(env, &sl->state); + if (IS_ERR(loop_entry)) + return PTR_ERR(loop_entry); force_exact = loop_entry && loop_entry->branches > 0; if (states_equal(env, &sl->state, cur, force_exact ? RANGE_WITHIN : NOT_EXACT)) { if (force_exact) - update_loop_entry(cur, loop_entry); + update_loop_entry(env, cur, loop_entry); hit: sl->hit_cnt++; /* reached equivalent register/stack state, @@ -18750,31 +19256,13 @@ miss: /* the state is unlikely to be useful. Remove it to * speed up verification */ - *pprev = sl->next; - if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE && - !sl->state.used_as_loop_entry) { - u32 br = sl->state.branches; - - WARN_ONCE(br, - "BUG live_done but branches_to_explore %d\n", - br); - free_verifier_state(&sl->state, false); - kfree(sl); - env->peak_states--; - } else { - /* cannot free this state, since parentage chain may - * walk it later. Add it for free_list instead to - * be freed at the end of verification - */ - sl->next = env->free_list; - env->free_list = sl; - } - sl = *pprev; - continue; + sl->in_free_list = true; + list_del(&sl->node); + list_add(&sl->node, &env->free_list); + env->free_list_size++; + env->explored_states_size--; + maybe_free_verifier_state(env, sl); } -next: - pprev = &sl->next; - sl = *pprev; } if (env->max_states_per_insn < states_cnt) @@ -18799,7 +19287,8 @@ next: if (!new_sl) return -ENOMEM; env->total_states++; - env->peak_states++; + env->explored_states_size++; + update_peak_states(env); env->prev_jmps_processed = env->jmps_processed; env->prev_insn_processed = env->insn_processed; @@ -18823,8 +19312,8 @@ next: cur->first_insn_idx = insn_idx; cur->insn_hist_start = cur->insn_hist_end; cur->dfs_depth = new->dfs_depth + 1; - new_sl->next = *explored_state(env, insn_idx); - *explored_state(env, insn_idx) = new_sl; + list_add(&new_sl->node, head); + /* connect new state to parentage chain. Current frame needs all * registers connected. Only r6 - r9 of the callers are alive (pushed * to the stack implicitly by JITs) so in callers' frames connect just @@ -19011,19 +19500,13 @@ static int do_check(struct bpf_verifier_env *env) } if (env->log.level & BPF_LOG_LEVEL) { - const struct bpf_insn_cbs cbs = { - .cb_call = disasm_kfunc_name, - .cb_print = verbose, - .private_data = env, - }; - if (verifier_state_scratched(env)) print_insn_state(env, state, state->curframe); verbose_linfo(env, env->insn_idx, "; "); env->prev_log_pos = env->log.end_pos; verbose(env, "%d: ", env->insn_idx); - print_bpf_insn(&cbs, insn, env->allow_ptr_leaks); + verbose_insn(env, insn); env->prev_insn_print_pos = env->log.end_pos - env->prev_log_pos; env->prev_log_pos = env->log.end_pos; } @@ -19045,37 +19528,18 @@ static int do_check(struct bpf_verifier_env *env) return err; } else if (class == BPF_LDX) { - enum bpf_reg_type src_reg_type; - - /* check for reserved fields is already done */ - - /* check src operand */ - err = check_reg_arg(env, insn->src_reg, SRC_OP); - if (err) - return err; - - err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK); - if (err) - return err; + bool is_ldsx = BPF_MODE(insn->code) == BPF_MEMSX; - src_reg_type = regs[insn->src_reg].type; - - /* check that memory (src_reg + off) is readable, - * the state of dst_reg will be updated by this func + /* Check for reserved fields is already done in + * resolve_pseudo_ldimm64(). */ - err = check_mem_access(env, env->insn_idx, insn->src_reg, - insn->off, BPF_SIZE(insn->code), - BPF_READ, insn->dst_reg, false, - BPF_MODE(insn->code) == BPF_MEMSX); - err = err ?: save_aux_ptr_type(env, src_reg_type, true); - err = err ?: reg_bounds_sanity_check(env, ®s[insn->dst_reg], "ldx"); + err = check_load_mem(env, insn, false, is_ldsx, true, + "ldx"); if (err) return err; } else if (class == BPF_STX) { - enum bpf_reg_type dst_reg_type; - if (BPF_MODE(insn->code) == BPF_ATOMIC) { - err = check_atomic(env, env->insn_idx, insn); + err = check_atomic(env, insn); if (err) return err; env->insn_idx++; @@ -19087,25 +19551,7 @@ static int do_check(struct bpf_verifier_env *env) return -EINVAL; } - /* check src1 operand */ - err = check_reg_arg(env, insn->src_reg, SRC_OP); - if (err) - return err; - /* check src2 operand */ - err = check_reg_arg(env, insn->dst_reg, SRC_OP); - if (err) - return err; - - dst_reg_type = regs[insn->dst_reg].type; - - /* check that memory (dst_reg + off) is writeable */ - err = check_mem_access(env, env->insn_idx, insn->dst_reg, - insn->off, BPF_SIZE(insn->code), - BPF_WRITE, insn->src_reg, false, false); - if (err) - return err; - - err = save_aux_ptr_type(env, dst_reg_type, false); + err = check_store_reg(env, insn, false); if (err) return err; } else if (class == BPF_ST) { @@ -19245,6 +19691,9 @@ process_bpf_exit: return err; break; } else { + if (verifier_bug_if(env->cur_state->loop_entry, env, + "broken loop detection")) + return -EFAULT; do_print_state = true; continue; } @@ -19504,7 +19953,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, } } - if (btf_record_has_field(map->record, BPF_SPIN_LOCK)) { + if (btf_record_has_field(map->record, BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK)) { if (prog_type == BPF_PROG_TYPE_SOCKET_FILTER) { verbose(env, "socket filter progs cannot use bpf_spin_lock yet\n"); return -EINVAL; @@ -20302,10 +20751,9 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env, if (bpf_pseudo_kfunc_call(&insn)) continue; - if (WARN_ON(load_reg == -1)) { - verbose(env, "verifier bug. zext_dst is set, but no reg is defined\n"); + if (verifier_bug_if(load_reg == -1, env, + "zext_dst is set, but no reg is defined")) return -EFAULT; - } zext_patch[0] = insn; zext_patch[1].dst_reg = load_reg; @@ -20334,7 +20782,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) { struct bpf_subprog_info *subprogs = env->subprog_info; const struct bpf_verifier_ops *ops = env->ops; - int i, cnt, size, ctx_field_size, delta = 0, epilogue_cnt = 0; + int i, cnt, size, ctx_field_size, ret, delta = 0, epilogue_cnt = 0; const int insn_cnt = env->prog->len; struct bpf_insn *epilogue_buf = env->epilogue_buf; struct bpf_insn *insn_buf = env->insn_buf; @@ -20363,6 +20811,10 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) return -ENOMEM; env->prog = new_prog; delta += cnt - 1; + + ret = add_kfunc_in_insns(env, epilogue_buf, epilogue_cnt - 1); + if (ret < 0) + return ret; } } @@ -20383,6 +20835,10 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) env->prog = new_prog; delta += cnt - 1; + + ret = add_kfunc_in_insns(env, insn_buf, cnt - 1); + if (ret < 0) + return ret; } } @@ -20415,7 +20871,9 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) insn->code == (BPF_ST | BPF_MEM | BPF_W) || insn->code == (BPF_ST | BPF_MEM | BPF_DW)) { type = BPF_WRITE; - } else if ((insn->code == (BPF_STX | BPF_ATOMIC | BPF_W) || + } else if ((insn->code == (BPF_STX | BPF_ATOMIC | BPF_B) || + insn->code == (BPF_STX | BPF_ATOMIC | BPF_H) || + insn->code == (BPF_STX | BPF_ATOMIC | BPF_W) || insn->code == (BPF_STX | BPF_ATOMIC | BPF_DW)) && env->insn_aux_data[i + delta].ptr_type == PTR_TO_ARENA) { insn->code = BPF_STX | BPF_PROBE_ATOMIC | BPF_SIZE(insn->code); @@ -20612,11 +21070,9 @@ static int jit_subprogs(struct bpf_verifier_env *env) * propagated in any case. */ subprog = find_subprog(env, i + insn->imm + 1); - if (subprog < 0) { - WARN_ONCE(1, "verifier bug. No program starts at insn %d\n", - i + insn->imm + 1); + if (verifier_bug_if(subprog < 0, env, "No program to jit at insn %d", + i + insn->imm + 1)) return -EFAULT; - } /* temporarily remember subprog id inside insn instead of * aux_data, since next loop will split up all insns into funcs */ @@ -20723,6 +21179,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) func[i]->aux->tail_call_reachable = env->subprog_info[i].tail_call_reachable; func[i]->aux->exception_cb = env->subprog_info[i].is_exception_cb; func[i]->aux->changes_pkt_data = env->subprog_info[i].changes_pkt_data; + func[i]->aux->might_sleep = env->subprog_info[i].might_sleep; if (!i) func[i]->aux->exception_boundary = env->seen_exception; func[i] = bpf_int_jit_compile(func[i]); @@ -20939,6 +21396,14 @@ static void specialize_kfunc(struct bpf_verifier_env *env, */ env->seen_direct_write = seen_direct_write; } + + if (func_id == special_kfunc_list[KF_bpf_set_dentry_xattr] && + bpf_lsm_has_d_inode_locked(prog)) + *addr = (unsigned long)bpf_set_dentry_xattr_locked; + + if (func_id == special_kfunc_list[KF_bpf_remove_dentry_xattr] && + bpf_lsm_has_d_inode_locked(prog)) + *addr = (unsigned long)bpf_remove_dentry_xattr_locked; } static void __fixup_collection_insert_kfunc(struct bpf_insn_aux_data *insn_aux, @@ -21050,13 +21515,17 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) { insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1); *cnt = 1; - } else if (is_bpf_wq_set_callback_impl_kfunc(desc->func_id)) { - struct bpf_insn ld_addrs[2] = { BPF_LD_IMM64(BPF_REG_4, (long)env->prog->aux) }; + } - insn_buf[0] = ld_addrs[0]; - insn_buf[1] = ld_addrs[1]; - insn_buf[2] = *insn; - *cnt = 3; + if (env->insn_aux_data[insn_idx].arg_prog) { + u32 regno = env->insn_aux_data[insn_idx].arg_prog; + struct bpf_insn ld_addrs[2] = { BPF_LD_IMM64(regno, (long)env->prog->aux) }; + int idx = *cnt; + + insn_buf[idx++] = ld_addrs[0]; + insn_buf[idx++] = ld_addrs[1]; + insn_buf[idx++] = *insn; + *cnt = idx; } return 0; } @@ -21373,7 +21842,50 @@ static int do_misc_fixups(struct bpf_verifier_env *env) goto next_insn; } - if (is_may_goto_insn(insn)) { + if (is_may_goto_insn(insn) && bpf_jit_supports_timed_may_goto()) { + int stack_off_cnt = -stack_depth - 16; + + /* + * Two 8 byte slots, depth-16 stores the count, and + * depth-8 stores the start timestamp of the loop. + * + * The starting value of count is BPF_MAX_TIMED_LOOPS + * (0xffff). Every iteration loads it and subs it by 1, + * until the value becomes 0 in AX (thus, 1 in stack), + * after which we call arch_bpf_timed_may_goto, which + * either sets AX to 0xffff to keep looping, or to 0 + * upon timeout. AX is then stored into the stack. In + * the next iteration, we either see 0 and break out, or + * continue iterating until the next time value is 0 + * after subtraction, rinse and repeat. + */ + stack_depth_extra = 16; + insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_AX, BPF_REG_10, stack_off_cnt); + if (insn->off >= 0) + insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off + 5); + else + insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off - 1); + insn_buf[2] = BPF_ALU64_IMM(BPF_SUB, BPF_REG_AX, 1); + insn_buf[3] = BPF_JMP_IMM(BPF_JNE, BPF_REG_AX, 0, 2); + /* + * AX is used as an argument to pass in stack_off_cnt + * (to add to r10/fp), and also as the return value of + * the call to arch_bpf_timed_may_goto. + */ + insn_buf[4] = BPF_MOV64_IMM(BPF_REG_AX, stack_off_cnt); + insn_buf[5] = BPF_EMIT_CALL(arch_bpf_timed_may_goto); + insn_buf[6] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_AX, stack_off_cnt); + cnt = 7; + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + goto next_insn; + } else if (is_may_goto_insn(insn)) { int stack_off = -stack_depth - 8; stack_depth_extra = 8; @@ -21897,6 +22409,13 @@ next_insn: if (subprogs[cur_subprog + 1].start == i + delta + 1) { subprogs[cur_subprog].stack_depth += stack_depth_extra; subprogs[cur_subprog].stack_extra = stack_depth_extra; + + stack_depth = subprogs[cur_subprog].stack_depth; + if (stack_depth > MAX_BPF_STACK && !prog->jit_requested) { + verbose(env, "stack size %d(extra %d) is too large\n", + stack_depth, stack_depth_extra); + return -EINVAL; + } cur_subprog++; stack_depth = subprogs[cur_subprog].stack_depth; stack_depth_extra = 0; @@ -21907,23 +22426,33 @@ next_insn: env->prog->aux->stack_depth = subprogs[0].stack_depth; for (i = 0; i < env->subprog_cnt; i++) { + int delta = bpf_jit_supports_timed_may_goto() ? 2 : 1; int subprog_start = subprogs[i].start; int stack_slots = subprogs[i].stack_extra / 8; + int slots = delta, cnt = 0; if (!stack_slots) continue; - if (stack_slots > 1) { - verbose(env, "verifier bug: stack_slots supports may_goto only\n"); + /* We need two slots in case timed may_goto is supported. */ + if (stack_slots > slots) { + verifier_bug(env, "stack_slots supports may_goto only"); return -EFAULT; } - /* Add ST insn to subprog prologue to init extra stack */ - insn_buf[0] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, - -subprogs[i].stack_depth, BPF_MAX_LOOPS); + stack_depth = subprogs[i].stack_depth; + if (bpf_jit_supports_timed_may_goto()) { + insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -stack_depth, + BPF_MAX_TIMED_LOOPS); + insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -stack_depth + 8, 0); + } else { + /* Add ST insn to subprog prologue to init extra stack */ + insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -stack_depth, + BPF_MAX_LOOPS); + } /* Copy first actual insn to preserve it */ - insn_buf[1] = env->prog->insnsi[subprog_start]; + insn_buf[cnt++] = env->prog->insnsi[subprog_start]; - new_prog = bpf_patch_insn_data(env, subprog_start, insn_buf, 2); + new_prog = bpf_patch_insn_data(env, subprog_start, insn_buf, cnt); if (!new_prog) return -ENOMEM; env->prog = prog = new_prog; @@ -21933,7 +22462,7 @@ next_insn: * to insn after BPF_ST that inits may_goto count. * Adjustment will succeed because bpf_patch_insn_data() didn't fail. */ - WARN_ON(adjust_jmp_off(env->prog, subprog_start, 1)); + WARN_ON(adjust_jmp_off(env->prog, subprog_start, delta)); } /* Since poke tab is now finalized, publish aux to tracker. */ @@ -22131,31 +22660,29 @@ static int remove_fastcall_spills_fills(struct bpf_verifier_env *env) static void free_states(struct bpf_verifier_env *env) { - struct bpf_verifier_state_list *sl, *sln; + struct bpf_verifier_state_list *sl; + struct list_head *head, *pos, *tmp; int i; - sl = env->free_list; - while (sl) { - sln = sl->next; + list_for_each_safe(pos, tmp, &env->free_list) { + sl = container_of(pos, struct bpf_verifier_state_list, node); free_verifier_state(&sl->state, false); kfree(sl); - sl = sln; } - env->free_list = NULL; + INIT_LIST_HEAD(&env->free_list); if (!env->explored_states) return; for (i = 0; i < state_htab_size(env); i++) { - sl = env->explored_states[i]; + head = &env->explored_states[i]; - while (sl) { - sln = sl->next; + list_for_each_safe(pos, tmp, head) { + sl = container_of(pos, struct bpf_verifier_state_list, node); free_verifier_state(&sl->state, false); kfree(sl); - sl = sln; } - env->explored_states[i] = NULL; + INIT_LIST_HEAD(&env->explored_states[i]); } } @@ -22163,6 +22690,7 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) { bool pop_log = !(env->log.level & BPF_LOG_LEVEL2); struct bpf_subprog_info *sub = subprog_info(env, subprog); + struct bpf_prog_aux *aux = env->prog->aux; struct bpf_verifier_state *state; struct bpf_reg_state *regs; int ret, i; @@ -22270,6 +22798,13 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) mark_reg_known_zero(env, regs, BPF_REG_1); } + /* Acquire references for struct_ops program arguments tagged with "__ref" */ + if (!subprog && env->prog->type == BPF_PROG_TYPE_STRUCT_OPS) { + for (i = 0; i < aux->ctx_arg_info_size; i++) + aux->ctx_arg_info[i].ref_obj_id = aux->ctx_arg_info[i].refcounted ? + acquire_reference(env, 0) : 0; + } + ret = do_check(env); out: /* check for NULL is necessary, since cur_state can be freed inside @@ -22392,6 +22927,15 @@ static void print_verification_stats(struct bpf_verifier_env *env) env->peak_states, env->longest_mark_read_walk); } +int bpf_prog_ctx_arg_info_init(struct bpf_prog *prog, + const struct bpf_ctx_arg_aux *info, u32 cnt) +{ + prog->aux->ctx_arg_info = kmemdup_array(info, cnt, sizeof(*info), GFP_KERNEL); + prog->aux->ctx_arg_info_size = cnt; + + return prog->aux->ctx_arg_info ? 0 : -ENOMEM; +} + static int check_struct_ops_btf_id(struct bpf_verifier_env *env) { const struct btf_type *t, *func_proto; @@ -22399,10 +22943,11 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env) const struct bpf_struct_ops *st_ops; const struct btf_member *member; struct bpf_prog *prog = env->prog; - u32 btf_id, member_idx; + bool has_refcounted_arg = false; + u32 btf_id, member_idx, member_off; struct btf *btf; const char *mname; - int err; + int i, err; if (!prog->gpl_compatible) { verbose(env, "struct ops programs must have a GPL compatible license\n"); @@ -22450,7 +22995,8 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env) return -EINVAL; } - err = bpf_struct_ops_supported(st_ops, __btf_member_bit_offset(t, member) / 8); + member_off = __btf_member_bit_offset(t, member) / 8; + err = bpf_struct_ops_supported(st_ops, member_off); if (err) { verbose(env, "attach to unsupported member %s of struct %s\n", mname, st_ops->name); @@ -22472,17 +23018,32 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env) return -EACCES; } - /* btf_ctx_access() used this to provide argument type info */ - prog->aux->ctx_arg_info = - st_ops_desc->arg_info[member_idx].info; - prog->aux->ctx_arg_info_size = - st_ops_desc->arg_info[member_idx].cnt; + for (i = 0; i < st_ops_desc->arg_info[member_idx].cnt; i++) { + if (st_ops_desc->arg_info[member_idx].info->refcounted) { + has_refcounted_arg = true; + break; + } + } + + /* Tail call is not allowed for programs with refcounted arguments since we + * cannot guarantee that valid refcounted kptrs will be passed to the callee. + */ + for (i = 0; i < env->subprog_cnt; i++) { + if (has_refcounted_arg && env->subprog_info[i].has_tail_call) { + verbose(env, "program with __ref argument cannot tail call\n"); + return -EINVAL; + } + } + + prog->aux->st_ops = st_ops; + prog->aux->attach_st_ops_member_off = member_off; prog->aux->attach_func_proto = func_proto; prog->aux->attach_func_name = mname; env->ops = st_ops->verifier_ops; - return 0; + return bpf_prog_ctx_arg_info_init(prog, st_ops_desc->arg_info[member_idx].info, + st_ops_desc->arg_info[member_idx].cnt); } #define SECURITY_PREFIX "security_" @@ -22558,6 +23119,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, if (tgt_prog) { struct bpf_prog_aux *aux = tgt_prog->aux; bool tgt_changes_pkt_data; + bool tgt_might_sleep; if (bpf_prog_is_dev_bound(prog->aux) && !bpf_prog_dev_bound_match(prog, tgt_prog)) { @@ -22600,6 +23162,15 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, "Extension program changes packet data, while original does not\n"); return -EINVAL; } + + tgt_might_sleep = aux->func + ? aux->func[subprog]->aux->might_sleep + : aux->might_sleep; + if (prog->aux->might_sleep && !tgt_might_sleep) { + bpf_log(log, + "Extension program may sleep, while original does not\n"); + return -EINVAL; + } } if (!tgt_prog->jited) { bpf_log(log, "Can attach to only JITed progs\n"); @@ -22856,6 +23427,33 @@ BTF_ID(func, __rcu_read_unlock) #endif BTF_SET_END(btf_id_deny) +/* fexit and fmod_ret can't be used to attach to __noreturn functions. + * Currently, we must manually list all __noreturn functions here. Once a more + * robust solution is implemented, this workaround can be removed. + */ +BTF_SET_START(noreturn_deny) +#ifdef CONFIG_IA32_EMULATION +BTF_ID(func, __ia32_sys_exit) +BTF_ID(func, __ia32_sys_exit_group) +#endif +#ifdef CONFIG_KUNIT +BTF_ID(func, __kunit_abort) +BTF_ID(func, kunit_try_catch_throw) +#endif +#ifdef CONFIG_MODULES +BTF_ID(func, __module_put_and_kthread_exit) +#endif +#ifdef CONFIG_X86_64 +BTF_ID(func, __x64_sys_exit) +BTF_ID(func, __x64_sys_exit_group) +#endif +BTF_ID(func, do_exit) +BTF_ID(func, do_group_exit) +BTF_ID(func, kthread_complete_and_exit) +BTF_ID(func, kthread_exit) +BTF_ID(func, make_task_dead) +BTF_SET_END(noreturn_deny) + static bool can_be_sleepable(struct bpf_prog *prog) { if (prog->type == BPF_PROG_TYPE_TRACING) { @@ -22932,9 +23530,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) prog->aux->attach_btf_trace = true; return 0; } else if (prog->expected_attach_type == BPF_TRACE_ITER) { - if (!bpf_iter_prog_supported(prog)) - return -EINVAL; - return 0; + return bpf_iter_prog_supported(prog); } if (prog->type == BPF_PROG_TYPE_LSM) { @@ -22944,6 +23540,11 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) } else if (prog->type == BPF_PROG_TYPE_TRACING && btf_id_set_contains(&btf_id_deny, btf_id)) { return -EINVAL; + } else if ((prog->expected_attach_type == BPF_TRACE_FEXIT || + prog->expected_attach_type == BPF_MODIFY_RETURN) && + btf_id_set_contains(&noreturn_deny, btf_id)) { + verbose(env, "Attaching fexit/fmod_ret to __noreturn functions is rejected.\n"); + return -EINVAL; } key = bpf_trampoline_compute_key(tgt_prog, prog->aux->attach_btf, btf_id); @@ -23036,6 +23637,302 @@ static int process_fd_array(struct bpf_verifier_env *env, union bpf_attr *attr, return 0; } +static bool can_fallthrough(struct bpf_insn *insn) +{ + u8 class = BPF_CLASS(insn->code); + u8 opcode = BPF_OP(insn->code); + + if (class != BPF_JMP && class != BPF_JMP32) + return true; + + if (opcode == BPF_EXIT || opcode == BPF_JA) + return false; + + return true; +} + +static bool can_jump(struct bpf_insn *insn) +{ + u8 class = BPF_CLASS(insn->code); + u8 opcode = BPF_OP(insn->code); + + if (class != BPF_JMP && class != BPF_JMP32) + return false; + + switch (opcode) { + case BPF_JA: + case BPF_JEQ: + case BPF_JNE: + case BPF_JLT: + case BPF_JLE: + case BPF_JGT: + case BPF_JGE: + case BPF_JSGT: + case BPF_JSGE: + case BPF_JSLT: + case BPF_JSLE: + case BPF_JCOND: + return true; + } + + return false; +} + +static int insn_successors(struct bpf_prog *prog, u32 idx, u32 succ[2]) +{ + struct bpf_insn *insn = &prog->insnsi[idx]; + int i = 0, insn_sz; + u32 dst; + + insn_sz = bpf_is_ldimm64(insn) ? 2 : 1; + if (can_fallthrough(insn) && idx + 1 < prog->len) + succ[i++] = idx + insn_sz; + + if (can_jump(insn)) { + dst = idx + jmp_offset(insn) + 1; + if (i == 0 || succ[0] != dst) + succ[i++] = dst; + } + + return i; +} + +/* Each field is a register bitmask */ +struct insn_live_regs { + u16 use; /* registers read by instruction */ + u16 def; /* registers written by instruction */ + u16 in; /* registers that may be alive before instruction */ + u16 out; /* registers that may be alive after instruction */ +}; + +/* Bitmask with 1s for all caller saved registers */ +#define ALL_CALLER_SAVED_REGS ((1u << CALLER_SAVED_REGS) - 1) + +/* Compute info->{use,def} fields for the instruction */ +static void compute_insn_live_regs(struct bpf_verifier_env *env, + struct bpf_insn *insn, + struct insn_live_regs *info) +{ + struct call_summary cs; + u8 class = BPF_CLASS(insn->code); + u8 code = BPF_OP(insn->code); + u8 mode = BPF_MODE(insn->code); + u16 src = BIT(insn->src_reg); + u16 dst = BIT(insn->dst_reg); + u16 r0 = BIT(0); + u16 def = 0; + u16 use = 0xffff; + + switch (class) { + case BPF_LD: + switch (mode) { + case BPF_IMM: + if (BPF_SIZE(insn->code) == BPF_DW) { + def = dst; + use = 0; + } + break; + case BPF_LD | BPF_ABS: + case BPF_LD | BPF_IND: + /* stick with defaults */ + break; + } + break; + case BPF_LDX: + switch (mode) { + case BPF_MEM: + case BPF_MEMSX: + def = dst; + use = src; + break; + } + break; + case BPF_ST: + switch (mode) { + case BPF_MEM: + def = 0; + use = dst; + break; + } + break; + case BPF_STX: + switch (mode) { + case BPF_MEM: + def = 0; + use = dst | src; + break; + case BPF_ATOMIC: + switch (insn->imm) { + case BPF_CMPXCHG: + use = r0 | dst | src; + def = r0; + break; + case BPF_LOAD_ACQ: + def = dst; + use = src; + break; + case BPF_STORE_REL: + def = 0; + use = dst | src; + break; + default: + use = dst | src; + if (insn->imm & BPF_FETCH) + def = src; + else + def = 0; + } + break; + } + break; + case BPF_ALU: + case BPF_ALU64: + switch (code) { + case BPF_END: + use = dst; + def = dst; + break; + case BPF_MOV: + def = dst; + if (BPF_SRC(insn->code) == BPF_K) + use = 0; + else + use = src; + break; + default: + def = dst; + if (BPF_SRC(insn->code) == BPF_K) + use = dst; + else + use = dst | src; + } + break; + case BPF_JMP: + case BPF_JMP32: + switch (code) { + case BPF_JA: + case BPF_JCOND: + def = 0; + use = 0; + break; + case BPF_EXIT: + def = 0; + use = r0; + break; + case BPF_CALL: + def = ALL_CALLER_SAVED_REGS; + use = def & ~BIT(BPF_REG_0); + if (get_call_summary(env, insn, &cs)) + use = GENMASK(cs.num_params, 1); + break; + default: + def = 0; + if (BPF_SRC(insn->code) == BPF_K) + use = dst; + else + use = dst | src; + } + break; + } + + info->def = def; + info->use = use; +} + +/* Compute may-live registers after each instruction in the program. + * The register is live after the instruction I if it is read by some + * instruction S following I during program execution and is not + * overwritten between I and S. + * + * Store result in env->insn_aux_data[i].live_regs. + */ +static int compute_live_registers(struct bpf_verifier_env *env) +{ + struct bpf_insn_aux_data *insn_aux = env->insn_aux_data; + struct bpf_insn *insns = env->prog->insnsi; + struct insn_live_regs *state; + int insn_cnt = env->prog->len; + int err = 0, i, j; + bool changed; + + /* Use the following algorithm: + * - define the following: + * - I.use : a set of all registers read by instruction I; + * - I.def : a set of all registers written by instruction I; + * - I.in : a set of all registers that may be alive before I execution; + * - I.out : a set of all registers that may be alive after I execution; + * - insn_successors(I): a set of instructions S that might immediately + * follow I for some program execution; + * - associate separate empty sets 'I.in' and 'I.out' with each instruction; + * - visit each instruction in a postorder and update + * state[i].in, state[i].out as follows: + * + * state[i].out = U [state[s].in for S in insn_successors(i)] + * state[i].in = (state[i].out / state[i].def) U state[i].use + * + * (where U stands for set union, / stands for set difference) + * - repeat the computation while {in,out} fields changes for + * any instruction. + */ + state = kvcalloc(insn_cnt, sizeof(*state), GFP_KERNEL); + if (!state) { + err = -ENOMEM; + goto out; + } + + for (i = 0; i < insn_cnt; ++i) + compute_insn_live_regs(env, &insns[i], &state[i]); + + changed = true; + while (changed) { + changed = false; + for (i = 0; i < env->cfg.cur_postorder; ++i) { + int insn_idx = env->cfg.insn_postorder[i]; + struct insn_live_regs *live = &state[insn_idx]; + int succ_num; + u32 succ[2]; + u16 new_out = 0; + u16 new_in = 0; + + succ_num = insn_successors(env->prog, insn_idx, succ); + for (int s = 0; s < succ_num; ++s) + new_out |= state[succ[s]].in; + new_in = (new_out & ~live->def) | live->use; + if (new_out != live->out || new_in != live->in) { + live->in = new_in; + live->out = new_out; + changed = true; + } + } + } + + for (i = 0; i < insn_cnt; ++i) + insn_aux[i].live_regs_before = state[i].in; + + if (env->log.level & BPF_LOG_LEVEL2) { + verbose(env, "Live regs before insn:\n"); + for (i = 0; i < insn_cnt; ++i) { + verbose(env, "%3d: ", i); + for (j = BPF_REG_0; j < BPF_REG_10; ++j) + if (insn_aux[i].live_regs_before & BIT(j)) + verbose(env, "%d", j); + else + verbose(env, "."); + verbose(env, " "); + verbose_insn(env, &insns[i]); + if (bpf_is_ldimm64(&insns[i])) + i++; + } + } + +out: + kvfree(state); + kvfree(env->cfg.insn_postorder); + env->cfg.insn_postorder = NULL; + env->cfg.cur_postorder = 0; + return err; +} + int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size) { u64 start_time = ktime_get_ns(); @@ -23113,12 +24010,16 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 env->test_reg_invariants = attr->prog_flags & BPF_F_TEST_REG_INVARIANTS; env->explored_states = kvcalloc(state_htab_size(env), - sizeof(struct bpf_verifier_state_list *), + sizeof(struct list_head), GFP_USER); ret = -ENOMEM; if (!env->explored_states) goto skip_full_check; + for (i = 0; i < state_htab_size(env); i++) + INIT_LIST_HEAD(&env->explored_states[i]); + INIT_LIST_HEAD(&env->free_list); + ret = check_btf_info_early(env, attr, uattr); if (ret < 0) goto skip_full_check; @@ -23153,6 +24054,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 if (ret) goto skip_full_check; + ret = compute_live_registers(env); + if (ret < 0) + goto skip_full_check; + ret = mark_fastcall_patterns(env); if (ret < 0) goto skip_full_check; @@ -23291,6 +24196,7 @@ err_unlock: vfree(env->insn_aux_data); kvfree(env->insn_hist); err_free_env: + kvfree(env->cfg.insn_postorder); kvfree(env); return ret; } diff --git a/kernel/capability.c b/kernel/capability.c index e089d2628c29..829f49ae07b9 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -286,22 +286,6 @@ bool has_ns_capability(struct task_struct *t, } /** - * has_capability - Does a task have a capability in init_user_ns - * @t: The task in question - * @cap: The capability to be tested for - * - * Return true if the specified task has the given superior capability - * currently in effect to the initial user namespace, false if not. - * - * Note that this does not set PF_SUPERPRIV on the task. - */ -bool has_capability(struct task_struct *t, int cap) -{ - return has_ns_capability(t, &init_user_ns, cap); -} -EXPORT_SYMBOL(has_capability); - -/** * has_ns_capability_noaudit - Does a task have a capability (unaudited) * in a specific user ns. * @t: The task in question diff --git a/kernel/cfi.c b/kernel/cfi.c index 19be79639542..422fa4f958ae 100644 --- a/kernel/cfi.c +++ b/kernel/cfi.c @@ -73,14 +73,11 @@ static bool is_module_cfi_trap(unsigned long addr) struct module *mod; bool found = false; - rcu_read_lock_sched_notrace(); - + guard(rcu)(); mod = __module_address(addr); if (mod) found = is_trap(addr, mod->kcfi_traps, mod->kcfi_traps_end); - rcu_read_unlock_sched_notrace(); - return found; } #else /* CONFIG_MODULES */ diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 95ab39e1ec8f..b14e61c64a34 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -270,9 +270,9 @@ int cgroup_task_count(const struct cgroup *cgrp); /* * rstat.c */ -int cgroup_rstat_init(struct cgroup *cgrp); -void cgroup_rstat_exit(struct cgroup *cgrp); -void cgroup_rstat_boot(void); +int css_rstat_init(struct cgroup_subsys_state *css); +void css_rstat_exit(struct cgroup_subsys_state *css); +int ss_rstat_init(struct cgroup_subsys *ss); void cgroup_base_stat_cputime_show(struct seq_file *seq); /* diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 11ea8d24ac72..fa24c032ed6f 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -851,7 +851,7 @@ static int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent if (kernfs_type(kn) != KERNFS_DIR) return -ENOTDIR; - if (kn->parent != new_parent) + if (rcu_access_pointer(kn->__parent) != new_parent) return -EIO; /* diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index f231fe3a0744..a723b7dc6e4e 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -90,11 +90,14 @@ DEFINE_MUTEX(cgroup_mutex); DEFINE_SPINLOCK(css_set_lock); -#ifdef CONFIG_PROVE_RCU +#if (defined CONFIG_PROVE_RCU || defined CONFIG_LOCKDEP) EXPORT_SYMBOL_GPL(cgroup_mutex); EXPORT_SYMBOL_GPL(css_set_lock); #endif +struct blocking_notifier_head cgroup_lifetime_notifier = + BLOCKING_NOTIFIER_INIT(cgroup_lifetime_notifier); + DEFINE_SPINLOCK(trace_cgroup_path_lock); char trace_cgroup_path[TRACE_CGROUP_PATH_LEN]; static bool cgroup_debug __read_mostly; @@ -161,10 +164,14 @@ static struct static_key_true *cgroup_subsys_on_dfl_key[] = { }; #undef SUBSYS -static DEFINE_PER_CPU(struct cgroup_rstat_cpu, cgrp_dfl_root_rstat_cpu); +static DEFINE_PER_CPU(struct css_rstat_cpu, root_rstat_cpu); +static DEFINE_PER_CPU(struct cgroup_rstat_base_cpu, root_rstat_base_cpu); /* the default hierarchy */ -struct cgroup_root cgrp_dfl_root = { .cgrp.rstat_cpu = &cgrp_dfl_root_rstat_cpu }; +struct cgroup_root cgrp_dfl_root = { + .cgrp.self.rstat_cpu = &root_rstat_cpu, + .cgrp.rstat_base_cpu = &root_rstat_base_cpu, +}; EXPORT_SYMBOL_GPL(cgrp_dfl_root); /* @@ -633,9 +640,22 @@ int cgroup_task_count(const struct cgroup *cgrp) return count; } +static struct cgroup *kn_priv(struct kernfs_node *kn) +{ + struct kernfs_node *parent; + /* + * The parent can not be replaced due to KERNFS_ROOT_INVARIANT_PARENT. + * Therefore it is always safe to dereference this pointer outside of a + * RCU section. + */ + parent = rcu_dereference_check(kn->__parent, + kernfs_root_flags(kn) & KERNFS_ROOT_INVARIANT_PARENT); + return parent->priv; +} + struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) { - struct cgroup *cgrp = of->kn->parent->priv; + struct cgroup *cgrp = kn_priv(of->kn); struct cftype *cft = of_cft(of); /* @@ -1322,6 +1342,7 @@ static void cgroup_destroy_root(struct cgroup_root *root) { struct cgroup *cgrp = &root->cgrp; struct cgrp_cset_link *link, *tmp_link; + int ret; trace_cgroup_destroy_root(root); @@ -1330,6 +1351,10 @@ static void cgroup_destroy_root(struct cgroup_root *root) BUG_ON(atomic_read(&root->nr_cgrps)); BUG_ON(!list_empty(&cgrp->self.children)); + ret = blocking_notifier_call_chain(&cgroup_lifetime_notifier, + CGROUP_LIFETIME_OFFLINE, cgrp); + WARN_ON_ONCE(notifier_to_errno(ret)); + /* Rebind all subsystems back to the default hierarchy */ WARN_ON(rebind_subsystems(&cgrp_dfl_root, root->subsys_mask)); @@ -1358,7 +1383,6 @@ static void cgroup_destroy_root(struct cgroup_root *root) cgroup_unlock(); - cgroup_rstat_exit(cgrp); kernfs_destroy_root(root->kf_root); cgroup_free_root(root); } @@ -1612,7 +1636,7 @@ void cgroup_kn_unlock(struct kernfs_node *kn) if (kernfs_type(kn) == KERNFS_DIR) cgrp = kn->priv; else - cgrp = kn->parent->priv; + cgrp = kn_priv(kn); cgroup_unlock(); @@ -1644,7 +1668,7 @@ struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline) if (kernfs_type(kn) == KERNFS_DIR) cgrp = kn->priv; else - cgrp = kn->parent->priv; + cgrp = kn_priv(kn); /* * We're gonna grab cgroup_mutex which nests outside kernfs @@ -1682,7 +1706,7 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) cfile->kn = NULL; spin_unlock_irq(&cgroup_file_kn_lock); - del_timer_sync(&cfile->notify_timer); + timer_delete_sync(&cfile->notify_timer); } kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name)); @@ -1702,7 +1726,7 @@ static void css_clear_dir(struct cgroup_subsys_state *css) css->flags &= ~CSS_VISIBLE; - if (!css->ss) { + if (css_is_self(css)) { if (cgroup_on_dfl(cgrp)) { cgroup_addrm_files(css, cgrp, cgroup_base_files, false); @@ -1734,7 +1758,7 @@ static int css_populate_dir(struct cgroup_subsys_state *css) if (css->flags & CSS_VISIBLE) return 0; - if (!css->ss) { + if (css_is_self(css)) { if (cgroup_on_dfl(cgrp)) { ret = cgroup_addrm_files(css, cgrp, cgroup_base_files, true); @@ -1863,13 +1887,6 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) } spin_unlock_irq(&css_set_lock); - if (ss->css_rstat_flush) { - list_del_rcu(&css->rstat_css_node); - synchronize_rcu(); - list_add_rcu(&css->rstat_css_node, - &dcgrp->rstat_css_list); - } - /* default hierarchy doesn't enable controllers by default */ dst_root->subsys_mask |= 1 << ssid; if (dst_root == &cgrp_dfl_root) { @@ -2052,7 +2069,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) cgrp->dom_cgrp = cgrp; cgrp->max_descendants = INT_MAX; cgrp->max_depth = INT_MAX; - INIT_LIST_HEAD(&cgrp->rstat_css_list); prev_cputime_init(&cgrp->prev_cputime); for_each_subsys(ss, ssid) @@ -2118,7 +2134,8 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) root->kf_root = kernfs_create_root(kf_sops, KERNFS_ROOT_CREATE_DEACTIVATED | KERNFS_ROOT_SUPPORT_EXPORTOP | - KERNFS_ROOT_SUPPORT_USER_XATTR, + KERNFS_ROOT_SUPPORT_USER_XATTR | + KERNFS_ROOT_INVARIANT_PARENT, root_cgrp); if (IS_ERR(root->kf_root)) { ret = PTR_ERR(root->kf_root); @@ -2132,7 +2149,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) if (ret) goto destroy_root; - ret = cgroup_rstat_init(root_cgrp); + ret = css_rstat_init(&root_cgrp->self); if (ret) goto destroy_root; @@ -2140,10 +2157,9 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) if (ret) goto exit_stats; - if (root == &cgrp_dfl_root) { - ret = cgroup_bpf_inherit(root_cgrp); - WARN_ON_ONCE(ret); - } + ret = blocking_notifier_call_chain(&cgroup_lifetime_notifier, + CGROUP_LIFETIME_ONLINE, root_cgrp); + WARN_ON_ONCE(notifier_to_errno(ret)); trace_cgroup_setup_root(root); @@ -2174,7 +2190,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) goto out; exit_stats: - cgroup_rstat_exit(root_cgrp); + css_rstat_exit(&root_cgrp->self); destroy_root: kernfs_destroy_root(root->kf_root); root->kf_root = NULL; @@ -2339,9 +2355,37 @@ static struct file_system_type cgroup2_fs_type = { }; #ifdef CONFIG_CPUSETS_V1 +enum cpuset_param { + Opt_cpuset_v2_mode, +}; + +static const struct fs_parameter_spec cpuset_fs_parameters[] = { + fsparam_flag ("cpuset_v2_mode", Opt_cpuset_v2_mode), + {} +}; + +static int cpuset_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); + struct fs_parse_result result; + int opt; + + opt = fs_parse(fc, cpuset_fs_parameters, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_cpuset_v2_mode: + ctx->flags |= CGRP_ROOT_CPUSET_V2_MODE; + return 0; + } + return -EINVAL; +} + static const struct fs_context_operations cpuset_fs_context_ops = { .get_tree = cgroup1_get_tree, .free = cgroup_fs_context_free, + .parse_param = cpuset_parse_param, }; /* @@ -2378,6 +2422,7 @@ static int cpuset_init_fs_context(struct fs_context *fc) static struct file_system_type cpuset_fs_type = { .name = "cpuset", .init_fs_context = cpuset_init_fs_context, + .parameters = cpuset_fs_parameters, .fs_flags = FS_USERNS_MOUNT, }; #endif @@ -4115,7 +4160,7 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cgroup_file_ctx *ctx = of->priv; - struct cgroup *cgrp = of->kn->parent->priv; + struct cgroup *cgrp = kn_priv(of->kn); struct cftype *cft = of_cft(of); struct cgroup_subsys_state *css; int ret; @@ -5401,8 +5446,9 @@ static void css_free_rwork_fn(struct work_struct *work) struct cgroup *cgrp = css->cgroup; percpu_ref_exit(&css->refcnt); + css_rstat_exit(css); - if (ss) { + if (!css_is_self(css)) { /* css free path */ struct cgroup_subsys_state *parent = css->parent; int id = css->id; @@ -5431,7 +5477,6 @@ static void css_free_rwork_fn(struct work_struct *work) cgroup_put(cgroup_parent(cgrp)); kernfs_put(cgrp->kn); psi_cgroup_free(cgrp); - cgroup_rstat_exit(cgrp); kfree(cgrp); } else { /* @@ -5456,14 +5501,10 @@ static void css_release_work_fn(struct work_struct *work) css->flags |= CSS_RELEASED; list_del_rcu(&css->sibling); - if (ss) { + if (!css_is_self(css)) { struct cgroup *parent_cgrp; - /* css release path */ - if (!list_empty(&css->rstat_css_node)) { - cgroup_rstat_flush(cgrp); - list_del_rcu(&css->rstat_css_node); - } + css_rstat_flush(css); cgroup_idr_replace(&ss->css_idr, NULL, css->id); if (ss->css_released) @@ -5489,7 +5530,7 @@ static void css_release_work_fn(struct work_struct *work) /* cgroup release path */ TRACE_CGROUP_PATH(release, cgrp); - cgroup_rstat_flush(cgrp); + css_rstat_flush(&cgrp->self); spin_lock_irq(&css_set_lock); for (tcgrp = cgroup_parent(cgrp); tcgrp; @@ -5537,7 +5578,6 @@ static void init_and_link_css(struct cgroup_subsys_state *css, css->id = -1; INIT_LIST_HEAD(&css->sibling); INIT_LIST_HEAD(&css->children); - INIT_LIST_HEAD(&css->rstat_css_node); css->serial_nr = css_serial_nr_next++; atomic_set(&css->online_cnt, 0); @@ -5546,9 +5586,6 @@ static void init_and_link_css(struct cgroup_subsys_state *css, css_get(css->parent); } - if (ss->css_rstat_flush) - list_add_rcu(&css->rstat_css_node, &cgrp->rstat_css_list); - BUG_ON(cgroup_css(cgrp, ss)); } @@ -5641,6 +5678,10 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, goto err_free_css; css->id = err; + err = css_rstat_init(css); + if (err) + goto err_free_css; + /* @css is ready to be brought online now, make it visible */ list_add_tail_rcu(&css->sibling, &parent_css->children); cgroup_idr_replace(&ss->css_idr, css, css->id); @@ -5654,7 +5695,6 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, err_list_del: list_del_rcu(&css->sibling); err_free_css: - list_del_rcu(&css->rstat_css_node); INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn); queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork); return ERR_PTR(err); @@ -5670,7 +5710,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, struct cgroup_root *root = parent->root; struct cgroup *cgrp, *tcgrp; struct kernfs_node *kn; - int level = parent->level + 1; + int i, level = parent->level + 1; int ret; /* allocate the cgroup and its ID, 0 is reserved for the root */ @@ -5682,17 +5722,13 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, if (ret) goto out_free_cgrp; - ret = cgroup_rstat_init(cgrp); - if (ret) - goto out_cancel_ref; - /* create the directory */ kn = kernfs_create_dir_ns(parent->kn, name, mode, current_fsuid(), current_fsgid(), cgrp, NULL); if (IS_ERR(kn)) { ret = PTR_ERR(kn); - goto out_stat_exit; + goto out_cancel_ref; } cgrp->kn = kn; @@ -5702,15 +5738,20 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, cgrp->root = root; cgrp->level = level; - ret = psi_cgroup_alloc(cgrp); + /* + * Now that init_cgroup_housekeeping() has been called and cgrp->self + * is setup, it is safe to perform rstat initialization on it. + */ + ret = css_rstat_init(&cgrp->self); if (ret) goto out_kernfs_remove; - if (cgrp->root == &cgrp_dfl_root) { - ret = cgroup_bpf_inherit(cgrp); - if (ret) - goto out_psi_free; - } + ret = psi_cgroup_alloc(cgrp); + if (ret) + goto out_stat_exit; + + for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) + cgrp->ancestors[tcgrp->level] = tcgrp; /* * New cgroup inherits effective freeze counter, and @@ -5728,24 +5769,6 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, set_bit(CGRP_FROZEN, &cgrp->flags); } - spin_lock_irq(&css_set_lock); - for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) { - cgrp->ancestors[tcgrp->level] = tcgrp; - - if (tcgrp != cgrp) { - tcgrp->nr_descendants++; - - /* - * If the new cgroup is frozen, all ancestor cgroups - * get a new frozen descendant, but their state can't - * change because of this. - */ - if (cgrp->freezer.e_freeze) - tcgrp->freezer.nr_frozen_descendants++; - } - } - spin_unlock_irq(&css_set_lock); - if (notify_on_release(parent)) set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); @@ -5754,7 +5777,29 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, cgrp->self.serial_nr = css_serial_nr_next++; + ret = blocking_notifier_call_chain_robust(&cgroup_lifetime_notifier, + CGROUP_LIFETIME_ONLINE, + CGROUP_LIFETIME_OFFLINE, cgrp); + ret = notifier_to_errno(ret); + if (ret) + goto out_psi_free; + /* allocation complete, commit to creation */ + spin_lock_irq(&css_set_lock); + for (i = 0; i < level; i++) { + tcgrp = cgrp->ancestors[i]; + tcgrp->nr_descendants++; + + /* + * If the new cgroup is frozen, all ancestor cgroups get a new + * frozen descendant, but their state can't change because of + * this. + */ + if (cgrp->freezer.e_freeze) + tcgrp->freezer.nr_frozen_descendants++; + } + spin_unlock_irq(&css_set_lock); + list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children); atomic_inc(&root->nr_cgrps); cgroup_get_live(parent); @@ -5772,10 +5817,10 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, out_psi_free: psi_cgroup_free(cgrp); +out_stat_exit: + css_rstat_exit(&cgrp->self); out_kernfs_remove: kernfs_remove(cgrp->kn); -out_stat_exit: - cgroup_rstat_exit(cgrp); out_cancel_ref: percpu_ref_exit(&cgrp->self.refcnt); out_free_cgrp: @@ -5909,6 +5954,12 @@ static void kill_css(struct cgroup_subsys_state *css) if (css->flags & CSS_DYING) return; + /* + * Call css_killed(), if defined, before setting the CSS_DYING flag + */ + if (css->ss->css_killed) + css->ss->css_killed(css); + css->flags |= CSS_DYING; /* @@ -5966,7 +6017,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) struct cgroup *tcgrp, *parent = cgroup_parent(cgrp); struct cgroup_subsys_state *css; struct cgrp_cset_link *link; - int ssid; + int ssid, ret; lockdep_assert_held(&cgroup_mutex); @@ -6024,8 +6075,9 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) cgroup1_check_for_release(parent); - if (cgrp->root == &cgrp_dfl_root) - cgroup_bpf_offline(cgrp); + ret = blocking_notifier_call_chain(&cgroup_lifetime_notifier, + CGROUP_LIFETIME_OFFLINE, cgrp); + WARN_ON_ONCE(notifier_to_errno(ret)); /* put the base reference */ percpu_ref_kill(&cgrp->self.refcnt); @@ -6087,6 +6139,9 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) } else { css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL); BUG_ON(css->id < 0); + + BUG_ON(ss_rstat_init(ss)); + BUG_ON(css_rstat_init(css)); } /* Update the init_css_set to contain a subsys @@ -6135,6 +6190,8 @@ int __init cgroup_init_early(void) ss->id, ss->name); WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN, "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]); + WARN(ss->early_init && ss->css_rstat_flush, + "cgroup rstat cannot be used with early init subsystem\n"); ss->id = i; ss->name = cgroup_subsys_name[i]; @@ -6163,7 +6220,7 @@ int __init cgroup_init(void) BUG_ON(cgroup_init_cftypes(NULL, cgroup_psi_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files)); - cgroup_rstat_boot(); + BUG_ON(ss_rstat_init(NULL)); get_user_ns(init_cgroup_ns.user_ns); @@ -6176,6 +6233,8 @@ int __init cgroup_init(void) hash_add(css_set_table, &init_css_set.hlist, css_set_hash(init_css_set.subsys)); + cgroup_bpf_lifetime_notifier_init(); + BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0)); cgroup_unlock(); diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h index 976a8bc3ff60..383963e28ac6 100644 --- a/kernel/cgroup/cpuset-internal.h +++ b/kernel/cgroup/cpuset-internal.h @@ -33,6 +33,7 @@ enum prs_errcode { PERR_CPUSEMPTY, PERR_HKEEPING, PERR_ACCESS, + PERR_REMOTE, }; /* bits in struct cpuset flags field */ diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 39c1fc643d77..6d3ac19cc2ac 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -61,10 +61,17 @@ static const char * const perr_strings[] = { [PERR_CPUSEMPTY] = "cpuset.cpus and cpuset.cpus.exclusive are empty", [PERR_HKEEPING] = "partition config conflicts with housekeeping setup", [PERR_ACCESS] = "Enable partition not permitted", + [PERR_REMOTE] = "Have remote partition underneath", }; /* - * Exclusive CPUs distributed out to sub-partitions of top_cpuset + * For local partitions, update to subpartitions_cpus & isolated_cpus is done + * in update_parent_effective_cpumask(). For remote partitions, it is done in + * the remote_partition_*() and remote_cpus_update() helpers. + */ +/* + * Exclusive CPUs distributed out to local or remote sub-partitions of + * top_cpuset */ static cpumask_var_t subpartitions_cpus; @@ -86,7 +93,6 @@ static struct list_head remote_children; * A flag to force sched domain rebuild at the end of an operation. * It can be set in * - update_partition_sd_lb() - * - remote_partition_check() * - update_cpumasks_hier() * - cpuset_update_flag() * - cpuset_hotplug_update_tasks() @@ -186,6 +192,20 @@ static inline void notify_partition_change(struct cpuset *cs, int old_prs) WRITE_ONCE(cs->prs_err, PERR_NONE); } +/* + * The top_cpuset is always synchronized to cpu_active_mask and we should avoid + * using cpu_online_mask as much as possible. An active CPU is always an online + * CPU, but not vice versa. cpu_active_mask and cpu_online_mask can differ + * during hotplug operations. A CPU is marked active at the last stage of CPU + * bringup (CPUHP_AP_ACTIVE). It is also the stage where cpuset hotplug code + * will be called to update the sched domains so that the scheduler can move + * a normal task to a newly active CPU or remove tasks away from a newly + * inactivated CPU. The online bit is set much earlier in the CPU bringup + * process and cleared much later in CPU teardown. + * + * If cpu_online_mask is used while a hotunplug operation is happening in + * parallel, we may leave an offline CPU in cpu_allowed or some other masks. + */ static struct cpuset top_cpuset = { .flags = BIT(CS_ONLINE) | BIT(CS_CPU_EXCLUSIVE) | BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE), @@ -349,18 +369,18 @@ static inline bool partition_is_populated(struct cpuset *cs, * appropriate cpus. * * One way or another, we guarantee to return some non-empty subset - * of cpu_online_mask. + * of cpu_active_mask. * * Call with callback_lock or cpuset_mutex held. */ -static void guarantee_online_cpus(struct task_struct *tsk, +static void guarantee_active_cpus(struct task_struct *tsk, struct cpumask *pmask) { const struct cpumask *possible_mask = task_cpu_possible_mask(tsk); struct cpuset *cs; - if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_online_mask))) - cpumask_copy(pmask, cpu_online_mask); + if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_active_mask))) + cpumask_copy(pmask, cpu_active_mask); rcu_read_lock(); cs = task_cs(tsk); @@ -1089,9 +1109,14 @@ void cpuset_reset_sched_domains(void) * * Iterate through each task of @cs updating its cpus_allowed to the * effective cpuset's. As this function is called with cpuset_mutex held, - * cpuset membership stays stable. For top_cpuset, task_cpu_possible_mask() - * is used instead of effective_cpus to make sure all offline CPUs are also - * included as hotplug code won't update cpumasks for tasks in top_cpuset. + * cpuset membership stays stable. + * + * For top_cpuset, task_cpu_possible_mask() is used instead of effective_cpus + * to make sure all offline CPUs are also included as hotplug code won't + * update cpumasks for tasks in top_cpuset. + * + * As task_cpu_possible_mask() can be task dependent in arm64, we have to + * do cpu masking per task instead of doing it once for all. */ void cpuset_update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus) { @@ -1105,9 +1130,11 @@ void cpuset_update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus) if (top_cs) { /* - * Percpu kthreads in top_cpuset are ignored + * PF_NO_SETAFFINITY tasks are ignored. + * All per cpu kthreads should have PF_NO_SETAFFINITY + * flag set, see kthread_set_per_cpu(). */ - if (kthread_is_per_cpu(task)) + if (task->flags & PF_NO_SETAFFINITY) continue; cpumask_andnot(new_cpus, possible_mask, subpartitions_cpus); } else { @@ -1151,7 +1178,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, * * Return: 0 if successful, an error code otherwise */ -static int update_partition_exclusive(struct cpuset *cs, int new_prs) +static int update_partition_exclusive_flag(struct cpuset *cs, int new_prs) { bool exclusive = (new_prs > PRS_MEMBER); @@ -1234,12 +1261,12 @@ static void reset_partition_data(struct cpuset *cs) } /* - * partition_xcpus_newstate - Exclusive CPUs state change + * isolated_cpus_update - Update the isolated_cpus mask * @old_prs: old partition_root_state * @new_prs: new partition_root_state * @xcpus: exclusive CPUs with state change */ -static void partition_xcpus_newstate(int old_prs, int new_prs, struct cpumask *xcpus) +static void isolated_cpus_update(int old_prs, int new_prs, struct cpumask *xcpus) { WARN_ON_ONCE(old_prs == new_prs); if (new_prs == PRS_ISOLATED) @@ -1273,8 +1300,8 @@ static bool partition_xcpus_add(int new_prs, struct cpuset *parent, isolcpus_updated = (new_prs != parent->partition_root_state); if (isolcpus_updated) - partition_xcpus_newstate(parent->partition_root_state, new_prs, - xcpus); + isolated_cpus_update(parent->partition_root_state, new_prs, + xcpus); cpumask_andnot(parent->effective_cpus, parent->effective_cpus, xcpus); return isolcpus_updated; @@ -1304,8 +1331,8 @@ static bool partition_xcpus_del(int old_prs, struct cpuset *parent, isolcpus_updated = (old_prs != parent->partition_root_state); if (isolcpus_updated) - partition_xcpus_newstate(old_prs, parent->partition_root_state, - xcpus); + isolated_cpus_update(old_prs, parent->partition_root_state, + xcpus); cpumask_and(xcpus, xcpus, cpu_active_mask); cpumask_or(parent->effective_cpus, parent->effective_cpus, xcpus); @@ -1340,20 +1367,55 @@ EXPORT_SYMBOL_GPL(cpuset_cpu_is_isolated); * compute_effective_exclusive_cpumask - compute effective exclusive CPUs * @cs: cpuset * @xcpus: effective exclusive CPUs value to be set - * Return: true if xcpus is not empty, false otherwise. + * @real_cs: the real cpuset (can be NULL) + * Return: 0 if there is no sibling conflict, > 0 otherwise * - * Starting with exclusive_cpus (cpus_allowed if exclusive_cpus is not set), - * it must be a subset of parent's effective_xcpus. + * If exclusive_cpus isn't explicitly set or a real_cs is provided, we have to + * scan the sibling cpusets and exclude their exclusive_cpus or effective_xcpus + * as well. The provision of real_cs means that a cpumask is being changed and + * the given cs is a trial one. */ -static bool compute_effective_exclusive_cpumask(struct cpuset *cs, - struct cpumask *xcpus) +static int compute_effective_exclusive_cpumask(struct cpuset *cs, + struct cpumask *xcpus, + struct cpuset *real_cs) { + struct cgroup_subsys_state *css; struct cpuset *parent = parent_cs(cs); + struct cpuset *sibling; + int retval = 0; if (!xcpus) xcpus = cs->effective_xcpus; - return cpumask_and(xcpus, user_xcpus(cs), parent->effective_xcpus); + cpumask_and(xcpus, user_xcpus(cs), parent->effective_xcpus); + + if (!real_cs) { + if (!cpumask_empty(cs->exclusive_cpus)) + return 0; + } else { + cs = real_cs; + } + + /* + * Exclude exclusive CPUs from siblings + */ + rcu_read_lock(); + cpuset_for_each_child(sibling, css, parent) { + if (sibling == cs) + continue; + + if (cpumask_intersects(xcpus, sibling->exclusive_cpus)) { + cpumask_andnot(xcpus, xcpus, sibling->exclusive_cpus); + retval++; + continue; + } + if (cpumask_intersects(xcpus, sibling->effective_xcpus)) { + cpumask_andnot(xcpus, xcpus, sibling->effective_xcpus); + retval++; + } + } + rcu_read_unlock(); + return retval; } static inline bool is_remote_partition(struct cpuset *cs) @@ -1391,21 +1453,26 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs, * The requested exclusive_cpus must not be allocated to other * partitions and it can't use up all the root's effective_cpus. * - * Note that if there is any local partition root above it or - * remote partition root underneath it, its exclusive_cpus must - * have overlapped with subpartitions_cpus. + * The effective_xcpus mask can contain offline CPUs, but there must + * be at least one or more online CPUs present before it can be enabled. + * + * Note that creating a remote partition with any local partition root + * above it or remote partition root underneath it is not allowed. */ - compute_effective_exclusive_cpumask(cs, tmp->new_cpus); - if (cpumask_empty(tmp->new_cpus) || - cpumask_intersects(tmp->new_cpus, subpartitions_cpus) || + compute_effective_exclusive_cpumask(cs, tmp->new_cpus, NULL); + WARN_ON_ONCE(cpumask_intersects(tmp->new_cpus, subpartitions_cpus)); + if (!cpumask_intersects(tmp->new_cpus, cpu_active_mask) || cpumask_subset(top_cpuset.effective_cpus, tmp->new_cpus)) return PERR_INVCPUS; spin_lock_irq(&callback_lock); isolcpus_updated = partition_xcpus_add(new_prs, NULL, tmp->new_cpus); list_add(&cs->remote_sibling, &remote_children); + cpumask_copy(cs->effective_xcpus, tmp->new_cpus); spin_unlock_irq(&callback_lock); update_unbound_workqueue_cpumask(isolcpus_updated); + cpuset_force_rebuild(); + cs->prs_err = 0; /* * Propagate changes in top_cpuset's effective_cpus down the hierarchy. @@ -1428,20 +1495,24 @@ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp) { bool isolcpus_updated; - compute_effective_exclusive_cpumask(cs, tmp->new_cpus); WARN_ON_ONCE(!is_remote_partition(cs)); - WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, subpartitions_cpus)); + WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus)); spin_lock_irq(&callback_lock); list_del_init(&cs->remote_sibling); isolcpus_updated = partition_xcpus_del(cs->partition_root_state, - NULL, tmp->new_cpus); - cs->partition_root_state = -cs->partition_root_state; - if (!cs->prs_err) - cs->prs_err = PERR_INVCPUS; + NULL, cs->effective_xcpus); + if (cs->prs_err) + cs->partition_root_state = -cs->partition_root_state; + else + cs->partition_root_state = PRS_MEMBER; + + /* effective_xcpus may need to be changed */ + compute_effective_exclusive_cpumask(cs, NULL, NULL); reset_partition_data(cs); spin_unlock_irq(&callback_lock); update_unbound_workqueue_cpumask(isolcpus_updated); + cpuset_force_rebuild(); /* * Propagate changes in top_cpuset's effective_cpus down the hierarchy. @@ -1453,14 +1524,15 @@ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp) /* * remote_cpus_update - cpus_exclusive change of remote partition * @cs: the cpuset to be updated - * @newmask: the new effective_xcpus mask + * @xcpus: the new exclusive_cpus mask, if non-NULL + * @excpus: the new effective_xcpus mask * @tmp: temporary masks * * top_cpuset and subpartitions_cpus will be updated or partition can be * invalidated. */ -static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask, - struct tmpmasks *tmp) +static void remote_cpus_update(struct cpuset *cs, struct cpumask *xcpus, + struct cpumask *excpus, struct tmpmasks *tmp) { bool adding, deleting; int prs = cs->partition_root_state; @@ -1471,29 +1543,46 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask, WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus)); - if (cpumask_empty(newmask)) + if (cpumask_empty(excpus)) { + cs->prs_err = PERR_CPUSEMPTY; goto invalidate; + } - adding = cpumask_andnot(tmp->addmask, newmask, cs->effective_xcpus); - deleting = cpumask_andnot(tmp->delmask, cs->effective_xcpus, newmask); + adding = cpumask_andnot(tmp->addmask, excpus, cs->effective_xcpus); + deleting = cpumask_andnot(tmp->delmask, cs->effective_xcpus, excpus); /* * Additions of remote CPUs is only allowed if those CPUs are * not allocated to other partitions and there are effective_cpus * left in the top cpuset. */ - if (adding && (!capable(CAP_SYS_ADMIN) || - cpumask_intersects(tmp->addmask, subpartitions_cpus) || - cpumask_subset(top_cpuset.effective_cpus, tmp->addmask))) - goto invalidate; + if (adding) { + WARN_ON_ONCE(cpumask_intersects(tmp->addmask, subpartitions_cpus)); + if (!capable(CAP_SYS_ADMIN)) + cs->prs_err = PERR_ACCESS; + else if (cpumask_intersects(tmp->addmask, subpartitions_cpus) || + cpumask_subset(top_cpuset.effective_cpus, tmp->addmask)) + cs->prs_err = PERR_NOCPUS; + if (cs->prs_err) + goto invalidate; + } spin_lock_irq(&callback_lock); if (adding) isolcpus_updated += partition_xcpus_add(prs, NULL, tmp->addmask); if (deleting) isolcpus_updated += partition_xcpus_del(prs, NULL, tmp->delmask); + /* + * Need to update effective_xcpus and exclusive_cpus now as + * update_sibling_cpumasks() below may iterate back to the same cs. + */ + cpumask_copy(cs->effective_xcpus, excpus); + if (xcpus) + cpumask_copy(cs->exclusive_cpus, xcpus); spin_unlock_irq(&callback_lock); update_unbound_workqueue_cpumask(isolcpus_updated); + if (adding || deleting) + cpuset_force_rebuild(); /* * Propagate changes in top_cpuset's effective_cpus down the hierarchy. @@ -1507,47 +1596,6 @@ invalidate: } /* - * remote_partition_check - check if a child remote partition needs update - * @cs: the cpuset to be updated - * @newmask: the new effective_xcpus mask - * @delmask: temporary mask for deletion (not in tmp) - * @tmp: temporary masks - * - * This should be called before the given cs has updated its cpus_allowed - * and/or effective_xcpus. - */ -static void remote_partition_check(struct cpuset *cs, struct cpumask *newmask, - struct cpumask *delmask, struct tmpmasks *tmp) -{ - struct cpuset *child, *next; - int disable_cnt = 0; - - /* - * Compute the effective exclusive CPUs that will be deleted. - */ - if (!cpumask_andnot(delmask, cs->effective_xcpus, newmask) || - !cpumask_intersects(delmask, subpartitions_cpus)) - return; /* No deletion of exclusive CPUs in partitions */ - - /* - * Searching the remote children list to look for those that will - * be impacted by the deletion of exclusive CPUs. - * - * Since a cpuset must be removed from the remote children list - * before it can go offline and holding cpuset_mutex will prevent - * any change in cpuset status. RCU read lock isn't needed. - */ - lockdep_assert_held(&cpuset_mutex); - list_for_each_entry_safe(child, next, &remote_children, remote_sibling) - if (cpumask_intersects(child->effective_cpus, delmask)) { - remote_partition_disable(child, tmp); - disable_cnt++; - } - if (disable_cnt) - cpuset_force_rebuild(); -} - -/* * prstate_housekeeping_conflict - check for partition & housekeeping conflicts * @prstate: partition root state to be checked * @new_cpus: cpu mask @@ -1601,7 +1649,7 @@ static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus) * The partcmd_update command is used by update_cpumasks_hier() with newmask * NULL and update_cpumask() with newmask set. The partcmd_invalidate is used * by update_cpumask() with NULL newmask. In both cases, the callers won't - * check for error and so partition_root_state and prs_error will be updated + * check for error and so partition_root_state and prs_err will be updated * directly. */ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, @@ -1614,11 +1662,12 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, int old_prs, new_prs; int part_error = PERR_NONE; /* Partition error? */ int subparts_delta = 0; - struct cpumask *xcpus; /* cs effective_xcpus */ int isolcpus_updated = 0; + struct cpumask *xcpus = user_xcpus(cs); bool nocpu; lockdep_assert_held(&cpuset_mutex); + WARN_ON_ONCE(is_remote_partition(cs)); /* For local partition only */ /* * new_prs will only be changed for the partcmd_update and @@ -1626,7 +1675,6 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, */ adding = deleting = false; old_prs = new_prs = cs->partition_root_state; - xcpus = user_xcpus(cs); if (cmd == partcmd_invalidate) { if (is_prs_invalid(old_prs)) @@ -1661,12 +1709,19 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, if ((cmd == partcmd_enable) || (cmd == partcmd_enablei)) { /* + * Need to call compute_effective_exclusive_cpumask() in case + * exclusive_cpus not set. Sibling conflict should only happen + * if exclusive_cpus isn't set. + */ + xcpus = tmp->delmask; + if (compute_effective_exclusive_cpumask(cs, xcpus, NULL)) + WARN_ON_ONCE(!cpumask_empty(cs->exclusive_cpus)); + + /* * Enabling partition root is not allowed if its - * effective_xcpus is empty or doesn't overlap with - * parent's effective_xcpus. + * effective_xcpus is empty. */ - if (cpumask_empty(xcpus) || - !cpumask_intersects(xcpus, parent->effective_xcpus)) + if (cpumask_empty(xcpus)) return PERR_INVCPUS; if (prstate_housekeeping_conflict(new_prs, xcpus)) @@ -1679,19 +1734,33 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, if (nocpu) return PERR_NOCPUS; - cpumask_copy(tmp->delmask, xcpus); + /* + * This function will only be called when all the preliminary + * checks have passed. At this point, the following condition + * should hold. + * + * (cs->effective_xcpus & cpu_active_mask) ⊆ parent->effective_cpus + * + * Warn if it is not the case. + */ + cpumask_and(tmp->new_cpus, xcpus, cpu_active_mask); + WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, parent->effective_cpus)); + deleting = true; subparts_delta++; new_prs = (cmd == partcmd_enable) ? PRS_ROOT : PRS_ISOLATED; } else if (cmd == partcmd_disable) { /* - * May need to add cpus to parent's effective_cpus for - * valid partition root. + * May need to add cpus back to parent's effective_cpus + * (and maybe removed from subpartitions_cpus/isolated_cpus) + * for valid partition root. xcpus may contain CPUs that + * shouldn't be removed from the two global cpumasks. */ - adding = !is_prs_invalid(old_prs) && - cpumask_and(tmp->addmask, xcpus, parent->effective_xcpus); - if (adding) + if (is_partition_valid(cs)) { + cpumask_copy(tmp->addmask, cs->effective_xcpus); + adding = true; subparts_delta--; + } new_prs = PRS_MEMBER; } else if (newmask) { /* @@ -1701,6 +1770,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, part_error = PERR_CPUSEMPTY; goto write_error; } + /* Check newmask again, whether cpus are available for parent/cs */ nocpu |= tasks_nocpu_error(parent, cs, newmask); @@ -1732,6 +1802,15 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, parent->effective_xcpus); } /* + * The new CPUs to be removed from parent's effective CPUs + * must be present. + */ + if (deleting) { + cpumask_and(tmp->new_cpus, tmp->delmask, cpu_active_mask); + WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, parent->effective_cpus)); + } + + /* * Make partition invalid if parent's effective_cpus could * become empty and there are tasks in the parent. */ @@ -1829,7 +1908,7 @@ write_error: * CPU lists in cs haven't been updated yet. So defer it to later. */ if ((old_prs != new_prs) && (cmd != partcmd_update)) { - int err = update_partition_exclusive(cs, new_prs); + int err = update_partition_exclusive_flag(cs, new_prs); if (err) return err; @@ -1867,7 +1946,7 @@ write_error: update_unbound_workqueue_cpumask(isolcpus_updated); if ((old_prs != new_prs) && (cmd == partcmd_update)) - update_partition_exclusive(cs, new_prs); + update_partition_exclusive_flag(cs, new_prs); if (adding || deleting) { cpuset_update_tasks_cpumask(parent, tmp->addmask); @@ -1917,7 +1996,7 @@ static void compute_partition_effective_cpumask(struct cpuset *cs, * 2) All the effective_cpus will be used up and cp * has tasks */ - compute_effective_exclusive_cpumask(cs, new_ecpus); + compute_effective_exclusive_cpumask(cs, new_ecpus, NULL); cpumask_and(new_ecpus, new_ecpus, cpu_active_mask); rcu_read_lock(); @@ -1925,6 +2004,11 @@ static void compute_partition_effective_cpumask(struct cpuset *cs, if (!is_partition_valid(child)) continue; + /* + * There shouldn't be a remote partition underneath another + * partition root. + */ + WARN_ON_ONCE(is_remote_partition(child)); child->prs_err = 0; if (!cpumask_subset(child->effective_xcpus, cs->effective_xcpus)) @@ -1980,32 +2064,39 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, bool remote = is_remote_partition(cp); bool update_parent = false; + old_prs = new_prs = cp->partition_root_state; + /* - * Skip descendent remote partition that acquires CPUs - * directly from top cpuset unless it is cs. + * For child remote partition root (!= cs), we need to call + * remote_cpus_update() if effective_xcpus will be changed. + * Otherwise, we can skip the whole subtree. + * + * remote_cpus_update() will reuse tmp->new_cpus only after + * its value is being processed. */ if (remote && (cp != cs)) { - pos_css = css_rightmost_descendant(pos_css); - continue; - } + compute_effective_exclusive_cpumask(cp, tmp->new_cpus, NULL); + if (cpumask_equal(cp->effective_xcpus, tmp->new_cpus)) { + pos_css = css_rightmost_descendant(pos_css); + continue; + } + rcu_read_unlock(); + remote_cpus_update(cp, NULL, tmp->new_cpus, tmp); + rcu_read_lock(); - /* - * Update effective_xcpus if exclusive_cpus set. - * The case when exclusive_cpus isn't set is handled later. - */ - if (!cpumask_empty(cp->exclusive_cpus) && (cp != cs)) { - spin_lock_irq(&callback_lock); - compute_effective_exclusive_cpumask(cp, NULL); - spin_unlock_irq(&callback_lock); + /* Remote partition may be invalidated */ + new_prs = cp->partition_root_state; + remote = (new_prs == old_prs); } - old_prs = new_prs = cp->partition_root_state; - if (remote || (is_partition_valid(parent) && - is_partition_valid(cp))) + if (remote || (is_partition_valid(parent) && is_partition_valid(cp))) compute_partition_effective_cpumask(cp, tmp->new_cpus); else compute_effective_cpumask(tmp->new_cpus, cp, parent); + if (remote) + goto get_css; /* Ready to update cpuset data */ + /* * A partition with no effective_cpus is allowed as long as * there is no task associated with it. Call @@ -2025,9 +2116,6 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, if (is_in_v2_mode() && !remote && cpumask_empty(tmp->new_cpus)) cpumask_copy(tmp->new_cpus, parent->effective_cpus); - if (remote) - goto get_css; - /* * Skip the whole subtree if * 1) the cpumask remains the same, @@ -2088,6 +2176,9 @@ get_css: spin_lock_irq(&callback_lock); cpumask_copy(cp->effective_cpus, tmp->new_cpus); cp->partition_root_state = new_prs; + if (!cpumask_empty(cp->exclusive_cpus) && (cp != cs)) + compute_effective_exclusive_cpumask(cp, NULL, NULL); + /* * Make sure effective_xcpus is properly set for a valid * partition root. @@ -2174,7 +2265,14 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, parent); if (cpumask_equal(tmp->new_cpus, sibling->effective_cpus)) continue; + } else if (is_remote_partition(sibling)) { + /* + * Change in a sibling cpuset won't affect a remote + * partition root. + */ + continue; } + if (!css_tryget_online(&sibling->css)) continue; @@ -2202,7 +2300,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, bool force = false; int old_prs = cs->partition_root_state; - /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */ + /* top_cpuset.cpus_allowed tracks cpu_active_mask; it's read-only */ if (cs == &top_cpuset) return -EACCES; @@ -2231,8 +2329,9 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, * trialcs->effective_xcpus is used as a temporary cpumask * for checking validity of the partition root. */ + trialcs->partition_root_state = PRS_MEMBER; if (!cpumask_empty(trialcs->exclusive_cpus) || is_partition_valid(cs)) - compute_effective_exclusive_cpumask(trialcs, NULL); + compute_effective_exclusive_cpumask(trialcs, NULL, cs); } /* Nothing to do if the cpus didn't change */ @@ -2305,19 +2404,13 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, * Call remote_cpus_update() to handle valid remote partition */ if (is_remote_partition(cs)) - remote_cpus_update(cs, xcpus, &tmp); + remote_cpus_update(cs, NULL, xcpus, &tmp); else if (invalidate) update_parent_effective_cpumask(cs, partcmd_invalidate, NULL, &tmp); else update_parent_effective_cpumask(cs, partcmd_update, xcpus, &tmp); - } else if (!cpumask_empty(cs->exclusive_cpus)) { - /* - * Use trialcs->effective_cpus as a temp cpumask - */ - remote_partition_check(cs, trialcs->effective_xcpus, - trialcs->effective_cpus, &tmp); } spin_lock_irq(&callback_lock); @@ -2369,8 +2462,15 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (cpumask_equal(cs->exclusive_cpus, trialcs->exclusive_cpus)) return 0; - if (*buf) - compute_effective_exclusive_cpumask(trialcs, NULL); + if (*buf) { + trialcs->partition_root_state = PRS_MEMBER; + /* + * Reject the change if there is exclusive CPUs conflict with + * the siblings. + */ + if (compute_effective_exclusive_cpumask(trialcs, NULL, cs)) + return -EINVAL; + } /* * Check all the descendants in update_cpumasks_hier() if @@ -2401,8 +2501,8 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (invalidate) remote_partition_disable(cs, &tmp); else - remote_cpus_update(cs, trialcs->effective_xcpus, - &tmp); + remote_cpus_update(cs, trialcs->exclusive_cpus, + trialcs->effective_xcpus, &tmp); } else if (invalidate) { update_parent_effective_cpumask(cs, partcmd_invalidate, NULL, &tmp); @@ -2410,12 +2510,6 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, update_parent_effective_cpumask(cs, partcmd_update, trialcs->effective_xcpus, &tmp); } - } else if (!cpumask_empty(trialcs->exclusive_cpus)) { - /* - * Use trialcs->effective_cpus as a temp cpumask - */ - remote_partition_check(cs, trialcs->effective_xcpus, - trialcs->effective_cpus, &tmp); } spin_lock_irq(&callback_lock); cpumask_copy(cs->exclusive_cpus, trialcs->exclusive_cpus); @@ -2782,7 +2876,7 @@ static int update_prstate(struct cpuset *cs, int new_prs) int err = PERR_NONE, old_prs = cs->partition_root_state; struct cpuset *parent = parent_cs(cs); struct tmpmasks tmpmask; - bool new_xcpus_state = false; + bool isolcpus_updated = false; if (old_prs == new_prs) return 0; @@ -2796,18 +2890,7 @@ static int update_prstate(struct cpuset *cs, int new_prs) if (alloc_cpumasks(NULL, &tmpmask)) return -ENOMEM; - /* - * Setup effective_xcpus if not properly set yet, it will be cleared - * later if partition becomes invalid. - */ - if ((new_prs > 0) && cpumask_empty(cs->exclusive_cpus)) { - spin_lock_irq(&callback_lock); - cpumask_and(cs->effective_xcpus, - cs->cpus_allowed, parent->effective_xcpus); - spin_unlock_irq(&callback_lock); - } - - err = update_partition_exclusive(cs, new_prs); + err = update_partition_exclusive_flag(cs, new_prs); if (err) goto out; @@ -2821,6 +2904,19 @@ static int update_prstate(struct cpuset *cs, int new_prs) } /* + * We don't support the creation of a new local partition with + * a remote partition underneath it. This unsupported + * setting can happen only if parent is the top_cpuset because + * a remote partition cannot be created underneath an existing + * local or remote partition. + */ + if ((parent == &top_cpuset) && + cpumask_intersects(cs->exclusive_cpus, subpartitions_cpus)) { + err = PERR_REMOTE; + goto out; + } + + /* * If parent is valid partition, enable local partiion. * Otherwise, enable a remote partition. */ @@ -2835,8 +2931,9 @@ static int update_prstate(struct cpuset *cs, int new_prs) } else if (old_prs && new_prs) { /* * A change in load balance state only, no change in cpumasks. + * Need to update isolated_cpus. */ - new_xcpus_state = true; + isolcpus_updated = true; } else { /* * Switching back to member is always allowed even if it @@ -2860,7 +2957,7 @@ out: */ if (err) { new_prs = -new_prs; - update_partition_exclusive(cs, new_prs); + update_partition_exclusive_flag(cs, new_prs); } spin_lock_irq(&callback_lock); @@ -2868,14 +2965,18 @@ out: WRITE_ONCE(cs->prs_err, err); if (!is_partition_valid(cs)) reset_partition_data(cs); - else if (new_xcpus_state) - partition_xcpus_newstate(old_prs, new_prs, cs->effective_xcpus); + else if (isolcpus_updated) + isolated_cpus_update(old_prs, new_prs, cs->effective_xcpus); spin_unlock_irq(&callback_lock); - update_unbound_workqueue_cpumask(new_xcpus_state); + update_unbound_workqueue_cpumask(isolcpus_updated); - /* Force update if switching back to member */ + /* Force update if switching back to member & update effective_xcpus */ update_cpumasks_hier(cs, &tmpmask, !new_prs); + /* A newly created partition must have effective_xcpus set */ + WARN_ON_ONCE(!old_prs && (new_prs > 0) + && cpumask_empty(cs->effective_xcpus)); + /* Update sched domains and load balance flag */ update_partition_sd_lb(cs, old_prs); @@ -3018,7 +3119,7 @@ static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task) lockdep_assert_held(&cpuset_mutex); if (cs != &top_cpuset) - guarantee_online_cpus(task, cpus_attach); + guarantee_active_cpus(task, cpus_attach); else cpumask_andnot(cpus_attach, task_cpu_possible_mask(task), subpartitions_cpus); @@ -3208,7 +3309,7 @@ int cpuset_common_seq_show(struct seq_file *sf, void *v) return ret; } -static int sched_partition_show(struct seq_file *seq, void *v) +static int cpuset_partition_show(struct seq_file *seq, void *v) { struct cpuset *cs = css_cs(seq_css(seq)); const char *err, *type = NULL; @@ -3239,7 +3340,7 @@ static int sched_partition_show(struct seq_file *seq, void *v) return 0; } -static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf, +static ssize_t cpuset_partition_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cpuset *cs = css_cs(of_css(of)); @@ -3260,11 +3361,8 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf, css_get(&cs->css); cpus_read_lock(); mutex_lock(&cpuset_mutex); - if (!is_cpuset_online(cs)) - goto out_unlock; - - retval = update_prstate(cs, val); -out_unlock: + if (is_cpuset_online(cs)) + retval = update_prstate(cs, val); mutex_unlock(&cpuset_mutex); cpus_read_unlock(); css_put(&cs->css); @@ -3308,8 +3406,8 @@ static struct cftype dfl_files[] = { { .name = "cpus.partition", - .seq_show = sched_partition_show, - .write = sched_partition_write, + .seq_show = cpuset_partition_show, + .write = cpuset_partition_write, .private = FILE_PARTITION_ROOT, .flags = CFTYPE_NOT_ON_ROOT, .file_offset = offsetof(struct cpuset, partition_file), @@ -3463,11 +3561,7 @@ out_unlock: * will call rebuild_sched_domains_locked(). That is not needed * in the default hierarchy where only changes in partition * will cause repartitioning. - * - * If the cpuset has the 'sched.partition' flag enabled, simulate - * turning 'sched.partition" off. */ - static void cpuset_css_offline(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); @@ -3475,9 +3569,6 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) cpus_read_lock(); mutex_lock(&cpuset_mutex); - if (is_partition_valid(cs)) - update_prstate(cs, 0); - if (!cpuset_v2() && is_sched_load_balance(cs)) cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); @@ -3488,6 +3579,27 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) cpus_read_unlock(); } +/* + * If a dying cpuset has the 'cpus.partition' enabled, turn it off by + * changing it back to member to free its exclusive CPUs back to the pool to + * be used by other online cpusets. + */ +static void cpuset_css_killed(struct cgroup_subsys_state *css) +{ + struct cpuset *cs = css_cs(css); + + cpus_read_lock(); + mutex_lock(&cpuset_mutex); + + /* Reset valid partition back to member */ + if (is_partition_valid(cs)) + update_prstate(cs, PRS_MEMBER); + + mutex_unlock(&cpuset_mutex); + cpus_read_unlock(); + +} + static void cpuset_css_free(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); @@ -3609,6 +3721,7 @@ struct cgroup_subsys cpuset_cgrp_subsys = { .css_alloc = cpuset_css_alloc, .css_online = cpuset_css_online, .css_offline = cpuset_css_offline, + .css_killed = cpuset_css_killed, .css_free = cpuset_css_free, .can_attach = cpuset_can_attach, .cancel_attach = cpuset_cancel_attach, @@ -3739,10 +3852,10 @@ retry: if (remote && cpumask_empty(&new_cpus) && partition_is_populated(cs, NULL)) { + cs->prs_err = PERR_HOTPLUG; remote_partition_disable(cs, tmp); compute_effective_cpumask(&new_cpus, cs, parent); remote = false; - cpuset_force_rebuild(); } /* @@ -3951,7 +4064,7 @@ void __init cpuset_init_smp(void) * * Description: Returns the cpumask_var_t cpus_allowed of the cpuset * attached to the specified @tsk. Guaranteed to return some non-empty - * subset of cpu_online_mask, even if this means going outside the + * subset of cpu_active_mask, even if this means going outside the * tasks cpuset, except when the task is in the top cpuset. **/ @@ -3965,7 +4078,7 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) cs = task_cs(tsk); if (cs != &top_cpuset) - guarantee_online_cpus(tsk, pmask); + guarantee_active_cpus(tsk, pmask); /* * Tasks in the top cpuset won't get update to their cpumasks * when a hotplug online/offline event happens. So we include all @@ -3979,7 +4092,7 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) * allowable online cpu left, we fall back to all possible cpus. */ cpumask_andnot(pmask, possible_mask, subpartitions_cpus); - if (!cpumask_intersects(pmask, cpu_online_mask)) + if (!cpumask_intersects(pmask, cpu_active_mask)) cpumask_copy(pmask, possible_mask); } diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c index 2fa3a4fb2aaf..6a01d91ea4cb 100644 --- a/kernel/cgroup/misc.c +++ b/kernel/cgroup/misc.c @@ -24,6 +24,10 @@ static const char *const misc_res_name[] = { /* AMD SEV-ES ASIDs resource */ "sev_es", #endif +#ifdef CONFIG_INTEL_TDX_HOST + /* Intel TDX HKIDs resource */ + "tdx", +#endif }; /* Root misc cgroup */ diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 4bb587d5d34f..ce4752ab9e09 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -9,18 +9,52 @@ #include <trace/events/cgroup.h> -static DEFINE_SPINLOCK(cgroup_rstat_lock); -static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock); +static DEFINE_SPINLOCK(rstat_base_lock); +static DEFINE_PER_CPU(raw_spinlock_t, rstat_base_cpu_lock); static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu); -static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu) +/* + * Determines whether a given css can participate in rstat. + * css's that are cgroup::self use rstat for base stats. + * Other css's associated with a subsystem use rstat only when + * they define the ss->css_rstat_flush callback. + */ +static inline bool css_uses_rstat(struct cgroup_subsys_state *css) +{ + return css_is_self(css) || css->ss->css_rstat_flush != NULL; +} + +static struct css_rstat_cpu *css_rstat_cpu( + struct cgroup_subsys_state *css, int cpu) +{ + return per_cpu_ptr(css->rstat_cpu, cpu); +} + +static struct cgroup_rstat_base_cpu *cgroup_rstat_base_cpu( + struct cgroup *cgrp, int cpu) { - return per_cpu_ptr(cgrp->rstat_cpu, cpu); + return per_cpu_ptr(cgrp->rstat_base_cpu, cpu); +} + +static spinlock_t *ss_rstat_lock(struct cgroup_subsys *ss) +{ + if (ss) + return &ss->rstat_ss_lock; + + return &rstat_base_lock; +} + +static raw_spinlock_t *ss_rstat_cpu_lock(struct cgroup_subsys *ss, int cpu) +{ + if (ss) + return per_cpu_ptr(ss->rstat_ss_cpu_lock, cpu); + + return per_cpu_ptr(&rstat_base_cpu_lock, cpu); } /* - * Helper functions for rstat per CPU lock (cgroup_rstat_cpu_lock). + * Helper functions for rstat per CPU locks. * * This makes it easier to diagnose locking issues and contention in * production environments. The parameter @fast_path determine the @@ -28,20 +62,23 @@ static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu) * operations without handling high-frequency fast-path "update" events. */ static __always_inline -unsigned long _cgroup_rstat_cpu_lock(raw_spinlock_t *cpu_lock, int cpu, - struct cgroup *cgrp, const bool fast_path) +unsigned long _css_rstat_cpu_lock(struct cgroup_subsys_state *css, int cpu, + const bool fast_path) { + struct cgroup *cgrp = css->cgroup; + raw_spinlock_t *cpu_lock; unsigned long flags; bool contended; /* - * The _irqsave() is needed because cgroup_rstat_lock is - * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring - * this lock with the _irq() suffix only disables interrupts on - * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables - * interrupts on both configurations. The _irqsave() ensures - * that interrupts are always disabled and later restored. + * The _irqsave() is needed because the locks used for flushing are + * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring this lock + * with the _irq() suffix only disables interrupts on a non-PREEMPT_RT + * kernel. The raw_spinlock_t below disables interrupts on both + * configurations. The _irqsave() ensures that interrupts are always + * disabled and later restored. */ + cpu_lock = ss_rstat_cpu_lock(css->ss, cpu); contended = !raw_spin_trylock_irqsave(cpu_lock, flags); if (contended) { if (fast_path) @@ -61,50 +98,59 @@ unsigned long _cgroup_rstat_cpu_lock(raw_spinlock_t *cpu_lock, int cpu, } static __always_inline -void _cgroup_rstat_cpu_unlock(raw_spinlock_t *cpu_lock, int cpu, - struct cgroup *cgrp, unsigned long flags, - const bool fast_path) +void _css_rstat_cpu_unlock(struct cgroup_subsys_state *css, int cpu, + unsigned long flags, const bool fast_path) { + struct cgroup *cgrp = css->cgroup; + raw_spinlock_t *cpu_lock; + if (fast_path) trace_cgroup_rstat_cpu_unlock_fastpath(cgrp, cpu, false); else trace_cgroup_rstat_cpu_unlock(cgrp, cpu, false); + cpu_lock = ss_rstat_cpu_lock(css->ss, cpu); raw_spin_unlock_irqrestore(cpu_lock, flags); } /** - * cgroup_rstat_updated - keep track of updated rstat_cpu - * @cgrp: target cgroup + * css_rstat_updated - keep track of updated rstat_cpu + * @css: target cgroup subsystem state * @cpu: cpu on which rstat_cpu was updated * - * @cgrp's rstat_cpu on @cpu was updated. Put it on the parent's matching - * rstat_cpu->updated_children list. See the comment on top of - * cgroup_rstat_cpu definition for details. + * @css's rstat_cpu on @cpu was updated. Put it on the parent's matching + * rstat_cpu->updated_children list. See the comment on top of + * css_rstat_cpu definition for details. */ -__bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) +__bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu) { - raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu); unsigned long flags; /* + * Since bpf programs can call this function, prevent access to + * uninitialized rstat pointers. + */ + if (!css_uses_rstat(css)) + return; + + /* * Speculative already-on-list test. This may race leading to * temporary inaccuracies, which is fine. * * Because @parent's updated_children is terminated with @parent - * instead of NULL, we can tell whether @cgrp is on the list by + * instead of NULL, we can tell whether @css is on the list by * testing the next pointer for NULL. */ - if (data_race(cgroup_rstat_cpu(cgrp, cpu)->updated_next)) + if (data_race(css_rstat_cpu(css, cpu)->updated_next)) return; - flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, cgrp, true); + flags = _css_rstat_cpu_lock(css, cpu, true); - /* put @cgrp and all ancestors on the corresponding updated lists */ + /* put @css and all ancestors on the corresponding updated lists */ while (true) { - struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); - struct cgroup *parent = cgroup_parent(cgrp); - struct cgroup_rstat_cpu *prstatc; + struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu); + struct cgroup_subsys_state *parent = css->parent; + struct css_rstat_cpu *prstatc; /* * Both additions and removals are bottom-up. If a cgroup @@ -115,53 +161,78 @@ __bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) /* Root has no parent to link it to, but mark it busy */ if (!parent) { - rstatc->updated_next = cgrp; + rstatc->updated_next = css; break; } - prstatc = cgroup_rstat_cpu(parent, cpu); + prstatc = css_rstat_cpu(parent, cpu); rstatc->updated_next = prstatc->updated_children; - prstatc->updated_children = cgrp; + prstatc->updated_children = css; - cgrp = parent; + css = parent; } - _cgroup_rstat_cpu_unlock(cpu_lock, cpu, cgrp, flags, true); + _css_rstat_cpu_unlock(css, cpu, flags, true); } /** - * cgroup_rstat_push_children - push children cgroups into the given list + * css_rstat_push_children - push children css's into the given list * @head: current head of the list (= subtree root) * @child: first child of the root * @cpu: target cpu - * Return: A new singly linked list of cgroups to be flush + * Return: A new singly linked list of css's to be flushed * - * Iteratively traverse down the cgroup_rstat_cpu updated tree level by + * Iteratively traverse down the css_rstat_cpu updated tree level by * level and push all the parents first before their next level children - * into a singly linked list built from the tail backward like "pushing" - * cgroups into a stack. The root is pushed by the caller. + * into a singly linked list via the rstat_flush_next pointer built from the + * tail backward like "pushing" css's into a stack. The root is pushed by + * the caller. */ -static struct cgroup *cgroup_rstat_push_children(struct cgroup *head, - struct cgroup *child, int cpu) +static struct cgroup_subsys_state *css_rstat_push_children( + struct cgroup_subsys_state *head, + struct cgroup_subsys_state *child, int cpu) { - struct cgroup *chead = child; /* Head of child cgroup level */ - struct cgroup *ghead = NULL; /* Head of grandchild cgroup level */ - struct cgroup *parent, *grandchild; - struct cgroup_rstat_cpu *crstatc; + struct cgroup_subsys_state *cnext = child; /* Next head of child css level */ + struct cgroup_subsys_state *ghead = NULL; /* Head of grandchild css level */ + struct cgroup_subsys_state *parent, *grandchild; + struct css_rstat_cpu *crstatc; child->rstat_flush_next = NULL; + /* + * The subsystem rstat lock must be held for the whole duration from + * here as the rstat_flush_next list is being constructed to when + * it is consumed later in css_rstat_flush(). + */ + lockdep_assert_held(ss_rstat_lock(head->ss)); + + /* + * Notation: -> updated_next pointer + * => rstat_flush_next pointer + * + * Assuming the following sample updated_children lists: + * P: C1 -> C2 -> P + * C1: G11 -> G12 -> C1 + * C2: G21 -> G22 -> C2 + * + * After 1st iteration: + * head => C2 => C1 => NULL + * ghead => G21 => G11 => NULL + * + * After 2nd iteration: + * head => G12 => G11 => G22 => G21 => C2 => C1 => NULL + */ next_level: - while (chead) { - child = chead; - chead = child->rstat_flush_next; - parent = cgroup_parent(child); + while (cnext) { + child = cnext; + cnext = child->rstat_flush_next; + parent = child->parent; - /* updated_next is parent cgroup terminated */ + /* updated_next is parent cgroup terminated if !NULL */ while (child != parent) { child->rstat_flush_next = head; head = child; - crstatc = cgroup_rstat_cpu(child, cpu); + crstatc = css_rstat_cpu(child, cpu); grandchild = crstatc->updated_children; if (grandchild != child) { /* Push the grand child to the next level */ @@ -175,7 +246,7 @@ next_level: } if (ghead) { - chead = ghead; + cnext = ghead; ghead = NULL; goto next_level; } @@ -183,31 +254,31 @@ next_level: } /** - * cgroup_rstat_updated_list - return a list of updated cgroups to be flushed - * @root: root of the cgroup subtree to traverse + * css_rstat_updated_list - build a list of updated css's to be flushed + * @root: root of the css subtree to traverse * @cpu: target cpu - * Return: A singly linked list of cgroups to be flushed + * Return: A singly linked list of css's to be flushed * * Walks the updated rstat_cpu tree on @cpu from @root. During traversal, - * each returned cgroup is unlinked from the updated tree. + * each returned css is unlinked from the updated tree. * * The only ordering guarantee is that, for a parent and a child pair * covered by a given traversal, the child is before its parent in * the list. * * Note that updated_children is self terminated and points to a list of - * child cgroups if not empty. Whereas updated_next is like a sibling link - * within the children list and terminated by the parent cgroup. An exception - * here is the cgroup root whose updated_next can be self terminated. + * child css's if not empty. Whereas updated_next is like a sibling link + * within the children list and terminated by the parent css. An exception + * here is the css root whose updated_next can be self terminated. */ -static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu) +static struct cgroup_subsys_state *css_rstat_updated_list( + struct cgroup_subsys_state *root, int cpu) { - raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu); - struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(root, cpu); - struct cgroup *head = NULL, *parent, *child; + struct css_rstat_cpu *rstatc = css_rstat_cpu(root, cpu); + struct cgroup_subsys_state *head = NULL, *parent, *child; unsigned long flags; - flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, root, false); + flags = _css_rstat_cpu_lock(root, cpu, false); /* Return NULL if this subtree is not on-list */ if (!rstatc->updated_next) @@ -217,17 +288,17 @@ static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu) * Unlink @root from its parent. As the updated_children list is * singly linked, we have to walk it to find the removal point. */ - parent = cgroup_parent(root); + parent = root->parent; if (parent) { - struct cgroup_rstat_cpu *prstatc; - struct cgroup **nextp; + struct css_rstat_cpu *prstatc; + struct cgroup_subsys_state **nextp; - prstatc = cgroup_rstat_cpu(parent, cpu); + prstatc = css_rstat_cpu(parent, cpu); nextp = &prstatc->updated_children; while (*nextp != root) { - struct cgroup_rstat_cpu *nrstatc; + struct css_rstat_cpu *nrstatc; - nrstatc = cgroup_rstat_cpu(*nextp, cpu); + nrstatc = css_rstat_cpu(*nextp, cpu); WARN_ON_ONCE(*nextp == parent); nextp = &nrstatc->updated_next; } @@ -242,16 +313,16 @@ static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu) child = rstatc->updated_children; rstatc->updated_children = root; if (child != root) - head = cgroup_rstat_push_children(head, child, cpu); + head = css_rstat_push_children(head, child, cpu); unlock_ret: - _cgroup_rstat_cpu_unlock(cpu_lock, cpu, root, flags, false); + _css_rstat_cpu_unlock(root, cpu, flags, false); return head; } /* * A hook for bpf stat collectors to attach to and flush their stats. - * Together with providing bpf kfuncs for cgroup_rstat_updated() and - * cgroup_rstat_flush(), this enables a complete workflow where bpf progs that + * Together with providing bpf kfuncs for css_rstat_updated() and + * css_rstat_flush(), this enables a complete workflow where bpf progs that * collect cgroup stats can integrate with rstat for efficient flushing. * * A static noinline declaration here could cause the compiler to optimize away @@ -271,7 +342,7 @@ __weak noinline void bpf_rstat_flush(struct cgroup *cgrp, __bpf_hook_end(); /* - * Helper functions for locking cgroup_rstat_lock. + * Helper functions for locking. * * This makes it easier to diagnose locking issues and contention in * production environments. The parameter @cpu_in_loop indicate lock @@ -279,114 +350,186 @@ __bpf_hook_end(); * value -1 is used when obtaining the main lock else this is the CPU * number processed last. */ -static inline void __cgroup_rstat_lock(struct cgroup *cgrp, int cpu_in_loop) - __acquires(&cgroup_rstat_lock) +static inline void __css_rstat_lock(struct cgroup_subsys_state *css, + int cpu_in_loop) + __acquires(ss_rstat_lock(css->ss)) { + struct cgroup *cgrp = css->cgroup; + spinlock_t *lock; bool contended; - contended = !spin_trylock_irq(&cgroup_rstat_lock); + lock = ss_rstat_lock(css->ss); + contended = !spin_trylock_irq(lock); if (contended) { trace_cgroup_rstat_lock_contended(cgrp, cpu_in_loop, contended); - spin_lock_irq(&cgroup_rstat_lock); + spin_lock_irq(lock); } trace_cgroup_rstat_locked(cgrp, cpu_in_loop, contended); } -static inline void __cgroup_rstat_unlock(struct cgroup *cgrp, int cpu_in_loop) - __releases(&cgroup_rstat_lock) +static inline void __css_rstat_unlock(struct cgroup_subsys_state *css, + int cpu_in_loop) + __releases(ss_rstat_lock(css->ss)) { + struct cgroup *cgrp = css->cgroup; + spinlock_t *lock; + + lock = ss_rstat_lock(css->ss); trace_cgroup_rstat_unlock(cgrp, cpu_in_loop, false); - spin_unlock_irq(&cgroup_rstat_lock); + spin_unlock_irq(lock); } /** - * cgroup_rstat_flush - flush stats in @cgrp's subtree - * @cgrp: target cgroup + * css_rstat_flush - flush stats in @css's rstat subtree + * @css: target cgroup subsystem state * - * Collect all per-cpu stats in @cgrp's subtree into the global counters - * and propagate them upwards. After this function returns, all cgroups in - * the subtree have up-to-date ->stat. + * Collect all per-cpu stats in @css's subtree into the global counters + * and propagate them upwards. After this function returns, all rstat + * nodes in the subtree have up-to-date ->stat. * - * This also gets all cgroups in the subtree including @cgrp off the + * This also gets all rstat nodes in the subtree including @css off the * ->updated_children lists. * * This function may block. */ -__bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp) +__bpf_kfunc void css_rstat_flush(struct cgroup_subsys_state *css) { int cpu; + bool is_self = css_is_self(css); + + /* + * Since bpf programs can call this function, prevent access to + * uninitialized rstat pointers. + */ + if (!css_uses_rstat(css)) + return; might_sleep(); for_each_possible_cpu(cpu) { - struct cgroup *pos = cgroup_rstat_updated_list(cgrp, cpu); + struct cgroup_subsys_state *pos; /* Reacquire for each CPU to avoid disabling IRQs too long */ - __cgroup_rstat_lock(cgrp, cpu); + __css_rstat_lock(css, cpu); + pos = css_rstat_updated_list(css, cpu); for (; pos; pos = pos->rstat_flush_next) { - struct cgroup_subsys_state *css; - - cgroup_base_stat_flush(pos, cpu); - bpf_rstat_flush(pos, cgroup_parent(pos), cpu); - - rcu_read_lock(); - list_for_each_entry_rcu(css, &pos->rstat_css_list, - rstat_css_node) - css->ss->css_rstat_flush(css, cpu); - rcu_read_unlock(); + if (is_self) { + cgroup_base_stat_flush(pos->cgroup, cpu); + bpf_rstat_flush(pos->cgroup, + cgroup_parent(pos->cgroup), cpu); + } else + pos->ss->css_rstat_flush(pos, cpu); } - __cgroup_rstat_unlock(cgrp, cpu); + __css_rstat_unlock(css, cpu); if (!cond_resched()) cpu_relax(); } } -int cgroup_rstat_init(struct cgroup *cgrp) +int css_rstat_init(struct cgroup_subsys_state *css) { + struct cgroup *cgrp = css->cgroup; int cpu; + bool is_self = css_is_self(css); + + if (is_self) { + /* the root cgrp has rstat_base_cpu preallocated */ + if (!cgrp->rstat_base_cpu) { + cgrp->rstat_base_cpu = alloc_percpu(struct cgroup_rstat_base_cpu); + if (!cgrp->rstat_base_cpu) + return -ENOMEM; + } + } else if (css->ss->css_rstat_flush == NULL) + return 0; + + /* the root cgrp's self css has rstat_cpu preallocated */ + if (!css->rstat_cpu) { + css->rstat_cpu = alloc_percpu(struct css_rstat_cpu); + if (!css->rstat_cpu) { + if (is_self) + free_percpu(cgrp->rstat_base_cpu); - /* the root cgrp has rstat_cpu preallocated */ - if (!cgrp->rstat_cpu) { - cgrp->rstat_cpu = alloc_percpu(struct cgroup_rstat_cpu); - if (!cgrp->rstat_cpu) return -ENOMEM; + } } /* ->updated_children list is self terminated */ for_each_possible_cpu(cpu) { - struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); + struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu); + + rstatc->updated_children = css; + + if (is_self) { + struct cgroup_rstat_base_cpu *rstatbc; - rstatc->updated_children = cgrp; - u64_stats_init(&rstatc->bsync); + rstatbc = cgroup_rstat_base_cpu(cgrp, cpu); + u64_stats_init(&rstatbc->bsync); + } } return 0; } -void cgroup_rstat_exit(struct cgroup *cgrp) +void css_rstat_exit(struct cgroup_subsys_state *css) { int cpu; - cgroup_rstat_flush(cgrp); + if (!css_uses_rstat(css)) + return; + + css_rstat_flush(css); /* sanity check */ for_each_possible_cpu(cpu) { - struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); + struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu); - if (WARN_ON_ONCE(rstatc->updated_children != cgrp) || + if (WARN_ON_ONCE(rstatc->updated_children != css) || WARN_ON_ONCE(rstatc->updated_next)) return; } - free_percpu(cgrp->rstat_cpu); - cgrp->rstat_cpu = NULL; + if (css_is_self(css)) { + struct cgroup *cgrp = css->cgroup; + + free_percpu(cgrp->rstat_base_cpu); + cgrp->rstat_base_cpu = NULL; + } + + free_percpu(css->rstat_cpu); + css->rstat_cpu = NULL; } -void __init cgroup_rstat_boot(void) +/** + * ss_rstat_init - subsystem-specific rstat initialization + * @ss: target subsystem + * + * If @ss is NULL, the static locks associated with the base stats + * are initialized. If @ss is non-NULL, the subsystem-specific locks + * are initialized. + */ +int __init ss_rstat_init(struct cgroup_subsys *ss) { int cpu; +#ifdef CONFIG_SMP + /* + * On uniprocessor machines, arch_spinlock_t is defined as an empty + * struct. Avoid allocating a size of zero by having this block + * excluded in this case. It's acceptable to leave the subsystem locks + * unitialized since the associated lock functions are no-ops in the + * non-smp case. + */ + if (ss) { + ss->rstat_ss_cpu_lock = alloc_percpu(raw_spinlock_t); + if (!ss->rstat_ss_cpu_lock) + return -ENOMEM; + } +#endif + + spin_lock_init(ss_rstat_lock(ss)); for_each_possible_cpu(cpu) - raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu)); + raw_spin_lock_init(ss_rstat_cpu_lock(ss, cpu)); + + return 0; } /* @@ -419,9 +562,9 @@ static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat, static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) { - struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); + struct cgroup_rstat_base_cpu *rstatbc = cgroup_rstat_base_cpu(cgrp, cpu); struct cgroup *parent = cgroup_parent(cgrp); - struct cgroup_rstat_cpu *prstatc; + struct cgroup_rstat_base_cpu *prstatbc; struct cgroup_base_stat delta; unsigned seq; @@ -431,15 +574,15 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) /* fetch the current per-cpu values */ do { - seq = __u64_stats_fetch_begin(&rstatc->bsync); - delta = rstatc->bstat; - } while (__u64_stats_fetch_retry(&rstatc->bsync, seq)); + seq = __u64_stats_fetch_begin(&rstatbc->bsync); + delta = rstatbc->bstat; + } while (__u64_stats_fetch_retry(&rstatbc->bsync, seq)); /* propagate per-cpu delta to cgroup and per-cpu global statistics */ - cgroup_base_stat_sub(&delta, &rstatc->last_bstat); + cgroup_base_stat_sub(&delta, &rstatbc->last_bstat); cgroup_base_stat_add(&cgrp->bstat, &delta); - cgroup_base_stat_add(&rstatc->last_bstat, &delta); - cgroup_base_stat_add(&rstatc->subtree_bstat, &delta); + cgroup_base_stat_add(&rstatbc->last_bstat, &delta); + cgroup_base_stat_add(&rstatbc->subtree_bstat, &delta); /* propagate cgroup and per-cpu global delta to parent (unless that's root) */ if (cgroup_parent(parent)) { @@ -448,73 +591,73 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) cgroup_base_stat_add(&parent->bstat, &delta); cgroup_base_stat_add(&cgrp->last_bstat, &delta); - delta = rstatc->subtree_bstat; - prstatc = cgroup_rstat_cpu(parent, cpu); - cgroup_base_stat_sub(&delta, &rstatc->last_subtree_bstat); - cgroup_base_stat_add(&prstatc->subtree_bstat, &delta); - cgroup_base_stat_add(&rstatc->last_subtree_bstat, &delta); + delta = rstatbc->subtree_bstat; + prstatbc = cgroup_rstat_base_cpu(parent, cpu); + cgroup_base_stat_sub(&delta, &rstatbc->last_subtree_bstat); + cgroup_base_stat_add(&prstatbc->subtree_bstat, &delta); + cgroup_base_stat_add(&rstatbc->last_subtree_bstat, &delta); } } -static struct cgroup_rstat_cpu * +static struct cgroup_rstat_base_cpu * cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp, unsigned long *flags) { - struct cgroup_rstat_cpu *rstatc; + struct cgroup_rstat_base_cpu *rstatbc; - rstatc = get_cpu_ptr(cgrp->rstat_cpu); - *flags = u64_stats_update_begin_irqsave(&rstatc->bsync); - return rstatc; + rstatbc = get_cpu_ptr(cgrp->rstat_base_cpu); + *flags = u64_stats_update_begin_irqsave(&rstatbc->bsync); + return rstatbc; } static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp, - struct cgroup_rstat_cpu *rstatc, + struct cgroup_rstat_base_cpu *rstatbc, unsigned long flags) { - u64_stats_update_end_irqrestore(&rstatc->bsync, flags); - cgroup_rstat_updated(cgrp, smp_processor_id()); - put_cpu_ptr(rstatc); + u64_stats_update_end_irqrestore(&rstatbc->bsync, flags); + css_rstat_updated(&cgrp->self, smp_processor_id()); + put_cpu_ptr(rstatbc); } void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec) { - struct cgroup_rstat_cpu *rstatc; + struct cgroup_rstat_base_cpu *rstatbc; unsigned long flags; - rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags); - rstatc->bstat.cputime.sum_exec_runtime += delta_exec; - cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags); + rstatbc = cgroup_base_stat_cputime_account_begin(cgrp, &flags); + rstatbc->bstat.cputime.sum_exec_runtime += delta_exec; + cgroup_base_stat_cputime_account_end(cgrp, rstatbc, flags); } void __cgroup_account_cputime_field(struct cgroup *cgrp, enum cpu_usage_stat index, u64 delta_exec) { - struct cgroup_rstat_cpu *rstatc; + struct cgroup_rstat_base_cpu *rstatbc; unsigned long flags; - rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags); + rstatbc = cgroup_base_stat_cputime_account_begin(cgrp, &flags); switch (index) { case CPUTIME_NICE: - rstatc->bstat.ntime += delta_exec; + rstatbc->bstat.ntime += delta_exec; fallthrough; case CPUTIME_USER: - rstatc->bstat.cputime.utime += delta_exec; + rstatbc->bstat.cputime.utime += delta_exec; break; case CPUTIME_SYSTEM: case CPUTIME_IRQ: case CPUTIME_SOFTIRQ: - rstatc->bstat.cputime.stime += delta_exec; + rstatbc->bstat.cputime.stime += delta_exec; break; #ifdef CONFIG_SCHED_CORE case CPUTIME_FORCEIDLE: - rstatc->bstat.forceidle_sum += delta_exec; + rstatbc->bstat.forceidle_sum += delta_exec; break; #endif default: break; } - cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags); + cgroup_base_stat_cputime_account_end(cgrp, rstatbc, flags); } /* @@ -573,12 +716,12 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq) struct cgroup_base_stat bstat; if (cgroup_parent(cgrp)) { - cgroup_rstat_flush(cgrp); - __cgroup_rstat_lock(cgrp, -1); + css_rstat_flush(&cgrp->self); + __css_rstat_lock(&cgrp->self, -1); bstat = cgrp->bstat; cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime, &bstat.cputime.utime, &bstat.cputime.stime); - __cgroup_rstat_unlock(cgrp, -1); + __css_rstat_unlock(&cgrp->self, -1); } else { root_cgroup_cputime(&bstat); } @@ -600,10 +743,10 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq) cgroup_force_idle_show(seq, &bstat); } -/* Add bpf kfuncs for cgroup_rstat_updated() and cgroup_rstat_flush() */ +/* Add bpf kfuncs for css_rstat_updated() and css_rstat_flush() */ BTF_KFUNCS_START(bpf_rstat_kfunc_ids) -BTF_ID_FLAGS(func, cgroup_rstat_updated) -BTF_ID_FLAGS(func, cgroup_rstat_flush, KF_SLEEPABLE) +BTF_ID_FLAGS(func, css_rstat_updated) +BTF_ID_FLAGS(func, css_rstat_flush, KF_SLEEPABLE) BTF_KFUNCS_END(bpf_rstat_kfunc_ids) static const struct btf_kfunc_id_set bpf_rstat_kfunc_set = { diff --git a/kernel/configs/debug.config b/kernel/configs/debug.config index 20552f163930..e81327d2cd63 100644 --- a/kernel/configs/debug.config +++ b/kernel/configs/debug.config @@ -73,7 +73,6 @@ CONFIG_DEBUG_VM=y CONFIG_DEBUG_VM_PGFLAGS=y CONFIG_DEBUG_VM_RB=y CONFIG_DEBUG_VM_VMACACHE=y -CONFIG_GENERIC_PTDUMP=y CONFIG_KASAN=y CONFIG_KASAN_GENERIC=y CONFIG_KASAN_INLINE=y @@ -113,3 +112,8 @@ CONFIG_BRANCH_PROFILE_NONE=y CONFIG_DYNAMIC_FTRACE=y CONFIG_FTRACE=y CONFIG_FUNCTION_TRACER=y +# +# Preemption +# +CONFIG_DEBUG_PREEMPT=y +CONFIG_PREEMPT=y diff --git a/kernel/configs/xen.config b/kernel/configs/xen.config index 6878b9a49be8..1875a0a5047a 100644 --- a/kernel/configs/xen.config +++ b/kernel/configs/xen.config @@ -13,6 +13,8 @@ CONFIG_SCSI=y CONFIG_FB=y CONFIG_INPUT_MISC=y CONFIG_MEMORY_HOTPLUG=y +CONFIG_MEMORY_HOTREMOVE=y +CONFIG_ZONE_DEVICE=y CONFIG_TTY=y # Technically not required but otherwise produces # pretty useless systems starting from allnoconfig @@ -47,3 +49,4 @@ CONFIG_XEN_GNTDEV=m CONFIG_XEN_GRANT_DEV_ALLOC=m CONFIG_SWIOTLB_XEN=y CONFIG_XEN_PRIVCMD=m +CONFIG_XEN_UNPOPULATED_ALLOC=y diff --git a/kernel/cpu.c b/kernel/cpu.c index ad755db29efd..a59e009e0be4 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -526,6 +526,7 @@ void lockdep_assert_cpus_held(void) percpu_rwsem_assert_held(&cpu_hotplug_lock); } +EXPORT_SYMBOL_GPL(lockdep_assert_cpus_held); #ifdef CONFIG_LOCKDEP int lockdep_is_cpus_held(void) @@ -2068,11 +2069,6 @@ static struct cpuhp_step cpuhp_hp_states[] = { .teardown.single = NULL, .cant_stop = true, }, - [CPUHP_PERF_PREPARE] = { - .name = "perf:prepare", - .startup.single = perf_event_init_cpu, - .teardown.single = perf_event_exit_cpu, - }, [CPUHP_RANDOM_PREPARE] = { .name = "random:prepare", .startup.single = random_prepare_cpu, diff --git a/kernel/crash_reserve.c b/kernel/crash_reserve.c index a620fb4b2116..aff7c0fdbefa 100644 --- a/kernel/crash_reserve.c +++ b/kernel/crash_reserve.c @@ -375,11 +375,10 @@ static int __init reserve_crashkernel_low(unsigned long long low_size) return 0; } -void __init reserve_crashkernel_generic(char *cmdline, - unsigned long long crash_size, - unsigned long long crash_base, - unsigned long long crash_low_size, - bool high) +void __init reserve_crashkernel_generic(unsigned long long crash_size, + unsigned long long crash_base, + unsigned long long crash_low_size, + bool high) { unsigned long long search_end = CRASH_ADDR_LOW_MAX, search_base = 0; bool fixed_base = false; diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index ce1bb2301c06..0b9495187fba 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -837,10 +837,6 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs) { struct kgdb_state kgdb_var; struct kgdb_state *ks = &kgdb_var; - int ret = 0; - - if (arch_kgdb_ops.enable_nmi) - arch_kgdb_ops.enable_nmi(0); /* * Avoid entering the debugger if we were triggered due to an oops * but panic_timeout indicates the system should automatically @@ -858,15 +854,11 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs) ks->linux_regs = regs; if (kgdb_reenter_check(ks)) - goto out; /* Ouch, double exception ! */ + return 0; /* Ouch, double exception ! */ if (kgdb_info[ks->cpu].enter_kgdb != 0) - goto out; + return 0; - ret = kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER); -out: - if (arch_kgdb_ops.enable_nmi) - arch_kgdb_ops.enable_nmi(1); - return ret; + return kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER); } NOKPROBE_SYMBOL(kgdb_handle_exception); diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 6a77f1c779c4..9b11b10b120c 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -334,7 +334,7 @@ poll_again: *cp = '\0'; p_tmp = strrchr(buffer, ' '); p_tmp = (p_tmp ? p_tmp + 1 : buffer); - strscpy(tmpbuffer, p_tmp, sizeof(tmpbuffer)); + strscpy(tmpbuffer, p_tmp); *cp = tmp; len = strlen(tmpbuffer); @@ -452,7 +452,7 @@ poll_again: char *kdb_getstr(char *buffer, size_t bufsize, const char *prompt) { if (prompt && kdb_prompt_str != prompt) - strscpy(kdb_prompt_str, prompt, CMD_BUFLEN); + strscpy(kdb_prompt_str, prompt); kdb_printf("%s", kdb_prompt_str); kdb_nextline = 1; /* Prompt and input resets line number */ return kdb_read(buffer, bufsize); diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 5f4be507d79f..7a4d2d4689a5 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -25,7 +25,6 @@ #include <linux/smp.h> #include <linux/utsname.h> #include <linux/vmalloc.h> -#include <linux/atomic.h> #include <linux/moduleparam.h> #include <linux/mm.h> #include <linux/init.h> @@ -105,7 +104,7 @@ static kdbmsg_t kdbmsgs[] = { KDBMSG(NOENVVALUE, "Environment variable should have value"), KDBMSG(NOTIMP, "Command not implemented"), KDBMSG(ENVFULL, "Environment full"), - KDBMSG(ENVBUFFULL, "Environment buffer full"), + KDBMSG(KMALLOCFAILED, "Failed to allocate memory"), KDBMSG(TOOMANYBPT, "Too many breakpoints defined"), #ifdef CONFIG_CPU_XSCALE KDBMSG(TOOMANYDBREGS, "More breakpoints than ibcr registers defined"), @@ -130,13 +129,9 @@ static const int __nkdb_err = ARRAY_SIZE(kdbmsgs); /* - * Initial environment. This is all kept static and local to - * this file. We don't want to rely on the memory allocation - * mechanisms in the kernel, so we use a very limited allocate-only - * heap for new and altered environment variables. The entire - * environment is limited to a fixed number of entries (add more - * to __env[] if required) and a fixed amount of heap (add more to - * KDB_ENVBUFSIZE if required). + * Initial environment. This is all kept static and local to this file. + * The entire environment is limited to a fixed number of entries + * (add more to __env[] if required) */ static char *__env[31] = { @@ -259,35 +254,6 @@ char *kdbgetenv(const char *match) } /* - * kdballocenv - This function is used to allocate bytes for - * environment entries. - * Parameters: - * bytes The number of bytes to allocate in the static buffer. - * Returns: - * A pointer to the allocated space in the buffer on success. - * NULL if bytes > size available in the envbuffer. - * Remarks: - * We use a static environment buffer (envbuffer) to hold the values - * of dynamically generated environment variables (see kdb_set). Buffer - * space once allocated is never free'd, so over time, the amount of space - * (currently 512 bytes) will be exhausted if env variables are changed - * frequently. - */ -static char *kdballocenv(size_t bytes) -{ -#define KDB_ENVBUFSIZE 512 - static char envbuffer[KDB_ENVBUFSIZE]; - static int envbufsize; - char *ep = NULL; - - if ((KDB_ENVBUFSIZE - envbufsize) >= bytes) { - ep = &envbuffer[envbufsize]; - envbufsize += bytes; - } - return ep; -} - -/* * kdbgetulenv - This function will return the value of an unsigned * long-valued environment variable. * Parameters: @@ -348,9 +314,9 @@ static int kdb_setenv(const char *var, const char *val) varlen = strlen(var); vallen = strlen(val); - ep = kdballocenv(varlen + vallen + 2); - if (ep == (char *)0) - return KDB_ENVBUFFULL; + ep = kmalloc(varlen + vallen + 2, GFP_KDB); + if (!ep) + return KDB_KMALLOCFAILED; sprintf(ep, "%s=%s", var, val); @@ -359,6 +325,7 @@ static int kdb_setenv(const char *var, const char *val) && ((strncmp(__env[i], var, varlen) == 0) && ((__env[i][varlen] == '\0') || (__env[i][varlen] == '=')))) { + kfree_const(__env[i]); __env[i] = ep; return 0; } @@ -2119,32 +2086,6 @@ static int kdb_dmesg(int argc, const char **argv) return 0; } #endif /* CONFIG_PRINTK */ - -/* Make sure we balance enable/disable calls, must disable first. */ -static atomic_t kdb_nmi_disabled; - -static int kdb_disable_nmi(int argc, const char *argv[]) -{ - if (atomic_read(&kdb_nmi_disabled)) - return 0; - atomic_set(&kdb_nmi_disabled, 1); - arch_kgdb_ops.enable_nmi(0); - return 0; -} - -static int kdb_param_enable_nmi(const char *val, const struct kernel_param *kp) -{ - if (!atomic_add_unless(&kdb_nmi_disabled, -1, 0)) - return -EINVAL; - arch_kgdb_ops.enable_nmi(1); - return 0; -} - -static const struct kernel_param_ops kdb_param_ops_enable_nmi = { - .set = kdb_param_enable_nmi, -}; -module_param_cb(enable_nmi, &kdb_param_ops_enable_nmi, NULL, 0600); - /* * kdb_cpu - This function implements the 'cpu' command. * cpu [<cpunum>] @@ -2836,20 +2777,10 @@ static kdbtab_t maintab[] = { }, }; -static kdbtab_t nmicmd = { - .name = "disable_nmi", - .func = kdb_disable_nmi, - .usage = "", - .help = "Disable NMI entry to KDB", - .flags = KDB_ENABLE_ALWAYS_SAFE, -}; - /* Initialize the kdb command table. */ static void __init kdb_inittab(void) { kdb_register_table(maintab, ARRAY_SIZE(maintab)); - if (arch_kgdb_ops.enable_nmi) - kdb_register_table(&nmicmd, 1); } /* Execute any commands defined in kdb_cmds. */ diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c index 3b2bdca9f1d4..77c8d9487a9a 100644 --- a/kernel/dma/coherent.c +++ b/kernel/dma/coherent.c @@ -336,16 +336,22 @@ static phys_addr_t dma_reserved_default_memory_size __initdata; static int rmem_dma_device_init(struct reserved_mem *rmem, struct device *dev) { - if (!rmem->priv) { - struct dma_coherent_mem *mem; + struct dma_coherent_mem *mem = rmem->priv; + if (!mem) { mem = dma_init_coherent_memory(rmem->base, rmem->base, rmem->size, true); if (IS_ERR(mem)) return PTR_ERR(mem); rmem->priv = mem; } - dma_assign_coherent_memory(dev, rmem->priv); + + /* Warn if the device potentially can't use the reserved memory */ + if (mem->device_base + rmem->size - 1 > + min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit)) + dev_warn(dev, "reserved memory is beyond device's set DMA address range\n"); + + dma_assign_coherent_memory(dev, mem); return 0; } diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index 055da410ac71..8df0dfaaca18 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -64,8 +64,7 @@ struct cma *dma_contiguous_default_area; * Users, who want to set the size of global CMA area for their system * should use cma= kernel parameter. */ -static const phys_addr_t size_bytes __initconst = - (phys_addr_t)CMA_SIZE_MBYTES * SZ_1M; +#define size_bytes ((phys_addr_t)CMA_SIZE_MBYTES * SZ_1M) static phys_addr_t size_cmdline __initdata = -1; static phys_addr_t base_cmdline __initdata; static phys_addr_t limit_cmdline __initdata; diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index b8fe0b3d0ffb..24c359d9c879 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -13,6 +13,7 @@ #include <linux/vmalloc.h> #include <linux/set_memory.h> #include <linux/slab.h> +#include <linux/pci-p2pdma.h> #include "direct.h" /* @@ -462,34 +463,33 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir, unsigned long attrs) { struct pci_p2pdma_map_state p2pdma_state = {}; - enum pci_p2pdma_map_type map; struct scatterlist *sg; int i, ret; for_each_sg(sgl, sg, nents, i) { - if (is_pci_p2pdma_page(sg_page(sg))) { - map = pci_p2pdma_map_segment(&p2pdma_state, dev, sg); - switch (map) { - case PCI_P2PDMA_MAP_BUS_ADDR: - continue; - case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: - /* - * Any P2P mapping that traverses the PCI - * host bridge must be mapped with CPU physical - * address and not PCI bus addresses. This is - * done with dma_direct_map_page() below. - */ - break; - default: - ret = -EREMOTEIO; + switch (pci_p2pdma_state(&p2pdma_state, dev, sg_page(sg))) { + case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: + /* + * Any P2P mapping that traverses the PCI host bridge + * must be mapped with CPU physical address and not PCI + * bus addresses. + */ + break; + case PCI_P2PDMA_MAP_NONE: + sg->dma_address = dma_direct_map_page(dev, sg_page(sg), + sg->offset, sg->length, dir, attrs); + if (sg->dma_address == DMA_MAPPING_ERROR) { + ret = -EIO; goto out_unmap; } - } - - sg->dma_address = dma_direct_map_page(dev, sg_page(sg), - sg->offset, sg->length, dir, attrs); - if (sg->dma_address == DMA_MAPPING_ERROR) { - ret = -EIO; + break; + case PCI_P2PDMA_MAP_BUS_ADDR: + sg->dma_address = pci_p2pdma_bus_addr_map(&p2pdma_state, + sg_phys(sg)); + sg_dma_mark_bus_address(sg); + continue; + default: + ret = -EREMOTEIO; goto out_unmap; } sg_dma_len(sg) = sg->length; diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index cda127027e48..107e4a4d251d 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -443,6 +443,24 @@ bool __dma_need_sync(struct device *dev, dma_addr_t dma_addr) } EXPORT_SYMBOL_GPL(__dma_need_sync); +/** + * dma_need_unmap - does this device need dma_unmap_* operations + * @dev: device to check + * + * If this function returns %false, drivers can skip calling dma_unmap_* after + * finishing an I/O. This function must be called after all mappings that might + * need to be unmapped have been performed. + */ +bool dma_need_unmap(struct device *dev) +{ + if (!dma_map_direct(dev, get_dma_ops(dev))) + return true; + if (!dev->dma_skip_sync) + return true; + return IS_ENABLED(CONFIG_DMA_API_DEBUG); +} +EXPORT_SYMBOL_GPL(dma_need_unmap); + static void dma_setup_need_sync(struct device *dev) { const struct dma_map_ops *ops = get_dma_ops(dev); @@ -910,6 +928,19 @@ int dma_set_coherent_mask(struct device *dev, u64 mask) } EXPORT_SYMBOL(dma_set_coherent_mask); +static bool __dma_addressing_limited(struct device *dev) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + if (min_not_zero(dma_get_mask(dev), dev->bus_dma_limit) < + dma_get_required_mask(dev)) + return true; + + if (unlikely(ops) || use_dma_iommu(dev)) + return false; + return !dma_direct_all_ram_mapped(dev); +} + /** * dma_addressing_limited - return if the device is addressing limited * @dev: device to check @@ -920,15 +951,11 @@ EXPORT_SYMBOL(dma_set_coherent_mask); */ bool dma_addressing_limited(struct device *dev) { - const struct dma_map_ops *ops = get_dma_ops(dev); - - if (min_not_zero(dma_get_mask(dev), dev->bus_dma_limit) < - dma_get_required_mask(dev)) - return true; - - if (unlikely(ops) || use_dma_iommu(dev)) + if (!__dma_addressing_limited(dev)) return false; - return !dma_direct_all_ram_mapped(dev); + + dev_dbg(dev, "device is DMA addressing limited\n"); + return true; } EXPORT_SYMBOL_GPL(dma_addressing_limited); diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 20154572ede9..a8dd1f27417c 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -146,7 +146,7 @@ static inline bool report_single_step(unsigned long work) return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP; } -static void syscall_exit_work(struct pt_regs *regs, unsigned long work) +void syscall_exit_work(struct pt_regs *regs, unsigned long work) { bool step; @@ -173,53 +173,6 @@ static void syscall_exit_work(struct pt_regs *regs, unsigned long work) ptrace_report_syscall_exit(regs, step); } -/* - * Syscall specific exit to user mode preparation. Runs with interrupts - * enabled. - */ -static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs) -{ - unsigned long work = READ_ONCE(current_thread_info()->syscall_work); - unsigned long nr = syscall_get_nr(current, regs); - - CT_WARN_ON(ct_state() != CT_STATE_KERNEL); - - if (IS_ENABLED(CONFIG_PROVE_LOCKING)) { - if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr)) - local_irq_enable(); - } - - rseq_syscall(regs); - - /* - * Do one-time syscall specific work. If these work items are - * enabled, we want to run them exactly once per syscall exit with - * interrupts enabled. - */ - if (unlikely(work & SYSCALL_WORK_EXIT)) - syscall_exit_work(regs, work); -} - -static __always_inline void __syscall_exit_to_user_mode_work(struct pt_regs *regs) -{ - syscall_exit_to_user_mode_prepare(regs); - local_irq_disable_exit_to_user(); - exit_to_user_mode_prepare(regs); -} - -void syscall_exit_to_user_mode_work(struct pt_regs *regs) -{ - __syscall_exit_to_user_mode_work(regs); -} - -__visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs) -{ - instrumentation_begin(); - __syscall_exit_to_user_mode_work(regs); - instrumentation_end(); - exit_to_user_mode(); -} - noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs) { enter_from_user_mode(regs); diff --git a/kernel/events/core.c b/kernel/events/core.c index 4ce9795e5519..f34c99f8ce8f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1193,8 +1193,8 @@ static void __perf_mux_hrtimer_init(struct perf_cpu_pmu_context *cpc, int cpu) cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval); raw_spin_lock_init(&cpc->hrtimer_lock); - hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); - timer->function = perf_mux_hrtimer_handler; + hrtimer_setup(timer, perf_mux_hrtimer_handler, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS_PINNED_HARD); } static int perf_mux_hrtimer_restart(struct perf_cpu_pmu_context *cpc) @@ -1270,6 +1270,10 @@ static void put_ctx(struct perf_event_context *ctx) if (ctx->task && ctx->task != TASK_TOMBSTONE) put_task_struct(ctx->task); call_rcu(&ctx->rcu_head, free_ctx); + } else { + smp_mb__after_atomic(); /* pairs with wait_var_event() */ + if (ctx->task == TASK_TOMBSTONE) + wake_up_var(&ctx->refcount); } } @@ -2167,7 +2171,7 @@ static void perf_put_aux_event(struct perf_event *event) * If the event is an aux_event, tear down all links to * it from other events. */ - for_each_sibling_event(iter, event->group_leader) { + for_each_sibling_event(iter, event) { if (iter->aux_event != event) continue; @@ -2325,7 +2329,11 @@ static void perf_child_detach(struct perf_event *event) if (WARN_ON_ONCE(!parent_event)) return; + /* + * Can't check this from an IPI, the holder is likey another CPU. + * lockdep_assert_held(&parent_event->child_mutex); + */ sync_child_event(event); list_del_init(&event->child_list); @@ -2343,6 +2351,11 @@ event_filter_match(struct perf_event *event) perf_cgroup_match(event); } +static inline bool is_event_in_freq_mode(struct perf_event *event) +{ + return event->attr.freq && event->attr.sample_freq; +} + static void event_sched_out(struct perf_event *event, struct perf_event_context *ctx) { @@ -2380,7 +2393,7 @@ event_sched_out(struct perf_event *event, struct perf_event_context *ctx) if (!is_software_event(event)) cpc->active_oncpu--; - if (event->attr.freq && event->attr.sample_freq) { + if (is_event_in_freq_mode(event)) { ctx->nr_freq--; epc->nr_freq--; } @@ -2450,7 +2463,9 @@ ctx_time_update_event(struct perf_event_context *ctx, struct perf_event *event) #define DETACH_GROUP 0x01UL #define DETACH_CHILD 0x02UL -#define DETACH_DEAD 0x04UL +#define DETACH_EXIT 0x04UL +#define DETACH_REVOKE 0x08UL +#define DETACH_DEAD 0x10UL /* * Cross CPU call to remove a performance event @@ -2465,6 +2480,7 @@ __perf_remove_from_context(struct perf_event *event, void *info) { struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx; + enum perf_event_state state = PERF_EVENT_STATE_OFF; unsigned long flags = (unsigned long)info; ctx_time_update(cpuctx, ctx); @@ -2473,16 +2489,22 @@ __perf_remove_from_context(struct perf_event *event, * Ensure event_sched_out() switches to OFF, at the very least * this avoids raising perf_pending_task() at this time. */ - if (flags & DETACH_DEAD) + if (flags & DETACH_EXIT) + state = PERF_EVENT_STATE_EXIT; + if (flags & DETACH_REVOKE) + state = PERF_EVENT_STATE_REVOKED; + if (flags & DETACH_DEAD) { event->pending_disable = 1; + state = PERF_EVENT_STATE_DEAD; + } event_sched_out(event, ctx); + perf_event_set_state(event, min(event->state, state)); + if (flags & DETACH_GROUP) perf_group_detach(event); if (flags & DETACH_CHILD) perf_child_detach(event); list_del_event(event, ctx); - if (flags & DETACH_DEAD) - event->state = PERF_EVENT_STATE_DEAD; if (!pmu_ctx->nr_events) { pmu_ctx->rotate_necessary = 0; @@ -2623,6 +2645,41 @@ void perf_event_disable_inatomic(struct perf_event *event) static void perf_log_throttle(struct perf_event *event, int enable); static void perf_log_itrace_start(struct perf_event *event); +static void perf_event_unthrottle(struct perf_event *event, bool start) +{ + event->hw.interrupts = 0; + if (start) + event->pmu->start(event, 0); + if (event == event->group_leader) + perf_log_throttle(event, 1); +} + +static void perf_event_throttle(struct perf_event *event) +{ + event->pmu->stop(event, 0); + event->hw.interrupts = MAX_INTERRUPTS; + if (event == event->group_leader) + perf_log_throttle(event, 0); +} + +static void perf_event_unthrottle_group(struct perf_event *event, bool skip_start_event) +{ + struct perf_event *sibling, *leader = event->group_leader; + + perf_event_unthrottle(leader, skip_start_event ? leader != event : true); + for_each_sibling_event(sibling, leader) + perf_event_unthrottle(sibling, skip_start_event ? sibling != event : true); +} + +static void perf_event_throttle_group(struct perf_event *event) +{ + struct perf_event *sibling, *leader = event->group_leader; + + perf_event_throttle(leader); + for_each_sibling_event(sibling, leader) + perf_event_throttle(sibling); +} + static int event_sched_in(struct perf_event *event, struct perf_event_context *ctx) { @@ -2651,10 +2708,8 @@ event_sched_in(struct perf_event *event, struct perf_event_context *ctx) * ticks already, also for a heavily scheduling task there is little * guarantee it'll get a tick in a timely manner. */ - if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) { - perf_log_throttle(event, 1); - event->hw.interrupts = 0; - } + if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) + perf_event_unthrottle(event, false); perf_pmu_disable(event->pmu); @@ -2669,7 +2724,7 @@ event_sched_in(struct perf_event *event, struct perf_event_context *ctx) if (!is_software_event(event)) cpc->active_oncpu++; - if (event->attr.freq && event->attr.sample_freq) { + if (is_event_in_freq_mode(event)) { ctx->nr_freq++; epc->nr_freq++; } @@ -3938,7 +3993,7 @@ static int merge_sched_in(struct perf_event *event, void *data) perf_event_set_state(event, PERF_EVENT_STATE_ERROR); if (*perf_event_fasync(event)) - event->pending_kill = POLL_HUP; + event->pending_kill = POLL_ERR; perf_event_wakeup(event); } else { @@ -4232,14 +4287,10 @@ static void perf_adjust_freq_unthr_events(struct list_head *event_list) hwc = &event->hw; - if (hwc->interrupts == MAX_INTERRUPTS) { - hwc->interrupts = 0; - perf_log_throttle(event, 1); - if (!event->attr.freq || !event->attr.sample_freq) - event->pmu->start(event, 0); - } + if (hwc->interrupts == MAX_INTERRUPTS) + perf_event_unthrottle_group(event, is_event_in_freq_mode(event)); - if (!event->attr.freq || !event->attr.sample_freq) + if (!is_event_in_freq_mode(event)) continue; /* @@ -4511,7 +4562,8 @@ out: static void perf_remove_from_owner(struct perf_event *event); static void perf_event_exit_event(struct perf_event *event, - struct perf_event_context *ctx); + struct perf_event_context *ctx, + bool revoke); /* * Removes all events from the current task that have been marked @@ -4538,7 +4590,7 @@ static void perf_event_remove_on_exec(struct perf_event_context *ctx) modified = true; - perf_event_exit_event(event, ctx); + perf_event_exit_event(event, ctx, false); } raw_spin_lock_irqsave(&ctx->lock, flags); @@ -4886,7 +4938,7 @@ find_get_context(struct task_struct *task, struct perf_event *event) if (!task) { /* Must be root to operate on a CPU event: */ - err = perf_allow_cpu(&event->attr); + err = perf_allow_cpu(); if (err) return ERR_PTR(err); @@ -5120,6 +5172,7 @@ static bool is_sb_event(struct perf_event *event) attr->context_switch || attr->text_poke || attr->bpf_event) return true; + return false; } @@ -5513,33 +5566,11 @@ static bool exclusive_event_installable(struct perf_event *event, static void perf_free_addr_filters(struct perf_event *event); -static void perf_pending_task_sync(struct perf_event *event) -{ - struct callback_head *head = &event->pending_task; - - if (!event->pending_work) - return; - /* - * If the task is queued to the current task's queue, we - * obviously can't wait for it to complete. Simply cancel it. - */ - if (task_work_cancel(current, head)) { - event->pending_work = 0; - local_dec(&event->ctx->nr_no_switch_fast); - return; - } - - /* - * All accesses related to the event are within the same RCU section in - * perf_pending_task(). The RCU grace period before the event is freed - * will make sure all those accesses are complete by then. - */ - rcuwait_wait_event(&event->pending_work_wait, !event->pending_work, TASK_UNINTERRUPTIBLE); -} - /* vs perf_event_alloc() error */ static void __free_event(struct perf_event *event) { + struct pmu *pmu = event->pmu; + if (event->attach_state & PERF_ATTACH_CALLCHAIN) put_callchain_buffers(); @@ -5569,6 +5600,7 @@ static void __free_event(struct perf_event *event) * put_pmu_ctx() needs an event->ctx reference, because of * epc->ctx. */ + WARN_ON_ONCE(!pmu); WARN_ON_ONCE(!event->ctx); WARN_ON_ONCE(event->pmu_ctx->ctx != event->ctx); put_pmu_ctx(event->pmu_ctx); @@ -5581,8 +5613,13 @@ static void __free_event(struct perf_event *event) if (event->ctx) put_ctx(event->ctx); - if (event->pmu) - module_put(event->pmu->module); + if (pmu) { + module_put(pmu->module); + scoped_guard (spinlock, &pmu->events_lock) { + list_del(&event->pmu_list); + wake_up_var(pmu); + } + } call_rcu(&event->rcu_head, free_event_rcu); } @@ -5594,7 +5631,6 @@ static void _free_event(struct perf_event *event) { irq_work_sync(&event->pending_irq); irq_work_sync(&event->pending_disable_irq); - perf_pending_task_sync(event); unaccount_event(event); @@ -5620,13 +5656,13 @@ static void _free_event(struct perf_event *event) /* * Used to free events which have a known refcount of 1, such as in error paths - * where the event isn't exposed yet and inherited events. + * of inherited events. */ static void free_event(struct perf_event *event) { if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1, - "unexpected event refcount: %ld; ptr=%p\n", - atomic_long_read(&event->refcount), event)) { + "unexpected event refcount: %ld; ptr=%p\n", + atomic_long_read(&event->refcount), event)) { /* leak to avoid use-after-free */ return; } @@ -5687,10 +5723,17 @@ static void perf_remove_from_owner(struct perf_event *event) static void put_event(struct perf_event *event) { + struct perf_event *parent; + if (!atomic_long_dec_and_test(&event->refcount)) return; + parent = event->parent; _free_event(event); + + /* Matches the refcount bump in inherit_event() */ + if (parent) + put_event(parent); } /* @@ -5702,7 +5745,6 @@ int perf_event_release_kernel(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; struct perf_event *child, *tmp; - LIST_HEAD(free_list); /* * If we got here through err_alloc: free_event(event); we will not @@ -5731,15 +5773,17 @@ int perf_event_release_kernel(struct perf_event *event) * Thus this guarantees that we will in fact observe and kill _ALL_ * child events. */ - perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD); + if (event->state > PERF_EVENT_STATE_REVOKED) { + perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD); + } else { + event->state = PERF_EVENT_STATE_DEAD; + } perf_event_ctx_unlock(event, ctx); again: mutex_lock(&event->child_mutex); list_for_each_entry(child, &event->child_list, child_list) { - void *var = NULL; - /* * Cannot change, child events are not migrated, see the * comment with perf_event_ctx_lock_nested(). @@ -5772,50 +5816,30 @@ again: tmp = list_first_entry_or_null(&event->child_list, struct perf_event, child_list); if (tmp == child) { - perf_remove_from_context(child, DETACH_GROUP); - list_move(&child->child_list, &free_list); - /* - * This matches the refcount bump in inherit_event(); - * this can't be the last reference. - */ - put_event(event); + perf_remove_from_context(child, DETACH_GROUP | DETACH_CHILD); } else { - var = &ctx->refcount; + child = NULL; } mutex_unlock(&event->child_mutex); mutex_unlock(&ctx->mutex); - put_ctx(ctx); - if (var) { - /* - * If perf_event_free_task() has deleted all events from the - * ctx while the child_mutex got released above, make sure to - * notify about the preceding put_ctx(). - */ - smp_mb(); /* pairs with wait_var_event() */ - wake_up_var(var); + if (child) { + /* Last reference unless ->pending_task work is pending */ + put_event(child); } + put_ctx(ctx); + goto again; } mutex_unlock(&event->child_mutex); - list_for_each_entry_safe(child, tmp, &free_list, child_list) { - void *var = &child->ctx->refcount; - - list_del(&child->child_list); - free_event(child); - - /* - * Wake any perf_event_free_task() waiting for this event to be - * freed. - */ - smp_mb(); /* pairs with wait_var_event() */ - wake_up_var(var); - } - no_ctx: - put_event(event); /* Must be the 'last' reference */ + /* + * Last reference unless ->pending_task work is pending on this event + * or any of its children. + */ + put_event(event); return 0; } EXPORT_SYMBOL_GPL(perf_event_release_kernel); @@ -6081,14 +6105,20 @@ static __poll_t perf_poll(struct file *file, poll_table *wait) struct perf_buffer *rb; __poll_t events = EPOLLHUP; + if (event->state <= PERF_EVENT_STATE_REVOKED) + return EPOLLERR; + poll_wait(file, &event->waitq, wait); + if (event->state <= PERF_EVENT_STATE_REVOKED) + return EPOLLERR; + if (is_event_hup(event)) return events; if (unlikely(READ_ONCE(event->state) == PERF_EVENT_STATE_ERROR && event->attr.pinned)) - return events; + return EPOLLERR; /* * Pin the event->rb by taking event->mmap_mutex; otherwise @@ -6180,14 +6210,6 @@ static void __perf_event_period(struct perf_event *event, active = (event->state == PERF_EVENT_STATE_ACTIVE); if (active) { perf_pmu_disable(event->pmu); - /* - * We could be throttled; unthrottle now to avoid the tick - * trying to unthrottle while we already re-started the event. - */ - if (event->hw.interrupts == MAX_INTERRUPTS) { - event->hw.interrupts = 0; - perf_log_throttle(event, 1); - } event->pmu->stop(event, PERF_EF_UPDATE); } @@ -6195,6 +6217,14 @@ static void __perf_event_period(struct perf_event *event, if (active) { event->pmu->start(event, PERF_EF_RELOAD); + /* + * Once the period is force-reset, the event starts immediately. + * But the event/group could be throttled. Unthrottle the + * event/group now to avoid the next tick trying to unthrottle + * while we already re-started the event/group. + */ + if (event->hw.interrupts == MAX_INTERRUPTS) + perf_event_unthrottle_group(event, true); perf_pmu_enable(event->pmu); } } @@ -6252,12 +6282,18 @@ static int perf_event_set_output(struct perf_event *event, static int perf_event_set_filter(struct perf_event *event, void __user *arg); static int perf_copy_attr(struct perf_event_attr __user *uattr, struct perf_event_attr *attr); +static int __perf_event_set_bpf_prog(struct perf_event *event, + struct bpf_prog *prog, + u64 bpf_cookie); static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg) { void (*func)(struct perf_event *); u32 flags = arg; + if (event->state <= PERF_EVENT_STATE_REVOKED) + return -ENODEV; + switch (cmd) { case PERF_EVENT_IOC_ENABLE: func = _perf_event_enable; @@ -6314,7 +6350,7 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon if (IS_ERR(prog)) return PTR_ERR(prog); - err = perf_event_set_bpf_prog(event, prog, 0); + err = __perf_event_set_bpf_prog(event, prog, 0); if (err) { bpf_prog_put(prog); return err; @@ -6633,9 +6669,22 @@ void ring_buffer_put(struct perf_buffer *rb) call_rcu(&rb->rcu_head, rb_free_rcu); } +typedef void (*mapped_f)(struct perf_event *event, struct mm_struct *mm); + +#define get_mapped(event, func) \ +({ struct pmu *pmu; \ + mapped_f f = NULL; \ + guard(rcu)(); \ + pmu = READ_ONCE(event->pmu); \ + if (pmu) \ + f = pmu->func; \ + f; \ +}) + static void perf_mmap_open(struct vm_area_struct *vma) { struct perf_event *event = vma->vm_file->private_data; + mapped_f mapped = get_mapped(event, event_mapped); atomic_inc(&event->mmap_count); atomic_inc(&event->rb->mmap_count); @@ -6643,8 +6692,8 @@ static void perf_mmap_open(struct vm_area_struct *vma) if (vma->vm_pgoff) atomic_inc(&event->rb->aux_mmap_count); - if (event->pmu->event_mapped) - event->pmu->event_mapped(event, vma->vm_mm); + if (mapped) + mapped(event, vma->vm_mm); } static void perf_pmu_output_stop(struct perf_event *event); @@ -6660,14 +6709,16 @@ static void perf_pmu_output_stop(struct perf_event *event); static void perf_mmap_close(struct vm_area_struct *vma) { struct perf_event *event = vma->vm_file->private_data; + mapped_f unmapped = get_mapped(event, event_unmapped); struct perf_buffer *rb = ring_buffer_get(event); struct user_struct *mmap_user = rb->mmap_user; int mmap_locked = rb->mmap_locked; unsigned long size = perf_data_size(rb); bool detach_rest = false; - if (event->pmu->event_unmapped) - event->pmu->event_unmapped(event, vma->vm_mm); + /* FIXIES vs perf_pmu_unregister() */ + if (unmapped) + unmapped(event, vma->vm_mm); /* * The AUX buffer is strictly a sub-buffer, serialize using aux_mutex @@ -6860,6 +6911,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) unsigned long nr_pages; long user_extra = 0, extra = 0; int ret, flags = 0; + mapped_f mapped; /* * Don't allow mmap() of inherited per-task counters. This would @@ -6890,6 +6942,16 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) mutex_lock(&event->mmap_mutex); ret = -EINVAL; + /* + * This relies on __pmu_detach_event() taking mmap_mutex after marking + * the event REVOKED. Either we observe the state, or __pmu_detach_event() + * will detach the rb created here. + */ + if (event->state <= PERF_EVENT_STATE_REVOKED) { + ret = -ENODEV; + goto unlock; + } + if (vma->vm_pgoff == 0) { nr_pages -= 1; @@ -7068,8 +7130,9 @@ aux_unlock: if (!ret) ret = map_range(rb, vma); - if (!ret && event->pmu->event_mapped) - event->pmu->event_mapped(event, vma->vm_mm); + mapped = get_mapped(event, event_mapped); + if (mapped) + mapped(event, vma->vm_mm); return ret; } @@ -7080,6 +7143,9 @@ static int perf_fasync(int fd, struct file *filp, int on) struct perf_event *event = filp->private_data; int retval; + if (event->state <= PERF_EVENT_STATE_REVOKED) + return -ENODEV; + inode_lock(inode); retval = fasync_helper(fd, filp, on, &event->fasync); inode_unlock(inode); @@ -7231,12 +7297,6 @@ static void perf_pending_task(struct callback_head *head) int rctx; /* - * All accesses to the event must belong to the same implicit RCU read-side - * critical section as the ->pending_work reset. See comment in - * perf_pending_task_sync(). - */ - rcu_read_lock(); - /* * If we 'fail' here, that's OK, it means recursion is already disabled * and we won't recurse 'further'. */ @@ -7246,9 +7306,8 @@ static void perf_pending_task(struct callback_head *head) event->pending_work = 0; perf_sigtrap(event); local_dec(&event->ctx->nr_no_switch_fast); - rcuwait_wake_up(&event->pending_work_wait); } - rcu_read_unlock(); + put_event(event); if (rctx >= 0) perf_swevent_put_recursion_context(rctx); @@ -9966,7 +10025,7 @@ void perf_event_text_poke(const void *addr, const void *old_bytes, void perf_event_itrace_started(struct perf_event *event) { - event->attach_state |= PERF_ATTACH_ITRACE; + WRITE_ONCE(event->attach_state, event->attach_state | PERF_ATTACH_ITRACE); } static void perf_log_itrace_start(struct perf_event *event) @@ -10049,14 +10108,13 @@ __perf_event_account_interrupt(struct perf_event *event, int throttle) hwc->interrupts = 1; } else { hwc->interrupts++; - if (unlikely(throttle && - hwc->interrupts > max_samples_per_tick)) { - __this_cpu_inc(perf_throttled_count); - tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS); - hwc->interrupts = MAX_INTERRUPTS; - perf_log_throttle(event, 0); - ret = 1; - } + } + + if (unlikely(throttle && hwc->interrupts >= max_samples_per_tick)) { + __this_cpu_inc(perf_throttled_count); + tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS); + perf_event_throttle_group(event); + ret = 1; } if (event->attr.freq) { @@ -10243,6 +10301,7 @@ static int __perf_event_overflow(struct perf_event *event, !task_work_add(current, &event->pending_task, notify_mode)) { event->pending_work = pending_id; local_inc(&event->ctx->nr_no_switch_fast); + WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount)); event->pending_addr = 0; if (valid_sample && (data->sample_flags & PERF_SAMPLE_ADDR)) @@ -11088,11 +11147,15 @@ static inline bool perf_event_is_tracing(struct perf_event *event) return false; } -int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog, - u64 bpf_cookie) +static int __perf_event_set_bpf_prog(struct perf_event *event, + struct bpf_prog *prog, + u64 bpf_cookie) { bool is_kprobe, is_uprobe, is_tracepoint, is_syscall_tp; + if (event->state <= PERF_EVENT_STATE_REVOKED) + return -ENODEV; + if (!perf_event_is_tracing(event)) return perf_event_set_bpf_handler(event, prog, bpf_cookie); @@ -11127,6 +11190,20 @@ int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog, return perf_event_attach_bpf_prog(event, prog, bpf_cookie); } +int perf_event_set_bpf_prog(struct perf_event *event, + struct bpf_prog *prog, + u64 bpf_cookie) +{ + struct perf_event_context *ctx; + int ret; + + ctx = perf_event_ctx_lock(event); + ret = __perf_event_set_bpf_prog(event, prog, bpf_cookie); + perf_event_ctx_unlock(event, ctx); + + return ret; +} + void perf_event_free_bpf_prog(struct perf_event *event) { if (!event->prog) @@ -11149,7 +11226,15 @@ static void perf_event_free_filter(struct perf_event *event) { } -int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog, +static int __perf_event_set_bpf_prog(struct perf_event *event, + struct bpf_prog *prog, + u64 bpf_cookie) +{ + return -ENOENT; +} + +int perf_event_set_bpf_prog(struct perf_event *event, + struct bpf_prog *prog, u64 bpf_cookie) { return -ENOENT; @@ -11679,8 +11764,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event) if (!is_sampling_event(event)) return; - hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); - hwc->hrtimer.function = perf_swevent_hrtimer; + hrtimer_setup(&hwc->hrtimer, perf_swevent_hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); /* * Since hrtimers have a fixed rate, we can do a static freq->period @@ -12255,6 +12339,9 @@ int perf_pmu_register(struct pmu *_pmu, const char *name, int type) if (!pmu->event_idx) pmu->event_idx = perf_event_idx_default; + INIT_LIST_HEAD(&pmu->events); + spin_lock_init(&pmu->events_lock); + /* * Now that the PMU is complete, make it visible to perf_try_init_event(). */ @@ -12268,21 +12355,143 @@ int perf_pmu_register(struct pmu *_pmu, const char *name, int type) } EXPORT_SYMBOL_GPL(perf_pmu_register); -void perf_pmu_unregister(struct pmu *pmu) +static void __pmu_detach_event(struct pmu *pmu, struct perf_event *event, + struct perf_event_context *ctx) +{ + /* + * De-schedule the event and mark it REVOKED. + */ + perf_event_exit_event(event, ctx, true); + + /* + * All _free_event() bits that rely on event->pmu: + * + * Notably, perf_mmap() relies on the ordering here. + */ + scoped_guard (mutex, &event->mmap_mutex) { + WARN_ON_ONCE(pmu->event_unmapped); + /* + * Mostly an empty lock sequence, such that perf_mmap(), which + * relies on mmap_mutex, is sure to observe the state change. + */ + } + + perf_event_free_bpf_prog(event); + perf_free_addr_filters(event); + + if (event->destroy) { + event->destroy(event); + event->destroy = NULL; + } + + if (event->pmu_ctx) { + put_pmu_ctx(event->pmu_ctx); + event->pmu_ctx = NULL; + } + + exclusive_event_destroy(event); + module_put(pmu->module); + + event->pmu = NULL; /* force fault instead of UAF */ +} + +static void pmu_detach_event(struct pmu *pmu, struct perf_event *event) +{ + struct perf_event_context *ctx; + + ctx = perf_event_ctx_lock(event); + __pmu_detach_event(pmu, event, ctx); + perf_event_ctx_unlock(event, ctx); + + scoped_guard (spinlock, &pmu->events_lock) + list_del(&event->pmu_list); +} + +static struct perf_event *pmu_get_event(struct pmu *pmu) +{ + struct perf_event *event; + + guard(spinlock)(&pmu->events_lock); + list_for_each_entry(event, &pmu->events, pmu_list) { + if (atomic_long_inc_not_zero(&event->refcount)) + return event; + } + + return NULL; +} + +static bool pmu_empty(struct pmu *pmu) +{ + guard(spinlock)(&pmu->events_lock); + return list_empty(&pmu->events); +} + +static void pmu_detach_events(struct pmu *pmu) +{ + struct perf_event *event; + + for (;;) { + event = pmu_get_event(pmu); + if (!event) + break; + + pmu_detach_event(pmu, event); + put_event(event); + } + + /* + * wait for pending _free_event()s + */ + wait_var_event(pmu, pmu_empty(pmu)); +} + +int perf_pmu_unregister(struct pmu *pmu) { scoped_guard (mutex, &pmus_lock) { + if (!idr_cmpxchg(&pmu_idr, pmu->type, pmu, NULL)) + return -EINVAL; + list_del_rcu(&pmu->entry); - idr_remove(&pmu_idr, pmu->type); } /* * We dereference the pmu list under both SRCU and regular RCU, so * synchronize against both of those. + * + * Notably, the entirety of event creation, from perf_init_event() + * (which will now fail, because of the above) until + * perf_install_in_context() should be under SRCU such that + * this synchronizes against event creation. This avoids trying to + * detach events that are not fully formed. */ synchronize_srcu(&pmus_srcu); synchronize_rcu(); + if (pmu->event_unmapped && !pmu_empty(pmu)) { + /* + * Can't force remove events when pmu::event_unmapped() + * is used in perf_mmap_close(). + */ + guard(mutex)(&pmus_lock); + idr_cmpxchg(&pmu_idr, pmu->type, NULL, pmu); + list_add_rcu(&pmu->entry, &pmus); + return -EBUSY; + } + + scoped_guard (mutex, &pmus_lock) + idr_remove(&pmu_idr, pmu->type); + + /* + * PMU is removed from the pmus list, so no new events will + * be created, now take care of the existing ones. + */ + pmu_detach_events(pmu); + + /* + * PMU is unused, make it go away. + */ perf_pmu_free(pmu); + return 0; } EXPORT_SYMBOL_GPL(perf_pmu_unregister); @@ -12376,7 +12585,7 @@ static struct pmu *perf_init_event(struct perf_event *event) struct pmu *pmu; int type, ret; - guard(srcu)(&pmus_srcu); + guard(srcu)(&pmus_srcu); /* pmu idr/list access */ /* * Save original type before calling pmu->event_init() since certain @@ -12600,13 +12809,13 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, INIT_LIST_HEAD(&event->active_entry); INIT_LIST_HEAD(&event->addr_filters.list); INIT_HLIST_NODE(&event->hlist_entry); + INIT_LIST_HEAD(&event->pmu_list); init_waitqueue_head(&event->waitq); init_irq_work(&event->pending_irq, perf_pending_irq); event->pending_disable_irq = IRQ_WORK_INIT_HARD(perf_pending_disable); init_task_work(&event->pending_task, perf_pending_task); - rcuwait_init(&event->pending_work_wait); mutex_init(&event->mmap_mutex); raw_spin_lock_init(&event->addr_filters.lock); @@ -12672,7 +12881,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, hwc = &event->hw; hwc->sample_period = attr->sample_period; - if (attr->freq && attr->sample_freq) + if (is_event_in_freq_mode(event)) hwc->sample_period = 1; hwc->last_period = hwc->sample_period; @@ -12779,6 +12988,13 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, /* symmetric to unaccount_event() in _free_event() */ account_event(event); + /* + * Event creation should be under SRCU, see perf_pmu_unregister(). + */ + lockdep_assert_held(&pmus_srcu); + scoped_guard (spinlock, &pmu->events_lock) + list_add(&event->pmu_list, &pmu->events); + return_ptr(event); } @@ -12849,7 +13065,7 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, } /* privileged levels capture (kernel, hv): check permissions */ if (mask & PERF_SAMPLE_BRANCH_PERM_PLM) { - ret = perf_allow_kernel(attr); + ret = perf_allow_kernel(); if (ret) return ret; } @@ -12978,6 +13194,9 @@ set: goto unlock; if (output_event) { + if (output_event->state <= PERF_EVENT_STATE_REVOKED) + goto unlock; + /* get the rb we want to redirect to */ rb = ring_buffer_get(output_event); if (!rb) @@ -13106,12 +13325,12 @@ SYSCALL_DEFINE5(perf_event_open, return err; /* Do we allow access to perf_event_open(2) ? */ - err = security_perf_event_open(&attr, PERF_SECURITY_OPEN); + err = security_perf_event_open(PERF_SECURITY_OPEN); if (err) return err; if (!attr.exclude_kernel) { - err = perf_allow_kernel(&attr); + err = perf_allow_kernel(); if (err) return err; } @@ -13131,7 +13350,7 @@ SYSCALL_DEFINE5(perf_event_open, /* Only privileged users can get physical addresses */ if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR)) { - err = perf_allow_kernel(&attr); + err = perf_allow_kernel(); if (err) return err; } @@ -13159,6 +13378,11 @@ SYSCALL_DEFINE5(perf_event_open, if (event_fd < 0) return event_fd; + /* + * Event creation should be under SRCU, see perf_pmu_unregister(). + */ + guard(srcu)(&pmus_srcu); + CLASS(fd, group)(group_fd); // group_fd == -1 => empty if (group_fd != -1) { if (!is_perf_file(group)) { @@ -13166,6 +13390,10 @@ SYSCALL_DEFINE5(perf_event_open, goto err_fd; } group_leader = fd_file(group)->private_data; + if (group_leader->state <= PERF_EVENT_STATE_REVOKED) { + err = -ENODEV; + goto err_fd; + } if (flags & PERF_FLAG_FD_OUTPUT) output_event = group_leader; if (flags & PERF_FLAG_FD_NO_GROUP) @@ -13462,7 +13690,7 @@ err_cred: if (task) up_read(&task->signal->exec_update_lock); err_alloc: - free_event(event); + put_event(event); err_task: if (task) put_task_struct(task); @@ -13499,6 +13727,11 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, if (attr->aux_output || attr->aux_action) return ERR_PTR(-EINVAL); + /* + * Event creation should be under SRCU, see perf_pmu_unregister(). + */ + guard(srcu)(&pmus_srcu); + event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler, context, -1); if (IS_ERR(event)) { @@ -13570,7 +13803,7 @@ err_unlock: perf_unpin_context(ctx); put_ctx(ctx); err_alloc: - free_event(event); + put_event(event); err: return ERR_PTR(err); } @@ -13710,10 +13943,12 @@ static void sync_child_event(struct perf_event *child_event) } static void -perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx) +perf_event_exit_event(struct perf_event *event, + struct perf_event_context *ctx, bool revoke) { struct perf_event *parent_event = event->parent; - unsigned long detach_flags = 0; + unsigned long detach_flags = DETACH_EXIT; + unsigned int attach_state; if (parent_event) { /* @@ -13728,28 +13963,38 @@ perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx) * Do destroy all inherited groups, we don't care about those * and being thorough is better. */ - detach_flags = DETACH_GROUP | DETACH_CHILD; + detach_flags |= DETACH_GROUP | DETACH_CHILD; mutex_lock(&parent_event->child_mutex); + /* PERF_ATTACH_ITRACE might be set concurrently */ + attach_state = READ_ONCE(event->attach_state); } - perf_remove_from_context(event, detach_flags); - - raw_spin_lock_irq(&ctx->lock); - if (event->state > PERF_EVENT_STATE_EXIT) - perf_event_set_state(event, PERF_EVENT_STATE_EXIT); - raw_spin_unlock_irq(&ctx->lock); + if (revoke) + detach_flags |= DETACH_GROUP | DETACH_REVOKE; + perf_remove_from_context(event, detach_flags); /* * Child events can be freed. */ if (parent_event) { mutex_unlock(&parent_event->child_mutex); + /* - * Kick perf_poll() for is_event_hup(); + * Match the refcount initialization. Make sure it doesn't happen + * twice if pmu_detach_event() calls it on an already exited task. */ - perf_event_wakeup(parent_event); - free_event(event); - put_event(parent_event); + if (attach_state & PERF_ATTACH_CHILD) { + /* + * Kick perf_poll() for is_event_hup(); + */ + perf_event_wakeup(parent_event); + /* + * pmu_detach_event() will have an extra refcount. + * perf_pending_task() might have one too. + */ + put_event(event); + } + return; } @@ -13759,15 +14004,13 @@ perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx) perf_event_wakeup(event); } -static void perf_event_exit_task_context(struct task_struct *child) +static void perf_event_exit_task_context(struct task_struct *task, bool exit) { - struct perf_event_context *child_ctx, *clone_ctx = NULL; + struct perf_event_context *ctx, *clone_ctx = NULL; struct perf_event *child_event, *next; - WARN_ON_ONCE(child != current); - - child_ctx = perf_pin_task_context(child); - if (!child_ctx) + ctx = perf_pin_task_context(task); + if (!ctx) return; /* @@ -13780,27 +14023,28 @@ static void perf_event_exit_task_context(struct task_struct *child) * without ctx::mutex (it cannot because of the move_group double mutex * lock thing). See the comments in perf_install_in_context(). */ - mutex_lock(&child_ctx->mutex); + mutex_lock(&ctx->mutex); /* * In a single ctx::lock section, de-schedule the events and detach the * context from the task such that we cannot ever get it scheduled back * in. */ - raw_spin_lock_irq(&child_ctx->lock); - task_ctx_sched_out(child_ctx, NULL, EVENT_ALL); + raw_spin_lock_irq(&ctx->lock); + if (exit) + task_ctx_sched_out(ctx, NULL, EVENT_ALL); /* * Now that the context is inactive, destroy the task <-> ctx relation * and mark the context dead. */ - RCU_INIT_POINTER(child->perf_event_ctxp, NULL); - put_ctx(child_ctx); /* cannot be last */ - WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE); - put_task_struct(current); /* cannot be last */ + RCU_INIT_POINTER(task->perf_event_ctxp, NULL); + put_ctx(ctx); /* cannot be last */ + WRITE_ONCE(ctx->task, TASK_TOMBSTONE); + put_task_struct(task); /* cannot be last */ - clone_ctx = unclone_ctx(child_ctx); - raw_spin_unlock_irq(&child_ctx->lock); + clone_ctx = unclone_ctx(ctx); + raw_spin_unlock_irq(&ctx->lock); if (clone_ctx) put_ctx(clone_ctx); @@ -13810,28 +14054,48 @@ static void perf_event_exit_task_context(struct task_struct *child) * won't get any samples after PERF_RECORD_EXIT. We can however still * get a few PERF_RECORD_READ events. */ - perf_event_task(child, child_ctx, 0); + if (exit) + perf_event_task(task, ctx, 0); - list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry) - perf_event_exit_event(child_event, child_ctx); + list_for_each_entry_safe(child_event, next, &ctx->event_list, event_entry) + perf_event_exit_event(child_event, ctx, false); - mutex_unlock(&child_ctx->mutex); + mutex_unlock(&ctx->mutex); - put_ctx(child_ctx); + if (!exit) { + /* + * perf_event_release_kernel() could still have a reference on + * this context. In that case we must wait for these events to + * have been freed (in particular all their references to this + * task must've been dropped). + * + * Without this copy_process() will unconditionally free this + * task (irrespective of its reference count) and + * _free_event()'s put_task_struct(event->hw.target) will be a + * use-after-free. + * + * Wait for all events to drop their context reference. + */ + wait_var_event(&ctx->refcount, + refcount_read(&ctx->refcount) == 1); + } + put_ctx(ctx); } /* - * When a child task exits, feed back event values to parent events. + * When a task exits, feed back event values to parent events. * * Can be called with exec_update_lock held when called from * setup_new_exec(). */ -void perf_event_exit_task(struct task_struct *child) +void perf_event_exit_task(struct task_struct *task) { struct perf_event *event, *tmp; - mutex_lock(&child->perf_event_mutex); - list_for_each_entry_safe(event, tmp, &child->perf_event_list, + WARN_ON_ONCE(task != current); + + mutex_lock(&task->perf_event_mutex); + list_for_each_entry_safe(event, tmp, &task->perf_event_list, owner_entry) { list_del_init(&event->owner_entry); @@ -13842,44 +14106,23 @@ void perf_event_exit_task(struct task_struct *child) */ smp_store_release(&event->owner, NULL); } - mutex_unlock(&child->perf_event_mutex); + mutex_unlock(&task->perf_event_mutex); - perf_event_exit_task_context(child); + perf_event_exit_task_context(task, true); /* * The perf_event_exit_task_context calls perf_event_task - * with child's task_ctx, which generates EXIT events for - * child contexts and sets child->perf_event_ctxp[] to NULL. + * with task's task_ctx, which generates EXIT events for + * task contexts and sets task->perf_event_ctxp[] to NULL. * At this point we need to send EXIT events to cpu contexts. */ - perf_event_task(child, NULL, 0); + perf_event_task(task, NULL, 0); /* * Detach the perf_ctx_data for the system-wide event. */ guard(percpu_read)(&global_ctx_data_rwsem); - detach_task_ctx_data(child); -} - -static void perf_free_event(struct perf_event *event, - struct perf_event_context *ctx) -{ - struct perf_event *parent = event->parent; - - if (WARN_ON_ONCE(!parent)) - return; - - mutex_lock(&parent->child_mutex); - list_del_init(&event->child_list); - mutex_unlock(&parent->child_mutex); - - put_event(parent); - - raw_spin_lock_irq(&ctx->lock); - perf_group_detach(event); - list_del_event(event, ctx); - raw_spin_unlock_irq(&ctx->lock); - free_event(event); + detach_task_ctx_data(task); } /* @@ -13891,48 +14134,7 @@ static void perf_free_event(struct perf_event *event, */ void perf_event_free_task(struct task_struct *task) { - struct perf_event_context *ctx; - struct perf_event *event, *tmp; - - ctx = rcu_access_pointer(task->perf_event_ctxp); - if (!ctx) - return; - - mutex_lock(&ctx->mutex); - raw_spin_lock_irq(&ctx->lock); - /* - * Destroy the task <-> ctx relation and mark the context dead. - * - * This is important because even though the task hasn't been - * exposed yet the context has been (through child_list). - */ - RCU_INIT_POINTER(task->perf_event_ctxp, NULL); - WRITE_ONCE(ctx->task, TASK_TOMBSTONE); - put_task_struct(task); /* cannot be last */ - raw_spin_unlock_irq(&ctx->lock); - - - list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) - perf_free_event(event, ctx); - - mutex_unlock(&ctx->mutex); - - /* - * perf_event_release_kernel() could've stolen some of our - * child events and still have them on its free_list. In that - * case we must wait for these events to have been freed (in - * particular all their references to this task must've been - * dropped). - * - * Without this copy_process() will unconditionally free this - * task (irrespective of its reference count) and - * _free_event()'s put_task_struct(event->hw.target) will be a - * use-after-free. - * - * Wait for all events to drop their context reference. - */ - wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1); - put_ctx(ctx); /* must be last */ + perf_event_exit_task_context(task, false); } void perf_event_delayed_put(struct task_struct *task) @@ -13970,12 +14172,12 @@ const struct perf_event_attr *perf_event_attrs(struct perf_event *event) return &event->attr; } -int perf_allow_kernel(struct perf_event_attr *attr) +int perf_allow_kernel(void) { if (sysctl_perf_event_paranoid > 1 && !perfmon_capable()) return -EACCES; - return security_perf_event_open(attr, PERF_SECURITY_KERNEL); + return security_perf_event_open(PERF_SECURITY_KERNEL); } EXPORT_SYMBOL_GPL(perf_allow_kernel); @@ -14009,6 +14211,14 @@ inherit_event(struct perf_event *parent_event, if (parent_event->parent) parent_event = parent_event->parent; + if (parent_event->state <= PERF_EVENT_STATE_REVOKED) + return NULL; + + /* + * Event creation should be under SRCU, see perf_pmu_unregister(). + */ + guard(srcu)(&pmus_srcu); + child_event = perf_event_alloc(&parent_event->attr, parent_event->cpu, child, @@ -14017,6 +14227,9 @@ inherit_event(struct perf_event *parent_event, if (IS_ERR(child_event)) return child_event; + get_ctx(child_ctx); + child_event->ctx = child_ctx; + pmu_ctx = find_get_pmu_context(child_event->pmu, child_ctx, child_event); if (IS_ERR(pmu_ctx)) { free_event(child_event); @@ -14038,8 +14251,6 @@ inherit_event(struct perf_event *parent_event, return NULL; } - get_ctx(child_ctx); - /* * Make the child state follow the state of the parent event, * not its attr.disabled bit. We hold the parent's mutex, @@ -14060,7 +14271,6 @@ inherit_event(struct perf_event *parent_event, local64_set(&hwc->period_left, sample_period); } - child_event->ctx = child_ctx; child_event->overflow_handler = parent_event->overflow_handler; child_event->overflow_handler_context = parent_event->overflow_handler_context; diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 5130b119d0ae..d2aef87c7e9f 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -679,7 +679,15 @@ int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event, { bool overwrite = !(flags & RING_BUFFER_WRITABLE); int node = (event->cpu == -1) ? -1 : cpu_to_node(event->cpu); - int ret = -ENOMEM, max_order; + bool use_contiguous_pages = event->pmu->capabilities & ( + PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_AUX_PREFER_LARGE); + /* + * Initialize max_order to 0 for page allocation. This allocates single + * pages to minimize memory fragmentation. This is overridden if the + * PMU needs or prefers contiguous pages (use_contiguous_pages = true). + */ + int max_order = 0; + int ret = -ENOMEM; if (!has_aux(event)) return -EOPNOTSUPP; @@ -689,8 +697,8 @@ int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event, if (!overwrite) { /* - * Watermark defaults to half the buffer, and so does the - * max_order, to aid PMU drivers in double buffering. + * Watermark defaults to half the buffer, to aid PMU drivers + * in double buffering. */ if (!watermark) watermark = min_t(unsigned long, @@ -698,16 +706,19 @@ int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event, (unsigned long)nr_pages << (PAGE_SHIFT - 1)); /* - * Use aux_watermark as the basis for chunking to - * help PMU drivers honor the watermark. + * If using contiguous pages, use aux_watermark as the basis + * for chunking to help PMU drivers honor the watermark. */ - max_order = get_order(watermark); + if (use_contiguous_pages) + max_order = get_order(watermark); } else { /* - * We need to start with the max_order that fits in nr_pages, - * not the other way around, hence ilog2() and not get_order. + * If using contiguous pages, we need to start with the + * max_order that fits in nr_pages, not the other way around, + * hence ilog2() and not get_order. */ - max_order = ilog2(nr_pages); + if (use_contiguous_pages) + max_order = ilog2(nr_pages); watermark = 0; } diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 70c84b9d7be3..8d783b5882b6 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -173,6 +173,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, DEFINE_FOLIO_VMA_WALK(pvmw, old_folio, vma, addr, 0); int err; struct mmu_notifier_range range; + pte_t pte; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr, addr + PAGE_SIZE); @@ -192,6 +193,16 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, if (!page_vma_mapped_walk(&pvmw)) goto unlock; VM_BUG_ON_PAGE(addr != pvmw.address, old_page); + pte = ptep_get(pvmw.pte); + + /* + * Handle PFN swap PTES, such as device-exclusive ones, that actually + * map pages: simply trigger GUP again to fix it up. + */ + if (unlikely(!pte_present(pte))) { + page_vma_mapped_walk_done(&pvmw); + goto unlock; + } if (new_page) { folio_get(new_folio); @@ -206,7 +217,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, inc_mm_counter(mm, MM_ANONPAGES); } - flush_cache_page(vma, addr, pte_pfn(ptep_get(pvmw.pte))); + flush_cache_page(vma, addr, pte_pfn(pte)); ptep_clear_flush(vma, addr, pvmw.pte); if (new_page) set_pte_at(mm, addr, pvmw.pte, @@ -1692,7 +1703,8 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) } vma = _install_special_mapping(mm, area->vaddr, PAGE_SIZE, - VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO, + VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO| + VM_SEALED_SYSMAP, &xol_mapping); if (IS_ERR(vma)) { ret = PTR_ERR(vma); @@ -1944,6 +1956,9 @@ static void free_ret_instance(struct uprobe_task *utask, * to-be-reused return instances for future uretprobes. If ri_timer() * happens to be running right now, though, we fallback to safety and * just perform RCU-delated freeing of ri. + * Admittedly, this is a rather simple use of seqcount, but it nicely + * abstracts away all the necessary memory barriers, so we use + * a well-supported kernel primitive here. */ if (raw_seqcount_try_begin(&utask->ri_seqcount, seq)) { /* immediate reuse of ri without RCU GP is OK */ @@ -2004,12 +2019,20 @@ static void ri_timer(struct timer_list *timer) /* RCU protects return_instance from freeing. */ guard(rcu)(); - write_seqcount_begin(&utask->ri_seqcount); + /* + * See free_ret_instance() for notes on seqcount use. + * We also employ raw API variants to avoid lockdep false-positive + * warning complaining about enabled preemption. The timer can only be + * invoked once for a uprobe_task. Therefore there can only be one + * writer. The reader does not require an even sequence count to make + * progress, so it is OK to remain preemptible on PREEMPT_RT. + */ + raw_write_seqcount_begin(&utask->ri_seqcount); for_each_ret_instance_rcu(ri, utask->return_instances) hprobe_expire(&ri->hprobe, false); - write_seqcount_end(&utask->ri_seqcount); + raw_write_seqcount_end(&utask->ri_seqcount); } static struct uprobe_task *alloc_utask(void) diff --git a/kernel/exit.c b/kernel/exit.c index c2e6c7b7779f..38645039dd8f 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -133,8 +133,13 @@ struct release_task_post { static void __unhash_process(struct release_task_post *post, struct task_struct *p, bool group_dead) { + struct pid *pid = task_pid(p); + nr_threads--; + detach_pid(post->pids, p, PIDTYPE_PID); + wake_up_all(&pid->wait_pidfd); + if (group_dead) { detach_pid(post->pids, p, PIDTYPE_TGID); detach_pid(post->pids, p, PIDTYPE_PGID); @@ -253,7 +258,8 @@ repeat: pidfs_exit(p); cgroup_release(p); - thread_pid = get_pid(p->thread_pid); + /* Retrieve @thread_pid before __unhash_process() may set it to NULL. */ + thread_pid = task_pid(p); write_lock_irq(&tasklist_lock); ptrace_release_task(p); @@ -268,6 +274,9 @@ repeat: leader = p->group_leader; if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { + /* for pidfs_exit() and do_notify_parent() */ + if (leader->signal->flags & SIGNAL_GROUP_EXIT) + leader->exit_code = leader->signal->group_exit_code; /* * If we were the last child thread and the leader has * exited already, and the leader's parent ignores SIGCHLD, @@ -279,8 +288,8 @@ repeat: } write_unlock_irq(&tasklist_lock); + /* @thread_pid can't go away until free_pids() below */ proc_flush_pid(thread_pid); - put_pid(thread_pid); add_device_randomness(&p->se.sum_exec_runtime, sizeof(p->se.sum_exec_runtime)); free_pids(post.pids); @@ -756,12 +765,6 @@ static void exit_notify(struct task_struct *tsk, int group_dead) kill_orphaned_pgrp(tsk->group_leader, NULL); tsk->exit_state = EXIT_ZOMBIE; - /* - * Ignore thread-group leaders that exited before all - * subthreads did. - */ - if (!delay_group_leader(tsk)) - do_notify_pidfd(tsk); if (unlikely(tsk->ptrace)) { int sig = thread_group_leader(tsk) && @@ -774,6 +777,8 @@ static void exit_notify(struct task_struct *tsk, int group_dead) do_notify_parent(tsk, tsk->exit_signal); } else { autoreap = true; + /* untraced sub-thread */ + do_notify_pidfd(tsk); } if (autoreap) { @@ -937,12 +942,12 @@ void __noreturn do_exit(long code) tsk->exit_code = code; taskstats_exit(tsk, group_dead); + trace_sched_process_exit(tsk, group_dead); exit_mm(); if (group_dead) acct_process(); - trace_sched_process_exit(tsk); exit_sem(tsk); exit_shm(tsk); diff --git a/kernel/fork.c b/kernel/fork.c index f11ac96b7587..85afccfdf3b1 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -311,11 +311,9 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node) * so memcg accounting is performed manually on assigning/releasing * stacks to tasks. Drop __GFP_ACCOUNT. */ - stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN, - VMALLOC_START, VMALLOC_END, + stack = __vmalloc_node(THREAD_SIZE, THREAD_ALIGN, THREADINFO_GFP & ~__GFP_ACCOUNT, - PAGE_KERNEL, - 0, node, __builtin_return_address(0)); + node, __builtin_return_address(0)); if (!stack) return -ENOMEM; @@ -436,35 +434,6 @@ static struct kmem_cache *vm_area_cachep; /* SLAB cache for mm_struct structures (tsk->mm) */ static struct kmem_cache *mm_cachep; -#ifdef CONFIG_PER_VMA_LOCK - -/* SLAB cache for vm_area_struct.lock */ -static struct kmem_cache *vma_lock_cachep; - -static bool vma_lock_alloc(struct vm_area_struct *vma) -{ - vma->vm_lock = kmem_cache_alloc(vma_lock_cachep, GFP_KERNEL); - if (!vma->vm_lock) - return false; - - init_rwsem(&vma->vm_lock->lock); - vma->vm_lock_seq = UINT_MAX; - - return true; -} - -static inline void vma_lock_free(struct vm_area_struct *vma) -{ - kmem_cache_free(vma_lock_cachep, vma->vm_lock); -} - -#else /* CONFIG_PER_VMA_LOCK */ - -static inline bool vma_lock_alloc(struct vm_area_struct *vma) { return true; } -static inline void vma_lock_free(struct vm_area_struct *vma) {} - -#endif /* CONFIG_PER_VMA_LOCK */ - struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) { struct vm_area_struct *vma; @@ -474,14 +443,46 @@ struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) return NULL; vma_init(vma, mm); - if (!vma_lock_alloc(vma)) { - kmem_cache_free(vm_area_cachep, vma); - return NULL; - } return vma; } +static void vm_area_init_from(const struct vm_area_struct *src, + struct vm_area_struct *dest) +{ + dest->vm_mm = src->vm_mm; + dest->vm_ops = src->vm_ops; + dest->vm_start = src->vm_start; + dest->vm_end = src->vm_end; + dest->anon_vma = src->anon_vma; + dest->vm_pgoff = src->vm_pgoff; + dest->vm_file = src->vm_file; + dest->vm_private_data = src->vm_private_data; + vm_flags_init(dest, src->vm_flags); + memcpy(&dest->vm_page_prot, &src->vm_page_prot, + sizeof(dest->vm_page_prot)); + /* + * src->shared.rb may be modified concurrently when called from + * dup_mmap(), but the clone will reinitialize it. + */ + data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared))); + memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx, + sizeof(dest->vm_userfaultfd_ctx)); +#ifdef CONFIG_ANON_VMA_NAME + dest->anon_name = src->anon_name; +#endif +#ifdef CONFIG_SWAP + memcpy(&dest->swap_readahead_info, &src->swap_readahead_info, + sizeof(dest->swap_readahead_info)); +#endif +#ifndef CONFIG_MMU + dest->vm_region = src->vm_region; +#endif +#ifdef CONFIG_NUMA + dest->vm_policy = src->vm_policy; +#endif +} + struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) { struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); @@ -491,15 +492,8 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) ASSERT_EXCLUSIVE_WRITER(orig->vm_flags); ASSERT_EXCLUSIVE_WRITER(orig->vm_file); - /* - * orig->shared.rb may be modified concurrently, but the clone - * will be reinitialized. - */ - data_race(memcpy(new, orig, sizeof(*new))); - if (!vma_lock_alloc(new)) { - kmem_cache_free(vm_area_cachep, new); - return NULL; - } + vm_area_init_from(orig, new); + vma_lock_init(new, true); INIT_LIST_HEAD(&new->anon_vma_chain); vma_numab_state_init(new); dup_anon_vma_name(orig, new); @@ -507,35 +501,15 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) return new; } -void __vm_area_free(struct vm_area_struct *vma) +void vm_area_free(struct vm_area_struct *vma) { + /* The vma should be detached while being destroyed. */ + vma_assert_detached(vma); vma_numab_state_free(vma); free_anon_vma_name(vma); - vma_lock_free(vma); kmem_cache_free(vm_area_cachep, vma); } -#ifdef CONFIG_PER_VMA_LOCK -static void vm_area_free_rcu_cb(struct rcu_head *head) -{ - struct vm_area_struct *vma = container_of(head, struct vm_area_struct, - vm_rcu); - - /* The vma should not be locked while being destroyed. */ - VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock->lock), vma); - __vm_area_free(vma); -} -#endif - -void vm_area_free(struct vm_area_struct *vma) -{ -#ifdef CONFIG_PER_VMA_LOCK - call_rcu(&vma->vm_rcu, vm_area_free_rcu_cb); -#else - __vm_area_free(vma); -#endif -} - static void account_kernel_stack(struct task_struct *tsk, int account) { if (IS_ENABLED(CONFIG_VMAP_STACK)) { @@ -694,6 +668,11 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, tmp = vm_area_dup(mpnt); if (!tmp) goto fail_nomem; + + /* track_pfn_copy() will later take care of copying internal state. */ + if (unlikely(tmp->vm_flags & VM_PFNMAP)) + untrack_pfn_clear(tmp); + retval = vma_dup_policy(mpnt, tmp); if (retval) goto fail_nomem_policy; @@ -826,6 +805,36 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) #define mm_free_pgd(mm) #endif /* CONFIG_MMU */ +#ifdef CONFIG_MM_ID +static DEFINE_IDA(mm_ida); + +static inline int mm_alloc_id(struct mm_struct *mm) +{ + int ret; + + ret = ida_alloc_range(&mm_ida, MM_ID_MIN, MM_ID_MAX, GFP_KERNEL); + if (ret < 0) + return ret; + mm->mm_id = ret; + return 0; +} + +static inline void mm_free_id(struct mm_struct *mm) +{ + const mm_id_t id = mm->mm_id; + + mm->mm_id = MM_ID_DUMMY; + if (id == MM_ID_DUMMY) + return; + if (WARN_ON_ONCE(id < MM_ID_MIN || id > MM_ID_MAX)) + return; + ida_free(&mm_ida, id); +} +#else /* !CONFIG_MM_ID */ +static inline int mm_alloc_id(struct mm_struct *mm) { return 0; } +static inline void mm_free_id(struct mm_struct *mm) {} +#endif /* CONFIG_MM_ID */ + static void check_mm(struct mm_struct *mm) { int i; @@ -929,6 +938,7 @@ void __mmdrop(struct mm_struct *mm) WARN_ON_ONCE(mm == current->active_mm); mm_free_pgd(mm); + mm_free_id(mm); destroy_context(mm); mmu_notifier_subscriptions_destroy(mm); check_mm(mm); @@ -1263,6 +1273,15 @@ static void mm_init_uprobes_state(struct mm_struct *mm) #endif } +static void mmap_init_lock(struct mm_struct *mm) +{ + init_rwsem(&mm->mmap_lock); + mm_lock_seqcount_init(mm); +#ifdef CONFIG_PER_VMA_LOCK + rcuwait_init(&mm->vma_writer_wait); +#endif +} + static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, struct user_namespace *user_ns) { @@ -1287,6 +1306,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, RCU_INIT_POINTER(mm->exe_file, NULL); mmu_notifier_subscriptions_init(mm); init_tlb_flush_pending(mm); + futex_mm_init(mm); #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !defined(CONFIG_SPLIT_PMD_PTLOCKS) mm->pmd_huge_pte = NULL; #endif @@ -1304,6 +1324,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, if (mm_alloc_pgd(mm)) goto fail_nopgd; + if (mm_alloc_id(mm)) + goto fail_noid; + if (init_new_context(p, mm)) goto fail_nocontext; @@ -1323,6 +1346,8 @@ fail_pcpu: fail_cid: destroy_context(mm); fail_nocontext: + mm_free_id(mm); +fail_noid: mm_free_pgd(mm); fail_nopgd: free_mm(mm); @@ -1364,6 +1389,7 @@ static inline void __mmput(struct mm_struct *mm) if (mm->binfmt) module_put(mm->binfmt->module); lru_gen_del_mm(mm); + futex_hash_free(mm); mmdrop(mm); } @@ -1559,6 +1585,17 @@ struct mm_struct *get_task_mm(struct task_struct *task) } EXPORT_SYMBOL_GPL(get_task_mm); +static bool may_access_mm(struct mm_struct *mm, struct task_struct *task, unsigned int mode) +{ + if (mm == current->mm) + return true; + if (ptrace_may_access(task, mode)) + return true; + if ((mode & PTRACE_MODE_READ) && perfmon_capable()) + return true; + return false; +} + struct mm_struct *mm_access(struct task_struct *task, unsigned int mode) { struct mm_struct *mm; @@ -1571,7 +1608,7 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode) mm = get_task_mm(task); if (!mm) { mm = ERR_PTR(-ESRCH); - } else if (mm != current->mm && !ptrace_may_access(task, mode)) { + } else if (!may_access_mm(mm, task, mode)) { mmput(mm); mm = ERR_PTR(-EACCES); } @@ -1891,8 +1928,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) #ifdef CONFIG_POSIX_TIMERS INIT_HLIST_HEAD(&sig->posix_timers); INIT_HLIST_HEAD(&sig->ignored_posix_timers); - hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - sig->real_timer.function = it_real_fn; + hrtimer_setup(&sig->real_timer, it_real_fn, CLOCK_MONOTONIC, HRTIMER_MODE_REL); #endif task_lock(current->group_leader); @@ -2003,17 +2039,16 @@ static inline void rcu_copy_process(struct task_struct *p) } /** - * __pidfd_prepare - allocate a new pidfd_file and reserve a pidfd + * pidfd_prepare - allocate a new pidfd_file and reserve a pidfd * @pid: the struct pid for which to create a pidfd * @flags: flags of the new @pidfd - * @ret: Where to return the file for the pidfd. + * @ret_file: return the new pidfs file * * Allocate a new file that stashes @pid and reserve a new pidfd number in the * caller's file descriptor table. The pidfd is reserved but not installed yet. * - * The helper doesn't perform checks on @pid which makes it useful for pidfds - * created via CLONE_PIDFD where @pid has no task attached when the pidfd and - * pidfd file are prepared. + * The helper verifies that @pid is still in use, without PIDFD_THREAD the + * task identified by @pid must be a thread-group leader. * * If this function returns successfully the caller is responsible to either * call fd_install() passing the returned pidfd and pidfd file as arguments in @@ -2030,59 +2065,50 @@ static inline void rcu_copy_process(struct task_struct *p) * error, a negative error code is returned from the function and the * last argument remains unchanged. */ -static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret) +int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret_file) { - struct file *pidfd_file; + struct file *pidfs_file; + + /* + * PIDFD_STALE is only allowed to be passed if the caller knows + * that @pid is already registered in pidfs and thus + * PIDFD_INFO_EXIT information is guaranteed to be available. + */ + if (!(flags & PIDFD_STALE)) { + /* + * While holding the pidfd waitqueue lock removing the + * task linkage for the thread-group leader pid + * (PIDTYPE_TGID) isn't possible. Thus, if there's still + * task linkage for PIDTYPE_PID not having thread-group + * leader linkage for the pid means it wasn't a + * thread-group leader in the first place. + */ + guard(spinlock_irq)(&pid->wait_pidfd.lock); + + /* Task has already been reaped. */ + if (!pid_has_task(pid, PIDTYPE_PID)) + return -ESRCH; + /* + * If this struct pid isn't used as a thread-group + * leader but the caller requested to create a + * thread-group leader pidfd then report ENOENT. + */ + if (!(flags & PIDFD_THREAD) && !pid_has_task(pid, PIDTYPE_TGID)) + return -ENOENT; + } CLASS(get_unused_fd, pidfd)(O_CLOEXEC); if (pidfd < 0) return pidfd; - pidfd_file = pidfs_alloc_file(pid, flags | O_RDWR); - if (IS_ERR(pidfd_file)) - return PTR_ERR(pidfd_file); + pidfs_file = pidfs_alloc_file(pid, flags | O_RDWR); + if (IS_ERR(pidfs_file)) + return PTR_ERR(pidfs_file); - *ret = pidfd_file; + *ret_file = pidfs_file; return take_fd(pidfd); } -/** - * pidfd_prepare - allocate a new pidfd_file and reserve a pidfd - * @pid: the struct pid for which to create a pidfd - * @flags: flags of the new @pidfd - * @ret: Where to return the pidfd. - * - * Allocate a new file that stashes @pid and reserve a new pidfd number in the - * caller's file descriptor table. The pidfd is reserved but not installed yet. - * - * The helper verifies that @pid is still in use, without PIDFD_THREAD the - * task identified by @pid must be a thread-group leader. - * - * If this function returns successfully the caller is responsible to either - * call fd_install() passing the returned pidfd and pidfd file as arguments in - * order to install the pidfd into its file descriptor table or they must use - * put_unused_fd() and fput() on the returned pidfd and pidfd file - * respectively. - * - * This function is useful when a pidfd must already be reserved but there - * might still be points of failure afterwards and the caller wants to ensure - * that no pidfd is leaked into its file descriptor table. - * - * Return: On success, a reserved pidfd is returned from the function and a new - * pidfd file is returned in the last argument to the function. On - * error, a negative error code is returned from the function and the - * last argument remains unchanged. - */ -int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret) -{ - bool thread = flags & PIDFD_THREAD; - - if (!pid || !pid_has_task(pid, thread ? PIDTYPE_PID : PIDTYPE_TGID)) - return -EINVAL; - - return __pidfd_prepare(pid, flags, ret); -} - static void __delayed_free_task(struct rcu_head *rhp) { struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); @@ -2129,6 +2155,13 @@ static void rv_task_fork(struct task_struct *p) #define rv_task_fork(p) do {} while (0) #endif +static bool need_futex_hash_allocate_default(u64 clone_flags) +{ + if ((clone_flags & (CLONE_THREAD | CLONE_VM)) != (CLONE_THREAD | CLONE_VM)) + return false; + return true; +} + /* * This creates a new process as a copy of the old one, * but does not actually start it yet. @@ -2429,7 +2462,7 @@ __latent_entropy struct task_struct *copy_process( * Note that no task has been attached to @pid yet indicate * that via CLONE_PIDFD. */ - retval = __pidfd_prepare(pid, flags | PIDFD_CLONE, &pidfile); + retval = pidfd_prepare(pid, flags | PIDFD_STALE, &pidfile); if (retval < 0) goto bad_fork_free_pid; pidfd = retval; @@ -2510,6 +2543,21 @@ __latent_entropy struct task_struct *copy_process( goto bad_fork_cancel_cgroup; /* + * Allocate a default futex hash for the user process once the first + * thread spawns. + */ + if (need_futex_hash_allocate_default(clone_flags)) { + retval = futex_hash_allocate_default(); + if (retval) + goto bad_fork_core_free; + /* + * If we fail beyond this point we don't free the allocated + * futex hash map. We assume that another thread will be created + * and makes use of it. The hash map will be freed once the main + * thread terminates. + */ + } + /* * From this point on we must avoid any synchronous user-space * communication until we take the tasklist-lock. In particular, we do * not want user-space to be able to predict the process start-time by @@ -3180,6 +3228,11 @@ void __init mm_cache_init(void) void __init proc_caches_init(void) { + struct kmem_cache_args args = { + .use_freeptr_offset = true, + .freeptr_offset = offsetof(struct vm_area_struct, vm_freeptr), + }; + sighand_cachep = kmem_cache_create("sighand_cache", sizeof(struct sighand_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| @@ -3196,11 +3249,10 @@ void __init proc_caches_init(void) sizeof(struct fs_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); - - vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT); -#ifdef CONFIG_PER_VMA_LOCK - vma_lock_cachep = KMEM_CACHE(vma_lock, SLAB_PANIC|SLAB_ACCOUNT); -#endif + vm_area_cachep = kmem_cache_create("vm_area_struct", + sizeof(struct vm_area_struct), &args, + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| + SLAB_ACCOUNT); mmap_init(); nsproxy_cache_init(); } diff --git a/kernel/futex/core.c b/kernel/futex/core.c index 3db8567f5a44..19a2c65f3d37 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -36,9 +36,15 @@ #include <linux/pagemap.h> #include <linux/debugfs.h> #include <linux/plist.h> +#include <linux/gfp.h> +#include <linux/vmalloc.h> #include <linux/memblock.h> #include <linux/fault-inject.h> #include <linux/slab.h> +#include <linux/prctl.h> +#include <linux/rcuref.h> +#include <linux/mempolicy.h> +#include <linux/mmap_lock.h> #include "futex.h" #include "../locking/rtmutex_common.h" @@ -49,12 +55,24 @@ * reside in the same cacheline. */ static struct { - struct futex_hash_bucket *queues; - unsigned long hashsize; + unsigned long hashmask; + unsigned int hashshift; + struct futex_hash_bucket *queues[MAX_NUMNODES]; } __futex_data __read_mostly __aligned(2*sizeof(long)); -#define futex_queues (__futex_data.queues) -#define futex_hashsize (__futex_data.hashsize) +#define futex_hashmask (__futex_data.hashmask) +#define futex_hashshift (__futex_data.hashshift) +#define futex_queues (__futex_data.queues) + +struct futex_private_hash { + rcuref_t users; + unsigned int hash_mask; + struct rcu_head rcu; + void *mm; + bool custom; + bool immutable; + struct futex_hash_bucket queues[]; +}; /* * Fault injections for futexes. @@ -107,21 +125,328 @@ late_initcall(fail_futex_debugfs); #endif /* CONFIG_FAIL_FUTEX */ +static struct futex_hash_bucket * +__futex_hash(union futex_key *key, struct futex_private_hash *fph); + +#ifdef CONFIG_FUTEX_PRIVATE_HASH +static inline bool futex_key_is_private(union futex_key *key) +{ + /* + * Relies on get_futex_key() to set either bit for shared + * futexes -- see comment with union futex_key. + */ + return !(key->both.offset & (FUT_OFF_INODE | FUT_OFF_MMSHARED)); +} + +bool futex_private_hash_get(struct futex_private_hash *fph) +{ + if (fph->immutable) + return true; + return rcuref_get(&fph->users); +} + +void futex_private_hash_put(struct futex_private_hash *fph) +{ + /* Ignore return value, last put is verified via rcuref_is_dead() */ + if (fph->immutable) + return; + if (rcuref_put(&fph->users)) + wake_up_var(fph->mm); +} + /** - * futex_hash - Return the hash bucket in the global hash - * @key: Pointer to the futex key for which the hash is calculated + * futex_hash_get - Get an additional reference for the local hash. + * @hb: ptr to the private local hash. * - * We hash on the keys returned from get_futex_key (see below) and return the - * corresponding hash bucket in the global hash. + * Obtain an additional reference for the already obtained hash bucket. The + * caller must already own an reference. */ +void futex_hash_get(struct futex_hash_bucket *hb) +{ + struct futex_private_hash *fph = hb->priv; + + if (!fph) + return; + WARN_ON_ONCE(!futex_private_hash_get(fph)); +} + +void futex_hash_put(struct futex_hash_bucket *hb) +{ + struct futex_private_hash *fph = hb->priv; + + if (!fph) + return; + futex_private_hash_put(fph); +} + +static struct futex_hash_bucket * +__futex_hash_private(union futex_key *key, struct futex_private_hash *fph) +{ + u32 hash; + + if (!futex_key_is_private(key)) + return NULL; + + if (!fph) + fph = rcu_dereference(key->private.mm->futex_phash); + if (!fph || !fph->hash_mask) + return NULL; + + hash = jhash2((void *)&key->private.address, + sizeof(key->private.address) / 4, + key->both.offset); + return &fph->queues[hash & fph->hash_mask]; +} + +static void futex_rehash_private(struct futex_private_hash *old, + struct futex_private_hash *new) +{ + struct futex_hash_bucket *hb_old, *hb_new; + unsigned int slots = old->hash_mask + 1; + unsigned int i; + + for (i = 0; i < slots; i++) { + struct futex_q *this, *tmp; + + hb_old = &old->queues[i]; + + spin_lock(&hb_old->lock); + plist_for_each_entry_safe(this, tmp, &hb_old->chain, list) { + + plist_del(&this->list, &hb_old->chain); + futex_hb_waiters_dec(hb_old); + + WARN_ON_ONCE(this->lock_ptr != &hb_old->lock); + + hb_new = __futex_hash(&this->key, new); + futex_hb_waiters_inc(hb_new); + /* + * The new pointer isn't published yet but an already + * moved user can be unqueued due to timeout or signal. + */ + spin_lock_nested(&hb_new->lock, SINGLE_DEPTH_NESTING); + plist_add(&this->list, &hb_new->chain); + this->lock_ptr = &hb_new->lock; + spin_unlock(&hb_new->lock); + } + spin_unlock(&hb_old->lock); + } +} + +static bool __futex_pivot_hash(struct mm_struct *mm, + struct futex_private_hash *new) +{ + struct futex_private_hash *fph; + + WARN_ON_ONCE(mm->futex_phash_new); + + fph = rcu_dereference_protected(mm->futex_phash, + lockdep_is_held(&mm->futex_hash_lock)); + if (fph) { + if (!rcuref_is_dead(&fph->users)) { + mm->futex_phash_new = new; + return false; + } + + futex_rehash_private(fph, new); + } + rcu_assign_pointer(mm->futex_phash, new); + kvfree_rcu(fph, rcu); + return true; +} + +static void futex_pivot_hash(struct mm_struct *mm) +{ + scoped_guard(mutex, &mm->futex_hash_lock) { + struct futex_private_hash *fph; + + fph = mm->futex_phash_new; + if (fph) { + mm->futex_phash_new = NULL; + __futex_pivot_hash(mm, fph); + } + } +} + +struct futex_private_hash *futex_private_hash(void) +{ + struct mm_struct *mm = current->mm; + /* + * Ideally we don't loop. If there is a replacement in progress + * then a new private hash is already prepared and a reference can't be + * obtained once the last user dropped it's. + * In that case we block on mm_struct::futex_hash_lock and either have + * to perform the replacement or wait while someone else is doing the + * job. Eitherway, on the second iteration we acquire a reference on the + * new private hash or loop again because a new replacement has been + * requested. + */ +again: + scoped_guard(rcu) { + struct futex_private_hash *fph; + + fph = rcu_dereference(mm->futex_phash); + if (!fph) + return NULL; + + if (fph->immutable) + return fph; + if (rcuref_get(&fph->users)) + return fph; + } + futex_pivot_hash(mm); + goto again; +} + struct futex_hash_bucket *futex_hash(union futex_key *key) { - u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4, - key->both.offset); + struct futex_private_hash *fph; + struct futex_hash_bucket *hb; + +again: + scoped_guard(rcu) { + hb = __futex_hash(key, NULL); + fph = hb->priv; + + if (!fph || futex_private_hash_get(fph)) + return hb; + } + futex_pivot_hash(key->private.mm); + goto again; +} + +#else /* !CONFIG_FUTEX_PRIVATE_HASH */ + +static struct futex_hash_bucket * +__futex_hash_private(union futex_key *key, struct futex_private_hash *fph) +{ + return NULL; +} + +struct futex_hash_bucket *futex_hash(union futex_key *key) +{ + return __futex_hash(key, NULL); +} + +#endif /* CONFIG_FUTEX_PRIVATE_HASH */ + +#ifdef CONFIG_FUTEX_MPOL + +static int __futex_key_to_node(struct mm_struct *mm, unsigned long addr) +{ + struct vm_area_struct *vma = vma_lookup(mm, addr); + struct mempolicy *mpol; + int node = FUTEX_NO_NODE; + + if (!vma) + return FUTEX_NO_NODE; + + mpol = vma_policy(vma); + if (!mpol) + return FUTEX_NO_NODE; + + switch (mpol->mode) { + case MPOL_PREFERRED: + node = first_node(mpol->nodes); + break; + case MPOL_PREFERRED_MANY: + case MPOL_BIND: + if (mpol->home_node != NUMA_NO_NODE) + node = mpol->home_node; + break; + default: + break; + } + + return node; +} + +static int futex_key_to_node_opt(struct mm_struct *mm, unsigned long addr) +{ + int seq, node; + + guard(rcu)(); + + if (!mmap_lock_speculate_try_begin(mm, &seq)) + return -EBUSY; + + node = __futex_key_to_node(mm, addr); + + if (mmap_lock_speculate_retry(mm, seq)) + return -EAGAIN; + + return node; +} + +static int futex_mpol(struct mm_struct *mm, unsigned long addr) +{ + int node; + + node = futex_key_to_node_opt(mm, addr); + if (node >= FUTEX_NO_NODE) + return node; - return &futex_queues[hash & (futex_hashsize - 1)]; + guard(mmap_read_lock)(mm); + return __futex_key_to_node(mm, addr); } +#else /* !CONFIG_FUTEX_MPOL */ + +static int futex_mpol(struct mm_struct *mm, unsigned long addr) +{ + return FUTEX_NO_NODE; +} + +#endif /* CONFIG_FUTEX_MPOL */ + +/** + * __futex_hash - Return the hash bucket + * @key: Pointer to the futex key for which the hash is calculated + * @fph: Pointer to private hash if known + * + * We hash on the keys returned from get_futex_key (see below) and return the + * corresponding hash bucket. + * If the FUTEX is PROCESS_PRIVATE then a per-process hash bucket (from the + * private hash) is returned if existing. Otherwise a hash bucket from the + * global hash is returned. + */ +static struct futex_hash_bucket * +__futex_hash(union futex_key *key, struct futex_private_hash *fph) +{ + int node = key->both.node; + u32 hash; + + if (node == FUTEX_NO_NODE) { + struct futex_hash_bucket *hb; + + hb = __futex_hash_private(key, fph); + if (hb) + return hb; + } + + hash = jhash2((u32 *)key, + offsetof(typeof(*key), both.offset) / sizeof(u32), + key->both.offset); + + if (node == FUTEX_NO_NODE) { + /* + * In case of !FLAGS_NUMA, use some unused hash bits to pick a + * node -- this ensures regular futexes are interleaved across + * the nodes and avoids having to allocate multiple + * hash-tables. + * + * NOTE: this isn't perfectly uniform, but it is fast and + * handles sparse node masks. + */ + node = (hash >> futex_hashshift) % nr_node_ids; + if (!node_possible(node)) { + node = find_next_bit_wrap(node_possible_map.bits, + nr_node_ids, node); + } + } + + return &futex_queues[node][hash & futex_hashmask]; +} /** * futex_setup_timer - set up the sleeping hrtimer. @@ -227,25 +552,60 @@ int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key, struct page *page; struct folio *folio; struct address_space *mapping; - int err, ro = 0; + int node, err, size, ro = 0; + bool node_updated = false; bool fshared; fshared = flags & FLAGS_SHARED; + size = futex_size(flags); + if (flags & FLAGS_NUMA) + size *= 2; /* * The futex address must be "naturally" aligned. */ key->both.offset = address % PAGE_SIZE; - if (unlikely((address % sizeof(u32)) != 0)) + if (unlikely((address % size) != 0)) return -EINVAL; address -= key->both.offset; - if (unlikely(!access_ok(uaddr, sizeof(u32)))) + if (unlikely(!access_ok(uaddr, size))) return -EFAULT; if (unlikely(should_fail_futex(fshared))) return -EFAULT; + node = FUTEX_NO_NODE; + + if (flags & FLAGS_NUMA) { + u32 __user *naddr = (void *)uaddr + size / 2; + + if (futex_get_value(&node, naddr)) + return -EFAULT; + + if (node != FUTEX_NO_NODE && + (node >= MAX_NUMNODES || !node_possible(node))) + return -EINVAL; + } + + if (node == FUTEX_NO_NODE && (flags & FLAGS_MPOL)) { + node = futex_mpol(mm, address); + node_updated = true; + } + + if (flags & FLAGS_NUMA) { + u32 __user *naddr = (void *)uaddr + size / 2; + + if (node == FUTEX_NO_NODE) { + node = numa_node_id(); + node_updated = true; + } + if (node_updated && futex_put_value(node, naddr)) + return -EFAULT; + } + + key->both.node = node; + /* * PROCESS_PRIVATE futexes are fast. * As the mm cannot disappear under us and the 'key' only needs @@ -502,13 +862,9 @@ void __futex_unqueue(struct futex_q *q) } /* The key must be already stored in q->key. */ -struct futex_hash_bucket *futex_q_lock(struct futex_q *q) +void futex_q_lock(struct futex_q *q, struct futex_hash_bucket *hb) __acquires(&hb->lock) { - struct futex_hash_bucket *hb; - - hb = futex_hash(&q->key); - /* * Increment the counter before taking the lock so that * a potential waker won't miss a to-be-slept task that is @@ -522,14 +878,13 @@ struct futex_hash_bucket *futex_q_lock(struct futex_q *q) q->lock_ptr = &hb->lock; spin_lock(&hb->lock); - return hb; } void futex_q_unlock(struct futex_hash_bucket *hb) __releases(&hb->lock) { - spin_unlock(&hb->lock); futex_hb_waiters_dec(hb); + spin_unlock(&hb->lock); } void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb, @@ -568,6 +923,8 @@ int futex_unqueue(struct futex_q *q) spinlock_t *lock_ptr; int ret = 0; + /* RCU so lock_ptr is not going away during locking. */ + guard(rcu)(); /* In the common case we don't take the spinlock, which is nice. */ retry: /* @@ -606,6 +963,24 @@ retry: return ret; } +void futex_q_lockptr_lock(struct futex_q *q) +{ + spinlock_t *lock_ptr; + + /* + * See futex_unqueue() why lock_ptr can change. + */ + guard(rcu)(); +retry: + lock_ptr = READ_ONCE(q->lock_ptr); + spin_lock(lock_ptr); + + if (unlikely(lock_ptr != q->lock_ptr)) { + spin_unlock(lock_ptr); + goto retry; + } +} + /* * PI futexes can not be requeued and must remove themselves from the hash * bucket. The hash bucket lock (i.e. lock_ptr) is held. @@ -949,10 +1324,20 @@ static void exit_pi_state_list(struct task_struct *curr) { struct list_head *next, *head = &curr->pi_state_list; struct futex_pi_state *pi_state; - struct futex_hash_bucket *hb; union futex_key key = FUTEX_KEY_INIT; /* + * The mutex mm_struct::futex_hash_lock might be acquired. + */ + might_sleep(); + /* + * Ensure the hash remains stable (no resize) during the while loop + * below. The hb pointer is acquired under the pi_lock so we can't block + * on the mutex. + */ + WARN_ON(curr != current); + guard(private_hash)(); + /* * We are a ZOMBIE and nobody can enqueue itself on * pi_state_list anymore, but we have to be careful * versus waiters unqueueing themselves: @@ -962,50 +1347,52 @@ static void exit_pi_state_list(struct task_struct *curr) next = head->next; pi_state = list_entry(next, struct futex_pi_state, list); key = pi_state->key; - hb = futex_hash(&key); - - /* - * We can race against put_pi_state() removing itself from the - * list (a waiter going away). put_pi_state() will first - * decrement the reference count and then modify the list, so - * its possible to see the list entry but fail this reference - * acquire. - * - * In that case; drop the locks to let put_pi_state() make - * progress and retry the loop. - */ - if (!refcount_inc_not_zero(&pi_state->refcount)) { + if (1) { + CLASS(hb, hb)(&key); + + /* + * We can race against put_pi_state() removing itself from the + * list (a waiter going away). put_pi_state() will first + * decrement the reference count and then modify the list, so + * its possible to see the list entry but fail this reference + * acquire. + * + * In that case; drop the locks to let put_pi_state() make + * progress and retry the loop. + */ + if (!refcount_inc_not_zero(&pi_state->refcount)) { + raw_spin_unlock_irq(&curr->pi_lock); + cpu_relax(); + raw_spin_lock_irq(&curr->pi_lock); + continue; + } raw_spin_unlock_irq(&curr->pi_lock); - cpu_relax(); - raw_spin_lock_irq(&curr->pi_lock); - continue; - } - raw_spin_unlock_irq(&curr->pi_lock); - spin_lock(&hb->lock); - raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); - raw_spin_lock(&curr->pi_lock); - /* - * We dropped the pi-lock, so re-check whether this - * task still owns the PI-state: - */ - if (head->next != next) { - /* retain curr->pi_lock for the loop invariant */ - raw_spin_unlock(&pi_state->pi_mutex.wait_lock); + spin_lock(&hb->lock); + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + raw_spin_lock(&curr->pi_lock); + /* + * We dropped the pi-lock, so re-check whether this + * task still owns the PI-state: + */ + if (head->next != next) { + /* retain curr->pi_lock for the loop invariant */ + raw_spin_unlock(&pi_state->pi_mutex.wait_lock); + spin_unlock(&hb->lock); + put_pi_state(pi_state); + continue; + } + + WARN_ON(pi_state->owner != curr); + WARN_ON(list_empty(&pi_state->list)); + list_del_init(&pi_state->list); + pi_state->owner = NULL; + + raw_spin_unlock(&curr->pi_lock); + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); spin_unlock(&hb->lock); - put_pi_state(pi_state); - continue; } - WARN_ON(pi_state->owner != curr); - WARN_ON(list_empty(&pi_state->list)); - list_del_init(&pi_state->list); - pi_state->owner = NULL; - - raw_spin_unlock(&curr->pi_lock); - raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); - spin_unlock(&hb->lock); - rt_mutex_futex_unlock(&pi_state->pi_mutex); put_pi_state(pi_state); @@ -1125,29 +1512,304 @@ void futex_exit_release(struct task_struct *tsk) futex_cleanup_end(tsk, FUTEX_STATE_DEAD); } +static void futex_hash_bucket_init(struct futex_hash_bucket *fhb, + struct futex_private_hash *fph) +{ +#ifdef CONFIG_FUTEX_PRIVATE_HASH + fhb->priv = fph; +#endif + atomic_set(&fhb->waiters, 0); + plist_head_init(&fhb->chain); + spin_lock_init(&fhb->lock); +} + +#define FH_CUSTOM 0x01 +#define FH_IMMUTABLE 0x02 + +#ifdef CONFIG_FUTEX_PRIVATE_HASH +void futex_hash_free(struct mm_struct *mm) +{ + struct futex_private_hash *fph; + + kvfree(mm->futex_phash_new); + fph = rcu_dereference_raw(mm->futex_phash); + if (fph) { + WARN_ON_ONCE(rcuref_read(&fph->users) > 1); + kvfree(fph); + } +} + +static bool futex_pivot_pending(struct mm_struct *mm) +{ + struct futex_private_hash *fph; + + guard(rcu)(); + + if (!mm->futex_phash_new) + return true; + + fph = rcu_dereference(mm->futex_phash); + return rcuref_is_dead(&fph->users); +} + +static bool futex_hash_less(struct futex_private_hash *a, + struct futex_private_hash *b) +{ + /* user provided always wins */ + if (!a->custom && b->custom) + return true; + if (a->custom && !b->custom) + return false; + + /* zero-sized hash wins */ + if (!b->hash_mask) + return true; + if (!a->hash_mask) + return false; + + /* keep the biggest */ + if (a->hash_mask < b->hash_mask) + return true; + if (a->hash_mask > b->hash_mask) + return false; + + return false; /* equal */ +} + +static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags) +{ + struct mm_struct *mm = current->mm; + struct futex_private_hash *fph; + bool custom = flags & FH_CUSTOM; + int i; + + if (hash_slots && (hash_slots == 1 || !is_power_of_2(hash_slots))) + return -EINVAL; + + /* + * Once we've disabled the global hash there is no way back. + */ + scoped_guard(rcu) { + fph = rcu_dereference(mm->futex_phash); + if (fph && (!fph->hash_mask || fph->immutable)) { + if (custom) + return -EBUSY; + return 0; + } + } + + fph = kvzalloc(struct_size(fph, queues, hash_slots), GFP_KERNEL_ACCOUNT | __GFP_NOWARN); + if (!fph) + return -ENOMEM; + + rcuref_init(&fph->users, 1); + fph->hash_mask = hash_slots ? hash_slots - 1 : 0; + fph->custom = custom; + fph->immutable = !!(flags & FH_IMMUTABLE); + fph->mm = mm; + + for (i = 0; i < hash_slots; i++) + futex_hash_bucket_init(&fph->queues[i], fph); + + if (custom) { + /* + * Only let prctl() wait / retry; don't unduly delay clone(). + */ +again: + wait_var_event(mm, futex_pivot_pending(mm)); + } + + scoped_guard(mutex, &mm->futex_hash_lock) { + struct futex_private_hash *free __free(kvfree) = NULL; + struct futex_private_hash *cur, *new; + + cur = rcu_dereference_protected(mm->futex_phash, + lockdep_is_held(&mm->futex_hash_lock)); + new = mm->futex_phash_new; + mm->futex_phash_new = NULL; + + if (fph) { + if (cur && !new) { + /* + * If we have an existing hash, but do not yet have + * allocated a replacement hash, drop the initial + * reference on the existing hash. + */ + futex_private_hash_put(cur); + } + + if (new) { + /* + * Two updates raced; throw out the lesser one. + */ + if (futex_hash_less(new, fph)) { + free = new; + new = fph; + } else { + free = fph; + } + } else { + new = fph; + } + fph = NULL; + } + + if (new) { + /* + * Will set mm->futex_phash_new on failure; + * futex_private_hash_get() will try again. + */ + if (!__futex_pivot_hash(mm, new) && custom) + goto again; + } + } + return 0; +} + +int futex_hash_allocate_default(void) +{ + unsigned int threads, buckets, current_buckets = 0; + struct futex_private_hash *fph; + + if (!current->mm) + return 0; + + scoped_guard(rcu) { + threads = min_t(unsigned int, + get_nr_threads(current), + num_online_cpus()); + + fph = rcu_dereference(current->mm->futex_phash); + if (fph) { + if (fph->custom) + return 0; + + current_buckets = fph->hash_mask + 1; + } + } + + /* + * The default allocation will remain within + * 16 <= threads * 4 <= global hash size + */ + buckets = roundup_pow_of_two(4 * threads); + buckets = clamp(buckets, 16, futex_hashmask + 1); + + if (current_buckets >= buckets) + return 0; + + return futex_hash_allocate(buckets, 0); +} + +static int futex_hash_get_slots(void) +{ + struct futex_private_hash *fph; + + guard(rcu)(); + fph = rcu_dereference(current->mm->futex_phash); + if (fph && fph->hash_mask) + return fph->hash_mask + 1; + return 0; +} + +static int futex_hash_get_immutable(void) +{ + struct futex_private_hash *fph; + + guard(rcu)(); + fph = rcu_dereference(current->mm->futex_phash); + if (fph && fph->immutable) + return 1; + if (fph && !fph->hash_mask) + return 1; + return 0; +} + +#else + +static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags) +{ + return -EINVAL; +} + +static int futex_hash_get_slots(void) +{ + return 0; +} + +static int futex_hash_get_immutable(void) +{ + return 0; +} +#endif + +int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4) +{ + unsigned int flags = FH_CUSTOM; + int ret; + + switch (arg2) { + case PR_FUTEX_HASH_SET_SLOTS: + if (arg4 & ~FH_FLAG_IMMUTABLE) + return -EINVAL; + if (arg4 & FH_FLAG_IMMUTABLE) + flags |= FH_IMMUTABLE; + ret = futex_hash_allocate(arg3, flags); + break; + + case PR_FUTEX_HASH_GET_SLOTS: + ret = futex_hash_get_slots(); + break; + + case PR_FUTEX_HASH_GET_IMMUTABLE: + ret = futex_hash_get_immutable(); + break; + + default: + ret = -EINVAL; + break; + } + return ret; +} + static int __init futex_init(void) { - unsigned int futex_shift; - unsigned long i; + unsigned long hashsize, i; + unsigned int order, n; + unsigned long size; #ifdef CONFIG_BASE_SMALL - futex_hashsize = 16; + hashsize = 16; #else - futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus()); + hashsize = 256 * num_possible_cpus(); + hashsize /= num_possible_nodes(); + hashsize = max(4, hashsize); + hashsize = roundup_pow_of_two(hashsize); #endif + futex_hashshift = ilog2(hashsize); + size = sizeof(struct futex_hash_bucket) * hashsize; + order = get_order(size); + + for_each_node(n) { + struct futex_hash_bucket *table; + + if (order > MAX_PAGE_ORDER) + table = vmalloc_huge_node(size, GFP_KERNEL, n); + else + table = alloc_pages_exact_nid(n, size, GFP_KERNEL); + + BUG_ON(!table); - futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues), - futex_hashsize, 0, 0, - &futex_shift, NULL, - futex_hashsize, futex_hashsize); - futex_hashsize = 1UL << futex_shift; + for (i = 0; i < hashsize; i++) + futex_hash_bucket_init(&table[i], NULL); - for (i = 0; i < futex_hashsize; i++) { - atomic_set(&futex_queues[i].waiters, 0); - plist_head_init(&futex_queues[i].chain); - spin_lock_init(&futex_queues[i].lock); + futex_queues[n] = table; } + futex_hashmask = hashsize - 1; + pr_info("futex hash table entries: %lu (%lu bytes on %d NUMA nodes, total %lu KiB, %s).\n", + hashsize, size, num_possible_nodes(), size * num_possible_nodes() / 1024, + order > MAX_PAGE_ORDER ? "vmalloc" : "linear"); return 0; } core_initcall(futex_init); diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h index 6b2f4c7eb720..fcd1617212ee 100644 --- a/kernel/futex/futex.h +++ b/kernel/futex/futex.h @@ -7,6 +7,7 @@ #include <linux/sched/wake_q.h> #include <linux/compat.h> #include <linux/uaccess.h> +#include <linux/cleanup.h> #ifdef CONFIG_PREEMPT_RT #include <linux/rcuwait.h> @@ -38,6 +39,7 @@ #define FLAGS_HAS_TIMEOUT 0x0040 #define FLAGS_NUMA 0x0080 #define FLAGS_STRICT 0x0100 +#define FLAGS_MPOL 0x0200 /* FUTEX_ to FLAGS_ */ static inline unsigned int futex_to_flags(unsigned int op) @@ -53,7 +55,7 @@ static inline unsigned int futex_to_flags(unsigned int op) return flags; } -#define FUTEX2_VALID_MASK (FUTEX2_SIZE_MASK | FUTEX2_PRIVATE) +#define FUTEX2_VALID_MASK (FUTEX2_SIZE_MASK | FUTEX2_NUMA | FUTEX2_MPOL | FUTEX2_PRIVATE) /* FUTEX2_ to FLAGS_ */ static inline unsigned int futex2_to_flags(unsigned int flags2) @@ -66,6 +68,9 @@ static inline unsigned int futex2_to_flags(unsigned int flags2) if (flags2 & FUTEX2_NUMA) flags |= FLAGS_NUMA; + if (flags2 & FUTEX2_MPOL) + flags |= FLAGS_MPOL; + return flags; } @@ -86,6 +91,19 @@ static inline bool futex_flags_valid(unsigned int flags) if ((flags & FLAGS_SIZE_MASK) != FLAGS_SIZE_32) return false; + /* + * Must be able to represent both FUTEX_NO_NODE and every valid nodeid + * in a futex word. + */ + if (flags & FLAGS_NUMA) { + int bits = 8 * futex_size(flags); + u64 max = ~0ULL; + + max >>= 64 - bits; + if (nr_node_ids >= max) + return false; + } + return true; } @@ -117,6 +135,7 @@ struct futex_hash_bucket { atomic_t waiters; spinlock_t lock; struct plist_head chain; + struct futex_private_hash *priv; } ____cacheline_aligned_in_smp; /* @@ -156,6 +175,7 @@ typedef void (futex_wake_fn)(struct wake_q_head *wake_q, struct futex_q *q); * @requeue_pi_key: the requeue_pi target futex key * @bitset: bitset for the optional bitmasked wakeup * @requeue_state: State field for futex_requeue_pi() + * @drop_hb_ref: Waiter should drop the extra hash bucket reference if true * @requeue_wait: RCU wait for futex_requeue_pi() (RT only) * * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so @@ -182,6 +202,7 @@ struct futex_q { union futex_key *requeue_pi_key; u32 bitset; atomic_t requeue_state; + bool drop_hb_ref; #ifdef CONFIG_PREEMPT_RT struct rcuwait requeue_wait; #endif @@ -196,12 +217,35 @@ enum futex_access { extern int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key, enum futex_access rw); - +extern void futex_q_lockptr_lock(struct futex_q *q); extern struct hrtimer_sleeper * futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, int flags, u64 range_ns); extern struct futex_hash_bucket *futex_hash(union futex_key *key); +#ifdef CONFIG_FUTEX_PRIVATE_HASH +extern void futex_hash_get(struct futex_hash_bucket *hb); +extern void futex_hash_put(struct futex_hash_bucket *hb); + +extern struct futex_private_hash *futex_private_hash(void); +extern bool futex_private_hash_get(struct futex_private_hash *fph); +extern void futex_private_hash_put(struct futex_private_hash *fph); + +#else /* !CONFIG_FUTEX_PRIVATE_HASH */ +static inline void futex_hash_get(struct futex_hash_bucket *hb) { } +static inline void futex_hash_put(struct futex_hash_bucket *hb) { } +static inline struct futex_private_hash *futex_private_hash(void) { return NULL; } +static inline bool futex_private_hash_get(void) { return false; } +static inline void futex_private_hash_put(struct futex_private_hash *fph) { } +#endif + +DEFINE_CLASS(hb, struct futex_hash_bucket *, + if (_T) futex_hash_put(_T), + futex_hash(key), union futex_key *key); + +DEFINE_CLASS(private_hash, struct futex_private_hash *, + if (_T) futex_private_hash_put(_T), + futex_private_hash(), void); /** * futex_match - Check whether two futex keys are equal @@ -219,9 +263,9 @@ static inline int futex_match(union futex_key *key1, union futex_key *key2) } extern int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, - struct futex_q *q, struct futex_hash_bucket **hb); -extern void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q, - struct hrtimer_sleeper *timeout); + struct futex_q *q, union futex_key *key2, + struct task_struct *task); +extern void futex_do_wait(struct futex_q *q, struct hrtimer_sleeper *timeout); extern bool __futex_wake_mark(struct futex_q *q); extern void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q); @@ -256,7 +300,7 @@ static inline int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32 * This looks a bit overkill, but generally just results in a couple * of instructions. */ -static __always_inline int futex_read_inatomic(u32 *dest, u32 __user *from) +static __always_inline int futex_get_value(u32 *dest, u32 __user *from) { u32 val; @@ -273,12 +317,26 @@ Efault: return -EFAULT; } +static __always_inline int futex_put_value(u32 val, u32 __user *to) +{ + if (can_do_masked_user_access()) + to = masked_user_access_begin(to); + else if (!user_read_access_begin(to, sizeof(*to))) + return -EFAULT; + unsafe_put_user(val, to, Efault); + user_read_access_end(); + return 0; +Efault: + user_read_access_end(); + return -EFAULT; +} + static inline int futex_get_value_locked(u32 *dest, u32 __user *from) { int ret; pagefault_disable(); - ret = futex_read_inatomic(dest, from); + ret = futex_get_value(dest, from); pagefault_enable(); return ret; @@ -354,7 +412,7 @@ static inline int futex_hb_waiters_pending(struct futex_hash_bucket *hb) #endif } -extern struct futex_hash_bucket *futex_q_lock(struct futex_q *q); +extern void futex_q_lock(struct futex_q *q, struct futex_hash_bucket *hb); extern void futex_q_unlock(struct futex_hash_bucket *hb); diff --git a/kernel/futex/pi.c b/kernel/futex/pi.c index 7a941845f7ee..dacb2330f1fb 100644 --- a/kernel/futex/pi.c +++ b/kernel/futex/pi.c @@ -806,7 +806,7 @@ handle_err: break; } - spin_lock(q->lock_ptr); + futex_q_lockptr_lock(q); raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); /* @@ -920,7 +920,6 @@ int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int tryl struct hrtimer_sleeper timeout, *to; struct task_struct *exiting = NULL; struct rt_mutex_waiter rt_waiter; - struct futex_hash_bucket *hb; struct futex_q q = futex_q_init; DEFINE_WAKE_Q(wake_q); int res, ret; @@ -939,151 +938,183 @@ retry: goto out; retry_private: - hb = futex_q_lock(&q); + if (1) { + CLASS(hb, hb)(&q.key); - ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, - &exiting, 0); - if (unlikely(ret)) { - /* - * Atomic work succeeded and we got the lock, - * or failed. Either way, we do _not_ block. - */ - switch (ret) { - case 1: - /* We got the lock. */ - ret = 0; - goto out_unlock_put_key; - case -EFAULT: - goto uaddr_faulted; - case -EBUSY: - case -EAGAIN: - /* - * Two reasons for this: - * - EBUSY: Task is exiting and we just wait for the - * exit to complete. - * - EAGAIN: The user space value changed. - */ - futex_q_unlock(hb); + futex_q_lock(&q, hb); + + ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, + &exiting, 0); + if (unlikely(ret)) { /* - * Handle the case where the owner is in the middle of - * exiting. Wait for the exit to complete otherwise - * this task might loop forever, aka. live lock. + * Atomic work succeeded and we got the lock, + * or failed. Either way, we do _not_ block. */ - wait_for_owner_exiting(ret, exiting); - cond_resched(); - goto retry; - default: - goto out_unlock_put_key; + switch (ret) { + case 1: + /* We got the lock. */ + ret = 0; + goto out_unlock_put_key; + case -EFAULT: + goto uaddr_faulted; + case -EBUSY: + case -EAGAIN: + /* + * Two reasons for this: + * - EBUSY: Task is exiting and we just wait for the + * exit to complete. + * - EAGAIN: The user space value changed. + */ + futex_q_unlock(hb); + /* + * Handle the case where the owner is in the middle of + * exiting. Wait for the exit to complete otherwise + * this task might loop forever, aka. live lock. + */ + wait_for_owner_exiting(ret, exiting); + cond_resched(); + goto retry; + default: + goto out_unlock_put_key; + } } - } - WARN_ON(!q.pi_state); + WARN_ON(!q.pi_state); - /* - * Only actually queue now that the atomic ops are done: - */ - __futex_queue(&q, hb, current); + /* + * Only actually queue now that the atomic ops are done: + */ + __futex_queue(&q, hb, current); - if (trylock) { - ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex); - /* Fixup the trylock return value: */ - ret = ret ? 0 : -EWOULDBLOCK; - goto no_block; - } + if (trylock) { + ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex); + /* Fixup the trylock return value: */ + ret = ret ? 0 : -EWOULDBLOCK; + goto no_block; + } - /* - * Must be done before we enqueue the waiter, here is unfortunately - * under the hb lock, but that *should* work because it does nothing. - */ - rt_mutex_pre_schedule(); + /* + * Caution; releasing @hb in-scope. The hb->lock is still locked + * while the reference is dropped. The reference can not be dropped + * after the unlock because if a user initiated resize is in progress + * then we might need to wake him. This can not be done after the + * rt_mutex_pre_schedule() invocation. The hb will remain valid because + * the thread, performing resize, will block on hb->lock during + * the requeue. + */ + futex_hash_put(no_free_ptr(hb)); + /* + * Must be done before we enqueue the waiter, here is unfortunately + * under the hb lock, but that *should* work because it does nothing. + */ + rt_mutex_pre_schedule(); - rt_mutex_init_waiter(&rt_waiter); + rt_mutex_init_waiter(&rt_waiter); - /* - * On PREEMPT_RT, when hb->lock becomes an rt_mutex, we must not - * hold it while doing rt_mutex_start_proxy(), because then it will - * include hb->lock in the blocking chain, even through we'll not in - * fact hold it while blocking. This will lead it to report -EDEADLK - * and BUG when futex_unlock_pi() interleaves with this. - * - * Therefore acquire wait_lock while holding hb->lock, but drop the - * latter before calling __rt_mutex_start_proxy_lock(). This - * interleaves with futex_unlock_pi() -- which does a similar lock - * handoff -- such that the latter can observe the futex_q::pi_state - * before __rt_mutex_start_proxy_lock() is done. - */ - raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock); - spin_unlock(q.lock_ptr); - /* - * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter - * such that futex_unlock_pi() is guaranteed to observe the waiter when - * it sees the futex_q::pi_state. - */ - ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current, &wake_q); - raw_spin_unlock_irq_wake(&q.pi_state->pi_mutex.wait_lock, &wake_q); + /* + * On PREEMPT_RT, when hb->lock becomes an rt_mutex, we must not + * hold it while doing rt_mutex_start_proxy(), because then it will + * include hb->lock in the blocking chain, even through we'll not in + * fact hold it while blocking. This will lead it to report -EDEADLK + * and BUG when futex_unlock_pi() interleaves with this. + * + * Therefore acquire wait_lock while holding hb->lock, but drop the + * latter before calling __rt_mutex_start_proxy_lock(). This + * interleaves with futex_unlock_pi() -- which does a similar lock + * handoff -- such that the latter can observe the futex_q::pi_state + * before __rt_mutex_start_proxy_lock() is done. + */ + raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock); + spin_unlock(q.lock_ptr); + /* + * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter + * such that futex_unlock_pi() is guaranteed to observe the waiter when + * it sees the futex_q::pi_state. + */ + ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current, &wake_q); + raw_spin_unlock_irq_wake(&q.pi_state->pi_mutex.wait_lock, &wake_q); - if (ret) { - if (ret == 1) - ret = 0; - goto cleanup; - } + if (ret) { + if (ret == 1) + ret = 0; + goto cleanup; + } - if (unlikely(to)) - hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS); + if (unlikely(to)) + hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS); - ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter); + ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter); cleanup: - /* - * If we failed to acquire the lock (deadlock/signal/timeout), we must - * must unwind the above, however we canont lock hb->lock because - * rt_mutex already has a waiter enqueued and hb->lock can itself try - * and enqueue an rt_waiter through rtlock. - * - * Doing the cleanup without holding hb->lock can cause inconsistent - * state between hb and pi_state, but only in the direction of not - * seeing a waiter that is leaving. - * - * See futex_unlock_pi(), it deals with this inconsistency. - * - * There be dragons here, since we must deal with the inconsistency on - * the way out (here), it is impossible to detect/warn about the race - * the other way around (missing an incoming waiter). - * - * What could possibly go wrong... - */ - if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter)) - ret = 0; + /* + * If we failed to acquire the lock (deadlock/signal/timeout), we must + * unwind the above, however we canont lock hb->lock because + * rt_mutex already has a waiter enqueued and hb->lock can itself try + * and enqueue an rt_waiter through rtlock. + * + * Doing the cleanup without holding hb->lock can cause inconsistent + * state between hb and pi_state, but only in the direction of not + * seeing a waiter that is leaving. + * + * See futex_unlock_pi(), it deals with this inconsistency. + * + * There be dragons here, since we must deal with the inconsistency on + * the way out (here), it is impossible to detect/warn about the race + * the other way around (missing an incoming waiter). + * + * What could possibly go wrong... + */ + if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter)) + ret = 0; - /* - * Now that the rt_waiter has been dequeued, it is safe to use - * spinlock/rtlock (which might enqueue its own rt_waiter) and fix up - * the - */ - spin_lock(q.lock_ptr); - /* - * Waiter is unqueued. - */ - rt_mutex_post_schedule(); + /* + * Now that the rt_waiter has been dequeued, it is safe to use + * spinlock/rtlock (which might enqueue its own rt_waiter) and fix up + * the + */ + futex_q_lockptr_lock(&q); + /* + * Waiter is unqueued. + */ + rt_mutex_post_schedule(); no_block: - /* - * Fixup the pi_state owner and possibly acquire the lock if we - * haven't already. - */ - res = fixup_pi_owner(uaddr, &q, !ret); - /* - * If fixup_pi_owner() returned an error, propagate that. If it acquired - * the lock, clear our -ETIMEDOUT or -EINTR. - */ - if (res) - ret = (res < 0) ? res : 0; - - futex_unqueue_pi(&q); - spin_unlock(q.lock_ptr); - goto out; + /* + * Fixup the pi_state owner and possibly acquire the lock if we + * haven't already. + */ + res = fixup_pi_owner(uaddr, &q, !ret); + /* + * If fixup_pi_owner() returned an error, propagate that. If it acquired + * the lock, clear our -ETIMEDOUT or -EINTR. + */ + if (res) + ret = (res < 0) ? res : 0; + + futex_unqueue_pi(&q); + spin_unlock(q.lock_ptr); + if (q.drop_hb_ref) { + CLASS(hb, hb)(&q.key); + /* Additional reference from futex_unlock_pi() */ + futex_hash_put(hb); + } + goto out; out_unlock_put_key: - futex_q_unlock(hb); + futex_q_unlock(hb); + goto out; + +uaddr_faulted: + futex_q_unlock(hb); + + ret = fault_in_user_writeable(uaddr); + if (ret) + goto out; + + if (!(flags & FLAGS_SHARED)) + goto retry_private; + + goto retry; + } out: if (to) { @@ -1091,18 +1122,6 @@ out: destroy_hrtimer_on_stack(&to->timer); } return ret != -EINTR ? ret : -ERESTARTNOINTR; - -uaddr_faulted: - futex_q_unlock(hb); - - ret = fault_in_user_writeable(uaddr); - if (ret) - goto out; - - if (!(flags & FLAGS_SHARED)) - goto retry_private; - - goto retry; } /* @@ -1114,7 +1133,6 @@ int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) { u32 curval, uval, vpid = task_pid_vnr(current); union futex_key key = FUTEX_KEY_INIT; - struct futex_hash_bucket *hb; struct futex_q *top_waiter; int ret; @@ -1134,7 +1152,7 @@ retry: if (ret) return ret; - hb = futex_hash(&key); + CLASS(hb, hb)(&key); spin_lock(&hb->lock); retry_hb: @@ -1187,6 +1205,12 @@ retry_hb: */ rt_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex); if (!rt_waiter) { + /* + * Acquire a reference for the leaving waiter to ensure + * valid futex_q::lock_ptr. + */ + futex_hash_get(hb); + top_waiter->drop_hb_ref = true; __futex_unqueue(top_waiter); raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); goto retry_hb; diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c index b47bb764b352..c716a66f8692 100644 --- a/kernel/futex/requeue.c +++ b/kernel/futex/requeue.c @@ -87,6 +87,11 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, futex_hb_waiters_inc(hb2); plist_add(&q->list, &hb2->chain); q->lock_ptr = &hb2->lock; + /* + * hb1 and hb2 belong to the same futex_hash_bucket_private + * because if we managed get a reference on hb1 then it can't be + * replaced. Therefore we avoid put(hb1)+get(hb2) here. + */ } q->key = *key2; } @@ -231,7 +236,12 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, WARN_ON(!q->rt_waiter); q->rt_waiter = NULL; - + /* + * Acquire a reference for the waiter to ensure valid + * futex_q::lock_ptr. + */ + futex_hash_get(hb); + q->drop_hb_ref = true; q->lock_ptr = &hb->lock; /* Signal locked state to the waiter */ @@ -371,7 +381,6 @@ int futex_requeue(u32 __user *uaddr1, unsigned int flags1, union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; int task_count = 0, ret; struct futex_pi_state *pi_state = NULL; - struct futex_hash_bucket *hb1, *hb2; struct futex_q *this, *next; DEFINE_WAKE_Q(wake_q); @@ -443,240 +452,242 @@ retry: if (requeue_pi && futex_match(&key1, &key2)) return -EINVAL; - hb1 = futex_hash(&key1); - hb2 = futex_hash(&key2); - retry_private: - futex_hb_waiters_inc(hb2); - double_lock_hb(hb1, hb2); + if (1) { + CLASS(hb, hb1)(&key1); + CLASS(hb, hb2)(&key2); - if (likely(cmpval != NULL)) { - u32 curval; + futex_hb_waiters_inc(hb2); + double_lock_hb(hb1, hb2); - ret = futex_get_value_locked(&curval, uaddr1); + if (likely(cmpval != NULL)) { + u32 curval; - if (unlikely(ret)) { - double_unlock_hb(hb1, hb2); - futex_hb_waiters_dec(hb2); + ret = futex_get_value_locked(&curval, uaddr1); - ret = get_user(curval, uaddr1); - if (ret) - return ret; + if (unlikely(ret)) { + futex_hb_waiters_dec(hb2); + double_unlock_hb(hb1, hb2); - if (!(flags1 & FLAGS_SHARED)) - goto retry_private; + ret = get_user(curval, uaddr1); + if (ret) + return ret; - goto retry; - } - if (curval != *cmpval) { - ret = -EAGAIN; - goto out_unlock; - } - } + if (!(flags1 & FLAGS_SHARED)) + goto retry_private; - if (requeue_pi) { - struct task_struct *exiting = NULL; + goto retry; + } + if (curval != *cmpval) { + ret = -EAGAIN; + goto out_unlock; + } + } - /* - * Attempt to acquire uaddr2 and wake the top waiter. If we - * intend to requeue waiters, force setting the FUTEX_WAITERS - * bit. We force this here where we are able to easily handle - * faults rather in the requeue loop below. - * - * Updates topwaiter::requeue_state if a top waiter exists. - */ - ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1, - &key2, &pi_state, - &exiting, nr_requeue); + if (requeue_pi) { + struct task_struct *exiting = NULL; - /* - * At this point the top_waiter has either taken uaddr2 or - * is waiting on it. In both cases pi_state has been - * established and an initial refcount on it. In case of an - * error there's nothing. - * - * The top waiter's requeue_state is up to date: - * - * - If the lock was acquired atomically (ret == 1), then - * the state is Q_REQUEUE_PI_LOCKED. - * - * The top waiter has been dequeued and woken up and can - * return to user space immediately. The kernel/user - * space state is consistent. In case that there must be - * more waiters requeued the WAITERS bit in the user - * space futex is set so the top waiter task has to go - * into the syscall slowpath to unlock the futex. This - * will block until this requeue operation has been - * completed and the hash bucket locks have been - * dropped. - * - * - If the trylock failed with an error (ret < 0) then - * the state is either Q_REQUEUE_PI_NONE, i.e. "nothing - * happened", or Q_REQUEUE_PI_IGNORE when there was an - * interleaved early wakeup. - * - * - If the trylock did not succeed (ret == 0) then the - * state is either Q_REQUEUE_PI_IN_PROGRESS or - * Q_REQUEUE_PI_WAIT if an early wakeup interleaved. - * This will be cleaned up in the loop below, which - * cannot fail because futex_proxy_trylock_atomic() did - * the same sanity checks for requeue_pi as the loop - * below does. - */ - switch (ret) { - case 0: - /* We hold a reference on the pi state. */ - break; - - case 1: /* - * futex_proxy_trylock_atomic() acquired the user space - * futex. Adjust task_count. + * Attempt to acquire uaddr2 and wake the top waiter. If we + * intend to requeue waiters, force setting the FUTEX_WAITERS + * bit. We force this here where we are able to easily handle + * faults rather in the requeue loop below. + * + * Updates topwaiter::requeue_state if a top waiter exists. */ - task_count++; - ret = 0; - break; + ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1, + &key2, &pi_state, + &exiting, nr_requeue); - /* - * If the above failed, then pi_state is NULL and - * waiter::requeue_state is correct. - */ - case -EFAULT: - double_unlock_hb(hb1, hb2); - futex_hb_waiters_dec(hb2); - ret = fault_in_user_writeable(uaddr2); - if (!ret) - goto retry; - return ret; - case -EBUSY: - case -EAGAIN: - /* - * Two reasons for this: - * - EBUSY: Owner is exiting and we just wait for the - * exit to complete. - * - EAGAIN: The user space value changed. - */ - double_unlock_hb(hb1, hb2); - futex_hb_waiters_dec(hb2); /* - * Handle the case where the owner is in the middle of - * exiting. Wait for the exit to complete otherwise - * this task might loop forever, aka. live lock. + * At this point the top_waiter has either taken uaddr2 or + * is waiting on it. In both cases pi_state has been + * established and an initial refcount on it. In case of an + * error there's nothing. + * + * The top waiter's requeue_state is up to date: + * + * - If the lock was acquired atomically (ret == 1), then + * the state is Q_REQUEUE_PI_LOCKED. + * + * The top waiter has been dequeued and woken up and can + * return to user space immediately. The kernel/user + * space state is consistent. In case that there must be + * more waiters requeued the WAITERS bit in the user + * space futex is set so the top waiter task has to go + * into the syscall slowpath to unlock the futex. This + * will block until this requeue operation has been + * completed and the hash bucket locks have been + * dropped. + * + * - If the trylock failed with an error (ret < 0) then + * the state is either Q_REQUEUE_PI_NONE, i.e. "nothing + * happened", or Q_REQUEUE_PI_IGNORE when there was an + * interleaved early wakeup. + * + * - If the trylock did not succeed (ret == 0) then the + * state is either Q_REQUEUE_PI_IN_PROGRESS or + * Q_REQUEUE_PI_WAIT if an early wakeup interleaved. + * This will be cleaned up in the loop below, which + * cannot fail because futex_proxy_trylock_atomic() did + * the same sanity checks for requeue_pi as the loop + * below does. */ - wait_for_owner_exiting(ret, exiting); - cond_resched(); - goto retry; - default: - goto out_unlock; - } - } - - plist_for_each_entry_safe(this, next, &hb1->chain, list) { - if (task_count - nr_wake >= nr_requeue) - break; - - if (!futex_match(&this->key, &key1)) - continue; - - /* - * FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI should always - * be paired with each other and no other futex ops. - * - * We should never be requeueing a futex_q with a pi_state, - * which is awaiting a futex_unlock_pi(). - */ - if ((requeue_pi && !this->rt_waiter) || - (!requeue_pi && this->rt_waiter) || - this->pi_state) { - ret = -EINVAL; - break; + switch (ret) { + case 0: + /* We hold a reference on the pi state. */ + break; + + case 1: + /* + * futex_proxy_trylock_atomic() acquired the user space + * futex. Adjust task_count. + */ + task_count++; + ret = 0; + break; + + /* + * If the above failed, then pi_state is NULL and + * waiter::requeue_state is correct. + */ + case -EFAULT: + futex_hb_waiters_dec(hb2); + double_unlock_hb(hb1, hb2); + ret = fault_in_user_writeable(uaddr2); + if (!ret) + goto retry; + return ret; + case -EBUSY: + case -EAGAIN: + /* + * Two reasons for this: + * - EBUSY: Owner is exiting and we just wait for the + * exit to complete. + * - EAGAIN: The user space value changed. + */ + futex_hb_waiters_dec(hb2); + double_unlock_hb(hb1, hb2); + /* + * Handle the case where the owner is in the middle of + * exiting. Wait for the exit to complete otherwise + * this task might loop forever, aka. live lock. + */ + wait_for_owner_exiting(ret, exiting); + cond_resched(); + goto retry; + default: + goto out_unlock; + } } - /* Plain futexes just wake or requeue and are done */ - if (!requeue_pi) { - if (++task_count <= nr_wake) - this->wake(&wake_q, this); - else - requeue_futex(this, hb1, hb2, &key2); - continue; - } + plist_for_each_entry_safe(this, next, &hb1->chain, list) { + if (task_count - nr_wake >= nr_requeue) + break; - /* Ensure we requeue to the expected futex for requeue_pi. */ - if (!futex_match(this->requeue_pi_key, &key2)) { - ret = -EINVAL; - break; - } + if (!futex_match(&this->key, &key1)) + continue; - /* - * Requeue nr_requeue waiters and possibly one more in the case - * of requeue_pi if we couldn't acquire the lock atomically. - * - * Prepare the waiter to take the rt_mutex. Take a refcount - * on the pi_state and store the pointer in the futex_q - * object of the waiter. - */ - get_pi_state(pi_state); - - /* Don't requeue when the waiter is already on the way out. */ - if (!futex_requeue_pi_prepare(this, pi_state)) { /* - * Early woken waiter signaled that it is on the - * way out. Drop the pi_state reference and try the - * next waiter. @this->pi_state is still NULL. + * FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI should always + * be paired with each other and no other futex ops. + * + * We should never be requeueing a futex_q with a pi_state, + * which is awaiting a futex_unlock_pi(). */ - put_pi_state(pi_state); - continue; - } + if ((requeue_pi && !this->rt_waiter) || + (!requeue_pi && this->rt_waiter) || + this->pi_state) { + ret = -EINVAL; + break; + } + + /* Plain futexes just wake or requeue and are done */ + if (!requeue_pi) { + if (++task_count <= nr_wake) + this->wake(&wake_q, this); + else + requeue_futex(this, hb1, hb2, &key2); + continue; + } + + /* Ensure we requeue to the expected futex for requeue_pi. */ + if (!futex_match(this->requeue_pi_key, &key2)) { + ret = -EINVAL; + break; + } - ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, - this->rt_waiter, - this->task); - - if (ret == 1) { - /* - * We got the lock. We do neither drop the refcount - * on pi_state nor clear this->pi_state because the - * waiter needs the pi_state for cleaning up the - * user space value. It will drop the refcount - * after doing so. this::requeue_state is updated - * in the wakeup as well. - */ - requeue_pi_wake_futex(this, &key2, hb2); - task_count++; - } else if (!ret) { - /* Waiter is queued, move it to hb2 */ - requeue_futex(this, hb1, hb2, &key2); - futex_requeue_pi_complete(this, 0); - task_count++; - } else { /* - * rt_mutex_start_proxy_lock() detected a potential - * deadlock when we tried to queue that waiter. - * Drop the pi_state reference which we took above - * and remove the pointer to the state from the - * waiters futex_q object. + * Requeue nr_requeue waiters and possibly one more in the case + * of requeue_pi if we couldn't acquire the lock atomically. + * + * Prepare the waiter to take the rt_mutex. Take a refcount + * on the pi_state and store the pointer in the futex_q + * object of the waiter. */ - this->pi_state = NULL; - put_pi_state(pi_state); - futex_requeue_pi_complete(this, ret); - /* - * We stop queueing more waiters and let user space - * deal with the mess. - */ - break; + get_pi_state(pi_state); + + /* Don't requeue when the waiter is already on the way out. */ + if (!futex_requeue_pi_prepare(this, pi_state)) { + /* + * Early woken waiter signaled that it is on the + * way out. Drop the pi_state reference and try the + * next waiter. @this->pi_state is still NULL. + */ + put_pi_state(pi_state); + continue; + } + + ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, + this->rt_waiter, + this->task); + + if (ret == 1) { + /* + * We got the lock. We do neither drop the refcount + * on pi_state nor clear this->pi_state because the + * waiter needs the pi_state for cleaning up the + * user space value. It will drop the refcount + * after doing so. this::requeue_state is updated + * in the wakeup as well. + */ + requeue_pi_wake_futex(this, &key2, hb2); + task_count++; + } else if (!ret) { + /* Waiter is queued, move it to hb2 */ + requeue_futex(this, hb1, hb2, &key2); + futex_requeue_pi_complete(this, 0); + task_count++; + } else { + /* + * rt_mutex_start_proxy_lock() detected a potential + * deadlock when we tried to queue that waiter. + * Drop the pi_state reference which we took above + * and remove the pointer to the state from the + * waiters futex_q object. + */ + this->pi_state = NULL; + put_pi_state(pi_state); + futex_requeue_pi_complete(this, ret); + /* + * We stop queueing more waiters and let user space + * deal with the mess. + */ + break; + } } - } - /* - * We took an extra initial reference to the pi_state in - * futex_proxy_trylock_atomic(). We need to drop it here again. - */ - put_pi_state(pi_state); + /* + * We took an extra initial reference to the pi_state in + * futex_proxy_trylock_atomic(). We need to drop it here again. + */ + put_pi_state(pi_state); out_unlock: - double_unlock_hb(hb1, hb2); + futex_hb_waiters_dec(hb2); + double_unlock_hb(hb1, hb2); + } wake_up_q(&wake_q); - futex_hb_waiters_dec(hb2); return ret ? ret : task_count; } @@ -769,7 +780,6 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, { struct hrtimer_sleeper timeout, *to; struct rt_mutex_waiter rt_waiter; - struct futex_hash_bucket *hb; union futex_key key2 = FUTEX_KEY_INIT; struct futex_q q = futex_q_init; struct rt_mutex_base *pi_mutex; @@ -805,35 +815,28 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, * Prepare to wait on uaddr. On success, it holds hb->lock and q * is initialized. */ - ret = futex_wait_setup(uaddr, val, flags, &q, &hb); + ret = futex_wait_setup(uaddr, val, flags, &q, &key2, current); if (ret) goto out; - /* - * The check above which compares uaddrs is not sufficient for - * shared futexes. We need to compare the keys: - */ - if (futex_match(&q.key, &key2)) { - futex_q_unlock(hb); - ret = -EINVAL; - goto out; - } - /* Queue the futex_q, drop the hb lock, wait for wakeup. */ - futex_wait_queue(hb, &q, to); + futex_do_wait(&q, to); switch (futex_requeue_pi_wakeup_sync(&q)) { case Q_REQUEUE_PI_IGNORE: - /* The waiter is still on uaddr1 */ - spin_lock(&hb->lock); - ret = handle_early_requeue_pi_wakeup(hb, &q, to); - spin_unlock(&hb->lock); + { + CLASS(hb, hb)(&q.key); + /* The waiter is still on uaddr1 */ + spin_lock(&hb->lock); + ret = handle_early_requeue_pi_wakeup(hb, &q, to); + spin_unlock(&hb->lock); + } break; case Q_REQUEUE_PI_LOCKED: /* The requeue acquired the lock */ if (q.pi_state && (q.pi_state->owner != current)) { - spin_lock(q.lock_ptr); + futex_q_lockptr_lock(&q); ret = fixup_pi_owner(uaddr2, &q, true); /* * Drop the reference to the pi state which the @@ -860,7 +863,7 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter)) ret = 0; - spin_lock(q.lock_ptr); + futex_q_lockptr_lock(&q); debug_rt_mutex_free_waiter(&rt_waiter); /* * Fixup the pi_state owner and possibly acquire the lock if we @@ -892,6 +895,11 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, default: BUG(); } + if (q.drop_hb_ref) { + CLASS(hb, hb)(&q.key); + /* Additional reference from requeue_pi_wake_futex() */ + futex_hash_put(hb); + } out: if (to) { diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c index 25877d4f2f8f..e2bbe5509ec2 100644 --- a/kernel/futex/waitwake.c +++ b/kernel/futex/waitwake.c @@ -154,7 +154,6 @@ void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q) */ int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) { - struct futex_hash_bucket *hb; struct futex_q *this, *next; union futex_key key = FUTEX_KEY_INIT; DEFINE_WAKE_Q(wake_q); @@ -170,7 +169,7 @@ int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) if ((flags & FLAGS_STRICT) && !nr_wake) return 0; - hb = futex_hash(&key); + CLASS(hb, hb)(&key); /* Make sure we really have tasks to wakeup */ if (!futex_hb_waiters_pending(hb)) @@ -253,7 +252,6 @@ int futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, int nr_wake, int nr_wake2, int op) { union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; - struct futex_hash_bucket *hb1, *hb2; struct futex_q *this, *next; int ret, op_ret; DEFINE_WAKE_Q(wake_q); @@ -266,67 +264,69 @@ retry: if (unlikely(ret != 0)) return ret; - hb1 = futex_hash(&key1); - hb2 = futex_hash(&key2); - retry_private: - double_lock_hb(hb1, hb2); - op_ret = futex_atomic_op_inuser(op, uaddr2); - if (unlikely(op_ret < 0)) { - double_unlock_hb(hb1, hb2); - - if (!IS_ENABLED(CONFIG_MMU) || - unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) { - /* - * we don't get EFAULT from MMU faults if we don't have - * an MMU, but we might get them from range checking - */ - ret = op_ret; - return ret; - } - - if (op_ret == -EFAULT) { - ret = fault_in_user_writeable(uaddr2); - if (ret) + if (1) { + CLASS(hb, hb1)(&key1); + CLASS(hb, hb2)(&key2); + + double_lock_hb(hb1, hb2); + op_ret = futex_atomic_op_inuser(op, uaddr2); + if (unlikely(op_ret < 0)) { + double_unlock_hb(hb1, hb2); + + if (!IS_ENABLED(CONFIG_MMU) || + unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) { + /* + * we don't get EFAULT from MMU faults if we don't have + * an MMU, but we might get them from range checking + */ + ret = op_ret; return ret; - } - - cond_resched(); - if (!(flags & FLAGS_SHARED)) - goto retry_private; - goto retry; - } + } - plist_for_each_entry_safe(this, next, &hb1->chain, list) { - if (futex_match (&this->key, &key1)) { - if (this->pi_state || this->rt_waiter) { - ret = -EINVAL; - goto out_unlock; + if (op_ret == -EFAULT) { + ret = fault_in_user_writeable(uaddr2); + if (ret) + return ret; } - this->wake(&wake_q, this); - if (++ret >= nr_wake) - break; + + cond_resched(); + if (!(flags & FLAGS_SHARED)) + goto retry_private; + goto retry; } - } - if (op_ret > 0) { - op_ret = 0; - plist_for_each_entry_safe(this, next, &hb2->chain, list) { - if (futex_match (&this->key, &key2)) { + plist_for_each_entry_safe(this, next, &hb1->chain, list) { + if (futex_match(&this->key, &key1)) { if (this->pi_state || this->rt_waiter) { ret = -EINVAL; goto out_unlock; } this->wake(&wake_q, this); - if (++op_ret >= nr_wake2) + if (++ret >= nr_wake) break; } } - ret += op_ret; - } + + if (op_ret > 0) { + op_ret = 0; + plist_for_each_entry_safe(this, next, &hb2->chain, list) { + if (futex_match(&this->key, &key2)) { + if (this->pi_state || this->rt_waiter) { + ret = -EINVAL; + goto out_unlock; + } + this->wake(&wake_q, this); + if (++op_ret >= nr_wake2) + break; + } + } + ret += op_ret; + } out_unlock: - double_unlock_hb(hb1, hb2); + double_unlock_hb(hb1, hb2); + } wake_up_q(&wake_q); return ret; } @@ -334,23 +334,12 @@ out_unlock: static long futex_wait_restart(struct restart_block *restart); /** - * futex_wait_queue() - futex_queue() and wait for wakeup, timeout, or signal - * @hb: the futex hash bucket, must be locked by the caller + * futex_do_wait() - wait for wakeup, timeout, or signal * @q: the futex_q to queue up on * @timeout: the prepared hrtimer_sleeper, or null for no timeout */ -void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q, - struct hrtimer_sleeper *timeout) +void futex_do_wait(struct futex_q *q, struct hrtimer_sleeper *timeout) { - /* - * The task state is guaranteed to be set before another task can - * wake it. set_current_state() is implemented using smp_store_mb() and - * futex_queue() calls spin_unlock() upon completion, both serializing - * access to the hash list and forcing another memory barrier. - */ - set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); - futex_queue(q, hb, current); - /* Arm the timer */ if (timeout) hrtimer_sleeper_start_expires(timeout, HRTIMER_MODE_ABS); @@ -412,12 +401,17 @@ int futex_unqueue_multiple(struct futex_vector *v, int count) */ int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *woken) { - struct futex_hash_bucket *hb; bool retry = false; int ret, i; u32 uval; /* + * Make sure to have a reference on the private_hash such that we + * don't block on rehash after changing the task state below. + */ + guard(private_hash)(); + + /* * Enqueuing multiple futexes is tricky, because we need to enqueue * each futex on the list before dealing with the next one to avoid * deadlocking on the hash bucket. But, before enqueuing, we need to @@ -451,20 +445,24 @@ retry: struct futex_q *q = &vs[i].q; u32 val = vs[i].w.val; - hb = futex_q_lock(q); - ret = futex_get_value_locked(&uval, uaddr); + if (1) { + CLASS(hb, hb)(&q->key); - if (!ret && uval == val) { - /* - * The bucket lock can't be held while dealing with the - * next futex. Queue each futex at this moment so hb can - * be unlocked. - */ - futex_queue(q, hb, current); - continue; - } + futex_q_lock(q, hb); + ret = futex_get_value_locked(&uval, uaddr); + + if (!ret && uval == val) { + /* + * The bucket lock can't be held while dealing with the + * next futex. Queue each futex at this moment so hb can + * be unlocked. + */ + futex_queue(q, hb, current); + continue; + } - futex_q_unlock(hb); + futex_q_unlock(hb); + } __set_current_state(TASK_RUNNING); /* @@ -578,7 +576,8 @@ int futex_wait_multiple(struct futex_vector *vs, unsigned int count, * @val: the expected value * @flags: futex flags (FLAGS_SHARED, etc.) * @q: the associated futex_q - * @hb: storage for hash_bucket pointer to be returned to caller + * @key2: the second futex_key if used for requeue PI + * @task: Task queueing this futex * * Setup the futex_q and locate the hash_bucket. Get the futex value and * compare it with the expected value. Handle atomic faults internally. @@ -586,10 +585,12 @@ int futex_wait_multiple(struct futex_vector *vs, unsigned int count, * * Return: * - 0 - uaddr contains val and hb has been locked; - * - <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked + * - <0 - On error and the hb is unlocked. A possible reason: the uaddr can not + * be read, does not contain the expected value or is not properly aligned. */ int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, - struct futex_q *q, struct futex_hash_bucket **hb) + struct futex_q *q, union futex_key *key2, + struct task_struct *task) { u32 uval; int ret; @@ -618,26 +619,45 @@ retry: return ret; retry_private: - *hb = futex_q_lock(q); + if (1) { + CLASS(hb, hb)(&q->key); - ret = futex_get_value_locked(&uval, uaddr); + futex_q_lock(q, hb); - if (ret) { - futex_q_unlock(*hb); + ret = futex_get_value_locked(&uval, uaddr); - ret = get_user(uval, uaddr); - if (ret) - return ret; + if (ret) { + futex_q_unlock(hb); - if (!(flags & FLAGS_SHARED)) - goto retry_private; + ret = get_user(uval, uaddr); + if (ret) + return ret; - goto retry; - } + if (!(flags & FLAGS_SHARED)) + goto retry_private; + + goto retry; + } - if (uval != val) { - futex_q_unlock(*hb); - ret = -EWOULDBLOCK; + if (uval != val) { + futex_q_unlock(hb); + return -EWOULDBLOCK; + } + + if (key2 && futex_match(&q->key, key2)) { + futex_q_unlock(hb); + return -EINVAL; + } + + /* + * The task state is guaranteed to be set before another task can + * wake it. set_current_state() is implemented using smp_store_mb() and + * futex_queue() calls spin_unlock() upon completion, both serializing + * access to the hash list and forcing another memory barrier. + */ + if (task == current) + set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); + futex_queue(q, hb, task); } return ret; @@ -647,7 +667,6 @@ int __futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, struct hrtimer_sleeper *to, u32 bitset) { struct futex_q q = futex_q_init; - struct futex_hash_bucket *hb; int ret; if (!bitset) @@ -660,12 +679,12 @@ retry: * Prepare to wait on uaddr. On success, it holds hb->lock and q * is initialized. */ - ret = futex_wait_setup(uaddr, val, flags, &q, &hb); + ret = futex_wait_setup(uaddr, val, flags, &q, NULL, current); if (ret) return ret; /* futex_queue and wait for wakeup, timeout, or a signal. */ - futex_wait_queue(hb, &q, to); + futex_do_wait(&q, to); /* If we were woken (and unqueued), we succeeded, whatever. */ if (!futex_unqueue(&q)) diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh index 00529c81cc40..c9e5dc068e85 100755 --- a/kernel/gen_kheaders.sh +++ b/kernel/gen_kheaders.sh @@ -89,7 +89,6 @@ rm -f "${tmpdir}.contents.txt" # Create archive and try to normalize metadata for reproducibility. tar "${KBUILD_BUILD_TIMESTAMP:+--mtime=$KBUILD_BUILD_TIMESTAMP}" \ - --exclude=".__afs*" --exclude=".nfs*" \ --owner=0 --group=0 --sort=name --numeric-owner --mode=u=rw,go=r,a+X \ -I $XZ -cf $tarfile -C "${tmpdir}/" . > /dev/null diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 04efa7a6e69b..dc898ec93463 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -93,6 +93,43 @@ static struct notifier_block panic_block = { .notifier_call = hung_task_panic, }; + +#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER +static void debug_show_blocker(struct task_struct *task) +{ + struct task_struct *g, *t; + unsigned long owner; + struct mutex *lock; + + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "No rcu lock held"); + + lock = READ_ONCE(task->blocker_mutex); + if (!lock) + return; + + owner = mutex_get_owner(lock); + if (unlikely(!owner)) { + pr_err("INFO: task %s:%d is blocked on a mutex, but the owner is not found.\n", + task->comm, task->pid); + return; + } + + /* Ensure the owner information is correct. */ + for_each_process_thread(g, t) { + if ((unsigned long)t == owner) { + pr_err("INFO: task %s:%d is blocked on a mutex likely owned by task %s:%d.\n", + task->comm, task->pid, t->comm, t->pid); + sched_show_task(t); + return; + } + } +} +#else +static inline void debug_show_blocker(struct task_struct *task) +{ +} +#endif + static void check_hung_task(struct task_struct *t, unsigned long timeout) { unsigned long switch_count = t->nvcsw + t->nivcsw; @@ -152,6 +189,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" " disables this message.\n"); sched_show_task(t); + debug_show_blocker(t); hung_task_show_lock = true; if (sysctl_hung_task_all_cpu_backtrace) diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 875f25ed6f71..3f02a0e45254 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -47,10 +47,6 @@ config GENERIC_IRQ_INJECTION config HARDIRQS_SW_RESEND bool -# Edge style eoi based handler (cell) -config IRQ_EDGE_EOI_HANDLER - bool - # Generic configurable interrupt chip implementation config GENERIC_IRQ_CHIP bool @@ -96,6 +92,7 @@ config GENERIC_MSI_IRQ bool select IRQ_DOMAIN_HIERARCHY +# irqchip drivers should select this if they call iommu_dma_prepare_msi() config IRQ_MSI_IOMMU bool diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index ae60cae24e9a..d0af8a8b3ae6 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -43,18 +43,16 @@ unsigned long probe_irq_on(void) * flush such a longstanding irq before considering it as spurious. */ for_each_irq_desc_reverse(i, desc) { - raw_spin_lock_irq(&desc->lock); + guard(raw_spinlock_irq)(&desc->lock); if (!desc->action && irq_settings_can_probe(desc)) { /* * Some chips need to know about probing in * progress: */ if (desc->irq_data.chip->irq_set_type) - desc->irq_data.chip->irq_set_type(&desc->irq_data, - IRQ_TYPE_PROBE); + desc->irq_data.chip->irq_set_type(&desc->irq_data, IRQ_TYPE_PROBE); irq_activate_and_startup(desc, IRQ_NORESEND); } - raw_spin_unlock_irq(&desc->lock); } /* Wait for longstanding interrupts to trigger. */ @@ -66,13 +64,12 @@ unsigned long probe_irq_on(void) * happened in the previous stage, it may have masked itself) */ for_each_irq_desc_reverse(i, desc) { - raw_spin_lock_irq(&desc->lock); + guard(raw_spinlock_irq)(&desc->lock); if (!desc->action && irq_settings_can_probe(desc)) { desc->istate |= IRQS_AUTODETECT | IRQS_WAITING; if (irq_activate_and_startup(desc, IRQ_NORESEND)) desc->istate |= IRQS_PENDING; } - raw_spin_unlock_irq(&desc->lock); } /* @@ -84,18 +81,16 @@ unsigned long probe_irq_on(void) * Now filter out any obviously spurious interrupts */ for_each_irq_desc(i, desc) { - raw_spin_lock_irq(&desc->lock); - + guard(raw_spinlock_irq)(&desc->lock); if (desc->istate & IRQS_AUTODETECT) { /* It triggered already - consider it spurious. */ if (!(desc->istate & IRQS_WAITING)) { desc->istate &= ~IRQS_AUTODETECT; irq_shutdown_and_deactivate(desc); - } else - if (i < 32) - mask |= 1 << i; + } else if (i < 32) { + mask |= 1 << i; + } } - raw_spin_unlock_irq(&desc->lock); } return mask; @@ -121,7 +116,7 @@ unsigned int probe_irq_mask(unsigned long val) int i; for_each_irq_desc(i, desc) { - raw_spin_lock_irq(&desc->lock); + guard(raw_spinlock_irq)(&desc->lock); if (desc->istate & IRQS_AUTODETECT) { if (i < 16 && !(desc->istate & IRQS_WAITING)) mask |= 1 << i; @@ -129,7 +124,6 @@ unsigned int probe_irq_mask(unsigned long val) desc->istate &= ~IRQS_AUTODETECT; irq_shutdown_and_deactivate(desc); } - raw_spin_unlock_irq(&desc->lock); } mutex_unlock(&probing_active); @@ -160,8 +154,7 @@ int probe_irq_off(unsigned long val) struct irq_desc *desc; for_each_irq_desc(i, desc) { - raw_spin_lock_irq(&desc->lock); - + guard(raw_spinlock_irq)(&desc->lock); if (desc->istate & IRQS_AUTODETECT) { if (!(desc->istate & IRQS_WAITING)) { if (!nr_of_irqs) @@ -171,7 +164,6 @@ int probe_irq_off(unsigned long val) desc->istate &= ~IRQS_AUTODETECT; irq_shutdown_and_deactivate(desc); } - raw_spin_unlock_irq(&desc->lock); } mutex_unlock(&probing_active); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index c901436ebd9f..b0e0a7332993 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -34,98 +34,80 @@ struct irqaction chained_action = { }; /** - * irq_set_chip - set the irq chip for an irq - * @irq: irq number - * @chip: pointer to irq chip description structure + * irq_set_chip - set the irq chip for an irq + * @irq: irq number + * @chip: pointer to irq chip description structure */ int irq_set_chip(unsigned int irq, const struct irq_chip *chip) { - unsigned long flags; - struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); + int ret = -EINVAL; - if (!desc) - return -EINVAL; - - desc->irq_data.chip = (struct irq_chip *)(chip ?: &no_irq_chip); - irq_put_desc_unlock(desc, flags); - /* - * For !CONFIG_SPARSE_IRQ make the irq show up in - * allocated_irqs. - */ - irq_mark_irq(irq); - return 0; + scoped_irqdesc_get_and_lock(irq, 0) { + scoped_irqdesc->irq_data.chip = (struct irq_chip *)(chip ?: &no_irq_chip); + ret = 0; + } + /* For !CONFIG_SPARSE_IRQ make the irq show up in allocated_irqs. */ + if (!ret) + irq_mark_irq(irq); + return ret; } EXPORT_SYMBOL(irq_set_chip); /** - * irq_set_irq_type - set the irq trigger type for an irq - * @irq: irq number - * @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h + * irq_set_irq_type - set the irq trigger type for an irq + * @irq: irq number + * @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h */ int irq_set_irq_type(unsigned int irq, unsigned int type) { - unsigned long flags; - struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); - int ret = 0; - - if (!desc) - return -EINVAL; - - ret = __irq_set_trigger(desc, type); - irq_put_desc_busunlock(desc, flags); - return ret; + scoped_irqdesc_get_and_buslock(irq, IRQ_GET_DESC_CHECK_GLOBAL) + return __irq_set_trigger(scoped_irqdesc, type); + return -EINVAL; } EXPORT_SYMBOL(irq_set_irq_type); /** - * irq_set_handler_data - set irq handler data for an irq - * @irq: Interrupt number - * @data: Pointer to interrupt specific data + * irq_set_handler_data - set irq handler data for an irq + * @irq: Interrupt number + * @data: Pointer to interrupt specific data * - * Set the hardware irq controller data for an irq + * Set the hardware irq controller data for an irq */ int irq_set_handler_data(unsigned int irq, void *data) { - unsigned long flags; - struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); - - if (!desc) - return -EINVAL; - desc->irq_common_data.handler_data = data; - irq_put_desc_unlock(desc, flags); - return 0; + scoped_irqdesc_get_and_lock(irq, 0) { + scoped_irqdesc->irq_common_data.handler_data = data; + return 0; + } + return -EINVAL; } EXPORT_SYMBOL(irq_set_handler_data); /** - * irq_set_msi_desc_off - set MSI descriptor data for an irq at offset - * @irq_base: Interrupt number base - * @irq_offset: Interrupt number offset - * @entry: Pointer to MSI descriptor data + * irq_set_msi_desc_off - set MSI descriptor data for an irq at offset + * @irq_base: Interrupt number base + * @irq_offset: Interrupt number offset + * @entry: Pointer to MSI descriptor data * - * Set the MSI descriptor entry for an irq at offset + * Set the MSI descriptor entry for an irq at offset */ -int irq_set_msi_desc_off(unsigned int irq_base, unsigned int irq_offset, - struct msi_desc *entry) -{ - unsigned long flags; - struct irq_desc *desc = irq_get_desc_lock(irq_base + irq_offset, &flags, IRQ_GET_DESC_CHECK_GLOBAL); - - if (!desc) - return -EINVAL; - desc->irq_common_data.msi_desc = entry; - if (entry && !irq_offset) - entry->irq = irq_base; - irq_put_desc_unlock(desc, flags); - return 0; +int irq_set_msi_desc_off(unsigned int irq_base, unsigned int irq_offset, struct msi_desc *entry) +{ + scoped_irqdesc_get_and_lock(irq_base + irq_offset, IRQ_GET_DESC_CHECK_GLOBAL) { + scoped_irqdesc->irq_common_data.msi_desc = entry; + if (entry && !irq_offset) + entry->irq = irq_base; + return 0; + } + return -EINVAL; } /** - * irq_set_msi_desc - set MSI descriptor data for an irq - * @irq: Interrupt number - * @entry: Pointer to MSI descriptor data + * irq_set_msi_desc - set MSI descriptor data for an irq + * @irq: Interrupt number + * @entry: Pointer to MSI descriptor data * - * Set the MSI descriptor entry for an irq + * Set the MSI descriptor entry for an irq */ int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry) { @@ -133,22 +115,19 @@ int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry) } /** - * irq_set_chip_data - set irq chip data for an irq - * @irq: Interrupt number - * @data: Pointer to chip specific data + * irq_set_chip_data - set irq chip data for an irq + * @irq: Interrupt number + * @data: Pointer to chip specific data * - * Set the hardware irq chip data for an irq + * Set the hardware irq chip data for an irq */ int irq_set_chip_data(unsigned int irq, void *data) { - unsigned long flags; - struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); - - if (!desc) - return -EINVAL; - desc->irq_data.chip_data = data; - irq_put_desc_unlock(desc, flags); - return 0; + scoped_irqdesc_get_and_lock(irq, 0) { + scoped_irqdesc->irq_data.chip_data = data; + return 0; + } + return -EINVAL; } EXPORT_SYMBOL(irq_set_chip_data); @@ -223,6 +202,19 @@ __irq_startup_managed(struct irq_desc *desc, const struct cpumask *aff, return IRQ_STARTUP_ABORT; return IRQ_STARTUP_MANAGED; } + +void irq_startup_managed(struct irq_desc *desc) +{ + /* + * Only start it up when the disable depth is 1, so that a disable, + * hotunplug, hotplug sequence does not end up enabling it during + * hotplug unconditionally. + */ + desc->depth--; + if (!desc->depth) + irq_startup(desc, IRQ_RESEND, IRQ_START_COND); +} + #else static __always_inline int __irq_startup_managed(struct irq_desc *desc, const struct cpumask *aff, @@ -232,6 +224,21 @@ __irq_startup_managed(struct irq_desc *desc, const struct cpumask *aff, } #endif +static void irq_enable(struct irq_desc *desc) +{ + if (!irqd_irq_disabled(&desc->irq_data)) { + unmask_irq(desc); + } else { + irq_state_clr_disabled(desc); + if (desc->irq_data.chip->irq_enable) { + desc->irq_data.chip->irq_enable(&desc->irq_data); + irq_state_clr_masked(desc); + } else { + unmask_irq(desc); + } + } +} + static int __irq_startup(struct irq_desc *desc) { struct irq_data *d = irq_desc_get_irq_data(desc); @@ -275,6 +282,7 @@ int irq_startup(struct irq_desc *desc, bool resend, bool force) ret = __irq_startup(desc); break; case IRQ_STARTUP_ABORT: + desc->depth = 1; irqd_set_managed_shutdown(d); return 0; } @@ -307,7 +315,13 @@ void irq_shutdown(struct irq_desc *desc) { if (irqd_is_started(&desc->irq_data)) { clear_irq_resend(desc); - desc->depth = 1; + /* + * Increment disable depth, so that a managed shutdown on + * CPU hotunplug preserves the actual disabled state when the + * CPU comes back online. See irq_startup_managed(). + */ + desc->depth++; + if (desc->irq_data.chip->irq_shutdown) { desc->irq_data.chip->irq_shutdown(&desc->irq_data); irq_state_set_disabled(desc); @@ -332,21 +346,6 @@ void irq_shutdown_and_deactivate(struct irq_desc *desc) irq_domain_deactivate_irq(&desc->irq_data); } -void irq_enable(struct irq_desc *desc) -{ - if (!irqd_irq_disabled(&desc->irq_data)) { - unmask_irq(desc); - } else { - irq_state_clr_disabled(desc); - if (desc->irq_data.chip->irq_enable) { - desc->irq_data.chip->irq_enable(&desc->irq_data); - irq_state_clr_masked(desc); - } else { - unmask_irq(desc); - } - } -} - static void __irq_disable(struct irq_desc *desc, bool mask) { if (irqd_irq_disabled(&desc->irq_data)) { @@ -450,48 +449,6 @@ void unmask_threaded_irq(struct irq_desc *desc) unmask_irq(desc); } -/* - * handle_nested_irq - Handle a nested irq from a irq thread - * @irq: the interrupt number - * - * Handle interrupts which are nested into a threaded interrupt - * handler. The handler function is called inside the calling - * threads context. - */ -void handle_nested_irq(unsigned int irq) -{ - struct irq_desc *desc = irq_to_desc(irq); - struct irqaction *action; - irqreturn_t action_ret; - - might_sleep(); - - raw_spin_lock_irq(&desc->lock); - - desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); - - action = desc->action; - if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) { - desc->istate |= IRQS_PENDING; - raw_spin_unlock_irq(&desc->lock); - return; - } - - kstat_incr_irqs_this_cpu(desc); - atomic_inc(&desc->threads_active); - raw_spin_unlock_irq(&desc->lock); - - action_ret = IRQ_NONE; - for_each_action_of_desc(desc, action) - action_ret |= action->thread_fn(action->irq, action->dev_id); - - if (!irq_settings_no_debug(desc)) - note_interrupt(desc, action_ret); - - wake_threads_waitq(desc); -} -EXPORT_SYMBOL_GPL(handle_nested_irq); - static bool irq_check_poll(struct irq_desc *desc) { if (!(desc->istate & IRQS_POLL_INPROGRESS)) @@ -499,7 +456,7 @@ static bool irq_check_poll(struct irq_desc *desc) return irq_wait_for_poll(desc); } -static bool irq_may_run(struct irq_desc *desc) +static bool irq_can_handle_pm(struct irq_desc *desc) { unsigned int mask = IRQD_IRQ_INPROGRESS | IRQD_WAKEUP_ARMED; @@ -524,77 +481,111 @@ static bool irq_may_run(struct irq_desc *desc) return irq_check_poll(desc); } +static inline bool irq_can_handle_actions(struct irq_desc *desc) +{ + desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); + + if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { + desc->istate |= IRQS_PENDING; + return false; + } + return true; +} + +static inline bool irq_can_handle(struct irq_desc *desc) +{ + if (!irq_can_handle_pm(desc)) + return false; + + return irq_can_handle_actions(desc); +} + /** - * handle_simple_irq - Simple and software-decoded IRQs. - * @desc: the interrupt description structure for this irq + * handle_nested_irq - Handle a nested irq from a irq thread + * @irq: the interrupt number * - * Simple interrupts are either sent from a demultiplexing interrupt - * handler or come from hardware, where no interrupt hardware control - * is necessary. - * - * Note: The caller is expected to handle the ack, clear, mask and - * unmask issues if necessary. + * Handle interrupts which are nested into a threaded interrupt + * handler. The handler function is called inside the calling threads + * context. */ -void handle_simple_irq(struct irq_desc *desc) +void handle_nested_irq(unsigned int irq) { - raw_spin_lock(&desc->lock); + struct irq_desc *desc = irq_to_desc(irq); + struct irqaction *action; + irqreturn_t action_ret; - if (!irq_may_run(desc)) - goto out_unlock; + might_sleep(); - desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); + scoped_guard(raw_spinlock_irq, &desc->lock) { + if (!irq_can_handle_actions(desc)) + return; - if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { - desc->istate |= IRQS_PENDING; - goto out_unlock; + action = desc->action; + kstat_incr_irqs_this_cpu(desc); + atomic_inc(&desc->threads_active); } + action_ret = IRQ_NONE; + for_each_action_of_desc(desc, action) + action_ret |= action->thread_fn(action->irq, action->dev_id); + + if (!irq_settings_no_debug(desc)) + note_interrupt(desc, action_ret); + + wake_threads_waitq(desc); +} +EXPORT_SYMBOL_GPL(handle_nested_irq); + +/** + * handle_simple_irq - Simple and software-decoded IRQs. + * @desc: the interrupt description structure for this irq + * + * Simple interrupts are either sent from a demultiplexing interrupt + * handler or come from hardware, where no interrupt hardware control is + * necessary. + * + * Note: The caller is expected to handle the ack, clear, mask and unmask + * issues if necessary. + */ +void handle_simple_irq(struct irq_desc *desc) +{ + guard(raw_spinlock)(&desc->lock); + + if (!irq_can_handle(desc)) + return; + kstat_incr_irqs_this_cpu(desc); handle_irq_event(desc); - -out_unlock: - raw_spin_unlock(&desc->lock); } EXPORT_SYMBOL_GPL(handle_simple_irq); /** - * handle_untracked_irq - Simple and software-decoded IRQs. - * @desc: the interrupt description structure for this irq + * handle_untracked_irq - Simple and software-decoded IRQs. + * @desc: the interrupt description structure for this irq * - * Untracked interrupts are sent from a demultiplexing interrupt - * handler when the demultiplexer does not know which device it its - * multiplexed irq domain generated the interrupt. IRQ's handled - * through here are not subjected to stats tracking, randomness, or - * spurious interrupt detection. + * Untracked interrupts are sent from a demultiplexing interrupt handler + * when the demultiplexer does not know which device it its multiplexed irq + * domain generated the interrupt. IRQ's handled through here are not + * subjected to stats tracking, randomness, or spurious interrupt + * detection. * - * Note: Like handle_simple_irq, the caller is expected to handle - * the ack, clear, mask and unmask issues if necessary. + * Note: Like handle_simple_irq, the caller is expected to handle the ack, + * clear, mask and unmask issues if necessary. */ void handle_untracked_irq(struct irq_desc *desc) { - raw_spin_lock(&desc->lock); - - if (!irq_may_run(desc)) - goto out_unlock; - - desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); + scoped_guard(raw_spinlock, &desc->lock) { + if (!irq_can_handle(desc)) + return; - if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { - desc->istate |= IRQS_PENDING; - goto out_unlock; + desc->istate &= ~IRQS_PENDING; + irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); } - desc->istate &= ~IRQS_PENDING; - irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); - raw_spin_unlock(&desc->lock); - __handle_irq_event_percpu(desc); - raw_spin_lock(&desc->lock); - irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); - -out_unlock: - raw_spin_unlock(&desc->lock); + scoped_guard(raw_spinlock, &desc->lock) + irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); } EXPORT_SYMBOL_GPL(handle_untracked_irq); @@ -617,40 +608,26 @@ static void cond_unmask_irq(struct irq_desc *desc) } /** - * handle_level_irq - Level type irq handler - * @desc: the interrupt description structure for this irq + * handle_level_irq - Level type irq handler + * @desc: the interrupt description structure for this irq * - * Level type interrupts are active as long as the hardware line has - * the active level. This may require to mask the interrupt and unmask - * it after the associated handler has acknowledged the device, so the - * interrupt line is back to inactive. + * Level type interrupts are active as long as the hardware line has the + * active level. This may require to mask the interrupt and unmask it after + * the associated handler has acknowledged the device, so the interrupt + * line is back to inactive. */ void handle_level_irq(struct irq_desc *desc) { - raw_spin_lock(&desc->lock); + guard(raw_spinlock)(&desc->lock); mask_ack_irq(desc); - if (!irq_may_run(desc)) - goto out_unlock; - - desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); - - /* - * If its disabled or no action available - * keep it masked and get out of here - */ - if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { - desc->istate |= IRQS_PENDING; - goto out_unlock; - } + if (!irq_can_handle(desc)) + return; kstat_incr_irqs_this_cpu(desc); handle_irq_event(desc); cond_unmask_irq(desc); - -out_unlock: - raw_spin_unlock(&desc->lock); } EXPORT_SYMBOL_GPL(handle_level_irq); @@ -675,42 +652,43 @@ static void cond_unmask_eoi_irq(struct irq_desc *desc, struct irq_chip *chip) } } +static inline void cond_eoi_irq(struct irq_chip *chip, struct irq_data *data) +{ + if (!(chip->flags & IRQCHIP_EOI_IF_HANDLED)) + chip->irq_eoi(data); +} + /** - * handle_fasteoi_irq - irq handler for transparent controllers - * @desc: the interrupt description structure for this irq + * handle_fasteoi_irq - irq handler for transparent controllers + * @desc: the interrupt description structure for this irq * - * Only a single callback will be issued to the chip: an ->eoi() - * call when the interrupt has been serviced. This enables support - * for modern forms of interrupt handlers, which handle the flow - * details in hardware, transparently. + * Only a single callback will be issued to the chip: an ->eoi() call when + * the interrupt has been serviced. This enables support for modern forms + * of interrupt handlers, which handle the flow details in hardware, + * transparently. */ void handle_fasteoi_irq(struct irq_desc *desc) { struct irq_chip *chip = desc->irq_data.chip; - raw_spin_lock(&desc->lock); + guard(raw_spinlock)(&desc->lock); /* * When an affinity change races with IRQ handling, the next interrupt * can arrive on the new CPU before the original CPU has completed * handling the previous one - it may need to be resent. */ - if (!irq_may_run(desc)) { + if (!irq_can_handle_pm(desc)) { if (irqd_needs_resend_when_in_progress(&desc->irq_data)) desc->istate |= IRQS_PENDING; - goto out; + cond_eoi_irq(chip, &desc->irq_data); + return; } - desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); - - /* - * If its disabled or no action available - * then mask it and get out of here: - */ - if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { - desc->istate |= IRQS_PENDING; + if (!irq_can_handle_actions(desc)) { mask_irq(desc); - goto out; + cond_eoi_irq(chip, &desc->irq_data); + return; } kstat_incr_irqs_this_cpu(desc); @@ -726,13 +704,6 @@ void handle_fasteoi_irq(struct irq_desc *desc) */ if (unlikely(desc->istate & IRQS_PENDING)) check_irq_resend(desc, false); - - raw_spin_unlock(&desc->lock); - return; -out: - if (!(chip->flags & IRQCHIP_EOI_IF_HANDLED)) - chip->irq_eoi(&desc->irq_data); - raw_spin_unlock(&desc->lock); } EXPORT_SYMBOL_GPL(handle_fasteoi_irq); @@ -770,40 +741,27 @@ void handle_fasteoi_nmi(struct irq_desc *desc) EXPORT_SYMBOL_GPL(handle_fasteoi_nmi); /** - * handle_edge_irq - edge type IRQ handler - * @desc: the interrupt description structure for this irq + * handle_edge_irq - edge type IRQ handler + * @desc: the interrupt description structure for this irq * - * Interrupt occurs on the falling and/or rising edge of a hardware - * signal. The occurrence is latched into the irq controller hardware - * and must be acked in order to be reenabled. After the ack another - * interrupt can happen on the same source even before the first one - * is handled by the associated event handler. If this happens it - * might be necessary to disable (mask) the interrupt depending on the - * controller hardware. This requires to reenable the interrupt inside - * of the loop which handles the interrupts which have arrived while - * the handler was running. If all pending interrupts are handled, the - * loop is left. + * Interrupt occurs on the falling and/or rising edge of a hardware + * signal. The occurrence is latched into the irq controller hardware and + * must be acked in order to be reenabled. After the ack another interrupt + * can happen on the same source even before the first one is handled by + * the associated event handler. If this happens it might be necessary to + * disable (mask) the interrupt depending on the controller hardware. This + * requires to reenable the interrupt inside of the loop which handles the + * interrupts which have arrived while the handler was running. If all + * pending interrupts are handled, the loop is left. */ void handle_edge_irq(struct irq_desc *desc) { - raw_spin_lock(&desc->lock); + guard(raw_spinlock)(&desc->lock); - desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); - - if (!irq_may_run(desc)) { + if (!irq_can_handle(desc)) { desc->istate |= IRQS_PENDING; mask_ack_irq(desc); - goto out_unlock; - } - - /* - * If its disabled or no action available then mask it and get - * out of here. - */ - if (irqd_irq_disabled(&desc->irq_data) || !desc->action) { - desc->istate |= IRQS_PENDING; - mask_ack_irq(desc); - goto out_unlock; + return; } kstat_incr_irqs_this_cpu(desc); @@ -814,7 +772,7 @@ void handle_edge_irq(struct irq_desc *desc) do { if (unlikely(!desc->action)) { mask_irq(desc); - goto out_unlock; + return; } /* @@ -830,61 +788,10 @@ void handle_edge_irq(struct irq_desc *desc) handle_irq_event(desc); - } while ((desc->istate & IRQS_PENDING) && - !irqd_irq_disabled(&desc->irq_data)); - -out_unlock: - raw_spin_unlock(&desc->lock); + } while ((desc->istate & IRQS_PENDING) && !irqd_irq_disabled(&desc->irq_data)); } EXPORT_SYMBOL(handle_edge_irq); -#ifdef CONFIG_IRQ_EDGE_EOI_HANDLER -/** - * handle_edge_eoi_irq - edge eoi type IRQ handler - * @desc: the interrupt description structure for this irq - * - * Similar as the above handle_edge_irq, but using eoi and w/o the - * mask/unmask logic. - */ -void handle_edge_eoi_irq(struct irq_desc *desc) -{ - struct irq_chip *chip = irq_desc_get_chip(desc); - - raw_spin_lock(&desc->lock); - - desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); - - if (!irq_may_run(desc)) { - desc->istate |= IRQS_PENDING; - goto out_eoi; - } - - /* - * If its disabled or no action available then mask it and get - * out of here. - */ - if (irqd_irq_disabled(&desc->irq_data) || !desc->action) { - desc->istate |= IRQS_PENDING; - goto out_eoi; - } - - kstat_incr_irqs_this_cpu(desc); - - do { - if (unlikely(!desc->action)) - goto out_eoi; - - handle_irq_event(desc); - - } while ((desc->istate & IRQS_PENDING) && - !irqd_irq_disabled(&desc->irq_data)); - -out_eoi: - chip->irq_eoi(&desc->irq_data); - raw_spin_unlock(&desc->lock); -} -#endif - /** * handle_percpu_irq - Per CPU local irq handler * @desc: the interrupt description structure for this irq @@ -1054,35 +961,23 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle, } } -void -__irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, - const char *name) +void __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, + const char *name) { - unsigned long flags; - struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, 0); - - if (!desc) - return; - - __irq_do_set_handler(desc, handle, is_chained, name); - irq_put_desc_busunlock(desc, flags); + scoped_irqdesc_get_and_lock(irq, 0) + __irq_do_set_handler(scoped_irqdesc, handle, is_chained, name); } EXPORT_SYMBOL_GPL(__irq_set_handler); -void -irq_set_chained_handler_and_data(unsigned int irq, irq_flow_handler_t handle, - void *data) +void irq_set_chained_handler_and_data(unsigned int irq, irq_flow_handler_t handle, + void *data) { - unsigned long flags; - struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, 0); - - if (!desc) - return; - - desc->irq_common_data.handler_data = data; - __irq_do_set_handler(desc, handle, 1, NULL); + scoped_irqdesc_get_and_buslock(irq, 0) { + struct irq_desc *desc = scoped_irqdesc; - irq_put_desc_busunlock(desc, flags); + desc->irq_common_data.handler_data = data; + __irq_do_set_handler(desc, handle, 1, NULL); + } } EXPORT_SYMBOL_GPL(irq_set_chained_handler_and_data); @@ -1097,38 +992,34 @@ EXPORT_SYMBOL_GPL(irq_set_chip_and_handler_name); void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) { - unsigned long flags, trigger, tmp; - struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); - - if (!desc) - return; - - /* - * Warn when a driver sets the no autoenable flag on an already - * active interrupt. - */ - WARN_ON_ONCE(!desc->depth && (set & _IRQ_NOAUTOEN)); - - irq_settings_clr_and_set(desc, clr, set); + scoped_irqdesc_get_and_lock(irq, 0) { + struct irq_desc *desc = scoped_irqdesc; + unsigned long trigger, tmp; + /* + * Warn when a driver sets the no autoenable flag on an already + * active interrupt. + */ + WARN_ON_ONCE(!desc->depth && (set & _IRQ_NOAUTOEN)); - trigger = irqd_get_trigger_type(&desc->irq_data); + irq_settings_clr_and_set(desc, clr, set); - irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU | - IRQD_TRIGGER_MASK | IRQD_LEVEL); - if (irq_settings_has_no_balance_set(desc)) - irqd_set(&desc->irq_data, IRQD_NO_BALANCING); - if (irq_settings_is_per_cpu(desc)) - irqd_set(&desc->irq_data, IRQD_PER_CPU); - if (irq_settings_is_level(desc)) - irqd_set(&desc->irq_data, IRQD_LEVEL); + trigger = irqd_get_trigger_type(&desc->irq_data); - tmp = irq_settings_get_trigger_mask(desc); - if (tmp != IRQ_TYPE_NONE) - trigger = tmp; + irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU | + IRQD_TRIGGER_MASK | IRQD_LEVEL); + if (irq_settings_has_no_balance_set(desc)) + irqd_set(&desc->irq_data, IRQD_NO_BALANCING); + if (irq_settings_is_per_cpu(desc)) + irqd_set(&desc->irq_data, IRQD_PER_CPU); + if (irq_settings_is_level(desc)) + irqd_set(&desc->irq_data, IRQD_LEVEL); - irqd_set(&desc->irq_data, trigger); + tmp = irq_settings_get_trigger_mask(desc); + if (tmp != IRQ_TYPE_NONE) + trigger = tmp; - irq_put_desc_unlock(desc, flags); + irqd_set(&desc->irq_data, trigger); + } } EXPORT_SYMBOL_GPL(irq_modify_status); @@ -1141,25 +1032,21 @@ EXPORT_SYMBOL_GPL(irq_modify_status); */ void irq_cpu_online(void) { - struct irq_desc *desc; - struct irq_chip *chip; - unsigned long flags; unsigned int irq; for_each_active_irq(irq) { - desc = irq_to_desc(irq); + struct irq_desc *desc = irq_to_desc(irq); + struct irq_chip *chip; + if (!desc) continue; - raw_spin_lock_irqsave(&desc->lock, flags); - + guard(raw_spinlock_irqsave)(&desc->lock); chip = irq_data_get_irq_chip(&desc->irq_data); if (chip && chip->irq_cpu_online && (!(chip->flags & IRQCHIP_ONOFFLINE_ENABLED) || !irqd_irq_disabled(&desc->irq_data))) chip->irq_cpu_online(&desc->irq_data); - - raw_spin_unlock_irqrestore(&desc->lock, flags); } } @@ -1171,25 +1058,21 @@ void irq_cpu_online(void) */ void irq_cpu_offline(void) { - struct irq_desc *desc; - struct irq_chip *chip; - unsigned long flags; unsigned int irq; for_each_active_irq(irq) { - desc = irq_to_desc(irq); + struct irq_desc *desc = irq_to_desc(irq); + struct irq_chip *chip; + if (!desc) continue; - raw_spin_lock_irqsave(&desc->lock, flags); - + guard(raw_spinlock_irqsave)(&desc->lock); chip = irq_data_get_irq_chip(&desc->irq_data); if (chip && chip->irq_cpu_offline && (!(chip->flags & IRQCHIP_ONOFFLINE_ENABLED) || !irqd_irq_disabled(&desc->irq_data))) chip->irq_cpu_offline(&desc->irq_data); - - raw_spin_unlock_irqrestore(&desc->lock, flags); } } #endif @@ -1198,102 +1081,69 @@ void irq_cpu_offline(void) #ifdef CONFIG_IRQ_FASTEOI_HIERARCHY_HANDLERS /** - * handle_fasteoi_ack_irq - irq handler for edge hierarchy - * stacked on transparent controllers + * handle_fasteoi_ack_irq - irq handler for edge hierarchy stacked on + * transparent controllers * - * @desc: the interrupt description structure for this irq + * @desc: the interrupt description structure for this irq * - * Like handle_fasteoi_irq(), but for use with hierarchy where - * the irq_chip also needs to have its ->irq_ack() function - * called. + * Like handle_fasteoi_irq(), but for use with hierarchy where the irq_chip + * also needs to have its ->irq_ack() function called. */ void handle_fasteoi_ack_irq(struct irq_desc *desc) { struct irq_chip *chip = desc->irq_data.chip; - raw_spin_lock(&desc->lock); - - if (!irq_may_run(desc)) - goto out; + guard(raw_spinlock)(&desc->lock); - desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); + if (!irq_can_handle_pm(desc)) { + cond_eoi_irq(chip, &desc->irq_data); + return; + } - /* - * If its disabled or no action available - * then mask it and get out of here: - */ - if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { - desc->istate |= IRQS_PENDING; + if (unlikely(!irq_can_handle_actions(desc))) { mask_irq(desc); - goto out; + cond_eoi_irq(chip, &desc->irq_data); + return; } kstat_incr_irqs_this_cpu(desc); if (desc->istate & IRQS_ONESHOT) mask_irq(desc); - /* Start handling the irq */ desc->irq_data.chip->irq_ack(&desc->irq_data); handle_irq_event(desc); cond_unmask_eoi_irq(desc, chip); - - raw_spin_unlock(&desc->lock); - return; -out: - if (!(chip->flags & IRQCHIP_EOI_IF_HANDLED)) - chip->irq_eoi(&desc->irq_data); - raw_spin_unlock(&desc->lock); } EXPORT_SYMBOL_GPL(handle_fasteoi_ack_irq); /** - * handle_fasteoi_mask_irq - irq handler for level hierarchy - * stacked on transparent controllers + * handle_fasteoi_mask_irq - irq handler for level hierarchy stacked on + * transparent controllers * - * @desc: the interrupt description structure for this irq + * @desc: the interrupt description structure for this irq * - * Like handle_fasteoi_irq(), but for use with hierarchy where - * the irq_chip also needs to have its ->irq_mask_ack() function - * called. + * Like handle_fasteoi_irq(), but for use with hierarchy where the irq_chip + * also needs to have its ->irq_mask_ack() function called. */ void handle_fasteoi_mask_irq(struct irq_desc *desc) { struct irq_chip *chip = desc->irq_data.chip; - raw_spin_lock(&desc->lock); + guard(raw_spinlock)(&desc->lock); mask_ack_irq(desc); - if (!irq_may_run(desc)) - goto out; - - desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); - - /* - * If its disabled or no action available - * then mask it and get out of here: - */ - if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { - desc->istate |= IRQS_PENDING; - mask_irq(desc); - goto out; + if (!irq_can_handle(desc)) { + cond_eoi_irq(chip, &desc->irq_data); + return; } kstat_incr_irqs_this_cpu(desc); - if (desc->istate & IRQS_ONESHOT) - mask_irq(desc); handle_irq_event(desc); cond_unmask_eoi_irq(desc, chip); - - raw_spin_unlock(&desc->lock); - return; -out: - if (!(chip->flags & IRQCHIP_EOI_IF_HANDLED)) - chip->irq_eoi(&desc->irq_data); - raw_spin_unlock(&desc->lock); } EXPORT_SYMBOL_GPL(handle_fasteoi_mask_irq); diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index 15a7654eff68..f07529ae4895 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -177,9 +177,8 @@ void irq_migrate_all_off_this_cpu(void) bool affinity_broken; desc = irq_to_desc(irq); - raw_spin_lock(&desc->lock); - affinity_broken = migrate_one_irq(desc); - raw_spin_unlock(&desc->lock); + scoped_guard(raw_spinlock, &desc->lock) + affinity_broken = migrate_one_irq(desc); if (affinity_broken) { pr_debug_ratelimited("IRQ %u: no longer affine to CPU%u\n", @@ -219,7 +218,7 @@ static void irq_restore_affinity_of_irq(struct irq_desc *desc, unsigned int cpu) return; if (irqd_is_managed_and_shutdown(data)) - irq_startup(desc, IRQ_RESEND, IRQ_START_COND); + irq_startup_managed(desc); /* * If the interrupt can only be directed to a single target @@ -244,9 +243,8 @@ int irq_affinity_online_cpu(unsigned int cpu) irq_lock_sparse(); for_each_active_irq(irq) { desc = irq_to_desc(irq); - raw_spin_lock_irq(&desc->lock); - irq_restore_affinity_of_irq(desc, cpu); - raw_spin_unlock_irq(&desc->lock); + scoped_guard(raw_spinlock_irq, &desc->lock) + irq_restore_affinity_of_irq(desc, cpu); } irq_unlock_sparse(); diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index ca142b9a4db3..3527defd2890 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -160,7 +160,7 @@ static int irq_debug_show(struct seq_file *m, void *p) struct irq_desc *desc = m->private; struct irq_data *data; - raw_spin_lock_irq(&desc->lock); + guard(raw_spinlock_irq)(&desc->lock); data = irq_desc_get_irq_data(desc); seq_printf(m, "handler: %ps\n", desc->handle_irq); seq_printf(m, "device: %s\n", desc->dev_name); @@ -178,7 +178,6 @@ static int irq_debug_show(struct seq_file *m, void *p) seq_printf(m, "node: %d\n", irq_data_get_node(data)); irq_debug_show_masks(m, desc); irq_debug_show_data(m, data, 0); - raw_spin_unlock_irq(&desc->lock); return 0; } @@ -226,12 +225,12 @@ void irq_debugfs_copy_devname(int irq, struct device *dev) void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *desc) { - char name [10]; + char name [12]; if (!irq_dir || !desc || desc->debugfs_file) return; - sprintf(name, "%d", irq); + sprintf(name, "%u", irq); desc->debugfs_file = debugfs_create_file(name, 0644, irq_dir, desc, &dfs_irq_ops); } diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index c4a8bca5f2b0..bf59e37d650a 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -40,10 +40,9 @@ void irq_gc_mask_disable_reg(struct irq_data *d) struct irq_chip_type *ct = irq_data_get_chip_type(d); u32 mask = d->mask; - irq_gc_lock(gc); + guard(raw_spinlock)(&gc->lock); irq_reg_writel(gc, mask, ct->regs.disable); *ct->mask_cache &= ~mask; - irq_gc_unlock(gc); } EXPORT_SYMBOL_GPL(irq_gc_mask_disable_reg); @@ -60,10 +59,9 @@ void irq_gc_mask_set_bit(struct irq_data *d) struct irq_chip_type *ct = irq_data_get_chip_type(d); u32 mask = d->mask; - irq_gc_lock(gc); + guard(raw_spinlock)(&gc->lock); *ct->mask_cache |= mask; irq_reg_writel(gc, *ct->mask_cache, ct->regs.mask); - irq_gc_unlock(gc); } EXPORT_SYMBOL_GPL(irq_gc_mask_set_bit); @@ -80,10 +78,9 @@ void irq_gc_mask_clr_bit(struct irq_data *d) struct irq_chip_type *ct = irq_data_get_chip_type(d); u32 mask = d->mask; - irq_gc_lock(gc); + guard(raw_spinlock)(&gc->lock); *ct->mask_cache &= ~mask; irq_reg_writel(gc, *ct->mask_cache, ct->regs.mask); - irq_gc_unlock(gc); } EXPORT_SYMBOL_GPL(irq_gc_mask_clr_bit); @@ -100,10 +97,9 @@ void irq_gc_unmask_enable_reg(struct irq_data *d) struct irq_chip_type *ct = irq_data_get_chip_type(d); u32 mask = d->mask; - irq_gc_lock(gc); + guard(raw_spinlock)(&gc->lock); irq_reg_writel(gc, mask, ct->regs.enable); *ct->mask_cache |= mask; - irq_gc_unlock(gc); } EXPORT_SYMBOL_GPL(irq_gc_unmask_enable_reg); @@ -117,9 +113,8 @@ void irq_gc_ack_set_bit(struct irq_data *d) struct irq_chip_type *ct = irq_data_get_chip_type(d); u32 mask = d->mask; - irq_gc_lock(gc); + guard(raw_spinlock)(&gc->lock); irq_reg_writel(gc, mask, ct->regs.ack); - irq_gc_unlock(gc); } EXPORT_SYMBOL_GPL(irq_gc_ack_set_bit); @@ -133,9 +128,8 @@ void irq_gc_ack_clr_bit(struct irq_data *d) struct irq_chip_type *ct = irq_data_get_chip_type(d); u32 mask = ~d->mask; - irq_gc_lock(gc); + guard(raw_spinlock)(&gc->lock); irq_reg_writel(gc, mask, ct->regs.ack); - irq_gc_unlock(gc); } /** @@ -156,11 +150,10 @@ void irq_gc_mask_disable_and_ack_set(struct irq_data *d) struct irq_chip_type *ct = irq_data_get_chip_type(d); u32 mask = d->mask; - irq_gc_lock(gc); + guard(raw_spinlock)(&gc->lock); irq_reg_writel(gc, mask, ct->regs.disable); *ct->mask_cache &= ~mask; irq_reg_writel(gc, mask, ct->regs.ack); - irq_gc_unlock(gc); } EXPORT_SYMBOL_GPL(irq_gc_mask_disable_and_ack_set); @@ -174,9 +167,8 @@ void irq_gc_eoi(struct irq_data *d) struct irq_chip_type *ct = irq_data_get_chip_type(d); u32 mask = d->mask; - irq_gc_lock(gc); + guard(raw_spinlock)(&gc->lock); irq_reg_writel(gc, mask, ct->regs.eoi); - irq_gc_unlock(gc); } /** @@ -196,12 +188,11 @@ int irq_gc_set_wake(struct irq_data *d, unsigned int on) if (!(mask & gc->wake_enabled)) return -EINVAL; - irq_gc_lock(gc); + guard(raw_spinlock)(&gc->lock); if (on) gc->wake_active |= mask; else gc->wake_active &= ~mask; - irq_gc_unlock(gc); return 0; } EXPORT_SYMBOL_GPL(irq_gc_set_wake); @@ -288,7 +279,6 @@ int irq_domain_alloc_generic_chips(struct irq_domain *d, { struct irq_domain_chip_generic *dgc; struct irq_chip_generic *gc; - unsigned long flags; int numchips, i; size_t dgc_sz; size_t gc_sz; @@ -340,9 +330,8 @@ int irq_domain_alloc_generic_chips(struct irq_domain *d, goto err; } - raw_spin_lock_irqsave(&gc_lock, flags); - list_add_tail(&gc->list, &gc_list); - raw_spin_unlock_irqrestore(&gc_lock, flags); + scoped_guard (raw_spinlock_irqsave, &gc_lock) + list_add_tail(&gc->list, &gc_list); /* Calc pointer to the next generic chip */ tmp += gc_sz; } @@ -459,7 +448,6 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, struct irq_chip_generic *gc; struct irq_chip_type *ct; struct irq_chip *chip; - unsigned long flags; int idx; gc = __irq_get_domain_generic_chip(d, hw_irq); @@ -479,9 +467,8 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, /* We only init the cache for the first mapping of a generic chip */ if (!gc->installed) { - raw_spin_lock_irqsave(&gc->lock, flags); + guard(raw_spinlock_irqsave)(&gc->lock); irq_gc_init_mask_cache(gc, dgc->gc_flags); - raw_spin_unlock_irqrestore(&gc->lock, flags); } /* Mark the interrupt as installed */ @@ -548,9 +535,8 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk, struct irq_chip *chip = &ct->chip; unsigned int i; - raw_spin_lock(&gc_lock); - list_add_tail(&gc->list, &gc_list); - raw_spin_unlock(&gc_lock); + scoped_guard (raw_spinlock, &gc_lock) + list_add_tail(&gc->list, &gc_list); irq_gc_init_mask_cache(gc, flags); @@ -616,9 +602,8 @@ void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk, { unsigned int i, virq; - raw_spin_lock(&gc_lock); - list_del(&gc->list); - raw_spin_unlock(&gc_lock); + scoped_guard (raw_spinlock, &gc_lock) + list_del(&gc->list); for (i = 0; msk; msk >>= 1, i++) { if (!(msk & 0x01)) diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index a979523640d0..aebfe225c9a6 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -87,10 +87,10 @@ extern void __enable_irq(struct irq_desc *desc); extern int irq_activate(struct irq_desc *desc); extern int irq_activate_and_startup(struct irq_desc *desc, bool resend); extern int irq_startup(struct irq_desc *desc, bool resend, bool force); +extern void irq_startup_managed(struct irq_desc *desc); extern void irq_shutdown(struct irq_desc *desc); extern void irq_shutdown_and_deactivate(struct irq_desc *desc); -extern void irq_enable(struct irq_desc *desc); extern void irq_disable(struct irq_desc *desc); extern void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu); extern void irq_percpu_disable(struct irq_desc *desc, unsigned int cpu); @@ -98,18 +98,12 @@ extern void mask_irq(struct irq_desc *desc); extern void unmask_irq(struct irq_desc *desc); extern void unmask_threaded_irq(struct irq_desc *desc); -extern unsigned int kstat_irqs_desc(struct irq_desc *desc, const struct cpumask *cpumask); - #ifdef CONFIG_SPARSE_IRQ static inline void irq_mark_irq(unsigned int irq) { } #else extern void irq_mark_irq(unsigned int irq); #endif -extern int __irq_get_irqchip_state(struct irq_data *data, - enum irqchip_irq_state which, - bool *state); - irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc); irqreturn_t handle_irq_event_percpu(struct irq_desc *desc); irqreturn_t handle_irq_event(struct irq_desc *desc); @@ -139,8 +133,6 @@ static inline void unregister_handler_proc(unsigned int irq, extern bool irq_can_set_affinity_usr(unsigned int irq); -extern void irq_set_thread_affinity(struct irq_desc *desc); - extern int irq_do_set_affinity(struct irq_data *data, const struct cpumask *dest, bool force); @@ -150,6 +142,10 @@ extern int irq_setup_affinity(struct irq_desc *desc); static inline int irq_setup_affinity(struct irq_desc *desc) { return 0; } #endif + +#define for_each_action_of_desc(desc, act) \ + for (act = desc->action; act; act = act->next) + /* Inline functions for support of irq chips on slow busses */ static inline void chip_bus_lock(struct irq_desc *desc) { @@ -169,38 +165,33 @@ static inline void chip_bus_sync_unlock(struct irq_desc *desc) #define IRQ_GET_DESC_CHECK_GLOBAL (_IRQ_DESC_CHECK) #define IRQ_GET_DESC_CHECK_PERCPU (_IRQ_DESC_CHECK | _IRQ_DESC_PERCPU) -#define for_each_action_of_desc(desc, act) \ - for (act = desc->action; act; act = act->next) - -struct irq_desc * -__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus, - unsigned int check); +struct irq_desc *__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus, + unsigned int check); void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus); -static inline struct irq_desc * -irq_get_desc_buslock(unsigned int irq, unsigned long *flags, unsigned int check) -{ - return __irq_get_desc_lock(irq, flags, true, check); -} +__DEFINE_CLASS_IS_CONDITIONAL(irqdesc_lock, true); +__DEFINE_UNLOCK_GUARD(irqdesc_lock, struct irq_desc, + __irq_put_desc_unlock(_T->lock, _T->flags, _T->bus), + unsigned long flags; bool bus); -static inline void -irq_put_desc_busunlock(struct irq_desc *desc, unsigned long flags) +static inline class_irqdesc_lock_t class_irqdesc_lock_constructor(unsigned int irq, bool bus, + unsigned int check) { - __irq_put_desc_unlock(desc, flags, true); -} + class_irqdesc_lock_t _t = { .bus = bus, }; -static inline struct irq_desc * -irq_get_desc_lock(unsigned int irq, unsigned long *flags, unsigned int check) -{ - return __irq_get_desc_lock(irq, flags, false, check); -} + _t.lock = __irq_get_desc_lock(irq, &_t.flags, bus, check); -static inline void -irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags) -{ - __irq_put_desc_unlock(desc, flags, false); + return _t; } +#define scoped_irqdesc_get_and_lock(_irq, _check) \ + scoped_guard(irqdesc_lock, _irq, false, _check) + +#define scoped_irqdesc_get_and_buslock(_irq, _check) \ + scoped_guard(irqdesc_lock, _irq, true, _check) + +#define scoped_irqdesc ((struct irq_desc *)(__guard_ptr(irqdesc_lock)(&scope))) + #define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors) static inline unsigned int irqd_get(struct irq_data *d) @@ -442,6 +433,7 @@ static inline struct cpumask *irq_desc_get_pending_mask(struct irq_desc *desc) return desc->pending_mask; } bool irq_fixup_move_pending(struct irq_desc *desc, bool force_clear); +void irq_force_complete_move(struct irq_desc *desc); #else /* CONFIG_GENERIC_PENDING_IRQ */ static inline bool irq_can_move_pcntxt(struct irq_data *data) { @@ -467,6 +459,7 @@ static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear) { return false; } +static inline void irq_force_complete_move(struct irq_desc *desc) { } #endif /* !CONFIG_GENERIC_PENDING_IRQ */ #if !defined(CONFIG_IRQ_DOMAIN) || !defined(CONFIG_IRQ_DOMAIN_HIERARCHY) diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 287830739783..b64c57b44c20 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -246,8 +246,7 @@ static struct kobject *irq_kobj_base; #define IRQ_ATTR_RO(_name) \ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) -static ssize_t per_cpu_count_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) +static ssize_t per_cpu_count_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); ssize_t ret = 0; @@ -257,112 +256,83 @@ static ssize_t per_cpu_count_show(struct kobject *kobj, for_each_possible_cpu(cpu) { unsigned int c = irq_desc_kstat_cpu(desc, cpu); - ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%u", p, c); + ret += sysfs_emit_at(buf, ret, "%s%u", p, c); p = ","; } - ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n"); + ret += sysfs_emit_at(buf, ret, "\n"); return ret; } IRQ_ATTR_RO(per_cpu_count); -static ssize_t chip_name_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) +static ssize_t chip_name_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); - ssize_t ret = 0; - - raw_spin_lock_irq(&desc->lock); - if (desc->irq_data.chip && desc->irq_data.chip->name) { - ret = scnprintf(buf, PAGE_SIZE, "%s\n", - desc->irq_data.chip->name); - } - raw_spin_unlock_irq(&desc->lock); - return ret; + guard(raw_spinlock_irq)(&desc->lock); + if (desc->irq_data.chip && desc->irq_data.chip->name) + return sysfs_emit(buf, "%s\n", desc->irq_data.chip->name); + return 0; } IRQ_ATTR_RO(chip_name); -static ssize_t hwirq_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) +static ssize_t hwirq_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); - ssize_t ret = 0; - raw_spin_lock_irq(&desc->lock); + guard(raw_spinlock_irq)(&desc->lock); if (desc->irq_data.domain) - ret = sprintf(buf, "%lu\n", desc->irq_data.hwirq); - raw_spin_unlock_irq(&desc->lock); - - return ret; + return sysfs_emit(buf, "%lu\n", desc->irq_data.hwirq); + return 0; } IRQ_ATTR_RO(hwirq); -static ssize_t type_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) +static ssize_t type_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); - ssize_t ret = 0; - raw_spin_lock_irq(&desc->lock); - ret = sprintf(buf, "%s\n", - irqd_is_level_type(&desc->irq_data) ? "level" : "edge"); - raw_spin_unlock_irq(&desc->lock); - - return ret; + guard(raw_spinlock_irq)(&desc->lock); + return sysfs_emit(buf, "%s\n", irqd_is_level_type(&desc->irq_data) ? "level" : "edge"); } IRQ_ATTR_RO(type); -static ssize_t wakeup_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) +static ssize_t wakeup_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); - ssize_t ret = 0; - - raw_spin_lock_irq(&desc->lock); - ret = sprintf(buf, "%s\n", str_enabled_disabled(irqd_is_wakeup_set(&desc->irq_data))); - raw_spin_unlock_irq(&desc->lock); - - return ret; + guard(raw_spinlock_irq)(&desc->lock); + return sysfs_emit(buf, "%s\n", str_enabled_disabled(irqd_is_wakeup_set(&desc->irq_data))); } IRQ_ATTR_RO(wakeup); -static ssize_t name_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) +static ssize_t name_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); - ssize_t ret = 0; - raw_spin_lock_irq(&desc->lock); + guard(raw_spinlock_irq)(&desc->lock); if (desc->name) - ret = scnprintf(buf, PAGE_SIZE, "%s\n", desc->name); - raw_spin_unlock_irq(&desc->lock); - - return ret; + return sysfs_emit(buf, "%s\n", desc->name); + return 0; } IRQ_ATTR_RO(name); -static ssize_t actions_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) +static ssize_t actions_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); struct irqaction *action; ssize_t ret = 0; char *p = ""; - raw_spin_lock_irq(&desc->lock); - for_each_action_of_desc(desc, action) { - ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s", - p, action->name); - p = ","; + scoped_guard(raw_spinlock_irq, &desc->lock) { + for_each_action_of_desc(desc, action) { + ret += sysfs_emit_at(buf, ret, "%s%s", p, action->name); + p = ","; + } } - raw_spin_unlock_irq(&desc->lock); if (ret) - ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n"); - + ret += sysfs_emit_at(buf, ret, "\n"); return ret; } IRQ_ATTR_RO(actions); @@ -418,19 +388,14 @@ static int __init irq_sysfs_init(void) int irq; /* Prevent concurrent irq alloc/free */ - irq_lock_sparse(); - + guard(mutex)(&sparse_irq_lock); irq_kobj_base = kobject_create_and_add("irq", kernel_kobj); - if (!irq_kobj_base) { - irq_unlock_sparse(); + if (!irq_kobj_base) return -ENOMEM; - } /* Add the already allocated interrupts */ for_each_irq_desc(irq, desc) irq_sysfs_add(irq, desc); - irq_unlock_sparse(); - return 0; } postcore_initcall(irq_sysfs_init); @@ -573,12 +538,12 @@ err: return -ENOMEM; } -static int irq_expand_nr_irqs(unsigned int nr) +static bool irq_expand_nr_irqs(unsigned int nr) { if (nr > MAX_SPARSE_IRQS) - return -ENOMEM; + return false; nr_irqs = nr; - return 0; + return true; } int __init early_irq_init(void) @@ -656,11 +621,9 @@ EXPORT_SYMBOL(irq_to_desc); static void free_desc(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); - unsigned long flags; - raw_spin_lock_irqsave(&desc->lock, flags); - desc_set_defaults(irq, desc, irq_desc_get_node(desc), NULL, NULL); - raw_spin_unlock_irqrestore(&desc->lock, flags); + scoped_guard(raw_spinlock_irqsave, &desc->lock) + desc_set_defaults(irq, desc, irq_desc_get_node(desc), NULL, NULL); delete_irq_desc(irq); } @@ -679,16 +642,15 @@ static inline int alloc_descs(unsigned int start, unsigned int cnt, int node, return start; } -static int irq_expand_nr_irqs(unsigned int nr) +static inline bool irq_expand_nr_irqs(unsigned int nr) { - return -ENOMEM; + return false; } void irq_mark_irq(unsigned int irq) { - mutex_lock(&sparse_irq_lock); + guard(mutex)(&sparse_irq_lock); irq_insert_desc(irq, irq_desc + irq); - mutex_unlock(&sparse_irq_lock); } #ifdef CONFIG_GENERIC_IRQ_LEGACY @@ -827,11 +789,9 @@ void irq_free_descs(unsigned int from, unsigned int cnt) if (from >= nr_irqs || (from + cnt) > nr_irqs) return; - mutex_lock(&sparse_irq_lock); + guard(mutex)(&sparse_irq_lock); for (i = 0; i < cnt; i++) free_desc(from + i); - - mutex_unlock(&sparse_irq_lock); } EXPORT_SYMBOL_GPL(irq_free_descs); @@ -848,11 +808,10 @@ EXPORT_SYMBOL_GPL(irq_free_descs); * * Returns the first irq number or error code */ -int __ref -__irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node, - struct module *owner, const struct irq_affinity_desc *affinity) +int __ref __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node, + struct module *owner, const struct irq_affinity_desc *affinity) { - int start, ret; + int start; if (!cnt) return -EINVAL; @@ -870,22 +829,17 @@ __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node, from = arch_dynirq_lower_bound(from); } - mutex_lock(&sparse_irq_lock); + guard(mutex)(&sparse_irq_lock); start = irq_find_free_area(from, cnt); - ret = -EEXIST; if (irq >=0 && start != irq) - goto unlock; + return -EEXIST; if (start + cnt > nr_irqs) { - ret = irq_expand_nr_irqs(start + cnt); - if (ret) - goto unlock; + if (!irq_expand_nr_irqs(start + cnt)) + return -ENOMEM; } - ret = alloc_descs(start, cnt, node, affinity, owner); -unlock: - mutex_unlock(&sparse_irq_lock); - return ret; + return alloc_descs(start, cnt, node, affinity, owner); } EXPORT_SYMBOL_GPL(__irq_alloc_descs); @@ -900,27 +854,27 @@ unsigned int irq_get_next_irq(unsigned int offset) return irq_find_at_or_after(offset); } -struct irq_desc * -__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus, - unsigned int check) +struct irq_desc *__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus, + unsigned int check) { - struct irq_desc *desc = irq_to_desc(irq); + struct irq_desc *desc; - if (desc) { - if (check & _IRQ_DESC_CHECK) { - if ((check & _IRQ_DESC_PERCPU) && - !irq_settings_is_per_cpu_devid(desc)) - return NULL; - - if (!(check & _IRQ_DESC_PERCPU) && - irq_settings_is_per_cpu_devid(desc)) - return NULL; - } + desc = irq_to_desc(irq); + if (!desc) + return NULL; + + if (check & _IRQ_DESC_CHECK) { + if ((check & _IRQ_DESC_PERCPU) && !irq_settings_is_per_cpu_devid(desc)) + return NULL; - if (bus) - chip_bus_lock(desc); - raw_spin_lock_irqsave(&desc->lock, *flags); + if (!(check & _IRQ_DESC_PERCPU) && irq_settings_is_per_cpu_devid(desc)) + return NULL; } + + if (bus) + chip_bus_lock(desc); + raw_spin_lock_irqsave(&desc->lock, *flags); + return desc; } @@ -991,7 +945,7 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) return desc && desc->kstat_irqs ? per_cpu(desc->kstat_irqs->cnt, cpu) : 0; } -unsigned int kstat_irqs_desc(struct irq_desc *desc, const struct cpumask *cpumask) +static unsigned int kstat_irqs_desc(struct irq_desc *desc, const struct cpumask *cpumask) { unsigned int sum = 0; int cpu; diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index ec6d8e72d980..c8b6de09047b 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -398,7 +398,7 @@ void irq_domain_remove(struct irq_domain *domain) * If the going away domain is the default one, reset it. */ if (unlikely(irq_default_domain == domain)) - irq_set_default_host(NULL); + irq_set_default_domain(NULL); mutex_unlock(&irq_domain_mutex); @@ -480,33 +480,6 @@ struct irq_domain *irq_domain_create_simple(struct fwnode_handle *fwnode, } EXPORT_SYMBOL_GPL(irq_domain_create_simple); -/** - * irq_domain_add_legacy() - Allocate and register a legacy revmap irq_domain. - * @of_node: pointer to interrupt controller's device tree node. - * @size: total number of irqs in legacy mapping - * @first_irq: first number of irq block assigned to the domain - * @first_hwirq: first hwirq number to use for the translation. Should normally - * be '0', but a positive integer can be used if the effective - * hwirqs numbering does not begin at zero. - * @ops: map/unmap domain callbacks - * @host_data: Controller private data pointer - * - * Note: the map() callback will be called before this function returns - * for all legacy interrupts except 0 (which is always the invalid irq for - * a legacy controller). - */ -struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, - unsigned int size, - unsigned int first_irq, - irq_hw_number_t first_hwirq, - const struct irq_domain_ops *ops, - void *host_data) -{ - return irq_domain_create_legacy(of_node_to_fwnode(of_node), size, - first_irq, first_hwirq, ops, host_data); -} -EXPORT_SYMBOL_GPL(irq_domain_add_legacy); - struct irq_domain *irq_domain_create_legacy(struct fwnode_handle *fwnode, unsigned int size, unsigned int first_irq, @@ -573,7 +546,7 @@ struct irq_domain *irq_find_matching_fwspec(struct irq_fwspec *fwspec, EXPORT_SYMBOL_GPL(irq_find_matching_fwspec); /** - * irq_set_default_host() - Set a "default" irq domain + * irq_set_default_domain() - Set a "default" irq domain * @domain: default domain pointer * * For convenience, it's possible to set a "default" domain that will be used @@ -581,16 +554,16 @@ EXPORT_SYMBOL_GPL(irq_find_matching_fwspec); * platforms that want to manipulate a few hard coded interrupt numbers that * aren't properly represented in the device-tree. */ -void irq_set_default_host(struct irq_domain *domain) +void irq_set_default_domain(struct irq_domain *domain) { pr_debug("Default domain set to @0x%p\n", domain); irq_default_domain = domain; } -EXPORT_SYMBOL_GPL(irq_set_default_host); +EXPORT_SYMBOL_GPL(irq_set_default_domain); /** - * irq_get_default_host() - Retrieve the "default" irq domain + * irq_get_default_domain() - Retrieve the "default" irq domain * * Returns: the default domain, if any. * @@ -598,11 +571,11 @@ EXPORT_SYMBOL_GPL(irq_set_default_host); * systems that cannot implement a firmware->fwnode mapping (which * both DT and ACPI provide). */ -struct irq_domain *irq_get_default_host(void) +struct irq_domain *irq_get_default_domain(void) { return irq_default_domain; } -EXPORT_SYMBOL_GPL(irq_get_default_host); +EXPORT_SYMBOL_GPL(irq_get_default_domain); static bool irq_domain_is_nomap(struct irq_domain *domain) { @@ -885,7 +858,7 @@ void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args, { int i; - fwspec->fwnode = of_node_to_fwnode(np); + fwspec->fwnode = of_fwnode_handle(np); fwspec->param_count = count; for (i = 0; i < count; i++) @@ -1133,6 +1106,31 @@ int irq_domain_xlate_twocell(struct irq_domain *d, struct device_node *ctrlr, EXPORT_SYMBOL_GPL(irq_domain_xlate_twocell); /** + * irq_domain_xlate_twothreecell() - Generic xlate for direct two or three cell bindings + * @d: Interrupt domain involved in the translation + * @ctrlr: The device tree node for the device whose interrupt is translated + * @intspec: The interrupt specifier data from the device tree + * @intsize: The number of entries in @intspec + * @out_hwirq: Pointer to storage for the hardware interrupt number + * @out_type: Pointer to storage for the interrupt type + * + * Device Tree interrupt specifier translation function for two or three + * cell bindings, where the cell values map directly to the hardware + * interrupt number and the type specifier. + */ +int irq_domain_xlate_twothreecell(struct irq_domain *d, struct device_node *ctrlr, + const u32 *intspec, unsigned int intsize, + irq_hw_number_t *out_hwirq, unsigned int *out_type) +{ + struct irq_fwspec fwspec; + + of_phandle_args_to_fwspec(ctrlr, intspec, intsize, &fwspec); + + return irq_domain_translate_twothreecell(d, &fwspec, out_hwirq, out_type); +} +EXPORT_SYMBOL_GPL(irq_domain_xlate_twothreecell); + +/** * irq_domain_xlate_onetwocell() - Generic xlate for one or two cell bindings * @d: Interrupt domain involved in the translation * @ctrlr: The device tree node for the device whose interrupt is translated @@ -1216,6 +1214,37 @@ int irq_domain_translate_twocell(struct irq_domain *d, } EXPORT_SYMBOL_GPL(irq_domain_translate_twocell); +/** + * irq_domain_translate_twothreecell() - Generic translate for direct two or three cell + * bindings + * @d: Interrupt domain involved in the translation + * @fwspec: The firmware interrupt specifier to translate + * @out_hwirq: Pointer to storage for the hardware interrupt number + * @out_type: Pointer to storage for the interrupt type + * + * Firmware interrupt specifier translation function for two or three cell + * specifications, where the parameter values map directly to the hardware + * interrupt number and the type specifier. + */ +int irq_domain_translate_twothreecell(struct irq_domain *d, struct irq_fwspec *fwspec, + unsigned long *out_hwirq, unsigned int *out_type) +{ + if (fwspec->param_count == 2) { + *out_hwirq = fwspec->param[0]; + *out_type = fwspec->param[1] & IRQ_TYPE_SENSE_MASK; + return 0; + } + + if (fwspec->param_count == 3) { + *out_hwirq = fwspec->param[1]; + *out_type = fwspec->param[2] & IRQ_TYPE_SENSE_MASK; + return 0; + } + + return -EINVAL; +} +EXPORT_SYMBOL_GPL(irq_domain_translate_twothreecell); + int irq_domain_alloc_descs(int virq, unsigned int cnt, irq_hw_number_t hwirq, int node, const struct irq_affinity_desc *affinity) { @@ -1252,47 +1281,6 @@ void irq_domain_reset_irq_data(struct irq_data *irq_data) EXPORT_SYMBOL_GPL(irq_domain_reset_irq_data); #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY -/** - * irq_domain_create_hierarchy - Add a irqdomain into the hierarchy - * @parent: Parent irq domain to associate with the new domain - * @flags: Irq domain flags associated to the domain - * @size: Size of the domain. See below - * @fwnode: Optional fwnode of the interrupt controller - * @ops: Pointer to the interrupt domain callbacks - * @host_data: Controller private data pointer - * - * If @size is 0 a tree domain is created, otherwise a linear domain. - * - * If successful the parent is associated to the new domain and the - * domain flags are set. - * Returns pointer to IRQ domain, or NULL on failure. - */ -struct irq_domain *irq_domain_create_hierarchy(struct irq_domain *parent, - unsigned int flags, - unsigned int size, - struct fwnode_handle *fwnode, - const struct irq_domain_ops *ops, - void *host_data) -{ - struct irq_domain_info info = { - .fwnode = fwnode, - .size = size, - .hwirq_max = size, - .ops = ops, - .host_data = host_data, - .domain_flags = flags, - .parent = parent, - }; - struct irq_domain *d; - - if (!info.size) - info.hwirq_max = ~0U; - - d = irq_domain_instantiate(&info); - return IS_ERR(d) ? NULL : d; -} -EXPORT_SYMBOL_GPL(irq_domain_create_hierarchy); - static void irq_domain_insert_irq(int virq) { struct irq_data *data; @@ -1589,9 +1577,8 @@ static void irq_domain_free_irqs_hierarchy(struct irq_domain *domain, } } -int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain, - unsigned int irq_base, - unsigned int nr_irqs, void *arg) +static int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain, unsigned int irq_base, + unsigned int nr_irqs, void *arg) { if (!domain->ops->alloc) { pr_debug("domain->ops->alloc() is NULL\n"); @@ -2009,7 +1996,7 @@ static void irq_domain_check_hierarchy(struct irq_domain *domain) domain->flags |= IRQ_DOMAIN_FLAG_HIERARCHY; } #else /* CONFIG_IRQ_DOMAIN_HIERARCHY */ -/** +/* * irq_domain_get_irq_data - Get irq_data associated with @virq and @domain * @domain: domain to match * @virq: IRQ number to get irq_data @@ -2023,7 +2010,7 @@ struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain, } EXPORT_SYMBOL_GPL(irq_domain_get_irq_data); -/** +/* * irq_domain_set_info - Set the complete data for a @virq in @domain * @domain: Interrupt domain to match * @virq: IRQ number diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index f300bb6be3bd..c94837382037 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -35,14 +35,14 @@ static int __init setup_forced_irqthreads(char *arg) early_param("threadirqs", setup_forced_irqthreads); #endif +static int __irq_get_irqchip_state(struct irq_data *d, enum irqchip_irq_state which, bool *state); + static void __synchronize_hardirq(struct irq_desc *desc, bool sync_chip) { struct irq_data *irqd = irq_desc_get_irq_data(desc); bool inprogress; do { - unsigned long flags; - /* * Wait until we're out of the critical section. This might * give the wrong answer due to the lack of memory barriers. @@ -51,7 +51,7 @@ static void __synchronize_hardirq(struct irq_desc *desc, bool sync_chip) cpu_relax(); /* Ok, that indicated we're done: double-check carefully. */ - raw_spin_lock_irqsave(&desc->lock, flags); + guard(raw_spinlock_irqsave)(&desc->lock); inprogress = irqd_irq_inprogress(&desc->irq_data); /* @@ -67,33 +67,30 @@ static void __synchronize_hardirq(struct irq_desc *desc, bool sync_chip) __irq_get_irqchip_state(irqd, IRQCHIP_STATE_ACTIVE, &inprogress); } - raw_spin_unlock_irqrestore(&desc->lock, flags); - /* Oops, that failed? */ } while (inprogress); } /** - * synchronize_hardirq - wait for pending hard IRQ handlers (on other CPUs) - * @irq: interrupt number to wait for + * synchronize_hardirq - wait for pending hard IRQ handlers (on other CPUs) + * @irq: interrupt number to wait for * - * This function waits for any pending hard IRQ handlers for this - * interrupt to complete before returning. If you use this - * function while holding a resource the IRQ handler may need you - * will deadlock. It does not take associated threaded handlers - * into account. + * This function waits for any pending hard IRQ handlers for this interrupt + * to complete before returning. If you use this function while holding a + * resource the IRQ handler may need you will deadlock. It does not take + * associated threaded handlers into account. * - * Do not use this for shutdown scenarios where you must be sure - * that all parts (hardirq and threaded handler) have completed. + * Do not use this for shutdown scenarios where you must be sure that all + * parts (hardirq and threaded handler) have completed. * - * Returns: false if a threaded handler is active. + * Returns: false if a threaded handler is active. * - * This function may be called - with care - from IRQ context. + * This function may be called - with care - from IRQ context. * - * It does not check whether there is an interrupt in flight at the - * hardware level, but not serviced yet, as this might deadlock when - * called with interrupts disabled and the target CPU of the interrupt - * is the current CPU. + * It does not check whether there is an interrupt in flight at the + * hardware level, but not serviced yet, as this might deadlock when called + * with interrupts disabled and the target CPU of the interrupt is the + * current CPU. */ bool synchronize_hardirq(unsigned int irq) { @@ -119,19 +116,19 @@ static void __synchronize_irq(struct irq_desc *desc) } /** - * synchronize_irq - wait for pending IRQ handlers (on other CPUs) - * @irq: interrupt number to wait for + * synchronize_irq - wait for pending IRQ handlers (on other CPUs) + * @irq: interrupt number to wait for * - * This function waits for any pending IRQ handlers for this interrupt - * to complete before returning. If you use this function while - * holding a resource the IRQ handler may need you will deadlock. + * This function waits for any pending IRQ handlers for this interrupt to + * complete before returning. If you use this function while holding a + * resource the IRQ handler may need you will deadlock. * - * Can only be called from preemptible code as it might sleep when - * an interrupt thread is associated to @irq. + * Can only be called from preemptible code as it might sleep when + * an interrupt thread is associated to @irq. * - * It optionally makes sure (when the irq chip supports that method) - * that the interrupt is not pending in any CPU and waiting for - * service. + * It optionally makes sure (when the irq chip supports that method) + * that the interrupt is not pending in any CPU and waiting for + * service. */ void synchronize_irq(unsigned int irq) { @@ -154,8 +151,8 @@ static bool __irq_can_set_affinity(struct irq_desc *desc) } /** - * irq_can_set_affinity - Check if the affinity of a given irq can be set - * @irq: Interrupt to check + * irq_can_set_affinity - Check if the affinity of a given irq can be set + * @irq: Interrupt to check * */ int irq_can_set_affinity(unsigned int irq) @@ -179,15 +176,15 @@ bool irq_can_set_affinity_usr(unsigned int irq) } /** - * irq_set_thread_affinity - Notify irq threads to adjust affinity - * @desc: irq descriptor which has affinity changed + * irq_set_thread_affinity - Notify irq threads to adjust affinity + * @desc: irq descriptor which has affinity changed * - * We just set IRQTF_AFFINITY and delegate the affinity setting - * to the interrupt thread itself. We can not call - * set_cpus_allowed_ptr() here as we hold desc->lock and this - * code can be called from hard interrupt context. + * Just set IRQTF_AFFINITY and delegate the affinity setting to the + * interrupt thread itself. We can not call set_cpus_allowed_ptr() here as + * we hold desc->lock and this code can be called from hard interrupt + * context. */ -void irq_set_thread_affinity(struct irq_desc *desc) +static void irq_set_thread_affinity(struct irq_desc *desc) { struct irqaction *action; @@ -398,14 +395,8 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask, * an interrupt which is already started or which has already been configured * as managed will also fail, as these mean invalid init state or double init. */ -int irq_update_affinity_desc(unsigned int irq, - struct irq_affinity_desc *affinity) +int irq_update_affinity_desc(unsigned int irq, struct irq_affinity_desc *affinity) { - struct irq_desc *desc; - unsigned long flags; - bool activated; - int ret = 0; - /* * Supporting this with the reservation scheme used by x86 needs * some more thought. Fail it for now. @@ -413,60 +404,50 @@ int irq_update_affinity_desc(unsigned int irq, if (IS_ENABLED(CONFIG_GENERIC_IRQ_RESERVATION_MODE)) return -EOPNOTSUPP; - desc = irq_get_desc_buslock(irq, &flags, 0); - if (!desc) - return -EINVAL; + scoped_irqdesc_get_and_buslock(irq, 0) { + struct irq_desc *desc = scoped_irqdesc; + bool activated; - /* Requires the interrupt to be shut down */ - if (irqd_is_started(&desc->irq_data)) { - ret = -EBUSY; - goto out_unlock; - } + /* Requires the interrupt to be shut down */ + if (irqd_is_started(&desc->irq_data)) + return -EBUSY; - /* Interrupts which are already managed cannot be modified */ - if (irqd_affinity_is_managed(&desc->irq_data)) { - ret = -EBUSY; - goto out_unlock; - } - - /* - * Deactivate the interrupt. That's required to undo - * anything an earlier activation has established. - */ - activated = irqd_is_activated(&desc->irq_data); - if (activated) - irq_domain_deactivate_irq(&desc->irq_data); - - if (affinity->is_managed) { - irqd_set(&desc->irq_data, IRQD_AFFINITY_MANAGED); - irqd_set(&desc->irq_data, IRQD_MANAGED_SHUTDOWN); - } + /* Interrupts which are already managed cannot be modified */ + if (irqd_affinity_is_managed(&desc->irq_data)) + return -EBUSY; + /* + * Deactivate the interrupt. That's required to undo + * anything an earlier activation has established. + */ + activated = irqd_is_activated(&desc->irq_data); + if (activated) + irq_domain_deactivate_irq(&desc->irq_data); - cpumask_copy(desc->irq_common_data.affinity, &affinity->mask); + if (affinity->is_managed) { + irqd_set(&desc->irq_data, IRQD_AFFINITY_MANAGED); + irqd_set(&desc->irq_data, IRQD_MANAGED_SHUTDOWN); + } - /* Restore the activation state */ - if (activated) - irq_domain_activate_irq(&desc->irq_data, false); + cpumask_copy(desc->irq_common_data.affinity, &affinity->mask); -out_unlock: - irq_put_desc_busunlock(desc, flags); - return ret; + /* Restore the activation state */ + if (activated) + irq_domain_activate_irq(&desc->irq_data, false); + return 0; + } + return -EINVAL; } static int __irq_set_affinity(unsigned int irq, const struct cpumask *mask, bool force) { struct irq_desc *desc = irq_to_desc(irq); - unsigned long flags; - int ret; if (!desc) return -EINVAL; - raw_spin_lock_irqsave(&desc->lock, flags); - ret = irq_set_affinity_locked(irq_desc_get_irq_data(desc), mask, force); - raw_spin_unlock_irqrestore(&desc->lock, flags); - return ret; + guard(raw_spinlock_irqsave)(&desc->lock); + return irq_set_affinity_locked(irq_desc_get_irq_data(desc), mask, force); } /** @@ -499,39 +480,36 @@ int irq_force_affinity(unsigned int irq, const struct cpumask *cpumask) } EXPORT_SYMBOL_GPL(irq_force_affinity); -int __irq_apply_affinity_hint(unsigned int irq, const struct cpumask *m, - bool setaffinity) +int __irq_apply_affinity_hint(unsigned int irq, const struct cpumask *m, bool setaffinity) { - unsigned long flags; - struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); + int ret = -EINVAL; - if (!desc) - return -EINVAL; - desc->affinity_hint = m; - irq_put_desc_unlock(desc, flags); - if (m && setaffinity) + scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_GLOBAL) { + scoped_irqdesc->affinity_hint = m; + ret = 0; + } + + if (!ret && m && setaffinity) __irq_set_affinity(irq, m, false); - return 0; + return ret; } EXPORT_SYMBOL_GPL(__irq_apply_affinity_hint); static void irq_affinity_notify(struct work_struct *work) { - struct irq_affinity_notify *notify = - container_of(work, struct irq_affinity_notify, work); + struct irq_affinity_notify *notify = container_of(work, struct irq_affinity_notify, work); struct irq_desc *desc = irq_to_desc(notify->irq); cpumask_var_t cpumask; - unsigned long flags; if (!desc || !alloc_cpumask_var(&cpumask, GFP_KERNEL)) goto out; - raw_spin_lock_irqsave(&desc->lock, flags); - if (irq_move_pending(&desc->irq_data)) - irq_get_pending(cpumask, desc); - else - cpumask_copy(cpumask, desc->irq_common_data.affinity); - raw_spin_unlock_irqrestore(&desc->lock, flags); + scoped_guard(raw_spinlock_irqsave, &desc->lock) { + if (irq_move_pending(&desc->irq_data)) + irq_get_pending(cpumask, desc); + else + cpumask_copy(cpumask, desc->irq_common_data.affinity); + } notify->notify(notify, cpumask); @@ -541,22 +519,20 @@ out: } /** - * irq_set_affinity_notifier - control notification of IRQ affinity changes - * @irq: Interrupt for which to enable/disable notification - * @notify: Context for notification, or %NULL to disable - * notification. Function pointers must be initialised; - * the other fields will be initialised by this function. - * - * Must be called in process context. Notification may only be enabled - * after the IRQ is allocated and must be disabled before the IRQ is - * freed using free_irq(). + * irq_set_affinity_notifier - control notification of IRQ affinity changes + * @irq: Interrupt for which to enable/disable notification + * @notify: Context for notification, or %NULL to disable + * notification. Function pointers must be initialised; + * the other fields will be initialised by this function. + * + * Must be called in process context. Notification may only be enabled + * after the IRQ is allocated and must be disabled before the IRQ is freed + * using free_irq(). */ -int -irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify) +int irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify) { struct irq_desc *desc = irq_to_desc(irq); struct irq_affinity_notify *old_notify; - unsigned long flags; /* The release function is promised process context */ might_sleep(); @@ -571,10 +547,10 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify) INIT_WORK(¬ify->work, irq_affinity_notify); } - raw_spin_lock_irqsave(&desc->lock, flags); - old_notify = desc->affinity_notify; - desc->affinity_notify = notify; - raw_spin_unlock_irqrestore(&desc->lock, flags); + scoped_guard(raw_spinlock_irqsave, &desc->lock) { + old_notify = desc->affinity_notify; + desc->affinity_notify = notify; + } if (old_notify) { if (cancel_work_sync(&old_notify->work)) { @@ -595,7 +571,8 @@ EXPORT_SYMBOL_GPL(irq_set_affinity_notifier); int irq_setup_affinity(struct irq_desc *desc) { struct cpumask *set = irq_default_affinity; - int ret, node = irq_desc_get_node(desc); + int node = irq_desc_get_node(desc); + static DEFINE_RAW_SPINLOCK(mask_lock); static struct cpumask mask; @@ -603,7 +580,7 @@ int irq_setup_affinity(struct irq_desc *desc) if (!__irq_can_set_affinity(desc)) return 0; - raw_spin_lock(&mask_lock); + guard(raw_spinlock)(&mask_lock); /* * Preserve the managed affinity setting and a userspace affinity * setup, but make sure that one of the targets is online. @@ -628,9 +605,7 @@ int irq_setup_affinity(struct irq_desc *desc) if (cpumask_intersects(&mask, nodemask)) cpumask_and(&mask, &mask, nodemask); } - ret = irq_do_set_affinity(&desc->irq_data, &mask, false); - raw_spin_unlock(&mask_lock); - return ret; + return irq_do_set_affinity(&desc->irq_data, &mask, false); } #else /* Wrapper for ALPHA specific affinity selector magic */ @@ -643,44 +618,36 @@ int irq_setup_affinity(struct irq_desc *desc) /** - * irq_set_vcpu_affinity - Set vcpu affinity for the interrupt - * @irq: interrupt number to set affinity - * @vcpu_info: vCPU specific data or pointer to a percpu array of vCPU - * specific data for percpu_devid interrupts - * - * This function uses the vCPU specific data to set the vCPU - * affinity for an irq. The vCPU specific data is passed from - * outside, such as KVM. One example code path is as below: - * KVM -> IOMMU -> irq_set_vcpu_affinity(). + * irq_set_vcpu_affinity - Set vcpu affinity for the interrupt + * @irq: interrupt number to set affinity + * @vcpu_info: vCPU specific data or pointer to a percpu array of vCPU + * specific data for percpu_devid interrupts + * + * This function uses the vCPU specific data to set the vCPU affinity for + * an irq. The vCPU specific data is passed from outside, such as KVM. One + * example code path is as below: KVM -> IOMMU -> irq_set_vcpu_affinity(). */ int irq_set_vcpu_affinity(unsigned int irq, void *vcpu_info) { - unsigned long flags; - struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); - struct irq_data *data; - struct irq_chip *chip; - int ret = -ENOSYS; + scoped_irqdesc_get_and_lock(irq, 0) { + struct irq_desc *desc = scoped_irqdesc; + struct irq_data *data; + struct irq_chip *chip; - if (!desc) - return -EINVAL; - - data = irq_desc_get_irq_data(desc); - do { - chip = irq_data_get_irq_chip(data); - if (chip && chip->irq_set_vcpu_affinity) - break; -#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY - data = data->parent_data; -#else - data = NULL; -#endif - } while (data); + data = irq_desc_get_irq_data(desc); + do { + chip = irq_data_get_irq_chip(data); + if (chip && chip->irq_set_vcpu_affinity) + break; - if (data) - ret = chip->irq_set_vcpu_affinity(data, vcpu_info); - irq_put_desc_unlock(desc, flags); + data = irqd_get_parent_data(data); + } while (data); - return ret; + if (!data) + return -ENOSYS; + return chip->irq_set_vcpu_affinity(data, vcpu_info); + } + return -EINVAL; } EXPORT_SYMBOL_GPL(irq_set_vcpu_affinity); @@ -692,26 +659,23 @@ void __disable_irq(struct irq_desc *desc) static int __disable_irq_nosync(unsigned int irq) { - unsigned long flags; - struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); - - if (!desc) - return -EINVAL; - __disable_irq(desc); - irq_put_desc_busunlock(desc, flags); - return 0; + scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_GLOBAL) { + __disable_irq(scoped_irqdesc); + return 0; + } + return -EINVAL; } /** - * disable_irq_nosync - disable an irq without waiting - * @irq: Interrupt to disable + * disable_irq_nosync - disable an irq without waiting + * @irq: Interrupt to disable * - * Disable the selected interrupt line. Disables and Enables are - * nested. - * Unlike disable_irq(), this function does not ensure existing - * instances of the IRQ handler have completed before returning. + * Disable the selected interrupt line. Disables and Enables are + * nested. + * Unlike disable_irq(), this function does not ensure existing + * instances of the IRQ handler have completed before returning. * - * This function may be called from IRQ context. + * This function may be called from IRQ context. */ void disable_irq_nosync(unsigned int irq) { @@ -720,17 +684,17 @@ void disable_irq_nosync(unsigned int irq) EXPORT_SYMBOL(disable_irq_nosync); /** - * disable_irq - disable an irq and wait for completion - * @irq: Interrupt to disable + * disable_irq - disable an irq and wait for completion + * @irq: Interrupt to disable * - * Disable the selected interrupt line. Enables and Disables are - * nested. - * This function waits for any pending IRQ handlers for this interrupt - * to complete before returning. If you use this function while - * holding a resource the IRQ handler may need you will deadlock. + * Disable the selected interrupt line. Enables and Disables are nested. * - * Can only be called from preemptible code as it might sleep when - * an interrupt thread is associated to @irq. + * This function waits for any pending IRQ handlers for this interrupt to + * complete before returning. If you use this function while holding a + * resource the IRQ handler may need you will deadlock. + * + * Can only be called from preemptible code as it might sleep when an + * interrupt thread is associated to @irq. * */ void disable_irq(unsigned int irq) @@ -742,40 +706,39 @@ void disable_irq(unsigned int irq) EXPORT_SYMBOL(disable_irq); /** - * disable_hardirq - disables an irq and waits for hardirq completion - * @irq: Interrupt to disable + * disable_hardirq - disables an irq and waits for hardirq completion + * @irq: Interrupt to disable + * + * Disable the selected interrupt line. Enables and Disables are nested. * - * Disable the selected interrupt line. Enables and Disables are - * nested. - * This function waits for any pending hard IRQ handlers for this - * interrupt to complete before returning. If you use this function while - * holding a resource the hard IRQ handler may need you will deadlock. + * This function waits for any pending hard IRQ handlers for this interrupt + * to complete before returning. If you use this function while holding a + * resource the hard IRQ handler may need you will deadlock. * - * When used to optimistically disable an interrupt from atomic context - * the return value must be checked. + * When used to optimistically disable an interrupt from atomic context the + * return value must be checked. * - * Returns: false if a threaded handler is active. + * Returns: false if a threaded handler is active. * - * This function may be called - with care - from IRQ context. + * This function may be called - with care - from IRQ context. */ bool disable_hardirq(unsigned int irq) { if (!__disable_irq_nosync(irq)) return synchronize_hardirq(irq); - return false; } EXPORT_SYMBOL_GPL(disable_hardirq); /** - * disable_nmi_nosync - disable an nmi without waiting - * @irq: Interrupt to disable - * - * Disable the selected interrupt line. Disables and enables are - * nested. - * The interrupt to disable must have been requested through request_nmi. - * Unlike disable_nmi(), this function does not ensure existing - * instances of the IRQ handler have completed before returning. + * disable_nmi_nosync - disable an nmi without waiting + * @irq: Interrupt to disable + * + * Disable the selected interrupt line. Disables and enables are nested. + * + * The interrupt to disable must have been requested through request_nmi. + * Unlike disable_nmi(), this function does not ensure existing + * instances of the IRQ handler have completed before returning. */ void disable_nmi_nosync(unsigned int irq) { @@ -815,41 +778,34 @@ void __enable_irq(struct irq_desc *desc) } /** - * enable_irq - enable handling of an irq - * @irq: Interrupt to enable + * enable_irq - enable handling of an irq + * @irq: Interrupt to enable * - * Undoes the effect of one call to disable_irq(). If this - * matches the last disable, processing of interrupts on this - * IRQ line is re-enabled. + * Undoes the effect of one call to disable_irq(). If this matches the + * last disable, processing of interrupts on this IRQ line is re-enabled. * - * This function may be called from IRQ context only when - * desc->irq_data.chip->bus_lock and desc->chip->bus_sync_unlock are NULL ! + * This function may be called from IRQ context only when + * desc->irq_data.chip->bus_lock and desc->chip->bus_sync_unlock are NULL ! */ void enable_irq(unsigned int irq) { - unsigned long flags; - struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); + scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_GLOBAL) { + struct irq_desc *desc = scoped_irqdesc; - if (!desc) - return; - if (WARN(!desc->irq_data.chip, - KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq)) - goto out; - - __enable_irq(desc); -out: - irq_put_desc_busunlock(desc, flags); + if (WARN(!desc->irq_data.chip, "enable_irq before setup/request_irq: irq %u\n", irq)) + return; + __enable_irq(desc); + } } EXPORT_SYMBOL(enable_irq); /** - * enable_nmi - enable handling of an nmi - * @irq: Interrupt to enable + * enable_nmi - enable handling of an nmi + * @irq: Interrupt to enable * - * The interrupt to enable must have been requested through request_nmi. - * Undoes the effect of one call to disable_nmi(). If this - * matches the last disable, processing of interrupts on this - * IRQ line is re-enabled. + * The interrupt to enable must have been requested through request_nmi. + * Undoes the effect of one call to disable_nmi(). If this matches the last + * disable, processing of interrupts on this IRQ line is re-enabled. */ void enable_nmi(unsigned int irq) { @@ -871,65 +827,59 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on) } /** - * irq_set_irq_wake - control irq power management wakeup - * @irq: interrupt to control - * @on: enable/disable power management wakeup - * - * Enable/disable power management wakeup mode, which is - * disabled by default. Enables and disables must match, - * just as they match for non-wakeup mode support. - * - * Wakeup mode lets this IRQ wake the system from sleep - * states like "suspend to RAM". - * - * Note: irq enable/disable state is completely orthogonal - * to the enable/disable state of irq wake. An irq can be - * disabled with disable_irq() and still wake the system as - * long as the irq has wake enabled. If this does not hold, - * then the underlying irq chip and the related driver need - * to be investigated. + * irq_set_irq_wake - control irq power management wakeup + * @irq: interrupt to control + * @on: enable/disable power management wakeup + * + * Enable/disable power management wakeup mode, which is disabled by + * default. Enables and disables must match, just as they match for + * non-wakeup mode support. + * + * Wakeup mode lets this IRQ wake the system from sleep states like + * "suspend to RAM". + * + * Note: irq enable/disable state is completely orthogonal to the + * enable/disable state of irq wake. An irq can be disabled with + * disable_irq() and still wake the system as long as the irq has wake + * enabled. If this does not hold, then the underlying irq chip and the + * related driver need to be investigated. */ int irq_set_irq_wake(unsigned int irq, unsigned int on) { - unsigned long flags; - struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); - int ret = 0; + scoped_irqdesc_get_and_buslock(irq, IRQ_GET_DESC_CHECK_GLOBAL) { + struct irq_desc *desc = scoped_irqdesc; + int ret = 0; - if (!desc) - return -EINVAL; - - /* Don't use NMIs as wake up interrupts please */ - if (irq_is_nmi(desc)) { - ret = -EINVAL; - goto out_unlock; - } + /* Don't use NMIs as wake up interrupts please */ + if (irq_is_nmi(desc)) + return -EINVAL; - /* wakeup-capable irqs can be shared between drivers that - * don't need to have the same sleep mode behaviors. - */ - if (on) { - if (desc->wake_depth++ == 0) { - ret = set_irq_wake_real(irq, on); - if (ret) - desc->wake_depth = 0; - else - irqd_set(&desc->irq_data, IRQD_WAKEUP_STATE); - } - } else { - if (desc->wake_depth == 0) { - WARN(1, "Unbalanced IRQ %d wake disable\n", irq); - } else if (--desc->wake_depth == 0) { - ret = set_irq_wake_real(irq, on); - if (ret) - desc->wake_depth = 1; - else - irqd_clear(&desc->irq_data, IRQD_WAKEUP_STATE); + /* + * wakeup-capable irqs can be shared between drivers that + * don't need to have the same sleep mode behaviors. + */ + if (on) { + if (desc->wake_depth++ == 0) { + ret = set_irq_wake_real(irq, on); + if (ret) + desc->wake_depth = 0; + else + irqd_set(&desc->irq_data, IRQD_WAKEUP_STATE); + } + } else { + if (desc->wake_depth == 0) { + WARN(1, "Unbalanced IRQ %d wake disable\n", irq); + } else if (--desc->wake_depth == 0) { + ret = set_irq_wake_real(irq, on); + if (ret) + desc->wake_depth = 1; + else + irqd_clear(&desc->irq_data, IRQD_WAKEUP_STATE); + } } + return ret; } - -out_unlock: - irq_put_desc_busunlock(desc, flags); - return ret; + return -EINVAL; } EXPORT_SYMBOL(irq_set_irq_wake); @@ -938,22 +888,17 @@ EXPORT_SYMBOL(irq_set_irq_wake); * particular irq has been exclusively allocated or is available * for driver use. */ -int can_request_irq(unsigned int irq, unsigned long irqflags) +bool can_request_irq(unsigned int irq, unsigned long irqflags) { - unsigned long flags; - struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); - int canrequest = 0; - - if (!desc) - return 0; + scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_GLOBAL) { + struct irq_desc *desc = scoped_irqdesc; - if (irq_settings_can_request(desc)) { - if (!desc->action || - irqflags & desc->action->flags & IRQF_SHARED) - canrequest = 1; + if (irq_settings_can_request(desc)) { + if (!desc->action || irqflags & desc->action->flags & IRQF_SHARED) + return true; + } } - irq_put_desc_unlock(desc, flags); - return canrequest; + return false; } int __irq_set_trigger(struct irq_desc *desc, unsigned long flags) @@ -1014,16 +959,11 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned long flags) #ifdef CONFIG_HARDIRQS_SW_RESEND int irq_set_parent(int irq, int parent_irq) { - unsigned long flags; - struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); - - if (!desc) - return -EINVAL; - - desc->parent_irq = parent_irq; - - irq_put_desc_unlock(desc, flags); - return 0; + scoped_irqdesc_get_and_lock(irq, 0) { + scoped_irqdesc->parent_irq = parent_irq; + return 0; + } + return -EINVAL; } EXPORT_SYMBOL_GPL(irq_set_parent); #endif @@ -1077,19 +1017,19 @@ static void irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *a return; } - raw_spin_lock_irq(&desc->lock); - /* - * This code is triggered unconditionally. Check the affinity - * mask pointer. For CPU_MASK_OFFSTACK=n this is optimized out. - */ - if (cpumask_available(desc->irq_common_data.affinity)) { - const struct cpumask *m; + scoped_guard(raw_spinlock_irq, &desc->lock) { + /* + * This code is triggered unconditionally. Check the affinity + * mask pointer. For CPU_MASK_OFFSTACK=n this is optimized out. + */ + if (cpumask_available(desc->irq_common_data.affinity)) { + const struct cpumask *m; - m = irq_data_get_effective_affinity_mask(&desc->irq_data); - cpumask_copy(mask, m); - valid = true; + m = irq_data_get_effective_affinity_mask(&desc->irq_data); + cpumask_copy(mask, m); + valid = true; + } } - raw_spin_unlock_irq(&desc->lock); if (valid) set_cpus_allowed_ptr(current, mask); @@ -1257,9 +1197,8 @@ static void irq_wake_secondary(struct irq_desc *desc, struct irqaction *action) if (WARN_ON_ONCE(!secondary)) return; - raw_spin_lock_irq(&desc->lock); + guard(raw_spinlock_irq)(&desc->lock); __irq_wake_thread(desc, secondary); - raw_spin_unlock_irq(&desc->lock); } /* @@ -1332,21 +1271,19 @@ static int irq_thread(void *data) } /** - * irq_wake_thread - wake the irq thread for the action identified by dev_id - * @irq: Interrupt line - * @dev_id: Device identity for which the thread should be woken - * + * irq_wake_thread - wake the irq thread for the action identified by dev_id + * @irq: Interrupt line + * @dev_id: Device identity for which the thread should be woken */ void irq_wake_thread(unsigned int irq, void *dev_id) { struct irq_desc *desc = irq_to_desc(irq); struct irqaction *action; - unsigned long flags; if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc))) return; - raw_spin_lock_irqsave(&desc->lock, flags); + guard(raw_spinlock_irqsave)(&desc->lock); for_each_action_of_desc(desc, action) { if (action->dev_id == dev_id) { if (action->thread) @@ -1354,7 +1291,6 @@ void irq_wake_thread(unsigned int irq, void *dev_id) break; } } - raw_spin_unlock_irqrestore(&desc->lock, flags); } EXPORT_SYMBOL_GPL(irq_wake_thread); @@ -1985,9 +1921,8 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id) * There is no interrupt on the fly anymore. Deactivate it * completely. */ - raw_spin_lock_irqsave(&desc->lock, flags); - irq_domain_deactivate_irq(&desc->irq_data); - raw_spin_unlock_irqrestore(&desc->lock, flags); + scoped_guard(raw_spinlock_irqsave, &desc->lock) + irq_domain_deactivate_irq(&desc->irq_data); irq_release_resources(desc); chip_bus_sync_unlock(desc); @@ -2003,20 +1938,19 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id) } /** - * free_irq - free an interrupt allocated with request_irq - * @irq: Interrupt line to free - * @dev_id: Device identity to free + * free_irq - free an interrupt allocated with request_irq + * @irq: Interrupt line to free + * @dev_id: Device identity to free * - * Remove an interrupt handler. The handler is removed and if the - * interrupt line is no longer in use by any driver it is disabled. - * On a shared IRQ the caller must ensure the interrupt is disabled - * on the card it drives before calling this function. The function - * does not return until any executing interrupts for this IRQ - * have completed. + * Remove an interrupt handler. The handler is removed and if the interrupt + * line is no longer in use by any driver it is disabled. On a shared IRQ + * the caller must ensure the interrupt is disabled on the card it drives + * before calling this function. The function does not return until any + * executing interrupts for this IRQ have completed. * - * This function must not be called from interrupt context. + * This function must not be called from interrupt context. * - * Returns the devname argument passed to request_irq. + * Returns the devname argument passed to request_irq. */ const void *free_irq(unsigned int irq, void *dev_id) { @@ -2073,8 +2007,6 @@ static const void *__cleanup_nmi(unsigned int irq, struct irq_desc *desc) const void *free_nmi(unsigned int irq, void *dev_id) { struct irq_desc *desc = irq_to_desc(irq); - unsigned long flags; - const void *devname; if (!desc || WARN_ON(!irq_is_nmi(desc))) return NULL; @@ -2086,53 +2018,46 @@ const void *free_nmi(unsigned int irq, void *dev_id) if (WARN_ON(desc->depth == 0)) disable_nmi_nosync(irq); - raw_spin_lock_irqsave(&desc->lock, flags); - + guard(raw_spinlock_irqsave)(&desc->lock); irq_nmi_teardown(desc); - devname = __cleanup_nmi(irq, desc); - - raw_spin_unlock_irqrestore(&desc->lock, flags); - - return devname; + return __cleanup_nmi(irq, desc); } /** - * request_threaded_irq - allocate an interrupt line - * @irq: Interrupt line to allocate - * @handler: Function to be called when the IRQ occurs. - * Primary handler for threaded interrupts. - * If handler is NULL and thread_fn != NULL - * the default primary handler is installed. - * @thread_fn: Function called from the irq handler thread - * If NULL, no irq thread is created - * @irqflags: Interrupt type flags - * @devname: An ascii name for the claiming device - * @dev_id: A cookie passed back to the handler function - * - * This call allocates interrupt resources and enables the - * interrupt line and IRQ handling. From the point this - * call is made your handler function may be invoked. Since - * your handler function must clear any interrupt the board - * raises, you must take care both to initialise your hardware - * and to set up the interrupt handler in the right order. - * - * If you want to set up a threaded irq handler for your device - * then you need to supply @handler and @thread_fn. @handler is - * still called in hard interrupt context and has to check - * whether the interrupt originates from the device. If yes it - * needs to disable the interrupt on the device and return - * IRQ_WAKE_THREAD which will wake up the handler thread and run - * @thread_fn. This split handler design is necessary to support - * shared interrupts. - * - * Dev_id must be globally unique. Normally the address of the - * device data structure is used as the cookie. Since the handler - * receives this value it makes sense to use it. - * - * If your interrupt is shared you must pass a non NULL dev_id - * as this is required when freeing the interrupt. - * - * Flags: + * request_threaded_irq - allocate an interrupt line + * @irq: Interrupt line to allocate + * @handler: Function to be called when the IRQ occurs. + * Primary handler for threaded interrupts. + * If handler is NULL and thread_fn != NULL + * the default primary handler is installed. + * @thread_fn: Function called from the irq handler thread + * If NULL, no irq thread is created + * @irqflags: Interrupt type flags + * @devname: An ascii name for the claiming device + * @dev_id: A cookie passed back to the handler function + * + * This call allocates interrupt resources and enables the interrupt line + * and IRQ handling. From the point this call is made your handler function + * may be invoked. Since your handler function must clear any interrupt the + * board raises, you must take care both to initialise your hardware and to + * set up the interrupt handler in the right order. + * + * If you want to set up a threaded irq handler for your device then you + * need to supply @handler and @thread_fn. @handler is still called in hard + * interrupt context and has to check whether the interrupt originates from + * the device. If yes it needs to disable the interrupt on the device and + * return IRQ_WAKE_THREAD which will wake up the handler thread and run + * @thread_fn. This split handler design is necessary to support shared + * interrupts. + * + * @dev_id must be globally unique. Normally the address of the device data + * structure is used as the cookie. Since the handler receives this value + * it makes sense to use it. + * + * If your interrupt is shared you must pass a non NULL dev_id as this is + * required when freeing the interrupt. + * + * Flags: * * IRQF_SHARED Interrupt is shared * IRQF_TRIGGER_* Specify active edge(s) or level @@ -2230,21 +2155,20 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, EXPORT_SYMBOL(request_threaded_irq); /** - * request_any_context_irq - allocate an interrupt line - * @irq: Interrupt line to allocate - * @handler: Function to be called when the IRQ occurs. - * Threaded handler for threaded interrupts. - * @flags: Interrupt type flags - * @name: An ascii name for the claiming device - * @dev_id: A cookie passed back to the handler function - * - * This call allocates interrupt resources and enables the - * interrupt line and IRQ handling. It selects either a - * hardirq or threaded handling method depending on the - * context. - * - * On failure, it returns a negative value. On success, - * it returns either IRQC_IS_HARDIRQ or IRQC_IS_NESTED. + * request_any_context_irq - allocate an interrupt line + * @irq: Interrupt line to allocate + * @handler: Function to be called when the IRQ occurs. + * Threaded handler for threaded interrupts. + * @flags: Interrupt type flags + * @name: An ascii name for the claiming device + * @dev_id: A cookie passed back to the handler function + * + * This call allocates interrupt resources and enables the interrupt line + * and IRQ handling. It selects either a hardirq or threaded handling + * method depending on the context. + * + * Returns: On failure, it returns a negative value. On success, it returns either + * IRQC_IS_HARDIRQ or IRQC_IS_NESTED. */ int request_any_context_irq(unsigned int irq, irq_handler_t handler, unsigned long flags, const char *name, void *dev_id) @@ -2271,37 +2195,35 @@ int request_any_context_irq(unsigned int irq, irq_handler_t handler, EXPORT_SYMBOL_GPL(request_any_context_irq); /** - * request_nmi - allocate an interrupt line for NMI delivery - * @irq: Interrupt line to allocate - * @handler: Function to be called when the IRQ occurs. - * Threaded handler for threaded interrupts. - * @irqflags: Interrupt type flags - * @name: An ascii name for the claiming device - * @dev_id: A cookie passed back to the handler function - * - * This call allocates interrupt resources and enables the - * interrupt line and IRQ handling. It sets up the IRQ line - * to be handled as an NMI. - * - * An interrupt line delivering NMIs cannot be shared and IRQ handling - * cannot be threaded. - * - * Interrupt lines requested for NMI delivering must produce per cpu - * interrupts and have auto enabling setting disabled. - * - * Dev_id must be globally unique. Normally the address of the - * device data structure is used as the cookie. Since the handler - * receives this value it makes sense to use it. - * - * If the interrupt line cannot be used to deliver NMIs, function - * will fail and return a negative value. + * request_nmi - allocate an interrupt line for NMI delivery + * @irq: Interrupt line to allocate + * @handler: Function to be called when the IRQ occurs. + * Threaded handler for threaded interrupts. + * @irqflags: Interrupt type flags + * @name: An ascii name for the claiming device + * @dev_id: A cookie passed back to the handler function + * + * This call allocates interrupt resources and enables the interrupt line + * and IRQ handling. It sets up the IRQ line to be handled as an NMI. + * + * An interrupt line delivering NMIs cannot be shared and IRQ handling + * cannot be threaded. + * + * Interrupt lines requested for NMI delivering must produce per cpu + * interrupts and have auto enabling setting disabled. + * + * @dev_id must be globally unique. Normally the address of the device data + * structure is used as the cookie. Since the handler receives this value + * it makes sense to use it. + * + * If the interrupt line cannot be used to deliver NMIs, function will fail + * and return a negative value. */ int request_nmi(unsigned int irq, irq_handler_t handler, unsigned long irqflags, const char *name, void *dev_id) { struct irqaction *action; struct irq_desc *desc; - unsigned long flags; int retval; if (irq == IRQ_NOTCONNECTED) @@ -2343,21 +2265,17 @@ int request_nmi(unsigned int irq, irq_handler_t handler, if (retval) goto err_irq_setup; - raw_spin_lock_irqsave(&desc->lock, flags); - - /* Setup NMI state */ - desc->istate |= IRQS_NMI; - retval = irq_nmi_setup(desc); - if (retval) { - __cleanup_nmi(irq, desc); - raw_spin_unlock_irqrestore(&desc->lock, flags); - return -EINVAL; + scoped_guard(raw_spinlock_irqsave, &desc->lock) { + /* Setup NMI state */ + desc->istate |= IRQS_NMI; + retval = irq_nmi_setup(desc); + if (retval) { + __cleanup_nmi(irq, desc); + return -EINVAL; + } + return 0; } - raw_spin_unlock_irqrestore(&desc->lock, flags); - - return 0; - err_irq_setup: irq_chip_pm_put(&desc->irq_data); err_out: @@ -2368,35 +2286,25 @@ err_out: void enable_percpu_irq(unsigned int irq, unsigned int type) { - unsigned int cpu = smp_processor_id(); - unsigned long flags; - struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_PERCPU); + scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_PERCPU) { + struct irq_desc *desc = scoped_irqdesc; - if (!desc) - return; - - /* - * If the trigger type is not specified by the caller, then - * use the default for this interrupt. - */ - type &= IRQ_TYPE_SENSE_MASK; - if (type == IRQ_TYPE_NONE) - type = irqd_get_trigger_type(&desc->irq_data); - - if (type != IRQ_TYPE_NONE) { - int ret; - - ret = __irq_set_trigger(desc, type); - - if (ret) { - WARN(1, "failed to set type for IRQ%d\n", irq); - goto out; + /* + * If the trigger type is not specified by the caller, then + * use the default for this interrupt. + */ + type &= IRQ_TYPE_SENSE_MASK; + if (type == IRQ_TYPE_NONE) + type = irqd_get_trigger_type(&desc->irq_data); + + if (type != IRQ_TYPE_NONE) { + if (__irq_set_trigger(desc, type)) { + WARN(1, "failed to set type for IRQ%d\n", irq); + return; + } } + irq_percpu_enable(desc, smp_processor_id()); } - - irq_percpu_enable(desc, cpu); -out: - irq_put_desc_unlock(desc, flags); } EXPORT_SYMBOL_GPL(enable_percpu_irq); @@ -2414,33 +2322,16 @@ void enable_percpu_nmi(unsigned int irq, unsigned int type) */ bool irq_percpu_is_enabled(unsigned int irq) { - unsigned int cpu = smp_processor_id(); - struct irq_desc *desc; - unsigned long flags; - bool is_enabled; - - desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_PERCPU); - if (!desc) - return false; - - is_enabled = cpumask_test_cpu(cpu, desc->percpu_enabled); - irq_put_desc_unlock(desc, flags); - - return is_enabled; + scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_PERCPU) + return cpumask_test_cpu(smp_processor_id(), scoped_irqdesc->percpu_enabled); + return false; } EXPORT_SYMBOL_GPL(irq_percpu_is_enabled); void disable_percpu_irq(unsigned int irq) { - unsigned int cpu = smp_processor_id(); - unsigned long flags; - struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_PERCPU); - - if (!desc) - return; - - irq_percpu_disable(desc, cpu); - irq_put_desc_unlock(desc, flags); + scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_PERCPU) + irq_percpu_disable(scoped_irqdesc, smp_processor_id()); } EXPORT_SYMBOL_GPL(disable_percpu_irq); @@ -2456,71 +2347,47 @@ static struct irqaction *__free_percpu_irq(unsigned int irq, void __percpu *dev_ { struct irq_desc *desc = irq_to_desc(irq); struct irqaction *action; - unsigned long flags; WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq); if (!desc) return NULL; - raw_spin_lock_irqsave(&desc->lock, flags); + scoped_guard(raw_spinlock_irqsave, &desc->lock) { + action = desc->action; + if (!action || action->percpu_dev_id != dev_id) { + WARN(1, "Trying to free already-free IRQ %d\n", irq); + return NULL; + } - action = desc->action; - if (!action || action->percpu_dev_id != dev_id) { - WARN(1, "Trying to free already-free IRQ %d\n", irq); - goto bad; - } + if (!cpumask_empty(desc->percpu_enabled)) { + WARN(1, "percpu IRQ %d still enabled on CPU%d!\n", + irq, cpumask_first(desc->percpu_enabled)); + return NULL; + } - if (!cpumask_empty(desc->percpu_enabled)) { - WARN(1, "percpu IRQ %d still enabled on CPU%d!\n", - irq, cpumask_first(desc->percpu_enabled)); - goto bad; + /* Found it - now remove it from the list of entries: */ + desc->action = NULL; + desc->istate &= ~IRQS_NMI; } - /* Found it - now remove it from the list of entries: */ - desc->action = NULL; - - desc->istate &= ~IRQS_NMI; - - raw_spin_unlock_irqrestore(&desc->lock, flags); - unregister_handler_proc(irq, action); - irq_chip_pm_put(&desc->irq_data); module_put(desc->owner); return action; - -bad: - raw_spin_unlock_irqrestore(&desc->lock, flags); - return NULL; } /** - * remove_percpu_irq - free a per-cpu interrupt - * @irq: Interrupt line to free - * @act: irqaction for the interrupt + * free_percpu_irq - free an interrupt allocated with request_percpu_irq + * @irq: Interrupt line to free + * @dev_id: Device identity to free * - * Used to remove interrupts statically setup by the early boot process. - */ -void remove_percpu_irq(unsigned int irq, struct irqaction *act) -{ - struct irq_desc *desc = irq_to_desc(irq); - - if (desc && irq_settings_is_per_cpu_devid(desc)) - __free_percpu_irq(irq, act->percpu_dev_id); -} - -/** - * free_percpu_irq - free an interrupt allocated with request_percpu_irq - * @irq: Interrupt line to free - * @dev_id: Device identity to free + * Remove a percpu interrupt handler. The handler is removed, but the + * interrupt line is not disabled. This must be done on each CPU before + * calling this function. The function does not return until any executing + * interrupts for this IRQ have completed. * - * Remove a percpu interrupt handler. The handler is removed, but - * the interrupt line is not disabled. This must be done on each - * CPU before calling this function. The function does not return - * until any executing interrupts for this IRQ have completed. - * - * This function must not be called from interrupt context. + * This function must not be called from interrupt context. */ void free_percpu_irq(unsigned int irq, void __percpu *dev_id) { @@ -2549,9 +2416,9 @@ void free_percpu_nmi(unsigned int irq, void __percpu *dev_id) } /** - * setup_percpu_irq - setup a per-cpu interrupt - * @irq: Interrupt line to setup - * @act: irqaction for the interrupt + * setup_percpu_irq - setup a per-cpu interrupt + * @irq: Interrupt line to setup + * @act: irqaction for the interrupt * * Used to statically setup per-cpu interrupts in the early boot process. */ @@ -2576,21 +2443,20 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act) } /** - * __request_percpu_irq - allocate a percpu interrupt line - * @irq: Interrupt line to allocate - * @handler: Function to be called when the IRQ occurs. - * @flags: Interrupt type flags (IRQF_TIMER only) - * @devname: An ascii name for the claiming device - * @dev_id: A percpu cookie passed back to the handler function - * - * This call allocates interrupt resources and enables the - * interrupt on the local CPU. If the interrupt is supposed to be - * enabled on other CPUs, it has to be done on each CPU using - * enable_percpu_irq(). - * - * Dev_id must be globally unique. It is a per-cpu variable, and - * the handler gets called with the interrupted CPU's instance of - * that variable. + * __request_percpu_irq - allocate a percpu interrupt line + * @irq: Interrupt line to allocate + * @handler: Function to be called when the IRQ occurs. + * @flags: Interrupt type flags (IRQF_TIMER only) + * @devname: An ascii name for the claiming device + * @dev_id: A percpu cookie passed back to the handler function + * + * This call allocates interrupt resources and enables the interrupt on the + * local CPU. If the interrupt is supposed to be enabled on other CPUs, it + * has to be done on each CPU using enable_percpu_irq(). + * + * @dev_id must be globally unique. It is a per-cpu variable, and + * the handler gets called with the interrupted CPU's instance of + * that variable. */ int __request_percpu_irq(unsigned int irq, irq_handler_t handler, unsigned long flags, const char *devname, @@ -2638,32 +2504,31 @@ int __request_percpu_irq(unsigned int irq, irq_handler_t handler, EXPORT_SYMBOL_GPL(__request_percpu_irq); /** - * request_percpu_nmi - allocate a percpu interrupt line for NMI delivery - * @irq: Interrupt line to allocate - * @handler: Function to be called when the IRQ occurs. - * @name: An ascii name for the claiming device - * @dev_id: A percpu cookie passed back to the handler function + * request_percpu_nmi - allocate a percpu interrupt line for NMI delivery + * @irq: Interrupt line to allocate + * @handler: Function to be called when the IRQ occurs. + * @name: An ascii name for the claiming device + * @dev_id: A percpu cookie passed back to the handler function * - * This call allocates interrupt resources for a per CPU NMI. Per CPU NMIs - * have to be setup on each CPU by calling prepare_percpu_nmi() before - * being enabled on the same CPU by using enable_percpu_nmi(). + * This call allocates interrupt resources for a per CPU NMI. Per CPU NMIs + * have to be setup on each CPU by calling prepare_percpu_nmi() before + * being enabled on the same CPU by using enable_percpu_nmi(). * - * Dev_id must be globally unique. It is a per-cpu variable, and - * the handler gets called with the interrupted CPU's instance of - * that variable. + * @dev_id must be globally unique. It is a per-cpu variable, and the + * handler gets called with the interrupted CPU's instance of that + * variable. * - * Interrupt lines requested for NMI delivering should have auto enabling - * setting disabled. + * Interrupt lines requested for NMI delivering should have auto enabling + * setting disabled. * - * If the interrupt line cannot be used to deliver NMIs, function - * will fail returning a negative value. + * If the interrupt line cannot be used to deliver NMIs, function + * will fail returning a negative value. */ int request_percpu_nmi(unsigned int irq, irq_handler_t handler, const char *name, void __percpu *dev_id) { struct irqaction *action; struct irq_desc *desc; - unsigned long flags; int retval; if (!handler) @@ -2699,10 +2564,8 @@ int request_percpu_nmi(unsigned int irq, irq_handler_t handler, if (retval) goto err_irq_setup; - raw_spin_lock_irqsave(&desc->lock, flags); - desc->istate |= IRQS_NMI; - raw_spin_unlock_irqrestore(&desc->lock, flags); - + scoped_guard(raw_spinlock_irqsave, &desc->lock) + desc->istate |= IRQS_NMI; return 0; err_irq_setup: @@ -2714,83 +2577,58 @@ err_out: } /** - * prepare_percpu_nmi - performs CPU local setup for NMI delivery - * @irq: Interrupt line to prepare for NMI delivery + * prepare_percpu_nmi - performs CPU local setup for NMI delivery + * @irq: Interrupt line to prepare for NMI delivery * - * This call prepares an interrupt line to deliver NMI on the current CPU, - * before that interrupt line gets enabled with enable_percpu_nmi(). + * This call prepares an interrupt line to deliver NMI on the current CPU, + * before that interrupt line gets enabled with enable_percpu_nmi(). * - * As a CPU local operation, this should be called from non-preemptible - * context. + * As a CPU local operation, this should be called from non-preemptible + * context. * - * If the interrupt line cannot be used to deliver NMIs, function - * will fail returning a negative value. + * If the interrupt line cannot be used to deliver NMIs, function will fail + * returning a negative value. */ int prepare_percpu_nmi(unsigned int irq) { - unsigned long flags; - struct irq_desc *desc; - int ret = 0; + int ret = -EINVAL; WARN_ON(preemptible()); - desc = irq_get_desc_lock(irq, &flags, - IRQ_GET_DESC_CHECK_PERCPU); - if (!desc) - return -EINVAL; - - if (WARN(!irq_is_nmi(desc), - KERN_ERR "prepare_percpu_nmi called for a non-NMI interrupt: irq %u\n", - irq)) { - ret = -EINVAL; - goto out; - } + scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_PERCPU) { + if (WARN(!irq_is_nmi(scoped_irqdesc), + "prepare_percpu_nmi called for a non-NMI interrupt: irq %u\n", irq)) + return -EINVAL; - ret = irq_nmi_setup(desc); - if (ret) { - pr_err("Failed to setup NMI delivery: irq %u\n", irq); - goto out; + ret = irq_nmi_setup(scoped_irqdesc); + if (ret) + pr_err("Failed to setup NMI delivery: irq %u\n", irq); } - -out: - irq_put_desc_unlock(desc, flags); return ret; } /** - * teardown_percpu_nmi - undoes NMI setup of IRQ line - * @irq: Interrupt line from which CPU local NMI configuration should be - * removed - * - * This call undoes the setup done by prepare_percpu_nmi(). + * teardown_percpu_nmi - undoes NMI setup of IRQ line + * @irq: Interrupt line from which CPU local NMI configuration should be removed * - * IRQ line should not be enabled for the current CPU. + * This call undoes the setup done by prepare_percpu_nmi(). * - * As a CPU local operation, this should be called from non-preemptible - * context. + * IRQ line should not be enabled for the current CPU. + * As a CPU local operation, this should be called from non-preemptible + * context. */ void teardown_percpu_nmi(unsigned int irq) { - unsigned long flags; - struct irq_desc *desc; - WARN_ON(preemptible()); - desc = irq_get_desc_lock(irq, &flags, - IRQ_GET_DESC_CHECK_PERCPU); - if (!desc) - return; - - if (WARN_ON(!irq_is_nmi(desc))) - goto out; - - irq_nmi_teardown(desc); -out: - irq_put_desc_unlock(desc, flags); + scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_PERCPU) { + if (WARN_ON(!irq_is_nmi(scoped_irqdesc))) + return; + irq_nmi_teardown(scoped_irqdesc); + } } -int __irq_get_irqchip_state(struct irq_data *data, enum irqchip_irq_state which, - bool *state) +static int __irq_get_irqchip_state(struct irq_data *data, enum irqchip_irq_state which, bool *state) { struct irq_chip *chip; int err = -EINVAL; @@ -2814,87 +2652,62 @@ int __irq_get_irqchip_state(struct irq_data *data, enum irqchip_irq_state which, } /** - * irq_get_irqchip_state - returns the irqchip state of a interrupt. - * @irq: Interrupt line that is forwarded to a VM - * @which: One of IRQCHIP_STATE_* the caller wants to know about - * @state: a pointer to a boolean where the state is to be stored + * irq_get_irqchip_state - returns the irqchip state of a interrupt. + * @irq: Interrupt line that is forwarded to a VM + * @which: One of IRQCHIP_STATE_* the caller wants to know about + * @state: a pointer to a boolean where the state is to be stored * - * This call snapshots the internal irqchip state of an - * interrupt, returning into @state the bit corresponding to - * stage @which + * This call snapshots the internal irqchip state of an interrupt, + * returning into @state the bit corresponding to stage @which * - * This function should be called with preemption disabled if the - * interrupt controller has per-cpu registers. + * This function should be called with preemption disabled if the interrupt + * controller has per-cpu registers. */ -int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which, - bool *state) +int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which, bool *state) { - struct irq_desc *desc; - struct irq_data *data; - unsigned long flags; - int err = -EINVAL; - - desc = irq_get_desc_buslock(irq, &flags, 0); - if (!desc) - return err; + scoped_irqdesc_get_and_buslock(irq, 0) { + struct irq_data *data = irq_desc_get_irq_data(scoped_irqdesc); - data = irq_desc_get_irq_data(desc); - - err = __irq_get_irqchip_state(data, which, state); - - irq_put_desc_busunlock(desc, flags); - return err; + return __irq_get_irqchip_state(data, which, state); + } + return -EINVAL; } EXPORT_SYMBOL_GPL(irq_get_irqchip_state); /** - * irq_set_irqchip_state - set the state of a forwarded interrupt. - * @irq: Interrupt line that is forwarded to a VM - * @which: State to be restored (one of IRQCHIP_STATE_*) - * @val: Value corresponding to @which + * irq_set_irqchip_state - set the state of a forwarded interrupt. + * @irq: Interrupt line that is forwarded to a VM + * @which: State to be restored (one of IRQCHIP_STATE_*) + * @val: Value corresponding to @which * - * This call sets the internal irqchip state of an interrupt, - * depending on the value of @which. + * This call sets the internal irqchip state of an interrupt, depending on + * the value of @which. * - * This function should be called with migration disabled if the - * interrupt controller has per-cpu registers. + * This function should be called with migration disabled if the interrupt + * controller has per-cpu registers. */ -int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which, - bool val) +int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which, bool val) { - struct irq_desc *desc; - struct irq_data *data; - struct irq_chip *chip; - unsigned long flags; - int err = -EINVAL; + scoped_irqdesc_get_and_buslock(irq, 0) { + struct irq_data *data = irq_desc_get_irq_data(scoped_irqdesc); + struct irq_chip *chip; - desc = irq_get_desc_buslock(irq, &flags, 0); - if (!desc) - return err; + do { + chip = irq_data_get_irq_chip(data); - data = irq_desc_get_irq_data(desc); + if (WARN_ON_ONCE(!chip)) + return -ENODEV; - do { - chip = irq_data_get_irq_chip(data); - if (WARN_ON_ONCE(!chip)) { - err = -ENODEV; - goto out_unlock; - } - if (chip->irq_set_irqchip_state) - break; -#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY - data = data->parent_data; -#else - data = NULL; -#endif - } while (data); + if (chip->irq_set_irqchip_state) + break; - if (data) - err = chip->irq_set_irqchip_state(data, which, val); + data = irqd_get_parent_data(data); + } while (data); -out_unlock: - irq_put_desc_busunlock(desc, flags); - return err; + if (data) + return chip->irq_set_irqchip_state(data, which, val); + } + return -EINVAL; } EXPORT_SYMBOL_GPL(irq_set_irqchip_state); diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index eb150afd671f..f2b2929986ff 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -35,6 +35,16 @@ bool irq_fixup_move_pending(struct irq_desc *desc, bool force_clear) return true; } +void irq_force_complete_move(struct irq_desc *desc) +{ + for (struct irq_data *d = irq_desc_get_irq_data(desc); d; d = irqd_get_parent_data(d)) { + if (d->chip && d->chip->irq_force_complete_move) { + d->chip->irq_force_complete_move(d); + return; + } + } +} + void irq_move_masked_irq(struct irq_data *idata) { struct irq_desc *desc = irq_data_to_desc(idata); @@ -117,3 +127,13 @@ void __irq_move_irq(struct irq_data *idata) if (!masked) idata->chip->irq_unmask(idata); } + +bool irq_can_move_in_process_context(struct irq_data *data) +{ + /* + * Get the top level irq_data in the hierarchy, which is optimized + * away when CONFIG_IRQ_DOMAIN_HIERARCHY is disabled. + */ + data = irq_desc_get_irq_data(irq_data_to_desc(data)); + return irq_can_move_pcntxt(data); +} diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 396a067a8a56..9febe797a5f6 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -15,6 +15,7 @@ #include <linux/mutex.h> #include <linux/pci.h> #include <linux/slab.h> +#include <linux/seq_file.h> #include <linux/sysfs.h> #include <linux/types.h> #include <linux/xarray.h> @@ -58,7 +59,8 @@ struct msi_ctrl { static void msi_domain_free_locked(struct device *dev, struct msi_ctrl *ctrl); static unsigned int msi_domain_get_hwsize(struct device *dev, unsigned int domid); static inline int msi_sysfs_create_group(struct device *dev); - +static int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev, + int nvec, msi_alloc_info_t *arg); /** * msi_alloc_desc - Allocate an initialized msi_desc @@ -342,26 +344,30 @@ int msi_setup_device_data(struct device *dev) } /** - * msi_lock_descs - Lock the MSI descriptor storage of a device + * __msi_lock_descs - Lock the MSI descriptor storage of a device * @dev: Device to operate on + * + * Internal function for guard(msi_descs_lock). Don't use in code. */ -void msi_lock_descs(struct device *dev) +void __msi_lock_descs(struct device *dev) { mutex_lock(&dev->msi.data->mutex); } -EXPORT_SYMBOL_GPL(msi_lock_descs); +EXPORT_SYMBOL_GPL(__msi_lock_descs); /** - * msi_unlock_descs - Unlock the MSI descriptor storage of a device + * __msi_unlock_descs - Unlock the MSI descriptor storage of a device * @dev: Device to operate on + * + * Internal function for guard(msi_descs_lock). Don't use in code. */ -void msi_unlock_descs(struct device *dev) +void __msi_unlock_descs(struct device *dev) { /* Invalidate the index which was cached by the iterator */ dev->msi.data->__iter_idx = MSI_XA_MAX_INDEX; mutex_unlock(&dev->msi.data->mutex); } -EXPORT_SYMBOL_GPL(msi_unlock_descs); +EXPORT_SYMBOL_GPL(__msi_unlock_descs); static struct msi_desc *msi_find_desc(struct msi_device_data *md, unsigned int domid, enum msi_desc_filter filter) @@ -447,7 +453,6 @@ EXPORT_SYMBOL_GPL(msi_next_desc); unsigned int msi_domain_get_virq(struct device *dev, unsigned int domid, unsigned int index) { struct msi_desc *desc; - unsigned int ret = 0; bool pcimsi = false; struct xarray *xa; @@ -461,7 +466,7 @@ unsigned int msi_domain_get_virq(struct device *dev, unsigned int domid, unsigne if (dev_is_pci(dev) && domid == MSI_DEFAULT_DOMAIN) pcimsi = to_pci_dev(dev)->msi_enabled; - msi_lock_descs(dev); + guard(msi_descs_lock)(dev); xa = &dev->msi.data->__domains[domid].store; desc = xa_load(xa, pcimsi ? 0 : index); if (desc && desc->irq) { @@ -470,16 +475,12 @@ unsigned int msi_domain_get_virq(struct device *dev, unsigned int domid, unsigne * PCI-MSIX and platform MSI use a descriptor per * interrupt. */ - if (pcimsi) { - if (index < desc->nvec_used) - ret = desc->irq + index; - } else { - ret = desc->irq; - } + if (!pcimsi) + return desc->irq; + if (index < desc->nvec_used) + return desc->irq + index; } - - msi_unlock_descs(dev); - return ret; + return 0; } EXPORT_SYMBOL_GPL(msi_domain_get_virq); @@ -756,12 +757,30 @@ static int msi_domain_translate(struct irq_domain *domain, struct irq_fwspec *fw return info->ops->msi_translate(domain, fwspec, hwirq, type); } +#ifdef CONFIG_GENERIC_IRQ_DEBUGFS +static void msi_domain_debug_show(struct seq_file *m, struct irq_domain *d, + struct irq_data *irqd, int ind) +{ + struct msi_desc *desc = irqd ? irq_data_get_msi_desc(irqd) : NULL; + + if (!desc) + return; + + seq_printf(m, "\n%*saddress_hi: 0x%08x", ind + 1, "", desc->msg.address_hi); + seq_printf(m, "\n%*saddress_lo: 0x%08x", ind + 1, "", desc->msg.address_lo); + seq_printf(m, "\n%*smsg_data: 0x%08x\n", ind + 1, "", desc->msg.data); +} +#endif + static const struct irq_domain_ops msi_domain_ops = { .alloc = msi_domain_alloc, .free = msi_domain_free, .activate = msi_domain_activate, .deactivate = msi_domain_deactivate, .translate = msi_domain_translate, +#ifdef CONFIG_GENERIC_IRQ_DEBUGFS + .debug_show = msi_domain_debug_show, +#endif }; static irq_hw_number_t msi_domain_ops_get_hwirq(struct msi_domain_info *info, @@ -777,6 +796,10 @@ static int msi_domain_ops_prepare(struct irq_domain *domain, struct device *dev, return 0; } +static void msi_domain_ops_teardown(struct irq_domain *domain, msi_alloc_info_t *arg) +{ +} + static void msi_domain_ops_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc) { @@ -802,6 +825,7 @@ static struct msi_domain_ops msi_domain_ops_default = { .get_hwirq = msi_domain_ops_get_hwirq, .msi_init = msi_domain_ops_init, .msi_prepare = msi_domain_ops_prepare, + .msi_teardown = msi_domain_ops_teardown, .set_desc = msi_domain_ops_set_desc, }; @@ -823,6 +847,8 @@ static void msi_domain_update_dom_ops(struct msi_domain_info *info) ops->msi_init = msi_domain_ops_default.msi_init; if (ops->msi_prepare == NULL) ops->msi_prepare = msi_domain_ops_default.msi_prepare; + if (ops->msi_teardown == NULL) + ops->msi_teardown = msi_domain_ops_default.msi_teardown; if (ops->set_desc == NULL) ops->set_desc = msi_domain_ops_default.set_desc; } @@ -886,6 +912,32 @@ struct irq_domain *msi_create_irq_domain(struct fwnode_handle *fwnode, } /** + * msi_create_parent_irq_domain - Create an MSI-parent interrupt domain + * @info: MSI irqdomain creation info + * @msi_parent_ops: MSI parent callbacks and configuration + * + * Return: pointer to the created &struct irq_domain or %NULL on failure + */ +struct irq_domain *msi_create_parent_irq_domain(struct irq_domain_info *info, + const struct msi_parent_ops *msi_parent_ops) +{ + struct irq_domain *d; + + info->hwirq_max = max(info->hwirq_max, info->size); + info->size = info->hwirq_max; + info->domain_flags |= IRQ_DOMAIN_FLAG_MSI_PARENT; + info->bus_token = msi_parent_ops->bus_select_token; + + d = irq_domain_instantiate(info); + if (IS_ERR(d)) + return NULL; + + d->msi_parent_ops = msi_parent_ops; + return d; +} +EXPORT_SYMBOL_GPL(msi_create_parent_irq_domain); + +/** * msi_parent_init_dev_msi_info - Delegate initialization of device MSI info down * in the domain hierarchy * @dev: The device for which the domain should be created @@ -979,9 +1031,8 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid, void *chip_data) { struct irq_domain *domain, *parent = dev->msi.domain; - struct fwnode_handle *fwnode, *fwnalloced = NULL; - struct msi_domain_template *bundle; const struct msi_parent_ops *pops; + struct fwnode_handle *fwnode; if (!irq_domain_is_msi_parent(parent)) return false; @@ -989,7 +1040,8 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid, if (domid >= MSI_MAX_DEVICE_IRQDOMAINS) return false; - bundle = kmemdup(template, sizeof(*bundle), GFP_KERNEL); + struct msi_domain_template *bundle __free(kfree) = + kmemdup(template, sizeof(*bundle), GFP_KERNEL); if (!bundle) return false; @@ -998,6 +1050,7 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid, bundle->info.ops = &bundle->ops; bundle->info.data = domain_data; bundle->info.chip_data = chip_data; + bundle->info.alloc_data = &bundle->alloc_info; pops = parent->msi_parent_ops; snprintf(bundle->name, sizeof(bundle->name), "%s%s-%s", @@ -1012,41 +1065,43 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid, * node as they are not guaranteed to have a fwnode. They are never * looked up and always handled in the context of the device. */ - if (bundle->info.flags & MSI_FLAG_USE_DEV_FWNODE) - fwnode = dev->fwnode; + struct fwnode_handle *fwnode_alloced __free(irq_domain_free_fwnode) = NULL; + + if (!(bundle->info.flags & MSI_FLAG_USE_DEV_FWNODE)) + fwnode = fwnode_alloced = irq_domain_alloc_named_fwnode(bundle->name); else - fwnode = fwnalloced = irq_domain_alloc_named_fwnode(bundle->name); + fwnode = dev->fwnode; if (!fwnode) - goto free_bundle; + return false; if (msi_setup_device_data(dev)) - goto free_fwnode; - - msi_lock_descs(dev); + return false; + guard(msi_descs_lock)(dev); if (WARN_ON_ONCE(msi_get_device_domain(dev, domid))) - goto fail; + return false; if (!pops->init_dev_msi_info(dev, parent, parent, &bundle->info)) - goto fail; + return false; domain = __msi_create_irq_domain(fwnode, &bundle->info, IRQ_DOMAIN_FLAG_MSI_DEVICE, parent); if (!domain) - goto fail; + return false; domain->dev = dev; dev->msi.data->__domains[domid].domain = domain; - msi_unlock_descs(dev); - return true; -fail: - msi_unlock_descs(dev); -free_fwnode: - irq_domain_free_fwnode(fwnalloced); -free_bundle: - kfree(bundle); - return false; + if (msi_domain_prepare_irqs(domain, dev, hwsize, &bundle->alloc_info)) { + dev->msi.data->__domains[domid].domain = NULL; + irq_domain_remove(domain); + return false; + } + + /* @bundle and @fwnode_alloced are now in use. Prevent cleanup */ + retain_and_null_ptr(bundle); + retain_and_null_ptr(fwnode_alloced); + return true; } /** @@ -1060,23 +1115,21 @@ void msi_remove_device_irq_domain(struct device *dev, unsigned int domid) struct msi_domain_info *info; struct irq_domain *domain; - msi_lock_descs(dev); - + guard(msi_descs_lock)(dev); domain = msi_get_device_domain(dev, domid); - if (!domain || !irq_domain_is_msi_device(domain)) - goto unlock; + return; dev->msi.data->__domains[domid].domain = NULL; info = domain->host_data; + + info->ops->msi_teardown(domain, info->alloc_data); + if (irq_domain_is_msi_device(domain)) fwnode = domain->fwnode; irq_domain_remove(domain); irq_domain_free_fwnode(fwnode); kfree(container_of(info, struct msi_domain_template, info)); - -unlock: - msi_unlock_descs(dev); } /** @@ -1092,16 +1145,14 @@ bool msi_match_device_irq_domain(struct device *dev, unsigned int domid, { struct msi_domain_info *info; struct irq_domain *domain; - bool ret = false; - msi_lock_descs(dev); + guard(msi_descs_lock)(dev); domain = msi_get_device_domain(dev, domid); if (domain && irq_domain_is_msi_device(domain)) { info = domain->host_data; - ret = info->bus_token == bus_token; + return info->bus_token == bus_token; } - msi_unlock_descs(dev); - return ret; + return false; } static int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev, @@ -1143,7 +1194,7 @@ static bool msi_check_reservation_mode(struct irq_domain *domain, if (!(info->flags & MSI_FLAG_MUST_REACTIVATE)) return false; - if (IS_ENABLED(CONFIG_PCI_MSI) && pci_msi_ignore_mask) + if (info->flags & MSI_FLAG_NO_MASK) return false; /* @@ -1219,6 +1270,24 @@ static int msi_init_virq(struct irq_domain *domain, int virq, unsigned int vflag return 0; } +static int populate_alloc_info(struct irq_domain *domain, struct device *dev, + unsigned int nirqs, msi_alloc_info_t *arg) +{ + struct msi_domain_info *info = domain->host_data; + + /* + * If the caller has provided a template alloc info, use that. Once + * all users of msi_create_irq_domain() have been eliminated, this + * should be the only source of allocation information, and the + * prepare call below should be finally removed. + */ + if (!info->alloc_data) + return msi_domain_prepare_irqs(domain, dev, nirqs, arg); + + *arg = *info->alloc_data; + return 0; +} + static int __msi_domain_alloc_irqs(struct device *dev, struct irq_domain *domain, struct msi_ctrl *ctrl) { @@ -1231,7 +1300,7 @@ static int __msi_domain_alloc_irqs(struct device *dev, struct irq_domain *domain unsigned long idx; int i, ret, virq; - ret = msi_domain_prepare_irqs(domain, dev, ctrl->nirqs, &arg); + ret = populate_alloc_info(domain, dev, ctrl->nirqs, &arg); if (ret) return ret; @@ -1372,12 +1441,9 @@ int msi_domain_alloc_irqs_range_locked(struct device *dev, unsigned int domid, int msi_domain_alloc_irqs_range(struct device *dev, unsigned int domid, unsigned int first, unsigned int last) { - int ret; - msi_lock_descs(dev); - ret = msi_domain_alloc_irqs_range_locked(dev, domid, first, last); - msi_unlock_descs(dev); - return ret; + guard(msi_descs_lock)(dev); + return msi_domain_alloc_irqs_range_locked(dev, domid, first, last); } EXPORT_SYMBOL_GPL(msi_domain_alloc_irqs_range); @@ -1481,12 +1547,8 @@ struct msi_map msi_domain_alloc_irq_at(struct device *dev, unsigned int domid, u const struct irq_affinity_desc *affdesc, union msi_instance_cookie *icookie) { - struct msi_map map; - - msi_lock_descs(dev); - map = __msi_domain_alloc_irq_at(dev, domid, index, affdesc, icookie); - msi_unlock_descs(dev); - return map; + guard(msi_descs_lock)(dev); + return __msi_domain_alloc_irq_at(dev, domid, index, affdesc, icookie); } /** @@ -1523,13 +1585,11 @@ int msi_device_domain_alloc_wired(struct irq_domain *domain, unsigned int hwirq, icookie.value = ((u64)type << 32) | hwirq; - msi_lock_descs(dev); + guard(msi_descs_lock)(dev); if (WARN_ON_ONCE(msi_get_device_domain(dev, domid) != domain)) map.index = -EINVAL; else map = __msi_domain_alloc_irq_at(dev, domid, MSI_ANY_INDEX, NULL, &icookie); - msi_unlock_descs(dev); - return map.index >= 0 ? map.virq : map.index; } @@ -1622,9 +1682,8 @@ void msi_domain_free_irqs_range_locked(struct device *dev, unsigned int domid, void msi_domain_free_irqs_range(struct device *dev, unsigned int domid, unsigned int first, unsigned int last) { - msi_lock_descs(dev); + guard(msi_descs_lock)(dev); msi_domain_free_irqs_range_locked(dev, domid, first, last); - msi_unlock_descs(dev); } EXPORT_SYMBOL_GPL(msi_domain_free_irqs_all); @@ -1654,9 +1713,8 @@ void msi_domain_free_irqs_all_locked(struct device *dev, unsigned int domid) */ void msi_domain_free_irqs_all(struct device *dev, unsigned int domid) { - msi_lock_descs(dev); + guard(msi_descs_lock)(dev); msi_domain_free_irqs_all_locked(dev, domid); - msi_unlock_descs(dev); } /** @@ -1675,12 +1733,11 @@ void msi_device_domain_free_wired(struct irq_domain *domain, unsigned int virq) if (WARN_ON_ONCE(!dev || !desc || domain->bus_token != DOMAIN_BUS_WIRED_TO_MSI)) return; - msi_lock_descs(dev); - if (!WARN_ON_ONCE(msi_get_device_domain(dev, MSI_DEFAULT_DOMAIN) != domain)) { - msi_domain_free_irqs_range_locked(dev, MSI_DEFAULT_DOMAIN, desc->msi_index, - desc->msi_index); - } - msi_unlock_descs(dev); + guard(msi_descs_lock)(dev); + if (WARN_ON_ONCE(msi_get_device_domain(dev, MSI_DEFAULT_DOMAIN) != domain)) + return; + msi_domain_free_irqs_range_locked(dev, MSI_DEFAULT_DOMAIN, desc->msi_index, + desc->msi_index); } /** diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index c556bc49d213..445912d51033 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -46,8 +46,7 @@ void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action) desc->cond_suspend_depth++; WARN_ON_ONCE(desc->no_suspend_depth && - (desc->no_suspend_depth + - desc->cond_suspend_depth) != desc->nr_actions); + (desc->no_suspend_depth + desc->cond_suspend_depth) != desc->nr_actions); } /* @@ -134,14 +133,12 @@ void suspend_device_irqs(void) int irq; for_each_irq_desc(irq, desc) { - unsigned long flags; bool sync; if (irq_settings_is_nested_thread(desc)) continue; - raw_spin_lock_irqsave(&desc->lock, flags); - sync = suspend_device_irq(desc); - raw_spin_unlock_irqrestore(&desc->lock, flags); + scoped_guard(raw_spinlock_irqsave, &desc->lock) + sync = suspend_device_irq(desc); if (sync) synchronize_irq(irq); @@ -186,18 +183,15 @@ static void resume_irqs(bool want_early) int irq; for_each_irq_desc(irq, desc) { - unsigned long flags; - bool is_early = desc->action && - desc->action->flags & IRQF_EARLY_RESUME; + bool is_early = desc->action && desc->action->flags & IRQF_EARLY_RESUME; if (!is_early && want_early) continue; if (irq_settings_is_nested_thread(desc)) continue; - raw_spin_lock_irqsave(&desc->lock, flags); + guard(raw_spinlock_irqsave)(&desc->lock); resume_irq(desc); - raw_spin_unlock_irqrestore(&desc->lock, flags); } } @@ -207,22 +201,16 @@ static void resume_irqs(bool want_early) */ void rearm_wake_irq(unsigned int irq) { - unsigned long flags; - struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); + scoped_irqdesc_get_and_buslock(irq, IRQ_GET_DESC_CHECK_GLOBAL) { + struct irq_desc *desc = scoped_irqdesc; - if (!desc) - return; - - if (!(desc->istate & IRQS_SUSPENDED) || - !irqd_is_wakeup_set(&desc->irq_data)) - goto unlock; - - desc->istate &= ~IRQS_SUSPENDED; - irqd_set(&desc->irq_data, IRQD_WAKEUP_ARMED); - __enable_irq(desc); + if (!(desc->istate & IRQS_SUSPENDED) || !irqd_is_wakeup_set(&desc->irq_data)) + return; -unlock: - irq_put_desc_busunlock(desc, flags); + desc->istate &= ~IRQS_SUSPENDED; + irqd_set(&desc->irq_data, IRQD_WAKEUP_ARMED); + __enable_irq(desc); + } } /** diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 8e29809de38d..29c2404e743b 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -81,20 +81,18 @@ static int show_irq_affinity(int type, struct seq_file *m) static int irq_affinity_hint_proc_show(struct seq_file *m, void *v) { struct irq_desc *desc = irq_to_desc((long)m->private); - unsigned long flags; cpumask_var_t mask; if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) return -ENOMEM; - raw_spin_lock_irqsave(&desc->lock, flags); - if (desc->affinity_hint) - cpumask_copy(mask, desc->affinity_hint); - raw_spin_unlock_irqrestore(&desc->lock, flags); + scoped_guard(raw_spinlock_irq, &desc->lock) { + if (desc->affinity_hint) + cpumask_copy(mask, desc->affinity_hint); + } seq_printf(m, "%*pb\n", cpumask_pr_args(mask)); free_cpumask_var(mask); - return 0; } @@ -295,32 +293,26 @@ static int irq_spurious_proc_show(struct seq_file *m, void *v) #define MAX_NAMELEN 128 -static int name_unique(unsigned int irq, struct irqaction *new_action) +static bool name_unique(unsigned int irq, struct irqaction *new_action) { struct irq_desc *desc = irq_to_desc(irq); struct irqaction *action; - unsigned long flags; - int ret = 1; - raw_spin_lock_irqsave(&desc->lock, flags); + guard(raw_spinlock_irq)(&desc->lock); for_each_action_of_desc(desc, action) { if ((action != new_action) && action->name && - !strcmp(new_action->name, action->name)) { - ret = 0; - break; - } + !strcmp(new_action->name, action->name)) + return false; } - raw_spin_unlock_irqrestore(&desc->lock, flags); - return ret; + return true; } void register_handler_proc(unsigned int irq, struct irqaction *action) { - char name [MAX_NAMELEN]; + char name[MAX_NAMELEN]; struct irq_desc *desc = irq_to_desc(irq); - if (!desc->dir || action->dir || !action->name || - !name_unique(irq, action)) + if (!desc->dir || action->dir || !action->name || !name_unique(irq, action)) return; snprintf(name, MAX_NAMELEN, "%s", action->name); @@ -347,17 +339,16 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) * added, not when the descriptor is created, so multiple * tasks might try to register at the same time. */ - mutex_lock(®ister_lock); + guard(mutex)(®ister_lock); if (desc->dir) - goto out_unlock; - - sprintf(name, "%d", irq); + return; /* create /proc/irq/1234 */ + sprintf(name, "%u", irq); desc->dir = proc_mkdir(name, root_irq_dir); if (!desc->dir) - goto out_unlock; + return; #ifdef CONFIG_SMP umode_t umode = S_IRUGO; @@ -366,31 +357,27 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) umode |= S_IWUSR; /* create /proc/irq/<irq>/smp_affinity */ - proc_create_data("smp_affinity", umode, desc->dir, - &irq_affinity_proc_ops, irqp); + proc_create_data("smp_affinity", umode, desc->dir, &irq_affinity_proc_ops, irqp); /* create /proc/irq/<irq>/affinity_hint */ proc_create_single_data("affinity_hint", 0444, desc->dir, - irq_affinity_hint_proc_show, irqp); + irq_affinity_hint_proc_show, irqp); /* create /proc/irq/<irq>/smp_affinity_list */ proc_create_data("smp_affinity_list", umode, desc->dir, &irq_affinity_list_proc_ops, irqp); - proc_create_single_data("node", 0444, desc->dir, irq_node_proc_show, - irqp); + proc_create_single_data("node", 0444, desc->dir, irq_node_proc_show, irqp); # ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK proc_create_single_data("effective_affinity", 0444, desc->dir, - irq_effective_aff_proc_show, irqp); + irq_effective_aff_proc_show, irqp); proc_create_single_data("effective_affinity_list", 0444, desc->dir, - irq_effective_aff_list_proc_show, irqp); + irq_effective_aff_list_proc_show, irqp); # endif #endif proc_create_single_data("spurious", 0444, desc->dir, - irq_spurious_proc_show, (void *)(long)irq); + irq_spurious_proc_show, (void *)(long)irq); -out_unlock: - mutex_unlock(®ister_lock); } void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) @@ -468,7 +455,6 @@ int show_interrupts(struct seq_file *p, void *v) int i = *(loff_t *) v, j; struct irqaction *action; struct irq_desc *desc; - unsigned long flags; if (i > ACTUAL_NR_IRQS) return 0; @@ -487,13 +473,13 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); } - rcu_read_lock(); + guard(rcu)(); desc = irq_to_desc(i); if (!desc || irq_settings_is_hidden(desc)) - goto outsparse; + return 0; if (!desc->action || irq_desc_is_chained(desc) || !desc->kstat_irqs) - goto outsparse; + return 0; seq_printf(p, "%*d:", prec, i); for_each_online_cpu(j) { @@ -503,7 +489,7 @@ int show_interrupts(struct seq_file *p, void *v) } seq_putc(p, ' '); - raw_spin_lock_irqsave(&desc->lock, flags); + guard(raw_spinlock_irq)(&desc->lock); if (desc->irq_data.chip) { if (desc->irq_data.chip->irq_print_chip) desc->irq_data.chip->irq_print_chip(&desc->irq_data, p); @@ -532,9 +518,6 @@ int show_interrupts(struct seq_file *p, void *v) } seq_putc(p, '\n'); - raw_spin_unlock_irqrestore(&desc->lock, flags); -outsparse: - rcu_read_unlock(); return 0; } #endif diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 1b7fa72968bd..ca9cc1b806a9 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -30,18 +30,17 @@ static DEFINE_RAW_SPINLOCK(irq_resend_lock); */ static void resend_irqs(struct tasklet_struct *unused) { - struct irq_desc *desc; - - raw_spin_lock_irq(&irq_resend_lock); + guard(raw_spinlock_irq)(&irq_resend_lock); while (!hlist_empty(&irq_resend_list)) { - desc = hlist_entry(irq_resend_list.first, struct irq_desc, - resend_node); + struct irq_desc *desc; + + desc = hlist_entry(irq_resend_list.first, struct irq_desc, resend_node); hlist_del_init(&desc->resend_node); + raw_spin_unlock(&irq_resend_lock); desc->handle_irq(desc); raw_spin_lock(&irq_resend_lock); } - raw_spin_unlock_irq(&irq_resend_lock); } /* Tasklet to handle resend: */ @@ -75,19 +74,18 @@ static int irq_sw_resend(struct irq_desc *desc) } /* Add to resend_list and activate the softirq: */ - raw_spin_lock(&irq_resend_lock); - if (hlist_unhashed(&desc->resend_node)) - hlist_add_head(&desc->resend_node, &irq_resend_list); - raw_spin_unlock(&irq_resend_lock); + scoped_guard(raw_spinlock, &irq_resend_lock) { + if (hlist_unhashed(&desc->resend_node)) + hlist_add_head(&desc->resend_node, &irq_resend_list); + } tasklet_schedule(&resend_tasklet); return 0; } void clear_irq_resend(struct irq_desc *desc) { - raw_spin_lock(&irq_resend_lock); + guard(raw_spinlock)(&irq_resend_lock); hlist_del_init(&desc->resend_node); - raw_spin_unlock(&irq_resend_lock); } void irq_resend_init(struct irq_desc *desc) @@ -172,30 +170,24 @@ int check_irq_resend(struct irq_desc *desc, bool inject) */ int irq_inject_interrupt(unsigned int irq) { - struct irq_desc *desc; - unsigned long flags; - int err; + int err = -EINVAL; /* Try the state injection hardware interface first */ if (!irq_set_irqchip_state(irq, IRQCHIP_STATE_PENDING, true)) return 0; /* That failed, try via the resend mechanism */ - desc = irq_get_desc_buslock(irq, &flags, 0); - if (!desc) - return -EINVAL; + scoped_irqdesc_get_and_buslock(irq, 0) { + struct irq_desc *desc = scoped_irqdesc; - /* - * Only try to inject when the interrupt is: - * - not NMI type - * - activated - */ - if (irq_is_nmi(desc) || !irqd_is_activated(&desc->irq_data)) - err = -EINVAL; - else - err = check_irq_resend(desc, true); - - irq_put_desc_busunlock(desc, flags); + /* + * Only try to inject when the interrupt is: + * - not NMI type + * - activated + */ + if (!irq_is_nmi(desc) && irqd_is_activated(&desc->irq_data)) + err = check_irq_resend(desc, true); + } return err; } EXPORT_SYMBOL_GPL(irq_inject_interrupt); diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 02b2daf07441..8f26982e7300 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -34,8 +34,9 @@ static atomic_t irq_poll_active; * true and let the handler run. */ bool irq_wait_for_poll(struct irq_desc *desc) - __must_hold(&desc->lock) { + lockdep_assert_held(&desc->lock); + if (WARN_ONCE(irq_poll_cpu == smp_processor_id(), "irq poll in progress on cpu %d for irq %d\n", smp_processor_id(), desc->irq_data.irq)) @@ -59,37 +60,35 @@ bool irq_wait_for_poll(struct irq_desc *desc) /* * Recovery handler for misrouted interrupts. */ -static int try_one_irq(struct irq_desc *desc, bool force) +static bool try_one_irq(struct irq_desc *desc, bool force) { - irqreturn_t ret = IRQ_NONE; struct irqaction *action; + bool ret = false; - raw_spin_lock(&desc->lock); + guard(raw_spinlock)(&desc->lock); /* * PER_CPU, nested thread interrupts and interrupts explicitly * marked polled are excluded from polling. */ - if (irq_settings_is_per_cpu(desc) || - irq_settings_is_nested_thread(desc) || + if (irq_settings_is_per_cpu(desc) || irq_settings_is_nested_thread(desc) || irq_settings_is_polled(desc)) - goto out; + return false; /* * Do not poll disabled interrupts unless the spurious * disabled poller asks explicitly. */ if (irqd_irq_disabled(&desc->irq_data) && !force) - goto out; + return false; /* * All handlers must agree on IRQF_SHARED, so we test just the * first. */ action = desc->action; - if (!action || !(action->flags & IRQF_SHARED) || - (action->flags & __IRQF_TIMER)) - goto out; + if (!action || !(action->flags & IRQF_SHARED) || (action->flags & __IRQF_TIMER)) + return false; /* Already running on another processor */ if (irqd_irq_inprogress(&desc->irq_data)) { @@ -98,21 +97,19 @@ static int try_one_irq(struct irq_desc *desc, bool force) * CPU to go looking for our mystery interrupt too */ desc->istate |= IRQS_PENDING; - goto out; + return false; } /* Mark it poll in progress */ desc->istate |= IRQS_POLL_INPROGRESS; do { if (handle_irq_event(desc) == IRQ_HANDLED) - ret = IRQ_HANDLED; + ret = true; /* Make sure that there is still a valid action */ action = desc->action; } while ((desc->istate & IRQS_PENDING) && action); desc->istate &= ~IRQS_POLL_INPROGRESS; -out: - raw_spin_unlock(&desc->lock); - return ret == IRQ_HANDLED; + return ret; } static int misrouted_irq(int irq) @@ -157,8 +154,7 @@ static void poll_spurious_irqs(struct timer_list *unused) continue; /* Racy but it doesn't matter */ - state = desc->istate; - barrier(); + state = READ_ONCE(desc->istate); if (!(state & IRQS_SPURIOUS_DISABLED)) continue; @@ -168,8 +164,7 @@ static void poll_spurious_irqs(struct timer_list *unused) } out: atomic_dec(&irq_poll_active); - mod_timer(&poll_spurious_irq_timer, - jiffies + POLL_SPURIOUS_IRQ_INTERVAL); + mod_timer(&poll_spurious_irq_timer, jiffies + POLL_SPURIOUS_IRQ_INTERVAL); } static inline int bad_action_ret(irqreturn_t action_ret) @@ -193,17 +188,13 @@ static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret) { unsigned int irq = irq_desc_get_irq(desc); struct irqaction *action; - unsigned long flags; - if (bad_action_ret(action_ret)) { - printk(KERN_ERR "irq event %d: bogus return value %x\n", - irq, action_ret); - } else { - printk(KERN_ERR "irq %d: nobody cared (try booting with " - "the \"irqpoll\" option)\n", irq); - } + if (bad_action_ret(action_ret)) + pr_err("irq event %d: bogus return value %x\n", irq, action_ret); + else + pr_err("irq %d: nobody cared (try booting with the \"irqpoll\" option)\n", irq); dump_stack(); - printk(KERN_ERR "handlers:\n"); + pr_err("handlers:\n"); /* * We need to take desc->lock here. note_interrupt() is called @@ -211,15 +202,13 @@ static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret) * with something else removing an action. It's ok to take * desc->lock here. See synchronize_irq(). */ - raw_spin_lock_irqsave(&desc->lock, flags); + guard(raw_spinlock_irqsave)(&desc->lock); for_each_action_of_desc(desc, action) { - printk(KERN_ERR "[<%p>] %ps", action->handler, action->handler); + pr_err("[<%p>] %ps", action->handler, action->handler); if (action->thread_fn) - printk(KERN_CONT " threaded [<%p>] %ps", - action->thread_fn, action->thread_fn); - printk(KERN_CONT "\n"); + pr_cont(" threaded [<%p>] %ps", action->thread_fn, action->thread_fn); + pr_cont("\n"); } - raw_spin_unlock_irqrestore(&desc->lock, flags); } static void report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret) @@ -232,18 +221,17 @@ static void report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret) } } -static inline int -try_misrouted_irq(unsigned int irq, struct irq_desc *desc, - irqreturn_t action_ret) +static inline bool try_misrouted_irq(unsigned int irq, struct irq_desc *desc, + irqreturn_t action_ret) { struct irqaction *action; if (!irqfixup) - return 0; + return false; /* We didn't actually handle the IRQ - see if it was misrouted? */ if (action_ret == IRQ_NONE) - return 1; + return true; /* * But for 'irqfixup == 2' we also do it for handled interrupts if @@ -251,19 +239,16 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc, * traditional PC timer interrupt.. Legacy) */ if (irqfixup < 2) - return 0; + return false; if (!irq) - return 1; + return true; /* * Since we don't get the descriptor lock, "action" can - * change under us. We don't really care, but we don't - * want to follow a NULL pointer. So tell the compiler to - * just load it once by using a barrier. + * change under us. */ - action = desc->action; - barrier(); + action = READ_ONCE(desc->action); return action && (action->flags & IRQF_IRQPOLL); } @@ -273,8 +258,7 @@ void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret) { unsigned int irq; - if (desc->istate & IRQS_POLL_INPROGRESS || - irq_settings_is_polled(desc)) + if (desc->istate & IRQS_POLL_INPROGRESS || irq_settings_is_polled(desc)) return; if (bad_action_ret(action_ret)) { @@ -420,13 +404,12 @@ void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret) /* * Now kill the IRQ */ - printk(KERN_EMERG "Disabling IRQ #%d\n", irq); + pr_emerg("Disabling IRQ #%d\n", irq); desc->istate |= IRQS_SPURIOUS_DISABLED; desc->depth++; irq_disable(desc); - mod_timer(&poll_spurious_irq_timer, - jiffies + POLL_SPURIOUS_IRQ_INTERVAL); + mod_timer(&poll_spurious_irq_timer, jiffies + POLL_SPURIOUS_IRQ_INTERVAL); } desc->irqs_unhandled = 0; } @@ -436,11 +419,9 @@ bool noirqdebug __read_mostly; int noirqdebug_setup(char *str) { noirqdebug = 1; - printk(KERN_INFO "IRQ lockup detection disabled\n"); - + pr_info("IRQ lockup detection disabled\n"); return 1; } - __setup("noirqdebug", noirqdebug_setup); module_param(noirqdebug, bool, 0644); MODULE_PARM_DESC(noirqdebug, "Disable irq lockup detection when true"); @@ -452,12 +433,10 @@ static int __init irqfixup_setup(char *str) return 1; } irqfixup = 1; - printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n"); - printk(KERN_WARNING "This may impact system performance.\n"); - + pr_warn("Misrouted IRQ fixup support enabled.\n"); + pr_warn("This may impact system performance.\n"); return 1; } - __setup("irqfixup", irqfixup_setup); module_param(irqfixup, int, 0644); @@ -468,11 +447,8 @@ static int __init irqpoll_setup(char *str) return 1; } irqfixup = 2; - printk(KERN_WARNING "Misrouted IRQ fixup and polling support " - "enabled\n"); - printk(KERN_WARNING "This may significantly impact system " - "performance\n"); + pr_warn("Misrouted IRQ fixup and polling support enabled\n"); + pr_warn("This may significantly impact system performance\n"); return 1; } - __setup("irqpoll", irqpoll_setup); diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 93a822d3c468..7cb19e601426 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -653,13 +653,12 @@ static int __jump_label_mod_text_reserved(void *start, void *end) struct module *mod; int ret; - preempt_disable(); - mod = __module_text_address((unsigned long)start); - WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod); - if (!try_module_get(mod)) - mod = NULL; - preempt_enable(); - + scoped_guard(rcu) { + mod = __module_text_address((unsigned long)start); + WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod); + if (!try_module_get(mod)) + mod = NULL; + } if (!mod) return 0; @@ -746,9 +745,9 @@ static int jump_label_add_module(struct module *mod) kfree(jlm); return -ENOMEM; } - preempt_disable(); - jlm2->mod = __module_address((unsigned long)key); - preempt_enable(); + scoped_guard(rcu) + jlm2->mod = __module_address((unsigned long)key); + jlm2->entries = static_key_entries(key); jlm2->next = NULL; static_key_set_mod(key, jlm2); @@ -906,13 +905,13 @@ static void jump_label_update(struct static_key *key) return; } - preempt_disable(); - mod = __module_address((unsigned long)key); - if (mod) { - stop = mod->jump_entries + mod->num_jump_entries; - init = mod->state == MODULE_STATE_COMING; + scoped_guard(rcu) { + mod = __module_address((unsigned long)key); + if (mod) { + stop = mod->jump_entries + mod->num_jump_entries; + init = mod->state == MODULE_STATE_COMING; + } } - preempt_enable(); #endif entry = static_key_entries(key); /* if there are no users, entry can be NULL */ diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c index 117d9d4d3c3b..c2871180edcc 100644 --- a/kernel/kcsan/kcsan_test.c +++ b/kernel/kcsan/kcsan_test.c @@ -1500,8 +1500,8 @@ static int access_thread(void *arg) func(); } } while (!torture_must_stop()); - del_timer_sync(&timer); - destroy_timer_on_stack(&timer); + timer_delete_sync(&timer); + timer_destroy_on_stack(&timer); torture_kthread_stopping("access_thread"); return 0; diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index c0bdc1686154..9c59fa480b0b 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -210,6 +210,16 @@ int sanity_check_segment_list(struct kimage *image) } #endif + /* + * The destination addresses are searched from system RAM rather than + * being allocated from the buddy allocator, so they are not guaranteed + * to be accepted by the current kernel. Accept the destination + * addresses before kexec swaps their content with the segments' source + * pages to avoid accessing memory before it is accepted. + */ + for (i = 0; i < nr_segments; i++) + accept_memory(image->segment[i].mem, image->segment[i].memsz); + return 0; } @@ -867,6 +877,60 @@ int kimage_load_segment(struct kimage *image, return result; } +void *kimage_map_segment(struct kimage *image, + unsigned long addr, unsigned long size) +{ + unsigned long src_page_addr, dest_page_addr = 0; + unsigned long eaddr = addr + size; + kimage_entry_t *ptr, entry; + struct page **src_pages; + unsigned int npages; + void *vaddr = NULL; + int i; + + /* + * Collect the source pages and map them in a contiguous VA range. + */ + npages = PFN_UP(eaddr) - PFN_DOWN(addr); + src_pages = kmalloc_array(npages, sizeof(*src_pages), GFP_KERNEL); + if (!src_pages) { + pr_err("Could not allocate ima pages array.\n"); + return NULL; + } + + i = 0; + for_each_kimage_entry(image, ptr, entry) { + if (entry & IND_DESTINATION) { + dest_page_addr = entry & PAGE_MASK; + } else if (entry & IND_SOURCE) { + if (dest_page_addr >= addr && dest_page_addr < eaddr) { + src_page_addr = entry & PAGE_MASK; + src_pages[i++] = + virt_to_page(__va(src_page_addr)); + if (i == npages) + break; + dest_page_addr += PAGE_SIZE; + } + } + } + + /* Sanity check. */ + WARN_ON(i < npages); + + vaddr = vmap(src_pages, npages, VM_MAP, PAGE_KERNEL); + kfree(src_pages); + + if (!vaddr) + pr_err("Could not map ima buffer.\n"); + + return vaddr; +} + +void kimage_unmap_segment(void *segment_buffer) +{ + vunmap(segment_buffer); +} + struct kexec_load_limit { /* Mutex protects the limit count. */ struct mutex mutex; @@ -1013,7 +1077,7 @@ int kernel_kexec(void) error = -EBUSY; goto Restore_console; } - suspend_console(); + console_suspend_all(); error = dpm_suspend_start(PMSG_FREEZE); if (error) goto Resume_console; @@ -1072,7 +1136,7 @@ int kernel_kexec(void) Resume_devices: dpm_resume_end(PMSG_RESTORE); Resume_console: - resume_console(); + console_resume_all(); thaw_processes(); Restore_console: pm_restore_console(); diff --git a/kernel/kexec_elf.c b/kernel/kexec_elf.c index d3689632e8b9..3a5c25b2adc9 100644 --- a/kernel/kexec_elf.c +++ b/kernel/kexec_elf.c @@ -390,7 +390,7 @@ int kexec_elf_load(struct kimage *image, struct elfhdr *ehdr, struct kexec_buf *kbuf, unsigned long *lowest_load_addr) { - unsigned long lowest_addr = UINT_MAX; + unsigned long lowest_addr = ULONG_MAX; int ret; size_t i; diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 3eedb8c226ad..0adb645072aa 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -38,6 +38,21 @@ void set_kexec_sig_enforced(void) } #endif +#ifdef CONFIG_IMA_KEXEC +static bool check_ima_segment_index(struct kimage *image, int i) +{ + if (image->is_ima_segment_index_set && i == image->ima_segment_index) + return true; + else + return false; +} +#else +static bool check_ima_segment_index(struct kimage *image, int i) +{ + return false; +} +#endif + static int kexec_calculate_store_digests(struct kimage *image); /* Maximum size in bytes for kernel/initrd files. */ @@ -186,6 +201,15 @@ kimage_validate_signature(struct kimage *image) } #endif +static int kexec_post_load(struct kimage *image, unsigned long flags) +{ +#ifdef CONFIG_IMA_KEXEC + if (!(flags & KEXEC_FILE_ON_CRASH)) + ima_kexec_post_load(image); +#endif + return machine_kexec_post_load(image); +} + /* * In file mode list of segments is prepared by kernel. Copy relevant * data from user space, do error checking, prepare segment list @@ -413,7 +437,7 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, kimage_terminate(image); - ret = machine_kexec_post_load(image); + ret = kexec_post_load(image, flags); if (ret) goto out; @@ -464,6 +488,12 @@ static int locate_mem_hole_top_down(unsigned long start, unsigned long end, continue; } + /* Make sure this does not conflict with exclude range */ + if (arch_check_excluded_range(image, temp_start, temp_end)) { + temp_start = temp_start - PAGE_SIZE; + continue; + } + /* We found a suitable memory range */ break; } while (1); @@ -498,6 +528,12 @@ static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end, continue; } + /* Make sure this does not conflict with exclude range */ + if (arch_check_excluded_range(image, temp_start, temp_end)) { + temp_start = temp_start + PAGE_SIZE; + continue; + } + /* We found a suitable memory range */ break; } while (1); @@ -764,6 +800,13 @@ static int kexec_calculate_store_digests(struct kimage *image) if (ksegment->kbuf == pi->purgatory_buf) continue; + /* + * Skip the segment if ima_segment_index is set and matches + * the current index + */ + if (check_ima_segment_index(image, i)) + continue; + ret = crypto_shash_update(desc, ksegment->kbuf, ksegment->bufsz); if (ret) diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 88aeac84e4c0..ffe0c3d52306 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1547,7 +1547,7 @@ static int check_kprobe_address_safe(struct kprobe *p, /* Ensure the address is in a text area, and find a module if exists. */ *probed_mod = NULL; if (!core_kernel_text((unsigned long) p->addr)) { - guard(preempt)(); + guard(rcu)(); *probed_mod = __module_text_address((unsigned long) p->addr); if (!(*probed_mod)) return -EINVAL; diff --git a/kernel/kthread.c b/kernel/kthread.c index 5dc5b0d7238e..77c44924cf54 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -1362,14 +1362,14 @@ static void kthread_cancel_delayed_work_timer(struct kthread_work *work, struct kthread_worker *worker = work->worker; /* - * del_timer_sync() must be called to make sure that the timer + * timer_delete_sync() must be called to make sure that the timer * callback is not running. The lock must be temporary released * to avoid a deadlock with the callback. In the meantime, * any queuing is blocked by setting the canceling counter. */ work->canceling++; raw_spin_unlock_irqrestore(&worker->lock, *flags); - del_timer_sync(&dwork->timer); + timer_delete_sync(&dwork->timer); raw_spin_lock_irqsave(&worker->lock, *flags); work->canceling--; } diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 0cd39954d5a1..0e73fac55f8e 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -59,7 +59,7 @@ static void klp_find_object_module(struct klp_object *obj) if (!klp_is_module(obj)) return; - rcu_read_lock_sched(); + guard(rcu)(); /* * We do not want to block removal of patched modules and therefore * we do not take a reference here. The patches are removed by @@ -75,8 +75,6 @@ static void klp_find_object_module(struct klp_object *obj) */ if (mod && mod->klp_alive) obj->mod = mod; - - rcu_read_unlock_sched(); } static bool klp_initialized(void) @@ -601,9 +599,12 @@ static int klp_add_object_nops(struct klp_patch *patch, } /* - * Add 'nop' functions which simply return to the caller to run - * the original function. The 'nop' functions are added to a - * patch to facilitate a 'replace' mode. + * Add 'nop' functions which simply return to the caller to run the + * original function. + * + * They are added only when the atomic replace mode is used and only for + * functions which are currently livepatched but are no longer included + * in the new livepatch. */ static int klp_add_nops(struct klp_patch *patch) { diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c index ba069459c101..2351a19ac2a9 100644 --- a/kernel/livepatch/transition.c +++ b/kernel/livepatch/transition.c @@ -29,22 +29,13 @@ static unsigned int klp_signals_cnt; /* * When a livepatch is in progress, enable klp stack checking in - * cond_resched(). This helps CPU-bound kthreads get patched. + * schedule(). This helps CPU-bound kthreads get patched. */ -#if defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) - -#define klp_cond_resched_enable() sched_dynamic_klp_enable() -#define klp_cond_resched_disable() sched_dynamic_klp_disable() - -#else /* !CONFIG_PREEMPT_DYNAMIC || !CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */ DEFINE_STATIC_KEY_FALSE(klp_sched_try_switch_key); -EXPORT_SYMBOL(klp_sched_try_switch_key); -#define klp_cond_resched_enable() static_branch_enable(&klp_sched_try_switch_key) -#define klp_cond_resched_disable() static_branch_disable(&klp_sched_try_switch_key) - -#endif /* CONFIG_PREEMPT_DYNAMIC && CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */ +#define klp_resched_enable() static_branch_enable(&klp_sched_try_switch_key) +#define klp_resched_disable() static_branch_disable(&klp_sched_try_switch_key) /* * This work can be performed periodically to finish patching or unpatching any @@ -365,26 +356,18 @@ static bool klp_try_switch_task(struct task_struct *task) void __klp_sched_try_switch(void) { - if (likely(!klp_patch_pending(current))) - return; - /* - * This function is called from cond_resched() which is called in many - * places throughout the kernel. Using the klp_mutex here might - * deadlock. - * - * Instead, disable preemption to prevent racing with other callers of - * klp_try_switch_task(). Thanks to task_call_func() they won't be - * able to switch this task while it's running. + * This function is called from __schedule() while a context switch is + * about to happen. Preemption is already disabled and klp_mutex + * can't be acquired. + * Disabled preemption is used to prevent racing with other callers of + * klp_try_switch_task(). Thanks to task_call_func() they won't be + * able to switch to this task while it's running. */ - preempt_disable(); + lockdep_assert_preemption_disabled(); - /* - * Make sure current didn't get patched between the above check and - * preempt_disable(). - */ - if (unlikely(!klp_patch_pending(current))) - goto out; + if (likely(!klp_patch_pending(current))) + return; /* * Enforce the order of the TIF_PATCH_PENDING read above and the @@ -395,11 +378,7 @@ void __klp_sched_try_switch(void) smp_rmb(); klp_try_switch_task(current); - -out: - preempt_enable(); } -EXPORT_SYMBOL(__klp_sched_try_switch); /* * Sends a fake signal to all non-kthread tasks with TIF_PATCH_PENDING set. @@ -508,7 +487,7 @@ void klp_try_complete_transition(void) } /* Done! Now cleanup the data structures. */ - klp_cond_resched_disable(); + klp_resched_disable(); patch = klp_transition_patch; klp_complete_transition(); @@ -560,7 +539,7 @@ void klp_start_transition(void) set_tsk_thread_flag(task, TIF_PATCH_PENDING); } - klp_cond_resched_enable(); + klp_resched_enable(); klp_signals_cnt = 0; } diff --git a/kernel/locking/lock_events_list.h b/kernel/locking/lock_events_list.h index 9ef9850aeebe..4e36258cc34f 100644 --- a/kernel/locking/lock_events_list.h +++ b/kernel/locking/lock_events_list.h @@ -50,6 +50,11 @@ LOCK_EVENT(lock_no_node) /* # of locking ops w/o using percpu node */ #endif /* CONFIG_QUEUED_SPINLOCKS */ /* + * Locking events for Resilient Queued Spin Lock + */ +LOCK_EVENT(rqspinlock_lock_timeout) /* # of locking ops that timeout */ + +/* * Locking events for rwsem */ LOCK_EVENT(rwsem_sleep_reader) /* # of reader sleeps */ diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index b15757e63626..dd2bbf73718b 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -219,6 +219,7 @@ static DECLARE_BITMAP(list_entries_in_use, MAX_LOCKDEP_ENTRIES); static struct hlist_head lock_keys_hash[KEYHASH_SIZE]; unsigned long nr_lock_classes; unsigned long nr_zapped_classes; +unsigned long nr_dynamic_keys; unsigned long max_lock_class_idx; struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; DECLARE_BITMAP(lock_classes_in_use, MAX_LOCKDEP_KEYS); @@ -1238,6 +1239,7 @@ void lockdep_register_key(struct lock_class_key *key) goto out_unlock; } hlist_add_head_rcu(&key->hash_entry, hash_head); + nr_dynamic_keys++; out_unlock: graph_unlock(); restore_irqs: @@ -1977,41 +1979,6 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth, } /* - * We are about to add A -> B into the dependency graph, and in __bfs() a - * strong dependency path A -> .. -> B is found: hlock_class equals - * entry->class. - * - * If A -> .. -> B can replace A -> B in any __bfs() search (means the former - * is _stronger_ than or equal to the latter), we consider A -> B as redundant. - * For example if A -> .. -> B is -(EN)-> (i.e. A -(E*)-> .. -(*N)-> B), and A - * -> B is -(ER)-> or -(EN)->, then we don't need to add A -> B into the - * dependency graph, as any strong path ..-> A -> B ->.. we can get with - * having dependency A -> B, we could already get a equivalent path ..-> A -> - * .. -> B -> .. with A -> .. -> B. Therefore A -> B is redundant. - * - * We need to make sure both the start and the end of A -> .. -> B is not - * weaker than A -> B. For the start part, please see the comment in - * check_redundant(). For the end part, we need: - * - * Either - * - * a) A -> B is -(*R)-> (everything is not weaker than that) - * - * or - * - * b) A -> .. -> B is -(*N)-> (nothing is stronger than this) - * - */ -static inline bool hlock_equal(struct lock_list *entry, void *data) -{ - struct held_lock *hlock = (struct held_lock *)data; - - return hlock_class(hlock) == entry->class && /* Found A -> .. -> B */ - (hlock->read == 2 || /* A -> B is -(*R)-> */ - !entry->only_xr); /* A -> .. -> B is -(*N)-> */ -} - -/* * We are about to add B -> A into the dependency graph, and in __bfs() a * strong dependency path A -> .. -> B is found: hlock_class equals * entry->class. @@ -2916,6 +2883,41 @@ static inline bool usage_skip(struct lock_list *entry, void *mask) #ifdef CONFIG_LOCKDEP_SMALL /* + * We are about to add A -> B into the dependency graph, and in __bfs() a + * strong dependency path A -> .. -> B is found: hlock_class equals + * entry->class. + * + * If A -> .. -> B can replace A -> B in any __bfs() search (means the former + * is _stronger_ than or equal to the latter), we consider A -> B as redundant. + * For example if A -> .. -> B is -(EN)-> (i.e. A -(E*)-> .. -(*N)-> B), and A + * -> B is -(ER)-> or -(EN)->, then we don't need to add A -> B into the + * dependency graph, as any strong path ..-> A -> B ->.. we can get with + * having dependency A -> B, we could already get a equivalent path ..-> A -> + * .. -> B -> .. with A -> .. -> B. Therefore A -> B is redundant. + * + * We need to make sure both the start and the end of A -> .. -> B is not + * weaker than A -> B. For the start part, please see the comment in + * check_redundant(). For the end part, we need: + * + * Either + * + * a) A -> B is -(*R)-> (everything is not weaker than that) + * + * or + * + * b) A -> .. -> B is -(*N)-> (nothing is stronger than this) + * + */ +static inline bool hlock_equal(struct lock_list *entry, void *data) +{ + struct held_lock *hlock = (struct held_lock *)data; + + return hlock_class(hlock) == entry->class && /* Found A -> .. -> B */ + (hlock->read == 2 || /* A -> B is -(*R)-> */ + !entry->only_xr); /* A -> .. -> B is -(*N)-> */ +} + +/* * Check that the dependency graph starting at <src> can lead to * <target> or not. If it can, <src> -> <target> dependency is already * in the graph. @@ -5101,6 +5103,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, lockevent_inc(lockdep_nocheck); } + if (DEBUG_LOCKS_WARN_ON(subclass >= MAX_LOCKDEP_SUBCLASSES)) + return 0; + if (subclass < NR_LOCKDEP_CACHING_CLASSES) class = lock->class_cache[subclass]; /* @@ -6264,6 +6269,9 @@ static void zap_class(struct pending_free *pf, struct lock_class *class) hlist_del_rcu(&class->hash_entry); WRITE_ONCE(class->key, NULL); WRITE_ONCE(class->name, NULL); + /* Class allocated but not used, -1 in nr_unused_locks */ + if (class->usage_mask == 0) + debug_atomic_dec(nr_unused_locks); nr_lock_classes--; __clear_bit(class - lock_classes, lock_classes_in_use); if (class - lock_classes == max_lock_class_idx) @@ -6603,6 +6611,7 @@ void lockdep_unregister_key(struct lock_class_key *key) pf = get_pending_free(); __lockdep_free_key_range(pf, key, 1); need_callback = prepare_call_rcu_zapped(pf); + nr_dynamic_keys--; } lockdep_unlock(); raw_local_irq_restore(flags); diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index 20f9ef58d3d0..82156caf77d1 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -138,6 +138,7 @@ extern unsigned long nr_lock_classes; extern unsigned long nr_zapped_classes; extern unsigned long nr_zapped_lock_chains; extern unsigned long nr_list_entries; +extern unsigned long nr_dynamic_keys; long lockdep_next_lockchain(long i); unsigned long lock_chain_count(void); extern unsigned long nr_stack_trace_entries; diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index 6db0f43fc4df..b52c07c4707c 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -286,6 +286,8 @@ static int lockdep_stats_show(struct seq_file *m, void *v) #endif seq_printf(m, " lock-classes: %11lu [max: %lu]\n", nr_lock_classes, MAX_LOCKDEP_KEYS); + seq_printf(m, " dynamic-keys: %11lu\n", + nr_dynamic_keys); seq_printf(m, " direct dependencies: %11lu [max: %lu]\n", nr_list_entries, MAX_LOCKDEP_ENTRIES); seq_printf(m, " indirect dependencies: %11lu\n", diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index cc33470f4de9..ce0362f0a871 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -362,6 +362,60 @@ static struct lock_torture_ops raw_spin_lock_irq_ops = { .name = "raw_spin_lock_irq" }; +#ifdef CONFIG_BPF_SYSCALL + +#include <asm/rqspinlock.h> +static rqspinlock_t rqspinlock; + +static int torture_raw_res_spin_write_lock(int tid __maybe_unused) +{ + raw_res_spin_lock(&rqspinlock); + return 0; +} + +static void torture_raw_res_spin_write_unlock(int tid __maybe_unused) +{ + raw_res_spin_unlock(&rqspinlock); +} + +static struct lock_torture_ops raw_res_spin_lock_ops = { + .writelock = torture_raw_res_spin_write_lock, + .write_delay = torture_spin_lock_write_delay, + .task_boost = torture_rt_boost, + .writeunlock = torture_raw_res_spin_write_unlock, + .readlock = NULL, + .read_delay = NULL, + .readunlock = NULL, + .name = "raw_res_spin_lock" +}; + +static int torture_raw_res_spin_write_lock_irq(int tid __maybe_unused) +{ + unsigned long flags; + + raw_res_spin_lock_irqsave(&rqspinlock, flags); + cxt.cur_ops->flags = flags; + return 0; +} + +static void torture_raw_res_spin_write_unlock_irq(int tid __maybe_unused) +{ + raw_res_spin_unlock_irqrestore(&rqspinlock, cxt.cur_ops->flags); +} + +static struct lock_torture_ops raw_res_spin_lock_irq_ops = { + .writelock = torture_raw_res_spin_write_lock_irq, + .write_delay = torture_spin_lock_write_delay, + .task_boost = torture_rt_boost, + .writeunlock = torture_raw_res_spin_write_unlock_irq, + .readlock = NULL, + .read_delay = NULL, + .readunlock = NULL, + .name = "raw_res_spin_lock_irq" +}; + +#endif + static DEFINE_RWLOCK(torture_rwlock); static int torture_rwlock_write_lock(int tid __maybe_unused) @@ -1168,6 +1222,9 @@ static int __init lock_torture_init(void) &lock_busted_ops, &spin_lock_ops, &spin_lock_irq_ops, &raw_spin_lock_ops, &raw_spin_lock_irq_ops, +#ifdef CONFIG_BPF_SYSCALL + &raw_res_spin_lock_ops, &raw_res_spin_lock_irq_ops, +#endif &rw_lock_ops, &rw_lock_irq_ops, &mutex_lock_ops, &ww_mutex_lock_ops, diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h index 85251d8771d9..5c92ba199b90 100644 --- a/kernel/locking/mcs_spinlock.h +++ b/kernel/locking/mcs_spinlock.h @@ -15,12 +15,6 @@ #include <asm/mcs_spinlock.h> -struct mcs_spinlock { - struct mcs_spinlock *next; - int locked; /* 1 if lock acquired */ - int count; /* nesting count, see qspinlock.c */ -}; - #ifndef arch_mcs_spin_lock_contended /* * Using smp_cond_load_acquire() provides the acquire semantics @@ -30,9 +24,7 @@ struct mcs_spinlock { * spinning, and smp_cond_load_acquire() provides that behavior. */ #define arch_mcs_spin_lock_contended(l) \ -do { \ - smp_cond_load_acquire(l, VAL); \ -} while (0) + smp_cond_load_acquire(l, VAL) #endif #ifndef arch_mcs_spin_unlock_contended diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 19b636f60a24..555e2b3a665a 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -72,6 +72,14 @@ static inline unsigned long __owner_flags(unsigned long owner) return owner & MUTEX_FLAGS; } +/* Do not use the return value as a pointer directly. */ +unsigned long mutex_get_owner(struct mutex *lock) +{ + unsigned long owner = atomic_long_read(&lock->owner); + + return (unsigned long)__owner_task(owner); +} + /* * Returns: __mutex_owner(lock) on failure or NULL on success. */ @@ -182,6 +190,9 @@ static void __mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, struct list_head *list) { +#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER + WRITE_ONCE(current->blocker_mutex, lock); +#endif debug_mutex_add_waiter(lock, waiter, current); list_add_tail(&waiter->list, list); @@ -197,6 +208,9 @@ __mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter) __mutex_clear_flag(lock, MUTEX_FLAGS); debug_mutex_remove_waiter(lock, waiter, current); +#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER + WRITE_ONCE(current->blocker_mutex, NULL); +#endif } /* diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index 6083883c4fe0..ef234469baac 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -138,7 +138,8 @@ static int percpu_rwsem_wake_function(struct wait_queue_entry *wq_entry, return !reader; /* wake (readers until) 1 writer */ } -static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader) +static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader, + bool freeze) { DEFINE_WAIT_FUNC(wq_entry, percpu_rwsem_wake_function); bool wait; @@ -156,7 +157,8 @@ static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader) spin_unlock_irq(&sem->waiters.lock); while (wait) { - set_current_state(TASK_UNINTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE | + (freeze ? TASK_FREEZABLE : 0)); if (!smp_load_acquire(&wq_entry.private)) break; schedule(); @@ -164,7 +166,8 @@ static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader) __set_current_state(TASK_RUNNING); } -bool __sched __percpu_down_read(struct percpu_rw_semaphore *sem, bool try) +bool __sched __percpu_down_read(struct percpu_rw_semaphore *sem, bool try, + bool freeze) { if (__percpu_down_read_trylock(sem)) return true; @@ -174,7 +177,7 @@ bool __sched __percpu_down_read(struct percpu_rw_semaphore *sem, bool try) trace_contention_begin(sem, LCB_F_PERCPU | LCB_F_READ); preempt_enable(); - percpu_rwsem_wait(sem, /* .reader = */ true); + percpu_rwsem_wait(sem, /* .reader = */ true, freeze); preempt_disable(); trace_contention_end(sem, 0); @@ -184,7 +187,7 @@ EXPORT_SYMBOL_GPL(__percpu_down_read); #define per_cpu_sum(var) \ ({ \ - typeof(var) __sum = 0; \ + TYPEOF_UNQUAL(var) __sum = 0; \ int cpu; \ compiletime_assert_atomic_type(__sum); \ for_each_possible_cpu(cpu) \ @@ -237,7 +240,7 @@ void __sched percpu_down_write(struct percpu_rw_semaphore *sem) */ if (!__percpu_down_write_trylock(sem)) { trace_contention_begin(sem, LCB_F_PERCPU | LCB_F_WRITE); - percpu_rwsem_wait(sem, /* .reader = */ false); + percpu_rwsem_wait(sem, /* .reader = */ false, false); contended = true; } diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 7d96bed718e4..af8d122bb649 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -25,8 +25,9 @@ #include <trace/events/lock.h> /* - * Include queued spinlock statistics code + * Include queued spinlock definitions and statistics code */ +#include "qspinlock.h" #include "qspinlock_stat.h" /* @@ -67,36 +68,6 @@ */ #include "mcs_spinlock.h" -#define MAX_NODES 4 - -/* - * On 64-bit architectures, the mcs_spinlock structure will be 16 bytes in - * size and four of them will fit nicely in one 64-byte cacheline. For - * pvqspinlock, however, we need more space for extra data. To accommodate - * that, we insert two more long words to pad it up to 32 bytes. IOW, only - * two of them can fit in a cacheline in this case. That is OK as it is rare - * to have more than 2 levels of slowpath nesting in actual use. We don't - * want to penalize pvqspinlocks to optimize for a rare case in native - * qspinlocks. - */ -struct qnode { - struct mcs_spinlock mcs; -#ifdef CONFIG_PARAVIRT_SPINLOCKS - long reserved[2]; -#endif -}; - -/* - * The pending bit spinning loop count. - * This heuristic is used to limit the number of lockword accesses - * made by atomic_cond_read_relaxed when waiting for the lock to - * transition out of the "== _Q_PENDING_VAL" state. We don't spin - * indefinitely because there's no guarantee that we'll make forward - * progress. - */ -#ifndef _Q_PENDING_LOOPS -#define _Q_PENDING_LOOPS 1 -#endif /* * Per-CPU queue node structures; we can never have more than 4 nested @@ -106,161 +77,7 @@ struct qnode { * * PV doubles the storage and uses the second cacheline for PV state. */ -static DEFINE_PER_CPU_ALIGNED(struct qnode, qnodes[MAX_NODES]); - -/* - * We must be able to distinguish between no-tail and the tail at 0:0, - * therefore increment the cpu number by one. - */ - -static inline __pure u32 encode_tail(int cpu, int idx) -{ - u32 tail; - - tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET; - tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */ - - return tail; -} - -static inline __pure struct mcs_spinlock *decode_tail(u32 tail) -{ - int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1; - int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET; - - return per_cpu_ptr(&qnodes[idx].mcs, cpu); -} - -static inline __pure -struct mcs_spinlock *grab_mcs_node(struct mcs_spinlock *base, int idx) -{ - return &((struct qnode *)base + idx)->mcs; -} - -#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK) - -#if _Q_PENDING_BITS == 8 -/** - * clear_pending - clear the pending bit. - * @lock: Pointer to queued spinlock structure - * - * *,1,* -> *,0,* - */ -static __always_inline void clear_pending(struct qspinlock *lock) -{ - WRITE_ONCE(lock->pending, 0); -} - -/** - * clear_pending_set_locked - take ownership and clear the pending bit. - * @lock: Pointer to queued spinlock structure - * - * *,1,0 -> *,0,1 - * - * Lock stealing is not allowed if this function is used. - */ -static __always_inline void clear_pending_set_locked(struct qspinlock *lock) -{ - WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL); -} - -/* - * xchg_tail - Put in the new queue tail code word & retrieve previous one - * @lock : Pointer to queued spinlock structure - * @tail : The new queue tail code word - * Return: The previous queue tail code word - * - * xchg(lock, tail), which heads an address dependency - * - * p,*,* -> n,*,* ; prev = xchg(lock, node) - */ -static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) -{ - /* - * We can use relaxed semantics since the caller ensures that the - * MCS node is properly initialized before updating the tail. - */ - return (u32)xchg_relaxed(&lock->tail, - tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET; -} - -#else /* _Q_PENDING_BITS == 8 */ - -/** - * clear_pending - clear the pending bit. - * @lock: Pointer to queued spinlock structure - * - * *,1,* -> *,0,* - */ -static __always_inline void clear_pending(struct qspinlock *lock) -{ - atomic_andnot(_Q_PENDING_VAL, &lock->val); -} - -/** - * clear_pending_set_locked - take ownership and clear the pending bit. - * @lock: Pointer to queued spinlock structure - * - * *,1,0 -> *,0,1 - */ -static __always_inline void clear_pending_set_locked(struct qspinlock *lock) -{ - atomic_add(-_Q_PENDING_VAL + _Q_LOCKED_VAL, &lock->val); -} - -/** - * xchg_tail - Put in the new queue tail code word & retrieve previous one - * @lock : Pointer to queued spinlock structure - * @tail : The new queue tail code word - * Return: The previous queue tail code word - * - * xchg(lock, tail) - * - * p,*,* -> n,*,* ; prev = xchg(lock, node) - */ -static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) -{ - u32 old, new; - - old = atomic_read(&lock->val); - do { - new = (old & _Q_LOCKED_PENDING_MASK) | tail; - /* - * We can use relaxed semantics since the caller ensures that - * the MCS node is properly initialized before updating the - * tail. - */ - } while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new)); - - return old; -} -#endif /* _Q_PENDING_BITS == 8 */ - -/** - * queued_fetch_set_pending_acquire - fetch the whole lock value and set pending - * @lock : Pointer to queued spinlock structure - * Return: The previous lock value - * - * *,*,* -> *,1,* - */ -#ifndef queued_fetch_set_pending_acquire -static __always_inline u32 queued_fetch_set_pending_acquire(struct qspinlock *lock) -{ - return atomic_fetch_or_acquire(_Q_PENDING_VAL, &lock->val); -} -#endif - -/** - * set_locked - Set the lock bit and own the lock - * @lock: Pointer to queued spinlock structure - * - * *,*,0 -> *,0,1 - */ -static __always_inline void set_locked(struct qspinlock *lock) -{ - WRITE_ONCE(lock->locked, _Q_LOCKED_VAL); -} - +static DEFINE_PER_CPU_ALIGNED(struct qnode, qnodes[_Q_MAX_NODES]); /* * Generate the native code for queued_spin_unlock_slowpath(); provide NOPs for @@ -410,7 +227,7 @@ pv_queue: * any MCS node. This is not the most elegant solution, but is * simple enough. */ - if (unlikely(idx >= MAX_NODES)) { + if (unlikely(idx >= _Q_MAX_NODES)) { lockevent_inc(lock_no_node); while (!queued_spin_trylock(lock)) cpu_relax(); @@ -465,7 +282,7 @@ pv_queue: * head of the waitqueue. */ if (old & _Q_TAIL_MASK) { - prev = decode_tail(old); + prev = decode_tail(old, qnodes); /* Link @node into the waitqueue. */ WRITE_ONCE(prev->next, node); diff --git a/kernel/locking/qspinlock.h b/kernel/locking/qspinlock.h new file mode 100644 index 000000000000..d69958a844f7 --- /dev/null +++ b/kernel/locking/qspinlock.h @@ -0,0 +1,201 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Queued spinlock defines + * + * This file contains macro definitions and functions shared between different + * qspinlock slow path implementations. + */ +#ifndef __LINUX_QSPINLOCK_H +#define __LINUX_QSPINLOCK_H + +#include <asm-generic/percpu.h> +#include <linux/percpu-defs.h> +#include <asm-generic/qspinlock.h> +#include <asm-generic/mcs_spinlock.h> + +#define _Q_MAX_NODES 4 + +/* + * The pending bit spinning loop count. + * This heuristic is used to limit the number of lockword accesses + * made by atomic_cond_read_relaxed when waiting for the lock to + * transition out of the "== _Q_PENDING_VAL" state. We don't spin + * indefinitely because there's no guarantee that we'll make forward + * progress. + */ +#ifndef _Q_PENDING_LOOPS +#define _Q_PENDING_LOOPS 1 +#endif + +/* + * On 64-bit architectures, the mcs_spinlock structure will be 16 bytes in + * size and four of them will fit nicely in one 64-byte cacheline. For + * pvqspinlock, however, we need more space for extra data. To accommodate + * that, we insert two more long words to pad it up to 32 bytes. IOW, only + * two of them can fit in a cacheline in this case. That is OK as it is rare + * to have more than 2 levels of slowpath nesting in actual use. We don't + * want to penalize pvqspinlocks to optimize for a rare case in native + * qspinlocks. + */ +struct qnode { + struct mcs_spinlock mcs; +#ifdef CONFIG_PARAVIRT_SPINLOCKS + long reserved[2]; +#endif +}; + +/* + * We must be able to distinguish between no-tail and the tail at 0:0, + * therefore increment the cpu number by one. + */ + +static inline __pure u32 encode_tail(int cpu, int idx) +{ + u32 tail; + + tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET; + tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */ + + return tail; +} + +static inline __pure struct mcs_spinlock *decode_tail(u32 tail, + struct qnode __percpu *qnodes) +{ + int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1; + int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET; + + return per_cpu_ptr(&qnodes[idx].mcs, cpu); +} + +static inline __pure +struct mcs_spinlock *grab_mcs_node(struct mcs_spinlock *base, int idx) +{ + return &((struct qnode *)base + idx)->mcs; +} + +#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK) + +#if _Q_PENDING_BITS == 8 +/** + * clear_pending - clear the pending bit. + * @lock: Pointer to queued spinlock structure + * + * *,1,* -> *,0,* + */ +static __always_inline void clear_pending(struct qspinlock *lock) +{ + WRITE_ONCE(lock->pending, 0); +} + +/** + * clear_pending_set_locked - take ownership and clear the pending bit. + * @lock: Pointer to queued spinlock structure + * + * *,1,0 -> *,0,1 + * + * Lock stealing is not allowed if this function is used. + */ +static __always_inline void clear_pending_set_locked(struct qspinlock *lock) +{ + WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL); +} + +/* + * xchg_tail - Put in the new queue tail code word & retrieve previous one + * @lock : Pointer to queued spinlock structure + * @tail : The new queue tail code word + * Return: The previous queue tail code word + * + * xchg(lock, tail), which heads an address dependency + * + * p,*,* -> n,*,* ; prev = xchg(lock, node) + */ +static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) +{ + /* + * We can use relaxed semantics since the caller ensures that the + * MCS node is properly initialized before updating the tail. + */ + return (u32)xchg_relaxed(&lock->tail, + tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET; +} + +#else /* _Q_PENDING_BITS == 8 */ + +/** + * clear_pending - clear the pending bit. + * @lock: Pointer to queued spinlock structure + * + * *,1,* -> *,0,* + */ +static __always_inline void clear_pending(struct qspinlock *lock) +{ + atomic_andnot(_Q_PENDING_VAL, &lock->val); +} + +/** + * clear_pending_set_locked - take ownership and clear the pending bit. + * @lock: Pointer to queued spinlock structure + * + * *,1,0 -> *,0,1 + */ +static __always_inline void clear_pending_set_locked(struct qspinlock *lock) +{ + atomic_add(-_Q_PENDING_VAL + _Q_LOCKED_VAL, &lock->val); +} + +/** + * xchg_tail - Put in the new queue tail code word & retrieve previous one + * @lock : Pointer to queued spinlock structure + * @tail : The new queue tail code word + * Return: The previous queue tail code word + * + * xchg(lock, tail) + * + * p,*,* -> n,*,* ; prev = xchg(lock, node) + */ +static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) +{ + u32 old, new; + + old = atomic_read(&lock->val); + do { + new = (old & _Q_LOCKED_PENDING_MASK) | tail; + /* + * We can use relaxed semantics since the caller ensures that + * the MCS node is properly initialized before updating the + * tail. + */ + } while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new)); + + return old; +} +#endif /* _Q_PENDING_BITS == 8 */ + +/** + * queued_fetch_set_pending_acquire - fetch the whole lock value and set pending + * @lock : Pointer to queued spinlock structure + * Return: The previous lock value + * + * *,*,* -> *,1,* + */ +#ifndef queued_fetch_set_pending_acquire +static __always_inline u32 queued_fetch_set_pending_acquire(struct qspinlock *lock) +{ + return atomic_fetch_or_acquire(_Q_PENDING_VAL, &lock->val); +} +#endif + +/** + * set_locked - Set the lock bit and own the lock + * @lock: Pointer to queued spinlock structure + * + * *,*,0 -> *,0,1 + */ +static __always_inline void set_locked(struct qspinlock *lock) +{ + WRITE_ONCE(lock->locked, _Q_LOCKED_VAL); +} + +#endif /* __LINUX_QSPINLOCK_H */ diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig index d7762ef5949a..39278737bb68 100644 --- a/kernel/module/Kconfig +++ b/kernel/module/Kconfig @@ -192,6 +192,11 @@ config GENDWARFKSYMS depends on !DEBUG_INFO_REDUCED && !DEBUG_INFO_SPLIT # Requires ELF object files. depends on !LTO + # To avoid conflicts with the discarded __gendwarfksyms_ptr symbols on + # X86, requires pahole before commit 47dcb534e253 ("btf_encoder: Stop + # indexing symbols for VARs") or after commit 9810758003ce ("btf_encoder: + # Verify 0 address DWARF variables are in ELF section"). + depends on !X86 || !DEBUG_INFO_BTF || PAHOLE_VERSION < 128 || PAHOLE_VERSION > 129 help Calculate symbol versions from DWARF debugging information using gendwarfksyms. Requires DEBUG_INFO to be enabled. diff --git a/kernel/module/internal.h b/kernel/module/internal.h index d09b46ef032f..626cf8668a7e 100644 --- a/kernel/module/internal.h +++ b/kernel/module/internal.h @@ -124,17 +124,6 @@ char *module_next_tag_pair(char *string, unsigned long *secsize); #define for_each_modinfo_entry(entry, info, name) \ for (entry = get_modinfo(info, name); entry; entry = get_next_modinfo(info, name, entry)) -static inline void module_assert_mutex_or_preempt(void) -{ -#ifdef CONFIG_LOCKDEP - if (unlikely(!debug_locks)) - return; - - WARN_ON_ONCE(!rcu_read_lock_sched_held() && - !lockdep_is_held(&module_mutex)); -#endif -} - static inline unsigned long kernel_symbol_value(const struct kernel_symbol *sym) { #ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c index bf65e0c3c86f..00a60796327c 100644 --- a/kernel/module/kallsyms.c +++ b/kernel/module/kallsyms.c @@ -177,19 +177,15 @@ void add_kallsyms(struct module *mod, const struct load_info *info) unsigned long strtab_size; void *data_base = mod->mem[MOD_DATA].base; void *init_data_base = mod->mem[MOD_INIT_DATA].base; + struct mod_kallsyms *kallsyms; - /* Set up to point into init section. */ - mod->kallsyms = (void __rcu *)init_data_base + - info->mod_kallsyms_init_off; + kallsyms = init_data_base + info->mod_kallsyms_init_off; - rcu_read_lock(); - /* The following is safe since this pointer cannot change */ - rcu_dereference(mod->kallsyms)->symtab = (void *)symsec->sh_addr; - rcu_dereference(mod->kallsyms)->num_symtab = symsec->sh_size / sizeof(Elf_Sym); + kallsyms->symtab = (void *)symsec->sh_addr; + kallsyms->num_symtab = symsec->sh_size / sizeof(Elf_Sym); /* Make sure we get permanent strtab: don't use info->strtab. */ - rcu_dereference(mod->kallsyms)->strtab = - (void *)info->sechdrs[info->index.str].sh_addr; - rcu_dereference(mod->kallsyms)->typetab = init_data_base + info->init_typeoffs; + kallsyms->strtab = (void *)info->sechdrs[info->index.str].sh_addr; + kallsyms->typetab = init_data_base + info->init_typeoffs; /* * Now populate the cut down core kallsyms for after init @@ -199,20 +195,19 @@ void add_kallsyms(struct module *mod, const struct load_info *info) mod->core_kallsyms.strtab = s = data_base + info->stroffs; mod->core_kallsyms.typetab = data_base + info->core_typeoffs; strtab_size = info->core_typeoffs - info->stroffs; - src = rcu_dereference(mod->kallsyms)->symtab; - for (ndst = i = 0; i < rcu_dereference(mod->kallsyms)->num_symtab; i++) { - rcu_dereference(mod->kallsyms)->typetab[i] = elf_type(src + i, info); + src = kallsyms->symtab; + for (ndst = i = 0; i < kallsyms->num_symtab; i++) { + kallsyms->typetab[i] = elf_type(src + i, info); if (i == 0 || is_livepatch_module(mod) || is_core_symbol(src + i, info->sechdrs, info->hdr->e_shnum, info->index.pcpu)) { ssize_t ret; mod->core_kallsyms.typetab[ndst] = - rcu_dereference(mod->kallsyms)->typetab[i]; + kallsyms->typetab[i]; dst[ndst] = src[i]; dst[ndst++].st_name = s - mod->core_kallsyms.strtab; - ret = strscpy(s, - &rcu_dereference(mod->kallsyms)->strtab[src[i].st_name], + ret = strscpy(s, &kallsyms->strtab[src[i].st_name], strtab_size); if (ret < 0) break; @@ -220,7 +215,9 @@ void add_kallsyms(struct module *mod, const struct load_info *info) strtab_size -= ret + 1; } } - rcu_read_unlock(); + + /* Set up to point into init section. */ + rcu_assign_pointer(mod->kallsyms, kallsyms); mod->core_kallsyms.num_symtab = ndst; } @@ -260,7 +257,7 @@ static const char *find_kallsyms_symbol(struct module *mod, { unsigned int i, best = 0; unsigned long nextval, bestval; - struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms); + struct mod_kallsyms *kallsyms = rcu_dereference(mod->kallsyms); struct module_memory *mod_mem; /* At worse, next value is at end of module */ @@ -319,7 +316,7 @@ void * __weak dereference_module_function_descriptor(struct module *mod, /* * For kallsyms to ask for address resolution. NULL means not found. Careful - * not to lock to avoid deadlock on oopses, simply disable preemption. + * not to lock to avoid deadlock on oopses, RCU is enough. */ int module_address_lookup(unsigned long addr, unsigned long *size, @@ -332,7 +329,7 @@ int module_address_lookup(unsigned long addr, int ret = 0; struct module *mod; - preempt_disable(); + guard(rcu)(); mod = __module_address(addr); if (mod) { if (modname) @@ -350,8 +347,6 @@ int module_address_lookup(unsigned long addr, if (sym) ret = strscpy(namebuf, sym, KSYM_NAME_LEN); } - preempt_enable(); - return ret; } @@ -359,7 +354,7 @@ int lookup_module_symbol_name(unsigned long addr, char *symname) { struct module *mod; - preempt_disable(); + guard(rcu)(); list_for_each_entry_rcu(mod, &modules, list) { if (mod->state == MODULE_STATE_UNFORMED) continue; @@ -371,12 +366,10 @@ int lookup_module_symbol_name(unsigned long addr, char *symname) goto out; strscpy(symname, sym, KSYM_NAME_LEN); - preempt_enable(); return 0; } } out: - preempt_enable(); return -ERANGE; } @@ -385,13 +378,13 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, { struct module *mod; - preempt_disable(); + guard(rcu)(); list_for_each_entry_rcu(mod, &modules, list) { struct mod_kallsyms *kallsyms; if (mod->state == MODULE_STATE_UNFORMED) continue; - kallsyms = rcu_dereference_sched(mod->kallsyms); + kallsyms = rcu_dereference(mod->kallsyms); if (symnum < kallsyms->num_symtab) { const Elf_Sym *sym = &kallsyms->symtab[symnum]; @@ -400,12 +393,10 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, strscpy(name, kallsyms_symbol_name(kallsyms, symnum), KSYM_NAME_LEN); strscpy(module_name, mod->name, MODULE_NAME_LEN); *exported = is_exported(name, *value, mod); - preempt_enable(); return 0; } symnum -= kallsyms->num_symtab; } - preempt_enable(); return -ERANGE; } @@ -413,7 +404,7 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, static unsigned long __find_kallsyms_symbol_value(struct module *mod, const char *name) { unsigned int i; - struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms); + struct mod_kallsyms *kallsyms = rcu_dereference(mod->kallsyms); for (i = 0; i < kallsyms->num_symtab; i++) { const Elf_Sym *sym = &kallsyms->symtab[i]; @@ -453,23 +444,15 @@ static unsigned long __module_kallsyms_lookup_name(const char *name) /* Look for this name: can be of form module:name. */ unsigned long module_kallsyms_lookup_name(const char *name) { - unsigned long ret; - /* Don't lock: we're in enough trouble already. */ - preempt_disable(); - ret = __module_kallsyms_lookup_name(name); - preempt_enable(); - return ret; + guard(rcu)(); + return __module_kallsyms_lookup_name(name); } unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name) { - unsigned long ret; - - preempt_disable(); - ret = __find_kallsyms_symbol_value(mod, name); - preempt_enable(); - return ret; + guard(rcu)(); + return __find_kallsyms_symbol_value(mod, name); } int module_kallsyms_on_each_symbol(const char *modname, @@ -490,10 +473,8 @@ int module_kallsyms_on_each_symbol(const char *modname, if (modname && strcmp(modname, mod->name)) continue; - /* Use rcu_dereference_sched() to remain compliant with the sparse tool */ - preempt_disable(); - kallsyms = rcu_dereference_sched(mod->kallsyms); - preempt_enable(); + kallsyms = rcu_dereference_check(mod->kallsyms, + lockdep_is_held(&module_mutex)); for (i = 0; i < kallsyms->num_symtab; i++) { const Elf_Sym *sym = &kallsyms->symtab[i]; diff --git a/kernel/module/main.c b/kernel/module/main.c index a256cc919ad7..5c6ab20240a6 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -67,7 +67,7 @@ /* * Mutex protects: - * 1) List of modules (also safely readable with preempt_disable), + * 1) List of modules (also safely readable within RCU read section), * 2) module_use links, * 3) mod_tree.addr_min/mod_tree.addr_max. * (delete and add uses RCU list operations). @@ -331,7 +331,7 @@ static bool find_exported_symbol_in_section(const struct symsearch *syms, /* * Find an exported symbol and return it, along with, (optional) crc and - * (optional) module which owns it. Needs preempt disabled or module_mutex. + * (optional) module which owns it. Needs RCU or module_mutex. */ bool find_symbol(struct find_symbol_arg *fsa) { @@ -345,8 +345,6 @@ bool find_symbol(struct find_symbol_arg *fsa) struct module *mod; unsigned int i; - module_assert_mutex_or_preempt(); - for (i = 0; i < ARRAY_SIZE(arr); i++) if (find_exported_symbol_in_section(&arr[i], NULL, fsa)) return true; @@ -374,16 +372,14 @@ bool find_symbol(struct find_symbol_arg *fsa) } /* - * Search for module by name: must hold module_mutex (or preempt disabled - * for read-only access). + * Search for module by name: must hold module_mutex (or RCU for read-only + * access). */ struct module *find_module_all(const char *name, size_t len, bool even_unformed) { struct module *mod; - module_assert_mutex_or_preempt(); - list_for_each_entry_rcu(mod, &modules, list, lockdep_is_held(&module_mutex)) { if (!even_unformed && mod->state == MODULE_STATE_UNFORMED) @@ -454,8 +450,7 @@ bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr) struct module *mod; unsigned int cpu; - preempt_disable(); - + guard(rcu)(); list_for_each_entry_rcu(mod, &modules, list) { if (mod->state == MODULE_STATE_UNFORMED) continue; @@ -472,13 +467,10 @@ bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr) per_cpu_ptr(mod->percpu, get_boot_cpu_id()); } - preempt_enable(); return true; } } } - - preempt_enable(); return false; } @@ -795,8 +787,8 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, async_synchronize_full(); /* Store the name and taints of the last unloaded module for diagnostic purposes */ - strscpy(last_unloaded_module.name, mod->name, sizeof(last_unloaded_module.name)); - strscpy(last_unloaded_module.taints, module_flags(mod, buf, false), sizeof(last_unloaded_module.taints)); + strscpy(last_unloaded_module.name, mod->name); + strscpy(last_unloaded_module.taints, module_flags(mod, buf, false)); free_module(mod); /* someone could wait for the module in add_unformed_module() */ @@ -814,10 +806,9 @@ void __symbol_put(const char *symbol) .gplok = true, }; - preempt_disable(); + guard(rcu)(); BUG_ON(!find_symbol(&fsa)); module_put(fsa.owner); - preempt_enable(); } EXPORT_SYMBOL(__symbol_put); @@ -832,13 +823,12 @@ void symbol_put_addr(void *addr) /* * Even though we hold a reference on the module; we still need to - * disable preemption in order to safely traverse the data structure. + * RCU read section in order to safely traverse the data structure. */ - preempt_disable(); + guard(rcu)(); modaddr = __module_text_address(a); BUG_ON(!modaddr); module_put(modaddr); - preempt_enable(); } EXPORT_SYMBOL_GPL(symbol_put_addr); @@ -1189,7 +1179,7 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod, getname: /* We must make copy under the lock if we failed to get ref. */ - strncpy(ownername, module_name(fsa.owner), MODULE_NAME_LEN); + strscpy(ownername, module_name(fsa.owner), MODULE_NAME_LEN); unlock: mutex_unlock(&module_mutex); return fsa.sym; @@ -1341,7 +1331,7 @@ static void free_module(struct module *mod) mod_tree_remove(mod); /* Remove this module from bug list, this uses list_del_rcu */ module_bug_cleanup(mod); - /* Wait for RCU-sched synchronizing before releasing mod->list and buglist. */ + /* Wait for RCU synchronizing before releasing mod->list and buglist. */ synchronize_rcu(); if (try_add_tainted_module(mod)) pr_err("%s: adding tainted module to the unloaded tainted modules list failed.\n", @@ -1364,21 +1354,18 @@ void *__symbol_get(const char *symbol) .warn = true, }; - preempt_disable(); - if (!find_symbol(&fsa)) - goto fail; - if (fsa.license != GPL_ONLY) { - pr_warn("failing symbol_get of non-GPLONLY symbol %s.\n", - symbol); - goto fail; + scoped_guard(rcu) { + if (!find_symbol(&fsa)) + return NULL; + if (fsa.license != GPL_ONLY) { + pr_warn("failing symbol_get of non-GPLONLY symbol %s.\n", + symbol); + return NULL; + } + if (strong_try_module_get(fsa.owner)) + return NULL; } - if (strong_try_module_get(fsa.owner)) - goto fail; - preempt_enable(); return (void *)kernel_symbol_value(fsa.sym); -fail: - preempt_enable(); - return NULL; } EXPORT_SYMBOL_GPL(__symbol_get); @@ -2842,6 +2829,7 @@ static void module_deallocate(struct module *mod, struct load_info *info) { percpu_modfree(mod); module_arch_freeing_init(mod); + codetag_free_module_sections(mod); free_mod_mem(mod); } @@ -3013,7 +3001,7 @@ static noinline int do_init_module(struct module *mod) #endif /* * We want to free module_init, but be aware that kallsyms may be - * walking this with preempt disabled. In all the failure paths, we + * walking this within an RCU read section. In all the failure paths, we * call synchronize_rcu(), but we don't want to slow down the success * path. execmem_free() cannot be called in an interrupt, so do the * work and call synchronize_rcu() in a work queue. @@ -3680,28 +3668,23 @@ out: /* Given an address, look for it in the module exception tables. */ const struct exception_table_entry *search_module_extables(unsigned long addr) { - const struct exception_table_entry *e = NULL; struct module *mod; - preempt_disable(); + guard(rcu)(); mod = __module_address(addr); if (!mod) - goto out; + return NULL; if (!mod->num_exentries) - goto out; - - e = search_extable(mod->extable, - mod->num_exentries, - addr); -out: - preempt_enable(); - + return NULL; /* - * Now, if we found one, we are running inside it now, hence - * we cannot unload the module, hence no refcnt needed. + * The address passed here belongs to a module that is currently + * invoked (we are running inside it). Therefore its module::refcnt + * needs already be >0 to ensure that it is not removed at this stage. + * All other user need to invoke this function within a RCU read + * section. */ - return e; + return search_extable(mod->extable, mod->num_exentries, addr); } /** @@ -3713,20 +3696,15 @@ out: */ bool is_module_address(unsigned long addr) { - bool ret; - - preempt_disable(); - ret = __module_address(addr) != NULL; - preempt_enable(); - - return ret; + guard(rcu)(); + return __module_address(addr) != NULL; } /** * __module_address() - get the module which contains an address. * @addr: the address. * - * Must be called with preempt disabled or module mutex held so that + * Must be called within RCU read section or module mutex held so that * module doesn't get freed during this. */ struct module *__module_address(unsigned long addr) @@ -3744,8 +3722,6 @@ struct module *__module_address(unsigned long addr) return NULL; lookup: - module_assert_mutex_or_preempt(); - mod = mod_find(addr, &mod_tree); if (mod) { BUG_ON(!within_module(addr, mod)); @@ -3765,20 +3741,28 @@ lookup: */ bool is_module_text_address(unsigned long addr) { - bool ret; + guard(rcu)(); + return __module_text_address(addr) != NULL; +} - preempt_disable(); - ret = __module_text_address(addr) != NULL; - preempt_enable(); +void module_for_each_mod(int(*func)(struct module *mod, void *data), void *data) +{ + struct module *mod; - return ret; + guard(rcu)(); + list_for_each_entry_rcu(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; + if (func(mod, data)) + break; + } } /** * __module_text_address() - get the module whose code contains an address. * @addr: the address. * - * Must be called with preempt disabled or module mutex held so that + * Must be called within RCU read section or module mutex held so that * module doesn't get freed during this. */ struct module *__module_text_address(unsigned long addr) @@ -3801,7 +3785,7 @@ void print_modules(void) printk(KERN_DEFAULT "Modules linked in:"); /* Most callers should already have preempt disabled, but make sure */ - preempt_disable(); + guard(rcu)(); list_for_each_entry_rcu(mod, &modules, list) { if (mod->state == MODULE_STATE_UNFORMED) continue; @@ -3809,7 +3793,6 @@ void print_modules(void) } print_unloaded_tainted_modules(); - preempt_enable(); if (last_unloaded_module.name[0]) pr_cont(" [last unloaded: %s%s]", last_unloaded_module.name, last_unloaded_module.taints); diff --git a/kernel/module/tracking.c b/kernel/module/tracking.c index 16742d1c630c..4fefec5b683c 100644 --- a/kernel/module/tracking.c +++ b/kernel/module/tracking.c @@ -21,8 +21,6 @@ int try_add_tainted_module(struct module *mod) { struct mod_unload_taint *mod_taint; - module_assert_mutex_or_preempt(); - if (!mod->taints) goto out; diff --git a/kernel/module/tree_lookup.c b/kernel/module/tree_lookup.c index 277197977d43..d3204c5c74eb 100644 --- a/kernel/module/tree_lookup.c +++ b/kernel/module/tree_lookup.c @@ -12,11 +12,11 @@ /* * Use a latched RB-tree for __module_address(); this allows us to use - * RCU-sched lookups of the address from any context. + * RCU lookups of the address from any context. * - * This is conditional on PERF_EVENTS || TRACING because those can really hit - * __module_address() hard by doing a lot of stack unwinding; potentially from - * NMI context. + * This is conditional on PERF_EVENTS || TRACING || CFI_CLANG because those can + * really hit __module_address() hard by doing a lot of stack unwinding; + * potentially from NMI context. */ static __always_inline unsigned long __mod_tree_val(struct latch_tree_node *n) diff --git a/kernel/module/version.c b/kernel/module/version.c index 3718a8868321..2beefeba82d9 100644 --- a/kernel/module/version.c +++ b/kernel/module/version.c @@ -79,17 +79,17 @@ int check_modstruct_version(const struct load_info *info, .name = "module_layout", .gplok = true, }; + bool have_symbol; /* * Since this should be found in kernel (which can't be removed), no - * locking is necessary -- use preempt_disable() to placate lockdep. + * locking is necessary. Regardless use a RCU read section to keep + * lockdep happy. */ - preempt_disable(); - if (!find_symbol(&fsa)) { - preempt_enable(); - BUG(); - } - preempt_enable(); + scoped_guard(rcu) + have_symbol = find_symbol(&fsa); + BUG_ON(!have_symbol); + return check_version(info, "module_layout", mod, fsa.crc); } diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index c9d97ed20122..5f31fdff8a38 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -128,17 +128,13 @@ out_time: out_net: put_cgroup_ns(new_nsp->cgroup_ns); out_cgroup: - if (new_nsp->pid_ns_for_children) - put_pid_ns(new_nsp->pid_ns_for_children); + put_pid_ns(new_nsp->pid_ns_for_children); out_pid: - if (new_nsp->ipc_ns) - put_ipc_ns(new_nsp->ipc_ns); + put_ipc_ns(new_nsp->ipc_ns); out_ipc: - if (new_nsp->uts_ns) - put_uts_ns(new_nsp->uts_ns); + put_uts_ns(new_nsp->uts_ns); out_uts: - if (new_nsp->mnt_ns) - put_mnt_ns(new_nsp->mnt_ns); + put_mnt_ns(new_nsp->mnt_ns); out_ns: kmem_cache_free(nsproxy_cachep, new_nsp); return ERR_PTR(err); @@ -189,18 +185,12 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) void free_nsproxy(struct nsproxy *ns) { - if (ns->mnt_ns) - put_mnt_ns(ns->mnt_ns); - if (ns->uts_ns) - put_uts_ns(ns->uts_ns); - if (ns->ipc_ns) - put_ipc_ns(ns->ipc_ns); - if (ns->pid_ns_for_children) - put_pid_ns(ns->pid_ns_for_children); - if (ns->time_ns) - put_time_ns(ns->time_ns); - if (ns->time_ns_for_children) - put_time_ns(ns->time_ns_for_children); + put_mnt_ns(ns->mnt_ns); + put_uts_ns(ns->uts_ns); + put_ipc_ns(ns->ipc_ns); + put_pid_ns(ns->pid_ns_for_children); + put_time_ns(ns->time_ns); + put_time_ns(ns->time_ns_for_children); put_cgroup_ns(ns->cgroup_ns); put_net(ns->net_ns); kmem_cache_free(nsproxy_cachep, ns); diff --git a/kernel/padata.c b/kernel/padata.c index b3d4eacc4f5d..7eee94166357 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -358,7 +358,8 @@ static void padata_reorder(struct parallel_data *pd) * To avoid UAF issue, add pd ref here, and put pd ref after reorder_work finish. */ padata_get_pd(pd); - queue_work(pinst->serial_wq, &pd->reorder_work); + if (!queue_work(pinst->serial_wq, &pd->reorder_work)) + padata_put_pd(pd); } } diff --git a/kernel/panic.c b/kernel/panic.c index d8635d5cecb2..047ea3215312 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -97,6 +97,36 @@ static const struct ctl_table kern_panic_table[] = { }, #endif { + .procname = "panic", + .data = &panic_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "panic_on_oops", + .data = &panic_on_oops, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "panic_print", + .data = &panic_print, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, + { + .procname = "panic_on_warn", + .data = &panic_on_warn, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { .procname = "warn_limit", .data = &warn_limit, .maxlen = sizeof(warn_limit), @@ -511,6 +541,7 @@ const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = { TAINT_FLAG(AUX, 'X', ' ', true), TAINT_FLAG(RANDSTRUCT, 'T', ' ', true), TAINT_FLAG(TEST, 'N', ' ', true), + TAINT_FLAG(FWCTL, 'J', ' ', true), }; #undef TAINT_FLAG @@ -832,9 +863,15 @@ device_initcall(register_warn_debugfs); */ __visible noinstr void __stack_chk_fail(void) { + unsigned long flags; + instrumentation_begin(); + flags = user_access_save(); + panic("stack-protector: Kernel stack is corrupted in: %pB", __builtin_return_address(0)); + + user_access_restore(flags); instrumentation_end(); } EXPORT_SYMBOL(__stack_chk_fail); diff --git a/kernel/params.c b/kernel/params.c index 0074d29c9b80..b92d64161b75 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -551,7 +551,7 @@ struct module_param_attrs { unsigned int num; struct attribute_group grp; - struct param_attribute attrs[]; + struct param_attribute attrs[] __counted_by(num); }; #ifdef CONFIG_SYSFS @@ -651,35 +651,32 @@ static __modinit int add_sysfs_param(struct module_kobject *mk, } /* Enlarge allocations. */ - new_mp = krealloc(mk->mp, - sizeof(*mk->mp) + - sizeof(mk->mp->attrs[0]) * (mk->mp->num + 1), + new_mp = krealloc(mk->mp, struct_size(mk->mp, attrs, mk->mp->num + 1), GFP_KERNEL); if (!new_mp) return -ENOMEM; mk->mp = new_mp; + mk->mp->num++; /* Extra pointer for NULL terminator */ - new_attrs = krealloc(mk->mp->grp.attrs, - sizeof(mk->mp->grp.attrs[0]) * (mk->mp->num + 2), - GFP_KERNEL); + new_attrs = krealloc_array(mk->mp->grp.attrs, mk->mp->num + 1, + sizeof(mk->mp->grp.attrs[0]), GFP_KERNEL); if (!new_attrs) return -ENOMEM; mk->mp->grp.attrs = new_attrs; /* Tack new one on the end. */ - memset(&mk->mp->attrs[mk->mp->num], 0, sizeof(mk->mp->attrs[0])); - sysfs_attr_init(&mk->mp->attrs[mk->mp->num].mattr.attr); - mk->mp->attrs[mk->mp->num].param = kp; - mk->mp->attrs[mk->mp->num].mattr.show = param_attr_show; + memset(&mk->mp->attrs[mk->mp->num - 1], 0, sizeof(mk->mp->attrs[0])); + sysfs_attr_init(&mk->mp->attrs[mk->mp->num - 1].mattr.attr); + mk->mp->attrs[mk->mp->num - 1].param = kp; + mk->mp->attrs[mk->mp->num - 1].mattr.show = param_attr_show; /* Do not allow runtime DAC changes to make param writable. */ if ((kp->perm & (S_IWUSR | S_IWGRP | S_IWOTH)) != 0) - mk->mp->attrs[mk->mp->num].mattr.store = param_attr_store; + mk->mp->attrs[mk->mp->num - 1].mattr.store = param_attr_store; else - mk->mp->attrs[mk->mp->num].mattr.store = NULL; - mk->mp->attrs[mk->mp->num].mattr.attr.name = (char *)name; - mk->mp->attrs[mk->mp->num].mattr.attr.mode = kp->perm; - mk->mp->num++; + mk->mp->attrs[mk->mp->num - 1].mattr.store = NULL; + mk->mp->attrs[mk->mp->num - 1].mattr.attr.name = (char *)name; + mk->mp->attrs[mk->mp->num - 1].mattr.attr.mode = kp->perm; /* Fix up all the pointers, since krealloc can move us */ for (i = 0; i < mk->mp->num; i++) @@ -763,38 +760,35 @@ void destroy_params(const struct kernel_param *params, unsigned num) params[i].ops->free(params[i].arg); } -static struct module_kobject * __init locate_module_kobject(const char *name) +struct module_kobject __modinit * lookup_or_create_module_kobject(const char *name) { struct module_kobject *mk; struct kobject *kobj; int err; kobj = kset_find_obj(module_kset, name); - if (kobj) { - mk = to_module_kobject(kobj); - } else { - mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL); - BUG_ON(!mk); - - mk->mod = THIS_MODULE; - mk->kobj.kset = module_kset; - err = kobject_init_and_add(&mk->kobj, &module_ktype, NULL, - "%s", name); -#ifdef CONFIG_MODULES - if (!err) - err = sysfs_create_file(&mk->kobj, &module_uevent.attr); -#endif - if (err) { - kobject_put(&mk->kobj); - pr_crit("Adding module '%s' to sysfs failed (%d), the system may be unstable.\n", - name, err); - return NULL; - } + if (kobj) + return to_module_kobject(kobj); - /* So that we hold reference in both cases. */ - kobject_get(&mk->kobj); + mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL); + if (!mk) + return NULL; + + mk->mod = THIS_MODULE; + mk->kobj.kset = module_kset; + err = kobject_init_and_add(&mk->kobj, &module_ktype, NULL, "%s", name); + if (IS_ENABLED(CONFIG_MODULES) && !err) + err = sysfs_create_file(&mk->kobj, &module_uevent.attr); + if (err) { + kobject_put(&mk->kobj); + pr_crit("Adding module '%s' to sysfs failed (%d), the system may be unstable.\n", + name, err); + return NULL; } + /* So that we hold reference in both cases. */ + kobject_get(&mk->kobj); + return mk; } @@ -805,7 +799,7 @@ static void __init kernel_add_sysfs_param(const char *name, struct module_kobject *mk; int err; - mk = locate_module_kobject(name); + mk = lookup_or_create_module_kobject(name); if (!mk) return; @@ -876,7 +870,7 @@ static void __init version_sysfs_builtin(void) int err; for (vattr = __start___modver; vattr < __stop___modver; vattr++) { - mk = locate_module_kobject(vattr->module_name); + mk = lookup_or_create_module_kobject(vattr->module_name); if (mk) { err = sysfs_create_file(&mk->kobj, &vattr->mattr.attr); WARN_ON_ONCE(err); @@ -949,7 +943,9 @@ struct kset *module_kset; static void module_kobj_release(struct kobject *kobj) { struct module_kobject *mk = to_module_kobject(kobj); - complete(mk->kobj_completion); + + if (mk->kobj_completion) + complete(mk->kobj_completion); } const struct kobj_type module_ktype = { diff --git a/kernel/pid.c b/kernel/pid.c index 4ac2ce46817f..8317bcbc7cf7 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -100,6 +100,7 @@ void put_pid(struct pid *pid) ns = pid->numbers[pid->level].ns; if (refcount_dec_and_test(&pid->count)) { + WARN_ON_ONCE(pid->stashed); kmem_cache_free(ns->pid_cachep, pid); put_pid_ns(ns); } @@ -359,11 +360,6 @@ static void __change_pid(struct pid **pids, struct task_struct *task, hlist_del_rcu(&task->pid_links[type]); *pid_ptr = new; - if (type == PIDTYPE_PID) { - WARN_ON_ONCE(pid_has_task(pid, PIDTYPE_PID)); - wake_up_all(&pid->wait_pidfd); - } - for (tmp = PIDTYPE_MAX; --tmp >= 0; ) if (pid_has_task(pid, tmp)) return; diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index ca947ed32e3d..54a623680019 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -380,8 +380,7 @@ config CPU_PM config ENERGY_MODEL bool "Energy Model for devices with DVFS (CPUs, GPUs, etc)" - depends on SMP - depends on CPU_FREQ + depends on CPU_FREQ || PM_DEVFREQ help Several subsystems (thermal and/or the task scheduler for example) can leverage information about the energy consumed by devices to diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 3874f0e97651..ea7995a25780 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -161,22 +161,10 @@ static void em_debug_create_pd(struct device *dev) {} static void em_debug_remove_pd(struct device *dev) {} #endif -static void em_destroy_table_rcu(struct rcu_head *rp) -{ - struct em_perf_table __rcu *table; - - table = container_of(rp, struct em_perf_table, rcu); - kfree(table); -} - static void em_release_table_kref(struct kref *kref) { - struct em_perf_table __rcu *table; - /* It was the last owner of this table so we can free */ - table = container_of(kref, struct em_perf_table, kref); - - call_rcu(&table->rcu, em_destroy_table_rcu); + kfree_rcu(container_of(kref, struct em_perf_table, kref), rcu); } /** @@ -185,7 +173,7 @@ static void em_release_table_kref(struct kref *kref) * * No return values. */ -void em_table_free(struct em_perf_table __rcu *table) +void em_table_free(struct em_perf_table *table) { kref_put(&table->kref, em_release_table_kref); } @@ -198,9 +186,9 @@ void em_table_free(struct em_perf_table __rcu *table) * has a user. * Returns allocated table or NULL. */ -struct em_perf_table __rcu *em_table_alloc(struct em_perf_domain *pd) +struct em_perf_table *em_table_alloc(struct em_perf_domain *pd) { - struct em_perf_table __rcu *table; + struct em_perf_table *table; int table_size; table_size = sizeof(struct em_perf_state) * pd->nr_perf_states; @@ -239,12 +227,16 @@ static void em_init_performance(struct device *dev, struct em_perf_domain *pd, } static int em_compute_costs(struct device *dev, struct em_perf_state *table, - struct em_data_callback *cb, int nr_states, + const struct em_data_callback *cb, int nr_states, unsigned long flags) { unsigned long prev_cost = ULONG_MAX; int i, ret; + /* This is needed only for CPUs and EAS skip other devices */ + if (!_is_cpu_device(dev)) + return 0; + /* Compute the cost of each performance state. */ for (i = nr_states - 1; i >= 0; i--) { unsigned long power_res, cost; @@ -308,9 +300,9 @@ int em_dev_compute_costs(struct device *dev, struct em_perf_state *table, * Return 0 on success or an error code on failure. */ int em_dev_update_perf_domain(struct device *dev, - struct em_perf_table __rcu *new_table) + struct em_perf_table *new_table) { - struct em_perf_table __rcu *old_table; + struct em_perf_table *old_table; struct em_perf_domain *pd; if (!dev) @@ -327,7 +319,8 @@ int em_dev_update_perf_domain(struct device *dev, kref_get(&new_table->kref); - old_table = pd->em_table; + old_table = rcu_dereference_protected(pd->em_table, + lockdep_is_held(&em_pd_mutex)); rcu_assign_pointer(pd->em_table, new_table); em_cpufreq_update_efficiencies(dev, new_table->state); @@ -341,7 +334,7 @@ EXPORT_SYMBOL_GPL(em_dev_update_perf_domain); static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, struct em_perf_state *table, - struct em_data_callback *cb, + const struct em_data_callback *cb, unsigned long flags) { unsigned long power, freq, prev_freq = 0; @@ -396,10 +389,11 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, } static int em_create_pd(struct device *dev, int nr_states, - struct em_data_callback *cb, cpumask_t *cpus, + const struct em_data_callback *cb, + const cpumask_t *cpus, unsigned long flags) { - struct em_perf_table __rcu *em_table; + struct em_perf_table *em_table; struct em_perf_domain *pd; struct device *cpu_dev; int cpu, ret, num_cpus; @@ -556,9 +550,10 @@ EXPORT_SYMBOL_GPL(em_cpu_get); * Return 0 on success */ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, - struct em_data_callback *cb, cpumask_t *cpus, - bool microwatts) + const struct em_data_callback *cb, + const cpumask_t *cpus, bool microwatts) { + struct em_perf_table *em_table; unsigned long cap, prev_cap = 0; unsigned long flags = 0; int cpu, ret; @@ -631,7 +626,9 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, dev->em_pd->min_perf_state = 0; dev->em_pd->max_perf_state = nr_states - 1; - em_cpufreq_update_efficiencies(dev, dev->em_pd->em_table->state); + em_table = rcu_dereference_protected(dev->em_pd->em_table, + lockdep_is_held(&em_pd_mutex)); + em_cpufreq_update_efficiencies(dev, em_table->state); em_debug_create_pd(dev); dev_info(dev, "EM: created perf domain\n"); @@ -668,7 +665,8 @@ void em_dev_unregister_perf_domain(struct device *dev) mutex_lock(&em_pd_mutex); em_debug_remove_pd(dev); - em_table_free(dev->em_pd->em_table); + em_table_free(rcu_dereference_protected(dev->em_pd->em_table, + lockdep_is_held(&em_pd_mutex))); kfree(dev->em_pd); dev->em_pd = NULL; @@ -676,9 +674,9 @@ void em_dev_unregister_perf_domain(struct device *dev) } EXPORT_SYMBOL_GPL(em_dev_unregister_perf_domain); -static struct em_perf_table __rcu *em_table_dup(struct em_perf_domain *pd) +static struct em_perf_table *em_table_dup(struct em_perf_domain *pd) { - struct em_perf_table __rcu *em_table; + struct em_perf_table *em_table; struct em_perf_state *ps, *new_ps; int ps_size; @@ -700,14 +698,16 @@ static struct em_perf_table __rcu *em_table_dup(struct em_perf_domain *pd) } static int em_recalc_and_update(struct device *dev, struct em_perf_domain *pd, - struct em_perf_table __rcu *em_table) + struct em_perf_table *em_table) { int ret; - ret = em_compute_costs(dev, em_table->state, NULL, pd->nr_perf_states, - pd->flags); - if (ret) - goto free_em_table; + if (!em_is_artificial(pd)) { + ret = em_compute_costs(dev, em_table->state, NULL, + pd->nr_perf_states, pd->flags); + if (ret) + goto free_em_table; + } ret = em_dev_update_perf_domain(dev, em_table); if (ret) @@ -727,11 +727,24 @@ free_em_table: * Adjustment of CPU performance values after boot, when all CPUs capacites * are correctly calculated. */ -static void em_adjust_new_capacity(struct device *dev, - struct em_perf_domain *pd, - u64 max_cap) +static void em_adjust_new_capacity(unsigned int cpu, struct device *dev, + struct em_perf_domain *pd) { - struct em_perf_table __rcu *em_table; + unsigned long cpu_capacity = arch_scale_cpu_capacity(cpu); + struct em_perf_table *em_table; + struct em_perf_state *table; + unsigned long em_max_perf; + + rcu_read_lock(); + table = em_perf_state_from_pd(pd); + em_max_perf = table[pd->nr_perf_states - 1].performance; + rcu_read_unlock(); + + if (em_max_perf == cpu_capacity) + return; + + pr_debug("updating cpu%d cpu_cap=%lu old capacity=%lu\n", cpu, + cpu_capacity, em_max_perf); em_table = em_table_dup(pd); if (!em_table) { @@ -744,12 +757,27 @@ static void em_adjust_new_capacity(struct device *dev, em_recalc_and_update(dev, pd, em_table); } +/** + * em_adjust_cpu_capacity() - Adjust the EM for a CPU after a capacity update. + * @cpu: Target CPU. + * + * Adjust the existing EM for @cpu after a capacity update under the assumption + * that the capacity has been updated in the same way for all of the CPUs in + * the same perf domain. + */ +void em_adjust_cpu_capacity(unsigned int cpu) +{ + struct device *dev = get_cpu_device(cpu); + struct em_perf_domain *pd; + + pd = em_pd_get(dev); + if (pd) + em_adjust_new_capacity(cpu, dev, pd); +} + static void em_check_capacity_update(void) { cpumask_var_t cpu_done_mask; - struct em_perf_state *table; - struct em_perf_domain *pd; - unsigned long cpu_capacity; int cpu; if (!zalloc_cpumask_var(&cpu_done_mask, GFP_KERNEL)) { @@ -760,7 +788,7 @@ static void em_check_capacity_update(void) /* Check if CPUs capacity has changed than update EM */ for_each_possible_cpu(cpu) { struct cpufreq_policy *policy; - unsigned long em_max_perf; + struct em_perf_domain *pd; struct device *dev; if (cpumask_test_cpu(cpu, cpu_done_mask)) @@ -775,32 +803,15 @@ static void em_check_capacity_update(void) } cpufreq_cpu_put(policy); - pd = em_cpu_get(cpu); + dev = get_cpu_device(cpu); + pd = em_pd_get(dev); if (!pd || em_is_artificial(pd)) continue; cpumask_or(cpu_done_mask, cpu_done_mask, em_span_cpus(pd)); - cpu_capacity = arch_scale_cpu_capacity(cpu); - - rcu_read_lock(); - table = em_perf_state_from_pd(pd); - em_max_perf = table[pd->nr_perf_states - 1].performance; - rcu_read_unlock(); - - /* - * Check if the CPU capacity has been adjusted during boot - * and trigger the update for new performance values. - */ - if (em_max_perf == cpu_capacity) - continue; - - pr_debug("updating cpu%d cpu_cap=%lu old capacity=%lu\n", - cpu, cpu_capacity, em_max_perf); - - dev = get_cpu_device(cpu); - em_adjust_new_capacity(dev, pd, cpu_capacity); + em_adjust_new_capacity(cpu, dev, pd); } free_cpumask_var(cpu_done_mask); @@ -822,7 +833,7 @@ static void em_update_workfn(struct work_struct *work) */ int em_dev_update_chip_binning(struct device *dev) { - struct em_perf_table __rcu *em_table; + struct em_perf_table *em_table; struct em_perf_domain *pd; int i, ret; diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 10a01af63a80..519fb09de5e0 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -11,6 +11,7 @@ #define pr_fmt(fmt) "PM: hibernation: " fmt +#include <crypto/acompress.h> #include <linux/blkdev.h> #include <linux/export.h> #include <linux/suspend.h> @@ -89,6 +90,11 @@ void hibernate_release(void) atomic_inc(&hibernate_atomic); } +bool hibernation_in_progress(void) +{ + return !atomic_read(&hibernate_atomic); +} + bool hibernation_available(void) { return nohibernate == 0 && @@ -132,10 +138,15 @@ bool system_entering_hibernation(void) EXPORT_SYMBOL(system_entering_hibernation); #ifdef CONFIG_PM_DEBUG +static unsigned int pm_test_delay = 5; +module_param(pm_test_delay, uint, 0644); +MODULE_PARM_DESC(pm_test_delay, + "Number of seconds to wait before resuming from hibernation test"); static void hibernation_debug_sleep(void) { - pr_info("debug: Waiting for 5 seconds.\n"); - mdelay(5000); + pr_info("hibernation debug: Waiting for %d second(s).\n", + pm_test_delay); + mdelay(pm_test_delay * 1000); } static int hibernation_test(int level) @@ -411,7 +422,7 @@ int hibernation_snapshot(int platform_mode) goto Thaw; } - suspend_console(); + console_suspend_all(); pm_restrict_gfp_mask(); error = dpm_suspend(PMSG_FREEZE); @@ -437,7 +448,7 @@ int hibernation_snapshot(int platform_mode) if (error || !in_suspend) pm_restore_gfp_mask(); - resume_console(); + console_resume_all(); dpm_complete(msg); Close: @@ -547,7 +558,7 @@ int hibernation_restore(int platform_mode) int error; pm_prepare_console(); - suspend_console(); + console_suspend_all(); pm_restrict_gfp_mask(); error = dpm_suspend_start(PMSG_QUIESCE); if (!error) { @@ -561,7 +572,7 @@ int hibernation_restore(int platform_mode) } dpm_resume_end(PMSG_RECOVER); pm_restore_gfp_mask(); - resume_console(); + console_resume_all(); pm_restore_console(); return error; } @@ -586,7 +597,7 @@ int hibernation_platform_enter(void) goto Close; entering_platform_hibernation = true; - suspend_console(); + console_suspend_all(); error = dpm_suspend_start(PMSG_HIBERNATE); if (error) { if (hibernation_ops->recover) @@ -639,7 +650,7 @@ int hibernation_platform_enter(void) Resume_devices: entering_platform_hibernation = false; dpm_resume_end(PMSG_RESTORE); - resume_console(); + console_resume_all(); Close: hibernation_ops->end(); @@ -756,8 +767,8 @@ int hibernate(void) * Query for the compression algorithm support if compression is enabled. */ if (!nocompress) { - strscpy(hib_comp_algo, hibernate_compressor, sizeof(hib_comp_algo)); - if (crypto_has_comp(hib_comp_algo, 0, 0) != 1) { + strscpy(hib_comp_algo, hibernate_compressor); + if (!crypto_has_acomp(hib_comp_algo, 0, CRYPTO_ALG_ASYNC)) { pr_err("%s compression is not available\n", hib_comp_algo); return -EOPNOTSUPP; } @@ -777,6 +788,8 @@ int hibernate(void) goto Restore; ksys_sync_helper(); + if (filesystem_freeze_enabled) + filesystems_freeze(); error = freeze_processes(); if (error) @@ -845,6 +858,7 @@ int hibernate(void) /* Don't bother checking whether freezer_test_done is true */ freezer_test_done = false; Exit: + filesystems_thaw(); pm_notifier_call_chain(PM_POST_HIBERNATION); Restore: pm_restore_console(); @@ -881,6 +895,9 @@ int hibernate_quiet_exec(int (*func)(void *data), void *data) if (error) goto restore; + if (filesystem_freeze_enabled) + filesystems_freeze(); + error = freeze_processes(); if (error) goto exit; @@ -901,7 +918,7 @@ int hibernate_quiet_exec(int (*func)(void *data), void *data) if (error) goto dpm_complete; - suspend_console(); + console_suspend_all(); error = dpm_suspend(PMSG_FREEZE); if (error) @@ -925,7 +942,7 @@ skip: dpm_resume: dpm_resume(PMSG_THAW); - resume_console(); + console_resume_all(); dpm_complete: dpm_complete(PMSG_THAW); @@ -940,6 +957,7 @@ thaw: thaw_processes(); exit: + filesystems_thaw(); pm_notifier_call_chain(PM_POST_HIBERNATION); restore: @@ -1005,10 +1023,10 @@ static int software_resume(void) */ if (!(swsusp_header_flags & SF_NOCOMPRESS_MODE)) { if (swsusp_header_flags & SF_COMPRESSION_ALG_LZ4) - strscpy(hib_comp_algo, COMPRESSION_ALGO_LZ4, sizeof(hib_comp_algo)); + strscpy(hib_comp_algo, COMPRESSION_ALGO_LZ4); else - strscpy(hib_comp_algo, COMPRESSION_ALGO_LZO, sizeof(hib_comp_algo)); - if (crypto_has_comp(hib_comp_algo, 0, 0) != 1) { + strscpy(hib_comp_algo, COMPRESSION_ALGO_LZO); + if (!crypto_has_acomp(hib_comp_algo, 0, CRYPTO_ALG_ASYNC)) { pr_err("%s compression is not available\n", hib_comp_algo); error = -EOPNOTSUPP; goto Unlock; @@ -1028,19 +1046,26 @@ static int software_resume(void) if (error) goto Restore; + if (filesystem_freeze_enabled) + filesystems_freeze(); + pm_pr_dbg("Preparing processes for hibernation restore.\n"); error = freeze_processes(); - if (error) + if (error) { + filesystems_thaw(); goto Close_Finish; + } error = freeze_kernel_threads(); if (error) { thaw_processes(); + filesystems_thaw(); goto Close_Finish; } error = load_image_and_restore(); thaw_processes(); + filesystems_thaw(); Finish: pm_notifier_call_chain(PM_POST_RESTORE); Restore: @@ -1446,22 +1471,21 @@ static const char * const comp_alg_enabled[] = { static int hibernate_compressor_param_set(const char *compressor, const struct kernel_param *kp) { - unsigned int sleep_flags; int index, ret; - sleep_flags = lock_system_sleep(); + if (!mutex_trylock(&system_transition_mutex)) + return -EBUSY; index = sysfs_match_string(comp_alg_enabled, compressor); if (index >= 0) { ret = param_set_copystring(comp_alg_enabled[index], kp); if (!ret) - strscpy(hib_comp_algo, comp_alg_enabled[index], - sizeof(hib_comp_algo)); + strscpy(hib_comp_algo, comp_alg_enabled[index]); } else { ret = index; } - unlock_system_sleep(sleep_flags); + mutex_unlock(&system_transition_mutex); if (ret) pr_debug("Cannot set specified compressor %s\n", diff --git a/kernel/power/main.c b/kernel/power/main.c index 6254814d4817..3d484630505a 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -557,6 +557,10 @@ static int __init pm_debugfs_init(void) late_initcall(pm_debugfs_init); #endif /* CONFIG_DEBUG_FS */ +bool pm_sleep_transition_in_progress(void) +{ + return pm_suspend_in_progress() || hibernation_in_progress(); +} #endif /* CONFIG_PM_SLEEP */ #ifdef CONFIG_PM_SLEEP_DEBUG @@ -594,7 +598,7 @@ power_attr(pm_print_times); static inline void pm_print_times_init(void) { - pm_print_times_enabled = !!initcall_debug; + pm_print_times_enabled = initcall_debug; } static ssize_t pm_wakeup_irq_show(struct kobject *kobj, @@ -613,7 +617,7 @@ bool pm_debug_messages_on __read_mostly; bool pm_debug_messages_should_print(void) { - return pm_debug_messages_on && pm_suspend_target_state != PM_SUSPEND_ON; + return pm_debug_messages_on && pm_sleep_transition_in_progress(); } EXPORT_SYMBOL_GPL(pm_debug_messages_should_print); @@ -962,6 +966,34 @@ power_attr(pm_freeze_timeout); #endif /* CONFIG_FREEZER*/ +#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION) +bool filesystem_freeze_enabled = false; + +static ssize_t freeze_filesystems_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%d\n", filesystem_freeze_enabled); +} + +static ssize_t freeze_filesystems_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + unsigned long val; + + if (kstrtoul(buf, 10, &val)) + return -EINVAL; + + if (val > 1) + return -EINVAL; + + filesystem_freeze_enabled = !!val; + return n; +} + +power_attr(freeze_filesystems); +#endif /* CONFIG_SUSPEND || CONFIG_HIBERNATION */ + static struct attribute * g[] = { &state_attr.attr, #ifdef CONFIG_PM_TRACE @@ -992,6 +1024,9 @@ static struct attribute * g[] = { #ifdef CONFIG_FREEZER &pm_freeze_timeout_attr.attr, #endif +#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION) + &freeze_filesystems_attr.attr, +#endif NULL, }; diff --git a/kernel/power/power.h b/kernel/power/power.h index c352dea2f67b..cb1d71562002 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -18,6 +18,10 @@ struct swsusp_info { unsigned long size; } __aligned(PAGE_SIZE); +#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION) +extern bool filesystem_freeze_enabled; +#endif + #ifdef CONFIG_HIBERNATION /* kernel/power/snapshot.c */ extern void __init hibernate_reserved_size_init(void); @@ -71,10 +75,14 @@ extern void enable_restore_image_protection(void); static inline void enable_restore_image_protection(void) {} #endif /* CONFIG_STRICT_KERNEL_RWX */ +extern bool hibernation_in_progress(void); + #else /* !CONFIG_HIBERNATION */ static inline void hibernate_reserved_size_init(void) {} static inline void hibernate_image_size_init(void) {} + +static inline bool hibernation_in_progress(void) { return false; } #endif /* !CONFIG_HIBERNATION */ #define power_attr(_name) \ diff --git a/kernel/power/process.c b/kernel/power/process.c index 66ac067d9ae6..dc0dfc349f22 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -189,7 +189,7 @@ void thaw_processes(void) oom_killer_enable(); - pr_info("Restarting tasks ... "); + pr_info("Restarting tasks: Starting\n"); __usermodehelper_set_disable_depth(UMH_FREEZING); thaw_workqueues(); @@ -208,7 +208,7 @@ void thaw_processes(void) usermodehelper_enable(); schedule(); - pr_cont("done.\n"); + pr_info("Restarting tasks: Done\n"); trace_suspend_resume(TPS("thaw_processes"), 0, false); } @@ -217,7 +217,7 @@ void thaw_kernel_threads(void) struct task_struct *g, *p; pm_nosig_freezing = false; - pr_info("Restarting kernel threads ... "); + pr_info("Restarting kernel threads ...\n"); thaw_workqueues(); @@ -229,5 +229,5 @@ void thaw_kernel_threads(void) read_unlock(&tasklist_lock); schedule(); - pr_cont("done.\n"); + pr_info("Done restarting kernel threads.\n"); } diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index c9fb559a6399..4e6e24e8b854 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -2270,9 +2270,9 @@ int snapshot_read_next(struct snapshot_handle *handle) */ void *kaddr; - kaddr = kmap_atomic(page); + kaddr = kmap_local_page(page); copy_page(buffer, kaddr); - kunmap_atomic(kaddr); + kunmap_local(kaddr); handle->buffer = buffer; } else { handle->buffer = page_address(page); @@ -2561,9 +2561,9 @@ static void copy_last_highmem_page(void) if (last_highmem_page) { void *dst; - dst = kmap_atomic(last_highmem_page); + dst = kmap_local_page(last_highmem_page); copy_page(dst, buffer); - kunmap_atomic(dst); + kunmap_local(dst); last_highmem_page = NULL; } } @@ -2881,13 +2881,13 @@ static inline void swap_two_pages_data(struct page *p1, struct page *p2, { void *kaddr1, *kaddr2; - kaddr1 = kmap_atomic(p1); - kaddr2 = kmap_atomic(p2); + kaddr1 = kmap_local_page(p1); + kaddr2 = kmap_local_page(p2); copy_page(buf, kaddr1); copy_page(kaddr1, kaddr2); copy_page(kaddr2, buf); - kunmap_atomic(kaddr2); - kunmap_atomic(kaddr1); + kunmap_local(kaddr2); + kunmap_local(kaddr1); } /** diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 09f8397bae15..76b141b9aac0 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -30,6 +30,7 @@ #include <trace/events/power.h> #include <linux/compiler.h> #include <linux/moduleparam.h> +#include <linux/fs.h> #include "power.h" @@ -91,6 +92,16 @@ static void s2idle_enter(void) { trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_TO_IDLE, true); + /* + * The correctness of the code below depends on the number of online + * CPUs being stable, but CPUs cannot be taken offline or put online + * while it is running. + * + * The s2idle_lock must be acquired before the pending wakeup check to + * prevent pm_system_wakeup() from running as a whole between that check + * and the subsequent s2idle_state update in which case a wakeup event + * would get lost. + */ raw_spin_lock_irq(&s2idle_lock); if (pm_wakeup_pending()) goto out; @@ -98,8 +109,6 @@ static void s2idle_enter(void) s2idle_state = S2IDLE_STATE_ENTER; raw_spin_unlock_irq(&s2idle_lock); - cpus_read_lock(); - /* Push all the CPUs into the idle loop. */ wake_up_all_idle_cpus(); /* Make the current CPU wait so it can enter the idle loop too. */ @@ -112,8 +121,6 @@ static void s2idle_enter(void) */ wake_up_all_idle_cpus(); - cpus_read_unlock(); - raw_spin_lock_irq(&s2idle_lock); out: @@ -368,6 +375,8 @@ static int suspend_prepare(suspend_state_t state) if (error) goto Restore; + if (filesystem_freeze_enabled) + filesystems_freeze(); trace_suspend_resume(TPS("freeze_processes"), 0, true); error = suspend_freeze_processes(); trace_suspend_resume(TPS("freeze_processes"), 0, false); @@ -502,7 +511,7 @@ int suspend_devices_and_enter(suspend_state_t state) if (error) goto Close; - suspend_console(); + console_suspend_all(); suspend_test_start(); error = dpm_suspend_start(PMSG_SUSPEND); if (error) { @@ -521,9 +530,9 @@ int suspend_devices_and_enter(suspend_state_t state) suspend_test_start(); dpm_resume_end(PMSG_RESUME); suspend_test_finish("resume devices"); - trace_suspend_resume(TPS("resume_console"), state, true); - resume_console(); - trace_suspend_resume(TPS("resume_console"), state, false); + trace_suspend_resume(TPS("console_resume_all"), state, true); + console_resume_all(); + trace_suspend_resume(TPS("console_resume_all"), state, false); Close: platform_resume_end(state); @@ -544,6 +553,7 @@ int suspend_devices_and_enter(suspend_state_t state) static void suspend_finish(void) { suspend_thaw_processes(); + filesystems_thaw(); pm_notifier_call_chain(PM_POST_SUSPEND); pm_restore_console(); } @@ -582,6 +592,8 @@ static int enter_state(suspend_state_t state) ksys_sync_helper(); trace_suspend_resume(TPS("sync_filesystems"), 0, false); } + if (filesystem_freeze_enabled) + filesystems_freeze(); pm_pr_dbg("Preparing system for sleep (%s)\n", mem_sleep_labels[state]); pm_suspend_clear_flags(); @@ -603,6 +615,7 @@ static int enter_state(suspend_state_t state) pm_pr_dbg("Finishing wakeup.\n"); suspend_finish(); Unlock: + filesystems_thaw(); mutex_unlock(&system_transition_mutex); return error; } diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 82b884b67152..ad13c461b657 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -12,6 +12,7 @@ #define pr_fmt(fmt) "PM: " fmt +#include <crypto/acompress.h> #include <linux/module.h> #include <linux/file.h> #include <linux/delay.h> @@ -267,35 +268,26 @@ static void hib_end_io(struct bio *bio) bio_put(bio); } -static int hib_submit_io(blk_opf_t opf, pgoff_t page_off, void *addr, +static int hib_submit_io_sync(blk_opf_t opf, pgoff_t page_off, void *addr) +{ + return bdev_rw_virt(file_bdev(hib_resume_bdev_file), + page_off * (PAGE_SIZE >> 9), addr, PAGE_SIZE, opf); +} + +static int hib_submit_io_async(blk_opf_t opf, pgoff_t page_off, void *addr, struct hib_bio_batch *hb) { - struct page *page = virt_to_page(addr); struct bio *bio; - int error = 0; bio = bio_alloc(file_bdev(hib_resume_bdev_file), 1, opf, GFP_NOIO | __GFP_HIGH); bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9); - - if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { - pr_err("Adding page to bio failed at %llu\n", - (unsigned long long)bio->bi_iter.bi_sector); - bio_put(bio); - return -EFAULT; - } - - if (hb) { - bio->bi_end_io = hib_end_io; - bio->bi_private = hb; - atomic_inc(&hb->count); - submit_bio(bio); - } else { - error = submit_bio_wait(bio); - bio_put(bio); - } - - return error; + bio_add_virt_nofail(bio, addr, PAGE_SIZE); + bio->bi_end_io = hib_end_io; + bio->bi_private = hb; + atomic_inc(&hb->count); + submit_bio(bio); + return 0; } static int hib_wait_io(struct hib_bio_batch *hb) @@ -315,7 +307,7 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) { int error; - hib_submit_io(REQ_OP_READ, swsusp_resume_block, swsusp_header, NULL); + hib_submit_io_sync(REQ_OP_READ, swsusp_resume_block, swsusp_header); if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) || !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) { memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10); @@ -328,8 +320,8 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) swsusp_header->flags = flags; if (flags & SF_CRC32_MODE) swsusp_header->crc32 = handle->crc32; - error = hib_submit_io(REQ_OP_WRITE | REQ_SYNC, - swsusp_resume_block, swsusp_header, NULL); + error = hib_submit_io_sync(REQ_OP_WRITE | REQ_SYNC, + swsusp_resume_block, swsusp_header); } else { pr_err("Swap header not found!\n"); error = -ENODEV; @@ -379,36 +371,30 @@ static int swsusp_swap_check(void) static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb) { + gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY; void *src; int ret; if (!offset) return -ENOSPC; - if (hb) { - src = (void *)__get_free_page(GFP_NOIO | __GFP_NOWARN | - __GFP_NORETRY); - if (src) { - copy_page(src, buf); - } else { - ret = hib_wait_io(hb); /* Free pages */ - if (ret) - return ret; - src = (void *)__get_free_page(GFP_NOIO | - __GFP_NOWARN | - __GFP_NORETRY); - if (src) { - copy_page(src, buf); - } else { - WARN_ON_ONCE(1); - hb = NULL; /* Go synchronous */ - src = buf; - } - } - } else { - src = buf; + if (!hb) + goto sync_io; + + src = (void *)__get_free_page(gfp); + if (!src) { + ret = hib_wait_io(hb); /* Free pages */ + if (ret) + return ret; + src = (void *)__get_free_page(gfp); + if (WARN_ON_ONCE(!src)) + goto sync_io; } - return hib_submit_io(REQ_OP_WRITE | REQ_SYNC, offset, src, hb); + + copy_page(src, buf); + return hib_submit_io_async(REQ_OP_WRITE | REQ_SYNC, offset, src, hb); +sync_io: + return hib_submit_io_sync(REQ_OP_WRITE | REQ_SYNC, offset, buf); } static void release_swap_writer(struct swap_map_handle *handle) @@ -635,7 +621,8 @@ static int crc32_threadfn(void *data) */ struct cmp_data { struct task_struct *thr; /* thread */ - struct crypto_comp *cc; /* crypto compressor stream */ + struct crypto_acomp *cc; /* crypto compressor */ + struct acomp_req *cr; /* crypto request */ atomic_t ready; /* ready to start flag */ atomic_t stop; /* ready to stop flag */ int ret; /* return code */ @@ -656,7 +643,6 @@ static atomic_t compressed_size = ATOMIC_INIT(0); static int compress_threadfn(void *data) { struct cmp_data *d = data; - unsigned int cmp_len = 0; while (1) { wait_event(d->go, atomic_read_acquire(&d->ready) || @@ -670,11 +656,13 @@ static int compress_threadfn(void *data) } atomic_set(&d->ready, 0); - cmp_len = CMP_SIZE - CMP_HEADER; - d->ret = crypto_comp_compress(d->cc, d->unc, d->unc_len, - d->cmp + CMP_HEADER, - &cmp_len); - d->cmp_len = cmp_len; + acomp_request_set_callback(d->cr, CRYPTO_TFM_REQ_MAY_SLEEP, + NULL, NULL); + acomp_request_set_src_nondma(d->cr, d->unc, d->unc_len); + acomp_request_set_dst_nondma(d->cr, d->cmp + CMP_HEADER, + CMP_SIZE - CMP_HEADER); + d->ret = crypto_acomp_compress(d->cr); + d->cmp_len = d->cr->dlen; atomic_set(&compressed_size, atomic_read(&compressed_size) + d->cmp_len); atomic_set_release(&d->stop, 1); @@ -745,13 +733,20 @@ static int save_compressed_image(struct swap_map_handle *handle, init_waitqueue_head(&data[thr].go); init_waitqueue_head(&data[thr].done); - data[thr].cc = crypto_alloc_comp(hib_comp_algo, 0, 0); + data[thr].cc = crypto_alloc_acomp(hib_comp_algo, 0, CRYPTO_ALG_ASYNC); if (IS_ERR_OR_NULL(data[thr].cc)) { pr_err("Could not allocate comp stream %ld\n", PTR_ERR(data[thr].cc)); ret = -EFAULT; goto out_clean; } + data[thr].cr = acomp_request_alloc(data[thr].cc); + if (!data[thr].cr) { + pr_err("Could not allocate comp request\n"); + ret = -ENOMEM; + goto out_clean; + } + data[thr].thr = kthread_run(compress_threadfn, &data[thr], "image_compress/%u", thr); @@ -899,8 +894,8 @@ out_clean: for (thr = 0; thr < nr_threads; thr++) { if (data[thr].thr) kthread_stop(data[thr].thr); - if (data[thr].cc) - crypto_free_comp(data[thr].cc); + acomp_request_free(data[thr].cr); + crypto_free_acomp(data[thr].cc); } vfree(data); } @@ -1031,7 +1026,7 @@ static int get_swap_reader(struct swap_map_handle *handle, return -ENOMEM; } - error = hib_submit_io(REQ_OP_READ, offset, tmp->map, NULL); + error = hib_submit_io_sync(REQ_OP_READ, offset, tmp->map); if (error) { release_swap_reader(handle); return error; @@ -1055,7 +1050,10 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf, offset = handle->cur->entries[handle->k]; if (!offset) return -EFAULT; - error = hib_submit_io(REQ_OP_READ, offset, buf, hb); + if (hb) + error = hib_submit_io_async(REQ_OP_READ, offset, buf, hb); + else + error = hib_submit_io_sync(REQ_OP_READ, offset, buf); if (error) return error; if (++handle->k >= MAP_PAGE_ENTRIES) { @@ -1142,7 +1140,8 @@ static int load_image(struct swap_map_handle *handle, */ struct dec_data { struct task_struct *thr; /* thread */ - struct crypto_comp *cc; /* crypto compressor stream */ + struct crypto_acomp *cc; /* crypto compressor */ + struct acomp_req *cr; /* crypto request */ atomic_t ready; /* ready to start flag */ atomic_t stop; /* ready to stop flag */ int ret; /* return code */ @@ -1160,7 +1159,6 @@ struct dec_data { static int decompress_threadfn(void *data) { struct dec_data *d = data; - unsigned int unc_len = 0; while (1) { wait_event(d->go, atomic_read_acquire(&d->ready) || @@ -1174,10 +1172,13 @@ static int decompress_threadfn(void *data) } atomic_set(&d->ready, 0); - unc_len = UNC_SIZE; - d->ret = crypto_comp_decompress(d->cc, d->cmp + CMP_HEADER, d->cmp_len, - d->unc, &unc_len); - d->unc_len = unc_len; + acomp_request_set_callback(d->cr, CRYPTO_TFM_REQ_MAY_SLEEP, + NULL, NULL); + acomp_request_set_src_nondma(d->cr, d->cmp + CMP_HEADER, + d->cmp_len); + acomp_request_set_dst_nondma(d->cr, d->unc, UNC_SIZE); + d->ret = crypto_acomp_decompress(d->cr); + d->unc_len = d->cr->dlen; if (clean_pages_on_decompress) flush_icache_range((unsigned long)d->unc, @@ -1254,13 +1255,20 @@ static int load_compressed_image(struct swap_map_handle *handle, init_waitqueue_head(&data[thr].go); init_waitqueue_head(&data[thr].done); - data[thr].cc = crypto_alloc_comp(hib_comp_algo, 0, 0); + data[thr].cc = crypto_alloc_acomp(hib_comp_algo, 0, CRYPTO_ALG_ASYNC); if (IS_ERR_OR_NULL(data[thr].cc)) { pr_err("Could not allocate comp stream %ld\n", PTR_ERR(data[thr].cc)); ret = -EFAULT; goto out_clean; } + data[thr].cr = acomp_request_alloc(data[thr].cc); + if (!data[thr].cr) { + pr_err("Could not allocate comp request\n"); + ret = -ENOMEM; + goto out_clean; + } + data[thr].thr = kthread_run(decompress_threadfn, &data[thr], "image_decompress/%u", thr); @@ -1507,8 +1515,8 @@ out_clean: for (thr = 0; thr < nr_threads; thr++) { if (data[thr].thr) kthread_stop(data[thr].thr); - if (data[thr].cc) - crypto_free_comp(data[thr].cc); + acomp_request_free(data[thr].cr); + crypto_free_acomp(data[thr].cc); } vfree(data); } @@ -1570,8 +1578,8 @@ int swsusp_check(bool exclusive) BLK_OPEN_READ, holder, NULL); if (!IS_ERR(hib_resume_bdev_file)) { clear_page(swsusp_header); - error = hib_submit_io(REQ_OP_READ, swsusp_resume_block, - swsusp_header, NULL); + error = hib_submit_io_sync(REQ_OP_READ, swsusp_resume_block, + swsusp_header); if (error) goto put; @@ -1579,9 +1587,9 @@ int swsusp_check(bool exclusive) memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); swsusp_header_flags = swsusp_header->flags; /* Reset swap signature now */ - error = hib_submit_io(REQ_OP_WRITE | REQ_SYNC, + error = hib_submit_io_sync(REQ_OP_WRITE | REQ_SYNC, swsusp_resume_block, - swsusp_header, NULL); + swsusp_header); } else { error = -EINVAL; } @@ -1630,13 +1638,12 @@ int swsusp_unmark(void) { int error; - hib_submit_io(REQ_OP_READ, swsusp_resume_block, - swsusp_header, NULL); + hib_submit_io_sync(REQ_OP_READ, swsusp_resume_block, swsusp_header); if (!memcmp(HIBERNATE_SIG,swsusp_header->sig, 10)) { memcpy(swsusp_header->sig,swsusp_header->orig_sig, 10); - error = hib_submit_io(REQ_OP_WRITE | REQ_SYNC, + error = hib_submit_io_sync(REQ_OP_WRITE | REQ_SYNC, swsusp_resume_block, - swsusp_header, NULL); + swsusp_header); } else { pr_err("Cannot find swsusp signature!\n"); error = -ENODEV; diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c index 52571dcad768..4e941999a53b 100644 --- a/kernel/power/wakelock.c +++ b/kernel/power/wakelock.c @@ -49,6 +49,9 @@ ssize_t pm_show_wakelocks(char *buf, bool show_active) len += sysfs_emit_at(buf, len, "%s ", wl->name); } + if (len > 0) + --len; + len += sysfs_emit_at(buf, len, "\n"); mutex_unlock(&wakelocks_lock); diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index a91bdf802967..48a24e7b309d 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -64,6 +64,7 @@ struct dev_printk_info; extern struct printk_ringbuffer *prb; extern bool printk_kthreads_running; +extern bool debug_non_panic_cpus; __printf(4, 0) int vprintk_store(int facility, int level, diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 057db78876cd..1eea80d0648e 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2375,6 +2375,22 @@ void printk_legacy_allow_panic_sync(void) } } +bool __read_mostly debug_non_panic_cpus; + +#ifdef CONFIG_PRINTK_CALLER +static int __init debug_non_panic_cpus_setup(char *str) +{ + debug_non_panic_cpus = true; + pr_info("allow messages from non-panic CPUs in panic()\n"); + + return 0; +} +early_param("debug_non_panic_cpus", debug_non_panic_cpus_setup); +module_param(debug_non_panic_cpus, bool, 0644); +MODULE_PARM_DESC(debug_non_panic_cpus, + "allow messages from non-panic CPUs in panic()"); +#endif + asmlinkage int vprintk_emit(int facility, int level, const struct dev_printk_info *dev_info, const char *fmt, va_list args) @@ -2391,7 +2407,9 @@ asmlinkage int vprintk_emit(int facility, int level, * non-panic CPUs are generating any messages, they will be * silently dropped. */ - if (other_cpu_in_panic() && !panic_triggering_all_cpu_backtrace) + if (other_cpu_in_panic() && + !debug_non_panic_cpus && + !panic_triggering_all_cpu_backtrace) return 0; printk_get_console_flush_type(&ft); @@ -2731,11 +2749,11 @@ module_param_named(console_no_auto_verbose, printk_console_no_auto_verbose, bool MODULE_PARM_DESC(console_no_auto_verbose, "Disable console loglevel raise to highest on oops/panic/etc"); /** - * suspend_console - suspend the console subsystem + * console_suspend_all - suspend the console subsystem * * This disables printk() while we go into suspend states */ -void suspend_console(void) +void console_suspend_all(void) { struct console *con; @@ -2758,7 +2776,7 @@ void suspend_console(void) synchronize_srcu(&console_srcu); } -void resume_console(void) +void console_resume_all(void) { struct console_flush_type ft; struct console *con; @@ -3340,7 +3358,12 @@ void console_unblank(void) */ cookie = console_srcu_read_lock(); for_each_console_srcu(c) { - if ((console_srcu_read_flags(c) & CON_ENABLED) && c->unblank) { + short flags = console_srcu_read_flags(c); + + if (flags & CON_SUSPENDED) + continue; + + if ((flags & CON_ENABLED) && c->unblank) { found_unblank = true; break; } @@ -3377,7 +3400,12 @@ void console_unblank(void) cookie = console_srcu_read_lock(); for_each_console_srcu(c) { - if ((console_srcu_read_flags(c) & CON_ENABLED) && c->unblank) + short flags = console_srcu_read_flags(c); + + if (flags & CON_SUSPENDED) + continue; + + if ((flags & CON_ENABLED) && c->unblank) c->unblank(); } console_srcu_read_unlock(cookie); @@ -3495,10 +3523,10 @@ struct tty_driver *console_device(int *index) /* * Prevent further output on the passed console device so that (for example) - * serial drivers can disable console output before suspending a port, and can + * serial drivers can suspend console output before suspending a port, and can * re-enable output afterwards. */ -void console_stop(struct console *console) +void console_suspend(struct console *console) { __pr_flush(console, 1000, true); console_list_lock(); @@ -3513,9 +3541,9 @@ void console_stop(struct console *console) */ synchronize_srcu(&console_srcu); } -EXPORT_SYMBOL(console_stop); +EXPORT_SYMBOL(console_suspend); -void console_start(struct console *console) +void console_resume(struct console *console) { struct console_flush_type ft; bool is_nbcon; @@ -3540,7 +3568,7 @@ void console_start(struct console *console) __pr_flush(console, 1000, true); } -EXPORT_SYMBOL(console_start); +EXPORT_SYMBOL(console_resume); #ifdef CONFIG_PRINTK static int unregister_console_locked(struct console *console); @@ -4275,6 +4303,11 @@ void __init console_init(void) initcall_t call; initcall_entry_t *ce; +#ifdef CONFIG_NULL_TTY_DEFAULT_CONSOLE + if (!console_set_on_cmdline) + add_preferred_console("ttynull", 0, NULL); +#endif + /* Setup the default TTY line discipline. */ n_tty_init(); diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c index 88e8f3a61922..d9fb053cff67 100644 --- a/kernel/printk/printk_ringbuffer.c +++ b/kernel/printk/printk_ringbuffer.c @@ -2133,9 +2133,9 @@ static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq, * there may be other finalized records beyond that * need to be printed for a panic situation. If this * is the panic CPU, skip this - * non-existent/non-finalized record unless it is - * at or beyond the head, in which case it is not - * possible to continue. + * non-existent/non-finalized record unless non-panic + * CPUs are still running and their debugging is + * explicitly enabled. * * Note that new messages printed on panic CPU are * finalized when we are here. The only exception @@ -2143,10 +2143,13 @@ static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq, * But it would have the sequence number returned * by "prb_next_reserve_seq() - 1". */ - if (this_cpu_in_panic() && ((*seq + 1) < prb_next_reserve_seq(rb))) + if (this_cpu_in_panic() && + (!debug_non_panic_cpus || legacy_allow_panic_sync) && + ((*seq + 1) < prb_next_reserve_seq(rb))) { (*seq)++; - else + } else { return false; + } } } diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index aa42de4d2768..4d9b21f69eaa 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -68,6 +68,8 @@ config TREE_SRCU config FORCE_NEED_SRCU_NMI_SAFE bool "Force selection of NEED_SRCU_NMI_SAFE" depends on !TINY_SRCU + depends on RCU_EXPERT + depends on ARCH_HAS_NMI_SAFE_THIS_CPU_OPS select NEED_SRCU_NMI_SAFE default n help diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index eed2951a4962..9cf01832a6c3 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -57,6 +57,9 @@ /* Low-order bit definition for polled grace-period APIs. */ #define RCU_GET_STATE_COMPLETED 0x1 +/* A complete grace period count */ +#define RCU_SEQ_GP (RCU_SEQ_STATE_MASK + 1) + extern int sysctl_sched_rt_runtime; /* @@ -157,12 +160,21 @@ static inline bool rcu_seq_done(unsigned long *sp, unsigned long s) * Given a snapshot from rcu_seq_snap(), determine whether or not a * full update-side operation has occurred, but do not allow the * (ULONG_MAX / 2) safety-factor/guard-band. + * + * The token returned by get_state_synchronize_rcu_full() is based on + * rcu_state.gp_seq but it is tested in poll_state_synchronize_rcu_full() + * against the root rnp->gp_seq. Since rcu_seq_start() is first called + * on rcu_state.gp_seq and only later reflected on the root rnp->gp_seq, + * it is possible that rcu_seq_snap(rcu_state.gp_seq) returns 2 full grace + * periods ahead of the root rnp->gp_seq. To prevent false-positives with the + * full polling API that a wrap around instantly completed the GP, when nothing + * like that happened, adjust for the 2 GPs in the ULONG_CMP_LT(). */ static inline bool rcu_seq_done_exact(unsigned long *sp, unsigned long s) { unsigned long cur_s = READ_ONCE(*sp); - return ULONG_CMP_GE(cur_s, s) || ULONG_CMP_LT(cur_s, s - (3 * RCU_SEQ_STATE_MASK + 1)); + return ULONG_CMP_GE(cur_s, s) || ULONG_CMP_LT(cur_s, s - (2 * RCU_SEQ_GP)); } /* @@ -572,6 +584,8 @@ void do_trace_rcu_torture_read(const char *rcutorturename, unsigned long c_old, unsigned long c); void rcu_gp_set_torture_wait(int duration); +void rcu_set_gpwrap_lag(unsigned long lag); +int rcu_get_gpwrap_count(int cpu); #else static inline void rcutorture_get_gp_data(int *flags, unsigned long *gp_seq) { @@ -589,6 +603,8 @@ void do_trace_rcu_torture_read(const char *rcutorturename, do { } while (0) #endif static inline void rcu_gp_set_torture_wait(int duration) { } +static inline void rcu_set_gpwrap_lag(unsigned long lag) { } +static inline int rcu_get_gpwrap_count(int cpu) { return 0; } #endif unsigned long long rcutorture_gather_gp_seqs(void); void rcutorture_format_gp_seqs(unsigned long long seqs, char *cp, size_t len); diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index 0f3059b1b80d..b521d0455992 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -762,7 +762,7 @@ kfree_scale_thread(void *arg) } for (i = 0; i < kfree_alloc_num; i++) { - alloc_ptr = kmalloc(kfree_mult * sizeof(struct kfree_obj), GFP_KERNEL); + alloc_ptr = kcalloc(kfree_mult, sizeof(struct kfree_obj), GFP_KERNEL); if (!alloc_ptr) return -ENOMEM; diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 65095664f5c5..70ec0f21abc3 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -115,6 +115,10 @@ torture_param(int, nreaders, -1, "Number of RCU reader threads"); torture_param(int, object_debug, 0, "Enable debug-object double call_rcu() testing"); torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)"); torture_param(int, onoff_interval, 0, "Time between CPU hotplugs (jiffies), 0=disable"); +torture_param(bool, gpwrap_lag, true, "Enable grace-period wrap lag testing"); +torture_param(int, gpwrap_lag_gps, 8, "Value to set for set_gpwrap_lag during an active testing period."); +torture_param(int, gpwrap_lag_cycle_mins, 30, "Total cycle duration for gpwrap lag testing (in minutes)"); +torture_param(int, gpwrap_lag_active_mins, 5, "Duration for which gpwrap lag is active within each cycle (in minutes)"); torture_param(int, nocbs_nthreads, 0, "Number of NOCB toggle threads, 0 to disable"); torture_param(int, nocbs_toggle, 1000, "Time between toggling nocb state (ms)"); torture_param(int, preempt_duration, 0, "Preemption duration (ms), zero to disable"); @@ -413,6 +417,8 @@ struct rcu_torture_ops { bool (*reader_blocked)(void); unsigned long long (*gather_gp_seqs)(void); void (*format_gp_seqs)(unsigned long long seqs, char *cp, size_t len); + void (*set_gpwrap_lag)(unsigned long lag); + int (*get_gpwrap_count)(int cpu); long cbflood_max; int irq_capable; int can_boost; @@ -619,6 +625,8 @@ static struct rcu_torture_ops rcu_ops = { : NULL, .gather_gp_seqs = rcutorture_gather_gp_seqs, .format_gp_seqs = rcutorture_format_gp_seqs, + .set_gpwrap_lag = rcu_set_gpwrap_lag, + .get_gpwrap_count = rcu_get_gpwrap_count, .irq_capable = 1, .can_boost = IS_ENABLED(CONFIG_RCU_BOOST), .extendables = RCUTORTURE_MAX_EXTEND, @@ -2164,53 +2172,70 @@ rcutorture_loop_extend(int *readstate, bool insoftirq, struct torture_random_sta return &rtrsp[j]; } -/* - * Do one read-side critical section, returning false if there was - * no data to read. Can be invoked both from process context and - * from a timer handler. - */ -static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid) -{ - bool checkpolling = !(torture_random(trsp) & 0xfff); +struct rcu_torture_one_read_state { + bool checkpolling; unsigned long cookie; struct rcu_gp_oldstate cookie_full; - int i; unsigned long started; - unsigned long completed; - int newstate; struct rcu_torture *p; - int pipe_count; - bool preempted = false; - int readstate = 0; - struct rt_read_seg rtseg[RCUTORTURE_RDR_MAX_SEGS] = { { 0 } }; - struct rt_read_seg *rtrsp = &rtseg[0]; - struct rt_read_seg *rtrsp1; + int readstate; + struct rt_read_seg rtseg[RCUTORTURE_RDR_MAX_SEGS]; + struct rt_read_seg *rtrsp; unsigned long long ts; +}; - WARN_ON_ONCE(!rcu_is_watching()); - newstate = rcutorture_extend_mask(readstate, trsp); - rcutorture_one_extend(&readstate, newstate, myid < 0, trsp, rtrsp++); - if (checkpolling) { +static void init_rcu_torture_one_read_state(struct rcu_torture_one_read_state *rtorsp, + struct torture_random_state *trsp) +{ + memset(rtorsp, 0, sizeof(*rtorsp)); + rtorsp->checkpolling = !(torture_random(trsp) & 0xfff); + rtorsp->rtrsp = &rtorsp->rtseg[0]; +} + +/* + * Set up the first segment of a series of overlapping read-side + * critical sections. The caller must have actually initiated the + * outermost read-side critical section. + */ +static bool rcu_torture_one_read_start(struct rcu_torture_one_read_state *rtorsp, + struct torture_random_state *trsp, long myid) +{ + if (rtorsp->checkpolling) { if (cur_ops->get_gp_state && cur_ops->poll_gp_state) - cookie = cur_ops->get_gp_state(); + rtorsp->cookie = cur_ops->get_gp_state(); if (cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full) - cur_ops->get_gp_state_full(&cookie_full); + cur_ops->get_gp_state_full(&rtorsp->cookie_full); } - started = cur_ops->get_gp_seq(); - ts = rcu_trace_clock_local(); - p = rcu_dereference_check(rcu_torture_current, + rtorsp->started = cur_ops->get_gp_seq(); + rtorsp->ts = rcu_trace_clock_local(); + rtorsp->p = rcu_dereference_check(rcu_torture_current, !cur_ops->readlock_held || cur_ops->readlock_held()); - if (p == NULL) { + if (rtorsp->p == NULL) { /* Wait for rcu_torture_writer to get underway */ - rcutorture_one_extend(&readstate, 0, myid < 0, trsp, rtrsp); + rcutorture_one_extend(&rtorsp->readstate, 0, myid < 0, trsp, rtorsp->rtrsp); return false; } - if (p->rtort_mbtest == 0) + if (rtorsp->p->rtort_mbtest == 0) atomic_inc(&n_rcu_torture_mberror); - rcu_torture_reader_do_mbchk(myid, p, trsp); - rtrsp = rcutorture_loop_extend(&readstate, myid < 0, trsp, rtrsp); + rcu_torture_reader_do_mbchk(myid, rtorsp->p, trsp); + return true; +} + +/* + * Complete the last segment of a series of overlapping read-side + * critical sections and check for errors. + */ +static void rcu_torture_one_read_end(struct rcu_torture_one_read_state *rtorsp, + struct torture_random_state *trsp, long myid) +{ + int i; + unsigned long completed; + int pipe_count; + bool preempted = false; + struct rt_read_seg *rtrsp1; + preempt_disable(); - pipe_count = READ_ONCE(p->rtort_pipe_count); + pipe_count = READ_ONCE(rtorsp->p->rtort_pipe_count); if (pipe_count > RCU_TORTURE_PIPE_LEN) { // Should not happen in a correct RCU implementation, // happens quite often for torture_type=busted. @@ -2218,28 +2243,28 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid) } completed = cur_ops->get_gp_seq(); if (pipe_count > 1) { - do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, - ts, started, completed); + do_trace_rcu_torture_read(cur_ops->name, &rtorsp->p->rtort_rcu, + rtorsp->ts, rtorsp->started, completed); rcu_ftrace_dump(DUMP_ALL); } __this_cpu_inc(rcu_torture_count[pipe_count]); - completed = rcutorture_seq_diff(completed, started); + completed = rcutorture_seq_diff(completed, rtorsp->started); if (completed > RCU_TORTURE_PIPE_LEN) { /* Should not happen, but... */ completed = RCU_TORTURE_PIPE_LEN; } __this_cpu_inc(rcu_torture_batch[completed]); preempt_enable(); - if (checkpolling) { + if (rtorsp->checkpolling) { if (cur_ops->get_gp_state && cur_ops->poll_gp_state) - WARN_ONCE(cur_ops->poll_gp_state(cookie), + WARN_ONCE(cur_ops->poll_gp_state(rtorsp->cookie), "%s: Cookie check 2 failed %s(%d) %lu->%lu\n", __func__, rcu_torture_writer_state_getname(), rcu_torture_writer_state, - cookie, cur_ops->get_gp_state()); + rtorsp->cookie, cur_ops->get_gp_state()); if (cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full) - WARN_ONCE(cur_ops->poll_gp_state_full(&cookie_full), + WARN_ONCE(cur_ops->poll_gp_state_full(&rtorsp->cookie_full), "%s: Cookie check 6 failed %s(%d) online %*pbl\n", __func__, rcu_torture_writer_state_getname(), @@ -2248,21 +2273,42 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid) } if (cur_ops->reader_blocked) preempted = cur_ops->reader_blocked(); - rcutorture_one_extend(&readstate, 0, myid < 0, trsp, rtrsp); - WARN_ON_ONCE(readstate); + rcutorture_one_extend(&rtorsp->readstate, 0, myid < 0, trsp, rtorsp->rtrsp); + WARN_ON_ONCE(rtorsp->readstate); // This next splat is expected behavior if leakpointer, especially // for CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels. - WARN_ON_ONCE(leakpointer && READ_ONCE(p->rtort_pipe_count) > 1); + WARN_ON_ONCE(leakpointer && READ_ONCE(rtorsp->p->rtort_pipe_count) > 1); /* If error or close call, record the sequence of reader protections. */ if ((pipe_count > 1 || completed > 1) && !xchg(&err_segs_recorded, 1)) { i = 0; - for (rtrsp1 = &rtseg[0]; rtrsp1 < rtrsp; rtrsp1++) + for (rtrsp1 = &rtorsp->rtseg[0]; rtrsp1 < rtorsp->rtrsp; rtrsp1++) err_segs[i++] = *rtrsp1; rt_read_nsegs = i; rt_read_preempted = preempted; } +} +/* + * Do one read-side critical section, returning false if there was + * no data to read. Can be invoked both from process context and + * from a timer handler. + */ +static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid) +{ + int newstate; + struct rcu_torture_one_read_state rtors; + + WARN_ON_ONCE(!rcu_is_watching()); + init_rcu_torture_one_read_state(&rtors, trsp); + newstate = rcutorture_extend_mask(rtors.readstate, trsp); + rcutorture_one_extend(&rtors.readstate, newstate, myid < 0, trsp, rtors.rtrsp++); + if (!rcu_torture_one_read_start(&rtors, trsp, myid)) { + rcutorture_one_extend(&rtors.readstate, 0, myid < 0, trsp, rtors.rtrsp); + return false; + } + rtors.rtrsp = rcutorture_loop_extend(&rtors.readstate, myid < 0, trsp, rtors.rtrsp); + rcu_torture_one_read_end(&rtors, trsp, myid); return true; } @@ -2307,7 +2353,7 @@ rcu_torture_reader(void *arg) set_user_nice(current, MAX_NICE); if (irqreader && cur_ops->irq_capable) timer_setup_on_stack(&t, rcu_torture_timer, 0); - tick_dep_set_task(current, TICK_DEP_BIT_RCU); + tick_dep_set_task(current, TICK_DEP_BIT_RCU); // CPU bound, so need tick. do { if (irqreader && cur_ops->irq_capable) { if (!timer_pending(&t)) @@ -2324,8 +2370,8 @@ rcu_torture_reader(void *arg) stutter_wait("rcu_torture_reader"); } while (!torture_must_stop()); if (irqreader && cur_ops->irq_capable) { - del_timer_sync(&t); - destroy_timer_on_stack(&t); + timer_delete_sync(&t); + timer_destroy_on_stack(&t); } tick_dep_clear_task(current, TICK_DEP_BIT_RCU); torture_kthread_stopping("rcu_torture_reader"); @@ -2394,6 +2440,7 @@ rcu_torture_stats_print(void) int i; long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; + long n_gpwraps = 0; struct rcu_torture *rtcp; static unsigned long rtcv_snap = ULONG_MAX; static bool splatted; @@ -2404,6 +2451,8 @@ rcu_torture_stats_print(void) pipesummary[i] += READ_ONCE(per_cpu(rcu_torture_count, cpu)[i]); batchsummary[i] += READ_ONCE(per_cpu(rcu_torture_batch, cpu)[i]); } + if (cur_ops->get_gpwrap_count) + n_gpwraps += cur_ops->get_gpwrap_count(cpu); } for (i = RCU_TORTURE_PIPE_LEN; i >= 0; i--) { if (pipesummary[i] != 0) @@ -2435,8 +2484,9 @@ rcu_torture_stats_print(void) data_race(n_barrier_attempts), data_race(n_rcu_torture_barrier_error)); pr_cont("read-exits: %ld ", data_race(n_read_exits)); // Statistic. - pr_cont("nocb-toggles: %ld:%ld\n", + pr_cont("nocb-toggles: %ld:%ld ", atomic_long_read(&n_nocb_offload), atomic_long_read(&n_nocb_deoffload)); + pr_cont("gpwraps: %ld\n", n_gpwraps); pr_alert("%s%s ", torture_type, TORTURE_FLAG); if (atomic_read(&n_rcu_torture_mberror) || @@ -3036,7 +3086,7 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp) cver = READ_ONCE(rcu_torture_current_version); gps = cur_ops->get_gp_seq(); rfp->rcu_launder_gp_seq_start = gps; - tick_dep_set_task(current, TICK_DEP_BIT_RCU); + tick_dep_set_task(current, TICK_DEP_BIT_RCU); // CPU bound, so need tick. while (time_before(jiffies, stopat) && !shutdown_time_arrived() && !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) { @@ -3607,6 +3657,57 @@ static int rcu_torture_preempt(void *unused) static enum cpuhp_state rcutor_hp; +static struct hrtimer gpwrap_lag_timer; +static bool gpwrap_lag_active; + +/* Timer handler for toggling RCU grace-period sequence overflow test lag value */ +static enum hrtimer_restart rcu_gpwrap_lag_timer(struct hrtimer *timer) +{ + ktime_t next_delay; + + if (gpwrap_lag_active) { + pr_alert("rcu-torture: Disabling gpwrap lag (value=0)\n"); + cur_ops->set_gpwrap_lag(0); + gpwrap_lag_active = false; + next_delay = ktime_set((gpwrap_lag_cycle_mins - gpwrap_lag_active_mins) * 60, 0); + } else { + pr_alert("rcu-torture: Enabling gpwrap lag (value=%d)\n", gpwrap_lag_gps); + cur_ops->set_gpwrap_lag(gpwrap_lag_gps); + gpwrap_lag_active = true; + next_delay = ktime_set(gpwrap_lag_active_mins * 60, 0); + } + + if (torture_must_stop_irq()) + return HRTIMER_NORESTART; + + hrtimer_forward_now(timer, next_delay); + return HRTIMER_RESTART; +} + +static int rcu_gpwrap_lag_init(void) +{ + if (!gpwrap_lag) + return 0; + + if (gpwrap_lag_cycle_mins <= 0 || gpwrap_lag_active_mins <= 0) { + pr_alert("rcu-torture: lag timing parameters must be positive\n"); + return -EINVAL; + } + + hrtimer_setup(&gpwrap_lag_timer, rcu_gpwrap_lag_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + gpwrap_lag_active = false; + hrtimer_start(&gpwrap_lag_timer, + ktime_set((gpwrap_lag_cycle_mins - gpwrap_lag_active_mins) * 60, 0), HRTIMER_MODE_REL); + + return 0; +} + +static void rcu_gpwrap_lag_cleanup(void) +{ + hrtimer_cancel(&gpwrap_lag_timer); + cur_ops->set_gpwrap_lag(0); + gpwrap_lag_active = false; +} static void rcu_torture_cleanup(void) { @@ -3776,6 +3877,9 @@ rcu_torture_cleanup(void) torture_cleanup_end(); if (cur_ops->gp_slow_unregister) cur_ops->gp_slow_unregister(NULL); + + if (gpwrap_lag && cur_ops->set_gpwrap_lag) + rcu_gpwrap_lag_cleanup(); } static void rcu_torture_leak_cb(struct rcu_head *rhp) @@ -4272,9 +4376,17 @@ rcu_torture_init(void) } if (object_debug) rcu_test_debug_objects(); - torture_init_end(); + if (cur_ops->gp_slow_register && !WARN_ON_ONCE(!cur_ops->gp_slow_unregister)) cur_ops->gp_slow_register(&rcu_fwd_cb_nodelay); + + if (gpwrap_lag && cur_ops->set_gpwrap_lag) { + firsterr = rcu_gpwrap_lag_init(); + if (torture_init_error(firsterr)) + goto unwind; + } + + torture_init_end(); return 0; unwind: diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index d2a694944553..48047260697e 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -690,7 +690,7 @@ void cleanup_srcu_struct(struct srcu_struct *ssp) for_each_possible_cpu(cpu) { struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); - del_timer_sync(&sdp->delay_work); + timer_delete_sync(&sdp->delay_work); flush_work(&sdp->work); if (WARN_ON(rcu_segcblist_n_cbs(&sdp->srcu_cblist))) return; /* Forgot srcu_barrier(), so just leak it! */ @@ -1589,7 +1589,7 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu); bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie) { if (cookie != SRCU_GET_STATE_COMPLETED && - !rcu_seq_done(&ssp->srcu_sup->srcu_gp_seq, cookie)) + !rcu_seq_done_exact(&ssp->srcu_sup->srcu_gp_seq, cookie)) return false; // Ensure that the end of the SRCU grace period happens before // any subsequent code that the caller might execute. diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 466668eb4fad..c0cc7ae41106 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1086,7 +1086,7 @@ static void rcu_tasks_postscan(struct list_head *hop) } if (!IS_ENABLED(CONFIG_TINY_RCU)) - del_timer_sync(&tasks_rcu_exit_srcu_stall_timer); + timer_delete_sync(&tasks_rcu_exit_srcu_stall_timer); } /* See if tasks are still holding out, complain if so. */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 659f83e71048..e8a4b720d7d2 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -80,6 +80,15 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *); static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = { .gpwrap = true, }; + +int rcu_get_gpwrap_count(int cpu) +{ + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + + return READ_ONCE(rdp->gpwrap_count); +} +EXPORT_SYMBOL_GPL(rcu_get_gpwrap_count); + static struct rcu_state rcu_state = { .level = { &rcu_state.node[0] }, .gp_state = RCU_GP_IDLE, @@ -757,6 +766,25 @@ void rcu_request_urgent_qs_task(struct task_struct *t) smp_store_release(per_cpu_ptr(&rcu_data.rcu_urgent_qs, cpu), true); } +static unsigned long seq_gpwrap_lag = ULONG_MAX / 4; + +/** + * rcu_set_gpwrap_lag - Set RCU GP sequence overflow lag value. + * @lag_gps: Set overflow lag to this many grace period worth of counters + * which is used by rcutorture to quickly force a gpwrap situation. + * @lag_gps = 0 means we reset it back to the boot-time value. + */ +void rcu_set_gpwrap_lag(unsigned long lag_gps) +{ + unsigned long lag_seq_count; + + lag_seq_count = (lag_gps == 0) + ? ULONG_MAX / 4 + : lag_gps << RCU_SEQ_CTR_SHIFT; + WRITE_ONCE(seq_gpwrap_lag, lag_seq_count); +} +EXPORT_SYMBOL_GPL(rcu_set_gpwrap_lag); + /* * When trying to report a quiescent state on behalf of some other CPU, * it is our responsibility to check for and handle potential overflow @@ -767,9 +795,11 @@ void rcu_request_urgent_qs_task(struct task_struct *t) static void rcu_gpnum_ovf(struct rcu_node *rnp, struct rcu_data *rdp) { raw_lockdep_assert_held_rcu_node(rnp); - if (ULONG_CMP_LT(rcu_seq_current(&rdp->gp_seq) + ULONG_MAX / 4, - rnp->gp_seq)) + if (ULONG_CMP_LT(rcu_seq_current(&rdp->gp_seq) + seq_gpwrap_lag, + rnp->gp_seq)) { WRITE_ONCE(rdp->gpwrap, true); + WRITE_ONCE(rdp->gpwrap_count, READ_ONCE(rdp->gpwrap_count) + 1); + } if (ULONG_CMP_LT(rdp->rcu_iw_gp_seq + ULONG_MAX / 4, rnp->gp_seq)) rdp->rcu_iw_gp_seq = rnp->gp_seq + ULONG_MAX / 4; } @@ -801,6 +831,10 @@ static int rcu_watching_snap_save(struct rcu_data *rdp) return 0; } +#ifndef arch_irq_stat_cpu +#define arch_irq_stat_cpu(cpu) 0 +#endif + /* * Returns positive if the specified CPU has passed through a quiescent state * by virtue of being in or having passed through an dynticks idle state since @@ -936,9 +970,9 @@ static int rcu_watching_snap_recheck(struct rcu_data *rdp) rsrp->cputime_irq = kcpustat_field(kcsp, CPUTIME_IRQ, cpu); rsrp->cputime_softirq = kcpustat_field(kcsp, CPUTIME_SOFTIRQ, cpu); rsrp->cputime_system = kcpustat_field(kcsp, CPUTIME_SYSTEM, cpu); - rsrp->nr_hardirqs = kstat_cpu_irqs_sum(rdp->cpu); - rsrp->nr_softirqs = kstat_cpu_softirqs_sum(rdp->cpu); - rsrp->nr_csw = nr_context_switches_cpu(rdp->cpu); + rsrp->nr_hardirqs = kstat_cpu_irqs_sum(cpu) + arch_irq_stat_cpu(cpu); + rsrp->nr_softirqs = kstat_cpu_softirqs_sum(cpu); + rsrp->nr_csw = nr_context_switches_cpu(cpu); rsrp->jiffies = jiffies; rsrp->gp_seq = rdp->gp_seq; } @@ -1060,38 +1094,6 @@ static bool rcu_future_gp_cleanup(struct rcu_node *rnp) return needmore; } -static void swake_up_one_online_ipi(void *arg) -{ - struct swait_queue_head *wqh = arg; - - swake_up_one(wqh); -} - -static void swake_up_one_online(struct swait_queue_head *wqh) -{ - int cpu = get_cpu(); - - /* - * If called from rcutree_report_cpu_starting(), wake up - * is dangerous that late in the CPU-down hotplug process. The - * scheduler might queue an ignored hrtimer. Defer the wake up - * to an online CPU instead. - */ - if (unlikely(cpu_is_offline(cpu))) { - int target; - - target = cpumask_any_and(housekeeping_cpumask(HK_TYPE_RCU), - cpu_online_mask); - - smp_call_function_single(target, swake_up_one_online_ipi, - wqh, 0); - put_cpu(); - } else { - put_cpu(); - swake_up_one(wqh); - } -} - /* * Awaken the grace-period kthread. Don't do a self-awaken (unless in an * interrupt or softirq handler, in which case we just might immediately @@ -1116,7 +1118,7 @@ static void rcu_gp_kthread_wake(void) return; WRITE_ONCE(rcu_state.gp_wake_time, jiffies); WRITE_ONCE(rcu_state.gp_wake_seq, READ_ONCE(rcu_state.gp_seq)); - swake_up_one_online(&rcu_state.gp_wq); + swake_up_one(&rcu_state.gp_wq); } /* @@ -1798,6 +1800,7 @@ static noinline_for_stack bool rcu_gp_init(void) struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(); bool start_new_poll; + unsigned long old_gp_seq; WRITE_ONCE(rcu_state.gp_activity, jiffies); raw_spin_lock_irq_rcu_node(rnp); @@ -1825,7 +1828,12 @@ static noinline_for_stack bool rcu_gp_init(void) */ start_new_poll = rcu_sr_normal_gp_init(); /* Record GP times before starting GP, hence rcu_seq_start(). */ + old_gp_seq = rcu_state.gp_seq; rcu_seq_start(&rcu_state.gp_seq); + /* Ensure that rcu_seq_done_exact() guardband doesn't give false positives. */ + WARN_ON_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && + rcu_seq_done_exact(&old_gp_seq, rcu_seq_snap(&rcu_state.gp_seq))); + ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq); trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("start")); rcu_poll_gp_seq_start(&rcu_state.gp_seq_polled_snap); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index a9a811d9d7a3..3830c19cf2f6 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -168,7 +168,7 @@ struct rcu_snap_record { u64 cputime_irq; /* Accumulated cputime of hard irqs */ u64 cputime_softirq;/* Accumulated cputime of soft irqs */ u64 cputime_system; /* Accumulated cputime of kernel tasks */ - unsigned long nr_hardirqs; /* Accumulated number of hard irqs */ + u64 nr_hardirqs; /* Accumulated number of hard irqs */ unsigned int nr_softirqs; /* Accumulated number of soft irqs */ unsigned long long nr_csw; /* Accumulated number of task switches */ unsigned long jiffies; /* Track jiffies value */ @@ -183,6 +183,7 @@ struct rcu_data { bool core_needs_qs; /* Core waits for quiescent state. */ bool beenonline; /* CPU online at least once. */ bool gpwrap; /* Possible ->gp_seq wrap. */ + unsigned int gpwrap_count; /* Count of GP sequence wrap. */ bool cpu_started; /* RCU watching this onlining CPU. */ struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ unsigned long grpmask; /* Mask to apply to leaf qsmask. */ diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 8d4895c854c5..c36c7d5575ca 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -200,7 +200,7 @@ static void __rcu_report_exp_rnp(struct rcu_node *rnp, if (rnp->parent == NULL) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); if (wake) - swake_up_one_online(&rcu_state.expedited_wq); + swake_up_one(&rcu_state.expedited_wq); break; } diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 5ff3bc56ff51..1596812f7f12 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -206,7 +206,7 @@ static bool __wake_nocb_gp(struct rcu_data *rdp_gp, if (rdp_gp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) { WRITE_ONCE(rdp_gp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); - del_timer(&rdp_gp->nocb_timer); + timer_delete(&rdp_gp->nocb_timer); } if (force || READ_ONCE(rdp_gp->nocb_gp_sleep)) { @@ -216,7 +216,7 @@ static bool __wake_nocb_gp(struct rcu_data *rdp_gp, raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); if (needwake) { trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DoWake")); - swake_up_one_online(&rdp_gp->nocb_gp_wq); + swake_up_one(&rdp_gp->nocb_gp_wq); } return needwake; @@ -554,19 +554,13 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, rcu_nocb_unlock(rdp); wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_LAZY, TPS("WakeLazy")); - } else if (!irqs_disabled_flags(flags) && cpu_online(rdp->cpu)) { + } else if (!irqs_disabled_flags(flags)) { /* ... if queue was empty ... */ rcu_nocb_unlock(rdp); wake_nocb_gp(rdp, false); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeEmpty")); } else { - /* - * Don't do the wake-up upfront on fragile paths. - * Also offline CPUs can't call swake_up_one_online() from - * (soft-)IRQs. Rely on the final deferred wake-up from - * rcutree_report_cpu_dead() - */ rcu_nocb_unlock(rdp); wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE, TPS("WakeEmptyIsDeferred")); @@ -822,7 +816,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) if (my_rdp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) { WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); - del_timer(&my_rdp->nocb_timer); + timer_delete(&my_rdp->nocb_timer); } WRITE_ONCE(my_rdp->nocb_gp_sleep, true); raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 3c0bbbbb686f..0b0f56f6abc8 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -29,7 +29,7 @@ static bool rcu_rdp_is_offloaded(struct rcu_data *rdp) (IS_ENABLED(CONFIG_HOTPLUG_CPU) && lockdep_is_cpus_held()) || lockdep_is_held(&rdp->nocb_lock) || lockdep_is_held(&rcu_state.nocb_mutex) || - (!(IS_ENABLED(CONFIG_PREEMPT_COUNT) && preemptible()) && + ((!(IS_ENABLED(CONFIG_PREEMPT_COUNT) && preemptible()) || softirq_count()) && rdp == this_cpu_ptr(&rcu_data)) || rcu_current_is_nocb_kthread(rdp)), "Unsafe read of RCU_NOCB offloaded state" diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 925fcdad5dea..56b21219442b 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -435,8 +435,8 @@ static void print_cpu_stat_info(int cpu) rsr.cputime_system = kcpustat_field(kcsp, CPUTIME_SYSTEM, cpu); pr_err("\t hardirqs softirqs csw/system\n"); - pr_err("\t number: %8ld %10d %12lld\n", - kstat_cpu_irqs_sum(cpu) - rsrp->nr_hardirqs, + pr_err("\t number: %8lld %10d %12lld\n", + kstat_cpu_irqs_sum(cpu) + arch_irq_stat_cpu(cpu) - rsrp->nr_hardirqs, kstat_cpu_softirqs_sum(cpu) - rsrp->nr_softirqs, nr_context_switches_cpu(cpu) - rsrp->nr_csw); pr_err("\tcputime: %8lld %10lld %12lld ==> %d(ms)\n", diff --git a/kernel/reboot.c b/kernel/reboot.c index 41ab9e1ba357..ec087827c85c 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -36,6 +36,8 @@ enum reboot_mode reboot_mode DEFAULT_REBOOT_MODE; EXPORT_SYMBOL_GPL(reboot_mode); enum reboot_mode panic_reboot_mode = REBOOT_UNDEFINED; +static enum hw_protection_action hw_protection_action = HWPROT_ACT_SHUTDOWN; + /* * This variable is used privately to keep track of whether or not * reboot_type is still set to its default value (i.e., reboot= hasn't @@ -229,6 +231,9 @@ EXPORT_SYMBOL(unregister_restart_handler); /** * do_kernel_restart - Execute kernel restart handler call chain * + * @cmd: pointer to buffer containing command to execute for restart + * or %NULL + * * Calls functions registered with register_restart_handler. * * Expected to be called from machine_restart as last step of the restart @@ -933,61 +938,86 @@ void orderly_reboot(void) } EXPORT_SYMBOL_GPL(orderly_reboot); +static const char *hw_protection_action_str(enum hw_protection_action action) +{ + switch (action) { + case HWPROT_ACT_SHUTDOWN: + return "shutdown"; + case HWPROT_ACT_REBOOT: + return "reboot"; + default: + return "undefined"; + } +} + +static enum hw_protection_action hw_failure_emergency_action; + /** - * hw_failure_emergency_poweroff_func - emergency poweroff work after a known delay - * @work: work_struct associated with the emergency poweroff function + * hw_failure_emergency_action_func - emergency action work after a known delay + * @work: work_struct associated with the emergency action function * * This function is called in very critical situations to force - * a kernel poweroff after a configurable timeout value. + * a kernel poweroff or reboot after a configurable timeout value. */ -static void hw_failure_emergency_poweroff_func(struct work_struct *work) +static void hw_failure_emergency_action_func(struct work_struct *work) { + const char *action_str = hw_protection_action_str(hw_failure_emergency_action); + + pr_emerg("Hardware protection timed-out. Trying forced %s\n", + action_str); + /* - * We have reached here after the emergency shutdown waiting period has - * expired. This means orderly_poweroff has not been able to shut off - * the system for some reason. + * We have reached here after the emergency action waiting period has + * expired. This means orderly_poweroff/reboot has not been able to + * shut off the system for some reason. * - * Try to shut down the system immediately using kernel_power_off - * if populated + * Try to shut off the system immediately if possible */ - pr_emerg("Hardware protection timed-out. Trying forced poweroff\n"); - kernel_power_off(); + + if (hw_failure_emergency_action == HWPROT_ACT_REBOOT) + kernel_restart(NULL); + else + kernel_power_off(); /* * Worst of the worst case trigger emergency restart */ - pr_emerg("Hardware protection shutdown failed. Trying emergency restart\n"); + pr_emerg("Hardware protection %s failed. Trying emergency restart\n", + action_str); emergency_restart(); } -static DECLARE_DELAYED_WORK(hw_failure_emergency_poweroff_work, - hw_failure_emergency_poweroff_func); +static DECLARE_DELAYED_WORK(hw_failure_emergency_action_work, + hw_failure_emergency_action_func); /** - * hw_failure_emergency_poweroff - Trigger an emergency system poweroff + * hw_failure_emergency_schedule - Schedule an emergency system shutdown or reboot + * + * @action: The hardware protection action to be taken + * @action_delay_ms: Time in milliseconds to elapse before triggering action * * This may be called from any critical situation to trigger a system shutdown - * after a given period of time. If time is negative this is not scheduled. + * or reboot after a given period of time. + * If time is negative this is not scheduled. */ -static void hw_failure_emergency_poweroff(int poweroff_delay_ms) +static void hw_failure_emergency_schedule(enum hw_protection_action action, + int action_delay_ms) { - if (poweroff_delay_ms <= 0) + if (action_delay_ms <= 0) return; - schedule_delayed_work(&hw_failure_emergency_poweroff_work, - msecs_to_jiffies(poweroff_delay_ms)); + hw_failure_emergency_action = action; + schedule_delayed_work(&hw_failure_emergency_action_work, + msecs_to_jiffies(action_delay_ms)); } /** - * __hw_protection_shutdown - Trigger an emergency system shutdown or reboot + * __hw_protection_trigger - Trigger an emergency system shutdown or reboot * * @reason: Reason of emergency shutdown or reboot to be printed. * @ms_until_forced: Time to wait for orderly shutdown or reboot before * triggering it. Negative value disables the forced * shutdown or reboot. - * @shutdown: If true, indicates that a shutdown will happen - * after the critical tempeature is reached. - * If false, indicates that a reboot will happen - * after the critical tempeature is reached. + * @action: The hardware protection action to be taken. * * Initiate an emergency system shutdown or reboot in order to protect * hardware from further damage. Usage examples include a thermal protection. @@ -995,11 +1025,16 @@ static void hw_failure_emergency_poweroff(int poweroff_delay_ms) * pending even if the previous request has given a large timeout for forced * shutdown/reboot. */ -void __hw_protection_shutdown(const char *reason, int ms_until_forced, bool shutdown) +void __hw_protection_trigger(const char *reason, int ms_until_forced, + enum hw_protection_action action) { static atomic_t allow_proceed = ATOMIC_INIT(1); - pr_emerg("HARDWARE PROTECTION shutdown (%s)\n", reason); + if (action == HWPROT_ACT_DEFAULT) + action = hw_protection_action; + + pr_emerg("HARDWARE PROTECTION %s (%s)\n", + hw_protection_action_str(action), reason); /* Shutdown should be initiated only once. */ if (!atomic_dec_and_test(&allow_proceed)) @@ -1009,13 +1044,55 @@ void __hw_protection_shutdown(const char *reason, int ms_until_forced, bool shut * Queue a backup emergency shutdown in the event of * orderly_poweroff failure */ - hw_failure_emergency_poweroff(ms_until_forced); - if (shutdown) + hw_failure_emergency_schedule(action, ms_until_forced); + if (action == HWPROT_ACT_REBOOT) + orderly_reboot(); + else orderly_poweroff(true); +} +EXPORT_SYMBOL_GPL(__hw_protection_trigger); + +static bool hw_protection_action_parse(const char *str, + enum hw_protection_action *action) +{ + if (sysfs_streq(str, "shutdown")) + *action = HWPROT_ACT_SHUTDOWN; + else if (sysfs_streq(str, "reboot")) + *action = HWPROT_ACT_REBOOT; else - orderly_reboot(); + return false; + + return true; +} + +static int __init hw_protection_setup(char *str) +{ + hw_protection_action_parse(str, &hw_protection_action); + return 1; +} +__setup("hw_protection=", hw_protection_setup); + +#ifdef CONFIG_SYSFS +static ssize_t hw_protection_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%s\n", + hw_protection_action_str(hw_protection_action)); +} +static ssize_t hw_protection_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, + size_t count) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!hw_protection_action_parse(buf, &hw_protection_action)) + return -EINVAL; + + return count; } -EXPORT_SYMBOL_GPL(__hw_protection_shutdown); +static struct kobj_attribute hw_protection_attr = __ATTR_RW(hw_protection); +#endif static int __init reboot_setup(char *str) { @@ -1276,6 +1353,7 @@ static struct kobj_attribute reboot_cpu_attr = __ATTR_RW(cpu); #endif static struct attribute *reboot_attrs[] = { + &hw_protection_attr.attr, &reboot_mode_attr.attr, #ifdef CONFIG_X86 &reboot_force_attr.attr, diff --git a/kernel/relay.c b/kernel/relay.c index a8ae436dc77e..5ac7e711e4b6 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -351,10 +351,9 @@ static struct dentry *relay_create_buf_file(struct rchan *chan, struct dentry *dentry; char *tmpname; - tmpname = kzalloc(NAME_MAX + 1, GFP_KERNEL); + tmpname = kasprintf(GFP_KERNEL, "%s%d", chan->base_filename, cpu); if (!tmpname) return NULL; - snprintf(tmpname, NAME_MAX, "%s%d", chan->base_filename, cpu); /* Create file in fs */ dentry = chan->cb->create_buf_file(tmpname, chan->parent, diff --git a/kernel/resource.c b/kernel/resource.c index 12004452d999..8d3e6ed0bdc1 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -561,8 +561,7 @@ static int __region_intersects(struct resource *parent, resource_size_t start, struct resource res, o; bool covered; - res.start = start; - res.end = start + size - 1; + res = DEFINE_RES(start, size, 0); for (p = parent->child; p ; p = p->sibling) { if (!resource_intersection(p, &res, &o)) @@ -1714,18 +1713,13 @@ static int __init reserve_setup(char *str) * I/O port space; otherwise assume it's memory. */ if (io_start < 0x10000) { - res->flags = IORESOURCE_IO; + *res = DEFINE_RES_IO_NAMED(io_start, io_num, "reserved"); parent = &ioport_resource; } else { - res->flags = IORESOURCE_MEM; + *res = DEFINE_RES_MEM_NAMED(io_start, io_num, "reserved"); parent = &iomem_resource; } - res->name = "reserved"; - res->start = io_start; - res->end = io_start + io_num - 1; res->flags |= IORESOURCE_BUSY; - res->desc = IORES_DESC_NONE; - res->child = NULL; if (request_resource(parent, res) == 0) reserved = x+1; } @@ -1975,11 +1969,7 @@ get_free_mem_region(struct device *dev, struct resource *base, */ revoke_iomem(res); } else { - res->start = addr; - res->end = addr + size - 1; - res->name = name; - res->desc = desc; - res->flags = IORESOURCE_MEM; + *res = DEFINE_RES_NAMED_DESC(addr, size, name, IORESOURCE_MEM, desc); /* * Only succeed if the resource hosts an exclusive diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5bd8d7e7347d..62b3416f5e43 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -66,6 +66,7 @@ #include <linux/vtime.h> #include <linux/wait_api.h> #include <linux/workqueue_api.h> +#include <linux/livepatch_sched.h> #ifdef CONFIG_PREEMPT_DYNAMIC # ifdef CONFIG_GENERIC_ENTRY @@ -488,6 +489,16 @@ sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { } #endif /* CONFIG_SCHED_CORE */ +/* need a wrapper since we may need to trace from modules */ +EXPORT_TRACEPOINT_SYMBOL(sched_set_state_tp); + +/* Call via the helper macro trace_set_current_state. */ +void __trace_set_current_state(int state_value) +{ + trace_sched_set_state_tp(current, state_value); +} +EXPORT_SYMBOL(__trace_set_current_state); + /* * Serialization rules: * @@ -912,8 +923,7 @@ static void hrtick_rq_init(struct rq *rq) #ifdef CONFIG_SMP INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq); #endif - hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); - rq->hrtick_timer.function = hrtick; + hrtimer_setup(&rq->hrtick_timer, hrtick, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); } #else /* CONFIG_SCHED_HRTICK */ static inline void hrtick_clear(struct rq *rq) @@ -1743,7 +1753,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, } } -static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) +static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p, int flags) { enum uclamp_id clamp_id; @@ -1759,7 +1769,8 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) if (unlikely(!p->sched_class->uclamp_enabled)) return; - if (p->se.sched_delayed) + /* Only inc the delayed task which being woken up. */ + if (p->se.sched_delayed && !(flags & ENQUEUE_DELAYED)) return; for_each_clamp_id(clamp_id) @@ -2027,7 +2038,7 @@ static void __init init_uclamp(void) } #else /* !CONFIG_UCLAMP_TASK */ -static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { } +static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p, int flags) { } static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { } static inline void uclamp_fork(struct task_struct *p) { } static inline void uclamp_post_fork(struct task_struct *p) { } @@ -2063,12 +2074,14 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags) if (!(flags & ENQUEUE_NOCLOCK)) update_rq_clock(rq); - p->sched_class->enqueue_task(rq, p, flags); /* - * Must be after ->enqueue_task() because ENQUEUE_DELAYED can clear - * ->sched_delayed. + * Can be before ->enqueue_task() because uclamp considers the + * ENQUEUE_DELAYED task before its ->sched_delayed gets cleared + * in ->enqueue_task(). */ - uclamp_rq_inc(rq, p); + uclamp_rq_inc(rq, p, flags); + + p->sched_class->enqueue_task(rq, p, flags); psi_enqueue(p, flags); @@ -2274,6 +2287,12 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state * just go back and repeat. */ rq = task_rq_lock(p, &rf); + /* + * If task is sched_delayed, force dequeue it, to avoid always + * hitting the tick timeout in the queued case + */ + if (p->se.sched_delayed) + dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED); trace_sched_wait_task(p); running = task_on_cpu(rq, p); queued = task_on_rq_queued(p); @@ -5296,6 +5315,12 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev) */ finish_task_switch(prev); + /* + * This is a special case: the newly created task has just + * switched the context for the first time. It is returning from + * schedule for the first time in this path. + */ + trace_sched_exit_tp(true, CALLER_ADDR0); preempt_enable(); if (current->set_child_tid) @@ -6556,12 +6581,14 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * Otherwise marks the task's __state as RUNNING */ static bool try_to_block_task(struct rq *rq, struct task_struct *p, - unsigned long task_state) + unsigned long *task_state_p) { + unsigned long task_state = *task_state_p; int flags = DEQUEUE_NOCLOCK; if (signal_pending_state(task_state, p)) { WRITE_ONCE(p->__state, TASK_RUNNING); + *task_state_p = TASK_RUNNING; return false; } @@ -6635,12 +6662,15 @@ static void __sched notrace __schedule(int sched_mode) * as a preemption by schedule_debug() and RCU. */ bool preempt = sched_mode > SM_NONE; + bool is_switch = false; unsigned long *switch_count; unsigned long prev_state; struct rq_flags rf; struct rq *rq; int cpu; + trace_sched_entry_tp(preempt, CALLER_ADDR0); + cpu = smp_processor_id(); rq = cpu_rq(cpu); prev = rq->curr; @@ -6650,6 +6680,8 @@ static void __sched notrace __schedule(int sched_mode) if (sched_feat(HRTICK) || sched_feat(HRTICK_DL)) hrtick_clear(rq); + klp_sched_try_switch(prev); + local_irq_disable(); rcu_note_context_switch(preempt); @@ -6695,7 +6727,7 @@ static void __sched notrace __schedule(int sched_mode) goto picked; } } else if (!preempt && prev_state) { - try_to_block_task(rq, prev, prev_state); + try_to_block_task(rq, prev, &prev_state); switch_count = &prev->nvcsw; } @@ -6706,7 +6738,8 @@ picked: clear_preempt_need_resched(); rq->last_seen_need_resched_ns = 0; - if (likely(prev != next)) { + is_switch = prev != next; + if (likely(is_switch)) { rq->nr_switches++; /* * RCU users of rcu_dereference(rq->curr) may not see @@ -6751,6 +6784,7 @@ picked: __balance_callbacks(rq); raw_spin_rq_unlock_irq(rq); } + trace_sched_exit_tp(is_switch, CALLER_ADDR0); } void __noreturn do_task_dead(void) @@ -7308,7 +7342,6 @@ EXPORT_STATIC_CALL_TRAMP(might_resched); static DEFINE_STATIC_KEY_FALSE(sk_dynamic_cond_resched); int __sched dynamic_cond_resched(void) { - klp_sched_try_switch(); if (!static_branch_unlikely(&sk_dynamic_cond_resched)) return 0; return __cond_resched(); @@ -7480,7 +7513,6 @@ int sched_dynamic_mode(const char *str) #endif static DEFINE_MUTEX(sched_dynamic_mutex); -static bool klp_override; static void __sched_dynamic_update(int mode) { @@ -7488,8 +7520,7 @@ static void __sched_dynamic_update(int mode) * Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in * the ZERO state, which is invalid. */ - if (!klp_override) - preempt_dynamic_enable(cond_resched); + preempt_dynamic_enable(cond_resched); preempt_dynamic_enable(might_resched); preempt_dynamic_enable(preempt_schedule); preempt_dynamic_enable(preempt_schedule_notrace); @@ -7498,8 +7529,7 @@ static void __sched_dynamic_update(int mode) switch (mode) { case preempt_dynamic_none: - if (!klp_override) - preempt_dynamic_enable(cond_resched); + preempt_dynamic_enable(cond_resched); preempt_dynamic_disable(might_resched); preempt_dynamic_disable(preempt_schedule); preempt_dynamic_disable(preempt_schedule_notrace); @@ -7510,8 +7540,7 @@ static void __sched_dynamic_update(int mode) break; case preempt_dynamic_voluntary: - if (!klp_override) - preempt_dynamic_enable(cond_resched); + preempt_dynamic_enable(cond_resched); preempt_dynamic_enable(might_resched); preempt_dynamic_disable(preempt_schedule); preempt_dynamic_disable(preempt_schedule_notrace); @@ -7522,8 +7551,7 @@ static void __sched_dynamic_update(int mode) break; case preempt_dynamic_full: - if (!klp_override) - preempt_dynamic_disable(cond_resched); + preempt_dynamic_disable(cond_resched); preempt_dynamic_disable(might_resched); preempt_dynamic_enable(preempt_schedule); preempt_dynamic_enable(preempt_schedule_notrace); @@ -7534,8 +7562,7 @@ static void __sched_dynamic_update(int mode) break; case preempt_dynamic_lazy: - if (!klp_override) - preempt_dynamic_disable(cond_resched); + preempt_dynamic_disable(cond_resched); preempt_dynamic_disable(might_resched); preempt_dynamic_enable(preempt_schedule); preempt_dynamic_enable(preempt_schedule_notrace); @@ -7556,36 +7583,6 @@ void sched_dynamic_update(int mode) mutex_unlock(&sched_dynamic_mutex); } -#ifdef CONFIG_HAVE_PREEMPT_DYNAMIC_CALL - -static int klp_cond_resched(void) -{ - __klp_sched_try_switch(); - return __cond_resched(); -} - -void sched_dynamic_klp_enable(void) -{ - mutex_lock(&sched_dynamic_mutex); - - klp_override = true; - static_call_update(cond_resched, klp_cond_resched); - - mutex_unlock(&sched_dynamic_mutex); -} - -void sched_dynamic_klp_disable(void) -{ - mutex_lock(&sched_dynamic_mutex); - - klp_override = false; - __sched_dynamic_update(preempt_dynamic_mode); - - mutex_unlock(&sched_dynamic_mutex); -} - -#endif /* CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */ - static int __init setup_preempt_mode(char *str) { int mode = sched_dynamic_mode(str); @@ -8998,7 +8995,7 @@ void sched_online_group(struct task_group *tg, struct task_group *parent) unsigned long flags; spin_lock_irqsave(&task_group_lock, flags); - list_add_rcu(&tg->list, &task_groups); + list_add_tail_rcu(&tg->list, &task_groups); /* Root should already exist: */ WARN_ON(!parent); @@ -9184,11 +9181,15 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) struct task_struct *task; struct cgroup_subsys_state *css; + if (!rt_group_sched_enabled()) + goto scx_check; + cgroup_taskset_for_each(task, css, tset) { if (!sched_rt_can_attach(css_tg(css), task)) return -EINVAL; } -#endif +scx_check: +#endif /* CONFIG_RT_GROUP_SCHED */ return scx_cgroup_can_attach(tset); } @@ -9841,18 +9842,6 @@ static struct cftype cpu_legacy_files[] = { .seq_show = cpu_cfs_local_stat_show, }, #endif -#ifdef CONFIG_RT_GROUP_SCHED - { - .name = "rt_runtime_us", - .read_s64 = cpu_rt_runtime_read, - .write_s64 = cpu_rt_runtime_write, - }, - { - .name = "rt_period_us", - .read_u64 = cpu_rt_period_read_uint, - .write_u64 = cpu_rt_period_write_uint, - }, -#endif #ifdef CONFIG_UCLAMP_TASK_GROUP { .name = "uclamp.min", @@ -9870,6 +9859,55 @@ static struct cftype cpu_legacy_files[] = { { } /* Terminate */ }; +#ifdef CONFIG_RT_GROUP_SCHED +static struct cftype rt_group_files[] = { + { + .name = "rt_runtime_us", + .read_s64 = cpu_rt_runtime_read, + .write_s64 = cpu_rt_runtime_write, + }, + { + .name = "rt_period_us", + .read_u64 = cpu_rt_period_read_uint, + .write_u64 = cpu_rt_period_write_uint, + }, + { } /* Terminate */ +}; + +# ifdef CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED +DEFINE_STATIC_KEY_FALSE(rt_group_sched); +# else +DEFINE_STATIC_KEY_TRUE(rt_group_sched); +# endif + +static int __init setup_rt_group_sched(char *str) +{ + long val; + + if (kstrtol(str, 0, &val) || val < 0 || val > 1) { + pr_warn("Unable to set rt_group_sched\n"); + return 1; + } + if (val) + static_branch_enable(&rt_group_sched); + else + static_branch_disable(&rt_group_sched); + + return 1; +} +__setup("rt_group_sched=", setup_rt_group_sched); + +static int __init cpu_rt_group_init(void) +{ + if (!rt_group_sched_enabled()) + return 0; + + WARN_ON(cgroup_add_legacy_cftypes(&cpu_cgrp_subsys, rt_group_files)); + return 0; +} +subsys_initcall(cpu_rt_group_init); +#endif /* CONFIG_RT_GROUP_SCHED */ + static int cpu_extra_stat_show(struct seq_file *sf, struct cgroup_subsys_state *css) { @@ -10683,7 +10721,6 @@ void sched_mm_cid_after_execve(struct task_struct *t) smp_mb(); t->last_mm_cid = t->mm_cid = mm_cid_get(rq, t, mm); } - rseq_set_notify_resume(t); } void sched_mm_cid_fork(struct task_struct *t) diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 1a19d69b91ed..461242ec958a 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -81,9 +81,23 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) if (!cpufreq_this_cpu_can_update(sg_policy->policy)) return false; - if (unlikely(sg_policy->limits_changed)) { - sg_policy->limits_changed = false; - sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS); + if (unlikely(READ_ONCE(sg_policy->limits_changed))) { + WRITE_ONCE(sg_policy->limits_changed, false); + sg_policy->need_freq_update = true; + + /* + * The above limits_changed update must occur before the reads + * of policy limits in cpufreq_driver_resolve_freq() or a policy + * limits update might be missed, so use a memory barrier to + * ensure it. + * + * This pairs with the write memory barrier in sugov_limits(). + */ + smp_mb(); + + return true; + } else if (sg_policy->need_freq_update) { + /* ignore_dl_rate_limit() wants a new frequency to be found. */ return true; } @@ -95,10 +109,22 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time, unsigned int next_freq) { - if (sg_policy->need_freq_update) + if (sg_policy->need_freq_update) { sg_policy->need_freq_update = false; - else if (sg_policy->next_freq == next_freq) + /* + * The policy limits have changed, but if the return value of + * cpufreq_driver_resolve_freq() after applying the new limits + * is still equal to the previously selected frequency, the + * driver callback need not be invoked unless the driver + * specifically wants that to happen on every update of the + * policy limits. + */ + if (sg_policy->next_freq == next_freq && + !cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS)) + return false; + } else if (sg_policy->next_freq == next_freq) { return false; + } sg_policy->next_freq = next_freq; sg_policy->last_freq_update_time = time; @@ -365,7 +391,7 @@ static inline bool sugov_hold_freq(struct sugov_cpu *sg_cpu) { return false; } static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu) { if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_min) - sg_cpu->sg_policy->limits_changed = true; + sg_cpu->sg_policy->need_freq_update = true; } static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu, @@ -604,7 +630,7 @@ static const struct kobj_type sugov_tunables_ktype = { /********************** cpufreq governor interface *********************/ -struct cpufreq_governor schedutil_gov; +static struct cpufreq_governor schedutil_gov; static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy) { @@ -871,10 +897,19 @@ static void sugov_limits(struct cpufreq_policy *policy) mutex_unlock(&sg_policy->work_lock); } - sg_policy->limits_changed = true; + /* + * The limits_changed update below must take place before the updates + * of policy limits in cpufreq_set_policy() or a policy limits update + * might be missed, so use a memory barrier to ensure it. + * + * This pairs with the memory barrier in sugov_should_update_freq(). + */ + smp_wmb(); + + WRITE_ONCE(sg_policy->limits_changed, true); } -struct cpufreq_governor schedutil_gov = { +static struct cpufreq_governor schedutil_gov = { .name = "schedutil", .owner = THIS_MODULE, .flags = CPUFREQ_GOV_DYNAMIC_SWITCHING, @@ -892,4 +927,9 @@ struct cpufreq_governor *cpufreq_default_governor(void) } #endif +bool sugov_is_governor(struct cpufreq_policy *policy) +{ + return policy->governor == &schedutil_gov; +} + cpufreq_governor_init(schedutil_gov); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 03a33b597768..ad45a8fea245 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1382,8 +1382,7 @@ static void init_dl_task_timer(struct sched_dl_entity *dl_se) { struct hrtimer *timer = &dl_se->dl_timer; - hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); - timer->function = dl_task_timer; + hrtimer_setup(timer, dl_task_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); } /* @@ -1839,8 +1838,7 @@ static void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se) { struct hrtimer *timer = &dl_se->inactive_timer; - hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); - timer->function = inactive_task_timer; + hrtimer_setup(timer, inactive_task_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); } #define __node_2_dle(node) \ diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 56ae54e0ce6a..557246880a7e 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -588,6 +588,10 @@ static void register_sd(struct sched_domain *sd, struct dentry *parent) debugfs_create_file("flags", 0444, parent, &sd->flags, &sd_flags_fops); debugfs_create_file("groups_flags", 0444, parent, &sd->groups->flags, &sd_flags_fops); debugfs_create_u32("level", 0444, parent, (u32 *)&sd->level); + + if (sd->flags & SD_ASYM_PACKING) + debugfs_create_u32("group_asym_prefer_cpu", 0444, parent, + (u32 *)&sd->groups->asym_prefer_cpu); } void update_sched_domain_debugfs(void) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 21575d39c376..2c41c78be61e 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -26,7 +26,7 @@ enum scx_consts { * Iterating all tasks may take a while. Periodically drop * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls. */ - SCX_OPS_TASK_ITER_BATCH = 32, + SCX_TASK_ITER_BATCH = 32, }; enum scx_exit_kind { @@ -44,9 +44,9 @@ enum scx_exit_kind { }; /* - * An exit code can be specified when exiting with scx_bpf_exit() or - * scx_ops_exit(), corresponding to exit_kind UNREG_BPF and UNREG_KERN - * respectively. The codes are 64bit of the format: + * An exit code can be specified when exiting with scx_bpf_exit() or scx_exit(), + * corresponding to exit_kind UNREG_BPF and UNREG_KERN respectively. The codes + * are 64bit of the format: * * Bits: [63 .. 48 47 .. 32 31 .. 0] * [ SYS ACT ] [ SYS RSN ] [ USR ] @@ -163,16 +163,21 @@ enum scx_ops_flags { /* * CPU cgroup support flags */ - SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16, /* cpu.weight */ + SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16, /* DEPRECATED, will be removed on 6.18 */ - SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE | - SCX_OPS_ENQ_LAST | - SCX_OPS_ENQ_EXITING | - SCX_OPS_ENQ_MIGRATION_DISABLED | - SCX_OPS_ALLOW_QUEUED_WAKEUP | - SCX_OPS_SWITCH_PARTIAL | - SCX_OPS_BUILTIN_IDLE_PER_NODE | - SCX_OPS_HAS_CGROUP_WEIGHT, + SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE | + SCX_OPS_ENQ_LAST | + SCX_OPS_ENQ_EXITING | + SCX_OPS_ENQ_MIGRATION_DISABLED | + SCX_OPS_ALLOW_QUEUED_WAKEUP | + SCX_OPS_SWITCH_PARTIAL | + SCX_OPS_BUILTIN_IDLE_PER_NODE | + SCX_OPS_HAS_CGROUP_WEIGHT, + + /* high 8 bits are internal, don't include in SCX_OPS_ALL_FLAGS */ + __SCX_OPS_INTERNAL_MASK = 0xffLLU << 56, + + SCX_OPS_HAS_CPU_PREEMPT = 1LLU << 56, }; /* argument container for ops.init_task() */ @@ -368,6 +373,15 @@ struct sched_ext_ops { * @running: A task is starting to run on its associated CPU * @p: task starting to run * + * Note that this callback may be called from a CPU other than the + * one the task is going to run on. This can happen when a task + * property is changed (i.e., affinity), since scx_next_task_scx(), + * which triggers this callback, may run on a CPU different from + * the task's assigned CPU. + * + * Therefore, always use scx_bpf_task_cpu(@p) to determine the + * target CPU the task is going to use. + * * See ->runnable() for explanation on the task state notifiers. */ void (*running)(struct task_struct *p); @@ -377,6 +391,15 @@ struct sched_ext_ops { * @p: task stopping to run * @runnable: is task @p still runnable? * + * Note that this callback may be called from a CPU other than the + * one the task was running on. This can happen when a task + * property is changed (i.e., affinity), since dequeue_task_scx(), + * which triggers this callback, may run on a CPU different from + * the task's assigned CPU. + * + * Therefore, always use scx_bpf_task_cpu(@p) to retrieve the CPU + * the task was running on. + * * See ->runnable() for explanation on the task state notifiers. If * !@runnable, ->quiescent() will be invoked after this operation * returns. @@ -465,6 +488,7 @@ struct sched_ext_ops { * idle CPU tracking and the following helpers become unavailable: * * - scx_bpf_select_cpu_dfl() + * - scx_bpf_select_cpu_and() * - scx_bpf_test_and_clear_cpu_idle() * - scx_bpf_pick_idle_cpu() * @@ -728,6 +752,9 @@ struct sched_ext_ops { * BPF scheduler is enabled. */ char name[SCX_OPS_NAME_LEN]; + + /* internal use only, must be NULL */ + void *priv; }; enum scx_opi { @@ -739,6 +766,98 @@ enum scx_opi { SCX_OPI_END = SCX_OP_IDX(init), }; +/* + * Collection of event counters. Event types are placed in descending order. + */ +struct scx_event_stats { + /* + * If ops.select_cpu() returns a CPU which can't be used by the task, + * the core scheduler code silently picks a fallback CPU. + */ + s64 SCX_EV_SELECT_CPU_FALLBACK; + + /* + * When dispatching to a local DSQ, the CPU may have gone offline in + * the meantime. In this case, the task is bounced to the global DSQ. + */ + s64 SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE; + + /* + * If SCX_OPS_ENQ_LAST is not set, the number of times that a task + * continued to run because there were no other tasks on the CPU. + */ + s64 SCX_EV_DISPATCH_KEEP_LAST; + + /* + * If SCX_OPS_ENQ_EXITING is not set, the number of times that a task + * is dispatched to a local DSQ when exiting. + */ + s64 SCX_EV_ENQ_SKIP_EXITING; + + /* + * If SCX_OPS_ENQ_MIGRATION_DISABLED is not set, the number of times a + * migration disabled task skips ops.enqueue() and is dispatched to its + * local DSQ. + */ + s64 SCX_EV_ENQ_SKIP_MIGRATION_DISABLED; + + /* + * Total number of times a task's time slice was refilled with the + * default value (SCX_SLICE_DFL). + */ + s64 SCX_EV_REFILL_SLICE_DFL; + + /* + * The total duration of bypass modes in nanoseconds. + */ + s64 SCX_EV_BYPASS_DURATION; + + /* + * The number of tasks dispatched in the bypassing mode. + */ + s64 SCX_EV_BYPASS_DISPATCH; + + /* + * The number of times the bypassing mode has been activated. + */ + s64 SCX_EV_BYPASS_ACTIVATE; +}; + +struct scx_sched { + struct sched_ext_ops ops; + DECLARE_BITMAP(has_op, SCX_OPI_END); + + /* + * Dispatch queues. + * + * The global DSQ (%SCX_DSQ_GLOBAL) is split per-node for scalability. + * This is to avoid live-locking in bypass mode where all tasks are + * dispatched to %SCX_DSQ_GLOBAL and all CPUs consume from it. If + * per-node split isn't sufficient, it can be further split. + */ + struct rhashtable dsq_hash; + struct scx_dispatch_q **global_dsqs; + + /* + * The event counters are in a per-CPU variable to minimize the + * accounting overhead. A system-wide view on the event counter is + * constructed when requested by scx_bpf_events(). + */ + struct scx_event_stats __percpu *event_stats_cpu; + + bool warned_zero_slice; + + atomic_t exit_kind; + struct scx_exit_info *exit_info; + + struct kobject kobj; + + struct kthread_worker *helper; + struct irq_work error_irq_work; + struct kthread_work disable_work; + struct rcu_work rcu_work; +}; + enum scx_wake_flags { /* expose select WF_* flags as enums */ SCX_WAKE_FORK = WF_FORK, @@ -838,18 +957,18 @@ enum scx_tg_flags { SCX_TG_INITED = 1U << 1, }; -enum scx_ops_enable_state { - SCX_OPS_ENABLING, - SCX_OPS_ENABLED, - SCX_OPS_DISABLING, - SCX_OPS_DISABLED, +enum scx_enable_state { + SCX_ENABLING, + SCX_ENABLED, + SCX_DISABLING, + SCX_DISABLED, }; -static const char *scx_ops_enable_state_str[] = { - [SCX_OPS_ENABLING] = "enabling", - [SCX_OPS_ENABLED] = "enabled", - [SCX_OPS_DISABLING] = "disabling", - [SCX_OPS_DISABLED] = "disabled", +static const char *scx_enable_state_str[] = { + [SCX_ENABLING] = "enabling", + [SCX_ENABLED] = "enabled", + [SCX_DISABLING] = "disabling", + [SCX_DISABLED] = "disabled", }; /* @@ -898,6 +1017,16 @@ enum scx_ops_state { #define SCX_OPSS_QSEQ_MASK (~SCX_OPSS_STATE_MASK) /* + * NOTE: sched_ext is in the process of growing multiple scheduler support and + * scx_root usage is in a transitional state. Naked dereferences are safe if the + * caller is one of the tasks attached to SCX and explicit RCU dereference is + * necessary otherwise. Naked scx_root dereferences trigger sparse warnings but + * are used as temporary markers to indicate that the dereferences need to be + * updated to point to the associated scheduler instances rather than scx_root. + */ +static struct scx_sched __rcu *scx_root; + +/* * During exit, a task may schedule after losing its PIDs. When disabling the * BPF scheduler, we need to be able to iterate tasks in every state to * guarantee system safety. Maintain a dedicated task list which contains every @@ -907,33 +1036,17 @@ static DEFINE_SPINLOCK(scx_tasks_lock); static LIST_HEAD(scx_tasks); /* ops enable/disable */ -static struct kthread_worker *scx_ops_helper; -static DEFINE_MUTEX(scx_ops_enable_mutex); -DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled); +static DEFINE_MUTEX(scx_enable_mutex); +DEFINE_STATIC_KEY_FALSE(__scx_enabled); DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem); -static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED); +static atomic_t scx_enable_state_var = ATOMIC_INIT(SCX_DISABLED); static unsigned long scx_in_softlockup; -static atomic_t scx_ops_breather_depth = ATOMIC_INIT(0); -static int scx_ops_bypass_depth; -static bool scx_ops_init_task_enabled; +static atomic_t scx_breather_depth = ATOMIC_INIT(0); +static int scx_bypass_depth; +static bool scx_init_task_enabled; static bool scx_switching_all; DEFINE_STATIC_KEY_FALSE(__scx_switched_all); -static struct sched_ext_ops scx_ops; -static bool scx_warned_zero_slice; - -DEFINE_STATIC_KEY_FALSE(scx_ops_allow_queued_wakeup); -static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_last); -static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting); -static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_migration_disabled); -static DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt); - -static struct static_key_false scx_has_op[SCX_OPI_END] = - { [0 ... SCX_OPI_END-1] = STATIC_KEY_FALSE_INIT }; - -static atomic_t scx_exit_kind = ATOMIC_INIT(SCX_EXIT_DONE); -static struct scx_exit_info *scx_exit_info; - static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0); static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0); @@ -947,7 +1060,7 @@ static atomic_long_t scx_enable_seq = ATOMIC_LONG_INIT(0); /* * The maximum amount of time in jiffies that a task may be runnable without * being scheduled on a CPU. If this timeout is exceeded, it will trigger - * scx_ops_error(). + * scx_error(). */ static unsigned long scx_watchdog_timeout; @@ -973,23 +1086,12 @@ static unsigned long __percpu *scx_kick_cpus_pnt_seqs; */ static DEFINE_PER_CPU(struct task_struct *, direct_dispatch_task); -/* - * Dispatch queues. - * - * The global DSQ (%SCX_DSQ_GLOBAL) is split per-node for scalability. This is - * to avoid live-locking in bypass mode where all tasks are dispatched to - * %SCX_DSQ_GLOBAL and all CPUs consume from it. If per-node split isn't - * sufficient, it can be further split. - */ -static struct scx_dispatch_q **global_dsqs; - static const struct rhashtable_params dsq_hash_params = { .key_len = sizeof_field(struct scx_dispatch_q, id), .key_offset = offsetof(struct scx_dispatch_q, id), .head_offset = offsetof(struct scx_dispatch_q, hash_node), }; -static struct rhashtable dsq_hash; static LLIST_HEAD(dsqs_to_free); /* dispatch buf */ @@ -1036,27 +1138,46 @@ static struct scx_dump_data scx_dump_data = { /* /sys/kernel/sched_ext interface */ static struct kset *scx_kset; -static struct kobject *scx_root_kobj; #define CREATE_TRACE_POINTS #include <trace/events/sched_ext.h> static void process_ddsp_deferred_locals(struct rq *rq); static void scx_bpf_kick_cpu(s32 cpu, u64 flags); -static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind, - s64 exit_code, - const char *fmt, ...); +static void scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind, + s64 exit_code, const char *fmt, va_list args); -#define scx_ops_error_kind(err, fmt, args...) \ - scx_ops_exit_kind((err), 0, fmt, ##args) +static __printf(4, 5) void scx_exit(struct scx_sched *sch, + enum scx_exit_kind kind, s64 exit_code, + const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + scx_vexit(sch, kind, exit_code, fmt, args); + va_end(args); +} + +static __printf(3, 4) void scx_kf_exit(enum scx_exit_kind kind, s64 exit_code, + const char *fmt, ...) +{ + struct scx_sched *sch; + va_list args; -#define scx_ops_exit(code, fmt, args...) \ - scx_ops_exit_kind(SCX_EXIT_UNREG_KERN, (code), fmt, ##args) + rcu_read_lock(); + sch = rcu_dereference(scx_root); + if (sch) { + va_start(args, fmt); + scx_vexit(sch, kind, exit_code, fmt, args); + va_end(args); + } + rcu_read_unlock(); +} -#define scx_ops_error(fmt, args...) \ - scx_ops_error_kind(SCX_EXIT_ERROR, fmt, ##args) +#define scx_error(sch, fmt, args...) scx_exit((sch), SCX_EXIT_ERROR, 0, fmt, ##args) +#define scx_kf_error(fmt, args...) scx_kf_exit(SCX_EXIT_ERROR, 0, fmt, ##args) -#define SCX_HAS_OP(op) static_branch_likely(&scx_has_op[SCX_OP_IDX(op)]) +#define SCX_HAS_OP(sch, op) test_bit(SCX_OP_IDX(op), (sch)->has_op) static long jiffies_delta_msecs(unsigned long at, unsigned long now) { @@ -1086,12 +1207,14 @@ static bool u32_before(u32 a, u32 b) static struct scx_dispatch_q *find_global_dsq(struct task_struct *p) { - return global_dsqs[cpu_to_node(task_cpu(p))]; + struct scx_sched *sch = scx_root; + + return sch->global_dsqs[cpu_to_node(task_cpu(p))]; } -static struct scx_dispatch_q *find_user_dsq(u64 dsq_id) +static struct scx_dispatch_q *find_user_dsq(struct scx_sched *sch, u64 dsq_id) { - return rhashtable_lookup_fast(&dsq_hash, &dsq_id, dsq_hash_params); + return rhashtable_lookup_fast(&sch->dsq_hash, &dsq_id, dsq_hash_params); } /* @@ -1118,27 +1241,61 @@ static void scx_kf_disallow(u32 mask) current->scx.kf_mask &= ~mask; } -#define SCX_CALL_OP(mask, op, args...) \ +/* + * Track the rq currently locked. + * + * This allows kfuncs to safely operate on rq from any scx ops callback, + * knowing which rq is already locked. + */ +static DEFINE_PER_CPU(struct rq *, locked_rq); + +static inline void update_locked_rq(struct rq *rq) +{ + /* + * Check whether @rq is actually locked. This can help expose bugs + * or incorrect assumptions about the context in which a kfunc or + * callback is executed. + */ + if (rq) + lockdep_assert_rq_held(rq); + __this_cpu_write(locked_rq, rq); +} + +/* + * Return the rq currently locked from an scx callback, or NULL if no rq is + * locked. + */ +static inline struct rq *scx_locked_rq(void) +{ + return __this_cpu_read(locked_rq); +} + +#define SCX_CALL_OP(sch, mask, op, rq, args...) \ do { \ + update_locked_rq(rq); \ if (mask) { \ scx_kf_allow(mask); \ - scx_ops.op(args); \ + (sch)->ops.op(args); \ scx_kf_disallow(mask); \ } else { \ - scx_ops.op(args); \ + (sch)->ops.op(args); \ } \ + update_locked_rq(NULL); \ } while (0) -#define SCX_CALL_OP_RET(mask, op, args...) \ +#define SCX_CALL_OP_RET(sch, mask, op, rq, args...) \ ({ \ - __typeof__(scx_ops.op(args)) __ret; \ + __typeof__((sch)->ops.op(args)) __ret; \ + \ + update_locked_rq(rq); \ if (mask) { \ scx_kf_allow(mask); \ - __ret = scx_ops.op(args); \ + __ret = (sch)->ops.op(args); \ scx_kf_disallow(mask); \ } else { \ - __ret = scx_ops.op(args); \ + __ret = (sch)->ops.op(args); \ } \ + update_locked_rq(NULL); \ __ret; \ }) @@ -1153,31 +1310,31 @@ do { \ * scx_kf_allowed_on_arg_tasks() to test whether the invocation is allowed on * the specific task. */ -#define SCX_CALL_OP_TASK(mask, op, task, args...) \ +#define SCX_CALL_OP_TASK(sch, mask, op, rq, task, args...) \ do { \ BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \ current->scx.kf_tasks[0] = task; \ - SCX_CALL_OP(mask, op, task, ##args); \ + SCX_CALL_OP((sch), mask, op, rq, task, ##args); \ current->scx.kf_tasks[0] = NULL; \ } while (0) -#define SCX_CALL_OP_TASK_RET(mask, op, task, args...) \ +#define SCX_CALL_OP_TASK_RET(sch, mask, op, rq, task, args...) \ ({ \ - __typeof__(scx_ops.op(task, ##args)) __ret; \ + __typeof__((sch)->ops.op(task, ##args)) __ret; \ BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \ current->scx.kf_tasks[0] = task; \ - __ret = SCX_CALL_OP_RET(mask, op, task, ##args); \ + __ret = SCX_CALL_OP_RET((sch), mask, op, rq, task, ##args); \ current->scx.kf_tasks[0] = NULL; \ __ret; \ }) -#define SCX_CALL_OP_2TASKS_RET(mask, op, task0, task1, args...) \ +#define SCX_CALL_OP_2TASKS_RET(sch, mask, op, rq, task0, task1, args...) \ ({ \ - __typeof__(scx_ops.op(task0, task1, ##args)) __ret; \ + __typeof__((sch)->ops.op(task0, task1, ##args)) __ret; \ BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \ current->scx.kf_tasks[0] = task0; \ current->scx.kf_tasks[1] = task1; \ - __ret = SCX_CALL_OP_RET(mask, op, task0, task1, ##args); \ + __ret = SCX_CALL_OP_RET((sch), mask, op, rq, task0, task1, ##args); \ current->scx.kf_tasks[0] = NULL; \ current->scx.kf_tasks[1] = NULL; \ __ret; \ @@ -1187,8 +1344,8 @@ do { \ static __always_inline bool scx_kf_allowed(u32 mask) { if (unlikely(!(current->scx.kf_mask & mask))) { - scx_ops_error("kfunc with mask 0x%x called from an operation only allowing 0x%x", - mask, current->scx.kf_mask); + scx_kf_error("kfunc with mask 0x%x called from an operation only allowing 0x%x", + mask, current->scx.kf_mask); return false; } @@ -1201,13 +1358,13 @@ static __always_inline bool scx_kf_allowed(u32 mask) */ if (unlikely(highest_bit(mask) == SCX_KF_CPU_RELEASE && (current->scx.kf_mask & higher_bits(SCX_KF_CPU_RELEASE)))) { - scx_ops_error("cpu_release kfunc called from a nested operation"); + scx_kf_error("cpu_release kfunc called from a nested operation"); return false; } if (unlikely(highest_bit(mask) == SCX_KF_DISPATCH && (current->scx.kf_mask & higher_bits(SCX_KF_DISPATCH)))) { - scx_ops_error("dispatch kfunc called from a nested operation"); + scx_kf_error("dispatch kfunc called from a nested operation"); return false; } @@ -1223,18 +1380,13 @@ static __always_inline bool scx_kf_allowed_on_arg_tasks(u32 mask, if (unlikely((p != current->scx.kf_tasks[0] && p != current->scx.kf_tasks[1]))) { - scx_ops_error("called on a task not being operated on"); + scx_kf_error("called on a task not being operated on"); return false; } return true; } -static bool scx_kf_allowed_if_unlocked(void) -{ - return !current->scx.kf_mask; -} - /** * nldsq_next_task - Iterate to the next task in a non-local DSQ * @dsq: user dsq being iterated @@ -1402,15 +1554,15 @@ static void scx_task_iter_stop(struct scx_task_iter *iter) * @iter: iterator to walk * * Visit the next task. See scx_task_iter_start() for details. Locks are dropped - * and re-acquired every %SCX_OPS_TASK_ITER_BATCH iterations to avoid causing - * stalls by holding scx_tasks_lock for too long. + * and re-acquired every %SCX_TASK_ITER_BATCH iterations to avoid causing stalls + * by holding scx_tasks_lock for too long. */ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter) { struct list_head *cursor = &iter->cursor.tasks_node; struct sched_ext_entity *pos; - if (!(++iter->cnt % SCX_OPS_TASK_ITER_BATCH)) { + if (!(++iter->cnt % SCX_TASK_ITER_BATCH)) { scx_task_iter_unlock(iter); cond_resched(); scx_task_iter_relock(iter); @@ -1481,91 +1633,29 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter) return p; } -/* - * Collection of event counters. Event types are placed in descending order. - */ -struct scx_event_stats { - /* - * If ops.select_cpu() returns a CPU which can't be used by the task, - * the core scheduler code silently picks a fallback CPU. - */ - s64 SCX_EV_SELECT_CPU_FALLBACK; - - /* - * When dispatching to a local DSQ, the CPU may have gone offline in - * the meantime. In this case, the task is bounced to the global DSQ. - */ - s64 SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE; - - /* - * If SCX_OPS_ENQ_LAST is not set, the number of times that a task - * continued to run because there were no other tasks on the CPU. - */ - s64 SCX_EV_DISPATCH_KEEP_LAST; - - /* - * If SCX_OPS_ENQ_EXITING is not set, the number of times that a task - * is dispatched to a local DSQ when exiting. - */ - s64 SCX_EV_ENQ_SKIP_EXITING; - - /* - * If SCX_OPS_ENQ_MIGRATION_DISABLED is not set, the number of times a - * migration disabled task skips ops.enqueue() and is dispatched to its - * local DSQ. - */ - s64 SCX_EV_ENQ_SKIP_MIGRATION_DISABLED; - - /* - * The total number of tasks enqueued (or pick_task-ed) with a - * default time slice (SCX_SLICE_DFL). - */ - s64 SCX_EV_ENQ_SLICE_DFL; - - /* - * The total duration of bypass modes in nanoseconds. - */ - s64 SCX_EV_BYPASS_DURATION; - - /* - * The number of tasks dispatched in the bypassing mode. - */ - s64 SCX_EV_BYPASS_DISPATCH; - - /* - * The number of times the bypassing mode has been activated. - */ - s64 SCX_EV_BYPASS_ACTIVATE; -}; - -/* - * The event counter is organized by a per-CPU variable to minimize the - * accounting overhead without synchronization. A system-wide view on the - * event counter is constructed when requested by scx_bpf_get_event_stat(). - */ -static DEFINE_PER_CPU(struct scx_event_stats, event_stats_cpu); - /** * scx_add_event - Increase an event counter for 'name' by 'cnt' + * @sch: scx_sched to account events for * @name: an event name defined in struct scx_event_stats * @cnt: the number of the event occured * * This can be used when preemption is not disabled. */ -#define scx_add_event(name, cnt) do { \ - this_cpu_add(event_stats_cpu.name, cnt); \ - trace_sched_ext_event(#name, cnt); \ +#define scx_add_event(sch, name, cnt) do { \ + this_cpu_add((sch)->event_stats_cpu->name, (cnt)); \ + trace_sched_ext_event(#name, (cnt)); \ } while(0) /** * __scx_add_event - Increase an event counter for 'name' by 'cnt' + * @sch: scx_sched to account events for * @name: an event name defined in struct scx_event_stats * @cnt: the number of the event occured * * This should be used only when preemption is disabled. */ -#define __scx_add_event(name, cnt) do { \ - __this_cpu_add(event_stats_cpu.name, cnt); \ +#define __scx_add_event(sch, name, cnt) do { \ + __this_cpu_add((sch)->event_stats_cpu->name, (cnt)); \ trace_sched_ext_event(#name, cnt); \ } while(0) @@ -1590,25 +1680,25 @@ static DEFINE_PER_CPU(struct scx_event_stats, event_stats_cpu); } while (0) -static void scx_bpf_events(struct scx_event_stats *events, size_t events__sz); +static void scx_read_events(struct scx_sched *sch, + struct scx_event_stats *events); -static enum scx_ops_enable_state scx_ops_enable_state(void) +static enum scx_enable_state scx_enable_state(void) { - return atomic_read(&scx_ops_enable_state_var); + return atomic_read(&scx_enable_state_var); } -static enum scx_ops_enable_state -scx_ops_set_enable_state(enum scx_ops_enable_state to) +static enum scx_enable_state scx_set_enable_state(enum scx_enable_state to) { - return atomic_xchg(&scx_ops_enable_state_var, to); + return atomic_xchg(&scx_enable_state_var, to); } -static bool scx_ops_tryset_enable_state(enum scx_ops_enable_state to, - enum scx_ops_enable_state from) +static bool scx_tryset_enable_state(enum scx_enable_state to, + enum scx_enable_state from) { int from_v = from; - return atomic_try_cmpxchg(&scx_ops_enable_state_var, &from_v, to); + return atomic_try_cmpxchg(&scx_enable_state_var, &from_v, to); } static bool scx_rq_bypassing(struct rq *rq) @@ -1633,8 +1723,14 @@ static void wait_ops_state(struct task_struct *p, unsigned long opss) } while (atomic_long_read_acquire(&p->scx.ops_state) == opss); } +static inline bool __cpu_valid(s32 cpu) +{ + return likely(cpu >= 0 && cpu < nr_cpu_ids && cpu_possible(cpu)); +} + /** - * ops_cpu_valid - Verify a cpu number + * ops_cpu_valid - Verify a cpu number, to be used on ops input args + * @sch: scx_sched to abort on error * @cpu: cpu number which came from a BPF ops * @where: extra information reported on error * @@ -1642,35 +1738,52 @@ static void wait_ops_state(struct task_struct *p, unsigned long opss) * Verify that it is in range and one of the possible cpus. If invalid, trigger * an ops error. */ -static bool ops_cpu_valid(s32 cpu, const char *where) +static bool ops_cpu_valid(struct scx_sched *sch, s32 cpu, const char *where) +{ + if (__cpu_valid(cpu)) { + return true; + } else { + scx_error(sch, "invalid CPU %d%s%s", cpu, where ? " " : "", where ?: ""); + return false; + } +} + +/** + * kf_cpu_valid - Verify a CPU number, to be used on kfunc input args + * @cpu: cpu number which came from a BPF ops + * @where: extra information reported on error + * + * The same as ops_cpu_valid() but @sch is implicit. + */ +static bool kf_cpu_valid(u32 cpu, const char *where) { - if (likely(cpu >= 0 && cpu < nr_cpu_ids && cpu_possible(cpu))) { + if (__cpu_valid(cpu)) { return true; } else { - scx_ops_error("invalid CPU %d%s%s", cpu, - where ? " " : "", where ?: ""); + scx_kf_error("invalid CPU %d%s%s", cpu, where ? " " : "", where ?: ""); return false; } } /** * ops_sanitize_err - Sanitize a -errno value + * @sch: scx_sched to error out on error * @ops_name: operation to blame on failure * @err: -errno value to sanitize * - * Verify @err is a valid -errno. If not, trigger scx_ops_error() and return + * Verify @err is a valid -errno. If not, trigger scx_error() and return * -%EPROTO. This is necessary because returning a rogue -errno up the chain can * cause misbehaviors. For an example, a large negative return from * ops.init_task() triggers an oops when passed up the call chain because the * value fails IS_ERR() test after being encoded with ERR_PTR() and then is * handled as a pointer. */ -static int ops_sanitize_err(const char *ops_name, s32 err) +static int ops_sanitize_err(struct scx_sched *sch, const char *ops_name, s32 err) { if (err < 0 && err >= -MAX_ERRNO) return err; - scx_ops_error("ops.%s() returned an invalid errno %d", ops_name, err); + scx_error(sch, "ops.%s() returned an invalid errno %d", ops_name, err); return -EPROTO; } @@ -1777,7 +1890,7 @@ static void touch_core_sched_dispatch(struct rq *rq, struct task_struct *p) lockdep_assert_rq_held(rq); #ifdef CONFIG_SCHED_CORE - if (SCX_HAS_OP(core_sched_before)) + if (unlikely(SCX_HAS_OP(scx_root, core_sched_before))) touch_core_sched(rq, p); #endif } @@ -1815,8 +1928,14 @@ static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta) WRITE_ONCE(dsq->nr, dsq->nr + delta); } -static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p, - u64 enq_flags) +static void refill_task_slice_dfl(struct task_struct *p) +{ + p->scx.slice = SCX_SLICE_DFL; + __scx_add_event(scx_root, SCX_EV_REFILL_SLICE_DFL, 1); +} + +static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq, + struct task_struct *p, u64 enq_flags) { bool is_local = dsq->id == SCX_DSQ_LOCAL; @@ -1827,7 +1946,7 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p, if (!is_local) { raw_spin_lock(&dsq->lock); if (unlikely(dsq->id == SCX_DSQ_INVALID)) { - scx_ops_error("attempting to dispatch to a destroyed dsq"); + scx_error(sch, "attempting to dispatch to a destroyed dsq"); /* fall back to the global dsq */ raw_spin_unlock(&dsq->lock); dsq = find_global_dsq(p); @@ -1844,7 +1963,7 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p, * disallow any internal DSQ from doing vtime ordering of * tasks. */ - scx_ops_error("cannot use vtime ordering for built-in DSQs"); + scx_error(sch, "cannot use vtime ordering for built-in DSQs"); enq_flags &= ~SCX_ENQ_DSQ_PRIQ; } @@ -1858,8 +1977,8 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p, */ if (unlikely(RB_EMPTY_ROOT(&dsq->priq) && nldsq_next_task(dsq, NULL, false))) - scx_ops_error("DSQ ID 0x%016llx already had FIFO-enqueued tasks", - dsq->id); + scx_error(sch, "DSQ ID 0x%016llx already had FIFO-enqueued tasks", + dsq->id); p->scx.dsq_flags |= SCX_TASK_DSQ_ON_PRIQ; rb_add(&p->scx.dsq_priq, &dsq->priq, scx_dsq_priq_less); @@ -1880,8 +1999,8 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p, } else { /* a FIFO DSQ shouldn't be using PRIQ enqueuing */ if (unlikely(!RB_EMPTY_ROOT(&dsq->priq))) - scx_ops_error("DSQ ID 0x%016llx already had PRIQ-enqueued tasks", - dsq->id); + scx_error(sch, "DSQ ID 0x%016llx already had PRIQ-enqueued tasks", + dsq->id); if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT)) list_add(&p->scx.dsq_list.node, &dsq->list); @@ -1996,7 +2115,8 @@ static void dispatch_dequeue(struct rq *rq, struct task_struct *p) raw_spin_unlock(&dsq->lock); } -static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id, +static struct scx_dispatch_q *find_dsq_for_dispatch(struct scx_sched *sch, + struct rq *rq, u64 dsq_id, struct task_struct *p) { struct scx_dispatch_q *dsq; @@ -2007,7 +2127,7 @@ static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id, if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) { s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; - if (!ops_cpu_valid(cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict")) + if (!ops_cpu_valid(sch, cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict")) return find_global_dsq(p); return &cpu_rq(cpu)->scx.local_dsq; @@ -2016,11 +2136,11 @@ static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id, if (dsq_id == SCX_DSQ_GLOBAL) dsq = find_global_dsq(p); else - dsq = find_user_dsq(dsq_id); + dsq = find_user_dsq(sch, dsq_id); if (unlikely(!dsq)) { - scx_ops_error("non-existent DSQ 0x%llx for %s[%d]", - dsq_id, p->comm, p->pid); + scx_error(sch, "non-existent DSQ 0x%llx for %s[%d]", + dsq_id, p->comm, p->pid); return find_global_dsq(p); } @@ -2041,12 +2161,12 @@ static void mark_direct_dispatch(struct task_struct *ddsp_task, /* @p must match the task on the enqueue path */ if (unlikely(p != ddsp_task)) { if (IS_ERR(ddsp_task)) - scx_ops_error("%s[%d] already direct-dispatched", - p->comm, p->pid); + scx_kf_error("%s[%d] already direct-dispatched", + p->comm, p->pid); else - scx_ops_error("scheduling for %s[%d] but trying to direct-dispatch %s[%d]", - ddsp_task->comm, ddsp_task->pid, - p->comm, p->pid); + scx_kf_error("scheduling for %s[%d] but trying to direct-dispatch %s[%d]", + ddsp_task->comm, ddsp_task->pid, + p->comm, p->pid); return; } @@ -2057,11 +2177,12 @@ static void mark_direct_dispatch(struct task_struct *ddsp_task, p->scx.ddsp_enq_flags = enq_flags; } -static void direct_dispatch(struct task_struct *p, u64 enq_flags) +static void direct_dispatch(struct scx_sched *sch, struct task_struct *p, + u64 enq_flags) { struct rq *rq = task_rq(p); struct scx_dispatch_q *dsq = - find_dsq_for_dispatch(rq, p->scx.ddsp_dsq_id, p); + find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, p); touch_core_sched_dispatch(rq, p); @@ -2102,7 +2223,8 @@ static void direct_dispatch(struct task_struct *p, u64 enq_flags) return; } - dispatch_enqueue(dsq, p, p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS); + dispatch_enqueue(sch, dsq, p, + p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS); } static bool scx_rq_online(struct rq *rq) @@ -2120,6 +2242,7 @@ static bool scx_rq_online(struct rq *rq) static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, int sticky_cpu) { + struct scx_sched *sch = scx_root; struct task_struct **ddsp_taskp; unsigned long qseq; @@ -2138,7 +2261,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, goto local; if (scx_rq_bypassing(rq)) { - __scx_add_event(SCX_EV_BYPASS_DISPATCH, 1); + __scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1); goto global; } @@ -2146,20 +2269,20 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, goto direct; /* see %SCX_OPS_ENQ_EXITING */ - if (!static_branch_unlikely(&scx_ops_enq_exiting) && + if (!(sch->ops.flags & SCX_OPS_ENQ_EXITING) && unlikely(p->flags & PF_EXITING)) { - __scx_add_event(SCX_EV_ENQ_SKIP_EXITING, 1); + __scx_add_event(sch, SCX_EV_ENQ_SKIP_EXITING, 1); goto local; } /* see %SCX_OPS_ENQ_MIGRATION_DISABLED */ - if (!static_branch_unlikely(&scx_ops_enq_migration_disabled) && + if (!(sch->ops.flags & SCX_OPS_ENQ_MIGRATION_DISABLED) && is_migration_disabled(p)) { - __scx_add_event(SCX_EV_ENQ_SKIP_MIGRATION_DISABLED, 1); + __scx_add_event(sch, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED, 1); goto local; } - if (!SCX_HAS_OP(enqueue)) + if (unlikely(!SCX_HAS_OP(sch, enqueue))) goto global; /* DSQ bypass didn't trigger, enqueue on the BPF scheduler */ @@ -2172,7 +2295,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, WARN_ON_ONCE(*ddsp_taskp); *ddsp_taskp = p; - SCX_CALL_OP_TASK(SCX_KF_ENQUEUE, enqueue, p, enq_flags); + SCX_CALL_OP_TASK(sch, SCX_KF_ENQUEUE, enqueue, rq, p, enq_flags); *ddsp_taskp = NULL; if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) @@ -2186,7 +2309,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, return; direct: - direct_dispatch(p, enq_flags); + direct_dispatch(sch, p, enq_flags); return; local: @@ -2196,17 +2319,15 @@ local: * higher priority it becomes from scx_prio_less()'s POV. */ touch_core_sched(rq, p); - p->scx.slice = SCX_SLICE_DFL; - __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1); + refill_task_slice_dfl(p); local_norefill: - dispatch_enqueue(&rq->scx.local_dsq, p, enq_flags); + dispatch_enqueue(sch, &rq->scx.local_dsq, p, enq_flags); return; global: touch_core_sched(rq, p); /* see the comment in local: */ - p->scx.slice = SCX_SLICE_DFL; - __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1); - dispatch_enqueue(find_global_dsq(p), p, enq_flags); + refill_task_slice_dfl(p); + dispatch_enqueue(sch, find_global_dsq(p), p, enq_flags); } static bool task_runnable(const struct task_struct *p) @@ -2224,7 +2345,7 @@ static void set_task_runnable(struct rq *rq, struct task_struct *p) } /* - * list_add_tail() must be used. scx_ops_bypass() depends on tasks being + * list_add_tail() must be used. scx_bypass() depends on tasks being * appended to the runnable_list. */ list_add_tail(&p->scx.runnable_node, &rq->scx.runnable_list); @@ -2239,6 +2360,7 @@ static void clr_task_runnable(struct task_struct *p, bool reset_runnable_at) static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags) { + struct scx_sched *sch = scx_root; int sticky_cpu = p->scx.sticky_cpu; if (enq_flags & ENQUEUE_WAKEUP) @@ -2268,8 +2390,8 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags rq->scx.nr_running++; add_nr_running(rq, 1); - if (SCX_HAS_OP(runnable) && !task_on_rq_migrating(p)) - SCX_CALL_OP_TASK(SCX_KF_REST, runnable, p, enq_flags); + if (SCX_HAS_OP(sch, runnable) && !task_on_rq_migrating(p)) + SCX_CALL_OP_TASK(sch, SCX_KF_REST, runnable, rq, p, enq_flags); if (enq_flags & SCX_ENQ_WAKEUP) touch_core_sched(rq, p); @@ -2280,11 +2402,12 @@ out: if ((enq_flags & SCX_ENQ_CPU_SELECTED) && unlikely(cpu_of(rq) != p->scx.selected_cpu)) - __scx_add_event(SCX_EV_SELECT_CPU_FALLBACK, 1); + __scx_add_event(sch, SCX_EV_SELECT_CPU_FALLBACK, 1); } -static void ops_dequeue(struct task_struct *p, u64 deq_flags) +static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags) { + struct scx_sched *sch = scx_root; unsigned long opss; /* dequeue is always temporary, don't reset runnable_at */ @@ -2303,8 +2426,9 @@ static void ops_dequeue(struct task_struct *p, u64 deq_flags) */ BUG(); case SCX_OPSS_QUEUED: - if (SCX_HAS_OP(dequeue)) - SCX_CALL_OP_TASK(SCX_KF_REST, dequeue, p, deq_flags); + if (SCX_HAS_OP(sch, dequeue)) + SCX_CALL_OP_TASK(sch, SCX_KF_REST, dequeue, rq, + p, deq_flags); if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss, SCX_OPSS_NONE)) @@ -2332,12 +2456,14 @@ static void ops_dequeue(struct task_struct *p, u64 deq_flags) static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags) { + struct scx_sched *sch = scx_root; + if (!(p->scx.flags & SCX_TASK_QUEUED)) { WARN_ON_ONCE(task_runnable(p)); return true; } - ops_dequeue(p, deq_flags); + ops_dequeue(rq, p, deq_flags); /* * A currently running task which is going off @rq first gets dequeued @@ -2351,13 +2477,13 @@ static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags * information meaningful to the BPF scheduler and can be suppressed by * skipping the callbacks if the task is !QUEUED. */ - if (SCX_HAS_OP(stopping) && task_current(rq, p)) { + if (SCX_HAS_OP(sch, stopping) && task_current(rq, p)) { update_curr_scx(rq); - SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, false); + SCX_CALL_OP_TASK(sch, SCX_KF_REST, stopping, rq, p, false); } - if (SCX_HAS_OP(quiescent) && !task_on_rq_migrating(p)) - SCX_CALL_OP_TASK(SCX_KF_REST, quiescent, p, deq_flags); + if (SCX_HAS_OP(sch, quiescent) && !task_on_rq_migrating(p)) + SCX_CALL_OP_TASK(sch, SCX_KF_REST, quiescent, rq, p, deq_flags); if (deq_flags & SCX_DEQ_SLEEP) p->scx.flags |= SCX_TASK_DEQD_FOR_SLEEP; @@ -2374,20 +2500,23 @@ static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags static void yield_task_scx(struct rq *rq) { + struct scx_sched *sch = scx_root; struct task_struct *p = rq->curr; - if (SCX_HAS_OP(yield)) - SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, p, NULL); + if (SCX_HAS_OP(sch, yield)) + SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, yield, rq, p, NULL); else p->scx.slice = 0; } static bool yield_to_task_scx(struct rq *rq, struct task_struct *to) { + struct scx_sched *sch = scx_root; struct task_struct *from = rq->curr; - if (SCX_HAS_OP(yield)) - return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, from, to); + if (SCX_HAS_OP(sch, yield)) + return SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, yield, rq, + from, to); else return false; } @@ -2467,7 +2596,8 @@ static void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, * * The caller must ensure that @p and @rq are on different CPUs. */ -static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, +static bool task_can_run_on_remote_rq(struct scx_sched *sch, + struct task_struct *p, struct rq *rq, bool enforce) { int cpu = cpu_of(rq); @@ -2488,8 +2618,8 @@ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, */ if (unlikely(is_migration_disabled(p))) { if (enforce) - scx_ops_error("SCX_DSQ_LOCAL[_ON] cannot move migration disabled %s[%d] from CPU %d to %d", - p->comm, p->pid, task_cpu(p), cpu); + scx_error(sch, "SCX_DSQ_LOCAL[_ON] cannot move migration disabled %s[%d] from CPU %d to %d", + p->comm, p->pid, task_cpu(p), cpu); return false; } @@ -2501,14 +2631,15 @@ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, */ if (!task_allowed_on_cpu(p, cpu)) { if (enforce) - scx_ops_error("SCX_DSQ_LOCAL[_ON] target CPU %d not allowed for %s[%d]", - cpu, p->comm, p->pid); + scx_error(sch, "SCX_DSQ_LOCAL[_ON] target CPU %d not allowed for %s[%d]", + cpu, p->comm, p->pid); return false; } if (!scx_rq_online(rq)) { if (enforce) - __scx_add_event(SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1); + __scx_add_event(scx_root, + SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1); return false; } @@ -2580,12 +2711,13 @@ static bool consume_remote_task(struct rq *this_rq, struct task_struct *p, } #else /* CONFIG_SMP */ static inline void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, struct rq *src_rq, struct rq *dst_rq) { WARN_ON_ONCE(1); } -static inline bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, bool enforce) { return false; } +static inline bool task_can_run_on_remote_rq(struct scx_sched *sch, struct task_struct *p, struct rq *rq, bool enforce) { return false; } static inline bool consume_remote_task(struct rq *this_rq, struct task_struct *p, struct scx_dispatch_q *dsq, struct rq *task_rq) { return false; } #endif /* CONFIG_SMP */ /** * move_task_between_dsqs() - Move a task from one DSQ to another + * @sch: scx_sched being operated on * @p: target task * @enq_flags: %SCX_ENQ_* * @src_dsq: DSQ @p is currently on, must not be a local DSQ @@ -2599,7 +2731,8 @@ static inline bool consume_remote_task(struct rq *this_rq, struct task_struct *p * On return, @src_dsq is unlocked and only @p's new task_rq, which is the * return value, is locked. */ -static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags, +static struct rq *move_task_between_dsqs(struct scx_sched *sch, + struct task_struct *p, u64 enq_flags, struct scx_dispatch_q *src_dsq, struct scx_dispatch_q *dst_dsq) { @@ -2612,7 +2745,7 @@ static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags, if (dst_dsq->id == SCX_DSQ_LOCAL) { dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq); if (src_rq != dst_rq && - unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) { + unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) { dst_dsq = find_global_dsq(p); dst_rq = src_rq; } @@ -2646,7 +2779,7 @@ static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags, p->scx.dsq = NULL; raw_spin_unlock(&src_dsq->lock); - dispatch_enqueue(dst_dsq, p, enq_flags); + dispatch_enqueue(sch, dst_dsq, p, enq_flags); } return dst_rq; @@ -2658,13 +2791,13 @@ static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags, * to the bypass mode can take a long time. Inject artificial delays while the * bypass mode is switching to guarantee timely completion. */ -static void scx_ops_breather(struct rq *rq) +static void scx_breather(struct rq *rq) { u64 until; lockdep_assert_rq_held(rq); - if (likely(!atomic_read(&scx_ops_breather_depth))) + if (likely(!atomic_read(&scx_breather_depth))) return; raw_spin_rq_unlock(rq); @@ -2673,25 +2806,26 @@ static void scx_ops_breather(struct rq *rq) do { int cnt = 1024; - while (atomic_read(&scx_ops_breather_depth) && --cnt) + while (atomic_read(&scx_breather_depth) && --cnt) cpu_relax(); - } while (atomic_read(&scx_ops_breather_depth) && + } while (atomic_read(&scx_breather_depth) && time_before64(ktime_get_ns(), until)); raw_spin_rq_lock(rq); } -static bool consume_dispatch_q(struct rq *rq, struct scx_dispatch_q *dsq) +static bool consume_dispatch_q(struct scx_sched *sch, struct rq *rq, + struct scx_dispatch_q *dsq) { struct task_struct *p; retry: /* - * This retry loop can repeatedly race against scx_ops_bypass() - * dequeueing tasks from @dsq trying to put the system into the bypass - * mode. On some multi-socket machines (e.g. 2x Intel 8480c), this can - * live-lock the machine into soft lockups. Give a breather. + * This retry loop can repeatedly race against scx_bypass() dequeueing + * tasks from @dsq trying to put the system into the bypass mode. On + * some multi-socket machines (e.g. 2x Intel 8480c), this can live-lock + * the machine into soft lockups. Give a breather. */ - scx_ops_breather(rq); + scx_breather(rq); /* * The caller can't expect to successfully consume a task if the task's @@ -2713,7 +2847,7 @@ retry: return true; } - if (task_can_run_on_remote_rq(p, rq, false)) { + if (task_can_run_on_remote_rq(sch, p, rq, false)) { if (likely(consume_remote_task(rq, p, dsq, task_rq))) return true; goto retry; @@ -2724,15 +2858,16 @@ retry: return false; } -static bool consume_global_dsq(struct rq *rq) +static bool consume_global_dsq(struct scx_sched *sch, struct rq *rq) { int node = cpu_to_node(cpu_of(rq)); - return consume_dispatch_q(rq, global_dsqs[node]); + return consume_dispatch_q(sch, rq, sch->global_dsqs[node]); } /** * dispatch_to_local_dsq - Dispatch a task to a local dsq + * @sch: scx_sched being operated on * @rq: current rq which is locked * @dst_dsq: destination DSQ * @p: task to dispatch @@ -2745,7 +2880,8 @@ static bool consume_global_dsq(struct rq *rq) * The caller must have exclusive ownership of @p (e.g. through * %SCX_OPSS_DISPATCHING). */ -static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq, +static void dispatch_to_local_dsq(struct scx_sched *sch, struct rq *rq, + struct scx_dispatch_q *dst_dsq, struct task_struct *p, u64 enq_flags) { struct rq *src_rq = task_rq(p); @@ -2761,14 +2897,15 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq, * If dispatching to @rq that @p is already on, no lock dancing needed. */ if (rq == src_rq && rq == dst_rq) { - dispatch_enqueue(dst_dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS); + dispatch_enqueue(sch, dst_dsq, p, + enq_flags | SCX_ENQ_CLEAR_OPSS); return; } #ifdef CONFIG_SMP if (src_rq != dst_rq && - unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) { - dispatch_enqueue(find_global_dsq(p), p, + unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) { + dispatch_enqueue(sch, find_global_dsq(p), p, enq_flags | SCX_ENQ_CLEAR_OPSS); return; } @@ -2806,7 +2943,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq, */ if (src_rq == dst_rq) { p->scx.holding_cpu = -1; - dispatch_enqueue(&dst_rq->scx.local_dsq, p, enq_flags); + dispatch_enqueue(sch, &dst_rq->scx.local_dsq, p, + enq_flags); } else { move_remote_task_to_local_dsq(p, enq_flags, src_rq, dst_rq); @@ -2848,7 +2986,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq, * was valid in the first place. Make sure that the task is still owned by the * BPF scheduler and claim the ownership before dispatching. */ -static void finish_dispatch(struct rq *rq, struct task_struct *p, +static void finish_dispatch(struct scx_sched *sch, struct rq *rq, + struct task_struct *p, unsigned long qseq_at_dispatch, u64 dsq_id, u64 enq_flags) { @@ -2901,15 +3040,15 @@ retry: BUG_ON(!(p->scx.flags & SCX_TASK_QUEUED)); - dsq = find_dsq_for_dispatch(this_rq(), dsq_id, p); + dsq = find_dsq_for_dispatch(sch, this_rq(), dsq_id, p); if (dsq->id == SCX_DSQ_LOCAL) - dispatch_to_local_dsq(rq, dsq, p, enq_flags); + dispatch_to_local_dsq(sch, rq, dsq, p, enq_flags); else - dispatch_enqueue(dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS); + dispatch_enqueue(sch, dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS); } -static void flush_dispatch_buf(struct rq *rq) +static void flush_dispatch_buf(struct scx_sched *sch, struct rq *rq) { struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); u32 u; @@ -2917,7 +3056,7 @@ static void flush_dispatch_buf(struct rq *rq) for (u = 0; u < dspc->cursor; u++) { struct scx_dsp_buf_ent *ent = &dspc->buf[u]; - finish_dispatch(rq, ent->task, ent->qseq, ent->dsq_id, + finish_dispatch(sch, rq, ent->task, ent->qseq, ent->dsq_id, ent->enq_flags); } @@ -2927,6 +3066,7 @@ static void flush_dispatch_buf(struct rq *rq) static int balance_one(struct rq *rq, struct task_struct *prev) { + struct scx_sched *sch = scx_root; struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); bool prev_on_scx = prev->sched_class == &ext_sched_class; bool prev_on_rq = prev->scx.flags & SCX_TASK_QUEUED; @@ -2936,7 +3076,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev) rq->scx.flags |= SCX_RQ_IN_BALANCE; rq->scx.flags &= ~(SCX_RQ_BAL_PENDING | SCX_RQ_BAL_KEEP); - if (static_branch_unlikely(&scx_ops_cpu_preempt) && + if ((sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT) && unlikely(rq->scx.cpu_released)) { /* * If the previous sched_class for the current CPU was not SCX, @@ -2944,8 +3084,9 @@ static int balance_one(struct rq *rq, struct task_struct *prev) * core. This callback complements ->cpu_release(), which is * emitted in switch_class(). */ - if (SCX_HAS_OP(cpu_acquire)) - SCX_CALL_OP(SCX_KF_REST, cpu_acquire, cpu_of(rq), NULL); + if (SCX_HAS_OP(sch, cpu_acquire)) + SCX_CALL_OP(sch, SCX_KF_REST, cpu_acquire, rq, + cpu_of(rq), NULL); rq->scx.cpu_released = false; } @@ -2959,8 +3100,8 @@ static int balance_one(struct rq *rq, struct task_struct *prev) * scheduler wants to handle this explicitly, it should * implement ->cpu_release(). * - * See scx_ops_disable_workfn() for the explanation on the - * bypassing test. + * See scx_disable_workfn() for the explanation on the bypassing + * test. */ if (prev_on_rq && prev->scx.slice && !scx_rq_bypassing(rq)) { rq->scx.flags |= SCX_RQ_BAL_KEEP; @@ -2972,10 +3113,11 @@ static int balance_one(struct rq *rq, struct task_struct *prev) if (rq->scx.local_dsq.nr) goto has_tasks; - if (consume_global_dsq(rq)) + if (consume_global_dsq(sch, rq)) goto has_tasks; - if (!SCX_HAS_OP(dispatch) || scx_rq_bypassing(rq) || !scx_rq_online(rq)) + if (unlikely(!SCX_HAS_OP(sch, dispatch)) || + scx_rq_bypassing(rq) || !scx_rq_online(rq)) goto no_tasks; dspc->rq = rq; @@ -2990,10 +3132,10 @@ static int balance_one(struct rq *rq, struct task_struct *prev) do { dspc->nr_tasks = 0; - SCX_CALL_OP(SCX_KF_DISPATCH, dispatch, cpu_of(rq), - prev_on_scx ? prev : NULL); + SCX_CALL_OP(sch, SCX_KF_DISPATCH, dispatch, rq, + cpu_of(rq), prev_on_scx ? prev : NULL); - flush_dispatch_buf(rq); + flush_dispatch_buf(sch, rq); if (prev_on_rq && prev->scx.slice) { rq->scx.flags |= SCX_RQ_BAL_KEEP; @@ -3001,7 +3143,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev) } if (rq->scx.local_dsq.nr) goto has_tasks; - if (consume_global_dsq(rq)) + if (consume_global_dsq(sch, rq)) goto has_tasks; /* @@ -3024,10 +3166,10 @@ no_tasks: * Didn't find another task to run. Keep running @prev unless * %SCX_OPS_ENQ_LAST is in effect. */ - if (prev_on_rq && (!static_branch_unlikely(&scx_ops_enq_last) || - scx_rq_bypassing(rq))) { + if (prev_on_rq && + (!(sch->ops.flags & SCX_OPS_ENQ_LAST) || scx_rq_bypassing(rq))) { rq->scx.flags |= SCX_RQ_BAL_KEEP; - __scx_add_event(SCX_EV_DISPATCH_KEEP_LAST, 1); + __scx_add_event(sch, SCX_EV_DISPATCH_KEEP_LAST, 1); goto has_tasks; } rq->scx.flags &= ~SCX_RQ_IN_BALANCE; @@ -3087,32 +3229,36 @@ static void process_ddsp_deferred_locals(struct rq *rq) */ while ((p = list_first_entry_or_null(&rq->scx.ddsp_deferred_locals, struct task_struct, scx.dsq_list.node))) { + struct scx_sched *sch = scx_root; struct scx_dispatch_q *dsq; list_del_init(&p->scx.dsq_list.node); - dsq = find_dsq_for_dispatch(rq, p->scx.ddsp_dsq_id, p); + dsq = find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, p); if (!WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL)) - dispatch_to_local_dsq(rq, dsq, p, p->scx.ddsp_enq_flags); + dispatch_to_local_dsq(sch, rq, dsq, p, + p->scx.ddsp_enq_flags); } } static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first) { + struct scx_sched *sch = scx_root; + if (p->scx.flags & SCX_TASK_QUEUED) { /* * Core-sched might decide to execute @p before it is * dispatched. Call ops_dequeue() to notify the BPF scheduler. */ - ops_dequeue(p, SCX_DEQ_CORE_SCHED_EXEC); + ops_dequeue(rq, p, SCX_DEQ_CORE_SCHED_EXEC); dispatch_dequeue(rq, p); } p->se.exec_start = rq_clock_task(rq); /* see dequeue_task_scx() on why we skip when !QUEUED */ - if (SCX_HAS_OP(running) && (p->scx.flags & SCX_TASK_QUEUED)) - SCX_CALL_OP_TASK(SCX_KF_REST, running, p); + if (SCX_HAS_OP(sch, running) && (p->scx.flags & SCX_TASK_QUEUED)) + SCX_CALL_OP_TASK(sch, SCX_KF_REST, running, rq, p); clr_task_runnable(p, true); @@ -3155,6 +3301,7 @@ preempt_reason_from_class(const struct sched_class *class) static void switch_class(struct rq *rq, struct task_struct *next) { + struct scx_sched *sch = scx_root; const struct sched_class *next_class = next->sched_class; #ifdef CONFIG_SMP @@ -3165,7 +3312,7 @@ static void switch_class(struct rq *rq, struct task_struct *next) */ smp_store_release(&rq->scx.pnt_seq, rq->scx.pnt_seq + 1); #endif - if (!static_branch_unlikely(&scx_ops_cpu_preempt)) + if (!(sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT)) return; /* @@ -3187,14 +3334,14 @@ static void switch_class(struct rq *rq, struct task_struct *next) * next time that balance_scx() is invoked. */ if (!rq->scx.cpu_released) { - if (SCX_HAS_OP(cpu_release)) { + if (SCX_HAS_OP(sch, cpu_release)) { struct scx_cpu_release_args args = { .reason = preempt_reason_from_class(next_class), .task = next, }; - SCX_CALL_OP(SCX_KF_CPU_RELEASE, - cpu_release, cpu_of(rq), &args); + SCX_CALL_OP(sch, SCX_KF_CPU_RELEASE, cpu_release, rq, + cpu_of(rq), &args); } rq->scx.cpu_released = true; } @@ -3203,11 +3350,12 @@ static void switch_class(struct rq *rq, struct task_struct *next) static void put_prev_task_scx(struct rq *rq, struct task_struct *p, struct task_struct *next) { + struct scx_sched *sch = scx_root; update_curr_scx(rq); /* see dequeue_task_scx() on why we skip when !QUEUED */ - if (SCX_HAS_OP(stopping) && (p->scx.flags & SCX_TASK_QUEUED)) - SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, true); + if (SCX_HAS_OP(sch, stopping) && (p->scx.flags & SCX_TASK_QUEUED)) + SCX_CALL_OP_TASK(sch, SCX_KF_REST, stopping, rq, p, true); if (p->scx.flags & SCX_TASK_QUEUED) { set_task_runnable(rq, p); @@ -3219,7 +3367,8 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p, * DSQ. */ if (p->scx.slice && !scx_rq_bypassing(rq)) { - dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD); + dispatch_enqueue(sch, &rq->scx.local_dsq, p, + SCX_ENQ_HEAD); goto switch_class; } @@ -3230,7 +3379,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p, * which should trigger an explicit follow-up scheduling event. */ if (sched_class_above(&ext_sched_class, next->sched_class)) { - WARN_ON_ONCE(!static_branch_unlikely(&scx_ops_enq_last)); + WARN_ON_ONCE(!(sch->ops.flags & SCX_OPS_ENQ_LAST)); do_enqueue_task(rq, p, SCX_ENQ_LAST, -1); } else { do_enqueue_task(rq, p, 0, -1); @@ -3283,7 +3432,7 @@ static struct task_struct *pick_task_scx(struct rq *rq) * Can happen while enabling as SCX_RQ_BAL_PENDING assertion is * conditional on scx_enabled() and may have been skipped. */ - WARN_ON_ONCE(scx_ops_enable_state() == SCX_OPS_ENABLED); + WARN_ON_ONCE(scx_enable_state() == SCX_ENABLED); keep_prev = false; } @@ -3294,10 +3443,8 @@ static struct task_struct *pick_task_scx(struct rq *rq) */ if (keep_prev) { p = prev; - if (!p->scx.slice) { - p->scx.slice = SCX_SLICE_DFL; - __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1); - } + if (!p->scx.slice) + refill_task_slice_dfl(p); } else { p = first_local_task(rq); if (!p) { @@ -3307,13 +3454,14 @@ static struct task_struct *pick_task_scx(struct rq *rq) } if (unlikely(!p->scx.slice)) { - if (!scx_rq_bypassing(rq) && !scx_warned_zero_slice) { + struct scx_sched *sch = scx_root; + + if (!scx_rq_bypassing(rq) && !sch->warned_zero_slice) { printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in %s()\n", p->comm, p->pid, __func__); - scx_warned_zero_slice = true; + sch->warned_zero_slice = true; } - p->scx.slice = SCX_SLICE_DFL; - __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1); + refill_task_slice_dfl(p); } } @@ -3342,13 +3490,17 @@ static struct task_struct *pick_task_scx(struct rq *rq) bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, bool in_fi) { + struct scx_sched *sch = scx_root; + /* * The const qualifiers are dropped from task_struct pointers when * calling ops.core_sched_before(). Accesses are controlled by the * verifier. */ - if (SCX_HAS_OP(core_sched_before) && !scx_rq_bypassing(task_rq(a))) - return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, core_sched_before, + if (SCX_HAS_OP(sch, core_sched_before) && + !scx_rq_bypassing(task_rq(a))) + return SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, core_sched_before, + NULL, (struct task_struct *)a, (struct task_struct *)b); else @@ -3360,6 +3512,7 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags) { + struct scx_sched *sch = scx_root; bool rq_bypass; /* @@ -3376,7 +3529,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag return prev_cpu; rq_bypass = scx_rq_bypassing(task_rq(p)); - if (SCX_HAS_OP(select_cpu) && !rq_bypass) { + if (likely(SCX_HAS_OP(sch, select_cpu)) && !rq_bypass) { s32 cpu; struct task_struct **ddsp_taskp; @@ -3384,29 +3537,30 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag WARN_ON_ONCE(*ddsp_taskp); *ddsp_taskp = p; - cpu = SCX_CALL_OP_TASK_RET(SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU, - select_cpu, p, prev_cpu, wake_flags); + cpu = SCX_CALL_OP_TASK_RET(sch, + SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU, + select_cpu, NULL, p, prev_cpu, + wake_flags); p->scx.selected_cpu = cpu; *ddsp_taskp = NULL; - if (ops_cpu_valid(cpu, "from ops.select_cpu()")) + if (ops_cpu_valid(sch, cpu, "from ops.select_cpu()")) return cpu; else return prev_cpu; } else { s32 cpu; - cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, 0); + cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, NULL, 0); if (cpu >= 0) { - p->scx.slice = SCX_SLICE_DFL; + refill_task_slice_dfl(p); p->scx.ddsp_dsq_id = SCX_DSQ_LOCAL; - __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1); } else { cpu = prev_cpu; } p->scx.selected_cpu = cpu; if (rq_bypass) - __scx_add_event(SCX_EV_BYPASS_DISPATCH, 1); + __scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1); return cpu; } } @@ -3419,6 +3573,8 @@ static void task_woken_scx(struct rq *rq, struct task_struct *p) static void set_cpus_allowed_scx(struct task_struct *p, struct affinity_context *ac) { + struct scx_sched *sch = scx_root; + set_cpus_allowed_common(p, ac); /* @@ -3429,28 +3585,38 @@ static void set_cpus_allowed_scx(struct task_struct *p, * Fine-grained memory write control is enforced by BPF making the const * designation pointless. Cast it away when calling the operation. */ - if (SCX_HAS_OP(set_cpumask)) - SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, p, - (struct cpumask *)p->cpus_ptr); + if (SCX_HAS_OP(sch, set_cpumask)) + SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_cpumask, NULL, + p, (struct cpumask *)p->cpus_ptr); } static void handle_hotplug(struct rq *rq, bool online) { + struct scx_sched *sch = scx_root; int cpu = cpu_of(rq); atomic_long_inc(&scx_hotplug_seq); + /* + * scx_root updates are protected by cpus_read_lock() and will stay + * stable here. Note that we can't depend on scx_enabled() test as the + * hotplug ops need to be enabled before __scx_enabled is set. + */ + if (unlikely(!sch)) + return; + if (scx_enabled()) - scx_idle_update_selcpu_topology(&scx_ops); + scx_idle_update_selcpu_topology(&sch->ops); - if (online && SCX_HAS_OP(cpu_online)) - SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_online, cpu); - else if (!online && SCX_HAS_OP(cpu_offline)) - SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_offline, cpu); + if (online && SCX_HAS_OP(sch, cpu_online)) + SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cpu_online, NULL, cpu); + else if (!online && SCX_HAS_OP(sch, cpu_offline)) + SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cpu_offline, NULL, cpu); else - scx_ops_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG, - "cpu %d going %s, exiting scheduler", cpu, - online ? "online" : "offline"); + scx_exit(sch, SCX_EXIT_UNREG_KERN, + SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG, + "cpu %d going %s, exiting scheduler", cpu, + online ? "online" : "offline"); } void scx_rq_activate(struct rq *rq) @@ -3477,11 +3643,16 @@ static void rq_offline_scx(struct rq *rq) static bool check_rq_for_timeouts(struct rq *rq) { + struct scx_sched *sch; struct task_struct *p; struct rq_flags rf; bool timed_out = false; rq_lock_irqsave(rq, &rf); + sch = rcu_dereference_bh(scx_root); + if (unlikely(!sch)) + goto out_unlock; + list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) { unsigned long last_runnable = p->scx.runnable_at; @@ -3489,16 +3660,15 @@ static bool check_rq_for_timeouts(struct rq *rq) last_runnable + scx_watchdog_timeout))) { u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable); - scx_ops_error_kind(SCX_EXIT_ERROR_STALL, - "%s[%d] failed to run for %u.%03us", - p->comm, p->pid, - dur_ms / 1000, dur_ms % 1000); + scx_exit(sch, SCX_EXIT_ERROR_STALL, 0, + "%s[%d] failed to run for %u.%03us", + p->comm, p->pid, dur_ms / 1000, dur_ms % 1000); timed_out = true; break; } } +out_unlock: rq_unlock_irqrestore(rq, &rf); - return timed_out; } @@ -3520,19 +3690,24 @@ static void scx_watchdog_workfn(struct work_struct *work) void scx_tick(struct rq *rq) { + struct scx_sched *sch; unsigned long last_check; if (!scx_enabled()) return; + sch = rcu_dereference_bh(scx_root); + if (unlikely(!sch)) + return; + last_check = READ_ONCE(scx_watchdog_timestamp); if (unlikely(time_after(jiffies, last_check + READ_ONCE(scx_watchdog_timeout)))) { u32 dur_ms = jiffies_to_msecs(jiffies - last_check); - scx_ops_error_kind(SCX_EXIT_ERROR_STALL, - "watchdog failed to check in for %u.%03us", - dur_ms / 1000, dur_ms % 1000); + scx_exit(sch, SCX_EXIT_ERROR_STALL, 0, + "watchdog failed to check in for %u.%03us", + dur_ms / 1000, dur_ms % 1000); } update_other_load_avgs(rq); @@ -3540,6 +3715,8 @@ void scx_tick(struct rq *rq) static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued) { + struct scx_sched *sch = scx_root; + update_curr_scx(rq); /* @@ -3549,8 +3726,8 @@ static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued) if (scx_rq_bypassing(rq)) { curr->scx.slice = 0; touch_core_sched(rq, curr); - } else if (SCX_HAS_OP(tick)) { - SCX_CALL_OP_TASK(SCX_KF_REST, tick, curr); + } else if (SCX_HAS_OP(sch, tick)) { + SCX_CALL_OP_TASK(sch, SCX_KF_REST, tick, rq, curr); } if (!curr->scx.slice) @@ -3615,21 +3792,23 @@ static void scx_set_task_state(struct task_struct *p, enum scx_task_state state) p->scx.flags |= state << SCX_TASK_STATE_SHIFT; } -static int scx_ops_init_task(struct task_struct *p, struct task_group *tg, bool fork) +static int scx_init_task(struct task_struct *p, struct task_group *tg, bool fork) { + struct scx_sched *sch = scx_root; int ret; p->scx.disallow = false; - if (SCX_HAS_OP(init_task)) { + if (SCX_HAS_OP(sch, init_task)) { struct scx_init_task_args args = { SCX_INIT_TASK_ARGS_CGROUP(tg) .fork = fork, }; - ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init_task, p, &args); + ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, init_task, NULL, + p, &args); if (unlikely(ret)) { - ret = ops_sanitize_err("init_task", ret); + ret = ops_sanitize_err(sch, "init_task", ret); return ret; } } @@ -3657,8 +3836,8 @@ static int scx_ops_init_task(struct task_struct *p, struct task_group *tg, bool task_rq_unlock(rq, p, &rf); } else if (p->policy == SCHED_EXT) { - scx_ops_error("ops.init_task() set task->scx.disallow for %s[%d] during fork", - p->comm, p->pid); + scx_error(sch, "ops.init_task() set task->scx.disallow for %s[%d] during fork", + p->comm, p->pid); } } @@ -3666,11 +3845,13 @@ static int scx_ops_init_task(struct task_struct *p, struct task_group *tg, bool return 0; } -static void scx_ops_enable_task(struct task_struct *p) +static void scx_enable_task(struct task_struct *p) { + struct scx_sched *sch = scx_root; + struct rq *rq = task_rq(p); u32 weight; - lockdep_assert_rq_held(task_rq(p)); + lockdep_assert_rq_held(rq); /* * Set the weight before calling ops.enable() so that the scheduler @@ -3683,26 +3864,31 @@ static void scx_ops_enable_task(struct task_struct *p) p->scx.weight = sched_weight_to_cgroup(weight); - if (SCX_HAS_OP(enable)) - SCX_CALL_OP_TASK(SCX_KF_REST, enable, p); + if (SCX_HAS_OP(sch, enable)) + SCX_CALL_OP_TASK(sch, SCX_KF_REST, enable, rq, p); scx_set_task_state(p, SCX_TASK_ENABLED); - if (SCX_HAS_OP(set_weight)) - SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight); + if (SCX_HAS_OP(sch, set_weight)) + SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_weight, rq, + p, p->scx.weight); } -static void scx_ops_disable_task(struct task_struct *p) +static void scx_disable_task(struct task_struct *p) { - lockdep_assert_rq_held(task_rq(p)); + struct scx_sched *sch = scx_root; + struct rq *rq = task_rq(p); + + lockdep_assert_rq_held(rq); WARN_ON_ONCE(scx_get_task_state(p) != SCX_TASK_ENABLED); - if (SCX_HAS_OP(disable)) - SCX_CALL_OP_TASK(SCX_KF_REST, disable, p); + if (SCX_HAS_OP(sch, disable)) + SCX_CALL_OP_TASK(sch, SCX_KF_REST, disable, rq, p); scx_set_task_state(p, SCX_TASK_READY); } -static void scx_ops_exit_task(struct task_struct *p) +static void scx_exit_task(struct task_struct *p) { + struct scx_sched *sch = scx_root; struct scx_exit_task_args args = { .cancelled = false, }; @@ -3718,15 +3904,16 @@ static void scx_ops_exit_task(struct task_struct *p) case SCX_TASK_READY: break; case SCX_TASK_ENABLED: - scx_ops_disable_task(p); + scx_disable_task(p); break; default: WARN_ON_ONCE(true); return; } - if (SCX_HAS_OP(exit_task)) - SCX_CALL_OP_TASK(SCX_KF_REST, exit_task, p, &args); + if (SCX_HAS_OP(sch, exit_task)) + SCX_CALL_OP_TASK(sch, SCX_KF_REST, exit_task, task_rq(p), + p, &args); scx_set_task_state(p, SCX_TASK_NONE); } @@ -3758,15 +3945,15 @@ int scx_fork(struct task_struct *p) { percpu_rwsem_assert_held(&scx_fork_rwsem); - if (scx_ops_init_task_enabled) - return scx_ops_init_task(p, task_group(p), true); + if (scx_init_task_enabled) + return scx_init_task(p, task_group(p), true); else return 0; } void scx_post_fork(struct task_struct *p) { - if (scx_ops_init_task_enabled) { + if (scx_init_task_enabled) { scx_set_task_state(p, SCX_TASK_READY); /* @@ -3779,7 +3966,7 @@ void scx_post_fork(struct task_struct *p) struct rq *rq; rq = task_rq_lock(p, &rf); - scx_ops_enable_task(p); + scx_enable_task(p); task_rq_unlock(rq, p, &rf); } } @@ -3799,7 +3986,7 @@ void scx_cancel_fork(struct task_struct *p) rq = task_rq_lock(p, &rf); WARN_ON_ONCE(scx_get_task_state(p) >= SCX_TASK_READY); - scx_ops_exit_task(p); + scx_exit_task(p); task_rq_unlock(rq, p, &rf); } @@ -3815,15 +4002,15 @@ void sched_ext_free(struct task_struct *p) spin_unlock_irqrestore(&scx_tasks_lock, flags); /* - * @p is off scx_tasks and wholly ours. scx_ops_enable()'s READY -> - * ENABLED transitions can't race us. Disable ops for @p. + * @p is off scx_tasks and wholly ours. scx_enable()'s READY -> ENABLED + * transitions can't race us. Disable ops for @p. */ if (scx_get_task_state(p) != SCX_TASK_NONE) { struct rq_flags rf; struct rq *rq; rq = task_rq_lock(p, &rf); - scx_ops_exit_task(p); + scx_exit_task(p); task_rq_unlock(rq, p, &rf); } } @@ -3831,11 +4018,14 @@ void sched_ext_free(struct task_struct *p) static void reweight_task_scx(struct rq *rq, struct task_struct *p, const struct load_weight *lw) { + struct scx_sched *sch = scx_root; + lockdep_assert_rq_held(task_rq(p)); p->scx.weight = sched_weight_to_cgroup(scale_load_down(lw->weight)); - if (SCX_HAS_OP(set_weight)) - SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight); + if (SCX_HAS_OP(sch, set_weight)) + SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_weight, rq, + p, p->scx.weight); } static void prio_changed_scx(struct rq *rq, struct task_struct *p, int oldprio) @@ -3844,20 +4034,22 @@ static void prio_changed_scx(struct rq *rq, struct task_struct *p, int oldprio) static void switching_to_scx(struct rq *rq, struct task_struct *p) { - scx_ops_enable_task(p); + struct scx_sched *sch = scx_root; + + scx_enable_task(p); /* * set_cpus_allowed_scx() is not called while @p is associated with a * different scheduler class. Keep the BPF scheduler up-to-date. */ - if (SCX_HAS_OP(set_cpumask)) - SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, p, - (struct cpumask *)p->cpus_ptr); + if (SCX_HAS_OP(sch, set_cpumask)) + SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_cpumask, rq, + p, (struct cpumask *)p->cpus_ptr); } static void switched_from_scx(struct rq *rq, struct task_struct *p) { - scx_ops_disable_task(p); + scx_disable_task(p); } static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p,int wake_flags) {} @@ -3899,55 +4091,25 @@ bool scx_can_stop_tick(struct rq *rq) DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_rwsem); static bool scx_cgroup_enabled; -static bool cgroup_warned_missing_weight; -static bool cgroup_warned_missing_idle; - -static void scx_cgroup_warn_missing_weight(struct task_group *tg) -{ - if (scx_ops_enable_state() == SCX_OPS_DISABLED || - cgroup_warned_missing_weight) - return; - - if ((scx_ops.flags & SCX_OPS_HAS_CGROUP_WEIGHT) || !tg->css.parent) - return; - - pr_warn("sched_ext: \"%s\" does not implement cgroup cpu.weight\n", - scx_ops.name); - cgroup_warned_missing_weight = true; -} - -static void scx_cgroup_warn_missing_idle(struct task_group *tg) -{ - if (!scx_cgroup_enabled || cgroup_warned_missing_idle) - return; - - if (!tg->idle) - return; - - pr_warn("sched_ext: \"%s\" does not implement cgroup cpu.idle\n", - scx_ops.name); - cgroup_warned_missing_idle = true; -} int scx_tg_online(struct task_group *tg) { + struct scx_sched *sch = scx_root; int ret = 0; WARN_ON_ONCE(tg->scx_flags & (SCX_TG_ONLINE | SCX_TG_INITED)); percpu_down_read(&scx_cgroup_rwsem); - scx_cgroup_warn_missing_weight(tg); - if (scx_cgroup_enabled) { - if (SCX_HAS_OP(cgroup_init)) { + if (SCX_HAS_OP(sch, cgroup_init)) { struct scx_cgroup_init_args args = { .weight = tg->scx_weight }; - ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init, - tg->css.cgroup, &args); + ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, cgroup_init, + NULL, tg->css.cgroup, &args); if (ret) - ret = ops_sanitize_err("cgroup_init", ret); + ret = ops_sanitize_err(sch, "cgroup_init", ret); } if (ret == 0) tg->scx_flags |= SCX_TG_ONLINE | SCX_TG_INITED; @@ -3961,12 +4123,16 @@ int scx_tg_online(struct task_group *tg) void scx_tg_offline(struct task_group *tg) { + struct scx_sched *sch = scx_root; + WARN_ON_ONCE(!(tg->scx_flags & SCX_TG_ONLINE)); percpu_down_read(&scx_cgroup_rwsem); - if (SCX_HAS_OP(cgroup_exit) && (tg->scx_flags & SCX_TG_INITED)) - SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, tg->css.cgroup); + if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_exit) && + (tg->scx_flags & SCX_TG_INITED)) + SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_exit, NULL, + tg->css.cgroup); tg->scx_flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED); percpu_up_read(&scx_cgroup_rwsem); @@ -3974,6 +4140,7 @@ void scx_tg_offline(struct task_group *tg) int scx_cgroup_can_attach(struct cgroup_taskset *tset) { + struct scx_sched *sch = scx_root; struct cgroup_subsys_state *css; struct task_struct *p; int ret; @@ -3998,8 +4165,9 @@ int scx_cgroup_can_attach(struct cgroup_taskset *tset) if (from == to) continue; - if (SCX_HAS_OP(cgroup_prep_move)) { - ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_prep_move, + if (SCX_HAS_OP(sch, cgroup_prep_move)) { + ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, + cgroup_prep_move, NULL, p, from, css->cgroup); if (ret) goto err; @@ -4012,18 +4180,21 @@ int scx_cgroup_can_attach(struct cgroup_taskset *tset) err: cgroup_taskset_for_each(p, css, tset) { - if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from) - SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, p, - p->scx.cgrp_moving_from, css->cgroup); + if (SCX_HAS_OP(sch, cgroup_cancel_move) && + p->scx.cgrp_moving_from) + SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_cancel_move, NULL, + p, p->scx.cgrp_moving_from, css->cgroup); p->scx.cgrp_moving_from = NULL; } percpu_up_read(&scx_cgroup_rwsem); - return ops_sanitize_err("cgroup_prep_move", ret); + return ops_sanitize_err(sch, "cgroup_prep_move", ret); } void scx_cgroup_move_task(struct task_struct *p) { + struct scx_sched *sch = scx_root; + if (!scx_cgroup_enabled) return; @@ -4031,9 +4202,11 @@ void scx_cgroup_move_task(struct task_struct *p) * @p must have ops.cgroup_prep_move() called on it and thus * cgrp_moving_from set. */ - if (SCX_HAS_OP(cgroup_move) && !WARN_ON_ONCE(!p->scx.cgrp_moving_from)) - SCX_CALL_OP_TASK(SCX_KF_UNLOCKED, cgroup_move, p, - p->scx.cgrp_moving_from, tg_cgrp(task_group(p))); + if (SCX_HAS_OP(sch, cgroup_move) && + !WARN_ON_ONCE(!p->scx.cgrp_moving_from)) + SCX_CALL_OP_TASK(sch, SCX_KF_UNLOCKED, cgroup_move, NULL, + p, p->scx.cgrp_moving_from, + tg_cgrp(task_group(p))); p->scx.cgrp_moving_from = NULL; } @@ -4044,6 +4217,7 @@ void scx_cgroup_finish_attach(void) void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) { + struct scx_sched *sch = scx_root; struct cgroup_subsys_state *css; struct task_struct *p; @@ -4051,9 +4225,10 @@ void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) goto out_unlock; cgroup_taskset_for_each(p, css, tset) { - if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from) - SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, p, - p->scx.cgrp_moving_from, css->cgroup); + if (SCX_HAS_OP(sch, cgroup_cancel_move) && + p->scx.cgrp_moving_from) + SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_cancel_move, NULL, + p, p->scx.cgrp_moving_from, css->cgroup); p->scx.cgrp_moving_from = NULL; } out_unlock: @@ -4062,11 +4237,13 @@ out_unlock: void scx_group_set_weight(struct task_group *tg, unsigned long weight) { + struct scx_sched *sch = scx_root; + percpu_down_read(&scx_cgroup_rwsem); if (scx_cgroup_enabled && tg->scx_weight != weight) { - if (SCX_HAS_OP(cgroup_set_weight)) - SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_set_weight, + if (SCX_HAS_OP(sch, cgroup_set_weight)) + SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_weight, NULL, tg_cgrp(tg), weight); tg->scx_weight = weight; } @@ -4076,9 +4253,7 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight) void scx_group_set_idle(struct task_group *tg, bool idle) { - percpu_down_read(&scx_cgroup_rwsem); - scx_cgroup_warn_missing_idle(tg); - percpu_up_read(&scx_cgroup_rwsem); + /* TODO: Implement ops->cgroup_set_idle() */ } static void scx_cgroup_lock(void) @@ -4157,29 +4332,6 @@ static void init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id) dsq->id = dsq_id; } -static struct scx_dispatch_q *create_dsq(u64 dsq_id, int node) -{ - struct scx_dispatch_q *dsq; - int ret; - - if (dsq_id & SCX_DSQ_FLAG_BUILTIN) - return ERR_PTR(-EINVAL); - - dsq = kmalloc_node(sizeof(*dsq), GFP_KERNEL, node); - if (!dsq) - return ERR_PTR(-ENOMEM); - - init_dsq(dsq, dsq_id); - - ret = rhashtable_insert_fast(&dsq_hash, &dsq->hash_node, - dsq_hash_params); - if (ret) { - kfree(dsq); - return ERR_PTR(ret); - } - return dsq; -} - static void free_dsq_irq_workfn(struct irq_work *irq_work) { struct llist_node *to_free = llist_del_all(&dsqs_to_free); @@ -4191,26 +4343,27 @@ static void free_dsq_irq_workfn(struct irq_work *irq_work) static DEFINE_IRQ_WORK(free_dsq_irq_work, free_dsq_irq_workfn); -static void destroy_dsq(u64 dsq_id) +static void destroy_dsq(struct scx_sched *sch, u64 dsq_id) { struct scx_dispatch_q *dsq; unsigned long flags; rcu_read_lock(); - dsq = find_user_dsq(dsq_id); + dsq = find_user_dsq(sch, dsq_id); if (!dsq) goto out_unlock_rcu; raw_spin_lock_irqsave(&dsq->lock, flags); if (dsq->nr) { - scx_ops_error("attempting to destroy in-use dsq 0x%016llx (nr=%u)", - dsq->id, dsq->nr); + scx_error(sch, "attempting to destroy in-use dsq 0x%016llx (nr=%u)", + dsq->id, dsq->nr); goto out_unlock_dsq; } - if (rhashtable_remove_fast(&dsq_hash, &dsq->hash_node, dsq_hash_params)) + if (rhashtable_remove_fast(&sch->dsq_hash, &dsq->hash_node, + dsq_hash_params)) goto out_unlock_dsq; /* @@ -4230,7 +4383,7 @@ out_unlock_rcu: } #ifdef CONFIG_EXT_GROUP_SCHED -static void scx_cgroup_exit(void) +static void scx_cgroup_exit(struct scx_sched *sch) { struct cgroup_subsys_state *css; @@ -4250,14 +4403,15 @@ static void scx_cgroup_exit(void) continue; tg->scx_flags &= ~SCX_TG_INITED; - if (!scx_ops.cgroup_exit) + if (!sch->ops.cgroup_exit) continue; if (WARN_ON_ONCE(!css_tryget(css))) continue; rcu_read_unlock(); - SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, css->cgroup); + SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_exit, NULL, + css->cgroup); rcu_read_lock(); css_put(css); @@ -4265,16 +4419,13 @@ static void scx_cgroup_exit(void) rcu_read_unlock(); } -static int scx_cgroup_init(void) +static int scx_cgroup_init(struct scx_sched *sch) { struct cgroup_subsys_state *css; int ret; percpu_rwsem_assert_held(&scx_cgroup_rwsem); - cgroup_warned_missing_weight = false; - cgroup_warned_missing_idle = false; - /* * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk * cgroups and init, all online cgroups are initialized. @@ -4284,14 +4435,11 @@ static int scx_cgroup_init(void) struct task_group *tg = css_tg(css); struct scx_cgroup_init_args args = { .weight = tg->scx_weight }; - scx_cgroup_warn_missing_weight(tg); - scx_cgroup_warn_missing_idle(tg); - if ((tg->scx_flags & (SCX_TG_ONLINE | SCX_TG_INITED)) != SCX_TG_ONLINE) continue; - if (!scx_ops.cgroup_init) { + if (!sch->ops.cgroup_init) { tg->scx_flags |= SCX_TG_INITED; continue; } @@ -4300,11 +4448,11 @@ static int scx_cgroup_init(void) continue; rcu_read_unlock(); - ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init, + ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, cgroup_init, NULL, css->cgroup, &args); if (ret) { css_put(css); - scx_ops_error("ops.cgroup_init() failed (%d)", ret); + scx_error(sch, "ops.cgroup_init() failed (%d)", ret); return ret; } tg->scx_flags |= SCX_TG_INITED; @@ -4321,8 +4469,8 @@ static int scx_cgroup_init(void) } #else -static void scx_cgroup_exit(void) {} -static int scx_cgroup_init(void) { return 0; } +static void scx_cgroup_exit(struct scx_sched *sch) {} +static int scx_cgroup_init(struct scx_sched *sch) { return 0; } #endif @@ -4339,8 +4487,7 @@ static int scx_cgroup_init(void) { return 0; } static ssize_t scx_attr_state_show(struct kobject *kobj, struct kobj_attribute *ka, char *buf) { - return sysfs_emit(buf, "%s\n", - scx_ops_enable_state_str[scx_ops_enable_state()]); + return sysfs_emit(buf, "%s\n", scx_enable_state_str[scx_enable_state()]); } SCX_ATTR(state); @@ -4385,15 +4532,51 @@ static const struct attribute_group scx_global_attr_group = { .attrs = scx_global_attrs, }; +static void free_exit_info(struct scx_exit_info *ei); + +static void scx_sched_free_rcu_work(struct work_struct *work) +{ + struct rcu_work *rcu_work = to_rcu_work(work); + struct scx_sched *sch = container_of(rcu_work, struct scx_sched, rcu_work); + struct rhashtable_iter rht_iter; + struct scx_dispatch_q *dsq; + int node; + + kthread_stop(sch->helper->task); + free_percpu(sch->event_stats_cpu); + + for_each_node_state(node, N_POSSIBLE) + kfree(sch->global_dsqs[node]); + kfree(sch->global_dsqs); + + rhashtable_walk_enter(&sch->dsq_hash, &rht_iter); + do { + rhashtable_walk_start(&rht_iter); + + while ((dsq = rhashtable_walk_next(&rht_iter)) && !IS_ERR(dsq)) + destroy_dsq(sch, dsq->id); + + rhashtable_walk_stop(&rht_iter); + } while (dsq == ERR_PTR(-EAGAIN)); + rhashtable_walk_exit(&rht_iter); + + rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL); + free_exit_info(sch->exit_info); + kfree(sch); +} + static void scx_kobj_release(struct kobject *kobj) { - kfree(kobj); + struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj); + + INIT_RCU_WORK(&sch->rcu_work, scx_sched_free_rcu_work); + queue_rcu_work(system_unbound_wq, &sch->rcu_work); } static ssize_t scx_attr_ops_show(struct kobject *kobj, struct kobj_attribute *ka, char *buf) { - return sysfs_emit(buf, "%s\n", scx_ops.name); + return sysfs_emit(buf, "%s\n", scx_root->ops.name); } SCX_ATTR(ops); @@ -4404,16 +4587,17 @@ SCX_ATTR(ops); static ssize_t scx_attr_events_show(struct kobject *kobj, struct kobj_attribute *ka, char *buf) { + struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj); struct scx_event_stats events; int at = 0; - scx_bpf_events(&events, sizeof(events)); + scx_read_events(sch, &events); at += scx_attr_event_show(buf, at, &events, SCX_EV_SELECT_CPU_FALLBACK); at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_KEEP_LAST); at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_EXITING); at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); - at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SLICE_DFL); + at += scx_attr_event_show(buf, at, &events, SCX_EV_REFILL_SLICE_DFL); at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DURATION); at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DISPATCH); at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_ACTIVATE); @@ -4436,7 +4620,7 @@ static const struct kobj_type scx_ktype = { static int scx_uevent(const struct kobject *kobj, struct kobj_uevent_env *env) { - return add_uevent_var(env, "SCXOPS=%s", scx_ops.name); + return add_uevent_var(env, "SCXOPS=%s", scx_root->ops.name); } static const struct kset_uevent_ops scx_uevent_ops = { @@ -4449,14 +4633,20 @@ static const struct kset_uevent_ops scx_uevent_ops = { */ bool task_should_scx(int policy) { - if (!scx_enabled() || - unlikely(scx_ops_enable_state() == SCX_OPS_DISABLING)) + if (!scx_enabled() || unlikely(scx_enable_state() == SCX_DISABLING)) return false; if (READ_ONCE(scx_switching_all)) return true; return policy == SCHED_EXT; } +bool scx_allow_ttwu_queue(const struct task_struct *p) +{ + return !scx_enabled() || + (scx_root->ops.flags & SCX_OPS_ALLOW_QUEUED_WAKEUP) || + p->sched_class != &ext_sched_class; +} + /** * scx_softlockup - sched_ext softlockup handler * @dur_s: number of seconds of CPU stuck due to soft lockup @@ -4469,40 +4659,48 @@ bool task_should_scx(int policy) */ void scx_softlockup(u32 dur_s) { - switch (scx_ops_enable_state()) { - case SCX_OPS_ENABLING: - case SCX_OPS_ENABLED: + struct scx_sched *sch; + + rcu_read_lock(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + goto out_unlock; + + switch (scx_enable_state()) { + case SCX_ENABLING: + case SCX_ENABLED: break; default: - return; + goto out_unlock; } - /* allow only one instance, cleared at the end of scx_ops_bypass() */ + /* allow only one instance, cleared at the end of scx_bypass() */ if (test_and_set_bit(0, &scx_in_softlockup)) - return; + goto out_unlock; printk_deferred(KERN_ERR "sched_ext: Soft lockup - CPU%d stuck for %us, disabling \"%s\"\n", - smp_processor_id(), dur_s, scx_ops.name); + smp_processor_id(), dur_s, scx_root->ops.name); /* * Some CPUs may be trapped in the dispatch paths. Enable breather - * immediately; otherwise, we might even be able to get to - * scx_ops_bypass(). + * immediately; otherwise, we might even be able to get to scx_bypass(). */ - atomic_inc(&scx_ops_breather_depth); + atomic_inc(&scx_breather_depth); - scx_ops_error("soft lockup - CPU#%d stuck for %us", - smp_processor_id(), dur_s); + scx_error(sch, "soft lockup - CPU#%d stuck for %us", smp_processor_id(), dur_s); +out_unlock: + rcu_read_unlock(); } static void scx_clear_softlockup(void) { if (test_and_clear_bit(0, &scx_in_softlockup)) - atomic_dec(&scx_ops_breather_depth); + atomic_dec(&scx_breather_depth); } /** - * scx_ops_bypass - [Un]bypass scx_ops and guarantee forward progress + * scx_bypass - [Un]bypass scx_ops and guarantee forward progress * @bypass: true for bypass, false for unbypass * * Bypassing guarantees that all runnable tasks make forward progress without @@ -4532,32 +4730,36 @@ static void scx_clear_softlockup(void) * * - scx_prio_less() reverts to the default core_sched_at order. */ -static void scx_ops_bypass(bool bypass) +static void scx_bypass(bool bypass) { static DEFINE_RAW_SPINLOCK(bypass_lock); static unsigned long bypass_timestamp; - - int cpu; + struct scx_sched *sch; unsigned long flags; + int cpu; raw_spin_lock_irqsave(&bypass_lock, flags); + sch = rcu_dereference_bh(scx_root); + if (bypass) { - scx_ops_bypass_depth++; - WARN_ON_ONCE(scx_ops_bypass_depth <= 0); - if (scx_ops_bypass_depth != 1) + scx_bypass_depth++; + WARN_ON_ONCE(scx_bypass_depth <= 0); + if (scx_bypass_depth != 1) goto unlock; bypass_timestamp = ktime_get_ns(); - scx_add_event(SCX_EV_BYPASS_ACTIVATE, 1); + if (sch) + scx_add_event(sch, SCX_EV_BYPASS_ACTIVATE, 1); } else { - scx_ops_bypass_depth--; - WARN_ON_ONCE(scx_ops_bypass_depth < 0); - if (scx_ops_bypass_depth != 0) + scx_bypass_depth--; + WARN_ON_ONCE(scx_bypass_depth < 0); + if (scx_bypass_depth != 0) goto unlock; - scx_add_event(SCX_EV_BYPASS_DURATION, - ktime_get_ns() - bypass_timestamp); + if (sch) + scx_add_event(sch, SCX_EV_BYPASS_DURATION, + ktime_get_ns() - bypass_timestamp); } - atomic_inc(&scx_ops_breather_depth); + atomic_inc(&scx_breather_depth); /* * No task property is changing. We just need to make sure all currently @@ -4615,7 +4817,7 @@ static void scx_ops_bypass(bool bypass) raw_spin_rq_unlock(rq); } - atomic_dec(&scx_ops_breather_depth); + atomic_dec(&scx_breather_depth); unlock: raw_spin_unlock_irqrestore(&bypass_lock, flags); scx_clear_softlockup(); @@ -4623,7 +4825,7 @@ unlock: static void free_exit_info(struct scx_exit_info *ei) { - kfree(ei->dump); + kvfree(ei->dump); kfree(ei->msg); kfree(ei->bt); kfree(ei); @@ -4639,7 +4841,7 @@ static struct scx_exit_info *alloc_exit_info(size_t exit_dump_len) ei->bt = kcalloc(SCX_EXIT_BT_LEN, sizeof(ei->bt[0]), GFP_KERNEL); ei->msg = kzalloc(SCX_EXIT_MSG_LEN, GFP_KERNEL); - ei->dump = kzalloc(exit_dump_len, GFP_KERNEL); + ei->dump = kvzalloc(exit_dump_len, GFP_KERNEL); if (!ei->bt || !ei->msg || !ei->dump) { free_exit_info(ei); @@ -4671,42 +4873,36 @@ static const char *scx_exit_reason(enum scx_exit_kind kind) } } -static void scx_ops_disable_workfn(struct kthread_work *work) +static void scx_disable_workfn(struct kthread_work *work) { - struct scx_exit_info *ei = scx_exit_info; + struct scx_sched *sch = container_of(work, struct scx_sched, disable_work); + struct scx_exit_info *ei = sch->exit_info; struct scx_task_iter sti; struct task_struct *p; - struct rhashtable_iter rht_iter; - struct scx_dispatch_q *dsq; - int i, kind, cpu; + int kind, cpu; - kind = atomic_read(&scx_exit_kind); + kind = atomic_read(&sch->exit_kind); while (true) { - /* - * NONE indicates that a new scx_ops has been registered since - * disable was scheduled - don't kill the new ops. DONE - * indicates that the ops has already been disabled. - */ - if (kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE) + if (kind == SCX_EXIT_DONE) /* already disabled? */ return; - if (atomic_try_cmpxchg(&scx_exit_kind, &kind, SCX_EXIT_DONE)) + WARN_ON_ONCE(kind == SCX_EXIT_NONE); + if (atomic_try_cmpxchg(&sch->exit_kind, &kind, SCX_EXIT_DONE)) break; } ei->kind = kind; ei->reason = scx_exit_reason(ei->kind); /* guarantee forward progress by bypassing scx_ops */ - scx_ops_bypass(true); + scx_bypass(true); - switch (scx_ops_set_enable_state(SCX_OPS_DISABLING)) { - case SCX_OPS_DISABLING: + switch (scx_set_enable_state(SCX_DISABLING)) { + case SCX_DISABLING: WARN_ONCE(true, "sched_ext: duplicate disabling instance?"); break; - case SCX_OPS_DISABLED: + case SCX_DISABLED: pr_warn("sched_ext: ops error detected without ops (%s)\n", - scx_exit_info->msg); - WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_DISABLED) != - SCX_OPS_DISABLING); + sch->exit_info->msg); + WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING); goto done; default: break; @@ -4717,17 +4913,17 @@ static void scx_ops_disable_workfn(struct kthread_work *work) * we can safely use blocking synchronization constructs. Actually * disable ops. */ - mutex_lock(&scx_ops_enable_mutex); + mutex_lock(&scx_enable_mutex); static_branch_disable(&__scx_switched_all); WRITE_ONCE(scx_switching_all, false); /* * Shut down cgroup support before tasks so that the cgroup attach path - * doesn't race against scx_ops_exit_task(). + * doesn't race against scx_exit_task(). */ scx_cgroup_lock(); - scx_cgroup_exit(); + scx_cgroup_exit(sch); scx_cgroup_unlock(); /* @@ -4736,7 +4932,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work) */ percpu_down_write(&scx_fork_rwsem); - scx_ops_init_task_enabled = false; + scx_init_task_enabled = false; scx_task_iter_start(&sti); while ((p = scx_task_iter_next_locked(&sti))) { @@ -4756,7 +4952,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work) sched_enq_and_set_task(&ctx); check_class_changed(task_rq(p), p, old_class, p->prio); - scx_ops_exit_task(p); + scx_exit_task(p); } scx_task_iter_stop(&sti); percpu_up_write(&scx_fork_rwsem); @@ -4771,98 +4967,71 @@ static void scx_ops_disable_workfn(struct kthread_work *work) } /* no task is on scx, turn off all the switches and flush in-progress calls */ - static_branch_disable(&__scx_ops_enabled); - for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++) - static_branch_disable(&scx_has_op[i]); - static_branch_disable(&scx_ops_allow_queued_wakeup); - static_branch_disable(&scx_ops_enq_last); - static_branch_disable(&scx_ops_enq_exiting); - static_branch_disable(&scx_ops_enq_migration_disabled); - static_branch_disable(&scx_ops_cpu_preempt); + static_branch_disable(&__scx_enabled); + bitmap_zero(sch->has_op, SCX_OPI_END); scx_idle_disable(); synchronize_rcu(); if (ei->kind >= SCX_EXIT_ERROR) { pr_err("sched_ext: BPF scheduler \"%s\" disabled (%s)\n", - scx_ops.name, ei->reason); + sch->ops.name, ei->reason); if (ei->msg[0] != '\0') - pr_err("sched_ext: %s: %s\n", scx_ops.name, ei->msg); + pr_err("sched_ext: %s: %s\n", sch->ops.name, ei->msg); #ifdef CONFIG_STACKTRACE stack_trace_print(ei->bt, ei->bt_len, 2); #endif } else { pr_info("sched_ext: BPF scheduler \"%s\" disabled (%s)\n", - scx_ops.name, ei->reason); + sch->ops.name, ei->reason); } - if (scx_ops.exit) - SCX_CALL_OP(SCX_KF_UNLOCKED, exit, ei); + if (sch->ops.exit) + SCX_CALL_OP(sch, SCX_KF_UNLOCKED, exit, NULL, ei); cancel_delayed_work_sync(&scx_watchdog_work); /* - * Delete the kobject from the hierarchy eagerly in addition to just - * dropping a reference. Otherwise, if the object is deleted - * asynchronously, sysfs could observe an object of the same name still - * in the hierarchy when another scheduler is loaded. + * scx_root clearing must be inside cpus_read_lock(). See + * handle_hotplug(). */ - kobject_del(scx_root_kobj); - kobject_put(scx_root_kobj); - scx_root_kobj = NULL; - - memset(&scx_ops, 0, sizeof(scx_ops)); - - rhashtable_walk_enter(&dsq_hash, &rht_iter); - do { - rhashtable_walk_start(&rht_iter); - - while ((dsq = rhashtable_walk_next(&rht_iter)) && !IS_ERR(dsq)) - destroy_dsq(dsq->id); + cpus_read_lock(); + RCU_INIT_POINTER(scx_root, NULL); + cpus_read_unlock(); - rhashtable_walk_stop(&rht_iter); - } while (dsq == ERR_PTR(-EAGAIN)); - rhashtable_walk_exit(&rht_iter); + /* + * Delete the kobject from the hierarchy synchronously. Otherwise, sysfs + * could observe an object of the same name still in the hierarchy when + * the next scheduler is loaded. + */ + kobject_del(&sch->kobj); free_percpu(scx_dsp_ctx); scx_dsp_ctx = NULL; scx_dsp_max_batch = 0; - free_exit_info(scx_exit_info); - scx_exit_info = NULL; + mutex_unlock(&scx_enable_mutex); - mutex_unlock(&scx_ops_enable_mutex); - - WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_DISABLED) != - SCX_OPS_DISABLING); + WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING); done: - scx_ops_bypass(false); -} - -static DEFINE_KTHREAD_WORK(scx_ops_disable_work, scx_ops_disable_workfn); - -static void schedule_scx_ops_disable_work(void) -{ - struct kthread_worker *helper = READ_ONCE(scx_ops_helper); - - /* - * We may be called spuriously before the first bpf_sched_ext_reg(). If - * scx_ops_helper isn't set up yet, there's nothing to do. - */ - if (helper) - kthread_queue_work(helper, &scx_ops_disable_work); + scx_bypass(false); } -static void scx_ops_disable(enum scx_exit_kind kind) +static void scx_disable(enum scx_exit_kind kind) { int none = SCX_EXIT_NONE; + struct scx_sched *sch; if (WARN_ON_ONCE(kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE)) kind = SCX_EXIT_ERROR; - atomic_try_cmpxchg(&scx_exit_kind, &none, kind); - - schedule_scx_ops_disable_work(); + rcu_read_lock(); + sch = rcu_dereference(scx_root); + if (sch) { + atomic_try_cmpxchg(&sch->exit_kind, &none, kind); + kthread_queue_work(sch->helper, &sch->disable_work); + } + rcu_read_unlock(); } static void dump_newline(struct seq_buf *s) @@ -4980,6 +5149,7 @@ static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx, struct task_struct *p, char marker) { static unsigned long bt[SCX_EXIT_BT_LEN]; + struct scx_sched *sch = scx_root; char dsq_id_buf[19] = "(n/a)"; unsigned long ops_state = atomic_long_read(&p->scx.ops_state); unsigned int bt_len = 0; @@ -5002,9 +5172,9 @@ static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx, p->scx.dsq_vtime, p->scx.slice, p->scx.weight); dump_line(s, " cpus=%*pb", cpumask_pr_args(p->cpus_ptr)); - if (SCX_HAS_OP(dump_task)) { + if (SCX_HAS_OP(sch, dump_task)) { ops_dump_init(s, " "); - SCX_CALL_OP(SCX_KF_REST, dump_task, dctx, p); + SCX_CALL_OP(sch, SCX_KF_REST, dump_task, NULL, dctx, p); ops_dump_exit(); } @@ -5021,6 +5191,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) { static DEFINE_SPINLOCK(dump_lock); static const char trunc_marker[] = "\n\n~~~~ TRUNCATED ~~~~\n"; + struct scx_sched *sch = scx_root; struct scx_dump_ctx dctx = { .kind = ei->kind, .exit_code = ei->exit_code, @@ -5049,9 +5220,9 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) dump_stack_trace(&s, " ", ei->bt, ei->bt_len); } - if (SCX_HAS_OP(dump)) { + if (SCX_HAS_OP(sch, dump)) { ops_dump_init(&s, ""); - SCX_CALL_OP(SCX_KF_UNLOCKED, dump, &dctx); + SCX_CALL_OP(sch, SCX_KF_UNLOCKED, dump, NULL, &dctx); ops_dump_exit(); } @@ -5072,7 +5243,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) idle = list_empty(&rq->scx.runnable_list) && rq->curr->sched_class == &idle_sched_class; - if (idle && !SCX_HAS_OP(dump_cpu)) + if (idle && !SCX_HAS_OP(sch, dump_cpu)) goto next; /* @@ -5106,9 +5277,10 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) cpumask_pr_args(rq->scx.cpus_to_wait)); used = seq_buf_used(&ns); - if (SCX_HAS_OP(dump_cpu)) { + if (SCX_HAS_OP(sch, dump_cpu)) { ops_dump_init(&ns, " "); - SCX_CALL_OP(SCX_KF_REST, dump_cpu, &dctx, cpu, idle); + SCX_CALL_OP(sch, SCX_KF_REST, dump_cpu, NULL, + &dctx, cpu, idle); ops_dump_exit(); } @@ -5142,13 +5314,13 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) dump_line(&s, "Event counters"); dump_line(&s, "--------------"); - scx_bpf_events(&events, sizeof(events)); + scx_read_events(sch, &events); scx_dump_event(s, &events, SCX_EV_SELECT_CPU_FALLBACK); scx_dump_event(s, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); scx_dump_event(s, &events, SCX_EV_DISPATCH_KEEP_LAST); scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_EXITING); scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); - scx_dump_event(s, &events, SCX_EV_ENQ_SLICE_DFL); + scx_dump_event(s, &events, SCX_EV_REFILL_SLICE_DFL); scx_dump_event(s, &events, SCX_EV_BYPASS_DURATION); scx_dump_event(s, &events, SCX_EV_BYPASS_DISPATCH); scx_dump_event(s, &events, SCX_EV_BYPASS_ACTIVATE); @@ -5160,27 +5332,25 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) spin_unlock_irqrestore(&dump_lock, flags); } -static void scx_ops_error_irq_workfn(struct irq_work *irq_work) +static void scx_error_irq_workfn(struct irq_work *irq_work) { - struct scx_exit_info *ei = scx_exit_info; + struct scx_sched *sch = container_of(irq_work, struct scx_sched, error_irq_work); + struct scx_exit_info *ei = sch->exit_info; if (ei->kind >= SCX_EXIT_ERROR) - scx_dump_state(ei, scx_ops.exit_dump_len); + scx_dump_state(ei, sch->ops.exit_dump_len); - schedule_scx_ops_disable_work(); + kthread_queue_work(sch->helper, &sch->disable_work); } -static DEFINE_IRQ_WORK(scx_ops_error_irq_work, scx_ops_error_irq_workfn); - -static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind, - s64 exit_code, - const char *fmt, ...) +static void scx_vexit(struct scx_sched *sch, + enum scx_exit_kind kind, s64 exit_code, + const char *fmt, va_list args) { - struct scx_exit_info *ei = scx_exit_info; + struct scx_exit_info *ei = sch->exit_info; int none = SCX_EXIT_NONE; - va_list args; - if (!atomic_try_cmpxchg(&scx_exit_kind, &none, kind)) + if (!atomic_try_cmpxchg(&sch->exit_kind, &none, kind)) return; ei->exit_code = exit_code; @@ -5188,31 +5358,98 @@ static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind, if (kind >= SCX_EXIT_ERROR) ei->bt_len = stack_trace_save(ei->bt, SCX_EXIT_BT_LEN, 1); #endif - va_start(args, fmt); vscnprintf(ei->msg, SCX_EXIT_MSG_LEN, fmt, args); - va_end(args); /* * Set ei->kind and ->reason for scx_dump_state(). They'll be set again - * in scx_ops_disable_workfn(). + * in scx_disable_workfn(). */ ei->kind = kind; ei->reason = scx_exit_reason(ei->kind); - irq_work_queue(&scx_ops_error_irq_work); + irq_work_queue(&sch->error_irq_work); } -static struct kthread_worker *scx_create_rt_helper(const char *name) +static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops) { - struct kthread_worker *helper; + struct scx_sched *sch; + int node, ret; - helper = kthread_run_worker(0, name); - if (helper) - sched_set_fifo(helper->task); - return helper; -} + sch = kzalloc(sizeof(*sch), GFP_KERNEL); + if (!sch) + return ERR_PTR(-ENOMEM); + + sch->exit_info = alloc_exit_info(ops->exit_dump_len); + if (!sch->exit_info) { + ret = -ENOMEM; + goto err_free_sch; + } -static void check_hotplug_seq(const struct sched_ext_ops *ops) + ret = rhashtable_init(&sch->dsq_hash, &dsq_hash_params); + if (ret < 0) + goto err_free_ei; + + sch->global_dsqs = kcalloc(nr_node_ids, sizeof(sch->global_dsqs[0]), + GFP_KERNEL); + if (!sch->global_dsqs) { + ret = -ENOMEM; + goto err_free_hash; + } + + for_each_node_state(node, N_POSSIBLE) { + struct scx_dispatch_q *dsq; + + dsq = kzalloc_node(sizeof(*dsq), GFP_KERNEL, node); + if (!dsq) { + ret = -ENOMEM; + goto err_free_gdsqs; + } + + init_dsq(dsq, SCX_DSQ_GLOBAL); + sch->global_dsqs[node] = dsq; + } + + sch->event_stats_cpu = alloc_percpu(struct scx_event_stats); + if (!sch->event_stats_cpu) + goto err_free_gdsqs; + + sch->helper = kthread_run_worker(0, "sched_ext_helper"); + if (!sch->helper) + goto err_free_event_stats; + sched_set_fifo(sch->helper->task); + + atomic_set(&sch->exit_kind, SCX_EXIT_NONE); + init_irq_work(&sch->error_irq_work, scx_error_irq_workfn); + kthread_init_work(&sch->disable_work, scx_disable_workfn); + sch->ops = *ops; + ops->priv = sch; + + sch->kobj.kset = scx_kset; + ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root"); + if (ret < 0) + goto err_stop_helper; + + return sch; + +err_stop_helper: + kthread_stop(sch->helper->task); +err_free_event_stats: + free_percpu(sch->event_stats_cpu); +err_free_gdsqs: + for_each_node_state(node, N_POSSIBLE) + kfree(sch->global_dsqs[node]); + kfree(sch->global_dsqs); +err_free_hash: + rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL); +err_free_ei: + free_exit_info(sch->exit_info); +err_free_sch: + kfree(sch); + return ERR_PTR(ret); +} + +static void check_hotplug_seq(struct scx_sched *sch, + const struct sched_ext_ops *ops) { unsigned long long global_hotplug_seq; @@ -5224,21 +5461,22 @@ static void check_hotplug_seq(const struct sched_ext_ops *ops) if (ops->hotplug_seq) { global_hotplug_seq = atomic_long_read(&scx_hotplug_seq); if (ops->hotplug_seq != global_hotplug_seq) { - scx_ops_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG, - "expected hotplug seq %llu did not match actual %llu", - ops->hotplug_seq, global_hotplug_seq); + scx_exit(sch, SCX_EXIT_UNREG_KERN, + SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG, + "expected hotplug seq %llu did not match actual %llu", + ops->hotplug_seq, global_hotplug_seq); } } } -static int validate_ops(const struct sched_ext_ops *ops) +static int validate_ops(struct scx_sched *sch, const struct sched_ext_ops *ops) { /* * It doesn't make sense to specify the SCX_OPS_ENQ_LAST flag if the * ops.enqueue() callback isn't implemented. */ if ((ops->flags & SCX_OPS_ENQ_LAST) && !ops->enqueue) { - scx_ops_error("SCX_OPS_ENQ_LAST requires ops.enqueue() to be implemented"); + scx_error(sch, "SCX_OPS_ENQ_LAST requires ops.enqueue() to be implemented"); return -EINVAL; } @@ -5248,19 +5486,23 @@ static int validate_ops(const struct sched_ext_ops *ops) */ if ((ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE) && (ops->update_idle && !(ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE))) { - scx_ops_error("SCX_OPS_BUILTIN_IDLE_PER_NODE requires CPU idle selection enabled"); + scx_error(sch, "SCX_OPS_BUILTIN_IDLE_PER_NODE requires CPU idle selection enabled"); return -EINVAL; } + if (ops->flags & SCX_OPS_HAS_CGROUP_WEIGHT) + pr_warn("SCX_OPS_HAS_CGROUP_WEIGHT is deprecated and a noop\n"); + return 0; } -static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) +static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) { + struct scx_sched *sch; struct scx_task_iter sti; struct task_struct *p; unsigned long timeout; - int i, cpu, node, ret; + int i, cpu, ret; if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN), cpu_possible_mask)) { @@ -5268,87 +5510,25 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) return -EINVAL; } - mutex_lock(&scx_ops_enable_mutex); + mutex_lock(&scx_enable_mutex); - /* - * Clear event counters so a new scx scheduler gets - * fresh event counter values. - */ - for_each_possible_cpu(cpu) { - struct scx_event_stats *e = per_cpu_ptr(&event_stats_cpu, cpu); - memset(e, 0, sizeof(*e)); - } - - if (!scx_ops_helper) { - WRITE_ONCE(scx_ops_helper, - scx_create_rt_helper("sched_ext_ops_helper")); - if (!scx_ops_helper) { - ret = -ENOMEM; - goto err_unlock; - } - } - - if (!global_dsqs) { - struct scx_dispatch_q **dsqs; - - dsqs = kcalloc(nr_node_ids, sizeof(dsqs[0]), GFP_KERNEL); - if (!dsqs) { - ret = -ENOMEM; - goto err_unlock; - } - - for_each_node_state(node, N_POSSIBLE) { - struct scx_dispatch_q *dsq; - - dsq = kzalloc_node(sizeof(*dsq), GFP_KERNEL, node); - if (!dsq) { - for_each_node_state(node, N_POSSIBLE) - kfree(dsqs[node]); - kfree(dsqs); - ret = -ENOMEM; - goto err_unlock; - } - - init_dsq(dsq, SCX_DSQ_GLOBAL); - dsqs[node] = dsq; - } - - global_dsqs = dsqs; - } - - if (scx_ops_enable_state() != SCX_OPS_DISABLED) { + if (scx_enable_state() != SCX_DISABLED) { ret = -EBUSY; goto err_unlock; } - scx_root_kobj = kzalloc(sizeof(*scx_root_kobj), GFP_KERNEL); - if (!scx_root_kobj) { - ret = -ENOMEM; + sch = scx_alloc_and_add_sched(ops); + if (IS_ERR(sch)) { + ret = PTR_ERR(sch); goto err_unlock; } - scx_root_kobj->kset = scx_kset; - ret = kobject_init_and_add(scx_root_kobj, &scx_ktype, NULL, "root"); - if (ret < 0) - goto err; - - scx_exit_info = alloc_exit_info(ops->exit_dump_len); - if (!scx_exit_info) { - ret = -ENOMEM; - goto err_del; - } - /* - * Set scx_ops, transition to ENABLING and clear exit info to arm the - * disable path. Failure triggers full disabling from here on. + * Transition to ENABLING and clear exit info to arm the disable path. + * Failure triggers full disabling from here on. */ - scx_ops = *ops; - - WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_ENABLING) != - SCX_OPS_DISABLED); - - atomic_set(&scx_exit_kind, SCX_EXIT_NONE); - scx_warned_zero_slice = false; + WARN_ON_ONCE(scx_set_enable_state(SCX_ENABLING) != SCX_DISABLED); + WARN_ON_ONCE(scx_root); atomic_long_set(&scx_nr_rejected, 0); @@ -5361,26 +5541,34 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) */ cpus_read_lock(); - if (scx_ops.init) { - ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init); + /* + * Make the scheduler instance visible. Must be inside cpus_read_lock(). + * See handle_hotplug(). + */ + rcu_assign_pointer(scx_root, sch); + + scx_idle_enable(ops); + + if (sch->ops.init) { + ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, init, NULL); if (ret) { - ret = ops_sanitize_err("init", ret); + ret = ops_sanitize_err(sch, "init", ret); cpus_read_unlock(); - scx_ops_error("ops.init() failed (%d)", ret); + scx_error(sch, "ops.init() failed (%d)", ret); goto err_disable; } } for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++) if (((void (**)(void))ops)[i]) - static_branch_enable_cpuslocked(&scx_has_op[i]); + set_bit(i, sch->has_op); - check_hotplug_seq(ops); + check_hotplug_seq(sch, ops); scx_idle_update_selcpu_topology(ops); cpus_read_unlock(); - ret = validate_ops(ops); + ret = validate_ops(sch, ops); if (ret) goto err_disable; @@ -5405,29 +5593,19 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) scx_watchdog_timeout / 2); /* - * Once __scx_ops_enabled is set, %current can be switched to SCX - * anytime. This can lead to stalls as some BPF schedulers (e.g. - * userspace scheduling) may not function correctly before all tasks are - * switched. Init in bypass mode to guarantee forward progress. + * Once __scx_enabled is set, %current can be switched to SCX anytime. + * This can lead to stalls as some BPF schedulers (e.g. userspace + * scheduling) may not function correctly before all tasks are switched. + * Init in bypass mode to guarantee forward progress. */ - scx_ops_bypass(true); + scx_bypass(true); for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++) if (((void (**)(void))ops)[i]) - static_branch_enable(&scx_has_op[i]); - - if (ops->flags & SCX_OPS_ALLOW_QUEUED_WAKEUP) - static_branch_enable(&scx_ops_allow_queued_wakeup); - if (ops->flags & SCX_OPS_ENQ_LAST) - static_branch_enable(&scx_ops_enq_last); - if (ops->flags & SCX_OPS_ENQ_EXITING) - static_branch_enable(&scx_ops_enq_exiting); - if (ops->flags & SCX_OPS_ENQ_MIGRATION_DISABLED) - static_branch_enable(&scx_ops_enq_migration_disabled); - if (scx_ops.cpu_acquire || scx_ops.cpu_release) - static_branch_enable(&scx_ops_cpu_preempt); + set_bit(i, sch->has_op); - scx_idle_enable(ops); + if (sch->ops.cpu_acquire || sch->ops.cpu_release) + sch->ops.flags |= SCX_OPS_HAS_CPU_PREEMPT; /* * Lock out forks, cgroup on/offlining and moves before opening the @@ -5435,8 +5613,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) */ percpu_down_write(&scx_fork_rwsem); - WARN_ON_ONCE(scx_ops_init_task_enabled); - scx_ops_init_task_enabled = true; + WARN_ON_ONCE(scx_init_task_enabled); + scx_init_task_enabled = true; /* * Enable ops for every task. Fork is excluded by scx_fork_rwsem @@ -5445,14 +5623,14 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) * tasks. Prep all tasks first and then enable them with preemption * disabled. * - * All cgroups should be initialized before scx_ops_init_task() so that - * the BPF scheduler can reliably track each task's cgroup membership - * from scx_ops_init_task(). Lock out cgroup on/offlining and task - * migrations while tasks are being initialized so that - * scx_cgroup_can_attach() never sees uninitialized tasks. + * All cgroups should be initialized before scx_init_task() so that the + * BPF scheduler can reliably track each task's cgroup membership from + * scx_init_task(). Lock out cgroup on/offlining and task migrations + * while tasks are being initialized so that scx_cgroup_can_attach() + * never sees uninitialized tasks. */ scx_cgroup_lock(); - ret = scx_cgroup_init(); + ret = scx_cgroup_init(sch); if (ret) goto err_disable_unlock_all; @@ -5468,13 +5646,13 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) scx_task_iter_unlock(&sti); - ret = scx_ops_init_task(p, task_group(p), false); + ret = scx_init_task(p, task_group(p), false); if (ret) { put_task_struct(p); scx_task_iter_relock(&sti); scx_task_iter_stop(&sti); - scx_ops_error("ops.init_task() failed (%d) for %s[%d]", - ret, p->comm, p->pid); + scx_error(sch, "ops.init_task() failed (%d) for %s[%d]", + ret, p->comm, p->pid); goto err_disable_unlock_all; } @@ -5492,7 +5670,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) * all eligible tasks. */ WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL)); - static_branch_enable(&__scx_ops_enabled); + static_branch_enable(&__scx_enabled); /* * We're fully committed and can't fail. The task READY -> ENABLED @@ -5523,10 +5701,10 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) scx_task_iter_stop(&sti); percpu_up_write(&scx_fork_rwsem); - scx_ops_bypass(false); + scx_bypass(false); - if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLED, SCX_OPS_ENABLING)) { - WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE); + if (!scx_tryset_enable_state(SCX_ENABLED, SCX_ENABLING)) { + WARN_ON_ONCE(atomic_read(&sch->exit_kind) == SCX_EXIT_NONE); goto err_disable; } @@ -5534,44 +5712,35 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) static_branch_enable(&__scx_switched_all); pr_info("sched_ext: BPF scheduler \"%s\" enabled%s\n", - scx_ops.name, scx_switched_all() ? "" : " (partial)"); - kobject_uevent(scx_root_kobj, KOBJ_ADD); - mutex_unlock(&scx_ops_enable_mutex); + sch->ops.name, scx_switched_all() ? "" : " (partial)"); + kobject_uevent(&sch->kobj, KOBJ_ADD); + mutex_unlock(&scx_enable_mutex); atomic_long_inc(&scx_enable_seq); return 0; -err_del: - kobject_del(scx_root_kobj); -err: - kobject_put(scx_root_kobj); - scx_root_kobj = NULL; - if (scx_exit_info) { - free_exit_info(scx_exit_info); - scx_exit_info = NULL; - } err_unlock: - mutex_unlock(&scx_ops_enable_mutex); + mutex_unlock(&scx_enable_mutex); return ret; err_disable_unlock_all: scx_cgroup_unlock(); percpu_up_write(&scx_fork_rwsem); - scx_ops_bypass(false); + scx_bypass(false); err_disable: - mutex_unlock(&scx_ops_enable_mutex); + mutex_unlock(&scx_enable_mutex); /* * Returning an error code here would not pass all the error information - * to userspace. Record errno using scx_ops_error() for cases - * scx_ops_error() wasn't already invoked and exit indicating success so - * that the error is notified through ops.exit() with all the details. + * to userspace. Record errno using scx_error() for cases scx_error() + * wasn't already invoked and exit indicating success so that the error + * is notified through ops.exit() with all the details. * - * Flush scx_ops_disable_work to ensure that error is reported before - * init completion. + * Flush scx_disable_work to ensure that error is reported before init + * completion. sch's base reference will be put by bpf_scx_unreg(). */ - scx_ops_error("scx_ops_enable() failed (%d)", ret); - kthread_flush_work(&scx_ops_disable_work); + scx_error(sch, "scx_enable() failed (%d)", ret); + kthread_flush_work(&sch->disable_work); return 0; } @@ -5622,21 +5791,8 @@ static int bpf_scx_btf_struct_access(struct bpf_verifier_log *log, return -EACCES; } -static const struct bpf_func_proto * -bpf_scx_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) -{ - switch (func_id) { - case BPF_FUNC_task_storage_get: - return &bpf_task_storage_get_proto; - case BPF_FUNC_task_storage_delete: - return &bpf_task_storage_delete_proto; - default: - return bpf_base_func_proto(func_id, prog); - } -} - static const struct bpf_verifier_ops bpf_scx_verifier_ops = { - .get_func_proto = bpf_scx_get_func_proto, + .get_func_proto = bpf_base_func_proto, .is_valid_access = bpf_scx_is_valid_access, .btf_struct_access = bpf_scx_btf_struct_access, }; @@ -5715,13 +5871,17 @@ static int bpf_scx_check_member(const struct btf_type *t, static int bpf_scx_reg(void *kdata, struct bpf_link *link) { - return scx_ops_enable(kdata, link); + return scx_enable(kdata, link); } static void bpf_scx_unreg(void *kdata, struct bpf_link *link) { - scx_ops_disable(SCX_EXIT_UNREG); - kthread_flush_work(&scx_ops_disable_work); + struct sched_ext_ops *ops = kdata; + struct scx_sched *sch = ops->priv; + + scx_disable(SCX_EXIT_UNREG); + kthread_flush_work(&sch->disable_work); + kobject_put(&sch->kobj); } static int bpf_scx_init(struct btf *btf) @@ -5843,10 +6003,7 @@ static struct bpf_struct_ops bpf_sched_ext_ops = { static void sysrq_handle_sched_ext_reset(u8 key) { - if (scx_ops_helper) - scx_ops_disable(SCX_EXIT_SYSRQ); - else - pr_info("sched_ext: BPF scheduler not yet used\n"); + scx_disable(SCX_EXIT_SYSRQ); } static const struct sysrq_key_op sysrq_sched_ext_reset_op = { @@ -5994,13 +6151,14 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work) */ void print_scx_info(const char *log_lvl, struct task_struct *p) { - enum scx_ops_enable_state state = scx_ops_enable_state(); + struct scx_sched *sch = scx_root; + enum scx_enable_state state = scx_enable_state(); const char *all = READ_ONCE(scx_switching_all) ? "+all" : ""; char runnable_at_buf[22] = "?"; struct sched_class *class; unsigned long runnable_at; - if (state == SCX_OPS_DISABLED) + if (state == SCX_DISABLED) return; /* @@ -6009,8 +6167,8 @@ void print_scx_info(const char *log_lvl, struct task_struct *p) */ if (copy_from_kernel_nofault(&class, &p->sched_class, sizeof(class)) || class != &ext_sched_class) { - printk("%sSched_ext: %s (%s%s)", log_lvl, scx_ops.name, - scx_ops_enable_state_str[state], all); + printk("%sSched_ext: %s (%s%s)", log_lvl, sch->ops.name, + scx_enable_state_str[state], all); return; } @@ -6021,7 +6179,7 @@ void print_scx_info(const char *log_lvl, struct task_struct *p) /* print everything onto one line to conserve console space */ printk("%sSched_ext: %s (%s%s), task: runnable_at=%s", - log_lvl, scx_ops.name, scx_ops_enable_state_str[state], all, + log_lvl, sch->ops.name, scx_enable_state_str[state], all, runnable_at_buf); } @@ -6037,12 +6195,12 @@ static int scx_pm_handler(struct notifier_block *nb, unsigned long event, void * case PM_HIBERNATION_PREPARE: case PM_SUSPEND_PREPARE: case PM_RESTORE_PREPARE: - scx_ops_bypass(true); + scx_bypass(true); break; case PM_POST_HIBERNATION: case PM_POST_SUSPEND: case PM_POST_RESTORE: - scx_ops_bypass(false); + scx_bypass(false); break; } @@ -6065,7 +6223,6 @@ void __init init_sched_ext_class(void) WRITE_ONCE(v, SCX_ENQ_WAKEUP | SCX_DEQ_SLEEP | SCX_KICK_PREEMPT | SCX_TG_ONLINE); - BUG_ON(rhashtable_init(&dsq_hash, &dsq_hash_params)); scx_idle_init_masks(); scx_kick_cpus_pnt_seqs = @@ -6109,12 +6266,12 @@ static bool scx_dsq_insert_preamble(struct task_struct *p, u64 enq_flags) lockdep_assert_irqs_disabled(); if (unlikely(!p)) { - scx_ops_error("called with NULL task"); + scx_kf_error("called with NULL task"); return false; } if (unlikely(enq_flags & __SCX_ENQ_INTERNAL_MASK)) { - scx_ops_error("invalid enq_flags 0x%llx", enq_flags); + scx_kf_error("invalid enq_flags 0x%llx", enq_flags); return false; } @@ -6134,7 +6291,7 @@ static void scx_dsq_insert_commit(struct task_struct *p, u64 dsq_id, } if (unlikely(dspc->cursor >= scx_dsp_max_batch)) { - scx_ops_error("dispatch buffer overflow"); + scx_kf_error("dispatch buffer overflow"); return; } @@ -6266,6 +6423,7 @@ static const struct btf_kfunc_id_set scx_kfunc_set_enqueue_dispatch = { static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit, struct task_struct *p, u64 dsq_id, u64 enq_flags) { + struct scx_sched *sch = scx_root; struct scx_dispatch_q *src_dsq = kit->dsq, *dst_dsq; struct rq *this_rq, *src_rq, *locked_rq; bool dispatched = false; @@ -6300,7 +6458,7 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit, * cause similar live-lock conditions as consume_dispatch_q(). Insert a * breather if necessary. */ - scx_ops_breather(src_rq); + scx_breather(src_rq); locked_rq = src_rq; raw_spin_lock(&src_dsq->lock); @@ -6318,7 +6476,7 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit, } /* @p is still on $src_dsq and stable, determine the destination */ - dst_dsq = find_dsq_for_dispatch(this_rq, dsq_id, p); + dst_dsq = find_dsq_for_dispatch(sch, this_rq, dsq_id, p); /* * Apply vtime and slice updates before moving so that the new time is @@ -6331,7 +6489,7 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit, p->scx.slice = kit->slice; /* execute move */ - locked_rq = move_task_between_dsqs(p, enq_flags, src_dsq, dst_dsq); + locked_rq = move_task_between_dsqs(sch, p, enq_flags, src_dsq, dst_dsq); dispatched = true; out: if (in_balance) { @@ -6379,7 +6537,7 @@ __bpf_kfunc void scx_bpf_dispatch_cancel(void) if (dspc->cursor > 0) dspc->cursor--; else - scx_ops_error("dispatch buffer underflow"); + scx_kf_error("dispatch buffer underflow"); } /** @@ -6398,21 +6556,22 @@ __bpf_kfunc void scx_bpf_dispatch_cancel(void) */ __bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id) { + struct scx_sched *sch = scx_root; struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); struct scx_dispatch_q *dsq; if (!scx_kf_allowed(SCX_KF_DISPATCH)) return false; - flush_dispatch_buf(dspc->rq); + flush_dispatch_buf(sch, dspc->rq); - dsq = find_user_dsq(dsq_id); + dsq = find_user_dsq(sch, dsq_id); if (unlikely(!dsq)) { - scx_ops_error("invalid DSQ ID 0x%016llx", dsq_id); + scx_error(sch, "invalid DSQ ID 0x%016llx", dsq_id); return false; } - if (consume_dispatch_q(dspc->rq, dsq)) { + if (consume_dispatch_q(sch, dspc->rq, dsq)) { /* * A successfully consumed task can be dequeued before it starts * running while the CPU is trying to migrate other dispatched @@ -6666,10 +6825,36 @@ __bpf_kfunc_start_defs(); */ __bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) { + struct scx_dispatch_q *dsq; + struct scx_sched *sch; + s32 ret; + if (unlikely(node >= (int)nr_node_ids || (node < 0 && node != NUMA_NO_NODE))) return -EINVAL; - return PTR_ERR_OR_ZERO(create_dsq(dsq_id, node)); + + if (unlikely(dsq_id & SCX_DSQ_FLAG_BUILTIN)) + return -EINVAL; + + dsq = kmalloc_node(sizeof(*dsq), GFP_KERNEL, node); + if (!dsq) + return -ENOMEM; + + init_dsq(dsq, dsq_id); + + rcu_read_lock(); + + sch = rcu_dereference(scx_root); + if (sch) + ret = rhashtable_lookup_insert_fast(&sch->dsq_hash, &dsq->hash_node, + dsq_hash_params); + else + ret = -ENODEV; + + rcu_read_unlock(); + if (ret) + kfree(dsq); + return ret; } __bpf_kfunc_end_defs(); @@ -6708,7 +6893,7 @@ __bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags) struct rq *this_rq; unsigned long irq_flags; - if (!ops_cpu_valid(cpu, NULL)) + if (!kf_cpu_valid(cpu, NULL)) return; local_irq_save(irq_flags); @@ -6732,7 +6917,7 @@ __bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags) struct rq *target_rq = cpu_rq(cpu); if (unlikely(flags & (SCX_KICK_PREEMPT | SCX_KICK_WAIT))) - scx_ops_error("PREEMPT/WAIT cannot be used with SCX_KICK_IDLE"); + scx_kf_error("PREEMPT/WAIT cannot be used with SCX_KICK_IDLE"); if (raw_spin_rq_trylock(target_rq)) { if (can_skip_idle_kick(target_rq)) { @@ -6765,23 +6950,30 @@ out: */ __bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id) { + struct scx_sched *sch; struct scx_dispatch_q *dsq; s32 ret; preempt_disable(); + sch = rcu_dereference_sched(scx_root); + if (unlikely(!sch)) { + ret = -ENODEV; + goto out; + } + if (dsq_id == SCX_DSQ_LOCAL) { ret = READ_ONCE(this_rq()->scx.local_dsq.nr); goto out; } else if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) { s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; - if (ops_cpu_valid(cpu, NULL)) { + if (ops_cpu_valid(sch, cpu, NULL)) { ret = READ_ONCE(cpu_rq(cpu)->scx.local_dsq.nr); goto out; } } else { - dsq = find_user_dsq(dsq_id); + dsq = find_user_dsq(sch, dsq_id); if (dsq) { ret = READ_ONCE(dsq->nr); goto out; @@ -6804,7 +6996,13 @@ out: */ __bpf_kfunc void scx_bpf_destroy_dsq(u64 dsq_id) { - destroy_dsq(dsq_id); + struct scx_sched *sch; + + rcu_read_lock(); + sch = rcu_dereference(scx_root); + if (sch) + destroy_dsq(sch, dsq_id); + rcu_read_unlock(); } /** @@ -6821,16 +7019,27 @@ __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, u64 flags) { struct bpf_iter_scx_dsq_kern *kit = (void *)it; + struct scx_sched *sch; BUILD_BUG_ON(sizeof(struct bpf_iter_scx_dsq_kern) > sizeof(struct bpf_iter_scx_dsq)); BUILD_BUG_ON(__alignof__(struct bpf_iter_scx_dsq_kern) != __alignof__(struct bpf_iter_scx_dsq)); + /* + * next() and destroy() will be called regardless of the return value. + * Always clear $kit->dsq. + */ + kit->dsq = NULL; + + sch = rcu_dereference_check(scx_root, rcu_read_lock_bh_held()); + if (unlikely(!sch)) + return -ENODEV; + if (flags & ~__SCX_DSQ_ITER_USER_FLAGS) return -EINVAL; - kit->dsq = find_user_dsq(dsq_id); + kit->dsq = find_user_dsq(sch, dsq_id); if (!kit->dsq) return -ENOENT; @@ -6920,21 +7129,20 @@ static s32 __bstr_format(u64 *data_buf, char *line_buf, size_t line_size, if (data__sz % 8 || data__sz > MAX_BPRINTF_VARARGS * 8 || (data__sz && !data)) { - scx_ops_error("invalid data=%p and data__sz=%u", - (void *)data, data__sz); + scx_kf_error("invalid data=%p and data__sz=%u", (void *)data, data__sz); return -EINVAL; } ret = copy_from_kernel_nofault(data_buf, data, data__sz); if (ret < 0) { - scx_ops_error("failed to read data fields (%d)", ret); + scx_kf_error("failed to read data fields (%d)", ret); return ret; } ret = bpf_bprintf_prepare(fmt, UINT_MAX, data_buf, data__sz / 8, &bprintf_data); if (ret < 0) { - scx_ops_error("format preparation failed (%d)", ret); + scx_kf_error("format preparation failed (%d)", ret); return ret; } @@ -6942,8 +7150,7 @@ static s32 __bstr_format(u64 *data_buf, char *line_buf, size_t line_size, bprintf_data.bin_args); bpf_bprintf_cleanup(&bprintf_data); if (ret < 0) { - scx_ops_error("(\"%s\", %p, %u) failed to format", - fmt, data, data__sz); + scx_kf_error("(\"%s\", %p, %u) failed to format", fmt, data, data__sz); return ret; } @@ -6976,8 +7183,7 @@ __bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt, raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags); if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0) - scx_ops_exit_kind(SCX_EXIT_UNREG_BPF, exit_code, "%s", - scx_exit_bstr_buf.line); + scx_kf_exit(SCX_EXIT_UNREG_BPF, exit_code, "%s", scx_exit_bstr_buf.line); raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags); } @@ -6997,8 +7203,7 @@ __bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data, raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags); if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0) - scx_ops_exit_kind(SCX_EXIT_ERROR_BPF, 0, "%s", - scx_exit_bstr_buf.line); + scx_kf_exit(SCX_EXIT_ERROR_BPF, 0, "%s", scx_exit_bstr_buf.line); raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags); } @@ -7022,7 +7227,7 @@ __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, s32 ret; if (raw_smp_processor_id() != dd->cpu) { - scx_ops_error("scx_bpf_dump() must only be called from ops.dump() and friends"); + scx_kf_error("scx_bpf_dump() must only be called from ops.dump() and friends"); return; } @@ -7063,7 +7268,7 @@ __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, */ __bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu) { - if (ops_cpu_valid(cpu, NULL)) + if (kf_cpu_valid(cpu, NULL)) return arch_scale_cpu_capacity(cpu); else return SCX_CPUPERF_ONE; @@ -7085,7 +7290,7 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu) */ __bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu) { - if (ops_cpu_valid(cpu, NULL)) + if (kf_cpu_valid(cpu, NULL)) return arch_scale_freq_capacity(cpu); else return SCX_CPUPERF_ONE; @@ -7108,18 +7313,37 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu) __bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf) { if (unlikely(perf > SCX_CPUPERF_ONE)) { - scx_ops_error("Invalid cpuperf target %u for CPU %d", perf, cpu); + scx_kf_error("Invalid cpuperf target %u for CPU %d", perf, cpu); return; } - if (ops_cpu_valid(cpu, NULL)) { - struct rq *rq = cpu_rq(cpu); + if (kf_cpu_valid(cpu, NULL)) { + struct rq *rq = cpu_rq(cpu), *locked_rq = scx_locked_rq(); + struct rq_flags rf; + + /* + * When called with an rq lock held, restrict the operation + * to the corresponding CPU to prevent ABBA deadlocks. + */ + if (locked_rq && rq != locked_rq) { + scx_kf_error("Invalid target CPU %d", cpu); + return; + } + + /* + * If no rq lock is held, allow to operate on any CPU by + * acquiring the corresponding rq lock. + */ + if (!locked_rq) { + rq_lock_irqsave(rq, &rf); + update_rq_clock(rq); + } rq->scx.cpuperf_target = perf; + cpufreq_update_util(rq, 0); - rcu_read_lock_sched_notrace(); - cpufreq_update_util(cpu_rq(cpu), 0); - rcu_read_unlock_sched_notrace(); + if (!locked_rq) + rq_unlock_irqrestore(rq, &rf); } } @@ -7197,7 +7421,7 @@ __bpf_kfunc s32 scx_bpf_task_cpu(const struct task_struct *p) */ __bpf_kfunc struct rq *scx_bpf_cpu_rq(s32 cpu) { - if (!ops_cpu_valid(cpu, NULL)) + if (!kf_cpu_valid(cpu, NULL)) return NULL; return cpu_rq(cpu); @@ -7293,6 +7517,27 @@ __bpf_kfunc u64 scx_bpf_now(void) return clock; } +static void scx_read_events(struct scx_sched *sch, struct scx_event_stats *events) +{ + struct scx_event_stats *e_cpu; + int cpu; + + /* Aggregate per-CPU event counters into @events. */ + memset(events, 0, sizeof(*events)); + for_each_possible_cpu(cpu) { + e_cpu = per_cpu_ptr(sch->event_stats_cpu, cpu); + scx_agg_event(events, e_cpu, SCX_EV_SELECT_CPU_FALLBACK); + scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); + scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_KEEP_LAST); + scx_agg_event(events, e_cpu, SCX_EV_ENQ_SKIP_EXITING); + scx_agg_event(events, e_cpu, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); + scx_agg_event(events, e_cpu, SCX_EV_REFILL_SLICE_DFL); + scx_agg_event(events, e_cpu, SCX_EV_BYPASS_DURATION); + scx_agg_event(events, e_cpu, SCX_EV_BYPASS_DISPATCH); + scx_agg_event(events, e_cpu, SCX_EV_BYPASS_ACTIVATE); + } +} + /* * scx_bpf_events - Get a system-wide event counter to * @events: output buffer from a BPF program @@ -7301,23 +7546,16 @@ __bpf_kfunc u64 scx_bpf_now(void) __bpf_kfunc void scx_bpf_events(struct scx_event_stats *events, size_t events__sz) { - struct scx_event_stats e_sys, *e_cpu; - int cpu; + struct scx_sched *sch; + struct scx_event_stats e_sys; - /* Aggregate per-CPU event counters into the system-wide counters. */ - memset(&e_sys, 0, sizeof(e_sys)); - for_each_possible_cpu(cpu) { - e_cpu = per_cpu_ptr(&event_stats_cpu, cpu); - scx_agg_event(&e_sys, e_cpu, SCX_EV_SELECT_CPU_FALLBACK); - scx_agg_event(&e_sys, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); - scx_agg_event(&e_sys, e_cpu, SCX_EV_DISPATCH_KEEP_LAST); - scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SKIP_EXITING); - scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); - scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SLICE_DFL); - scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_DURATION); - scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_DISPATCH); - scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_ACTIVATE); - } + rcu_read_lock(); + sch = rcu_dereference(scx_root); + if (sch) + scx_read_events(sch, &e_sys); + else + memset(&e_sys, 0, sizeof(e_sys)); + rcu_read_unlock(); /* * We cannot entirely trust a BPF-provided size since a BPF program @@ -7350,12 +7588,6 @@ BTF_ID_FLAGS(func, scx_bpf_nr_cpu_ids) BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE) BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE) BTF_ID_FLAGS(func, scx_bpf_put_cpumask, KF_RELEASE) -BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask, KF_ACQUIRE) -BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask, KF_ACQUIRE) -BTF_ID_FLAGS(func, scx_bpf_put_idle_cpumask, KF_RELEASE) -BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle) -BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU) -BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_cpu_rq) diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h index 1bda96b19a1b..6e5072f57771 100644 --- a/kernel/sched/ext.h +++ b/kernel/sched/ext.h @@ -8,6 +8,11 @@ */ #ifdef CONFIG_SCHED_CLASS_EXT +static inline bool scx_kf_allowed_if_unlocked(void) +{ + return !current->scx.kf_mask; +} + DECLARE_STATIC_KEY_FALSE(scx_ops_allow_queued_wakeup); void scx_tick(struct rq *rq); @@ -21,6 +26,7 @@ void scx_rq_activate(struct rq *rq); void scx_rq_deactivate(struct rq *rq); int scx_check_setscheduler(struct task_struct *p, int policy); bool task_should_scx(int policy); +bool scx_allow_ttwu_queue(const struct task_struct *p); void init_sched_ext_class(void); static inline u32 scx_cpuperf_target(s32 cpu) @@ -36,13 +42,6 @@ static inline bool task_on_scx(const struct task_struct *p) return scx_enabled() && p->sched_class == &ext_sched_class; } -static inline bool scx_allow_ttwu_queue(const struct task_struct *p) -{ - return !scx_enabled() || - static_branch_likely(&scx_ops_allow_queued_wakeup) || - p->sched_class != &ext_sched_class; -} - #ifdef CONFIG_SCHED_CORE bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, bool in_fi); diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c index 52c36a70a3d0..66da03cc0b33 100644 --- a/kernel/sched/ext_idle.c +++ b/kernel/sched/ext_idle.c @@ -47,6 +47,13 @@ static struct scx_idle_cpus scx_idle_global_masks; static struct scx_idle_cpus **scx_idle_node_masks; /* + * Local per-CPU cpumasks (used to generate temporary idle cpumasks). + */ +static DEFINE_PER_CPU(cpumask_var_t, local_idle_cpumask); +static DEFINE_PER_CPU(cpumask_var_t, local_llc_idle_cpumask); +static DEFINE_PER_CPU(cpumask_var_t, local_numa_idle_cpumask); + +/* * Return the idle masks associated to a target @node. * * NUMA_NO_NODE identifies the global idle cpumask. @@ -392,6 +399,14 @@ void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops) } /* + * Return true if @p can run on all possible CPUs, false otherwise. + */ +static inline bool task_affinity_all(const struct task_struct *p) +{ + return p->nr_cpus_allowed >= num_possible_cpus(); +} + +/* * Built-in CPU idle selection policy: * * 1. Prioritize full-idle cores: @@ -403,13 +418,15 @@ void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops) * branch prediction optimizations. * * 3. Pick a CPU within the same LLC (Last-Level Cache): - * - if the above conditions aren't met, pick a CPU that shares the same LLC - * to maintain cache locality. + * - if the above conditions aren't met, pick a CPU that shares the same + * LLC, if the LLC domain is a subset of @cpus_allowed, to maintain + * cache locality. * * 4. Pick a CPU within the same NUMA node, if enabled: - * - choose a CPU from the same NUMA node to reduce memory access latency. + * - choose a CPU from the same NUMA node, if the node cpumask is a + * subset of @cpus_allowed, to reduce memory access latency. * - * 5. Pick any idle CPU usable by the task. + * 5. Pick any idle CPU within the @cpus_allowed domain. * * Step 3 and 4 are performed only if the system has, respectively, * multiple LLCs / multiple NUMA nodes (see scx_selcpu_topo_llc and @@ -424,35 +441,77 @@ void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops) * NOTE: tasks that can only run on 1 CPU are excluded by this logic, because * we never call ops.select_cpu() for them, see select_task_rq(). */ -s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64 flags) +s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, + const struct cpumask *cpus_allowed, u64 flags) { - const struct cpumask *llc_cpus = NULL; - const struct cpumask *numa_cpus = NULL; + const struct cpumask *llc_cpus = NULL, *numa_cpus = NULL; + const struct cpumask *allowed = cpus_allowed ?: p->cpus_ptr; int node = scx_cpu_node_if_enabled(prev_cpu); s32 cpu; + preempt_disable(); + + /* + * Determine the subset of CPUs usable by @p within @cpus_allowed. + */ + if (allowed != p->cpus_ptr) { + struct cpumask *local_cpus = this_cpu_cpumask_var_ptr(local_idle_cpumask); + + if (task_affinity_all(p)) { + allowed = cpus_allowed; + } else if (cpumask_and(local_cpus, cpus_allowed, p->cpus_ptr)) { + allowed = local_cpus; + } else { + cpu = -EBUSY; + goto out_enable; + } + + /* + * If @prev_cpu is not in the allowed CPUs, skip topology + * optimizations and try to pick any idle CPU usable by the + * task. + * + * If %SCX_OPS_BUILTIN_IDLE_PER_NODE is enabled, prioritize + * the current node, as it may optimize some waker->wakee + * workloads. + */ + if (!cpumask_test_cpu(prev_cpu, allowed)) { + node = scx_cpu_node_if_enabled(smp_processor_id()); + cpu = scx_pick_idle_cpu(allowed, node, flags); + goto out_enable; + } + } + /* * This is necessary to protect llc_cpus. */ rcu_read_lock(); /* - * Determine the scheduling domain only if the task is allowed to run - * on all CPUs. + * Determine the subset of CPUs that the task can use in its + * current LLC and node. * - * This is done primarily for efficiency, as it avoids the overhead of - * updating a cpumask every time we need to select an idle CPU (which - * can be costly in large SMP systems), but it also aligns logically: - * if a task's scheduling domain is restricted by user-space (through - * CPU affinity), the task will simply use the flat scheduling domain - * defined by user-space. + * If the task can run on all CPUs, use the node and LLC cpumasks + * directly. */ - if (p->nr_cpus_allowed >= num_possible_cpus()) { - if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa)) - numa_cpus = numa_span(prev_cpu); + if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa)) { + struct cpumask *local_cpus = this_cpu_cpumask_var_ptr(local_numa_idle_cpumask); + const struct cpumask *cpus = numa_span(prev_cpu); + + if (allowed == p->cpus_ptr && task_affinity_all(p)) + numa_cpus = cpus; + else if (cpus && cpumask_and(local_cpus, allowed, cpus)) + numa_cpus = local_cpus; + } - if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc)) - llc_cpus = llc_span(prev_cpu); + if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc)) { + struct cpumask *local_cpus = this_cpu_cpumask_var_ptr(local_llc_idle_cpumask); + const struct cpumask *cpus = llc_span(prev_cpu); + + if (allowed == p->cpus_ptr && task_affinity_all(p)) + llc_cpus = cpus; + else if (cpus && cpumask_and(local_cpus, allowed, cpus)) + llc_cpus = local_cpus; } /* @@ -490,7 +549,7 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64 cpu_rq(cpu)->scx.local_dsq.nr == 0 && (!(flags & SCX_PICK_IDLE_IN_NODE) || (waker_node == node)) && !cpumask_empty(idle_cpumask(waker_node)->cpu)) { - if (cpumask_test_cpu(cpu, p->cpus_ptr)) + if (cpumask_test_cpu(cpu, allowed)) goto out_unlock; } } @@ -535,7 +594,7 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64 * begin in prev_cpu's node and proceed to other nodes in * order of increasing distance. */ - cpu = scx_pick_idle_cpu(p->cpus_ptr, node, flags | SCX_PICK_IDLE_CORE); + cpu = scx_pick_idle_cpu(allowed, node, flags | SCX_PICK_IDLE_CORE); if (cpu >= 0) goto out_unlock; @@ -544,7 +603,7 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64 * core. */ if (flags & SCX_PICK_IDLE_CORE) { - cpu = prev_cpu; + cpu = -EBUSY; goto out_unlock; } } @@ -583,12 +642,12 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64 * in prev_cpu's node and proceed to other nodes in order of * increasing distance. */ - cpu = scx_pick_idle_cpu(p->cpus_ptr, node, flags); - if (cpu >= 0) - goto out_unlock; + cpu = scx_pick_idle_cpu(allowed, node, flags); out_unlock: rcu_read_unlock(); +out_enable: + preempt_enable(); return cpu; } @@ -598,7 +657,7 @@ out_unlock: */ void scx_idle_init_masks(void) { - int node; + int i; /* Allocate global idle cpumasks */ BUG_ON(!alloc_cpumask_var(&scx_idle_global_masks.cpu, GFP_KERNEL)); @@ -609,13 +668,23 @@ void scx_idle_init_masks(void) sizeof(*scx_idle_node_masks), GFP_KERNEL); BUG_ON(!scx_idle_node_masks); - for_each_node(node) { - scx_idle_node_masks[node] = kzalloc_node(sizeof(**scx_idle_node_masks), - GFP_KERNEL, node); - BUG_ON(!scx_idle_node_masks[node]); + for_each_node(i) { + scx_idle_node_masks[i] = kzalloc_node(sizeof(**scx_idle_node_masks), + GFP_KERNEL, i); + BUG_ON(!scx_idle_node_masks[i]); - BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[node]->cpu, GFP_KERNEL, node)); - BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[node]->smt, GFP_KERNEL, node)); + BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[i]->cpu, GFP_KERNEL, i)); + BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[i]->smt, GFP_KERNEL, i)); + } + + /* Allocate local per-cpu idle cpumasks */ + for_each_possible_cpu(i) { + BUG_ON(!alloc_cpumask_var_node(&per_cpu(local_idle_cpumask, i), + GFP_KERNEL, cpu_to_node(i))); + BUG_ON(!alloc_cpumask_var_node(&per_cpu(local_llc_idle_cpumask, i), + GFP_KERNEL, cpu_to_node(i))); + BUG_ON(!alloc_cpumask_var_node(&per_cpu(local_numa_idle_cpumask, i), + GFP_KERNEL, cpu_to_node(i))); } } @@ -664,21 +733,12 @@ static void update_builtin_idle(int cpu, bool idle) */ void __scx_update_idle(struct rq *rq, bool idle, bool do_notify) { + struct scx_sched *sch = scx_root; int cpu = cpu_of(rq); lockdep_assert_rq_held(rq); /* - * Trigger ops.update_idle() only when transitioning from a task to - * the idle thread and vice versa. - * - * Idle transitions are indicated by do_notify being set to true, - * managed by put_prev_task_idle()/set_next_task_idle(). - */ - if (SCX_HAS_OP(update_idle) && do_notify && !scx_rq_bypassing(rq)) - SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle); - - /* * Update the idle masks: * - for real idle transitions (do_notify == true) * - for idle-to-idle transitions (indicated by the previous task @@ -695,6 +755,21 @@ void __scx_update_idle(struct rq *rq, bool idle, bool do_notify) if (static_branch_likely(&scx_builtin_idle_enabled)) if (do_notify || is_idle_task(rq->curr)) update_builtin_idle(cpu, idle); + + /* + * Trigger ops.update_idle() only when transitioning from a task to + * the idle thread and vice versa. + * + * Idle transitions are indicated by do_notify being set to true, + * managed by put_prev_task_idle()/set_next_task_idle(). + * + * This must come after builtin idle update so that BPF schedulers can + * create interlocking between ops.update_idle() and ops.enqueue() - + * either enqueue() sees the idle bit or update_idle() sees the task + * that enqueue() queued. + */ + if (SCX_HAS_OP(sch, update_idle) && do_notify && !scx_rq_bypassing(rq)) + SCX_CALL_OP(sch, SCX_KF_REST, update_idle, rq, cpu_of(rq), idle); } static void reset_idle_masks(struct sched_ext_ops *ops) @@ -723,14 +798,14 @@ static void reset_idle_masks(struct sched_ext_ops *ops) void scx_idle_enable(struct sched_ext_ops *ops) { if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) - static_branch_enable(&scx_builtin_idle_enabled); + static_branch_enable_cpuslocked(&scx_builtin_idle_enabled); else - static_branch_disable(&scx_builtin_idle_enabled); + static_branch_disable_cpuslocked(&scx_builtin_idle_enabled); if (ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE) - static_branch_enable(&scx_builtin_idle_per_node); + static_branch_enable_cpuslocked(&scx_builtin_idle_per_node); else - static_branch_disable(&scx_builtin_idle_per_node); + static_branch_disable_cpuslocked(&scx_builtin_idle_per_node); #ifdef CONFIG_SMP reset_idle_masks(ops); @@ -750,7 +825,7 @@ void scx_idle_disable(void) static int validate_node(int node) { if (!static_branch_likely(&scx_builtin_idle_per_node)) { - scx_ops_error("per-node idle tracking is disabled"); + scx_kf_error("per-node idle tracking is disabled"); return -EOPNOTSUPP; } @@ -760,13 +835,13 @@ static int validate_node(int node) /* Make sure node is in a valid range */ if (node < 0 || node >= nr_node_ids) { - scx_ops_error("invalid node %d", node); + scx_kf_error("invalid node %d", node); return -EINVAL; } /* Make sure the node is part of the set of possible nodes */ if (!node_possible(node)) { - scx_ops_error("unavailable node %d", node); + scx_kf_error("unavailable node %d", node); return -EINVAL; } @@ -780,10 +855,72 @@ static bool check_builtin_idle_enabled(void) if (static_branch_likely(&scx_builtin_idle_enabled)) return true; - scx_ops_error("built-in idle tracking is disabled"); + scx_kf_error("built-in idle tracking is disabled"); return false; } +s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_flags, + const struct cpumask *allowed, u64 flags) +{ + struct rq *rq; + struct rq_flags rf; + s32 cpu; + + if (!kf_cpu_valid(prev_cpu, NULL)) + return -EINVAL; + + if (!check_builtin_idle_enabled()) + return -EBUSY; + + /* + * If called from an unlocked context, acquire the task's rq lock, + * so that we can safely access p->cpus_ptr and p->nr_cpus_allowed. + * + * Otherwise, allow to use this kfunc only from ops.select_cpu() + * and ops.select_enqueue(). + */ + if (scx_kf_allowed_if_unlocked()) { + rq = task_rq_lock(p, &rf); + } else { + if (!scx_kf_allowed(SCX_KF_SELECT_CPU | SCX_KF_ENQUEUE)) + return -EPERM; + rq = scx_locked_rq(); + } + + /* + * Validate locking correctness to access p->cpus_ptr and + * p->nr_cpus_allowed: if we're holding an rq lock, we're safe; + * otherwise, assert that p->pi_lock is held. + */ + if (!rq) + lockdep_assert_held(&p->pi_lock); + +#ifdef CONFIG_SMP + /* + * This may also be called from ops.enqueue(), so we need to handle + * per-CPU tasks as well. For these tasks, we can skip all idle CPU + * selection optimizations and simply check whether the previously + * used CPU is idle and within the allowed cpumask. + */ + if (p->nr_cpus_allowed == 1) { + if (cpumask_test_cpu(prev_cpu, allowed ?: p->cpus_ptr) && + scx_idle_test_and_clear_cpu(prev_cpu)) + cpu = prev_cpu; + else + cpu = -EBUSY; + } else { + cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, + allowed ?: p->cpus_ptr, flags); + } +#else + cpu = -EBUSY; +#endif + if (scx_kf_allowed_if_unlocked()) + task_rq_unlock(rq, p, &rf); + + return cpu; +} + /** * scx_bpf_cpu_node - Return the NUMA node the given @cpu belongs to, or * trigger an error if @cpu is invalid @@ -792,7 +929,7 @@ static bool check_builtin_idle_enabled(void) __bpf_kfunc int scx_bpf_cpu_node(s32 cpu) { #ifdef CONFIG_NUMA - if (!ops_cpu_valid(cpu, NULL)) + if (!kf_cpu_valid(cpu, NULL)) return NUMA_NO_NODE; return cpu_to_node(cpu); @@ -808,9 +945,10 @@ __bpf_kfunc int scx_bpf_cpu_node(s32 cpu) * @wake_flags: %SCX_WAKE_* flags * @is_idle: out parameter indicating whether the returned CPU is idle * - * Can only be called from ops.select_cpu() if the built-in CPU selection is - * enabled - ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE is set. - * @p, @prev_cpu and @wake_flags match ops.select_cpu(). + * Can be called from ops.select_cpu(), ops.enqueue(), or from an unlocked + * context such as a BPF test_run() call, as long as built-in CPU selection + * is enabled: ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE + * is set. * * Returns the picked CPU with *@is_idle indicating whether the picked CPU is * currently idle and thus a good candidate for direct dispatching. @@ -818,39 +956,52 @@ __bpf_kfunc int scx_bpf_cpu_node(s32 cpu) __bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *is_idle) { -#ifdef CONFIG_SMP s32 cpu; -#endif - if (!ops_cpu_valid(prev_cpu, NULL)) - goto prev_cpu; - - if (!check_builtin_idle_enabled()) - goto prev_cpu; - - if (!scx_kf_allowed(SCX_KF_SELECT_CPU)) - goto prev_cpu; -#ifdef CONFIG_SMP - cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, 0); + cpu = select_cpu_from_kfunc(p, prev_cpu, wake_flags, NULL, 0); if (cpu >= 0) { *is_idle = true; return cpu; } -#endif - -prev_cpu: *is_idle = false; + return prev_cpu; } /** + * scx_bpf_select_cpu_and - Pick an idle CPU usable by task @p, + * prioritizing those in @cpus_allowed + * @p: task_struct to select a CPU for + * @prev_cpu: CPU @p was on previously + * @wake_flags: %SCX_WAKE_* flags + * @cpus_allowed: cpumask of allowed CPUs + * @flags: %SCX_PICK_IDLE* flags + * + * Can be called from ops.select_cpu(), ops.enqueue(), or from an unlocked + * context such as a BPF test_run() call, as long as built-in CPU selection + * is enabled: ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE + * is set. + * + * @p, @prev_cpu and @wake_flags match ops.select_cpu(). + * + * Returns the selected idle CPU, which will be automatically awakened upon + * returning from ops.select_cpu() and can be used for direct dispatch, or + * a negative value if no idle CPU is available. + */ +__bpf_kfunc s32 scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64 wake_flags, + const struct cpumask *cpus_allowed, u64 flags) +{ + return select_cpu_from_kfunc(p, prev_cpu, wake_flags, cpus_allowed, flags); +} + +/** * scx_bpf_get_idle_cpumask_node - Get a referenced kptr to the * idle-tracking per-CPU cpumask of a target NUMA node. * @node: target NUMA node * * Returns an empty cpumask if idle tracking is not enabled, if @node is * not valid, or running on a UP kernel. In this case the actual error will - * be reported to the BPF scheduler via scx_ops_error(). + * be reported to the BPF scheduler via scx_error(). */ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask_node(int node) { @@ -875,7 +1026,7 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask_node(int node) __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void) { if (static_branch_unlikely(&scx_builtin_idle_per_node)) { - scx_ops_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled"); + scx_kf_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled"); return cpu_none_mask; } @@ -897,7 +1048,7 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void) * * Returns an empty cpumask if idle tracking is not enabled, if @node is * not valid, or running on a UP kernel. In this case the actual error will - * be reported to the BPF scheduler via scx_ops_error(). + * be reported to the BPF scheduler via scx_error(). */ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask_node(int node) { @@ -926,7 +1077,7 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask_node(int node) __bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void) { if (static_branch_unlikely(&scx_builtin_idle_per_node)) { - scx_ops_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled"); + scx_kf_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled"); return cpu_none_mask; } @@ -973,7 +1124,7 @@ __bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) if (!check_builtin_idle_enabled()) return false; - if (ops_cpu_valid(cpu, NULL)) + if (kf_cpu_valid(cpu, NULL)) return scx_idle_test_and_clear_cpu(cpu); else return false; @@ -1034,7 +1185,7 @@ __bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) { if (static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) { - scx_ops_error("per-node idle tracking is enabled"); + scx_kf_error("per-node idle tracking is enabled"); return -EBUSY; } @@ -1111,7 +1262,7 @@ __bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed, s32 cpu; if (static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) { - scx_ops_error("per-node idle tracking is enabled"); + scx_kf_error("per-node idle tracking is enabled"); return -EBUSY; } @@ -1142,6 +1293,8 @@ BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu_node, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu_node, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_select_cpu_and, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU) BTF_KFUNCS_END(scx_kfunc_ids_idle) static const struct btf_kfunc_id_set scx_kfunc_set_idle = { @@ -1149,21 +1302,11 @@ static const struct btf_kfunc_id_set scx_kfunc_set_idle = { .set = &scx_kfunc_ids_idle, }; -BTF_KFUNCS_START(scx_kfunc_ids_select_cpu) -BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU) -BTF_KFUNCS_END(scx_kfunc_ids_select_cpu) - -static const struct btf_kfunc_id_set scx_kfunc_set_select_cpu = { - .owner = THIS_MODULE, - .set = &scx_kfunc_ids_select_cpu, -}; - int scx_idle_init(void) { int ret; - ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_select_cpu) || - register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_idle) || + ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_idle) || register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &scx_kfunc_set_idle) || register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &scx_kfunc_set_idle); diff --git a/kernel/sched/ext_idle.h b/kernel/sched/ext_idle.h index 511cc2221f7a..37be78a7502b 100644 --- a/kernel/sched/ext_idle.h +++ b/kernel/sched/ext_idle.h @@ -27,7 +27,8 @@ static inline s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node } #endif /* CONFIG_SMP */ -s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64 flags); +s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, + const struct cpumask *cpus_allowed, u64 flags); void scx_idle_enable(struct sched_ext_ops *ops); void scx_idle_disable(void); int scx_idle_init(void); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a0c4cd26ee07..125912c0e9dd 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3795,6 +3795,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, update_entity_lag(cfs_rq, se); se->deadline -= se->vruntime; se->rel_deadline = 1; + cfs_rq->nr_queued--; if (!curr) __dequeue_entity(cfs_rq, se); update_load_sub(&cfs_rq->load, se->load.weight); @@ -3821,10 +3822,11 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, enqueue_load_avg(cfs_rq, se); if (se->on_rq) { - update_load_add(&cfs_rq->load, se->load.weight); place_entity(cfs_rq, se, 0); + update_load_add(&cfs_rq->load, se->load.weight); if (!curr) __enqueue_entity(cfs_rq, se); + cfs_rq->nr_queued++; /* * The entity's vruntime has been adjusted, so let's check @@ -4933,13 +4935,6 @@ static inline void util_est_update(struct cfs_rq *cfs_rq, goto done; /* - * To avoid overestimation of actual task utilization, skip updates if - * we cannot grant there is idle time in this CPU. - */ - if (dequeued > arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq)))) - return; - - /* * To avoid underestimate of task utilization, skip updates of EWMA if * we cannot grant that thread got all CPU time it wanted. */ @@ -6552,14 +6547,14 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *paren cfs_b->hierarchical_quota = parent ? parent->hierarchical_quota : RUNTIME_INF; INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq); - hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); - cfs_b->period_timer.function = sched_cfs_period_timer; + hrtimer_setup(&cfs_b->period_timer, sched_cfs_period_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS_PINNED); /* Add a random offset so that timers interleave */ hrtimer_set_expires(&cfs_b->period_timer, get_random_u32_below(cfs_b->period)); - hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - cfs_b->slack_timer.function = sched_cfs_slack_timer; + hrtimer_setup(&cfs_b->slack_timer, sched_cfs_slack_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); cfs_b->slack_started = false; } @@ -6941,7 +6936,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) * Let's add the task's estimated utilization to the cfs_rq's * estimated utilization, before we update schedutil. */ - if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & ENQUEUE_RESTORE)))) + if (!p->se.sched_delayed || (flags & ENQUEUE_DELAYED)) util_est_enqueue(&rq->cfs, p); if (flags & ENQUEUE_DELAYED) { @@ -7081,9 +7076,6 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) h_nr_idle = task_has_idle_policy(p); if (task_sleep || task_delayed || !se->sched_delayed) h_nr_runnable = 1; - } else { - cfs_rq = group_cfs_rq(se); - slice = cfs_rq_min_slice(cfs_rq); } for_each_sched_entity(se) { @@ -7093,6 +7085,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) if (p && &p->se == se) return -1; + slice = cfs_rq_min_slice(cfs_rq); break; } @@ -7183,7 +7176,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) */ static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) { - if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & DEQUEUE_SAVE)))) + if (!p->se.sched_delayed) util_est_dequeue(&rq->cfs, p); util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP); @@ -7198,6 +7191,11 @@ static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) return true; } +static inline unsigned int cfs_h_nr_delayed(struct rq *rq) +{ + return (rq->cfs.h_nr_queued - rq->cfs.h_nr_runnable); +} + #ifdef CONFIG_SMP /* Working cpumask for: sched_balance_rq(), sched_balance_newidle(). */ @@ -7359,8 +7357,12 @@ wake_affine_idle(int this_cpu, int prev_cpu, int sync) if (available_idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu)) return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu; - if (sync && cpu_rq(this_cpu)->nr_running == 1) - return this_cpu; + if (sync) { + struct rq *rq = cpu_rq(this_cpu); + + if ((rq->nr_running - cfs_h_nr_delayed(rq)) == 1) + return this_cpu; + } if (available_idle_cpu(prev_cpu)) return prev_cpu; @@ -10258,7 +10260,7 @@ sched_group_asym(struct lb_env *env, struct sg_lb_stats *sgs, struct sched_group (sgs->group_weight - sgs->idle_cpus != 1)) return false; - return sched_asym(env->sd, env->dst_cpu, group->asym_prefer_cpu); + return sched_asym(env->sd, env->dst_cpu, READ_ONCE(group->asym_prefer_cpu)); } /* One group has more than one SMT CPU while the other group does not */ @@ -10495,7 +10497,8 @@ static bool update_sd_pick_busiest(struct lb_env *env, case group_asym_packing: /* Prefer to move from lowest priority CPU's work */ - return sched_asym_prefer(sds->busiest->asym_prefer_cpu, sg->asym_prefer_cpu); + return sched_asym_prefer(READ_ONCE(sds->busiest->asym_prefer_cpu), + READ_ONCE(sg->asym_prefer_cpu)); case group_misfit_task: /* diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 81bc8b329ef1..93b038d48900 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -40,7 +40,7 @@ int housekeeping_any_cpu(enum hk_type type) if (cpu < nr_cpu_ids) return cpu; - cpu = cpumask_any_and(housekeeping.cpumasks[type], cpu_online_mask); + cpu = cpumask_any_and_distribute(housekeeping.cpumasks[type], cpu_online_mask); if (likely(cpu < nr_cpu_ids)) return cpu; /* diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index bb56805e3d47..1396674fa722 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -1440,7 +1440,7 @@ void psi_trigger_destroy(struct psi_trigger *t) group->rtpoll_task, lockdep_is_held(&group->rtpoll_trigger_lock)); rcu_assign_pointer(group->rtpoll_task, NULL); - del_timer(&group->rtpoll_timer); + timer_delete(&group->rtpoll_timer); } } mutex_unlock(&group->rtpoll_trigger_lock); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index a4774155ae12..e40422c37033 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -89,6 +89,7 @@ void init_rt_rq(struct rt_rq *rt_rq) rt_rq->rt_throttled = 0; rt_rq->rt_runtime = 0; raw_spin_lock_init(&rt_rq->rt_runtime_lock); + rt_rq->tg = &root_task_group; #endif } @@ -127,9 +128,8 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) raw_spin_lock_init(&rt_b->rt_runtime_lock); - hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL_HARD); - rt_b->rt_period_timer.function = sched_rt_period_timer; + hrtimer_setup(&rt_b->rt_period_timer, sched_rt_period_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_HARD); } static inline void do_start_rt_bandwidth(struct rt_bandwidth *rt_b) @@ -176,11 +176,14 @@ static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) { + /* Cannot fold with non-CONFIG_RT_GROUP_SCHED version, layout */ + WARN_ON(!rt_group_sched_enabled() && rt_rq->tg != &root_task_group); return rt_rq->rq; } static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) { + WARN_ON(!rt_group_sched_enabled() && rt_se->rt_rq->tg != &root_task_group); return rt_se->rt_rq; } @@ -188,11 +191,15 @@ static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se) { struct rt_rq *rt_rq = rt_se->rt_rq; + WARN_ON(!rt_group_sched_enabled() && rt_rq->tg != &root_task_group); return rt_rq->rq; } void unregister_rt_sched_group(struct task_group *tg) { + if (!rt_group_sched_enabled()) + return; + if (tg->rt_se) destroy_rt_bandwidth(&tg->rt_bandwidth); } @@ -201,6 +208,9 @@ void free_rt_sched_group(struct task_group *tg) { int i; + if (!rt_group_sched_enabled()) + return; + for_each_possible_cpu(i) { if (tg->rt_rq) kfree(tg->rt_rq[i]); @@ -245,6 +255,9 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) struct sched_rt_entity *rt_se; int i; + if (!rt_group_sched_enabled()) + return 1; + tg->rt_rq = kcalloc(nr_cpu_ids, sizeof(rt_rq), GFP_KERNEL); if (!tg->rt_rq) goto err; @@ -483,9 +496,6 @@ static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu) static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) { - if (!rt_rq->tg) - return RUNTIME_INF; - return rt_rq->rt_runtime; } @@ -498,6 +508,11 @@ typedef struct task_group *rt_rq_iter_t; static inline struct task_group *next_task_group(struct task_group *tg) { + if (!rt_group_sched_enabled()) { + WARN_ON(tg != &root_task_group); + return NULL; + } + do { tg = list_entry_rcu(tg->list.next, typeof(struct task_group), list); @@ -510,9 +525,9 @@ static inline struct task_group *next_task_group(struct task_group *tg) } #define for_each_rt_rq(rt_rq, iter, rq) \ - for (iter = container_of(&task_groups, typeof(*iter), list); \ - (iter = next_task_group(iter)) && \ - (rt_rq = iter->rt_rq[cpu_of(rq)]);) + for (iter = &root_task_group; \ + iter && (rt_rq = iter->rt_rq[cpu_of(rq)]); \ + iter = next_task_group(iter)) #define for_each_sched_rt_entity(rt_se) \ for (; rt_se; rt_se = rt_se->parent) @@ -1067,13 +1082,12 @@ inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) { struct rq *rq = rq_of_rt_rq(rt_rq); -#ifdef CONFIG_RT_GROUP_SCHED /* * Change rq's cpupri only if rt_rq is the top queue. */ - if (&rq->rt != rt_rq) + if (IS_ENABLED(CONFIG_RT_GROUP_SCHED) && &rq->rt != rt_rq) return; -#endif + if (rq->online && prio < prev_prio) cpupri_set(&rq->rd->cpupri, rq->cpu, prio); } @@ -1083,13 +1097,12 @@ dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) { struct rq *rq = rq_of_rt_rq(rt_rq); -#ifdef CONFIG_RT_GROUP_SCHED /* * Change rq's cpupri only if rt_rq is the top queue. */ - if (&rq->rt != rt_rq) + if (IS_ENABLED(CONFIG_RT_GROUP_SCHED) && &rq->rt != rt_rq) return; -#endif + if (rq->online && rt_rq->highest_prio.curr != prev_prio) cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr); } @@ -1157,8 +1170,7 @@ inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) if (rt_se_boosted(rt_se)) rt_rq->rt_nr_boosted++; - if (rt_rq->tg) - start_rt_bandwidth(&rt_rq->tg->rt_bandwidth); + start_rt_bandwidth(&rt_rq->tg->rt_bandwidth); } static void @@ -1258,11 +1270,9 @@ static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_arr static inline struct sched_statistics * __schedstats_from_rt_se(struct sched_rt_entity *rt_se) { -#ifdef CONFIG_RT_GROUP_SCHED /* schedstats is not supported for rt group. */ if (!rt_entity_is_task(rt_se)) return NULL; -#endif return &rt_task_of(rt_se)->stats; } @@ -1884,6 +1894,27 @@ static int find_lowest_rq(struct task_struct *task) return -1; } +static struct task_struct *pick_next_pushable_task(struct rq *rq) +{ + struct task_struct *p; + + if (!has_pushable_tasks(rq)) + return NULL; + + p = plist_first_entry(&rq->rt.pushable_tasks, + struct task_struct, pushable_tasks); + + BUG_ON(rq->cpu != task_cpu(p)); + BUG_ON(task_current(rq, p)); + BUG_ON(task_current_donor(rq, p)); + BUG_ON(p->nr_cpus_allowed <= 1); + + BUG_ON(!task_on_rq_queued(p)); + BUG_ON(!rt_task(p)); + + return p; +} + /* Will lock the rq it finds */ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) { @@ -1914,18 +1945,16 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) /* * We had to unlock the run queue. In * the mean time, task could have - * migrated already or had its affinity changed. - * Also make sure that it wasn't scheduled on its rq. + * migrated already or had its affinity changed, + * therefore check if the task is still at the + * head of the pushable tasks list. * It is possible the task was scheduled, set * "migrate_disabled" and then got preempted, so we must * check the task migration disable flag here too. */ - if (unlikely(task_rq(task) != rq || + if (unlikely(is_migration_disabled(task) || !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_mask) || - task_on_cpu(rq, task) || - !rt_task(task) || - is_migration_disabled(task) || - !task_on_rq_queued(task))) { + task != pick_next_pushable_task(rq))) { double_unlock_balance(rq, lowest_rq); lowest_rq = NULL; @@ -1945,27 +1974,6 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) return lowest_rq; } -static struct task_struct *pick_next_pushable_task(struct rq *rq) -{ - struct task_struct *p; - - if (!has_pushable_tasks(rq)) - return NULL; - - p = plist_first_entry(&rq->rt.pushable_tasks, - struct task_struct, pushable_tasks); - - BUG_ON(rq->cpu != task_cpu(p)); - BUG_ON(task_current(rq, p)); - BUG_ON(task_current_donor(rq, p)); - BUG_ON(p->nr_cpus_allowed <= 1); - - BUG_ON(!task_on_rq_queued(p)); - BUG_ON(!rt_task(p)); - - return p; -} - /* * If the current CPU has more than one RT task, see if the non * running task can migrate over to a CPU that is running a task @@ -2603,8 +2611,9 @@ static int task_is_throttled_rt(struct task_struct *p, int cpu) { struct rt_rq *rt_rq; -#ifdef CONFIG_RT_GROUP_SCHED +#ifdef CONFIG_RT_GROUP_SCHED // XXX maybe add task_rt_rq(), see also sched_rt_period_rt_rq rt_rq = task_group(p)->rt_rq[cpu]; + WARN_ON(!rt_group_sched_enabled() && rt_rq->tg != &root_task_group); #else rt_rq = &cpu_rq(cpu)->rt; #endif @@ -2714,6 +2723,9 @@ static int tg_rt_schedulable(struct task_group *tg, void *data) tg->rt_bandwidth.rt_runtime && tg_has_rt_tasks(tg)) return -EBUSY; + if (WARN_ON(!rt_group_sched_enabled() && tg != &root_task_group)) + return -EBUSY; + total = to_ratio(period, runtime); /* @@ -2869,7 +2881,7 @@ static int sched_rt_global_constraints(void) int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) { /* Don't accept real-time tasks when there is no way for them to run */ - if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) + if (rt_group_sched_enabled() && rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) return 0; return 1; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 47972f34ea70..475bb5998295 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -813,15 +813,17 @@ struct rt_rq { #ifdef CONFIG_RT_GROUP_SCHED int rt_throttled; - u64 rt_time; - u64 rt_runtime; + u64 rt_time; /* consumed RT time, goes up in update_curr_rt */ + u64 rt_runtime; /* allotted RT time, "slice" from rt_bandwidth, RT sharing/balancing */ /* Nests inside the rq lock: */ raw_spinlock_t rt_runtime_lock; unsigned int rt_nr_boosted; - struct rq *rq; - struct task_group *tg; + struct rq *rq; /* this is always top-level rq, cache? */ +#endif +#ifdef CONFIG_CGROUP_SCHED + struct task_group *tg; /* this tg has "this" rt_rq on given CPU for runnable entities */ #endif }; @@ -1498,6 +1500,23 @@ static inline bool sched_group_cookie_match(struct rq *rq, } #endif /* !CONFIG_SCHED_CORE */ +#ifdef CONFIG_RT_GROUP_SCHED +# ifdef CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED +DECLARE_STATIC_KEY_FALSE(rt_group_sched); +static inline bool rt_group_sched_enabled(void) +{ + return static_branch_unlikely(&rt_group_sched); +} +# else +DECLARE_STATIC_KEY_TRUE(rt_group_sched); +static inline bool rt_group_sched_enabled(void) +{ + return static_branch_likely(&rt_group_sched); +} +# endif /* CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED */ +#else +# define rt_group_sched_enabled() false +#endif /* CONFIG_RT_GROUP_SCHED */ static inline void lockdep_assert_rq_held(struct rq *rq) { @@ -1717,10 +1736,10 @@ extern struct balance_callback balance_push_callback; #ifdef CONFIG_SCHED_CLASS_EXT extern const struct sched_class ext_sched_class; -DECLARE_STATIC_KEY_FALSE(__scx_ops_enabled); /* SCX BPF scheduler loaded */ +DECLARE_STATIC_KEY_FALSE(__scx_enabled); /* SCX BPF scheduler loaded */ DECLARE_STATIC_KEY_FALSE(__scx_switched_all); /* all fair class tasks on SCX */ -#define scx_enabled() static_branch_unlikely(&__scx_ops_enabled) +#define scx_enabled() static_branch_unlikely(&__scx_enabled) #define scx_switched_all() static_branch_unlikely(&__scx_switched_all) static inline void scx_rq_clock_update(struct rq *rq, u64 clock) @@ -2146,6 +2165,13 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) #endif #ifdef CONFIG_RT_GROUP_SCHED + /* + * p->rt.rt_rq is NULL initially and it is easier to assign + * root_task_group's rt_rq than switching in rt_rq_of_se() + * Clobbers tg(!) + */ + if (!rt_group_sched_enabled()) + tg = &root_task_group; p->rt.rt_rq = tg->rt_rq[cpu]; p->rt.parent = tg->rt_se[cpu]; #endif @@ -3509,8 +3535,6 @@ static inline bool sched_energy_enabled(void) return static_branch_unlikely(&sched_energy_present); } -extern struct cpufreq_governor schedutil_gov; - #else /* ! (CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */ #define perf_domain_span(pd) NULL diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c index c326de1344fb..547c1f05b667 100644 --- a/kernel/sched/syscalls.c +++ b/kernel/sched/syscalls.c @@ -634,13 +634,14 @@ change: * Do not allow real-time tasks into groups that have no runtime * assigned. */ - if (rt_bandwidth_enabled() && rt_policy(policy) && + if (rt_group_sched_enabled() && + rt_bandwidth_enabled() && rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0 && !task_group_is_autogroup(task_group(p))) { retval = -EPERM; goto unlock; } -#endif +#endif /* CONFIG_RT_GROUP_SCHED */ #ifdef CONFIG_SMP if (dl_bandwidth_enabled() && dl_policy(policy) && !(attr->sched_flags & SCHED_FLAG_SUGOV)) { diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index f1ebc60d967f..b958fe48e020 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -212,8 +212,6 @@ static bool sched_energy_update; static bool sched_is_eas_possible(const struct cpumask *cpu_mask) { bool any_asym_capacity = false; - struct cpufreq_policy *policy; - struct cpufreq_governor *gov; int i; /* EAS is enabled for asymmetric CPU capacity topologies. */ @@ -248,25 +246,12 @@ static bool sched_is_eas_possible(const struct cpumask *cpu_mask) return false; } - /* Do not attempt EAS if schedutil is not being used. */ - for_each_cpu(i, cpu_mask) { - policy = cpufreq_cpu_get(i); - if (!policy) { - if (sched_debug()) { - pr_info("rd %*pbl: Checking EAS, cpufreq policy not set for CPU: %d", - cpumask_pr_args(cpu_mask), i); - } - return false; - } - gov = policy->governor; - cpufreq_cpu_put(policy); - if (gov != &schedutil_gov) { - if (sched_debug()) { - pr_info("rd %*pbl: Checking EAS, schedutil is mandatory\n", - cpumask_pr_args(cpu_mask)); - } - return false; + if (!cpufreq_ready_for_eas(cpu_mask)) { + if (sched_debug()) { + pr_info("rd %*pbl: Checking EAS: cpufreq is not ready\n", + cpumask_pr_args(cpu_mask)); } + return false; } return true; @@ -1333,6 +1318,64 @@ next: update_group_capacity(sd, cpu); } +#ifdef CONFIG_SMP + +/* Update the "asym_prefer_cpu" when arch_asym_cpu_priority() changes. */ +void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio) +{ + int asym_prefer_cpu = cpu; + struct sched_domain *sd; + + guard(rcu)(); + + for_each_domain(cpu, sd) { + struct sched_group *sg; + int group_cpu; + + if (!(sd->flags & SD_ASYM_PACKING)) + continue; + + /* + * Groups of overlapping domain are replicated per NUMA + * node and will require updating "asym_prefer_cpu" on + * each local copy. + * + * If you are hitting this warning, consider moving + * "sg->asym_prefer_cpu" to "sg->sgc->asym_prefer_cpu" + * which is shared by all the overlapping groups. + */ + WARN_ON_ONCE(sd->flags & SD_OVERLAP); + + sg = sd->groups; + if (cpu != sg->asym_prefer_cpu) { + /* + * Since the parent is a superset of the current group, + * if the cpu is not the "asym_prefer_cpu" at the + * current level, it cannot be the preferred CPU at a + * higher levels either. + */ + if (!sched_asym_prefer(cpu, sg->asym_prefer_cpu)) + return; + + WRITE_ONCE(sg->asym_prefer_cpu, cpu); + continue; + } + + /* Ranking has improved; CPU is still the preferred one. */ + if (new_prio >= old_prio) + continue; + + for_each_cpu(group_cpu, sched_group_span(sg)) { + if (sched_asym_prefer(group_cpu, asym_prefer_cpu)) + asym_prefer_cpu = group_cpu; + } + + WRITE_ONCE(sg->asym_prefer_cpu, asym_prefer_cpu); + } +} + +#endif /* CONFIG_SMP */ + /* * Set of available CPUs grouped by their corresponding capacities * Each list entry contains a CPU mask reflecting CPUs that share the same @@ -2098,7 +2141,7 @@ int sched_numa_find_closest(const struct cpumask *cpus, int cpu) for (i = 0; i < sched_domains_numa_levels; i++) { if (!masks[i][j]) break; - cpu = cpumask_any_and(cpus, masks[i][j]); + cpu = cpumask_any_and_distribute(cpus, masks[i][j]); if (cpu < nr_cpu_ids) { found = cpu; break; @@ -2347,35 +2390,54 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve /* * Ensure topology masks are sane, i.e. there are no conflicts (overlaps) for - * any two given CPUs at this (non-NUMA) topology level. + * any two given CPUs on non-NUMA topology levels. */ -static bool topology_span_sane(struct sched_domain_topology_level *tl, - const struct cpumask *cpu_map, int cpu) +static bool topology_span_sane(const struct cpumask *cpu_map) { - int i = cpu + 1; + struct sched_domain_topology_level *tl; + struct cpumask *covered, *id_seen; + int cpu; - /* NUMA levels are allowed to overlap */ - if (tl->flags & SDTL_OVERLAP) - return true; + lockdep_assert_held(&sched_domains_mutex); + covered = sched_domains_tmpmask; + id_seen = sched_domains_tmpmask2; + + for_each_sd_topology(tl) { + + /* NUMA levels are allowed to overlap */ + if (tl->flags & SDTL_OVERLAP) + continue; + + cpumask_clear(covered); + cpumask_clear(id_seen); - /* - * Non-NUMA levels cannot partially overlap - they must be either - * completely equal or completely disjoint. Otherwise we can end up - * breaking the sched_group lists - i.e. a later get_group() pass - * breaks the linking done for an earlier span. - */ - for_each_cpu_from(i, cpu_map) { /* - * We should 'and' all those masks with 'cpu_map' to exactly - * match the topology we're about to build, but that can only - * remove CPUs, which only lessens our ability to detect - * overlaps + * Non-NUMA levels cannot partially overlap - they must be either + * completely equal or completely disjoint. Otherwise we can end up + * breaking the sched_group lists - i.e. a later get_group() pass + * breaks the linking done for an earlier span. */ - if (!cpumask_equal(tl->mask(cpu), tl->mask(i)) && - cpumask_intersects(tl->mask(cpu), tl->mask(i))) - return false; - } + for_each_cpu(cpu, cpu_map) { + const struct cpumask *tl_cpu_mask = tl->mask(cpu); + int id; + + /* lowest bit set in this mask is used as a unique id */ + id = cpumask_first(tl_cpu_mask); + + if (cpumask_test_cpu(id, id_seen)) { + /* First CPU has already been seen, ensure identical spans */ + if (!cpumask_equal(tl->mask(id), tl_cpu_mask)) + return false; + } else { + /* First CPU hasn't been seen before, ensure it's a completely new span */ + if (cpumask_intersects(tl_cpu_mask, covered)) + return false; + cpumask_or(covered, covered, tl_cpu_mask); + cpumask_set_cpu(id, id_seen); + } + } + } return true; } @@ -2408,9 +2470,6 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att sd = NULL; for_each_sd_topology(tl) { - if (WARN_ON(!topology_span_sane(tl, cpu_map, i))) - goto error; - sd = build_sched_domain(tl, cpu_map, attr, sd, i); has_asym |= sd->flags & SD_ASYM_CPUCAPACITY; @@ -2424,6 +2483,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att } } + if (WARN_ON(!topology_span_sane(cpu_map))) + goto error; + /* Build the groups for the domains */ for_each_cpu(i, cpu_map) { for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { diff --git a/kernel/signal.c b/kernel/signal.c index 027ad9e97417..148082db9a55 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -176,9 +176,10 @@ static bool recalc_sigpending_tsk(struct task_struct *t) void recalc_sigpending(void) { - if (!recalc_sigpending_tsk(current) && !freezing(current)) - clear_thread_flag(TIF_SIGPENDING); - + if (!recalc_sigpending_tsk(current) && !freezing(current)) { + if (unlikely(test_thread_flag(TIF_SIGPENDING))) + clear_thread_flag(TIF_SIGPENDING); + } } EXPORT_SYMBOL(recalc_sigpending); @@ -2092,7 +2093,7 @@ static inline void posixtimer_sig_ignore(struct task_struct *tsk, struct sigqueu * from a non-periodic timer, then just drop the reference * count. Otherwise queue it on the ignored list. */ - if (tmr->it_signal && tmr->it_sig_periodic) + if (posixtimer_valid(tmr) && tmr->it_sig_periodic) hlist_add_head(&tmr->ignored_list, &tsk->signal->ignored_posix_timers); else posixtimer_putref(tmr); @@ -2179,11 +2180,9 @@ bool do_notify_parent(struct task_struct *tsk, int sig) WARN_ON_ONCE(!tsk->ptrace && (tsk->group_leader != tsk || !thread_group_empty(tsk))); - /* - * Notify for thread-group leaders without subthreads. - */ - if (thread_group_empty(tsk)) - do_notify_pidfd(tsk); + + /* ptraced, or group-leader without sub-threads */ + do_notify_pidfd(tsk); if (sig != SIGCHLD) { /* @@ -4982,9 +4981,20 @@ static const struct ctl_table signal_debug_table[] = { #endif }; +static const struct ctl_table signal_table[] = { + { + .procname = "print-fatal-signals", + .data = &print_fatal_signals, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +}; + static int __init init_signal_sysctls(void) { register_sysctl_init("debug", signal_debug_table); + register_sysctl_init("kernel", signal_table); return 0; } early_initcall(init_signal_sysctls); diff --git a/kernel/softirq.c b/kernel/softirq.c index 4dae6ac2e83f..513b1945987c 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -126,6 +126,18 @@ static DEFINE_PER_CPU(struct softirq_ctrl, softirq_ctrl) = { .lock = INIT_LOCAL_LOCK(softirq_ctrl.lock), }; +#ifdef CONFIG_DEBUG_LOCK_ALLOC +static struct lock_class_key bh_lock_key; +struct lockdep_map bh_lock_map = { + .name = "local_bh", + .key = &bh_lock_key, + .wait_type_outer = LD_WAIT_FREE, + .wait_type_inner = LD_WAIT_CONFIG, /* PREEMPT_RT makes BH preemptible. */ + .lock_type = LD_LOCK_PERCPU, +}; +EXPORT_SYMBOL_GPL(bh_lock_map); +#endif + /** * local_bh_blocked() - Check for idle whether BH processing is blocked * @@ -148,6 +160,8 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) WARN_ON_ONCE(in_hardirq()); + lock_map_acquire_read(&bh_lock_map); + /* First entry of a task into a BH disabled section? */ if (!current->softirq_disable_cnt) { if (preemptible()) { @@ -211,6 +225,8 @@ void __local_bh_enable_ip(unsigned long ip, unsigned int cnt) WARN_ON_ONCE(in_hardirq()); lockdep_assert_irqs_enabled(); + lock_map_release(&bh_lock_map); + local_irq_save(flags); curcnt = __this_cpu_read(softirq_ctrl.cnt); @@ -261,6 +277,8 @@ static inline void ksoftirqd_run_begin(void) /* Counterpart to ksoftirqd_run_begin() */ static inline void ksoftirqd_run_end(void) { + /* pairs with the lock_map_acquire_read() in ksoftirqd_run_begin() */ + lock_map_release(&bh_lock_map); __local_bh_enable(SOFTIRQ_OFFSET, true); WARN_ON_ONCE(in_interrupt()); local_irq_enable(); diff --git a/kernel/static_call_inline.c b/kernel/static_call_inline.c index bb7d066a7c39..269683d41aa9 100644 --- a/kernel/static_call_inline.c +++ b/kernel/static_call_inline.c @@ -206,7 +206,7 @@ void __static_call_update(struct static_call_key *key, void *tramp, void *func) continue; } - arch_static_call_transform(site_addr, NULL, func, + arch_static_call_transform(site_addr, tramp, func, static_call_is_tail(site)); } } @@ -325,13 +325,12 @@ static int __static_call_mod_text_reserved(void *start, void *end) struct module *mod; int ret; - preempt_disable(); - mod = __module_text_address((unsigned long)start); - WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod); - if (!try_module_get(mod)) - mod = NULL; - preempt_enable(); - + scoped_guard(rcu) { + mod = __module_text_address((unsigned long)start); + WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod); + if (!try_module_get(mod)) + mod = NULL; + } if (!mod) return 0; diff --git a/kernel/sys.c b/kernel/sys.c index 4efca8a97d62..adc0de0aa364 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -52,6 +52,7 @@ #include <linux/user_namespace.h> #include <linux/time_namespace.h> #include <linux/binfmts.h> +#include <linux/futex.h> #include <linux/sched.h> #include <linux/sched/autogroup.h> @@ -2815,6 +2816,14 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, return -EINVAL; error = arch_lock_shadow_stack_status(me, arg2); break; + case PR_TIMER_CREATE_RESTORE_IDS: + if (arg3 || arg4 || arg5) + return -EINVAL; + error = posixtimer_create_prctl(arg2); + break; + case PR_FUTEX_HASH: + error = futex_hash_prctl(arg2, arg3, arg4); + break; default: trace_task_prctl_unknown(option, arg2, arg3, arg4, arg5); error = -EINVAL; diff --git a/kernel/sysctl-test.c b/kernel/sysctl-test.c index eb2842bd0557..92f94ea28957 100644 --- a/kernel/sysctl-test.c +++ b/kernel/sysctl-test.c @@ -367,54 +367,6 @@ static void sysctl_test_api_dointvec_write_single_greater_int_max( KUNIT_EXPECT_EQ(test, 0, *((int *)table.data)); } -/* - * Test that registering an invalid extra value is not allowed. - */ -static void sysctl_test_register_sysctl_sz_invalid_extra_value( - struct kunit *test) -{ - unsigned char data = 0; - const struct ctl_table table_foo[] = { - { - .procname = "foo", - .data = &data, - .maxlen = sizeof(u8), - .mode = 0644, - .proc_handler = proc_dou8vec_minmax, - .extra1 = SYSCTL_FOUR, - .extra2 = SYSCTL_ONE_THOUSAND, - }, - }; - - const struct ctl_table table_bar[] = { - { - .procname = "bar", - .data = &data, - .maxlen = sizeof(u8), - .mode = 0644, - .proc_handler = proc_dou8vec_minmax, - .extra1 = SYSCTL_NEG_ONE, - .extra2 = SYSCTL_ONE_HUNDRED, - }, - }; - - const struct ctl_table table_qux[] = { - { - .procname = "qux", - .data = &data, - .maxlen = sizeof(u8), - .mode = 0644, - .proc_handler = proc_dou8vec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_TWO_HUNDRED, - }, - }; - - KUNIT_EXPECT_NULL(test, register_sysctl("foo", table_foo)); - KUNIT_EXPECT_NULL(test, register_sysctl("foo", table_bar)); - KUNIT_EXPECT_NOT_NULL(test, register_sysctl("foo", table_qux)); -} - static struct kunit_case sysctl_test_cases[] = { KUNIT_CASE(sysctl_test_api_dointvec_null_tbl_data), KUNIT_CASE(sysctl_test_api_dointvec_table_maxlen_unset), @@ -426,7 +378,6 @@ static struct kunit_case sysctl_test_cases[] = { KUNIT_CASE(sysctl_test_dointvec_write_happy_single_negative), KUNIT_CASE(sysctl_test_api_dointvec_write_single_less_int_min), KUNIT_CASE(sysctl_test_api_dointvec_write_single_greater_int_max), - KUNIT_CASE(sysctl_test_register_sysctl_sz_invalid_extra_value), {} }; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 4ebe6136b08d..9b4f0cff76ea 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -20,18 +20,12 @@ */ #include <linux/module.h> -#include <linux/mm.h> -#include <linux/swap.h> -#include <linux/slab.h> #include <linux/sysctl.h> #include <linux/bitmap.h> -#include <linux/signal.h> -#include <linux/panic.h> #include <linux/printk.h> #include <linux/proc_fs.h> #include <linux/security.h> #include <linux/ctype.h> -#include <linux/kmemleak.h> #include <linux/filter.h> #include <linux/fs.h> #include <linux/init.h> @@ -42,25 +36,19 @@ #include <linux/highuid.h> #include <linux/writeback.h> #include <linux/ratelimit.h> -#include <linux/hugetlb.h> #include <linux/initrd.h> #include <linux/key.h> #include <linux/times.h> #include <linux/limits.h> -#include <linux/dcache.h> #include <linux/syscalls.h> -#include <linux/vmstat.h> #include <linux/nfs_fs.h> #include <linux/acpi.h> #include <linux/reboot.h> -#include <linux/ftrace.h> -#include <linux/oom.h> #include <linux/kmod.h> #include <linux/capability.h> #include <linux/binfmts.h> #include <linux/sched/sysctl.h> #include <linux/mount.h> -#include <linux/userfaultfd_k.h> #include <linux/pid.h> #include "../lib/kstrtox.h" @@ -70,12 +58,8 @@ #ifdef CONFIG_X86 #include <asm/nmi.h> -#include <asm/stacktrace.h> #include <asm/io.h> #endif -#ifdef CONFIG_SPARC -#include <asm/setup.h> -#endif #ifdef CONFIG_RT_MUTEXES #include <linux/rtmutex.h> #endif @@ -122,12 +106,6 @@ enum sysctl_writes_mode { static enum sysctl_writes_mode sysctl_writes_strict = SYSCTL_WRITES_STRICT; #endif /* CONFIG_PROC_SYSCTL */ - -#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \ - defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT) -int sysctl_legacy_va_layout; -#endif - #endif /* CONFIG_SYSCTL */ /* @@ -1603,13 +1581,6 @@ int proc_do_static_key(const struct ctl_table *table, int write, } static const struct ctl_table kern_table[] = { - { - .procname = "panic", - .data = &panic_timeout, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, #ifdef CONFIG_PROC_SYSCTL { .procname = "tainted", @@ -1627,45 +1598,6 @@ static const struct ctl_table kern_table[] = { .extra2 = SYSCTL_ONE, }, #endif - { - .procname = "print-fatal-signals", - .data = &print_fatal_signals, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#ifdef CONFIG_SPARC - { - .procname = "reboot-cmd", - .data = reboot_command, - .maxlen = 256, - .mode = 0644, - .proc_handler = proc_dostring, - }, - { - .procname = "stop-a", - .data = &stop_a_enabled, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "scons-poweroff", - .data = &scons_pwroff, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_SPARC64 - { - .procname = "tsb-ratio", - .data = &sysctl_tsb_ratio, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif #ifdef CONFIG_PARISC { .procname = "soft-power", @@ -1684,38 +1616,6 @@ static const struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -#ifdef CONFIG_STACK_TRACER - { - .procname = "stack_tracer_enabled", - .data = &stack_tracer_enabled, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = stack_trace_sysctl, - }, -#endif -#ifdef CONFIG_TRACING - { - .procname = "ftrace_dump_on_oops", - .data = &ftrace_dump_on_oops, - .maxlen = MAX_TRACER_SIZE, - .mode = 0644, - .proc_handler = proc_dostring, - }, - { - .procname = "traceoff_on_warning", - .data = &__disable_trace_on_warning, - .maxlen = sizeof(__disable_trace_on_warning), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "tracepoint_printk", - .data = &tracepoint_printk, - .maxlen = sizeof(tracepoint_printk), - .mode = 0644, - .proc_handler = tracepoint_printk_sysctl, - }, -#endif #ifdef CONFIG_MODULES { .procname = "modprobe", @@ -1787,29 +1687,6 @@ static const struct ctl_table kern_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_MAXOLDUID, }, -#ifdef CONFIG_S390 - { - .procname = "userprocess_debug", - .data = &show_unhandled_signals, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif - { - .procname = "panic_on_oops", - .data = &panic_on_oops, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "panic_print", - .data = &panic_print, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - }, { .procname = "ngroups_max", .data = (void *)&ngroups_max, @@ -1843,15 +1720,6 @@ static const struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -#if defined(CONFIG_S390) && defined(CONFIG_SMP) - { - .procname = "spin_retry", - .data = &spin_retry, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif #ifdef CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN { .procname = "ignore-unaligned-usertrap", @@ -1870,15 +1738,6 @@ static const struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif - { - .procname = "panic_on_warn", - .data = &panic_on_warn, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, #ifdef CONFIG_TREE_RCU { .procname = "panic_on_rcu_stall", @@ -1901,215 +1760,9 @@ static const struct ctl_table kern_table[] = { #endif }; -static const struct ctl_table vm_table[] = { - { - .procname = "overcommit_memory", - .data = &sysctl_overcommit_memory, - .maxlen = sizeof(sysctl_overcommit_memory), - .mode = 0644, - .proc_handler = overcommit_policy_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_TWO, - }, - { - .procname = "overcommit_ratio", - .data = &sysctl_overcommit_ratio, - .maxlen = sizeof(sysctl_overcommit_ratio), - .mode = 0644, - .proc_handler = overcommit_ratio_handler, - }, - { - .procname = "overcommit_kbytes", - .data = &sysctl_overcommit_kbytes, - .maxlen = sizeof(sysctl_overcommit_kbytes), - .mode = 0644, - .proc_handler = overcommit_kbytes_handler, - }, - { - .procname = "page-cluster", - .data = &page_cluster, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = (void *)&page_cluster_max, - }, - { - .procname = "dirtytime_expire_seconds", - .data = &dirtytime_expire_interval, - .maxlen = sizeof(dirtytime_expire_interval), - .mode = 0644, - .proc_handler = dirtytime_interval_handler, - .extra1 = SYSCTL_ZERO, - }, - { - .procname = "swappiness", - .data = &vm_swappiness, - .maxlen = sizeof(vm_swappiness), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_TWO_HUNDRED, - }, -#ifdef CONFIG_NUMA - { - .procname = "numa_stat", - .data = &sysctl_vm_numa_stat, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = sysctl_vm_numa_stat_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif - { - .procname = "drop_caches", - .data = &sysctl_drop_caches, - .maxlen = sizeof(int), - .mode = 0200, - .proc_handler = drop_caches_sysctl_handler, - .extra1 = SYSCTL_ONE, - .extra2 = SYSCTL_FOUR, - }, - { - .procname = "page_lock_unfairness", - .data = &sysctl_page_lock_unfairness, - .maxlen = sizeof(sysctl_page_lock_unfairness), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, -#ifdef CONFIG_MMU - { - .procname = "max_map_count", - .data = &sysctl_max_map_count, - .maxlen = sizeof(sysctl_max_map_count), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, -#else - { - .procname = "nr_trim_pages", - .data = &sysctl_nr_trim_pages, - .maxlen = sizeof(sysctl_nr_trim_pages), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, -#endif - { - .procname = "vfs_cache_pressure", - .data = &sysctl_vfs_cache_pressure, - .maxlen = sizeof(sysctl_vfs_cache_pressure), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, -#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \ - defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT) - { - .procname = "legacy_va_layout", - .data = &sysctl_legacy_va_layout, - .maxlen = sizeof(sysctl_legacy_va_layout), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, -#endif -#ifdef CONFIG_NUMA - { - .procname = "zone_reclaim_mode", - .data = &node_reclaim_mode, - .maxlen = sizeof(node_reclaim_mode), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, -#endif -#ifdef CONFIG_SMP - { - .procname = "stat_interval", - .data = &sysctl_stat_interval, - .maxlen = sizeof(sysctl_stat_interval), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "stat_refresh", - .data = NULL, - .maxlen = 0, - .mode = 0600, - .proc_handler = vmstat_refresh, - }, -#endif -#ifdef CONFIG_MMU - { - .procname = "mmap_min_addr", - .data = &dac_mmap_min_addr, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = mmap_min_addr_handler, - }, -#endif -#if (defined(CONFIG_X86_32) && !defined(CONFIG_UML))|| \ - (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) - { - .procname = "vdso_enabled", -#ifdef CONFIG_X86_32 - .data = &vdso32_enabled, - .maxlen = sizeof(vdso32_enabled), -#else - .data = &vdso_enabled, - .maxlen = sizeof(vdso_enabled), -#endif - .mode = 0644, - .proc_handler = proc_dointvec, - .extra1 = SYSCTL_ZERO, - }, -#endif - { - .procname = "user_reserve_kbytes", - .data = &sysctl_user_reserve_kbytes, - .maxlen = sizeof(sysctl_user_reserve_kbytes), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - }, - { - .procname = "admin_reserve_kbytes", - .data = &sysctl_admin_reserve_kbytes, - .maxlen = sizeof(sysctl_admin_reserve_kbytes), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - }, -#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS - { - .procname = "mmap_rnd_bits", - .data = &mmap_rnd_bits, - .maxlen = sizeof(mmap_rnd_bits), - .mode = 0600, - .proc_handler = proc_dointvec_minmax, - .extra1 = (void *)&mmap_rnd_bits_min, - .extra2 = (void *)&mmap_rnd_bits_max, - }, -#endif -#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS - { - .procname = "mmap_rnd_compat_bits", - .data = &mmap_rnd_compat_bits, - .maxlen = sizeof(mmap_rnd_compat_bits), - .mode = 0600, - .proc_handler = proc_dointvec_minmax, - .extra1 = (void *)&mmap_rnd_compat_bits_min, - .extra2 = (void *)&mmap_rnd_compat_bits_max, - }, -#endif -}; - int __init sysctl_init_bases(void) { register_sysctl_init("kernel", kern_table); - register_sysctl_init("vm", vm_table); return 0; } diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 0ddccdff119a..577f0e6842d4 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -70,12 +70,10 @@ static DEFINE_SPINLOCK(rtcdev_lock); */ struct rtc_device *alarmtimer_get_rtcdev(void) { - unsigned long flags; struct rtc_device *ret; - spin_lock_irqsave(&rtcdev_lock, flags); + guard(spinlock_irqsave)(&rtcdev_lock); ret = rtcdev; - spin_unlock_irqrestore(&rtcdev_lock, flags); return ret; } @@ -83,7 +81,6 @@ EXPORT_SYMBOL_GPL(alarmtimer_get_rtcdev); static int alarmtimer_rtc_add_device(struct device *dev) { - unsigned long flags; struct rtc_device *rtc = to_rtc_device(dev); struct platform_device *pdev; int ret = 0; @@ -101,25 +98,18 @@ static int alarmtimer_rtc_add_device(struct device *dev) if (!IS_ERR(pdev)) device_init_wakeup(&pdev->dev, true); - spin_lock_irqsave(&rtcdev_lock, flags); - if (!IS_ERR(pdev) && !rtcdev) { - if (!try_module_get(rtc->owner)) { + scoped_guard(spinlock_irqsave, &rtcdev_lock) { + if (!IS_ERR(pdev) && !rtcdev && try_module_get(rtc->owner)) { + rtcdev = rtc; + /* hold a reference so it doesn't go away */ + get_device(dev); + pdev = NULL; + } else { ret = -1; - goto unlock; } - - rtcdev = rtc; - /* hold a reference so it doesn't go away */ - get_device(dev); - pdev = NULL; - } else { - ret = -1; } -unlock: - spin_unlock_irqrestore(&rtcdev_lock, flags); platform_device_unregister(pdev); - return ret; } @@ -198,7 +188,7 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) struct alarm *alarm = container_of(timer, struct alarm, timer); struct alarm_base *base = &alarm_bases[alarm->type]; - scoped_guard (spinlock_irqsave, &base->lock) + scoped_guard(spinlock_irqsave, &base->lock) alarmtimer_dequeue(base, alarm); if (alarm->function) @@ -228,17 +218,16 @@ EXPORT_SYMBOL_GPL(alarm_expires_remaining); static int alarmtimer_suspend(struct device *dev) { ktime_t min, now, expires; - int i, ret, type; struct rtc_device *rtc; - unsigned long flags; struct rtc_time tm; + int i, ret, type; - spin_lock_irqsave(&freezer_delta_lock, flags); - min = freezer_delta; - expires = freezer_expires; - type = freezer_alarmtype; - freezer_delta = 0; - spin_unlock_irqrestore(&freezer_delta_lock, flags); + scoped_guard(spinlock_irqsave, &freezer_delta_lock) { + min = freezer_delta; + expires = freezer_expires; + type = freezer_alarmtype; + freezer_delta = 0; + } rtc = alarmtimer_get_rtcdev(); /* If we have no rtcdev, just return */ @@ -251,9 +240,8 @@ static int alarmtimer_suspend(struct device *dev) struct timerqueue_node *next; ktime_t delta; - spin_lock_irqsave(&base->lock, flags); - next = timerqueue_getnext(&base->timerqueue); - spin_unlock_irqrestore(&base->lock, flags); + scoped_guard(spinlock_irqsave, &base->lock) + next = timerqueue_getnext(&base->timerqueue); if (!next) continue; delta = ktime_sub(next->expires, base->get_ktime()); @@ -352,13 +340,12 @@ EXPORT_SYMBOL_GPL(alarm_init); void alarm_start(struct alarm *alarm, ktime_t start) { struct alarm_base *base = &alarm_bases[alarm->type]; - unsigned long flags; - spin_lock_irqsave(&base->lock, flags); - alarm->node.expires = start; - alarmtimer_enqueue(base, alarm); - hrtimer_start(&alarm->timer, alarm->node.expires, HRTIMER_MODE_ABS); - spin_unlock_irqrestore(&base->lock, flags); + scoped_guard(spinlock_irqsave, &base->lock) { + alarm->node.expires = start; + alarmtimer_enqueue(base, alarm); + hrtimer_start(&alarm->timer, alarm->node.expires, HRTIMER_MODE_ABS); + } trace_alarmtimer_start(alarm, base->get_ktime()); } @@ -381,13 +368,11 @@ EXPORT_SYMBOL_GPL(alarm_start_relative); void alarm_restart(struct alarm *alarm) { struct alarm_base *base = &alarm_bases[alarm->type]; - unsigned long flags; - spin_lock_irqsave(&base->lock, flags); + guard(spinlock_irqsave)(&base->lock); hrtimer_set_expires(&alarm->timer, alarm->node.expires); hrtimer_restart(&alarm->timer); alarmtimer_enqueue(base, alarm); - spin_unlock_irqrestore(&base->lock, flags); } EXPORT_SYMBOL_GPL(alarm_restart); @@ -401,14 +386,13 @@ EXPORT_SYMBOL_GPL(alarm_restart); int alarm_try_to_cancel(struct alarm *alarm) { struct alarm_base *base = &alarm_bases[alarm->type]; - unsigned long flags; int ret; - spin_lock_irqsave(&base->lock, flags); - ret = hrtimer_try_to_cancel(&alarm->timer); - if (ret >= 0) - alarmtimer_dequeue(base, alarm); - spin_unlock_irqrestore(&base->lock, flags); + scoped_guard(spinlock_irqsave, &base->lock) { + ret = hrtimer_try_to_cancel(&alarm->timer); + if (ret >= 0) + alarmtimer_dequeue(base, alarm); + } trace_alarmtimer_cancel(alarm, base->get_ktime()); return ret; @@ -479,7 +463,6 @@ EXPORT_SYMBOL_GPL(alarm_forward_now); static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type) { struct alarm_base *base; - unsigned long flags; ktime_t delta; switch(type) { @@ -498,13 +481,12 @@ static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type) delta = ktime_sub(absexp, base->get_ktime()); - spin_lock_irqsave(&freezer_delta_lock, flags); + guard(spinlock_irqsave)(&freezer_delta_lock); if (!freezer_delta || (delta < freezer_delta)) { freezer_delta = delta; freezer_expires = absexp; freezer_alarmtype = type; } - spin_unlock_irqrestore(&freezer_delta_lock, flags); } /** @@ -515,9 +497,9 @@ static enum alarmtimer_type clock2alarm(clockid_t clockid) { if (clockid == CLOCK_REALTIME_ALARM) return ALARM_REALTIME; - if (clockid == CLOCK_BOOTTIME_ALARM) - return ALARM_BOOTTIME; - return -1; + + WARN_ON_ONCE(clockid != CLOCK_BOOTTIME_ALARM); + return ALARM_BOOTTIME; } /** diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 2a7802ec480c..6a8bc7da9062 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -310,7 +310,7 @@ static void clocksource_verify_choose_cpus(void) { int cpu, i, n = verify_n_cpus; - if (n < 0) { + if (n < 0 || n >= num_online_cpus()) { /* Check all of the CPUs. */ cpumask_copy(&cpus_chosen, cpu_online_mask); cpumask_clear_cpu(smp_processor_id(), &cpus_chosen); @@ -619,7 +619,7 @@ static inline void clocksource_stop_watchdog(void) { if (!watchdog_running || (watchdog && !list_empty(&watchdog_list))) return; - del_timer(&watchdog_timer); + timer_delete(&watchdog_timer); watchdog_running = 0; } @@ -1510,7 +1510,7 @@ static int __init boot_override_clocksource(char* str) { mutex_lock(&clocksource_mutex); if (str) - strscpy(override_name, str, sizeof(override_name)); + strscpy(override_name, str); mutex_unlock(&clocksource_mutex); return 1; } diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index deb1aa32814e..30899a8cc52c 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -117,16 +117,6 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = .csd = CSD_INIT(retrigger_next_event, NULL) }; -static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { - /* Make sure we catch unsupported clockids */ - [0 ... MAX_CLOCKS - 1] = HRTIMER_MAX_CLOCK_BASES, - - [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, - [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, - [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME, - [CLOCK_TAI] = HRTIMER_BASE_TAI, -}; - static inline bool hrtimer_base_is_online(struct hrtimer_cpu_base *base) { if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) @@ -376,7 +366,7 @@ static const struct debug_obj_descr hrtimer_debug_descr; static void *hrtimer_debug_hint(void *addr) { - return ((struct hrtimer *) addr)->function; + return ACCESS_PRIVATE((struct hrtimer *)addr, function); } /* @@ -475,19 +465,17 @@ static inline void debug_hrtimer_activate(struct hrtimer *timer, static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { } #endif -static inline void -debug_init(struct hrtimer *timer, clockid_t clockid, - enum hrtimer_mode mode) +static inline void debug_setup(struct hrtimer *timer, clockid_t clockid, enum hrtimer_mode mode) { debug_hrtimer_init(timer); - trace_hrtimer_init(timer, clockid, mode); + trace_hrtimer_setup(timer, clockid, mode); } -static inline void debug_init_on_stack(struct hrtimer *timer, clockid_t clockid, - enum hrtimer_mode mode) +static inline void debug_setup_on_stack(struct hrtimer *timer, clockid_t clockid, + enum hrtimer_mode mode) { debug_hrtimer_init_on_stack(timer); - trace_hrtimer_init(timer, clockid, mode); + trace_hrtimer_setup(timer, clockid, mode); } static inline void debug_activate(struct hrtimer *timer, @@ -1326,8 +1314,6 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, struct hrtimer_clock_base *base; unsigned long flags; - if (WARN_ON_ONCE(!timer->function)) - return; /* * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft * match on CONFIG_PREEMPT_RT = n. With PREEMPT_RT check the hard @@ -1439,7 +1425,7 @@ static __always_inline bool is_migration_base(struct hrtimer_clock_base *base) * running. * * This prevents priority inversion: if the soft irq thread is preempted - * in the middle of a timer callback, then calling del_timer_sync() can + * in the middle of a timer callback, then calling hrtimer_cancel() can * lead to two issues: * * - If the caller is on a remote CPU then it has to spin wait for the timer @@ -1587,23 +1573,24 @@ u64 hrtimer_next_event_without(const struct hrtimer *exclude) static inline int hrtimer_clockid_to_base(clockid_t clock_id) { - if (likely(clock_id < MAX_CLOCKS)) { - int base = hrtimer_clock_to_base_table[clock_id]; - - if (likely(base != HRTIMER_MAX_CLOCK_BASES)) - return base; + switch (clock_id) { + case CLOCK_REALTIME: + return HRTIMER_BASE_REALTIME; + case CLOCK_MONOTONIC: + return HRTIMER_BASE_MONOTONIC; + case CLOCK_BOOTTIME: + return HRTIMER_BASE_BOOTTIME; + case CLOCK_TAI: + return HRTIMER_BASE_TAI; + default: + WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id); + return HRTIMER_BASE_MONOTONIC; } - WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id); - return HRTIMER_BASE_MONOTONIC; -} - -static enum hrtimer_restart hrtimer_dummy_timeout(struct hrtimer *unused) -{ - return HRTIMER_NORESTART; } -static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, - enum hrtimer_mode mode) +static void __hrtimer_setup(struct hrtimer *timer, + enum hrtimer_restart (*function)(struct hrtimer *), + clockid_t clock_id, enum hrtimer_mode mode) { bool softtimer = !!(mode & HRTIMER_MODE_SOFT); struct hrtimer_cpu_base *cpu_base; @@ -1636,41 +1623,14 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, timer->is_hard = !!(mode & HRTIMER_MODE_HARD); timer->base = &cpu_base->clock_base[base]; timerqueue_init(&timer->node); -} - -static void __hrtimer_setup(struct hrtimer *timer, - enum hrtimer_restart (*function)(struct hrtimer *), - clockid_t clock_id, enum hrtimer_mode mode) -{ - __hrtimer_init(timer, clock_id, mode); if (WARN_ON_ONCE(!function)) - timer->function = hrtimer_dummy_timeout; + ACCESS_PRIVATE(timer, function) = hrtimer_dummy_timeout; else - timer->function = function; + ACCESS_PRIVATE(timer, function) = function; } /** - * hrtimer_init - initialize a timer to the given clock - * @timer: the timer to be initialized - * @clock_id: the clock to be used - * @mode: The modes which are relevant for initialization: - * HRTIMER_MODE_ABS, HRTIMER_MODE_REL, HRTIMER_MODE_ABS_SOFT, - * HRTIMER_MODE_REL_SOFT - * - * The PINNED variants of the above can be handed in, - * but the PINNED bit is ignored as pinning happens - * when the hrtimer is started - */ -void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, - enum hrtimer_mode mode) -{ - debug_init(timer, clock_id, mode); - __hrtimer_init(timer, clock_id, mode); -} -EXPORT_SYMBOL_GPL(hrtimer_init); - -/** * hrtimer_setup - initialize a timer to the given clock * @timer: the timer to be initialized * @function: the callback function @@ -1686,7 +1646,7 @@ EXPORT_SYMBOL_GPL(hrtimer_init); void hrtimer_setup(struct hrtimer *timer, enum hrtimer_restart (*function)(struct hrtimer *), clockid_t clock_id, enum hrtimer_mode mode) { - debug_init(timer, clock_id, mode); + debug_setup(timer, clock_id, mode); __hrtimer_setup(timer, function, clock_id, mode); } EXPORT_SYMBOL_GPL(hrtimer_setup); @@ -1705,7 +1665,7 @@ void hrtimer_setup_on_stack(struct hrtimer *timer, enum hrtimer_restart (*function)(struct hrtimer *), clockid_t clock_id, enum hrtimer_mode mode) { - debug_init_on_stack(timer, clock_id, mode); + debug_setup_on_stack(timer, clock_id, mode); __hrtimer_setup(timer, function, clock_id, mode); } EXPORT_SYMBOL_GPL(hrtimer_setup_on_stack); @@ -1779,7 +1739,7 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, raw_write_seqcount_barrier(&base->seq); __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0); - fn = timer->function; + fn = ACCESS_PRIVATE(timer, function); /* * Clear the 'is relative' flag for the TIME_LOW_RES case. If the @@ -2054,7 +2014,7 @@ void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl, * Make the enqueue delivery mode check work on RT. If the sleeper * was initialized for hard interrupt delivery, force the mode bit. * This is a special case for hrtimer_sleepers because - * __hrtimer_init_sleeper() determines the delivery mode on RT so the + * __hrtimer_setup_sleeper() determines the delivery mode on RT so the * fiddling with this decision is avoided at the call sites. */ if (IS_ENABLED(CONFIG_PREEMPT_RT) && sl->timer.is_hard) @@ -2064,8 +2024,8 @@ void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl, } EXPORT_SYMBOL_GPL(hrtimer_sleeper_start_expires); -static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl, - clockid_t clock_id, enum hrtimer_mode mode) +static void __hrtimer_setup_sleeper(struct hrtimer_sleeper *sl, + clockid_t clock_id, enum hrtimer_mode mode) { /* * On PREEMPT_RT enabled kernels hrtimers which are not explicitly @@ -2091,8 +2051,7 @@ static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl, mode |= HRTIMER_MODE_HARD; } - __hrtimer_init(&sl->timer, clock_id, mode); - sl->timer.function = hrtimer_wakeup; + __hrtimer_setup(&sl->timer, hrtimer_wakeup, clock_id, mode); sl->task = current; } @@ -2105,8 +2064,8 @@ static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl, void hrtimer_setup_sleeper_on_stack(struct hrtimer_sleeper *sl, clockid_t clock_id, enum hrtimer_mode mode) { - debug_init_on_stack(&sl->timer, clock_id, mode); - __hrtimer_init_sleeper(sl, clock_id, mode); + debug_setup_on_stack(&sl->timer, clock_id, mode); + __hrtimer_setup_sleeper(sl, clock_id, mode); } EXPORT_SYMBOL_GPL(hrtimer_setup_sleeper_on_stack); diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index bc4db9e5ab70..34eeacac2253 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -75,13 +75,11 @@ struct clocksource * __init __weak clocksource_default_clock(void) static struct clocksource refined_jiffies; -int register_refined_jiffies(long cycles_per_second) +void __init register_refined_jiffies(long cycles_per_second) { u64 nsec_per_tick, shift_hz; long cycles_per_tick; - - refined_jiffies = clocksource_jiffies; refined_jiffies.name = "refined-jiffies"; refined_jiffies.rating++; @@ -100,5 +98,4 @@ int register_refined_jiffies(long cycles_per_second) refined_jiffies.mult = ((u32)nsec_per_tick) << JIFFIES_SHIFT; __clocksource_register(&refined_jiffies); - return 0; } diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 0775b9ec952a..e3642278df43 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -165,26 +165,26 @@ static struct timens_offset offset_from_ts(struct timespec64 off) * HVCLOCK * VVAR * - * The check for vdso_data->clock_mode is in the unlikely path of + * The check for vdso_clock->clock_mode is in the unlikely path of * the seq begin magic. So for the non-timens case most of the time * 'seq' is even, so the branch is not taken. * * If 'seq' is odd, i.e. a concurrent update is in progress, the extra check - * for vdso_data->clock_mode is a non-issue. The task is spin waiting for the + * for vdso_clock->clock_mode is a non-issue. The task is spin waiting for the * update to finish and for 'seq' to become even anyway. * - * Timens page has vdso_data->clock_mode set to VDSO_CLOCKMODE_TIMENS which + * Timens page has vdso_clock->clock_mode set to VDSO_CLOCKMODE_TIMENS which * enforces the time namespace handling path. */ -static void timens_setup_vdso_data(struct vdso_data *vdata, - struct time_namespace *ns) +static void timens_setup_vdso_clock_data(struct vdso_clock *vc, + struct time_namespace *ns) { - struct timens_offset *offset = vdata->offset; + struct timens_offset *offset = vc->offset; struct timens_offset monotonic = offset_from_ts(ns->offsets.monotonic); struct timens_offset boottime = offset_from_ts(ns->offsets.boottime); - vdata->seq = 1; - vdata->clock_mode = VDSO_CLOCKMODE_TIMENS; + vc->seq = 1; + vc->clock_mode = VDSO_CLOCKMODE_TIMENS; offset[CLOCK_MONOTONIC] = monotonic; offset[CLOCK_MONOTONIC_RAW] = monotonic; offset[CLOCK_MONOTONIC_COARSE] = monotonic; @@ -219,7 +219,8 @@ static DEFINE_MUTEX(offset_lock); static void timens_set_vvar_page(struct task_struct *task, struct time_namespace *ns) { - struct vdso_data *vdata; + struct vdso_time_data *vdata; + struct vdso_clock *vc; unsigned int i; if (ns == &init_time_ns) @@ -235,10 +236,11 @@ static void timens_set_vvar_page(struct task_struct *task, goto out; ns->frozen_offsets = true; - vdata = arch_get_vdso_data(page_address(ns->vvar_page)); + vdata = page_address(ns->vvar_page); + vc = vdata->clock_data; for (i = 0; i < CS_BASES; i++) - timens_setup_vdso_data(&vdata[i], ns); + timens_setup_vdso_clock_data(&vc[i], ns); out: mutex_unlock(&offset_lock); diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 163e7a2033b6..b837d3d9d325 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -678,8 +678,7 @@ void ntp_notify_cmos_timer(bool offset_set) static void __init ntp_init_cmos_sync(void) { - hrtimer_init(&sync_hrtimer, CLOCK_REALTIME, HRTIMER_MODE_ABS); - sync_hrtimer.function = sync_timer_callback; + hrtimer_setup(&sync_hrtimer, sync_timer_callback, CLOCK_REALTIME, HRTIMER_MODE_ABS); } #else /* CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC) */ static inline void __init ntp_init_cmos_sync(void) { } diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index 1af0bb2cc45c..101a0f7c43e0 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -90,26 +90,6 @@ static long posix_clock_ioctl(struct file *fp, return err; } -#ifdef CONFIG_COMPAT -static long posix_clock_compat_ioctl(struct file *fp, - unsigned int cmd, unsigned long arg) -{ - struct posix_clock_context *pccontext = fp->private_data; - struct posix_clock *clk = get_posix_clock(fp); - int err = -ENOTTY; - - if (!clk) - return -ENODEV; - - if (clk->ops.ioctl) - err = clk->ops.ioctl(pccontext, cmd, arg); - - put_posix_clock(clk); - - return err; -} -#endif - static int posix_clock_open(struct inode *inode, struct file *fp) { int err; @@ -129,6 +109,7 @@ static int posix_clock_open(struct inode *inode, struct file *fp) goto out; } pccontext->clk = clk; + pccontext->fp = fp; if (clk->ops.open) { err = clk->ops.open(pccontext, fp->f_mode); if (err) { @@ -171,11 +152,9 @@ static const struct file_operations posix_clock_file_operations = { .read = posix_clock_read, .poll = posix_clock_poll, .unlocked_ioctl = posix_clock_ioctl, + .compat_ioctl = posix_clock_ioctl, .open = posix_clock_open, .release = posix_clock_release, -#ifdef CONFIG_COMPAT - .compat_ioctl = posix_clock_compat_ioctl, -#endif }; int posix_clock_register(struct posix_clock *clk, struct device *dev) @@ -251,7 +230,7 @@ static int pc_clock_adjtime(clockid_t id, struct __kernel_timex *tx) if (err) return err; - if ((cd.fp->f_mode & FMODE_WRITE) == 0) { + if (tx->modes && (cd.fp->f_mode & FMODE_WRITE) == 0) { err = -EACCES; goto out; } diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 1b675aee99a9..2053b1a4c9e4 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -9,34 +9,27 @@ * * These are all the functions necessary to implement POSIX clocks & timers */ -#include <linux/mm.h> +#include <linux/compat.h> +#include <linux/compiler.h> +#include <linux/init.h> +#include <linux/jhash.h> #include <linux/interrupt.h> -#include <linux/slab.h> -#include <linux/time.h> -#include <linux/mutex.h> -#include <linux/sched/task.h> - -#include <linux/uaccess.h> #include <linux/list.h> -#include <linux/init.h> -#include <linux/compiler.h> -#include <linux/hash.h> +#include <linux/memblock.h> +#include <linux/nospec.h> #include <linux/posix-clock.h> #include <linux/posix-timers.h> +#include <linux/prctl.h> +#include <linux/sched/task.h> +#include <linux/slab.h> #include <linux/syscalls.h> -#include <linux/wait.h> -#include <linux/workqueue.h> -#include <linux/export.h> -#include <linux/hashtable.h> -#include <linux/compat.h> -#include <linux/nospec.h> +#include <linux/time.h> #include <linux/time_namespace.h> +#include <linux/uaccess.h> #include "timekeeping.h" #include "posix-timers.h" -static struct kmem_cache *posix_timers_cache; - /* * Timers are managed in a hash table for lockless lookup. The hash key is * constructed from current::signal and the timer ID and the timer is @@ -46,39 +39,67 @@ static struct kmem_cache *posix_timers_cache; * This allows checkpoint/restore to reconstruct the exact timer IDs for * a process. */ -static DEFINE_HASHTABLE(posix_timers_hashtable, 9); -static DEFINE_SPINLOCK(hash_lock); +struct timer_hash_bucket { + spinlock_t lock; + struct hlist_head head; +}; + +static struct { + struct timer_hash_bucket *buckets; + unsigned long mask; + struct kmem_cache *cache; +} __timer_data __ro_after_init __aligned(4*sizeof(long)); + +#define timer_buckets (__timer_data.buckets) +#define timer_hashmask (__timer_data.mask) +#define posix_timers_cache (__timer_data.cache) static const struct k_clock * const posix_clocks[]; static const struct k_clock *clockid_to_kclock(const clockid_t id); static const struct k_clock clock_realtime, clock_monotonic; +#define TIMER_ANY_ID INT_MIN + /* SIGEV_THREAD_ID cannot share a bit with the other SIGEV values. */ #if SIGEV_THREAD_ID != (SIGEV_THREAD_ID & \ ~(SIGEV_SIGNAL | SIGEV_NONE | SIGEV_THREAD)) #error "SIGEV_THREAD_ID must not share bit with other SIGEV values!" #endif -static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags); +static struct k_itimer *__lock_timer(timer_t timer_id); -#define lock_timer(tid, flags) \ -({ struct k_itimer *__timr; \ - __cond_lock(&__timr->it_lock, __timr = __lock_timer(tid, flags)); \ - __timr; \ +#define lock_timer(tid) \ +({ struct k_itimer *__timr; \ + __cond_lock(&__timr->it_lock, __timr = __lock_timer(tid)); \ + __timr; \ }) -static int hash(struct signal_struct *sig, unsigned int nr) +static inline void unlock_timer(struct k_itimer *timr) { - return hash_32(hash32_ptr(sig) ^ nr, HASH_BITS(posix_timers_hashtable)); + if (likely((timr))) + spin_unlock_irq(&timr->it_lock); } -static struct k_itimer *__posix_timers_find(struct hlist_head *head, - struct signal_struct *sig, - timer_t id) +#define scoped_timer_get_or_fail(_id) \ + scoped_cond_guard(lock_timer, return -EINVAL, _id) + +#define scoped_timer (scope) + +DEFINE_CLASS(lock_timer, struct k_itimer *, unlock_timer(_T), __lock_timer(id), timer_t id); +DEFINE_CLASS_IS_COND_GUARD(lock_timer); + +static struct timer_hash_bucket *hash_bucket(struct signal_struct *sig, unsigned int nr) { + return &timer_buckets[jhash2((u32 *)&sig, sizeof(sig) / sizeof(u32), nr) & timer_hashmask]; +} + +static struct k_itimer *posix_timer_by_id(timer_t id) +{ + struct signal_struct *sig = current->signal; + struct timer_hash_bucket *bucket = hash_bucket(sig, id); struct k_itimer *timer; - hlist_for_each_entry_rcu(timer, head, t_hash, lockdep_is_held(&hash_lock)) { + hlist_for_each_entry_rcu(timer, &bucket->head, t_hash) { /* timer->it_signal can be set concurrently */ if ((READ_ONCE(timer->it_signal) == sig) && (timer->it_id == id)) return timer; @@ -86,46 +107,88 @@ static struct k_itimer *__posix_timers_find(struct hlist_head *head, return NULL; } -static struct k_itimer *posix_timer_by_id(timer_t id) +static inline struct signal_struct *posix_sig_owner(const struct k_itimer *timer) { - struct signal_struct *sig = current->signal; - struct hlist_head *head = &posix_timers_hashtable[hash(sig, id)]; + unsigned long val = (unsigned long)timer->it_signal; - return __posix_timers_find(head, sig, id); + /* + * Mask out bit 0, which acts as invalid marker to prevent + * posix_timer_by_id() detecting it as valid. + */ + return (struct signal_struct *)(val & ~1UL); } -static int posix_timer_add(struct k_itimer *timer) +static bool posix_timer_hashed(struct timer_hash_bucket *bucket, struct signal_struct *sig, + timer_t id) { - struct signal_struct *sig = current->signal; - struct hlist_head *head; - unsigned int cnt, id; + struct hlist_head *head = &bucket->head; + struct k_itimer *timer; - /* - * FIXME: Replace this by a per signal struct xarray once there is - * a plan to handle the resulting CRIU regression gracefully. - */ - for (cnt = 0; cnt <= INT_MAX; cnt++) { - spin_lock(&hash_lock); - id = sig->next_posix_timer_id; + hlist_for_each_entry_rcu(timer, head, t_hash, lockdep_is_held(&bucket->lock)) { + if ((posix_sig_owner(timer) == sig) && (timer->it_id == id)) + return true; + } + return false; +} - /* Write the next ID back. Clamp it to the positive space */ - sig->next_posix_timer_id = (id + 1) & INT_MAX; +static bool posix_timer_add_at(struct k_itimer *timer, struct signal_struct *sig, unsigned int id) +{ + struct timer_hash_bucket *bucket = hash_bucket(sig, id); - head = &posix_timers_hashtable[hash(sig, id)]; - if (!__posix_timers_find(head, sig, id)) { - hlist_add_head_rcu(&timer->t_hash, head); - spin_unlock(&hash_lock); - return id; + scoped_guard (spinlock, &bucket->lock) { + /* + * Validate under the lock as this could have raced against + * another thread ending up with the same ID, which is + * highly unlikely, but possible. + */ + if (!posix_timer_hashed(bucket, sig, id)) { + /* + * Set the timer ID and the signal pointer to make + * it identifiable in the hash table. The signal + * pointer has bit 0 set to indicate that it is not + * yet fully initialized. posix_timer_hashed() + * masks this bit out, but the syscall lookup fails + * to match due to it being set. This guarantees + * that there can't be duplicate timer IDs handed + * out. + */ + timer->it_id = (timer_t)id; + timer->it_signal = (struct signal_struct *)((unsigned long)sig | 1UL); + hlist_add_head_rcu(&timer->t_hash, &bucket->head); + return true; } - spin_unlock(&hash_lock); } - /* POSIX return code when no timer ID could be allocated */ - return -EAGAIN; + return false; } -static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) +static int posix_timer_add(struct k_itimer *timer, int req_id) { - spin_unlock_irqrestore(&timr->it_lock, flags); + struct signal_struct *sig = current->signal; + + if (unlikely(req_id != TIMER_ANY_ID)) { + if (!posix_timer_add_at(timer, sig, req_id)) + return -EBUSY; + + /* + * Move the ID counter past the requested ID, so that after + * switching back to normal mode the IDs are outside of the + * exact allocated region. That avoids ID collisions on the + * next regular timer_create() invocations. + */ + atomic_set(&sig->next_posix_timer_id, req_id + 1); + return req_id; + } + + for (unsigned int cnt = 0; cnt <= INT_MAX; cnt++) { + /* Get the next timer ID and clamp it to positive space */ + unsigned int id = atomic_fetch_inc(&sig->next_posix_timer_id) & INT_MAX; + + if (posix_timer_add_at(timer, sig, id)) + return id; + cond_resched(); + } + /* POSIX return code when no timer ID could be allocated */ + return -EAGAIN; } static int posix_get_realtime_timespec(clockid_t which_clock, struct timespec64 *tp) @@ -220,15 +283,6 @@ static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp) return 0; } -static __init int init_posix_timers(void) -{ - posix_timers_cache = kmem_cache_create("posix_timers_cache", - sizeof(struct k_itimer), 0, - SLAB_PANIC | SLAB_ACCOUNT, NULL); - return 0; -} -__initcall(init_posix_timers); - /* * The siginfo si_overrun field and the return value of timer_getoverrun(2) * are of type int. Clamp the overrun value to INT_MAX @@ -259,7 +313,7 @@ static bool __posixtimer_deliver_signal(struct kernel_siginfo *info, struct k_it * since the signal was queued. In either case, don't rearm and * drop the signal. */ - if (timr->it_signal_seq != timr->it_sigqueue_seq || WARN_ON_ONCE(!timr->it_signal)) + if (timr->it_signal_seq != timr->it_sigqueue_seq || WARN_ON_ONCE(!posixtimer_valid(timr))) return false; if (!timr->it_interval || WARN_ON_ONCE(timr->it_status != POSIX_TIMER_REQUEUE_PENDING)) @@ -304,6 +358,9 @@ void posix_timer_queue_signal(struct k_itimer *timr) { lockdep_assert_held(&timr->it_lock); + if (!posixtimer_valid(timr)) + return; + timr->it_status = timr->it_interval ? POSIX_TIMER_REQUEUE_PENDING : POSIX_TIMER_DISARMED; posixtimer_send_sigqueue(timr); } @@ -324,6 +381,21 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) return HRTIMER_NORESTART; } +long posixtimer_create_prctl(unsigned long ctrl) +{ + switch (ctrl) { + case PR_TIMER_CREATE_RESTORE_IDS_OFF: + current->signal->timer_create_restore_ids = 0; + return 0; + case PR_TIMER_CREATE_RESTORE_IDS_ON: + current->signal->timer_create_restore_ids = 1; + return 0; + case PR_TIMER_CREATE_RESTORE_IDS_GET: + return current->signal->timer_create_restore_ids; + } + return -EINVAL; +} + static struct pid *good_sigevent(sigevent_t * event) { struct pid *pid = task_tgid(current); @@ -350,8 +422,12 @@ static struct pid *good_sigevent(sigevent_t * event) static struct k_itimer *alloc_posix_timer(void) { - struct k_itimer *tmr = kmem_cache_zalloc(posix_timers_cache, GFP_KERNEL); + struct k_itimer *tmr; + if (unlikely(!posix_timers_cache)) + return NULL; + + tmr = kmem_cache_zalloc(posix_timers_cache, GFP_KERNEL); if (!tmr) return tmr; @@ -373,15 +449,16 @@ void posixtimer_free_timer(struct k_itimer *tmr) static void posix_timer_unhash_and_free(struct k_itimer *tmr) { - spin_lock(&hash_lock); - hlist_del_rcu(&tmr->t_hash); - spin_unlock(&hash_lock); + struct timer_hash_bucket *bucket = hash_bucket(posix_sig_owner(tmr), tmr->it_id); + + scoped_guard (spinlock, &bucket->lock) + hlist_del_rcu(&tmr->t_hash); posixtimer_putref(tmr); } static int common_timer_create(struct k_itimer *new_timer) { - hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0); + hrtimer_setup(&new_timer->it.real.timer, posix_timer_fn, new_timer->it_clock, 0); return 0; } @@ -390,6 +467,7 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, timer_t __user *created_timer_id) { const struct k_clock *kc = clockid_to_kclock(which_clock); + timer_t req_id = TIMER_ANY_ID; struct k_itimer *new_timer; int error, new_timer_id; @@ -404,26 +482,32 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, spin_lock_init(&new_timer->it_lock); + /* Special case for CRIU to restore timers with a given timer ID. */ + if (unlikely(current->signal->timer_create_restore_ids)) { + if (copy_from_user(&req_id, created_timer_id, sizeof(req_id))) + return -EFAULT; + /* Valid IDs are 0..INT_MAX */ + if ((unsigned int)req_id > INT_MAX) + return -EINVAL; + } + /* * Add the timer to the hash table. The timer is not yet valid - * because new_timer::it_signal is still NULL. The timer id is also - * not yet visible to user space. + * after insertion, but has a unique ID allocated. */ - new_timer_id = posix_timer_add(new_timer); + new_timer_id = posix_timer_add(new_timer, req_id); if (new_timer_id < 0) { posixtimer_free_timer(new_timer); return new_timer_id; } - new_timer->it_id = (timer_t) new_timer_id; new_timer->it_clock = which_clock; new_timer->kclock = kc; new_timer->it_overrun = -1LL; if (event) { - rcu_read_lock(); - new_timer->it_pid = get_pid(good_sigevent(event)); - rcu_read_unlock(); + scoped_guard (rcu) + new_timer->it_pid = get_pid(good_sigevent(event)); if (!new_timer->it_pid) { error = -EINVAL; goto out; @@ -434,7 +518,6 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, } else { new_timer->it_sigev_notify = SIGEV_SIGNAL; new_timer->sigq.info.si_signo = SIGALRM; - memset(&new_timer->sigq.info.si_value, 0, sizeof(sigval_t)); new_timer->sigq.info.si_value.sival_int = new_timer->it_id; new_timer->it_pid = get_pid(task_tgid(current)); } @@ -453,7 +536,7 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, } /* * After succesful copy out, the timer ID is visible to user space - * now but not yet valid because new_timer::signal is still NULL. + * now but not yet valid because new_timer::signal low order bit is 1. * * Complete the initialization with the clock specific create * callback. @@ -462,14 +545,25 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, if (error) goto out; - spin_lock_irq(¤t->sighand->siglock); - /* This makes the timer valid in the hash table */ - WRITE_ONCE(new_timer->it_signal, current->signal); - hlist_add_head(&new_timer->list, ¤t->signal->posix_timers); - spin_unlock_irq(¤t->sighand->siglock); /* - * After unlocking sighand::siglock @new_timer is subject to - * concurrent removal and cannot be touched anymore + * timer::it_lock ensures that __lock_timer() observes a fully + * initialized timer when it observes a valid timer::it_signal. + * + * sighand::siglock is required to protect signal::posix_timers. + */ + scoped_guard (spinlock_irq, &new_timer->it_lock) { + guard(spinlock)(¤t->sighand->siglock); + /* + * new_timer::it_signal contains the signal pointer with + * bit 0 set, which makes it invalid for syscall operations. + * Store the unmodified signal pointer to make it valid. + */ + WRITE_ONCE(new_timer->it_signal, current->signal); + hlist_add_head_rcu(&new_timer->list, ¤t->signal->posix_timers); + } + /* + * After unlocking @new_timer is subject to concurrent removal and + * cannot be touched anymore */ return 0; out: @@ -507,7 +601,7 @@ COMPAT_SYSCALL_DEFINE3(timer_create, clockid_t, which_clock, } #endif -static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) +static struct k_itimer *__lock_timer(timer_t timer_id) { struct k_itimer *timr; @@ -522,11 +616,11 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) * The hash lookup and the timers are RCU protected. * * Timers are added to the hash in invalid state where - * timr::it_signal == NULL. timer::it_signal is only set after the - * rest of the initialization succeeded. + * timr::it_signal is marked invalid. timer::it_signal is only set + * after the rest of the initialization succeeded. * * Timer destruction happens in steps: - * 1) Set timr::it_signal to NULL with timr::it_lock held + * 1) Set timr::it_signal marked invalid with timr::it_lock held * 2) Release timr::it_lock * 3) Remove from the hash under hash_lock * 4) Put the reference count. @@ -543,25 +637,21 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) * * The lookup validates locklessly that timr::it_signal == * current::it_signal and timr::it_id == @timer_id. timr::it_id - * can't change, but timr::it_signal becomes NULL during - * destruction. + * can't change, but timr::it_signal can become invalid during + * destruction, which makes the locked check fail. */ - rcu_read_lock(); + guard(rcu)(); timr = posix_timer_by_id(timer_id); if (timr) { - spin_lock_irqsave(&timr->it_lock, *flags); + spin_lock_irq(&timr->it_lock); /* * Validate under timr::it_lock that timr::it_signal is * still valid. Pairs with #1 above. */ - if (timr->it_signal == current->signal) { - rcu_read_unlock(); + if (timr->it_signal == current->signal) return timr; - } - spin_unlock_irqrestore(&timr->it_lock, *flags); + spin_unlock_irq(&timr->it_lock); } - rcu_read_unlock(); - return NULL; } @@ -652,24 +742,10 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) static int do_timer_gettime(timer_t timer_id, struct itimerspec64 *setting) { - const struct k_clock *kc; - struct k_itimer *timr; - unsigned long flags; - int ret = 0; - - timr = lock_timer(timer_id, &flags); - if (!timr) - return -EINVAL; - memset(setting, 0, sizeof(*setting)); - kc = timr->kclock; - if (WARN_ON_ONCE(!kc || !kc->timer_get)) - ret = -EINVAL; - else - kc->timer_get(timr, setting); - - unlock_timer(timr, flags); - return ret; + scoped_timer_get_or_fail(timer_id) + scoped_timer->kclock->timer_get(scoped_timer, setting); + return 0; } /* Get the time remaining on a POSIX.1b interval timer. */ @@ -723,18 +799,8 @@ SYSCALL_DEFINE2(timer_gettime32, timer_t, timer_id, */ SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id) { - struct k_itimer *timr; - unsigned long flags; - int overrun; - - timr = lock_timer(timer_id, &flags); - if (!timr) - return -EINVAL; - - overrun = timer_overrun_to_int(timr); - unlock_timer(timr, flags); - - return overrun; + scoped_timer_get_or_fail(timer_id) + return timer_overrun_to_int(scoped_timer); } static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires, @@ -747,7 +813,7 @@ static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires, /* * Posix magic: Relative CLOCK_REALTIME timers are not affected by * clock modifications, so they become CLOCK_MONOTONIC based under the - * hood. See hrtimer_init(). Update timr->kclock, so the generic + * hood. See hrtimer_setup(). Update timr->kclock, so the generic * functions which use timr->kclock->clock_get_*() work. * * Note: it_clock stays unmodified, because the next timer_set() might @@ -756,8 +822,7 @@ static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires, if (timr->it_clock == CLOCK_REALTIME) timr->kclock = absolute ? &clock_realtime : &clock_monotonic; - hrtimer_init(&timr->it.real.timer, timr->it_clock, mode); - timr->it.real.timer.function = posix_timer_fn; + hrtimer_setup(&timr->it.real.timer, posix_timer_fn, timr->it_clock, mode); if (!absolute) expires = ktime_add_safe(expires, timer->base->get_time()); @@ -791,26 +856,13 @@ static void common_timer_wait_running(struct k_itimer *timer) * when the task which tries to delete or disarm the timer has preempted * the task which runs the expiry in task work context. */ -static struct k_itimer *timer_wait_running(struct k_itimer *timer, - unsigned long *flags) +static void timer_wait_running(struct k_itimer *timer) { - const struct k_clock *kc = READ_ONCE(timer->kclock); - timer_t timer_id = READ_ONCE(timer->it_id); - - /* Prevent kfree(timer) after dropping the lock */ - rcu_read_lock(); - unlock_timer(timer, *flags); - /* * kc->timer_wait_running() might drop RCU lock. So @timer * cannot be touched anymore after the function returns! */ - if (!WARN_ON_ONCE(!kc->timer_wait_running)) - kc->timer_wait_running(timer); - - rcu_read_unlock(); - /* Relock the timer. It might be not longer hashed. */ - return lock_timer(timer_id, flags); + timer->kclock->timer_wait_running(timer); } /* @@ -865,15 +917,9 @@ int common_timer_set(struct k_itimer *timr, int flags, return 0; } -static int do_timer_settime(timer_t timer_id, int tmr_flags, - struct itimerspec64 *new_spec64, +static int do_timer_settime(timer_t timer_id, int tmr_flags, struct itimerspec64 *new_spec64, struct itimerspec64 *old_spec64) { - const struct k_clock *kc; - struct k_itimer *timr; - unsigned long flags; - int error; - if (!timespec64_valid(&new_spec64->it_interval) || !timespec64_valid(&new_spec64->it_value)) return -EINVAL; @@ -881,33 +927,28 @@ static int do_timer_settime(timer_t timer_id, int tmr_flags, if (old_spec64) memset(old_spec64, 0, sizeof(*old_spec64)); - timr = lock_timer(timer_id, &flags); -retry: - if (!timr) - return -EINVAL; + for (; ; old_spec64 = NULL) { + struct k_itimer *timr; - if (old_spec64) - old_spec64->it_interval = ktime_to_timespec64(timr->it_interval); + scoped_timer_get_or_fail(timer_id) { + timr = scoped_timer; - /* Prevent signal delivery and rearming. */ - timr->it_signal_seq++; + if (old_spec64) + old_spec64->it_interval = ktime_to_timespec64(timr->it_interval); - kc = timr->kclock; - if (WARN_ON_ONCE(!kc || !kc->timer_set)) - error = -EINVAL; - else - error = kc->timer_set(timr, tmr_flags, new_spec64, old_spec64); - - if (error == TIMER_RETRY) { - // We already got the old time... - old_spec64 = NULL; - /* Unlocks and relocks the timer if it still exists */ - timr = timer_wait_running(timr, &flags); - goto retry; - } - unlock_timer(timr, flags); + /* Prevent signal delivery and rearming. */ + timr->it_signal_seq++; - return error; + int ret = timr->kclock->timer_set(timr, tmr_flags, new_spec64, old_spec64); + if (ret != TIMER_RETRY) + return ret; + + /* Protect the timer from being freed when leaving the lock scope */ + rcu_read_lock(); + } + timer_wait_running(timr); + rcu_read_unlock(); + } } /* Set a POSIX.1b interval timer */ @@ -978,110 +1019,58 @@ static inline void posix_timer_cleanup_ignored(struct k_itimer *tmr) } } -static inline int timer_delete_hook(struct k_itimer *timer) +static void posix_timer_delete(struct k_itimer *timer) { - const struct k_clock *kc = timer->kclock; - - /* Prevent signal delivery and rearming. */ + /* + * Invalidate the timer, remove it from the linked list and remove + * it from the ignored list if pending. + * + * The invalidation must be written with siglock held so that the + * signal code observes the invalidated timer::it_signal in + * do_sigaction(), which prevents it from moving a pending signal + * of a deleted timer to the ignore list. + * + * The invalidation also prevents signal queueing, signal delivery + * and therefore rearming from the signal delivery path. + * + * A concurrent lookup can still find the timer in the hash, but it + * will check timer::it_signal with timer::it_lock held and observe + * bit 0 set, which invalidates it. That also prevents the timer ID + * from being handed out before this timer is completely gone. + */ timer->it_signal_seq++; - if (WARN_ON_ONCE(!kc || !kc->timer_del)) - return -EINVAL; - return kc->timer_del(timer); + scoped_guard (spinlock, ¤t->sighand->siglock) { + unsigned long sig = (unsigned long)timer->it_signal | 1UL; + + WRITE_ONCE(timer->it_signal, (struct signal_struct *)sig); + hlist_del_rcu(&timer->list); + posix_timer_cleanup_ignored(timer); + } + + while (timer->kclock->timer_del(timer) == TIMER_RETRY) { + guard(rcu)(); + spin_unlock_irq(&timer->it_lock); + timer_wait_running(timer); + spin_lock_irq(&timer->it_lock); + } } /* Delete a POSIX.1b interval timer. */ SYSCALL_DEFINE1(timer_delete, timer_t, timer_id) { struct k_itimer *timer; - unsigned long flags; - timer = lock_timer(timer_id, &flags); - -retry_delete: - if (!timer) - return -EINVAL; - - if (unlikely(timer_delete_hook(timer) == TIMER_RETRY)) { - /* Unlocks and relocks the timer if it still exists */ - timer = timer_wait_running(timer, &flags); - goto retry_delete; + scoped_timer_get_or_fail(timer_id) { + timer = scoped_timer; + posix_timer_delete(timer); } - - spin_lock(¤t->sighand->siglock); - hlist_del(&timer->list); - posix_timer_cleanup_ignored(timer); - /* - * A concurrent lookup could check timer::it_signal lockless. It - * will reevaluate with timer::it_lock held and observe the NULL. - * - * It must be written with siglock held so that the signal code - * observes timer->it_signal == NULL in do_sigaction(SIG_IGN), - * which prevents it from moving a pending signal of a deleted - * timer to the ignore list. - */ - WRITE_ONCE(timer->it_signal, NULL); - spin_unlock(¤t->sighand->siglock); - - unlock_timer(timer, flags); + /* Remove it from the hash, which frees up the timer ID */ posix_timer_unhash_and_free(timer); return 0; } /* - * Delete a timer if it is armed, remove it from the hash and schedule it - * for RCU freeing. - */ -static void itimer_delete(struct k_itimer *timer) -{ - unsigned long flags; - - /* - * irqsave is required to make timer_wait_running() work. - */ - spin_lock_irqsave(&timer->it_lock, flags); - -retry_delete: - /* - * Even if the timer is not longer accessible from other tasks - * it still might be armed and queued in the underlying timer - * mechanism. Worse, that timer mechanism might run the expiry - * function concurrently. - */ - if (timer_delete_hook(timer) == TIMER_RETRY) { - /* - * Timer is expired concurrently, prevent livelocks - * and pointless spinning on RT. - * - * timer_wait_running() drops timer::it_lock, which opens - * the possibility for another task to delete the timer. - * - * That's not possible here because this is invoked from - * do_exit() only for the last thread of the thread group. - * So no other task can access and delete that timer. - */ - if (WARN_ON_ONCE(timer_wait_running(timer, &flags) != timer)) - return; - - goto retry_delete; - } - hlist_del(&timer->list); - - posix_timer_cleanup_ignored(timer); - - /* - * Setting timer::it_signal to NULL is technically not required - * here as nothing can access the timer anymore legitimately via - * the hash table. Set it to NULL nevertheless so that all deletion - * paths are consistent. - */ - WRITE_ONCE(timer->it_signal, NULL); - - spin_unlock_irqrestore(&timer->it_lock, flags); - posix_timer_unhash_and_free(timer); -} - -/* * Invoked from do_exit() when the last thread of a thread group exits. * At that point no other task can access the timers of the dying * task anymore. @@ -1089,18 +1078,26 @@ retry_delete: void exit_itimers(struct task_struct *tsk) { struct hlist_head timers; + struct hlist_node *next; + struct k_itimer *timer; + + /* Clear restore mode for exec() */ + tsk->signal->timer_create_restore_ids = 0; if (hlist_empty(&tsk->signal->posix_timers)) return; /* Protect against concurrent read via /proc/$PID/timers */ - spin_lock_irq(&tsk->sighand->siglock); - hlist_move_list(&tsk->signal->posix_timers, &timers); - spin_unlock_irq(&tsk->sighand->siglock); + scoped_guard (spinlock_irq, &tsk->sighand->siglock) + hlist_move_list(&tsk->signal->posix_timers, &timers); /* The timers are not longer accessible via tsk::signal */ - while (!hlist_empty(&timers)) - itimer_delete(hlist_entry(timers.first, struct k_itimer, list)); + hlist_for_each_entry_safe(timer, next, &timers, list) { + scoped_guard (spinlock_irq, &timer->it_lock) + posix_timer_delete(timer); + posix_timer_unhash_and_free(timer); + cond_resched(); + } /* * There should be no timers on the ignored list. itimer_delete() has @@ -1545,3 +1542,31 @@ static const struct k_clock *clockid_to_kclock(const clockid_t id) return posix_clocks[array_index_nospec(idx, ARRAY_SIZE(posix_clocks))]; } + +static int __init posixtimer_init(void) +{ + unsigned long i, size; + unsigned int shift; + + posix_timers_cache = kmem_cache_create("posix_timers_cache", + sizeof(struct k_itimer), + __alignof__(struct k_itimer), + SLAB_ACCOUNT, NULL); + + if (IS_ENABLED(CONFIG_BASE_SMALL)) + size = 512; + else + size = roundup_pow_of_two(512 * num_possible_cpus()); + + timer_buckets = alloc_large_system_hash("posixtimers", sizeof(*timer_buckets), + size, 0, 0, &shift, NULL, size, size); + size = 1UL << shift; + timer_hashmask = size - 1; + + for (i = 0; i < size; i++) { + spin_lock_init(&timer_buckets[i].lock); + INIT_HLIST_HEAD(&timer_buckets[i].head); + } + return 0; +} +core_initcall(posixtimer_init); diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index fcca4e72f1ef..cc15fe293719 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -263,8 +263,7 @@ void __init generic_sched_clock_init(void) * Start the timer to keep sched_clock() properly updated and * sets the initial epoch. */ - hrtimer_init(&sched_clock_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); - sched_clock_timer.function = sched_clock_poll; + hrtimer_setup(&sched_clock_timer, sched_clock_poll, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL_HARD); } diff --git a/kernel/time/sleep_timeout.c b/kernel/time/sleep_timeout.c index dfe939f6e4ec..5aa38b2cf40a 100644 --- a/kernel/time/sleep_timeout.c +++ b/kernel/time/sleep_timeout.c @@ -97,10 +97,10 @@ signed long __sched schedule_timeout(signed long timeout) timer.timer.expires = expire; add_timer(&timer.timer); schedule(); - del_timer_sync(&timer.timer); + timer_delete_sync(&timer.timer); /* Remove the timer from the object tracker */ - destroy_timer_on_stack(&timer.timer); + timer_destroy_on_stack(&timer.timer); timeout = expire - jiffies; diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c index e28f9210f8a1..a88b72b0f35e 100644 --- a/kernel/time/tick-broadcast-hrtimer.c +++ b/kernel/time/tick-broadcast-hrtimer.c @@ -100,7 +100,6 @@ static enum hrtimer_restart bc_handler(struct hrtimer *t) void tick_setup_hrtimer_broadcast(void) { - hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); - bctimer.function = bc_handler; + hrtimer_setup(&bctimer, bc_handler, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); clockevents_register_device(&ce_broadcast_hrtimer); } diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index a47bcf71defc..9a3859443c04 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -509,6 +509,7 @@ void tick_resume(void) #ifdef CONFIG_SUSPEND static DEFINE_RAW_SPINLOCK(tick_freeze_lock); +static DEFINE_WAIT_OVERRIDE_MAP(tick_freeze_map, LD_WAIT_SLEEP); static unsigned int tick_freeze_depth; /** @@ -528,9 +529,22 @@ void tick_freeze(void) if (tick_freeze_depth == num_online_cpus()) { trace_suspend_resume(TPS("timekeeping_freeze"), smp_processor_id(), true); + /* + * All other CPUs have their interrupts disabled and are + * suspended to idle. Other tasks have been frozen so there + * is no scheduling happening. This means that there is no + * concurrency in the system at this point. Therefore it is + * okay to acquire a sleeping lock on PREEMPT_RT, such as a + * spinlock, because the lock cannot be held by other CPUs + * or threads and acquiring it cannot block. + * + * Inform lockdep about the situation. + */ + lock_map_acquire_try(&tick_freeze_map); system_state = SYSTEM_SUSPEND; sched_clock_suspend(); timekeeping_suspend(); + lock_map_release(&tick_freeze_map); } else { tick_suspend_local(); } @@ -552,8 +566,16 @@ void tick_unfreeze(void) raw_spin_lock(&tick_freeze_lock); if (tick_freeze_depth == num_online_cpus()) { + /* + * Similar to tick_freeze(). On resumption the first CPU may + * acquire uncontended sleeping locks while other CPUs block on + * tick_freeze_lock. + */ + lock_map_acquire_try(&tick_freeze_map); timekeeping_resume(); sched_clock_resume(); + lock_map_release(&tick_freeze_map); + system_state = SYSTEM_RUNNING; trace_suspend_resume(TPS("timekeeping_freeze"), smp_processor_id(), false); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index fa058510af9c..c527b421c865 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -1573,12 +1573,10 @@ void tick_setup_sched_timer(bool hrtimer) struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); /* Emulate tick processing via per-CPU hrtimers: */ - hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); + hrtimer_setup(&ts->sched_timer, tick_nohz_handler, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); - if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && hrtimer) { + if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && hrtimer) tick_sched_flag_set(ts, TS_FLAG_HIGHRES); - ts->sched_timer.function = tick_nohz_handler; - } /* Get the next period (per-CPU) */ hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 1e67d076f195..a009c91f7b05 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -164,10 +164,34 @@ static inline struct timespec64 tk_xtime(const struct timekeeper *tk) return ts; } +static inline struct timespec64 tk_xtime_coarse(const struct timekeeper *tk) +{ + struct timespec64 ts; + + ts.tv_sec = tk->xtime_sec; + ts.tv_nsec = tk->coarse_nsec; + return ts; +} + +/* + * Update the nanoseconds part for the coarse time keepers. They can't rely + * on xtime_nsec because xtime_nsec could be adjusted by a small negative + * amount when the multiplication factor of the clock is adjusted, which + * could cause the coarse clocks to go slightly backwards. See + * timekeeping_apply_adjustment(). Thus we keep a separate copy for the coarse + * clockids which only is updated when the clock has been set or we have + * accumulated time. + */ +static inline void tk_update_coarse_nsecs(struct timekeeper *tk) +{ + tk->coarse_nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; +} + static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts) { tk->xtime_sec = ts->tv_sec; tk->tkr_mono.xtime_nsec = (u64)ts->tv_nsec << tk->tkr_mono.shift; + tk_update_coarse_nsecs(tk); } static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts) @@ -175,6 +199,7 @@ static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts) tk->xtime_sec += ts->tv_sec; tk->tkr_mono.xtime_nsec += (u64)ts->tv_nsec << tk->tkr_mono.shift; tk_normalize_xtime(tk); + tk_update_coarse_nsecs(tk); } static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm) @@ -708,6 +733,7 @@ static void timekeeping_forward_now(struct timekeeper *tk) tk_normalize_xtime(tk); delta -= incr; } + tk_update_coarse_nsecs(tk); } /** @@ -804,8 +830,8 @@ EXPORT_SYMBOL_GPL(ktime_get_with_offset); ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs) { struct timekeeper *tk = &tk_core.timekeeper; - unsigned int seq; ktime_t base, *offset = offsets[offs]; + unsigned int seq; u64 nsecs; WARN_ON(timekeeping_suspended); @@ -813,7 +839,7 @@ ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs) do { seq = read_seqcount_begin(&tk_core.seq); base = ktime_add(tk->tkr_mono.base, *offset); - nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; + nsecs = tk->coarse_nsec; } while (read_seqcount_retry(&tk_core.seq, seq)); @@ -2161,7 +2187,7 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode) struct timekeeper *real_tk = &tk_core.timekeeper; unsigned int clock_set = 0; int shift = 0, maxshift; - u64 offset; + u64 offset, orig_offset; guard(raw_spinlock_irqsave)(&tk_core.lock); @@ -2172,7 +2198,7 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode) offset = clocksource_delta(tk_clock_read(&tk->tkr_mono), tk->tkr_mono.cycle_last, tk->tkr_mono.mask, tk->tkr_mono.clock->max_raw_delta); - + orig_offset = offset; /* Check if there's really nothing to do */ if (offset < real_tk->cycle_interval && mode == TK_ADV_TICK) return false; @@ -2205,6 +2231,14 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode) */ clock_set |= accumulate_nsecs_to_secs(tk); + /* + * To avoid inconsistencies caused adjtimex TK_ADV_FREQ calls + * making small negative adjustments to the base xtime_nsec + * value, only update the coarse clocks if we accumulated time + */ + if (orig_offset != offset) + tk_update_coarse_nsecs(tk); + timekeeping_update_from_shadow(&tk_core, clock_set); return !!clock_set; @@ -2248,7 +2282,7 @@ void ktime_get_coarse_real_ts64(struct timespec64 *ts) do { seq = read_seqcount_begin(&tk_core.seq); - *ts = tk_xtime(tk); + *ts = tk_xtime_coarse(tk); } while (read_seqcount_retry(&tk_core.seq, seq)); } EXPORT_SYMBOL(ktime_get_coarse_real_ts64); @@ -2271,7 +2305,7 @@ void ktime_get_coarse_real_ts64_mg(struct timespec64 *ts) do { seq = read_seqcount_begin(&tk_core.seq); - *ts = tk_xtime(tk); + *ts = tk_xtime_coarse(tk); offset = tk_core.timekeeper.offs_real; } while (read_seqcount_retry(&tk_core.seq, seq)); @@ -2350,12 +2384,12 @@ void ktime_get_coarse_ts64(struct timespec64 *ts) do { seq = read_seqcount_begin(&tk_core.seq); - now = tk_xtime(tk); + now = tk_xtime_coarse(tk); mono = tk->wall_to_monotonic; } while (read_seqcount_retry(&tk_core.seq, seq)); set_normalized_timespec64(ts, now.tv_sec + mono.tv_sec, - now.tv_nsec + mono.tv_nsec); + now.tv_nsec + mono.tv_nsec); } EXPORT_SYMBOL(ktime_get_coarse_ts64); diff --git a/kernel/time/timer.c b/kernel/time/timer.c index c8f776dc6ee0..553fa469d7cc 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -386,32 +386,6 @@ static unsigned long round_jiffies_common(unsigned long j, int cpu, } /** - * __round_jiffies - function to round jiffies to a full second - * @j: the time in (absolute) jiffies that should be rounded - * @cpu: the processor number on which the timeout will happen - * - * __round_jiffies() rounds an absolute time in the future (in jiffies) - * up or down to (approximately) full seconds. This is useful for timers - * for which the exact time they fire does not matter too much, as long as - * they fire approximately every X seconds. - * - * By rounding these timers to whole seconds, all such timers will fire - * at the same time, rather than at various times spread out. The goal - * of this is to have the CPU wake up less, which saves power. - * - * The exact rounding is skewed for each processor to avoid all - * processors firing at the exact same time, which could lead - * to lock contention or spurious cache line bouncing. - * - * The return value is the rounded version of the @j parameter. - */ -unsigned long __round_jiffies(unsigned long j, int cpu) -{ - return round_jiffies_common(j, cpu, false); -} -EXPORT_SYMBOL_GPL(__round_jiffies); - -/** * __round_jiffies_relative - function to round jiffies to a full second * @j: the time in (relative) jiffies that should be rounded * @cpu: the processor number on which the timeout will happen @@ -483,22 +457,6 @@ unsigned long round_jiffies_relative(unsigned long j) EXPORT_SYMBOL_GPL(round_jiffies_relative); /** - * __round_jiffies_up - function to round jiffies up to a full second - * @j: the time in (absolute) jiffies that should be rounded - * @cpu: the processor number on which the timeout will happen - * - * This is the same as __round_jiffies() except that it will never - * round down. This is useful for timeouts for which the exact time - * of firing does not matter too much, as long as they don't fire too - * early. - */ -unsigned long __round_jiffies_up(unsigned long j, int cpu) -{ - return round_jiffies_common(j, cpu, true); -} -EXPORT_SYMBOL_GPL(__round_jiffies_up); - -/** * __round_jiffies_up_relative - function to round jiffies up to a full second * @j: the time in (relative) jiffies that should be rounded * @cpu: the processor number on which the timeout will happen @@ -744,7 +702,7 @@ static bool timer_fixup_init(void *addr, enum debug_obj_state state) switch (state) { case ODEBUG_STATE_ACTIVE: - del_timer_sync(timer); + timer_delete_sync(timer); debug_object_init(timer, &timer_debug_descr); return true; default: @@ -790,7 +748,7 @@ static bool timer_fixup_free(void *addr, enum debug_obj_state state) switch (state) { case ODEBUG_STATE_ACTIVE: - del_timer_sync(timer); + timer_delete_sync(timer); debug_object_free(timer, &timer_debug_descr); return true; default: @@ -850,7 +808,7 @@ static void do_init_timer(struct timer_list *timer, unsigned int flags, const char *name, struct lock_class_key *key); -void init_timer_on_stack_key(struct timer_list *timer, +void timer_init_key_on_stack(struct timer_list *timer, void (*func)(struct timer_list *), unsigned int flags, const char *name, struct lock_class_key *key) @@ -858,13 +816,13 @@ void init_timer_on_stack_key(struct timer_list *timer, debug_object_init_on_stack(timer, &timer_debug_descr); do_init_timer(timer, func, flags, name, key); } -EXPORT_SYMBOL_GPL(init_timer_on_stack_key); +EXPORT_SYMBOL_GPL(timer_init_key_on_stack); -void destroy_timer_on_stack(struct timer_list *timer) +void timer_destroy_on_stack(struct timer_list *timer) { debug_object_free(timer, &timer_debug_descr); } -EXPORT_SYMBOL_GPL(destroy_timer_on_stack); +EXPORT_SYMBOL_GPL(timer_destroy_on_stack); #else static inline void debug_timer_init(struct timer_list *timer) { } @@ -904,7 +862,7 @@ static void do_init_timer(struct timer_list *timer, } /** - * init_timer_key - initialize a timer + * timer_init_key - initialize a timer * @timer: the timer to be initialized * @func: timer callback function * @flags: timer flags @@ -912,17 +870,17 @@ static void do_init_timer(struct timer_list *timer, * @key: lockdep class key of the fake lock used for tracking timer * sync lock dependencies * - * init_timer_key() must be done to a timer prior to calling *any* of the + * timer_init_key() must be done to a timer prior to calling *any* of the * other timer functions. */ -void init_timer_key(struct timer_list *timer, +void timer_init_key(struct timer_list *timer, void (*func)(struct timer_list *), unsigned int flags, const char *name, struct lock_class_key *key) { debug_init(timer); do_init_timer(timer, func, flags, name, key); } -EXPORT_SYMBOL(init_timer_key); +EXPORT_SYMBOL(timer_init_key); static inline void detach_timer(struct timer_list *timer, bool clear_pending) { @@ -1212,10 +1170,10 @@ EXPORT_SYMBOL(mod_timer_pending); * * mod_timer(timer, expires) is equivalent to: * - * del_timer(timer); timer->expires = expires; add_timer(timer); + * timer_delete(timer); timer->expires = expires; add_timer(timer); * * mod_timer() is more efficient than the above open coded sequence. In - * case that the timer is inactive, the del_timer() part is a NOP. The + * case that the timer is inactive, the timer_delete() part is a NOP. The * timer is in any case activated with the new expiry time @expires. * * Note that if there are multiple unserialized concurrent users of the @@ -1511,7 +1469,7 @@ static int __try_to_del_timer_sync(struct timer_list *timer, bool shutdown) } /** - * try_to_del_timer_sync - Try to deactivate a timer + * timer_delete_sync_try - Try to deactivate a timer * @timer: Timer to deactivate * * This function tries to deactivate a timer. On success the timer is not @@ -1526,11 +1484,11 @@ static int __try_to_del_timer_sync(struct timer_list *timer, bool shutdown) * * %1 - The timer was pending and deactivated * * %-1 - The timer callback function is running on a different CPU */ -int try_to_del_timer_sync(struct timer_list *timer) +int timer_delete_sync_try(struct timer_list *timer) { return __try_to_del_timer_sync(timer, false); } -EXPORT_SYMBOL(try_to_del_timer_sync); +EXPORT_SYMBOL(timer_delete_sync_try); #ifdef CONFIG_PREEMPT_RT static __init void timer_base_init_expiry_lock(struct timer_base *base) @@ -1900,7 +1858,7 @@ static void timer_recalc_next_expiry(struct timer_base *base) unsigned long clk, next, adj; unsigned lvl, offset = 0; - next = base->clk + NEXT_TIMER_MAX_DELTA; + next = base->clk + TIMER_NEXT_MAX_DELTA; clk = base->clk; for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) { int pos = next_pending_bucket(base, offset, clk & LVL_MASK); @@ -1963,7 +1921,7 @@ static void timer_recalc_next_expiry(struct timer_base *base) WRITE_ONCE(base->next_expiry, next); base->next_expiry_recalc = false; - base->timers_pending = !(next == base->clk + NEXT_TIMER_MAX_DELTA); + base->timers_pending = !(next == base->clk + TIMER_NEXT_MAX_DELTA); } #ifdef CONFIG_NO_HZ_COMMON @@ -2015,7 +1973,7 @@ static unsigned long next_timer_interrupt(struct timer_base *base, * easy comparable to find out which base holds the first pending timer. */ if (!base->timers_pending) - WRITE_ONCE(base->next_expiry, basej + NEXT_TIMER_MAX_DELTA); + WRITE_ONCE(base->next_expiry, basej + TIMER_NEXT_MAX_DELTA); return base->next_expiry; } @@ -2399,7 +2357,7 @@ static inline void __run_timers(struct timer_base *base) * timer at this clk are that all matching timers have been * dequeued or no timer has been queued since * base::next_expiry was set to base::clk + - * NEXT_TIMER_MAX_DELTA. + * TIMER_NEXT_MAX_DELTA. */ WARN_ON_ONCE(!levels && !base->next_expiry_recalc && base->timers_pending); @@ -2544,7 +2502,7 @@ int timers_prepare_cpu(unsigned int cpu) for (b = 0; b < NR_BASES; b++) { base = per_cpu_ptr(&timer_bases[b], cpu); base->clk = jiffies; - base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA; + base->next_expiry = base->clk + TIMER_NEXT_MAX_DELTA; base->next_expiry_recalc = false; base->timers_pending = false; base->is_idle = false; @@ -2599,7 +2557,7 @@ static void __init init_timer_cpu(int cpu) base->cpu = cpu; raw_spin_lock_init(&base->lock); base->clk = jiffies; - base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA; + base->next_expiry = base->clk + TIMER_NEXT_MAX_DELTA; timer_base_init_expiry_lock(base); } } @@ -2612,7 +2570,7 @@ static void __init init_timer_cpus(void) init_timer_cpu(cpu); } -void __init init_timers(void) +void __init timers_init(void) { init_timer_cpus(); posix_cputimers_init_work(); diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 1c311c46da50..b03d0ada6469 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -46,7 +46,7 @@ static void print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer, int idx, u64 now) { - SEQ_printf(m, " #%d: <%pK>, %ps", idx, taddr, timer->function); + SEQ_printf(m, " #%d: <%p>, %ps", idx, taddr, ACCESS_PRIVATE(timer, function)); SEQ_printf(m, ", S:%02x", timer->state); SEQ_printf(m, "\n"); SEQ_printf(m, " # expires at %Lu-%Lu nsecs [in %Ld to %Ld nsecs]\n", @@ -98,7 +98,7 @@ next_one: static void print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) { - SEQ_printf(m, " .base: %pK\n", base); + SEQ_printf(m, " .base: %p\n", base); SEQ_printf(m, " .index: %d\n", base->index); SEQ_printf(m, " .resolution: %u nsecs\n", hrtimer_resolution); diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c index 05d383143165..32ef27c71b57 100644 --- a/kernel/time/vsyscall.c +++ b/kernel/time/vsyscall.c @@ -15,29 +15,29 @@ #include "timekeeping_internal.h" -static inline void update_vdso_data(struct vdso_data *vdata, - struct timekeeper *tk) +static inline void update_vdso_time_data(struct vdso_time_data *vdata, struct timekeeper *tk) { + struct vdso_clock *vc = vdata->clock_data; struct vdso_timestamp *vdso_ts; u64 nsec, sec; - vdata[CS_HRES_COARSE].cycle_last = tk->tkr_mono.cycle_last; + vc[CS_HRES_COARSE].cycle_last = tk->tkr_mono.cycle_last; #ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT - vdata[CS_HRES_COARSE].max_cycles = tk->tkr_mono.clock->max_cycles; + vc[CS_HRES_COARSE].max_cycles = tk->tkr_mono.clock->max_cycles; #endif - vdata[CS_HRES_COARSE].mask = tk->tkr_mono.mask; - vdata[CS_HRES_COARSE].mult = tk->tkr_mono.mult; - vdata[CS_HRES_COARSE].shift = tk->tkr_mono.shift; - vdata[CS_RAW].cycle_last = tk->tkr_raw.cycle_last; + vc[CS_HRES_COARSE].mask = tk->tkr_mono.mask; + vc[CS_HRES_COARSE].mult = tk->tkr_mono.mult; + vc[CS_HRES_COARSE].shift = tk->tkr_mono.shift; + vc[CS_RAW].cycle_last = tk->tkr_raw.cycle_last; #ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT - vdata[CS_RAW].max_cycles = tk->tkr_raw.clock->max_cycles; + vc[CS_RAW].max_cycles = tk->tkr_raw.clock->max_cycles; #endif - vdata[CS_RAW].mask = tk->tkr_raw.mask; - vdata[CS_RAW].mult = tk->tkr_raw.mult; - vdata[CS_RAW].shift = tk->tkr_raw.shift; + vc[CS_RAW].mask = tk->tkr_raw.mask; + vc[CS_RAW].mult = tk->tkr_raw.mult; + vc[CS_RAW].shift = tk->tkr_raw.shift; /* CLOCK_MONOTONIC */ - vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC]; + vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC]; vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; nsec = tk->tkr_mono.xtime_nsec; @@ -55,7 +55,7 @@ static inline void update_vdso_data(struct vdso_data *vdata, nsec += (u64)tk->monotonic_to_boot.tv_nsec << tk->tkr_mono.shift; /* CLOCK_BOOTTIME */ - vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_BOOTTIME]; + vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_BOOTTIME]; vdso_ts->sec = sec; while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { @@ -65,19 +65,20 @@ static inline void update_vdso_data(struct vdso_data *vdata, vdso_ts->nsec = nsec; /* CLOCK_MONOTONIC_RAW */ - vdso_ts = &vdata[CS_RAW].basetime[CLOCK_MONOTONIC_RAW]; + vdso_ts = &vc[CS_RAW].basetime[CLOCK_MONOTONIC_RAW]; vdso_ts->sec = tk->raw_sec; vdso_ts->nsec = tk->tkr_raw.xtime_nsec; /* CLOCK_TAI */ - vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_TAI]; + vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_TAI]; vdso_ts->sec = tk->xtime_sec + (s64)tk->tai_offset; vdso_ts->nsec = tk->tkr_mono.xtime_nsec; } void update_vsyscall(struct timekeeper *tk) { - struct vdso_data *vdata = __arch_get_k_vdso_data(); + struct vdso_time_data *vdata = vdso_k_time_data; + struct vdso_clock *vc = vdata->clock_data; struct vdso_timestamp *vdso_ts; s32 clock_mode; u64 nsec; @@ -86,54 +87,53 @@ void update_vsyscall(struct timekeeper *tk) vdso_write_begin(vdata); clock_mode = tk->tkr_mono.clock->vdso_clock_mode; - vdata[CS_HRES_COARSE].clock_mode = clock_mode; - vdata[CS_RAW].clock_mode = clock_mode; + vc[CS_HRES_COARSE].clock_mode = clock_mode; + vc[CS_RAW].clock_mode = clock_mode; /* CLOCK_REALTIME also required for time() */ - vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME]; + vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_REALTIME]; vdso_ts->sec = tk->xtime_sec; vdso_ts->nsec = tk->tkr_mono.xtime_nsec; /* CLOCK_REALTIME_COARSE */ - vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME_COARSE]; + vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_REALTIME_COARSE]; vdso_ts->sec = tk->xtime_sec; - vdso_ts->nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; + vdso_ts->nsec = tk->coarse_nsec; /* CLOCK_MONOTONIC_COARSE */ - vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC_COARSE]; + vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC_COARSE]; vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; - nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; + nsec = tk->coarse_nsec; nsec = nsec + tk->wall_to_monotonic.tv_nsec; vdso_ts->sec += __iter_div_u64_rem(nsec, NSEC_PER_SEC, &vdso_ts->nsec); /* * Read without the seqlock held by clock_getres(). - * Note: No need to have a second copy. */ - WRITE_ONCE(vdata[CS_HRES_COARSE].hrtimer_res, hrtimer_resolution); + WRITE_ONCE(vdata->hrtimer_res, hrtimer_resolution); /* * If the current clocksource is not VDSO capable, then spare the * update of the high resolution parts. */ if (clock_mode != VDSO_CLOCKMODE_NONE) - update_vdso_data(vdata, tk); + update_vdso_time_data(vdata, tk); __arch_update_vsyscall(vdata); vdso_write_end(vdata); - __arch_sync_vdso_data(vdata); + __arch_sync_vdso_time_data(vdata); } void update_vsyscall_tz(void) { - struct vdso_data *vdata = __arch_get_k_vdso_data(); + struct vdso_time_data *vdata = vdso_k_time_data; - vdata[CS_HRES_COARSE].tz_minuteswest = sys_tz.tz_minuteswest; - vdata[CS_HRES_COARSE].tz_dsttime = sys_tz.tz_dsttime; + vdata->tz_minuteswest = sys_tz.tz_minuteswest; + vdata->tz_dsttime = sys_tz.tz_dsttime; - __arch_sync_vdso_data(vdata); + __arch_sync_vdso_time_data(vdata); } /** @@ -150,7 +150,7 @@ void update_vsyscall_tz(void) */ unsigned long vdso_update_begin(void) { - struct vdso_data *vdata = __arch_get_k_vdso_data(); + struct vdso_time_data *vdata = vdso_k_time_data; unsigned long flags = timekeeper_lock_irqsave(); vdso_write_begin(vdata); @@ -167,9 +167,9 @@ unsigned long vdso_update_begin(void) */ void vdso_update_end(unsigned long flags) { - struct vdso_data *vdata = __arch_get_k_vdso_data(); + struct vdso_time_data *vdata = vdso_k_time_data; vdso_write_end(vdata); - __arch_sync_vdso_data(vdata); + __arch_sync_vdso_time_data(vdata); timekeeper_unlock_irqrestore(flags); } diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index d570b8b9c0a9..a3f35c7d83b6 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -263,6 +263,17 @@ config FUNCTION_GRAPH_RETADDR the function is called. This feature is off by default, and you can enable it via the trace option funcgraph-retaddr. +config FUNCTION_TRACE_ARGS + bool + depends on PROBE_EVENTS_BTF_ARGS + default y + help + If supported with function argument access API and BTF, then + the function tracer and function graph tracer will support printing + of function arguments. This feature is off by default, and can be + enabled via the trace option func-args (for the function tracer) and + funcgraph-args (for the function graph tracer) + config DYNAMIC_FTRACE bool "enable/disable function tracing dynamically" depends on FUNCTION_TRACER diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 3679a6d18934..3f6a7bdc6edf 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -893,11 +893,6 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, rcu_read_unlock(); } -static void blk_add_trace_bio_bounce(void *ignore, struct bio *bio) -{ - blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_BOUNCE, 0); -} - static void blk_add_trace_bio_complete(void *ignore, struct request_queue *q, struct bio *bio) { @@ -1089,8 +1084,6 @@ static void blk_register_tracepoints(void) WARN_ON(ret); ret = register_trace_block_rq_complete(blk_add_trace_rq_complete, NULL); WARN_ON(ret); - ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL); - WARN_ON(ret); ret = register_trace_block_bio_complete(blk_add_trace_bio_complete, NULL); WARN_ON(ret); ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL); @@ -1125,7 +1118,6 @@ static void blk_unregister_tracepoints(void) unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL); unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL); unregister_trace_block_bio_complete(blk_add_trace_bio_complete, NULL); - unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL); unregister_trace_block_rq_complete(blk_add_trace_rq_complete, NULL); unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL); unregister_trace_block_rq_merge(blk_add_trace_rq_merge, NULL); @@ -1462,7 +1454,6 @@ static const struct { [__BLK_TA_UNPLUG_TIMER] = {{ "UT", "unplug_timer" }, blk_log_unplug }, [__BLK_TA_INSERT] = {{ "I", "insert" }, blk_log_generic }, [__BLK_TA_SPLIT] = {{ "X", "split" }, blk_log_split }, - [__BLK_TA_BOUNCE] = {{ "B", "bounce" }, blk_log_generic }, [__BLK_TA_REMAP] = {{ "A", "remap" }, blk_log_remap }, }; @@ -1896,6 +1887,8 @@ void blk_fill_rwbs(char *rwbs, blk_opf_t opf) rwbs[i++] = 'S'; if (opf & REQ_META) rwbs[i++] = 'M'; + if (opf & REQ_ATOMIC) + rwbs[i++] = 'U'; rwbs[i] = '\0'; } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 997fb2a47c92..132c8be6f635 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -392,7 +392,7 @@ static const struct bpf_func_proto bpf_trace_printk_proto = { .arg2_type = ARG_CONST_SIZE, }; -static void __set_printk_clr_event(void) +static void __set_printk_clr_event(struct work_struct *work) { /* * This program might be calling bpf_trace_printk, @@ -405,10 +405,11 @@ static void __set_printk_clr_event(void) if (trace_set_clr_event("bpf_trace", "bpf_trace_printk", 1)) pr_warn_ratelimited("could not enable bpf_trace_printk events"); } +static DECLARE_WORK(set_printk_work, __set_printk_clr_event); const struct bpf_func_proto *bpf_get_trace_printk_proto(void) { - __set_printk_clr_event(); + schedule_work(&set_printk_work); return &bpf_trace_printk_proto; } @@ -451,7 +452,7 @@ static const struct bpf_func_proto bpf_trace_vprintk_proto = { const struct bpf_func_proto *bpf_get_trace_vprintk_proto(void) { - __set_printk_clr_event(); + schedule_work(&set_printk_work); return &bpf_trace_vprintk_proto; } @@ -571,7 +572,7 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags) return value; } -static const struct bpf_func_proto bpf_perf_event_read_proto = { +const struct bpf_func_proto bpf_perf_event_read_proto = { .func = bpf_perf_event_read, .gpl_only = true, .ret_type = RET_INTEGER, @@ -606,6 +607,11 @@ static const struct bpf_func_proto bpf_perf_event_read_value_proto = { .arg4_type = ARG_CONST_SIZE, }; +const struct bpf_func_proto *bpf_get_perf_event_read_value_proto(void) +{ + return &bpf_perf_event_read_value_proto; +} + static __always_inline u64 __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, u64 flags, struct perf_raw_record *raw, @@ -843,7 +849,7 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type, struct task_struc if (unlikely(is_global_init(task))) return -EPERM; - if (!preemptible()) { + if (preempt_count() != 0 || irqs_disabled()) { /* Do an early check on signal validity. Otherwise, * the error is lost in deferred irq_work. */ @@ -876,7 +882,7 @@ BPF_CALL_1(bpf_send_signal, u32, sig) return bpf_send_signal_common(sig, PIDTYPE_TGID, NULL, 0); } -static const struct bpf_func_proto bpf_send_signal_proto = { +const struct bpf_func_proto bpf_send_signal_proto = { .func = bpf_send_signal, .gpl_only = false, .ret_type = RET_INTEGER, @@ -888,7 +894,7 @@ BPF_CALL_1(bpf_send_signal_thread, u32, sig) return bpf_send_signal_common(sig, PIDTYPE_PID, NULL, 0); } -static const struct bpf_func_proto bpf_send_signal_thread_proto = { +const struct bpf_func_proto bpf_send_signal_thread_proto = { .func = bpf_send_signal_thread, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1179,7 +1185,7 @@ BPF_CALL_3(bpf_get_branch_snapshot, void *, buf, u32, size, u64, flags) return entry_cnt * br_entry_size; } -static const struct bpf_func_proto bpf_get_branch_snapshot_proto = { +const struct bpf_func_proto bpf_get_branch_snapshot_proto = { .func = bpf_get_branch_snapshot, .gpl_only = true, .ret_type = RET_INTEGER, @@ -1424,56 +1430,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) const struct bpf_func_proto *func_proto; switch (func_id) { - case BPF_FUNC_map_lookup_elem: - return &bpf_map_lookup_elem_proto; - case BPF_FUNC_map_update_elem: - return &bpf_map_update_elem_proto; - case BPF_FUNC_map_delete_elem: - return &bpf_map_delete_elem_proto; - case BPF_FUNC_map_push_elem: - return &bpf_map_push_elem_proto; - case BPF_FUNC_map_pop_elem: - return &bpf_map_pop_elem_proto; - case BPF_FUNC_map_peek_elem: - return &bpf_map_peek_elem_proto; - case BPF_FUNC_map_lookup_percpu_elem: - return &bpf_map_lookup_percpu_elem_proto; - case BPF_FUNC_ktime_get_ns: - return &bpf_ktime_get_ns_proto; - case BPF_FUNC_ktime_get_boot_ns: - return &bpf_ktime_get_boot_ns_proto; - case BPF_FUNC_tail_call: - return &bpf_tail_call_proto; - case BPF_FUNC_get_current_task: - return &bpf_get_current_task_proto; - case BPF_FUNC_get_current_task_btf: - return &bpf_get_current_task_btf_proto; - case BPF_FUNC_task_pt_regs: - return &bpf_task_pt_regs_proto; - case BPF_FUNC_get_current_uid_gid: - return &bpf_get_current_uid_gid_proto; - case BPF_FUNC_get_current_comm: - return &bpf_get_current_comm_proto; - case BPF_FUNC_trace_printk: - return bpf_get_trace_printk_proto(); case BPF_FUNC_get_smp_processor_id: return &bpf_get_smp_processor_id_proto; - case BPF_FUNC_get_numa_node_id: - return &bpf_get_numa_node_id_proto; - case BPF_FUNC_perf_event_read: - return &bpf_perf_event_read_proto; - case BPF_FUNC_get_prandom_u32: - return &bpf_get_prandom_u32_proto; - case BPF_FUNC_probe_read_user: - return &bpf_probe_read_user_proto; - case BPF_FUNC_probe_read_kernel: - return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ? - NULL : &bpf_probe_read_kernel_proto; - case BPF_FUNC_probe_read_user_str: - return &bpf_probe_read_user_str_proto; - case BPF_FUNC_probe_read_kernel_str: - return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ? - NULL : &bpf_probe_read_kernel_str_proto; #ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE case BPF_FUNC_probe_read: return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ? @@ -1482,65 +1440,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ? NULL : &bpf_probe_read_compat_str_proto; #endif -#ifdef CONFIG_CGROUPS - case BPF_FUNC_cgrp_storage_get: - return &bpf_cgrp_storage_get_proto; - case BPF_FUNC_cgrp_storage_delete: - return &bpf_cgrp_storage_delete_proto; - case BPF_FUNC_current_task_under_cgroup: - return &bpf_current_task_under_cgroup_proto; -#endif - case BPF_FUNC_send_signal: - return &bpf_send_signal_proto; - case BPF_FUNC_send_signal_thread: - return &bpf_send_signal_thread_proto; - case BPF_FUNC_perf_event_read_value: - return &bpf_perf_event_read_value_proto; - case BPF_FUNC_ringbuf_output: - return &bpf_ringbuf_output_proto; - case BPF_FUNC_ringbuf_reserve: - return &bpf_ringbuf_reserve_proto; - case BPF_FUNC_ringbuf_submit: - return &bpf_ringbuf_submit_proto; - case BPF_FUNC_ringbuf_discard: - return &bpf_ringbuf_discard_proto; - case BPF_FUNC_ringbuf_query: - return &bpf_ringbuf_query_proto; - case BPF_FUNC_jiffies64: - return &bpf_jiffies64_proto; - case BPF_FUNC_get_task_stack: - return prog->sleepable ? &bpf_get_task_stack_sleepable_proto - : &bpf_get_task_stack_proto; - case BPF_FUNC_copy_from_user: - return &bpf_copy_from_user_proto; - case BPF_FUNC_copy_from_user_task: - return &bpf_copy_from_user_task_proto; - case BPF_FUNC_snprintf_btf: - return &bpf_snprintf_btf_proto; - case BPF_FUNC_per_cpu_ptr: - return &bpf_per_cpu_ptr_proto; - case BPF_FUNC_this_cpu_ptr: - return &bpf_this_cpu_ptr_proto; - case BPF_FUNC_task_storage_get: - if (bpf_prog_check_recur(prog)) - return &bpf_task_storage_get_recur_proto; - return &bpf_task_storage_get_proto; - case BPF_FUNC_task_storage_delete: - if (bpf_prog_check_recur(prog)) - return &bpf_task_storage_delete_recur_proto; - return &bpf_task_storage_delete_proto; - case BPF_FUNC_for_each_map_elem: - return &bpf_for_each_map_elem_proto; - case BPF_FUNC_snprintf: - return &bpf_snprintf_proto; case BPF_FUNC_get_func_ip: return &bpf_get_func_ip_proto_tracing; - case BPF_FUNC_get_branch_snapshot: - return &bpf_get_branch_snapshot_proto; - case BPF_FUNC_find_vma: - return &bpf_find_vma_proto; - case BPF_FUNC_trace_vprintk: - return bpf_get_trace_vprintk_proto(); default: break; } @@ -1852,7 +1753,7 @@ static struct pt_regs *get_bpf_raw_tp_regs(void) struct bpf_raw_tp_regs *tp_regs = this_cpu_ptr(&bpf_raw_tp_regs); int nest_level = this_cpu_inc_return(bpf_raw_tp_nest_level); - if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(tp_regs->regs))) { + if (nest_level > ARRAY_SIZE(tp_regs->regs)) { this_cpu_dec(bpf_raw_tp_nest_level); return ERR_PTR(-EBUSY); } @@ -2332,10 +2233,9 @@ void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp) { struct module *mod; - preempt_disable(); + guard(rcu)(); mod = __module_address((unsigned long)btp); module_put(mod); - preempt_enable(); } static __always_inline @@ -2919,18 +2819,21 @@ static int get_modules_for_addrs(struct module ***mods, unsigned long *addrs, u3 u32 i, err = 0; for (i = 0; i < addrs_cnt; i++) { + bool skip_add = false; struct module *mod; - preempt_disable(); - mod = __module_address(addrs[i]); - /* Either no module or we it's already stored */ - if (!mod || has_module(&arr, mod)) { - preempt_enable(); - continue; + scoped_guard(rcu) { + mod = __module_address(addrs[i]); + /* Either no module or it's already stored */ + if (!mod || has_module(&arr, mod)) { + skip_add = true; + break; /* scoped_guard */ + } + if (!try_module_get(mod)) + err = -EINVAL; } - if (!try_module_get(mod)) - err = -EINVAL; - preempt_enable(); + if (skip_add) + continue; if (err) break; err = add_module(&arr, mod); @@ -2979,6 +2882,9 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr if (sizeof(u64) != sizeof(void *)) return -EOPNOTSUPP; + if (attr->link_create.flags) + return -EINVAL; + if (!is_kprobe_multi(prog)) return -EINVAL; @@ -3368,6 +3274,9 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr if (sizeof(u64) != sizeof(void *)) return -EOPNOTSUPP; + if (attr->link_create.flags) + return -EINVAL; + if (!is_uprobe_multi(prog)) return -EINVAL; @@ -3409,7 +3318,9 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr } if (pid) { + rcu_read_lock(); task = get_pid_task(find_vpid(pid), PIDTYPE_TGID); + rcu_read_unlock(); if (!task) { err = -ESRCH; goto error_path_put; @@ -3557,6 +3468,146 @@ static int __init bpf_kprobe_multi_kfuncs_init(void) late_initcall(bpf_kprobe_multi_kfuncs_init); +typedef int (*copy_fn_t)(void *dst, const void *src, u32 size, struct task_struct *tsk); + +/* + * The __always_inline is to make sure the compiler doesn't + * generate indirect calls into callbacks, which is expensive, + * on some kernel configurations. This allows compiler to put + * direct calls into all the specific callback implementations + * (copy_user_data_sleepable, copy_user_data_nofault, and so on) + */ +static __always_inline int __bpf_dynptr_copy_str(struct bpf_dynptr *dptr, u32 doff, u32 size, + const void *unsafe_src, + copy_fn_t str_copy_fn, + struct task_struct *tsk) +{ + struct bpf_dynptr_kern *dst; + u32 chunk_sz, off; + void *dst_slice; + int cnt, err; + char buf[256]; + + dst_slice = bpf_dynptr_slice_rdwr(dptr, doff, NULL, size); + if (likely(dst_slice)) + return str_copy_fn(dst_slice, unsafe_src, size, tsk); + + dst = (struct bpf_dynptr_kern *)dptr; + if (bpf_dynptr_check_off_len(dst, doff, size)) + return -E2BIG; + + for (off = 0; off < size; off += chunk_sz - 1) { + chunk_sz = min_t(u32, sizeof(buf), size - off); + /* Expect str_copy_fn to return count of copied bytes, including + * zero terminator. Next iteration increment off by chunk_sz - 1 to + * overwrite NUL. + */ + cnt = str_copy_fn(buf, unsafe_src + off, chunk_sz, tsk); + if (cnt < 0) + return cnt; + err = __bpf_dynptr_write(dst, doff + off, buf, cnt, 0); + if (err) + return err; + if (cnt < chunk_sz || chunk_sz == 1) /* we are done */ + return off + cnt; + } + return off; +} + +static __always_inline int __bpf_dynptr_copy(const struct bpf_dynptr *dptr, u32 doff, + u32 size, const void *unsafe_src, + copy_fn_t copy_fn, struct task_struct *tsk) +{ + struct bpf_dynptr_kern *dst; + void *dst_slice; + char buf[256]; + u32 off, chunk_sz; + int err; + + dst_slice = bpf_dynptr_slice_rdwr(dptr, doff, NULL, size); + if (likely(dst_slice)) + return copy_fn(dst_slice, unsafe_src, size, tsk); + + dst = (struct bpf_dynptr_kern *)dptr; + if (bpf_dynptr_check_off_len(dst, doff, size)) + return -E2BIG; + + for (off = 0; off < size; off += chunk_sz) { + chunk_sz = min_t(u32, sizeof(buf), size - off); + err = copy_fn(buf, unsafe_src + off, chunk_sz, tsk); + if (err) + return err; + err = __bpf_dynptr_write(dst, doff + off, buf, chunk_sz, 0); + if (err) + return err; + } + return 0; +} + +static __always_inline int copy_user_data_nofault(void *dst, const void *unsafe_src, + u32 size, struct task_struct *tsk) +{ + return copy_from_user_nofault(dst, (const void __user *)unsafe_src, size); +} + +static __always_inline int copy_user_data_sleepable(void *dst, const void *unsafe_src, + u32 size, struct task_struct *tsk) +{ + int ret; + + if (!tsk) { /* Read from the current task */ + ret = copy_from_user(dst, (const void __user *)unsafe_src, size); + if (ret) + return -EFAULT; + return 0; + } + + ret = access_process_vm(tsk, (unsigned long)unsafe_src, dst, size, 0); + if (ret != size) + return -EFAULT; + return 0; +} + +static __always_inline int copy_kernel_data_nofault(void *dst, const void *unsafe_src, + u32 size, struct task_struct *tsk) +{ + return copy_from_kernel_nofault(dst, unsafe_src, size); +} + +static __always_inline int copy_user_str_nofault(void *dst, const void *unsafe_src, + u32 size, struct task_struct *tsk) +{ + return strncpy_from_user_nofault(dst, (const void __user *)unsafe_src, size); +} + +static __always_inline int copy_user_str_sleepable(void *dst, const void *unsafe_src, + u32 size, struct task_struct *tsk) +{ + int ret; + + if (unlikely(size == 0)) + return 0; + + if (tsk) { + ret = copy_remote_vm_str(tsk, (unsigned long)unsafe_src, dst, size, 0); + } else { + ret = strncpy_from_user(dst, (const void __user *)unsafe_src, size - 1); + /* strncpy_from_user does not guarantee NUL termination */ + if (ret >= 0) + ((char *)dst)[ret] = '\0'; + } + + if (ret < 0) + return ret; + return ret + 1; +} + +static __always_inline int copy_kernel_str_nofault(void *dst, const void *unsafe_src, + u32 size, struct task_struct *tsk) +{ + return strncpy_from_kernel_nofault(dst, unsafe_src, size); +} + __bpf_kfunc_start_defs(); __bpf_kfunc int bpf_send_signal_task(struct task_struct *task, int sig, enum pid_type type, @@ -3568,4 +3619,62 @@ __bpf_kfunc int bpf_send_signal_task(struct task_struct *task, int sig, enum pid return bpf_send_signal_common(sig, type, task, value); } +__bpf_kfunc int bpf_probe_read_user_dynptr(struct bpf_dynptr *dptr, u32 off, + u32 size, const void __user *unsafe_ptr__ign) +{ + return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign, + copy_user_data_nofault, NULL); +} + +__bpf_kfunc int bpf_probe_read_kernel_dynptr(struct bpf_dynptr *dptr, u32 off, + u32 size, const void *unsafe_ptr__ign) +{ + return __bpf_dynptr_copy(dptr, off, size, unsafe_ptr__ign, + copy_kernel_data_nofault, NULL); +} + +__bpf_kfunc int bpf_probe_read_user_str_dynptr(struct bpf_dynptr *dptr, u32 off, + u32 size, const void __user *unsafe_ptr__ign) +{ + return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign, + copy_user_str_nofault, NULL); +} + +__bpf_kfunc int bpf_probe_read_kernel_str_dynptr(struct bpf_dynptr *dptr, u32 off, + u32 size, const void *unsafe_ptr__ign) +{ + return __bpf_dynptr_copy_str(dptr, off, size, unsafe_ptr__ign, + copy_kernel_str_nofault, NULL); +} + +__bpf_kfunc int bpf_copy_from_user_dynptr(struct bpf_dynptr *dptr, u32 off, + u32 size, const void __user *unsafe_ptr__ign) +{ + return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign, + copy_user_data_sleepable, NULL); +} + +__bpf_kfunc int bpf_copy_from_user_str_dynptr(struct bpf_dynptr *dptr, u32 off, + u32 size, const void __user *unsafe_ptr__ign) +{ + return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign, + copy_user_str_sleepable, NULL); +} + +__bpf_kfunc int bpf_copy_from_user_task_dynptr(struct bpf_dynptr *dptr, u32 off, + u32 size, const void __user *unsafe_ptr__ign, + struct task_struct *tsk) +{ + return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign, + copy_user_data_sleepable, tsk); +} + +__bpf_kfunc int bpf_copy_from_user_task_str_dynptr(struct bpf_dynptr *dptr, u32 off, + u32 size, const void __user *unsafe_ptr__ign, + struct task_struct *tsk) +{ + return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign, + copy_user_str_sleepable, tsk); +} + __bpf_kfunc_end_defs(); diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 5dddfc2149f6..8d925cbdce3a 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -865,7 +865,7 @@ __ftrace_return_to_handler(struct ftrace_regs *fregs, unsigned long frame_pointe } /* - * After all architecures have selected HAVE_FUNCTION_GRAPH_FREGS, we can + * After all architectures have selected HAVE_FUNCTION_GRAPH_FREGS, we can * leave only ftrace_return_to_handler(fregs). */ #ifdef CONFIG_HAVE_FUNCTION_GRAPH_FREGS diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index 33082c4e8154..ba7ff14f5339 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -89,8 +89,11 @@ static bool delete_fprobe_node(struct fprobe_hlist_node *node) { lockdep_assert_held(&fprobe_mutex); - WRITE_ONCE(node->fp, NULL); - hlist_del_rcu(&node->hlist); + /* Avoid double deleting */ + if (READ_ONCE(node->fp) != NULL) { + WRITE_ONCE(node->fp, NULL); + hlist_del_rcu(&node->hlist); + } return !!find_first_fprobe_node(node->addr); } @@ -411,6 +414,103 @@ static void fprobe_graph_remove_ips(unsigned long *addrs, int num) ftrace_set_filter_ips(&fprobe_graph_ops.ops, addrs, num, 1, 0); } +#ifdef CONFIG_MODULES + +#define FPROBE_IPS_BATCH_INIT 8 +/* instruction pointer address list */ +struct fprobe_addr_list { + int index; + int size; + unsigned long *addrs; +}; + +static int fprobe_addr_list_add(struct fprobe_addr_list *alist, unsigned long addr) +{ + unsigned long *addrs; + + if (alist->index >= alist->size) + return -ENOMEM; + + alist->addrs[alist->index++] = addr; + if (alist->index < alist->size) + return 0; + + /* Expand the address list */ + addrs = kcalloc(alist->size * 2, sizeof(*addrs), GFP_KERNEL); + if (!addrs) + return -ENOMEM; + + memcpy(addrs, alist->addrs, alist->size * sizeof(*addrs)); + alist->size *= 2; + kfree(alist->addrs); + alist->addrs = addrs; + + return 0; +} + +static void fprobe_remove_node_in_module(struct module *mod, struct hlist_head *head, + struct fprobe_addr_list *alist) +{ + struct fprobe_hlist_node *node; + int ret = 0; + + hlist_for_each_entry_rcu(node, head, hlist, + lockdep_is_held(&fprobe_mutex)) { + if (!within_module(node->addr, mod)) + continue; + if (delete_fprobe_node(node)) + continue; + /* + * If failed to update alist, just continue to update hlist. + * Therefore, at list user handler will not hit anymore. + */ + if (!ret) + ret = fprobe_addr_list_add(alist, node->addr); + } +} + +/* Handle module unloading to manage fprobe_ip_table. */ +static int fprobe_module_callback(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct fprobe_addr_list alist = {.size = FPROBE_IPS_BATCH_INIT}; + struct module *mod = data; + int i; + + if (val != MODULE_STATE_GOING) + return NOTIFY_DONE; + + alist.addrs = kcalloc(alist.size, sizeof(*alist.addrs), GFP_KERNEL); + /* If failed to alloc memory, we can not remove ips from hash. */ + if (!alist.addrs) + return NOTIFY_DONE; + + mutex_lock(&fprobe_mutex); + for (i = 0; i < FPROBE_IP_TABLE_SIZE; i++) + fprobe_remove_node_in_module(mod, &fprobe_ip_table[i], &alist); + + if (alist.index < alist.size && alist.index > 0) + ftrace_set_filter_ips(&fprobe_graph_ops.ops, + alist.addrs, alist.index, 1, 0); + mutex_unlock(&fprobe_mutex); + + kfree(alist.addrs); + + return NOTIFY_DONE; +} + +static struct notifier_block fprobe_module_nb = { + .notifier_call = fprobe_module_callback, + .priority = 0, +}; + +static int __init init_fprobe_module(void) +{ + return register_module_notifier(&fprobe_module_nb); +} +early_initcall(init_fprobe_module); +#endif + static int symbols_cmp(const void *a, const void *b) { const char **str_a = (const char **) a; @@ -445,6 +545,7 @@ struct filter_match_data { size_t index; size_t size; unsigned long *addrs; + struct module **mods; }; static int filter_match_callback(void *data, const char *name, unsigned long addr) @@ -458,30 +559,47 @@ static int filter_match_callback(void *data, const char *name, unsigned long add if (!ftrace_location(addr)) return 0; - if (match->addrs) - match->addrs[match->index] = addr; + if (match->addrs) { + struct module *mod = __module_text_address(addr); + if (mod && !try_module_get(mod)) + return 0; + + match->mods[match->index] = mod; + match->addrs[match->index] = addr; + } match->index++; return match->index == match->size; } /* * Make IP list from the filter/no-filter glob patterns. - * Return the number of matched symbols, or -ENOENT. + * Return the number of matched symbols, or errno. + * If @addrs == NULL, this just counts the number of matched symbols. If @addrs + * is passed with an array, we need to pass the an @mods array of the same size + * to increment the module refcount for each symbol. + * This means we also need to call `module_put` for each element of @mods after + * using the @addrs. */ -static int ip_list_from_filter(const char *filter, const char *notfilter, - unsigned long *addrs, size_t size) +static int get_ips_from_filter(const char *filter, const char *notfilter, + unsigned long *addrs, struct module **mods, + size_t size) { struct filter_match_data match = { .filter = filter, .notfilter = notfilter, - .index = 0, .size = size, .addrs = addrs}; + .index = 0, .size = size, .addrs = addrs, .mods = mods}; int ret; + if (addrs && !mods) + return -EINVAL; + ret = kallsyms_on_each_symbol(filter_match_callback, &match); if (ret < 0) return ret; - ret = module_kallsyms_on_each_symbol(NULL, filter_match_callback, &match); - if (ret < 0) - return ret; + if (IS_ENABLED(CONFIG_MODULES)) { + ret = module_kallsyms_on_each_symbol(NULL, filter_match_callback, &match); + if (ret < 0) + return ret; + } return match.index ?: -ENOENT; } @@ -543,24 +661,35 @@ static int fprobe_init(struct fprobe *fp, unsigned long *addrs, int num) */ int register_fprobe(struct fprobe *fp, const char *filter, const char *notfilter) { - unsigned long *addrs; - int ret; + unsigned long *addrs __free(kfree) = NULL; + struct module **mods __free(kfree) = NULL; + int ret, num; if (!fp || !filter) return -EINVAL; - ret = ip_list_from_filter(filter, notfilter, NULL, FPROBE_IPS_MAX); - if (ret < 0) - return ret; + num = get_ips_from_filter(filter, notfilter, NULL, NULL, FPROBE_IPS_MAX); + if (num < 0) + return num; - addrs = kcalloc(ret, sizeof(unsigned long), GFP_KERNEL); + addrs = kcalloc(num, sizeof(*addrs), GFP_KERNEL); if (!addrs) return -ENOMEM; - ret = ip_list_from_filter(filter, notfilter, addrs, ret); - if (ret > 0) - ret = register_fprobe_ips(fp, addrs, ret); - kfree(addrs); + mods = kcalloc(num, sizeof(*mods), GFP_KERNEL); + if (!mods) + return -ENOMEM; + + ret = get_ips_from_filter(filter, notfilter, addrs, mods, num); + if (ret < 0) + return ret; + + ret = register_fprobe_ips(fp, addrs, ret); + + for (int i = 0; i < num; i++) { + if (mods[i]) + module_put(mods[i]); + } return ret; } EXPORT_SYMBOL_GPL(register_fprobe); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index fc88e0688daf..6981830c3128 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1293,8 +1293,12 @@ static void free_ftrace_hash_rcu(struct ftrace_hash *hash) void ftrace_free_filter(struct ftrace_ops *ops) { ftrace_ops_init(ops); + if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED)) + return; free_ftrace_hash(ops->func_hash->filter_hash); free_ftrace_hash(ops->func_hash->notrace_hash); + ops->func_hash->filter_hash = EMPTY_HASH; + ops->func_hash->notrace_hash = EMPTY_HASH; } EXPORT_SYMBOL_GPL(ftrace_free_filter); @@ -3254,6 +3258,31 @@ static int append_hash(struct ftrace_hash **hash, struct ftrace_hash *new_hash, } /* + * Remove functions from @hash that are in @notrace_hash + */ +static void remove_hash(struct ftrace_hash *hash, struct ftrace_hash *notrace_hash) +{ + struct ftrace_func_entry *entry; + struct hlist_node *tmp; + int size; + int i; + + /* If the notrace hash is empty, there's nothing to do */ + if (ftrace_hash_empty(notrace_hash)) + return; + + size = 1 << hash->size_bits; + for (i = 0; i < size; i++) { + hlist_for_each_entry_safe(entry, tmp, &hash->buckets[i], hlist) { + if (!__ftrace_lookup_ip(notrace_hash, entry->ip)) + continue; + remove_hash_entry(hash, entry); + kfree(entry); + } + } +} + +/* * Add to @hash only those that are in both @new_hash1 and @new_hash2 * * The notrace_hash updates uses just the intersect_hash() function @@ -3293,67 +3322,6 @@ static int intersect_hash(struct ftrace_hash **hash, struct ftrace_hash *new_has return 0; } -/* Return a new hash that has a union of all @ops->filter_hash entries */ -static struct ftrace_hash *append_hashes(struct ftrace_ops *ops) -{ - struct ftrace_hash *new_hash = NULL; - struct ftrace_ops *subops; - int size_bits; - int ret; - - if (ops->func_hash->filter_hash) - size_bits = ops->func_hash->filter_hash->size_bits; - else - size_bits = FTRACE_HASH_DEFAULT_BITS; - - list_for_each_entry(subops, &ops->subop_list, list) { - ret = append_hash(&new_hash, subops->func_hash->filter_hash, size_bits); - if (ret < 0) { - free_ftrace_hash(new_hash); - return NULL; - } - /* Nothing more to do if new_hash is empty */ - if (ftrace_hash_empty(new_hash)) - break; - } - /* Can't return NULL as that means this failed */ - return new_hash ? : EMPTY_HASH; -} - -/* Make @ops trace evenything except what all its subops do not trace */ -static struct ftrace_hash *intersect_hashes(struct ftrace_ops *ops) -{ - struct ftrace_hash *new_hash = NULL; - struct ftrace_ops *subops; - int size_bits; - int ret; - - list_for_each_entry(subops, &ops->subop_list, list) { - struct ftrace_hash *next_hash; - - if (!new_hash) { - size_bits = subops->func_hash->notrace_hash->size_bits; - new_hash = alloc_and_copy_ftrace_hash(size_bits, ops->func_hash->notrace_hash); - if (!new_hash) - return NULL; - continue; - } - size_bits = new_hash->size_bits; - next_hash = new_hash; - new_hash = alloc_ftrace_hash(size_bits); - ret = intersect_hash(&new_hash, next_hash, subops->func_hash->notrace_hash); - free_ftrace_hash(next_hash); - if (ret < 0) { - free_ftrace_hash(new_hash); - return NULL; - } - /* Nothing more to do if new_hash is empty */ - if (ftrace_hash_empty(new_hash)) - break; - } - return new_hash; -} - static bool ops_equal(struct ftrace_hash *A, struct ftrace_hash *B) { struct ftrace_func_entry *entry; @@ -3425,6 +3393,95 @@ static int ftrace_update_ops(struct ftrace_ops *ops, struct ftrace_hash *filter_ return 0; } +static int add_first_hash(struct ftrace_hash **filter_hash, struct ftrace_hash **notrace_hash, + struct ftrace_ops_hash *func_hash) +{ + /* If the filter hash is not empty, simply remove the nohash from it */ + if (!ftrace_hash_empty(func_hash->filter_hash)) { + *filter_hash = copy_hash(func_hash->filter_hash); + if (!*filter_hash) + return -ENOMEM; + remove_hash(*filter_hash, func_hash->notrace_hash); + *notrace_hash = EMPTY_HASH; + + } else { + *notrace_hash = copy_hash(func_hash->notrace_hash); + if (!*notrace_hash) + return -ENOMEM; + *filter_hash = EMPTY_HASH; + } + return 0; +} + +static int add_next_hash(struct ftrace_hash **filter_hash, struct ftrace_hash **notrace_hash, + struct ftrace_ops_hash *ops_hash, struct ftrace_ops_hash *subops_hash) +{ + int size_bits; + int ret; + + /* If the subops trace all functions so must the main ops */ + if (ftrace_hash_empty(ops_hash->filter_hash) || + ftrace_hash_empty(subops_hash->filter_hash)) { + *filter_hash = EMPTY_HASH; + } else { + /* + * The main ops filter hash is not empty, so its + * notrace_hash had better be, as the notrace hash + * is only used for empty main filter hashes. + */ + WARN_ON_ONCE(!ftrace_hash_empty(ops_hash->notrace_hash)); + + size_bits = max(ops_hash->filter_hash->size_bits, + subops_hash->filter_hash->size_bits); + + /* Copy the subops hash */ + *filter_hash = alloc_and_copy_ftrace_hash(size_bits, subops_hash->filter_hash); + if (!*filter_hash) + return -ENOMEM; + /* Remove any notrace functions from the copy */ + remove_hash(*filter_hash, subops_hash->notrace_hash); + + ret = append_hash(filter_hash, ops_hash->filter_hash, + size_bits); + if (ret < 0) { + free_ftrace_hash(*filter_hash); + *filter_hash = EMPTY_HASH; + return ret; + } + } + + /* + * Only process notrace hashes if the main filter hash is empty + * (tracing all functions), otherwise the filter hash will just + * remove the notrace hash functions, and the notrace hash is + * not needed. + */ + if (ftrace_hash_empty(*filter_hash)) { + /* + * Intersect the notrace functions. That is, if two + * subops are not tracing a set of functions, the + * main ops will only not trace the functions that are + * in both subops, but has to trace the functions that + * are only notrace in one of the subops, for the other + * subops to be able to trace them. + */ + size_bits = max(ops_hash->notrace_hash->size_bits, + subops_hash->notrace_hash->size_bits); + *notrace_hash = alloc_ftrace_hash(size_bits); + if (!*notrace_hash) + return -ENOMEM; + + ret = intersect_hash(notrace_hash, ops_hash->notrace_hash, + subops_hash->notrace_hash); + if (ret < 0) { + free_ftrace_hash(*notrace_hash); + *notrace_hash = EMPTY_HASH; + return ret; + } + } + return 0; +} + /** * ftrace_startup_subops - enable tracing for subops of an ops * @ops: Manager ops (used to pick all the functions of its subops) @@ -3437,11 +3494,10 @@ static int ftrace_update_ops(struct ftrace_ops *ops, struct ftrace_hash *filter_ */ int ftrace_startup_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int command) { - struct ftrace_hash *filter_hash; - struct ftrace_hash *notrace_hash; + struct ftrace_hash *filter_hash = EMPTY_HASH; + struct ftrace_hash *notrace_hash = EMPTY_HASH; struct ftrace_hash *save_filter_hash; struct ftrace_hash *save_notrace_hash; - int size_bits; int ret; if (unlikely(ftrace_disabled)) @@ -3465,14 +3521,14 @@ int ftrace_startup_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int /* For the first subops to ops just enable it normally */ if (list_empty(&ops->subop_list)) { - /* Just use the subops hashes */ - filter_hash = copy_hash(subops->func_hash->filter_hash); - notrace_hash = copy_hash(subops->func_hash->notrace_hash); - if (!filter_hash || !notrace_hash) { - free_ftrace_hash(filter_hash); - free_ftrace_hash(notrace_hash); - return -ENOMEM; - } + + /* The ops was empty, should have empty hashes */ + WARN_ON_ONCE(!ftrace_hash_empty(ops->func_hash->filter_hash)); + WARN_ON_ONCE(!ftrace_hash_empty(ops->func_hash->notrace_hash)); + + ret = add_first_hash(&filter_hash, ¬race_hash, subops->func_hash); + if (ret < 0) + return ret; save_filter_hash = ops->func_hash->filter_hash; save_notrace_hash = ops->func_hash->notrace_hash; @@ -3498,48 +3554,16 @@ int ftrace_startup_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int /* * Here there's already something attached. Here are the rules: - * o If either filter_hash is empty then the final stays empty - * o Otherwise, the final is a superset of both hashes - * o If either notrace_hash is empty then the final stays empty - * o Otherwise, the final is an intersection between the hashes + * If the new subops and main ops filter hashes are not empty: + * o Make a copy of the subops filter hash + * o Remove all functions in the nohash from it. + * o Add in the main hash filter functions + * o Remove any of these functions from the main notrace hash */ - if (ftrace_hash_empty(ops->func_hash->filter_hash) || - ftrace_hash_empty(subops->func_hash->filter_hash)) { - filter_hash = EMPTY_HASH; - } else { - size_bits = max(ops->func_hash->filter_hash->size_bits, - subops->func_hash->filter_hash->size_bits); - filter_hash = alloc_and_copy_ftrace_hash(size_bits, ops->func_hash->filter_hash); - if (!filter_hash) - return -ENOMEM; - ret = append_hash(&filter_hash, subops->func_hash->filter_hash, - size_bits); - if (ret < 0) { - free_ftrace_hash(filter_hash); - return ret; - } - } - - if (ftrace_hash_empty(ops->func_hash->notrace_hash) || - ftrace_hash_empty(subops->func_hash->notrace_hash)) { - notrace_hash = EMPTY_HASH; - } else { - size_bits = max(ops->func_hash->filter_hash->size_bits, - subops->func_hash->filter_hash->size_bits); - notrace_hash = alloc_ftrace_hash(size_bits); - if (!notrace_hash) { - free_ftrace_hash(filter_hash); - return -ENOMEM; - } - ret = intersect_hash(¬race_hash, ops->func_hash->filter_hash, - subops->func_hash->filter_hash); - if (ret < 0) { - free_ftrace_hash(filter_hash); - free_ftrace_hash(notrace_hash); - return ret; - } - } + ret = add_next_hash(&filter_hash, ¬race_hash, ops->func_hash, subops->func_hash); + if (ret < 0) + return ret; list_add(&subops->list, &ops->subop_list); @@ -3555,6 +3579,45 @@ int ftrace_startup_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int return ret; } +static int rebuild_hashes(struct ftrace_hash **filter_hash, struct ftrace_hash **notrace_hash, + struct ftrace_ops *ops) +{ + struct ftrace_ops_hash temp_hash; + struct ftrace_ops *subops; + bool first = true; + int ret; + + temp_hash.filter_hash = EMPTY_HASH; + temp_hash.notrace_hash = EMPTY_HASH; + + list_for_each_entry(subops, &ops->subop_list, list) { + *filter_hash = EMPTY_HASH; + *notrace_hash = EMPTY_HASH; + + if (first) { + ret = add_first_hash(filter_hash, notrace_hash, subops->func_hash); + if (ret < 0) + return ret; + first = false; + } else { + ret = add_next_hash(filter_hash, notrace_hash, + &temp_hash, subops->func_hash); + if (ret < 0) { + free_ftrace_hash(temp_hash.filter_hash); + free_ftrace_hash(temp_hash.notrace_hash); + return ret; + } + } + + free_ftrace_hash(temp_hash.filter_hash); + free_ftrace_hash(temp_hash.notrace_hash); + + temp_hash.filter_hash = *filter_hash; + temp_hash.notrace_hash = *notrace_hash; + } + return 0; +} + /** * ftrace_shutdown_subops - Remove a subops from a manager ops * @ops: A manager ops to remove @subops from @@ -3569,8 +3632,8 @@ int ftrace_startup_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int */ int ftrace_shutdown_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int command) { - struct ftrace_hash *filter_hash; - struct ftrace_hash *notrace_hash; + struct ftrace_hash *filter_hash = EMPTY_HASH; + struct ftrace_hash *notrace_hash = EMPTY_HASH; int ret; if (unlikely(ftrace_disabled)) @@ -3603,14 +3666,9 @@ int ftrace_shutdown_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, in } /* Rebuild the hashes without subops */ - filter_hash = append_hashes(ops); - notrace_hash = intersect_hashes(ops); - if (!filter_hash || !notrace_hash) { - free_ftrace_hash(filter_hash); - free_ftrace_hash(notrace_hash); - list_add(&subops->list, &ops->subop_list); - return -ENOMEM; - } + ret = rebuild_hashes(&filter_hash, ¬race_hash, ops); + if (ret < 0) + return ret; ret = ftrace_update_ops(ops, filter_hash, notrace_hash); if (ret < 0) { @@ -3626,11 +3684,11 @@ int ftrace_shutdown_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, in static int ftrace_hash_move_and_update_subops(struct ftrace_ops *subops, struct ftrace_hash **orig_subhash, - struct ftrace_hash *hash, - int enable) + struct ftrace_hash *hash) { struct ftrace_ops *ops = subops->managed; - struct ftrace_hash **orig_hash; + struct ftrace_hash *notrace_hash; + struct ftrace_hash *filter_hash; struct ftrace_hash *save_hash; struct ftrace_hash *new_hash; int ret; @@ -3647,24 +3705,18 @@ static int ftrace_hash_move_and_update_subops(struct ftrace_ops *subops, return -ENOMEM; } - /* Create a new_hash to hold the ops new functions */ - if (enable) { - orig_hash = &ops->func_hash->filter_hash; - new_hash = append_hashes(ops); - } else { - orig_hash = &ops->func_hash->notrace_hash; - new_hash = intersect_hashes(ops); + ret = rebuild_hashes(&filter_hash, ¬race_hash, ops); + if (!ret) { + ret = ftrace_update_ops(ops, filter_hash, notrace_hash); + free_ftrace_hash(filter_hash); + free_ftrace_hash(notrace_hash); } - /* Move the hash over to the new hash */ - ret = __ftrace_hash_move_and_update_ops(ops, orig_hash, new_hash, enable); - - free_ftrace_hash(new_hash); - if (ret) { /* Put back the original hash */ - free_ftrace_hash_rcu(*orig_subhash); + new_hash = *orig_subhash; *orig_subhash = save_hash; + free_ftrace_hash_rcu(new_hash); } else { free_ftrace_hash_rcu(save_hash); } @@ -4888,7 +4940,7 @@ static int ftrace_hash_move_and_update_ops(struct ftrace_ops *ops, int enable) { if (ops->flags & FTRACE_OPS_FL_SUBOP) - return ftrace_hash_move_and_update_subops(ops, orig_hash, hash, enable); + return ftrace_hash_move_and_update_subops(ops, orig_hash, hash); /* * If this ops is not enabled, it could be sharing its filters @@ -4907,7 +4959,7 @@ static int ftrace_hash_move_and_update_ops(struct ftrace_ops *ops, list_for_each_entry(subops, &op->subop_list, list) { if ((subops->flags & FTRACE_OPS_FL_ENABLED) && subops->func_hash == ops->func_hash) { - return ftrace_hash_move_and_update_subops(subops, orig_hash, hash, enable); + return ftrace_hash_move_and_update_subops(subops, orig_hash, hash); } } } while_for_each_ftrace_op(op); @@ -5912,9 +5964,10 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) /* Make a copy hash to place the new and the old entries in */ size = hash->count + direct_functions->count; - if (size > 32) - size = 32; - new_hash = alloc_ftrace_hash(fls(size)); + size = fls(size); + if (size > FTRACE_HASH_MAX_BITS) + size = FTRACE_HASH_MAX_BITS; + new_hash = alloc_ftrace_hash(size); if (!new_hash) goto out_unlock; @@ -6853,6 +6906,7 @@ ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer) } } } + cond_resched(); } while_for_each_ftrace_rec(); return fail ? -EINVAL : 0; @@ -7016,6 +7070,7 @@ static int ftrace_process_locs(struct module *mod, unsigned long *p; unsigned long addr; unsigned long flags = 0; /* Shut up gcc */ + unsigned long pages; int ret = -ENOMEM; count = end - start; @@ -7023,6 +7078,8 @@ static int ftrace_process_locs(struct module *mod, if (!count) return 0; + pages = DIV_ROUND_UP(count, ENTRIES_PER_PAGE); + /* * Sorting mcount in vmlinux at build time depend on * CONFIG_BUILDTIME_MCOUNT_SORT, while mcount loc in @@ -7067,7 +7124,9 @@ static int ftrace_process_locs(struct module *mod, pg = start_pg; while (p < end) { unsigned long end_offset; - addr = ftrace_call_adjust(*p++); + + addr = *p++; + /* * Some architecture linkers will pad between * the different mcount_loc sections of different @@ -7079,6 +7138,19 @@ static int ftrace_process_locs(struct module *mod, continue; } + /* + * If this is core kernel, make sure the address is in core + * or inittext, as weak functions get zeroed and KASLR can + * move them to something other than zero. It just will not + * move it to an area where kernel text is. + */ + if (!mod && !(is_kernel_text(addr) || is_kernel_inittext(addr))) { + skipped++; + continue; + } + + addr = ftrace_call_adjust(addr); + end_offset = (pg->index+1) * sizeof(pg->records[0]); if (end_offset > PAGE_SIZE << pg->order) { /* We should have allocated enough */ @@ -7118,11 +7190,41 @@ static int ftrace_process_locs(struct module *mod, /* We should have used all pages unless we skipped some */ if (pg_unuse) { - WARN_ON(!skipped); + unsigned long pg_remaining, remaining = 0; + unsigned long skip; + + /* Count the number of entries unused and compare it to skipped. */ + pg_remaining = (ENTRIES_PER_PAGE << pg->order) - pg->index; + + if (!WARN(skipped < pg_remaining, "Extra allocated pages for ftrace")) { + + skip = skipped - pg_remaining; + + for (pg = pg_unuse; pg; pg = pg->next) + remaining += 1 << pg->order; + + pages -= remaining; + + skip = DIV_ROUND_UP(skip, ENTRIES_PER_PAGE); + + /* + * Check to see if the number of pages remaining would + * just fit the number of entries skipped. + */ + WARN(skip != remaining, "Extra allocated pages for ftrace: %lu with %lu skipped", + remaining, skipped); + } /* Need to synchronize with ftrace_location_range() */ synchronize_rcu(); ftrace_free_pages(pg_unuse); } + + if (!mod) { + count -= skipped; + pr_info("ftrace: allocating %ld entries in %ld pages\n", + count, pages); + } + return ret; } @@ -7768,9 +7870,6 @@ void __init ftrace_init(void) goto failed; } - pr_info("ftrace: allocating %ld entries in %ld pages\n", - count, DIV_ROUND_UP(count, ENTRIES_PER_PAGE)); - ret = ftrace_process_locs(NULL, __start_mcount_loc, __stop_mcount_loc); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index bb6089c2951e..3f9bf562beea 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -31,6 +31,7 @@ #include <asm/local64.h> #include <asm/local.h> +#include <asm/setup.h> #include "trace.h" @@ -48,9 +49,12 @@ static void update_pages_handler(struct work_struct *work); struct ring_buffer_meta { int magic; - int struct_size; - unsigned long text_addr; - unsigned long data_addr; + int struct_sizes; + unsigned long total_size; + unsigned long buffers_offset; +}; + +struct ring_buffer_cpu_meta { unsigned long first_buffer; unsigned long head_buffer; unsigned long commit_buffer; @@ -517,7 +521,7 @@ struct ring_buffer_per_cpu { struct mutex mapping_lock; unsigned long *subbuf_ids; /* ID to subbuf VA */ struct trace_buffer_meta *meta_page; - struct ring_buffer_meta *ring_meta; + struct ring_buffer_cpu_meta *ring_meta; /* ring buffer pages to update, > 0 to add, < 0 to remove */ long nr_pages_to_update; @@ -550,8 +554,7 @@ struct trace_buffer { unsigned long range_addr_start; unsigned long range_addr_end; - long last_text_delta; - long last_data_delta; + struct ring_buffer_meta *meta; unsigned int subbuf_size; unsigned int subbuf_order; @@ -1271,7 +1274,7 @@ static void rb_head_page_activate(struct ring_buffer_per_cpu *cpu_buffer) rb_set_list_to_head(head->list.prev); if (cpu_buffer->ring_meta) { - struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; meta->head_buffer = (unsigned long)head->page; } } @@ -1569,7 +1572,7 @@ out_locked: static unsigned long rb_range_align_subbuf(unsigned long addr, int subbuf_size, int nr_subbufs) { - addr += sizeof(struct ring_buffer_meta) + + addr += sizeof(struct ring_buffer_cpu_meta) + sizeof(int) * nr_subbufs; return ALIGN(addr, subbuf_size); } @@ -1580,19 +1583,22 @@ rb_range_align_subbuf(unsigned long addr, int subbuf_size, int nr_subbufs) static void *rb_range_meta(struct trace_buffer *buffer, int nr_pages, int cpu) { int subbuf_size = buffer->subbuf_size + BUF_PAGE_HDR_SIZE; - unsigned long ptr = buffer->range_addr_start; - struct ring_buffer_meta *meta; + struct ring_buffer_cpu_meta *meta; + struct ring_buffer_meta *bmeta; + unsigned long ptr; int nr_subbufs; - if (!ptr) + bmeta = buffer->meta; + if (!bmeta) return NULL; + ptr = (unsigned long)bmeta + bmeta->buffers_offset; + meta = (struct ring_buffer_cpu_meta *)ptr; + /* When nr_pages passed in is zero, the first meta has already been initialized */ if (!nr_pages) { - meta = (struct ring_buffer_meta *)ptr; nr_subbufs = meta->nr_subbufs; } else { - meta = NULL; /* Include the reader page */ nr_subbufs = nr_pages + 1; } @@ -1624,7 +1630,7 @@ static void *rb_range_meta(struct trace_buffer *buffer, int nr_pages, int cpu) } /* Return the start of subbufs given the meta pointer */ -static void *rb_subbufs_from_meta(struct ring_buffer_meta *meta) +static void *rb_subbufs_from_meta(struct ring_buffer_cpu_meta *meta) { int subbuf_size = meta->subbuf_size; unsigned long ptr; @@ -1640,7 +1646,7 @@ static void *rb_subbufs_from_meta(struct ring_buffer_meta *meta) */ static void *rb_range_buffer(struct ring_buffer_per_cpu *cpu_buffer, int idx) { - struct ring_buffer_meta *meta; + struct ring_buffer_cpu_meta *meta; unsigned long ptr; int subbuf_size; @@ -1666,14 +1672,77 @@ static void *rb_range_buffer(struct ring_buffer_per_cpu *cpu_buffer, int idx) } /* + * See if the existing memory contains a valid meta section. + * if so, use that, otherwise initialize it. + */ +static bool rb_meta_init(struct trace_buffer *buffer, int scratch_size) +{ + unsigned long ptr = buffer->range_addr_start; + struct ring_buffer_meta *bmeta; + unsigned long total_size; + int struct_sizes; + + bmeta = (struct ring_buffer_meta *)ptr; + buffer->meta = bmeta; + + total_size = buffer->range_addr_end - buffer->range_addr_start; + + struct_sizes = sizeof(struct ring_buffer_cpu_meta); + struct_sizes |= sizeof(*bmeta) << 16; + + /* The first buffer will start word size after the meta page */ + ptr += sizeof(*bmeta); + ptr = ALIGN(ptr, sizeof(long)); + ptr += scratch_size; + + if (bmeta->magic != RING_BUFFER_META_MAGIC) { + pr_info("Ring buffer boot meta mismatch of magic\n"); + goto init; + } + + if (bmeta->struct_sizes != struct_sizes) { + pr_info("Ring buffer boot meta mismatch of struct size\n"); + goto init; + } + + if (bmeta->total_size != total_size) { + pr_info("Ring buffer boot meta mismatch of total size\n"); + goto init; + } + + if (bmeta->buffers_offset > bmeta->total_size) { + pr_info("Ring buffer boot meta mismatch of offset outside of total size\n"); + goto init; + } + + if (bmeta->buffers_offset != (void *)ptr - (void *)bmeta) { + pr_info("Ring buffer boot meta mismatch of first buffer offset\n"); + goto init; + } + + return true; + + init: + bmeta->magic = RING_BUFFER_META_MAGIC; + bmeta->struct_sizes = struct_sizes; + bmeta->total_size = total_size; + bmeta->buffers_offset = (void *)ptr - (void *)bmeta; + + /* Zero out the scatch pad */ + memset((void *)bmeta + sizeof(*bmeta), 0, bmeta->buffers_offset - sizeof(*bmeta)); + + return false; +} + +/* * See if the existing memory contains valid ring buffer data. * As the previous kernel must be the same as this kernel, all * the calculations (size of buffers and number of buffers) * must be the same. */ -static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu, - struct trace_buffer *buffer, int nr_pages, - unsigned long *subbuf_mask) +static bool rb_cpu_meta_valid(struct ring_buffer_cpu_meta *meta, int cpu, + struct trace_buffer *buffer, int nr_pages, + unsigned long *subbuf_mask) { int subbuf_size = PAGE_SIZE; struct buffer_data_page *subbuf; @@ -1684,20 +1753,6 @@ static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu, if (!subbuf_mask) return false; - /* Check the meta magic and meta struct size */ - if (meta->magic != RING_BUFFER_META_MAGIC || - meta->struct_size != sizeof(*meta)) { - pr_info("Ring buffer boot meta[%d] mismatch of magic or struct size\n", cpu); - return false; - } - - /* The subbuffer's size and number of subbuffers must match */ - if (meta->subbuf_size != subbuf_size || - meta->nr_subbufs != nr_pages + 1) { - pr_info("Ring buffer boot meta [%d] mismatch of subbuf_size/nr_pages\n", cpu); - return false; - } - buffers_start = meta->first_buffer; buffers_end = meta->first_buffer + (subbuf_size * meta->nr_subbufs); @@ -1743,7 +1798,7 @@ static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu, return true; } -static int rb_meta_subbuf_idx(struct ring_buffer_meta *meta, void *subbuf); +static int rb_meta_subbuf_idx(struct ring_buffer_cpu_meta *meta, void *subbuf); static int rb_read_data_buffer(struct buffer_data_page *dpage, int tail, int cpu, unsigned long long *timestamp, u64 *delta_ptr) @@ -1810,7 +1865,7 @@ static int rb_validate_buffer(struct buffer_data_page *dpage, int cpu) /* If the meta data has been validated, now validate the events */ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer) { - struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; struct buffer_page *head_page; unsigned long entry_bytes = 0; unsigned long entries = 0; @@ -1832,10 +1887,12 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer) head_page = cpu_buffer->head_page; - /* If both the head and commit are on the reader_page then we are done. */ - if (head_page == cpu_buffer->reader_page && - head_page == cpu_buffer->commit_page) + /* If the commit_buffer is the reader page, update the commit page */ + if (meta->commit_buffer == (unsigned long)cpu_buffer->reader_page->page) { + cpu_buffer->commit_page = cpu_buffer->reader_page; + /* Nothing more to do, the only page is the reader page */ goto done; + } /* Iterate until finding the commit page */ for (i = 0; i < meta->nr_subbufs + 1; i++, rb_inc_page(&head_page)) { @@ -1891,24 +1948,13 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer) } } -/* Used to calculate data delta */ -static char rb_data_ptr[] = ""; - -#define THIS_TEXT_PTR ((unsigned long)rb_meta_init_text_addr) -#define THIS_DATA_PTR ((unsigned long)rb_data_ptr) - -static void rb_meta_init_text_addr(struct ring_buffer_meta *meta) -{ - meta->text_addr = THIS_TEXT_PTR; - meta->data_addr = THIS_DATA_PTR; -} - -static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages) +static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages, int scratch_size) { - struct ring_buffer_meta *meta; + struct ring_buffer_cpu_meta *meta; unsigned long *subbuf_mask; unsigned long delta; void *subbuf; + bool valid = false; int cpu; int i; @@ -1916,20 +1962,21 @@ static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages) subbuf_mask = bitmap_alloc(nr_pages + 1, GFP_KERNEL); /* If subbuf_mask fails to allocate, then rb_meta_valid() will return false */ + if (rb_meta_init(buffer, scratch_size)) + valid = true; + for (cpu = 0; cpu < nr_cpu_ids; cpu++) { void *next_meta; meta = rb_range_meta(buffer, nr_pages, cpu); - if (rb_meta_valid(meta, cpu, buffer, nr_pages, subbuf_mask)) { + if (valid && rb_cpu_meta_valid(meta, cpu, buffer, nr_pages, subbuf_mask)) { /* Make the mappings match the current address */ subbuf = rb_subbufs_from_meta(meta); delta = (unsigned long)subbuf - meta->first_buffer; meta->first_buffer += delta; meta->head_buffer += delta; meta->commit_buffer += delta; - buffer->last_text_delta = THIS_TEXT_PTR - meta->text_addr; - buffer->last_data_delta = THIS_DATA_PTR - meta->data_addr; continue; } @@ -1940,16 +1987,12 @@ static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages) memset(meta, 0, next_meta - (void *)meta); - meta->magic = RING_BUFFER_META_MAGIC; - meta->struct_size = sizeof(*meta); - meta->nr_subbufs = nr_pages + 1; meta->subbuf_size = PAGE_SIZE; subbuf = rb_subbufs_from_meta(meta); meta->first_buffer = (unsigned long)subbuf; - rb_meta_init_text_addr(meta); /* * The buffers[] array holds the order of the sub-buffers @@ -1971,7 +2014,7 @@ static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages) static void *rbm_start(struct seq_file *m, loff_t *pos) { struct ring_buffer_per_cpu *cpu_buffer = m->private; - struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; unsigned long val; if (!meta) @@ -1996,7 +2039,7 @@ static void *rbm_next(struct seq_file *m, void *v, loff_t *pos) static int rbm_show(struct seq_file *m, void *v) { struct ring_buffer_per_cpu *cpu_buffer = m->private; - struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; unsigned long val = (unsigned long)v; if (val == 1) { @@ -2045,7 +2088,7 @@ int ring_buffer_meta_seq_init(struct file *file, struct trace_buffer *buffer, in static void rb_meta_buffer_update(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *bpage) { - struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; if (meta->head_buffer == (unsigned long)bpage->page) cpu_buffer->head_page = bpage; @@ -2060,7 +2103,7 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, long nr_pages, struct list_head *pages) { struct trace_buffer *buffer = cpu_buffer->buffer; - struct ring_buffer_meta *meta = NULL; + struct ring_buffer_cpu_meta *meta = NULL; struct buffer_page *bpage, *tmp; bool user_thread = current->mm != NULL; gfp_t mflags; @@ -2184,7 +2227,7 @@ static struct ring_buffer_per_cpu * rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; - struct ring_buffer_meta *meta; + struct ring_buffer_cpu_meta *meta; struct buffer_page *bpage; struct page *page; int ret; @@ -2313,6 +2356,7 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, int order, unsigned long start, unsigned long end, + unsigned long scratch_size, struct lock_class_key *key) { struct trace_buffer *buffer; @@ -2355,10 +2399,23 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, /* If start/end are specified, then that overrides size */ if (start && end) { + unsigned long buffers_start; unsigned long ptr; int n; - size = end - start; + /* Make sure that start is word aligned */ + start = ALIGN(start, sizeof(long)); + + /* scratch_size needs to be aligned too */ + scratch_size = ALIGN(scratch_size, sizeof(long)); + + /* Subtract the buffer meta data and word aligned */ + buffers_start = start + sizeof(struct ring_buffer_cpu_meta); + buffers_start = ALIGN(buffers_start, sizeof(long)); + buffers_start += scratch_size; + + /* Calculate the size for the per CPU data */ + size = end - buffers_start; size = size / nr_cpu_ids; /* @@ -2368,7 +2425,7 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, * needed, plus account for the integer array index that * will be appended to the meta data. */ - nr_pages = (size - sizeof(struct ring_buffer_meta)) / + nr_pages = (size - sizeof(struct ring_buffer_cpu_meta)) / (subbuf_size + sizeof(int)); /* Need at least two pages plus the reader page */ if (nr_pages < 3) @@ -2376,8 +2433,8 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, again: /* Make sure that the size fits aligned */ - for (n = 0, ptr = start; n < nr_cpu_ids; n++) { - ptr += sizeof(struct ring_buffer_meta) + + for (n = 0, ptr = buffers_start; n < nr_cpu_ids; n++) { + ptr += sizeof(struct ring_buffer_cpu_meta) + sizeof(int) * nr_pages; ptr = ALIGN(ptr, subbuf_size); ptr += subbuf_size * nr_pages; @@ -2394,7 +2451,7 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, buffer->range_addr_start = start; buffer->range_addr_end = end; - rb_range_meta_init(buffer, nr_pages); + rb_range_meta_init(buffer, nr_pages, scratch_size); } else { /* need at least two pages */ @@ -2447,7 +2504,7 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *key) { /* Default buffer page size - one system page */ - return alloc_buffer(size, flags, 0, 0, 0,key); + return alloc_buffer(size, flags, 0, 0, 0, 0, key); } EXPORT_SYMBOL_GPL(__ring_buffer_alloc); @@ -2459,6 +2516,7 @@ EXPORT_SYMBOL_GPL(__ring_buffer_alloc); * @order: sub-buffer order * @start: start of allocated range * @range_size: size of allocated range + * @scratch_size: size of scratch area (for preallocated memory buffers) * @key: ring buffer reader_lock_key. * * Currently the only flag that is available is the RB_FL_OVERWRITE @@ -2469,32 +2527,29 @@ EXPORT_SYMBOL_GPL(__ring_buffer_alloc); struct trace_buffer *__ring_buffer_alloc_range(unsigned long size, unsigned flags, int order, unsigned long start, unsigned long range_size, + unsigned long scratch_size, struct lock_class_key *key) { - return alloc_buffer(size, flags, order, start, start + range_size, key); + return alloc_buffer(size, flags, order, start, start + range_size, + scratch_size, key); } -/** - * ring_buffer_last_boot_delta - return the delta offset from last boot - * @buffer: The buffer to return the delta from - * @text: Return text delta - * @data: Return data delta - * - * Returns: The true if the delta is non zero - */ -bool ring_buffer_last_boot_delta(struct trace_buffer *buffer, long *text, - long *data) +void *ring_buffer_meta_scratch(struct trace_buffer *buffer, unsigned int *size) { - if (!buffer) - return false; + struct ring_buffer_meta *meta; + void *ptr; - if (!buffer->last_text_delta) - return false; + if (!buffer || !buffer->meta) + return NULL; - *text = buffer->last_text_delta; - *data = buffer->last_data_delta; + meta = buffer->meta; - return true; + ptr = (void *)ALIGN((unsigned long)meta + sizeof(*meta), sizeof(long)); + + if (size) + *size = (void *)meta + meta->buffers_offset - ptr; + + return ptr; } /** @@ -3105,7 +3160,7 @@ static void rb_inc_iter(struct ring_buffer_iter *iter) } /* Return the index into the sub-buffers for a given sub-buffer */ -static int rb_meta_subbuf_idx(struct ring_buffer_meta *meta, void *subbuf) +static int rb_meta_subbuf_idx(struct ring_buffer_cpu_meta *meta, void *subbuf) { void *subbuf_array; @@ -3117,7 +3172,7 @@ static int rb_meta_subbuf_idx(struct ring_buffer_meta *meta, void *subbuf) static void rb_update_meta_head(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *next_page) { - struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; unsigned long old_head = (unsigned long)next_page->page; unsigned long new_head; @@ -3134,7 +3189,7 @@ static void rb_update_meta_head(struct ring_buffer_per_cpu *cpu_buffer, static void rb_update_meta_reader(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *reader) { - struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; void *old_reader = cpu_buffer->reader_page->page; void *new_reader = reader->page; int id; @@ -3763,7 +3818,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) rb_page_write(cpu_buffer->commit_page)); rb_inc_page(&cpu_buffer->commit_page); if (cpu_buffer->ring_meta) { - struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; meta->commit_buffer = (unsigned long)cpu_buffer->commit_page->page; } /* add barrier to keep gcc from optimizing too much */ @@ -5318,7 +5373,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) * moving it. The page before the header page has the * flag bit '1' set if it is pointing to the page we want. * but if the writer is in the process of moving it - * than it will be '2' or already moved '0'. + * then it will be '2' or already moved '0'. */ ret = rb_head_page_replace(reader, cpu_buffer->reader_page); @@ -5963,7 +6018,7 @@ static void rb_update_meta_page(struct ring_buffer_per_cpu *cpu_buffer) meta->read = cpu_buffer->read; /* Some archs do not have data cache coherency between kernel and user-space */ - flush_dcache_folio(virt_to_folio(cpu_buffer->meta_page)); + flush_kernel_vmap_range(cpu_buffer->meta_page, PAGE_SIZE); } static void @@ -6016,7 +6071,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) if (cpu_buffer->mapped) { rb_update_meta_page(cpu_buffer); if (cpu_buffer->ring_meta) { - struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; meta->commit_buffer = meta->head_buffer; } } @@ -6050,7 +6105,6 @@ static void reset_disabled_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; - struct ring_buffer_meta *meta; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return; @@ -6069,11 +6123,6 @@ void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu) atomic_dec(&cpu_buffer->record_disabled); atomic_dec(&cpu_buffer->resize_disabled); - /* Make sure persistent meta now uses this buffer's addresses */ - meta = rb_range_meta(buffer, 0, cpu_buffer->cpu); - if (meta) - rb_meta_init_text_addr(meta); - mutex_unlock(&buffer->mutex); } EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); @@ -6088,7 +6137,6 @@ EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); void ring_buffer_reset_online_cpus(struct trace_buffer *buffer) { struct ring_buffer_per_cpu *cpu_buffer; - struct ring_buffer_meta *meta; int cpu; /* prevent another thread from changing buffer sizes */ @@ -6116,11 +6164,6 @@ void ring_buffer_reset_online_cpus(struct trace_buffer *buffer) reset_disabled_cpu_buffer(cpu_buffer); - /* Make sure persistent meta now uses this buffer's addresses */ - meta = rb_range_meta(buffer, 0, cpu_buffer->cpu); - if (meta) - rb_meta_init_text_addr(meta); - atomic_dec(&cpu_buffer->record_disabled); atomic_sub(RESET_BIT, &cpu_buffer->resize_disabled); } @@ -7278,7 +7321,8 @@ consume: out: /* Some archs do not have data cache coherency between kernel and user-space */ - flush_dcache_folio(virt_to_folio(cpu_buffer->reader_page->page)); + flush_kernel_vmap_range(cpu_buffer->reader_page->page, + buffer->subbuf_size + BUF_PAGE_HDR_SIZE); rb_update_meta_page(cpu_buffer); @@ -7411,9 +7455,9 @@ static __init int rb_write_something(struct rb_test_data *data, bool nested) /* Ignore dropped events before test starts. */ if (started) { if (nested) - data->bytes_dropped += len; - else data->bytes_dropped_nested += len; + else + data->bytes_dropped += len; } return len; } diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig index 8226352a0062..b39f36013ef2 100644 --- a/kernel/trace/rv/Kconfig +++ b/kernel/trace/rv/Kconfig @@ -27,6 +27,13 @@ menuconfig RV source "kernel/trace/rv/monitors/wip/Kconfig" source "kernel/trace/rv/monitors/wwnr/Kconfig" +source "kernel/trace/rv/monitors/sched/Kconfig" +source "kernel/trace/rv/monitors/tss/Kconfig" +source "kernel/trace/rv/monitors/sco/Kconfig" +source "kernel/trace/rv/monitors/snroc/Kconfig" +source "kernel/trace/rv/monitors/scpd/Kconfig" +source "kernel/trace/rv/monitors/snep/Kconfig" +source "kernel/trace/rv/monitors/sncid/Kconfig" # Add new monitors here config RV_REACTORS diff --git a/kernel/trace/rv/Makefile b/kernel/trace/rv/Makefile index 188b64668e1f..f9b2cd0483c3 100644 --- a/kernel/trace/rv/Makefile +++ b/kernel/trace/rv/Makefile @@ -5,6 +5,13 @@ ccflags-y += -I $(src) # needed for trace events obj-$(CONFIG_RV) += rv.o obj-$(CONFIG_RV_MON_WIP) += monitors/wip/wip.o obj-$(CONFIG_RV_MON_WWNR) += monitors/wwnr/wwnr.o +obj-$(CONFIG_RV_MON_SCHED) += monitors/sched/sched.o +obj-$(CONFIG_RV_MON_TSS) += monitors/tss/tss.o +obj-$(CONFIG_RV_MON_SCO) += monitors/sco/sco.o +obj-$(CONFIG_RV_MON_SNROC) += monitors/snroc/snroc.o +obj-$(CONFIG_RV_MON_SCPD) += monitors/scpd/scpd.o +obj-$(CONFIG_RV_MON_SNEP) += monitors/snep/snep.o +obj-$(CONFIG_RV_MON_SNCID) += monitors/sncid/sncid.o # Add new monitors here obj-$(CONFIG_RV_REACTORS) += rv_reactors.o obj-$(CONFIG_RV_REACT_PRINTK) += reactor_printk.o diff --git a/kernel/trace/rv/monitors/sched/Kconfig b/kernel/trace/rv/monitors/sched/Kconfig new file mode 100644 index 000000000000..ae3eb410abd7 --- /dev/null +++ b/kernel/trace/rv/monitors/sched/Kconfig @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config RV_MON_SCHED + depends on RV + bool "sched monitor" + help + Collection of monitors to check the scheduler behaves according to specifications. + Enable this to enable all scheduler specification supported by the current kernel. + + For further information, see: + Documentation/trace/rv/monitor_sched.rst diff --git a/kernel/trace/rv/monitors/sched/sched.c b/kernel/trace/rv/monitors/sched/sched.c new file mode 100644 index 000000000000..905e03c3c934 --- /dev/null +++ b/kernel/trace/rv/monitors/sched/sched.c @@ -0,0 +1,38 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> + +#define MODULE_NAME "sched" + +#include "sched.h" + +struct rv_monitor rv_sched; + +struct rv_monitor rv_sched = { + .name = "sched", + .description = "container for several scheduler monitor specifications.", + .enable = NULL, + .disable = NULL, + .reset = NULL, + .enabled = 0, +}; + +static int __init register_sched(void) +{ + rv_register_monitor(&rv_sched, NULL); + return 0; +} + +static void __exit unregister_sched(void) +{ + rv_unregister_monitor(&rv_sched); +} + +module_init(register_sched); +module_exit(unregister_sched); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>"); +MODULE_DESCRIPTION("sched: container for several scheduler monitor specifications."); diff --git a/kernel/trace/rv/monitors/sched/sched.h b/kernel/trace/rv/monitors/sched/sched.h new file mode 100644 index 000000000000..ba148dd8d48b --- /dev/null +++ b/kernel/trace/rv/monitors/sched/sched.h @@ -0,0 +1,3 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +extern struct rv_monitor rv_sched; diff --git a/kernel/trace/rv/monitors/sco/Kconfig b/kernel/trace/rv/monitors/sco/Kconfig new file mode 100644 index 000000000000..097c96cccdd7 --- /dev/null +++ b/kernel/trace/rv/monitors/sco/Kconfig @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config RV_MON_SCO + depends on RV + depends on RV_MON_SCHED + default y + select DA_MON_EVENTS_IMPLICIT + bool "sco monitor" + help + Monitor to ensure sched_set_state happens only in thread context. + This monitor is part of the sched monitors collection. + + For further information, see: + Documentation/trace/rv/monitor_sched.rst diff --git a/kernel/trace/rv/monitors/sco/sco.c b/kernel/trace/rv/monitors/sco/sco.c new file mode 100644 index 000000000000..4cff59220bfc --- /dev/null +++ b/kernel/trace/rv/monitors/sco/sco.c @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ftrace.h> +#include <linux/tracepoint.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> +#include <rv/instrumentation.h> +#include <rv/da_monitor.h> + +#define MODULE_NAME "sco" + +#include <trace/events/sched.h> +#include <rv_trace.h> +#include <monitors/sched/sched.h> + +#include "sco.h" + +static struct rv_monitor rv_sco; +DECLARE_DA_MON_PER_CPU(sco, unsigned char); + +static void handle_sched_set_state(void *data, struct task_struct *tsk, int state) +{ + da_handle_start_event_sco(sched_set_state_sco); +} + +static void handle_schedule_entry(void *data, bool preempt, unsigned long ip) +{ + da_handle_event_sco(schedule_entry_sco); +} + +static void handle_schedule_exit(void *data, bool is_switch, unsigned long ip) +{ + da_handle_start_event_sco(schedule_exit_sco); +} + +static int enable_sco(void) +{ + int retval; + + retval = da_monitor_init_sco(); + if (retval) + return retval; + + rv_attach_trace_probe("sco", sched_set_state_tp, handle_sched_set_state); + rv_attach_trace_probe("sco", sched_entry_tp, handle_schedule_entry); + rv_attach_trace_probe("sco", sched_exit_tp, handle_schedule_exit); + + return 0; +} + +static void disable_sco(void) +{ + rv_sco.enabled = 0; + + rv_detach_trace_probe("sco", sched_set_state_tp, handle_sched_set_state); + rv_detach_trace_probe("sco", sched_entry_tp, handle_schedule_entry); + rv_detach_trace_probe("sco", sched_exit_tp, handle_schedule_exit); + + da_monitor_destroy_sco(); +} + +static struct rv_monitor rv_sco = { + .name = "sco", + .description = "scheduling context operations.", + .enable = enable_sco, + .disable = disable_sco, + .reset = da_monitor_reset_all_sco, + .enabled = 0, +}; + +static int __init register_sco(void) +{ + rv_register_monitor(&rv_sco, &rv_sched); + return 0; +} + +static void __exit unregister_sco(void) +{ + rv_unregister_monitor(&rv_sco); +} + +module_init(register_sco); +module_exit(unregister_sco); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>"); +MODULE_DESCRIPTION("sco: scheduling context operations."); diff --git a/kernel/trace/rv/monitors/sco/sco.h b/kernel/trace/rv/monitors/sco/sco.h new file mode 100644 index 000000000000..7a4c1f2d5ca1 --- /dev/null +++ b/kernel/trace/rv/monitors/sco/sco.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Automatically generated C representation of sco automaton + * For further information about this format, see kernel documentation: + * Documentation/trace/rv/deterministic_automata.rst + */ + +enum states_sco { + thread_context_sco = 0, + scheduling_context_sco, + state_max_sco +}; + +#define INVALID_STATE state_max_sco + +enum events_sco { + sched_set_state_sco = 0, + schedule_entry_sco, + schedule_exit_sco, + event_max_sco +}; + +struct automaton_sco { + char *state_names[state_max_sco]; + char *event_names[event_max_sco]; + unsigned char function[state_max_sco][event_max_sco]; + unsigned char initial_state; + bool final_states[state_max_sco]; +}; + +static const struct automaton_sco automaton_sco = { + .state_names = { + "thread_context", + "scheduling_context" + }, + .event_names = { + "sched_set_state", + "schedule_entry", + "schedule_exit" + }, + .function = { + { thread_context_sco, scheduling_context_sco, INVALID_STATE }, + { INVALID_STATE, INVALID_STATE, thread_context_sco }, + }, + .initial_state = thread_context_sco, + .final_states = { 1, 0 }, +}; diff --git a/kernel/trace/rv/monitors/sco/sco_trace.h b/kernel/trace/rv/monitors/sco/sco_trace.h new file mode 100644 index 000000000000..b711cd9024ec --- /dev/null +++ b/kernel/trace/rv/monitors/sco/sco_trace.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h + */ + +#ifdef CONFIG_RV_MON_SCO +DEFINE_EVENT(event_da_monitor, event_sco, + TP_PROTO(char *state, char *event, char *next_state, bool final_state), + TP_ARGS(state, event, next_state, final_state)); + +DEFINE_EVENT(error_da_monitor, error_sco, + TP_PROTO(char *state, char *event), + TP_ARGS(state, event)); +#endif /* CONFIG_RV_MON_SCO */ diff --git a/kernel/trace/rv/monitors/scpd/Kconfig b/kernel/trace/rv/monitors/scpd/Kconfig new file mode 100644 index 000000000000..b9114fbf680f --- /dev/null +++ b/kernel/trace/rv/monitors/scpd/Kconfig @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config RV_MON_SCPD + depends on RV + depends on PREEMPT_TRACER + depends on RV_MON_SCHED + default y + select DA_MON_EVENTS_IMPLICIT + bool "scpd monitor" + help + Monitor to ensure schedule is called with preemption disabled. + This monitor is part of the sched monitors collection. + + For further information, see: + Documentation/trace/rv/monitor_sched.rst diff --git a/kernel/trace/rv/monitors/scpd/scpd.c b/kernel/trace/rv/monitors/scpd/scpd.c new file mode 100644 index 000000000000..cbdd6a5f8d7f --- /dev/null +++ b/kernel/trace/rv/monitors/scpd/scpd.c @@ -0,0 +1,96 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ftrace.h> +#include <linux/tracepoint.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> +#include <rv/instrumentation.h> +#include <rv/da_monitor.h> + +#define MODULE_NAME "scpd" + +#include <trace/events/sched.h> +#include <trace/events/preemptirq.h> +#include <rv_trace.h> +#include <monitors/sched/sched.h> + +#include "scpd.h" + +static struct rv_monitor rv_scpd; +DECLARE_DA_MON_PER_CPU(scpd, unsigned char); + +static void handle_preempt_disable(void *data, unsigned long ip, unsigned long parent_ip) +{ + da_handle_event_scpd(preempt_disable_scpd); +} + +static void handle_preempt_enable(void *data, unsigned long ip, unsigned long parent_ip) +{ + da_handle_start_event_scpd(preempt_enable_scpd); +} + +static void handle_schedule_entry(void *data, bool preempt, unsigned long ip) +{ + da_handle_event_scpd(schedule_entry_scpd); +} + +static void handle_schedule_exit(void *data, bool is_switch, unsigned long ip) +{ + da_handle_event_scpd(schedule_exit_scpd); +} + +static int enable_scpd(void) +{ + int retval; + + retval = da_monitor_init_scpd(); + if (retval) + return retval; + + rv_attach_trace_probe("scpd", preempt_disable, handle_preempt_disable); + rv_attach_trace_probe("scpd", preempt_enable, handle_preempt_enable); + rv_attach_trace_probe("scpd", sched_entry_tp, handle_schedule_entry); + rv_attach_trace_probe("scpd", sched_exit_tp, handle_schedule_exit); + + return 0; +} + +static void disable_scpd(void) +{ + rv_scpd.enabled = 0; + + rv_detach_trace_probe("scpd", preempt_disable, handle_preempt_disable); + rv_detach_trace_probe("scpd", preempt_enable, handle_preempt_enable); + rv_detach_trace_probe("scpd", sched_entry_tp, handle_schedule_entry); + rv_detach_trace_probe("scpd", sched_exit_tp, handle_schedule_exit); + + da_monitor_destroy_scpd(); +} + +static struct rv_monitor rv_scpd = { + .name = "scpd", + .description = "schedule called with preemption disabled.", + .enable = enable_scpd, + .disable = disable_scpd, + .reset = da_monitor_reset_all_scpd, + .enabled = 0, +}; + +static int __init register_scpd(void) +{ + rv_register_monitor(&rv_scpd, &rv_sched); + return 0; +} + +static void __exit unregister_scpd(void) +{ + rv_unregister_monitor(&rv_scpd); +} + +module_init(register_scpd); +module_exit(unregister_scpd); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>"); +MODULE_DESCRIPTION("scpd: schedule called with preemption disabled."); diff --git a/kernel/trace/rv/monitors/scpd/scpd.h b/kernel/trace/rv/monitors/scpd/scpd.h new file mode 100644 index 000000000000..295f735a5811 --- /dev/null +++ b/kernel/trace/rv/monitors/scpd/scpd.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Automatically generated C representation of scpd automaton + * For further information about this format, see kernel documentation: + * Documentation/trace/rv/deterministic_automata.rst + */ + +enum states_scpd { + cant_sched_scpd = 0, + can_sched_scpd, + state_max_scpd +}; + +#define INVALID_STATE state_max_scpd + +enum events_scpd { + preempt_disable_scpd = 0, + preempt_enable_scpd, + schedule_entry_scpd, + schedule_exit_scpd, + event_max_scpd +}; + +struct automaton_scpd { + char *state_names[state_max_scpd]; + char *event_names[event_max_scpd]; + unsigned char function[state_max_scpd][event_max_scpd]; + unsigned char initial_state; + bool final_states[state_max_scpd]; +}; + +static const struct automaton_scpd automaton_scpd = { + .state_names = { + "cant_sched", + "can_sched" + }, + .event_names = { + "preempt_disable", + "preempt_enable", + "schedule_entry", + "schedule_exit" + }, + .function = { + { can_sched_scpd, INVALID_STATE, INVALID_STATE, INVALID_STATE }, + { INVALID_STATE, cant_sched_scpd, can_sched_scpd, can_sched_scpd }, + }, + .initial_state = cant_sched_scpd, + .final_states = { 1, 0 }, +}; diff --git a/kernel/trace/rv/monitors/scpd/scpd_trace.h b/kernel/trace/rv/monitors/scpd/scpd_trace.h new file mode 100644 index 000000000000..6b0f4aa4732e --- /dev/null +++ b/kernel/trace/rv/monitors/scpd/scpd_trace.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h + */ + +#ifdef CONFIG_RV_MON_SCPD +DEFINE_EVENT(event_da_monitor, event_scpd, + TP_PROTO(char *state, char *event, char *next_state, bool final_state), + TP_ARGS(state, event, next_state, final_state)); + +DEFINE_EVENT(error_da_monitor, error_scpd, + TP_PROTO(char *state, char *event), + TP_ARGS(state, event)); +#endif /* CONFIG_RV_MON_SCPD */ diff --git a/kernel/trace/rv/monitors/sncid/Kconfig b/kernel/trace/rv/monitors/sncid/Kconfig new file mode 100644 index 000000000000..76bcfef4fd10 --- /dev/null +++ b/kernel/trace/rv/monitors/sncid/Kconfig @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config RV_MON_SNCID + depends on RV + depends on IRQSOFF_TRACER + depends on RV_MON_SCHED + default y + select DA_MON_EVENTS_IMPLICIT + bool "sncid monitor" + help + Monitor to ensure schedule is not called with interrupt disabled. + This monitor is part of the sched monitors collection. + + For further information, see: + Documentation/trace/rv/monitor_sched.rst diff --git a/kernel/trace/rv/monitors/sncid/sncid.c b/kernel/trace/rv/monitors/sncid/sncid.c new file mode 100644 index 000000000000..f5037cd6214c --- /dev/null +++ b/kernel/trace/rv/monitors/sncid/sncid.c @@ -0,0 +1,96 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ftrace.h> +#include <linux/tracepoint.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> +#include <rv/instrumentation.h> +#include <rv/da_monitor.h> + +#define MODULE_NAME "sncid" + +#include <trace/events/sched.h> +#include <trace/events/preemptirq.h> +#include <rv_trace.h> +#include <monitors/sched/sched.h> + +#include "sncid.h" + +static struct rv_monitor rv_sncid; +DECLARE_DA_MON_PER_CPU(sncid, unsigned char); + +static void handle_irq_disable(void *data, unsigned long ip, unsigned long parent_ip) +{ + da_handle_event_sncid(irq_disable_sncid); +} + +static void handle_irq_enable(void *data, unsigned long ip, unsigned long parent_ip) +{ + da_handle_start_event_sncid(irq_enable_sncid); +} + +static void handle_schedule_entry(void *data, bool preempt, unsigned long ip) +{ + da_handle_start_event_sncid(schedule_entry_sncid); +} + +static void handle_schedule_exit(void *data, bool is_switch, unsigned long ip) +{ + da_handle_start_event_sncid(schedule_exit_sncid); +} + +static int enable_sncid(void) +{ + int retval; + + retval = da_monitor_init_sncid(); + if (retval) + return retval; + + rv_attach_trace_probe("sncid", irq_disable, handle_irq_disable); + rv_attach_trace_probe("sncid", irq_enable, handle_irq_enable); + rv_attach_trace_probe("sncid", sched_entry_tp, handle_schedule_entry); + rv_attach_trace_probe("sncid", sched_exit_tp, handle_schedule_exit); + + return 0; +} + +static void disable_sncid(void) +{ + rv_sncid.enabled = 0; + + rv_detach_trace_probe("sncid", irq_disable, handle_irq_disable); + rv_detach_trace_probe("sncid", irq_enable, handle_irq_enable); + rv_detach_trace_probe("sncid", sched_entry_tp, handle_schedule_entry); + rv_detach_trace_probe("sncid", sched_exit_tp, handle_schedule_exit); + + da_monitor_destroy_sncid(); +} + +static struct rv_monitor rv_sncid = { + .name = "sncid", + .description = "schedule not called with interrupt disabled.", + .enable = enable_sncid, + .disable = disable_sncid, + .reset = da_monitor_reset_all_sncid, + .enabled = 0, +}; + +static int __init register_sncid(void) +{ + rv_register_monitor(&rv_sncid, &rv_sched); + return 0; +} + +static void __exit unregister_sncid(void) +{ + rv_unregister_monitor(&rv_sncid); +} + +module_init(register_sncid); +module_exit(unregister_sncid); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>"); +MODULE_DESCRIPTION("sncid: schedule not called with interrupt disabled."); diff --git a/kernel/trace/rv/monitors/sncid/sncid.h b/kernel/trace/rv/monitors/sncid/sncid.h new file mode 100644 index 000000000000..21304725142b --- /dev/null +++ b/kernel/trace/rv/monitors/sncid/sncid.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Automatically generated C representation of sncid automaton + * For further information about this format, see kernel documentation: + * Documentation/trace/rv/deterministic_automata.rst + */ + +enum states_sncid { + can_sched_sncid = 0, + cant_sched_sncid, + state_max_sncid +}; + +#define INVALID_STATE state_max_sncid + +enum events_sncid { + irq_disable_sncid = 0, + irq_enable_sncid, + schedule_entry_sncid, + schedule_exit_sncid, + event_max_sncid +}; + +struct automaton_sncid { + char *state_names[state_max_sncid]; + char *event_names[event_max_sncid]; + unsigned char function[state_max_sncid][event_max_sncid]; + unsigned char initial_state; + bool final_states[state_max_sncid]; +}; + +static const struct automaton_sncid automaton_sncid = { + .state_names = { + "can_sched", + "cant_sched" + }, + .event_names = { + "irq_disable", + "irq_enable", + "schedule_entry", + "schedule_exit" + }, + .function = { + { cant_sched_sncid, INVALID_STATE, can_sched_sncid, can_sched_sncid }, + { INVALID_STATE, can_sched_sncid, INVALID_STATE, INVALID_STATE }, + }, + .initial_state = can_sched_sncid, + .final_states = { 1, 0 }, +}; diff --git a/kernel/trace/rv/monitors/sncid/sncid_trace.h b/kernel/trace/rv/monitors/sncid/sncid_trace.h new file mode 100644 index 000000000000..3ce42a57671d --- /dev/null +++ b/kernel/trace/rv/monitors/sncid/sncid_trace.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h + */ + +#ifdef CONFIG_RV_MON_SNCID +DEFINE_EVENT(event_da_monitor, event_sncid, + TP_PROTO(char *state, char *event, char *next_state, bool final_state), + TP_ARGS(state, event, next_state, final_state)); + +DEFINE_EVENT(error_da_monitor, error_sncid, + TP_PROTO(char *state, char *event), + TP_ARGS(state, event)); +#endif /* CONFIG_RV_MON_SNCID */ diff --git a/kernel/trace/rv/monitors/snep/Kconfig b/kernel/trace/rv/monitors/snep/Kconfig new file mode 100644 index 000000000000..77527f971232 --- /dev/null +++ b/kernel/trace/rv/monitors/snep/Kconfig @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config RV_MON_SNEP + depends on RV + depends on PREEMPT_TRACER + depends on RV_MON_SCHED + default y + select DA_MON_EVENTS_IMPLICIT + bool "snep monitor" + help + Monitor to ensure schedule does not enable preempt. + This monitor is part of the sched monitors collection. + + For further information, see: + Documentation/trace/rv/monitor_sched.rst diff --git a/kernel/trace/rv/monitors/snep/snep.c b/kernel/trace/rv/monitors/snep/snep.c new file mode 100644 index 000000000000..0076ba6d7ea4 --- /dev/null +++ b/kernel/trace/rv/monitors/snep/snep.c @@ -0,0 +1,96 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ftrace.h> +#include <linux/tracepoint.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> +#include <rv/instrumentation.h> +#include <rv/da_monitor.h> + +#define MODULE_NAME "snep" + +#include <trace/events/sched.h> +#include <trace/events/preemptirq.h> +#include <rv_trace.h> +#include <monitors/sched/sched.h> + +#include "snep.h" + +static struct rv_monitor rv_snep; +DECLARE_DA_MON_PER_CPU(snep, unsigned char); + +static void handle_preempt_disable(void *data, unsigned long ip, unsigned long parent_ip) +{ + da_handle_start_event_snep(preempt_disable_snep); +} + +static void handle_preempt_enable(void *data, unsigned long ip, unsigned long parent_ip) +{ + da_handle_start_event_snep(preempt_enable_snep); +} + +static void handle_schedule_entry(void *data, bool preempt, unsigned long ip) +{ + da_handle_event_snep(schedule_entry_snep); +} + +static void handle_schedule_exit(void *data, bool is_switch, unsigned long ip) +{ + da_handle_start_event_snep(schedule_exit_snep); +} + +static int enable_snep(void) +{ + int retval; + + retval = da_monitor_init_snep(); + if (retval) + return retval; + + rv_attach_trace_probe("snep", preempt_disable, handle_preempt_disable); + rv_attach_trace_probe("snep", preempt_enable, handle_preempt_enable); + rv_attach_trace_probe("snep", sched_entry_tp, handle_schedule_entry); + rv_attach_trace_probe("snep", sched_exit_tp, handle_schedule_exit); + + return 0; +} + +static void disable_snep(void) +{ + rv_snep.enabled = 0; + + rv_detach_trace_probe("snep", preempt_disable, handle_preempt_disable); + rv_detach_trace_probe("snep", preempt_enable, handle_preempt_enable); + rv_detach_trace_probe("snep", sched_entry_tp, handle_schedule_entry); + rv_detach_trace_probe("snep", sched_exit_tp, handle_schedule_exit); + + da_monitor_destroy_snep(); +} + +static struct rv_monitor rv_snep = { + .name = "snep", + .description = "schedule does not enable preempt.", + .enable = enable_snep, + .disable = disable_snep, + .reset = da_monitor_reset_all_snep, + .enabled = 0, +}; + +static int __init register_snep(void) +{ + rv_register_monitor(&rv_snep, &rv_sched); + return 0; +} + +static void __exit unregister_snep(void) +{ + rv_unregister_monitor(&rv_snep); +} + +module_init(register_snep); +module_exit(unregister_snep); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>"); +MODULE_DESCRIPTION("snep: schedule does not enable preempt."); diff --git a/kernel/trace/rv/monitors/snep/snep.h b/kernel/trace/rv/monitors/snep/snep.h new file mode 100644 index 000000000000..6d16b9ad931e --- /dev/null +++ b/kernel/trace/rv/monitors/snep/snep.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Automatically generated C representation of snep automaton + * For further information about this format, see kernel documentation: + * Documentation/trace/rv/deterministic_automata.rst + */ + +enum states_snep { + non_scheduling_context_snep = 0, + scheduling_contex_snep, + state_max_snep +}; + +#define INVALID_STATE state_max_snep + +enum events_snep { + preempt_disable_snep = 0, + preempt_enable_snep, + schedule_entry_snep, + schedule_exit_snep, + event_max_snep +}; + +struct automaton_snep { + char *state_names[state_max_snep]; + char *event_names[event_max_snep]; + unsigned char function[state_max_snep][event_max_snep]; + unsigned char initial_state; + bool final_states[state_max_snep]; +}; + +static const struct automaton_snep automaton_snep = { + .state_names = { + "non_scheduling_context", + "scheduling_contex" + }, + .event_names = { + "preempt_disable", + "preempt_enable", + "schedule_entry", + "schedule_exit" + }, + .function = { + { non_scheduling_context_snep, non_scheduling_context_snep, scheduling_contex_snep, INVALID_STATE }, + { INVALID_STATE, INVALID_STATE, INVALID_STATE, non_scheduling_context_snep }, + }, + .initial_state = non_scheduling_context_snep, + .final_states = { 1, 0 }, +}; diff --git a/kernel/trace/rv/monitors/snep/snep_trace.h b/kernel/trace/rv/monitors/snep/snep_trace.h new file mode 100644 index 000000000000..01aad49a949a --- /dev/null +++ b/kernel/trace/rv/monitors/snep/snep_trace.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h + */ + +#ifdef CONFIG_RV_MON_SNEP +DEFINE_EVENT(event_da_monitor, event_snep, + TP_PROTO(char *state, char *event, char *next_state, bool final_state), + TP_ARGS(state, event, next_state, final_state)); + +DEFINE_EVENT(error_da_monitor, error_snep, + TP_PROTO(char *state, char *event), + TP_ARGS(state, event)); +#endif /* CONFIG_RV_MON_SNEP */ diff --git a/kernel/trace/rv/monitors/snroc/Kconfig b/kernel/trace/rv/monitors/snroc/Kconfig new file mode 100644 index 000000000000..6e4365a2fea3 --- /dev/null +++ b/kernel/trace/rv/monitors/snroc/Kconfig @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config RV_MON_SNROC + depends on RV + depends on RV_MON_SCHED + default y + select DA_MON_EVENTS_ID + bool "snroc monitor" + help + Monitor to ensure sched_set_state happens only in the respective task's context. + This monitor is part of the sched monitors collection. + + For further information, see: + Documentation/trace/rv/monitor_sched.rst diff --git a/kernel/trace/rv/monitors/snroc/snroc.c b/kernel/trace/rv/monitors/snroc/snroc.c new file mode 100644 index 000000000000..bb1f60d55296 --- /dev/null +++ b/kernel/trace/rv/monitors/snroc/snroc.c @@ -0,0 +1,85 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ftrace.h> +#include <linux/tracepoint.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> +#include <rv/instrumentation.h> +#include <rv/da_monitor.h> + +#define MODULE_NAME "snroc" + +#include <trace/events/sched.h> +#include <rv_trace.h> +#include <monitors/sched/sched.h> + +#include "snroc.h" + +static struct rv_monitor rv_snroc; +DECLARE_DA_MON_PER_TASK(snroc, unsigned char); + +static void handle_sched_set_state(void *data, struct task_struct *tsk, int state) +{ + da_handle_event_snroc(tsk, sched_set_state_snroc); +} + +static void handle_sched_switch(void *data, bool preempt, + struct task_struct *prev, + struct task_struct *next, + unsigned int prev_state) +{ + da_handle_start_event_snroc(prev, sched_switch_out_snroc); + da_handle_event_snroc(next, sched_switch_in_snroc); +} + +static int enable_snroc(void) +{ + int retval; + + retval = da_monitor_init_snroc(); + if (retval) + return retval; + + rv_attach_trace_probe("snroc", sched_set_state_tp, handle_sched_set_state); + rv_attach_trace_probe("snroc", sched_switch, handle_sched_switch); + + return 0; +} + +static void disable_snroc(void) +{ + rv_snroc.enabled = 0; + + rv_detach_trace_probe("snroc", sched_set_state_tp, handle_sched_set_state); + rv_detach_trace_probe("snroc", sched_switch, handle_sched_switch); + + da_monitor_destroy_snroc(); +} + +static struct rv_monitor rv_snroc = { + .name = "snroc", + .description = "set non runnable on its own context.", + .enable = enable_snroc, + .disable = disable_snroc, + .reset = da_monitor_reset_all_snroc, + .enabled = 0, +}; + +static int __init register_snroc(void) +{ + rv_register_monitor(&rv_snroc, &rv_sched); + return 0; +} + +static void __exit unregister_snroc(void) +{ + rv_unregister_monitor(&rv_snroc); +} + +module_init(register_snroc); +module_exit(unregister_snroc); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>"); +MODULE_DESCRIPTION("snroc: set non runnable on its own context."); diff --git a/kernel/trace/rv/monitors/snroc/snroc.h b/kernel/trace/rv/monitors/snroc/snroc.h new file mode 100644 index 000000000000..c3650a2b1b10 --- /dev/null +++ b/kernel/trace/rv/monitors/snroc/snroc.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Automatically generated C representation of snroc automaton + * For further information about this format, see kernel documentation: + * Documentation/trace/rv/deterministic_automata.rst + */ + +enum states_snroc { + other_context_snroc = 0, + own_context_snroc, + state_max_snroc +}; + +#define INVALID_STATE state_max_snroc + +enum events_snroc { + sched_set_state_snroc = 0, + sched_switch_in_snroc, + sched_switch_out_snroc, + event_max_snroc +}; + +struct automaton_snroc { + char *state_names[state_max_snroc]; + char *event_names[event_max_snroc]; + unsigned char function[state_max_snroc][event_max_snroc]; + unsigned char initial_state; + bool final_states[state_max_snroc]; +}; + +static const struct automaton_snroc automaton_snroc = { + .state_names = { + "other_context", + "own_context" + }, + .event_names = { + "sched_set_state", + "sched_switch_in", + "sched_switch_out" + }, + .function = { + { INVALID_STATE, own_context_snroc, INVALID_STATE }, + { own_context_snroc, INVALID_STATE, other_context_snroc }, + }, + .initial_state = other_context_snroc, + .final_states = { 1, 0 }, +}; diff --git a/kernel/trace/rv/monitors/snroc/snroc_trace.h b/kernel/trace/rv/monitors/snroc/snroc_trace.h new file mode 100644 index 000000000000..50114cef5122 --- /dev/null +++ b/kernel/trace/rv/monitors/snroc/snroc_trace.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h + */ + +#ifdef CONFIG_RV_MON_SNROC +DEFINE_EVENT(event_da_monitor_id, event_snroc, + TP_PROTO(int id, char *state, char *event, char *next_state, bool final_state), + TP_ARGS(id, state, event, next_state, final_state)); + +DEFINE_EVENT(error_da_monitor_id, error_snroc, + TP_PROTO(int id, char *state, char *event), + TP_ARGS(id, state, event)); +#endif /* CONFIG_RV_MON_SNROC */ diff --git a/kernel/trace/rv/monitors/tss/Kconfig b/kernel/trace/rv/monitors/tss/Kconfig new file mode 100644 index 000000000000..479f86f52e60 --- /dev/null +++ b/kernel/trace/rv/monitors/tss/Kconfig @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config RV_MON_TSS + depends on RV + depends on RV_MON_SCHED + default y + select DA_MON_EVENTS_IMPLICIT + bool "tss monitor" + help + Monitor to ensure sched_switch happens only in scheduling context. + This monitor is part of the sched monitors collection. + + For further information, see: + Documentation/trace/rv/monitor_sched.rst diff --git a/kernel/trace/rv/monitors/tss/tss.c b/kernel/trace/rv/monitors/tss/tss.c new file mode 100644 index 000000000000..542787e6524f --- /dev/null +++ b/kernel/trace/rv/monitors/tss/tss.c @@ -0,0 +1,91 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ftrace.h> +#include <linux/tracepoint.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> +#include <rv/instrumentation.h> +#include <rv/da_monitor.h> + +#define MODULE_NAME "tss" + +#include <trace/events/sched.h> +#include <rv_trace.h> +#include <monitors/sched/sched.h> + +#include "tss.h" + +static struct rv_monitor rv_tss; +DECLARE_DA_MON_PER_CPU(tss, unsigned char); + +static void handle_sched_switch(void *data, bool preempt, + struct task_struct *prev, + struct task_struct *next, + unsigned int prev_state) +{ + da_handle_event_tss(sched_switch_tss); +} + +static void handle_schedule_entry(void *data, bool preempt, unsigned long ip) +{ + da_handle_event_tss(schedule_entry_tss); +} + +static void handle_schedule_exit(void *data, bool is_switch, unsigned long ip) +{ + da_handle_start_event_tss(schedule_exit_tss); +} + +static int enable_tss(void) +{ + int retval; + + retval = da_monitor_init_tss(); + if (retval) + return retval; + + rv_attach_trace_probe("tss", sched_switch, handle_sched_switch); + rv_attach_trace_probe("tss", sched_entry_tp, handle_schedule_entry); + rv_attach_trace_probe("tss", sched_exit_tp, handle_schedule_exit); + + return 0; +} + +static void disable_tss(void) +{ + rv_tss.enabled = 0; + + rv_detach_trace_probe("tss", sched_switch, handle_sched_switch); + rv_detach_trace_probe("tss", sched_entry_tp, handle_schedule_entry); + rv_detach_trace_probe("tss", sched_exit_tp, handle_schedule_exit); + + da_monitor_destroy_tss(); +} + +static struct rv_monitor rv_tss = { + .name = "tss", + .description = "task switch while scheduling.", + .enable = enable_tss, + .disable = disable_tss, + .reset = da_monitor_reset_all_tss, + .enabled = 0, +}; + +static int __init register_tss(void) +{ + rv_register_monitor(&rv_tss, &rv_sched); + return 0; +} + +static void __exit unregister_tss(void) +{ + rv_unregister_monitor(&rv_tss); +} + +module_init(register_tss); +module_exit(unregister_tss); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>"); +MODULE_DESCRIPTION("tss: task switch while scheduling."); diff --git a/kernel/trace/rv/monitors/tss/tss.h b/kernel/trace/rv/monitors/tss/tss.h new file mode 100644 index 000000000000..f0a36fda1b87 --- /dev/null +++ b/kernel/trace/rv/monitors/tss/tss.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Automatically generated C representation of tss automaton + * For further information about this format, see kernel documentation: + * Documentation/trace/rv/deterministic_automata.rst + */ + +enum states_tss { + thread_tss = 0, + sched_tss, + state_max_tss +}; + +#define INVALID_STATE state_max_tss + +enum events_tss { + sched_switch_tss = 0, + schedule_entry_tss, + schedule_exit_tss, + event_max_tss +}; + +struct automaton_tss { + char *state_names[state_max_tss]; + char *event_names[event_max_tss]; + unsigned char function[state_max_tss][event_max_tss]; + unsigned char initial_state; + bool final_states[state_max_tss]; +}; + +static const struct automaton_tss automaton_tss = { + .state_names = { + "thread", + "sched" + }, + .event_names = { + "sched_switch", + "schedule_entry", + "schedule_exit" + }, + .function = { + { INVALID_STATE, sched_tss, INVALID_STATE }, + { sched_tss, INVALID_STATE, thread_tss }, + }, + .initial_state = thread_tss, + .final_states = { 1, 0 }, +}; diff --git a/kernel/trace/rv/monitors/tss/tss_trace.h b/kernel/trace/rv/monitors/tss/tss_trace.h new file mode 100644 index 000000000000..4619dbb50cc0 --- /dev/null +++ b/kernel/trace/rv/monitors/tss/tss_trace.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h + */ + +#ifdef CONFIG_RV_MON_TSS +DEFINE_EVENT(event_da_monitor, event_tss, + TP_PROTO(char *state, char *event, char *next_state, bool final_state), + TP_ARGS(state, event, next_state, final_state)); + +DEFINE_EVENT(error_da_monitor, error_tss, + TP_PROTO(char *state, char *event), + TP_ARGS(state, event)); +#endif /* CONFIG_RV_MON_TSS */ diff --git a/kernel/trace/rv/monitors/wip/Kconfig b/kernel/trace/rv/monitors/wip/Kconfig index 3ef664b5cd90..e464b9294865 100644 --- a/kernel/trace/rv/monitors/wip/Kconfig +++ b/kernel/trace/rv/monitors/wip/Kconfig @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0-only +# config RV_MON_WIP depends on RV depends on PREEMPT_TRACER diff --git a/kernel/trace/rv/monitors/wip/wip.c b/kernel/trace/rv/monitors/wip/wip.c index db7389157c87..ed758fec8608 100644 --- a/kernel/trace/rv/monitors/wip/wip.c +++ b/kernel/trace/rv/monitors/wip/wip.c @@ -71,7 +71,7 @@ static struct rv_monitor rv_wip = { static int __init register_wip(void) { - rv_register_monitor(&rv_wip); + rv_register_monitor(&rv_wip, NULL); return 0; } diff --git a/kernel/trace/rv/monitors/wip/wip.h b/kernel/trace/rv/monitors/wip/wip.h index 2e373f2c65ed..c7193748bf36 100644 --- a/kernel/trace/rv/monitors/wip/wip.h +++ b/kernel/trace/rv/monitors/wip/wip.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Automatically generated C representation of wip automaton * For further information about this format, see kernel documentation: diff --git a/kernel/trace/rv/monitors/wwnr/Kconfig b/kernel/trace/rv/monitors/wwnr/Kconfig index ee741aa6d6b8..d3bfc20037db 100644 --- a/kernel/trace/rv/monitors/wwnr/Kconfig +++ b/kernel/trace/rv/monitors/wwnr/Kconfig @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0-only +# config RV_MON_WWNR depends on RV select DA_MON_EVENTS_ID diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.c b/kernel/trace/rv/monitors/wwnr/wwnr.c index 3b16994a9984..172f31c4b0f3 100644 --- a/kernel/trace/rv/monitors/wwnr/wwnr.c +++ b/kernel/trace/rv/monitors/wwnr/wwnr.c @@ -70,7 +70,7 @@ static struct rv_monitor rv_wwnr = { static int __init register_wwnr(void) { - rv_register_monitor(&rv_wwnr); + rv_register_monitor(&rv_wwnr, NULL); return 0; } diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.h b/kernel/trace/rv/monitors/wwnr/wwnr.h index d0d9c4b8121b..0a59d23edf61 100644 --- a/kernel/trace/rv/monitors/wwnr/wwnr.h +++ b/kernel/trace/rv/monitors/wwnr/wwnr.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Automatically generated C representation of wwnr automaton * For further information about this format, see kernel documentation: diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c index 8657fc8806e7..e4077500a91d 100644 --- a/kernel/trace/rv/rv.c +++ b/kernel/trace/rv/rv.c @@ -162,7 +162,7 @@ struct dentry *get_monitors_root(void) /* * Interface for the monitor register. */ -static LIST_HEAD(rv_monitors_list); +LIST_HEAD(rv_monitors_list); static int task_monitor_count; static bool task_monitor_slots[RV_PER_TASK_MONITORS]; @@ -207,6 +207,35 @@ void rv_put_task_monitor_slot(int slot) } /* + * Monitors with a parent are nested, + * Monitors without a parent could be standalone or containers. + */ +bool rv_is_nested_monitor(struct rv_monitor_def *mdef) +{ + return mdef->parent != NULL; +} + +/* + * We set our list to have nested monitors listed after their parent + * if a monitor has a child element its a container. + * Containers can be also identified based on their function pointers: + * as they are not real monitors they do not need function definitions + * for enable()/disable(). Use this condition to find empty containers. + * Keep both conditions in case we have some non-compliant containers. + */ +bool rv_is_container_monitor(struct rv_monitor_def *mdef) +{ + struct rv_monitor_def *next; + + if (list_is_last(&mdef->list, &rv_monitors_list)) + return false; + + next = list_next_entry(mdef, list); + + return next->parent == mdef->monitor || !mdef->monitor->enable; +} + +/* * This section collects the monitor/ files and folders. */ static ssize_t monitor_enable_read_data(struct file *filp, char __user *user_buf, size_t count, @@ -229,7 +258,8 @@ static int __rv_disable_monitor(struct rv_monitor_def *mdef, bool sync) if (mdef->monitor->enabled) { mdef->monitor->enabled = 0; - mdef->monitor->disable(); + if (mdef->monitor->disable) + mdef->monitor->disable(); /* * Wait for the execution of all events to finish. @@ -243,6 +273,60 @@ static int __rv_disable_monitor(struct rv_monitor_def *mdef, bool sync) return 0; } +static void rv_disable_single(struct rv_monitor_def *mdef) +{ + __rv_disable_monitor(mdef, true); +} + +static int rv_enable_single(struct rv_monitor_def *mdef) +{ + int retval; + + lockdep_assert_held(&rv_interface_lock); + + if (mdef->monitor->enabled) + return 0; + + retval = mdef->monitor->enable(); + + if (!retval) + mdef->monitor->enabled = 1; + + return retval; +} + +static void rv_disable_container(struct rv_monitor_def *mdef) +{ + struct rv_monitor_def *p = mdef; + int enabled = 0; + + list_for_each_entry_continue(p, &rv_monitors_list, list) { + if (p->parent != mdef->monitor) + break; + enabled += __rv_disable_monitor(p, false); + } + if (enabled) + tracepoint_synchronize_unregister(); + mdef->monitor->enabled = 0; +} + +static int rv_enable_container(struct rv_monitor_def *mdef) +{ + struct rv_monitor_def *p = mdef; + int retval = 0; + + list_for_each_entry_continue(p, &rv_monitors_list, list) { + if (retval || p->parent != mdef->monitor) + break; + retval = rv_enable_single(p); + } + if (retval) + rv_disable_container(mdef); + else + mdef->monitor->enabled = 1; + return retval; +} + /** * rv_disable_monitor - disable a given runtime monitor * @mdef: Pointer to the monitor definition structure. @@ -251,7 +335,11 @@ static int __rv_disable_monitor(struct rv_monitor_def *mdef, bool sync) */ int rv_disable_monitor(struct rv_monitor_def *mdef) { - __rv_disable_monitor(mdef, true); + if (rv_is_container_monitor(mdef)) + rv_disable_container(mdef); + else + rv_disable_single(mdef); + return 0; } @@ -265,15 +353,10 @@ int rv_enable_monitor(struct rv_monitor_def *mdef) { int retval; - lockdep_assert_held(&rv_interface_lock); - - if (mdef->monitor->enabled) - return 0; - - retval = mdef->monitor->enable(); - - if (!retval) - mdef->monitor->enabled = 1; + if (rv_is_container_monitor(mdef)) + retval = rv_enable_container(mdef); + else + retval = rv_enable_single(mdef); return retval; } @@ -336,9 +419,9 @@ static const struct file_operations interface_desc_fops = { * the monitor dir, where the specific options of the monitor * are exposed. */ -static int create_monitor_dir(struct rv_monitor_def *mdef) +static int create_monitor_dir(struct rv_monitor_def *mdef, struct rv_monitor_def *parent) { - struct dentry *root = get_monitors_root(); + struct dentry *root = parent ? parent->root_d : get_monitors_root(); const char *name = mdef->monitor->name; struct dentry *tmp; int retval; @@ -377,7 +460,11 @@ static int monitors_show(struct seq_file *m, void *p) { struct rv_monitor_def *mon_def = p; - seq_printf(m, "%s\n", mon_def->monitor->name); + if (mon_def->parent) + seq_printf(m, "%s:%s\n", mon_def->parent->name, + mon_def->monitor->name); + else + seq_printf(m, "%s\n", mon_def->monitor->name); return 0; } @@ -514,7 +601,7 @@ static ssize_t enabled_monitors_write(struct file *filp, const char __user *user struct rv_monitor_def *mdef; int retval = -EINVAL; bool enable = true; - char *ptr; + char *ptr, *tmp; int len; if (count < 1 || count > MAX_RV_MONITOR_NAME_SIZE + 1) @@ -541,6 +628,11 @@ static ssize_t enabled_monitors_write(struct file *filp, const char __user *user retval = -EINVAL; + /* we support 1 nesting level, trim the parent */ + tmp = strstr(ptr, ":"); + if (tmp) + ptr = tmp+1; + list_for_each_entry(mdef, &rv_monitors_list, list) { if (strcmp(ptr, mdef->monitor->name) != 0) continue; @@ -613,7 +705,7 @@ static void reset_all_monitors(void) struct rv_monitor_def *mdef; list_for_each_entry(mdef, &rv_monitors_list, list) { - if (mdef->monitor->enabled) + if (mdef->monitor->enabled && mdef->monitor->reset) mdef->monitor->reset(); } } @@ -685,18 +777,19 @@ static void destroy_monitor_dir(struct rv_monitor_def *mdef) /** * rv_register_monitor - register a rv monitor. * @monitor: The rv_monitor to be registered. + * @parent: The parent of the monitor to be registered, NULL if not nested. * * Returns 0 if successful, error otherwise. */ -int rv_register_monitor(struct rv_monitor *monitor) +int rv_register_monitor(struct rv_monitor *monitor, struct rv_monitor *parent) { - struct rv_monitor_def *r; + struct rv_monitor_def *r, *p = NULL; int retval = 0; if (strlen(monitor->name) >= MAX_RV_MONITOR_NAME_SIZE) { pr_info("Monitor %s has a name longer than %d\n", monitor->name, MAX_RV_MONITOR_NAME_SIZE); - return -1; + return -EINVAL; } mutex_lock(&rv_interface_lock); @@ -704,11 +797,27 @@ int rv_register_monitor(struct rv_monitor *monitor) list_for_each_entry(r, &rv_monitors_list, list) { if (strcmp(monitor->name, r->monitor->name) == 0) { pr_info("Monitor %s is already registered\n", monitor->name); - retval = -1; + retval = -EEXIST; goto out_unlock; } } + if (parent) { + list_for_each_entry(r, &rv_monitors_list, list) { + if (strcmp(parent->name, r->monitor->name) == 0) { + p = r; + break; + } + } + } + + if (p && rv_is_nested_monitor(p)) { + pr_info("Parent monitor %s is already nested, cannot nest further\n", + parent->name); + retval = -EINVAL; + goto out_unlock; + } + r = kzalloc(sizeof(struct rv_monitor_def), GFP_KERNEL); if (!r) { retval = -ENOMEM; @@ -716,14 +825,19 @@ int rv_register_monitor(struct rv_monitor *monitor) } r->monitor = monitor; + r->parent = parent; - retval = create_monitor_dir(r); + retval = create_monitor_dir(r, p); if (retval) { kfree(r); goto out_unlock; } - list_add_tail(&r->list, &rv_monitors_list); + /* keep children close to the parent for easier visualisation */ + if (p) + list_add(&r->list, &p->list); + else + list_add_tail(&r->list, &rv_monitors_list); out_unlock: mutex_unlock(&rv_interface_lock); diff --git a/kernel/trace/rv/rv.h b/kernel/trace/rv/rv.h index db6cb0913dbd..98fca0a1adbc 100644 --- a/kernel/trace/rv/rv.h +++ b/kernel/trace/rv/rv.h @@ -21,6 +21,7 @@ struct rv_interface { #define MAX_RV_REACTOR_NAME_SIZE 32 extern struct mutex rv_interface_lock; +extern struct list_head rv_monitors_list; #ifdef CONFIG_RV_REACTORS struct rv_reactor_def { @@ -34,6 +35,7 @@ struct rv_reactor_def { struct rv_monitor_def { struct list_head list; struct rv_monitor *monitor; + struct rv_monitor *parent; struct dentry *root_d; #ifdef CONFIG_RV_REACTORS struct rv_reactor_def *rdef; @@ -45,6 +47,8 @@ struct rv_monitor_def { struct dentry *get_monitors_root(void); int rv_disable_monitor(struct rv_monitor_def *mdef); int rv_enable_monitor(struct rv_monitor_def *mdef); +bool rv_is_container_monitor(struct rv_monitor_def *mdef); +bool rv_is_nested_monitor(struct rv_monitor_def *mdef); #ifdef CONFIG_RV_REACTORS int reactor_populate_monitor(struct rv_monitor_def *mdef); diff --git a/kernel/trace/rv/rv_reactors.c b/kernel/trace/rv/rv_reactors.c index 7b49cbe388d4..9501ca886d83 100644 --- a/kernel/trace/rv/rv_reactors.c +++ b/kernel/trace/rv/rv_reactors.c @@ -158,8 +158,9 @@ static const struct seq_operations monitor_reactors_seq_ops = { .show = monitor_reactor_show }; -static void monitor_swap_reactors(struct rv_monitor_def *mdef, struct rv_reactor_def *rdef, - bool reacting) +static void monitor_swap_reactors_single(struct rv_monitor_def *mdef, + struct rv_reactor_def *rdef, + bool reacting, bool nested) { bool monitor_enabled; @@ -179,10 +180,31 @@ static void monitor_swap_reactors(struct rv_monitor_def *mdef, struct rv_reactor mdef->reacting = reacting; mdef->monitor->react = rdef->reactor->react; - if (monitor_enabled) + /* enable only once if iterating through a container */ + if (monitor_enabled && !nested) rv_enable_monitor(mdef); } +static void monitor_swap_reactors(struct rv_monitor_def *mdef, + struct rv_reactor_def *rdef, bool reacting) +{ + struct rv_monitor_def *p = mdef; + + if (rv_is_container_monitor(mdef)) + list_for_each_entry_continue(p, &rv_monitors_list, list) { + if (p->parent != mdef->monitor) + break; + monitor_swap_reactors_single(p, rdef, reacting, true); + } + /* + * This call enables and disables the monitor if they were active. + * In case of a container, we already disabled all and will enable all. + * All nested monitors are enabled also if they were off, we may refine + * this logic in the future. + */ + monitor_swap_reactors_single(mdef, rdef, reacting, false); +} + static ssize_t monitor_reactors_write(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) diff --git a/kernel/trace/rv/rv_trace.h b/kernel/trace/rv/rv_trace.h index 5e65097423ba..422b75f58891 100644 --- a/kernel/trace/rv/rv_trace.h +++ b/kernel/trace/rv/rv_trace.h @@ -58,6 +58,11 @@ DECLARE_EVENT_CLASS(error_da_monitor, ); #include <monitors/wip/wip_trace.h> +#include <monitors/tss/tss_trace.h> +#include <monitors/sco/sco_trace.h> +#include <monitors/scpd/scpd_trace.h> +#include <monitors/snep/snep_trace.h> +#include <monitors/sncid/sncid_trace.h> // Add new monitors based on CONFIG_DA_MON_EVENTS_IMPLICIT here #endif /* CONFIG_DA_MON_EVENTS_IMPLICIT */ @@ -118,6 +123,7 @@ DECLARE_EVENT_CLASS(error_da_monitor_id, ); #include <monitors/wwnr/wwnr_trace.h> +#include <monitors/snroc/snroc_trace.h> // Add new monitors based on CONFIG_DA_MON_EVENTS_ID here #endif /* CONFIG_DA_MON_EVENTS_ID */ diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index fd3cb2b2ab82..d3459e715fbc 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -49,6 +49,8 @@ #include <linux/fsnotify.h> #include <linux/irq_work.h> #include <linux/workqueue.h> +#include <linux/sort.h> +#include <linux/io.h> /* vmap_page_range() */ #include <asm/setup.h> /* COMMAND_LINE_SIZE */ @@ -87,6 +89,7 @@ void __init disable_tracing_selftest(const char *reason) static struct trace_iterator *tracepoint_print_iter; int tracepoint_printk; static bool tracepoint_printk_stop_on_boot __initdata; +static bool traceoff_after_boot __initdata; static DEFINE_STATIC_KEY_FALSE(tracepoint_printk_key); /* For tracers that don't implement custom flags */ @@ -117,6 +120,7 @@ static int tracing_disabled = 1; cpumask_var_t __read_mostly tracing_buffer_mask; +#define MAX_TRACER_SIZE 100 /* * ftrace_dump_on_oops - variable to dump ftrace buffer on oops * @@ -139,7 +143,40 @@ cpumask_var_t __read_mostly tracing_buffer_mask; char ftrace_dump_on_oops[MAX_TRACER_SIZE] = "0"; /* When set, tracing will stop when a WARN*() is hit */ -int __disable_trace_on_warning; +static int __disable_trace_on_warning; + +int tracepoint_printk_sysctl(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); +static const struct ctl_table trace_sysctl_table[] = { + { + .procname = "ftrace_dump_on_oops", + .data = &ftrace_dump_on_oops, + .maxlen = MAX_TRACER_SIZE, + .mode = 0644, + .proc_handler = proc_dostring, + }, + { + .procname = "traceoff_on_warning", + .data = &__disable_trace_on_warning, + .maxlen = sizeof(__disable_trace_on_warning), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "tracepoint_printk", + .data = &tracepoint_printk, + .maxlen = sizeof(tracepoint_printk), + .mode = 0644, + .proc_handler = tracepoint_printk_sysctl, + }, +}; + +static int __init init_trace_sysctls(void) +{ + register_sysctl_init("kernel", trace_sysctl_table); + return 0; +} +subsys_initcall(init_trace_sysctls); #ifdef CONFIG_TRACE_EVAL_MAP_FILE /* Map of enums to their values, for "eval_map" file */ @@ -330,6 +367,13 @@ static int __init set_tracepoint_printk_stop(char *str) } __setup("tp_printk_stop_on_boot", set_tracepoint_printk_stop); +static int __init set_traceoff_after_boot(char *str) +{ + traceoff_after_boot = true; + return 1; +} +__setup("traceoff_after_boot", set_traceoff_after_boot); + unsigned long long ns2usecs(u64 nsec) { nsec += 500; @@ -2878,13 +2922,16 @@ trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer, void trace_function(struct trace_array *tr, unsigned long ip, unsigned long - parent_ip, unsigned int trace_ctx) + parent_ip, unsigned int trace_ctx, struct ftrace_regs *fregs) { struct trace_buffer *buffer = tr->array_buffer.buffer; struct ring_buffer_event *event; struct ftrace_entry *entry; + int size = sizeof(*entry); + + size += FTRACE_REGS_MAX_ARGS * !!fregs * sizeof(long); - event = __trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry), + event = __trace_buffer_lock_reserve(buffer, TRACE_FN, size, trace_ctx); if (!event) return; @@ -2892,6 +2939,13 @@ trace_function(struct trace_array *tr, unsigned long ip, unsigned long entry->ip = ip; entry->parent_ip = parent_ip; +#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API + if (fregs) { + for (int i = 0; i < FTRACE_REGS_MAX_ARGS; i++) + entry->args[i] = ftrace_regs_get_argument(fregs, i); + } +#endif + if (static_branch_unlikely(&trace_function_exports_enabled)) ftrace_exports(event, TRACE_EXPORT_FUNCTION); __buffer_unlock_commit(buffer, event); @@ -3322,10 +3376,9 @@ out_nobuffer: } EXPORT_SYMBOL_GPL(trace_vbprintk); -__printf(3, 0) -static int -__trace_array_vprintk(struct trace_buffer *buffer, - unsigned long ip, const char *fmt, va_list args) +static __printf(3, 0) +int __trace_array_vprintk(struct trace_buffer *buffer, + unsigned long ip, const char *fmt, va_list args) { struct ring_buffer_event *event; int len = 0, size; @@ -3375,7 +3428,6 @@ out_nobuffer: return len; } -__printf(3, 0) int trace_array_vprintk(struct trace_array *tr, unsigned long ip, const char *fmt, va_list args) { @@ -3405,7 +3457,6 @@ int trace_array_vprintk(struct trace_array *tr, * Note, trace_array_init_printk() must be called on @tr before this * can be used. */ -__printf(3, 0) int trace_array_printk(struct trace_array *tr, unsigned long ip, const char *fmt, ...) { @@ -3450,7 +3501,6 @@ int trace_array_init_printk(struct trace_array *tr) } EXPORT_SYMBOL_GPL(trace_array_init_printk); -__printf(3, 4) int trace_array_printk_buf(struct trace_buffer *buffer, unsigned long ip, const char *fmt, ...) { @@ -3466,7 +3516,6 @@ int trace_array_printk_buf(struct trace_buffer *buffer, return ret; } -__printf(2, 0) int trace_vprintk(unsigned long ip, const char *fmt, va_list args) { return trace_array_vprintk(printk_trace, ip, fmt, args); @@ -4188,7 +4237,7 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) * safe to use if the array has delta offsets * Force printing via the fields. */ - if ((tr->text_delta || tr->data_delta) && + if ((tr->text_delta) && event->type > __TRACE_LAST_TYPE) return print_event_fields(iter, event); @@ -5983,11 +6032,132 @@ ssize_t tracing_resize_ring_buffer(struct trace_array *tr, return __tracing_resize_ring_buffer(tr, size, cpu_id); } +struct trace_mod_entry { + unsigned long mod_addr; + char mod_name[MODULE_NAME_LEN]; +}; + +struct trace_scratch { + unsigned long text_addr; + unsigned long nr_entries; + struct trace_mod_entry entries[]; +}; + +static DEFINE_MUTEX(scratch_mutex); + +static int cmp_mod_entry(const void *key, const void *pivot) +{ + unsigned long addr = (unsigned long)key; + const struct trace_mod_entry *ent = pivot; + + if (addr >= ent[0].mod_addr && addr < ent[1].mod_addr) + return 0; + else + return addr - ent->mod_addr; +} + +/** + * trace_adjust_address() - Adjust prev boot address to current address. + * @tr: Persistent ring buffer's trace_array. + * @addr: Address in @tr which is adjusted. + */ +unsigned long trace_adjust_address(struct trace_array *tr, unsigned long addr) +{ + struct trace_module_delta *module_delta; + struct trace_scratch *tscratch; + struct trace_mod_entry *entry; + int idx = 0, nr_entries; + + /* If we don't have last boot delta, return the address */ + if (!(tr->flags & TRACE_ARRAY_FL_LAST_BOOT)) + return addr; + + /* tr->module_delta must be protected by rcu. */ + guard(rcu)(); + tscratch = tr->scratch; + /* if there is no tscrach, module_delta must be NULL. */ + module_delta = READ_ONCE(tr->module_delta); + if (!module_delta || !tscratch->nr_entries || + tscratch->entries[0].mod_addr > addr) { + return addr + tr->text_delta; + } + + /* Note that entries must be sorted. */ + nr_entries = tscratch->nr_entries; + if (nr_entries == 1 || + tscratch->entries[nr_entries - 1].mod_addr < addr) + idx = nr_entries - 1; + else { + entry = __inline_bsearch((void *)addr, + tscratch->entries, + nr_entries - 1, + sizeof(tscratch->entries[0]), + cmp_mod_entry); + if (entry) + idx = entry - tscratch->entries; + } + + return addr + module_delta->delta[idx]; +} + +#ifdef CONFIG_MODULES +static int save_mod(struct module *mod, void *data) +{ + struct trace_array *tr = data; + struct trace_scratch *tscratch; + struct trace_mod_entry *entry; + unsigned int size; + + tscratch = tr->scratch; + if (!tscratch) + return -1; + size = tr->scratch_size; + + if (struct_size(tscratch, entries, tscratch->nr_entries + 1) > size) + return -1; + + entry = &tscratch->entries[tscratch->nr_entries]; + + tscratch->nr_entries++; + + entry->mod_addr = (unsigned long)mod->mem[MOD_TEXT].base; + strscpy(entry->mod_name, mod->name); + + return 0; +} +#else +static int save_mod(struct module *mod, void *data) +{ + return 0; +} +#endif + static void update_last_data(struct trace_array *tr) { - if (!tr->text_delta && !tr->data_delta) + struct trace_module_delta *module_delta; + struct trace_scratch *tscratch; + + if (!(tr->flags & TRACE_ARRAY_FL_BOOT)) return; + if (!(tr->flags & TRACE_ARRAY_FL_LAST_BOOT)) + return; + + /* Only if the buffer has previous boot data clear and update it. */ + tr->flags &= ~TRACE_ARRAY_FL_LAST_BOOT; + + /* Reset the module list and reload them */ + if (tr->scratch) { + struct trace_scratch *tscratch = tr->scratch; + + memset(tscratch->entries, 0, + flex_array_size(tscratch, entries, tscratch->nr_entries)); + tscratch->nr_entries = 0; + + guard(mutex)(&scratch_mutex); + module_for_each_mod(save_mod, tr); + } + /* * Need to clear all CPU buffers as there cannot be events * from the previous boot mixed with events with this boot @@ -5998,7 +6168,17 @@ static void update_last_data(struct trace_array *tr) /* Using current data now */ tr->text_delta = 0; - tr->data_delta = 0; + + if (!tr->scratch) + return; + + tscratch = tr->scratch; + module_delta = READ_ONCE(tr->module_delta); + WRITE_ONCE(tr->module_delta, NULL); + kfree_rcu(module_delta, rcu); + + /* Set the persistent ring buffer meta data to this address */ + tscratch->text_addr = (unsigned long)_text; } /** @@ -6677,13 +6857,14 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, /* Copy the data into the page, so we can start over. */ ret = trace_seq_to_buffer(&iter->seq, page_address(spd.pages[i]), - trace_seq_used(&iter->seq)); + min((size_t)trace_seq_used(&iter->seq), + PAGE_SIZE)); if (ret < 0) { __free_page(spd.pages[i]); break; } spd.partial[i].offset = 0; - spd.partial[i].len = trace_seq_used(&iter->seq); + spd.partial[i].len = ret; trace_seq_init(&iter->seq); } @@ -6807,19 +6988,102 @@ tracing_total_entries_read(struct file *filp, char __user *ubuf, return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } -static ssize_t -tracing_last_boot_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) +#define LAST_BOOT_HEADER ((void *)1) + +static void *l_next(struct seq_file *m, void *v, loff_t *pos) { - struct trace_array *tr = filp->private_data; - struct seq_buf seq; - char buf[64]; + struct trace_array *tr = m->private; + struct trace_scratch *tscratch = tr->scratch; + unsigned int index = *pos; - seq_buf_init(&seq, buf, 64); + (*pos)++; - seq_buf_printf(&seq, "text delta:\t%ld\n", tr->text_delta); - seq_buf_printf(&seq, "data delta:\t%ld\n", tr->data_delta); + if (*pos == 1) + return LAST_BOOT_HEADER; - return simple_read_from_buffer(ubuf, cnt, ppos, buf, seq_buf_used(&seq)); + /* Only show offsets of the last boot data */ + if (!tscratch || !(tr->flags & TRACE_ARRAY_FL_LAST_BOOT)) + return NULL; + + /* *pos 0 is for the header, 1 is for the first module */ + index--; + + if (index >= tscratch->nr_entries) + return NULL; + + return &tscratch->entries[index]; +} + +static void *l_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&scratch_mutex); + + return l_next(m, NULL, pos); +} + +static void l_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&scratch_mutex); +} + +static void show_last_boot_header(struct seq_file *m, struct trace_array *tr) +{ + struct trace_scratch *tscratch = tr->scratch; + + /* + * Do not leak KASLR address. This only shows the KASLR address of + * the last boot. When the ring buffer is started, the LAST_BOOT + * flag gets cleared, and this should only report "current". + * Otherwise it shows the KASLR address from the previous boot which + * should not be the same as the current boot. + */ + if (tscratch && (tr->flags & TRACE_ARRAY_FL_LAST_BOOT)) + seq_printf(m, "%lx\t[kernel]\n", tscratch->text_addr); + else + seq_puts(m, "# Current\n"); +} + +static int l_show(struct seq_file *m, void *v) +{ + struct trace_array *tr = m->private; + struct trace_mod_entry *entry = v; + + if (v == LAST_BOOT_HEADER) { + show_last_boot_header(m, tr); + return 0; + } + + seq_printf(m, "%lx\t%s\n", entry->mod_addr, entry->mod_name); + return 0; +} + +static const struct seq_operations last_boot_seq_ops = { + .start = l_start, + .next = l_next, + .stop = l_stop, + .show = l_show, +}; + +static int tracing_last_boot_open(struct inode *inode, struct file *file) +{ + struct trace_array *tr = inode->i_private; + struct seq_file *m; + int ret; + + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; + + ret = seq_open(file, &last_boot_seq_ops); + if (ret) { + trace_array_put(tr); + return ret; + } + + m = file->private_data; + m->private = tr; + + return 0; } static int tracing_buffer_meta_open(struct inode *inode, struct file *filp) @@ -7448,10 +7712,10 @@ static const struct file_operations trace_time_stamp_mode_fops = { }; static const struct file_operations last_boot_fops = { - .open = tracing_open_generic_tr, - .read = tracing_last_boot_read, - .llseek = generic_file_llseek, - .release = tracing_release_generic_tr, + .open = tracing_last_boot_open, + .read = seq_read, + .llseek = seq_lseek, + .release = tracing_seq_release, }; #ifdef CONFIG_TRACER_SNAPSHOT @@ -8274,6 +8538,10 @@ static int tracing_buffers_mmap(struct file *filp, struct vm_area_struct *vma) struct trace_iterator *iter = &info->iter; int ret = 0; + /* A memmap'ed buffer is not supported for user space mmap */ + if (iter->tr->flags & TRACE_ARRAY_FL_MEMMAP) + return -ENODEV; + /* Currently the boot mapped buffer is not supported for mmap */ if (iter->tr->flags & TRACE_ARRAY_FL_BOOT) return -ENODEV; @@ -9191,22 +9459,125 @@ static struct dentry *trace_instance_dir; static void init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer); +#ifdef CONFIG_MODULES +static int make_mod_delta(struct module *mod, void *data) +{ + struct trace_module_delta *module_delta; + struct trace_scratch *tscratch; + struct trace_mod_entry *entry; + struct trace_array *tr = data; + int i; + + tscratch = tr->scratch; + module_delta = READ_ONCE(tr->module_delta); + for (i = 0; i < tscratch->nr_entries; i++) { + entry = &tscratch->entries[i]; + if (strcmp(mod->name, entry->mod_name)) + continue; + if (mod->state == MODULE_STATE_GOING) + module_delta->delta[i] = 0; + else + module_delta->delta[i] = (unsigned long)mod->mem[MOD_TEXT].base + - entry->mod_addr; + break; + } + return 0; +} +#else +static int make_mod_delta(struct module *mod, void *data) +{ + return 0; +} +#endif + +static int mod_addr_comp(const void *a, const void *b, const void *data) +{ + const struct trace_mod_entry *e1 = a; + const struct trace_mod_entry *e2 = b; + + return e1->mod_addr > e2->mod_addr ? 1 : -1; +} + +static void setup_trace_scratch(struct trace_array *tr, + struct trace_scratch *tscratch, unsigned int size) +{ + struct trace_module_delta *module_delta; + struct trace_mod_entry *entry; + int i, nr_entries; + + if (!tscratch) + return; + + tr->scratch = tscratch; + tr->scratch_size = size; + + if (tscratch->text_addr) + tr->text_delta = (unsigned long)_text - tscratch->text_addr; + + if (struct_size(tscratch, entries, tscratch->nr_entries) > size) + goto reset; + + /* Check if each module name is a valid string */ + for (i = 0; i < tscratch->nr_entries; i++) { + int n; + + entry = &tscratch->entries[i]; + + for (n = 0; n < MODULE_NAME_LEN; n++) { + if (entry->mod_name[n] == '\0') + break; + if (!isprint(entry->mod_name[n])) + goto reset; + } + if (n == MODULE_NAME_LEN) + goto reset; + } + + /* Sort the entries so that we can find appropriate module from address. */ + nr_entries = tscratch->nr_entries; + sort_r(tscratch->entries, nr_entries, sizeof(struct trace_mod_entry), + mod_addr_comp, NULL, NULL); + + if (IS_ENABLED(CONFIG_MODULES)) { + module_delta = kzalloc(struct_size(module_delta, delta, nr_entries), GFP_KERNEL); + if (!module_delta) { + pr_info("module_delta allocation failed. Not able to decode module address."); + goto reset; + } + init_rcu_head(&module_delta->rcu); + } else + module_delta = NULL; + WRITE_ONCE(tr->module_delta, module_delta); + + /* Scan modules to make text delta for modules. */ + module_for_each_mod(make_mod_delta, tr); + return; + reset: + /* Invalid trace modules */ + memset(tscratch, 0, size); +} + static int allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, int size) { enum ring_buffer_flags rb_flags; + struct trace_scratch *tscratch; + unsigned int scratch_size = 0; rb_flags = tr->trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0; buf->tr = tr; if (tr->range_addr_start && tr->range_addr_size) { + /* Add scratch buffer to handle 128 modules */ buf->buffer = ring_buffer_alloc_range(size, rb_flags, 0, tr->range_addr_start, - tr->range_addr_size); + tr->range_addr_size, + struct_size(tscratch, entries, 128)); + + tscratch = ring_buffer_meta_scratch(buf->buffer, &scratch_size); + setup_trace_scratch(tr, tscratch, scratch_size); - ring_buffer_last_boot_delta(buf->buffer, - &tr->text_delta, &tr->data_delta); /* * This is basically the same as a mapped buffer, * with the same restrictions. @@ -9275,6 +9646,7 @@ static void free_trace_buffers(struct trace_array *tr) return; free_trace_buffer(&tr->array_buffer); + kfree(tr->module_delta); #ifdef CONFIG_TRACER_MAX_TRACE free_trace_buffer(&tr->max_buffer); @@ -9440,6 +9812,7 @@ trace_array_create_systems(const char *name, const char *systems, free_cpumask_var(tr->pipe_cpumask); free_cpumask_var(tr->tracing_cpumask); kfree_const(tr->system_names); + kfree(tr->range_name); kfree(tr->name); kfree(tr); @@ -9470,30 +9843,35 @@ static int instance_mkdir(const char *name) return ret; } -static u64 map_pages(u64 start, u64 size) +#ifdef CONFIG_MMU +static u64 map_pages(unsigned long start, unsigned long size) { - struct page **pages; - phys_addr_t page_start; - unsigned int page_count; - unsigned int i; - void *vaddr; - - page_count = DIV_ROUND_UP(size, PAGE_SIZE); + unsigned long vmap_start, vmap_end; + struct vm_struct *area; + int ret; - page_start = start; - pages = kmalloc_array(page_count, sizeof(struct page *), GFP_KERNEL); - if (!pages) + area = get_vm_area(size, VM_IOREMAP); + if (!area) return 0; - for (i = 0; i < page_count; i++) { - phys_addr_t addr = page_start + i * PAGE_SIZE; - pages[i] = pfn_to_page(addr >> PAGE_SHIFT); + vmap_start = (unsigned long) area->addr; + vmap_end = vmap_start + size; + + ret = vmap_page_range(vmap_start, vmap_end, + start, pgprot_nx(PAGE_KERNEL)); + if (ret < 0) { + free_vm_area(area); + return 0; } - vaddr = vmap(pages, page_count, VM_MAP, PAGE_KERNEL); - kfree(pages); - return (u64)(unsigned long)vaddr; + return (u64)vmap_start; +} +#else +static inline u64 map_pages(unsigned long start, unsigned long size) +{ + return 0; } +#endif /** * trace_array_get_by_name - Create/Lookup a trace array, given its name. @@ -9566,6 +9944,11 @@ static int __remove_instance(struct trace_array *tr) free_trace_buffers(tr); clear_tracing_err_log(tr); + if (tr->range_name) { + reserve_mem_release_by_name(tr->range_name); + kfree(tr->range_name); + } + for (i = 0; i < tr->nr_topts; i++) { kfree(tr->topts[i].topts); } @@ -9887,6 +10270,24 @@ static void trace_module_remove_evals(struct module *mod) static inline void trace_module_remove_evals(struct module *mod) { } #endif /* CONFIG_TRACE_EVAL_MAP_FILE */ +static void trace_module_record(struct module *mod, bool add) +{ + struct trace_array *tr; + unsigned long flags; + + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + flags = tr->flags & (TRACE_ARRAY_FL_BOOT | TRACE_ARRAY_FL_LAST_BOOT); + /* Update any persistent trace array that has already been started */ + if (flags == TRACE_ARRAY_FL_BOOT && add) { + guard(mutex)(&scratch_mutex); + save_mod(mod, tr); + } else if (flags & TRACE_ARRAY_FL_LAST_BOOT) { + /* Update delta if the module loaded in previous boot */ + make_mod_delta(mod, tr); + } + } +} + static int trace_module_notify(struct notifier_block *self, unsigned long val, void *data) { @@ -9895,9 +10296,11 @@ static int trace_module_notify(struct notifier_block *self, switch (val) { case MODULE_STATE_COMING: trace_module_add_evals(mod); + trace_module_record(mod, true); break; case MODULE_STATE_GOING: trace_module_remove_evals(mod); + trace_module_record(mod, false); break; } @@ -10346,6 +10749,7 @@ static inline void do_allocate_snapshot(const char *name) { } __init static void enable_instances(void) { struct trace_array *tr; + bool memmap_area = false; char *curr_str; char *name; char *str; @@ -10363,6 +10767,7 @@ __init static void enable_instances(void) bool traceoff = false; char *flag_delim; char *addr_delim; + char *rname __free(kfree) = NULL; tok = strsep(&curr_str, ","); @@ -10413,16 +10818,31 @@ __init static void enable_instances(void) name); continue; } + memmap_area = true; } else if (tok) { if (!reserve_mem_find_by_name(tok, &start, &size)) { start = 0; pr_warn("Failed to map boot instance %s to %s\n", name, tok); continue; } + rname = kstrdup(tok, GFP_KERNEL); } if (start) { - addr = map_pages(start, size); + /* Start and size must be page aligned */ + if (start & ~PAGE_MASK) { + pr_warn("Tracing: mapping start addr %pa is not page aligned\n", &start); + continue; + } + if (size & ~PAGE_MASK) { + pr_warn("Tracing: mapping size %pa is not page aligned\n", &size); + continue; + } + + if (memmap_area) + addr = map_pages(start, size); + else + addr = (unsigned long)phys_to_virt(start); if (addr) { pr_info("Tracing: mapped boot instance %s at physical memory %pa of size 0x%lx\n", name, &start, (unsigned long)size); @@ -10449,15 +10869,18 @@ __init static void enable_instances(void) update_printk_trace(tr); /* - * If start is set, then this is a mapped buffer, and - * cannot be deleted by user space, so keep the reference - * to it. + * memmap'd buffers can not be freed. */ - if (start) { - tr->flags |= TRACE_ARRAY_FL_BOOT; + if (memmap_area) { + tr->flags |= TRACE_ARRAY_FL_MEMMAP; tr->ref++; } + if (start) { + tr->flags |= TRACE_ARRAY_FL_BOOT | TRACE_ARRAY_FL_LAST_BOOT; + tr->range_name = no_free_ptr(rname); + } + while ((tok = strsep(&curr_str, ","))) { early_enable_events(tr, tok, true); } @@ -10699,6 +11122,9 @@ __init static int late_trace_init(void) tracepoint_printk = 0; } + if (traceoff_after_boot) + tracing_off(); + tracing_set_default_clock(); clear_boot_tracer(); return 0; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 9c21ba45b7af..79be1995db44 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -21,6 +21,7 @@ #include <linux/workqueue.h> #include <linux/ctype.h> #include <linux/once_lite.h> +#include <linux/ftrace_regs.h> #include "pid_list.h" @@ -312,6 +313,11 @@ struct trace_func_repeats { u64 ts_last_call; }; +struct trace_module_delta { + struct rcu_head rcu; + long delta[]; +}; + /* * The trace array - an array of per-CPU trace arrays. This is the * highest level data structure that individual tracers deal with. @@ -348,8 +354,13 @@ struct trace_array { unsigned int mapped; unsigned long range_addr_start; unsigned long range_addr_size; + char *range_name; long text_delta; - long data_delta; + struct trace_module_delta *module_delta; + void *scratch; /* pointer in persistent memory */ + int scratch_size; + + int buffer_disabled; struct trace_pid_list __rcu *filtered_pids; struct trace_pid_list __rcu *filtered_no_pids; @@ -367,7 +378,6 @@ struct trace_array { * CONFIG_TRACER_MAX_TRACE. */ arch_spinlock_t max_lock; - int buffer_disabled; #ifdef CONFIG_FTRACE_SYSCALLS int sys_refcount_enter; int sys_refcount_exit; @@ -433,9 +443,11 @@ struct trace_array { }; enum { - TRACE_ARRAY_FL_GLOBAL = BIT(0), - TRACE_ARRAY_FL_BOOT = BIT(1), - TRACE_ARRAY_FL_MOD_INIT = BIT(2), + TRACE_ARRAY_FL_GLOBAL = BIT(0), + TRACE_ARRAY_FL_BOOT = BIT(1), + TRACE_ARRAY_FL_LAST_BOOT = BIT(2), + TRACE_ARRAY_FL_MOD_INIT = BIT(3), + TRACE_ARRAY_FL_MEMMAP = BIT(4), }; #ifdef CONFIG_MODULES @@ -462,6 +474,8 @@ extern int tracing_set_clock(struct trace_array *tr, const char *clockstr); extern bool trace_clock_in_ns(struct trace_array *tr); +extern unsigned long trace_adjust_address(struct trace_array *tr, unsigned long addr); + /* * The global tracer (top) should be the first trace array added, * but we check the flag anyway. @@ -697,7 +711,8 @@ unsigned long trace_total_entries(struct trace_array *tr); void trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, - unsigned int trace_ctx); + unsigned int trace_ctx, + struct ftrace_regs *regs); void trace_graph_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, @@ -783,6 +798,8 @@ extern void trace_find_cmdline(int pid, char comm[]); extern int trace_find_tgid(int pid); extern void trace_event_follow_fork(struct trace_array *tr, bool enable); +extern int trace_events_enabled(struct trace_array *tr, const char *system); + #ifdef CONFIG_DYNAMIC_FTRACE extern unsigned long ftrace_update_tot_cnt; extern unsigned long ftrace_number_of_pages; @@ -836,13 +853,15 @@ static inline void __init disable_tracing_selftest(const char *reason) extern void *head_page(struct trace_array_cpu *data); extern unsigned long long ns2usecs(u64 nsec); -extern int -trace_vbprintk(unsigned long ip, const char *fmt, va_list args); -extern int -trace_vprintk(unsigned long ip, const char *fmt, va_list args); -extern int -trace_array_vprintk(struct trace_array *tr, - unsigned long ip, const char *fmt, va_list args); + +__printf(2, 0) +int trace_vbprintk(unsigned long ip, const char *fmt, va_list args); +__printf(2, 0) +int trace_vprintk(unsigned long ip, const char *fmt, va_list args); +__printf(3, 0) +int trace_array_vprintk(struct trace_array *tr, + unsigned long ip, const char *fmt, va_list args); +__printf(3, 4) int trace_array_printk_buf(struct trace_buffer *buffer, unsigned long ip, const char *fmt, ...); void trace_printk_seq(struct trace_seq *s); @@ -897,6 +916,7 @@ static __always_inline bool ftrace_hash_empty(struct ftrace_hash *hash) #define TRACE_GRAPH_PRINT_RETVAL 0x800 #define TRACE_GRAPH_PRINT_RETVAL_HEX 0x1000 #define TRACE_GRAPH_PRINT_RETADDR 0x2000 +#define TRACE_GRAPH_ARGS 0x4000 #define TRACE_GRAPH_PRINT_FILL_SHIFT 28 #define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT) @@ -1714,7 +1734,7 @@ struct event_trigger_data { unsigned long count; int ref; int flags; - struct event_trigger_ops *ops; + const struct event_trigger_ops *ops; struct event_command *cmd_ops; struct event_filter __rcu *filter; char *filter_str; @@ -1959,7 +1979,7 @@ struct event_command { int (*set_filter)(char *filter_str, struct event_trigger_data *data, struct trace_event_file *file); - struct event_trigger_ops *(*get_trigger_ops)(char *cmd, char *param); + const struct event_trigger_ops *(*get_trigger_ops)(char *cmd, char *param); }; /** diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index a322e4f249a5..5d64a18cacac 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -16,7 +16,7 @@ #include "trace_output.h" /* for trace_event_sem */ #include "trace_dynevent.h" -static DEFINE_MUTEX(dyn_event_ops_mutex); +DEFINE_MUTEX(dyn_event_ops_mutex); static LIST_HEAD(dyn_event_ops_list); bool trace_event_dyn_try_get_ref(struct trace_event_call *dyn_call) @@ -116,6 +116,20 @@ int dyn_event_release(const char *raw_command, struct dyn_event_operations *type return ret; } +/* + * Locked version of event creation. The event creation must be protected by + * dyn_event_ops_mutex because of protecting trace_probe_log. + */ +int dyn_event_create(const char *raw_command, struct dyn_event_operations *type) +{ + int ret; + + mutex_lock(&dyn_event_ops_mutex); + ret = type->create(raw_command); + mutex_unlock(&dyn_event_ops_mutex); + return ret; +} + static int create_dyn_event(const char *raw_command) { struct dyn_event_operations *ops; diff --git a/kernel/trace/trace_dynevent.h b/kernel/trace/trace_dynevent.h index 936477a111d3..beee3f8d7544 100644 --- a/kernel/trace/trace_dynevent.h +++ b/kernel/trace/trace_dynevent.h @@ -100,6 +100,7 @@ void *dyn_event_seq_next(struct seq_file *m, void *v, loff_t *pos); void dyn_event_seq_stop(struct seq_file *m, void *v); int dyn_events_release_all(struct dyn_event_operations *type); int dyn_event_release(const char *raw_command, struct dyn_event_operations *type); +int dyn_event_create(const char *raw_command, struct dyn_event_operations *type); /* * for_each_dyn_event - iterate over the dyn_event list diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index fbfb396905a6..4ef4df6623a8 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -61,8 +61,9 @@ FTRACE_ENTRY_REG(function, ftrace_entry, TRACE_FN, F_STRUCT( - __field_fn( unsigned long, ip ) - __field_fn( unsigned long, parent_ip ) + __field_fn( unsigned long, ip ) + __field_fn( unsigned long, parent_ip ) + __dynamic_array( unsigned long, args ) ), F_printk(" %ps <-- %ps", @@ -72,17 +73,18 @@ FTRACE_ENTRY_REG(function, ftrace_entry, ); /* Function call entry */ -FTRACE_ENTRY_PACKED(funcgraph_entry, ftrace_graph_ent_entry, +FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry, TRACE_GRAPH_ENT, F_STRUCT( __field_struct( struct ftrace_graph_ent, graph_ent ) __field_packed( unsigned long, graph_ent, func ) - __field_packed( int, graph_ent, depth ) + __field_packed( unsigned int, graph_ent, depth ) + __dynamic_array(unsigned long, args ) ), - F_printk("--> %ps (%d)", (void *)__entry->func, __entry->depth) + F_printk("--> %ps (%u)", (void *)__entry->func, __entry->depth) ); #ifdef CONFIG_FUNCTION_GRAPH_RETADDR diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index 82fd637cfc19..916555f0de81 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -478,7 +478,7 @@ static void eprobe_trigger_func(struct event_trigger_data *data, __eprobe_trace_func(edata, rec); } -static struct event_trigger_ops eprobe_trigger_ops = { +static const struct event_trigger_ops eprobe_trigger_ops = { .trigger = eprobe_trigger_func, .print = eprobe_trigger_print, .init = eprobe_trigger_init, @@ -507,8 +507,8 @@ static void eprobe_trigger_unreg_func(char *glob, } -static struct event_trigger_ops *eprobe_trigger_get_ops(char *cmd, - char *param) +static const struct event_trigger_ops *eprobe_trigger_get_ops(char *cmd, + char *param) { return &eprobe_trigger_ops; } @@ -913,6 +913,8 @@ static int __trace_eprobe_create(int argc, const char *argv[]) } if (argc - 2 > MAX_TRACE_ARGS) { + trace_probe_log_set_index(2); + trace_probe_log_err(0, TOO_MANY_ARGS); ret = -E2BIG; goto error; } @@ -967,10 +969,13 @@ static int __trace_eprobe_create(int argc, const char *argv[]) goto error; } } + trace_probe_log_clear(); return ret; + parse_error: ret = -EINVAL; error: + trace_probe_log_clear(); trace_event_probe_cleanup(ep); return ret; } diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 3ff9caa4a71b..a6bb7577e8c5 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -49,7 +49,7 @@ static int perf_trace_event_perm(struct trace_event_call *tp_event, /* The ftrace function trace is allowed only for root. */ if (ftrace_event_is_function(tp_event)) { - ret = perf_allow_tracepoint(&p_event->attr); + ret = perf_allow_tracepoint(); if (ret) return ret; @@ -86,7 +86,7 @@ static int perf_trace_event_perm(struct trace_event_call *tp_event, * ...otherwise raw tracepoint data can be a severe data leak, * only allow root to have these. */ - ret = perf_allow_tracepoint(&p_event->attr); + ret = perf_allow_tracepoint(); if (ret) return ret; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 513de9ceb80e..069e92856bda 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -470,6 +470,7 @@ static void test_event_printk(struct trace_event_call *call) case '%': continue; case 'p': + do_pointer: /* Find dereferencing fields */ switch (fmt[i + 1]) { case 'B': case 'R': case 'r': @@ -498,6 +499,12 @@ static void test_event_printk(struct trace_event_call *call) continue; if (fmt[i + j] == '*') { star = true; + /* Handle %*pbl case */ + if (!j && fmt[i + 1] == 'p') { + arg++; + i++; + goto do_pointer; + } continue; } if ((fmt[i + j] == 's')) { @@ -790,7 +797,9 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, clear_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags); } - call->class->reg(call, TRACE_REG_UNREGISTER, file); + ret = call->class->reg(call, TRACE_REG_UNREGISTER, file); + + WARN_ON_ONCE(ret); } /* If in SOFT_MODE, just set the SOFT_DISABLE_BIT, else clear it */ if (file->flags & EVENT_FILE_FL_SOFT_MODE) @@ -1818,28 +1827,28 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, return cnt; } -static ssize_t -system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, - loff_t *ppos) +/* + * Returns: + * 0 : no events exist? + * 1 : all events are disabled + * 2 : all events are enabled + * 3 : some events are enabled and some are enabled + */ +int trace_events_enabled(struct trace_array *tr, const char *system) { - const char set_to_char[4] = { '?', '0', '1', 'X' }; - struct trace_subsystem_dir *dir = filp->private_data; - struct event_subsystem *system = dir->subsystem; struct trace_event_call *call; struct trace_event_file *file; - struct trace_array *tr = dir->tr; - char buf[2]; int set = 0; - int ret; - mutex_lock(&event_mutex); + guard(mutex)(&event_mutex); + list_for_each_entry(file, &tr->events, list) { call = file->event_call; if ((call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) || !trace_event_name(call) || !call->class || !call->class->reg) continue; - if (system && strcmp(call->class->system, system->name) != 0) + if (system && strcmp(call->class->system, system) != 0) continue; /* @@ -1855,7 +1864,23 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, if (set == 3) break; } - mutex_unlock(&event_mutex); + + return set; +} + +static ssize_t +system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + const char set_to_char[4] = { '?', '0', '1', 'X' }; + struct trace_subsystem_dir *dir = filp->private_data; + struct event_subsystem *system = dir->subsystem; + struct trace_array *tr = dir->tr; + char buf[2]; + int set; + int ret; + + set = trace_events_enabled(tr, system ? system->name : NULL); buf[0] = set_to_char[set]; buf[1] = '\n'; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 0993dfc1c5c1..2048560264bb 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -808,7 +808,7 @@ static __always_inline char *test_string(char *str) kstr = ubuf->buffer; /* For safety, do not trust the string pointer */ - if (!strncpy_from_kernel_nofault(kstr, str, USTRING_BUF_SIZE)) + if (strncpy_from_kernel_nofault(kstr, str, USTRING_BUF_SIZE) < 0) return NULL; return kstr; } @@ -827,7 +827,7 @@ static __always_inline char *test_ustring(char *str) /* user space address? */ ustr = (char __user *)str; - if (!strncpy_from_user_nofault(kstr, ustr, USTRING_BUF_SIZE)) + if (strncpy_from_user_nofault(kstr, ustr, USTRING_BUF_SIZE) < 0) return NULL; return kstr; diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 53dc6719181e..1260c23cfa5f 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -6203,7 +6203,7 @@ static void event_hist_trigger_free(struct event_trigger_data *data) } } -static struct event_trigger_ops event_hist_trigger_ops = { +static const struct event_trigger_ops event_hist_trigger_ops = { .trigger = event_hist_trigger, .print = event_hist_trigger_print, .init = event_hist_trigger_init, @@ -6235,15 +6235,15 @@ static void event_hist_trigger_named_free(struct event_trigger_data *data) } } -static struct event_trigger_ops event_hist_trigger_named_ops = { +static const struct event_trigger_ops event_hist_trigger_named_ops = { .trigger = event_hist_trigger, .print = event_hist_trigger_print, .init = event_hist_trigger_named_init, .free = event_hist_trigger_named_free, }; -static struct event_trigger_ops *event_hist_get_trigger_ops(char *cmd, - char *param) +static const struct event_trigger_ops *event_hist_get_trigger_ops(char *cmd, + char *param) { return &event_hist_trigger_ops; } @@ -6838,38 +6838,38 @@ hist_enable_count_trigger(struct event_trigger_data *data, hist_enable_trigger(data, buffer, rec, event); } -static struct event_trigger_ops hist_enable_trigger_ops = { +static const struct event_trigger_ops hist_enable_trigger_ops = { .trigger = hist_enable_trigger, .print = event_enable_trigger_print, .init = event_trigger_init, .free = event_enable_trigger_free, }; -static struct event_trigger_ops hist_enable_count_trigger_ops = { +static const struct event_trigger_ops hist_enable_count_trigger_ops = { .trigger = hist_enable_count_trigger, .print = event_enable_trigger_print, .init = event_trigger_init, .free = event_enable_trigger_free, }; -static struct event_trigger_ops hist_disable_trigger_ops = { +static const struct event_trigger_ops hist_disable_trigger_ops = { .trigger = hist_enable_trigger, .print = event_enable_trigger_print, .init = event_trigger_init, .free = event_enable_trigger_free, }; -static struct event_trigger_ops hist_disable_count_trigger_ops = { +static const struct event_trigger_ops hist_disable_count_trigger_ops = { .trigger = hist_enable_count_trigger, .print = event_enable_trigger_print, .init = event_trigger_init, .free = event_enable_trigger_free, }; -static struct event_trigger_ops * +static const struct event_trigger_ops * hist_enable_get_trigger_ops(char *cmd, char *param) { - struct event_trigger_ops *ops; + const struct event_trigger_ops *ops; bool enable; enable = (strcmp(cmd, ENABLE_HIST_STR) == 0); diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index e3f7d09e5512..33cfbd4ed76d 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -207,7 +207,7 @@ static int synth_field_string_size(char *type) if (len == 0) return 0; /* variable-length string */ - strncpy(buf, start, len); + memcpy(buf, start, len); buf[len] = '\0'; err = kstrtouint(buf, 0, &size); @@ -305,7 +305,7 @@ static const char *synth_field_fmt(char *type) else if (strcmp(type, "gfp_t") == 0) fmt = "%x"; else if (synth_field_is_string(type)) - fmt = "%.*s"; + fmt = "%s"; else if (synth_field_is_stack(type)) fmt = "%s"; @@ -370,7 +370,6 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter, union trace_synth_field *data = &entry->fields[n_u64]; trace_seq_printf(s, print_fmt, se->fields[i]->name, - STR_VAR_LEN_MAX, (char *)entry + data->as_dynamic.offset, i == se->n_fields - 1 ? "" : " "); n_u64++; @@ -612,7 +611,7 @@ static int __set_synth_event_print_fmt(struct synth_event *event, fmt = synth_field_fmt(event->fields[i]->type); pos += snprintf(buf + pos, LEN_OR_ZERO, "%s=%s%s", event->fields[i]->name, fmt, - i == event->n_fields - 1 ? "" : ", "); + i == event->n_fields - 1 ? "" : " "); } pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); @@ -852,6 +851,38 @@ static struct trace_event_fields synth_event_fields_array[] = { {} }; +static int synth_event_reg(struct trace_event_call *call, + enum trace_reg type, void *data) +{ + struct synth_event *event = container_of(call, struct synth_event, call); + + switch (type) { +#ifdef CONFIG_PERF_EVENTS + case TRACE_REG_PERF_REGISTER: +#endif + case TRACE_REG_REGISTER: + if (!try_module_get(event->mod)) + return -EBUSY; + break; + default: + break; + } + + int ret = trace_event_reg(call, type, data); + + switch (type) { +#ifdef CONFIG_PERF_EVENTS + case TRACE_REG_PERF_UNREGISTER: +#endif + case TRACE_REG_UNREGISTER: + module_put(event->mod); + break; + default: + break; + } + return ret; +} + static int register_synth_event(struct synth_event *event) { struct trace_event_call *call = &event->call; @@ -881,7 +912,7 @@ static int register_synth_event(struct synth_event *event) goto out; } call->flags = TRACE_EVENT_FL_TRACEPOINT; - call->class->reg = trace_event_reg; + call->class->reg = synth_event_reg; call->class->probe = trace_event_raw_event_synth; call->data = event; call->tp = event->tp; diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index d45448947094..6e87ae2a1a66 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -825,7 +825,7 @@ struct event_trigger_data *event_trigger_alloc(struct event_command *cmd_ops, void *private_data) { struct event_trigger_data *trigger_data; - struct event_trigger_ops *trigger_ops; + const struct event_trigger_ops *trigger_ops; trigger_ops = cmd_ops->get_trigger_ops(cmd, param); @@ -1367,38 +1367,38 @@ traceoff_trigger_print(struct seq_file *m, struct event_trigger_data *data) data->filter_str); } -static struct event_trigger_ops traceon_trigger_ops = { +static const struct event_trigger_ops traceon_trigger_ops = { .trigger = traceon_trigger, .print = traceon_trigger_print, .init = event_trigger_init, .free = event_trigger_free, }; -static struct event_trigger_ops traceon_count_trigger_ops = { +static const struct event_trigger_ops traceon_count_trigger_ops = { .trigger = traceon_count_trigger, .print = traceon_trigger_print, .init = event_trigger_init, .free = event_trigger_free, }; -static struct event_trigger_ops traceoff_trigger_ops = { +static const struct event_trigger_ops traceoff_trigger_ops = { .trigger = traceoff_trigger, .print = traceoff_trigger_print, .init = event_trigger_init, .free = event_trigger_free, }; -static struct event_trigger_ops traceoff_count_trigger_ops = { +static const struct event_trigger_ops traceoff_count_trigger_ops = { .trigger = traceoff_count_trigger, .print = traceoff_trigger_print, .init = event_trigger_init, .free = event_trigger_free, }; -static struct event_trigger_ops * +static const struct event_trigger_ops * onoff_get_trigger_ops(char *cmd, char *param) { - struct event_trigger_ops *ops; + const struct event_trigger_ops *ops; /* we register both traceon and traceoff to this callback */ if (strcmp(cmd, "traceon") == 0) @@ -1491,21 +1491,21 @@ snapshot_trigger_print(struct seq_file *m, struct event_trigger_data *data) data->filter_str); } -static struct event_trigger_ops snapshot_trigger_ops = { +static const struct event_trigger_ops snapshot_trigger_ops = { .trigger = snapshot_trigger, .print = snapshot_trigger_print, .init = event_trigger_init, .free = event_trigger_free, }; -static struct event_trigger_ops snapshot_count_trigger_ops = { +static const struct event_trigger_ops snapshot_count_trigger_ops = { .trigger = snapshot_count_trigger, .print = snapshot_trigger_print, .init = event_trigger_init, .free = event_trigger_free, }; -static struct event_trigger_ops * +static const struct event_trigger_ops * snapshot_get_trigger_ops(char *cmd, char *param) { return param ? &snapshot_count_trigger_ops : &snapshot_trigger_ops; @@ -1560,7 +1560,7 @@ stacktrace_trigger(struct event_trigger_data *data, struct trace_event_file *file = data->private_data; if (file) - __trace_stack(file->tr, tracing_gen_ctx(), STACK_SKIP); + __trace_stack(file->tr, tracing_gen_ctx_dec(), STACK_SKIP); else trace_dump_stack(STACK_SKIP); } @@ -1586,21 +1586,21 @@ stacktrace_trigger_print(struct seq_file *m, struct event_trigger_data *data) data->filter_str); } -static struct event_trigger_ops stacktrace_trigger_ops = { +static const struct event_trigger_ops stacktrace_trigger_ops = { .trigger = stacktrace_trigger, .print = stacktrace_trigger_print, .init = event_trigger_init, .free = event_trigger_free, }; -static struct event_trigger_ops stacktrace_count_trigger_ops = { +static const struct event_trigger_ops stacktrace_count_trigger_ops = { .trigger = stacktrace_count_trigger, .print = stacktrace_trigger_print, .init = event_trigger_init, .free = event_trigger_free, }; -static struct event_trigger_ops * +static const struct event_trigger_ops * stacktrace_get_trigger_ops(char *cmd, char *param) { return param ? &stacktrace_count_trigger_ops : &stacktrace_trigger_ops; @@ -1711,28 +1711,28 @@ void event_enable_trigger_free(struct event_trigger_data *data) } } -static struct event_trigger_ops event_enable_trigger_ops = { +static const struct event_trigger_ops event_enable_trigger_ops = { .trigger = event_enable_trigger, .print = event_enable_trigger_print, .init = event_trigger_init, .free = event_enable_trigger_free, }; -static struct event_trigger_ops event_enable_count_trigger_ops = { +static const struct event_trigger_ops event_enable_count_trigger_ops = { .trigger = event_enable_count_trigger, .print = event_enable_trigger_print, .init = event_trigger_init, .free = event_enable_trigger_free, }; -static struct event_trigger_ops event_disable_trigger_ops = { +static const struct event_trigger_ops event_disable_trigger_ops = { .trigger = event_enable_trigger, .print = event_enable_trigger_print, .init = event_trigger_init, .free = event_enable_trigger_free, }; -static struct event_trigger_ops event_disable_count_trigger_ops = { +static const struct event_trigger_ops event_disable_count_trigger_ops = { .trigger = event_enable_count_trigger, .print = event_enable_trigger_print, .init = event_trigger_init, @@ -1916,10 +1916,10 @@ void event_enable_unregister_trigger(char *glob, data->ops->free(data); } -static struct event_trigger_ops * +static const struct event_trigger_ops * event_enable_get_trigger_ops(char *cmd, char *param) { - struct event_trigger_ops *ops; + const struct event_trigger_ops *ops; bool enable; #ifdef CONFIG_HIST_TRIGGERS diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 97325fbd6283..af42aaa3d172 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -455,7 +455,7 @@ static void user_event_enabler_fault_fixup(struct work_struct *work) if (ret && ret != -ENOENT) { struct user_event *user = enabler->event; - pr_warn("user_events: Fault for mm: 0x%pK @ 0x%llx event: %s\n", + pr_warn("user_events: Fault for mm: 0x%p @ 0x%llx event: %s\n", mm->mm, (unsigned long long)uaddr, EVENT_NAME(user)); } @@ -2793,11 +2793,8 @@ static int user_seq_show(struct seq_file *m, void *p) seq_printf(m, "%s", EVENT_TP_NAME(user)); - if (status != 0) - seq_puts(m, " #"); - if (status != 0) { - seq_puts(m, " Used by"); + seq_puts(m, " # Used by"); if (status & EVENT_STATUS_FTRACE) seq_puts(m, " ftrace"); if (status & EVENT_STATUS_PERF) diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c index 985ff98272da..b40fa59159ac 100644 --- a/kernel/trace/trace_fprobe.c +++ b/kernel/trace/trace_fprobe.c @@ -919,9 +919,15 @@ static void __find_tracepoint_module_cb(struct tracepoint *tp, struct module *mo struct __find_tracepoint_cb_data *data = priv; if (!data->tpoint && !strcmp(data->tp_name, tp->name)) { - data->tpoint = tp; - if (!data->mod) + /* If module is not specified, try getting module refcount. */ + if (!data->mod && mod) { + /* If failed to get refcount, ignore this tracepoint. */ + if (!try_module_get(mod)) + return; + data->mod = mod; + } + data->tpoint = tp; } } @@ -933,7 +939,11 @@ static void __find_tracepoint_cb(struct tracepoint *tp, void *priv) data->tpoint = tp; } -/* Find a tracepoint from kernel and module. */ +/* + * Find a tracepoint from kernel and module. If the tracepoint is on the module, + * the module's refcount is incremented and returned as *@tp_mod. Thus, if it is + * not NULL, caller must call module_put(*tp_mod) after used the tracepoint. + */ static struct tracepoint *find_tracepoint(const char *tp_name, struct module **tp_mod) { @@ -962,7 +972,10 @@ static void reenable_trace_fprobe(struct trace_fprobe *tf) } } -/* Find a tracepoint from specified module. */ +/* + * Find a tracepoint from specified module. In this case, this does not get the + * module's refcount. The caller must ensure the module is not freed. + */ static struct tracepoint *find_tracepoint_in_module(struct module *mod, const char *tp_name) { @@ -1169,11 +1182,6 @@ static int trace_fprobe_create_internal(int argc, const char *argv[], if (is_tracepoint) { ctx->flags |= TPARG_FL_TPOINT; tpoint = find_tracepoint(symbol, &tp_mod); - /* lock module until register this tprobe. */ - if (tp_mod && !try_module_get(tp_mod)) { - tpoint = NULL; - tp_mod = NULL; - } if (tpoint) { ctx->funcname = kallsyms_lookup( (unsigned long)tpoint->probestub, @@ -1199,8 +1207,11 @@ static int trace_fprobe_create_internal(int argc, const char *argv[], argc = new_argc; argv = new_argv; } - if (argc > MAX_TRACE_ARGS) + if (argc > MAX_TRACE_ARGS) { + trace_probe_log_set_index(2); + trace_probe_log_err(0, TOO_MANY_ARGS); return -E2BIG; + } ret = traceprobe_expand_dentry_args(argc, argv, &dbuf); if (ret) diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index df56f9b76010..4e37a0f6aaa3 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -25,6 +25,9 @@ static void function_trace_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs); static void +function_args_trace_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct ftrace_regs *fregs); +static void function_stack_trace_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs); static void @@ -42,9 +45,10 @@ enum { TRACE_FUNC_NO_OPTS = 0x0, /* No flags set. */ TRACE_FUNC_OPT_STACK = 0x1, TRACE_FUNC_OPT_NO_REPEATS = 0x2, + TRACE_FUNC_OPT_ARGS = 0x4, /* Update this to next highest bit. */ - TRACE_FUNC_OPT_HIGHEST_BIT = 0x4 + TRACE_FUNC_OPT_HIGHEST_BIT = 0x8 }; #define TRACE_FUNC_OPT_MASK (TRACE_FUNC_OPT_HIGHEST_BIT - 1) @@ -114,6 +118,8 @@ static ftrace_func_t select_trace_function(u32 flags_val) switch (flags_val & TRACE_FUNC_OPT_MASK) { case TRACE_FUNC_NO_OPTS: return function_trace_call; + case TRACE_FUNC_OPT_ARGS: + return function_args_trace_call; case TRACE_FUNC_OPT_STACK: return function_stack_trace_call; case TRACE_FUNC_OPT_NO_REPEATS: @@ -220,7 +226,34 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, data = this_cpu_ptr(tr->array_buffer.data); if (!atomic_read(&data->disabled)) - trace_function(tr, ip, parent_ip, trace_ctx); + trace_function(tr, ip, parent_ip, trace_ctx, NULL); + + ftrace_test_recursion_unlock(bit); +} + +static void +function_args_trace_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct ftrace_regs *fregs) +{ + struct trace_array *tr = op->private; + struct trace_array_cpu *data; + unsigned int trace_ctx; + int bit; + int cpu; + + if (unlikely(!tr->function_enabled)) + return; + + bit = ftrace_test_recursion_trylock(ip, parent_ip); + if (bit < 0) + return; + + trace_ctx = tracing_gen_ctx(); + + cpu = smp_processor_id(); + data = per_cpu_ptr(tr->array_buffer.data, cpu); + if (!atomic_read(&data->disabled)) + trace_function(tr, ip, parent_ip, trace_ctx, fregs); ftrace_test_recursion_unlock(bit); } @@ -270,7 +303,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip, if (likely(disabled == 1)) { trace_ctx = tracing_gen_ctx_flags(flags); - trace_function(tr, ip, parent_ip, trace_ctx); + trace_function(tr, ip, parent_ip, trace_ctx, NULL); #ifdef CONFIG_UNWINDER_FRAME_POINTER if (ftrace_pids_enabled(op)) skip++; @@ -349,7 +382,7 @@ function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, trace_ctx = tracing_gen_ctx_dec(); process_repeats(tr, ip, parent_ip, last_info, trace_ctx); - trace_function(tr, ip, parent_ip, trace_ctx); + trace_function(tr, ip, parent_ip, trace_ctx, NULL); out: ftrace_test_recursion_unlock(bit); @@ -389,7 +422,7 @@ function_stack_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, trace_ctx = tracing_gen_ctx_flags(flags); process_repeats(tr, ip, parent_ip, last_info, trace_ctx); - trace_function(tr, ip, parent_ip, trace_ctx); + trace_function(tr, ip, parent_ip, trace_ctx, NULL); __trace_stack(tr, trace_ctx, STACK_SKIP); } @@ -403,6 +436,9 @@ static struct tracer_opt func_opts[] = { { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) }, #endif { TRACER_OPT(func-no-repeats, TRACE_FUNC_OPT_NO_REPEATS) }, +#ifdef CONFIG_FUNCTION_TRACE_ARGS + { TRACER_OPT(func-args, TRACE_FUNC_OPT_ARGS) }, +#endif { } /* Always set a last empty entry */ }; @@ -597,11 +633,7 @@ ftrace_traceoff(unsigned long ip, unsigned long parent_ip, static __always_inline void trace_stack(struct trace_array *tr) { - unsigned int trace_ctx; - - trace_ctx = tracing_gen_ctx(); - - __trace_stack(tr, trace_ctx, FTRACE_STACK_SKIP); + __trace_stack(tr, tracing_gen_ctx_dec(), FTRACE_STACK_SKIP); } static void diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 136c750b0b4d..0c357a89c58e 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -71,6 +71,10 @@ static struct tracer_opt trace_opts[] = { /* Display function return address ? */ { TRACER_OPT(funcgraph-retaddr, TRACE_GRAPH_PRINT_RETADDR) }, #endif +#ifdef CONFIG_FUNCTION_TRACE_ARGS + /* Display function arguments ? */ + { TRACER_OPT(funcgraph-args, TRACE_GRAPH_ARGS) }, +#endif /* Include sleep time (scheduled out) between entry and return */ { TRACER_OPT(sleep-time, TRACE_GRAPH_SLEEP_TIME) }, @@ -110,25 +114,43 @@ static void print_graph_duration(struct trace_array *tr, unsigned long long duration, struct trace_seq *s, u32 flags); -int __trace_graph_entry(struct trace_array *tr, - struct ftrace_graph_ent *trace, - unsigned int trace_ctx) +static int __graph_entry(struct trace_array *tr, struct ftrace_graph_ent *trace, + unsigned int trace_ctx, struct ftrace_regs *fregs) { struct ring_buffer_event *event; struct trace_buffer *buffer = tr->array_buffer.buffer; struct ftrace_graph_ent_entry *entry; + int size; - event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT, - sizeof(*entry), trace_ctx); + /* If fregs is defined, add FTRACE_REGS_MAX_ARGS long size words */ + size = sizeof(*entry) + (FTRACE_REGS_MAX_ARGS * !!fregs * sizeof(long)); + + event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT, size, trace_ctx); if (!event) return 0; - entry = ring_buffer_event_data(event); - entry->graph_ent = *trace; + + entry = ring_buffer_event_data(event); + entry->graph_ent = *trace; + +#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API + if (fregs) { + for (int i = 0; i < FTRACE_REGS_MAX_ARGS; i++) + entry->args[i] = ftrace_regs_get_argument(fregs, i); + } +#endif + trace_buffer_unlock_commit_nostack(buffer, event); return 1; } +int __trace_graph_entry(struct trace_array *tr, + struct ftrace_graph_ent *trace, + unsigned int trace_ctx) +{ + return __graph_entry(tr, trace, trace_ctx, NULL); +} + #ifdef CONFIG_FUNCTION_GRAPH_RETADDR int __trace_graph_retaddr_entry(struct trace_array *tr, struct ftrace_graph_ent *trace, @@ -174,9 +196,9 @@ struct fgraph_times { unsigned long long sleeptime; /* may be optional! */ }; -int trace_graph_entry(struct ftrace_graph_ent *trace, - struct fgraph_ops *gops, - struct ftrace_regs *fregs) +static int graph_entry(struct ftrace_graph_ent *trace, + struct fgraph_ops *gops, + struct ftrace_regs *fregs) { unsigned long *task_var = fgraph_get_task_var(gops); struct trace_array *tr = gops->private; @@ -246,7 +268,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace, unsigned long retaddr = ftrace_graph_top_ret_addr(current); ret = __trace_graph_retaddr_entry(tr, trace, trace_ctx, retaddr); } else { - ret = __trace_graph_entry(tr, trace, trace_ctx); + ret = __graph_entry(tr, trace, trace_ctx, fregs); } } preempt_enable_notrace(); @@ -254,6 +276,20 @@ int trace_graph_entry(struct ftrace_graph_ent *trace, return ret; } +int trace_graph_entry(struct ftrace_graph_ent *trace, + struct fgraph_ops *gops, + struct ftrace_regs *fregs) +{ + return graph_entry(trace, gops, NULL); +} + +static int trace_graph_entry_args(struct ftrace_graph_ent *trace, + struct fgraph_ops *gops, + struct ftrace_regs *fregs) +{ + return graph_entry(trace, gops, fregs); +} + static void __trace_graph_function(struct trace_array *tr, unsigned long ip, unsigned int trace_ctx) @@ -418,14 +454,17 @@ static int graph_trace_init(struct trace_array *tr) { int ret; - tr->gops->entryfunc = trace_graph_entry; + if (tracer_flags_is_set(TRACE_GRAPH_ARGS)) + tr->gops->entryfunc = trace_graph_entry_args; + else + tr->gops->entryfunc = trace_graph_entry; if (tracing_thresh) tr->gops->retfunc = trace_graph_thresh_return; else tr->gops->retfunc = trace_graph_return; - /* Make gops functions are visible before we start tracing */ + /* Make gops functions visible before we start tracing */ smp_mb(); ret = register_ftrace_graph(tr->gops); @@ -436,6 +475,28 @@ static int graph_trace_init(struct trace_array *tr) return 0; } +static int ftrace_graph_trace_args(struct trace_array *tr, int set) +{ + trace_func_graph_ent_t entry; + + if (set) + entry = trace_graph_entry_args; + else + entry = trace_graph_entry; + + /* See if there's any changes */ + if (tr->gops->entryfunc == entry) + return 0; + + unregister_ftrace_graph(tr->gops); + + tr->gops->entryfunc = entry; + + /* Make gops functions visible before we start tracing */ + smp_mb(); + return register_ftrace_graph(tr->gops); +} + static void graph_trace_reset(struct trace_array *tr) { tracing_stop_cmdline_record(); @@ -775,7 +836,7 @@ static void print_graph_retaddr(struct trace_seq *s, struct fgraph_retaddr_ent_e static void print_graph_retval(struct trace_seq *s, struct ftrace_graph_ent_entry *entry, struct ftrace_graph_ret *graph_ret, void *func, - u32 opt_flags, u32 trace_flags) + u32 opt_flags, u32 trace_flags, int args_size) { unsigned long err_code = 0; unsigned long retval = 0; @@ -809,11 +870,16 @@ static void print_graph_retval(struct trace_seq *s, struct ftrace_graph_ent_entr if (entry->ent.type != TRACE_GRAPH_RETADDR_ENT) print_retaddr = false; - trace_seq_printf(s, "%ps();", func); + trace_seq_printf(s, "%ps", func); + + if (args_size >= FTRACE_REGS_MAX_ARGS * sizeof(long)) { + print_function_args(s, entry->args, (unsigned long)func); + trace_seq_putc(s, ';'); + } else + trace_seq_puts(s, "();"); + if (print_retval || print_retaddr) trace_seq_puts(s, " /*"); - else - trace_seq_putc(s, '\n'); } else { print_retaddr = false; trace_seq_printf(s, "} /* %ps", func); @@ -831,12 +897,13 @@ static void print_graph_retval(struct trace_seq *s, struct ftrace_graph_ent_entr } if (!entry || print_retval || print_retaddr) - trace_seq_puts(s, " */\n"); + trace_seq_puts(s, " */"); } #else -#define print_graph_retval(_seq, _ent, _ret, _func, _opt_flags, _trace_flags) do {} while (0) +#define print_graph_retval(_seq, _ent, _ret, _func, _opt_flags, _trace_flags, args_size) \ + do {} while (0) #endif @@ -852,16 +919,17 @@ print_graph_entry_leaf(struct trace_iterator *iter, struct ftrace_graph_ret *graph_ret; struct ftrace_graph_ent *call; unsigned long long duration; - unsigned long func; + unsigned long ret_func; + int args_size; int cpu = iter->cpu; int i; + args_size = iter->ent_size - offsetof(struct ftrace_graph_ent_entry, args); + graph_ret = &ret_entry->ret; call = &entry->graph_ent; duration = ret_entry->rettime - ret_entry->calltime; - func = call->func + iter->tr->text_delta; - if (data) { struct fgraph_cpu_data *cpu_data; @@ -887,16 +955,25 @@ print_graph_entry_leaf(struct trace_iterator *iter, for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) trace_seq_putc(s, ' '); + ret_func = graph_ret->func + iter->tr->text_delta; + /* * Write out the function return value or return address */ if (flags & (__TRACE_GRAPH_PRINT_RETVAL | __TRACE_GRAPH_PRINT_RETADDR)) { print_graph_retval(s, entry, graph_ret, (void *)graph_ret->func + iter->tr->text_delta, - flags, tr->trace_flags); + flags, tr->trace_flags, args_size); } else { - trace_seq_printf(s, "%ps();\n", (void *)func); + trace_seq_printf(s, "%ps", (void *)ret_func); + + if (args_size >= FTRACE_REGS_MAX_ARGS * sizeof(long)) { + print_function_args(s, entry->args, ret_func); + trace_seq_putc(s, ';'); + } else + trace_seq_puts(s, "();"); } + trace_seq_putc(s, '\n'); print_graph_irq(iter, graph_ret->func, TRACE_GRAPH_RET, cpu, iter->ent->pid, flags); @@ -913,6 +990,7 @@ print_graph_entry_nested(struct trace_iterator *iter, struct fgraph_data *data = iter->private; struct trace_array *tr = iter->tr; unsigned long func; + int args_size; int i; if (data) { @@ -937,7 +1015,17 @@ print_graph_entry_nested(struct trace_iterator *iter, func = call->func + iter->tr->text_delta; - trace_seq_printf(s, "%ps() {", (void *)func); + trace_seq_printf(s, "%ps", (void *)func); + + args_size = iter->ent_size - offsetof(struct ftrace_graph_ent_entry, args); + + if (args_size >= FTRACE_REGS_MAX_ARGS * sizeof(long)) + print_function_args(s, entry->args, func); + else + trace_seq_puts(s, "()"); + + trace_seq_puts(s, " {"); + if (flags & __TRACE_GRAPH_PRINT_RETADDR && entry->ent.type == TRACE_GRAPH_RETADDR_ENT) print_graph_retaddr(s, (struct fgraph_retaddr_ent_entry *)entry, @@ -1107,21 +1195,38 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, struct trace_iterator *iter, u32 flags) { struct fgraph_data *data = iter->private; - struct ftrace_graph_ent *call = &field->graph_ent; + struct ftrace_graph_ent *call; struct ftrace_graph_ret_entry *leaf_ret; static enum print_line_t ret; int cpu = iter->cpu; + /* + * print_graph_entry() may consume the current event, + * thus @field may become invalid, so we need to save it. + * sizeof(struct ftrace_graph_ent_entry) is very small, + * it can be safely saved at the stack. + */ + struct ftrace_graph_ent_entry *entry; + u8 save_buf[sizeof(*entry) + FTRACE_REGS_MAX_ARGS * sizeof(long)]; + + /* The ent_size is expected to be as big as the entry */ + if (iter->ent_size > sizeof(save_buf)) + iter->ent_size = sizeof(save_buf); + + entry = (void *)save_buf; + memcpy(entry, field, iter->ent_size); + + call = &entry->graph_ent; if (check_irq_entry(iter, flags, call->func, call->depth)) return TRACE_TYPE_HANDLED; print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags); - leaf_ret = get_return_for_leaf(iter, field); + leaf_ret = get_return_for_leaf(iter, entry); if (leaf_ret) - ret = print_graph_entry_leaf(iter, field, leaf_ret, s, flags); + ret = print_graph_entry_leaf(iter, entry, leaf_ret, s, flags); else - ret = print_graph_entry_nested(iter, field, s, cpu, flags); + ret = print_graph_entry_nested(iter, entry, s, cpu, flags); if (data) { /* @@ -1195,7 +1300,8 @@ print_graph_return(struct ftrace_graph_ret_entry *retentry, struct trace_seq *s, * funcgraph-retval option is enabled. */ if (flags & __TRACE_GRAPH_PRINT_RETVAL) { - print_graph_retval(s, NULL, trace, (void *)func, flags, tr->trace_flags); + print_graph_retval(s, NULL, trace, (void *)func, flags, + tr->trace_flags, 0); } else { /* * If the return function does not have a matching entry, @@ -1205,10 +1311,11 @@ print_graph_return(struct ftrace_graph_ret_entry *retentry, struct trace_seq *s, * that if the funcgraph-tail option is enabled. */ if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL)) - trace_seq_puts(s, "}\n"); + trace_seq_puts(s, "}"); else - trace_seq_printf(s, "} /* %ps */\n", (void *)func); + trace_seq_printf(s, "} /* %ps */", (void *)func); } + trace_seq_putc(s, '\n'); /* Overrun */ if (flags & TRACE_GRAPH_PRINT_OVERRUN) @@ -1323,16 +1430,8 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags) switch (entry->type) { case TRACE_GRAPH_ENT: { - /* - * print_graph_entry() may consume the current event, - * thus @field may become invalid, so we need to save it. - * sizeof(struct ftrace_graph_ent_entry) is very small, - * it can be safely saved at the stack. - */ - struct ftrace_graph_ent_entry saved; trace_assign_type(field, entry); - saved = *field; - return print_graph_entry(&saved, s, iter, flags); + return print_graph_entry(field, s, iter, flags); } #ifdef CONFIG_FUNCTION_GRAPH_RETADDR case TRACE_GRAPH_RETADDR_ENT: { @@ -1511,6 +1610,7 @@ void graph_trace_close(struct trace_iterator *iter) if (data) { free_percpu(data->cpu_data); kfree(data); + iter->private = NULL; } } @@ -1526,6 +1626,9 @@ func_graph_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) if (bit == TRACE_GRAPH_GRAPH_TIME) ftrace_graph_graph_time_control(set); + if (bit == TRACE_GRAPH_ARGS) + return ftrace_graph_trace_args(tr, set); + return 0; } diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 7294ad676379..40c39e946940 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -150,7 +150,7 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip, trace_ctx = tracing_gen_ctx_flags(flags); - trace_function(tr, ip, parent_ip, trace_ctx); + trace_function(tr, ip, parent_ip, trace_ctx, fregs); atomic_dec(&data->disabled); } @@ -250,8 +250,6 @@ static void irqsoff_trace_open(struct trace_iterator *iter) { if (is_graph(iter->tr)) graph_trace_open(iter); - else - iter->private = NULL; } static void irqsoff_trace_close(struct trace_iterator *iter) @@ -295,11 +293,17 @@ __trace_function(struct trace_array *tr, if (is_graph(tr)) trace_graph_function(tr, ip, parent_ip, trace_ctx); else - trace_function(tr, ip, parent_ip, trace_ctx); + trace_function(tr, ip, parent_ip, trace_ctx, NULL); } #else -#define __trace_function trace_function +static inline void +__trace_function(struct trace_array *tr, + unsigned long ip, unsigned long parent_ip, + unsigned int trace_ctx) +{ + return trace_function(tr, ip, parent_ip, trace_ctx, NULL); +} static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) { diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index d8d5f18a141a..3e5c47b6d7b2 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -124,9 +124,8 @@ static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk) if (!p) return true; *p = '\0'; - rcu_read_lock_sched(); - ret = !!find_module(tk->symbol); - rcu_read_unlock_sched(); + scoped_guard(rcu) + ret = !!find_module(tk->symbol); *p = ':'; return ret; @@ -796,12 +795,10 @@ static struct module *try_module_get_by_name(const char *name) { struct module *mod; - rcu_read_lock_sched(); + guard(rcu)(); mod = find_module(name); if (mod && !try_module_get(mod)) mod = NULL; - rcu_read_unlock_sched(); - return mod; } #else @@ -1007,8 +1004,11 @@ static int trace_kprobe_create_internal(int argc, const char *argv[], argc = new_argc; argv = new_argv; } - if (argc > MAX_TRACE_ARGS) + if (argc > MAX_TRACE_ARGS) { + trace_probe_log_set_index(2); + trace_probe_log_err(0, TOO_MANY_ARGS); return -E2BIG; + } ret = traceprobe_expand_dentry_args(argc, argv, &dbuf); if (ret) @@ -1089,7 +1089,7 @@ static int create_or_delete_trace_kprobe(const char *raw_command) if (raw_command[0] == '-') return dyn_event_release(raw_command, &trace_kprobe_ops); - ret = trace_kprobe_create(raw_command); + ret = dyn_event_create(raw_command, &trace_kprobe_ops); return ret == -ECANCELED ? -EINVAL : ret; } diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 512034e365ad..e732c9e37e14 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -316,33 +316,6 @@ static inline void osn_var_reset_all(void) bool trace_osnoise_callback_enabled; /* - * osnoise sample structure definition. Used to store the statistics of a - * sample run. - */ -struct osnoise_sample { - u64 runtime; /* runtime */ - u64 noise; /* noise */ - u64 max_sample; /* max single noise sample */ - int hw_count; /* # HW (incl. hypervisor) interference */ - int nmi_count; /* # NMIs during this sample */ - int irq_count; /* # IRQs during this sample */ - int softirq_count; /* # softirqs during this sample */ - int thread_count; /* # threads during this sample */ -}; - -#ifdef CONFIG_TIMERLAT_TRACER -/* - * timerlat sample structure definition. Used to store the statistics of - * a sample run. - */ -struct timerlat_sample { - u64 timer_latency; /* timer_latency */ - unsigned int seqnum; /* unique sequence */ - int context; /* timer context */ -}; -#endif - -/* * Tracer data. */ static struct osnoise_data { @@ -497,7 +470,7 @@ static void print_osnoise_headers(struct seq_file *s) * Record an osnoise_sample into the tracer buffer. */ static void -__trace_osnoise_sample(struct osnoise_sample *sample, struct trace_buffer *buffer) +__record_osnoise_sample(struct osnoise_sample *sample, struct trace_buffer *buffer) { struct ring_buffer_event *event; struct osnoise_entry *entry; @@ -520,17 +493,19 @@ __trace_osnoise_sample(struct osnoise_sample *sample, struct trace_buffer *buffe } /* - * Record an osnoise_sample on all osnoise instances. + * Record an osnoise_sample on all osnoise instances and fire trace event. */ -static void trace_osnoise_sample(struct osnoise_sample *sample) +static void record_osnoise_sample(struct osnoise_sample *sample) { struct osnoise_instance *inst; struct trace_buffer *buffer; + trace_osnoise_sample(sample); + rcu_read_lock(); list_for_each_entry_rcu(inst, &osnoise_instances, list) { buffer = inst->tr->array_buffer.buffer; - __trace_osnoise_sample(sample, buffer); + __record_osnoise_sample(sample, buffer); } rcu_read_unlock(); } @@ -574,7 +549,7 @@ static void print_timerlat_headers(struct seq_file *s) #endif /* CONFIG_PREEMPT_RT */ static void -__trace_timerlat_sample(struct timerlat_sample *sample, struct trace_buffer *buffer) +__record_timerlat_sample(struct timerlat_sample *sample, struct trace_buffer *buffer) { struct ring_buffer_event *event; struct timerlat_entry *entry; @@ -594,15 +569,17 @@ __trace_timerlat_sample(struct timerlat_sample *sample, struct trace_buffer *buf /* * Record an timerlat_sample into the tracer buffer. */ -static void trace_timerlat_sample(struct timerlat_sample *sample) +static void record_timerlat_sample(struct timerlat_sample *sample) { struct osnoise_instance *inst; struct trace_buffer *buffer; + trace_timerlat_sample(sample); + rcu_read_lock(); list_for_each_entry_rcu(inst, &osnoise_instances, list) { buffer = inst->tr->array_buffer.buffer; - __trace_timerlat_sample(sample, buffer); + __record_timerlat_sample(sample, buffer); } rcu_read_unlock(); } @@ -1606,7 +1583,7 @@ static int run_osnoise(void) /* Save interference stats info */ diff_osn_sample_stats(osn_var, &s); - trace_osnoise_sample(&s); + record_osnoise_sample(&s); notify_new_max_latency(max_noise); @@ -1801,7 +1778,7 @@ static enum hrtimer_restart timerlat_irq(struct hrtimer *timer) s.timer_latency = diff; s.context = IRQ_CONTEXT; - trace_timerlat_sample(&s); + record_timerlat_sample(&s); if (osnoise_data.stop_tracing) { if (time_to_us(diff) >= osnoise_data.stop_tracing) { @@ -1899,8 +1876,7 @@ static int timerlat_main(void *data) tlat->count = 0; tlat->tracing_thread = false; - hrtimer_init(&tlat->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); - tlat->timer.function = timerlat_irq; + hrtimer_setup(&tlat->timer, timerlat_irq, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); tlat->kthread = current; osn_var->pid = current->pid; /* @@ -1921,7 +1897,7 @@ static int timerlat_main(void *data) s.timer_latency = diff; s.context = THREAD_CONTEXT; - trace_timerlat_sample(&s); + record_timerlat_sample(&s); notify_new_max_latency(diff); @@ -2030,7 +2006,6 @@ static int start_kthread(unsigned int cpu) if (IS_ERR(kthread)) { pr_err(BANNER "could not start sampling thread\n"); - stop_per_cpu_kthreads(); return -ENOMEM; } @@ -2454,8 +2429,7 @@ static int timerlat_fd_open(struct inode *inode, struct file *file) tlat = this_cpu_tmr_var(); tlat->count = 0; - hrtimer_init(&tlat->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); - tlat->timer.function = timerlat_irq; + hrtimer_setup(&tlat->timer, timerlat_irq, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); migrate_enable(); return 0; @@ -2527,7 +2501,7 @@ timerlat_fd_read(struct file *file, char __user *ubuf, size_t count, s.timer_latency = diff; s.context = THREAD_URET; - trace_timerlat_sample(&s); + record_timerlat_sample(&s); notify_new_max_latency(diff); @@ -2562,7 +2536,7 @@ timerlat_fd_read(struct file *file, char __user *ubuf, size_t count, s.timer_latency = diff; s.context = THREAD_CONTEXT; - trace_timerlat_sample(&s); + record_timerlat_sample(&s); if (osnoise_data.stop_tracing_total) { if (time_to_us(diff) >= osnoise_data.stop_tracing_total) { diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 03d56f711ad1..b9ab06c99543 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -5,6 +5,7 @@ * Copyright (C) 2008 Red Hat Inc, Steven Rostedt <srostedt@redhat.com> * */ +#include "trace.h" #include <linux/module.h> #include <linux/mutex.h> #include <linux/ftrace.h> @@ -12,15 +13,19 @@ #include <linux/sched/clock.h> #include <linux/sched/mm.h> #include <linux/idr.h> +#include <linux/btf.h> +#include <linux/bpf.h> +#include <linux/hashtable.h> #include "trace_output.h" +#include "trace_btf.h" -/* must be a power of 2 */ -#define EVENT_HASHSIZE 128 +/* 2^7 = 128 */ +#define EVENT_HASH_BITS 7 DECLARE_RWSEM(trace_event_sem); -static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; +static DEFINE_HASHTABLE(event_hash, EVENT_HASH_BITS); enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter) { @@ -684,6 +689,88 @@ int trace_print_lat_context(struct trace_iterator *iter) return !trace_seq_has_overflowed(s); } +#ifdef CONFIG_FUNCTION_TRACE_ARGS +void print_function_args(struct trace_seq *s, unsigned long *args, + unsigned long func) +{ + const struct btf_param *param; + const struct btf_type *t; + const char *param_name; + char name[KSYM_NAME_LEN]; + unsigned long arg; + struct btf *btf; + s32 tid, nr = 0; + int a, p, x; + + trace_seq_printf(s, "("); + + if (!args) + goto out; + if (lookup_symbol_name(func, name)) + goto out; + + /* TODO: Pass module name here too */ + t = btf_find_func_proto(name, &btf); + if (IS_ERR_OR_NULL(t)) + goto out; + + param = btf_get_func_param(t, &nr); + if (!param) + goto out_put; + + for (a = 0, p = 0; p < nr; a++, p++) { + if (p) + trace_seq_puts(s, ", "); + + /* This only prints what the arch allows (6 args by default) */ + if (a == FTRACE_REGS_MAX_ARGS) { + trace_seq_puts(s, "..."); + break; + } + + arg = args[a]; + + param_name = btf_name_by_offset(btf, param[p].name_off); + if (param_name) + trace_seq_printf(s, "%s=", param_name); + t = btf_type_skip_modifiers(btf, param[p].type, &tid); + + switch (t ? BTF_INFO_KIND(t->info) : BTF_KIND_UNKN) { + case BTF_KIND_UNKN: + trace_seq_putc(s, '?'); + /* Still print unknown type values */ + fallthrough; + case BTF_KIND_PTR: + trace_seq_printf(s, "0x%lx", arg); + break; + case BTF_KIND_INT: + trace_seq_printf(s, "%ld", arg); + break; + case BTF_KIND_ENUM: + trace_seq_printf(s, "%ld", arg); + break; + default: + /* This does not handle complex arguments */ + trace_seq_printf(s, "(%s)[0x%lx", btf_type_str(t), arg); + for (x = sizeof(long); x < t->size; x += sizeof(long)) { + trace_seq_putc(s, ':'); + if (++a == FTRACE_REGS_MAX_ARGS) { + trace_seq_puts(s, "...]"); + goto out_put; + } + trace_seq_printf(s, "0x%lx", args[a]); + } + trace_seq_putc(s, ']'); + break; + } + } +out_put: + btf_put(btf); +out: + trace_seq_printf(s, ")"); +} +#endif + /** * ftrace_find_event - find a registered event * @type: the type of event to look for @@ -694,11 +781,8 @@ int trace_print_lat_context(struct trace_iterator *iter) struct trace_event *ftrace_find_event(int type) { struct trace_event *event; - unsigned key; - - key = type & (EVENT_HASHSIZE - 1); - hlist_for_each_entry(event, &event_hash[key], node) { + hash_for_each_possible(event_hash, event, node, type) { if (event->type == type) return event; } @@ -753,7 +837,6 @@ void trace_event_read_unlock(void) */ int register_trace_event(struct trace_event *event) { - unsigned key; int ret = 0; down_write(&trace_event_sem); @@ -786,9 +869,7 @@ int register_trace_event(struct trace_event *event) if (event->funcs->binary == NULL) event->funcs->binary = trace_nop_print; - key = event->type & (EVENT_HASHSIZE - 1); - - hlist_add_head(&event->node, &event_hash[key]); + hash_add(event_hash, &event->node, event->type); ret = event->type; out: @@ -803,7 +884,7 @@ EXPORT_SYMBOL_GPL(register_trace_event); */ int __unregister_trace_event(struct trace_event *event) { - hlist_del(&event->node); + hash_del(&event->node); free_trace_event_type(event->type); return 0; } @@ -961,11 +1042,12 @@ enum print_line_t print_event_fields(struct trace_iterator *iter, struct trace_event_call *call; struct list_head *head; + lockdep_assert_held_read(&trace_event_sem); + /* ftrace defined events have separate call structures */ if (event->type <= __TRACE_LAST_TYPE) { bool found = false; - down_read(&trace_event_sem); list_for_each_entry(call, &ftrace_events, list) { if (call->event.type == event->type) { found = true; @@ -975,7 +1057,6 @@ enum print_line_t print_event_fields(struct trace_iterator *iter, if (call->event.type > __TRACE_LAST_TYPE) break; } - up_read(&trace_event_sem); if (!found) { trace_seq_printf(&iter->seq, "UNKNOWN TYPE %d\n", event->type); goto out; @@ -1005,12 +1086,15 @@ enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags, } static void print_fn_trace(struct trace_seq *s, unsigned long ip, - unsigned long parent_ip, long delta, int flags) + unsigned long parent_ip, long delta, + unsigned long *args, int flags) { ip += delta; parent_ip += delta; seq_print_ip_sym(s, ip, flags); + if (args) + print_function_args(s, args, ip); if ((flags & TRACE_ITER_PRINT_PARENT) && parent_ip) { trace_seq_puts(s, " <-"); @@ -1024,10 +1108,19 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags, { struct ftrace_entry *field; struct trace_seq *s = &iter->seq; + unsigned long *args; + int args_size; trace_assign_type(field, iter->ent); - print_fn_trace(s, field->ip, field->parent_ip, iter->tr->text_delta, flags); + args_size = iter->ent_size - offsetof(struct ftrace_entry, args); + if (args_size >= FTRACE_REGS_MAX_ARGS * sizeof(long)) + args = field->args; + else + args = NULL; + + print_fn_trace(s, field->ip, field->parent_ip, iter->tr->text_delta, + args, flags); trace_seq_putc(s, '\n'); return trace_handle_return(s); @@ -1248,7 +1341,6 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter, struct trace_seq *s = &iter->seq; unsigned long *p; unsigned long *end; - long delta = iter->tr->text_delta; trace_assign_type(field, iter->ent); end = (unsigned long *)((long)iter->ent + iter->ent_size); @@ -1265,7 +1357,7 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter, trace_seq_puts(s, "[FTRACE TRAMPOLINE]\n"); continue; } - seq_print_ip_sym(s, (*p) + delta, flags); + seq_print_ip_sym(s, trace_adjust_address(iter->tr, *p), flags); trace_seq_putc(s, '\n'); } @@ -1700,7 +1792,7 @@ trace_func_repeats_print(struct trace_iterator *iter, int flags, trace_assign_type(field, iter->ent); - print_fn_trace(s, field->ip, field->parent_ip, iter->tr->text_delta, flags); + print_fn_trace(s, field->ip, field->parent_ip, iter->tr->text_delta, NULL, flags); trace_seq_printf(s, " (repeats: %u, last_ts:", field->count); trace_print_time(s, iter, iter->ts - FUNC_REPEATS_GET_DELTA_TS(field)); diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index dca40f1f1da4..2e305364f2a9 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -41,5 +41,14 @@ extern struct rw_semaphore trace_event_sem; #define SEQ_PUT_HEX_FIELD(s, x) \ trace_seq_putmem_hex(s, &(x), sizeof(x)) +#ifdef CONFIG_FUNCTION_TRACE_ARGS +void print_function_args(struct trace_seq *s, unsigned long *args, + unsigned long func); +#else +static inline void print_function_args(struct trace_seq *s, unsigned long *args, + unsigned long func) { + trace_seq_puts(s, "()"); +} +#endif #endif diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 8f58ee1e8858..424751cdf31f 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -154,9 +154,12 @@ fail: } static struct trace_probe_log trace_probe_log; +extern struct mutex dyn_event_ops_mutex; void trace_probe_log_init(const char *subsystem, int argc, const char **argv) { + lockdep_assert_held(&dyn_event_ops_mutex); + trace_probe_log.subsystem = subsystem; trace_probe_log.argc = argc; trace_probe_log.argv = argv; @@ -165,11 +168,15 @@ void trace_probe_log_init(const char *subsystem, int argc, const char **argv) void trace_probe_log_clear(void) { + lockdep_assert_held(&dyn_event_ops_mutex); + memset(&trace_probe_log, 0, sizeof(trace_probe_log)); } void trace_probe_log_set_index(int index) { + lockdep_assert_held(&dyn_event_ops_mutex); + trace_probe_log.index = index; } @@ -178,6 +185,8 @@ void __trace_probe_log_err(int offset, int err_type) char *command, *p; int i, len = 0, pos = 0; + lockdep_assert_held(&dyn_event_ops_mutex); + if (!trace_probe_log.argv) return; @@ -770,6 +779,10 @@ static int check_prepare_btf_string_fetch(char *typename, #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API +/* + * Add the entry code to store the 'argnum'th parameter and return the offset + * in the entry data buffer where the data will be stored. + */ static int __store_entry_arg(struct trace_probe *tp, int argnum) { struct probe_entry_arg *earg = tp->entry_arg; @@ -793,6 +806,20 @@ static int __store_entry_arg(struct trace_probe *tp, int argnum) tp->entry_arg = earg; } + /* + * The entry code array is repeating the pair of + * [FETCH_OP_ARG(argnum)][FETCH_OP_ST_EDATA(offset of entry data buffer)] + * and the rest of entries are filled with [FETCH_OP_END]. + * + * To reduce the redundant function parameter fetching, we scan the entry + * code array to find the FETCH_OP_ARG which already fetches the 'argnum' + * parameter. If it doesn't match, update 'offset' to find the last + * offset. + * If we find the FETCH_OP_END without matching FETCH_OP_ARG entry, we + * will save the entry with FETCH_OP_ARG and FETCH_OP_ST_EDATA, and + * return data offset so that caller can find the data offset in the entry + * data buffer. + */ offset = 0; for (i = 0; i < earg->size - 1; i++) { switch (earg->code[i].op) { @@ -826,6 +853,16 @@ int traceprobe_get_entry_data_size(struct trace_probe *tp) if (!earg) return 0; + /* + * earg->code[] array has an operation sequence which is run in + * the entry handler. + * The sequence stopped by FETCH_OP_END and each data stored in + * the entry data buffer by FETCH_OP_ST_EDATA. The FETCH_OP_ST_EDATA + * stores the data at the data buffer + its offset, and all data are + * "unsigned long" size. The offset must be increased when a data is + * stored. Thus we need to find the last FETCH_OP_ST_EDATA in the + * code array. + */ for (i = 0; i < earg->size; i++) { switch (earg->code[i].op) { case FETCH_OP_END: diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 96792bc4b092..854e5668f5ee 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -545,6 +545,7 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(BAD_BTF_TID, "Failed to get BTF type info."),\ C(BAD_TYPE4STR, "This type does not fit for string."),\ C(NEED_STRING_TYPE, "$comm and immediate-string only accepts string type"),\ + C(TOO_MANY_ARGS, "Too many arguments are specified"), \ C(TOO_MANY_EARGS, "Too many entry arguments specified"), #undef C diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index af30586f1aea..a0db3404f7f7 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -188,8 +188,6 @@ static void wakeup_trace_open(struct trace_iterator *iter) { if (is_graph(iter->tr)) graph_trace_open(iter); - else - iter->private = NULL; } static void wakeup_trace_close(struct trace_iterator *iter) @@ -242,7 +240,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip, return; local_irq_save(flags); - trace_function(tr, ip, parent_ip, trace_ctx); + trace_function(tr, ip, parent_ip, trace_ctx, fregs); local_irq_restore(flags); atomic_dec(&data->disabled); @@ -327,7 +325,7 @@ __trace_function(struct trace_array *tr, if (is_graph(tr)) trace_graph_function(tr, ip, parent_ip, trace_ctx); else - trace_function(tr, ip, parent_ip, trace_ctx); + trace_function(tr, ip, parent_ip, trace_ctx, NULL); } static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set) diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 14c6f272c4d8..e34223c8065d 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -32,7 +32,7 @@ static arch_spinlock_t stack_trace_max_lock = DEFINE_PER_CPU(int, disable_stack_tracer); static DEFINE_MUTEX(stack_sysctl_mutex); -int stack_tracer_enabled; +static int stack_tracer_enabled; static void print_max_stack(void) { @@ -578,3 +578,23 @@ static __init int stack_trace_init(void) } device_initcall(stack_trace_init); + + +static const struct ctl_table trace_stack_sysctl_table[] = { + { + .procname = "stack_tracer_enabled", + .data = &stack_tracer_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = stack_trace_sysctl, + }, +}; + +static int __init init_trace_stack_sysctls(void) +{ + register_sysctl_init("kernel", trace_stack_sysctl_table); + return 0; +} +subsys_initcall(init_trace_stack_sysctls); + + diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index ccc762fbb69c..f95a2c3d5b1b 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -562,8 +562,14 @@ static int __trace_uprobe_create(int argc, const char **argv) if (argc < 2) return -ECANCELED; - if (argc - 2 > MAX_TRACE_ARGS) + + trace_probe_log_init("trace_uprobe", argc, argv); + + if (argc - 2 > MAX_TRACE_ARGS) { + trace_probe_log_set_index(2); + trace_probe_log_err(0, TOO_MANY_ARGS); return -E2BIG; + } if (argv[0][1] == ':') event = &argv[0][2]; @@ -582,7 +588,6 @@ static int __trace_uprobe_create(int argc, const char **argv) return -ECANCELED; } - trace_probe_log_init("trace_uprobe", argc, argv); trace_probe_log_set_index(1); /* filename is the 2nd argument */ *arg++ = '\0'; @@ -736,7 +741,7 @@ static int create_or_delete_trace_uprobe(const char *raw_command) if (raw_command[0] == '-') return dyn_event_release(raw_command, &trace_uprobe_ops); - ret = trace_uprobe_create(raw_command); + ret = dyn_event_create(raw_command, &trace_uprobe_ops); return ret == -ECANCELED ? -EINVAL : ret; } @@ -1484,7 +1489,7 @@ int bpf_get_uprobe_info(const struct perf_event *event, u32 *fd_type, : BPF_FD_TYPE_UPROBE; *filename = tu->filename; *probe_offset = tu->offset; - *probe_addr = 0; + *probe_addr = tu->ref_ctr_offset; return 0; } #endif /* CONFIG_PERF_EVENTS */ diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 1848ce7e2976..62719d2941c9 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -127,7 +127,7 @@ static void debug_print_probes(struct tracepoint_func *funcs) return; for (i = 0; funcs[i].func; i++) - printk(KERN_DEBUG "Probe %d : %p\n", i, funcs[i].func); + printk(KERN_DEBUG "Probe %d : %pSb\n", i, funcs[i].func); } static struct tracepoint_func * diff --git a/kernel/ucount.c b/kernel/ucount.c index 86c5f1c0bad9..8686e329b8f2 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -11,11 +11,14 @@ struct ucounts init_ucounts = { .ns = &init_user_ns, .uid = GLOBAL_ROOT_UID, - .count = ATOMIC_INIT(1), + .count = RCUREF_INIT(1), }; #define UCOUNTS_HASHTABLE_BITS 10 -static struct hlist_head ucounts_hashtable[(1 << UCOUNTS_HASHTABLE_BITS)]; +#define UCOUNTS_HASHTABLE_ENTRIES (1 << UCOUNTS_HASHTABLE_BITS) +static struct hlist_nulls_head ucounts_hashtable[UCOUNTS_HASHTABLE_ENTRIES] = { + [0 ... UCOUNTS_HASHTABLE_ENTRIES - 1] = HLIST_NULLS_HEAD_INIT(0) +}; static DEFINE_SPINLOCK(ucounts_lock); #define ucounts_hashfn(ns, uid) \ @@ -24,7 +27,6 @@ static DEFINE_SPINLOCK(ucounts_lock); #define ucounts_hashentry(ns, uid) \ (ucounts_hashtable + ucounts_hashfn(ns, uid)) - #ifdef CONFIG_SYSCTL static struct ctl_table_set * set_lookup(struct ctl_table_root *root) @@ -127,88 +129,73 @@ void retire_userns_sysctls(struct user_namespace *ns) #endif } -static struct ucounts *find_ucounts(struct user_namespace *ns, kuid_t uid, struct hlist_head *hashent) +static struct ucounts *find_ucounts(struct user_namespace *ns, kuid_t uid, + struct hlist_nulls_head *hashent) { struct ucounts *ucounts; + struct hlist_nulls_node *pos; - hlist_for_each_entry(ucounts, hashent, node) { - if (uid_eq(ucounts->uid, uid) && (ucounts->ns == ns)) - return ucounts; + guard(rcu)(); + hlist_nulls_for_each_entry_rcu(ucounts, pos, hashent, node) { + if (uid_eq(ucounts->uid, uid) && (ucounts->ns == ns)) { + if (rcuref_get(&ucounts->count)) + return ucounts; + } } return NULL; } static void hlist_add_ucounts(struct ucounts *ucounts) { - struct hlist_head *hashent = ucounts_hashentry(ucounts->ns, ucounts->uid); + struct hlist_nulls_head *hashent = ucounts_hashentry(ucounts->ns, ucounts->uid); + spin_lock_irq(&ucounts_lock); - hlist_add_head(&ucounts->node, hashent); + hlist_nulls_add_head_rcu(&ucounts->node, hashent); spin_unlock_irq(&ucounts_lock); } -static inline bool get_ucounts_or_wrap(struct ucounts *ucounts) +struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid) { - /* Returns true on a successful get, false if the count wraps. */ - return !atomic_add_negative(1, &ucounts->count); -} + struct hlist_nulls_head *hashent = ucounts_hashentry(ns, uid); + struct ucounts *ucounts, *new; -struct ucounts *get_ucounts(struct ucounts *ucounts) -{ - if (!get_ucounts_or_wrap(ucounts)) { - put_ucounts(ucounts); - ucounts = NULL; - } - return ucounts; -} + ucounts = find_ucounts(ns, uid, hashent); + if (ucounts) + return ucounts; -struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid) -{ - struct hlist_head *hashent = ucounts_hashentry(ns, uid); - bool wrapped; - struct ucounts *ucounts, *new = NULL; + new = kzalloc(sizeof(*new), GFP_KERNEL); + if (!new) + return NULL; + + new->ns = ns; + new->uid = uid; + rcuref_init(&new->count, 1); spin_lock_irq(&ucounts_lock); ucounts = find_ucounts(ns, uid, hashent); - if (!ucounts) { + if (ucounts) { spin_unlock_irq(&ucounts_lock); - - new = kzalloc(sizeof(*new), GFP_KERNEL); - if (!new) - return NULL; - - new->ns = ns; - new->uid = uid; - atomic_set(&new->count, 1); - - spin_lock_irq(&ucounts_lock); - ucounts = find_ucounts(ns, uid, hashent); - if (!ucounts) { - hlist_add_head(&new->node, hashent); - get_user_ns(new->ns); - spin_unlock_irq(&ucounts_lock); - return new; - } + kfree(new); + return ucounts; } - wrapped = !get_ucounts_or_wrap(ucounts); + hlist_nulls_add_head_rcu(&new->node, hashent); + get_user_ns(new->ns); spin_unlock_irq(&ucounts_lock); - kfree(new); - if (wrapped) { - put_ucounts(ucounts); - return NULL; - } - return ucounts; + return new; } void put_ucounts(struct ucounts *ucounts) { unsigned long flags; - if (atomic_dec_and_lock_irqsave(&ucounts->count, &ucounts_lock, flags)) { - hlist_del_init(&ucounts->node); + if (rcuref_put(&ucounts->count)) { + spin_lock_irqsave(&ucounts_lock, flags); + hlist_nulls_del_rcu(&ucounts->node); spin_unlock_irqrestore(&ucounts_lock, flags); + put_user_ns(ucounts->ns); - kfree(ucounts); + kfree_rcu(ucounts, rcu); } } diff --git a/kernel/vhost_task.c b/kernel/vhost_task.c index 2ef2e1b80091..2f844c279a3e 100644 --- a/kernel/vhost_task.c +++ b/kernel/vhost_task.c @@ -111,7 +111,7 @@ EXPORT_SYMBOL_GPL(vhost_task_stop); * @arg: data to be passed to fn and handled_kill * @name: the thread's name * - * This returns a specialized task for use by the vhost layer or NULL on + * This returns a specialized task for use by the vhost layer or ERR_PTR() on * failure. The returned task is inactive, and the caller must fire it up * through vhost_task_start(). */ diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 18156023e461..9fa2af9dbf2c 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -795,8 +795,7 @@ static void watchdog_enable(unsigned int cpu) * Start the timer first to prevent the hardlockup watchdog triggering * before the timer has a chance to fire. */ - hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); - hrtimer->function = watchdog_timer_fn; + hrtimer_setup(hrtimer, watchdog_timer_fn, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL_PINNED_HARD); diff --git a/kernel/watchdog_perf.c b/kernel/watchdog_perf.c index a78ff092d636..75af12ff774e 100644 --- a/kernel/watchdog_perf.c +++ b/kernel/watchdog_perf.c @@ -269,12 +269,10 @@ void __init hardlockup_config_perf_event(const char *str) } else { unsigned int len = comma - str; - if (len >= sizeof(buf)) + if (len > sizeof(buf)) return; - if (strscpy(buf, str, sizeof(buf)) < 0) - return; - buf[len] = 0; + strscpy(buf, str, len); if (kstrtoull(buf, 16, &config)) return; } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index bfe030b443e2..3bef0754cf73 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -686,7 +686,7 @@ EXPORT_SYMBOL_GPL(destroy_work_on_stack); void destroy_delayed_work_on_stack(struct delayed_work *work) { - destroy_timer_on_stack(&work->timer); + timer_destroy_on_stack(&work->timer); debug_object_free(&work->work, &work_debug_descr); } EXPORT_SYMBOL_GPL(destroy_delayed_work_on_stack); @@ -2057,11 +2057,11 @@ static int try_to_grab_pending(struct work_struct *work, u32 cflags, struct delayed_work *dwork = to_delayed_work(work); /* - * dwork->timer is irqsafe. If del_timer() fails, it's + * dwork->timer is irqsafe. If timer_delete() fails, it's * guaranteed that the timer is not queued anywhere and not * running on the local CPU. */ - if (likely(del_timer(&dwork->timer))) + if (likely(timer_delete(&dwork->timer))) return 1; } @@ -3069,7 +3069,7 @@ restart: break; } - del_timer_sync(&pool->mayday_timer); + timer_delete_sync(&pool->mayday_timer); raw_spin_lock_irq(&pool->lock); /* * This is necessary even after a new worker was just successfully @@ -3241,7 +3241,7 @@ __acquires(&pool->lock) * point will only record its address. */ trace_workqueue_execute_end(work, worker->current_func); - pwq->stats[PWQ_STAT_COMPLETED]++; + lock_map_release(&lockdep_map); if (!bh_draining) lock_map_release(pwq->wq->lockdep_map); @@ -3272,6 +3272,8 @@ __acquires(&pool->lock) raw_spin_lock_irq(&pool->lock); + pwq->stats[PWQ_STAT_COMPLETED]++; + /* * In addition to %WQ_CPU_INTENSIVE, @worker may also have been marked * CPU intensive by wq_worker_tick() if @work hogged CPU longer than @@ -4281,7 +4283,7 @@ EXPORT_SYMBOL_GPL(flush_work); bool flush_delayed_work(struct delayed_work *dwork) { local_irq_disable(); - if (del_timer_sync(&dwork->timer)) + if (timer_delete_sync(&dwork->timer)) __queue_work(dwork->cpu, dwork->wq, &dwork->work); local_irq_enable(); return flush_work(&dwork->work); @@ -4984,9 +4986,9 @@ static void put_unbound_pool(struct worker_pool *pool) reap_dying_workers(&cull_list); /* shut down the timers */ - del_timer_sync(&pool->idle_timer); + timer_delete_sync(&pool->idle_timer); cancel_work_sync(&pool->idle_cull_work); - del_timer_sync(&pool->mayday_timer); + timer_delete_sync(&pool->mayday_timer); /* RCU protected to allow dereferences from get_work_pool() */ call_rcu(&pool->rcu, rcu_free_pool); @@ -5837,6 +5839,17 @@ static bool pwq_busy(struct pool_workqueue *pwq) * @wq: target workqueue * * Safely destroy a workqueue. All work currently pending will be done first. + * + * This function does NOT guarantee that non-pending work that has been + * submitted with queue_delayed_work() and similar functions will be done + * before destroying the workqueue. The fundamental problem is that, currently, + * the workqueue has no way of accessing non-pending delayed_work. delayed_work + * is only linked on the timer-side. All delayed_work must, therefore, be + * canceled before calling this function. + * + * TODO: It would be better if the problem described above wouldn't exist and + * destroy_workqueue() would cleanly cancel all pending and non-pending + * delayed_work. */ void destroy_workqueue(struct workqueue_struct *wq) { @@ -7637,7 +7650,7 @@ notrace void wq_watchdog_touch(int cpu) static void wq_watchdog_set_thresh(unsigned long thresh) { wq_watchdog_thresh = 0; - del_timer_sync(&wq_watchdog_timer); + timer_delete_sync(&wq_watchdog_timer); if (thresh) { wq_watchdog_thresh = thresh; |