diff options
Diffstat (limited to 'kernel')
110 files changed, 5200 insertions, 2841 deletions
diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec index 422270d64820..54e581072617 100644 --- a/kernel/Kconfig.kexec +++ b/kernel/Kconfig.kexec @@ -109,6 +109,15 @@ config KEXEC_HANDOVER to keep data or state alive across the kexec. For this to work, both source and target kernels need to have this option enabled. +config KEXEC_HANDOVER_DEBUG + bool "Enable Kexec Handover debug checks" + depends on KEXEC_HANDOVER + help + This option enables extra sanity checks for the Kexec Handover + subsystem. Since, KHO performance is crucial in live update + scenarios and the extra code might be adding overhead it is + only optionally enabled. + config CRASH_DUMP bool "kernel crash dumps" default ARCH_DEFAULT_CRASH_DUMP diff --git a/kernel/Makefile b/kernel/Makefile index df3dd8291bb6..9fe722305c9b 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -83,6 +83,7 @@ obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_KEXEC_FILE) += kexec_file.o obj-$(CONFIG_KEXEC_ELF) += kexec_elf.o obj-$(CONFIG_KEXEC_HANDOVER) += kexec_handover.o +obj-$(CONFIG_KEXEC_HANDOVER_DEBUG) += kexec_handover_debug.o obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_CGROUPS) += cgroup/ diff --git a/kernel/acct.c b/kernel/acct.c index 61630110e29d..2a2b3c874acd 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -520,26 +520,23 @@ static void fill_ac(struct bsd_acct_struct *acct) static void acct_write_process(struct bsd_acct_struct *acct) { struct file *file = acct->file; - const struct cred *cred; acct_t *ac = &acct->ac; /* Perform file operations on behalf of whoever enabled accounting */ - cred = override_creds(file->f_cred); - - /* - * First check to see if there is enough free_space to continue - * the process accounting system. Then get freeze protection. If - * the fs is frozen, just skip the write as we could deadlock - * the system otherwise. - */ - if (check_free_space(acct) && file_start_write_trylock(file)) { - /* it's been opened O_APPEND, so position is irrelevant */ - loff_t pos = 0; - __kernel_write(file, ac, sizeof(acct_t), &pos); - file_end_write(file); + scoped_with_creds(file->f_cred) { + /* + * First check to see if there is enough free_space to continue + * the process accounting system. Then get freeze protection. If + * the fs is frozen, just skip the write as we could deadlock + * the system otherwise. + */ + if (check_free_space(acct) && file_start_write_trylock(file)) { + /* it's been opened O_APPEND, so position is irrelevant */ + loff_t pos = 0; + __kernel_write(file, ac, sizeof(acct_t), &pos); + file_end_write(file); + } } - - revert_creds(cred); } static void do_acct_process(struct bsd_acct_struct *acct) diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 6ac35430c573..eec60b57bd3d 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -634,37 +634,24 @@ release_prog: int bpf_iter_new_fd(struct bpf_link *link) { struct bpf_iter_link *iter_link; - struct file *file; unsigned int flags; - int err, fd; + int err; if (link->ops != &bpf_iter_link_lops) return -EINVAL; flags = O_RDONLY | O_CLOEXEC; - fd = get_unused_fd_flags(flags); - if (fd < 0) - return fd; - - file = anon_inode_getfile("bpf_iter", &bpf_iter_fops, NULL, flags); - if (IS_ERR(file)) { - err = PTR_ERR(file); - goto free_fd; - } + + FD_PREPARE(fdf, flags, anon_inode_getfile("bpf_iter", &bpf_iter_fops, NULL, flags)); + if (fdf.err) + return fdf.err; iter_link = container_of(link, struct bpf_iter_link, link); - err = prepare_seq_file(file, iter_link); + err = prepare_seq_file(fd_prepare_file(fdf), iter_link); if (err) - goto free_file; + return err; /* Automatic cleanup handles fput */ - fd_install(fd, file); - return fd; - -free_file: - fput(file); -free_fd: - put_unused_fd(fd); - return err; + return fd_publish(fdf); } struct bpf_prog *bpf_iter_get_info(struct bpf_iter_meta *meta, bool in_stop) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index eb25e70e0bdc..e4007fea4909 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -4169,7 +4169,8 @@ release_prog: } /** - * bpf_task_work_schedule_signal - Schedule BPF callback using task_work_add with TWA_SIGNAL mode + * bpf_task_work_schedule_signal_impl - Schedule BPF callback using task_work_add with TWA_SIGNAL + * mode * @task: Task struct for which callback should be scheduled * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping * @map__map: bpf_map that embeds struct bpf_task_work in the values @@ -4178,15 +4179,17 @@ release_prog: * * Return: 0 if task work has been scheduled successfully, negative error code otherwise */ -__bpf_kfunc int bpf_task_work_schedule_signal(struct task_struct *task, struct bpf_task_work *tw, - void *map__map, bpf_task_work_callback_t callback, - void *aux__prog) +__bpf_kfunc int bpf_task_work_schedule_signal_impl(struct task_struct *task, + struct bpf_task_work *tw, void *map__map, + bpf_task_work_callback_t callback, + void *aux__prog) { return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_SIGNAL); } /** - * bpf_task_work_schedule_resume - Schedule BPF callback using task_work_add with TWA_RESUME mode + * bpf_task_work_schedule_resume_impl - Schedule BPF callback using task_work_add with TWA_RESUME + * mode * @task: Task struct for which callback should be scheduled * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping * @map__map: bpf_map that embeds struct bpf_task_work in the values @@ -4195,9 +4198,10 @@ __bpf_kfunc int bpf_task_work_schedule_signal(struct task_struct *task, struct b * * Return: 0 if task work has been scheduled successfully, negative error code otherwise */ -__bpf_kfunc int bpf_task_work_schedule_resume(struct task_struct *task, struct bpf_task_work *tw, - void *map__map, bpf_task_work_callback_t callback, - void *aux__prog) +__bpf_kfunc int bpf_task_work_schedule_resume_impl(struct task_struct *task, + struct bpf_task_work *tw, void *map__map, + bpf_task_work_callback_t callback, + void *aux__prog) { return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_RESUME); } @@ -4376,9 +4380,9 @@ BTF_ID_FLAGS(func, bpf_strnstr); #if defined(CONFIG_BPF_LSM) && defined(CONFIG_CGROUPS) BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU) #endif -BTF_ID_FLAGS(func, bpf_stream_vprintk, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_task_work_schedule_signal, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_task_work_schedule_resume, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_stream_vprintk_impl, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_task_work_schedule_signal_impl, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_task_work_schedule_resume_impl, KF_TRUSTED_ARGS) BTF_KFUNCS_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 4d53cdd1374c..8f1dacaf01fe 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -315,7 +315,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, max_depth = sysctl_perf_event_max_stack; trace = get_perf_callchain(regs, kernel, user, max_depth, - false, false); + false, false, 0); if (unlikely(!trace)) /* couldn't fetch the stack trace */ @@ -452,7 +452,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, trace = get_callchain_entry_for_task(task, max_depth); else trace = get_perf_callchain(regs, kernel, user, max_depth, - crosstask, false); + crosstask, false, 0); if (unlikely(!trace) || trace->nr < skip) { if (may_fault) diff --git a/kernel/bpf/stream.c b/kernel/bpf/stream.c index eb6c5a21c2ef..ff16c631951b 100644 --- a/kernel/bpf/stream.c +++ b/kernel/bpf/stream.c @@ -355,7 +355,8 @@ __bpf_kfunc_start_defs(); * Avoid using enum bpf_stream_id so that kfunc users don't have to pull in the * enum in headers. */ -__bpf_kfunc int bpf_stream_vprintk(int stream_id, const char *fmt__str, const void *args, u32 len__sz, void *aux__prog) +__bpf_kfunc int bpf_stream_vprintk_impl(int stream_id, const char *fmt__str, const void *args, + u32 len__sz, void *aux__prog) { struct bpf_bprintf_data data = { .get_bin_args = true, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 8a129746bd6c..6cde6a46babf 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2330,7 +2330,7 @@ static void bpf_audit_prog(const struct bpf_prog *prog, unsigned int op) return; if (audit_enabled == AUDIT_OFF) return; - if (!in_irq() && !irqs_disabled()) + if (!in_hardirq() && !irqs_disabled()) ctx = audit_context(); ab = audit_log_start(ctx, GFP_ATOMIC, AUDIT_BPF); if (unlikely(!ab)) @@ -2428,7 +2428,7 @@ static void __bpf_prog_put(struct bpf_prog *prog) struct bpf_prog_aux *aux = prog->aux; if (atomic64_dec_and_test(&aux->refcnt)) { - if (in_irq() || irqs_disabled()) { + if (in_hardirq() || irqs_disabled()) { INIT_WORK(&aux->work, bpf_prog_put_deferred); schedule_work(&aux->work); } else { diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c index 0bbe412f854e..feecd8f4dbf9 100644 --- a/kernel/bpf/token.c +++ b/kernel/bpf/token.c @@ -110,16 +110,15 @@ const struct file_operations bpf_token_fops = { int bpf_token_create(union bpf_attr *attr) { + struct bpf_token *token __free(kfree) = NULL; struct bpf_mount_opts *mnt_opts; - struct bpf_token *token = NULL; struct user_namespace *userns; struct inode *inode; - struct file *file; CLASS(fd, f)(attr->token_create.bpffs_fd); struct path path; struct super_block *sb; umode_t mode; - int err, fd; + int err; if (fd_empty(f)) return -EBADF; @@ -166,23 +165,20 @@ int bpf_token_create(union bpf_attr *attr) inode->i_fop = &bpf_token_fops; clear_nlink(inode); /* make sure it is unlinked */ - file = alloc_file_pseudo(inode, path.mnt, BPF_TOKEN_INODE_NAME, O_RDWR, &bpf_token_fops); - if (IS_ERR(file)) { - iput(inode); - return PTR_ERR(file); - } + FD_PREPARE(fdf, O_CLOEXEC, + alloc_file_pseudo(inode, path.mnt, BPF_TOKEN_INODE_NAME, + O_RDWR, &bpf_token_fops)); + if (fdf.err) + return fdf.err; token = kzalloc(sizeof(*token), GFP_USER); - if (!token) { - err = -ENOMEM; - goto out_file; - } + if (!token) + return -ENOMEM; atomic64_set(&token->refcnt, 1); - /* remember bpffs owning userns for future ns_capable() checks */ - token->userns = get_user_ns(userns); - + /* remember bpffs owning userns for future ns_capable() checks. */ + token->userns = userns; token->allowed_cmds = mnt_opts->delegate_cmds; token->allowed_maps = mnt_opts->delegate_maps; token->allowed_progs = mnt_opts->delegate_progs; @@ -190,24 +186,11 @@ int bpf_token_create(union bpf_attr *attr) err = security_bpf_token_create(token, attr, &path); if (err) - goto out_token; - - fd = get_unused_fd_flags(O_CLOEXEC); - if (fd < 0) { - err = fd; - goto out_token; - } - - file->private_data = token; - fd_install(fd, file); - - return fd; + return err; -out_token: - bpf_token_free(token); -out_file: - fput(file); - return err; + get_user_ns(token->userns); + fd_prepare_file(fdf)->private_data = no_free_ptr(token); + return fd_publish(fdf); } int bpf_token_get_info_by_fd(struct bpf_token *token, diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 5949095e51c3..f2cb0b097093 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -479,11 +479,6 @@ again: * BPF_TRAMP_F_SHARE_IPMODIFY is set, we can generate the * trampoline again, and retry register. */ - /* reset fops->func and fops->trampoline for re-register */ - tr->fops->func = NULL; - tr->fops->trampoline = 0; - - /* free im memory and reallocate later */ bpf_tramp_image_free(im); goto again; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ff40e5e65c43..fbe4bb91c564 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8866,7 +8866,7 @@ static int widen_imprecise_scalars(struct bpf_verifier_env *env, struct bpf_verifier_state *cur) { struct bpf_func_state *fold, *fcur; - int i, fr; + int i, fr, num_slots; reset_idmap_scratch(env); for (fr = old->curframe; fr >= 0; fr--) { @@ -8879,7 +8879,9 @@ static int widen_imprecise_scalars(struct bpf_verifier_env *env, &fcur->regs[i], &env->idmap_scratch); - for (i = 0; i < fold->allocated_stack / BPF_REG_SIZE; i++) { + num_slots = min(fold->allocated_stack / BPF_REG_SIZE, + fcur->allocated_stack / BPF_REG_SIZE); + for (i = 0; i < num_slots; i++) { if (!is_spilled_reg(&fold->stack[i]) || !is_spilled_reg(&fcur->stack[i])) continue; @@ -12259,8 +12261,8 @@ enum special_kfunc_type { KF_bpf_res_spin_lock_irqsave, KF_bpf_res_spin_unlock_irqrestore, KF___bpf_trap, - KF_bpf_task_work_schedule_signal, - KF_bpf_task_work_schedule_resume, + KF_bpf_task_work_schedule_signal_impl, + KF_bpf_task_work_schedule_resume_impl, }; BTF_ID_LIST(special_kfunc_list) @@ -12331,13 +12333,13 @@ BTF_ID(func, bpf_res_spin_unlock) BTF_ID(func, bpf_res_spin_lock_irqsave) BTF_ID(func, bpf_res_spin_unlock_irqrestore) BTF_ID(func, __bpf_trap) -BTF_ID(func, bpf_task_work_schedule_signal) -BTF_ID(func, bpf_task_work_schedule_resume) +BTF_ID(func, bpf_task_work_schedule_signal_impl) +BTF_ID(func, bpf_task_work_schedule_resume_impl) static bool is_task_work_add_kfunc(u32 func_id) { - return func_id == special_kfunc_list[KF_bpf_task_work_schedule_signal] || - func_id == special_kfunc_list[KF_bpf_task_work_schedule_resume]; + return func_id == special_kfunc_list[KF_bpf_task_work_schedule_signal_impl] || + func_id == special_kfunc_list[KF_bpf_task_work_schedule_resume_impl]; } static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index fdee387f0d6b..ae1eb7a85eb4 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -250,12 +250,9 @@ bool cgroup_enable_per_threadgroup_rwsem __read_mostly; /* cgroup namespace for init task */ struct cgroup_namespace init_cgroup_ns = { - .ns.__ns_ref = REFCOUNT_INIT(2), + .ns = NS_COMMON_INIT(init_cgroup_ns), .user_ns = &init_user_ns, - .ns.ops = &cgroupns_operations, - .ns.inum = ns_init_inum(&init_cgroup_ns), .root_cset = &init_css_set, - .ns.ns_type = ns_common_type(&init_cgroup_ns), }; static struct file_system_type cgroup2_fs_type; @@ -1522,9 +1519,9 @@ static struct cgroup *current_cgns_cgroup_dfl(void) } else { /* * NOTE: This function may be called from bpf_cgroup_from_id() - * on a task which has already passed exit_task_namespaces() and - * nsproxy == NULL. Fall back to cgrp_dfl_root which will make all - * cgroups visible for lookups. + * on a task which has already passed exit_nsproxy_namespaces() + * and nsproxy == NULL. Fall back to cgrp_dfl_root which will + * make all cgroups visible for lookups. */ return &cgrp_dfl_root.cgrp; } @@ -5363,7 +5360,6 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, struct cgroup_file_ctx *ctx = of->priv; struct cgroup *src_cgrp, *dst_cgrp; struct task_struct *task; - const struct cred *saved_cred; ssize_t ret; enum cgroup_attach_lock_mode lock_mode; @@ -5386,11 +5382,10 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, * permissions using the credentials from file open to protect against * inherited fd attacks. */ - saved_cred = override_creds(of->file->f_cred); - ret = cgroup_attach_permissions(src_cgrp, dst_cgrp, - of->file->f_path.dentry->d_sb, - threadgroup, ctx->ns); - revert_creds(saved_cred); + scoped_with_creds(of->file->f_cred) + ret = cgroup_attach_permissions(src_cgrp, dst_cgrp, + of->file->f_path.dentry->d_sb, + threadgroup, ctx->ns); if (ret) goto out_finish; diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 52468d2c178a..4aaad07b0bd1 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1391,7 +1391,7 @@ static bool partition_xcpus_del(int old_prs, struct cpuset *parent, return isolcpus_updated; } -static void update_unbound_workqueue_cpumask(bool isolcpus_updated) +static void update_isolation_cpumasks(bool isolcpus_updated) { int ret; @@ -1402,6 +1402,9 @@ static void update_unbound_workqueue_cpumask(bool isolcpus_updated) ret = workqueue_unbound_exclude_cpumask(isolated_cpus); WARN_ON_ONCE(ret < 0); + + ret = tmigr_isolated_exclude_cpumask(isolated_cpus); + WARN_ON_ONCE(ret < 0); } /** @@ -1555,7 +1558,7 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs, list_add(&cs->remote_sibling, &remote_children); cpumask_copy(cs->effective_xcpus, tmp->new_cpus); spin_unlock_irq(&callback_lock); - update_unbound_workqueue_cpumask(isolcpus_updated); + update_isolation_cpumasks(isolcpus_updated); cpuset_force_rebuild(); cs->prs_err = 0; @@ -1596,7 +1599,7 @@ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp) compute_excpus(cs, cs->effective_xcpus); reset_partition_data(cs); spin_unlock_irq(&callback_lock); - update_unbound_workqueue_cpumask(isolcpus_updated); + update_isolation_cpumasks(isolcpus_updated); cpuset_force_rebuild(); /* @@ -1665,7 +1668,7 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *xcpus, if (xcpus) cpumask_copy(cs->exclusive_cpus, xcpus); spin_unlock_irq(&callback_lock); - update_unbound_workqueue_cpumask(isolcpus_updated); + update_isolation_cpumasks(isolcpus_updated); if (adding || deleting) cpuset_force_rebuild(); @@ -2023,7 +2026,7 @@ write_error: WARN_ON_ONCE(parent->nr_subparts < 0); } spin_unlock_irq(&callback_lock); - update_unbound_workqueue_cpumask(isolcpus_updated); + update_isolation_cpumasks(isolcpus_updated); if ((old_prs != new_prs) && (cmd == partcmd_update)) update_partition_exclusive_flag(cs, new_prs); @@ -3043,7 +3046,7 @@ out: else if (isolcpus_updated) isolated_cpus_update(old_prs, new_prs, cs->effective_xcpus); spin_unlock_irq(&callback_lock); - update_unbound_workqueue_cpumask(isolcpus_updated); + update_isolation_cpumasks(isolcpus_updated); /* Force update if switching back to member & update effective_xcpus */ update_cpumasks_hier(cs, &tmpmask, !new_prs); @@ -4180,7 +4183,7 @@ bool cpuset_cpus_allowed_fallback(struct task_struct *tsk) rcu_read_lock(); cs_mask = task_cs(tsk)->cpus_allowed; if (is_in_v2_mode() && cpumask_subset(cs_mask, possible_mask)) { - do_set_cpus_allowed(tsk, cs_mask); + set_cpus_allowed_force(tsk, cs_mask); changed = true; } rcu_read_unlock(); diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c index dd9417425d92..915b02f65980 100644 --- a/kernel/cgroup/legacy_freezer.c +++ b/kernel/cgroup/legacy_freezer.c @@ -63,7 +63,7 @@ static struct freezer *parent_freezer(struct freezer *freezer) return css_freezer(freezer->css.parent); } -bool cgroup_freezing(struct task_struct *task) +bool cgroup1_freezing(struct task_struct *task) { bool ret; diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index fdbe57578e68..db9617556dd7 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -30,7 +30,6 @@ static struct cgroup_namespace *alloc_cgroup_ns(void) ret = ns_common_init(new_ns); if (ret) return ERR_PTR(ret); - ns_tree_add(new_ns); return no_free_ptr(new_ns); } @@ -86,6 +85,7 @@ struct cgroup_namespace *copy_cgroup_ns(u64 flags, new_ns->ucounts = ucounts; new_ns->root_cset = cset; + ns_tree_add(new_ns); return new_ns; } diff --git a/kernel/cpu.c b/kernel/cpu.c index db9f6c539b28..b674fdf96208 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -3085,10 +3085,13 @@ EXPORT_SYMBOL(cpu_all_bits); #ifdef CONFIG_INIT_ALL_POSSIBLE struct cpumask __cpu_possible_mask __ro_after_init = {CPU_BITS_ALL}; +unsigned int __num_possible_cpus __ro_after_init = NR_CPUS; #else struct cpumask __cpu_possible_mask __ro_after_init; +unsigned int __num_possible_cpus __ro_after_init; #endif EXPORT_SYMBOL(__cpu_possible_mask); +EXPORT_SYMBOL(__num_possible_cpus); struct cpumask __cpu_online_mask __read_mostly; EXPORT_SYMBOL(__cpu_online_mask); @@ -3116,6 +3119,7 @@ void init_cpu_present(const struct cpumask *src) void init_cpu_possible(const struct cpumask *src) { cpumask_copy(&__cpu_possible_mask, src); + __num_possible_cpus = cpumask_weight(&__cpu_possible_mask); } void set_cpu_online(unsigned int cpu, bool online) @@ -3140,6 +3144,21 @@ void set_cpu_online(unsigned int cpu, bool online) } /* + * This should be marked __init, but there is a boatload of call sites + * which need to be fixed up to do so. Sigh... + */ +void set_cpu_possible(unsigned int cpu, bool possible) +{ + if (possible) { + if (!cpumask_test_and_set_cpu(cpu, &__cpu_possible_mask)) + __num_possible_cpus++; + } else { + if (cpumask_test_and_clear_cpu(cpu, &__cpu_possible_mask)) + __num_possible_cpus--; + } +} + +/* * Activate the first processor. */ void __init boot_cpu_init(void) diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 3b1c43382eec..99dac1aa972a 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -373,7 +373,7 @@ static int __crash_shrink_memory(struct resource *old_res, old_res->start = 0; old_res->end = 0; } else { - crashk_res.end = ram_res->start - 1; + old_res->end = ram_res->start - 1; } crash_free_reserved_phys_range(ram_res->start, ram_res->end); diff --git a/kernel/cred.c b/kernel/cred.c index dbf6b687dc5c..a6f686b30da1 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -35,33 +35,6 @@ do { \ static struct kmem_cache *cred_jar; -/* init to 2 - one for init_task, one to ensure it is never freed */ -static struct group_info init_groups = { .usage = REFCOUNT_INIT(2) }; - -/* - * The initial credentials for the initial task - */ -struct cred init_cred = { - .usage = ATOMIC_INIT(4), - .uid = GLOBAL_ROOT_UID, - .gid = GLOBAL_ROOT_GID, - .suid = GLOBAL_ROOT_UID, - .sgid = GLOBAL_ROOT_GID, - .euid = GLOBAL_ROOT_UID, - .egid = GLOBAL_ROOT_GID, - .fsuid = GLOBAL_ROOT_UID, - .fsgid = GLOBAL_ROOT_GID, - .securebits = SECUREBITS_DEFAULT, - .cap_inheritable = CAP_EMPTY_SET, - .cap_permitted = CAP_FULL_SET, - .cap_effective = CAP_FULL_SET, - .cap_bset = CAP_FULL_SET, - .user = INIT_USER, - .user_ns = &init_user_ns, - .group_info = &init_groups, - .ucounts = &init_ucounts, -}; - /* * The RCU callback to actually dispose of a set of credentials */ @@ -306,6 +279,7 @@ int copy_creds(struct task_struct *p, u64 clone_flags) kdebug("share_creds(%p{%ld})", p->cred, atomic_long_read(&p->cred->usage)); inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); + get_cred_namespaces(p); return 0; } @@ -343,6 +317,8 @@ int copy_creds(struct task_struct *p, u64 clone_flags) p->cred = p->real_cred = get_cred(new); inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); + get_cred_namespaces(p); + return 0; error_put: @@ -435,10 +411,13 @@ int commit_creds(struct cred *new) */ if (new->user != old->user || new->user_ns != old->user_ns) inc_rlimit_ucounts(new->ucounts, UCOUNT_RLIMIT_NPROC, 1); + rcu_assign_pointer(task->real_cred, new); rcu_assign_pointer(task->cred, new); if (new->user != old->user || new->user_ns != old->user_ns) dec_rlimit_ucounts(old->ucounts, UCOUNT_RLIMIT_NPROC, 1); + if (new->user_ns != old->user_ns) + switch_cred_namespaces(old, new); /* send notifications */ if (!uid_eq(new->uid, old->uid) || diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 1f9ee9759426..f973e7e73c90 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -481,6 +481,7 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, case PCI_P2PDMA_MAP_BUS_ADDR: sg->dma_address = pci_p2pdma_bus_addr_map(&p2pdma_state, sg_phys(sg)); + sg_dma_len(sg) = sg->length; sg_dma_mark_bus_address(sg); continue; default: diff --git a/kernel/entry/common.c b/kernel/entry/common.c index f62e1d1b2063..5c792b30c58a 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -11,19 +11,20 @@ /* Workaround to allow gradual conversion of architecture code */ void __weak arch_do_signal_or_restart(struct pt_regs *regs) { } -/** - * exit_to_user_mode_loop - do any pending work before leaving to user space - * @regs: Pointer to pt_regs on entry stack - * @ti_work: TIF work flags as read by the caller - */ -__always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs, - unsigned long ti_work) +#ifdef CONFIG_HAVE_GENERIC_TIF_BITS +#define EXIT_TO_USER_MODE_WORK_LOOP (EXIT_TO_USER_MODE_WORK & ~_TIF_RSEQ) +#else +#define EXIT_TO_USER_MODE_WORK_LOOP (EXIT_TO_USER_MODE_WORK) +#endif + +static __always_inline unsigned long __exit_to_user_mode_loop(struct pt_regs *regs, + unsigned long ti_work) { /* * Before returning to user space ensure that all pending work * items have been completed. */ - while (ti_work & EXIT_TO_USER_MODE_WORK) { + while (ti_work & EXIT_TO_USER_MODE_WORK_LOOP) { local_irq_enable_exit_to_user(ti_work); @@ -62,17 +63,21 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs, return ti_work; } -noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs) +/** + * exit_to_user_mode_loop - do any pending work before leaving to user space + * @regs: Pointer to pt_regs on entry stack + * @ti_work: TIF work flags as read by the caller + */ +__always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs, + unsigned long ti_work) { - enter_from_user_mode(regs); -} + for (;;) { + ti_work = __exit_to_user_mode_loop(regs, ti_work); -noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs) -{ - instrumentation_begin(); - exit_to_user_mode_prepare(regs); - instrumentation_end(); - exit_to_user_mode(); + if (likely(!rseq_exit_to_user_mode_restart(regs, ti_work))) + return ti_work; + ti_work = read_thread_flags(); + } } noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) diff --git a/kernel/entry/syscall-common.c b/kernel/entry/syscall-common.c index 66e6ba7fa80c..940a597ded40 100644 --- a/kernel/entry/syscall-common.c +++ b/kernel/entry/syscall-common.c @@ -63,14 +63,6 @@ long syscall_trace_enter(struct pt_regs *regs, long syscall, return ret ? : syscall; } -noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs) -{ - enter_from_user_mode(regs); - instrumentation_begin(); - local_irq_enable(); - instrumentation_end(); -} - /* * If SYSCALL_EMU is set, then the only reason to report is when * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 808c0d7a31fa..b9c7e00725d6 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -218,7 +218,7 @@ static void fixup_uretprobe_trampoline_entries(struct perf_callchain_entry *entr struct perf_callchain_entry * get_perf_callchain(struct pt_regs *regs, bool kernel, bool user, - u32 max_stack, bool crosstask, bool add_mark) + u32 max_stack, bool crosstask, bool add_mark, u64 defer_cookie) { struct perf_callchain_entry *entry; struct perf_callchain_entry_ctx ctx; @@ -251,6 +251,18 @@ get_perf_callchain(struct pt_regs *regs, bool kernel, bool user, regs = task_pt_regs(current); } + if (defer_cookie) { + /* + * Foretell the coming of PERF_RECORD_CALLCHAIN_DEFERRED + * which can be stitched to this one, and add + * the cookie after it (it will be cut off when the + * user stack is copied to the callchain). + */ + perf_callchain_store_context(&ctx, PERF_CONTEXT_USER_DEFERRED); + perf_callchain_store_context(&ctx, defer_cookie); + goto exit_put; + } + if (add_mark) perf_callchain_store_context(&ctx, PERF_CONTEXT_USER); diff --git a/kernel/events/core.c b/kernel/events/core.c index 1fd347da9026..ece716879cbc 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -56,6 +56,7 @@ #include <linux/buildid.h> #include <linux/task_work.h> #include <linux/percpu-rwsem.h> +#include <linux/unwind_deferred.h> #include "internal.h" @@ -8200,6 +8201,8 @@ static u64 perf_get_page_size(unsigned long addr) static struct perf_callchain_entry __empty_callchain = { .nr = 0, }; +static struct unwind_work perf_unwind_work; + struct perf_callchain_entry * perf_callchain(struct perf_event *event, struct pt_regs *regs) { @@ -8208,8 +8211,11 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) !(current->flags & (PF_KTHREAD | PF_USER_WORKER)); /* Disallow cross-task user callchains. */ bool crosstask = event->ctx->task && event->ctx->task != current; + bool defer_user = IS_ENABLED(CONFIG_UNWIND_USER) && user && + event->attr.defer_callchain; const u32 max_stack = event->attr.sample_max_stack; struct perf_callchain_entry *callchain; + u64 defer_cookie; if (!current->mm) user = false; @@ -8217,8 +8223,13 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) if (!kernel && !user) return &__empty_callchain; - callchain = get_perf_callchain(regs, kernel, user, - max_stack, crosstask, true); + if (!(user && defer_user && !crosstask && + unwind_deferred_request(&perf_unwind_work, &defer_cookie) >= 0)) + defer_cookie = 0; + + callchain = get_perf_callchain(regs, kernel, user, max_stack, + crosstask, true, defer_cookie); + return callchain ?: &__empty_callchain; } @@ -10003,6 +10014,66 @@ void perf_event_bpf_event(struct bpf_prog *prog, perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL); } +struct perf_callchain_deferred_event { + struct unwind_stacktrace *trace; + struct { + struct perf_event_header header; + u64 cookie; + u64 nr; + u64 ips[]; + } event; +}; + +static void perf_callchain_deferred_output(struct perf_event *event, void *data) +{ + struct perf_callchain_deferred_event *deferred_event = data; + struct perf_output_handle handle; + struct perf_sample_data sample; + int ret, size = deferred_event->event.header.size; + + if (!event->attr.defer_output) + return; + + /* XXX do we really need sample_id_all for this ??? */ + perf_event_header__init_id(&deferred_event->event.header, &sample, event); + + ret = perf_output_begin(&handle, &sample, event, + deferred_event->event.header.size); + if (ret) + goto out; + + perf_output_put(&handle, deferred_event->event); + for (int i = 0; i < deferred_event->trace->nr; i++) { + u64 entry = deferred_event->trace->entries[i]; + perf_output_put(&handle, entry); + } + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +out: + deferred_event->event.header.size = size; +} + +static void perf_unwind_deferred_callback(struct unwind_work *work, + struct unwind_stacktrace *trace, u64 cookie) +{ + struct perf_callchain_deferred_event deferred_event = { + .trace = trace, + .event = { + .header = { + .type = PERF_RECORD_CALLCHAIN_DEFERRED, + .misc = PERF_RECORD_MISC_USER, + .size = sizeof(deferred_event.event) + + (trace->nr * sizeof(u64)), + }, + .cookie = cookie, + .nr = trace->nr, + }, + }; + + perf_iterate_sb(perf_callchain_deferred_output, &deferred_event, NULL); +} + struct perf_text_poke_event { const void *old_bytes; const void *new_bytes; @@ -11901,7 +11972,7 @@ static int cpu_clock_event_add(struct perf_event *event, int flags) static void cpu_clock_event_del(struct perf_event *event, int flags) { - cpu_clock_event_stop(event, flags); + cpu_clock_event_stop(event, PERF_EF_UPDATE); } static void cpu_clock_event_read(struct perf_event *event) @@ -14809,6 +14880,9 @@ void __init perf_event_init(void) idr_init(&pmu_idr); + unwind_deferred_init(&perf_unwind_work, + perf_unwind_deferred_callback); + perf_event_init_all_cpus(); init_srcu_struct(&pmus_srcu); perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE); diff --git a/kernel/exit.c b/kernel/exit.c index 9f74e8f1c431..b9667ffcf7b3 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -291,6 +291,7 @@ repeat: write_unlock_irq(&tasklist_lock); /* @thread_pid can't go away until free_pids() below */ proc_flush_pid(thread_pid); + exit_cred_namespaces(p); add_device_randomness(&p->se.sum_exec_runtime, sizeof(p->se.sum_exec_runtime)); free_pids(post.pids); @@ -910,6 +911,7 @@ void __noreturn do_exit(long code) user_events_exit(tsk); io_uring_files_cancel(); + sched_mm_cid_exit(tsk); exit_signals(tsk); /* sets PF_EXITING */ seccomp_filter_release(tsk); @@ -939,7 +941,6 @@ void __noreturn do_exit(long code) tsk->exit_code = code; taskstats_exit(tsk, group_dead); - unwind_deferred_task_exit(tsk); trace_sched_process_exit(tsk, group_dead); /* @@ -950,6 +951,12 @@ void __noreturn do_exit(long code) * gets woken up by child-exit notifications. */ perf_event_exit_task(tsk); + /* + * PF_EXITING (above) ensures unwind_deferred_request() will no + * longer add new unwinds. While exit_mm() (below) will destroy the + * abaility to do unwinds. So flush any pending unwinds here. + */ + unwind_deferred_task_exit(tsk); exit_mm(); @@ -962,7 +969,7 @@ void __noreturn do_exit(long code) exit_fs(tsk); if (group_dead) disassociate_ctty(1); - exit_task_namespaces(tsk); + exit_nsproxy_namespaces(tsk); exit_task_work(tsk); exit_thread(tsk); diff --git a/kernel/fork.c b/kernel/fork.c index 3da0f08615a9..83e05d6f2307 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -955,10 +955,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) #endif #ifdef CONFIG_SCHED_MM_CID - tsk->mm_cid = -1; - tsk->last_mm_cid = -1; - tsk->mm_cid_active = 0; - tsk->migrate_from_cpu = -1; + tsk->mm_cid.cid = MM_CID_UNSET; + tsk->mm_cid.active = 0; #endif return tsk; @@ -2453,9 +2451,10 @@ bad_fork_cleanup_io: if (p->io_context) exit_io_context(p); bad_fork_cleanup_namespaces: - exit_task_namespaces(p); + exit_nsproxy_namespaces(p); bad_fork_cleanup_mm: if (p->mm) { + sched_mm_cid_exit(p); mm_clear_owner(p->mm, p); mmput(p->mm); } @@ -2487,6 +2486,7 @@ bad_fork_cleanup_delayacct: delayacct_tsk_free(p); bad_fork_cleanup_count: dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); + exit_cred_namespaces(p); exit_creds(p); bad_fork_free: WRITE_ONCE(p->__state, TASK_DEAD); diff --git a/kernel/freezer.c b/kernel/freezer.c index ddc11a8bd2ea..a76bf957fb32 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -44,7 +44,7 @@ bool freezing_slow_path(struct task_struct *p) if (tsk_is_oom_victim(p)) return false; - if (pm_nosig_freezing || cgroup_freezing(p)) + if (pm_nosig_freezing || cgroup1_freezing(p)) return true; if (pm_freezing && !(p->flags & PF_KTHREAD)) diff --git a/kernel/futex/core.c b/kernel/futex/core.c index 2e77a6e5c865..cf7e610eac42 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -581,7 +581,7 @@ int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key, if (flags & FLAGS_NUMA) { u32 __user *naddr = (void *)uaddr + size / 2; - if (futex_get_value(&node, naddr)) + if (get_user_inline(node, naddr)) return -EFAULT; if ((node != FUTEX_NO_NODE) && @@ -601,7 +601,7 @@ int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key, node = numa_node_id(); node_updated = true; } - if (node_updated && futex_put_value(node, naddr)) + if (node_updated && put_user_inline(node, naddr)) return -EFAULT; } diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h index 2cd57096c38e..30c2afa03889 100644 --- a/kernel/futex/futex.h +++ b/kernel/futex/futex.h @@ -281,63 +281,11 @@ static inline int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32 return ret; } -/* - * This does a plain atomic user space read, and the user pointer has - * already been verified earlier by get_futex_key() to be both aligned - * and actually in user space, just like futex_atomic_cmpxchg_inatomic(). - * - * We still want to avoid any speculation, and while __get_user() is - * the traditional model for this, it's actually slower than doing - * this manually these days. - * - * We could just have a per-architecture special function for it, - * the same way we do futex_atomic_cmpxchg_inatomic(), but rather - * than force everybody to do that, write it out long-hand using - * the low-level user-access infrastructure. - * - * This looks a bit overkill, but generally just results in a couple - * of instructions. - */ -static __always_inline int futex_get_value(u32 *dest, u32 __user *from) -{ - u32 val; - - if (can_do_masked_user_access()) - from = masked_user_access_begin(from); - else if (!user_read_access_begin(from, sizeof(*from))) - return -EFAULT; - unsafe_get_user(val, from, Efault); - user_read_access_end(); - *dest = val; - return 0; -Efault: - user_read_access_end(); - return -EFAULT; -} - -static __always_inline int futex_put_value(u32 val, u32 __user *to) -{ - if (can_do_masked_user_access()) - to = masked_user_access_begin(to); - else if (!user_write_access_begin(to, sizeof(*to))) - return -EFAULT; - unsafe_put_user(val, to, Efault); - user_write_access_end(); - return 0; -Efault: - user_write_access_end(); - return -EFAULT; -} - +/* Read from user memory with pagefaults disabled */ static inline int futex_get_value_locked(u32 *dest, u32 __user *from) { - int ret; - - pagefault_disable(); - ret = futex_get_value(dest, from); - pagefault_enable(); - - return ret; + guard(pagefault)(); + return get_user_inline(*dest, from); } extern void __futex_unqueue(struct futex_q *q); diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c index a08cc076f332..ffde93d051a4 100644 --- a/kernel/gcov/gcc_4_7.c +++ b/kernel/gcov/gcc_4_7.c @@ -18,7 +18,9 @@ #include <linux/mm.h> #include "gcov.h" -#if (__GNUC__ >= 14) +#if (__GNUC__ >= 15) +#define GCOV_COUNTERS 10 +#elif (__GNUC__ >= 14) #define GCOV_COUNTERS 9 #elif (__GNUC__ >= 10) #define GCOV_COUNTERS 8 diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index d1917b28761a..678f094d261a 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -897,8 +897,9 @@ void handle_percpu_irq(struct irq_desc *desc) void handle_percpu_devid_irq(struct irq_desc *desc) { struct irq_chip *chip = irq_desc_get_chip(desc); - struct irqaction *action = desc->action; unsigned int irq = irq_desc_get_irq(desc); + unsigned int cpu = smp_processor_id(); + struct irqaction *action; irqreturn_t res; /* @@ -910,12 +911,15 @@ void handle_percpu_devid_irq(struct irq_desc *desc) if (chip->irq_ack) chip->irq_ack(&desc->irq_data); + for (action = desc->action; action; action = action->next) + if (cpumask_test_cpu(cpu, action->affinity)) + break; + if (likely(action)) { trace_irq_handler_entry(irq, action); res = action->handler(irq, raw_cpu_ptr(action->percpu_dev_id)); trace_irq_handler_exit(irq, action, res); } else { - unsigned int cpu = smp_processor_id(); bool enabled = cpumask_test_cpu(cpu, desc->percpu_enabled); if (enabled) @@ -929,31 +933,6 @@ void handle_percpu_devid_irq(struct irq_desc *desc) chip->irq_eoi(&desc->irq_data); } -/** - * handle_percpu_devid_fasteoi_nmi - Per CPU local NMI handler with per cpu - * dev ids - * @desc: the interrupt description structure for this irq - * - * Similar to handle_fasteoi_nmi, but handling the dev_id cookie - * as a percpu pointer. - */ -void handle_percpu_devid_fasteoi_nmi(struct irq_desc *desc) -{ - struct irq_chip *chip = irq_desc_get_chip(desc); - struct irqaction *action = desc->action; - unsigned int irq = irq_desc_get_irq(desc); - irqreturn_t res; - - __kstat_incr_irqs_this_cpu(desc); - - trace_irq_handler_entry(irq, action); - res = action->handler(irq, raw_cpu_ptr(action->percpu_dev_id)); - trace_irq_handler_exit(irq, action, res); - - if (chip->irq_eoi) - chip->irq_eoi(&desc->irq_data); -} - static void __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle, int is_chained, const char *name) diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index e103451243a0..786f5570a640 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -133,7 +133,15 @@ void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action) */ atomic_inc(&desc->threads_active); - wake_up_process(action->thread); + /* + * This might be a premature wakeup before the thread reached the + * thread function and set the IRQTF_READY bit. It's waiting in + * kthread code with state UNINTERRUPTIBLE. Once it reaches the + * thread function it waits with INTERRUPTIBLE. The wakeup is not + * lost in that case because the thread is guaranteed to observe + * the RUN flag before it goes to sleep in wait_for_interrupt(). + */ + wake_up_state(action->thread, TASK_INTERRUPTIBLE); } static DEFINE_STATIC_KEY_FALSE(irqhandler_duration_check_enabled); diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index db714d3014b5..6acf268f005b 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -879,8 +879,7 @@ void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus) chip_bus_sync_unlock(desc); } -int irq_set_percpu_devid_partition(unsigned int irq, - const struct cpumask *affinity) +int irq_set_percpu_devid(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); @@ -892,31 +891,10 @@ int irq_set_percpu_devid_partition(unsigned int irq, if (!desc->percpu_enabled) return -ENOMEM; - desc->percpu_affinity = affinity ? : cpu_possible_mask; - irq_set_percpu_devid_flags(irq); return 0; } -int irq_set_percpu_devid(unsigned int irq) -{ - return irq_set_percpu_devid_partition(irq, NULL); -} - -int irq_get_percpu_devid_partition(unsigned int irq, struct cpumask *affinity) -{ - struct irq_desc *desc = irq_to_desc(irq); - - if (!desc || !desc->percpu_enabled) - return -EINVAL; - - if (affinity) - cpumask_copy(affinity, desc->percpu_affinity); - - return 0; -} -EXPORT_SYMBOL_GPL(irq_get_percpu_devid_partition); - void kstat_incr_irq_this_cpu(unsigned int irq) { kstat_incr_irqs_this_cpu(irq_to_desc(irq)); diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index dc473faadcc8..2652c4cfd877 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -867,13 +867,9 @@ void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args, } EXPORT_SYMBOL_GPL(of_phandle_args_to_fwspec); -unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) +static struct irq_domain *fwspec_to_domain(struct irq_fwspec *fwspec) { struct irq_domain *domain; - struct irq_data *irq_data; - irq_hw_number_t hwirq; - unsigned int type = IRQ_TYPE_NONE; - int virq; if (fwspec->fwnode) { domain = irq_find_matching_fwspec(fwspec, DOMAIN_BUS_WIRED); @@ -883,6 +879,32 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) domain = irq_default_domain; } + return domain; +} + +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY +int irq_populate_fwspec_info(struct irq_fwspec *fwspec, struct irq_fwspec_info *info) +{ + struct irq_domain *domain = fwspec_to_domain(fwspec); + + memset(info, 0, sizeof(*info)); + + if (!domain || !domain->ops->get_fwspec_info) + return 0; + + return domain->ops->get_fwspec_info(fwspec, info); +} +#endif + +unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) +{ + unsigned int type = IRQ_TYPE_NONE; + struct irq_domain *domain; + struct irq_data *irq_data; + irq_hw_number_t hwirq; + int virq; + + domain = fwspec_to_domain(fwspec); if (!domain) { pr_warn("no irq domain found for %s !\n", of_node_full_name(to_of_node(fwspec->fwnode))); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 400856abf672..0bb29316b436 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -547,7 +547,7 @@ int irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *noti INIT_WORK(¬ify->work, irq_affinity_notify); } - scoped_guard(raw_spinlock_irqsave, &desc->lock) { + scoped_guard(raw_spinlock_irq, &desc->lock) { old_notify = desc->affinity_notify; desc->affinity_notify = notify; } @@ -1001,7 +1001,6 @@ static irqreturn_t irq_forced_secondary_handler(int irq, void *dev_id) static void irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { cpumask_var_t mask; - bool valid = false; if (!test_and_clear_bit(IRQTF_AFFINITY, &action->thread_flags)) return; @@ -1018,21 +1017,13 @@ static void irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *a } scoped_guard(raw_spinlock_irq, &desc->lock) { - /* - * This code is triggered unconditionally. Check the affinity - * mask pointer. For CPU_MASK_OFFSTACK=n this is optimized out. - */ - if (cpumask_available(desc->irq_common_data.affinity)) { - const struct cpumask *m; + const struct cpumask *m; - m = irq_data_get_effective_affinity_mask(&desc->irq_data); - cpumask_copy(mask, m); - valid = true; - } + m = irq_data_get_effective_affinity_mask(&desc->irq_data); + cpumask_copy(mask, m); } - if (valid) - set_cpus_allowed_ptr(current, mask); + set_cpus_allowed_ptr(current, mask); free_cpumask_var(mask); } #else @@ -1239,7 +1230,10 @@ static int irq_thread(void *data) irq_thread_set_ready(desc, action); - sched_set_fifo(current); + if (action->handler == irq_forced_secondary_handler) + sched_set_fifo_secondary(current); + else + sched_set_fifo(current); if (force_irqthreads() && test_bit(IRQTF_FORCED_THREAD, &action->thread_flags)) @@ -1405,19 +1399,39 @@ setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary) * references an already freed task_struct. */ new->thread = get_task_struct(t); + /* - * Tell the thread to set its affinity. This is - * important for shared interrupt handlers as we do - * not invoke setup_affinity() for the secondary - * handlers as everything is already set up. Even for - * interrupts marked with IRQF_NO_BALANCE this is - * correct as we want the thread to move to the cpu(s) - * on which the requesting code placed the interrupt. + * The affinity can not be established yet, but it will be once the + * interrupt is enabled. Delay and defer the actual setting to the + * thread itself once it is ready to run. In the meantime, prevent + * it from ever being re-affined directly by cpuset or + * housekeeping. The proper way to do it is to re-affine the whole + * vector. */ - set_bit(IRQTF_AFFINITY, &new->thread_flags); + kthread_bind_mask(t, cpu_possible_mask); + + /* + * Ensure the thread adjusts the affinity once it reaches the + * thread function. + */ + new->thread_flags = BIT(IRQTF_AFFINITY); + return 0; } +static bool valid_percpu_irqaction(struct irqaction *old, struct irqaction *new) +{ + do { + if (cpumask_intersects(old->affinity, new->affinity) || + old->percpu_dev_id == new->percpu_dev_id) + return false; + + old = old->next; + } while (old); + + return true; +} + /* * Internal function to register an irqaction - typically used to * allocate special interrupts that are part of the architecture. @@ -1438,6 +1452,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) struct irqaction *old, **old_ptr; unsigned long flags, thread_mask = 0; int ret, nested, shared = 0; + bool per_cpu_devid; if (!desc) return -EINVAL; @@ -1447,6 +1462,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) if (!try_module_get(desc->owner)) return -ENODEV; + per_cpu_devid = irq_settings_is_per_cpu_devid(desc); + new->irq = irq; /* @@ -1554,13 +1571,20 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) */ unsigned int oldtype; - if (irq_is_nmi(desc)) { + if (irq_is_nmi(desc) && !per_cpu_devid) { pr_err("Invalid attempt to share NMI for %s (irq %d) on irqchip %s.\n", new->name, irq, desc->irq_data.chip->name); ret = -EINVAL; goto out_unlock; } + if (per_cpu_devid && !valid_percpu_irqaction(old, new)) { + pr_err("Overlapping affinities for %s (irq %d) on irqchip %s.\n", + new->name, irq, desc->irq_data.chip->name); + ret = -EINVAL; + goto out_unlock; + } + /* * If nobody did set the configuration before, inherit * the one provided by the requester. @@ -1711,7 +1735,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) if (!(new->flags & IRQF_NO_AUTOEN) && irq_settings_can_autoenable(desc)) { irq_startup(desc, IRQ_RESEND, IRQ_START_COND); - } else { + } else if (!per_cpu_devid) { /* * Shared interrupts do not go well with disabling * auto enable. The sharing interrupt might request @@ -2346,7 +2370,7 @@ void disable_percpu_nmi(unsigned int irq) static struct irqaction *__free_percpu_irq(unsigned int irq, void __percpu *dev_id) { struct irq_desc *desc = irq_to_desc(irq); - struct irqaction *action; + struct irqaction *action, **action_ptr; WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq); @@ -2354,21 +2378,33 @@ static struct irqaction *__free_percpu_irq(unsigned int irq, void __percpu *dev_ return NULL; scoped_guard(raw_spinlock_irqsave, &desc->lock) { - action = desc->action; - if (!action || action->percpu_dev_id != dev_id) { - WARN(1, "Trying to free already-free IRQ %d\n", irq); - return NULL; + action_ptr = &desc->action; + for (;;) { + action = *action_ptr; + + if (!action) { + WARN(1, "Trying to free already-free IRQ %d\n", irq); + return NULL; + } + + if (action->percpu_dev_id == dev_id) + break; + + action_ptr = &action->next; } - if (!cpumask_empty(desc->percpu_enabled)) { - WARN(1, "percpu IRQ %d still enabled on CPU%d!\n", - irq, cpumask_first(desc->percpu_enabled)); + if (cpumask_intersects(desc->percpu_enabled, action->affinity)) { + WARN(1, "percpu IRQ %d still enabled on CPU%d!\n", irq, + cpumask_first_and(desc->percpu_enabled, action->affinity)); return NULL; } /* Found it - now remove it from the list of entries: */ - desc->action = NULL; - desc->istate &= ~IRQS_NMI; + *action_ptr = action->next; + + /* Demote from NMI if we killed the last action */ + if (!desc->action) + desc->istate &= ~IRQS_NMI; } unregister_handler_proc(irq, action); @@ -2442,17 +2478,49 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act) return retval; } +static +struct irqaction *create_percpu_irqaction(irq_handler_t handler, unsigned long flags, + const char *devname, const cpumask_t *affinity, + void __percpu *dev_id) +{ + struct irqaction *action; + + if (!affinity) + affinity = cpu_possible_mask; + + action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); + if (!action) + return NULL; + + action->handler = handler; + action->flags = flags | IRQF_PERCPU | IRQF_NO_SUSPEND; + action->name = devname; + action->percpu_dev_id = dev_id; + action->affinity = affinity; + + /* + * We allow some form of sharing for non-overlapping affinity + * masks. Obviously, covering all CPUs prevents any sharing in + * the first place. + */ + if (!cpumask_equal(affinity, cpu_possible_mask)) + action->flags |= IRQF_SHARED; + + return action; +} + /** * __request_percpu_irq - allocate a percpu interrupt line * @irq: Interrupt line to allocate * @handler: Function to be called when the IRQ occurs. * @flags: Interrupt type flags (IRQF_TIMER only) * @devname: An ascii name for the claiming device + * @affinity: A cpumask describing the target CPUs for this interrupt * @dev_id: A percpu cookie passed back to the handler function * - * This call allocates interrupt resources and enables the interrupt on the - * local CPU. If the interrupt is supposed to be enabled on other CPUs, it - * has to be done on each CPU using enable_percpu_irq(). + * This call allocates interrupt resources, but doesn't enable the interrupt + * on any CPU, as all percpu-devid interrupts are flagged with IRQ_NOAUTOEN. + * It has to be done on each CPU using enable_percpu_irq(). * * @dev_id must be globally unique. It is a per-cpu variable, and * the handler gets called with the interrupted CPU's instance of @@ -2460,7 +2528,7 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act) */ int __request_percpu_irq(unsigned int irq, irq_handler_t handler, unsigned long flags, const char *devname, - void __percpu *dev_id) + const cpumask_t *affinity, void __percpu *dev_id) { struct irqaction *action; struct irq_desc *desc; @@ -2477,15 +2545,10 @@ int __request_percpu_irq(unsigned int irq, irq_handler_t handler, if (flags && flags != IRQF_TIMER) return -EINVAL; - action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); + action = create_percpu_irqaction(handler, flags, devname, affinity, dev_id); if (!action) return -ENOMEM; - action->handler = handler; - action->flags = flags | IRQF_PERCPU | IRQF_NO_SUSPEND; - action->name = devname; - action->percpu_dev_id = dev_id; - retval = irq_chip_pm_get(&desc->irq_data); if (retval < 0) { kfree(action); @@ -2508,6 +2571,7 @@ EXPORT_SYMBOL_GPL(__request_percpu_irq); * @irq: Interrupt line to allocate * @handler: Function to be called when the IRQ occurs. * @name: An ascii name for the claiming device + * @affinity: A cpumask describing the target CPUs for this interrupt * @dev_id: A percpu cookie passed back to the handler function * * This call allocates interrupt resources for a per CPU NMI. Per CPU NMIs @@ -2524,8 +2588,8 @@ EXPORT_SYMBOL_GPL(__request_percpu_irq); * If the interrupt line cannot be used to deliver NMIs, function * will fail returning a negative value. */ -int request_percpu_nmi(unsigned int irq, irq_handler_t handler, - const char *name, void __percpu *dev_id) +int request_percpu_nmi(unsigned int irq, irq_handler_t handler, const char *name, + const struct cpumask *affinity, void __percpu *dev_id) { struct irqaction *action; struct irq_desc *desc; @@ -2542,20 +2606,16 @@ int request_percpu_nmi(unsigned int irq, irq_handler_t handler, !irq_supports_nmi(desc)) return -EINVAL; - /* The line cannot already be NMI */ - if (irq_is_nmi(desc)) + /* The line cannot be NMI already if the new request covers all CPUs */ + if (irq_is_nmi(desc) && + (!affinity || cpumask_equal(affinity, cpu_possible_mask))) return -EINVAL; - action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); + action = create_percpu_irqaction(handler, IRQF_NO_THREAD | IRQF_NOBALANCING, + name, affinity, dev_id); if (!action) return -ENOMEM; - action->handler = handler; - action->flags = IRQF_PERCPU | IRQF_NO_SUSPEND | IRQF_NO_THREAD - | IRQF_NOBALANCING; - action->name = name; - action->percpu_dev_id = dev_id; - retval = irq_chip_pm_get(&desc->irq_data); if (retval < 0) goto err_out; diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index e7ad99254841..68886881fe10 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -706,7 +706,7 @@ static int msi_domain_alloc(struct irq_domain *domain, unsigned int virq, irq_hw_number_t hwirq = ops->get_hwirq(info, arg); int i, ret; - if (irq_find_mapping(domain, hwirq) > 0) + if (irq_resolve_mapping(domain, hwirq)) return -EEXIST; if (domain->parent) { diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 29c2404e743b..77258eafbf63 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -48,6 +48,8 @@ static int show_irq_affinity(int type, struct seq_file *m) struct irq_desc *desc = irq_to_desc((long)m->private); const struct cpumask *mask; + guard(raw_spinlock_irq)(&desc->lock); + switch (type) { case AFFINITY: case AFFINITY_LIST: diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index 76f0940fb485..03d12e27189f 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -8,6 +8,7 @@ #define pr_fmt(fmt) "KHO: " fmt +#include <linux/cleanup.h> #include <linux/cma.h> #include <linux/count_zeros.h> #include <linux/debugfs.h> @@ -22,6 +23,7 @@ #include <asm/early_ioremap.h> +#include "kexec_handover_internal.h" /* * KHO is tightly coupled with mm init and needs access to some of mm * internal APIs. @@ -67,10 +69,10 @@ early_param("kho", kho_parse_enable); * Keep track of memory that is to be preserved across KHO. * * The serializing side uses two levels of xarrays to manage chunks of per-order - * 512 byte bitmaps. For instance if PAGE_SIZE = 4096, the entire 1G order of a - * 1TB system would fit inside a single 512 byte bitmap. For order 0 allocations - * each bitmap will cover 16M of address space. Thus, for 16G of memory at most - * 512K of bitmap memory will be needed for order 0. + * PAGE_SIZE byte bitmaps. For instance if PAGE_SIZE = 4096, the entire 1G order + * of a 8TB system would fit inside a single 4096 byte bitmap. For order 0 + * allocations each bitmap will cover 128M of address space. Thus, for 16G of + * memory at most 512K of bitmap memory will be needed for order 0. * * This approach is fully incremental, as the serialization progresses folios * can continue be aggregated to the tracker. The final step, immediately prior @@ -78,12 +80,14 @@ early_param("kho", kho_parse_enable); * successor kernel to parse. */ -#define PRESERVE_BITS (512 * 8) +#define PRESERVE_BITS (PAGE_SIZE * 8) struct kho_mem_phys_bits { DECLARE_BITMAP(preserve, PRESERVE_BITS); }; +static_assert(sizeof(struct kho_mem_phys_bits) == PAGE_SIZE); + struct kho_mem_phys { /* * Points to kho_mem_phys_bits, a sparse bitmap array. Each bit is sized @@ -131,28 +135,28 @@ static struct kho_out kho_out = { .finalized = false, }; -static void *xa_load_or_alloc(struct xarray *xa, unsigned long index, size_t sz) +static void *xa_load_or_alloc(struct xarray *xa, unsigned long index) { - void *elm, *res; + void *res = xa_load(xa, index); + + if (res) + return res; - elm = xa_load(xa, index); - if (elm) - return elm; + void *elm __free(free_page) = (void *)get_zeroed_page(GFP_KERNEL); - elm = kzalloc(sz, GFP_KERNEL); if (!elm) return ERR_PTR(-ENOMEM); + if (WARN_ON(kho_scratch_overlap(virt_to_phys(elm), PAGE_SIZE))) + return ERR_PTR(-EINVAL); + res = xa_cmpxchg(xa, index, NULL, elm, GFP_KERNEL); if (xa_is_err(res)) - res = ERR_PTR(xa_err(res)); - - if (res) { - kfree(elm); + return ERR_PTR(xa_err(res)); + else if (res) return res; - } - return elm; + return no_free_ptr(elm); } static void __kho_unpreserve(struct kho_mem_track *track, unsigned long pfn, @@ -167,12 +171,12 @@ static void __kho_unpreserve(struct kho_mem_track *track, unsigned long pfn, const unsigned long pfn_high = pfn >> order; physxa = xa_load(&track->orders, order); - if (!physxa) - continue; + if (WARN_ON_ONCE(!physxa)) + return; bits = xa_load(&physxa->phys_bits, pfn_high / PRESERVE_BITS); - if (!bits) - continue; + if (WARN_ON_ONCE(!bits)) + return; clear_bit(pfn_high % PRESERVE_BITS, bits->preserve); @@ -216,8 +220,7 @@ static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn, } } - bits = xa_load_or_alloc(&physxa->phys_bits, pfn_high / PRESERVE_BITS, - sizeof(*bits)); + bits = xa_load_or_alloc(&physxa->phys_bits, pfn_high / PRESERVE_BITS); if (IS_ERR(bits)) return PTR_ERR(bits); @@ -345,15 +348,19 @@ static_assert(sizeof(struct khoser_mem_chunk) == PAGE_SIZE); static struct khoser_mem_chunk *new_chunk(struct khoser_mem_chunk *cur_chunk, unsigned long order) { - struct khoser_mem_chunk *chunk; + struct khoser_mem_chunk *chunk __free(free_page) = NULL; - chunk = kzalloc(PAGE_SIZE, GFP_KERNEL); + chunk = (void *)get_zeroed_page(GFP_KERNEL); if (!chunk) - return NULL; + return ERR_PTR(-ENOMEM); + + if (WARN_ON(kho_scratch_overlap(virt_to_phys(chunk), PAGE_SIZE))) + return ERR_PTR(-EINVAL); + chunk->hdr.order = order; if (cur_chunk) KHOSER_STORE_PTR(cur_chunk->hdr.next, chunk); - return chunk; + return no_free_ptr(chunk); } static void kho_mem_ser_free(struct khoser_mem_chunk *first_chunk) @@ -374,14 +381,17 @@ static int kho_mem_serialize(struct kho_serialization *ser) struct khoser_mem_chunk *chunk = NULL; struct kho_mem_phys *physxa; unsigned long order; + int err = -ENOMEM; xa_for_each(&ser->track.orders, order, physxa) { struct kho_mem_phys_bits *bits; unsigned long phys; chunk = new_chunk(chunk, order); - if (!chunk) + if (IS_ERR(chunk)) { + err = PTR_ERR(chunk); goto err_free; + } if (!first_chunk) first_chunk = chunk; @@ -391,8 +401,10 @@ static int kho_mem_serialize(struct kho_serialization *ser) if (chunk->hdr.num_elms == ARRAY_SIZE(chunk->bitmaps)) { chunk = new_chunk(chunk, order); - if (!chunk) + if (IS_ERR(chunk)) { + err = PTR_ERR(chunk); goto err_free; + } } elm = &chunk->bitmaps[chunk->hdr.num_elms]; @@ -409,7 +421,7 @@ static int kho_mem_serialize(struct kho_serialization *ser) err_free: kho_mem_ser_free(first_chunk); - return -ENOMEM; + return err; } static void __init deserialize_bitmap(unsigned int order, @@ -465,8 +477,8 @@ static void __init kho_mem_deserialize(const void *fdt) * area for early allocations that happen before page allocator is * initialized. */ -static struct kho_scratch *kho_scratch; -static unsigned int kho_scratch_cnt; +struct kho_scratch *kho_scratch; +unsigned int kho_scratch_cnt; /* * The scratch areas are scaled by default as percent of memory allocated from @@ -752,6 +764,9 @@ int kho_preserve_folio(struct folio *folio) const unsigned int order = folio_order(folio); struct kho_mem_track *track = &kho_out.ser.track; + if (WARN_ON(kho_scratch_overlap(pfn << PAGE_SHIFT, PAGE_SIZE << order))) + return -EINVAL; + return __kho_preserve_order(track, pfn, order); } EXPORT_SYMBOL_GPL(kho_preserve_folio); @@ -775,6 +790,11 @@ int kho_preserve_pages(struct page *page, unsigned int nr_pages) unsigned long failed_pfn = 0; int err = 0; + if (WARN_ON(kho_scratch_overlap(start_pfn << PAGE_SHIFT, + nr_pages << PAGE_SHIFT))) { + return -EINVAL; + } + while (pfn < end_pfn) { const unsigned int order = min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn)); @@ -862,16 +882,17 @@ err_free: return NULL; } -static void kho_vmalloc_unpreserve_chunk(struct kho_vmalloc_chunk *chunk) +static void kho_vmalloc_unpreserve_chunk(struct kho_vmalloc_chunk *chunk, + unsigned short order) { struct kho_mem_track *track = &kho_out.ser.track; unsigned long pfn = PHYS_PFN(virt_to_phys(chunk)); __kho_unpreserve(track, pfn, pfn + 1); - for (int i = 0; chunk->phys[i]; i++) { + for (int i = 0; i < ARRAY_SIZE(chunk->phys) && chunk->phys[i]; i++) { pfn = PHYS_PFN(chunk->phys[i]); - __kho_unpreserve(track, pfn, pfn + 1); + __kho_unpreserve(track, pfn, pfn + (1 << order)); } } @@ -882,7 +903,7 @@ static void kho_vmalloc_free_chunks(struct kho_vmalloc *kho_vmalloc) while (chunk) { struct kho_vmalloc_chunk *tmp = chunk; - kho_vmalloc_unpreserve_chunk(chunk); + kho_vmalloc_unpreserve_chunk(chunk, kho_vmalloc->order); chunk = KHOSER_LOAD_PTR(chunk->hdr.next); free_page((unsigned long)tmp); @@ -992,7 +1013,7 @@ void *kho_restore_vmalloc(const struct kho_vmalloc *preservation) while (chunk) { struct page *page; - for (int i = 0; chunk->phys[i]; i++) { + for (int i = 0; i < ARRAY_SIZE(chunk->phys) && chunk->phys[i]; i++) { phys_addr_t phys = chunk->phys[i]; if (idx + contig_pages > total_pages) diff --git a/kernel/kexec_handover_debug.c b/kernel/kexec_handover_debug.c new file mode 100644 index 000000000000..6efb696f5426 --- /dev/null +++ b/kernel/kexec_handover_debug.c @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * kexec_handover_debug.c - kexec handover optional debug functionality + * Copyright (C) 2025 Google LLC, Pasha Tatashin <pasha.tatashin@soleen.com> + */ + +#define pr_fmt(fmt) "KHO: " fmt + +#include "kexec_handover_internal.h" + +bool kho_scratch_overlap(phys_addr_t phys, size_t size) +{ + phys_addr_t scratch_start, scratch_end; + unsigned int i; + + for (i = 0; i < kho_scratch_cnt; i++) { + scratch_start = kho_scratch[i].addr; + scratch_end = kho_scratch[i].addr + kho_scratch[i].size; + + if (phys < scratch_end && (phys + size) > scratch_start) + return true; + } + + return false; +} diff --git a/kernel/kexec_handover_internal.h b/kernel/kexec_handover_internal.h new file mode 100644 index 000000000000..3c3c7148ceed --- /dev/null +++ b/kernel/kexec_handover_internal.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef LINUX_KEXEC_HANDOVER_INTERNAL_H +#define LINUX_KEXEC_HANDOVER_INTERNAL_H + +#include <linux/kexec_handover.h> +#include <linux/types.h> + +extern struct kho_scratch *kho_scratch; +extern unsigned int kho_scratch_cnt; + +#ifdef CONFIG_KEXEC_HANDOVER_DEBUG +bool kho_scratch_overlap(phys_addr_t phys, size_t size); +#else +static inline bool kho_scratch_overlap(phys_addr_t phys, size_t size) +{ + return false; +} +#endif /* CONFIG_KEXEC_HANDOVER_DEBUG */ + +#endif /* LINUX_KEXEC_HANDOVER_INTERNAL_H */ diff --git a/kernel/kthread.c b/kernel/kthread.c index 31b072e8d427..99a3808d086f 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -593,18 +593,16 @@ EXPORT_SYMBOL(kthread_create_on_node); static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, unsigned int state) { - unsigned long flags; - if (!wait_task_inactive(p, state)) { WARN_ON(1); return; } + scoped_guard (raw_spinlock_irqsave, &p->pi_lock) + set_cpus_allowed_force(p, mask); + /* It's safe because the task is inactive. */ - raw_spin_lock_irqsave(&p->pi_lock, flags); - do_set_cpus_allowed(p, mask); p->flags |= PF_NO_SETAFFINITY; - raw_spin_unlock_irqrestore(&p->pi_lock, flags); } static void __kthread_bind(struct task_struct *p, unsigned int cpu, unsigned int state) @@ -857,7 +855,6 @@ int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask) { struct kthread *kthread = to_kthread(p); cpumask_var_t affinity; - unsigned long flags; int ret = 0; if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE) || kthread->started) { @@ -882,10 +879,8 @@ int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask) list_add_tail(&kthread->hotplug_node, &kthreads_hotplug); kthread_fetch_affinity(kthread, affinity); - /* It's safe because the task is inactive. */ - raw_spin_lock_irqsave(&p->pi_lock, flags); - do_set_cpus_allowed(p, affinity); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); + scoped_guard (raw_spinlock_irqsave, &p->pi_lock) + set_cpus_allowed_force(p, affinity); mutex_unlock(&kthreads_hotplug_lock); out: diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c index 949103fd8e9b..2c6b02d4699b 100644 --- a/kernel/locking/mutex-debug.c +++ b/kernel/locking/mutex-debug.c @@ -78,16 +78,8 @@ void debug_mutex_unlock(struct mutex *lock) } } -void debug_mutex_init(struct mutex *lock, const char *name, - struct lock_class_key *key) +void debug_mutex_init(struct mutex *lock) { -#ifdef CONFIG_DEBUG_LOCK_ALLOC - /* - * Make sure we are not reinitializing a held lock: - */ - debug_check_no_locks_freed((void *)lock, sizeof(*lock)); - lockdep_init_map_wait(&lock->dep_map, name, key, 0, LD_WAIT_SLEEP); -#endif lock->magic = lock; } diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index de7d6702cd96..2a1d165b3167 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -43,8 +43,7 @@ # define MUTEX_WARN_ON(cond) #endif -void -__mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) +static void __mutex_init_generic(struct mutex *lock) { atomic_long_set(&lock->owner, 0); raw_spin_lock_init(&lock->wait_lock); @@ -52,10 +51,8 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) #ifdef CONFIG_MUTEX_SPIN_ON_OWNER osq_lock_init(&lock->osq); #endif - - debug_mutex_init(lock, name, key); + debug_mutex_init(lock); } -EXPORT_SYMBOL(__mutex_init); static inline struct task_struct *__owner_task(unsigned long owner) { @@ -142,6 +139,11 @@ static inline bool __mutex_trylock(struct mutex *lock) * There is nothing that would stop spreading the lockdep annotations outwards * except more code. */ +void mutex_init_generic(struct mutex *lock) +{ + __mutex_init_generic(lock); +} +EXPORT_SYMBOL(mutex_init_generic); /* * Optimistic trylock that only works in the uncontended case. Make sure to @@ -166,7 +168,21 @@ static __always_inline bool __mutex_unlock_fast(struct mutex *lock) return atomic_long_try_cmpxchg_release(&lock->owner, &curr, 0UL); } -#endif + +#else /* !CONFIG_DEBUG_LOCK_ALLOC */ + +void mutex_init_lockep(struct mutex *lock, const char *name, struct lock_class_key *key) +{ + __mutex_init_generic(lock); + + /* + * Make sure we are not reinitializing a held lock: + */ + debug_check_no_locks_freed((void *)lock, sizeof(*lock)); + lockdep_init_map_wait(&lock->dep_map, name, key, 0, LD_WAIT_SLEEP); +} +EXPORT_SYMBOL(mutex_init_lockep); +#endif /* !CONFIG_DEBUG_LOCK_ALLOC */ static inline void __mutex_set_flag(struct mutex *lock, unsigned long flag) { diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h index 2e8080a9bee3..9ad4da8cea00 100644 --- a/kernel/locking/mutex.h +++ b/kernel/locking/mutex.h @@ -59,8 +59,7 @@ extern void debug_mutex_add_waiter(struct mutex *lock, extern void debug_mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, struct task_struct *task); extern void debug_mutex_unlock(struct mutex *lock); -extern void debug_mutex_init(struct mutex *lock, const char *name, - struct lock_class_key *key); +extern void debug_mutex_init(struct mutex *lock); #else /* CONFIG_DEBUG_MUTEXES */ # define debug_mutex_lock_common(lock, waiter) do { } while (0) # define debug_mutex_wake_waiter(lock, waiter) do { } while (0) @@ -68,6 +67,6 @@ extern void debug_mutex_init(struct mutex *lock, const char *name, # define debug_mutex_add_waiter(lock, waiter, ti) do { } while (0) # define debug_mutex_remove_waiter(lock, waiter, ti) do { } while (0) # define debug_mutex_unlock(lock) do { } while (0) -# define debug_mutex_init(lock, name, key) do { } while (0) +# define debug_mutex_init(lock) do { } while (0) #endif /* !CONFIG_DEBUG_MUTEXES */ #endif /* CONFIG_PREEMPT_RT */ diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c index bafd5af98eae..59dbd29cb219 100644 --- a/kernel/locking/rtmutex_api.c +++ b/kernel/locking/rtmutex_api.c @@ -515,13 +515,11 @@ void rt_mutex_debug_task_free(struct task_struct *task) #ifdef CONFIG_PREEMPT_RT /* Mutexes */ -void __mutex_rt_init(struct mutex *mutex, const char *name, - struct lock_class_key *key) +static void __mutex_rt_init_generic(struct mutex *mutex) { + rt_mutex_base_init(&mutex->rtmutex); debug_check_no_locks_freed((void *)mutex, sizeof(*mutex)); - lockdep_init_map_wait(&mutex->dep_map, name, key, 0, LD_WAIT_SLEEP); } -EXPORT_SYMBOL(__mutex_rt_init); static __always_inline int __mutex_lock_common(struct mutex *lock, unsigned int state, @@ -542,6 +540,13 @@ static __always_inline int __mutex_lock_common(struct mutex *lock, } #ifdef CONFIG_DEBUG_LOCK_ALLOC +void mutex_rt_init_lockdep(struct mutex *mutex, const char *name, struct lock_class_key *key) +{ + __mutex_rt_init_generic(mutex); + lockdep_init_map_wait(&mutex->dep_map, name, key, 0, LD_WAIT_SLEEP); +} +EXPORT_SYMBOL(mutex_rt_init_lockdep); + void __sched mutex_lock_nested(struct mutex *lock, unsigned int subclass) { __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_); @@ -598,6 +603,12 @@ int __sched _mutex_trylock_nest_lock(struct mutex *lock, EXPORT_SYMBOL_GPL(_mutex_trylock_nest_lock); #else /* CONFIG_DEBUG_LOCK_ALLOC */ +void mutex_rt_init_generic(struct mutex *mutex) +{ + __mutex_rt_init_generic(mutex); +} +EXPORT_SYMBOL(mutex_rt_init_generic); + void __sched mutex_lock(struct mutex *lock) { __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_); diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c index 87b03d2e41db..2338b3adfb55 100644 --- a/kernel/locking/spinlock_debug.c +++ b/kernel/locking/spinlock_debug.c @@ -184,8 +184,8 @@ void do_raw_read_unlock(rwlock_t *lock) static inline void debug_write_lock_before(rwlock_t *lock) { RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); - RWLOCK_BUG_ON(lock->owner == current, lock, "recursion"); - RWLOCK_BUG_ON(lock->owner_cpu == raw_smp_processor_id(), + RWLOCK_BUG_ON(READ_ONCE(lock->owner) == current, lock, "recursion"); + RWLOCK_BUG_ON(READ_ONCE(lock->owner_cpu) == raw_smp_processor_id(), lock, "cpu recursion"); } diff --git a/kernel/nscommon.c b/kernel/nscommon.c index c1fb2bad6d72..bdc3c86231d3 100644 --- a/kernel/nscommon.c +++ b/kernel/nscommon.c @@ -1,7 +1,10 @@ // SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */ #include <linux/ns_common.h> +#include <linux/nstree.h> #include <linux/proc_ns.h> +#include <linux/user_namespace.h> #include <linux/vfsdebug.h> #ifdef CONFIG_DEBUG_VFS @@ -52,26 +55,257 @@ static void ns_debug(struct ns_common *ns, const struct proc_ns_operations *ops) int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum) { + int ret = 0; + refcount_set(&ns->__ns_ref, 1); ns->stashed = NULL; ns->ops = ops; ns->ns_id = 0; ns->ns_type = ns_type; - RB_CLEAR_NODE(&ns->ns_tree_node); - INIT_LIST_HEAD(&ns->ns_list_node); + ns_tree_node_init(&ns->ns_tree_node); + ns_tree_node_init(&ns->ns_unified_node); + ns_tree_node_init(&ns->ns_owner_node); + ns_tree_root_init(&ns->ns_owner_root); #ifdef CONFIG_DEBUG_VFS ns_debug(ns, ops); #endif - if (inum) { + if (inum) ns->inum = inum; - return 0; - } - return proc_alloc_inum(&ns->inum); + else + ret = proc_alloc_inum(&ns->inum); + if (ret) + return ret; + /* + * Tree ref starts at 0. It's incremented when namespace enters + * active use (installed in nsproxy) and decremented when all + * active uses are gone. Initial namespaces are always active. + */ + if (is_ns_init_inum(ns)) + atomic_set(&ns->__ns_ref_active, 1); + else + atomic_set(&ns->__ns_ref_active, 0); + return 0; } void __ns_common_free(struct ns_common *ns) { proc_free_inum(ns->inum); } + +struct ns_common *__must_check ns_owner(struct ns_common *ns) +{ + struct user_namespace *owner; + + if (unlikely(!ns->ops)) + return NULL; + VFS_WARN_ON_ONCE(!ns->ops->owner); + owner = ns->ops->owner(ns); + VFS_WARN_ON_ONCE(!owner && ns != to_ns_common(&init_user_ns)); + if (!owner) + return NULL; + /* Skip init_user_ns as it's always active */ + if (owner == &init_user_ns) + return NULL; + return to_ns_common(owner); +} + +/* + * The active reference count works by having each namespace that gets + * created take a single active reference on its owning user namespace. + * That single reference is only released once the child namespace's + * active count itself goes down. + * + * A regular namespace tree might look as follow: + * Legend: + * + : adding active reference + * - : dropping active reference + * x : always active (initial namespace) + * + * + * net_ns pid_ns + * \ / + * + + + * user_ns1 (2) + * | + * ipc_ns | uts_ns + * \ | / + * + + + + * user_ns2 (3) + * | + * cgroup_ns | mnt_ns + * \ | / + * x x x + * init_user_ns (1) + * + * If both net_ns and pid_ns put their last active reference on + * themselves it will cascade to user_ns1 dropping its own active + * reference and dropping one active reference on user_ns2: + * + * net_ns pid_ns + * \ / + * - - + * user_ns1 (0) + * | + * ipc_ns | uts_ns + * \ | / + * + - + + * user_ns2 (2) + * | + * cgroup_ns | mnt_ns + * \ | / + * x x x + * init_user_ns (1) + * + * The iteration stops once we reach a namespace that still has active + * references. + */ +void __ns_ref_active_put(struct ns_common *ns) +{ + /* Initial namespaces are always active. */ + if (is_ns_init_id(ns)) + return; + + if (!atomic_dec_and_test(&ns->__ns_ref_active)) { + VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) < 0); + return; + } + + VFS_WARN_ON_ONCE(is_ns_init_id(ns)); + VFS_WARN_ON_ONCE(!__ns_ref_read(ns)); + + for (;;) { + ns = ns_owner(ns); + if (!ns) + return; + VFS_WARN_ON_ONCE(is_ns_init_id(ns)); + if (!atomic_dec_and_test(&ns->__ns_ref_active)) { + VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) < 0); + return; + } + } +} + +/* + * The active reference count works by having each namespace that gets + * created take a single active reference on its owning user namespace. + * That single reference is only released once the child namespace's + * active count itself goes down. This makes it possible to efficiently + * resurrect a namespace tree: + * + * A regular namespace tree might look as follow: + * Legend: + * + : adding active reference + * - : dropping active reference + * x : always active (initial namespace) + * + * + * net_ns pid_ns + * \ / + * + + + * user_ns1 (2) + * | + * ipc_ns | uts_ns + * \ | / + * + + + + * user_ns2 (3) + * | + * cgroup_ns | mnt_ns + * \ | / + * x x x + * init_user_ns (1) + * + * If both net_ns and pid_ns put their last active reference on + * themselves it will cascade to user_ns1 dropping its own active + * reference and dropping one active reference on user_ns2: + * + * net_ns pid_ns + * \ / + * - - + * user_ns1 (0) + * | + * ipc_ns | uts_ns + * \ | / + * + - + + * user_ns2 (2) + * | + * cgroup_ns | mnt_ns + * \ | / + * x x x + * init_user_ns (1) + * + * Assume the whole tree is dead but all namespaces are still active: + * + * net_ns pid_ns + * \ / + * - - + * user_ns1 (0) + * | + * ipc_ns | uts_ns + * \ | / + * - - - + * user_ns2 (0) + * | + * cgroup_ns | mnt_ns + * \ | / + * x x x + * init_user_ns (1) + * + * Now assume the net_ns gets resurrected (.e.g., via the SIOCGSKNS ioctl()): + * + * net_ns pid_ns + * \ / + * + - + * user_ns1 (0) + * | + * ipc_ns | uts_ns + * \ | / + * - + - + * user_ns2 (0) + * | + * cgroup_ns | mnt_ns + * \ | / + * x x x + * init_user_ns (1) + * + * If net_ns had a zero reference count and we bumped it we also need to + * take another reference on its owning user namespace. Similarly, if + * pid_ns had a zero reference count it also needs to take another + * reference on its owning user namespace. So both net_ns and pid_ns + * will each have their own reference on the owning user namespace. + * + * If the owning user namespace user_ns1 had a zero reference count then + * it also needs to take another reference on its owning user namespace + * and so on. + */ +void __ns_ref_active_get(struct ns_common *ns) +{ + int prev; + + /* Initial namespaces are always active. */ + if (is_ns_init_id(ns)) + return; + + /* If we didn't resurrect the namespace we're done. */ + prev = atomic_fetch_add(1, &ns->__ns_ref_active); + VFS_WARN_ON_ONCE(prev < 0); + if (likely(prev)) + return; + + /* + * We did resurrect it. Walk the ownership hierarchy upwards + * until we found an owning user namespace that is active. + */ + for (;;) { + ns = ns_owner(ns); + if (!ns) + return; + + VFS_WARN_ON_ONCE(is_ns_init_id(ns)); + prev = atomic_fetch_add(1, &ns->__ns_ref_active); + VFS_WARN_ON_ONCE(prev < 0); + if (likely(prev)) + return; + } +} diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 19aa64ab08c8..259c4b4f1eeb 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -26,6 +26,7 @@ #include <linux/syscalls.h> #include <linux/cgroup.h> #include <linux/perf_event.h> +#include <linux/nstree.h> static struct kmem_cache *nsproxy_cachep; @@ -59,6 +60,25 @@ static inline struct nsproxy *create_nsproxy(void) return nsproxy; } +static inline void nsproxy_free(struct nsproxy *ns) +{ + put_mnt_ns(ns->mnt_ns); + put_uts_ns(ns->uts_ns); + put_ipc_ns(ns->ipc_ns); + put_pid_ns(ns->pid_ns_for_children); + put_time_ns(ns->time_ns); + put_time_ns(ns->time_ns_for_children); + put_cgroup_ns(ns->cgroup_ns); + put_net(ns->net_ns); + kmem_cache_free(nsproxy_cachep, ns); +} + +void deactivate_nsproxy(struct nsproxy *ns) +{ + nsproxy_ns_active_put(ns); + nsproxy_free(ns); +} + /* * Create new nsproxy and all of its the associated namespaces. * Return the newly created nsproxy. Do not attach this to the task, @@ -179,23 +199,11 @@ int copy_namespaces(u64 flags, struct task_struct *tsk) if ((flags & CLONE_VM) == 0) timens_on_fork(new_ns, tsk); + nsproxy_ns_active_get(new_ns); tsk->nsproxy = new_ns; return 0; } -void free_nsproxy(struct nsproxy *ns) -{ - put_mnt_ns(ns->mnt_ns); - put_uts_ns(ns->uts_ns); - put_ipc_ns(ns->ipc_ns); - put_pid_ns(ns->pid_ns_for_children); - put_time_ns(ns->time_ns); - put_time_ns(ns->time_ns_for_children); - put_cgroup_ns(ns->cgroup_ns); - put_net(ns->net_ns); - kmem_cache_free(nsproxy_cachep, ns); -} - /* * Called from unshare. Unshare all the namespaces part of nsproxy. * On success, returns the new nsproxy. @@ -232,6 +240,9 @@ void switch_task_namespaces(struct task_struct *p, struct nsproxy *new) might_sleep(); + if (new) + nsproxy_ns_active_get(new); + task_lock(p); ns = p->nsproxy; p->nsproxy = new; @@ -241,11 +252,27 @@ void switch_task_namespaces(struct task_struct *p, struct nsproxy *new) put_nsproxy(ns); } -void exit_task_namespaces(struct task_struct *p) +void exit_nsproxy_namespaces(struct task_struct *p) { switch_task_namespaces(p, NULL); } +void switch_cred_namespaces(const struct cred *old, const struct cred *new) +{ + ns_ref_active_get(new->user_ns); + ns_ref_active_put(old->user_ns); +} + +void get_cred_namespaces(struct task_struct *tsk) +{ + ns_ref_active_get(tsk->real_cred->user_ns); +} + +void exit_cred_namespaces(struct task_struct *tsk) +{ + ns_ref_active_put(tsk->real_cred->user_ns); +} + int exec_task_namespaces(void) { struct task_struct *tsk = current; @@ -315,7 +342,7 @@ static void put_nsset(struct nsset *nsset) if (nsset->fs && (flags & CLONE_NEWNS) && (flags & ~CLONE_NEWNS)) free_fs_struct(nsset->fs); if (nsset->nsproxy) - free_nsproxy(nsset->nsproxy); + nsproxy_free(nsset->nsproxy); } static int prepare_nsset(unsigned flags, struct nsset *nsset) diff --git a/kernel/nstree.c b/kernel/nstree.c index b24a320a11a6..f36c59e6951d 100644 --- a/kernel/nstree.c +++ b/kernel/nstree.c @@ -1,140 +1,261 @@ // SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */ #include <linux/nstree.h> #include <linux/proc_ns.h> +#include <linux/rculist.h> #include <linux/vfsdebug.h> +#include <linux/syscalls.h> +#include <linux/user_namespace.h> -/** - * struct ns_tree - Namespace tree - * @ns_tree: Rbtree of namespaces of a particular type - * @ns_list: Sequentially walkable list of all namespaces of this type - * @ns_tree_lock: Seqlock to protect the tree and list - * @type: type of namespaces in this tree - */ -struct ns_tree { - struct rb_root ns_tree; - struct list_head ns_list; - seqlock_t ns_tree_lock; - int type; +static __cacheline_aligned_in_smp DEFINE_SEQLOCK(ns_tree_lock); + +DEFINE_LOCK_GUARD_0(ns_tree_writer, + write_seqlock(&ns_tree_lock), + write_sequnlock(&ns_tree_lock)) + +DEFINE_LOCK_GUARD_0(ns_tree_locked_reader, + read_seqlock_excl(&ns_tree_lock), + read_sequnlock_excl(&ns_tree_lock)) + +static struct ns_tree_root ns_unified_root = { /* protected by ns_tree_lock */ + .ns_rb = RB_ROOT, + .ns_list_head = LIST_HEAD_INIT(ns_unified_root.ns_list_head), }; -struct ns_tree mnt_ns_tree = { - .ns_tree = RB_ROOT, - .ns_list = LIST_HEAD_INIT(mnt_ns_tree.ns_list), - .ns_tree_lock = __SEQLOCK_UNLOCKED(mnt_ns_tree.ns_tree_lock), - .type = CLONE_NEWNS, +struct ns_tree_root mnt_ns_tree = { + .ns_rb = RB_ROOT, + .ns_list_head = LIST_HEAD_INIT(mnt_ns_tree.ns_list_head), }; -struct ns_tree net_ns_tree = { - .ns_tree = RB_ROOT, - .ns_list = LIST_HEAD_INIT(net_ns_tree.ns_list), - .ns_tree_lock = __SEQLOCK_UNLOCKED(net_ns_tree.ns_tree_lock), - .type = CLONE_NEWNET, +struct ns_tree_root net_ns_tree = { + .ns_rb = RB_ROOT, + .ns_list_head = LIST_HEAD_INIT(net_ns_tree.ns_list_head), }; EXPORT_SYMBOL_GPL(net_ns_tree); -struct ns_tree uts_ns_tree = { - .ns_tree = RB_ROOT, - .ns_list = LIST_HEAD_INIT(uts_ns_tree.ns_list), - .ns_tree_lock = __SEQLOCK_UNLOCKED(uts_ns_tree.ns_tree_lock), - .type = CLONE_NEWUTS, +struct ns_tree_root uts_ns_tree = { + .ns_rb = RB_ROOT, + .ns_list_head = LIST_HEAD_INIT(uts_ns_tree.ns_list_head), }; -struct ns_tree user_ns_tree = { - .ns_tree = RB_ROOT, - .ns_list = LIST_HEAD_INIT(user_ns_tree.ns_list), - .ns_tree_lock = __SEQLOCK_UNLOCKED(user_ns_tree.ns_tree_lock), - .type = CLONE_NEWUSER, +struct ns_tree_root user_ns_tree = { + .ns_rb = RB_ROOT, + .ns_list_head = LIST_HEAD_INIT(user_ns_tree.ns_list_head), }; -struct ns_tree ipc_ns_tree = { - .ns_tree = RB_ROOT, - .ns_list = LIST_HEAD_INIT(ipc_ns_tree.ns_list), - .ns_tree_lock = __SEQLOCK_UNLOCKED(ipc_ns_tree.ns_tree_lock), - .type = CLONE_NEWIPC, +struct ns_tree_root ipc_ns_tree = { + .ns_rb = RB_ROOT, + .ns_list_head = LIST_HEAD_INIT(ipc_ns_tree.ns_list_head), }; -struct ns_tree pid_ns_tree = { - .ns_tree = RB_ROOT, - .ns_list = LIST_HEAD_INIT(pid_ns_tree.ns_list), - .ns_tree_lock = __SEQLOCK_UNLOCKED(pid_ns_tree.ns_tree_lock), - .type = CLONE_NEWPID, +struct ns_tree_root pid_ns_tree = { + .ns_rb = RB_ROOT, + .ns_list_head = LIST_HEAD_INIT(pid_ns_tree.ns_list_head), }; -struct ns_tree cgroup_ns_tree = { - .ns_tree = RB_ROOT, - .ns_list = LIST_HEAD_INIT(cgroup_ns_tree.ns_list), - .ns_tree_lock = __SEQLOCK_UNLOCKED(cgroup_ns_tree.ns_tree_lock), - .type = CLONE_NEWCGROUP, +struct ns_tree_root cgroup_ns_tree = { + .ns_rb = RB_ROOT, + .ns_list_head = LIST_HEAD_INIT(cgroup_ns_tree.ns_list_head), }; -struct ns_tree time_ns_tree = { - .ns_tree = RB_ROOT, - .ns_list = LIST_HEAD_INIT(time_ns_tree.ns_list), - .ns_tree_lock = __SEQLOCK_UNLOCKED(time_ns_tree.ns_tree_lock), - .type = CLONE_NEWTIME, +struct ns_tree_root time_ns_tree = { + .ns_rb = RB_ROOT, + .ns_list_head = LIST_HEAD_INIT(time_ns_tree.ns_list_head), }; -DEFINE_COOKIE(namespace_cookie); +/** + * ns_tree_node_init - Initialize a namespace tree node + * @node: The node to initialize + * + * Initializes both the rbtree node and list entry. + */ +void ns_tree_node_init(struct ns_tree_node *node) +{ + RB_CLEAR_NODE(&node->ns_node); + INIT_LIST_HEAD(&node->ns_list_entry); +} + +/** + * ns_tree_root_init - Initialize a namespace tree root + * @root: The root to initialize + * + * Initializes both the rbtree root and list head. + */ +void ns_tree_root_init(struct ns_tree_root *root) +{ + root->ns_rb = RB_ROOT; + INIT_LIST_HEAD(&root->ns_list_head); +} + +/** + * ns_tree_node_empty - Check if a namespace tree node is empty + * @node: The node to check + * + * Returns true if the node is not in any tree. + */ +bool ns_tree_node_empty(const struct ns_tree_node *node) +{ + return RB_EMPTY_NODE(&node->ns_node); +} + +/** + * ns_tree_node_add - Add a node to a namespace tree + * @node: The node to add + * @root: The tree root to add to + * @cmp: Comparison function for rbtree insertion + * + * Adds the node to both the rbtree and the list, maintaining sorted order. + * The list is maintained in the same order as the rbtree to enable efficient + * iteration. + * + * Returns: NULL if insertion succeeded, existing node if duplicate found + */ +struct rb_node *ns_tree_node_add(struct ns_tree_node *node, + struct ns_tree_root *root, + int (*cmp)(struct rb_node *, const struct rb_node *)) +{ + struct rb_node *ret, *prev; + + /* Add to rbtree */ + ret = rb_find_add_rcu(&node->ns_node, &root->ns_rb, cmp); + + /* Add to list in sorted order */ + prev = rb_prev(&node->ns_node); + if (!prev) { + /* No previous node, add at head */ + list_add_rcu(&node->ns_list_entry, &root->ns_list_head); + } else { + /* Add after previous node */ + struct ns_tree_node *prev_node; + prev_node = rb_entry(prev, struct ns_tree_node, ns_node); + list_add_rcu(&node->ns_list_entry, &prev_node->ns_list_entry); + } + + return ret; +} + +/** + * ns_tree_node_del - Remove a node from a namespace tree + * @node: The node to remove + * @root: The tree root to remove from + * + * Removes the node from both the rbtree and the list atomically. + */ +void ns_tree_node_del(struct ns_tree_node *node, struct ns_tree_root *root) +{ + rb_erase(&node->ns_node, &root->ns_rb); + RB_CLEAR_NODE(&node->ns_node); + list_bidir_del_rcu(&node->ns_list_entry); +} static inline struct ns_common *node_to_ns(const struct rb_node *node) { if (!node) return NULL; - return rb_entry(node, struct ns_common, ns_tree_node); + return rb_entry(node, struct ns_common, ns_tree_node.ns_node); } -static inline int ns_cmp(struct rb_node *a, const struct rb_node *b) +static inline struct ns_common *node_to_ns_unified(const struct rb_node *node) { - struct ns_common *ns_a = node_to_ns(a); - struct ns_common *ns_b = node_to_ns(b); - u64 ns_id_a = ns_a->ns_id; - u64 ns_id_b = ns_b->ns_id; + if (!node) + return NULL; + return rb_entry(node, struct ns_common, ns_unified_node.ns_node); +} - if (ns_id_a < ns_id_b) +static inline struct ns_common *node_to_ns_owner(const struct rb_node *node) +{ + if (!node) + return NULL; + return rb_entry(node, struct ns_common, ns_owner_node.ns_node); +} + +static int ns_id_cmp(u64 id_a, u64 id_b) +{ + if (id_a < id_b) return -1; - if (ns_id_a > ns_id_b) + if (id_a > id_b) return 1; return 0; } -void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree) +static int ns_cmp(struct rb_node *a, const struct rb_node *b) +{ + return ns_id_cmp(node_to_ns(a)->ns_id, node_to_ns(b)->ns_id); +} + +static int ns_cmp_unified(struct rb_node *a, const struct rb_node *b) +{ + return ns_id_cmp(node_to_ns_unified(a)->ns_id, node_to_ns_unified(b)->ns_id); +} + +static int ns_cmp_owner(struct rb_node *a, const struct rb_node *b) { - struct rb_node *node, *prev; + return ns_id_cmp(node_to_ns_owner(a)->ns_id, node_to_ns_owner(b)->ns_id); +} + +void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree_root *ns_tree) +{ + struct rb_node *node; + const struct proc_ns_operations *ops = ns->ops; VFS_WARN_ON_ONCE(!ns->ns_id); - write_seqlock(&ns_tree->ns_tree_lock); + guard(ns_tree_writer)(); - VFS_WARN_ON_ONCE(ns->ns_type != ns_tree->type); + /* Add to per-type tree and list */ + node = ns_tree_node_add(&ns->ns_tree_node, ns_tree, ns_cmp); - node = rb_find_add_rcu(&ns->ns_tree_node, &ns_tree->ns_tree, ns_cmp); - /* - * If there's no previous entry simply add it after the - * head and if there is add it after the previous entry. - */ - prev = rb_prev(&ns->ns_tree_node); - if (!prev) - list_add_rcu(&ns->ns_list_node, &ns_tree->ns_list); - else - list_add_rcu(&ns->ns_list_node, &node_to_ns(prev)->ns_list_node); + /* Add to unified tree and list */ + ns_tree_node_add(&ns->ns_unified_node, &ns_unified_root, ns_cmp_unified); + + /* Add to owner's tree if applicable */ + if (ops) { + struct user_namespace *user_ns; - write_sequnlock(&ns_tree->ns_tree_lock); + VFS_WARN_ON_ONCE(!ops->owner); + user_ns = ops->owner(ns); + if (user_ns) { + struct ns_common *owner = &user_ns->ns; + VFS_WARN_ON_ONCE(owner->ns_type != CLONE_NEWUSER); + + /* Insert into owner's tree and list */ + ns_tree_node_add(&ns->ns_owner_node, &owner->ns_owner_root, ns_cmp_owner); + } else { + /* Only the initial user namespace doesn't have an owner. */ + VFS_WARN_ON_ONCE(ns != to_ns_common(&init_user_ns)); + } + } VFS_WARN_ON_ONCE(node); } -void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree) +void __ns_tree_remove(struct ns_common *ns, struct ns_tree_root *ns_tree) { - VFS_WARN_ON_ONCE(RB_EMPTY_NODE(&ns->ns_tree_node)); - VFS_WARN_ON_ONCE(list_empty(&ns->ns_list_node)); - VFS_WARN_ON_ONCE(ns->ns_type != ns_tree->type); + const struct proc_ns_operations *ops = ns->ops; + struct user_namespace *user_ns; + + VFS_WARN_ON_ONCE(ns_tree_node_empty(&ns->ns_tree_node)); + VFS_WARN_ON_ONCE(list_empty(&ns->ns_tree_node.ns_list_entry)); + + write_seqlock(&ns_tree_lock); + + /* Remove from per-type tree and list */ + ns_tree_node_del(&ns->ns_tree_node, ns_tree); + + /* Remove from unified tree and list */ + ns_tree_node_del(&ns->ns_unified_node, &ns_unified_root); - write_seqlock(&ns_tree->ns_tree_lock); - rb_erase(&ns->ns_tree_node, &ns_tree->ns_tree); - list_bidir_del_rcu(&ns->ns_list_node); - RB_CLEAR_NODE(&ns->ns_tree_node); - write_sequnlock(&ns_tree->ns_tree_lock); + /* Remove from owner's tree if applicable */ + if (ops) { + user_ns = ops->owner(ns); + if (user_ns) { + struct ns_common *owner = &user_ns->ns; + ns_tree_node_del(&ns->ns_owner_node, &owner->ns_owner_root); + } + } + + write_sequnlock(&ns_tree_lock); } EXPORT_SYMBOL_GPL(__ns_tree_remove); @@ -150,8 +271,19 @@ static int ns_find(const void *key, const struct rb_node *node) return 0; } +static int ns_find_unified(const void *key, const struct rb_node *node) +{ + const u64 ns_id = *(u64 *)key; + const struct ns_common *ns = node_to_ns_unified(node); -static struct ns_tree *ns_tree_from_type(int ns_type) + if (ns_id < ns->ns_id) + return -1; + if (ns_id > ns->ns_id) + return 1; + return 0; +} + +static struct ns_tree_root *ns_tree_from_type(int ns_type) { switch (ns_type) { case CLONE_NEWCGROUP: @@ -175,73 +307,507 @@ static struct ns_tree *ns_tree_from_type(int ns_type) return NULL; } -struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type) +static struct ns_common *__ns_unified_tree_lookup_rcu(u64 ns_id) { - struct ns_tree *ns_tree; struct rb_node *node; unsigned int seq; - RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious ns_tree_lookup_rcu() usage"); + do { + seq = read_seqbegin(&ns_tree_lock); + node = rb_find_rcu(&ns_id, &ns_unified_root.ns_rb, ns_find_unified); + if (node) + break; + } while (read_seqretry(&ns_tree_lock, seq)); + + return node_to_ns_unified(node); +} + +static struct ns_common *__ns_tree_lookup_rcu(u64 ns_id, int ns_type) +{ + struct ns_tree_root *ns_tree; + struct rb_node *node; + unsigned int seq; ns_tree = ns_tree_from_type(ns_type); if (!ns_tree) return NULL; do { - seq = read_seqbegin(&ns_tree->ns_tree_lock); - node = rb_find_rcu(&ns_id, &ns_tree->ns_tree, ns_find); + seq = read_seqbegin(&ns_tree_lock); + node = rb_find_rcu(&ns_id, &ns_tree->ns_rb, ns_find); if (node) break; - } while (read_seqretry(&ns_tree->ns_tree_lock, seq)); + } while (read_seqretry(&ns_tree_lock, seq)); - if (!node) - return NULL; + return node_to_ns(node); +} - VFS_WARN_ON_ONCE(node_to_ns(node)->ns_type != ns_type); +struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type) +{ + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious ns_tree_lookup_rcu() usage"); - return node_to_ns(node); + if (ns_type) + return __ns_tree_lookup_rcu(ns_id, ns_type); + + return __ns_unified_tree_lookup_rcu(ns_id); } /** - * ns_tree_adjoined_rcu - find the next/previous namespace in the same + * __ns_tree_adjoined_rcu - find the next/previous namespace in the same * tree * @ns: namespace to start from + * @ns_tree: namespace tree to search in * @previous: if true find the previous namespace, otherwise the next * * Find the next or previous namespace in the same tree as @ns. If * there is no next/previous namespace, -ENOENT is returned. */ struct ns_common *__ns_tree_adjoined_rcu(struct ns_common *ns, - struct ns_tree *ns_tree, bool previous) + struct ns_tree_root *ns_tree, bool previous) { struct list_head *list; RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious ns_tree_adjoined_rcu() usage"); if (previous) - list = rcu_dereference(list_bidir_prev_rcu(&ns->ns_list_node)); + list = rcu_dereference(list_bidir_prev_rcu(&ns->ns_tree_node.ns_list_entry)); else - list = rcu_dereference(list_next_rcu(&ns->ns_list_node)); - if (list_is_head(list, &ns_tree->ns_list)) + list = rcu_dereference(list_next_rcu(&ns->ns_tree_node.ns_list_entry)); + if (list_is_head(list, &ns_tree->ns_list_head)) return ERR_PTR(-ENOENT); - VFS_WARN_ON_ONCE(list_entry_rcu(list, struct ns_common, ns_list_node)->ns_type != ns_tree->type); - - return list_entry_rcu(list, struct ns_common, ns_list_node); + return list_entry_rcu(list, struct ns_common, ns_tree_node.ns_list_entry); } /** - * ns_tree_gen_id - generate a new namespace id + * __ns_tree_gen_id - generate a new namespace id * @ns: namespace to generate id for + * @id: if non-zero, this is the initial namespace and this is a fixed id * * Generates a new namespace id and assigns it to the namespace. All * namespaces types share the same id space and thus can be compared * directly. IOW, when two ids of two namespace are equal, they are * identical. */ -u64 ns_tree_gen_id(struct ns_common *ns) +u64 __ns_tree_gen_id(struct ns_common *ns, u64 id) { - guard(preempt)(); - ns->ns_id = gen_cookie_next(&namespace_cookie); + static atomic64_t namespace_cookie = ATOMIC64_INIT(NS_LAST_INIT_ID + 1); + + if (id) + ns->ns_id = id; + else + ns->ns_id = atomic64_inc_return(&namespace_cookie); return ns->ns_id; } + +struct klistns { + u64 __user *uns_ids; + u32 nr_ns_ids; + u64 last_ns_id; + u64 user_ns_id; + u32 ns_type; + struct user_namespace *user_ns; + bool userns_capable; + struct ns_common *first_ns; +}; + +static void __free_klistns_free(const struct klistns *kls) +{ + if (kls->user_ns_id != LISTNS_CURRENT_USER) + put_user_ns(kls->user_ns); + if (kls->first_ns && kls->first_ns->ops) + kls->first_ns->ops->put(kls->first_ns); +} + +#define NS_ALL (PID_NS | USER_NS | MNT_NS | UTS_NS | IPC_NS | NET_NS | CGROUP_NS | TIME_NS) + +static int copy_ns_id_req(const struct ns_id_req __user *req, + struct ns_id_req *kreq) +{ + int ret; + size_t usize; + + BUILD_BUG_ON(sizeof(struct ns_id_req) != NS_ID_REQ_SIZE_VER0); + + ret = get_user(usize, &req->size); + if (ret) + return -EFAULT; + if (unlikely(usize > PAGE_SIZE)) + return -E2BIG; + if (unlikely(usize < NS_ID_REQ_SIZE_VER0)) + return -EINVAL; + memset(kreq, 0, sizeof(*kreq)); + ret = copy_struct_from_user(kreq, sizeof(*kreq), req, usize); + if (ret) + return ret; + if (kreq->spare != 0) + return -EINVAL; + if (kreq->ns_type & ~NS_ALL) + return -EOPNOTSUPP; + return 0; +} + +static inline int prepare_klistns(struct klistns *kls, struct ns_id_req *kreq, + u64 __user *ns_ids, size_t nr_ns_ids) +{ + kls->last_ns_id = kreq->ns_id; + kls->user_ns_id = kreq->user_ns_id; + kls->nr_ns_ids = nr_ns_ids; + kls->ns_type = kreq->ns_type; + kls->uns_ids = ns_ids; + return 0; +} + +/* + * Lookup a namespace owned by owner with id >= ns_id. + * Returns the namespace with the smallest id that is >= ns_id. + */ +static struct ns_common *lookup_ns_owner_at(u64 ns_id, struct ns_common *owner) +{ + struct ns_common *ret = NULL; + struct rb_node *node; + + VFS_WARN_ON_ONCE(owner->ns_type != CLONE_NEWUSER); + + guard(ns_tree_locked_reader)(); + + node = owner->ns_owner_root.ns_rb.rb_node; + while (node) { + struct ns_common *ns; + + ns = node_to_ns_owner(node); + if (ns_id <= ns->ns_id) { + ret = ns; + if (ns_id == ns->ns_id) + break; + node = node->rb_left; + } else { + node = node->rb_right; + } + } + + if (ret) + ret = ns_get_unless_inactive(ret); + return ret; +} + +static struct ns_common *lookup_ns_id(u64 mnt_ns_id, int ns_type) +{ + struct ns_common *ns; + + guard(rcu)(); + ns = ns_tree_lookup_rcu(mnt_ns_id, ns_type); + if (!ns) + return NULL; + + if (!ns_get_unless_inactive(ns)) + return NULL; + + return ns; +} + +static inline bool __must_check ns_requested(const struct klistns *kls, + const struct ns_common *ns) +{ + return !kls->ns_type || (kls->ns_type & ns->ns_type); +} + +static inline bool __must_check may_list_ns(const struct klistns *kls, + struct ns_common *ns) +{ + if (kls->user_ns) { + if (kls->userns_capable) + return true; + } else { + struct ns_common *owner; + struct user_namespace *user_ns; + + owner = ns_owner(ns); + if (owner) + user_ns = to_user_ns(owner); + else + user_ns = &init_user_ns; + if (ns_capable_noaudit(user_ns, CAP_SYS_ADMIN)) + return true; + } + + if (is_current_namespace(ns)) + return true; + + if (ns->ns_type != CLONE_NEWUSER) + return false; + + if (ns_capable_noaudit(to_user_ns(ns), CAP_SYS_ADMIN)) + return true; + + return false; +} + +static inline void ns_put(struct ns_common *ns) +{ + if (ns && ns->ops) + ns->ops->put(ns); +} + +DEFINE_FREE(ns_put, struct ns_common *, if (!IS_ERR_OR_NULL(_T)) ns_put(_T)) + +static inline struct ns_common *__must_check legitimize_ns(const struct klistns *kls, + struct ns_common *candidate) +{ + struct ns_common *ns __free(ns_put) = NULL; + + if (!ns_requested(kls, candidate)) + return NULL; + + ns = ns_get_unless_inactive(candidate); + if (!ns) + return NULL; + + if (!may_list_ns(kls, ns)) + return NULL; + + return no_free_ptr(ns); +} + +static ssize_t do_listns_userns(struct klistns *kls) +{ + u64 __user *ns_ids = kls->uns_ids; + size_t nr_ns_ids = kls->nr_ns_ids; + struct ns_common *ns = NULL, *first_ns = NULL, *prev = NULL; + const struct list_head *head; + ssize_t ret; + + VFS_WARN_ON_ONCE(!kls->user_ns_id); + + if (kls->user_ns_id == LISTNS_CURRENT_USER) + ns = to_ns_common(current_user_ns()); + else if (kls->user_ns_id) + ns = lookup_ns_id(kls->user_ns_id, CLONE_NEWUSER); + if (!ns) + return -EINVAL; + kls->user_ns = to_user_ns(ns); + + /* + * Use the rbtree to find the first namespace we care about and + * then use it's list entry to iterate from there. + */ + if (kls->last_ns_id) { + kls->first_ns = lookup_ns_owner_at(kls->last_ns_id + 1, ns); + if (!kls->first_ns) + return -ENOENT; + first_ns = kls->first_ns; + } + + ret = 0; + head = &to_ns_common(kls->user_ns)->ns_owner_root.ns_list_head; + kls->userns_capable = ns_capable_noaudit(kls->user_ns, CAP_SYS_ADMIN); + + rcu_read_lock(); + + if (!first_ns) + first_ns = list_entry_rcu(head->next, typeof(*first_ns), ns_owner_node.ns_list_entry); + + ns = first_ns; + list_for_each_entry_from_rcu(ns, head, ns_owner_node.ns_list_entry) { + struct ns_common *valid; + + if (!nr_ns_ids) + break; + + valid = legitimize_ns(kls, ns); + if (!valid) + continue; + + rcu_read_unlock(); + + ns_put(prev); + prev = valid; + + if (put_user(valid->ns_id, ns_ids + ret)) { + ns_put(prev); + return -EFAULT; + } + + nr_ns_ids--; + ret++; + + rcu_read_lock(); + } + + rcu_read_unlock(); + ns_put(prev); + return ret; +} + +/* + * Lookup a namespace with id >= ns_id in either the unified tree or a type-specific tree. + * Returns the namespace with the smallest id that is >= ns_id. + */ +static struct ns_common *lookup_ns_id_at(u64 ns_id, int ns_type) +{ + struct ns_common *ret = NULL; + struct ns_tree_root *ns_tree = NULL; + struct rb_node *node; + + if (ns_type) { + ns_tree = ns_tree_from_type(ns_type); + if (!ns_tree) + return NULL; + } + + guard(ns_tree_locked_reader)(); + + if (ns_tree) + node = ns_tree->ns_rb.rb_node; + else + node = ns_unified_root.ns_rb.rb_node; + + while (node) { + struct ns_common *ns; + + if (ns_type) + ns = node_to_ns(node); + else + ns = node_to_ns_unified(node); + + if (ns_id <= ns->ns_id) { + if (ns_type) + ret = node_to_ns(node); + else + ret = node_to_ns_unified(node); + if (ns_id == ns->ns_id) + break; + node = node->rb_left; + } else { + node = node->rb_right; + } + } + + if (ret) + ret = ns_get_unless_inactive(ret); + return ret; +} + +static inline struct ns_common *first_ns_common(const struct list_head *head, + struct ns_tree_root *ns_tree) +{ + if (ns_tree) + return list_entry_rcu(head->next, struct ns_common, ns_tree_node.ns_list_entry); + return list_entry_rcu(head->next, struct ns_common, ns_unified_node.ns_list_entry); +} + +static inline struct ns_common *next_ns_common(struct ns_common *ns, + struct ns_tree_root *ns_tree) +{ + if (ns_tree) + return list_entry_rcu(ns->ns_tree_node.ns_list_entry.next, struct ns_common, ns_tree_node.ns_list_entry); + return list_entry_rcu(ns->ns_unified_node.ns_list_entry.next, struct ns_common, ns_unified_node.ns_list_entry); +} + +static inline bool ns_common_is_head(struct ns_common *ns, + const struct list_head *head, + struct ns_tree_root *ns_tree) +{ + if (ns_tree) + return &ns->ns_tree_node.ns_list_entry == head; + return &ns->ns_unified_node.ns_list_entry == head; +} + +static ssize_t do_listns(struct klistns *kls) +{ + u64 __user *ns_ids = kls->uns_ids; + size_t nr_ns_ids = kls->nr_ns_ids; + struct ns_common *ns, *first_ns = NULL, *prev = NULL; + struct ns_tree_root *ns_tree = NULL; + const struct list_head *head; + u32 ns_type; + ssize_t ret; + + if (hweight32(kls->ns_type) == 1) + ns_type = kls->ns_type; + else + ns_type = 0; + + if (ns_type) { + ns_tree = ns_tree_from_type(ns_type); + if (!ns_tree) + return -EINVAL; + } + + if (kls->last_ns_id) { + kls->first_ns = lookup_ns_id_at(kls->last_ns_id + 1, ns_type); + if (!kls->first_ns) + return -ENOENT; + first_ns = kls->first_ns; + } + + ret = 0; + if (ns_tree) + head = &ns_tree->ns_list_head; + else + head = &ns_unified_root.ns_list_head; + + rcu_read_lock(); + + if (!first_ns) + first_ns = first_ns_common(head, ns_tree); + + for (ns = first_ns; !ns_common_is_head(ns, head, ns_tree) && nr_ns_ids; + ns = next_ns_common(ns, ns_tree)) { + struct ns_common *valid; + + valid = legitimize_ns(kls, ns); + if (!valid) + continue; + + rcu_read_unlock(); + + ns_put(prev); + prev = valid; + + if (put_user(valid->ns_id, ns_ids + ret)) { + ns_put(prev); + return -EFAULT; + } + + nr_ns_ids--; + ret++; + + rcu_read_lock(); + } + + rcu_read_unlock(); + ns_put(prev); + return ret; +} + +SYSCALL_DEFINE4(listns, const struct ns_id_req __user *, req, + u64 __user *, ns_ids, size_t, nr_ns_ids, unsigned int, flags) +{ + struct klistns klns __free(klistns_free) = {}; + const size_t maxcount = 1000000; + struct ns_id_req kreq; + ssize_t ret; + + if (flags) + return -EINVAL; + + if (unlikely(nr_ns_ids > maxcount)) + return -EOVERFLOW; + + if (!access_ok(ns_ids, nr_ns_ids * sizeof(*ns_ids))) + return -EFAULT; + + ret = copy_ns_id_req(req, &kreq); + if (ret) + return ret; + + ret = prepare_klistns(&klns, &kreq, ns_ids, nr_ns_ids); + if (ret) + return ret; + + if (kreq.user_ns_id) + return do_listns_userns(&klns); + + return do_listns(&klns); +} diff --git a/kernel/pid.c b/kernel/pid.c index 4fffec767a63..a31771bc89c1 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -71,21 +71,16 @@ static int pid_max_max = PID_MAX_LIMIT; * the scheme scales to up to 4 million PIDs, runtime. */ struct pid_namespace init_pid_ns = { - .ns.__ns_ref = REFCOUNT_INIT(2), + .ns = NS_COMMON_INIT(init_pid_ns), .idr = IDR_INIT(init_pid_ns.idr), .pid_allocated = PIDNS_ADDING, .level = 0, .child_reaper = &init_task, .user_ns = &init_user_ns, - .ns.inum = ns_init_inum(&init_pid_ns), -#ifdef CONFIG_PID_NS - .ns.ops = &pidns_operations, -#endif .pid_max = PID_MAX_DEFAULT, #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) .memfd_noexec_scope = MEMFD_NOEXEC_SCOPE_EXEC, #endif - .ns.ns_type = ns_common_type(&init_pid_ns), }; EXPORT_SYMBOL_GPL(init_pid_ns); @@ -117,9 +112,13 @@ static void delayed_put_pid(struct rcu_head *rhp) void free_pid(struct pid *pid) { int i; + struct pid_namespace *active_ns; lockdep_assert_not_held(&tasklist_lock); + active_ns = pid->numbers[pid->level].ns; + ns_ref_active_put(active_ns); + spin_lock(&pidmap_lock); for (i = 0; i <= pid->level; i++) { struct upid *upid = pid->numbers + i; @@ -283,6 +282,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, } spin_unlock(&pidmap_lock); idr_preload_end(); + ns_ref_active_get(ns); return pid; diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 650be58d8d18..e48f5de41361 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -184,7 +184,7 @@ struct pid_namespace *copy_pid_ns(u64 flags, void put_pid_ns(struct pid_namespace *ns) { - if (ns && ns != &init_pid_ns && ns_ref_put(ns)) + if (ns && ns_ref_put(ns)) schedule_work(&ns->work); } EXPORT_SYMBOL_GPL(put_pid_ns); diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 54a623680019..05337f437cca 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -202,6 +202,17 @@ config PM_WAKELOCKS_GC depends on PM_WAKELOCKS default y +config PM_QOS_CPU_SYSTEM_WAKEUP + bool "User space interface for CPU system wakeup QoS" + depends on CPU_IDLE + help + Enable this to allow user space via the cpu_wakeup_latency file to + specify a CPU system wakeup latency limit. + + This may be particularly useful for platforms supporting multiple low + power states for CPUs during system-wide suspend and s2idle in + particular. + config PM bool "Device power management core functionality" help diff --git a/kernel/power/Makefile b/kernel/power/Makefile index 874ad834dc8d..773e2789412b 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -21,4 +21,6 @@ obj-$(CONFIG_PM_WAKELOCKS) += wakelock.o obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o -obj-$(CONFIG_ENERGY_MODEL) += energy_model.o +obj-$(CONFIG_ENERGY_MODEL) += em.o +em-y := energy_model.o +em-$(CONFIG_NET) += em_netlink_autogen.o em_netlink.o diff --git a/kernel/power/console.c b/kernel/power/console.c index 19c48aa5355d..a906a0ac0f9b 100644 --- a/kernel/power/console.c +++ b/kernel/power/console.c @@ -44,9 +44,10 @@ static LIST_HEAD(pm_vt_switch_list); * no_console_suspend argument has been passed on the command line, VT * switches will occur. */ -void pm_vt_switch_required(struct device *dev, bool required) +int pm_vt_switch_required(struct device *dev, bool required) { struct pm_vt_switch *entry, *tmp; + int ret = 0; mutex_lock(&vt_switch_mutex); list_for_each_entry(tmp, &pm_vt_switch_list, head) { @@ -58,8 +59,10 @@ void pm_vt_switch_required(struct device *dev, bool required) } entry = kmalloc(sizeof(*entry), GFP_KERNEL); - if (!entry) + if (!entry) { + ret = -ENOMEM; goto out; + } entry->required = required; entry->dev = dev; @@ -67,6 +70,7 @@ void pm_vt_switch_required(struct device *dev, bool required) list_add(&entry->head, &pm_vt_switch_list); out: mutex_unlock(&vt_switch_mutex); + return ret; } EXPORT_SYMBOL(pm_vt_switch_required); diff --git a/kernel/power/em_netlink.c b/kernel/power/em_netlink.c new file mode 100644 index 000000000000..4b85da138a06 --- /dev/null +++ b/kernel/power/em_netlink.c @@ -0,0 +1,308 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * + * Generic netlink for energy model. + * + * Copyright (c) 2025 Valve Corporation. + * Author: Changwoo Min <changwoo@igalia.com> + */ + +#define pr_fmt(fmt) "energy_model: " fmt + +#include <linux/energy_model.h> +#include <net/sock.h> +#include <net/genetlink.h> +#include <uapi/linux/energy_model.h> + +#include "em_netlink.h" +#include "em_netlink_autogen.h" + +#define EM_A_PD_CPUS_LEN 256 + +/*************************** Command encoding ********************************/ +static int __em_nl_get_pd_size(struct em_perf_domain *pd, void *data) +{ + char cpus_buf[EM_A_PD_CPUS_LEN]; + int *tot_msg_sz = data; + int msg_sz, cpus_sz; + + cpus_sz = snprintf(cpus_buf, sizeof(cpus_buf), "%*pb", + cpumask_pr_args(to_cpumask(pd->cpus))); + + msg_sz = nla_total_size(0) + /* EM_A_PDS_PD */ + nla_total_size(sizeof(u32)) + /* EM_A_PD_PD_ID */ + nla_total_size_64bit(sizeof(u64)) + /* EM_A_PD_FLAGS */ + nla_total_size(cpus_sz); /* EM_A_PD_CPUS */ + + *tot_msg_sz += nlmsg_total_size(genlmsg_msg_size(msg_sz)); + return 0; +} + +static int __em_nl_get_pd(struct em_perf_domain *pd, void *data) +{ + char cpus_buf[EM_A_PD_CPUS_LEN]; + struct sk_buff *msg = data; + struct nlattr *entry; + + entry = nla_nest_start(msg, EM_A_PDS_PD); + if (!entry) + goto out_cancel_nest; + + if (nla_put_u32(msg, EM_A_PD_PD_ID, pd->id)) + goto out_cancel_nest; + + if (nla_put_u64_64bit(msg, EM_A_PD_FLAGS, pd->flags, EM_A_PD_PAD)) + goto out_cancel_nest; + + snprintf(cpus_buf, sizeof(cpus_buf), "%*pb", + cpumask_pr_args(to_cpumask(pd->cpus))); + if (nla_put_string(msg, EM_A_PD_CPUS, cpus_buf)) + goto out_cancel_nest; + + nla_nest_end(msg, entry); + + return 0; + +out_cancel_nest: + nla_nest_cancel(msg, entry); + + return -EMSGSIZE; +} + +int em_nl_get_pds_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct sk_buff *msg; + void *hdr; + int cmd = info->genlhdr->cmd; + int ret = -EMSGSIZE, msg_sz = 0; + + for_each_em_perf_domain(__em_nl_get_pd_size, &msg_sz); + + msg = genlmsg_new(msg_sz, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + hdr = genlmsg_put_reply(msg, info, &em_nl_family, 0, cmd); + if (!hdr) + goto out_free_msg; + + ret = for_each_em_perf_domain(__em_nl_get_pd, msg); + if (ret) + goto out_cancel_msg; + + genlmsg_end(msg, hdr); + + return genlmsg_reply(msg, info); + +out_cancel_msg: + genlmsg_cancel(msg, hdr); +out_free_msg: + nlmsg_free(msg); + + return ret; +} + +static struct em_perf_domain *__em_nl_get_pd_table_id(struct nlattr **attrs) +{ + struct em_perf_domain *pd; + int id; + + if (!attrs[EM_A_PD_TABLE_PD_ID]) + return NULL; + + id = nla_get_u32(attrs[EM_A_PD_TABLE_PD_ID]); + pd = em_perf_domain_get_by_id(id); + return pd; +} + +static int __em_nl_get_pd_table_size(const struct em_perf_domain *pd) +{ + int id_sz, ps_sz; + + id_sz = nla_total_size(sizeof(u32)); /* EM_A_PD_TABLE_PD_ID */ + ps_sz = nla_total_size(0) + /* EM_A_PD_TABLE_PS */ + nla_total_size_64bit(sizeof(u64)) + /* EM_A_PS_PERFORMANCE */ + nla_total_size_64bit(sizeof(u64)) + /* EM_A_PS_FREQUENCY */ + nla_total_size_64bit(sizeof(u64)) + /* EM_A_PS_POWER */ + nla_total_size_64bit(sizeof(u64)) + /* EM_A_PS_COST */ + nla_total_size_64bit(sizeof(u64)); /* EM_A_PS_FLAGS */ + ps_sz *= pd->nr_perf_states; + + return nlmsg_total_size(genlmsg_msg_size(id_sz + ps_sz)); +} + +static int __em_nl_get_pd_table(struct sk_buff *msg, const struct em_perf_domain *pd) +{ + struct em_perf_state *table, *ps; + struct nlattr *entry; + int i; + + if (nla_put_u32(msg, EM_A_PD_TABLE_PD_ID, pd->id)) + goto out_err; + + rcu_read_lock(); + table = em_perf_state_from_pd((struct em_perf_domain *)pd); + + for (i = 0; i < pd->nr_perf_states; i++) { + ps = &table[i]; + + entry = nla_nest_start(msg, EM_A_PD_TABLE_PS); + if (!entry) + goto out_unlock_ps; + + if (nla_put_u64_64bit(msg, EM_A_PS_PERFORMANCE, + ps->performance, EM_A_PS_PAD)) + goto out_cancel_ps_nest; + if (nla_put_u64_64bit(msg, EM_A_PS_FREQUENCY, + ps->frequency, EM_A_PS_PAD)) + goto out_cancel_ps_nest; + if (nla_put_u64_64bit(msg, EM_A_PS_POWER, + ps->power, EM_A_PS_PAD)) + goto out_cancel_ps_nest; + if (nla_put_u64_64bit(msg, EM_A_PS_COST, + ps->cost, EM_A_PS_PAD)) + goto out_cancel_ps_nest; + if (nla_put_u64_64bit(msg, EM_A_PS_FLAGS, + ps->flags, EM_A_PS_PAD)) + goto out_cancel_ps_nest; + + nla_nest_end(msg, entry); + } + rcu_read_unlock(); + return 0; + +out_cancel_ps_nest: + nla_nest_cancel(msg, entry); +out_unlock_ps: + rcu_read_unlock(); +out_err: + return -EMSGSIZE; +} + +int em_nl_get_pd_table_doit(struct sk_buff *skb, struct genl_info *info) +{ + int cmd = info->genlhdr->cmd; + int msg_sz, ret = -EMSGSIZE; + struct em_perf_domain *pd; + struct sk_buff *msg; + void *hdr; + + pd = __em_nl_get_pd_table_id(info->attrs); + if (!pd) + return -EINVAL; + + msg_sz = __em_nl_get_pd_table_size(pd); + + msg = genlmsg_new(msg_sz, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + hdr = genlmsg_put_reply(msg, info, &em_nl_family, 0, cmd); + if (!hdr) + goto out_free_msg; + + ret = __em_nl_get_pd_table(msg, pd); + if (ret) + goto out_free_msg; + + genlmsg_end(msg, hdr); + return genlmsg_reply(msg, info); + +out_free_msg: + nlmsg_free(msg); + return ret; +} + + +/**************************** Event encoding *********************************/ +static void __em_notify_pd_table(const struct em_perf_domain *pd, int ntf_type) +{ + struct sk_buff *msg; + int msg_sz, ret = -EMSGSIZE; + void *hdr; + + if (!genl_has_listeners(&em_nl_family, &init_net, EM_NLGRP_EVENT)) + return; + + msg_sz = __em_nl_get_pd_table_size(pd); + + msg = genlmsg_new(msg_sz, GFP_KERNEL); + if (!msg) + return; + + hdr = genlmsg_put(msg, 0, 0, &em_nl_family, 0, ntf_type); + if (!hdr) + goto out_free_msg; + + ret = __em_nl_get_pd_table(msg, pd); + if (ret) + goto out_free_msg; + + genlmsg_end(msg, hdr); + + genlmsg_multicast(&em_nl_family, msg, 0, EM_NLGRP_EVENT, GFP_KERNEL); + + return; + +out_free_msg: + nlmsg_free(msg); + return; +} + +void em_notify_pd_created(const struct em_perf_domain *pd) +{ + __em_notify_pd_table(pd, EM_CMD_PD_CREATED); +} + +void em_notify_pd_updated(const struct em_perf_domain *pd) +{ + __em_notify_pd_table(pd, EM_CMD_PD_UPDATED); +} + +static int __em_notify_pd_deleted_size(const struct em_perf_domain *pd) +{ + int id_sz = nla_total_size(sizeof(u32)); /* EM_A_PD_TABLE_PD_ID */ + + return nlmsg_total_size(genlmsg_msg_size(id_sz)); +} + +void em_notify_pd_deleted(const struct em_perf_domain *pd) +{ + struct sk_buff *msg; + void *hdr; + int msg_sz; + + if (!genl_has_listeners(&em_nl_family, &init_net, EM_NLGRP_EVENT)) + return; + + msg_sz = __em_notify_pd_deleted_size(pd); + + msg = genlmsg_new(msg_sz, GFP_KERNEL); + if (!msg) + return; + + hdr = genlmsg_put(msg, 0, 0, &em_nl_family, 0, EM_CMD_PD_DELETED); + if (!hdr) + goto out_free_msg; + + if (nla_put_u32(msg, EM_A_PD_TABLE_PD_ID, pd->id)) { + goto out_free_msg; + } + + genlmsg_end(msg, hdr); + + genlmsg_multicast(&em_nl_family, msg, 0, EM_NLGRP_EVENT, GFP_KERNEL); + + return; + +out_free_msg: + nlmsg_free(msg); + return; +} + +/**************************** Initialization *********************************/ +static int __init em_netlink_init(void) +{ + return genl_register_family(&em_nl_family); +} +postcore_initcall(em_netlink_init); diff --git a/kernel/power/em_netlink.h b/kernel/power/em_netlink.h new file mode 100644 index 000000000000..583d7f1c3939 --- /dev/null +++ b/kernel/power/em_netlink.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * + * Generic netlink for energy model. + * + * Copyright (c) 2025 Valve Corporation. + * Author: Changwoo Min <changwoo@igalia.com> + */ +#ifndef _EM_NETLINK_H +#define _EM_NETLINK_H + +#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_NET) +int for_each_em_perf_domain(int (*cb)(struct em_perf_domain*, void *), + void *data); +struct em_perf_domain *em_perf_domain_get_by_id(int id); +void em_notify_pd_created(const struct em_perf_domain *pd); +void em_notify_pd_deleted(const struct em_perf_domain *pd); +void em_notify_pd_updated(const struct em_perf_domain *pd); +#else +static inline +int for_each_em_perf_domain(int (*cb)(struct em_perf_domain*, void *), + void *data) +{ + return -EINVAL; +} +static inline +struct em_perf_domain *em_perf_domain_get_by_id(int id) +{ + return NULL; +} + +static inline void em_notify_pd_created(const struct em_perf_domain *pd) {} + +static inline void em_notify_pd_deleted(const struct em_perf_domain *pd) {} + +static inline void em_notify_pd_updated(const struct em_perf_domain *pd) {} +#endif + +#endif /* _EM_NETLINK_H */ diff --git a/kernel/power/em_netlink_autogen.c b/kernel/power/em_netlink_autogen.c new file mode 100644 index 000000000000..a7a09ab1d1c2 --- /dev/null +++ b/kernel/power/em_netlink_autogen.c @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/em.yaml */ +/* YNL-GEN kernel source */ + +#include <net/netlink.h> +#include <net/genetlink.h> + +#include "em_netlink_autogen.h" + +#include <uapi/linux/energy_model.h> + +/* EM_CMD_GET_PD_TABLE - do */ +static const struct nla_policy em_get_pd_table_nl_policy[EM_A_PD_TABLE_PD_ID + 1] = { + [EM_A_PD_TABLE_PD_ID] = { .type = NLA_U32, }, +}; + +/* Ops table for em */ +static const struct genl_split_ops em_nl_ops[] = { + { + .cmd = EM_CMD_GET_PDS, + .doit = em_nl_get_pds_doit, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = EM_CMD_GET_PD_TABLE, + .doit = em_nl_get_pd_table_doit, + .policy = em_get_pd_table_nl_policy, + .maxattr = EM_A_PD_TABLE_PD_ID, + .flags = GENL_CMD_CAP_DO, + }, +}; + +static const struct genl_multicast_group em_nl_mcgrps[] = { + [EM_NLGRP_EVENT] = { "event", }, +}; + +struct genl_family em_nl_family __ro_after_init = { + .name = EM_FAMILY_NAME, + .version = EM_FAMILY_VERSION, + .netnsok = true, + .parallel_ops = true, + .module = THIS_MODULE, + .split_ops = em_nl_ops, + .n_split_ops = ARRAY_SIZE(em_nl_ops), + .mcgrps = em_nl_mcgrps, + .n_mcgrps = ARRAY_SIZE(em_nl_mcgrps), +}; diff --git a/kernel/power/em_netlink_autogen.h b/kernel/power/em_netlink_autogen.h new file mode 100644 index 000000000000..78ce609641f1 --- /dev/null +++ b/kernel/power/em_netlink_autogen.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/em.yaml */ +/* YNL-GEN kernel header */ + +#ifndef _LINUX_EM_GEN_H +#define _LINUX_EM_GEN_H + +#include <net/netlink.h> +#include <net/genetlink.h> + +#include <uapi/linux/energy_model.h> + +int em_nl_get_pds_doit(struct sk_buff *skb, struct genl_info *info); +int em_nl_get_pd_table_doit(struct sk_buff *skb, struct genl_info *info); + +enum { + EM_NLGRP_EVENT, +}; + +extern struct genl_family em_nl_family; + +#endif /* _LINUX_EM_GEN_H */ diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 5f17d2e8e954..11af9f64aa82 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -17,12 +17,24 @@ #include <linux/sched/topology.h> #include <linux/slab.h> +#include "em_netlink.h" + /* * Mutex serializing the registrations of performance domains and letting * callbacks defined by drivers sleep. */ static DEFINE_MUTEX(em_pd_mutex); +/* + * Manage performance domains with IDs. One can iterate the performance domains + * through the list and pick one with their associated ID. The mutex serializes + * the list access. When holding em_pd_list_mutex, em_pd_mutex should not be + * taken to avoid potential deadlock. + */ +static DEFINE_IDA(em_pd_ida); +static LIST_HEAD(em_pd_list); +static DEFINE_MUTEX(em_pd_list_mutex); + static void em_cpufreq_update_efficiencies(struct device *dev, struct em_perf_state *table); static void em_check_capacity_update(void); @@ -116,6 +128,16 @@ static int em_debug_flags_show(struct seq_file *s, void *unused) } DEFINE_SHOW_ATTRIBUTE(em_debug_flags); +static int em_debug_id_show(struct seq_file *s, void *unused) +{ + struct em_perf_domain *pd = s->private; + + seq_printf(s, "%d\n", pd->id); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(em_debug_id); + static void em_debug_create_pd(struct device *dev) { struct em_dbg_info *em_dbg; @@ -132,6 +154,8 @@ static void em_debug_create_pd(struct device *dev) debugfs_create_file("flags", 0444, d, dev->em_pd, &em_debug_flags_fops); + debugfs_create_file("id", 0444, d, dev->em_pd, &em_debug_id_fops); + em_dbg = devm_kcalloc(dev, dev->em_pd->nr_perf_states, sizeof(*em_dbg), GFP_KERNEL); if (!em_dbg) @@ -328,6 +352,8 @@ int em_dev_update_perf_domain(struct device *dev, em_table_free(old_table); mutex_unlock(&em_pd_mutex); + + em_notify_pd_updated(pd); return 0; } EXPORT_SYMBOL_GPL(em_dev_update_perf_domain); @@ -396,7 +422,7 @@ static int em_create_pd(struct device *dev, int nr_states, struct em_perf_table *em_table; struct em_perf_domain *pd; struct device *cpu_dev; - int cpu, ret, num_cpus; + int cpu, ret, num_cpus, id; if (_is_cpu_device(dev)) { num_cpus = cpumask_weight(cpus); @@ -420,6 +446,13 @@ static int em_create_pd(struct device *dev, int nr_states, pd->nr_perf_states = nr_states; + INIT_LIST_HEAD(&pd->node); + + id = ida_alloc(&em_pd_ida, GFP_KERNEL); + if (id < 0) + return -ENOMEM; + pd->id = id; + em_table = em_table_alloc(pd); if (!em_table) goto free_pd; @@ -444,6 +477,7 @@ free_pd_table: kfree(em_table); free_pd: kfree(pd); + ida_free(&em_pd_ida, id); return -EINVAL; } @@ -659,8 +693,16 @@ int em_dev_register_pd_no_update(struct device *dev, unsigned int nr_states, unlock: mutex_unlock(&em_pd_mutex); + if (ret) + return ret; - return ret; + mutex_lock(&em_pd_list_mutex); + list_add_tail(&dev->em_pd->node, &em_pd_list); + mutex_unlock(&em_pd_list_mutex); + + em_notify_pd_created(dev->em_pd); + + return 0; } EXPORT_SYMBOL_GPL(em_dev_register_pd_no_update); @@ -678,6 +720,12 @@ void em_dev_unregister_perf_domain(struct device *dev) if (_is_cpu_device(dev)) return; + mutex_lock(&em_pd_list_mutex); + list_del_init(&dev->em_pd->node); + mutex_unlock(&em_pd_list_mutex); + + em_notify_pd_deleted(dev->em_pd); + /* * The mutex separates all register/unregister requests and protects * from potential clean-up/setup issues in the debugfs directories. @@ -689,6 +737,8 @@ void em_dev_unregister_perf_domain(struct device *dev) em_table_free(rcu_dereference_protected(dev->em_pd->em_table, lockdep_is_held(&em_pd_mutex))); + ida_free(&em_pd_ida, dev->em_pd->id); + kfree(dev->em_pd); dev->em_pd = NULL; mutex_unlock(&em_pd_mutex); @@ -958,3 +1008,39 @@ void em_rebuild_sched_domains(void) */ schedule_work(&rebuild_sd_work); } + +#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_NET) +int for_each_em_perf_domain(int (*cb)(struct em_perf_domain*, void *), + void *data) +{ + struct em_perf_domain *pd; + + lockdep_assert_not_held(&em_pd_mutex); + guard(mutex)(&em_pd_list_mutex); + + list_for_each_entry(pd, &em_pd_list, node) { + int ret; + + ret = cb(pd, data); + if (ret) + return ret; + } + + return 0; +} + +struct em_perf_domain *em_perf_domain_get_by_id(int id) +{ + struct em_perf_domain *pd; + + lockdep_assert_not_held(&em_pd_mutex); + guard(mutex)(&em_pd_list_mutex); + + list_for_each_entry(pd, &em_pd_list, node) { + if (pd->id == id) + return pd; + } + + return NULL; +} +#endif diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 53166ef86ba4..af8d07bafe02 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -820,9 +820,11 @@ int hibernate(void) if (error) goto Restore; - ksys_sync_helper(); - if (filesystem_freeze_enabled) - filesystems_freeze(); + error = pm_sleep_fs_sync(); + if (error) + goto Notify; + + filesystems_freeze(filesystem_freeze_enabled); error = freeze_processes(); if (error) @@ -892,6 +894,7 @@ int hibernate(void) freezer_test_done = false; Exit: filesystems_thaw(); + Notify: pm_notifier_call_chain(PM_POST_HIBERNATION); Restore: pm_restore_console(); @@ -928,8 +931,7 @@ int hibernate_quiet_exec(int (*func)(void *data), void *data) if (error) goto restore; - if (filesystem_freeze_enabled) - filesystems_freeze(); + filesystems_freeze(filesystem_freeze_enabled); error = freeze_processes(); if (error) @@ -1079,8 +1081,7 @@ static int software_resume(void) if (error) goto Restore; - if (filesystem_freeze_enabled) - filesystems_freeze(); + filesystems_freeze(filesystem_freeze_enabled); pm_pr_dbg("Preparing processes for hibernation restore.\n"); error = freeze_processes(); diff --git a/kernel/power/main.c b/kernel/power/main.c index 549f51ca3a1e..03b2c5495c77 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -18,6 +18,8 @@ #include <linux/suspend.h> #include <linux/syscalls.h> #include <linux/pm_runtime.h> +#include <linux/atomic.h> +#include <linux/wait.h> #include "power.h" @@ -92,6 +94,61 @@ void ksys_sync_helper(void) } EXPORT_SYMBOL_GPL(ksys_sync_helper); +#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION) +/* Wakeup events handling resolution while syncing file systems in jiffies */ +#define PM_FS_SYNC_WAKEUP_RESOLUTION 5 + +static atomic_t pm_fs_sync_count = ATOMIC_INIT(0); +static struct workqueue_struct *pm_fs_sync_wq; +static DECLARE_WAIT_QUEUE_HEAD(pm_fs_sync_wait); + +static bool pm_fs_sync_completed(void) +{ + return atomic_read(&pm_fs_sync_count) == 0; +} + +static void pm_fs_sync_work_fn(struct work_struct *work) +{ + ksys_sync_helper(); + + if (atomic_dec_and_test(&pm_fs_sync_count)) + wake_up(&pm_fs_sync_wait); +} +static DECLARE_WORK(pm_fs_sync_work, pm_fs_sync_work_fn); + +/** + * pm_sleep_fs_sync() - Sync file systems in an interruptible way + * + * Return: 0 on successful file system sync, or -EBUSY if the file system sync + * was aborted. + */ +int pm_sleep_fs_sync(void) +{ + pm_wakeup_clear(0); + + /* + * Take back-to-back sleeps into account by queuing a subsequent fs sync + * only if the previous fs sync is running or is not queued. Multiple fs + * syncs increase the likelihood of saving the latest files immediately + * before sleep. + */ + if (!work_pending(&pm_fs_sync_work)) { + atomic_inc(&pm_fs_sync_count); + queue_work(pm_fs_sync_wq, &pm_fs_sync_work); + } + + while (!pm_fs_sync_completed()) { + if (pm_wakeup_pending()) + return -EBUSY; + + wait_event_timeout(pm_fs_sync_wait, pm_fs_sync_completed(), + PM_FS_SYNC_WAKEUP_RESOLUTION); + } + + return 0; +} +#endif /* CONFIG_SUSPEND || CONFIG_HIBERNATION */ + /* Routines for PM-transition notifications */ static BLOCKING_NOTIFIER_HEAD(pm_chain_head); @@ -231,10 +288,10 @@ static ssize_t mem_sleep_store(struct kobject *kobj, struct kobj_attribute *attr power_attr(mem_sleep); /* - * sync_on_suspend: invoke ksys_sync_helper() before suspend. + * sync_on_suspend: Sync file systems before suspend. * - * show() returns whether ksys_sync_helper() is invoked before suspend. - * store() accepts 0 or 1. 0 disables ksys_sync_helper() and 1 enables it. + * show() returns whether file systems sync before suspend is enabled. + * store() accepts 0 or 1. 0 disables file systems sync and 1 enables it. */ bool sync_on_suspend_enabled = !IS_ENABLED(CONFIG_SUSPEND_SKIP_SYNC); @@ -1066,16 +1123,26 @@ static const struct attribute_group *attr_groups[] = { struct workqueue_struct *pm_wq; EXPORT_SYMBOL_GPL(pm_wq); -static int __init pm_start_workqueue(void) +static int __init pm_start_workqueues(void) { - pm_wq = alloc_workqueue("pm", WQ_FREEZABLE, 0); + pm_wq = alloc_workqueue("pm", WQ_FREEZABLE | WQ_UNBOUND, 0); + if (!pm_wq) + return -ENOMEM; - return pm_wq ? 0 : -ENOMEM; +#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION) + pm_fs_sync_wq = alloc_ordered_workqueue("pm_fs_sync", 0); + if (!pm_fs_sync_wq) { + destroy_workqueue(pm_wq); + return -ENOMEM; + } +#endif + + return 0; } static int __init pm_init(void) { - int error = pm_start_workqueue(); + int error = pm_start_workqueues(); if (error) return error; hibernate_image_size_init(); diff --git a/kernel/power/power.h b/kernel/power/power.h index 7ccd709af93f..75b63843886e 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -19,6 +19,7 @@ struct swsusp_info { } __aligned(PAGE_SIZE); #if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION) +extern int pm_sleep_fs_sync(void); extern bool filesystem_freeze_enabled; #endif diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 4244b069442e..f7d8064e9adc 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -415,6 +415,105 @@ static struct miscdevice cpu_latency_qos_miscdev = { .fops = &cpu_latency_qos_fops, }; +#ifdef CONFIG_PM_QOS_CPU_SYSTEM_WAKEUP +/* The CPU system wakeup latency QoS. */ +static struct pm_qos_constraints cpu_wakeup_latency_constraints = { + .list = PLIST_HEAD_INIT(cpu_wakeup_latency_constraints.list), + .target_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT, + .default_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT, + .no_constraint_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT, + .type = PM_QOS_MIN, +}; + +/** + * cpu_wakeup_latency_qos_limit - Current CPU system wakeup latency QoS limit. + * + * Returns the current CPU system wakeup latency QoS limit that may have been + * requested by user space. + */ +s32 cpu_wakeup_latency_qos_limit(void) +{ + return pm_qos_read_value(&cpu_wakeup_latency_constraints); +} + +static int cpu_wakeup_latency_qos_open(struct inode *inode, struct file *filp) +{ + struct pm_qos_request *req; + + req = kzalloc(sizeof(*req), GFP_KERNEL); + if (!req) + return -ENOMEM; + + req->qos = &cpu_wakeup_latency_constraints; + pm_qos_update_target(req->qos, &req->node, PM_QOS_ADD_REQ, + PM_QOS_RESUME_LATENCY_NO_CONSTRAINT); + filp->private_data = req; + + return 0; +} + +static int cpu_wakeup_latency_qos_release(struct inode *inode, + struct file *filp) +{ + struct pm_qos_request *req = filp->private_data; + + filp->private_data = NULL; + pm_qos_update_target(req->qos, &req->node, PM_QOS_REMOVE_REQ, + PM_QOS_RESUME_LATENCY_NO_CONSTRAINT); + kfree(req); + + return 0; +} + +static ssize_t cpu_wakeup_latency_qos_read(struct file *filp, char __user *buf, + size_t count, loff_t *f_pos) +{ + s32 value = pm_qos_read_value(&cpu_wakeup_latency_constraints); + + return simple_read_from_buffer(buf, count, f_pos, &value, sizeof(s32)); +} + +static ssize_t cpu_wakeup_latency_qos_write(struct file *filp, + const char __user *buf, + size_t count, loff_t *f_pos) +{ + struct pm_qos_request *req = filp->private_data; + s32 value; + + if (count == sizeof(s32)) { + if (copy_from_user(&value, buf, sizeof(s32))) + return -EFAULT; + } else { + int ret; + + ret = kstrtos32_from_user(buf, count, 16, &value); + if (ret) + return ret; + } + + if (value < 0) + return -EINVAL; + + pm_qos_update_target(req->qos, &req->node, PM_QOS_UPDATE_REQ, value); + + return count; +} + +static const struct file_operations cpu_wakeup_latency_qos_fops = { + .open = cpu_wakeup_latency_qos_open, + .release = cpu_wakeup_latency_qos_release, + .read = cpu_wakeup_latency_qos_read, + .write = cpu_wakeup_latency_qos_write, + .llseek = noop_llseek, +}; + +static struct miscdevice cpu_wakeup_latency_qos_miscdev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "cpu_wakeup_latency", + .fops = &cpu_wakeup_latency_qos_fops, +}; +#endif /* CONFIG_PM_QOS_CPU_SYSTEM_WAKEUP */ + static int __init cpu_latency_qos_init(void) { int ret; @@ -424,6 +523,13 @@ static int __init cpu_latency_qos_init(void) pr_err("%s: %s setup failed\n", __func__, cpu_latency_qos_miscdev.name); +#ifdef CONFIG_PM_QOS_CPU_SYSTEM_WAKEUP + ret = misc_register(&cpu_wakeup_latency_qos_miscdev); + if (ret < 0) + pr_err("%s: %s setup failed\n", __func__, + cpu_wakeup_latency_qos_miscdev.name); +#endif + return ret; } late_initcall(cpu_latency_qos_init); diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 645f42e40478..0a946932d5c1 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -2110,22 +2110,20 @@ asmlinkage __visible int swsusp_save(void) { unsigned int nr_pages, nr_highmem; - pr_info("Creating image:\n"); + pm_deferred_pr_dbg("Creating image\n"); drain_local_pages(NULL); nr_pages = count_data_pages(); nr_highmem = count_highmem_pages(); - pr_info("Need to copy %u pages\n", nr_pages + nr_highmem); + pm_deferred_pr_dbg("Need to copy %u pages\n", nr_pages + nr_highmem); if (!enough_free_mem(nr_pages, nr_highmem)) { - pr_err("Not enough free memory\n"); + pm_deferred_pr_dbg("Not enough free memory for image creation\n"); return -ENOMEM; } - if (swsusp_alloc(©_bm, nr_pages, nr_highmem)) { - pr_err("Memory allocation failed\n"); + if (swsusp_alloc(©_bm, nr_pages, nr_highmem)) return -ENOMEM; - } /* * During allocating of suspend pagedir, new cold pages may appear. @@ -2144,7 +2142,8 @@ asmlinkage __visible int swsusp_save(void) nr_zero_pages = nr_pages - nr_copy_pages; nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE); - pr_info("Image created (%d pages copied, %d zero pages)\n", nr_copy_pages, nr_zero_pages); + pm_deferred_pr_dbg("Image created (%d pages copied, %d zero pages)\n", + nr_copy_pages, nr_zero_pages); return 0; } diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index b4ca17c2fecf..2da4482bb6eb 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -344,10 +344,14 @@ MODULE_PARM_DESC(pm_test_delay, static int suspend_test(int level) { #ifdef CONFIG_PM_DEBUG + int i; + if (pm_test_level == level) { pr_info("suspend debug: Waiting for %d second(s).\n", pm_test_delay); - mdelay(pm_test_delay * 1000); + for (i = 0; i < pm_test_delay && !pm_wakeup_pending(); i++) + msleep(1000); + return 1; } #endif /* !CONFIG_PM_DEBUG */ @@ -375,8 +379,7 @@ static int suspend_prepare(suspend_state_t state) if (error) goto Restore; - if (filesystem_freeze_enabled) - filesystems_freeze(); + filesystems_freeze(filesystem_freeze_enabled); trace_suspend_resume(TPS("freeze_processes"), 0, true); error = suspend_freeze_processes(); trace_suspend_resume(TPS("freeze_processes"), 0, false); @@ -590,7 +593,11 @@ static int enter_state(suspend_state_t state) if (sync_on_suspend_enabled) { trace_suspend_resume(TPS("sync_filesystems"), 0, true); - ksys_sync_helper(); + + error = pm_sleep_fs_sync(); + if (error) + goto Unlock; + trace_suspend_resume(TPS("sync_filesystems"), 0, false); } diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 0beff7eeaaba..33a186373bef 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -46,19 +46,18 @@ static bool clean_pages_on_read; static bool clean_pages_on_decompress; /* - * The swap map is a data structure used for keeping track of each page - * written to a swap partition. It consists of many swap_map_page - * structures that contain each an array of MAP_PAGE_ENTRIES swap entries. - * These structures are stored on the swap and linked together with the - * help of the .next_swap member. + * The swap map is a data structure used for keeping track of each page + * written to a swap partition. It consists of many swap_map_page structures + * that contain each an array of MAP_PAGE_ENTRIES swap entries. These + * structures are stored on the swap and linked together with the help of the + * .next_swap member. * - * The swap map is created during suspend. The swap map pages are - * allocated and populated one at a time, so we only need one memory - * page to set up the entire structure. + * The swap map is created during suspend. The swap map pages are allocated and + * populated one at a time, so we only need one memory page to set up the entire + * structure. * - * During resume we pick up all swap_map_page structures into a list. + * During resume we pick up all swap_map_page structures into a list. */ - #define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1) /* @@ -89,10 +88,8 @@ struct swap_map_page_list { }; /* - * The swap_map_handle structure is used for handling swap in - * a file-alike way + * The swap_map_handle structure is used for handling swap in a file-alike way. */ - struct swap_map_handle { struct swap_map_page *cur; struct swap_map_page_list *maps; @@ -117,10 +114,9 @@ struct swsusp_header { static struct swsusp_header *swsusp_header; /* - * The following functions are used for tracing the allocated - * swap pages, so that they can be freed in case of an error. + * The following functions are used for tracing the allocated swap pages, so + * that they can be freed in case of an error. */ - struct swsusp_extent { struct rb_node node; unsigned long start; @@ -170,15 +166,14 @@ static int swsusp_extents_insert(unsigned long swap_offset) return 0; } -/* - * alloc_swapdev_block - allocate a swap page and register that it has - * been allocated, so that it can be freed in case of an error. - */ - sector_t alloc_swapdev_block(int swap) { unsigned long offset; + /* + * Allocate a swap page and register that it has been allocated, so that + * it can be freed in case of an error. + */ offset = swp_offset(get_swap_page_of_type(swap)); if (offset) { if (swsusp_extents_insert(offset)) @@ -189,16 +184,14 @@ sector_t alloc_swapdev_block(int swap) return 0; } -/* - * free_all_swap_pages - free swap pages allocated for saving image data. - * It also frees the extents used to register which swap entries had been - * allocated. - */ - void free_all_swap_pages(int swap) { struct rb_node *node; + /* + * Free swap pages allocated for saving image data. It also frees the + * extents used to register which swap entries had been allocated. + */ while ((node = swsusp_extents.rb_node)) { struct swsusp_extent *ext; @@ -303,6 +296,7 @@ static int hib_wait_io(struct hib_bio_batch *hb) /* * Saving part */ + static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) { int error; @@ -336,16 +330,14 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) */ unsigned int swsusp_header_flags; -/** - * swsusp_swap_check - check if the resume device is a swap device - * and get its index (if so) - * - * This is called before saving image - */ static int swsusp_swap_check(void) { int res; + /* + * Check if the resume device is a swap device and get its index (if so). + * This is called before saving the image. + */ if (swsusp_resume_device) res = swap_type_of(swsusp_resume_device, swsusp_resume_block); else @@ -362,13 +354,6 @@ static int swsusp_swap_check(void) return 0; } -/** - * write_page - Write one page to given swap location. - * @buf: Address we're writing. - * @offset: Offset of the swap page we're writing to. - * @hb: bio completion batch - */ - static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb) { gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY; @@ -519,17 +504,14 @@ static int swap_writer_finish(struct swap_map_handle *handle, CMP_HEADER, PAGE_SIZE) #define CMP_SIZE (CMP_PAGES * PAGE_SIZE) -/* Maximum number of threads for compression/decompression. */ -#define CMP_THREADS 3 +/* Default number of threads for compression/decompression. */ +#define CMP_THREADS 3 +static unsigned int hibernate_compression_threads = CMP_THREADS; /* Minimum/maximum number of pages for read buffering. */ #define CMP_MIN_RD_PAGES 1024 #define CMP_MAX_RD_PAGES 8192 -/** - * save_image - save the suspend image data - */ - static int save_image(struct swap_map_handle *handle, struct snapshot_handle *snapshot, unsigned int nr_to_write) @@ -585,13 +567,48 @@ struct crc_data { wait_queue_head_t go; /* start crc update */ wait_queue_head_t done; /* crc update done */ u32 *crc32; /* points to handle's crc32 */ - size_t *unc_len[CMP_THREADS]; /* uncompressed lengths */ - unsigned char *unc[CMP_THREADS]; /* uncompressed data */ + size_t **unc_len; /* uncompressed lengths */ + unsigned char **unc; /* uncompressed data */ }; -/* - * CRC32 update function that runs in its own thread. - */ +static struct crc_data *alloc_crc_data(int nr_threads) +{ + struct crc_data *crc; + + crc = kzalloc(sizeof(*crc), GFP_KERNEL); + if (!crc) + return NULL; + + crc->unc = kcalloc(nr_threads, sizeof(*crc->unc), GFP_KERNEL); + if (!crc->unc) + goto err_free_crc; + + crc->unc_len = kcalloc(nr_threads, sizeof(*crc->unc_len), GFP_KERNEL); + if (!crc->unc_len) + goto err_free_unc; + + return crc; + +err_free_unc: + kfree(crc->unc); +err_free_crc: + kfree(crc); + return NULL; +} + +static void free_crc_data(struct crc_data *crc) +{ + if (!crc) + return; + + if (crc->thr) + kthread_stop(crc->thr); + + kfree(crc->unc_len); + kfree(crc->unc); + kfree(crc); +} + static int crc32_threadfn(void *data) { struct crc_data *d = data; @@ -616,6 +633,7 @@ static int crc32_threadfn(void *data) } return 0; } + /* * Structure used for data compression. */ @@ -635,11 +653,8 @@ struct cmp_data { }; /* Indicates the image size after compression */ -static atomic_t compressed_size = ATOMIC_INIT(0); +static atomic64_t compressed_size = ATOMIC_INIT(0); -/* - * Compression function that runs in its own thread. - */ static int compress_threadfn(void *data) { struct cmp_data *d = data; @@ -664,19 +679,13 @@ static int compress_threadfn(void *data) d->ret = crypto_acomp_compress(d->cr); d->cmp_len = d->cr->dlen; - atomic_set(&compressed_size, atomic_read(&compressed_size) + d->cmp_len); + atomic64_add(d->cmp_len, &compressed_size); atomic_set_release(&d->stop, 1); wake_up(&d->done); } return 0; } -/** - * save_compressed_image - Save the suspend image data after compression. - * @handle: Swap map handle to use for saving the image. - * @snapshot: Image to read data from. - * @nr_to_write: Number of pages to save. - */ static int save_compressed_image(struct swap_map_handle *handle, struct snapshot_handle *snapshot, unsigned int nr_to_write) @@ -689,21 +698,21 @@ static int save_compressed_image(struct swap_map_handle *handle, ktime_t start; ktime_t stop; size_t off; - unsigned thr, run_threads, nr_threads; + unsigned int thr, run_threads, nr_threads; unsigned char *page = NULL; struct cmp_data *data = NULL; struct crc_data *crc = NULL; hib_init_batch(&hb); - atomic_set(&compressed_size, 0); + atomic64_set(&compressed_size, 0); /* * We'll limit the number of threads for compression to limit memory * footprint. */ nr_threads = num_online_cpus() - 1; - nr_threads = clamp_val(nr_threads, 1, CMP_THREADS); + nr_threads = clamp_val(nr_threads, 1, hibernate_compression_threads); page = (void *)__get_free_page(GFP_NOIO | __GFP_HIGH); if (!page) { @@ -719,7 +728,7 @@ static int save_compressed_image(struct swap_map_handle *handle, goto out_clean; } - crc = kzalloc(sizeof(*crc), GFP_KERNEL); + crc = alloc_crc_data(nr_threads); if (!crc) { pr_err("Failed to allocate crc\n"); ret = -ENOMEM; @@ -877,19 +886,18 @@ out_finish: stop = ktime_get(); if (!ret) ret = err2; - if (!ret) + if (!ret) { + swsusp_show_speed(start, stop, nr_to_write, "Wrote"); + pr_info("Image size after compression: %lld kbytes\n", + (atomic64_read(&compressed_size) / 1024)); pr_info("Image saving done\n"); - swsusp_show_speed(start, stop, nr_to_write, "Wrote"); - pr_info("Image size after compression: %d kbytes\n", - (atomic_read(&compressed_size) / 1024)); + } else { + pr_err("Image saving failed: %d\n", ret); + } out_clean: hib_finish_batch(&hb); - if (crc) { - if (crc->thr) - kthread_stop(crc->thr); - kfree(crc); - } + free_crc_data(crc); if (data) { for (thr = 0; thr < nr_threads; thr++) { if (data[thr].thr) @@ -899,18 +907,12 @@ out_clean: } vfree(data); } - if (page) free_page((unsigned long)page); + if (page) + free_page((unsigned long)page); return ret; } -/** - * enough_swap - Make sure we have enough swap to save the image. - * - * Returns TRUE or FALSE after checking the total amount of swap - * space available from the resume partition. - */ - static int enough_swap(unsigned int nr_pages) { unsigned int free_swap = count_swap_pages(root_swap, 1); @@ -923,15 +925,16 @@ static int enough_swap(unsigned int nr_pages) } /** - * swsusp_write - Write entire image and metadata. - * @flags: flags to pass to the "boot" kernel in the image header + * swsusp_write - Write entire image and metadata. + * @flags: flags to pass to the "boot" kernel in the image header + * + * It is important _NOT_ to umount filesystems at this point. We want them + * synced (in case something goes wrong) but we DO not want to mark filesystem + * clean: it is not. (And it does not matter, if we resume correctly, we'll mark + * system clean, anyway.) * - * It is important _NOT_ to umount filesystems at this point. We want - * them synced (in case something goes wrong) but we DO not want to mark - * filesystem clean: it is not. (And it does not matter, if we resume - * correctly, we'll mark system clean, anyway.) + * Return: 0 on success, negative error code on failure. */ - int swsusp_write(unsigned int flags) { struct swap_map_handle handle; @@ -974,8 +977,8 @@ out_finish: } /* - * The following functions allow us to read data using a swap map - * in a file-like way. + * The following functions allow us to read data using a swap map in a file-like + * way. */ static void release_swap_reader(struct swap_map_handle *handle) @@ -1077,12 +1080,6 @@ static int swap_reader_finish(struct swap_map_handle *handle) return 0; } -/** - * load_image - load the image using the swap map handle - * @handle and the snapshot handle @snapshot - * (assume there are @nr_pages pages to load) - */ - static int load_image(struct swap_map_handle *handle, struct snapshot_handle *snapshot, unsigned int nr_to_read) @@ -1153,9 +1150,6 @@ struct dec_data { unsigned char cmp[CMP_SIZE]; /* compressed buffer */ }; -/* - * Decompression function that runs in its own thread. - */ static int decompress_threadfn(void *data) { struct dec_data *d = data; @@ -1190,12 +1184,6 @@ static int decompress_threadfn(void *data) return 0; } -/** - * load_compressed_image - Load compressed image data and decompress it. - * @handle: Swap map handle to use for loading data. - * @snapshot: Image to copy uncompressed data into. - * @nr_to_read: Number of pages to load. - */ static int load_compressed_image(struct swap_map_handle *handle, struct snapshot_handle *snapshot, unsigned int nr_to_read) @@ -1223,7 +1211,7 @@ static int load_compressed_image(struct swap_map_handle *handle, * footprint. */ nr_threads = num_online_cpus() - 1; - nr_threads = clamp_val(nr_threads, 1, CMP_THREADS); + nr_threads = clamp_val(nr_threads, 1, hibernate_compression_threads); page = vmalloc_array(CMP_MAX_RD_PAGES, sizeof(*page)); if (!page) { @@ -1239,7 +1227,7 @@ static int load_compressed_image(struct swap_map_handle *handle, goto out_clean; } - crc = kzalloc(sizeof(*crc), GFP_KERNEL); + crc = alloc_crc_data(nr_threads); if (!crc) { pr_err("Failed to allocate crc\n"); ret = -ENOMEM; @@ -1506,11 +1494,7 @@ out_clean: hib_finish_batch(&hb); for (i = 0; i < ring_size; i++) free_page((unsigned long)page[i]); - if (crc) { - if (crc->thr) - kthread_stop(crc->thr); - kfree(crc); - } + free_crc_data(crc); if (data) { for (thr = 0; thr < nr_threads; thr++) { if (data[thr].thr) @@ -1529,8 +1513,9 @@ out_clean: * swsusp_read - read the hibernation image. * @flags_p: flags passed by the "frozen" kernel in the image header should * be written into this memory location + * + * Return: 0 on success, negative error code on failure. */ - int swsusp_read(unsigned int *flags_p) { int error; @@ -1567,8 +1552,9 @@ static void *swsusp_holder; /** * swsusp_check - Open the resume device and check for the swsusp signature. * @exclusive: Open the resume device exclusively. + * + * Return: 0 if a valid image is found, negative error code otherwise. */ - int swsusp_check(bool exclusive) { void *holder = exclusive ? &swsusp_holder : NULL; @@ -1618,7 +1604,6 @@ put: /** * swsusp_close - close resume device. */ - void swsusp_close(void) { if (IS_ERR(hib_resume_bdev_file)) { @@ -1630,9 +1615,10 @@ void swsusp_close(void) } /** - * swsusp_unmark - Unmark swsusp signature in the resume device + * swsusp_unmark - Unmark swsusp signature in the resume device + * + * Return: 0 on success, negative error code on failure. */ - #ifdef CONFIG_SUSPEND int swsusp_unmark(void) { @@ -1658,8 +1644,46 @@ int swsusp_unmark(void) } #endif +static ssize_t hibernate_compression_threads_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%d\n", hibernate_compression_threads); +} + +static ssize_t hibernate_compression_threads_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + unsigned long val; + + if (kstrtoul(buf, 0, &val)) + return -EINVAL; + + if (val < 1) + return -EINVAL; + + hibernate_compression_threads = val; + return n; +} +power_attr(hibernate_compression_threads); + +static struct attribute *g[] = { + &hibernate_compression_threads_attr.attr, + NULL, +}; + +static const struct attribute_group attr_group = { + .attrs = g, +}; + static int __init swsusp_header_init(void) { + int error; + + error = sysfs_create_group(power_kobj, &attr_group); + if (error) + return -ENOMEM; + swsusp_header = (struct swsusp_header*) __get_free_page(GFP_KERNEL); if (!swsusp_header) panic("Could not allocate memory for swsusp_header\n"); @@ -1667,3 +1691,19 @@ static int __init swsusp_header_init(void) } core_initcall(swsusp_header_init); + +static int __init hibernate_compression_threads_setup(char *str) +{ + int rc = kstrtouint(str, 0, &hibernate_compression_threads); + + if (rc) + return rc; + + if (hibernate_compression_threads < 1) + hibernate_compression_threads = CMP_THREADS; + + return 1; + +} + +__setup("hibernate_compression_threads=", hibernate_compression_threads_setup); diff --git a/kernel/power/user.c b/kernel/power/user.c index 3f9e3efb9f6e..4401cfe26e5c 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -278,7 +278,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, if (data->frozen) break; - ksys_sync_helper(); + error = pm_sleep_fs_sync(); + if (error) + break; error = freeze_processes(); if (error) diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 75a84efad40f..392ec2f75f01 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -793,9 +793,9 @@ static long ptrace_get_rseq_configuration(struct task_struct *task, unsigned long size, void __user *data) { struct ptrace_rseq_configuration conf = { - .rseq_abi_pointer = (u64)(uintptr_t)task->rseq, - .rseq_abi_size = task->rseq_len, - .signature = task->rseq_sig, + .rseq_abi_pointer = (u64)(uintptr_t)task->rseq.usrptr, + .rseq_abi_size = task->rseq.len, + .signature = task->rseq.sig, .flags = 0, }; diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index c1ebfd51768b..585cade21010 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -70,12 +70,10 @@ void rcu_qs(void) */ void rcu_sched_clock_irq(int user) { - if (user) { + if (user) rcu_qs(); - } else if (rcu_ctrlblk.donetail != rcu_ctrlblk.curtail) { - set_tsk_need_resched(current); - set_preempt_need_resched(); - } + else if (rcu_ctrlblk.donetail != rcu_ctrlblk.curtail) + set_need_resched_current(); } /* diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 8293bae1dec1..85b82a7007b9 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2696,10 +2696,8 @@ void rcu_sched_clock_irq(int user) /* The load-acquire pairs with the store-release setting to true. */ if (smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs))) { /* Idle and userspace execution already are quiescent states. */ - if (!rcu_is_cpu_rrupt_from_idle() && !user) { - set_tsk_need_resched(current); - set_preempt_need_resched(); - } + if (!rcu_is_cpu_rrupt_from_idle() && !user) + set_need_resched_current(); __this_cpu_write(rcu_data.rcu_urgent_qs, false); } rcu_flavor_sched_clock_irq(user); @@ -2824,7 +2822,6 @@ static void strict_work_handler(struct work_struct *work) /* Perform RCU core processing work for the current CPU. */ static __latent_entropy void rcu_core(void) { - unsigned long flags; struct rcu_data *rdp = raw_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; @@ -2837,8 +2834,8 @@ static __latent_entropy void rcu_core(void) if (IS_ENABLED(CONFIG_PREEMPT_COUNT) && (!(preempt_count() & PREEMPT_MASK))) { rcu_preempt_deferred_qs(current); } else if (rcu_preempt_need_deferred_qs(current)) { - set_tsk_need_resched(current); - set_preempt_need_resched(); + guard(irqsave)(); + set_need_resched_current(); } /* Update RCU state based on any recent quiescent states. */ @@ -2847,10 +2844,9 @@ static __latent_entropy void rcu_core(void) /* No grace period and unregistered callbacks? */ if (!rcu_gp_in_progress() && rcu_segcblist_is_enabled(&rdp->cblist) && !rcu_rdp_is_offloaded(rdp)) { - local_irq_save(flags); + guard(irqsave)(); if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) rcu_accelerate_cbs_unlocked(rnp, rdp); - local_irq_restore(flags); } rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check()); diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 6058a734090c..96c49c56fc14 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -729,8 +729,7 @@ static void rcu_exp_need_qs(void) __this_cpu_write(rcu_data.cpu_no_qs.b.exp, true); /* Store .exp before .rcu_urgent_qs. */ smp_store_release(this_cpu_ptr(&rcu_data.rcu_urgent_qs), true); - set_tsk_need_resched(current); - set_preempt_need_resched(); + set_need_resched_current(); } #ifdef CONFIG_PREEMPT_RCU diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index d85763336b3c..dbe2d02be824 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -753,8 +753,7 @@ static void rcu_read_unlock_special(struct task_struct *t) // Also if no expediting and no possible deboosting, // slow is OK. Plus nohz_full CPUs eventually get // tick enabled. - set_tsk_need_resched(current); - set_preempt_need_resched(); + set_need_resched_current(); if (IS_ENABLED(CONFIG_IRQ_WORK) && irqs_were_disabled && needs_exp && rdp->defer_qs_iw_pending != DEFER_QS_PENDING && cpu_online(rdp->cpu)) { @@ -813,10 +812,8 @@ static void rcu_flavor_sched_clock_irq(int user) if (rcu_preempt_depth() > 0 || (preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK))) { /* No QS, force context switch if deferred. */ - if (rcu_preempt_need_deferred_qs(t)) { - set_tsk_need_resched(t); - set_preempt_need_resched(); - } + if (rcu_preempt_need_deferred_qs(t)) + set_need_resched_current(); } else if (rcu_preempt_need_deferred_qs(t)) { rcu_preempt_deferred_qs(t); /* Report deferred QS. */ return; diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index d16afeb11506..b67532cb8770 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -763,8 +763,7 @@ static void print_cpu_stall(unsigned long gp_seq, unsigned long gps) * progress and it could be we're stuck in kernel space without context * switches for an entirely unreasonable amount of time. */ - set_tsk_need_resched(current); - set_preempt_need_resched(); + set_need_resched_current(); } static bool csd_lock_suppress_rcu_stall; diff --git a/kernel/rseq.c b/kernel/rseq.c index 2452b7366b00..395d8b002350 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -8,98 +8,7 @@ * Mathieu Desnoyers <mathieu.desnoyers@efficios.com> */ -#include <linux/sched.h> -#include <linux/uaccess.h> -#include <linux/syscalls.h> -#include <linux/rseq.h> -#include <linux/types.h> -#include <linux/ratelimit.h> -#include <asm/ptrace.h> - -#define CREATE_TRACE_POINTS -#include <trace/events/rseq.h> - -/* The original rseq structure size (including padding) is 32 bytes. */ -#define ORIG_RSEQ_SIZE 32 - -#define RSEQ_CS_NO_RESTART_FLAGS (RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT | \ - RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL | \ - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE) - -#ifdef CONFIG_DEBUG_RSEQ -static struct rseq *rseq_kernel_fields(struct task_struct *t) -{ - return (struct rseq *) t->rseq_fields; -} - -static int rseq_validate_ro_fields(struct task_struct *t) -{ - static DEFINE_RATELIMIT_STATE(_rs, - DEFAULT_RATELIMIT_INTERVAL, - DEFAULT_RATELIMIT_BURST); - u32 cpu_id_start, cpu_id, node_id, mm_cid; - struct rseq __user *rseq = t->rseq; - - /* - * Validate fields which are required to be read-only by - * user-space. - */ - if (!user_read_access_begin(rseq, t->rseq_len)) - goto efault; - unsafe_get_user(cpu_id_start, &rseq->cpu_id_start, efault_end); - unsafe_get_user(cpu_id, &rseq->cpu_id, efault_end); - unsafe_get_user(node_id, &rseq->node_id, efault_end); - unsafe_get_user(mm_cid, &rseq->mm_cid, efault_end); - user_read_access_end(); - - if ((cpu_id_start != rseq_kernel_fields(t)->cpu_id_start || - cpu_id != rseq_kernel_fields(t)->cpu_id || - node_id != rseq_kernel_fields(t)->node_id || - mm_cid != rseq_kernel_fields(t)->mm_cid) && __ratelimit(&_rs)) { - - pr_warn("Detected rseq corruption for pid: %d, name: %s\n" - "\tcpu_id_start: %u ?= %u\n" - "\tcpu_id: %u ?= %u\n" - "\tnode_id: %u ?= %u\n" - "\tmm_cid: %u ?= %u\n", - t->pid, t->comm, - cpu_id_start, rseq_kernel_fields(t)->cpu_id_start, - cpu_id, rseq_kernel_fields(t)->cpu_id, - node_id, rseq_kernel_fields(t)->node_id, - mm_cid, rseq_kernel_fields(t)->mm_cid); - } - - /* For now, only print a console warning on mismatch. */ - return 0; - -efault_end: - user_read_access_end(); -efault: - return -EFAULT; -} - -/* - * Update an rseq field and its in-kernel copy in lock-step to keep a coherent - * state. - */ -#define rseq_unsafe_put_user(t, value, field, error_label) \ - do { \ - unsafe_put_user(value, &t->rseq->field, error_label); \ - rseq_kernel_fields(t)->field = value; \ - } while (0) - -#else -static int rseq_validate_ro_fields(struct task_struct *t) -{ - return 0; -} - -#define rseq_unsafe_put_user(t, value, field, error_label) \ - unsafe_put_user(value, &t->rseq->field, error_label) -#endif - /* - * * Restartable sequences are a lightweight interface that allows * user-level code to be executed atomically relative to scheduler * preemption and signal delivery. Typically used for implementing @@ -158,356 +67,356 @@ static int rseq_validate_ro_fields(struct task_struct *t) * F1. <failure> */ -static int rseq_update_cpu_node_id(struct task_struct *t) -{ - struct rseq __user *rseq = t->rseq; - u32 cpu_id = raw_smp_processor_id(); - u32 node_id = cpu_to_node(cpu_id); - u32 mm_cid = task_mm_cid(t); +/* Required to select the proper per_cpu ops for rseq_stats_inc() */ +#define RSEQ_BUILD_SLOW_PATH - /* - * Validate read-only rseq fields. - */ - if (rseq_validate_ro_fields(t)) - goto efault; - WARN_ON_ONCE((int) mm_cid < 0); - if (!user_write_access_begin(rseq, t->rseq_len)) - goto efault; +#include <linux/debugfs.h> +#include <linux/ratelimit.h> +#include <linux/rseq_entry.h> +#include <linux/sched.h> +#include <linux/syscalls.h> +#include <linux/uaccess.h> +#include <linux/types.h> +#include <asm/ptrace.h> - rseq_unsafe_put_user(t, cpu_id, cpu_id_start, efault_end); - rseq_unsafe_put_user(t, cpu_id, cpu_id, efault_end); - rseq_unsafe_put_user(t, node_id, node_id, efault_end); - rseq_unsafe_put_user(t, mm_cid, mm_cid, efault_end); +#define CREATE_TRACE_POINTS +#include <trace/events/rseq.h> - /* - * Additional feature fields added after ORIG_RSEQ_SIZE - * need to be conditionally updated only if - * t->rseq_len != ORIG_RSEQ_SIZE. - */ - user_write_access_end(); - trace_rseq_update(t); - return 0; +DEFINE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled); -efault_end: - user_write_access_end(); -efault: - return -EFAULT; +static inline void rseq_control_debug(bool on) +{ + if (on) + static_branch_enable(&rseq_debug_enabled); + else + static_branch_disable(&rseq_debug_enabled); } -static int rseq_reset_rseq_cpu_node_id(struct task_struct *t) +static int __init rseq_setup_debug(char *str) { - struct rseq __user *rseq = t->rseq; - u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED, node_id = 0, - mm_cid = 0; - - /* - * Validate read-only rseq fields. - */ - if (rseq_validate_ro_fields(t)) - goto efault; + bool on; - if (!user_write_access_begin(rseq, t->rseq_len)) - goto efault; - - /* - * Reset all fields to their initial state. - * - * All fields have an initial state of 0 except cpu_id which is set to - * RSEQ_CPU_ID_UNINITIALIZED, so that any user coming in after - * unregistration can figure out that rseq needs to be registered - * again. - */ - rseq_unsafe_put_user(t, cpu_id_start, cpu_id_start, efault_end); - rseq_unsafe_put_user(t, cpu_id, cpu_id, efault_end); - rseq_unsafe_put_user(t, node_id, node_id, efault_end); - rseq_unsafe_put_user(t, mm_cid, mm_cid, efault_end); - - /* - * Additional feature fields added after ORIG_RSEQ_SIZE - * need to be conditionally reset only if - * t->rseq_len != ORIG_RSEQ_SIZE. - */ - user_write_access_end(); - return 0; - -efault_end: - user_write_access_end(); -efault: - return -EFAULT; + if (kstrtobool(str, &on)) + return -EINVAL; + rseq_control_debug(on); + return 1; } +__setup("rseq_debug=", rseq_setup_debug); +#ifdef CONFIG_TRACEPOINTS /* - * Get the user-space pointer value stored in the 'rseq_cs' field. + * Out of line, so the actual update functions can be in a header to be + * inlined into the exit to user code. */ -static int rseq_get_rseq_cs_ptr_val(struct rseq __user *rseq, u64 *rseq_cs) +void __rseq_trace_update(struct task_struct *t) { - if (!rseq_cs) - return -EFAULT; - -#ifdef CONFIG_64BIT - if (get_user(*rseq_cs, &rseq->rseq_cs)) - return -EFAULT; -#else - if (copy_from_user(rseq_cs, &rseq->rseq_cs, sizeof(*rseq_cs))) - return -EFAULT; -#endif + trace_rseq_update(t); +} - return 0; +void __rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip, + unsigned long offset, unsigned long abort_ip) +{ + trace_rseq_ip_fixup(ip, start_ip, offset, abort_ip); } +#endif /* CONFIG_TRACEPOINTS */ -/* - * If the rseq_cs field of 'struct rseq' contains a valid pointer to - * user-space, copy 'struct rseq_cs' from user-space and validate its fields. - */ -static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) +#ifdef CONFIG_DEBUG_FS +#ifdef CONFIG_RSEQ_STATS +DEFINE_PER_CPU(struct rseq_stats, rseq_stats); + +static int rseq_stats_show(struct seq_file *m, void *p) { - struct rseq_cs __user *urseq_cs; - u64 ptr; - u32 __user *usig; - u32 sig; - int ret; - - ret = rseq_get_rseq_cs_ptr_val(t->rseq, &ptr); - if (ret) - return ret; - - /* If the rseq_cs pointer is NULL, return a cleared struct rseq_cs. */ - if (!ptr) { - memset(rseq_cs, 0, sizeof(*rseq_cs)); - return 0; + struct rseq_stats stats = { }; + unsigned int cpu; + + for_each_possible_cpu(cpu) { + stats.exit += data_race(per_cpu(rseq_stats.exit, cpu)); + stats.signal += data_race(per_cpu(rseq_stats.signal, cpu)); + stats.slowpath += data_race(per_cpu(rseq_stats.slowpath, cpu)); + stats.fastpath += data_race(per_cpu(rseq_stats.fastpath, cpu)); + stats.ids += data_race(per_cpu(rseq_stats.ids, cpu)); + stats.cs += data_race(per_cpu(rseq_stats.cs, cpu)); + stats.clear += data_race(per_cpu(rseq_stats.clear, cpu)); + stats.fixup += data_race(per_cpu(rseq_stats.fixup, cpu)); } - /* Check that the pointer value fits in the user-space process space. */ - if (ptr >= TASK_SIZE) - return -EINVAL; - urseq_cs = (struct rseq_cs __user *)(unsigned long)ptr; - if (copy_from_user(rseq_cs, urseq_cs, sizeof(*rseq_cs))) - return -EFAULT; - if (rseq_cs->start_ip >= TASK_SIZE || - rseq_cs->start_ip + rseq_cs->post_commit_offset >= TASK_SIZE || - rseq_cs->abort_ip >= TASK_SIZE || - rseq_cs->version > 0) - return -EINVAL; - /* Check for overflow. */ - if (rseq_cs->start_ip + rseq_cs->post_commit_offset < rseq_cs->start_ip) - return -EINVAL; - /* Ensure that abort_ip is not in the critical section. */ - if (rseq_cs->abort_ip - rseq_cs->start_ip < rseq_cs->post_commit_offset) - return -EINVAL; + seq_printf(m, "exit: %16lu\n", stats.exit); + seq_printf(m, "signal: %16lu\n", stats.signal); + seq_printf(m, "slowp: %16lu\n", stats.slowpath); + seq_printf(m, "fastp: %16lu\n", stats.fastpath); + seq_printf(m, "ids: %16lu\n", stats.ids); + seq_printf(m, "cs: %16lu\n", stats.cs); + seq_printf(m, "clear: %16lu\n", stats.clear); + seq_printf(m, "fixup: %16lu\n", stats.fixup); + return 0; +} - usig = (u32 __user *)(unsigned long)(rseq_cs->abort_ip - sizeof(u32)); - ret = get_user(sig, usig); - if (ret) - return ret; +static int rseq_stats_open(struct inode *inode, struct file *file) +{ + return single_open(file, rseq_stats_show, inode->i_private); +} - if (current->rseq_sig != sig) { - printk_ratelimited(KERN_WARNING - "Possible attack attempt. Unexpected rseq signature 0x%x, expecting 0x%x (pid=%d, addr=%p).\n", - sig, current->rseq_sig, current->pid, usig); - return -EINVAL; - } +static const struct file_operations stat_ops = { + .open = rseq_stats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init rseq_stats_init(struct dentry *root_dir) +{ + debugfs_create_file("stats", 0444, root_dir, NULL, &stat_ops); return 0; } +#else +static inline void rseq_stats_init(struct dentry *root_dir) { } +#endif /* CONFIG_RSEQ_STATS */ -static bool rseq_warn_flags(const char *str, u32 flags) +static int rseq_debug_show(struct seq_file *m, void *p) { - u32 test_flags; - - if (!flags) - return false; - test_flags = flags & RSEQ_CS_NO_RESTART_FLAGS; - if (test_flags) - pr_warn_once("Deprecated flags (%u) in %s ABI structure", test_flags, str); - test_flags = flags & ~RSEQ_CS_NO_RESTART_FLAGS; - if (test_flags) - pr_warn_once("Unknown flags (%u) in %s ABI structure", test_flags, str); - return true; + bool on = static_branch_unlikely(&rseq_debug_enabled); + + seq_printf(m, "%d\n", on); + return 0; } -static int rseq_need_restart(struct task_struct *t, u32 cs_flags) +static ssize_t rseq_debug_write(struct file *file, const char __user *ubuf, + size_t count, loff_t *ppos) { - u32 flags, event_mask; - int ret; + bool on; - if (rseq_warn_flags("rseq_cs", cs_flags)) + if (kstrtobool_from_user(ubuf, count, &on)) return -EINVAL; - /* Get thread flags. */ - ret = get_user(flags, &t->rseq->flags); - if (ret) - return ret; + rseq_control_debug(on); + return count; +} - if (rseq_warn_flags("rseq", flags)) - return -EINVAL; +static int rseq_debug_open(struct inode *inode, struct file *file) +{ + return single_open(file, rseq_debug_show, inode->i_private); +} - /* - * Load and clear event mask atomically with respect to - * scheduler preemption and membarrier IPIs. - */ - scoped_guard(RSEQ_EVENT_GUARD) { - event_mask = t->rseq_event_mask; - t->rseq_event_mask = 0; - } +static const struct file_operations debug_ops = { + .open = rseq_debug_open, + .read = seq_read, + .write = rseq_debug_write, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init rseq_debugfs_init(void) +{ + struct dentry *root_dir = debugfs_create_dir("rseq", NULL); - return !!event_mask; + debugfs_create_file("debug", 0644, root_dir, NULL, &debug_ops); + rseq_stats_init(root_dir); + return 0; } +__initcall(rseq_debugfs_init); +#endif /* CONFIG_DEBUG_FS */ -static int clear_rseq_cs(struct rseq __user *rseq) +static bool rseq_set_ids(struct task_struct *t, struct rseq_ids *ids, u32 node_id) { - /* - * The rseq_cs field is set to NULL on preemption or signal - * delivery on top of rseq assembly block, as well as on top - * of code outside of the rseq assembly block. This performs - * a lazy clear of the rseq_cs field. - * - * Set rseq_cs to NULL. - */ -#ifdef CONFIG_64BIT - return put_user(0UL, &rseq->rseq_cs); -#else - if (clear_user(&rseq->rseq_cs, sizeof(rseq->rseq_cs))) - return -EFAULT; - return 0; -#endif + return rseq_set_ids_get_csaddr(t, ids, node_id, NULL); } -/* - * Unsigned comparison will be true when ip >= start_ip, and when - * ip < start_ip + post_commit_offset. - */ -static bool in_rseq_cs(unsigned long ip, struct rseq_cs *rseq_cs) +static bool rseq_handle_cs(struct task_struct *t, struct pt_regs *regs) { - return ip - rseq_cs->start_ip < rseq_cs->post_commit_offset; + struct rseq __user *urseq = t->rseq.usrptr; + u64 csaddr; + + scoped_user_read_access(urseq, efault) + unsafe_get_user(csaddr, &urseq->rseq_cs, efault); + if (likely(!csaddr)) + return true; + return rseq_update_user_cs(t, regs, csaddr); +efault: + return false; } -static int rseq_ip_fixup(struct pt_regs *regs) +static void rseq_slowpath_update_usr(struct pt_regs *regs) { - unsigned long ip = instruction_pointer(regs); + /* + * Preserve rseq state and user_irq state. The generic entry code + * clears user_irq on the way out, the non-generic entry + * architectures are not having user_irq. + */ + const struct rseq_event evt_mask = { .has_rseq = true, .user_irq = true, }; struct task_struct *t = current; - struct rseq_cs rseq_cs; - int ret; + struct rseq_ids ids; + u32 node_id; + bool event; + + if (unlikely(t->flags & PF_EXITING)) + return; - ret = rseq_get_rseq_cs(t, &rseq_cs); - if (ret) - return ret; + rseq_stat_inc(rseq_stats.slowpath); /* - * Handle potentially not being within a critical section. - * If not nested over a rseq critical section, restart is useless. - * Clear the rseq_cs pointer and return. + * Read and clear the event pending bit first. If the task + * was not preempted or migrated or a signal is on the way, + * there is no point in doing any of the heavy lifting here + * on production kernels. In that case TIF_NOTIFY_RESUME + * was raised by some other functionality. + * + * This is correct because the read/clear operation is + * guarded against scheduler preemption, which makes it CPU + * local atomic. If the task is preempted right after + * re-enabling preemption then TIF_NOTIFY_RESUME is set + * again and this function is invoked another time _before_ + * the task is able to return to user mode. + * + * On a debug kernel, invoke the fixup code unconditionally + * with the result handed in to allow the detection of + * inconsistencies. */ - if (!in_rseq_cs(ip, &rseq_cs)) - return clear_rseq_cs(t->rseq); - ret = rseq_need_restart(t, rseq_cs.flags); - if (ret <= 0) - return ret; - ret = clear_rseq_cs(t->rseq); - if (ret) - return ret; - trace_rseq_ip_fixup(ip, rseq_cs.start_ip, rseq_cs.post_commit_offset, - rseq_cs.abort_ip); - instruction_pointer_set(regs, (unsigned long)rseq_cs.abort_ip); - return 0; + scoped_guard(irq) { + event = t->rseq.event.sched_switch; + t->rseq.event.all &= evt_mask.all; + ids.cpu_id = task_cpu(t); + ids.mm_cid = task_mm_cid(t); + } + + if (!event) + return; + + node_id = cpu_to_node(ids.cpu_id); + + if (unlikely(!rseq_update_usr(t, regs, &ids, node_id))) { + /* + * Clear the errors just in case this might survive magically, but + * leave the rest intact. + */ + t->rseq.event.error = 0; + force_sig(SIGSEGV); + } } -/* - * This resume handler must always be executed between any of: - * - preemption, - * - signal delivery, - * and return to user-space. - * - * This is how we can ensure that the entire rseq critical section - * will issue the commit instruction only if executed atomically with - * respect to other threads scheduled on the same CPU, and with respect - * to signal handlers. - */ -void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) +void __rseq_handle_slowpath(struct pt_regs *regs) { - struct task_struct *t = current; - int ret, sig; - - if (unlikely(t->flags & PF_EXITING)) + /* + * If invoked from hypervisors before entering the guest via + * resume_user_mode_work(), then @regs is a NULL pointer. + * + * resume_user_mode_work() clears TIF_NOTIFY_RESUME and re-raises + * it before returning from the ioctl() to user space when + * rseq_event.sched_switch is set. + * + * So it's safe to ignore here instead of pointlessly updating it + * in the vcpu_run() loop. + */ + if (!regs) return; + rseq_slowpath_update_usr(regs); +} + +void __rseq_signal_deliver(int sig, struct pt_regs *regs) +{ + rseq_stat_inc(rseq_stats.signal); /* - * regs is NULL if and only if the caller is in a syscall path. Skip - * fixup and leave rseq_cs as is so that rseq_sycall() will detect and - * kill a misbehaving userspace on debug kernels. + * Don't update IDs, they are handled on exit to user if + * necessary. The important thing is to abort a critical section of + * the interrupted context as after this point the instruction + * pointer in @regs points to the signal handler. */ - if (regs) { - ret = rseq_ip_fixup(regs); - if (unlikely(ret < 0)) - goto error; + if (unlikely(!rseq_handle_cs(current, regs))) { + /* + * Clear the errors just in case this might survive + * magically, but leave the rest intact. + */ + current->rseq.event.error = 0; + force_sigsegv(sig); } - if (unlikely(rseq_update_cpu_node_id(t))) - goto error; - return; - -error: - sig = ksig ? ksig->sig : 0; - force_sigsegv(sig); } -#ifdef CONFIG_DEBUG_RSEQ - /* * Terminate the process if a syscall is issued within a restartable * sequence. */ -void rseq_syscall(struct pt_regs *regs) +void __rseq_debug_syscall_return(struct pt_regs *regs) { - unsigned long ip = instruction_pointer(regs); struct task_struct *t = current; - struct rseq_cs rseq_cs; + u64 csaddr; - if (!t->rseq) + if (!t->rseq.event.has_rseq) return; - if (rseq_get_rseq_cs(t, &rseq_cs) || in_rseq_cs(ip, &rseq_cs)) - force_sig(SIGSEGV); + if (get_user(csaddr, &t->rseq.usrptr->rseq_cs)) + goto fail; + if (likely(!csaddr)) + return; + if (unlikely(csaddr >= TASK_SIZE)) + goto fail; + if (rseq_debug_update_user_cs(t, regs, csaddr)) + return; +fail: + force_sig(SIGSEGV); } +#ifdef CONFIG_DEBUG_RSEQ +/* Kept around to keep GENERIC_ENTRY=n architectures supported. */ +void rseq_syscall(struct pt_regs *regs) +{ + __rseq_debug_syscall_return(regs); +} #endif +static bool rseq_reset_ids(void) +{ + struct rseq_ids ids = { + .cpu_id = RSEQ_CPU_ID_UNINITIALIZED, + .mm_cid = 0, + }; + + /* + * If this fails, terminate it because this leaves the kernel in + * stupid state as exit to user space will try to fixup the ids + * again. + */ + if (rseq_set_ids(current, &ids, 0)) + return true; + + force_sig(SIGSEGV); + return false; +} + +/* The original rseq structure size (including padding) is 32 bytes. */ +#define ORIG_RSEQ_SIZE 32 + /* * sys_rseq - setup restartable sequences for caller thread. */ -SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, - int, flags, u32, sig) +SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig) { - int ret; - u64 rseq_cs; - if (flags & RSEQ_FLAG_UNREGISTER) { if (flags & ~RSEQ_FLAG_UNREGISTER) return -EINVAL; /* Unregister rseq for current thread. */ - if (current->rseq != rseq || !current->rseq) + if (current->rseq.usrptr != rseq || !current->rseq.usrptr) return -EINVAL; - if (rseq_len != current->rseq_len) + if (rseq_len != current->rseq.len) return -EINVAL; - if (current->rseq_sig != sig) + if (current->rseq.sig != sig) return -EPERM; - ret = rseq_reset_rseq_cpu_node_id(current); - if (ret) - return ret; - current->rseq = NULL; - current->rseq_sig = 0; - current->rseq_len = 0; + if (!rseq_reset_ids()) + return -EFAULT; + rseq_reset(current); return 0; } if (unlikely(flags)) return -EINVAL; - if (current->rseq) { + if (current->rseq.usrptr) { /* * If rseq is already registered, check whether * the provided address differs from the prior * one. */ - if (current->rseq != rseq || rseq_len != current->rseq_len) + if (current->rseq.usrptr != rseq || rseq_len != current->rseq.len) return -EINVAL; - if (current->rseq_sig != sig) + if (current->rseq.sig != sig) return -EPERM; /* Already registered. */ return -EBUSY; @@ -531,43 +440,39 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, if (!access_ok(rseq, rseq_len)) return -EFAULT; - /* - * If the rseq_cs pointer is non-NULL on registration, clear it to - * avoid a potential segfault on return to user-space. The proper thing - * to do would have been to fail the registration but this would break - * older libcs that reuse the rseq area for new threads without - * clearing the fields. - */ - if (rseq_get_rseq_cs_ptr_val(rseq, &rseq_cs)) - return -EFAULT; - if (rseq_cs && clear_rseq_cs(rseq)) - return -EFAULT; + scoped_user_write_access(rseq, efault) { + /* + * If the rseq_cs pointer is non-NULL on registration, clear it to + * avoid a potential segfault on return to user-space. The proper thing + * to do would have been to fail the registration but this would break + * older libcs that reuse the rseq area for new threads without + * clearing the fields. Don't bother reading it, just reset it. + */ + unsafe_put_user(0UL, &rseq->rseq_cs, efault); + /* Initialize IDs in user space */ + unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id_start, efault); + unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id, efault); + unsafe_put_user(0U, &rseq->node_id, efault); + unsafe_put_user(0U, &rseq->mm_cid, efault); + } -#ifdef CONFIG_DEBUG_RSEQ - /* - * Initialize the in-kernel rseq fields copy for validation of - * read-only fields. - */ - if (get_user(rseq_kernel_fields(current)->cpu_id_start, &rseq->cpu_id_start) || - get_user(rseq_kernel_fields(current)->cpu_id, &rseq->cpu_id) || - get_user(rseq_kernel_fields(current)->node_id, &rseq->node_id) || - get_user(rseq_kernel_fields(current)->mm_cid, &rseq->mm_cid)) - return -EFAULT; -#endif /* * Activate the registration by setting the rseq area address, length * and signature in the task struct. */ - current->rseq = rseq; - current->rseq_len = rseq_len; - current->rseq_sig = sig; + current->rseq.usrptr = rseq; + current->rseq.len = rseq_len; + current->rseq.sig = sig; /* * If rseq was previously inactive, and has just been * registered, ensure the cpu_id_start and cpu_id fields * are updated before returning to user-space. */ - rseq_set_notify_resume(current); - + current->rseq.event.has_rseq = true; + rseq_force_update(); return 0; + +efault: + return -EFAULT; } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f754a60de848..fc358c1b6ca9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -121,6 +121,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); +DEFINE_PER_CPU(struct rnd_state, sched_rnd_state); #ifdef CONFIG_SCHED_PROXY_EXEC DEFINE_STATIC_KEY_TRUE(__sched_proxy_exec); @@ -583,8 +584,8 @@ EXPORT_SYMBOL(__trace_set_current_state); * * p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIGRATING }: * - * is set by activate_task() and cleared by deactivate_task(), under - * rq->lock. Non-zero indicates the task is runnable, the special + * is set by activate_task() and cleared by deactivate_task()/block_task(), + * under rq->lock. Non-zero indicates the task is runnable, the special * ON_RQ_MIGRATING state is used for migration without holding both * rq->locks. It indicates task_cpu() is not stable, see task_rq_lock(). * @@ -2089,6 +2090,7 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags) */ uclamp_rq_inc(rq, p, flags); + rq->queue_mask |= p->sched_class->queue_mask; p->sched_class->enqueue_task(rq, p, flags); psi_enqueue(p, flags); @@ -2121,6 +2123,7 @@ inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags) * and mark the task ->sched_delayed. */ uclamp_rq_dec(rq, p); + rq->queue_mask |= p->sched_class->queue_mask; return p->sched_class->dequeue_task(rq, p, flags); } @@ -2128,8 +2131,6 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags) { if (task_on_rq_migrating(p)) flags |= ENQUEUE_MIGRATED; - if (flags & ENQUEUE_MIGRATED) - sched_mm_cid_migrate_to(rq, p); enqueue_task(rq, p, flags); @@ -2169,37 +2170,6 @@ inline int task_curr(const struct task_struct *p) return cpu_curr(task_cpu(p)) == p; } -/* - * ->switching_to() is called with the pi_lock and rq_lock held and must not - * mess with locking. - */ -void check_class_changing(struct rq *rq, struct task_struct *p, - const struct sched_class *prev_class) -{ - if (prev_class != p->sched_class && p->sched_class->switching_to) - p->sched_class->switching_to(rq, p); -} - -/* - * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock, - * use the balance_callback list if you want balancing. - * - * this means any call to check_class_changed() must be followed by a call to - * balance_callback(). - */ -void check_class_changed(struct rq *rq, struct task_struct *p, - const struct sched_class *prev_class, - int oldprio) -{ - if (prev_class != p->sched_class) { - if (prev_class->switched_from) - prev_class->switched_from(rq, p); - - p->sched_class->switched_to(rq, p); - } else if (oldprio != p->prio || dl_task(p)) - p->sched_class->prio_changed(rq, p, oldprio); -} - void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags) { struct task_struct *donor = rq->donor; @@ -2362,7 +2332,7 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state } static void -__do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx); +do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx); static void migrate_disable_switch(struct rq *rq, struct task_struct *p) { @@ -2377,10 +2347,8 @@ static void migrate_disable_switch(struct rq *rq, struct task_struct *p) if (p->cpus_ptr != &p->cpus_mask) return; - /* - * Violates locking rules! See comment in __do_set_cpus_allowed(). - */ - __do_set_cpus_allowed(p, &ac); + scoped_guard (task_rq_lock, p) + do_set_cpus_allowed(p, &ac); } void ___migrate_enable(void) @@ -2613,7 +2581,8 @@ static int migration_cpu_stop(void *data) */ WARN_ON_ONCE(!pending->stop_pending); preempt_disable(); - task_rq_unlock(rq, p, &rf); + rq_unlock(rq, &rf); + raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags); stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop, &pending->arg, &pending->stop_work); preempt_enable(); @@ -2622,7 +2591,8 @@ static int migration_cpu_stop(void *data) out: if (pending) pending->stop_pending = false; - task_rq_unlock(rq, p, &rf); + rq_unlock(rq, &rf); + raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags); if (complete) complete_all(&pending->done); @@ -2671,6 +2641,8 @@ out_unlock: return 0; } +static inline void mm_update_cpus_allowed(struct mm_struct *mm, const cpumask_t *affmask); + /* * sched_class::set_cpus_allowed must do the below, but is not required to * actually call this function. @@ -2684,6 +2656,7 @@ void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx cpumask_copy(&p->cpus_mask, ctx->new_mask); p->nr_cpus_allowed = cpumask_weight(ctx->new_mask); + mm_update_cpus_allowed(p->mm, ctx->new_mask); /* * Swap in a new user_cpus_ptr if SCA_USER flag set @@ -2693,56 +2666,17 @@ void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx } static void -__do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx) +do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx) { - struct rq *rq = task_rq(p); - bool queued, running; - - /* - * This here violates the locking rules for affinity, since we're only - * supposed to change these variables while holding both rq->lock and - * p->pi_lock. - * - * HOWEVER, it magically works, because ttwu() is the only code that - * accesses these variables under p->pi_lock and only does so after - * smp_cond_load_acquire(&p->on_cpu, !VAL), and we're in __schedule() - * before finish_task(). - * - * XXX do further audits, this smells like something putrid. - */ - if (ctx->flags & SCA_MIGRATE_DISABLE) - WARN_ON_ONCE(!p->on_cpu); - else - lockdep_assert_held(&p->pi_lock); - - queued = task_on_rq_queued(p); - running = task_current_donor(rq, p); - - if (queued) { - /* - * Because __kthread_bind() calls this on blocked tasks without - * holding rq->lock. - */ - lockdep_assert_rq_held(rq); - dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK); - } - if (running) - put_prev_task(rq, p); - - p->sched_class->set_cpus_allowed(p, ctx); - mm_set_cpus_allowed(p->mm, ctx->new_mask); - - if (queued) - enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); - if (running) - set_next_task(rq, p); + scoped_guard (sched_change, p, DEQUEUE_SAVE) + p->sched_class->set_cpus_allowed(p, ctx); } /* * Used for kthread_bind() and select_fallback_rq(), in both cases the user * affinity (if any) should be destroyed too. */ -void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) +void set_cpus_allowed_force(struct task_struct *p, const struct cpumask *new_mask) { struct affinity_context ac = { .new_mask = new_mask, @@ -2754,7 +2688,8 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) struct rcu_head rcu; }; - __do_set_cpus_allowed(p, &ac); + scoped_guard (__task_rq_lock, p) + do_set_cpus_allowed(p, &ac); /* * Because this is called with p->pi_lock held, it is not possible @@ -2792,7 +2727,7 @@ int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, * Use pi_lock to protect content of user_cpus_ptr * * Though unlikely, user_cpus_ptr can be reset to NULL by a concurrent - * do_set_cpus_allowed(). + * set_cpus_allowed_force(). */ raw_spin_lock_irqsave(&src->pi_lock, flags); if (src->user_cpus_ptr) { @@ -3064,8 +2999,6 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p, unsigned int dest_cpu; int ret = 0; - update_rq_clock(rq); - if (kthread || is_migration_disabled(p)) { /* * Kernel threads are allowed on online && !active CPUs, @@ -3120,7 +3053,7 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p, goto out; } - __do_set_cpus_allowed(p, ctx); + do_set_cpus_allowed(p, ctx); return affine_move_task(rq, p, rf, dest_cpu, ctx->flags); @@ -3329,8 +3262,6 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) if (p->sched_class->migrate_task_rq) p->sched_class->migrate_task_rq(p, new_cpu); p->se.nr_migrations++; - rseq_migrate(p); - sched_mm_cid_migrate_from(p); perf_event_task_migrate(p); } @@ -3529,13 +3460,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p) } fallthrough; case possible: - /* - * XXX When called from select_task_rq() we only - * hold p->pi_lock and again violate locking order. - * - * More yuck to audit. - */ - do_set_cpus_allowed(p, task_cpu_fallback_mask(p)); + set_cpus_allowed_force(p, task_cpu_fallback_mask(p)); state = fail; break; case fail: @@ -3777,7 +3702,7 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags) ttwu_do_wakeup(p); ret = 1; } - __task_rq_unlock(rq, &rf); + __task_rq_unlock(rq, p, &rf); return ret; } @@ -4231,7 +4156,7 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * __schedule(). See the comment for smp_mb__after_spinlock(). * * Form a control-dep-acquire with p->on_rq == 0 above, to ensure - * schedule()'s deactivate_task() has 'happened' and p will no longer + * schedule()'s block_task() has 'happened' and p will no longer * care about it's own p->state. See the comment in __schedule(). */ smp_acquire__after_ctrl_dep(); @@ -4370,7 +4295,7 @@ int task_call_func(struct task_struct *p, task_call_f func, void *arg) ret = func(p, arg); if (rq) - rq_unlock(rq, &rf); + __task_rq_unlock(rq, p, &rf); raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags); return ret; @@ -4487,7 +4412,6 @@ static void __sched_fork(u64 clone_flags, struct task_struct *p) init_numa_balancing(clone_flags, p); p->wake_entry.u_flags = CSD_TYPE_TTWU; p->migration_pending = NULL; - init_sched_mm_cid(p); } DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); @@ -4763,7 +4687,6 @@ int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) p->sched_task_group = tg; } #endif - rseq_migrate(p); /* * We're setting the CPU for the first time, we don't migrate, * so use __set_task_cpu(). @@ -4827,7 +4750,6 @@ void wake_up_new_task(struct task_struct *p) * as we're not fully set-up yet. */ p->recent_used_cpu = task_cpu(p); - rseq_migrate(p); __set_task_cpu(p, select_task_rq(p, task_cpu(p), &wake_flags)); rq = __task_rq_lock(p, &rf); update_rq_clock(rq); @@ -5121,7 +5043,6 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, kcov_prepare_switch(prev); sched_info_switch(rq, prev, next); perf_event_task_sched_out(prev, next); - rseq_preempt(prev); fire_sched_out_preempt_notifiers(prev, next); kmap_local_sched_out(); prepare_task(next); @@ -5284,19 +5205,16 @@ context_switch(struct rq *rq, struct task_struct *prev, * * kernel -> user switch + mmdrop_lazy_tlb() active * user -> user switch - * - * switch_mm_cid() needs to be updated if the barriers provided - * by context_switch() are modified. */ - if (!next->mm) { // to kernel + if (!next->mm) { // to kernel enter_lazy_tlb(prev->active_mm, next); next->active_mm = prev->active_mm; - if (prev->mm) // from user + if (prev->mm) // from user mmgrab_lazy_tlb(prev->active_mm); else prev->active_mm = NULL; - } else { // to user + } else { // to user membarrier_switch_mm(rq, prev->active_mm, next->mm); /* * sys_membarrier() requires an smp_mb() between setting @@ -5309,15 +5227,20 @@ context_switch(struct rq *rq, struct task_struct *prev, switch_mm_irqs_off(prev->active_mm, next->mm, next); lru_gen_use_mm(next->mm); - if (!prev->mm) { // from kernel + if (!prev->mm) { // from kernel /* will mmdrop_lazy_tlb() in finish_task_switch(). */ rq->prev_mm = prev->active_mm; prev->active_mm = NULL; } } - /* switch_mm_cid() requires the memory barriers above. */ - switch_mm_cid(rq, prev, next); + mm_cid_switch_to(prev, next); + + /* + * Tell rseq that the task was scheduled in. Must be after + * switch_mm_cid() to get the TIF flag set. + */ + rseq_sched_switch_event(next); prepare_lock_switch(rq, next, rf); @@ -5602,7 +5525,6 @@ void sched_tick(void) resched_latency = cpu_resched_latency(rq); calc_global_load_tick(rq); sched_core_tick(rq); - task_tick_mm_cid(rq, donor); scx_tick(rq); rq_unlock(rq, &rf); @@ -5692,7 +5614,7 @@ static void sched_tick_remote(struct work_struct *work) * reasonable amount of time. */ u64 delta = rq_clock_task(rq) - curr->se.exec_start; - WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); + WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 30); } curr->sched_class->task_tick(rq, curr, 0); @@ -5916,19 +5838,6 @@ static void prev_balance(struct rq *rq, struct task_struct *prev, const struct sched_class *start_class = prev->sched_class; const struct sched_class *class; -#ifdef CONFIG_SCHED_CLASS_EXT - /* - * SCX requires a balance() call before every pick_task() including when - * waking up from SCHED_IDLE. If @start_class is below SCX, start from - * SCX instead. Also, set a flag to detect missing balance() call. - */ - if (scx_enabled()) { - rq->scx.flags |= SCX_RQ_BAL_PENDING; - if (sched_class_above(&ext_sched_class, start_class)) - start_class = &ext_sched_class; - } -#endif - /* * We must do the balancing pass before put_prev_task(), such * that when we release the rq->lock the task is in the same @@ -5972,7 +5881,7 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) /* Assume the next prioritized class is idle_sched_class */ if (!p) { - p = pick_task_idle(rq); + p = pick_task_idle(rq, rf); put_prev_set_next_task(rq, prev, p); } @@ -5984,11 +5893,15 @@ restart: for_each_active_class(class) { if (class->pick_next_task) { - p = class->pick_next_task(rq, prev); + p = class->pick_next_task(rq, prev, rf); + if (unlikely(p == RETRY_TASK)) + goto restart; if (p) return p; } else { - p = class->pick_task(rq); + p = class->pick_task(rq, rf); + if (unlikely(p == RETRY_TASK)) + goto restart; if (p) { put_prev_set_next_task(rq, prev, p); return p; @@ -6018,7 +5931,11 @@ static inline bool cookie_match(struct task_struct *a, struct task_struct *b) return a->core_cookie == b->core_cookie; } -static inline struct task_struct *pick_task(struct rq *rq) +/* + * Careful; this can return RETRY_TASK, it does not include the retry-loop + * itself due to the whole SMT pick retry thing below. + */ +static inline struct task_struct *pick_task(struct rq *rq, struct rq_flags *rf) { const struct sched_class *class; struct task_struct *p; @@ -6026,7 +5943,7 @@ static inline struct task_struct *pick_task(struct rq *rq) rq->dl_server = NULL; for_each_active_class(class) { - p = class->pick_task(rq); + p = class->pick_task(rq, rf); if (p) return p; } @@ -6041,7 +5958,7 @@ static void queue_core_balance(struct rq *rq); static struct task_struct * pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { - struct task_struct *next, *p, *max = NULL; + struct task_struct *next, *p, *max; const struct cpumask *smt_mask; bool fi_before = false; bool core_clock_updated = (rq == rq->core); @@ -6126,7 +6043,10 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * and there are no cookied tasks running on siblings. */ if (!need_sync) { - next = pick_task(rq); +restart_single: + next = pick_task(rq, rf); + if (unlikely(next == RETRY_TASK)) + goto restart_single; if (!next->core_cookie) { rq->core_pick = NULL; rq->core_dl_server = NULL; @@ -6146,6 +6066,8 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * * Tie-break prio towards the current CPU */ +restart_multi: + max = NULL; for_each_cpu_wrap(i, smt_mask, cpu) { rq_i = cpu_rq(i); @@ -6157,7 +6079,11 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) if (i != cpu && (rq_i != rq->core || !core_clock_updated)) update_rq_clock(rq_i); - rq_i->core_pick = p = pick_task(rq_i); + p = pick_task(rq_i, rf); + if (unlikely(p == RETRY_TASK)) + goto restart_multi; + + rq_i->core_pick = p; rq_i->core_dl_server = rq_i->dl_server; if (!max || prio_less(max, p, fi_before)) @@ -6179,7 +6105,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) if (cookie) p = sched_core_find(rq_i, cookie); if (!p) - p = idle_sched_class.pick_task(rq_i); + p = idle_sched_class.pick_task(rq_i, rf); } rq_i->core_pick = p; @@ -6812,6 +6738,7 @@ static void __sched notrace __schedule(int sched_mode) local_irq_disable(); rcu_note_context_switch(preempt); + migrate_disable_switch(rq, prev); /* * Make sure that signal_pending_state()->signal_pending() below @@ -6918,7 +6845,6 @@ keep_resched: */ ++*switch_count; - migrate_disable_switch(rq, prev); psi_account_irqtime(rq, prev, next); psi_sched_switch(prev, next, !task_on_rq_queued(prev) || prev->se.sched_delayed); @@ -7326,7 +7252,7 @@ void rt_mutex_post_schedule(void) */ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) { - int prio, oldprio, queued, running, queue_flag = + int prio, oldprio, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; const struct sched_class *prev_class, *next_class; struct rq_flags rf; @@ -7388,64 +7314,51 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) prev_class = p->sched_class; next_class = __setscheduler_class(p->policy, prio); - if (prev_class != next_class && p->se.sched_delayed) - dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK); - - queued = task_on_rq_queued(p); - running = task_current_donor(rq, p); - if (queued) - dequeue_task(rq, p, queue_flag); - if (running) - put_prev_task(rq, p); + if (prev_class != next_class) + queue_flag |= DEQUEUE_CLASS; - /* - * Boosting condition are: - * 1. -rt task is running and holds mutex A - * --> -dl task blocks on mutex A - * - * 2. -dl task is running and holds mutex A - * --> -dl task blocks on mutex A and could preempt the - * running task - */ - if (dl_prio(prio)) { - if (!dl_prio(p->normal_prio) || - (pi_task && dl_prio(pi_task->prio) && - dl_entity_preempt(&pi_task->dl, &p->dl))) { - p->dl.pi_se = pi_task->dl.pi_se; - queue_flag |= ENQUEUE_REPLENISH; + scoped_guard (sched_change, p, queue_flag) { + /* + * Boosting condition are: + * 1. -rt task is running and holds mutex A + * --> -dl task blocks on mutex A + * + * 2. -dl task is running and holds mutex A + * --> -dl task blocks on mutex A and could preempt the + * running task + */ + if (dl_prio(prio)) { + if (!dl_prio(p->normal_prio) || + (pi_task && dl_prio(pi_task->prio) && + dl_entity_preempt(&pi_task->dl, &p->dl))) { + p->dl.pi_se = pi_task->dl.pi_se; + scope->flags |= ENQUEUE_REPLENISH; + } else { + p->dl.pi_se = &p->dl; + } + } else if (rt_prio(prio)) { + if (dl_prio(oldprio)) + p->dl.pi_se = &p->dl; + if (oldprio < prio) + scope->flags |= ENQUEUE_HEAD; } else { - p->dl.pi_se = &p->dl; + if (dl_prio(oldprio)) + p->dl.pi_se = &p->dl; + if (rt_prio(oldprio)) + p->rt.timeout = 0; } - } else if (rt_prio(prio)) { - if (dl_prio(oldprio)) - p->dl.pi_se = &p->dl; - if (oldprio < prio) - queue_flag |= ENQUEUE_HEAD; - } else { - if (dl_prio(oldprio)) - p->dl.pi_se = &p->dl; - if (rt_prio(oldprio)) - p->rt.timeout = 0; - } - - p->sched_class = next_class; - p->prio = prio; - - check_class_changing(rq, p, prev_class); - if (queued) - enqueue_task(rq, p, queue_flag); - if (running) - set_next_task(rq, p); - - check_class_changed(rq, p, prev_class, oldprio); + p->sched_class = next_class; + p->prio = prio; + } out_unlock: /* Avoid rq from going away on us: */ preempt_disable(); rq_unpin_lock(rq, &rf); __balance_callbacks(rq); - raw_spin_rq_unlock(rq); + rq_repin_lock(rq, &rf); + __task_rq_unlock(rq, p, &rf); preempt_enable(); } @@ -8084,26 +7997,9 @@ int migrate_task_to(struct task_struct *p, int target_cpu) */ void sched_setnuma(struct task_struct *p, int nid) { - bool queued, running; - struct rq_flags rf; - struct rq *rq; - - rq = task_rq_lock(p, &rf); - queued = task_on_rq_queued(p); - running = task_current_donor(rq, p); - - if (queued) - dequeue_task(rq, p, DEQUEUE_SAVE); - if (running) - put_prev_task(rq, p); - - p->numa_preferred_nid = nid; - - if (queued) - enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); - if (running) - set_next_task(rq, p); - task_rq_unlock(rq, p, &rf); + guard(task_rq_lock)(p); + scoped_guard (sched_change, p, DEQUEUE_SAVE) + p->numa_preferred_nid = nid; } #endif /* CONFIG_NUMA_BALANCING */ @@ -8141,18 +8037,15 @@ static int __balance_push_cpu_stop(void *arg) struct rq_flags rf; int cpu; - raw_spin_lock_irq(&p->pi_lock); - rq_lock(rq, &rf); - - update_rq_clock(rq); - - if (task_rq(p) == rq && task_on_rq_queued(p)) { + scoped_guard (raw_spinlock_irq, &p->pi_lock) { cpu = select_fallback_rq(rq->cpu, p); - rq = __migrate_task(rq, &rf, p, cpu); - } - rq_unlock(rq, &rf); - raw_spin_unlock_irq(&p->pi_lock); + rq_lock(rq, &rf); + update_rq_clock(rq); + if (task_rq(p) == rq && task_on_rq_queued(p)) + rq = __migrate_task(rq, &rf, p, cpu); + rq_unlock(rq, &rf); + } put_task_struct(p); @@ -8591,6 +8484,8 @@ void __init sched_init_smp(void) { sched_init_numa(NUMA_NO_NODE); + prandom_init_once(&sched_rnd_state); + /* * There's no userspace yet to cause hotplug operations; hence all the * CPU masks are stable and all blatant races in the below code cannot @@ -9207,38 +9102,23 @@ static void sched_change_group(struct task_struct *tsk) */ void sched_move_task(struct task_struct *tsk, bool for_autogroup) { - int queued, running, queue_flags = - DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; + unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; + bool resched = false; struct rq *rq; CLASS(task_rq_lock, rq_guard)(tsk); rq = rq_guard.rq; - update_rq_clock(rq); - - running = task_current_donor(rq, tsk); - queued = task_on_rq_queued(tsk); - - if (queued) - dequeue_task(rq, tsk, queue_flags); - if (running) - put_prev_task(rq, tsk); - - sched_change_group(tsk); - if (!for_autogroup) - scx_cgroup_move_task(tsk); + scoped_guard (sched_change, tsk, queue_flags) { + sched_change_group(tsk); + if (!for_autogroup) + scx_cgroup_move_task(tsk); + if (scope->running) + resched = true; + } - if (queued) - enqueue_task(rq, tsk, queue_flags); - if (running) { - set_next_task(rq, tsk); - /* - * After changing group, the running task may have joined a - * throttled one but it's still the running task. Trigger a - * resched to make sure that task can still run. - */ + if (resched) resched_curr(rq); - } } static struct cgroup_subsys_state * @@ -10374,557 +10254,571 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count) } #ifdef CONFIG_SCHED_MM_CID - -/* - * @cid_lock: Guarantee forward-progress of cid allocation. - * - * Concurrency ID allocation within a bitmap is mostly lock-free. The cid_lock - * is only used when contention is detected by the lock-free allocation so - * forward progress can be guaranteed. - */ -DEFINE_RAW_SPINLOCK(cid_lock); - -/* - * @use_cid_lock: Select cid allocation behavior: lock-free vs spinlock. - * - * When @use_cid_lock is 0, the cid allocation is lock-free. When contention is - * detected, it is set to 1 to ensure that all newly coming allocations are - * serialized by @cid_lock until the allocation which detected contention - * completes and sets @use_cid_lock back to 0. This guarantees forward progress - * of a cid allocation. - */ -int use_cid_lock; - /* - * mm_cid remote-clear implements a lock-free algorithm to clear per-mm/cpu cid - * concurrently with respect to the execution of the source runqueue context - * switch. - * - * There is one basic properties we want to guarantee here: - * - * (1) Remote-clear should _never_ mark a per-cpu cid UNSET when it is actively - * used by a task. That would lead to concurrent allocation of the cid and - * userspace corruption. + * Concurrency IDentifier management * - * Provide this guarantee by introducing a Dekker memory ordering to guarantee - * that a pair of loads observe at least one of a pair of stores, which can be - * shown as: + * Serialization rules: * - * X = Y = 0 + * mm::mm_cid::mutex: Serializes fork() and exit() and therefore + * protects mm::mm_cid::users. * - * w[X]=1 w[Y]=1 - * MB MB - * r[Y]=y r[X]=x + * mm::mm_cid::lock: Serializes mm_update_max_cids() and + * mm_update_cpus_allowed(). Nests in mm_cid::mutex + * and runqueue lock. * - * Which guarantees that x==0 && y==0 is impossible. But rather than using - * values 0 and 1, this algorithm cares about specific state transitions of the - * runqueue current task (as updated by the scheduler context switch), and the - * per-mm/cpu cid value. + * The mm_cidmask bitmap is not protected by any of the mm::mm_cid locks + * and can only be modified with atomic operations. * - * Let's introduce task (Y) which has task->mm == mm and task (N) which has - * task->mm != mm for the rest of the discussion. There are two scheduler state - * transitions on context switch we care about: + * The mm::mm_cid:pcpu per CPU storage is protected by the CPUs runqueue + * lock. * - * (TSA) Store to rq->curr with transition from (N) to (Y) + * CID ownership: * - * (TSB) Store to rq->curr with transition from (Y) to (N) + * A CID is either owned by a task (stored in task_struct::mm_cid.cid) or + * by a CPU (stored in mm::mm_cid.pcpu::cid). CIDs owned by CPUs have the + * MM_CID_ONCPU bit set. During transition from CPU to task ownership mode, + * MM_CID_TRANSIT is set on the per task CIDs. When this bit is set the + * task needs to drop the CID into the pool when scheduling out. Both bits + * (ONCPU and TRANSIT) are filtered out by task_cid() when the CID is + * actually handed over to user space in the RSEQ memory. * - * On the remote-clear side, there is one transition we care about: + * Mode switching: * - * (TMA) cmpxchg to *pcpu_cid to set the LAZY flag + * Switching to per CPU mode happens when the user count becomes greater + * than the maximum number of CIDs, which is calculated by: * - * There is also a transition to UNSET state which can be performed from all - * sides (scheduler, remote-clear). It is always performed with a cmpxchg which - * guarantees that only a single thread will succeed: + * opt_cids = min(mm_cid::nr_cpus_allowed, mm_cid::users); + * max_cids = min(1.25 * opt_cids, num_possible_cpus()); * - * (TMB) cmpxchg to *pcpu_cid to mark UNSET + * The +25% allowance is useful for tight CPU masks in scenarios where only + * a few threads are created and destroyed to avoid frequent mode + * switches. Though this allowance shrinks, the closer opt_cids becomes to + * num_possible_cpus(), which is the (unfortunate) hard ABI limit. * - * Just to be clear, what we do _not_ want to happen is a transition to UNSET - * when a thread is actively using the cid (property (1)). + * At the point of switching to per CPU mode the new user is not yet + * visible in the system, so the task which initiated the fork() runs the + * fixup function: mm_cid_fixup_tasks_to_cpu() walks the thread list and + * either transfers each tasks owned CID to the CPU the task runs on or + * drops it into the CID pool if a task is not on a CPU at that point in + * time. Tasks which schedule in before the task walk reaches them do the + * handover in mm_cid_schedin(). When mm_cid_fixup_tasks_to_cpus() completes + * it's guaranteed that no task related to that MM owns a CID anymore. * - * Let's looks at the relevant combinations of TSA/TSB, and TMA transitions. + * Switching back to task mode happens when the user count goes below the + * threshold which was recorded on the per CPU mode switch: * - * Scenario A) (TSA)+(TMA) (from next task perspective) + * pcpu_thrs = min(opt_cids - (opt_cids / 4), num_possible_cpus() / 2); * - * CPU0 CPU1 + * This threshold is updated when a affinity change increases the number of + * allowed CPUs for the MM, which might cause a switch back to per task + * mode. * - * Context switch CS-1 Remote-clear - * - store to rq->curr: (N)->(Y) (TSA) - cmpxchg to *pcpu_id to LAZY (TMA) - * (implied barrier after cmpxchg) - * - switch_mm_cid() - * - memory barrier (see switch_mm_cid() - * comment explaining how this barrier - * is combined with other scheduler - * barriers) - * - mm_cid_get (next) - * - READ_ONCE(*pcpu_cid) - rcu_dereference(src_rq->curr) + * If the switch back was initiated by a exiting task, then that task runs + * the fixup function. If it was initiated by a affinity change, then it's + * run either in the deferred update function in context of a workqueue or + * by a task which forks a new one or by a task which exits. Whatever + * happens first. mm_cid_fixup_cpus_to_task() walks through the possible + * CPUs and either transfers the CPU owned CIDs to a related task which + * runs on the CPU or drops it into the pool. Tasks which schedule in on a + * CPU which the walk did not cover yet do the handover themself. * - * This Dekker ensures that either task (Y) is observed by the - * rcu_dereference() or the LAZY flag is observed by READ_ONCE(), or both are - * observed. + * This transition from CPU to per task ownership happens in two phases: * - * If task (Y) store is observed by rcu_dereference(), it means that there is - * still an active task on the cpu. Remote-clear will therefore not transition - * to UNSET, which fulfills property (1). + * 1) mm:mm_cid.transit contains MM_CID_TRANSIT This is OR'ed on the task + * CID and denotes that the CID is only temporarily owned by the + * task. When it schedules out the task drops the CID back into the + * pool if this bit is set. * - * If task (Y) is not observed, but the lazy flag is observed by READ_ONCE(), - * it will move its state to UNSET, which clears the percpu cid perhaps - * uselessly (which is not an issue for correctness). Because task (Y) is not - * observed, CPU1 can move ahead to set the state to UNSET. Because moving - * state to UNSET is done with a cmpxchg expecting that the old state has the - * LAZY flag set, only one thread will successfully UNSET. + * 2) The initiating context walks the per CPU space and after completion + * clears mm:mm_cid.transit. So after that point the CIDs are strictly + * task owned again. * - * If both states (LAZY flag and task (Y)) are observed, the thread on CPU0 - * will observe the LAZY flag and transition to UNSET (perhaps uselessly), and - * CPU1 will observe task (Y) and do nothing more, which is fine. + * This two phase transition is required to prevent CID space exhaustion + * during the transition as a direct transfer of ownership would fail if + * two tasks are scheduled in on the same CPU before the fixup freed per + * CPU CIDs. * - * What we are effectively preventing with this Dekker is a scenario where - * neither LAZY flag nor store (Y) are observed, which would fail property (1) - * because this would UNSET a cid which is actively used. + * When mm_cid_fixup_cpus_to_tasks() completes it's guaranteed that no CID + * related to that MM is owned by a CPU anymore. */ -void sched_mm_cid_migrate_from(struct task_struct *t) -{ - t->migrate_from_cpu = task_cpu(t); -} - -static -int __sched_mm_cid_migrate_from_fetch_cid(struct rq *src_rq, - struct task_struct *t, - struct mm_cid *src_pcpu_cid) +/* + * Update the CID range properties when the constraints change. Invoked via + * fork(), exit() and affinity changes + */ +static void __mm_update_max_cids(struct mm_mm_cid *mc) { - struct mm_struct *mm = t->mm; - struct task_struct *src_task; - int src_cid, last_mm_cid; + unsigned int opt_cids, max_cids; - if (!mm) - return -1; + /* Calculate the new optimal constraint */ + opt_cids = min(mc->nr_cpus_allowed, mc->users); - last_mm_cid = t->last_mm_cid; - /* - * If the migrated task has no last cid, or if the current - * task on src rq uses the cid, it means the source cid does not need - * to be moved to the destination cpu. - */ - if (last_mm_cid == -1) - return -1; - src_cid = READ_ONCE(src_pcpu_cid->cid); - if (!mm_cid_is_valid(src_cid) || last_mm_cid != src_cid) - return -1; + /* Adjust the maximum CIDs to +25% limited by the number of possible CPUs */ + max_cids = min(opt_cids + (opt_cids / 4), num_possible_cpus()); + WRITE_ONCE(mc->max_cids, max_cids); +} - /* - * If we observe an active task using the mm on this rq, it means we - * are not the last task to be migrated from this cpu for this mm, so - * there is no need to move src_cid to the destination cpu. - */ - guard(rcu)(); - src_task = rcu_dereference(src_rq->curr); - if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) { - t->last_mm_cid = -1; - return -1; - } +static inline unsigned int mm_cid_calc_pcpu_thrs(struct mm_mm_cid *mc) +{ + unsigned int opt_cids; - return src_cid; + opt_cids = min(mc->nr_cpus_allowed, mc->users); + /* Has to be at least 1 because 0 indicates PCPU mode off */ + return max(min(opt_cids - opt_cids / 4, num_possible_cpus() / 2), 1); } -static -int __sched_mm_cid_migrate_from_try_steal_cid(struct rq *src_rq, - struct task_struct *t, - struct mm_cid *src_pcpu_cid, - int src_cid) +static bool mm_update_max_cids(struct mm_struct *mm) { - struct task_struct *src_task; - struct mm_struct *mm = t->mm; - int lazy_cid; + struct mm_mm_cid *mc = &mm->mm_cid; - if (src_cid == -1) - return -1; + lockdep_assert_held(&mm->mm_cid.lock); - /* - * Attempt to clear the source cpu cid to move it to the destination - * cpu. - */ - lazy_cid = mm_cid_set_lazy_put(src_cid); - if (!try_cmpxchg(&src_pcpu_cid->cid, &src_cid, lazy_cid)) - return -1; - - /* - * The implicit barrier after cmpxchg per-mm/cpu cid before loading - * rq->curr->mm matches the scheduler barrier in context_switch() - * between store to rq->curr and load of prev and next task's - * per-mm/cpu cid. - * - * The implicit barrier after cmpxchg per-mm/cpu cid before loading - * rq->curr->mm_cid_active matches the barrier in - * sched_mm_cid_exit_signals(), sched_mm_cid_before_execve(), and - * sched_mm_cid_after_execve() between store to t->mm_cid_active and - * load of per-mm/cpu cid. - */ + /* Clear deferred mode switch flag. A change is handled by the caller */ + mc->update_deferred = false; + __mm_update_max_cids(mc); - /* - * If we observe an active task using the mm on this rq after setting - * the lazy-put flag, this task will be responsible for transitioning - * from lazy-put flag set to MM_CID_UNSET. - */ - scoped_guard (rcu) { - src_task = rcu_dereference(src_rq->curr); - if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) { - /* - * We observed an active task for this mm, there is therefore - * no point in moving this cid to the destination cpu. - */ - t->last_mm_cid = -1; - return -1; - } + /* Check whether owner mode must be changed */ + if (!mc->percpu) { + /* Enable per CPU mode when the number of users is above max_cids */ + if (mc->users > mc->max_cids) + mc->pcpu_thrs = mm_cid_calc_pcpu_thrs(mc); + } else { + /* Switch back to per task if user count under threshold */ + if (mc->users < mc->pcpu_thrs) + mc->pcpu_thrs = 0; } - /* - * The src_cid is unused, so it can be unset. - */ - if (!try_cmpxchg(&src_pcpu_cid->cid, &lazy_cid, MM_CID_UNSET)) - return -1; - WRITE_ONCE(src_pcpu_cid->recent_cid, MM_CID_UNSET); - return src_cid; + /* Mode change required? */ + if (!!mc->percpu == !!mc->pcpu_thrs) + return false; + /* When switching back to per TASK mode, set the transition flag */ + if (!mc->pcpu_thrs) + WRITE_ONCE(mc->transit, MM_CID_TRANSIT); + WRITE_ONCE(mc->percpu, !!mc->pcpu_thrs); + return true; } -/* - * Migration to dst cpu. Called with dst_rq lock held. - * Interrupts are disabled, which keeps the window of cid ownership without the - * source rq lock held small. - */ -void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) +static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpumask *affmsk) { - struct mm_cid *src_pcpu_cid, *dst_pcpu_cid; - struct mm_struct *mm = t->mm; - int src_cid, src_cpu; - bool dst_cid_is_set; - struct rq *src_rq; - - lockdep_assert_rq_held(dst_rq); + struct cpumask *mm_allowed; + struct mm_mm_cid *mc; + unsigned int weight; - if (!mm) + if (!mm || !READ_ONCE(mm->mm_cid.users)) return; - src_cpu = t->migrate_from_cpu; - if (src_cpu == -1) { - t->last_mm_cid = -1; - return; - } /* - * Move the src cid if the dst cid is unset. This keeps id - * allocation closest to 0 in cases where few threads migrate around - * many CPUs. - * - * If destination cid or recent cid is already set, we may have - * to just clear the src cid to ensure compactness in frequent - * migrations scenarios. - * - * It is not useful to clear the src cid when the number of threads is - * greater or equal to the number of allowed CPUs, because user-space - * can expect that the number of allowed cids can reach the number of - * allowed CPUs. - */ - dst_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(dst_rq)); - dst_cid_is_set = !mm_cid_is_unset(READ_ONCE(dst_pcpu_cid->cid)) || - !mm_cid_is_unset(READ_ONCE(dst_pcpu_cid->recent_cid)); - if (dst_cid_is_set && atomic_read(&mm->mm_users) >= READ_ONCE(mm->nr_cpus_allowed)) + * mm::mm_cid::mm_cpus_allowed is the superset of each threads + * allowed CPUs mask which means it can only grow. + */ + mc = &mm->mm_cid; + guard(raw_spinlock)(&mc->lock); + mm_allowed = mm_cpus_allowed(mm); + weight = cpumask_weighted_or(mm_allowed, mm_allowed, affmsk); + if (weight == mc->nr_cpus_allowed) return; - src_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, src_cpu); - src_rq = cpu_rq(src_cpu); - src_cid = __sched_mm_cid_migrate_from_fetch_cid(src_rq, t, src_pcpu_cid); - if (src_cid == -1) + + WRITE_ONCE(mc->nr_cpus_allowed, weight); + __mm_update_max_cids(mc); + if (!mc->percpu) return; - src_cid = __sched_mm_cid_migrate_from_try_steal_cid(src_rq, t, src_pcpu_cid, - src_cid); - if (src_cid == -1) + + /* Adjust the threshold to the wider set */ + mc->pcpu_thrs = mm_cid_calc_pcpu_thrs(mc); + /* Switch back to per task mode? */ + if (mc->users >= mc->pcpu_thrs) return; - if (dst_cid_is_set) { - __mm_cid_put(mm, src_cid); + + /* Don't queue twice */ + if (mc->update_deferred) return; - } - /* Move src_cid to dst cpu. */ - mm_cid_snapshot_time(dst_rq, mm); - WRITE_ONCE(dst_pcpu_cid->cid, src_cid); - WRITE_ONCE(dst_pcpu_cid->recent_cid, src_cid); + + /* Queue the irq work, which schedules the real work */ + mc->update_deferred = true; + irq_work_queue(&mc->irq_work); } -static void sched_mm_cid_remote_clear(struct mm_struct *mm, struct mm_cid *pcpu_cid, - int cpu) +static inline void mm_cid_transit_to_task(struct task_struct *t, struct mm_cid_pcpu *pcp) { - struct rq *rq = cpu_rq(cpu); - struct task_struct *t; - int cid, lazy_cid; + if (cid_on_cpu(t->mm_cid.cid)) { + unsigned int cid = cpu_cid_to_cid(t->mm_cid.cid); - cid = READ_ONCE(pcpu_cid->cid); - if (!mm_cid_is_valid(cid)) - return; + t->mm_cid.cid = cid_to_transit_cid(cid); + pcp->cid = t->mm_cid.cid; + } +} - /* - * Clear the cpu cid if it is set to keep cid allocation compact. If - * there happens to be other tasks left on the source cpu using this - * mm, the next task using this mm will reallocate its cid on context - * switch. - */ - lazy_cid = mm_cid_set_lazy_put(cid); - if (!try_cmpxchg(&pcpu_cid->cid, &cid, lazy_cid)) - return; +static void mm_cid_fixup_cpus_to_tasks(struct mm_struct *mm) +{ + unsigned int cpu; - /* - * The implicit barrier after cmpxchg per-mm/cpu cid before loading - * rq->curr->mm matches the scheduler barrier in context_switch() - * between store to rq->curr and load of prev and next task's - * per-mm/cpu cid. - * - * The implicit barrier after cmpxchg per-mm/cpu cid before loading - * rq->curr->mm_cid_active matches the barrier in - * sched_mm_cid_exit_signals(), sched_mm_cid_before_execve(), and - * sched_mm_cid_after_execve() between store to t->mm_cid_active and - * load of per-mm/cpu cid. - */ + /* Walk the CPUs and fixup all stale CIDs */ + for_each_possible_cpu(cpu) { + struct mm_cid_pcpu *pcp = per_cpu_ptr(mm->mm_cid.pcpu, cpu); + struct rq *rq = cpu_rq(cpu); - /* - * If we observe an active task using the mm on this rq after setting - * the lazy-put flag, that task will be responsible for transitioning - * from lazy-put flag set to MM_CID_UNSET. - */ - scoped_guard (rcu) { - t = rcu_dereference(rq->curr); - if (READ_ONCE(t->mm_cid_active) && t->mm == mm) - return; + /* Remote access to mm::mm_cid::pcpu requires rq_lock */ + guard(rq_lock_irq)(rq); + /* Is the CID still owned by the CPU? */ + if (cid_on_cpu(pcp->cid)) { + /* + * If rq->curr has @mm, transfer it with the + * transition bit set. Otherwise drop it. + */ + if (rq->curr->mm == mm && rq->curr->mm_cid.active) + mm_cid_transit_to_task(rq->curr, pcp); + else + mm_drop_cid_on_cpu(mm, pcp); + + } else if (rq->curr->mm == mm && rq->curr->mm_cid.active) { + unsigned int cid = rq->curr->mm_cid.cid; + + /* Ensure it has the transition bit set */ + if (!cid_in_transit(cid)) { + cid = cid_to_transit_cid(cid); + rq->curr->mm_cid.cid = cid; + pcp->cid = cid; + } + } } + /* Clear the transition bit */ + WRITE_ONCE(mm->mm_cid.transit, 0); +} - /* - * The cid is unused, so it can be unset. - * Disable interrupts to keep the window of cid ownership without rq - * lock small. - */ - scoped_guard (irqsave) { - if (try_cmpxchg(&pcpu_cid->cid, &lazy_cid, MM_CID_UNSET)) - __mm_cid_put(mm, cid); +static inline void mm_cid_transfer_to_cpu(struct task_struct *t, struct mm_cid_pcpu *pcp) +{ + if (cid_on_task(t->mm_cid.cid)) { + t->mm_cid.cid = cid_to_cpu_cid(t->mm_cid.cid); + pcp->cid = t->mm_cid.cid; } } -static void sched_mm_cid_remote_clear_old(struct mm_struct *mm, int cpu) +static bool mm_cid_fixup_task_to_cpu(struct task_struct *t, struct mm_struct *mm) { - struct rq *rq = cpu_rq(cpu); - struct mm_cid *pcpu_cid; - struct task_struct *curr; - u64 rq_clock; + /* Remote access to mm::mm_cid::pcpu requires rq_lock */ + guard(task_rq_lock)(t); + /* If the task is not active it is not in the users count */ + if (!t->mm_cid.active) + return false; + if (cid_on_task(t->mm_cid.cid)) { + /* If running on the CPU, transfer the CID, otherwise drop it */ + if (task_rq(t)->curr == t) + mm_cid_transfer_to_cpu(t, per_cpu_ptr(mm->mm_cid.pcpu, task_cpu(t))); + else + mm_unset_cid_on_task(t); + } + return true; +} - /* - * rq->clock load is racy on 32-bit but one spurious clear once in a - * while is irrelevant. - */ - rq_clock = READ_ONCE(rq->clock); - pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu); +static void mm_cid_fixup_tasks_to_cpus(void) +{ + struct mm_struct *mm = current->mm; + struct task_struct *p, *t; + unsigned int users; /* - * In order to take care of infrequently scheduled tasks, bump the time - * snapshot associated with this cid if an active task using the mm is - * observed on this rq. + * This can obviously race with a concurrent affinity change, which + * increases the number of allowed CPUs for this mm, but that does + * not affect the mode and only changes the CID constraints. A + * possible switch back to per task mode happens either in the + * deferred handler function or in the next fork()/exit(). + * + * The caller has already transferred. The newly incoming task is + * already accounted for, but not yet visible. */ - scoped_guard (rcu) { - curr = rcu_dereference(rq->curr); - if (READ_ONCE(curr->mm_cid_active) && curr->mm == mm) { - WRITE_ONCE(pcpu_cid->time, rq_clock); - return; - } + users = mm->mm_cid.users - 2; + if (!users) + return; + + guard(rcu)(); + for_other_threads(current, t) { + if (mm_cid_fixup_task_to_cpu(t, mm)) + users--; } - if (rq_clock < pcpu_cid->time + SCHED_MM_CID_PERIOD_NS) + if (!users) return; - sched_mm_cid_remote_clear(mm, pcpu_cid, cpu); + + /* Happens only for VM_CLONE processes. */ + for_each_process_thread(p, t) { + if (t == current || t->mm != mm) + continue; + if (mm_cid_fixup_task_to_cpu(t, mm)) { + if (--users == 0) + return; + } + } } -static void sched_mm_cid_remote_clear_weight(struct mm_struct *mm, int cpu, - int weight) +static bool sched_mm_cid_add_user(struct task_struct *t, struct mm_struct *mm) { - struct mm_cid *pcpu_cid; - int cid; - - pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu); - cid = READ_ONCE(pcpu_cid->cid); - if (!mm_cid_is_valid(cid) || cid < weight) - return; - sched_mm_cid_remote_clear(mm, pcpu_cid, cpu); + t->mm_cid.active = 1; + mm->mm_cid.users++; + return mm_update_max_cids(mm); } -static void task_mm_cid_work(struct callback_head *work) +void sched_mm_cid_fork(struct task_struct *t) { - unsigned long now = jiffies, old_scan, next_scan; - struct task_struct *t = current; - struct cpumask *cidmask; - struct mm_struct *mm; - int weight, cpu; + struct mm_struct *mm = t->mm; + bool percpu; - WARN_ON_ONCE(t != container_of(work, struct task_struct, cid_work)); + WARN_ON_ONCE(!mm || t->mm_cid.cid != MM_CID_UNSET); - work->next = work; /* Prevent double-add */ - if (t->flags & PF_EXITING) - return; - mm = t->mm; - if (!mm) - return; - old_scan = READ_ONCE(mm->mm_cid_next_scan); - next_scan = now + msecs_to_jiffies(MM_CID_SCAN_DELAY); - if (!old_scan) { - unsigned long res; - - res = cmpxchg(&mm->mm_cid_next_scan, old_scan, next_scan); - if (res != old_scan) - old_scan = res; + guard(mutex)(&mm->mm_cid.mutex); + scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) { + struct mm_cid_pcpu *pcp = this_cpu_ptr(mm->mm_cid.pcpu); + + /* First user ? */ + if (!mm->mm_cid.users) { + sched_mm_cid_add_user(t, mm); + t->mm_cid.cid = mm_get_cid(mm); + /* Required for execve() */ + pcp->cid = t->mm_cid.cid; + return; + } + + if (!sched_mm_cid_add_user(t, mm)) { + if (!mm->mm_cid.percpu) + t->mm_cid.cid = mm_get_cid(mm); + return; + } + + /* Handle the mode change and transfer current's CID */ + percpu = !!mm->mm_cid.percpu; + if (!percpu) + mm_cid_transit_to_task(current, pcp); else - old_scan = next_scan; + mm_cid_transfer_to_cpu(current, pcp); } - if (time_before(now, old_scan)) - return; - if (!try_cmpxchg(&mm->mm_cid_next_scan, &old_scan, next_scan)) - return; - cidmask = mm_cidmask(mm); - /* Clear cids that were not recently used. */ - for_each_possible_cpu(cpu) - sched_mm_cid_remote_clear_old(mm, cpu); - weight = cpumask_weight(cidmask); - /* - * Clear cids that are greater or equal to the cidmask weight to - * recompact it. - */ - for_each_possible_cpu(cpu) - sched_mm_cid_remote_clear_weight(mm, cpu, weight); -} -void init_sched_mm_cid(struct task_struct *t) -{ - struct mm_struct *mm = t->mm; - int mm_users = 0; - - if (mm) { - mm_users = atomic_read(&mm->mm_users); - if (mm_users == 1) - mm->mm_cid_next_scan = jiffies + msecs_to_jiffies(MM_CID_SCAN_DELAY); + if (percpu) { + mm_cid_fixup_tasks_to_cpus(); + } else { + mm_cid_fixup_cpus_to_tasks(mm); + t->mm_cid.cid = mm_get_cid(mm); } - t->cid_work.next = &t->cid_work; /* Protect against double add */ - init_task_work(&t->cid_work, task_mm_cid_work); } -void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) +static bool sched_mm_cid_remove_user(struct task_struct *t) { - struct callback_head *work = &curr->cid_work; - unsigned long now = jiffies; - - if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) || - work->next != work) - return; - if (time_before(now, READ_ONCE(curr->mm->mm_cid_next_scan))) - return; - - /* No page allocation under rq lock */ - task_work_add(curr, work, TWA_RESUME); + t->mm_cid.active = 0; + scoped_guard(preempt) { + /* Clear the transition bit */ + t->mm_cid.cid = cid_from_transit_cid(t->mm_cid.cid); + mm_unset_cid_on_task(t); + } + t->mm->mm_cid.users--; + return mm_update_max_cids(t->mm); } -void sched_mm_cid_exit_signals(struct task_struct *t) +static bool __sched_mm_cid_exit(struct task_struct *t) { struct mm_struct *mm = t->mm; - struct rq *rq; - if (!mm) - return; - - preempt_disable(); - rq = this_rq(); - guard(rq_lock_irqsave)(rq); - preempt_enable_no_resched(); /* holding spinlock */ - WRITE_ONCE(t->mm_cid_active, 0); + if (!sched_mm_cid_remove_user(t)) + return false; /* - * Store t->mm_cid_active before loading per-mm/cpu cid. - * Matches barrier in sched_mm_cid_remote_clear_old(). + * Contrary to fork() this only deals with a switch back to per + * task mode either because the above decreased users or an + * affinity change increased the number of allowed CPUs and the + * deferred fixup did not run yet. */ - smp_mb(); - mm_cid_put(mm); - t->last_mm_cid = t->mm_cid = -1; + if (WARN_ON_ONCE(mm->mm_cid.percpu)) + return false; + /* + * A failed fork(2) cleanup never gets here, so @current must have + * the same MM as @t. That's true for exit() and the failed + * pthread_create() cleanup case. + */ + if (WARN_ON_ONCE(current->mm != mm)) + return false; + return true; } -void sched_mm_cid_before_execve(struct task_struct *t) +/* + * When a task exits, the MM CID held by the task is not longer required as + * the task cannot return to user space. + */ +void sched_mm_cid_exit(struct task_struct *t) { struct mm_struct *mm = t->mm; - struct rq *rq; - if (!mm) + if (!mm || !t->mm_cid.active) return; + /* + * Ensure that only one instance is doing MM CID operations within + * a MM. The common case is uncontended. The rare fixup case adds + * some overhead. + */ + scoped_guard(mutex, &mm->mm_cid.mutex) { + /* mm_cid::mutex is sufficient to protect mm_cid::users */ + if (likely(mm->mm_cid.users > 1)) { + scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) { + if (!__sched_mm_cid_exit(t)) + return; + /* Mode change required. Transfer currents CID */ + mm_cid_transit_to_task(current, this_cpu_ptr(mm->mm_cid.pcpu)); + } + mm_cid_fixup_cpus_to_tasks(mm); + return; + } + /* Last user */ + scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) { + /* Required across execve() */ + if (t == current) + mm_cid_transit_to_task(t, this_cpu_ptr(mm->mm_cid.pcpu)); + /* Ignore mode change. There is nothing to do. */ + sched_mm_cid_remove_user(t); + } + } - preempt_disable(); - rq = this_rq(); - guard(rq_lock_irqsave)(rq); - preempt_enable_no_resched(); /* holding spinlock */ - WRITE_ONCE(t->mm_cid_active, 0); /* - * Store t->mm_cid_active before loading per-mm/cpu cid. - * Matches barrier in sched_mm_cid_remote_clear_old(). + * As this is the last user (execve(), process exit or failed + * fork(2)) there is no concurrency anymore. + * + * Synchronize eventually pending work to ensure that there are no + * dangling references left. @t->mm_cid.users is zero so nothing + * can queue this work anymore. */ - smp_mb(); - mm_cid_put(mm); - t->last_mm_cid = t->mm_cid = -1; + irq_work_sync(&mm->mm_cid.irq_work); + cancel_work_sync(&mm->mm_cid.work); +} + +/* Deactivate MM CID allocation across execve() */ +void sched_mm_cid_before_execve(struct task_struct *t) +{ + sched_mm_cid_exit(t); } +/* Reactivate MM CID after successful execve() */ void sched_mm_cid_after_execve(struct task_struct *t) { - struct mm_struct *mm = t->mm; - struct rq *rq; + sched_mm_cid_fork(t); +} - if (!mm) +static void mm_cid_work_fn(struct work_struct *work) +{ + struct mm_struct *mm = container_of(work, struct mm_struct, mm_cid.work); + + guard(mutex)(&mm->mm_cid.mutex); + /* Did the last user task exit already? */ + if (!mm->mm_cid.users) return; - preempt_disable(); - rq = this_rq(); - scoped_guard (rq_lock_irqsave, rq) { - preempt_enable_no_resched(); /* holding spinlock */ - WRITE_ONCE(t->mm_cid_active, 1); - /* - * Store t->mm_cid_active before loading per-mm/cpu cid. - * Matches barrier in sched_mm_cid_remote_clear_old(). - */ - smp_mb(); - t->last_mm_cid = t->mm_cid = mm_cid_get(rq, t, mm); + scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) { + /* Have fork() or exit() handled it already? */ + if (!mm->mm_cid.update_deferred) + return; + /* This clears mm_cid::update_deferred */ + if (!mm_update_max_cids(mm)) + return; + /* Affinity changes can only switch back to task mode */ + if (WARN_ON_ONCE(mm->mm_cid.percpu)) + return; } + mm_cid_fixup_cpus_to_tasks(mm); } -void sched_mm_cid_fork(struct task_struct *t) +static void mm_cid_irq_work(struct irq_work *work) { - WARN_ON_ONCE(!t->mm || t->mm_cid != -1); - t->mm_cid_active = 1; + struct mm_struct *mm = container_of(work, struct mm_struct, mm_cid.irq_work); + + /* + * Needs to be unconditional because mm_cid::lock cannot be held + * when scheduling work as mm_update_cpus_allowed() nests inside + * rq::lock and schedule_work() might end up in wakeup... + */ + schedule_work(&mm->mm_cid.work); } -#endif /* CONFIG_SCHED_MM_CID */ -#ifdef CONFIG_SCHED_CLASS_EXT -void sched_deq_and_put_task(struct task_struct *p, int queue_flags, - struct sched_enq_and_set_ctx *ctx) +void mm_init_cid(struct mm_struct *mm, struct task_struct *p) +{ + mm->mm_cid.max_cids = 0; + mm->mm_cid.percpu = 0; + mm->mm_cid.transit = 0; + mm->mm_cid.nr_cpus_allowed = p->nr_cpus_allowed; + mm->mm_cid.users = 0; + mm->mm_cid.pcpu_thrs = 0; + mm->mm_cid.update_deferred = 0; + raw_spin_lock_init(&mm->mm_cid.lock); + mutex_init(&mm->mm_cid.mutex); + mm->mm_cid.irq_work = IRQ_WORK_INIT_HARD(mm_cid_irq_work); + INIT_WORK(&mm->mm_cid.work, mm_cid_work_fn); + cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask); + bitmap_zero(mm_cidmask(mm), num_possible_cpus()); +} +#else /* CONFIG_SCHED_MM_CID */ +static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpumask *affmsk) { } +#endif /* !CONFIG_SCHED_MM_CID */ + +static DEFINE_PER_CPU(struct sched_change_ctx, sched_change_ctx); + +struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int flags) { + struct sched_change_ctx *ctx = this_cpu_ptr(&sched_change_ctx); struct rq *rq = task_rq(p); + /* + * Must exclusively use matched flags since this is both dequeue and + * enqueue. + */ + WARN_ON_ONCE(flags & 0xFFFF0000); + lockdep_assert_rq_held(rq); - *ctx = (struct sched_enq_and_set_ctx){ + if (!(flags & DEQUEUE_NOCLOCK)) { + update_rq_clock(rq); + flags |= DEQUEUE_NOCLOCK; + } + + if (flags & DEQUEUE_CLASS) { + if (p->sched_class->switching_from) + p->sched_class->switching_from(rq, p); + } + + *ctx = (struct sched_change_ctx){ .p = p, - .queue_flags = queue_flags, + .flags = flags, .queued = task_on_rq_queued(p), - .running = task_current(rq, p), + .running = task_current_donor(rq, p), }; - update_rq_clock(rq); + if (!(flags & DEQUEUE_CLASS)) { + if (p->sched_class->get_prio) + ctx->prio = p->sched_class->get_prio(rq, p); + else + ctx->prio = p->prio; + } + if (ctx->queued) - dequeue_task(rq, p, queue_flags | DEQUEUE_NOCLOCK); + dequeue_task(rq, p, flags); if (ctx->running) put_prev_task(rq, p); + + if ((flags & DEQUEUE_CLASS) && p->sched_class->switched_from) + p->sched_class->switched_from(rq, p); + + return ctx; } -void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx) +void sched_change_end(struct sched_change_ctx *ctx) { - struct rq *rq = task_rq(ctx->p); + struct task_struct *p = ctx->p; + struct rq *rq = task_rq(p); lockdep_assert_rq_held(rq); + if ((ctx->flags & ENQUEUE_CLASS) && p->sched_class->switching_to) + p->sched_class->switching_to(rq, p); + if (ctx->queued) - enqueue_task(rq, ctx->p, ctx->queue_flags | ENQUEUE_NOCLOCK); + enqueue_task(rq, p, ctx->flags); if (ctx->running) - set_next_task(rq, ctx->p); + set_next_task(rq, p); + + if (ctx->flags & ENQUEUE_CLASS) { + if (p->sched_class->switched_to) + p->sched_class->switched_to(rq, p); + } else { + p->sched_class->prio_changed(rq, p, ctx->prio); + } } -#endif /* CONFIG_SCHED_CLASS_EXT */ diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index cdd740b3f774..37b572cc8aca 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -166,12 +166,13 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, * cpudl_clear - remove a CPU from the cpudl max-heap * @cp: the cpudl max-heap context * @cpu: the target CPU + * @online: the online state of the deadline runqueue * * Notes: assumes cpu_rq(cpu)->lock is locked * * Returns: (void) */ -void cpudl_clear(struct cpudl *cp, int cpu) +void cpudl_clear(struct cpudl *cp, int cpu, bool online) { int old_idx, new_cpu; unsigned long flags; @@ -184,7 +185,7 @@ void cpudl_clear(struct cpudl *cp, int cpu) if (old_idx == IDX_INVALID) { /* * Nothing to remove if old_idx was invalid. - * This could happen if a rq_offline_dl is + * This could happen if rq_online_dl or rq_offline_dl is * called for a CPU without -dl tasks running. */ } else { @@ -195,9 +196,12 @@ void cpudl_clear(struct cpudl *cp, int cpu) cp->elements[new_cpu].idx = old_idx; cp->elements[cpu].idx = IDX_INVALID; cpudl_heapify(cp, old_idx); - - cpumask_set_cpu(cpu, cp->free_cpus); } + if (likely(online)) + __cpumask_set_cpu(cpu, cp->free_cpus); + else + __cpumask_clear_cpu(cpu, cp->free_cpus); + raw_spin_unlock_irqrestore(&cp->lock, flags); } @@ -228,7 +232,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl) cp->elements[new_idx].cpu = cpu; cp->elements[cpu].idx = new_idx; cpudl_heapify_up(cp, new_idx); - cpumask_clear_cpu(cpu, cp->free_cpus); + __cpumask_clear_cpu(cpu, cp->free_cpus); } else { cp->elements[old_idx].dl = dl; cpudl_heapify(cp, old_idx); @@ -238,26 +242,6 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl) } /* - * cpudl_set_freecpu - Set the cpudl.free_cpus - * @cp: the cpudl max-heap context - * @cpu: rd attached CPU - */ -void cpudl_set_freecpu(struct cpudl *cp, int cpu) -{ - cpumask_set_cpu(cpu, cp->free_cpus); -} - -/* - * cpudl_clear_freecpu - Clear the cpudl.free_cpus - * @cp: the cpudl max-heap context - * @cpu: rd attached CPU - */ -void cpudl_clear_freecpu(struct cpudl *cp, int cpu) -{ - cpumask_clear_cpu(cpu, cp->free_cpus); -} - -/* * cpudl_init - initialize the cpudl structure * @cp: the cpudl max-heap context */ diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h index 11c0f1faa7e1..d7699468eedd 100644 --- a/kernel/sched/cpudeadline.h +++ b/kernel/sched/cpudeadline.h @@ -19,8 +19,6 @@ struct cpudl { int cpudl_find(struct cpudl *cp, struct task_struct *p, struct cpumask *later_mask); void cpudl_set(struct cpudl *cp, int cpu, u64 dl); -void cpudl_clear(struct cpudl *cp, int cpu); +void cpudl_clear(struct cpudl *cp, int cpu, bool online); int cpudl_init(struct cpudl *cp); -void cpudl_set_freecpu(struct cpudl *cp, int cpu); -void cpudl_clear_freecpu(struct cpudl *cp, int cpu); void cpudl_cleanup(struct cpudl *cp); diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 7097de2c8cda..4f97896887ec 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -313,10 +313,8 @@ static u64 read_sum_exec_runtime(struct task_struct *t) void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) { struct signal_struct *sig = tsk->signal; - u64 utime, stime; struct task_struct *t; - unsigned int seq, nextseq; - unsigned long flags; + u64 utime, stime; /* * Update current task runtime to account pending time since last @@ -329,27 +327,19 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) if (same_thread_group(current, tsk)) (void) task_sched_runtime(current); - rcu_read_lock(); - /* Attempt a lockless read on the first round. */ - nextseq = 0; - do { - seq = nextseq; - flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq); + guard(rcu)(); + scoped_seqlock_read (&sig->stats_lock, ss_lock_irqsave) { times->utime = sig->utime; times->stime = sig->stime; times->sum_exec_runtime = sig->sum_sched_runtime; - for_each_thread(tsk, t) { + __for_each_thread(sig, t) { task_cputime(t, &utime, &stime); times->utime += utime; times->stime += stime; times->sum_exec_runtime += read_sum_exec_runtime(t); } - /* If lockless access failed, take the lock. */ - nextseq = 1; - } while (need_seqretry(&sig->stats_lock, seq)); - done_seqretry_irqrestore(&sig->stats_lock, seq, flags); - rcu_read_unlock(); + } } #ifdef CONFIG_IRQ_TIME_ACCOUNTING diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 7b7671060bf9..67f540c23717 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -125,20 +125,11 @@ static inline struct dl_bw *dl_bw_of(int i) static inline int dl_bw_cpus(int i) { struct root_domain *rd = cpu_rq(i)->rd; - int cpus; RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), "sched RCU must be held"); - if (cpumask_subset(rd->span, cpu_active_mask)) - return cpumask_weight(rd->span); - - cpus = 0; - - for_each_cpu_and(i, rd->span, cpu_active_mask) - cpus++; - - return cpus; + return cpumask_weight_and(rd->span, cpu_active_mask); } static inline unsigned long __dl_bw_capacity(const struct cpumask *mask) @@ -405,7 +396,7 @@ static void __dl_clear_params(struct sched_dl_entity *dl_se); * up, and checks if the task is still in the "ACTIVE non contending" * state or not (in the second case, it updates running_bw). */ -static void task_non_contending(struct sched_dl_entity *dl_se) +static void task_non_contending(struct sched_dl_entity *dl_se, bool dl_task) { struct hrtimer *timer = &dl_se->inactive_timer; struct rq *rq = rq_of_dl_se(dl_se); @@ -444,10 +435,10 @@ static void task_non_contending(struct sched_dl_entity *dl_se) } else { struct task_struct *p = dl_task_of(dl_se); - if (dl_task(p)) + if (dl_task) sub_running_bw(dl_se, dl_rq); - if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) { + if (!dl_task || READ_ONCE(p->__state) == TASK_DEAD) { struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); if (READ_ONCE(p->__state) == TASK_DEAD) @@ -1166,8 +1157,17 @@ static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_ sched_clock_tick(); update_rq_clock(rq); - if (!dl_se->dl_runtime) + /* + * Make sure current has propagated its pending runtime into + * any relevant server through calling dl_server_update() and + * friends. + */ + rq->donor->sched_class->update_curr(rq); + + if (dl_se->dl_defer_idle) { + dl_server_stop(dl_se); return HRTIMER_NORESTART; + } if (dl_se->dl_defer_armed) { /* @@ -1416,10 +1416,11 @@ s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta } static inline void -update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se, - int flags); +update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se, int flags); + static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec) { + bool idle = rq->curr == rq->idle; s64 scaled_delta_exec; if (unlikely(delta_exec <= 0)) { @@ -1440,6 +1441,9 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 dl_se->runtime -= scaled_delta_exec; + if (dl_se->dl_defer_idle && !idle) + dl_se->dl_defer_idle = 0; + /* * The fair server can consume its runtime while throttled (not queued/ * running as regular CFS). @@ -1450,6 +1454,29 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 */ if (dl_se->dl_defer && dl_se->dl_throttled && dl_runtime_exceeded(dl_se)) { /* + * Non-servers would never get time accounted while throttled. + */ + WARN_ON_ONCE(!dl_server(dl_se)); + + /* + * While the server is marked idle, do not push out the + * activation further, instead wait for the period timer + * to lapse and stop the server. + */ + if (dl_se->dl_defer_idle && idle) { + /* + * The timer is at the zero-laxity point, this means + * dl_server_stop() / dl_server_start() can happen + * while now < deadline. This means update_dl_entity() + * will not replenish. Additionally start_dl_timer() + * will be set for 'deadline - runtime'. Negative + * runtime will not do. + */ + dl_se->runtime = 0; + return; + } + + /* * If the server was previously activated - the starving condition * took place, it this point it went away because the fair scheduler * was able to get runtime in background. So return to the initial @@ -1461,6 +1488,9 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 replenish_dl_new_period(dl_se, dl_se->rq); + if (idle) + dl_se->dl_defer_idle = 1; + /* * Not being able to start the timer seems problematic. If it could not * be started for whatever reason, we need to "unthrottle" the DL server @@ -1543,38 +1573,213 @@ throttle: * as time available for the fair server, avoiding a penalty for the * rt scheduler that did not consumed that time. */ -void dl_server_update_idle_time(struct rq *rq, struct task_struct *p) +void dl_server_update_idle(struct sched_dl_entity *dl_se, s64 delta_exec) { - s64 delta_exec; - - if (!rq->fair_server.dl_defer) - return; - - /* no need to discount more */ - if (rq->fair_server.runtime < 0) - return; - - delta_exec = rq_clock_task(rq) - p->se.exec_start; - if (delta_exec < 0) - return; - - rq->fair_server.runtime -= delta_exec; - - if (rq->fair_server.runtime < 0) { - rq->fair_server.dl_defer_running = 0; - rq->fair_server.runtime = 0; - } - - p->se.exec_start = rq_clock_task(rq); + if (dl_se->dl_server_active && dl_se->dl_runtime && dl_se->dl_defer) + update_curr_dl_se(dl_se->rq, dl_se, delta_exec); } void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec) { /* 0 runtime = fair server disabled */ - if (dl_se->dl_runtime) + if (dl_se->dl_server_active && dl_se->dl_runtime) update_curr_dl_se(dl_se->rq, dl_se, delta_exec); } +/* + * dl_server && dl_defer: + * + * 6 + * +--------------------+ + * v | + * +-------------+ 4 +-----------+ 5 +------------------+ + * +-> | A:init | <--- | D:running | -----> | E:replenish-wait | + * | +-------------+ +-----------+ +------------------+ + * | | | 1 ^ ^ | + * | | 1 +----------+ | 3 | + * | v | | + * | +--------------------------------+ 2 | + * | | | ----+ | + * | 8 | B:zero_laxity-wait | | | + * | | | <---+ | + * | +--------------------------------+ | + * | | ^ ^ 2 | + * | | 7 | 2 +--------------------+ + * | v | + * | +-------------+ | + * +-- | C:idle-wait | -+ + * +-------------+ + * ^ 7 | + * +---------+ + * + * + * [A] - init + * dl_server_active = 0 + * dl_throttled = 0 + * dl_defer_armed = 0 + * dl_defer_running = 0/1 + * dl_defer_idle = 0 + * + * [B] - zero_laxity-wait + * dl_server_active = 1 + * dl_throttled = 1 + * dl_defer_armed = 1 + * dl_defer_running = 0 + * dl_defer_idle = 0 + * + * [C] - idle-wait + * dl_server_active = 1 + * dl_throttled = 1 + * dl_defer_armed = 1 + * dl_defer_running = 0 + * dl_defer_idle = 1 + * + * [D] - running + * dl_server_active = 1 + * dl_throttled = 0 + * dl_defer_armed = 0 + * dl_defer_running = 1 + * dl_defer_idle = 0 + * + * [E] - replenish-wait + * dl_server_active = 1 + * dl_throttled = 1 + * dl_defer_armed = 0 + * dl_defer_running = 1 + * dl_defer_idle = 0 + * + * + * [1] A->B, A->D + * dl_server_start() + * dl_server_active = 1; + * enqueue_dl_entity() + * update_dl_entity(WAKEUP) + * if (!dl_defer_running) + * dl_defer_armed = 1; + * dl_throttled = 1; + * if (dl_throttled && start_dl_timer()) + * return; // [B] + * __enqueue_dl_entity(); + * // [D] + * + * // deplete server runtime from client-class + * [2] B->B, C->B, E->B + * dl_server_update() + * update_curr_dl_se() // idle = false + * if (dl_defer_idle) + * dl_defer_idle = 0; + * if (dl_defer && dl_throttled && dl_runtime_exceeded()) + * dl_defer_running = 0; + * hrtimer_try_to_cancel(); // stop timer + * replenish_dl_new_period() + * // fwd period + * dl_throttled = 1; + * dl_defer_armed = 1; + * start_dl_timer(); // restart timer + * // [B] + * + * // timer actually fires means we have runtime + * [3] B->D + * dl_server_timer() + * if (dl_defer_armed) + * dl_defer_running = 1; + * enqueue_dl_entity(REPLENISH) + * replenish_dl_entity() + * // fwd period + * if (dl_throttled) + * dl_throttled = 0; + * if (dl_defer_armed) + * dl_defer_armed = 0; + * __enqueue_dl_entity(); + * // [D] + * + * // schedule server + * [4] D->A + * pick_task_dl() + * p = server_pick_task(); + * if (!p) + * dl_server_stop() + * dequeue_dl_entity(); + * hrtimer_try_to_cancel(); + * dl_defer_armed = 0; + * dl_throttled = 0; + * dl_server_active = 0; + * // [A] + * return p; + * + * // server running + * [5] D->E + * update_curr_dl_se() + * if (dl_runtime_exceeded()) + * dl_throttled = 1; + * dequeue_dl_entity(); + * start_dl_timer(); + * // [E] + * + * // server replenished + * [6] E->D + * dl_server_timer() + * enqueue_dl_entity(REPLENISH) + * replenish_dl_entity() + * fwd-period + * if (dl_throttled) + * dl_throttled = 0; + * __enqueue_dl_entity(); + * // [D] + * + * // deplete server runtime from idle + * [7] B->C, C->C + * dl_server_update_idle() + * update_curr_dl_se() // idle = true + * if (dl_defer && dl_throttled && dl_runtime_exceeded()) + * if (dl_defer_idle) + * return; + * dl_defer_running = 0; + * hrtimer_try_to_cancel(); + * replenish_dl_new_period() + * // fwd period + * dl_throttled = 1; + * dl_defer_armed = 1; + * dl_defer_idle = 1; + * start_dl_timer(); // restart timer + * // [C] + * + * // stop idle server + * [8] C->A + * dl_server_timer() + * if (dl_defer_idle) + * dl_server_stop(); + * // [A] + * + * + * digraph dl_server { + * "A:init" -> "B:zero_laxity-wait" [label="1:dl_server_start"] + * "A:init" -> "D:running" [label="1:dl_server_start"] + * "B:zero_laxity-wait" -> "B:zero_laxity-wait" [label="2:dl_server_update"] + * "B:zero_laxity-wait" -> "C:idle-wait" [label="7:dl_server_update_idle"] + * "B:zero_laxity-wait" -> "D:running" [label="3:dl_server_timer"] + * "C:idle-wait" -> "A:init" [label="8:dl_server_timer"] + * "C:idle-wait" -> "B:zero_laxity-wait" [label="2:dl_server_update"] + * "C:idle-wait" -> "C:idle-wait" [label="7:dl_server_update_idle"] + * "D:running" -> "A:init" [label="4:pick_task_dl"] + * "D:running" -> "E:replenish-wait" [label="5:update_curr_dl_se"] + * "E:replenish-wait" -> "B:zero_laxity-wait" [label="2:dl_server_update"] + * "E:replenish-wait" -> "D:running" [label="6:dl_server_timer"] + * } + * + * + * Notes: + * + * - When there are fair tasks running the most likely loop is [2]->[2]. + * the dl_server never actually runs, the timer never fires. + * + * - When there is actual fair starvation; the timer fires and starts the + * dl_server. This will then throttle and replenish like a normal DL + * task. Notably it will not 'defer' again. + * + * - When idle it will push the actication forward once, and then wait + * for the timer to hit or a non-idle update to restart things. + */ void dl_server_start(struct sched_dl_entity *dl_se) { struct rq *rq = dl_se->rq; @@ -1582,6 +1787,11 @@ void dl_server_start(struct sched_dl_entity *dl_se) if (!dl_server(dl_se) || dl_se->dl_server_active) return; + /* + * Update the current task to 'now'. + */ + rq->donor->sched_class->update_curr(rq); + if (WARN_ON_ONCE(!cpu_online(cpu_of(rq)))) return; @@ -1600,6 +1810,7 @@ void dl_server_stop(struct sched_dl_entity *dl_se) hrtimer_try_to_cancel(&dl_se->dl_timer); dl_se->dl_defer_armed = 0; dl_se->dl_throttled = 0; + dl_se->dl_defer_idle = 0; dl_se->dl_server_active = 0; } @@ -1811,7 +2022,7 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) if (!dl_rq->dl_nr_running) { dl_rq->earliest_dl.curr = 0; dl_rq->earliest_dl.next = 0; - cpudl_clear(&rq->rd->cpudl, rq->cpu); + cpudl_clear(&rq->rd->cpudl, rq->cpu, rq->online); cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr); } else { struct rb_node *leftmost = rb_first_cached(&dl_rq->root); @@ -2048,7 +2259,7 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se, int flags) * or "inactive") */ if (flags & DEQUEUE_SLEEP) - task_non_contending(dl_se); + task_non_contending(dl_se, true); } static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) @@ -2143,7 +2354,7 @@ static void yield_task_dl(struct rq *rq) * it and the bandwidth timer will wake it up and will give it * new scheduling parameters (thanks to dl_yielded=1). */ - rq->curr->dl.dl_yielded = 1; + rq->donor->dl.dl_yielded = 1; update_rq_clock(rq); update_curr_dl(rq); @@ -2173,7 +2384,7 @@ select_task_rq_dl(struct task_struct *p, int cpu, int flags) struct rq *rq; if (!(flags & WF_TTWU)) - goto out; + return cpu; rq = cpu_rq(cpu); @@ -2211,7 +2422,6 @@ select_task_rq_dl(struct task_struct *p, int cpu, int flags) } rcu_read_unlock(); -out: return cpu; } @@ -2355,7 +2565,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct dl_rq *dl_rq) * __pick_next_task_dl - Helper to pick the next -deadline task to run. * @rq: The runqueue to pick the next task from. */ -static struct task_struct *__pick_task_dl(struct rq *rq) +static struct task_struct *__pick_task_dl(struct rq *rq, struct rq_flags *rf) { struct sched_dl_entity *dl_se; struct dl_rq *dl_rq = &rq->dl; @@ -2369,7 +2579,7 @@ again: WARN_ON_ONCE(!dl_se); if (dl_server(dl_se)) { - p = dl_se->server_pick_task(dl_se); + p = dl_se->server_pick_task(dl_se, rf); if (!p) { dl_server_stop(dl_se); goto again; @@ -2382,9 +2592,9 @@ again: return p; } -static struct task_struct *pick_task_dl(struct rq *rq) +static struct task_struct *pick_task_dl(struct rq *rq, struct rq_flags *rf) { - return __pick_task_dl(rq); + return __pick_task_dl(rq, rf); } static void put_prev_task_dl(struct rq *rq, struct task_struct *p, struct task_struct *next) @@ -2883,9 +3093,10 @@ static void rq_online_dl(struct rq *rq) if (rq->dl.overloaded) dl_set_overload(rq); - cpudl_set_freecpu(&rq->rd->cpudl, rq->cpu); if (rq->dl.dl_nr_running > 0) cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr); + else + cpudl_clear(&rq->rd->cpudl, rq->cpu, true); } /* Assumes rq->lock is held */ @@ -2894,8 +3105,7 @@ static void rq_offline_dl(struct rq *rq) if (rq->dl.overloaded) dl_clear_overload(rq); - cpudl_clear(&rq->rd->cpudl, rq->cpu); - cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu); + cpudl_clear(&rq->rd->cpudl, rq->cpu, false); } void __init init_sched_dl_class(void) @@ -2973,7 +3183,7 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) * will reset the task parameters. */ if (task_on_rq_queued(p) && p->dl.dl_runtime) - task_non_contending(&p->dl); + task_non_contending(&p->dl, false); /* * In case a task is setscheduled out from SCHED_DEADLINE we need to @@ -3045,23 +3255,24 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) } } +static u64 get_prio_dl(struct rq *rq, struct task_struct *p) +{ + return p->dl.deadline; +} + /* * If the scheduling parameters of a -deadline task changed, * a push or pull operation might be needed. */ -static void prio_changed_dl(struct rq *rq, struct task_struct *p, - int oldprio) +static void prio_changed_dl(struct rq *rq, struct task_struct *p, u64 old_deadline) { if (!task_on_rq_queued(p)) return; - /* - * This might be too much, but unfortunately - * we don't have the old deadline value, and - * we can't argue if the task is increasing - * or lowering its prio, so... - */ - if (!rq->dl.overloaded) + if (p->dl.deadline == old_deadline) + return; + + if (dl_time_before(old_deadline, p->dl.deadline)) deadline_queue_pull_task(rq); if (task_current_donor(rq, p)) { @@ -3094,6 +3305,8 @@ static int task_is_throttled_dl(struct task_struct *p, int cpu) DEFINE_SCHED_CLASS(dl) = { + .queue_mask = 8, + .enqueue_task = enqueue_task_dl, .dequeue_task = dequeue_task_dl, .yield_task = yield_task_dl, @@ -3116,6 +3329,7 @@ DEFINE_SCHED_CLASS(dl) = { .task_tick = task_tick_dl, .task_fork = task_fork_dl, + .get_prio = get_prio_dl, .prio_changed = prio_changed_dl, .switched_from = switched_from_dl, .switched_to = switched_to_dl, diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 02e16b70a790..41caa22e0680 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -796,7 +796,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) { - s64 left_vruntime = -1, min_vruntime, right_vruntime = -1, left_deadline = -1, spread; + s64 left_vruntime = -1, zero_vruntime, right_vruntime = -1, left_deadline = -1, spread; struct sched_entity *last, *first, *root; struct rq *rq = cpu_rq(cpu); unsigned long flags; @@ -819,15 +819,15 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) last = __pick_last_entity(cfs_rq); if (last) right_vruntime = last->vruntime; - min_vruntime = cfs_rq->min_vruntime; + zero_vruntime = cfs_rq->zero_vruntime; raw_spin_rq_unlock_irqrestore(rq, flags); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_deadline", SPLIT_NS(left_deadline)); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_vruntime", SPLIT_NS(left_vruntime)); - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime", - SPLIT_NS(min_vruntime)); + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "zero_vruntime", + SPLIT_NS(zero_vruntime)); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "avg_vruntime", SPLIT_NS(avg_vruntime(cfs_rq))); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "right_vruntime", diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index ecb251e883ea..6827689a0966 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -25,7 +25,7 @@ static struct scx_sched __rcu *scx_root; * guarantee system safety. Maintain a dedicated task list which contains every * task between its fork and eventual free. */ -static DEFINE_SPINLOCK(scx_tasks_lock); +static DEFINE_RAW_SPINLOCK(scx_tasks_lock); static LIST_HEAD(scx_tasks); /* ops enable/disable */ @@ -476,7 +476,7 @@ static void scx_task_iter_start(struct scx_task_iter *iter) BUILD_BUG_ON(__SCX_DSQ_ITER_ALL_FLAGS & ((1U << __SCX_DSQ_LNODE_PRIV_SHIFT) - 1)); - spin_lock_irq(&scx_tasks_lock); + raw_spin_lock_irq(&scx_tasks_lock); iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR }; list_add(&iter->cursor.tasks_node, &scx_tasks); @@ -507,14 +507,14 @@ static void scx_task_iter_unlock(struct scx_task_iter *iter) __scx_task_iter_rq_unlock(iter); if (iter->list_locked) { iter->list_locked = false; - spin_unlock_irq(&scx_tasks_lock); + raw_spin_unlock_irq(&scx_tasks_lock); } } static void __scx_task_iter_maybe_relock(struct scx_task_iter *iter) { if (!iter->list_locked) { - spin_lock_irq(&scx_tasks_lock); + raw_spin_lock_irq(&scx_tasks_lock); iter->list_locked = true; } } @@ -1474,7 +1474,7 @@ static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags static void yield_task_scx(struct rq *rq) { struct scx_sched *sch = scx_root; - struct task_struct *p = rq->curr; + struct task_struct *p = rq->donor; if (SCX_HAS_OP(sch, yield)) SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, yield, rq, p, NULL); @@ -1485,7 +1485,7 @@ static void yield_task_scx(struct rq *rq) static bool yield_to_task_scx(struct rq *rq, struct task_struct *to) { struct scx_sched *sch = scx_root; - struct task_struct *from = rq->curr; + struct task_struct *from = rq->donor; if (SCX_HAS_OP(sch, yield)) return SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, yield, rq, @@ -2047,7 +2047,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev) lockdep_assert_rq_held(rq); rq->scx.flags |= SCX_RQ_IN_BALANCE; - rq->scx.flags &= ~(SCX_RQ_BAL_PENDING | SCX_RQ_BAL_KEEP); + rq->scx.flags &= ~SCX_RQ_BAL_KEEP; if ((sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT) && unlikely(rq->scx.cpu_released)) { @@ -2153,42 +2153,6 @@ has_tasks: return true; } -static int balance_scx(struct rq *rq, struct task_struct *prev, - struct rq_flags *rf) -{ - int ret; - - rq_unpin_lock(rq, rf); - - ret = balance_one(rq, prev); - -#ifdef CONFIG_SCHED_SMT - /* - * When core-sched is enabled, this ops.balance() call will be followed - * by pick_task_scx() on this CPU and the SMT siblings. Balance the - * siblings too. - */ - if (sched_core_enabled(rq)) { - const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq)); - int scpu; - - for_each_cpu_andnot(scpu, smt_mask, cpumask_of(cpu_of(rq))) { - struct rq *srq = cpu_rq(scpu); - struct task_struct *sprev = srq->curr; - - WARN_ON_ONCE(__rq_lockp(rq) != __rq_lockp(srq)); - update_rq_clock(srq); - balance_one(srq, sprev); - } - } -#endif - rq_repin_lock(rq, rf); - - maybe_queue_balance_callback(rq); - - return ret; -} - static void process_ddsp_deferred_locals(struct rq *rq) { struct task_struct *p; @@ -2368,41 +2332,23 @@ static struct task_struct *first_local_task(struct rq *rq) struct task_struct, scx.dsq_list.node); } -static struct task_struct *pick_task_scx(struct rq *rq) +static struct task_struct *pick_task_scx(struct rq *rq, struct rq_flags *rf) { struct task_struct *prev = rq->curr; + bool keep_prev, kick_idle = false; struct task_struct *p; - bool keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP; - bool kick_idle = false; - /* - * WORKAROUND: - * - * %SCX_RQ_BAL_KEEP should be set iff $prev is on SCX as it must just - * have gone through balance_scx(). Unfortunately, there currently is a - * bug where fair could say yes on balance() but no on pick_task(), - * which then ends up calling pick_task_scx() without preceding - * balance_scx(). - * - * Keep running @prev if possible and avoid stalling from entering idle - * without balancing. - * - * Once fair is fixed, remove the workaround and trigger WARN_ON_ONCE() - * if pick_task_scx() is called without preceding balance_scx(). - */ - if (unlikely(rq->scx.flags & SCX_RQ_BAL_PENDING)) { - if (prev->scx.flags & SCX_TASK_QUEUED) { - keep_prev = true; - } else { - keep_prev = false; - kick_idle = true; - } - } else if (unlikely(keep_prev && - prev->sched_class != &ext_sched_class)) { - /* - * Can happen while enabling as SCX_RQ_BAL_PENDING assertion is - * conditional on scx_enabled() and may have been skipped. - */ + rq_modified_clear(rq); + rq_unpin_lock(rq, rf); + balance_one(rq, prev); + rq_repin_lock(rq, rf); + maybe_queue_balance_callback(rq); + if (rq_modified_above(rq, &ext_sched_class)) + return RETRY_TASK; + + keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP; + if (unlikely(keep_prev && + prev->sched_class != &ext_sched_class)) { WARN_ON_ONCE(scx_enable_state() == SCX_ENABLED); keep_prev = false; } @@ -2940,9 +2886,9 @@ void scx_post_fork(struct task_struct *p) } } - spin_lock_irq(&scx_tasks_lock); + raw_spin_lock_irq(&scx_tasks_lock); list_add_tail(&p->scx.tasks_node, &scx_tasks); - spin_unlock_irq(&scx_tasks_lock); + raw_spin_unlock_irq(&scx_tasks_lock); percpu_up_read(&scx_fork_rwsem); } @@ -2966,9 +2912,9 @@ void sched_ext_free(struct task_struct *p) { unsigned long flags; - spin_lock_irqsave(&scx_tasks_lock, flags); + raw_spin_lock_irqsave(&scx_tasks_lock, flags); list_del_init(&p->scx.tasks_node); - spin_unlock_irqrestore(&scx_tasks_lock, flags); + raw_spin_unlock_irqrestore(&scx_tasks_lock, flags); /* * @p is off scx_tasks and wholly ours. scx_enable()'s READY -> ENABLED @@ -2997,7 +2943,7 @@ static void reweight_task_scx(struct rq *rq, struct task_struct *p, p, p->scx.weight); } -static void prio_changed_scx(struct rq *rq, struct task_struct *p, int oldprio) +static void prio_changed_scx(struct rq *rq, struct task_struct *p, u64 oldprio) { } @@ -3270,6 +3216,8 @@ static void scx_cgroup_unlock(void) {} * their current sched_class. Call them directly from sched core instead. */ DEFINE_SCHED_CLASS(ext) = { + .queue_mask = 1, + .enqueue_task = enqueue_task_scx, .dequeue_task = dequeue_task_scx, .yield_task = yield_task_scx, @@ -3277,7 +3225,6 @@ DEFINE_SCHED_CLASS(ext) = { .wakeup_preempt = wakeup_preempt_scx, - .balance = balance_scx, .pick_task = pick_task_scx, .put_prev_task = put_prev_task_scx, @@ -3818,11 +3765,10 @@ static void scx_bypass(bool bypass) */ list_for_each_entry_safe_reverse(p, n, &rq->scx.runnable_list, scx.runnable_node) { - struct sched_enq_and_set_ctx ctx; - /* cycling deq/enq is enough, see the function comment */ - sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); - sched_enq_and_set_task(&ctx); + scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) { + /* nothing */ ; + } } /* resched to restore ticks and idle state */ @@ -3972,22 +3918,20 @@ static void scx_disable_workfn(struct kthread_work *work) scx_task_iter_start(&sti); while ((p = scx_task_iter_next_locked(&sti))) { + unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; const struct sched_class *old_class = p->sched_class; const struct sched_class *new_class = __setscheduler_class(p->policy, p->prio); - struct sched_enq_and_set_ctx ctx; - if (old_class != new_class && p->se.sched_delayed) - dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED); + update_rq_clock(task_rq(p)); - sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); + if (old_class != new_class) + queue_flags |= DEQUEUE_CLASS; - p->sched_class = new_class; - check_class_changing(task_rq(p), p, old_class); - - sched_enq_and_set_task(&ctx); + scoped_guard (sched_change, p, queue_flags) { + p->sched_class = new_class; + } - check_class_changed(task_rq(p), p, old_class, p->prio); scx_exit_task(p); } scx_task_iter_stop(&sti); @@ -4276,7 +4220,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) size_t avail, used; bool idle; - rq_lock(rq, &rf); + rq_lock_irqsave(rq, &rf); idle = list_empty(&rq->scx.runnable_list) && rq->curr->sched_class == &idle_sched_class; @@ -4345,7 +4289,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) scx_dump_task(&s, &dctx, p, ' '); next: - rq_unlock(rq, &rf); + rq_unlock_irqrestore(rq, &rf); } dump_newline(&s); @@ -4479,8 +4423,11 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops) goto err_free_gdsqs; sch->helper = kthread_run_worker(0, "sched_ext_helper"); - if (!sch->helper) + if (IS_ERR(sch->helper)) { + ret = PTR_ERR(sch->helper); goto err_free_pcpu; + } + sched_set_fifo(sch->helper->task); atomic_set(&sch->exit_kind, SCX_EXIT_NONE); @@ -4748,26 +4695,22 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) percpu_down_write(&scx_fork_rwsem); scx_task_iter_start(&sti); while ((p = scx_task_iter_next_locked(&sti))) { + unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; const struct sched_class *old_class = p->sched_class; const struct sched_class *new_class = __setscheduler_class(p->policy, p->prio); - struct sched_enq_and_set_ctx ctx; if (!tryget_task_struct(p)) continue; - if (old_class != new_class && p->se.sched_delayed) - dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED); - - sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); - - p->scx.slice = SCX_SLICE_DFL; - p->sched_class = new_class; - check_class_changing(task_rq(p), p, old_class); + if (old_class != new_class) + queue_flags |= DEQUEUE_CLASS; - sched_enq_and_set_task(&ctx); + scoped_guard (sched_change, p, queue_flags) { + p->scx.slice = SCX_SLICE_DFL; + p->sched_class = new_class; + } - check_class_changed(task_rq(p), p, old_class, p->prio); put_task_struct(p); } scx_task_iter_stop(&sti); @@ -5321,8 +5264,8 @@ void __init init_sched_ext_class(void) BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n)); BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n)); BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n)); - init_irq_work(&rq->scx.deferred_irq_work, deferred_irq_workfn); - init_irq_work(&rq->scx.kick_cpus_irq_work, kick_cpus_irq_workfn); + rq->scx.deferred_irq_work = IRQ_WORK_INIT_HARD(deferred_irq_workfn); + rq->scx.kick_cpus_irq_work = IRQ_WORK_INIT_HARD(kick_cpus_irq_workfn); if (cpu_online(cpu)) cpu_rq(cpu)->scx.flags |= SCX_RQ_ONLINE; @@ -6401,7 +6344,7 @@ __bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf) guard(rcu)(); - sch = rcu_dereference(sch); + sch = rcu_dereference(scx_root); if (unlikely(!sch)) return; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5b752324270b..769d7b7990df 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -554,7 +554,7 @@ static inline bool entity_before(const struct sched_entity *a, static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) { - return (s64)(se->vruntime - cfs_rq->min_vruntime); + return (s64)(se->vruntime - cfs_rq->zero_vruntime); } #define __node_2_se(node) \ @@ -606,13 +606,13 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) * * Which we track using: * - * v0 := cfs_rq->min_vruntime + * v0 := cfs_rq->zero_vruntime * \Sum (v_i - v0) * w_i := cfs_rq->avg_vruntime * \Sum w_i := cfs_rq->avg_load * - * Since min_vruntime is a monotonic increasing variable that closely tracks - * the per-task service, these deltas: (v_i - v), will be in the order of the - * maximal (virtual) lag induced in the system due to quantisation. + * Since zero_vruntime closely tracks the per-task service, these + * deltas: (v_i - v), will be in the order of the maximal (virtual) lag + * induced in the system due to quantisation. * * Also, we use scale_load_down() to reduce the size. * @@ -671,7 +671,7 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq) avg = div_s64(avg, load); } - return cfs_rq->min_vruntime + avg; + return cfs_rq->zero_vruntime + avg; } /* @@ -732,7 +732,7 @@ static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime) load += weight; } - return avg >= (s64)(vruntime - cfs_rq->min_vruntime) * load; + return avg >= (s64)(vruntime - cfs_rq->zero_vruntime) * load; } int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -740,42 +740,14 @@ int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se) return vruntime_eligible(cfs_rq, se->vruntime); } -static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime) +static void update_zero_vruntime(struct cfs_rq *cfs_rq) { - u64 min_vruntime = cfs_rq->min_vruntime; - /* - * open coded max_vruntime() to allow updating avg_vruntime - */ - s64 delta = (s64)(vruntime - min_vruntime); - if (delta > 0) { - avg_vruntime_update(cfs_rq, delta); - min_vruntime = vruntime; - } - return min_vruntime; -} + u64 vruntime = avg_vruntime(cfs_rq); + s64 delta = (s64)(vruntime - cfs_rq->zero_vruntime); -static void update_min_vruntime(struct cfs_rq *cfs_rq) -{ - struct sched_entity *se = __pick_root_entity(cfs_rq); - struct sched_entity *curr = cfs_rq->curr; - u64 vruntime = cfs_rq->min_vruntime; - - if (curr) { - if (curr->on_rq) - vruntime = curr->vruntime; - else - curr = NULL; - } - - if (se) { - if (!curr) - vruntime = se->min_vruntime; - else - vruntime = min_vruntime(vruntime, se->min_vruntime); - } + avg_vruntime_update(cfs_rq, delta); - /* ensure we never gain time by being placed backwards. */ - cfs_rq->min_vruntime = __update_min_vruntime(cfs_rq, vruntime); + cfs_rq->zero_vruntime = vruntime; } static inline u64 cfs_rq_min_slice(struct cfs_rq *cfs_rq) @@ -848,6 +820,7 @@ RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity, static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { avg_vruntime_add(cfs_rq, se); + update_zero_vruntime(cfs_rq); se->min_vruntime = se->vruntime; se->min_slice = se->slice; rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, @@ -859,6 +832,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, &min_vruntime_cb); avg_vruntime_sub(cfs_rq, se); + update_zero_vruntime(cfs_rq); } struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq) @@ -955,6 +929,16 @@ static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq, bool protect) if (cfs_rq->nr_queued == 1) return curr && curr->on_rq ? curr : se; + /* + * Picking the ->next buddy will affect latency but not fairness. + */ + if (sched_feat(PICK_BUDDY) && + cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) { + /* ->next will never be delayed */ + WARN_ON_ONCE(cfs_rq->next->sched_delayed); + return cfs_rq->next; + } + if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr))) curr = NULL; @@ -1193,6 +1177,8 @@ static s64 update_se(struct rq *rq, struct sched_entity *se) return delta_exec; } +static void set_next_buddy(struct sched_entity *se); + /* * Used by other classes to account runtime. */ @@ -1226,7 +1212,6 @@ static void update_curr(struct cfs_rq *cfs_rq) curr->vruntime += calc_delta_fair(delta_exec, curr); resched = update_deadline(cfs_rq, curr); - update_min_vruntime(cfs_rq); if (entity_is_task(curr)) { /* @@ -1239,8 +1224,7 @@ static void update_curr(struct cfs_rq *cfs_rq) * against fair_server such that it can account for this time * and possibly avoid running this period. */ - if (dl_server_active(&rq->fair_server)) - dl_server_update(&rq->fair_server, delta_exec); + dl_server_update(&rq->fair_server, delta_exec); } account_cfs_rq_runtime(cfs_rq, delta_exec); @@ -3808,15 +3792,6 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, if (!curr) __enqueue_entity(cfs_rq, se); cfs_rq->nr_queued++; - - /* - * The entity's vruntime has been adjusted, so let's check - * whether the rq-wide min_vruntime needs updated too. Since - * the calculations above require stable min_vruntime rather - * than up-to-date one, we do the update at the end of the - * reweight process. - */ - update_min_vruntime(cfs_rq); } } @@ -5429,15 +5404,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) update_cfs_group(se); - /* - * Now advance min_vruntime if @se was the entity holding it back, - * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be - * put back on, and if we advance min_vruntime, we'll be placed back - * further than we started -- i.e. we'll be penalized. - */ - if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE) - update_min_vruntime(cfs_rq); - if (flags & DEQUEUE_DELAYED) finish_delayed_dequeue_entity(se); @@ -5512,16 +5478,6 @@ pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq) { struct sched_entity *se; - /* - * Picking the ->next buddy will affect latency but not fairness. - */ - if (sched_feat(PICK_BUDDY) && - cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) { - /* ->next will never be delayed */ - WARN_ON_ONCE(cfs_rq->next->sched_delayed); - return cfs_rq->next; - } - se = pick_eevdf(cfs_rq); if (se->sched_delayed) { dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED); @@ -7003,12 +6959,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) h_nr_idle = 1; } - if (!rq_h_nr_queued && rq->cfs.h_nr_queued) { - /* Account for idle runtime */ - if (!rq->nr_running) - dl_server_update_idle_time(rq, rq->curr); + if (!rq_h_nr_queued && rq->cfs.h_nr_queued) dl_server_start(&rq->fair_server); - } /* At this point se is NULL and we are at root level*/ add_nr_running(rq, 1); @@ -7035,8 +6987,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) hrtick_update(rq); } -static void set_next_buddy(struct sched_entity *se); - /* * Basically dequeue_task_fair(), except it can deal with dequeue_entity() * failing half-way through and resume the dequeue later. @@ -8712,15 +8662,6 @@ static void set_cpus_allowed_fair(struct task_struct *p, struct affinity_context set_task_max_allowed_capacity(p); } -static int -balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) -{ - if (sched_fair_runnable(rq)) - return 1; - - return sched_balance_newidle(rq, rf) != 0; -} - static void set_next_buddy(struct sched_entity *se) { for_each_sched_entity(se) { @@ -8732,16 +8673,81 @@ static void set_next_buddy(struct sched_entity *se) } } +enum preempt_wakeup_action { + PREEMPT_WAKEUP_NONE, /* No preemption. */ + PREEMPT_WAKEUP_SHORT, /* Ignore slice protection. */ + PREEMPT_WAKEUP_PICK, /* Let __pick_eevdf() decide. */ + PREEMPT_WAKEUP_RESCHED, /* Force reschedule. */ +}; + +static inline bool +set_preempt_buddy(struct cfs_rq *cfs_rq, int wake_flags, + struct sched_entity *pse, struct sched_entity *se) +{ + /* + * Keep existing buddy if the deadline is sooner than pse. + * The older buddy may be cache cold and completely unrelated + * to the current wakeup but that is unpredictable where as + * obeying the deadline is more in line with EEVDF objectives. + */ + if (cfs_rq->next && entity_before(cfs_rq->next, pse)) + return false; + + set_next_buddy(pse); + return true; +} + +/* + * WF_SYNC|WF_TTWU indicates the waker expects to sleep but it is not + * strictly enforced because the hint is either misunderstood or + * multiple tasks must be woken up. + */ +static inline enum preempt_wakeup_action +preempt_sync(struct rq *rq, int wake_flags, + struct sched_entity *pse, struct sched_entity *se) +{ + u64 threshold, delta; + + /* + * WF_SYNC without WF_TTWU is not expected so warn if it happens even + * though it is likely harmless. + */ + WARN_ON_ONCE(!(wake_flags & WF_TTWU)); + + threshold = sysctl_sched_migration_cost; + delta = rq_clock_task(rq) - se->exec_start; + if ((s64)delta < 0) + delta = 0; + + /* + * WF_RQ_SELECTED implies the tasks are stacking on a CPU when they + * could run on other CPUs. Reduce the threshold before preemption is + * allowed to an arbitrary lower value as it is more likely (but not + * guaranteed) the waker requires the wakee to finish. + */ + if (wake_flags & WF_RQ_SELECTED) + threshold >>= 2; + + /* + * As WF_SYNC is not strictly obeyed, allow some runtime for batch + * wakeups to be issued. + */ + if (entity_before(pse, se) && delta >= threshold) + return PREEMPT_WAKEUP_RESCHED; + + return PREEMPT_WAKEUP_NONE; +} + /* * Preempt the current task with a newly woken task if needed: */ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags) { + enum preempt_wakeup_action preempt_action = PREEMPT_WAKEUP_PICK; struct task_struct *donor = rq->donor; struct sched_entity *se = &donor->se, *pse = &p->se; struct cfs_rq *cfs_rq = task_cfs_rq(donor); int cse_is_idle, pse_is_idle; - bool do_preempt_short = false; if (unlikely(se == pse)) return; @@ -8755,10 +8761,6 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int if (task_is_throttled(p)) return; - if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK) && !pse->sched_delayed) { - set_next_buddy(pse); - } - /* * We can come here with TIF_NEED_RESCHED already set from new task * wake up path. @@ -8790,7 +8792,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int * When non-idle entity preempt an idle entity, * don't give idle entity slice protection. */ - do_preempt_short = true; + preempt_action = PREEMPT_WAKEUP_SHORT; goto preempt; } @@ -8809,27 +8811,74 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int * If @p has a shorter slice than current and @p is eligible, override * current's slice protection in order to allow preemption. */ - do_preempt_short = sched_feat(PREEMPT_SHORT) && (pse->slice < se->slice); + if (sched_feat(PREEMPT_SHORT) && (pse->slice < se->slice)) { + preempt_action = PREEMPT_WAKEUP_SHORT; + goto pick; + } /* + * Ignore wakee preemption on WF_FORK as it is less likely that + * there is shared data as exec often follow fork. Do not + * preempt for tasks that are sched_delayed as it would violate + * EEVDF to forcibly queue an ineligible task. + */ + if ((wake_flags & WF_FORK) || pse->sched_delayed) + return; + + /* + * If @p potentially is completing work required by current then + * consider preemption. + * + * Reschedule if waker is no longer eligible. */ + if (in_task() && !entity_eligible(cfs_rq, se)) { + preempt_action = PREEMPT_WAKEUP_RESCHED; + goto preempt; + } + + /* Prefer picking wakee soon if appropriate. */ + if (sched_feat(NEXT_BUDDY) && + set_preempt_buddy(cfs_rq, wake_flags, pse, se)) { + + /* + * Decide whether to obey WF_SYNC hint for a new buddy. Old + * buddies are ignored as they may not be relevant to the + * waker and less likely to be cache hot. + */ + if (wake_flags & WF_SYNC) + preempt_action = preempt_sync(rq, wake_flags, pse, se); + } + + switch (preempt_action) { + case PREEMPT_WAKEUP_NONE: + return; + case PREEMPT_WAKEUP_RESCHED: + goto preempt; + case PREEMPT_WAKEUP_SHORT: + fallthrough; + case PREEMPT_WAKEUP_PICK: + break; + } + +pick: + /* * If @p has become the most eligible task, force preemption. */ - if (__pick_eevdf(cfs_rq, !do_preempt_short) == pse) + if (__pick_eevdf(cfs_rq, preempt_action != PREEMPT_WAKEUP_SHORT) == pse) goto preempt; - if (sched_feat(RUN_TO_PARITY) && do_preempt_short) + if (sched_feat(RUN_TO_PARITY)) update_protect_slice(cfs_rq, se); return; preempt: - if (do_preempt_short) + if (preempt_action == PREEMPT_WAKEUP_SHORT) cancel_protect_slice(se); resched_curr_lazy(rq); } -static struct task_struct *pick_task_fair(struct rq *rq) +static struct task_struct *pick_task_fair(struct rq *rq, struct rq_flags *rf) { struct sched_entity *se; struct cfs_rq *cfs_rq; @@ -8873,7 +8922,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf int new_tasks; again: - p = pick_task_fair(rq); + p = pick_task_fair(rq, rf); if (!p) goto idle; se = &p->se; @@ -8952,14 +9001,10 @@ idle: return NULL; } -static struct task_struct *__pick_next_task_fair(struct rq *rq, struct task_struct *prev) +static struct task_struct * +fair_server_pick_task(struct sched_dl_entity *dl_se, struct rq_flags *rf) { - return pick_next_task_fair(rq, prev, NULL); -} - -static struct task_struct *fair_server_pick_task(struct sched_dl_entity *dl_se) -{ - return pick_task_fair(dl_se->rq); + return pick_task_fair(dl_se->rq, rf); } void fair_server_init(struct rq *rq) @@ -8990,7 +9035,7 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, struct t */ static void yield_task_fair(struct rq *rq) { - struct task_struct *curr = rq->curr; + struct task_struct *curr = rq->donor; struct cfs_rq *cfs_rq = task_cfs_rq(curr); struct sched_entity *se = &curr->se; @@ -9014,7 +9059,18 @@ static void yield_task_fair(struct rq *rq) */ rq_clock_skip_update(rq); - se->deadline += calc_delta_fair(se->slice, se); + /* + * Forfeit the remaining vruntime, only if the entity is eligible. This + * condition is necessary because in core scheduling we prefer to run + * ineligible tasks rather than force idling. If this happens we may + * end up in a loop where the core scheduler picks the yielding task, + * which yields immediately again; without the condition the vruntime + * ends up quickly running away. + */ + if (entity_eligible(cfs_rq, se)) { + se->vruntime = se->deadline; + se->deadline += calc_delta_fair(se->slice, se); + } } static bool yield_to_task_fair(struct rq *rq, struct task_struct *p) @@ -10678,7 +10734,7 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd, if (sd->flags & SD_ASYM_CPUCAPACITY) sgs->group_misfit_task_load = 1; - for_each_cpu(i, sched_group_span(group)) { + for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) { struct rq *rq = cpu_rq(i); unsigned int local; @@ -11730,6 +11786,21 @@ static void update_lb_imbalance_stat(struct lb_env *env, struct sched_domain *sd } /* + * This flag serializes load-balancing passes over large domains + * (above the NODE topology level) - only one load-balancing instance + * may run at a time, to reduce overhead on very large systems with + * lots of CPUs and large NUMA distances. + * + * - Note that load-balancing passes triggered while another one + * is executing are skipped and not re-tried. + * + * - Also note that this does not serialize rebalance_domains() + * execution, as non-SD_SERIALIZE domains will still be + * load-balanced in parallel. + */ +static atomic_t sched_balance_running = ATOMIC_INIT(0); + +/* * Check this_cpu to ensure it is balanced within domain. Attempt to move * tasks if there is an imbalance. */ @@ -11754,6 +11825,7 @@ static int sched_balance_rq(int this_cpu, struct rq *this_rq, .fbq_type = all, .tasks = LIST_HEAD_INIT(env.tasks), }; + bool need_unlock = false; cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask); @@ -11765,6 +11837,14 @@ redo: goto out_balanced; } + if (!need_unlock && (sd->flags & SD_SERIALIZE)) { + int zero = 0; + if (!atomic_try_cmpxchg_acquire(&sched_balance_running, &zero, 1)) + goto out_balanced; + + need_unlock = true; + } + group = sched_balance_find_src_group(&env); if (!group) { schedstat_inc(sd->lb_nobusyg[idle]); @@ -12005,6 +12085,9 @@ out_one_pinned: sd->balance_interval < sd->max_interval) sd->balance_interval *= 2; out: + if (need_unlock) + atomic_set_release(&sched_balance_running, 0); + return ld_moved; } @@ -12130,21 +12213,6 @@ out_unlock: } /* - * This flag serializes load-balancing passes over large domains - * (above the NODE topology level) - only one load-balancing instance - * may run at a time, to reduce overhead on very large systems with - * lots of CPUs and large NUMA distances. - * - * - Note that load-balancing passes triggered while another one - * is executing are skipped and not re-tried. - * - * - Also note that this does not serialize rebalance_domains() - * execution, as non-SD_SERIALIZE domains will still be - * load-balanced in parallel. - */ -static atomic_t sched_balance_running = ATOMIC_INIT(0); - -/* * Scale the max sched_balance_rq interval with the number of CPUs in the system. * This trades load-balance latency on larger machines for less cross talk. */ @@ -12153,30 +12221,43 @@ void update_max_interval(void) max_load_balance_interval = HZ*num_online_cpus()/10; } -static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost) +static inline void update_newidle_stats(struct sched_domain *sd, unsigned int success) { + sd->newidle_call++; + sd->newidle_success += success; + + if (sd->newidle_call >= 1024) { + sd->newidle_ratio = sd->newidle_success; + sd->newidle_call /= 2; + sd->newidle_success /= 2; + } +} + +static inline bool +update_newidle_cost(struct sched_domain *sd, u64 cost, unsigned int success) +{ + unsigned long next_decay = sd->last_decay_max_lb_cost + HZ; + unsigned long now = jiffies; + + if (cost) + update_newidle_stats(sd, success); + if (cost > sd->max_newidle_lb_cost) { /* * Track max cost of a domain to make sure to not delay the * next wakeup on the CPU. - * - * sched_balance_newidle() bumps the cost whenever newidle - * balance fails, and we don't want things to grow out of - * control. Use the sysctl_sched_migration_cost as the upper - * limit, plus a litle extra to avoid off by ones. */ - sd->max_newidle_lb_cost = - min(cost, sysctl_sched_migration_cost + 200); - sd->last_decay_max_lb_cost = jiffies; - } else if (time_after(jiffies, sd->last_decay_max_lb_cost + HZ)) { + sd->max_newidle_lb_cost = cost; + sd->last_decay_max_lb_cost = now; + + } else if (time_after(now, next_decay)) { /* * Decay the newidle max times by ~1% per second to ensure that * it is not outdated and the current max cost is actually * shorter. */ sd->max_newidle_lb_cost = (sd->max_newidle_lb_cost * 253) / 256; - sd->last_decay_max_lb_cost = jiffies; - + sd->last_decay_max_lb_cost = now; return true; } @@ -12199,7 +12280,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle) /* Earliest time when we have to do rebalance again */ unsigned long next_balance = jiffies + 60*HZ; int update_next_balance = 0; - int need_serialize, need_decay = 0; + int need_decay = 0; u64 max_cost = 0; rcu_read_lock(); @@ -12208,7 +12289,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle) * Decay the newidle max times here because this is a regular * visit to all the domains. */ - need_decay = update_newidle_cost(sd, 0); + need_decay = update_newidle_cost(sd, 0, 0); max_cost += sd->max_newidle_lb_cost; /* @@ -12223,13 +12304,6 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle) } interval = get_sd_balance_interval(sd, busy); - - need_serialize = sd->flags & SD_SERIALIZE; - if (need_serialize) { - if (atomic_cmpxchg_acquire(&sched_balance_running, 0, 1)) - goto out; - } - if (time_after_eq(jiffies, sd->last_balance + interval)) { if (sched_balance_rq(cpu, rq, sd, idle, &continue_balancing)) { /* @@ -12243,9 +12317,6 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle) sd->last_balance = jiffies; interval = get_sd_balance_interval(sd, busy); } - if (need_serialize) - atomic_set_release(&sched_balance_running, 0); -out: if (time_after(next_balance, sd->last_balance + interval)) { next_balance = sd->last_balance + interval; update_next_balance = 1; @@ -12824,18 +12895,21 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) rcu_read_lock(); sd = rcu_dereference_check_sched_domain(this_rq->sd); + if (!sd) { + rcu_read_unlock(); + goto out; + } if (!get_rd_overloaded(this_rq->rd) || - (sd && this_rq->avg_idle < sd->max_newidle_lb_cost)) { + this_rq->avg_idle < sd->max_newidle_lb_cost) { - if (sd) - update_next_balance(sd, &next_balance); + update_next_balance(sd, &next_balance); rcu_read_unlock(); - goto out; } rcu_read_unlock(); + rq_modified_clear(this_rq); raw_spin_rq_unlock(this_rq); t0 = sched_clock_cpu(this_cpu); @@ -12851,6 +12925,22 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) break; if (sd->flags & SD_BALANCE_NEWIDLE) { + unsigned int weight = 1; + + if (sched_feat(NI_RANDOM)) { + /* + * Throw a 1k sided dice; and only run + * newidle_balance according to the success + * rate. + */ + u32 d1k = sched_rng() % 1024; + weight = 1 + sd->newidle_ratio; + if (d1k > weight) { + update_newidle_stats(sd, 0); + continue; + } + weight = (1024 + weight/2) / weight; + } pulled_task = sched_balance_rq(this_cpu, this_rq, sd, CPU_NEWLY_IDLE, @@ -12862,13 +12952,10 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) t0 = t1; /* - * Failing newidle means it is not effective; - * bump the cost so we end up doing less of it. + * Track max cost of a domain to make sure to not delay the + * next wakeup on the CPU. */ - if (!pulled_task) - domain_cost = (3 * sd->max_newidle_lb_cost) / 2; - - update_newidle_cost(sd, domain_cost); + update_newidle_cost(sd, domain_cost, weight * !!pulled_task); } /* @@ -12893,8 +12980,8 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) if (this_rq->cfs.h_nr_queued && !pulled_task) pulled_task = 1; - /* Is there a task of a high priority class? */ - if (this_rq->nr_running != this_rq->cfs.h_nr_queued) + /* If a higher prio class was modified, restart the pick */ + if (rq_modified_above(this_rq, &fair_sched_class)) pulled_task = -1; out: @@ -13012,7 +13099,170 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr) } /* - * se_fi_update - Update the cfs_rq->min_vruntime_fi in a CFS hierarchy if needed. + * Consider any infeasible weight scenario. Take for instance two tasks, + * each bound to their respective sibling, one with weight 1 and one with + * weight 2. Then the lower weight task will run ahead of the higher weight + * task without bound. + * + * This utterly destroys the concept of a shared time base. + * + * Remember; all this is about a proportionally fair scheduling, where each + * tasks receives: + * + * w_i + * dt_i = ---------- dt (1) + * \Sum_j w_j + * + * which we do by tracking a virtual time, s_i: + * + * 1 + * s_i = --- d[t]_i (2) + * w_i + * + * Where d[t] is a delta of discrete time, while dt is an infinitesimal. + * The immediate corollary is that the ideal schedule S, where (2) to use + * an infinitesimal delta, is: + * + * 1 + * S = ---------- dt (3) + * \Sum_i w_i + * + * From which we can define the lag, or deviation from the ideal, as: + * + * lag(i) = S - s_i (4) + * + * And since the one and only purpose is to approximate S, we get that: + * + * \Sum_i w_i lag(i) := 0 (5) + * + * If this were not so, we no longer converge to S, and we can no longer + * claim our scheduler has any of the properties we derive from S. This is + * exactly what you did above, you broke it! + * + * + * Let's continue for a while though; to see if there is anything useful to + * be learned. We can combine (1)-(3) or (4)-(5) and express S in s_i: + * + * \Sum_i w_i s_i + * S = -------------- (6) + * \Sum_i w_i + * + * Which gives us a way to compute S, given our s_i. Now, if you've read + * our code, you know that we do not in fact do this, the reason for this + * is two-fold. Firstly, computing S in that way requires a 64bit division + * for every time we'd use it (see 12), and secondly, this only describes + * the steady-state, it doesn't handle dynamics. + * + * Anyway, in (6): s_i -> x + (s_i - x), to get: + * + * \Sum_i w_i (s_i - x) + * S - x = -------------------- (7) + * \Sum_i w_i + * + * Which shows that S and s_i transform alike (which makes perfect sense + * given that S is basically the (weighted) average of s_i). + * + * So the thing to remember is that the above is strictly UP. It is + * possible to generalize to multiple runqueues -- however it gets really + * yuck when you have to add affinity support, as illustrated by our very + * first counter-example. + * + * Luckily I think we can avoid needing a full multi-queue variant for + * core-scheduling (or load-balancing). The crucial observation is that we + * only actually need this comparison in the presence of forced-idle; only + * then do we need to tell if the stalled rq has higher priority over the + * other. + * + * [XXX assumes SMT2; better consider the more general case, I suspect + * it'll work out because our comparison is always between 2 rqs and the + * answer is only interesting if one of them is forced-idle] + * + * And (under assumption of SMT2) when there is forced-idle, there is only + * a single queue, so everything works like normal. + * + * Let, for our runqueue 'k': + * + * T_k = \Sum_i w_i s_i + * W_k = \Sum_i w_i ; for all i of k (8) + * + * Then we can write (6) like: + * + * T_k + * S_k = --- (9) + * W_k + * + * From which immediately follows that: + * + * T_k + T_l + * S_k+l = --------- (10) + * W_k + W_l + * + * On which we can define a combined lag: + * + * lag_k+l(i) := S_k+l - s_i (11) + * + * And that gives us the tools to compare tasks across a combined runqueue. + * + * + * Combined this gives the following: + * + * a) when a runqueue enters force-idle, sync it against it's sibling rq(s) + * using (7); this only requires storing single 'time'-stamps. + * + * b) when comparing tasks between 2 runqueues of which one is forced-idle, + * compare the combined lag, per (11). + * + * Now, of course cgroups (I so hate them) make this more interesting in + * that a) seems to suggest we need to iterate all cgroup on a CPU at such + * boundaries, but I think we can avoid that. The force-idle is for the + * whole CPU, all it's rqs. So we can mark it in the root and lazily + * propagate downward on demand. + */ + +/* + * So this sync is basically a relative reset of S to 0. + * + * So with 2 queues, when one goes idle, we drop them both to 0 and one + * then increases due to not being idle, and the idle one builds up lag to + * get re-elected. So far so simple, right? + * + * When there's 3, we can have the situation where 2 run and one is idle, + * we sync to 0 and let the idle one build up lag to get re-election. Now + * suppose another one also drops idle. At this point dropping all to 0 + * again would destroy the built-up lag from the queue that was already + * idle, not good. + * + * So instead of syncing everything, we can: + * + * less := !((s64)(s_a - s_b) <= 0) + * + * (v_a - S_a) - (v_b - S_b) == v_a - v_b - S_a + S_b + * == v_a - (v_b - S_a + S_b) + * + * IOW, we can recast the (lag) comparison to a one-sided difference. + * So if then, instead of syncing the whole queue, sync the idle queue + * against the active queue with S_a + S_b at the point where we sync. + * + * (XXX consider the implication of living in a cyclic group: N / 2^n N) + * + * This gives us means of syncing single queues against the active queue, + * and for already idle queues to preserve their build-up lag. + * + * Of course, then we get the situation where there's 2 active and one + * going idle, who do we pick to sync against? Theory would have us sync + * against the combined S, but as we've already demonstrated, there is no + * such thing in infeasible weight scenarios. + * + * One thing I've considered; and this is where that core_active rudiment + * came from, is having active queues sync up between themselves after + * every tick. This limits the observed divergence due to the work + * conservancy. + * + * On top of that, we can improve upon things by employing (10) here. + */ + +/* + * se_fi_update - Update the cfs_rq->zero_vruntime_fi in a CFS hierarchy if needed. */ static void se_fi_update(const struct sched_entity *se, unsigned int fi_seq, bool forceidle) @@ -13026,7 +13276,7 @@ static void se_fi_update(const struct sched_entity *se, unsigned int fi_seq, cfs_rq->forceidle_seq = fi_seq; } - cfs_rq->min_vruntime_fi = cfs_rq->min_vruntime; + cfs_rq->zero_vruntime_fi = cfs_rq->zero_vruntime; } } @@ -13079,11 +13329,11 @@ bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b, /* * Find delta after normalizing se's vruntime with its cfs_rq's - * min_vruntime_fi, which would have been updated in prior calls + * zero_vruntime_fi, which would have been updated in prior calls * to se_fi_update(). */ delta = (s64)(sea->vruntime - seb->vruntime) + - (s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi); + (s64)(cfs_rqb->zero_vruntime_fi - cfs_rqa->zero_vruntime_fi); return delta > 0; } @@ -13145,11 +13395,14 @@ static void task_fork_fair(struct task_struct *p) * the current task. */ static void -prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) +prio_changed_fair(struct rq *rq, struct task_struct *p, u64 oldprio) { if (!task_on_rq_queued(p)) return; + if (p->prio == oldprio) + return; + if (rq->cfs.nr_queued == 1) return; @@ -13161,8 +13414,9 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) if (task_current_donor(rq, p)) { if (p->prio > oldprio) resched_curr(rq); - } else + } else { wakeup_preempt(rq, p, 0); + } } #ifdef CONFIG_FAIR_GROUP_SCHED @@ -13246,6 +13500,12 @@ static void attach_task_cfs_rq(struct task_struct *p) attach_entity_cfs_rq(se); } +static void switching_from_fair(struct rq *rq, struct task_struct *p) +{ + if (p->se.sched_delayed) + dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK); +} + static void switched_from_fair(struct rq *rq, struct task_struct *p) { detach_task_cfs_rq(p); @@ -13319,7 +13579,7 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) void init_cfs_rq(struct cfs_rq *cfs_rq) { cfs_rq->tasks_timeline = RB_ROOT_CACHED; - cfs_rq->min_vruntime = (u64)(-(1LL << 20)); + cfs_rq->zero_vruntime = (u64)(-(1LL << 20)); raw_spin_lock_init(&cfs_rq->removed.lock); } @@ -13620,6 +13880,8 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task */ DEFINE_SCHED_CLASS(fair) = { + .queue_mask = 2, + .enqueue_task = enqueue_task_fair, .dequeue_task = dequeue_task_fair, .yield_task = yield_task_fair, @@ -13628,11 +13890,10 @@ DEFINE_SCHED_CLASS(fair) = { .wakeup_preempt = check_preempt_wakeup_fair, .pick_task = pick_task_fair, - .pick_next_task = __pick_next_task_fair, + .pick_next_task = pick_next_task_fair, .put_prev_task = put_prev_task_fair, .set_next_task = set_next_task_fair, - .balance = balance_fair, .select_task_rq = select_task_rq_fair, .migrate_task_rq = migrate_task_rq_fair, @@ -13647,6 +13908,7 @@ DEFINE_SCHED_CLASS(fair) = { .reweight_task = reweight_task_fair, .prio_changed = prio_changed_fair, + .switching_from = switching_from_fair, .switched_from = switched_from_fair, .switched_to = switched_to_fair, diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 3c12d9f93331..980d92bab8ab 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -29,7 +29,7 @@ SCHED_FEAT(PREEMPT_SHORT, true) * wakeup-preemption), since its likely going to consume data we * touched, increases cache locality. */ -SCHED_FEAT(NEXT_BUDDY, false) +SCHED_FEAT(NEXT_BUDDY, true) /* * Allow completely ignoring cfs_rq->next; which can be set from various @@ -121,3 +121,8 @@ SCHED_FEAT(WA_BIAS, true) SCHED_FEAT(UTIL_EST, true) SCHED_FEAT(LATENCY_WARN, false) + +/* + * Do newidle balancing proportional to its success rate using randomization. + */ +SCHED_FEAT(NI_RANDOM, true) diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index c39b089d4f09..c174afe1dd17 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -131,12 +131,13 @@ void __cpuidle default_idle_call(void) } static int call_cpuidle_s2idle(struct cpuidle_driver *drv, - struct cpuidle_device *dev) + struct cpuidle_device *dev, + u64 max_latency_ns) { if (current_clr_polling_and_test()) return -EBUSY; - return cpuidle_enter_s2idle(drv, dev); + return cpuidle_enter_s2idle(drv, dev, max_latency_ns); } static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev, @@ -205,12 +206,13 @@ static void cpuidle_idle_call(void) u64 max_latency_ns; if (idle_should_enter_s2idle()) { + max_latency_ns = cpu_wakeup_latency_qos_limit() * + NSEC_PER_USEC; - entered_state = call_cpuidle_s2idle(drv, dev); + entered_state = call_cpuidle_s2idle(drv, dev, + max_latency_ns); if (entered_state > 0) goto exit_idle; - - max_latency_ns = U64_MAX; } else { max_latency_ns = dev->forced_idle_latency_limit_ns; } @@ -452,9 +454,11 @@ static void wakeup_preempt_idle(struct rq *rq, struct task_struct *p, int flags) resched_curr(rq); } +static void update_curr_idle(struct rq *rq); + static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct task_struct *next) { - dl_server_update_idle_time(rq, prev); + update_curr_idle(rq); scx_update_idle(rq, false, true); } @@ -466,7 +470,7 @@ static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool fir next->se.exec_start = rq_clock_task(rq); } -struct task_struct *pick_task_idle(struct rq *rq) +struct task_struct *pick_task_idle(struct rq *rq, struct rq_flags *rf) { scx_update_idle(rq, true, false); return rq->idle; @@ -496,21 +500,36 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags) */ static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) { + update_curr_idle(rq); } -static void switched_to_idle(struct rq *rq, struct task_struct *p) +static void switching_to_idle(struct rq *rq, struct task_struct *p) { BUG(); } static void -prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio) +prio_changed_idle(struct rq *rq, struct task_struct *p, u64 oldprio) { + if (p->prio == oldprio) + return; + BUG(); } static void update_curr_idle(struct rq *rq) { + struct sched_entity *se = &rq->idle->se; + u64 now = rq_clock_task(rq); + s64 delta_exec; + + delta_exec = now - se->exec_start; + if (unlikely(delta_exec <= 0)) + return; + + se->exec_start = now; + + dl_server_update_idle(&rq->fair_server, delta_exec); } /* @@ -518,6 +537,8 @@ static void update_curr_idle(struct rq *rq) */ DEFINE_SCHED_CLASS(idle) = { + .queue_mask = 0, + /* no enqueue/yield_task for idle tasks */ /* dequeue is not valid, we print a debug message there: */ @@ -536,6 +557,6 @@ DEFINE_SCHED_CLASS(idle) = { .task_tick = task_tick_idle, .prio_changed = prio_changed_idle, - .switched_to = switched_to_idle, + .switching_to = switching_to_idle, .update_curr = update_curr_idle, }; diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index a4cf17b1fab0..3ad0d6df6a0a 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -167,6 +167,29 @@ static int __init housekeeping_setup(char *str, unsigned long flags) } } + /* + * Check the combination of nohz_full and isolcpus=domain, + * necessary to avoid problems with the timer migration + * hierarchy. managed_irq is ignored by this check since it + * isn't considered in the timer migration logic. + */ + iter_flags = housekeeping.flags & (HK_FLAG_KERNEL_NOISE | HK_FLAG_DOMAIN); + type = find_first_bit(&iter_flags, HK_TYPE_MAX); + /* + * Pass the check if none of these flags were previously set or + * are not in the current selection. + */ + iter_flags = flags & (HK_FLAG_KERNEL_NOISE | HK_FLAG_DOMAIN); + first_cpu = (type == HK_TYPE_MAX || !iter_flags) ? 0 : + cpumask_first_and_and(cpu_present_mask, + housekeeping_staging, housekeeping.cpumasks[type]); + if (first_cpu >= min(nr_cpu_ids, setup_max_cpus)) { + pr_warn("Housekeeping: must include one present CPU " + "neither in nohz_full= nor in isolcpus=domain, " + "ignoring setting %s\n", str); + goto free_housekeeping_staging; + } + iter_flags = flags & ~housekeeping.flags; for_each_set_bit(type, &iter_flags, HK_TYPE_MAX) diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 62fba83b7bb1..623445603725 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -199,7 +199,7 @@ static void ipi_rseq(void *info) * is negligible. */ smp_mb(); - rseq_preempt(current); + rseq_sched_switch_event(current); } static void ipi_sync_rq_state(void *info) @@ -407,9 +407,9 @@ static int membarrier_private_expedited(int flags, int cpu_id) * membarrier, we will end up with some thread in the mm * running without a core sync. * - * For RSEQ, don't rseq_preempt() the caller. User code - * is not supposed to issue syscalls at all from inside an - * rseq critical section. + * For RSEQ, don't invoke rseq_sched_switch_event() on the + * caller. User code is not supposed to issue syscalls at + * all from inside an rseq critical section. */ if (flags != MEMBARRIER_FLAG_SYNC_CORE) { preempt_disable(); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 7936d4333731..f1867fe8e5c5 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1490,7 +1490,7 @@ static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head) static void yield_task_rt(struct rq *rq) { - requeue_task_rt(rq, rq->curr, 0); + requeue_task_rt(rq, rq->donor, 0); } static int find_lowest_rq(struct task_struct *task); @@ -1695,7 +1695,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) return rt_task_of(rt_se); } -static struct task_struct *pick_task_rt(struct rq *rq) +static struct task_struct *pick_task_rt(struct rq *rq, struct rq_flags *rf) { struct task_struct *p; @@ -2437,11 +2437,14 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) * us to initiate a push or pull. */ static void -prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) +prio_changed_rt(struct rq *rq, struct task_struct *p, u64 oldprio) { if (!task_on_rq_queued(p)) return; + if (p->prio == oldprio) + return; + if (task_current_donor(rq, p)) { /* * If our priority decreases while running, we @@ -2566,6 +2569,8 @@ static int task_is_throttled_rt(struct task_struct *p, int cpu) DEFINE_SCHED_CLASS(rt) = { + .queue_mask = 4, + .enqueue_task = enqueue_task_rt, .dequeue_task = dequeue_task_rt, .yield_task = yield_task_rt, @@ -2589,8 +2594,8 @@ DEFINE_SCHED_CLASS(rt) = { .get_rr_interval = get_rr_interval_rt, - .prio_changed = prio_changed_rt, .switched_to = switched_to_rt, + .prio_changed = prio_changed_rt, .update_curr = update_curr_rt, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index adfb6e3409d7..8590113e4a60 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -5,6 +5,7 @@ #ifndef _KERNEL_SCHED_SCHED_H #define _KERNEL_SCHED_SCHED_H +#include <linux/prandom.h> #include <linux/sched/affinity.h> #include <linux/sched/autogroup.h> #include <linux/sched/cpufreq.h> @@ -20,7 +21,6 @@ #include <linux/sched/task_flags.h> #include <linux/sched/task.h> #include <linux/sched/topology.h> - #include <linux/atomic.h> #include <linux/bitmap.h> #include <linux/bug.h> @@ -405,6 +405,7 @@ extern s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s6 * naturally thottled to once per period, avoiding high context switch * workloads from spamming the hrtimer program/cancel paths. */ +extern void dl_server_update_idle(struct sched_dl_entity *dl_se, s64 delta_exec); extern void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec); extern void dl_server_start(struct sched_dl_entity *dl_se); extern void dl_server_stop(struct sched_dl_entity *dl_se); @@ -412,8 +413,6 @@ extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, dl_server_pick_f pick_task); extern void sched_init_dl_servers(void); -extern void dl_server_update_idle_time(struct rq *rq, - struct task_struct *p); extern void fair_server_init(struct rq *rq); extern void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq); extern int dl_server_apply_params(struct sched_dl_entity *dl_se, @@ -682,10 +681,10 @@ struct cfs_rq { s64 avg_vruntime; u64 avg_load; - u64 min_vruntime; + u64 zero_vruntime; #ifdef CONFIG_SCHED_CORE unsigned int forceidle_seq; - u64 min_vruntime_fi; + u64 zero_vruntime_fi; #endif struct rb_root_cached tasks_timeline; @@ -780,7 +779,6 @@ enum scx_rq_flags { */ SCX_RQ_ONLINE = 1 << 0, SCX_RQ_CAN_STOP_TICK = 1 << 1, - SCX_RQ_BAL_PENDING = 1 << 2, /* balance hasn't run yet */ SCX_RQ_BAL_KEEP = 1 << 3, /* balance decided to keep current */ SCX_RQ_BYPASSING = 1 << 4, SCX_RQ_CLK_VALID = 1 << 5, /* RQ clock is fresh and valid */ @@ -1120,6 +1118,8 @@ struct rq { /* runqueue lock: */ raw_spinlock_t __lock; + /* Per class runqueue modification mask; bits in class order. */ + unsigned int queue_mask; unsigned int nr_running; #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; @@ -1349,6 +1349,12 @@ static inline bool is_migration_disabled(struct task_struct *p) } DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); +DECLARE_PER_CPU(struct rnd_state, sched_rnd_state); + +static inline u32 sched_rng(void) +{ + return prandom_u32_state(this_cpu_ptr(&sched_rnd_state)); +} #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) #define this_rq() this_cpu_ptr(&runqueues) @@ -1432,6 +1438,9 @@ static inline bool sched_core_cookie_match(struct rq *rq, struct task_struct *p) if (!sched_core_enabled(rq)) return true; + if (rq->core->core_cookie == p->core_cookie) + return true; + for_each_cpu(cpu, cpu_smt_mask(cpu_of(rq))) { if (!available_idle_cpu(cpu)) { idle_core = false; @@ -1443,7 +1452,7 @@ static inline bool sched_core_cookie_match(struct rq *rq, struct task_struct *p) * A CPU in an idle core is always the best choice for tasks with * cookies. */ - return idle_core || rq->core->core_cookie == p->core_cookie; + return idle_core; } static inline bool sched_group_cookie_match(struct rq *rq, @@ -1827,7 +1836,8 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) __acquires(p->pi_lock) __acquires(rq->lock); -static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) +static inline void +__task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) __releases(rq->lock) { rq_unpin_lock(rq, rf); @@ -1839,8 +1849,7 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) __releases(rq->lock) __releases(p->pi_lock) { - rq_unpin_lock(rq, rf); - raw_spin_rq_unlock(rq); + __task_rq_unlock(rq, p, rf); raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); } @@ -1849,6 +1858,11 @@ DEFINE_LOCK_GUARD_1(task_rq_lock, struct task_struct, task_rq_unlock(_T->rq, _T->lock, &_T->rf), struct rq *rq; struct rq_flags rf) +DEFINE_LOCK_GUARD_1(__task_rq_lock, struct task_struct, + _T->rq = __task_rq_lock(_T->lock, &_T->rf), + __task_rq_unlock(_T->rq, _T->lock, &_T->rf), + struct rq *rq; struct rq_flags rf) + static inline void rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) __acquires(rq->lock) { @@ -2209,6 +2223,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) smp_wmb(); WRITE_ONCE(task_thread_info(p)->cpu, cpu); p->wake_cpu = cpu; + rseq_sched_set_ids_changed(p); #endif /* CONFIG_SMP */ } @@ -2342,8 +2357,7 @@ extern const u32 sched_prio_to_wmult[40]; /* * {de,en}queue flags: * - * DEQUEUE_SLEEP - task is no longer runnable - * ENQUEUE_WAKEUP - task just became runnable + * SLEEP/WAKEUP - task is no-longer/just-became runnable * * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks * are in a known state which allows modification. Such pairs @@ -2356,34 +2370,46 @@ extern const u32 sched_prio_to_wmult[40]; * * MIGRATION - p->on_rq == TASK_ON_RQ_MIGRATING (used for DEADLINE) * + * DELAYED - de/re-queue a sched_delayed task + * + * CLASS - going to update p->sched_class; makes sched_change call the + * various switch methods. + * * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) * ENQUEUE_MIGRATED - the task was migrated during wakeup * ENQUEUE_RQ_SELECTED - ->select_task_rq() was called * + * XXX SAVE/RESTORE in combination with CLASS doesn't really make sense, but + * SCHED_DEADLINE seems to rely on this for now. */ -#define DEQUEUE_SLEEP 0x01 /* Matches ENQUEUE_WAKEUP */ -#define DEQUEUE_SAVE 0x02 /* Matches ENQUEUE_RESTORE */ -#define DEQUEUE_MOVE 0x04 /* Matches ENQUEUE_MOVE */ -#define DEQUEUE_NOCLOCK 0x08 /* Matches ENQUEUE_NOCLOCK */ -#define DEQUEUE_SPECIAL 0x10 -#define DEQUEUE_MIGRATING 0x100 /* Matches ENQUEUE_MIGRATING */ -#define DEQUEUE_DELAYED 0x200 /* Matches ENQUEUE_DELAYED */ -#define DEQUEUE_THROTTLE 0x800 - -#define ENQUEUE_WAKEUP 0x01 -#define ENQUEUE_RESTORE 0x02 -#define ENQUEUE_MOVE 0x04 -#define ENQUEUE_NOCLOCK 0x08 - -#define ENQUEUE_HEAD 0x10 -#define ENQUEUE_REPLENISH 0x20 -#define ENQUEUE_MIGRATED 0x40 -#define ENQUEUE_INITIAL 0x80 -#define ENQUEUE_MIGRATING 0x100 -#define ENQUEUE_DELAYED 0x200 -#define ENQUEUE_RQ_SELECTED 0x400 +#define DEQUEUE_SLEEP 0x0001 /* Matches ENQUEUE_WAKEUP */ +#define DEQUEUE_SAVE 0x0002 /* Matches ENQUEUE_RESTORE */ +#define DEQUEUE_MOVE 0x0004 /* Matches ENQUEUE_MOVE */ +#define DEQUEUE_NOCLOCK 0x0008 /* Matches ENQUEUE_NOCLOCK */ + +#define DEQUEUE_MIGRATING 0x0010 /* Matches ENQUEUE_MIGRATING */ +#define DEQUEUE_DELAYED 0x0020 /* Matches ENQUEUE_DELAYED */ +#define DEQUEUE_CLASS 0x0040 /* Matches ENQUEUE_CLASS */ + +#define DEQUEUE_SPECIAL 0x00010000 +#define DEQUEUE_THROTTLE 0x00020000 + +#define ENQUEUE_WAKEUP 0x0001 +#define ENQUEUE_RESTORE 0x0002 +#define ENQUEUE_MOVE 0x0004 +#define ENQUEUE_NOCLOCK 0x0008 + +#define ENQUEUE_MIGRATING 0x0010 +#define ENQUEUE_DELAYED 0x0020 +#define ENQUEUE_CLASS 0x0040 + +#define ENQUEUE_HEAD 0x00010000 +#define ENQUEUE_REPLENISH 0x00020000 +#define ENQUEUE_MIGRATED 0x00040000 +#define ENQUEUE_INITIAL 0x00080000 +#define ENQUEUE_RQ_SELECTED 0x00100000 #define RETRY_TASK ((void *)-1UL) @@ -2400,16 +2426,61 @@ struct sched_class { #ifdef CONFIG_UCLAMP_TASK int uclamp_enabled; #endif + /* + * idle: 0 + * ext: 1 + * fair: 2 + * rt: 4 + * dl: 8 + * stop: 16 + */ + unsigned int queue_mask; + /* + * move_queued_task/activate_task/enqueue_task: rq->lock + * ttwu_do_activate/activate_task/enqueue_task: rq->lock + * wake_up_new_task/activate_task/enqueue_task: task_rq_lock + * ttwu_runnable/enqueue_task: task_rq_lock + * proxy_task_current: rq->lock + * sched_change_end + */ void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); + /* + * move_queued_task/deactivate_task/dequeue_task: rq->lock + * __schedule/block_task/dequeue_task: rq->lock + * proxy_task_current: rq->lock + * wait_task_inactive: task_rq_lock + * sched_change_begin + */ bool (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); + + /* + * do_sched_yield: rq->lock + */ void (*yield_task) (struct rq *rq); + /* + * yield_to: rq->lock (double) + */ bool (*yield_to_task)(struct rq *rq, struct task_struct *p); + /* + * move_queued_task: rq->lock + * __migrate_swap_task: rq->lock + * ttwu_do_activate: rq->lock + * ttwu_runnable: task_rq_lock + * wake_up_new_task: task_rq_lock + */ void (*wakeup_preempt)(struct rq *rq, struct task_struct *p, int flags); + /* + * schedule/pick_next_task/prev_balance: rq->lock + */ int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); - struct task_struct *(*pick_task)(struct rq *rq); + + /* + * schedule/pick_next_task: rq->lock + */ + struct task_struct *(*pick_task)(struct rq *rq, struct rq_flags *rf); /* * Optional! When implemented pick_next_task() should be equivalent to: * @@ -2419,55 +2490,123 @@ struct sched_class { * set_next_task_first(next); * } */ - struct task_struct *(*pick_next_task)(struct rq *rq, struct task_struct *prev); + struct task_struct *(*pick_next_task)(struct rq *rq, struct task_struct *prev, + struct rq_flags *rf); + /* + * sched_change: + * __schedule: rq->lock + */ void (*put_prev_task)(struct rq *rq, struct task_struct *p, struct task_struct *next); void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first); + /* + * select_task_rq: p->pi_lock + * sched_exec: p->pi_lock + */ int (*select_task_rq)(struct task_struct *p, int task_cpu, int flags); + /* + * set_task_cpu: p->pi_lock || rq->lock (ttwu like) + */ void (*migrate_task_rq)(struct task_struct *p, int new_cpu); + /* + * ttwu_do_activate: rq->lock + * wake_up_new_task: task_rq_lock + */ void (*task_woken)(struct rq *this_rq, struct task_struct *task); + /* + * do_set_cpus_allowed: task_rq_lock + sched_change + */ void (*set_cpus_allowed)(struct task_struct *p, struct affinity_context *ctx); + /* + * sched_set_rq_{on,off}line: rq->lock + */ void (*rq_online)(struct rq *rq); void (*rq_offline)(struct rq *rq); + /* + * push_cpu_stop: p->pi_lock && rq->lock + */ struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq); + /* + * hrtick: rq->lock + * sched_tick: rq->lock + * sched_tick_remote: rq->lock + */ void (*task_tick)(struct rq *rq, struct task_struct *p, int queued); + /* + * sched_cgroup_fork: p->pi_lock + */ void (*task_fork)(struct task_struct *p); + /* + * finish_task_switch: no locks + */ void (*task_dead)(struct task_struct *p); /* - * The switched_from() call is allowed to drop rq->lock, therefore we - * cannot assume the switched_from/switched_to pair is serialized by - * rq->lock. They are however serialized by p->pi_lock. + * sched_change + */ + void (*switching_from)(struct rq *this_rq, struct task_struct *task); + void (*switched_from) (struct rq *this_rq, struct task_struct *task); + void (*switching_to) (struct rq *this_rq, struct task_struct *task); + void (*switched_to) (struct rq *this_rq, struct task_struct *task); + u64 (*get_prio) (struct rq *this_rq, struct task_struct *task); + void (*prio_changed) (struct rq *this_rq, struct task_struct *task, + u64 oldprio); + + /* + * set_load_weight: task_rq_lock + sched_change + * __setscheduler_parms: task_rq_lock + sched_change */ - void (*switching_to) (struct rq *this_rq, struct task_struct *task); - void (*switched_from)(struct rq *this_rq, struct task_struct *task); - void (*switched_to) (struct rq *this_rq, struct task_struct *task); void (*reweight_task)(struct rq *this_rq, struct task_struct *task, const struct load_weight *lw); - void (*prio_changed) (struct rq *this_rq, struct task_struct *task, - int oldprio); + /* + * sched_rr_get_interval: task_rq_lock + */ unsigned int (*get_rr_interval)(struct rq *rq, struct task_struct *task); + /* + * task_sched_runtime: task_rq_lock + */ void (*update_curr)(struct rq *rq); #ifdef CONFIG_FAIR_GROUP_SCHED + /* + * sched_change_group: task_rq_lock + sched_change + */ void (*task_change_group)(struct task_struct *p); #endif #ifdef CONFIG_SCHED_CORE + /* + * pick_next_task: rq->lock + * try_steal_cookie: rq->lock (double) + */ int (*task_is_throttled)(struct task_struct *p, int cpu); #endif }; +/* + * Does not nest; only used around sched_class::pick_task() rq-lock-breaks. + */ +static inline void rq_modified_clear(struct rq *rq) +{ + rq->queue_mask = 0; +} + +static inline bool rq_modified_above(struct rq *rq, const struct sched_class * class) +{ + unsigned int mask = class->queue_mask; + return rq->queue_mask & ~((mask << 1) - 1); +} + static inline void put_prev_task(struct rq *rq, struct task_struct *prev) { WARN_ON_ONCE(rq->donor != prev); @@ -2579,8 +2718,9 @@ static inline bool sched_fair_runnable(struct rq *rq) return rq->cfs.nr_queued > 0; } -extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); -extern struct task_struct *pick_task_idle(struct rq *rq); +extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, + struct rq_flags *rf); +extern struct task_struct *pick_task_idle(struct rq *rq, struct rq_flags *rf); #define SCA_CHECK 0x01 #define SCA_MIGRATE_DISABLE 0x02 @@ -2610,7 +2750,7 @@ static inline bool task_allowed_on_cpu(struct task_struct *p, int cpu) static inline cpumask_t *alloc_user_cpus_ptr(int node) { /* - * See do_set_cpus_allowed() above for the rcu_head usage. + * See set_cpus_allowed_force() above for the rcu_head usage. */ int size = max_t(int, cpumask_size(), sizeof(struct rcu_head)); @@ -3540,283 +3680,212 @@ extern const char *preempt_modes[]; #ifdef CONFIG_SCHED_MM_CID -#define SCHED_MM_CID_PERIOD_NS (100ULL * 1000000) /* 100ms */ -#define MM_CID_SCAN_DELAY 100 /* 100ms */ +static __always_inline bool cid_on_cpu(unsigned int cid) +{ + return cid & MM_CID_ONCPU; +} -extern raw_spinlock_t cid_lock; -extern int use_cid_lock; +static __always_inline bool cid_in_transit(unsigned int cid) +{ + return cid & MM_CID_TRANSIT; +} -extern void sched_mm_cid_migrate_from(struct task_struct *t); -extern void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t); -extern void task_tick_mm_cid(struct rq *rq, struct task_struct *curr); -extern void init_sched_mm_cid(struct task_struct *t); +static __always_inline unsigned int cpu_cid_to_cid(unsigned int cid) +{ + return cid & ~MM_CID_ONCPU; +} -static inline void __mm_cid_put(struct mm_struct *mm, int cid) +static __always_inline unsigned int cid_to_cpu_cid(unsigned int cid) { - if (cid < 0) - return; - cpumask_clear_cpu(cid, mm_cidmask(mm)); + return cid | MM_CID_ONCPU; } -/* - * The per-mm/cpu cid can have the MM_CID_LAZY_PUT flag set or transition to - * the MM_CID_UNSET state without holding the rq lock, but the rq lock needs to - * be held to transition to other states. - * - * State transitions synchronized with cmpxchg or try_cmpxchg need to be - * consistent across CPUs, which prevents use of this_cpu_cmpxchg. - */ -static inline void mm_cid_put_lazy(struct task_struct *t) +static __always_inline unsigned int cid_to_transit_cid(unsigned int cid) { - struct mm_struct *mm = t->mm; - struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; - int cid; + return cid | MM_CID_TRANSIT; +} - lockdep_assert_irqs_disabled(); - cid = __this_cpu_read(pcpu_cid->cid); - if (!mm_cid_is_lazy_put(cid) || - !try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET)) - return; - __mm_cid_put(mm, mm_cid_clear_lazy_put(cid)); +static __always_inline unsigned int cid_from_transit_cid(unsigned int cid) +{ + return cid & ~MM_CID_TRANSIT; } -static inline int mm_cid_pcpu_unset(struct mm_struct *mm) +static __always_inline bool cid_on_task(unsigned int cid) { - struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; - int cid, res; + /* True if none of the MM_CID_ONCPU, MM_CID_TRANSIT, MM_CID_UNSET bits is set */ + return cid < MM_CID_TRANSIT; +} - lockdep_assert_irqs_disabled(); - cid = __this_cpu_read(pcpu_cid->cid); - for (;;) { - if (mm_cid_is_unset(cid)) - return MM_CID_UNSET; - /* - * Attempt transition from valid or lazy-put to unset. - */ - res = cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, cid, MM_CID_UNSET); - if (res == cid) - break; - cid = res; - } - return cid; +static __always_inline void mm_drop_cid(struct mm_struct *mm, unsigned int cid) +{ + clear_bit(cid, mm_cidmask(mm)); } -static inline void mm_cid_put(struct mm_struct *mm) +static __always_inline void mm_unset_cid_on_task(struct task_struct *t) { - int cid; + unsigned int cid = t->mm_cid.cid; - lockdep_assert_irqs_disabled(); - cid = mm_cid_pcpu_unset(mm); - if (cid == MM_CID_UNSET) - return; - __mm_cid_put(mm, mm_cid_clear_lazy_put(cid)); + t->mm_cid.cid = MM_CID_UNSET; + if (cid_on_task(cid)) + mm_drop_cid(t->mm, cid); } -static inline int __mm_cid_try_get(struct task_struct *t, struct mm_struct *mm) +static __always_inline void mm_drop_cid_on_cpu(struct mm_struct *mm, struct mm_cid_pcpu *pcp) { - struct cpumask *cidmask = mm_cidmask(mm); - struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; - int cid, max_nr_cid, allowed_max_nr_cid; + /* Clear the ONCPU bit, but do not set UNSET in the per CPU storage */ + pcp->cid = cpu_cid_to_cid(pcp->cid); + mm_drop_cid(mm, pcp->cid); +} - /* - * After shrinking the number of threads or reducing the number - * of allowed cpus, reduce the value of max_nr_cid so expansion - * of cid allocation will preserve cache locality if the number - * of threads or allowed cpus increase again. - */ - max_nr_cid = atomic_read(&mm->max_nr_cid); - while ((allowed_max_nr_cid = min_t(int, READ_ONCE(mm->nr_cpus_allowed), - atomic_read(&mm->mm_users))), - max_nr_cid > allowed_max_nr_cid) { - /* atomic_try_cmpxchg loads previous mm->max_nr_cid into max_nr_cid. */ - if (atomic_try_cmpxchg(&mm->max_nr_cid, &max_nr_cid, allowed_max_nr_cid)) { - max_nr_cid = allowed_max_nr_cid; - break; - } - } - /* Try to re-use recent cid. This improves cache locality. */ - cid = __this_cpu_read(pcpu_cid->recent_cid); - if (!mm_cid_is_unset(cid) && cid < max_nr_cid && - !cpumask_test_and_set_cpu(cid, cidmask)) - return cid; - /* - * Expand cid allocation if the maximum number of concurrency - * IDs allocated (max_nr_cid) is below the number cpus allowed - * and number of threads. Expanding cid allocation as much as - * possible improves cache locality. - */ - cid = max_nr_cid; - while (cid < READ_ONCE(mm->nr_cpus_allowed) && cid < atomic_read(&mm->mm_users)) { - /* atomic_try_cmpxchg loads previous mm->max_nr_cid into cid. */ - if (!atomic_try_cmpxchg(&mm->max_nr_cid, &cid, cid + 1)) - continue; - if (!cpumask_test_and_set_cpu(cid, cidmask)) - return cid; - } - /* - * Find the first available concurrency id. - * Retry finding first zero bit if the mask is temporarily - * filled. This only happens during concurrent remote-clear - * which owns a cid without holding a rq lock. - */ - for (;;) { - cid = cpumask_first_zero(cidmask); - if (cid < READ_ONCE(mm->nr_cpus_allowed)) - break; - cpu_relax(); - } - if (cpumask_test_and_set_cpu(cid, cidmask)) - return -1; +static inline unsigned int __mm_get_cid(struct mm_struct *mm, unsigned int max_cids) +{ + unsigned int cid = find_first_zero_bit(mm_cidmask(mm), max_cids); + if (cid >= max_cids) + return MM_CID_UNSET; + if (test_and_set_bit(cid, mm_cidmask(mm))) + return MM_CID_UNSET; return cid; } -/* - * Save a snapshot of the current runqueue time of this cpu - * with the per-cpu cid value, allowing to estimate how recently it was used. - */ -static inline void mm_cid_snapshot_time(struct rq *rq, struct mm_struct *mm) +static inline unsigned int mm_get_cid(struct mm_struct *mm) { - struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(rq)); + unsigned int cid = __mm_get_cid(mm, READ_ONCE(mm->mm_cid.max_cids)); - lockdep_assert_rq_held(rq); - WRITE_ONCE(pcpu_cid->time, rq->clock); + while (cid == MM_CID_UNSET) { + cpu_relax(); + cid = __mm_get_cid(mm, num_possible_cpus()); + } + return cid; } -static inline int __mm_cid_get(struct rq *rq, struct task_struct *t, - struct mm_struct *mm) +static inline unsigned int mm_cid_converge(struct mm_struct *mm, unsigned int orig_cid, + unsigned int max_cids) { - int cid; + unsigned int new_cid, cid = cpu_cid_to_cid(orig_cid); - /* - * All allocations (even those using the cid_lock) are lock-free. If - * use_cid_lock is set, hold the cid_lock to perform cid allocation to - * guarantee forward progress. - */ - if (!READ_ONCE(use_cid_lock)) { - cid = __mm_cid_try_get(t, mm); - if (cid >= 0) - goto end; - raw_spin_lock(&cid_lock); - } else { - raw_spin_lock(&cid_lock); - cid = __mm_cid_try_get(t, mm); - if (cid >= 0) - goto unlock; + /* Is it in the optimal CID space? */ + if (likely(cid < max_cids)) + return orig_cid; + + /* Try to find one in the optimal space. Otherwise keep the provided. */ + new_cid = __mm_get_cid(mm, max_cids); + if (new_cid != MM_CID_UNSET) { + mm_drop_cid(mm, cid); + /* Preserve the ONCPU mode of the original CID */ + return new_cid | (orig_cid & MM_CID_ONCPU); } + return orig_cid; +} - /* - * cid concurrently allocated. Retry while forcing following - * allocations to use the cid_lock to ensure forward progress. - */ - WRITE_ONCE(use_cid_lock, 1); - /* - * Set use_cid_lock before allocation. Only care about program order - * because this is only required for forward progress. - */ - barrier(); - /* - * Retry until it succeeds. It is guaranteed to eventually succeed once - * all newcoming allocations observe the use_cid_lock flag set. - */ - do { - cid = __mm_cid_try_get(t, mm); - cpu_relax(); - } while (cid < 0); - /* - * Allocate before clearing use_cid_lock. Only care about - * program order because this is for forward progress. - */ - barrier(); - WRITE_ONCE(use_cid_lock, 0); -unlock: - raw_spin_unlock(&cid_lock); -end: - mm_cid_snapshot_time(rq, mm); +static __always_inline void mm_cid_update_task_cid(struct task_struct *t, unsigned int cid) +{ + if (t->mm_cid.cid != cid) { + t->mm_cid.cid = cid; + rseq_sched_set_ids_changed(t); + } +} - return cid; +static __always_inline void mm_cid_update_pcpu_cid(struct mm_struct *mm, unsigned int cid) +{ + __this_cpu_write(mm->mm_cid.pcpu->cid, cid); } -static inline int mm_cid_get(struct rq *rq, struct task_struct *t, - struct mm_struct *mm) +static __always_inline void mm_cid_from_cpu(struct task_struct *t, unsigned int cpu_cid) { - struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; - int cid; + unsigned int max_cids, tcid = t->mm_cid.cid; + struct mm_struct *mm = t->mm; - lockdep_assert_rq_held(rq); - cid = __this_cpu_read(pcpu_cid->cid); - if (mm_cid_is_valid(cid)) { - mm_cid_snapshot_time(rq, mm); - return cid; - } - if (mm_cid_is_lazy_put(cid)) { - if (try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET)) - __mm_cid_put(mm, mm_cid_clear_lazy_put(cid)); + max_cids = READ_ONCE(mm->mm_cid.max_cids); + /* Optimize for the common case where both have the ONCPU bit set */ + if (likely(cid_on_cpu(cpu_cid & tcid))) { + if (likely(cpu_cid_to_cid(cpu_cid) < max_cids)) { + mm_cid_update_task_cid(t, cpu_cid); + return; + } + /* Try to converge into the optimal CID space */ + cpu_cid = mm_cid_converge(mm, cpu_cid, max_cids); + } else { + /* Hand over or drop the task owned CID */ + if (cid_on_task(tcid)) { + if (cid_on_cpu(cpu_cid)) + mm_unset_cid_on_task(t); + else + cpu_cid = cid_to_cpu_cid(tcid); + } + /* Still nothing, allocate a new one */ + if (!cid_on_cpu(cpu_cid)) + cpu_cid = cid_to_cpu_cid(mm_get_cid(mm)); } - cid = __mm_cid_get(rq, t, mm); - __this_cpu_write(pcpu_cid->cid, cid); - __this_cpu_write(pcpu_cid->recent_cid, cid); - - return cid; + mm_cid_update_pcpu_cid(mm, cpu_cid); + mm_cid_update_task_cid(t, cpu_cid); } -static inline void switch_mm_cid(struct rq *rq, - struct task_struct *prev, - struct task_struct *next) +static __always_inline void mm_cid_from_task(struct task_struct *t, unsigned int cpu_cid) { - /* - * Provide a memory barrier between rq->curr store and load of - * {prev,next}->mm->pcpu_cid[cpu] on rq->curr->mm transition. - * - * Should be adapted if context_switch() is modified. - */ - if (!next->mm) { // to kernel - /* - * user -> kernel transition does not guarantee a barrier, but - * we can use the fact that it performs an atomic operation in - * mmgrab(). - */ - if (prev->mm) // from user - smp_mb__after_mmgrab(); - /* - * kernel -> kernel transition does not change rq->curr->mm - * state. It stays NULL. - */ - } else { // to user - /* - * kernel -> user transition does not provide a barrier - * between rq->curr store and load of {prev,next}->mm->pcpu_cid[cpu]. - * Provide it here. - */ - if (!prev->mm) { // from kernel - smp_mb(); - } else { // from user - /* - * user->user transition relies on an implicit - * memory barrier in switch_mm() when - * current->mm changes. If the architecture - * switch_mm() does not have an implicit memory - * barrier, it is emitted here. If current->mm - * is unchanged, no barrier is needed. - */ - smp_mb__after_switch_mm(); + unsigned int max_cids, tcid = t->mm_cid.cid; + struct mm_struct *mm = t->mm; + + max_cids = READ_ONCE(mm->mm_cid.max_cids); + /* Optimize for the common case, where both have the ONCPU bit clear */ + if (likely(cid_on_task(tcid | cpu_cid))) { + if (likely(tcid < max_cids)) { + mm_cid_update_pcpu_cid(mm, tcid); + return; } + /* Try to converge into the optimal CID space */ + tcid = mm_cid_converge(mm, tcid, max_cids); + } else { + /* Hand over or drop the CPU owned CID */ + if (cid_on_cpu(cpu_cid)) { + if (cid_on_task(tcid)) + mm_drop_cid_on_cpu(mm, this_cpu_ptr(mm->mm_cid.pcpu)); + else + tcid = cpu_cid_to_cid(cpu_cid); + } + /* Still nothing, allocate a new one */ + if (!cid_on_task(tcid)) + tcid = mm_get_cid(mm); + /* Set the transition mode flag if required */ + tcid |= READ_ONCE(mm->mm_cid.transit); } - if (prev->mm_cid_active) { - mm_cid_snapshot_time(rq, prev->mm); - mm_cid_put_lazy(prev); - prev->mm_cid = -1; - } - if (next->mm_cid_active) - next->last_mm_cid = next->mm_cid = mm_cid_get(rq, next, next->mm); + mm_cid_update_pcpu_cid(mm, tcid); + mm_cid_update_task_cid(t, tcid); +} + +static __always_inline void mm_cid_schedin(struct task_struct *next) +{ + struct mm_struct *mm = next->mm; + unsigned int cpu_cid; + + if (!next->mm_cid.active) + return; + + cpu_cid = __this_cpu_read(mm->mm_cid.pcpu->cid); + if (likely(!READ_ONCE(mm->mm_cid.percpu))) + mm_cid_from_task(next, cpu_cid); + else + mm_cid_from_cpu(next, cpu_cid); +} + +static __always_inline void mm_cid_schedout(struct task_struct *prev) +{ + /* During mode transitions CIDs are temporary and need to be dropped */ + if (likely(!cid_in_transit(prev->mm_cid.cid))) + return; + + mm_drop_cid(prev->mm, cid_from_transit_cid(prev->mm_cid.cid)); + prev->mm_cid.cid = MM_CID_UNSET; +} + +static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct *next) +{ + mm_cid_schedout(prev); + mm_cid_schedin(next); } #else /* !CONFIG_SCHED_MM_CID: */ -static inline void switch_mm_cid(struct rq *rq, struct task_struct *prev, struct task_struct *next) { } -static inline void sched_mm_cid_migrate_from(struct task_struct *t) { } -static inline void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) { } -static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { } -static inline void init_sched_mm_cid(struct task_struct *t) { } +static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct *next) { } #endif /* !CONFIG_SCHED_MM_CID */ extern u64 avg_vruntime(struct cfs_rq *cfs_rq); @@ -3875,32 +3944,42 @@ extern void set_load_weight(struct task_struct *p, bool update_load); extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags); extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags); -extern void check_class_changing(struct rq *rq, struct task_struct *p, - const struct sched_class *prev_class); -extern void check_class_changed(struct rq *rq, struct task_struct *p, - const struct sched_class *prev_class, - int oldprio); - extern struct balance_callback *splice_balance_callbacks(struct rq *rq); extern void balance_callbacks(struct rq *rq, struct balance_callback *head); -#ifdef CONFIG_SCHED_CLASS_EXT /* - * Used by SCX in the enable/disable paths to move tasks between sched_classes - * and establish invariants. + * The 'sched_change' pattern is the safe, easy and slow way of changing a + * task's scheduling properties. It dequeues a task, such that the scheduler + * is fully unaware of it; at which point its properties can be modified; + * after which it is enqueued again. + * + * Typically this must be called while holding task_rq_lock, since most/all + * properties are serialized under those locks. There is currently one + * exception to this rule in sched/ext which only holds rq->lock. */ -struct sched_enq_and_set_ctx { + +/* + * This structure is a temporary, used to preserve/convey the queueing state + * of the task between sched_change_begin() and sched_change_end(). Ensuring + * the task's queueing state is idempotent across the operation. + */ +struct sched_change_ctx { + u64 prio; struct task_struct *p; - int queue_flags; + int flags; bool queued; bool running; }; -void sched_deq_and_put_task(struct task_struct *p, int queue_flags, - struct sched_enq_and_set_ctx *ctx); -void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx); +struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int flags); +void sched_change_end(struct sched_change_ctx *ctx); -#endif /* CONFIG_SCHED_CLASS_EXT */ +DEFINE_CLASS(sched_change, struct sched_change_ctx *, + sched_change_end(_T), + sched_change_begin(p, flags), + struct task_struct *p, unsigned int flags) + +DEFINE_CLASS_IS_UNCONDITIONAL(sched_change) #include "ext.h" diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 26f3fd4d34ce..cbf7206b3f9d 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -206,7 +206,7 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) rq = __task_rq_lock(p, &rf); psi_task_change(p, p->psi_flags, 0); - __task_rq_unlock(rq, &rf); + __task_rq_unlock(rq, p, &rf); } } diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 2d4e279f05ee..4f9192be4b5b 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -32,7 +32,7 @@ static void set_next_task_stop(struct rq *rq, struct task_struct *stop, bool fir stop->se.exec_start = rq_clock_task(rq); } -static struct task_struct *pick_task_stop(struct rq *rq) +static struct task_struct *pick_task_stop(struct rq *rq, struct rq_flags *rf) { if (!sched_stop_runnable(rq)) return NULL; @@ -75,14 +75,17 @@ static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued) { } -static void switched_to_stop(struct rq *rq, struct task_struct *p) +static void switching_to_stop(struct rq *rq, struct task_struct *p) { BUG(); /* its impossible to change to this class */ } static void -prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio) +prio_changed_stop(struct rq *rq, struct task_struct *p, u64 oldprio) { + if (p->prio == oldprio) + return; + BUG(); /* how!?, what priority? */ } @@ -95,6 +98,8 @@ static void update_curr_stop(struct rq *rq) */ DEFINE_SCHED_CLASS(stop) = { + .queue_mask = 16, + .enqueue_task = enqueue_task_stop, .dequeue_task = dequeue_task_stop, .yield_task = yield_task_stop, @@ -112,6 +117,6 @@ DEFINE_SCHED_CLASS(stop) = { .task_tick = task_tick_stop, .prio_changed = prio_changed_stop, - .switched_to = switched_to_stop, + .switching_to = switching_to_stop, .update_curr = update_curr_stop, }; diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c index 77ae87f36e84..0496dc29ed0f 100644 --- a/kernel/sched/syscalls.c +++ b/kernel/sched/syscalls.c @@ -64,8 +64,6 @@ static int effective_prio(struct task_struct *p) void set_user_nice(struct task_struct *p, long nice) { - bool queued, running; - struct rq *rq; int old_prio; if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) @@ -74,10 +72,7 @@ void set_user_nice(struct task_struct *p, long nice) * We have to be careful, if called from sys_setpriority(), * the task might be in the middle of scheduling on another CPU. */ - CLASS(task_rq_lock, rq_guard)(p); - rq = rq_guard.rq; - - update_rq_clock(rq); + guard(task_rq_lock)(p); /* * The RT priorities are set via sched_setscheduler(), but we still @@ -90,28 +85,12 @@ void set_user_nice(struct task_struct *p, long nice) return; } - queued = task_on_rq_queued(p); - running = task_current_donor(rq, p); - if (queued) - dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK); - if (running) - put_prev_task(rq, p); - - p->static_prio = NICE_TO_PRIO(nice); - set_load_weight(p, true); - old_prio = p->prio; - p->prio = effective_prio(p); - - if (queued) - enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); - if (running) - set_next_task(rq, p); - - /* - * If the task increased its priority or is running and - * lowered its priority, then reschedule its CPU: - */ - p->sched_class->prio_changed(rq, p, old_prio); + scoped_guard (sched_change, p, DEQUEUE_SAVE) { + p->static_prio = NICE_TO_PRIO(nice); + set_load_weight(p, true); + old_prio = p->prio; + p->prio = effective_prio(p); + } } EXPORT_SYMBOL(set_user_nice); @@ -515,7 +494,7 @@ int __sched_setscheduler(struct task_struct *p, bool user, bool pi) { int oldpolicy = -1, policy = attr->sched_policy; - int retval, oldprio, newprio, queued, running; + int retval, oldprio, newprio; const struct sched_class *prev_class, *next_class; struct balance_callback *head; struct rq_flags rf; @@ -695,38 +674,27 @@ change: prev_class = p->sched_class; next_class = __setscheduler_class(policy, newprio); - if (prev_class != next_class && p->se.sched_delayed) - dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK); - - queued = task_on_rq_queued(p); - running = task_current_donor(rq, p); - if (queued) - dequeue_task(rq, p, queue_flags); - if (running) - put_prev_task(rq, p); + if (prev_class != next_class) + queue_flags |= DEQUEUE_CLASS; - if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) { - __setscheduler_params(p, attr); - p->sched_class = next_class; - p->prio = newprio; - } - __setscheduler_uclamp(p, attr); - check_class_changing(rq, p, prev_class); + scoped_guard (sched_change, p, queue_flags) { - if (queued) { - /* - * We enqueue to tail when the priority of a task is - * increased (user space view). - */ - if (oldprio < p->prio) - queue_flags |= ENQUEUE_HEAD; + if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) { + __setscheduler_params(p, attr); + p->sched_class = next_class; + p->prio = newprio; + } + __setscheduler_uclamp(p, attr); - enqueue_task(rq, p, queue_flags); + if (scope->queued) { + /* + * We enqueue to tail when the priority of a task is + * increased (user space view). + */ + if (oldprio < p->prio) + scope->flags |= ENQUEUE_HEAD; + } } - if (running) - set_next_task(rq, p); - - check_class_changed(rq, p, prev_class, oldprio); /* Avoid rq from going away on us: */ preempt_disable(); @@ -856,6 +824,19 @@ void sched_set_fifo_low(struct task_struct *p) } EXPORT_SYMBOL_GPL(sched_set_fifo_low); +/* + * Used when the primary interrupt handler is forced into a thread, in addition + * to the (always threaded) secondary handler. The secondary handler gets a + * slightly lower priority so that the primary handler can preempt it, thereby + * emulating the behavior of a non-PREEMPT_RT system where the primary handler + * runs in hard interrupt context. + */ +void sched_set_fifo_secondary(struct task_struct *p) +{ + struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 - 1 }; + WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0); +} + void sched_set_normal(struct task_struct *p, int nice) { struct sched_attr attr = { @@ -1351,7 +1332,7 @@ static void do_sched_yield(void) rq = this_rq_lock_irq(&rf); schedstat_inc(rq->yld_count); - current->sched_class->yield_task(rq); + rq->donor->sched_class->yield_task(rq); preempt_disable(); rq_unlock_irq(rq, &rf); @@ -1420,12 +1401,13 @@ EXPORT_SYMBOL(yield); */ int __sched yield_to(struct task_struct *p, bool preempt) { - struct task_struct *curr = current; + struct task_struct *curr; struct rq *rq, *p_rq; int yielded = 0; scoped_guard (raw_spinlock_irqsave, &p->pi_lock) { rq = this_rq(); + curr = rq->donor; again: p_rq = task_rq(p); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 444bdfdab731..cf643a5ddedd 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1590,10 +1590,17 @@ static void claim_allocations(int cpu, struct sched_domain *sd) #ifdef CONFIG_NUMA enum numa_topology_type sched_numa_topology_type; +/* + * sched_domains_numa_distance is derived from sched_numa_node_distance + * and provides a simplified view of NUMA distances used specifically + * for building NUMA scheduling domains. + */ static int sched_domains_numa_levels; +static int sched_numa_node_levels; int sched_max_numa_distance; static int *sched_domains_numa_distance; +static int *sched_numa_node_distance; static struct cpumask ***sched_domains_numa_masks; #endif /* CONFIG_NUMA */ @@ -1662,6 +1669,12 @@ sd_init(struct sched_domain_topology_level *tl, .last_balance = jiffies, .balance_interval = sd_weight, + + /* 50% success rate */ + .newidle_call = 512, + .newidle_success = 256, + .newidle_ratio = 512, + .max_newidle_lb_cost = 0, .last_decay_max_lb_cost = jiffies, .child = child, @@ -1845,10 +1858,10 @@ bool find_numa_distance(int distance) return true; rcu_read_lock(); - distances = rcu_dereference(sched_domains_numa_distance); + distances = rcu_dereference(sched_numa_node_distance); if (!distances) goto unlock; - for (i = 0; i < sched_domains_numa_levels; i++) { + for (i = 0; i < sched_numa_node_levels; i++) { if (distances[i] == distance) { found = true; break; @@ -1924,14 +1937,34 @@ static void init_numa_topology_type(int offline_node) #define NR_DISTANCE_VALUES (1 << DISTANCE_BITS) -void sched_init_numa(int offline_node) +/* + * An architecture could modify its NUMA distance, to change + * grouping of NUMA nodes and number of NUMA levels when creating + * NUMA level sched domains. + * + * A NUMA level is created for each unique + * arch_sched_node_distance. + */ +static int numa_node_dist(int i, int j) { - struct sched_domain_topology_level *tl; - unsigned long *distance_map; + return node_distance(i, j); +} + +int arch_sched_node_distance(int from, int to) + __weak __alias(numa_node_dist); + +static bool modified_sched_node_distance(void) +{ + return numa_node_dist != arch_sched_node_distance; +} + +static int sched_record_numa_dist(int offline_node, int (*n_dist)(int, int), + int **dist, int *levels) +{ + unsigned long *distance_map __free(bitmap) = NULL; int nr_levels = 0; int i, j; int *distances; - struct cpumask ***masks; /* * O(nr_nodes^2) de-duplicating selection sort -- in order to find the @@ -1939,17 +1972,16 @@ void sched_init_numa(int offline_node) */ distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL); if (!distance_map) - return; + return -ENOMEM; bitmap_zero(distance_map, NR_DISTANCE_VALUES); for_each_cpu_node_but(i, offline_node) { for_each_cpu_node_but(j, offline_node) { - int distance = node_distance(i, j); + int distance = n_dist(i, j); if (distance < LOCAL_DISTANCE || distance >= NR_DISTANCE_VALUES) { sched_numa_warn("Invalid distance value range"); - bitmap_free(distance_map); - return; + return -EINVAL; } bitmap_set(distance_map, distance, 1); @@ -1962,18 +1994,46 @@ void sched_init_numa(int offline_node) nr_levels = bitmap_weight(distance_map, NR_DISTANCE_VALUES); distances = kcalloc(nr_levels, sizeof(int), GFP_KERNEL); - if (!distances) { - bitmap_free(distance_map); - return; - } + if (!distances) + return -ENOMEM; for (i = 0, j = 0; i < nr_levels; i++, j++) { j = find_next_bit(distance_map, NR_DISTANCE_VALUES, j); distances[i] = j; } - rcu_assign_pointer(sched_domains_numa_distance, distances); + *dist = distances; + *levels = nr_levels; - bitmap_free(distance_map); + return 0; +} + +void sched_init_numa(int offline_node) +{ + struct sched_domain_topology_level *tl; + int nr_levels, nr_node_levels; + int i, j; + int *distances, *domain_distances; + struct cpumask ***masks; + + /* Record the NUMA distances from SLIT table */ + if (sched_record_numa_dist(offline_node, numa_node_dist, &distances, + &nr_node_levels)) + return; + + /* Record modified NUMA distances for building sched domains */ + if (modified_sched_node_distance()) { + if (sched_record_numa_dist(offline_node, arch_sched_node_distance, + &domain_distances, &nr_levels)) { + kfree(distances); + return; + } + } else { + domain_distances = distances; + nr_levels = nr_node_levels; + } + rcu_assign_pointer(sched_numa_node_distance, distances); + WRITE_ONCE(sched_max_numa_distance, distances[nr_node_levels - 1]); + WRITE_ONCE(sched_numa_node_levels, nr_node_levels); /* * 'nr_levels' contains the number of unique distances @@ -1991,6 +2051,8 @@ void sched_init_numa(int offline_node) * * We reset it to 'nr_levels' at the end of this function. */ + rcu_assign_pointer(sched_domains_numa_distance, domain_distances); + sched_domains_numa_levels = 0; masks = kzalloc(sizeof(void *) * nr_levels, GFP_KERNEL); @@ -2016,10 +2078,13 @@ void sched_init_numa(int offline_node) masks[i][j] = mask; for_each_cpu_node_but(k, offline_node) { - if (sched_debug() && (node_distance(j, k) != node_distance(k, j))) + if (sched_debug() && + (arch_sched_node_distance(j, k) != + arch_sched_node_distance(k, j))) sched_numa_warn("Node-distance not symmetric"); - if (node_distance(j, k) > sched_domains_numa_distance[i]) + if (arch_sched_node_distance(j, k) > + sched_domains_numa_distance[i]) continue; cpumask_or(mask, mask, cpumask_of_node(k)); @@ -2059,7 +2124,6 @@ void sched_init_numa(int offline_node) sched_domain_topology = tl; sched_domains_numa_levels = nr_levels; - WRITE_ONCE(sched_max_numa_distance, sched_domains_numa_distance[nr_levels - 1]); init_numa_topology_type(offline_node); } @@ -2067,14 +2131,18 @@ void sched_init_numa(int offline_node) static void sched_reset_numa(void) { - int nr_levels, *distances; + int nr_levels, *distances, *dom_distances = NULL; struct cpumask ***masks; nr_levels = sched_domains_numa_levels; + sched_numa_node_levels = 0; sched_domains_numa_levels = 0; sched_max_numa_distance = 0; sched_numa_topology_type = NUMA_DIRECT; - distances = sched_domains_numa_distance; + distances = sched_numa_node_distance; + if (sched_numa_node_distance != sched_domains_numa_distance) + dom_distances = sched_domains_numa_distance; + rcu_assign_pointer(sched_numa_node_distance, NULL); rcu_assign_pointer(sched_domains_numa_distance, NULL); masks = sched_domains_numa_masks; rcu_assign_pointer(sched_domains_numa_masks, NULL); @@ -2083,6 +2151,7 @@ static void sched_reset_numa(void) synchronize_rcu(); kfree(distances); + kfree(dom_distances); for (i = 0; i < nr_levels && masks; i++) { if (!masks[i]) continue; @@ -2129,7 +2198,8 @@ void sched_domains_numa_masks_set(unsigned int cpu) continue; /* Set ourselves in the remote node's masks */ - if (node_distance(j, node) <= sched_domains_numa_distance[i]) + if (arch_sched_node_distance(j, node) <= + sched_domains_numa_distance[i]) cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]); } } diff --git a/kernel/signal.c b/kernel/signal.c index fe9190d84f28..e42b8bd6922f 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3125,7 +3125,6 @@ void exit_signals(struct task_struct *tsk) cgroup_threadgroup_change_begin(tsk); if (thread_group_empty(tsk) || (tsk->signal->flags & SIGNAL_GROUP_EXIT)) { - sched_mm_cid_exit_signals(tsk); tsk->flags |= PF_EXITING; cgroup_threadgroup_change_end(tsk); return; @@ -3136,7 +3135,6 @@ void exit_signals(struct task_struct *tsk) * From now this task is not visible for group-wide signals, * see wants_signal(), do_signal_stop(). */ - sched_mm_cid_exit_signals(tsk); tsk->flags |= PF_EXITING; cgroup_threadgroup_change_end(tsk); diff --git a/kernel/task_work.c b/kernel/task_work.c index d1efec571a4a..0f7519f8e7c9 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -9,7 +9,12 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */ #ifdef CONFIG_IRQ_WORK static void task_work_set_notify_irq(struct irq_work *entry) { - test_and_set_tsk_thread_flag(current, TIF_NOTIFY_RESUME); + /* + * no-op IPI + * + * TWA_NMI_CURRENT will already have set the TIF flag, all + * this interrupt does it tickle the return-to-user path. + */ } static DEFINE_PER_CPU(struct irq_work, irq_work_NMI_resume) = IRQ_WORK_INIT_HARD(task_work_set_notify_irq); @@ -86,6 +91,7 @@ int task_work_add(struct task_struct *task, struct callback_head *work, break; #ifdef CONFIG_IRQ_WORK case TWA_NMI_CURRENT: + set_tsk_thread_flag(current, TIF_NOTIFY_RESUME); irq_work_queue(this_cpu_ptr(&irq_work_NMI_resume)); break; #endif diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 88aa062b8a55..f8ea8c8fc895 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -2145,7 +2145,7 @@ static long __sched hrtimer_nanosleep_restart(struct restart_block *restart) int ret; hrtimer_setup_sleeper_on_stack(&t, restart->nanosleep.clockid, HRTIMER_MODE_ABS); - hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires); + hrtimer_set_expires(&t.timer, restart->nanosleep.expires); ret = do_nanosleep(&t, HRTIMER_MODE_ABS); destroy_hrtimer_on_stack(&t.timer); return ret; @@ -2172,7 +2172,7 @@ long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, restart = ¤t->restart_block; restart->nanosleep.clockid = t.timer.base->clockid; - restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer); + restart->nanosleep.expires = hrtimer_get_expires(&t.timer); set_restart_fn(restart, hrtimer_nanosleep_restart); out: destroy_hrtimer_on_stack(&t.timer); diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 5b6997f4dc3d..e76be24b132c 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -478,11 +478,8 @@ const struct proc_ns_operations timens_for_children_operations = { }; struct time_namespace init_time_ns = { - .ns.ns_type = ns_common_type(&init_time_ns), - .ns.__ns_ref = REFCOUNT_INIT(3), + .ns = NS_COMMON_INIT(init_time_ns), .user_ns = &init_user_ns, - .ns.inum = ns_init_inum(&init_time_ns), - .ns.ops = &timens_operations, .frozen_offsets = true, }; diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 2e5b89d7d866..0de2bb7cbec0 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1557,7 +1557,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, * Report back to the user the time still remaining. */ restart = ¤t->restart_block; - restart->nanosleep.expires = expires; + restart->nanosleep.expires = ns_to_ktime(expires); if (restart->nanosleep.type != TT_NONE) error = nanosleep_copyout(restart, &it.it_value); } @@ -1599,7 +1599,7 @@ static long posix_cpu_nsleep_restart(struct restart_block *restart_block) clockid_t which_clock = restart_block->nanosleep.clockid; struct timespec64 t; - t = ns_to_timespec64(restart_block->nanosleep.expires); + t = ktime_to_timespec64(restart_block->nanosleep.expires); return do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t); } diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index aa3120104a51..80a8a09a21a0 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -475,12 +475,6 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, if (!kc->timer_create) return -EOPNOTSUPP; - new_timer = alloc_posix_timer(); - if (unlikely(!new_timer)) - return -EAGAIN; - - spin_lock_init(&new_timer->it_lock); - /* Special case for CRIU to restore timers with a given timer ID. */ if (unlikely(current->signal->timer_create_restore_ids)) { if (copy_from_user(&req_id, created_timer_id, sizeof(req_id))) @@ -490,6 +484,12 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, return -EINVAL; } + new_timer = alloc_posix_timer(); + if (unlikely(!new_timer)) + return -EAGAIN; + + spin_lock_init(&new_timer->it_lock); + /* * Add the timer to the hash table. The timer is not yet valid * after insertion, but has a unique ID allocated. @@ -1242,7 +1242,7 @@ SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock, * sys_clock_settime(). The kernel internal timekeeping is always using * nanoseconds precision independent of the clocksource device which is * used to read the time from. The resolution of that device only - * affects the presicion of the time returned by sys_clock_gettime(). + * affects the precision of the time returned by sys_clock_gettime(). * * Returns: * 0 Success. @tp contains the resolution diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index 5e2c2c26b3cc..ffee943d796d 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -19,6 +19,10 @@ /** * tick_program_event - program the CPU local timer device for the next event + * @expires: the time at which the next timer event should occur + * @force: flag to force reprograming even if the event time hasn't changed + * + * Return: 0 on success, negative error code on failure */ int tick_program_event(ktime_t expires, int force) { @@ -57,6 +61,13 @@ void tick_resume_oneshot(void) /** * tick_setup_oneshot - setup the event device for oneshot mode (hres or nohz) + * @newdev: Pointer to the clock event device to configure + * @handler: Function to be called when the event device triggers an interrupt + * @next_event: Initial expiry time for the next event (in ktime) + * + * Configures the specified clock event device for onshot mode, + * assigns the given handler as its event callback, and programs + * the device to trigger at the specified next event time. */ void tick_setup_oneshot(struct clock_event_device *newdev, void (*handler)(struct clock_event_device *), @@ -69,6 +80,10 @@ void tick_setup_oneshot(struct clock_event_device *newdev, /** * tick_switch_to_oneshot - switch to oneshot mode + * @handler: function to call when an event occurs on the tick device + * + * Return: 0 on success, -EINVAL if the tick device is not present, + * not functional, or does not support oneshot mode. */ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)) { @@ -101,7 +116,7 @@ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)) /** * tick_oneshot_mode_active - check whether the system is in oneshot mode * - * returns 1 when either nohz or highres are enabled. otherwise 0. + * Return: 1 when either nohz or highres are enabled, otherwise 0. */ int tick_oneshot_mode_active(void) { @@ -120,6 +135,9 @@ int tick_oneshot_mode_active(void) * tick_init_highres - switch to high resolution mode * * Called with interrupts disabled. + * + * Return: 0 on success, -EINVAL if the tick device cannot switch + * to oneshot/high-resolution mode. */ int tick_init_highres(void) { diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index c527b421c865..8ddf74e705d3 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -201,6 +201,27 @@ static inline void tick_sched_flag_clear(struct tick_sched *ts, ts->flags &= ~flag; } +/* + * Allow only one non-timekeeper CPU at a time update jiffies from + * the timer tick. + * + * Returns true if update was run. + */ +static bool tick_limited_update_jiffies64(struct tick_sched *ts, ktime_t now) +{ + static atomic_t in_progress; + int inp; + + inp = atomic_read(&in_progress); + if (inp || !atomic_try_cmpxchg(&in_progress, &inp, 1)) + return false; + + if (ts->last_tick_jiffies == jiffies) + tick_do_update_jiffies64(now); + atomic_set(&in_progress, 0); + return true; +} + #define MAX_STALLED_JIFFIES 5 static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now) @@ -239,10 +260,11 @@ static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now) ts->stalled_jiffies = 0; ts->last_tick_jiffies = READ_ONCE(jiffies); } else { - if (++ts->stalled_jiffies == MAX_STALLED_JIFFIES) { - tick_do_update_jiffies64(now); - ts->stalled_jiffies = 0; - ts->last_tick_jiffies = READ_ONCE(jiffies); + if (++ts->stalled_jiffies >= MAX_STALLED_JIFFIES) { + if (tick_limited_update_jiffies64(ts, now)) { + ts->stalled_jiffies = 0; + ts->last_tick_jiffies = READ_ONCE(jiffies); + } } } @@ -1152,16 +1174,15 @@ static bool report_idle_softirq(void) return false; } - if (ratelimit >= 10) - return false; - /* On RT, softirq handling may be waiting on some lock */ if (local_bh_blocked()) return false; - pr_warn("NOHZ tick-stop error: local softirq work is pending, handler #%02x!!!\n", - pending); - ratelimit++; + if (ratelimit < 10) { + pr_warn("NOHZ tick-stop error: local softirq work is pending, handler #%02x!!!\n", + pending); + ratelimit++; + } return true; } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 3a4d3b2e3f74..4790da895203 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -3060,29 +3060,34 @@ static const struct attribute_group aux_clock_enable_attr_group = { static int __init tk_aux_sysfs_init(void) { struct kobject *auxo, *tko = kobject_create_and_add("time", kernel_kobj); + int ret = -ENOMEM; if (!tko) - return -ENOMEM; + return ret; auxo = kobject_create_and_add("aux_clocks", tko); - if (!auxo) { - kobject_put(tko); - return -ENOMEM; - } + if (!auxo) + goto err_clean; for (int i = 0; i < MAX_AUX_CLOCKS; i++) { char id[2] = { [0] = '0' + i, }; struct kobject *clk = kobject_create_and_add(id, auxo); - if (!clk) - return -ENOMEM; - - int ret = sysfs_create_group(clk, &aux_clock_enable_attr_group); + if (!clk) { + ret = -ENOMEM; + goto err_clean; + } + ret = sysfs_create_group(clk, &aux_clock_enable_attr_group); if (ret) - return ret; + goto err_clean; } return 0; + +err_clean: + kobject_put(auxo); + kobject_put(tko); + return ret; } late_initcall(tk_aux_sysfs_init); diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 553fa469d7cc..1f2364126894 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1458,10 +1458,11 @@ static int __try_to_del_timer_sync(struct timer_list *timer, bool shutdown) base = lock_timer_base(timer, &flags); - if (base->running_timer != timer) + if (base->running_timer != timer) { ret = detach_if_pending(timer, base, true); - if (shutdown) - timer->function = NULL; + if (shutdown) + timer->function = NULL; + } raw_spin_unlock_irqrestore(&base->lock, flags); @@ -2472,7 +2473,7 @@ void update_process_times(int user_tick) run_local_timers(); rcu_sched_clock_irq(user_tick); #ifdef CONFIG_IRQ_WORK - if (in_irq()) + if (in_hardirq()) irq_work_tick(); #endif sched_tick(); diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c index c0c54dc5314c..18dda1aa782d 100644 --- a/kernel/time/timer_migration.c +++ b/kernel/time/timer_migration.c @@ -10,6 +10,7 @@ #include <linux/spinlock.h> #include <linux/timerqueue.h> #include <trace/events/ipi.h> +#include <linux/sched/isolation.h> #include "timer_migration.h" #include "tick-internal.h" @@ -420,14 +421,54 @@ static struct list_head *tmigr_level_list __read_mostly; static unsigned int tmigr_hierarchy_levels __read_mostly; static unsigned int tmigr_crossnode_level __read_mostly; +static struct tmigr_group *tmigr_root; + static DEFINE_PER_CPU(struct tmigr_cpu, tmigr_cpu); +/* + * CPUs available for timer migration. + * Protected by cpuset_mutex (with cpus_read_lock held) or cpus_write_lock. + * Additionally tmigr_available_mutex serializes set/clear operations with each other. + */ +static cpumask_var_t tmigr_available_cpumask; +static DEFINE_MUTEX(tmigr_available_mutex); + +/* Enabled during late initcall */ +static DEFINE_STATIC_KEY_FALSE(tmigr_exclude_isolated); + #define TMIGR_NONE 0xFF #define BIT_CNT 8 static inline bool tmigr_is_not_available(struct tmigr_cpu *tmc) { - return !(tmc->tmgroup && tmc->online); + return !(tmc->tmgroup && tmc->available); +} + +/* + * Returns true if @cpu should be excluded from the hierarchy as isolated. + * Domain isolated CPUs don't participate in timer migration, nohz_full CPUs + * are still part of the hierarchy but become idle (from a tick and timer + * migration perspective) when they stop their tick. This lets the timekeeping + * CPU handle their global timers. Marking also isolated CPUs as idle would be + * too costly, hence they are completely excluded from the hierarchy. + * This check is necessary, for instance, to prevent offline isolated CPUs from + * being incorrectly marked as available once getting back online. + * + * This function returns false during early boot and the isolation logic is + * enabled only after isolated CPUs are marked as unavailable at late boot. + * The tick CPU can be isolated at boot, however we cannot mark it as + * unavailable to avoid having no global migrator for the nohz_full CPUs. This + * should be ensured by the callers of this function: implicitly from hotplug + * callbacks and explicitly in tmigr_init_isolation() and + * tmigr_isolated_exclude_cpumask(). + */ +static inline bool tmigr_is_isolated(int cpu) +{ + if (!static_branch_unlikely(&tmigr_exclude_isolated)) + return false; + return (!housekeeping_cpu(cpu, HK_TYPE_DOMAIN) || + cpuset_cpu_is_isolated(cpu)) && + housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE); } /* @@ -502,11 +543,6 @@ static bool tmigr_check_lonely(struct tmigr_group *group) * @now: timer base monotonic * @check: is set if there is the need to handle remote timers; * required in tmigr_requires_handle_remote() only - * @tmc_active: this flag indicates, whether the CPU which triggers - * the hierarchy walk is !idle in the timer migration - * hierarchy. When the CPU is idle and the whole hierarchy is - * idle, only the first event of the top level has to be - * considered. */ struct tmigr_walk { u64 nextexp; @@ -517,16 +553,13 @@ struct tmigr_walk { unsigned long basej; u64 now; bool check; - bool tmc_active; }; typedef bool (*up_f)(struct tmigr_group *, struct tmigr_group *, struct tmigr_walk *); -static void __walk_groups(up_f up, struct tmigr_walk *data, - struct tmigr_cpu *tmc) +static void __walk_groups_from(up_f up, struct tmigr_walk *data, + struct tmigr_group *child, struct tmigr_group *group) { - struct tmigr_group *child = NULL, *group = tmc->tmgroup; - do { WARN_ON_ONCE(group->level >= tmigr_hierarchy_levels); @@ -544,6 +577,12 @@ static void __walk_groups(up_f up, struct tmigr_walk *data, } while (group); } +static void __walk_groups(up_f up, struct tmigr_walk *data, + struct tmigr_cpu *tmc) +{ + __walk_groups_from(up, data, NULL, tmc->tmgroup); +} + static void walk_groups(up_f up, struct tmigr_walk *data, struct tmigr_cpu *tmc) { lockdep_assert_held(&tmc->lock); @@ -708,7 +747,7 @@ void tmigr_cpu_activate(void) /* * Returns true, if there is nothing to be propagated to the next level * - * @data->firstexp is set to expiry of first gobal event of the (top level of + * @data->firstexp is set to expiry of first global event of the (top level of * the) hierarchy, but only when hierarchy is completely idle. * * The child and group states need to be read under the lock, to prevent a race @@ -926,7 +965,7 @@ static void tmigr_handle_remote_cpu(unsigned int cpu, u64 now, * updated the event takes care when hierarchy is completely * idle. Otherwise the migrator does it as the event is enqueued. */ - if (!tmc->online || tmc->remote || tmc->cpuevt.ignore || + if (!tmc->available || tmc->remote || tmc->cpuevt.ignore || now < tmc->cpuevt.nextevt.expires) { raw_spin_unlock_irq(&tmc->lock); return; @@ -973,7 +1012,7 @@ static void tmigr_handle_remote_cpu(unsigned int cpu, u64 now, * (See also section "Required event and timerqueue update after a * remote expiry" in the documentation at the top) */ - if (!tmc->online || !tmc->idle) { + if (!tmc->available || !tmc->idle) { timer_unlock_remote_bases(cpu); goto unlock; } @@ -1113,15 +1152,6 @@ static bool tmigr_requires_handle_remote_up(struct tmigr_group *group, */ if (!tmigr_check_migrator(group, childmask)) return true; - - /* - * When there is a parent group and the CPU which triggered the - * hierarchy walk is not active, proceed the walk to reach the top level - * group before reading the next_expiry value. - */ - if (group->parent && !data->tmc_active) - return false; - /* * The lock is required on 32bit architectures to read the variable * consistently with a concurrent writer. On 64bit the lock is not @@ -1166,7 +1196,6 @@ bool tmigr_requires_handle_remote(void) data.now = get_jiffies_update(&jif); data.childmask = tmc->groupmask; data.firstexp = KTIME_MAX; - data.tmc_active = !tmc->idle; data.check = false; /* @@ -1432,38 +1461,43 @@ static long tmigr_trigger_active(void *unused) { struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); - WARN_ON_ONCE(!tmc->online || tmc->idle); + WARN_ON_ONCE(!tmc->available || tmc->idle); return 0; } -static int tmigr_cpu_offline(unsigned int cpu) +static int tmigr_clear_cpu_available(unsigned int cpu) { struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); int migrator; u64 firstexp; - raw_spin_lock_irq(&tmc->lock); - tmc->online = false; - WRITE_ONCE(tmc->wakeup, KTIME_MAX); + guard(mutex)(&tmigr_available_mutex); - /* - * CPU has to handle the local events on his own, when on the way to - * offline; Therefore nextevt value is set to KTIME_MAX - */ - firstexp = __tmigr_cpu_deactivate(tmc, KTIME_MAX); - trace_tmigr_cpu_offline(tmc); - raw_spin_unlock_irq(&tmc->lock); + cpumask_clear_cpu(cpu, tmigr_available_cpumask); + scoped_guard(raw_spinlock_irq, &tmc->lock) { + if (!tmc->available) + return 0; + tmc->available = false; + WRITE_ONCE(tmc->wakeup, KTIME_MAX); + + /* + * CPU has to handle the local events on his own, when on the way to + * offline; Therefore nextevt value is set to KTIME_MAX + */ + firstexp = __tmigr_cpu_deactivate(tmc, KTIME_MAX); + trace_tmigr_cpu_unavailable(tmc); + } if (firstexp != KTIME_MAX) { - migrator = cpumask_any_but(cpu_online_mask, cpu); + migrator = cpumask_any(tmigr_available_cpumask); work_on_cpu(migrator, tmigr_trigger_active, NULL); } return 0; } -static int tmigr_cpu_online(unsigned int cpu) +static int tmigr_set_cpu_available(unsigned int cpu) { struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); @@ -1471,16 +1505,123 @@ static int tmigr_cpu_online(unsigned int cpu) if (WARN_ON_ONCE(!tmc->tmgroup)) return -EINVAL; - raw_spin_lock_irq(&tmc->lock); - trace_tmigr_cpu_online(tmc); - tmc->idle = timer_base_is_idle(); - if (!tmc->idle) - __tmigr_cpu_activate(tmc); - tmc->online = true; - raw_spin_unlock_irq(&tmc->lock); + if (tmigr_is_isolated(cpu)) + return 0; + + guard(mutex)(&tmigr_available_mutex); + + cpumask_set_cpu(cpu, tmigr_available_cpumask); + scoped_guard(raw_spinlock_irq, &tmc->lock) { + if (tmc->available) + return 0; + trace_tmigr_cpu_available(tmc); + tmc->idle = timer_base_is_idle(); + if (!tmc->idle) + __tmigr_cpu_activate(tmc); + tmc->available = true; + } return 0; } +static void tmigr_cpu_isolate(struct work_struct *ignored) +{ + tmigr_clear_cpu_available(smp_processor_id()); +} + +static void tmigr_cpu_unisolate(struct work_struct *ignored) +{ + tmigr_set_cpu_available(smp_processor_id()); +} + +/** + * tmigr_isolated_exclude_cpumask - Exclude given CPUs from hierarchy + * @exclude_cpumask: the cpumask to be excluded from timer migration hierarchy + * + * This function can be called from cpuset code to provide the new set of + * isolated CPUs that should be excluded from the hierarchy. + * Online CPUs not present in exclude_cpumask but already excluded are brought + * back to the hierarchy. + * Functions to isolate/unisolate need to be called locally and can sleep. + */ +int tmigr_isolated_exclude_cpumask(struct cpumask *exclude_cpumask) +{ + struct work_struct __percpu *works __free(free_percpu) = + alloc_percpu(struct work_struct); + cpumask_var_t cpumask __free(free_cpumask_var) = CPUMASK_VAR_NULL; + int cpu; + + lockdep_assert_cpus_held(); + + if (!works) + return -ENOMEM; + if (!alloc_cpumask_var(&cpumask, GFP_KERNEL)) + return -ENOMEM; + + /* + * First set previously isolated CPUs as available (unisolate). + * This cpumask contains only CPUs that switched to available now. + */ + cpumask_andnot(cpumask, cpu_online_mask, exclude_cpumask); + cpumask_andnot(cpumask, cpumask, tmigr_available_cpumask); + + for_each_cpu(cpu, cpumask) { + struct work_struct *work = per_cpu_ptr(works, cpu); + + INIT_WORK(work, tmigr_cpu_unisolate); + schedule_work_on(cpu, work); + } + for_each_cpu(cpu, cpumask) + flush_work(per_cpu_ptr(works, cpu)); + + /* + * Then clear previously available CPUs (isolate). + * This cpumask contains only CPUs that switched to not available now. + * There cannot be overlap with the newly available ones. + */ + cpumask_and(cpumask, exclude_cpumask, tmigr_available_cpumask); + cpumask_and(cpumask, cpumask, housekeeping_cpumask(HK_TYPE_KERNEL_NOISE)); + /* + * Handle this here and not in the cpuset code because exclude_cpumask + * might include also the tick CPU if included in isolcpus. + */ + for_each_cpu(cpu, cpumask) { + if (!tick_nohz_cpu_hotpluggable(cpu)) { + cpumask_clear_cpu(cpu, cpumask); + break; + } + } + + for_each_cpu(cpu, cpumask) { + struct work_struct *work = per_cpu_ptr(works, cpu); + + INIT_WORK(work, tmigr_cpu_isolate); + schedule_work_on(cpu, work); + } + for_each_cpu(cpu, cpumask) + flush_work(per_cpu_ptr(works, cpu)); + + return 0; +} + +static int __init tmigr_init_isolation(void) +{ + cpumask_var_t cpumask __free(free_cpumask_var) = CPUMASK_VAR_NULL; + + static_branch_enable(&tmigr_exclude_isolated); + + if (!housekeeping_enabled(HK_TYPE_DOMAIN)) + return 0; + if (!alloc_cpumask_var(&cpumask, GFP_KERNEL)) + return -ENOMEM; + + cpumask_andnot(cpumask, cpu_possible_mask, housekeeping_cpumask(HK_TYPE_DOMAIN)); + + /* Protect against RCU torture hotplug testing */ + guard(cpus_read_lock)(); + return tmigr_isolated_exclude_cpumask(cpumask); +} +late_initcall(tmigr_init_isolation); + static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl, int node) { @@ -1498,21 +1639,6 @@ static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl, s.seq = 0; atomic_set(&group->migr_state, s.state); - /* - * If this is a new top-level, prepare its groupmask in advance. - * This avoids accidents where yet another new top-level is - * created in the future and made visible before the current groupmask. - */ - if (list_empty(&tmigr_level_list[lvl])) { - group->groupmask = BIT(0); - /* - * The previous top level has prepared its groupmask already, - * simply account it as the first child. - */ - if (lvl > 0) - group->num_children = 1; - } - timerqueue_init_head(&group->events); timerqueue_init(&group->groupevt.nextevt); group->groupevt.nextevt.expires = KTIME_MAX; @@ -1520,8 +1646,7 @@ static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl, group->groupevt.ignore = true; } -static struct tmigr_group *tmigr_get_group(unsigned int cpu, int node, - unsigned int lvl) +static struct tmigr_group *tmigr_get_group(int node, unsigned int lvl) { struct tmigr_group *tmp, *group = NULL; @@ -1567,25 +1692,51 @@ static struct tmigr_group *tmigr_get_group(unsigned int cpu, int node, return group; } +static bool tmigr_init_root(struct tmigr_group *group, bool activate) +{ + if (!group->parent && group != tmigr_root) { + /* + * This is the new top-level, prepare its groupmask in advance + * to avoid accidents where yet another new top-level is + * created in the future and made visible before this groupmask. + */ + group->groupmask = BIT(0); + WARN_ON_ONCE(activate); + + return true; + } + + return false; + +} + static void tmigr_connect_child_parent(struct tmigr_group *child, struct tmigr_group *parent, bool activate) { - struct tmigr_walk data; - - raw_spin_lock_irq(&child->lock); - raw_spin_lock_nested(&parent->lock, SINGLE_DEPTH_NESTING); + if (tmigr_init_root(parent, activate)) { + /* + * The previous top level had prepared its groupmask already, + * simply account it in advance as the first child. If some groups + * have been created between the old and new root due to node + * mismatch, the new root's child will be intialized accordingly. + */ + parent->num_children = 1; + } - if (activate) { + /* Connecting old root to new root ? */ + if (!parent->parent && activate) { /* - * @child is the old top and @parent the new one. In this - * case groupmask is pre-initialized and @child already - * accounted, along with its new sibling corresponding to the - * CPU going up. + * @child is the old top, or in case of node mismatch, some + * intermediate group between the old top and the new one in + * @parent. In this case the @child must be pre-accounted above + * as the first child. Its new inactive sibling corresponding + * to the CPU going up has been accounted as the second child. */ - WARN_ON_ONCE(child->groupmask != BIT(0) || parent->num_children != 2); + WARN_ON_ONCE(parent->num_children != 2); + child->groupmask = BIT(0); } else { - /* Adding @child for the CPU going up to @parent. */ + /* Common case adding @child for the CPU going up to @parent. */ child->groupmask = BIT(parent->num_children++); } @@ -1596,87 +1747,61 @@ static void tmigr_connect_child_parent(struct tmigr_group *child, */ smp_store_release(&child->parent, parent); - raw_spin_unlock(&parent->lock); - raw_spin_unlock_irq(&child->lock); - trace_tmigr_connect_child_parent(child); - - if (!activate) - return; - - /* - * To prevent inconsistent states, active children need to be active in - * the new parent as well. Inactive children are already marked inactive - * in the parent group: - * - * * When new groups were created by tmigr_setup_groups() starting from - * the lowest level (and not higher then one level below the current - * top level), then they are not active. They will be set active when - * the new online CPU comes active. - * - * * But if a new group above the current top level is required, it is - * mandatory to propagate the active state of the already existing - * child to the new parent. So tmigr_connect_child_parent() is - * executed with the formerly top level group (child) and the newly - * created group (parent). - * - * * It is ensured that the child is active, as this setup path is - * executed in hotplug prepare callback. This is exectued by an - * already connected and !idle CPU. Even if all other CPUs go idle, - * the CPU executing the setup will be responsible up to current top - * level group. And the next time it goes inactive, it will release - * the new childmask and parent to subsequent walkers through this - * @child. Therefore propagate active state unconditionally. - */ - data.childmask = child->groupmask; - - /* - * There is only one new level per time (which is protected by - * tmigr_mutex). When connecting the child and the parent and set the - * child active when the parent is inactive, the parent needs to be the - * uppermost level. Otherwise there went something wrong! - */ - WARN_ON(!tmigr_active_up(parent, child, &data) && parent->parent); } -static int tmigr_setup_groups(unsigned int cpu, unsigned int node) +static int tmigr_setup_groups(unsigned int cpu, unsigned int node, + struct tmigr_group *start, bool activate) { struct tmigr_group *group, *child, **stack; - int top = 0, err = 0, i = 0; - struct list_head *lvllist; + int i, top = 0, err = 0, start_lvl = 0; + bool root_mismatch = false; stack = kcalloc(tmigr_hierarchy_levels, sizeof(*stack), GFP_KERNEL); if (!stack) return -ENOMEM; - do { - group = tmigr_get_group(cpu, node, i); + if (start) { + stack[start->level] = start; + start_lvl = start->level + 1; + } + + if (tmigr_root) + root_mismatch = tmigr_root->numa_node != node; + + for (i = start_lvl; i < tmigr_hierarchy_levels; i++) { + group = tmigr_get_group(node, i); if (IS_ERR(group)) { err = PTR_ERR(group); + i--; break; } top = i; - stack[i++] = group; + stack[i] = group; /* * When booting only less CPUs of a system than CPUs are - * available, not all calculated hierarchy levels are required. + * available, not all calculated hierarchy levels are required, + * unless a node mismatch is detected. * * The loop is aborted as soon as the highest level, which might * be different from tmigr_hierarchy_levels, contains only a - * single group. + * single group, unless the nodes mismatch below tmigr_crossnode_level */ - if (group->parent || list_is_singular(&tmigr_level_list[i - 1])) + if (group->parent) break; + if ((!root_mismatch || i >= tmigr_crossnode_level) && + list_is_singular(&tmigr_level_list[i])) + break; + } - } while (i < tmigr_hierarchy_levels); - - /* Assert single root */ - WARN_ON_ONCE(!err && !group->parent && !list_is_singular(&tmigr_level_list[top])); + /* Assert single root without parent */ + if (WARN_ON_ONCE(i >= tmigr_hierarchy_levels)) + return -EINVAL; - while (i > 0) { - group = stack[--i]; + for (; i >= start_lvl; i--) { + group = stack[i]; if (err < 0) { list_del(&group->list); @@ -1692,12 +1817,10 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node) if (i == 0) { struct tmigr_cpu *tmc = per_cpu_ptr(&tmigr_cpu, cpu); - raw_spin_lock_irq(&group->lock); - tmc->tmgroup = group; tmc->groupmask = BIT(group->num_children++); - raw_spin_unlock_irq(&group->lock); + tmigr_init_root(group, activate); trace_tmigr_connect_cpu_parent(tmc); @@ -1705,42 +1828,58 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node) continue; } else { child = stack[i - 1]; - /* Will be activated at online time */ - tmigr_connect_child_parent(child, group, false); + tmigr_connect_child_parent(child, group, activate); } + } - /* check if uppermost level was newly created */ - if (top != i) - continue; - - WARN_ON_ONCE(top == 0); + if (err < 0) + goto out; - lvllist = &tmigr_level_list[top]; + if (activate) { + struct tmigr_walk data; + union tmigr_state state; /* - * Newly created root level should have accounted the upcoming - * CPU's child group and pre-accounted the old root. + * To prevent inconsistent states, active children need to be active in + * the new parent as well. Inactive children are already marked inactive + * in the parent group: + * + * * When new groups were created by tmigr_setup_groups() starting from + * the lowest level, then they are not active. They will be set active + * when the new online CPU comes active. + * + * * But if new groups above the current top level are required, it is + * mandatory to propagate the active state of the already existing + * child to the new parents. So tmigr_active_up() activates the + * new parents while walking up from the old root to the new. + * + * * It is ensured that @start is active, as this setup path is + * executed in hotplug prepare callback. This is executed by an + * already connected and !idle CPU. Even if all other CPUs go idle, + * the CPU executing the setup will be responsible up to current top + * level group. And the next time it goes inactive, it will release + * the new childmask and parent to subsequent walkers through this + * @child. Therefore propagate active state unconditionally. */ - if (group->num_children == 2 && list_is_singular(lvllist)) { - /* - * The target CPU must never do the prepare work, except - * on early boot when the boot CPU is the target. Otherwise - * it may spuriously activate the old top level group inside - * the new one (nevertheless whether old top level group is - * active or not) and/or release an uninitialized childmask. - */ - WARN_ON_ONCE(cpu == raw_smp_processor_id()); - - lvllist = &tmigr_level_list[top - 1]; - list_for_each_entry(child, lvllist, list) { - if (child->parent) - continue; + state.state = atomic_read(&start->migr_state); + WARN_ON_ONCE(!state.active); + WARN_ON_ONCE(!start->parent); + data.childmask = start->groupmask; + __walk_groups_from(tmigr_active_up, &data, start, start->parent); + } - tmigr_connect_child_parent(child, group, true); - } + /* Root update */ + if (list_is_singular(&tmigr_level_list[top])) { + group = list_first_entry(&tmigr_level_list[top], + typeof(*group), list); + WARN_ON_ONCE(group->parent); + if (tmigr_root) { + /* Old root should be the same or below */ + WARN_ON_ONCE(tmigr_root->level > top); } + tmigr_root = group; } - +out: kfree(stack); return err; @@ -1748,12 +1887,31 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node) static int tmigr_add_cpu(unsigned int cpu) { + struct tmigr_group *old_root = tmigr_root; int node = cpu_to_node(cpu); int ret; - mutex_lock(&tmigr_mutex); - ret = tmigr_setup_groups(cpu, node); - mutex_unlock(&tmigr_mutex); + guard(mutex)(&tmigr_mutex); + + ret = tmigr_setup_groups(cpu, node, NULL, false); + + /* Root has changed? Connect the old one to the new */ + if (ret >= 0 && old_root && old_root != tmigr_root) { + /* + * The target CPU must never do the prepare work, except + * on early boot when the boot CPU is the target. Otherwise + * it may spuriously activate the old top level group inside + * the new one (nevertheless whether old top level group is + * active or not) and/or release an uninitialized childmask. + */ + WARN_ON_ONCE(cpu == raw_smp_processor_id()); + /* + * The (likely) current CPU is expected to be online in the hierarchy, + * otherwise the old root may not be active as expected. + */ + WARN_ON_ONCE(!per_cpu_ptr(&tmigr_cpu, raw_smp_processor_id())->available); + ret = tmigr_setup_groups(-1, old_root->numa_node, old_root, true); + } return ret; } @@ -1798,6 +1956,11 @@ static int __init tmigr_init(void) if (ncpus == 1) return 0; + if (!zalloc_cpumask_var(&tmigr_available_cpumask, GFP_KERNEL)) { + ret = -ENOMEM; + goto err; + } + /* * Calculate the required hierarchy levels. Unfortunately there is no * reliable information available, unless all possible CPUs have been @@ -1847,7 +2010,7 @@ static int __init tmigr_init(void) goto err; ret = cpuhp_setup_state(CPUHP_AP_TMIGR_ONLINE, "tmigr:online", - tmigr_cpu_online, tmigr_cpu_offline); + tmigr_set_cpu_available, tmigr_clear_cpu_available); if (ret) goto err; diff --git a/kernel/time/timer_migration.h b/kernel/time/timer_migration.h index ae19f70f8170..70879cde6fdd 100644 --- a/kernel/time/timer_migration.h +++ b/kernel/time/timer_migration.h @@ -97,7 +97,7 @@ struct tmigr_group { */ struct tmigr_cpu { raw_spinlock_t lock; - bool online; + bool available; bool idle; bool remote; struct tmigr_group *tmgroup; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 42bd2ba68a82..59cfacb8a5bb 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1971,7 +1971,8 @@ static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops) */ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops, struct ftrace_hash *old_hash, - struct ftrace_hash *new_hash) + struct ftrace_hash *new_hash, + bool update_target) { struct ftrace_page *pg; struct dyn_ftrace *rec, *end = NULL; @@ -2006,10 +2007,13 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops, if (rec->flags & FTRACE_FL_DISABLED) continue; - /* We need to update only differences of filter_hash */ + /* + * Unless we are updating the target of a direct function, + * we only need to update differences of filter_hash + */ in_old = !!ftrace_lookup_ip(old_hash, rec->ip); in_new = !!ftrace_lookup_ip(new_hash, rec->ip); - if (in_old == in_new) + if (!update_target && (in_old == in_new)) continue; if (in_new) { @@ -2020,7 +2024,16 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops, if (is_ipmodify) goto rollback; - FTRACE_WARN_ON(rec->flags & FTRACE_FL_DIRECT); + /* + * If this is called by __modify_ftrace_direct() + * then it is only changing where the direct + * pointer is jumping to, and the record already + * points to a direct trampoline. If it isn't, + * then it is a bug to update ipmodify on a direct + * caller. + */ + FTRACE_WARN_ON(!update_target && + (rec->flags & FTRACE_FL_DIRECT)); /* * Another ops with IPMODIFY is already @@ -2076,7 +2089,7 @@ static int ftrace_hash_ipmodify_enable(struct ftrace_ops *ops) if (ftrace_hash_empty(hash)) hash = NULL; - return __ftrace_hash_update_ipmodify(ops, EMPTY_HASH, hash); + return __ftrace_hash_update_ipmodify(ops, EMPTY_HASH, hash, false); } /* Disabling always succeeds */ @@ -2087,7 +2100,7 @@ static void ftrace_hash_ipmodify_disable(struct ftrace_ops *ops) if (ftrace_hash_empty(hash)) hash = NULL; - __ftrace_hash_update_ipmodify(ops, hash, EMPTY_HASH); + __ftrace_hash_update_ipmodify(ops, hash, EMPTY_HASH, false); } static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops, @@ -2101,7 +2114,7 @@ static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops, if (ftrace_hash_empty(new_hash)) new_hash = NULL; - return __ftrace_hash_update_ipmodify(ops, old_hash, new_hash); + return __ftrace_hash_update_ipmodify(ops, old_hash, new_hash, false); } static void print_ip_ins(const char *fmt, const unsigned char *p) @@ -5953,6 +5966,17 @@ static void register_ftrace_direct_cb(struct rcu_head *rhp) free_ftrace_hash(fhp); } +static void reset_direct(struct ftrace_ops *ops, unsigned long addr) +{ + struct ftrace_hash *hash = ops->func_hash->filter_hash; + + remove_direct_functions_hash(hash, addr); + + /* cleanup for possible another register call */ + ops->func = NULL; + ops->trampoline = 0; +} + /** * register_ftrace_direct - Call a custom trampoline directly * for multiple functions registered in @ops @@ -6048,6 +6072,8 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) ops->direct_call = addr; err = register_ftrace_function_nolock(ops); + if (err) + reset_direct(ops, addr); out_unlock: mutex_unlock(&direct_mutex); @@ -6080,7 +6106,6 @@ EXPORT_SYMBOL_GPL(register_ftrace_direct); int unregister_ftrace_direct(struct ftrace_ops *ops, unsigned long addr, bool free_filters) { - struct ftrace_hash *hash = ops->func_hash->filter_hash; int err; if (check_direct_multi(ops)) @@ -6090,13 +6115,9 @@ int unregister_ftrace_direct(struct ftrace_ops *ops, unsigned long addr, mutex_lock(&direct_mutex); err = unregister_ftrace_function(ops); - remove_direct_functions_hash(hash, addr); + reset_direct(ops, addr); mutex_unlock(&direct_mutex); - /* cleanup for possible another register call */ - ops->func = NULL; - ops->trampoline = 0; - if (free_filters) ftrace_free_filter(ops); return err; @@ -6106,7 +6127,7 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_direct); static int __modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) { - struct ftrace_hash *hash; + struct ftrace_hash *hash = ops->func_hash->filter_hash; struct ftrace_func_entry *entry, *iter; static struct ftrace_ops tmp_ops = { .func = ftrace_stub, @@ -6127,12 +6148,20 @@ __modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) return err; /* + * Call __ftrace_hash_update_ipmodify() here, so that we can call + * ops->ops_func for the ops. This is needed because the above + * register_ftrace_function_nolock() worked on tmp_ops. + */ + err = __ftrace_hash_update_ipmodify(ops, hash, hash, true); + if (err) + goto out; + + /* * Now the ftrace_ops_list_func() is called to do the direct callers. * We can safely change the direct functions attached to each entry. */ mutex_lock(&ftrace_lock); - hash = ops->func_hash->filter_hash; size = 1 << hash->size_bits; for (i = 0; i < size; i++) { hlist_for_each_entry(iter, &hash->buckets[i], hlist) { @@ -6147,6 +6176,7 @@ __modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) mutex_unlock(&ftrace_lock); +out: /* Removing the tmp_ops will add the updated direct callers to the functions */ unregister_ftrace_function(&tmp_ops); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d1e527cf2aae..304e93597126 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -8781,8 +8781,18 @@ static void tracing_buffers_mmap_close(struct vm_area_struct *vma) put_snapshot_map(iter->tr); } +static int tracing_buffers_may_split(struct vm_area_struct *vma, unsigned long addr) +{ + /* + * Trace buffer mappings require the complete buffer including + * the meta page. Partial mappings are not supported. + */ + return -EINVAL; +} + static const struct vm_operations_struct tracing_buffers_vmops = { .close = tracing_buffers_mmap_close, + .may_split = tracing_buffers_may_split, }; static int tracing_buffers_mmap(struct file *filp, struct vm_area_struct *vma) diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index c428dafe7496..b15854c75d4f 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -1449,12 +1449,7 @@ static struct trace_event_functions user_event_funcs = { static int user_event_set_call_visible(struct user_event *user, bool visible) { - int ret; - const struct cred *old_cred; - struct cred *cred; - - cred = prepare_creds(); - + CLASS(prepare_creds, cred)(); if (!cred) return -ENOMEM; @@ -1469,17 +1464,12 @@ static int user_event_set_call_visible(struct user_event *user, bool visible) */ cred->fsuid = GLOBAL_ROOT_UID; - old_cred = override_creds(cred); - - if (visible) - ret = trace_add_event_call(&user->call); - else - ret = trace_remove_event_call(&user->call); - - revert_creds(old_cred); - put_cred(cred); + scoped_with_creds(cred) { + if (visible) + return trace_add_event_call(&user->call); - return ret; + return trace_remove_event_call(&user->call); + } } static int destroy_user_event(struct user_event *user) diff --git a/kernel/unwind/deferred.c b/kernel/unwind/deferred.c index dc6040aae3ee..a88fb481c4a3 100644 --- a/kernel/unwind/deferred.c +++ b/kernel/unwind/deferred.c @@ -53,7 +53,7 @@ DEFINE_STATIC_SRCU(unwind_srcu); static inline bool unwind_pending(struct unwind_task_info *info) { - return test_bit(UNWIND_PENDING_BIT, &info->unwind_mask); + return atomic_long_read(&info->unwind_mask) & UNWIND_PENDING; } /* @@ -79,6 +79,8 @@ static u64 get_cookie(struct unwind_task_info *info) { u32 cnt = 1; + lockdep_assert_irqs_disabled(); + if (info->id.cpu) return info->id.id; @@ -126,23 +128,20 @@ int unwind_user_faultable(struct unwind_stacktrace *trace) cache = info->cache; trace->entries = cache->entries; - - if (cache->nr_entries) { - /* - * The user stack has already been previously unwound in this - * entry context. Skip the unwind and use the cache. - */ - trace->nr = cache->nr_entries; + trace->nr = cache->nr_entries; + /* + * The user stack has already been previously unwound in this + * entry context. Skip the unwind and use the cache. + */ + if (trace->nr) return 0; - } - trace->nr = 0; unwind_user(trace, UNWIND_MAX_ENTRIES); cache->nr_entries = trace->nr; /* Clear nr_entries on way back to user space */ - set_bit(UNWIND_USED_BIT, &info->unwind_mask); + atomic_long_or(UNWIND_USED, &info->unwind_mask); return 0; } @@ -160,7 +159,7 @@ static void process_unwind_deferred(struct task_struct *task) /* Clear pending bit but make sure to have the current bits */ bits = atomic_long_fetch_andnot(UNWIND_PENDING, - (atomic_long_t *)&info->unwind_mask); + &info->unwind_mask); /* * From here on out, the callback must always be called, even if it's * just an empty trace. @@ -231,6 +230,7 @@ void unwind_deferred_task_exit(struct task_struct *task) int unwind_deferred_request(struct unwind_work *work, u64 *cookie) { struct unwind_task_info *info = ¤t->unwind_info; + int twa_mode = TWA_RESUME; unsigned long old, bits; unsigned long bit; int ret; @@ -246,8 +246,11 @@ int unwind_deferred_request(struct unwind_work *work, u64 *cookie) * Trigger a warning to make it obvious that an architecture * is using this in NMI when it should not be. */ - if (WARN_ON_ONCE(!CAN_USE_IN_NMI && in_nmi())) - return -EINVAL; + if (in_nmi()) { + if (WARN_ON_ONCE(!CAN_USE_IN_NMI)) + return -EINVAL; + twa_mode = TWA_NMI_CURRENT; + } /* Do not allow cancelled works to request again */ bit = READ_ONCE(work->bit); @@ -261,7 +264,7 @@ int unwind_deferred_request(struct unwind_work *work, u64 *cookie) *cookie = get_cookie(info); - old = READ_ONCE(info->unwind_mask); + old = atomic_long_read(&info->unwind_mask); /* Is this already queued or executed */ if (old & bit) @@ -274,7 +277,7 @@ int unwind_deferred_request(struct unwind_work *work, u64 *cookie) * to have a callback. */ bits = UNWIND_PENDING | bit; - old = atomic_long_fetch_or(bits, (atomic_long_t *)&info->unwind_mask); + old = atomic_long_fetch_or(bits, &info->unwind_mask); if (old & bits) { /* * If the work's bit was set, whatever set it had better @@ -285,10 +288,10 @@ int unwind_deferred_request(struct unwind_work *work, u64 *cookie) } /* The work has been claimed, now schedule it. */ - ret = task_work_add(current, &info->work, TWA_RESUME); + ret = task_work_add(current, &info->work, twa_mode); if (WARN_ON_ONCE(ret)) - WRITE_ONCE(info->unwind_mask, 0); + atomic_long_set(&info->unwind_mask, 0); return ret; } @@ -320,7 +323,8 @@ void unwind_deferred_cancel(struct unwind_work *work) guard(rcu)(); /* Clear this bit from all threads */ for_each_process_thread(g, t) { - clear_bit(bit, &t->unwind_info.unwind_mask); + atomic_long_andnot(BIT(bit), + &t->unwind_info.unwind_mask); if (t->unwind_info.cache) clear_bit(bit, &t->unwind_info.cache->unwind_completed); } @@ -350,7 +354,7 @@ void unwind_task_init(struct task_struct *task) memset(info, 0, sizeof(*info)); init_task_work(&info->work, unwind_deferred_task_work); - info->unwind_mask = 0; + atomic_long_set(&info->unwind_mask, 0); } void unwind_task_free(struct task_struct *task) diff --git a/kernel/unwind/user.c b/kernel/unwind/user.c index 97a8415e3216..39e270789444 100644 --- a/kernel/unwind/user.c +++ b/kernel/unwind/user.c @@ -8,18 +8,28 @@ #include <linux/unwind_user.h> #include <linux/uaccess.h> -static const struct unwind_user_frame fp_frame = { - ARCH_INIT_USER_FP_FRAME -}; - #define for_each_user_frame(state) \ for (unwind_user_start(state); !(state)->done; unwind_user_next(state)) -static int unwind_user_next_fp(struct unwind_user_state *state) +static inline int +get_user_word(unsigned long *word, unsigned long base, int off, unsigned int ws) +{ + unsigned long __user *addr = (void __user *)base + off; +#ifdef CONFIG_COMPAT + if (ws == sizeof(int)) { + unsigned int data; + int ret = get_user(data, (unsigned int __user *)addr); + *word = data; + return ret; + } +#endif + return get_user(*word, addr); +} + +static int unwind_user_next_common(struct unwind_user_state *state, + const struct unwind_user_frame *frame) { - const struct unwind_user_frame *frame = &fp_frame; unsigned long cfa, fp, ra; - unsigned int shift; if (frame->use_fp) { if (state->fp < state->sp) @@ -37,24 +47,45 @@ static int unwind_user_next_fp(struct unwind_user_state *state) return -EINVAL; /* Make sure that the address is word aligned */ - shift = sizeof(long) == 4 ? 2 : 3; - if (cfa & ((1 << shift) - 1)) + if (cfa & (state->ws - 1)) return -EINVAL; /* Find the Return Address (RA) */ - if (get_user(ra, (unsigned long *)(cfa + frame->ra_off))) + if (get_user_word(&ra, cfa, frame->ra_off, state->ws)) return -EINVAL; - if (frame->fp_off && get_user(fp, (unsigned long __user *)(cfa + frame->fp_off))) + if (frame->fp_off && get_user_word(&fp, cfa, frame->fp_off, state->ws)) return -EINVAL; state->ip = ra; state->sp = cfa; if (frame->fp_off) state->fp = fp; + state->topmost = false; return 0; } +static int unwind_user_next_fp(struct unwind_user_state *state) +{ +#ifdef CONFIG_HAVE_UNWIND_USER_FP + struct pt_regs *regs = task_pt_regs(current); + + if (state->topmost && unwind_user_at_function_start(regs)) { + const struct unwind_user_frame fp_entry_frame = { + ARCH_INIT_USER_FP_ENTRY_FRAME(state->ws) + }; + return unwind_user_next_common(state, &fp_entry_frame); + } + + const struct unwind_user_frame fp_frame = { + ARCH_INIT_USER_FP_FRAME(state->ws) + }; + return unwind_user_next_common(state, &fp_frame); +#else + return -EINVAL; +#endif +} + static int unwind_user_next(struct unwind_user_state *state) { unsigned long iter_mask = state->available_types; @@ -102,6 +133,12 @@ static int unwind_user_start(struct unwind_user_state *state) state->ip = instruction_pointer(regs); state->sp = user_stack_pointer(regs); state->fp = frame_pointer(regs); + state->ws = unwind_user_word_size(regs); + if (!state->ws) { + state->done = true; + return -EINVAL; + } + state->topmost = true; return 0; } diff --git a/kernel/user.c b/kernel/user.c index 0163665914c9..7aef4e679a6a 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -35,6 +35,7 @@ EXPORT_SYMBOL_GPL(init_binfmt_misc); * and 1 for... ? */ struct user_namespace init_user_ns = { + .ns = NS_COMMON_INIT(init_user_ns), .uid_map = { { .extent[0] = { @@ -65,14 +66,8 @@ struct user_namespace init_user_ns = { .nr_extents = 1, }, }, - .ns.ns_type = ns_common_type(&init_user_ns), - .ns.__ns_ref = REFCOUNT_INIT(3), .owner = GLOBAL_ROOT_UID, .group = GLOBAL_ROOT_GID, - .ns.inum = ns_init_inum(&init_user_ns), -#ifdef CONFIG_USER_NS - .ns.ops = &userns_operations, -#endif .flags = USERNS_INIT_FLAGS, #ifdef CONFIG_KEYS .keyring_name_list = LIST_HEAD_INIT(init_user_ns.keyring_name_list), diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index 7e45559521af..52f89f1137da 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -119,9 +119,9 @@ static bool post_one_notification(struct watch_queue *wqueue, offset = note % WATCH_QUEUE_NOTES_PER_PAGE * WATCH_QUEUE_NOTE_SIZE; get_page(page); len = n->info & WATCH_INFO_LENGTH; - p = kmap_atomic(page); + p = kmap_local_page(page); memcpy(p + offset, n, len); - kunmap_atomic(p); + kunmap_local(p); buf = pipe_buf(pipe, head); buf->page = page; |
