diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/cgroup/rstat.c | 25 | ||||
-rw-r--r-- | kernel/futex/core.c | 2 | ||||
-rw-r--r-- | kernel/locking/mutex.c | 21 | ||||
-rw-r--r-- | kernel/locking/rtmutex_api.c | 33 | ||||
-rw-r--r-- | kernel/module/internal.h | 7 | ||||
-rw-r--r-- | kernel/module/main.c | 27 | ||||
-rw-r--r-- | kernel/module/strict_rwx.c | 47 | ||||
-rw-r--r-- | kernel/sched/core.c | 9 | ||||
-rw-r--r-- | kernel/sched/debug.c | 4 | ||||
-rw-r--r-- | kernel/sched/ext_idle.c | 37 | ||||
-rw-r--r-- | kernel/sched/fair.c | 3 | ||||
-rw-r--r-- | kernel/trace/ftrace.c | 13 |
12 files changed, 152 insertions, 76 deletions
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index ce4752ab9e09..cbeaa499a96a 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -47,8 +47,20 @@ static spinlock_t *ss_rstat_lock(struct cgroup_subsys *ss) static raw_spinlock_t *ss_rstat_cpu_lock(struct cgroup_subsys *ss, int cpu) { - if (ss) + if (ss) { + /* + * Depending on config, the subsystem per-cpu lock type may be an + * empty struct. In enviromnents where this is the case, allocation + * of this field is not performed in ss_rstat_init(). Avoid a + * cpu-based offset relative to NULL by returning early. When the + * lock type is zero in size, the corresponding lock functions are + * no-ops so passing them NULL is acceptable. + */ + if (sizeof(*ss->rstat_ss_cpu_lock) == 0) + return NULL; + return per_cpu_ptr(ss->rstat_ss_cpu_lock, cpu); + } return per_cpu_ptr(&rstat_base_cpu_lock, cpu); } @@ -510,20 +522,15 @@ int __init ss_rstat_init(struct cgroup_subsys *ss) { int cpu; -#ifdef CONFIG_SMP /* - * On uniprocessor machines, arch_spinlock_t is defined as an empty - * struct. Avoid allocating a size of zero by having this block - * excluded in this case. It's acceptable to leave the subsystem locks - * unitialized since the associated lock functions are no-ops in the - * non-smp case. + * Depending on config, the subsystem per-cpu lock type may be an empty + * struct. Avoid allocating a size of zero in this case. */ - if (ss) { + if (ss && sizeof(*ss->rstat_ss_cpu_lock)) { ss->rstat_ss_cpu_lock = alloc_percpu(raw_spinlock_t); if (!ss->rstat_ss_cpu_lock) return -ENOMEM; } -#endif spin_lock_init(ss_rstat_lock(ss)); for_each_possible_cpu(cpu) diff --git a/kernel/futex/core.c b/kernel/futex/core.c index 19a2c65f3d37..565f9717c6ca 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -531,7 +531,7 @@ static u64 get_inode_sequence_number(struct inode *inode) * * For shared mappings (when @fshared), the key is: * - * ( inode->i_sequence, page->index, offset_within_page ) + * ( inode->i_sequence, page offset within mapping, offset_within_page ) * * [ also see get_inode_sequence_number() ] * diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 61fa97da7989..a39ecccbd106 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -809,11 +809,12 @@ _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest) EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock); int __sched -mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass) +_mutex_lock_killable(struct mutex *lock, unsigned int subclass, + struct lockdep_map *nest) { - return __mutex_lock(lock, TASK_KILLABLE, subclass, NULL, _RET_IP_); + return __mutex_lock(lock, TASK_KILLABLE, subclass, nest, _RET_IP_); } -EXPORT_SYMBOL_GPL(mutex_lock_killable_nested); +EXPORT_SYMBOL_GPL(_mutex_lock_killable); int __sched mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass) @@ -1063,6 +1064,7 @@ __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock, #endif +#ifndef CONFIG_DEBUG_LOCK_ALLOC /** * mutex_trylock - try to acquire the mutex, without waiting * @lock: the mutex to be acquired @@ -1079,17 +1081,24 @@ __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock, */ int __sched mutex_trylock(struct mutex *lock) { + MUTEX_WARN_ON(lock->magic != lock); + return __mutex_trylock(lock); +} +EXPORT_SYMBOL(mutex_trylock); +#else +int __sched _mutex_trylock_nest_lock(struct mutex *lock, struct lockdep_map *nest_lock) +{ bool locked; MUTEX_WARN_ON(lock->magic != lock); - locked = __mutex_trylock(lock); if (locked) - mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); + mutex_acquire_nest(&lock->dep_map, 0, 1, nest_lock, _RET_IP_); return locked; } -EXPORT_SYMBOL(mutex_trylock); +EXPORT_SYMBOL(_mutex_trylock_nest_lock); +#endif #ifndef CONFIG_DEBUG_LOCK_ALLOC int __sched diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c index 191e4720e546..2d933528a0fa 100644 --- a/kernel/locking/rtmutex_api.c +++ b/kernel/locking/rtmutex_api.c @@ -544,12 +544,12 @@ int __sched mutex_lock_interruptible_nested(struct mutex *lock, } EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); -int __sched mutex_lock_killable_nested(struct mutex *lock, - unsigned int subclass) +int __sched _mutex_lock_killable(struct mutex *lock, unsigned int subclass, + struct lockdep_map *nest_lock) { - return __mutex_lock_common(lock, TASK_KILLABLE, subclass, NULL, _RET_IP_); + return __mutex_lock_common(lock, TASK_KILLABLE, subclass, nest_lock, _RET_IP_); } -EXPORT_SYMBOL_GPL(mutex_lock_killable_nested); +EXPORT_SYMBOL_GPL(_mutex_lock_killable); void __sched mutex_lock_io_nested(struct mutex *lock, unsigned int subclass) { @@ -563,6 +563,21 @@ void __sched mutex_lock_io_nested(struct mutex *lock, unsigned int subclass) } EXPORT_SYMBOL_GPL(mutex_lock_io_nested); +int __sched _mutex_trylock_nest_lock(struct mutex *lock, + struct lockdep_map *nest_lock) +{ + int ret; + + if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES) && WARN_ON_ONCE(!in_task())) + return 0; + + ret = __rt_mutex_trylock(&lock->rtmutex); + if (ret) + mutex_acquire_nest(&lock->dep_map, 0, 1, nest_lock, _RET_IP_); + + return ret; +} +EXPORT_SYMBOL_GPL(_mutex_trylock_nest_lock); #else /* CONFIG_DEBUG_LOCK_ALLOC */ void __sched mutex_lock(struct mutex *lock) @@ -591,22 +606,16 @@ void __sched mutex_lock_io(struct mutex *lock) io_schedule_finish(token); } EXPORT_SYMBOL(mutex_lock_io); -#endif /* !CONFIG_DEBUG_LOCK_ALLOC */ int __sched mutex_trylock(struct mutex *lock) { - int ret; - if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES) && WARN_ON_ONCE(!in_task())) return 0; - ret = __rt_mutex_trylock(&lock->rtmutex); - if (ret) - mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); - - return ret; + return __rt_mutex_trylock(&lock->rtmutex); } EXPORT_SYMBOL(mutex_trylock); +#endif /* !CONFIG_DEBUG_LOCK_ALLOC */ void __sched mutex_unlock(struct mutex *lock) { diff --git a/kernel/module/internal.h b/kernel/module/internal.h index 626cf8668a7e..8d74b0a21c82 100644 --- a/kernel/module/internal.h +++ b/kernel/module/internal.h @@ -322,8 +322,11 @@ int module_enable_rodata_ro(const struct module *mod); int module_enable_rodata_ro_after_init(const struct module *mod); int module_enable_data_nx(const struct module *mod); int module_enable_text_rox(const struct module *mod); -int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, - char *secstrings, struct module *mod); +int module_enforce_rwx_sections(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, + const char *secstrings, + const struct module *mod); +void module_mark_ro_after_init(const Elf_Ehdr *hdr, Elf_Shdr *sechdrs, + const char *secstrings); #ifdef CONFIG_MODULE_SIG int module_sig_check(struct load_info *info, int flags); diff --git a/kernel/module/main.c b/kernel/module/main.c index 5c6ab20240a6..3d64e69cc03e 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -1562,12 +1562,11 @@ static void __layout_sections(struct module *mod, struct load_info *info, bool i { unsigned int m, i; + /* + * { Mask of required section header flags, + * Mask of excluded section header flags } + */ static const unsigned long masks[][2] = { - /* - * NOTE: all executable code must be the first section - * in this array; otherwise modify the text_size - * finder in the two loops below - */ { SHF_EXECINSTR | SHF_ALLOC, ARCH_SHF_SMALL }, { SHF_ALLOC, SHF_WRITE | ARCH_SHF_SMALL }, { SHF_RO_AFTER_INIT | SHF_ALLOC, ARCH_SHF_SMALL }, @@ -2768,7 +2767,6 @@ core_param(module_blacklist, module_blacklist, charp, 0400); static struct module *layout_and_allocate(struct load_info *info, int flags) { struct module *mod; - unsigned int ndx; int err; /* Allow arches to frob section contents and sizes. */ @@ -2786,22 +2784,11 @@ static struct module *layout_and_allocate(struct load_info *info, int flags) info->sechdrs[info->index.pcpu].sh_flags &= ~(unsigned long)SHF_ALLOC; /* - * Mark ro_after_init section with SHF_RO_AFTER_INIT so that - * layout_sections() can put it in the right place. + * Mark relevant sections as SHF_RO_AFTER_INIT so layout_sections() can + * put them in the right place. * Note: ro_after_init sections also have SHF_{WRITE,ALLOC} set. */ - ndx = find_sec(info, ".data..ro_after_init"); - if (ndx) - info->sechdrs[ndx].sh_flags |= SHF_RO_AFTER_INIT; - /* - * Mark the __jump_table section as ro_after_init as well: these data - * structures are never modified, with the exception of entries that - * refer to code in the __init section, which are annotated as such - * at module load time. - */ - ndx = find_sec(info, "__jump_table"); - if (ndx) - info->sechdrs[ndx].sh_flags |= SHF_RO_AFTER_INIT; + module_mark_ro_after_init(info->hdr, info->sechdrs, info->secstrings); /* * Determine total sizes, and put offsets in sh_entsize. For now diff --git a/kernel/module/strict_rwx.c b/kernel/module/strict_rwx.c index 03f4142cfbf4..8fd438529fbc 100644 --- a/kernel/module/strict_rwx.c +++ b/kernel/module/strict_rwx.c @@ -87,8 +87,9 @@ int module_enable_data_nx(const struct module *mod) return 0; } -int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, - char *secstrings, struct module *mod) +int module_enforce_rwx_sections(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, + const char *secstrings, + const struct module *mod) { const unsigned long shf_wx = SHF_WRITE | SHF_EXECINSTR; int i; @@ -106,3 +107,45 @@ int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, return 0; } + +static const char *const ro_after_init[] = { + /* + * Section .data..ro_after_init holds data explicitly annotated by + * __ro_after_init. + */ + ".data..ro_after_init", + + /* + * Section __jump_table holds data structures that are never modified, + * with the exception of entries that refer to code in the __init + * section, which are marked as such at module load time. + */ + "__jump_table", + +#ifdef CONFIG_HAVE_STATIC_CALL_INLINE + /* + * Section .static_call_sites holds data structures that need to be + * sorted and processed at module load time but are never modified + * afterwards. + */ + ".static_call_sites", +#endif +}; + +void module_mark_ro_after_init(const Elf_Ehdr *hdr, Elf_Shdr *sechdrs, + const char *secstrings) +{ + int i, j; + + for (i = 1; i < hdr->e_shnum; i++) { + Elf_Shdr *shdr = &sechdrs[i]; + + for (j = 0; j < ARRAY_SIZE(ro_after_init); j++) { + if (strcmp(secstrings + shdr->sh_name, + ro_after_init[j]) == 0) { + shdr->sh_flags |= SHF_RO_AFTER_INIT; + break; + } + } + } +} diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 62b3416f5e43..dce50fa57471 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3362,6 +3362,10 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) #ifdef CONFIG_NUMA_BALANCING static void __migrate_swap_task(struct task_struct *p, int cpu) { + __schedstat_inc(p->stats.numa_task_swapped); + count_vm_numa_event(NUMA_TASK_SWAP); + count_memcg_event_mm(p->mm, NUMA_TASK_SWAP); + if (task_on_rq_queued(p)) { struct rq *src_rq, *dst_rq; struct rq_flags srf, drf; @@ -7930,8 +7934,9 @@ int migrate_task_to(struct task_struct *p, int target_cpu) if (!cpumask_test_cpu(target_cpu, p->cpus_ptr)) return -EINVAL; - /* TODO: This is not properly updating schedstats */ - + __schedstat_inc(p->stats.numa_task_migrated); + count_vm_numa_event(NUMA_TASK_MIGRATE); + count_memcg_event_mm(p->mm, NUMA_TASK_MIGRATE); trace_sched_move_numa(p, curr_cpu, target_cpu); return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg); } diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 557246880a7e..9d71baf08075 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -1210,6 +1210,10 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, P_SCHEDSTAT(nr_failed_migrations_running); P_SCHEDSTAT(nr_failed_migrations_hot); P_SCHEDSTAT(nr_forced_migrations); +#ifdef CONFIG_NUMA_BALANCING + P_SCHEDSTAT(numa_task_migrated); + P_SCHEDSTAT(numa_task_swapped); +#endif P_SCHEDSTAT(nr_wakeups); P_SCHEDSTAT(nr_wakeups_sync); P_SCHEDSTAT(nr_wakeups_migrate); diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c index 66da03cc0b33..6d29d3cbc670 100644 --- a/kernel/sched/ext_idle.c +++ b/kernel/sched/ext_idle.c @@ -138,6 +138,7 @@ found: goto retry; } +#ifdef CONFIG_NUMA /* * Tracks nodes that have not yet been visited when searching for an idle * CPU across all available nodes. @@ -186,6 +187,13 @@ static s32 pick_idle_cpu_from_online_nodes(const struct cpumask *cpus_allowed, i return cpu; } +#else +static inline s32 +pick_idle_cpu_from_online_nodes(const struct cpumask *cpus_allowed, int node, u64 flags) +{ + return -EBUSY; +} +#endif /* * Find an idle CPU in the system, starting from @node. @@ -447,11 +455,18 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, const struct cpumask *llc_cpus = NULL, *numa_cpus = NULL; const struct cpumask *allowed = cpus_allowed ?: p->cpus_ptr; int node = scx_cpu_node_if_enabled(prev_cpu); + bool is_prev_allowed; s32 cpu; preempt_disable(); /* + * Check whether @prev_cpu is still within the allowed set. If not, + * we can still try selecting a nearby CPU. + */ + is_prev_allowed = cpumask_test_cpu(prev_cpu, allowed); + + /* * Determine the subset of CPUs usable by @p within @cpus_allowed. */ if (allowed != p->cpus_ptr) { @@ -465,21 +480,6 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, cpu = -EBUSY; goto out_enable; } - - /* - * If @prev_cpu is not in the allowed CPUs, skip topology - * optimizations and try to pick any idle CPU usable by the - * task. - * - * If %SCX_OPS_BUILTIN_IDLE_PER_NODE is enabled, prioritize - * the current node, as it may optimize some waker->wakee - * workloads. - */ - if (!cpumask_test_cpu(prev_cpu, allowed)) { - node = scx_cpu_node_if_enabled(smp_processor_id()); - cpu = scx_pick_idle_cpu(allowed, node, flags); - goto out_enable; - } } /* @@ -525,7 +525,7 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, * then avoid a migration. */ cpu = smp_processor_id(); - if (cpus_share_cache(cpu, prev_cpu) && + if (is_prev_allowed && cpus_share_cache(cpu, prev_cpu) && scx_idle_test_and_clear_cpu(prev_cpu)) { cpu = prev_cpu; goto out_unlock; @@ -562,7 +562,8 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, /* * Keep using @prev_cpu if it's part of a fully idle core. */ - if (cpumask_test_cpu(prev_cpu, idle_cpumask(node)->smt) && + if (is_prev_allowed && + cpumask_test_cpu(prev_cpu, idle_cpumask(node)->smt) && scx_idle_test_and_clear_cpu(prev_cpu)) { cpu = prev_cpu; goto out_unlock; @@ -611,7 +612,7 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, /* * Use @prev_cpu if it's idle. */ - if (scx_idle_test_and_clear_cpu(prev_cpu)) { + if (is_prev_allowed && scx_idle_test_and_clear_cpu(prev_cpu)) { cpu = prev_cpu; goto out_unlock; } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b4326827e326..7a14da5396fb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2273,7 +2273,8 @@ static bool task_numa_compare(struct task_numa_env *env, rcu_read_lock(); cur = rcu_dereference(dst_rq->curr); - if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur))) + if (cur && ((cur->flags & (PF_EXITING | PF_KTHREAD)) || + !cur->mm)) cur = NULL; /* diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1af952cba48d..a7291685902e 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -7438,9 +7438,10 @@ void ftrace_release_mod(struct module *mod) mutex_lock(&ftrace_lock); - if (ftrace_disabled) - goto out_unlock; - + /* + * To avoid the UAF problem after the module is unloaded, the + * 'mod_map' resource needs to be released unconditionally. + */ list_for_each_entry_safe(mod_map, n, &ftrace_mod_maps, list) { if (mod_map->mod == mod) { list_del_rcu(&mod_map->list); @@ -7449,6 +7450,9 @@ void ftrace_release_mod(struct module *mod) } } + if (ftrace_disabled) + goto out_unlock; + /* * Each module has its own ftrace_pages, remove * them from the list. @@ -7627,6 +7631,9 @@ allocate_ftrace_mod_map(struct module *mod, { struct ftrace_mod_map *mod_map; + if (ftrace_disabled) + return NULL; + mod_map = kmalloc(sizeof(*mod_map), GFP_KERNEL); if (!mod_map) return NULL; |