From 7318d4cc14c8c8a5dde2b0b72ea50fd2545f0b7a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 21 Apr 2020 12:09:13 +0200 Subject: sched: Provide sched_set_fifo() SCHED_FIFO (or any static priority scheduler) is a broken scheduler model; it is fundamentally incapable of resource management, the one thing an OS is actually supposed to do. It is impossible to compose static priority workloads. One cannot take two well designed and functional static priority workloads and mash them together and still expect them to work. Therefore it doesn't make sense to expose the priority field; the kernel is fundamentally incapable of setting a sensible value, it needs systems knowledge that it doesn't have. Take away sched_setschedule() / sched_setattr() from modules and replace them with: - sched_set_fifo(p); create a FIFO task (at prio 50) - sched_set_fifo_low(p); create a task higher than NORMAL, which ends up being a FIFO task at prio 1. - sched_set_normal(p, nice); (re)set the task to normal This stops the proliferation of randomly chosen, and irrelevant, FIFO priorities that dont't really mean anything anyway. The system administrator/integrator, whoever has insight into the actual system design and requirements (userspace) can set-up appropriate priorities if and when needed. Cc: airlied@redhat.com Cc: alexander.deucher@amd.com Cc: awalls@md.metrocast.net Cc: axboe@kernel.dk Cc: broonie@kernel.org Cc: daniel.lezcano@linaro.org Cc: gregkh@linuxfoundation.org Cc: hannes@cmpxchg.org Cc: herbert@gondor.apana.org.au Cc: hverkuil@xs4all.nl Cc: john.stultz@linaro.org Cc: nico@fluxnic.net Cc: paulmck@kernel.org Cc: rafael.j.wysocki@intel.com Cc: rmk+kernel@arm.linux.org.uk Cc: sudeep.holla@arm.com Cc: tglx@linutronix.de Cc: ulf.hansson@linaro.org Cc: wim@linux-watchdog.org Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ingo Molnar Tested-by: Paul E. McKenney --- include/linux/sched.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index b62e6aaf28f0..b792b8f0f4cf 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1653,6 +1653,9 @@ extern int idle_cpu(int cpu); extern int available_idle_cpu(int cpu); extern int sched_setscheduler(struct task_struct *, int, const struct sched_param *); extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *); +extern int sched_set_fifo(struct task_struct *p); +extern int sched_set_fifo_low(struct task_struct *p); +extern int sched_set_normal(struct task_struct *p, int nice); extern int sched_setattr(struct task_struct *, const struct sched_attr *); extern int sched_setattr_nocheck(struct task_struct *, const struct sched_attr *); extern struct task_struct *idle_task(int cpu); -- cgit From 8b700983de82f79e05b2c1136d6513ea4c9b22c4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 22 Apr 2020 13:10:04 +0200 Subject: sched: Remove sched_set_*() return value Ingo suggested that since the new sched_set_*() functions are implemented using the 'nocheck' variants, they really shouldn't ever fail, so remove the return value. Cc: axboe@kernel.dk Cc: daniel.lezcano@linaro.org Cc: sudeep.holla@arm.com Cc: airlied@redhat.com Cc: broonie@kernel.org Cc: paulmck@kernel.org Suggested-by: Ingo Molnar Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ingo Molnar --- include/linux/sched.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index b792b8f0f4cf..ae7664492af2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1653,9 +1653,9 @@ extern int idle_cpu(int cpu); extern int available_idle_cpu(int cpu); extern int sched_setscheduler(struct task_struct *, int, const struct sched_param *); extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *); -extern int sched_set_fifo(struct task_struct *p); -extern int sched_set_fifo_low(struct task_struct *p); -extern int sched_set_normal(struct task_struct *p, int nice); +extern void sched_set_fifo(struct task_struct *p); +extern void sched_set_fifo_low(struct task_struct *p); +extern void sched_set_normal(struct task_struct *p, int nice); extern int sched_setattr(struct task_struct *, const struct sched_attr *); extern int sched_setattr_nocheck(struct task_struct *, const struct sched_attr *); extern struct task_struct *idle_task(int cpu); -- cgit From 4f311afc2035f7b33d97e69ffaa2e6cd67fca16d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 10 Jun 2020 12:14:09 +0200 Subject: sched/core: Fix CONFIG_GCC_PLUGIN_RANDSTRUCT build fail As a temporary build fix, the proper cleanup needs more work. Reported-by: Guenter Roeck Reported-by: Eric Biggers Suggested-by: Eric Biggers Suggested-by: Kees Cook Fixes: a148866489fb ("sched: Replace rq::wake_list") Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar --- include/linux/sched.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index b62e6aaf28f0..224b5de568e7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -654,8 +654,10 @@ struct task_struct { unsigned int ptrace; #ifdef CONFIG_SMP - struct llist_node wake_entry; - unsigned int wake_entry_type; + struct { + struct llist_node wake_entry; + unsigned int wake_entry_type; + }; int on_cpu; #ifdef CONFIG_THREAD_INFO_IN_TASK /* Current CPU: */ -- cgit From 8c4890d1c3358fb8023d46e1e554c41d54f02878 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 22 Jun 2020 12:01:25 +0200 Subject: smp, irq_work: Continue smp_call_function*() and irq_work*() integration Instead of relying on BUG_ON() to ensure the various data structures line up, use a bunch of horrible unions to make it all automatic. Much of the union magic is to ensure irq_work and smp_call_function do not (yet) see the members of their respective data structures change name. Suggested-by: Linus Torvalds Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Frederic Weisbecker Link: https://lkml.kernel.org/r/20200622100825.844455025@infradead.org --- include/linux/sched.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 224b5de568e7..692e327d7455 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -654,11 +654,8 @@ struct task_struct { unsigned int ptrace; #ifdef CONFIG_SMP - struct { - struct llist_node wake_entry; - unsigned int wake_entry_type; - }; int on_cpu; + struct __call_single_node wake_entry; #ifdef CONFIG_THREAD_INFO_IN_TASK /* Current CPU: */ unsigned int cpu; -- cgit From 884c5e683b67dbc52892e24c29eed864f330ec08 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 26 Jun 2020 12:23:00 -0500 Subject: umh: Separate the user mode driver and the user mode helper support This makes it clear which code is part of the core user mode helper support and which code is needed to implement user mode drivers. This makes the kernel smaller for everyone who does not use a usermode driver. v1: https://lkml.kernel.org/r/87tuyyf0ln.fsf_-_@x220.int.ebiederm.org v2: https://lkml.kernel.org/r/87imf963s6.fsf_-_@x220.int.ebiederm.org Link: https://lkml.kernel.org/r/20200702164140.4468-5-ebiederm@xmission.com Reviewed-by: Greg Kroah-Hartman Acked-by: Alexei Starovoitov Tested-by: Alexei Starovoitov Signed-off-by: "Eric W. Biederman" --- include/linux/sched.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index b62e6aaf28f0..59d1e92bb88e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2020,14 +2020,6 @@ static inline void rseq_execve(struct task_struct *t) #endif -void __exit_umh(struct task_struct *tsk); - -static inline void exit_umh(struct task_struct *tsk) -{ - if (unlikely(tsk->flags & PF_UMH)) - __exit_umh(tsk); -} - #ifdef CONFIG_DEBUG_RSEQ void rseq_syscall(struct pt_regs *regs); -- cgit From 8c2f52663973e643c617663d826e2b0daa008b38 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 25 Jun 2020 17:40:40 -0500 Subject: umd: Remove exit_umh The bpfilter code no longer uses the umd_info.cleanup callback. This callback is what exit_umh exists to call. So remove exit_umh and all of it's associated booking. v1: https://lkml.kernel.org/r/87bll6dlte.fsf_-_@x220.int.ebiederm.org v2: https://lkml.kernel.org/r/87y2o53abg.fsf_-_@x220.int.ebiederm.org Link: https://lkml.kernel.org/r/20200702164140.4468-15-ebiederm@xmission.com Reviewed-by: Greg Kroah-Hartman Acked-by: Alexei Starovoitov Tested-by: Alexei Starovoitov Signed-off-by: "Eric W. Biederman" --- include/linux/sched.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 59d1e92bb88e..edb2020875ad 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1511,7 +1511,6 @@ extern struct pid *cad_pid; #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ #define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */ #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ -#define PF_UMH 0x02000000 /* I'm an Usermodehelper process */ #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ #define PF_MEMALLOC_NOCMA 0x10000000 /* All allocation request will have _GFP_MOVABLE cleared */ -- cgit From dbfb089d360b1cc623c51a2c7cf9b99eff78e0e7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 3 Jul 2020 12:40:33 +0200 Subject: sched: Fix loadavg accounting race The recent commit: c6e7bd7afaeb ("sched/core: Optimize ttwu() spinning on p->on_cpu") moved these lines in ttwu(): p->sched_contributes_to_load = !!task_contributes_to_load(p); p->state = TASK_WAKING; up before: smp_cond_load_acquire(&p->on_cpu, !VAL); into the 'p->on_rq == 0' block, with the thinking that once we hit schedule() the current task cannot change it's ->state anymore. And while this is true, it is both incorrect and flawed. It is incorrect in that we need at least an ACQUIRE on 'p->on_rq == 0' to avoid weak hardware from re-ordering things for us. This can fairly easily be achieved by relying on the control-dependency already in place. The second problem, which makes the flaw in the original argument, is that while schedule() will not change prev->state, it will read it a number of times (arguably too many times since it's marked volatile). The previous condition 'p->on_cpu == 0' was sufficient because that indicates schedule() has completed, and will no longer read prev->state. So now the trick is to make this same true for the (much) earlier 'prev->on_rq == 0' case. Furthermore, in order to make the ordering stick, the 'prev->on_rq = 0' assignment needs to he a RELEASE, but adding additional ordering to schedule() is an unwelcome proposition at the best of times, doubly so for mere accounting. Luckily we can push the prev->state load up before rq->lock, with the only caveat that we then have to re-read the state after. However, we know that if it changed, we no longer have to worry about the blocking path. This gives us the required ordering, if we block, we did the prev->state load before an (effective) smp_mb() and the p->on_rq store needs not change. With this we end up with the effective ordering: LOAD p->state LOAD-ACQUIRE p->on_rq == 0 MB STORE p->on_rq, 0 STORE p->state, TASK_WAKING which ensures the TASK_WAKING store happens after the prev->state load, and all is well again. Fixes: c6e7bd7afaeb ("sched/core: Optimize ttwu() spinning on p->on_cpu") Reported-by: Dave Jones Reported-by: Paul Gortmaker Signed-off-by: Peter Zijlstra (Intel) Tested-by: Dave Jones Tested-by: Paul Gortmaker Link: https://lkml.kernel.org/r/20200707102957.GN117543@hirez.programming.kicks-ass.net --- include/linux/sched.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 692e327d7455..683372943093 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -114,10 +114,6 @@ struct task_group; #define task_is_stopped_or_traced(task) ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0) -#define task_contributes_to_load(task) ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \ - (task->flags & PF_FROZEN) == 0 && \ - (task->state & TASK_NOLOAD) == 0) - #ifdef CONFIG_DEBUG_ATOMIC_SLEEP /* -- cgit From 9d246053a69196c7c27068870e9b4b66ac536f68 Mon Sep 17 00:00:00 2001 From: Phil Auld Date: Mon, 29 Jun 2020 15:23:03 -0400 Subject: sched: Add a tracepoint to track rq->nr_running Add a bare tracepoint trace_sched_update_nr_running_tp which tracks ->nr_running CPU's rq. This is used to accurately trace this data and provide a visualization of scheduler imbalances in, for example, the form of a heat map. The tracepoint is accessed by loading an external kernel module. An example module (forked from Qais' module and including the pelt related tracepoints) can be found at: https://github.com/auldp/tracepoints-helpers.git A script to turn the trace-cmd report output into a heatmap plot can be found at: https://github.com/jirvoz/plot-nr-running The tracepoints are added to add_nr_running() and sub_nr_running() which are in kernel/sched/sched.h. In order to avoid CREATE_TRACE_POINTS in the header a wrapper call is used and the trace/events/sched.h include is moved before sched.h in kernel/sched/core. Signed-off-by: Phil Auld Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200629192303.GC120228@lorien.usersys.redhat.com --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 683372943093..12b10ce51a08 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2044,6 +2044,7 @@ const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq); const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq); int sched_trace_rq_cpu(struct rq *rq); +int sched_trace_rq_nr_running(struct rq *rq); const struct cpumask *sched_trace_rd_span(struct root_domain *rd); -- cgit From a21ee6055c30ce68c4e201c6496f0ed2a1936230 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 25 May 2020 12:22:41 +0200 Subject: lockdep: Change hardirq{s_enabled,_context} to per-cpu variables Currently all IRQ-tracking state is in task_struct, this means that task_struct needs to be defined before we use it. Especially for lockdep_assert_irq*() this can lead to header-hell. Move the hardirq state into per-cpu variables to avoid the task_struct dependency. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ingo Molnar Link: https://lkml.kernel.org/r/20200623083721.512673481@infradead.org --- include/linux/sched.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 692e327d7455..3903a9500926 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -990,8 +990,6 @@ struct task_struct { unsigned long hardirq_disable_ip; unsigned int hardirq_enable_event; unsigned int hardirq_disable_event; - int hardirqs_enabled; - int hardirq_context; u64 hardirq_chain_key; unsigned long softirq_disable_ip; unsigned long softirq_enable_ip; -- cgit From 58877d347b58c9e971112df5eb311c13bb0acb28 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 2 Jul 2020 14:52:11 +0200 Subject: sched: Better document ttwu() Dave hit the problem fixed by commit: b6e13e85829f ("sched/core: Fix ttwu() race") and failed to understand much of the code involved. Per his request a few comments to (hopefully) clarify things. Requested-by: Dave Chinner Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200702125211.GQ4800@hirez.programming.kicks-ass.net --- include/linux/sched.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 12b10ce51a08..5033813fecd5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -154,24 +154,24 @@ struct task_group; * * for (;;) { * set_current_state(TASK_UNINTERRUPTIBLE); - * if (!need_sleep) - * break; + * if (CONDITION) + * break; * * schedule(); * } * __set_current_state(TASK_RUNNING); * * If the caller does not need such serialisation (because, for instance, the - * condition test and condition change and wakeup are under the same lock) then + * CONDITION test and condition change and wakeup are under the same lock) then * use __set_current_state(). * * The above is typically ordered against the wakeup, which does: * - * need_sleep = false; + * CONDITION = 1; * wake_up_state(p, TASK_UNINTERRUPTIBLE); * - * where wake_up_state() executes a full memory barrier before accessing the - * task state. + * where wake_up_state()/try_to_wake_up() executes a full memory barrier before + * accessing p->state. * * Wakeup will do: if (@state & p->state) p->state = TASK_RUNNING, that is, * once it observes the TASK_UNINTERRUPTIBLE store the waking CPU can issue a -- cgit From c1b7b8d42b5422627b0a8268416a60748f8d000f Mon Sep 17 00:00:00 2001 From: 王文虎 Date: Mon, 27 Jul 2020 21:39:51 +0800 Subject: sched: Fix a typo in a comment Change the comment typo: "direcly" -> "directly". Signed-off-by: Wang Wenhu Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/AAcAXwBTDSpsKN-5iyIOtaqk.1.1595857191899.Hmail.wenhu.wang@vivo.com --- include/linux/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 5033813fecd5..adf0125190d4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -374,7 +374,7 @@ struct util_est { * For cfs_rq, they are the aggregated values of all runnable and blocked * sched_entities. * - * The load/runnable/util_avg doesn't direcly factor frequency scaling and CPU + * The load/runnable/util_avg doesn't directly factor frequency scaling and CPU * capacity scaling. The scaling is done through the rq_clock_pelt that is used * for computing those signals (see update_rq_clock_pelt()) * -- cgit From 13685c4a08fca9dd76bf53bfcbadc044ab2a08cb Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Thu, 16 Jul 2020 12:03:45 +0100 Subject: sched/uclamp: Add a new sysctl to control RT default boost value RT tasks by default run at the highest capacity/performance level. When uclamp is selected this default behavior is retained by enforcing the requested uclamp.min (p->uclamp_req[UCLAMP_MIN]) of the RT tasks to be uclamp_none(UCLAMP_MAX), which is SCHED_CAPACITY_SCALE; the maximum value. This is also referred to as 'the default boost value of RT tasks'. See commit 1a00d999971c ("sched/uclamp: Set default clamps for RT tasks"). On battery powered devices, it is desired to control this default (currently hardcoded) behavior at runtime to reduce energy consumed by RT tasks. For example, a mobile device manufacturer where big.LITTLE architecture is dominant, the performance of the little cores varies across SoCs, and on high end ones the big cores could be too power hungry. Given the diversity of SoCs, the new knob allows manufactures to tune the best performance/power for RT tasks for the particular hardware they run on. They could opt to further tune the value when the user selects a different power saving mode or when the device is actively charging. The runtime aspect of it further helps in creating a single kernel image that can be run on multiple devices that require different tuning. Keep in mind that a lot of RT tasks in the system are created by the kernel. On Android for instance I can see over 50 RT tasks, only a handful of which created by the Android framework. To control the default behavior globally by system admins and device integrator, introduce the new sysctl_sched_uclamp_util_min_rt_default to change the default boost value of the RT tasks. I anticipate this to be mostly in the form of modifying the init script of a particular device. To avoid polluting the fast path with unnecessary code, the approach taken is to synchronously do the update by traversing all the existing tasks in the system. This could race with a concurrent fork(), which is dealt with by introducing sched_post_fork() function which will ensure the racy fork will get the right update applied. Tested on Juno-r2 in combination with the RT capacity awareness [1]. By default an RT task will go to the highest capacity CPU and run at the maximum frequency, which is particularly energy inefficient on high end mobile devices because the biggest core[s] are 'huge' and power hungry. With this patch the RT task can be controlled to run anywhere by default, and doesn't cause the frequency to be maximum all the time. Yet any task that really needs to be boosted can easily escape this default behavior by modifying its requested uclamp.min value (p->uclamp_req[UCLAMP_MIN]) via sched_setattr() syscall. [1] 804d402fb6f6: ("sched/rt: Make RT capacity-aware") Signed-off-by: Qais Yousef Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200716110347.19553-2-qais.yousef@arm.com --- include/linux/sched.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index adf0125190d4..a6bf77c34687 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -686,9 +686,15 @@ struct task_struct { struct sched_dl_entity dl; #ifdef CONFIG_UCLAMP_TASK - /* Clamp values requested for a scheduling entity */ + /* + * Clamp values requested for a scheduling entity. + * Must be updated with task_rq_lock() held. + */ struct uclamp_se uclamp_req[UCLAMP_CNT]; - /* Effective clamp values used for a scheduling entity */ + /* + * Effective clamp values used for a scheduling entity. + * Must be updated with task_rq_lock() held. + */ struct uclamp_se uclamp[UCLAMP_CNT]; #endif -- cgit From b75058614fdd3140074a640b514f6a0b4d485a2d Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Mon, 20 Jul 2020 17:55:19 +0200 Subject: sched: tasks: Use sequence counter with associated spinlock A sequence counter write side critical section must be protected by some form of locking to serialize writers. A plain seqcount_t does not contain the information of which lock must be held when entering a write side critical section. Use the new seqcount_spinlock_t data type, which allows to associate a spinlock with the sequence counter. This enables lockdep to verify that the spinlock used for writer serialization is held when the write side critical section is entered. If lockdep is disabled this lock association is compiled out and has neither storage size nor runtime overhead. Signed-off-by: Ahmed S. Darwish Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200720155530.1173732-14-a.darwish@linutronix.de --- include/linux/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 8d1de021b315..9a9d8263962d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1050,7 +1050,7 @@ struct task_struct { /* Protected by ->alloc_lock: */ nodemask_t mems_allowed; /* Seqence number to catch updates: */ - seqcount_t mems_allowed_seq; + seqcount_spinlock_t mems_allowed_seq; int cpuset_mem_spread_rotor; int cpuset_slab_spread_rotor; #endif -- cgit From 0584df9c12f449124d0bfef9899e5365604ee7a9 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Wed, 29 Jul 2020 13:09:15 +0200 Subject: lockdep: Refactor IRQ trace events fields into struct Refactor the IRQ trace events fields, used for printing information about the IRQ trace events, into a separate struct 'irqtrace_events'. This improves readability by separating the information only used in reporting, as well as enables (simplified) storing/restoring of irqtrace_events snapshots. No functional change intended. Signed-off-by: Marco Elver Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20200729110916.3920464-1-elver@google.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 8d1de021b315..52e0fdd6a555 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -980,17 +981,9 @@ struct task_struct { #endif #ifdef CONFIG_TRACE_IRQFLAGS - unsigned int irq_events; + struct irqtrace_events irqtrace; unsigned int hardirq_threaded; - unsigned long hardirq_enable_ip; - unsigned long hardirq_disable_ip; - unsigned int hardirq_enable_event; - unsigned int hardirq_disable_event; u64 hardirq_chain_key; - unsigned long softirq_disable_ip; - unsigned long softirq_enable_ip; - unsigned int softirq_disable_event; - unsigned int softirq_enable_event; int softirqs_enabled; int softirq_context; int irq_config; -- cgit From 92c209ac6d3d35783c16c8a717547183e6e11162 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Wed, 29 Jul 2020 13:09:16 +0200 Subject: kcsan: Improve IRQ state trace reporting To improve the general usefulness of the IRQ state trace events with KCSAN enabled, save and restore the trace information when entering and exiting the KCSAN runtime as well as when generating a KCSAN report. Without this, reporting the IRQ trace events (whether via a KCSAN report or outside of KCSAN via a lockdep report) is rather useless due to continuously being touched by KCSAN. This is because if KCSAN is enabled, every instrumented memory access causes changes to IRQ trace events (either by KCSAN disabling/enabling interrupts or taking report_lock when generating a report). Before "lockdep: Prepare for NMI IRQ state tracking", KCSAN avoided touching the IRQ trace events via raw_local_irq_save/restore() and lockdep_off/on(). Fixes: 248591f5d257 ("kcsan: Make KCSAN compatible with new IRQ state tracking") Signed-off-by: Marco Elver Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20200729110916.3920464-2-elver@google.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 52e0fdd6a555..060e9214c8b5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1184,8 +1184,12 @@ struct task_struct { #ifdef CONFIG_KASAN unsigned int kasan_depth; #endif + #ifdef CONFIG_KCSAN struct kcsan_ctx kcsan_ctx; +#ifdef CONFIG_TRACE_IRQFLAGS + struct irqtrace_events kcsan_save_irqtrace; +#endif #endif #ifdef CONFIG_FUNCTION_GRAPH_TRACER -- cgit From 0cd39f4600ed4de859383018eb10f0f724900e1b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 6 Aug 2020 14:35:11 +0200 Subject: locking/seqlock, headers: Untangle the spaghetti monster By using lockdep_assert_*() from seqlock.h, the spaghetti monster attacked. Attack back by reducing seqlock.h dependencies from two key high level headers: - : -Remove - : -Remove - : +Add The price was to add it to sched.h ... Core header fallout, we add direct header dependencies instead of gaining them parasitically from higher level headers: - : +Add - : +Add - : +Add - : +Add - : +Add - : +Add Arch headers fallout: - PARISC: : +Add - SH: : +Add - SPARC: : +Add - SPARC: : +Add , -Remove - X86: : +Add -Remove There's also a bunch of parasitic header dependency fallout in .c files, not listed separately. [ mingo: Extended the changelog, split up & fixed the original patch. ] Co-developed-by: Ingo Molnar Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20200804133438.GK2674@hirez.programming.kicks-ass.net --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 9a9d8263962d..7c7a9499d7bc 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -31,6 +31,7 @@ #include #include #include +#include #include /* task_struct member predeclarations (sorted alphabetically): */ -- cgit From 1fb497dd003009be95ce67689ac800c446b7acc5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 30 Jul 2020 12:14:06 +0200 Subject: posix-cpu-timers: Provide mechanisms to defer timer handling to task_work Running posix CPU timers in hard interrupt context has a few downsides: - For PREEMPT_RT it cannot work as the expiry code needs to take sighand lock, which is a 'sleeping spinlock' in RT. The original RT approach of offloading the posix CPU timer handling into a high priority thread was clumsy and provided no real benefit in general. - For fine grained accounting it's just wrong to run this in context of the timer interrupt because that way a process specific CPU time is accounted to the timer interrupt. - Long running timer interrupts caused by a large amount of expiring timers which can be created and armed by unpriviledged user space. There is no hard requirement to expire them in interrupt context. If the signal is targeted at the task itself then it won't be delivered before the task returns to user space anyway. If the signal is targeted at a supervisor process then it might be slightly delayed, but posix CPU timers are inaccurate anyway due to the fact that they are tied to the tick. Provide infrastructure to schedule task work which allows splitting the posix CPU timer code into a quick check in interrupt context and a thread context expiry and signal delivery function. This has to be enabled by architectures as it requires that the architecture specific KVM implementation handles pending task work before exiting to guest mode. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Reviewed-by: Oleg Nesterov Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20200730102337.783470146@linutronix.de --- include/linux/sched.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 06ec60462af0..e9942ce07ef1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -889,6 +889,10 @@ struct task_struct { /* Empty if CONFIG_POSIX_CPUTIMERS=n */ struct posix_cputimers posix_cputimers; +#ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK + struct posix_cputimers_work posix_cputimers_work; +#endif + /* Process credentials: */ /* Tracer's credentials at attach: */ -- cgit