diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2025-01-19 09:09:07 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2025-01-19 09:09:07 -0800 |
commit | 25144ea31b90af6fa860e1ce3ab735d8bb8deb83 (patch) | |
tree | 4054ddc432c76e0c550baf41d7416f5ccc350e2b | |
parent | b031457ab15dacb47d714ee872e724f1aa6a1b30 (diff) | |
parent | 2f8dea1692eef2b7ba6a256246ed82c365fdc686 (diff) |
Merge tag 'timers_urgent_for_v6.13' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull timer fixes from Borislav Petkov:
- Reset hrtimers correctly when a CPU hotplug state traversal happens
"half-ways" and leaves hrtimers not (re-)initialized properly
- Annotate accesses to a timer group's ignore flag to prevent KCSAN
from raising data_race warnings
- Make sure timer group initialization is visible to timer tree walkers
and avoid a hypothetical race
- Fix another race between CPU hotplug and idle entry/exit where timers
on a fully idle system are getting ignored
- Fix a case where an ignored signal is still being handled which it
shouldn't be
* tag 'timers_urgent_for_v6.13' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
hrtimers: Handle CPU state correctly on hotplug
timers/migration: Annotate accesses to ignore flag
timers/migration: Enforce group initialization visibility to tree walkers
timers/migration: Fix another race between hotplug and idle entry/exit
signal/posixtimers: Handle ignore/blocked sequences correctly
-rw-r--r-- | include/linux/hrtimer.h | 1 | ||||
-rw-r--r-- | kernel/cpu.c | 2 | ||||
-rw-r--r-- | kernel/signal.c | 37 | ||||
-rw-r--r-- | kernel/time/hrtimer.c | 11 | ||||
-rw-r--r-- | kernel/time/timer_migration.c | 64 |
5 files changed, 95 insertions, 20 deletions
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 7ef5f7ef31a9..f7bfdcf0dda3 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -386,6 +386,7 @@ extern void __init hrtimers_init(void); extern void sysrq_timer_list_show(void); int hrtimers_prepare_cpu(unsigned int cpu); +int hrtimers_cpu_starting(unsigned int cpu); #ifdef CONFIG_HOTPLUG_CPU int hrtimers_cpu_dying(unsigned int cpu); #else diff --git a/kernel/cpu.c b/kernel/cpu.c index b605334f8ee6..0509a9733745 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -2179,7 +2179,7 @@ static struct cpuhp_step cpuhp_hp_states[] = { }, [CPUHP_AP_HRTIMERS_DYING] = { .name = "hrtimers:dying", - .startup.single = NULL, + .startup.single = hrtimers_cpu_starting, .teardown.single = hrtimers_cpu_dying, }, [CPUHP_AP_TICK_DYING] = { diff --git a/kernel/signal.c b/kernel/signal.c index 989b1cc9116a..a2afd54303f0 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2007,11 +2007,22 @@ void posixtimer_send_sigqueue(struct k_itimer *tmr) if (!list_empty(&q->list)) { /* - * If task group is exiting with the signal already pending, - * wait for __exit_signal() to do its job. Otherwise if - * ignored, it's not supposed to be queued. Try to survive. + * The signal was ignored and blocked. The timer + * expiry queued it because blocked signals are + * queued independent of the ignored state. + * + * The unblocking set SIGPENDING, but the signal + * was not yet dequeued from the pending list. + * So prepare_signal() sees unblocked and ignored, + * which ends up here. Leave it queued like a + * regular signal. + * + * The same happens when the task group is exiting + * and the signal is already queued. + * prepare_signal() treats SIGNAL_GROUP_EXIT as + * ignored independent of its queued state. This + * gets cleaned up in __exit_signal(). */ - WARN_ON_ONCE(!(t->signal->flags & SIGNAL_GROUP_EXIT)); goto out; } @@ -2046,17 +2057,25 @@ void posixtimer_send_sigqueue(struct k_itimer *tmr) goto out; } - /* This should never happen and leaks a reference count */ - if (WARN_ON_ONCE(!hlist_unhashed(&tmr->ignored_list))) - hlist_del_init(&tmr->ignored_list); - if (unlikely(!list_empty(&q->list))) { /* This holds a reference count already */ result = TRACE_SIGNAL_ALREADY_PENDING; goto out; } - posixtimer_sigqueue_getref(q); + /* + * If the signal is on the ignore list, it got blocked after it was + * ignored earlier. But nothing lifted the ignore. Move it back to + * the pending list to be consistent with the regular signal + * handling. This already holds a reference count. + * + * If it's not on the ignore list acquire a reference count. + */ + if (likely(hlist_unhashed(&tmr->ignored_list))) + posixtimer_sigqueue_getref(q); + else + hlist_del_init(&tmr->ignored_list); + posixtimer_queue_sigqueue(q, t, tmr->it_pid_type); result = TRACE_SIGNAL_DELIVERED; out: diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 80fe3749d2db..030426c8c944 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -2202,6 +2202,15 @@ int hrtimers_prepare_cpu(unsigned int cpu) } cpu_base->cpu = cpu; + hrtimer_cpu_base_init_expiry_lock(cpu_base); + return 0; +} + +int hrtimers_cpu_starting(unsigned int cpu) +{ + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); + + /* Clear out any left over state from a CPU down operation */ cpu_base->active_bases = 0; cpu_base->hres_active = 0; cpu_base->hang_detected = 0; @@ -2210,7 +2219,6 @@ int hrtimers_prepare_cpu(unsigned int cpu) cpu_base->expires_next = KTIME_MAX; cpu_base->softirq_expires_next = KTIME_MAX; cpu_base->online = 1; - hrtimer_cpu_base_init_expiry_lock(cpu_base); return 0; } @@ -2286,5 +2294,6 @@ int hrtimers_cpu_dying(unsigned int dying_cpu) void __init hrtimers_init(void) { hrtimers_prepare_cpu(smp_processor_id()); + hrtimers_cpu_starting(smp_processor_id()); open_softirq(HRTIMER_SOFTIRQ, hrtimer_run_softirq); } diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c index 8d57f7686bb0..066c9ddca4ec 100644 --- a/kernel/time/timer_migration.c +++ b/kernel/time/timer_migration.c @@ -534,8 +534,13 @@ static void __walk_groups(up_f up, struct tmigr_walk *data, break; child = group; - group = group->parent; + /* + * Pairs with the store release on group connection + * to make sure group initialization is visible. + */ + group = READ_ONCE(group->parent); data->childmask = child->groupmask; + WARN_ON_ONCE(!data->childmask); } while (group); } @@ -564,7 +569,7 @@ static struct tmigr_event *tmigr_next_groupevt(struct tmigr_group *group) while ((node = timerqueue_getnext(&group->events))) { evt = container_of(node, struct tmigr_event, nextevt); - if (!evt->ignore) { + if (!READ_ONCE(evt->ignore)) { WRITE_ONCE(group->next_expiry, evt->nextevt.expires); return evt; } @@ -660,7 +665,7 @@ static bool tmigr_active_up(struct tmigr_group *group, * lock is held while updating the ignore flag in idle path. So this * state change will not be lost. */ - group->groupevt.ignore = true; + WRITE_ONCE(group->groupevt.ignore, true); return walk_done; } @@ -721,6 +726,7 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child, union tmigr_state childstate, groupstate; bool remote = data->remote; bool walk_done = false; + bool ignore; u64 nextexp; if (child) { @@ -739,11 +745,19 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child, nextexp = child->next_expiry; evt = &child->groupevt; - evt->ignore = (nextexp == KTIME_MAX) ? true : false; + /* + * This can race with concurrent idle exit (activate). + * If the current writer wins, a useless remote expiration may + * be scheduled. If the activate wins, the event is properly + * ignored. + */ + ignore = (nextexp == KTIME_MAX) ? true : false; + WRITE_ONCE(evt->ignore, ignore); } else { nextexp = data->nextexp; first_childevt = evt = data->evt; + ignore = evt->ignore; /* * Walking the hierarchy is required in any case when a @@ -769,7 +783,7 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child, * first event information of the group is updated properly and * also handled properly, so skip this fast return path. */ - if (evt->ignore && !remote && group->parent) + if (ignore && !remote && group->parent) return true; raw_spin_lock(&group->lock); @@ -783,7 +797,7 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child, * queue when the expiry time changed only or when it could be ignored. */ if (timerqueue_node_queued(&evt->nextevt)) { - if ((evt->nextevt.expires == nextexp) && !evt->ignore) { + if ((evt->nextevt.expires == nextexp) && !ignore) { /* Make sure not to miss a new CPU event with the same expiry */ evt->cpu = first_childevt->cpu; goto check_toplvl; @@ -793,7 +807,7 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child, WRITE_ONCE(group->next_expiry, KTIME_MAX); } - if (evt->ignore) { + if (ignore) { /* * When the next child event could be ignored (nextexp is * KTIME_MAX) and there was no remote timer handling before or @@ -1487,6 +1501,21 @@ static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl, s.seq = 0; atomic_set(&group->migr_state, s.state); + /* + * If this is a new top-level, prepare its groupmask in advance. + * This avoids accidents where yet another new top-level is + * created in the future and made visible before the current groupmask. + */ + if (list_empty(&tmigr_level_list[lvl])) { + group->groupmask = BIT(0); + /* + * The previous top level has prepared its groupmask already, + * simply account it as the first child. + */ + if (lvl > 0) + group->num_children = 1; + } + timerqueue_init_head(&group->events); timerqueue_init(&group->groupevt.nextevt); group->groupevt.nextevt.expires = KTIME_MAX; @@ -1550,8 +1579,25 @@ static void tmigr_connect_child_parent(struct tmigr_group *child, raw_spin_lock_irq(&child->lock); raw_spin_lock_nested(&parent->lock, SINGLE_DEPTH_NESTING); - child->parent = parent; - child->groupmask = BIT(parent->num_children++); + if (activate) { + /* + * @child is the old top and @parent the new one. In this + * case groupmask is pre-initialized and @child already + * accounted, along with its new sibling corresponding to the + * CPU going up. + */ + WARN_ON_ONCE(child->groupmask != BIT(0) || parent->num_children != 2); + } else { + /* Adding @child for the CPU going up to @parent. */ + child->groupmask = BIT(parent->num_children++); + } + + /* + * Make sure parent initialization is visible before publishing it to a + * racing CPU entering/exiting idle. This RELEASE barrier enforces an + * address dependency that pairs with the READ_ONCE() in __walk_groups(). + */ + smp_store_release(&child->parent, parent); raw_spin_unlock(&parent->lock); raw_spin_unlock_irq(&child->lock); |