summaryrefslogtreecommitdiff
path: root/kernel/time
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/time')
-rw-r--r--kernel/time/Kconfig15
-rw-r--r--kernel/time/Makefile2
-rw-r--r--kernel/time/alarmtimer.c86
-rw-r--r--kernel/time/clockevents.c2
-rw-r--r--kernel/time/clocksource.c22
-rw-r--r--kernel/time/hrtimer.c55
-rw-r--r--kernel/time/itimer.c3
-rw-r--r--kernel/time/jiffies.c130
-rw-r--r--kernel/time/namespace.c38
-rw-r--r--kernel/time/ntp.c72
-rw-r--r--kernel/time/ntp_internal.h13
-rw-r--r--kernel/time/posix-cpu-timers.c13
-rw-r--r--kernel/time/posix-timers.c47
-rw-r--r--kernel/time/posix-timers.h1
-rw-r--r--kernel/time/sched_clock.c26
-rw-r--r--kernel/time/sleep_timeout.c4
-rw-r--r--kernel/time/tick-common.c16
-rw-r--r--kernel/time/tick-internal.h2
-rw-r--r--kernel/time/tick-oneshot.c20
-rw-r--r--kernel/time/tick-sched.c41
-rw-r--r--kernel/time/time.c1
-rw-r--r--kernel/time/timecounter.c2
-rw-r--r--kernel/time/timekeeping.c714
-rw-r--r--kernel/time/timekeeping_internal.h3
-rw-r--r--kernel/time/timer.c87
-rw-r--r--kernel/time/timer_list.c2
-rw-r--r--kernel/time/timer_migration.c510
-rw-r--r--kernel/time/timer_migration.h2
-rw-r--r--kernel/time/vsyscall.c74
29 files changed, 1442 insertions, 561 deletions
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index b0b97a60aaa6..7c6a52f7836c 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -82,9 +82,9 @@ config CONTEXT_TRACKING_IDLE
help
Tracks idle state on behalf of RCU.
-if GENERIC_CLOCKEVENTS
menu "Timers subsystem"
+if GENERIC_CLOCKEVENTS
# Core internal switch. Selected by NO_HZ_COMMON / HIGH_RES_TIMERS. This is
# only related to the tick functionality. Oneshot clockevent devices
# are supported independent of this.
@@ -208,6 +208,17 @@ config CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
interval and NTP's maximum frequency drift of 500 parts
per million. If the clocksource is good enough for NTP,
it is good enough for the clocksource watchdog!
+endif
+
+config POSIX_AUX_CLOCKS
+ bool "Enable auxiliary POSIX clocks"
+ depends on POSIX_TIMERS
+ help
+ Auxiliary POSIX clocks are clocks which can be steered
+ independently of the core timekeeper, which controls the
+ MONOTONIC, REALTIME, BOOTTIME and TAI clocks. They are useful to
+ provide e.g. lockless time accessors to independent PTP clocks
+ and other clock domains, which are not correlated to the TAI/NTP
+ notion of time.
endmenu
-endif
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index e6e9b85d4db5..f7d52d9543cc 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -26,7 +26,7 @@ obj-$(CONFIG_LEGACY_TIMER_TICK) += tick-legacy.o
ifeq ($(CONFIG_SMP),y)
obj-$(CONFIG_NO_HZ_COMMON) += timer_migration.o
endif
-obj-$(CONFIG_HAVE_GENERIC_VDSO) += vsyscall.o
+obj-$(CONFIG_GENERIC_GETTIMEOFDAY) += vsyscall.o
obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o
obj-$(CONFIG_TEST_UDELAY) += test_udelay.o
obj-$(CONFIG_TIME_NS) += namespace.o
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 0ddccdff119a..069d93bfb0c7 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -35,7 +35,7 @@
/**
* struct alarm_base - Alarm timer bases
- * @lock: Lock for syncrhonized access to the base
+ * @lock: Lock for synchronized access to the base
* @timerqueue: Timerqueue head managing the list of events
* @get_ktime: Function to read the time correlating to the base
* @get_timespec: Function to read the namespace time correlating to the base
@@ -70,12 +70,10 @@ static DEFINE_SPINLOCK(rtcdev_lock);
*/
struct rtc_device *alarmtimer_get_rtcdev(void)
{
- unsigned long flags;
struct rtc_device *ret;
- spin_lock_irqsave(&rtcdev_lock, flags);
+ guard(spinlock_irqsave)(&rtcdev_lock);
ret = rtcdev;
- spin_unlock_irqrestore(&rtcdev_lock, flags);
return ret;
}
@@ -83,7 +81,6 @@ EXPORT_SYMBOL_GPL(alarmtimer_get_rtcdev);
static int alarmtimer_rtc_add_device(struct device *dev)
{
- unsigned long flags;
struct rtc_device *rtc = to_rtc_device(dev);
struct platform_device *pdev;
int ret = 0;
@@ -101,25 +98,18 @@ static int alarmtimer_rtc_add_device(struct device *dev)
if (!IS_ERR(pdev))
device_init_wakeup(&pdev->dev, true);
- spin_lock_irqsave(&rtcdev_lock, flags);
- if (!IS_ERR(pdev) && !rtcdev) {
- if (!try_module_get(rtc->owner)) {
+ scoped_guard(spinlock_irqsave, &rtcdev_lock) {
+ if (!IS_ERR(pdev) && !rtcdev && try_module_get(rtc->owner)) {
+ rtcdev = rtc;
+ /* hold a reference so it doesn't go away */
+ get_device(dev);
+ pdev = NULL;
+ } else {
ret = -1;
- goto unlock;
}
-
- rtcdev = rtc;
- /* hold a reference so it doesn't go away */
- get_device(dev);
- pdev = NULL;
- } else {
- ret = -1;
}
-unlock:
- spin_unlock_irqrestore(&rtcdev_lock, flags);
platform_device_unregister(pdev);
-
return ret;
}
@@ -198,7 +188,7 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)
struct alarm *alarm = container_of(timer, struct alarm, timer);
struct alarm_base *base = &alarm_bases[alarm->type];
- scoped_guard (spinlock_irqsave, &base->lock)
+ scoped_guard(spinlock_irqsave, &base->lock)
alarmtimer_dequeue(base, alarm);
if (alarm->function)
@@ -228,17 +218,16 @@ EXPORT_SYMBOL_GPL(alarm_expires_remaining);
static int alarmtimer_suspend(struct device *dev)
{
ktime_t min, now, expires;
- int i, ret, type;
struct rtc_device *rtc;
- unsigned long flags;
struct rtc_time tm;
+ int i, ret, type;
- spin_lock_irqsave(&freezer_delta_lock, flags);
- min = freezer_delta;
- expires = freezer_expires;
- type = freezer_alarmtype;
- freezer_delta = 0;
- spin_unlock_irqrestore(&freezer_delta_lock, flags);
+ scoped_guard(spinlock_irqsave, &freezer_delta_lock) {
+ min = freezer_delta;
+ expires = freezer_expires;
+ type = freezer_alarmtype;
+ freezer_delta = 0;
+ }
rtc = alarmtimer_get_rtcdev();
/* If we have no rtcdev, just return */
@@ -251,9 +240,8 @@ static int alarmtimer_suspend(struct device *dev)
struct timerqueue_node *next;
ktime_t delta;
- spin_lock_irqsave(&base->lock, flags);
- next = timerqueue_getnext(&base->timerqueue);
- spin_unlock_irqrestore(&base->lock, flags);
+ scoped_guard(spinlock_irqsave, &base->lock)
+ next = timerqueue_getnext(&base->timerqueue);
if (!next)
continue;
delta = ktime_sub(next->expires, base->get_ktime());
@@ -352,13 +340,12 @@ EXPORT_SYMBOL_GPL(alarm_init);
void alarm_start(struct alarm *alarm, ktime_t start)
{
struct alarm_base *base = &alarm_bases[alarm->type];
- unsigned long flags;
- spin_lock_irqsave(&base->lock, flags);
- alarm->node.expires = start;
- alarmtimer_enqueue(base, alarm);
- hrtimer_start(&alarm->timer, alarm->node.expires, HRTIMER_MODE_ABS);
- spin_unlock_irqrestore(&base->lock, flags);
+ scoped_guard(spinlock_irqsave, &base->lock) {
+ alarm->node.expires = start;
+ alarmtimer_enqueue(base, alarm);
+ hrtimer_start(&alarm->timer, alarm->node.expires, HRTIMER_MODE_ABS);
+ }
trace_alarmtimer_start(alarm, base->get_ktime());
}
@@ -381,13 +368,11 @@ EXPORT_SYMBOL_GPL(alarm_start_relative);
void alarm_restart(struct alarm *alarm)
{
struct alarm_base *base = &alarm_bases[alarm->type];
- unsigned long flags;
- spin_lock_irqsave(&base->lock, flags);
+ guard(spinlock_irqsave)(&base->lock);
hrtimer_set_expires(&alarm->timer, alarm->node.expires);
hrtimer_restart(&alarm->timer);
alarmtimer_enqueue(base, alarm);
- spin_unlock_irqrestore(&base->lock, flags);
}
EXPORT_SYMBOL_GPL(alarm_restart);
@@ -401,14 +386,13 @@ EXPORT_SYMBOL_GPL(alarm_restart);
int alarm_try_to_cancel(struct alarm *alarm)
{
struct alarm_base *base = &alarm_bases[alarm->type];
- unsigned long flags;
int ret;
- spin_lock_irqsave(&base->lock, flags);
- ret = hrtimer_try_to_cancel(&alarm->timer);
- if (ret >= 0)
- alarmtimer_dequeue(base, alarm);
- spin_unlock_irqrestore(&base->lock, flags);
+ scoped_guard(spinlock_irqsave, &base->lock) {
+ ret = hrtimer_try_to_cancel(&alarm->timer);
+ if (ret >= 0)
+ alarmtimer_dequeue(base, alarm);
+ }
trace_alarmtimer_cancel(alarm, base->get_ktime());
return ret;
@@ -479,7 +463,6 @@ EXPORT_SYMBOL_GPL(alarm_forward_now);
static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type)
{
struct alarm_base *base;
- unsigned long flags;
ktime_t delta;
switch(type) {
@@ -498,13 +481,12 @@ static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type)
delta = ktime_sub(absexp, base->get_ktime());
- spin_lock_irqsave(&freezer_delta_lock, flags);
+ guard(spinlock_irqsave)(&freezer_delta_lock);
if (!freezer_delta || (delta < freezer_delta)) {
freezer_delta = delta;
freezer_expires = absexp;
freezer_alarmtype = type;
}
- spin_unlock_irqrestore(&freezer_delta_lock, flags);
}
/**
@@ -515,9 +497,9 @@ static enum alarmtimer_type clock2alarm(clockid_t clockid)
{
if (clockid == CLOCK_REALTIME_ALARM)
return ALARM_REALTIME;
- if (clockid == CLOCK_BOOTTIME_ALARM)
- return ALARM_BOOTTIME;
- return -1;
+
+ WARN_ON_ONCE(clockid != CLOCK_BOOTTIME_ALARM);
+ return ALARM_BOOTTIME;
}
/**
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index f3e831f62906..a59bc75ab7c5 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -633,7 +633,7 @@ void tick_offline_cpu(unsigned int cpu)
raw_spin_lock(&clockevents_lock);
tick_broadcast_offline(cpu);
- tick_shutdown(cpu);
+ tick_shutdown();
/*
* Unregister the clock event devices which were
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index bb48498ebb5a..a1890a073196 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -144,7 +144,7 @@ static u64 suspend_start;
* Default for maximum permissible skew when cs->uncertainty_margin is
* not specified, and the lower bound even when cs->uncertainty_margin
* is specified. This is also the default that is used when registering
- * clocks with unspecifed cs->uncertainty_margin, so this macro is used
+ * clocks with unspecified cs->uncertainty_margin, so this macro is used
* even in CONFIG_CLOCKSOURCE_WATCHDOG=n kernels.
*/
#define WATCHDOG_MAX_SKEW (MAX_SKEW_USEC * NSEC_PER_USEC)
@@ -310,7 +310,7 @@ static void clocksource_verify_choose_cpus(void)
{
int cpu, i, n = verify_n_cpus;
- if (n < 0) {
+ if (n < 0 || n >= num_online_cpus()) {
/* Check all of the CPUs. */
cpumask_copy(&cpus_chosen, cpu_online_mask);
cpumask_clear_cpu(smp_processor_id(), &cpus_chosen);
@@ -323,9 +323,7 @@ static void clocksource_verify_choose_cpus(void)
return;
/* Make sure to select at least one CPU other than the current CPU. */
- cpu = cpumask_first(cpu_online_mask);
- if (cpu == smp_processor_id())
- cpu = cpumask_next(cpu, cpu_online_mask);
+ cpu = cpumask_any_but(cpu_online_mask, smp_processor_id());
if (WARN_ON_ONCE(cpu >= nr_cpu_ids))
return;
cpumask_set_cpu(cpu, &cpus_chosen);
@@ -342,10 +340,7 @@ static void clocksource_verify_choose_cpus(void)
* CPUs that are currently online.
*/
for (i = 1; i < n; i++) {
- cpu = get_random_u32_below(nr_cpu_ids);
- cpu = cpumask_next(cpu - 1, cpu_online_mask);
- if (cpu >= nr_cpu_ids)
- cpu = cpumask_first(cpu_online_mask);
+ cpu = cpumask_random(cpu_online_mask);
if (!WARN_ON_ONCE(cpu >= nr_cpu_ids))
cpumask_set_cpu(cpu, &cpus_chosen);
}
@@ -412,9 +407,8 @@ void clocksource_verify_percpu(struct clocksource *cs)
if (!cpumask_empty(&cpus_behind))
pr_warn(" CPUs %*pbl behind CPU %d for clocksource %s.\n",
cpumask_pr_args(&cpus_behind), testcpu, cs->name);
- if (!cpumask_empty(&cpus_ahead) || !cpumask_empty(&cpus_behind))
- pr_warn(" CPU %d check durations %lldns - %lldns for clocksource %s.\n",
- testcpu, cs_nsec_min, cs_nsec_max, cs->name);
+ pr_info(" CPU %d check durations %lldns - %lldns for clocksource %s.\n",
+ testcpu, cs_nsec_min, cs_nsec_max, cs->name);
}
EXPORT_SYMBOL_GPL(clocksource_verify_percpu);
@@ -589,9 +583,7 @@ static void clocksource_watchdog(struct timer_list *unused)
* Cycle through CPUs to check if the CPUs stay synchronized
* to each other.
*/
- next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask);
- if (next_cpu >= nr_cpu_ids)
- next_cpu = cpumask_first(cpu_online_mask);
+ next_cpu = cpumask_next_wrap(raw_smp_processor_id(), cpu_online_mask);
/*
* Arm timer if not already pending: could race with concurrent
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 30899a8cc52c..f8ea8c8fc895 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -59,6 +59,7 @@
#define HRTIMER_ACTIVE_ALL (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD)
static void retrigger_next_event(void *arg);
+static ktime_t __hrtimer_cb_get_time(clockid_t clock_id);
/*
* The timer bases:
@@ -76,42 +77,34 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
{
.index = HRTIMER_BASE_MONOTONIC,
.clockid = CLOCK_MONOTONIC,
- .get_time = &ktime_get,
},
{
.index = HRTIMER_BASE_REALTIME,
.clockid = CLOCK_REALTIME,
- .get_time = &ktime_get_real,
},
{
.index = HRTIMER_BASE_BOOTTIME,
.clockid = CLOCK_BOOTTIME,
- .get_time = &ktime_get_boottime,
},
{
.index = HRTIMER_BASE_TAI,
.clockid = CLOCK_TAI,
- .get_time = &ktime_get_clocktai,
},
{
.index = HRTIMER_BASE_MONOTONIC_SOFT,
.clockid = CLOCK_MONOTONIC,
- .get_time = &ktime_get,
},
{
.index = HRTIMER_BASE_REALTIME_SOFT,
.clockid = CLOCK_REALTIME,
- .get_time = &ktime_get_real,
},
{
.index = HRTIMER_BASE_BOOTTIME_SOFT,
.clockid = CLOCK_BOOTTIME,
- .get_time = &ktime_get_boottime,
},
{
.index = HRTIMER_BASE_TAI_SOFT,
.clockid = CLOCK_TAI,
- .get_time = &ktime_get_clocktai,
},
},
.csd = CSD_INIT(retrigger_next_event, NULL)
@@ -208,7 +201,7 @@ static bool hrtimer_suitable_target(struct hrtimer *timer, struct hrtimer_clock_
/*
* The offline local CPU can't be the default target if the
* next remote target event is after this timer. Keep the
- * elected new base. An IPI will we issued to reprogram
+ * elected new base. An IPI will be issued to reprogram
* it as a last resort.
*/
if (!hrtimer_base_is_online(this_cpu_base))
@@ -787,10 +780,10 @@ static void retrigger_next_event(void *arg)
* of the next expiring timer is enough. The return from the SMP
* function call will take care of the reprogramming in case the
* CPU was in a NOHZ idle sleep.
+ *
+ * In periodic low resolution mode, the next softirq expiration
+ * must also be updated.
*/
- if (!hrtimer_hres_active(base) && !tick_nohz_active)
- return;
-
raw_spin_lock(&base->lock);
hrtimer_update_base(base);
if (hrtimer_hres_active(base))
@@ -1253,7 +1246,7 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
remove_hrtimer(timer, base, true, force_local);
if (mode & HRTIMER_MODE_REL)
- tim = ktime_add_safe(tim, base->get_time());
+ tim = ktime_add_safe(tim, __hrtimer_cb_get_time(base->clockid));
tim = hrtimer_update_lowres(timer, tim, mode);
@@ -1574,10 +1567,10 @@ u64 hrtimer_next_event_without(const struct hrtimer *exclude)
static inline int hrtimer_clockid_to_base(clockid_t clock_id)
{
switch (clock_id) {
- case CLOCK_REALTIME:
- return HRTIMER_BASE_REALTIME;
case CLOCK_MONOTONIC:
return HRTIMER_BASE_MONOTONIC;
+ case CLOCK_REALTIME:
+ return HRTIMER_BASE_REALTIME;
case CLOCK_BOOTTIME:
return HRTIMER_BASE_BOOTTIME;
case CLOCK_TAI:
@@ -1588,6 +1581,29 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id)
}
}
+static ktime_t __hrtimer_cb_get_time(clockid_t clock_id)
+{
+ switch (clock_id) {
+ case CLOCK_MONOTONIC:
+ return ktime_get();
+ case CLOCK_REALTIME:
+ return ktime_get_real();
+ case CLOCK_BOOTTIME:
+ return ktime_get_boottime();
+ case CLOCK_TAI:
+ return ktime_get_clocktai();
+ default:
+ WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id);
+ return ktime_get();
+ }
+}
+
+ktime_t hrtimer_cb_get_time(const struct hrtimer *timer)
+{
+ return __hrtimer_cb_get_time(timer->base->clockid);
+}
+EXPORT_SYMBOL_GPL(hrtimer_cb_get_time);
+
static void __hrtimer_setup(struct hrtimer *timer,
enum hrtimer_restart (*function)(struct hrtimer *),
clockid_t clock_id, enum hrtimer_mode mode)
@@ -2129,7 +2145,7 @@ static long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
int ret;
hrtimer_setup_sleeper_on_stack(&t, restart->nanosleep.clockid, HRTIMER_MODE_ABS);
- hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
+ hrtimer_set_expires(&t.timer, restart->nanosleep.expires);
ret = do_nanosleep(&t, HRTIMER_MODE_ABS);
destroy_hrtimer_on_stack(&t.timer);
return ret;
@@ -2156,7 +2172,7 @@ long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode,
restart = &current->restart_block;
restart->nanosleep.clockid = t.timer.base->clockid;
- restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer);
+ restart->nanosleep.expires = hrtimer_get_expires(&t.timer);
set_restart_fn(restart, hrtimer_nanosleep_restart);
out:
destroy_hrtimer_on_stack(&t.timer);
@@ -2295,11 +2311,6 @@ int hrtimers_cpu_dying(unsigned int dying_cpu)
&new_base->clock_base[i]);
}
- /*
- * The migration might have changed the first expiring softirq
- * timer on this CPU. Update it.
- */
- __hrtimer_get_next_event(new_base, HRTIMER_ACTIVE_SOFT);
/* Tell the other CPU to retrigger the next event */
smp_call_function_single(ncpu, retrigger_next_event, NULL, 0);
diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
index 876d389b2e21..7c6110e964e7 100644
--- a/kernel/time/itimer.c
+++ b/kernel/time/itimer.c
@@ -163,8 +163,7 @@ void posixtimer_rearm_itimer(struct task_struct *tsk)
struct hrtimer *tmr = &tsk->signal->real_timer;
if (!hrtimer_is_queued(tmr) && tsk->signal->it_real_incr != 0) {
- hrtimer_forward(tmr, tmr->base->get_time(),
- tsk->signal->it_real_incr);
+ hrtimer_forward_now(tmr, tsk->signal->it_real_incr);
hrtimer_restart(tmr);
}
}
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index bc4db9e5ab70..d31a6d40d38d 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -75,13 +75,11 @@ struct clocksource * __init __weak clocksource_default_clock(void)
static struct clocksource refined_jiffies;
-int register_refined_jiffies(long cycles_per_second)
+void __init register_refined_jiffies(long cycles_per_second)
{
u64 nsec_per_tick, shift_hz;
long cycles_per_tick;
-
-
refined_jiffies = clocksource_jiffies;
refined_jiffies.name = "refined-jiffies";
refined_jiffies.rating++;
@@ -100,5 +98,129 @@ int register_refined_jiffies(long cycles_per_second)
refined_jiffies.mult = ((u32)nsec_per_tick) << JIFFIES_SHIFT;
__clocksource_register(&refined_jiffies);
- return 0;
}
+
+#define SYSCTL_CONV_MULT_HZ(val) ((val) * HZ)
+#define SYSCTL_CONV_DIV_HZ(val) ((val) / HZ)
+
+static SYSCTL_USER_TO_KERN_INT_CONV(_hz, SYSCTL_CONV_MULT_HZ)
+static SYSCTL_KERN_TO_USER_INT_CONV(_hz, SYSCTL_CONV_DIV_HZ)
+static SYSCTL_USER_TO_KERN_INT_CONV(_userhz, clock_t_to_jiffies)
+static SYSCTL_KERN_TO_USER_INT_CONV(_userhz, jiffies_to_clock_t)
+static SYSCTL_USER_TO_KERN_INT_CONV(_ms, msecs_to_jiffies)
+static SYSCTL_KERN_TO_USER_INT_CONV(_ms, jiffies_to_msecs)
+
+static SYSCTL_INT_CONV_CUSTOM(_jiffies, sysctl_user_to_kern_int_conv_hz,
+ sysctl_kern_to_user_int_conv_hz, false)
+static SYSCTL_INT_CONV_CUSTOM(_userhz_jiffies,
+ sysctl_user_to_kern_int_conv_userhz,
+ sysctl_kern_to_user_int_conv_userhz, false)
+static SYSCTL_INT_CONV_CUSTOM(_ms_jiffies, sysctl_user_to_kern_int_conv_ms,
+ sysctl_kern_to_user_int_conv_ms, false)
+static SYSCTL_INT_CONV_CUSTOM(_ms_jiffies_minmax,
+ sysctl_user_to_kern_int_conv_ms,
+ sysctl_kern_to_user_int_conv_ms, true)
+
+/**
+ * proc_dointvec_jiffies - read a vector of integers as seconds
+ * @table: the sysctl table
+ * @dir: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+ * values from/to the user buffer, treated as an ASCII string.
+ * The values read are assumed to be in seconds, and are converted into
+ * jiffies.
+ *
+ * Returns 0 on success.
+ */
+int proc_dointvec_jiffies(const struct ctl_table *table, int dir,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return proc_dointvec_conv(table, dir, buffer, lenp, ppos,
+ do_proc_int_conv_jiffies);
+}
+EXPORT_SYMBOL(proc_dointvec_jiffies);
+
+/**
+ * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds
+ * @table: the sysctl table
+ * @dir: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: pointer to the file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+ * values from/to the user buffer, treated as an ASCII string.
+ * The values read are assumed to be in 1/USER_HZ seconds, and
+ * are converted into jiffies.
+ *
+ * Returns 0 on success.
+ */
+int proc_dointvec_userhz_jiffies(const struct ctl_table *table, int dir,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ if (SYSCTL_USER_TO_KERN(dir) && USER_HZ < HZ)
+ return -EINVAL;
+ return proc_dointvec_conv(table, dir, buffer, lenp, ppos,
+ do_proc_int_conv_userhz_jiffies);
+}
+EXPORT_SYMBOL(proc_dointvec_userhz_jiffies);
+
+/**
+ * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds
+ * @table: the sysctl table
+ * @dir: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: the current position in the file
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+ * values from/to the user buffer, treated as an ASCII string.
+ * The values read are assumed to be in 1/1000 seconds, and
+ * are converted into jiffies.
+ *
+ * Returns 0 on success.
+ */
+int proc_dointvec_ms_jiffies(const struct ctl_table *table, int dir, void *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ return proc_dointvec_conv(table, dir, buffer, lenp, ppos,
+ do_proc_int_conv_ms_jiffies);
+}
+EXPORT_SYMBOL(proc_dointvec_ms_jiffies);
+
+int proc_dointvec_ms_jiffies_minmax(const struct ctl_table *table, int dir,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return proc_dointvec_conv(table, dir, buffer, lenp, ppos,
+ do_proc_int_conv_ms_jiffies_minmax);
+}
+
+/**
+ * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values
+ * @table: the sysctl table
+ * @dir: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long
+ * values from/to the user buffer, treated as an ASCII string. The values
+ * are treated as milliseconds, and converted to jiffies when they are stored.
+ *
+ * This routine will ensure the values are within the range specified by
+ * table->extra1 (min) and table->extra2 (max).
+ *
+ * Returns 0 on success.
+ */
+int proc_doulongvec_ms_jiffies_minmax(const struct ctl_table *table, int dir,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return proc_doulongvec_minmax_conv(table, dir, buffer, lenp, ppos,
+ HZ, 1000l);
+}
+EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax);
+
diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c
index e3642278df43..e76be24b132c 100644
--- a/kernel/time/namespace.c
+++ b/kernel/time/namespace.c
@@ -12,6 +12,7 @@
#include <linux/seq_file.h>
#include <linux/proc_ns.h>
#include <linux/export.h>
+#include <linux/nstree.h>
#include <linux/time.h>
#include <linux/slab.h>
#include <linux/cred.h>
@@ -88,25 +89,23 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns,
goto fail;
err = -ENOMEM;
- ns = kmalloc(sizeof(*ns), GFP_KERNEL_ACCOUNT);
+ ns = kzalloc(sizeof(*ns), GFP_KERNEL_ACCOUNT);
if (!ns)
goto fail_dec;
- refcount_set(&ns->ns.count, 1);
-
ns->vvar_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
if (!ns->vvar_page)
goto fail_free;
- err = ns_alloc_inum(&ns->ns);
+ err = ns_common_init(ns);
if (err)
goto fail_free_page;
ns->ucounts = ucounts;
- ns->ns.ops = &timens_operations;
ns->user_ns = get_user_ns(user_ns);
ns->offsets = old_ns->offsets;
ns->frozen_offsets = false;
+ ns_tree_add(ns);
return ns;
fail_free_page:
@@ -130,7 +129,7 @@ fail:
*
* Return: timens_for_children namespace or ERR_PTR.
*/
-struct time_namespace *copy_time_ns(unsigned long flags,
+struct time_namespace *copy_time_ns(u64 flags,
struct user_namespace *user_ns, struct time_namespace *old_ns)
{
if (!(flags & CLONE_NEWTIME))
@@ -242,22 +241,24 @@ static void timens_set_vvar_page(struct task_struct *task,
for (i = 0; i < CS_BASES; i++)
timens_setup_vdso_clock_data(&vc[i], ns);
+ if (IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS)) {
+ for (i = 0; i < ARRAY_SIZE(vdata->aux_clock_data); i++)
+ timens_setup_vdso_clock_data(&vdata->aux_clock_data[i], ns);
+ }
+
out:
mutex_unlock(&offset_lock);
}
void free_time_ns(struct time_namespace *ns)
{
+ ns_tree_remove(ns);
dec_time_namespaces(ns->ucounts);
put_user_ns(ns->user_ns);
- ns_free_inum(&ns->ns);
+ ns_common_free(ns);
__free_page(ns->vvar_page);
- kfree(ns);
-}
-
-static struct time_namespace *to_time_ns(struct ns_common *ns)
-{
- return container_of(ns, struct time_namespace, ns);
+ /* Concurrent nstree traversal depends on a grace period. */
+ kfree_rcu(ns, ns.ns_rcu);
}
static struct ns_common *timens_get(struct task_struct *task)
@@ -461,7 +462,6 @@ out:
const struct proc_ns_operations timens_operations = {
.name = "time",
- .type = CLONE_NEWTIME,
.get = timens_get,
.put = timens_put,
.install = timens_install,
@@ -471,7 +471,6 @@ const struct proc_ns_operations timens_operations = {
const struct proc_ns_operations timens_for_children_operations = {
.name = "time_for_children",
.real_ns_name = "time",
- .type = CLONE_NEWTIME,
.get = timens_for_children_get,
.put = timens_put,
.install = timens_install,
@@ -479,9 +478,12 @@ const struct proc_ns_operations timens_for_children_operations = {
};
struct time_namespace init_time_ns = {
- .ns.count = REFCOUNT_INIT(3),
+ .ns = NS_COMMON_INIT(init_time_ns),
.user_ns = &init_user_ns,
- .ns.inum = PROC_TIME_INIT_INO,
- .ns.ops = &timens_operations,
.frozen_offsets = true,
};
+
+void __init time_ns_init(void)
+{
+ ns_tree_add(&init_time_ns);
+}
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index b837d3d9d325..97fa99b96dd0 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -18,6 +18,7 @@
#include <linux/module.h>
#include <linux/rtc.h>
#include <linux/audit.h>
+#include <linux/timekeeper_internal.h>
#include "ntp_internal.h"
#include "timekeeping_internal.h"
@@ -86,14 +87,16 @@ struct ntp_data {
#endif
};
-static struct ntp_data tk_ntp_data = {
- .tick_usec = USER_TICK_USEC,
- .time_state = TIME_OK,
- .time_status = STA_UNSYNC,
- .time_constant = 2,
- .time_maxerror = NTP_PHASE_LIMIT,
- .time_esterror = NTP_PHASE_LIMIT,
- .ntp_next_leap_sec = TIME64_MAX,
+static struct ntp_data tk_ntp_data[TIMEKEEPERS_MAX] = {
+ [ 0 ... TIMEKEEPERS_MAX - 1 ] = {
+ .tick_usec = USER_TICK_USEC,
+ .time_state = TIME_OK,
+ .time_status = STA_UNSYNC,
+ .time_constant = 2,
+ .time_maxerror = NTP_PHASE_LIMIT,
+ .time_esterror = NTP_PHASE_LIMIT,
+ .ntp_next_leap_sec = TIME64_MAX,
+ },
};
#define SECS_PER_DAY 86400
@@ -300,7 +303,7 @@ static void ntp_update_offset(struct ntp_data *ntpdata, long offset)
* Select how the frequency is to be controlled
* and in which mode (PLL or FLL).
*/
- real_secs = __ktime_get_real_seconds();
+ real_secs = ktime_get_ntp_seconds(ntpdata - tk_ntp_data);
secs = (long)(real_secs - ntpdata->time_reftime);
if (unlikely(ntpdata->time_status & STA_FREQHOLD))
secs = 0;
@@ -348,33 +351,38 @@ static void __ntp_clear(struct ntp_data *ntpdata)
/**
* ntp_clear - Clears the NTP state variables
+ * @tkid: Timekeeper ID to be able to select proper ntp data array member
*/
-void ntp_clear(void)
+void ntp_clear(unsigned int tkid)
{
- __ntp_clear(&tk_ntp_data);
+ __ntp_clear(&tk_ntp_data[tkid]);
}
-u64 ntp_tick_length(void)
+u64 ntp_tick_length(unsigned int tkid)
{
- return tk_ntp_data.tick_length;
+ return tk_ntp_data[tkid].tick_length;
}
/**
* ntp_get_next_leap - Returns the next leapsecond in CLOCK_REALTIME ktime_t
+ * @tkid: Timekeeper ID
*
- * Provides the time of the next leapsecond against CLOCK_REALTIME in
- * a ktime_t format. Returns KTIME_MAX if no leapsecond is pending.
+ * Returns: For @tkid == TIMEKEEPER_CORE this provides the time of the next
+ * leap second against CLOCK_REALTIME in a ktime_t format if a
+ * leap second is pending. KTIME_MAX otherwise.
*/
-ktime_t ntp_get_next_leap(void)
+ktime_t ntp_get_next_leap(unsigned int tkid)
{
- struct ntp_data *ntpdata = &tk_ntp_data;
- ktime_t ret;
+ struct ntp_data *ntpdata = &tk_ntp_data[TIMEKEEPER_CORE];
+
+ if (tkid != TIMEKEEPER_CORE)
+ return KTIME_MAX;
if ((ntpdata->time_state == TIME_INS) && (ntpdata->time_status & STA_INS))
return ktime_set(ntpdata->ntp_next_leap_sec, 0);
- ret = KTIME_MAX;
- return ret;
+
+ return KTIME_MAX;
}
/*
@@ -387,9 +395,9 @@ ktime_t ntp_get_next_leap(void)
*
* Also handles leap second processing, and returns leap offset
*/
-int second_overflow(time64_t secs)
+int second_overflow(unsigned int tkid, time64_t secs)
{
- struct ntp_data *ntpdata = &tk_ntp_data;
+ struct ntp_data *ntpdata = &tk_ntp_data[tkid];
s64 delta;
int leap = 0;
s32 rem;
@@ -605,7 +613,7 @@ static inline int update_rtc(struct timespec64 *to_set, unsigned long *offset_ns
*/
static inline bool ntp_synced(void)
{
- return !(tk_ntp_data.time_status & STA_UNSYNC);
+ return !(tk_ntp_data[TIMEKEEPER_CORE].time_status & STA_UNSYNC);
}
/*
@@ -702,7 +710,7 @@ static inline void process_adj_status(struct ntp_data *ntpdata, const struct __k
* reference time to current time.
*/
if (!(ntpdata->time_status & STA_PLL) && (txc->status & STA_PLL))
- ntpdata->time_reftime = __ktime_get_real_seconds();
+ ntpdata->time_reftime = ktime_get_ntp_seconds(ntpdata - tk_ntp_data);
/* only set allowed bits */
ntpdata->time_status &= STA_RONLY;
@@ -759,10 +767,10 @@ static inline void process_adjtimex_modes(struct ntp_data *ntpdata, const struct
* adjtimex() mainly allows reading (and writing, if superuser) of
* kernel time-keeping variables. used by xntpd.
*/
-int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts,
- s32 *time_tai, struct audit_ntp_data *ad)
+int ntp_adjtimex(unsigned int tkid, struct __kernel_timex *txc, const struct timespec64 *ts,
+ s32 *time_tai, struct audit_ntp_data *ad)
{
- struct ntp_data *ntpdata = &tk_ntp_data;
+ struct ntp_data *ntpdata = &tk_ntp_data[tkid];
int result;
if (txc->modes & ADJ_ADJTIME) {
@@ -1031,8 +1039,8 @@ static void hardpps_update_phase(struct ntp_data *ntpdata, long error)
*/
void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts)
{
+ struct ntp_data *ntpdata = &tk_ntp_data[TIMEKEEPER_CORE];
struct pps_normtime pts_norm, freq_norm;
- struct ntp_data *ntpdata = &tk_ntp_data;
pts_norm = pps_normalize_ts(*phase_ts);
@@ -1083,18 +1091,18 @@ void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_t
static int __init ntp_tick_adj_setup(char *str)
{
- int rc = kstrtos64(str, 0, &tk_ntp_data.ntp_tick_adj);
+ int rc = kstrtos64(str, 0, &tk_ntp_data[TIMEKEEPER_CORE].ntp_tick_adj);
if (rc)
return rc;
- tk_ntp_data.ntp_tick_adj <<= NTP_SCALE_SHIFT;
+ tk_ntp_data[TIMEKEEPER_CORE].ntp_tick_adj <<= NTP_SCALE_SHIFT;
return 1;
}
-
__setup("ntp_tick_adj=", ntp_tick_adj_setup);
void __init ntp_init(void)
{
- ntp_clear();
+ for (int id = 0; id < TIMEKEEPERS_MAX; id++)
+ __ntp_clear(tk_ntp_data + id);
ntp_init_cmos_sync();
}
diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h
index 5a633dce9057..7084d839c207 100644
--- a/kernel/time/ntp_internal.h
+++ b/kernel/time/ntp_internal.h
@@ -3,14 +3,13 @@
#define _LINUX_NTP_INTERNAL_H
extern void ntp_init(void);
-extern void ntp_clear(void);
+extern void ntp_clear(unsigned int tkid);
/* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */
-extern u64 ntp_tick_length(void);
-extern ktime_t ntp_get_next_leap(void);
-extern int second_overflow(time64_t secs);
-extern int __do_adjtimex(struct __kernel_timex *txc,
- const struct timespec64 *ts,
- s32 *time_tai, struct audit_ntp_data *ad);
+extern u64 ntp_tick_length(unsigned int tkid);
+extern ktime_t ntp_get_next_leap(unsigned int tkid);
+extern int second_overflow(unsigned int tkid, time64_t secs);
+extern int ntp_adjtimex(unsigned int tkid, struct __kernel_timex *txc, const struct timespec64 *ts,
+ s32 *time_tai, struct audit_ntp_data *ad);
extern void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts);
#if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC)
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 50e8d04ab661..0de2bb7cbec0 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -1406,6 +1406,15 @@ void run_posix_cpu_timers(void)
lockdep_assert_irqs_disabled();
/*
+ * Ensure that release_task(tsk) can't happen while
+ * handle_posix_cpu_timers() is running. Otherwise, a concurrent
+ * posix_cpu_timer_del() may fail to lock_task_sighand(tsk) and
+ * miss timer->it.cpu.firing != 0.
+ */
+ if (tsk->exit_state)
+ return;
+
+ /*
* If the actual expiry is deferred to task work context and the
* work is already scheduled there is no point to do anything here.
*/
@@ -1548,7 +1557,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
* Report back to the user the time still remaining.
*/
restart = &current->restart_block;
- restart->nanosleep.expires = expires;
+ restart->nanosleep.expires = ns_to_ktime(expires);
if (restart->nanosleep.type != TT_NONE)
error = nanosleep_copyout(restart, &it.it_value);
}
@@ -1590,7 +1599,7 @@ static long posix_cpu_nsleep_restart(struct restart_block *restart_block)
clockid_t which_clock = restart_block->nanosleep.clockid;
struct timespec64 t;
- t = ns_to_timespec64(restart_block->nanosleep.expires);
+ t = ktime_to_timespec64(restart_block->nanosleep.expires);
return do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t);
}
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 6222112533a7..80a8a09a21a0 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -30,8 +30,6 @@
#include "timekeeping.h"
#include "posix-timers.h"
-static struct kmem_cache *posix_timers_cache;
-
/*
* Timers are managed in a hash table for lockless lookup. The hash key is
* constructed from current::signal and the timer ID and the timer is
@@ -49,10 +47,12 @@ struct timer_hash_bucket {
static struct {
struct timer_hash_bucket *buckets;
unsigned long mask;
-} __timer_data __ro_after_init __aligned(2*sizeof(long));
+ struct kmem_cache *cache;
+} __timer_data __ro_after_init __aligned(4*sizeof(long));
-#define timer_buckets (__timer_data.buckets)
-#define timer_hashmask (__timer_data.mask)
+#define timer_buckets (__timer_data.buckets)
+#define timer_hashmask (__timer_data.mask)
+#define posix_timers_cache (__timer_data.cache)
static const struct k_clock * const posix_clocks[];
static const struct k_clock *clockid_to_kclock(const clockid_t id);
@@ -283,14 +283,6 @@ static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp)
return 0;
}
-static __init int init_posix_timers(void)
-{
- posix_timers_cache = kmem_cache_create("posix_timers_cache", sizeof(struct k_itimer),
- __alignof__(struct k_itimer), SLAB_ACCOUNT, NULL);
- return 0;
-}
-__initcall(init_posix_timers);
-
/*
* The siginfo si_overrun field and the return value of timer_getoverrun(2)
* are of type int. Clamp the overrun value to INT_MAX
@@ -307,8 +299,7 @@ static void common_hrtimer_rearm(struct k_itimer *timr)
{
struct hrtimer *timer = &timr->it.real.timer;
- timr->it_overrun += hrtimer_forward(timer, timer->base->get_time(),
- timr->it_interval);
+ timr->it_overrun += hrtimer_forward_now(timer, timr->it_interval);
hrtimer_restart(timer);
}
@@ -484,12 +475,6 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event,
if (!kc->timer_create)
return -EOPNOTSUPP;
- new_timer = alloc_posix_timer();
- if (unlikely(!new_timer))
- return -EAGAIN;
-
- spin_lock_init(&new_timer->it_lock);
-
/* Special case for CRIU to restore timers with a given timer ID. */
if (unlikely(current->signal->timer_create_restore_ids)) {
if (copy_from_user(&req_id, created_timer_id, sizeof(req_id)))
@@ -499,6 +484,12 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event,
return -EINVAL;
}
+ new_timer = alloc_posix_timer();
+ if (unlikely(!new_timer))
+ return -EAGAIN;
+
+ spin_lock_init(&new_timer->it_lock);
+
/*
* Add the timer to the hash table. The timer is not yet valid
* after insertion, but has a unique ID allocated.
@@ -543,7 +534,7 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event,
goto out;
}
/*
- * After succesful copy out, the timer ID is visible to user space
+ * After successful copy out, the timer ID is visible to user space
* now but not yet valid because new_timer::signal low order bit is 1.
*
* Complete the initialization with the clock specific create
@@ -833,7 +824,7 @@ static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires,
hrtimer_setup(&timr->it.real.timer, posix_timer_fn, timr->it_clock, mode);
if (!absolute)
- expires = ktime_add_safe(expires, timer->base->get_time());
+ expires = ktime_add_safe(expires, hrtimer_cb_get_time(timer));
hrtimer_set_expires(timer, expires);
if (!sigev_none)
@@ -1251,7 +1242,7 @@ SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock,
* sys_clock_settime(). The kernel internal timekeeping is always using
* nanoseconds precision independent of the clocksource device which is
* used to read the time from. The resolution of that device only
- * affects the presicion of the time returned by sys_clock_gettime().
+ * affects the precision of the time returned by sys_clock_gettime().
*
* Returns:
* 0 Success. @tp contains the resolution
@@ -1534,6 +1525,9 @@ static const struct k_clock * const posix_clocks[] = {
[CLOCK_REALTIME_ALARM] = &alarm_clock,
[CLOCK_BOOTTIME_ALARM] = &alarm_clock,
[CLOCK_TAI] = &clock_tai,
+#ifdef CONFIG_POSIX_AUX_CLOCKS
+ [CLOCK_AUX ... CLOCK_AUX_LAST] = &clock_aux,
+#endif
};
static const struct k_clock *clockid_to_kclock(const clockid_t id)
@@ -1556,6 +1550,11 @@ static int __init posixtimer_init(void)
unsigned long i, size;
unsigned int shift;
+ posix_timers_cache = kmem_cache_create("posix_timers_cache",
+ sizeof(struct k_itimer),
+ __alignof__(struct k_itimer),
+ SLAB_ACCOUNT, NULL);
+
if (IS_ENABLED(CONFIG_BASE_SMALL))
size = 512;
else
diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h
index 61906f0688c1..7f259e845d24 100644
--- a/kernel/time/posix-timers.h
+++ b/kernel/time/posix-timers.h
@@ -41,6 +41,7 @@ extern const struct k_clock clock_posix_dynamic;
extern const struct k_clock clock_process;
extern const struct k_clock clock_thread;
extern const struct k_clock alarm_clock;
+extern const struct k_clock clock_aux;
void posix_timer_queue_signal(struct k_itimer *timr);
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index cc15fe293719..f39111830ca3 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -174,8 +174,7 @@ static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt)
return HRTIMER_RESTART;
}
-void __init
-sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
+void sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
{
u64 res, wrap, new_mask, new_epoch, cyc, ns;
u32 new_mult, new_shift;
@@ -247,6 +246,7 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
pr_debug("Registered %pS as sched_clock source\n", read);
}
+EXPORT_SYMBOL_GPL(sched_clock_register);
void __init generic_sched_clock_init(void)
{
@@ -296,6 +296,11 @@ int sched_clock_suspend(void)
return 0;
}
+static int sched_clock_syscore_suspend(void *data)
+{
+ return sched_clock_suspend();
+}
+
void sched_clock_resume(void)
{
struct clock_read_data *rd = &cd.read_data[0];
@@ -305,14 +310,23 @@ void sched_clock_resume(void)
rd->read_sched_clock = cd.actual_read_sched_clock;
}
-static struct syscore_ops sched_clock_ops = {
- .suspend = sched_clock_suspend,
- .resume = sched_clock_resume,
+static void sched_clock_syscore_resume(void *data)
+{
+ sched_clock_resume();
+}
+
+static const struct syscore_ops sched_clock_syscore_ops = {
+ .suspend = sched_clock_syscore_suspend,
+ .resume = sched_clock_syscore_resume,
+};
+
+static struct syscore sched_clock_syscore = {
+ .ops = &sched_clock_syscore_ops,
};
static int __init sched_clock_syscore_init(void)
{
- register_syscore_ops(&sched_clock_ops);
+ register_syscore(&sched_clock_syscore);
return 0;
}
diff --git a/kernel/time/sleep_timeout.c b/kernel/time/sleep_timeout.c
index c0e960a5de39..3c90574bd904 100644
--- a/kernel/time/sleep_timeout.c
+++ b/kernel/time/sleep_timeout.c
@@ -22,7 +22,7 @@ struct process_timer {
static void process_timeout(struct timer_list *t)
{
- struct process_timer *timeout = from_timer(timeout, t, timer);
+ struct process_timer *timeout = timer_container_of(timeout, t, timer);
wake_up_process(timeout->task);
}
@@ -100,7 +100,7 @@ signed long __sched schedule_timeout(signed long timeout)
timer_delete_sync(&timer.timer);
/* Remove the timer from the object tracker */
- destroy_timer_on_stack(&timer.timer);
+ timer_destroy_on_stack(&timer.timer);
timeout = expire - jiffies;
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 9a3859443c04..7e33d3f2e889 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -411,24 +411,18 @@ int tick_cpu_dying(unsigned int dying_cpu)
}
/*
- * Shutdown an event device on a given cpu:
+ * Shutdown an event device on the outgoing CPU:
*
- * This is called on a life CPU, when a CPU is dead. So we cannot
- * access the hardware device itself.
- * We just set the mode and remove it from the lists.
+ * Called by the dying CPU during teardown, with clockevents_lock held
+ * and interrupts disabled.
*/
-void tick_shutdown(unsigned int cpu)
+void tick_shutdown(void)
{
- struct tick_device *td = &per_cpu(tick_cpu_device, cpu);
+ struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
struct clock_event_device *dev = td->evtdev;
td->mode = TICKDEV_MODE_PERIODIC;
if (dev) {
- /*
- * Prevent that the clock events layer tries to call
- * the set mode function!
- */
- clockevent_set_state(dev, CLOCK_EVT_STATE_DETACHED);
clockevents_exchange_device(dev, NULL);
dev->event_handler = clockevents_handle_noop;
td->evtdev = NULL;
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index faac36de35b9..4e4f7bbe2a64 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -26,7 +26,7 @@ extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast);
extern void tick_handle_periodic(struct clock_event_device *dev);
extern void tick_check_new_device(struct clock_event_device *dev);
extern void tick_offline_cpu(unsigned int cpu);
-extern void tick_shutdown(unsigned int cpu);
+extern void tick_shutdown(void);
extern void tick_suspend(void);
extern void tick_resume(void);
extern bool tick_check_replacement(struct clock_event_device *curdev,
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 5e2c2c26b3cc..ffee943d796d 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -19,6 +19,10 @@
/**
* tick_program_event - program the CPU local timer device for the next event
+ * @expires: the time at which the next timer event should occur
+ * @force: flag to force reprograming even if the event time hasn't changed
+ *
+ * Return: 0 on success, negative error code on failure
*/
int tick_program_event(ktime_t expires, int force)
{
@@ -57,6 +61,13 @@ void tick_resume_oneshot(void)
/**
* tick_setup_oneshot - setup the event device for oneshot mode (hres or nohz)
+ * @newdev: Pointer to the clock event device to configure
+ * @handler: Function to be called when the event device triggers an interrupt
+ * @next_event: Initial expiry time for the next event (in ktime)
+ *
+ * Configures the specified clock event device for onshot mode,
+ * assigns the given handler as its event callback, and programs
+ * the device to trigger at the specified next event time.
*/
void tick_setup_oneshot(struct clock_event_device *newdev,
void (*handler)(struct clock_event_device *),
@@ -69,6 +80,10 @@ void tick_setup_oneshot(struct clock_event_device *newdev,
/**
* tick_switch_to_oneshot - switch to oneshot mode
+ * @handler: function to call when an event occurs on the tick device
+ *
+ * Return: 0 on success, -EINVAL if the tick device is not present,
+ * not functional, or does not support oneshot mode.
*/
int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))
{
@@ -101,7 +116,7 @@ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))
/**
* tick_oneshot_mode_active - check whether the system is in oneshot mode
*
- * returns 1 when either nohz or highres are enabled. otherwise 0.
+ * Return: 1 when either nohz or highres are enabled, otherwise 0.
*/
int tick_oneshot_mode_active(void)
{
@@ -120,6 +135,9 @@ int tick_oneshot_mode_active(void)
* tick_init_highres - switch to high resolution mode
*
* Called with interrupts disabled.
+ *
+ * Return: 0 on success, -EINVAL if the tick device cannot switch
+ * to oneshot/high-resolution mode.
*/
int tick_init_highres(void)
{
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index c527b421c865..8ddf74e705d3 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -201,6 +201,27 @@ static inline void tick_sched_flag_clear(struct tick_sched *ts,
ts->flags &= ~flag;
}
+/*
+ * Allow only one non-timekeeper CPU at a time update jiffies from
+ * the timer tick.
+ *
+ * Returns true if update was run.
+ */
+static bool tick_limited_update_jiffies64(struct tick_sched *ts, ktime_t now)
+{
+ static atomic_t in_progress;
+ int inp;
+
+ inp = atomic_read(&in_progress);
+ if (inp || !atomic_try_cmpxchg(&in_progress, &inp, 1))
+ return false;
+
+ if (ts->last_tick_jiffies == jiffies)
+ tick_do_update_jiffies64(now);
+ atomic_set(&in_progress, 0);
+ return true;
+}
+
#define MAX_STALLED_JIFFIES 5
static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now)
@@ -239,10 +260,11 @@ static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now)
ts->stalled_jiffies = 0;
ts->last_tick_jiffies = READ_ONCE(jiffies);
} else {
- if (++ts->stalled_jiffies == MAX_STALLED_JIFFIES) {
- tick_do_update_jiffies64(now);
- ts->stalled_jiffies = 0;
- ts->last_tick_jiffies = READ_ONCE(jiffies);
+ if (++ts->stalled_jiffies >= MAX_STALLED_JIFFIES) {
+ if (tick_limited_update_jiffies64(ts, now)) {
+ ts->stalled_jiffies = 0;
+ ts->last_tick_jiffies = READ_ONCE(jiffies);
+ }
}
}
@@ -1152,16 +1174,15 @@ static bool report_idle_softirq(void)
return false;
}
- if (ratelimit >= 10)
- return false;
-
/* On RT, softirq handling may be waiting on some lock */
if (local_bh_blocked())
return false;
- pr_warn("NOHZ tick-stop error: local softirq work is pending, handler #%02x!!!\n",
- pending);
- ratelimit++;
+ if (ratelimit < 10) {
+ pr_warn("NOHZ tick-stop error: local softirq work is pending, handler #%02x!!!\n",
+ pending);
+ ratelimit++;
+ }
return true;
}
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 1b69caa87480..0ba8e3c50d62 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -858,6 +858,7 @@ struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
return res;
}
+EXPORT_SYMBOL_GPL(timespec64_add_safe);
/**
* get_timespec64 - get user's time value into kernel space
diff --git a/kernel/time/timecounter.c b/kernel/time/timecounter.c
index e6285288d765..3d2a354cfe1c 100644
--- a/kernel/time/timecounter.c
+++ b/kernel/time/timecounter.c
@@ -6,7 +6,7 @@
#include <linux/timecounter.h>
void timecounter_init(struct timecounter *tc,
- const struct cyclecounter *cc,
+ struct cyclecounter *cc,
u64 start_tstamp)
{
tc->cc = cc;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 1e67d076f195..3ec3daa4acab 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -6,6 +6,7 @@
#include <linux/timekeeper_internal.h>
#include <linux/module.h>
#include <linux/interrupt.h>
+#include <linux/kobject.h>
#include <linux/percpu.h>
#include <linux/init.h>
#include <linux/mm.h>
@@ -25,6 +26,8 @@
#include <linux/audit.h>
#include <linux/random.h>
+#include <vdso/auxclock.h>
+
#include "tick-internal.h"
#include "ntp_internal.h"
#include "timekeeping_internal.h"
@@ -53,7 +56,38 @@ struct tk_data {
raw_spinlock_t lock;
} ____cacheline_aligned;
-static struct tk_data tk_core;
+static struct tk_data timekeeper_data[TIMEKEEPERS_MAX];
+
+/* The core timekeeper */
+#define tk_core (timekeeper_data[TIMEKEEPER_CORE])
+
+#ifdef CONFIG_POSIX_AUX_CLOCKS
+static inline bool tk_get_aux_ts64(unsigned int tkid, struct timespec64 *ts)
+{
+ return ktime_get_aux_ts64(CLOCK_AUX + tkid - TIMEKEEPER_AUX_FIRST, ts);
+}
+
+static inline bool tk_is_aux(const struct timekeeper *tk)
+{
+ return tk->id >= TIMEKEEPER_AUX_FIRST && tk->id <= TIMEKEEPER_AUX_LAST;
+}
+#else
+static inline bool tk_get_aux_ts64(unsigned int tkid, struct timespec64 *ts)
+{
+ return false;
+}
+
+static inline bool tk_is_aux(const struct timekeeper *tk)
+{
+ return false;
+}
+#endif
+
+static inline void tk_update_aux_offs(struct timekeeper *tk, ktime_t offs)
+{
+ tk->offs_aux = offs;
+ tk->monotonic_to_aux = ktime_to_timespec64(offs);
+}
/* flag for if timekeeping is suspended */
int __read_mostly timekeeping_suspended;
@@ -113,6 +147,16 @@ static struct tk_fast tk_fast_raw ____cacheline_aligned = {
.base[1] = FAST_TK_INIT,
};
+#ifdef CONFIG_POSIX_AUX_CLOCKS
+static __init void tk_aux_setup(void);
+static void tk_aux_update_clocksource(void);
+static void tk_aux_advance(void);
+#else
+static inline void tk_aux_setup(void) { }
+static inline void tk_aux_update_clocksource(void) { }
+static inline void tk_aux_advance(void) { }
+#endif
+
unsigned long timekeeper_lock_irqsave(void)
{
unsigned long flags;
@@ -164,10 +208,34 @@ static inline struct timespec64 tk_xtime(const struct timekeeper *tk)
return ts;
}
+static inline struct timespec64 tk_xtime_coarse(const struct timekeeper *tk)
+{
+ struct timespec64 ts;
+
+ ts.tv_sec = tk->xtime_sec;
+ ts.tv_nsec = tk->coarse_nsec;
+ return ts;
+}
+
+/*
+ * Update the nanoseconds part for the coarse time keepers. They can't rely
+ * on xtime_nsec because xtime_nsec could be adjusted by a small negative
+ * amount when the multiplication factor of the clock is adjusted, which
+ * could cause the coarse clocks to go slightly backwards. See
+ * timekeeping_apply_adjustment(). Thus we keep a separate copy for the coarse
+ * clockids which only is updated when the clock has been set or we have
+ * accumulated time.
+ */
+static inline void tk_update_coarse_nsecs(struct timekeeper *tk)
+{
+ tk->coarse_nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
+}
+
static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts)
{
tk->xtime_sec = ts->tv_sec;
tk->tkr_mono.xtime_nsec = (u64)ts->tv_nsec << tk->tkr_mono.shift;
+ tk_update_coarse_nsecs(tk);
}
static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts)
@@ -175,6 +243,7 @@ static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts)
tk->xtime_sec += ts->tv_sec;
tk->tkr_mono.xtime_nsec += (u64)ts->tv_nsec << tk->tkr_mono.shift;
tk_normalize_xtime(tk);
+ tk_update_coarse_nsecs(tk);
}
static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm)
@@ -576,7 +645,7 @@ EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier);
*/
static inline void tk_update_leap_state(struct timekeeper *tk)
{
- tk->next_leap_ktime = ntp_get_next_leap();
+ tk->next_leap_ktime = ntp_get_next_leap(tk->id);
if (tk->next_leap_ktime != KTIME_MAX)
/* Convert to monotonic time */
tk->next_leap_ktime = ktime_sub(tk->next_leap_ktime, tk->offs_real);
@@ -638,7 +707,7 @@ static void timekeeping_restore_shadow(struct tk_data *tkd)
static void timekeeping_update_from_shadow(struct tk_data *tkd, unsigned int action)
{
- struct timekeeper *tk = &tk_core.shadow_timekeeper;
+ struct timekeeper *tk = &tkd->shadow_timekeeper;
lockdep_assert_held(&tkd->lock);
@@ -653,18 +722,22 @@ static void timekeeping_update_from_shadow(struct tk_data *tkd, unsigned int act
if (action & TK_CLEAR_NTP) {
tk->ntp_error = 0;
- ntp_clear();
+ ntp_clear(tk->id);
}
tk_update_leap_state(tk);
tk_update_ktime_data(tk);
+ tk->tkr_mono.base_real = tk->tkr_mono.base + tk->offs_real;
- update_vsyscall(tk);
- update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET);
+ if (tk->id == TIMEKEEPER_CORE) {
+ update_vsyscall(tk);
+ update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET);
- tk->tkr_mono.base_real = tk->tkr_mono.base + tk->offs_real;
- update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono);
- update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw);
+ update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono);
+ update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw);
+ } else if (tk_is_aux(tk)) {
+ vdso_time_update_aux(tk);
+ }
if (action & TK_CLOCK_WAS_SET)
tk->clock_was_set_seq++;
@@ -708,6 +781,7 @@ static void timekeeping_forward_now(struct timekeeper *tk)
tk_normalize_xtime(tk);
delta -= incr;
}
+ tk_update_coarse_nsecs(tk);
}
/**
@@ -804,8 +878,8 @@ EXPORT_SYMBOL_GPL(ktime_get_with_offset);
ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs)
{
struct timekeeper *tk = &tk_core.timekeeper;
- unsigned int seq;
ktime_t base, *offset = offsets[offs];
+ unsigned int seq;
u64 nsecs;
WARN_ON(timekeeping_suspended);
@@ -813,7 +887,7 @@ ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs)
do {
seq = read_seqcount_begin(&tk_core.seq);
base = ktime_add(tk->tkr_mono.base, *offset);
- nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
+ nsecs = tk->coarse_nsec;
} while (read_seqcount_retry(&tk_core.seq, seq));
@@ -949,9 +1023,14 @@ time64_t ktime_get_real_seconds(void)
EXPORT_SYMBOL_GPL(ktime_get_real_seconds);
/**
- * __ktime_get_real_seconds - The same as ktime_get_real_seconds
- * but without the sequence counter protect. This internal function
- * is called just when timekeeping lock is already held.
+ * __ktime_get_real_seconds - Unprotected access to CLOCK_REALTIME seconds
+ *
+ * The same as ktime_get_real_seconds() but without the sequence counter
+ * protection. This function is used in restricted contexts like the x86 MCE
+ * handler and in KGDB. It's unprotected on 32-bit vs. concurrent half
+ * completed modification and only to be used for such critical contexts.
+ *
+ * Returns: Racy snapshot of the CLOCK_REALTIME seconds value
*/
noinstr time64_t __ktime_get_real_seconds(void)
{
@@ -1230,7 +1309,7 @@ int get_device_system_crosststamp(int (*get_time_fn)
struct system_time_snapshot *history_begin,
struct system_device_crosststamp *xtstamp)
{
- struct system_counterval_t system_counterval;
+ struct system_counterval_t system_counterval = {};
struct timekeeper *tk = &tk_core.timekeeper;
u64 cycles, now, interval_start;
unsigned int clock_was_set_seq = 0;
@@ -1386,41 +1465,73 @@ int do_settimeofday64(const struct timespec64 *ts)
}
EXPORT_SYMBOL(do_settimeofday64);
+static inline bool timekeeper_is_core_tk(struct timekeeper *tk)
+{
+ return !IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS) || tk->id == TIMEKEEPER_CORE;
+}
+
/**
- * timekeeping_inject_offset - Adds or subtracts from the current time.
+ * __timekeeping_inject_offset - Adds or subtracts from the current time.
+ * @tkd: Pointer to the timekeeper to modify
* @ts: Pointer to the timespec variable containing the offset
*
* Adds or subtracts an offset value from the current time.
*/
-static int timekeeping_inject_offset(const struct timespec64 *ts)
+static int __timekeeping_inject_offset(struct tk_data *tkd, const struct timespec64 *ts)
{
+ struct timekeeper *tks = &tkd->shadow_timekeeper;
+ struct timespec64 tmp;
+
if (ts->tv_nsec < 0 || ts->tv_nsec >= NSEC_PER_SEC)
return -EINVAL;
- scoped_guard (raw_spinlock_irqsave, &tk_core.lock) {
- struct timekeeper *tks = &tk_core.shadow_timekeeper;
- struct timespec64 tmp;
-
- timekeeping_forward_now(tks);
+ timekeeping_forward_now(tks);
+ if (timekeeper_is_core_tk(tks)) {
/* Make sure the proposed value is valid */
tmp = timespec64_add(tk_xtime(tks), *ts);
if (timespec64_compare(&tks->wall_to_monotonic, ts) > 0 ||
!timespec64_valid_settod(&tmp)) {
- timekeeping_restore_shadow(&tk_core);
+ timekeeping_restore_shadow(tkd);
return -EINVAL;
}
tk_xtime_add(tks, ts);
tk_set_wall_to_mono(tks, timespec64_sub(tks->wall_to_monotonic, *ts));
- timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL);
+ } else {
+ struct tk_read_base *tkr_mono = &tks->tkr_mono;
+ ktime_t now, offs;
+
+ /* Get the current time */
+ now = ktime_add_ns(tkr_mono->base, timekeeping_get_ns(tkr_mono));
+ /* Add the relative offset change */
+ offs = ktime_add(tks->offs_aux, timespec64_to_ktime(*ts));
+
+ /* Prevent that the resulting time becomes negative */
+ if (ktime_add(now, offs) < 0) {
+ timekeeping_restore_shadow(tkd);
+ return -EINVAL;
+ }
+ tk_update_aux_offs(tks, offs);
}
- /* Signal hrtimers about time change */
- clock_was_set(CLOCK_SET_WALL);
+ timekeeping_update_from_shadow(tkd, TK_UPDATE_ALL);
return 0;
}
+static int timekeeping_inject_offset(const struct timespec64 *ts)
+{
+ int ret;
+
+ scoped_guard (raw_spinlock_irqsave, &tk_core.lock)
+ ret = __timekeeping_inject_offset(&tk_core, ts);
+
+ /* Signal hrtimers about time change */
+ if (!ret)
+ clock_was_set(CLOCK_SET_WALL);
+ return ret;
+}
+
/*
* Indicates if there is an offset between the system clock and the hardware
* clock/persistent clock/rtc.
@@ -1496,6 +1607,8 @@ static int change_clocksource(void *data)
timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL);
}
+ tk_aux_update_clocksource();
+
if (old) {
if (old->disable)
old->disable(old);
@@ -1547,6 +1660,39 @@ void ktime_get_raw_ts64(struct timespec64 *ts)
}
EXPORT_SYMBOL(ktime_get_raw_ts64);
+/**
+ * ktime_get_clock_ts64 - Returns time of a clock in a timespec
+ * @id: POSIX clock ID of the clock to read
+ * @ts: Pointer to the timespec64 to be set
+ *
+ * The timestamp is invalidated (@ts->sec is set to -1) if the
+ * clock @id is not available.
+ */
+void ktime_get_clock_ts64(clockid_t id, struct timespec64 *ts)
+{
+ /* Invalidate time stamp */
+ ts->tv_sec = -1;
+ ts->tv_nsec = 0;
+
+ switch (id) {
+ case CLOCK_REALTIME:
+ ktime_get_real_ts64(ts);
+ return;
+ case CLOCK_MONOTONIC:
+ ktime_get_ts64(ts);
+ return;
+ case CLOCK_MONOTONIC_RAW:
+ ktime_get_raw_ts64(ts);
+ return;
+ case CLOCK_AUX ... CLOCK_AUX_LAST:
+ if (IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS))
+ ktime_get_aux_ts64(id, ts);
+ return;
+ default:
+ WARN_ON_ONCE(1);
+ }
+}
+EXPORT_SYMBOL_GPL(ktime_get_clock_ts64);
/**
* timekeeping_valid_for_hres - Check if timekeeping is suitable for hres
@@ -1623,10 +1769,12 @@ read_persistent_wall_and_boot_offset(struct timespec64 *wall_time,
*boot_offset = ns_to_timespec64(local_clock());
}
-static __init void tkd_basic_setup(struct tk_data *tkd)
+static __init void tkd_basic_setup(struct tk_data *tkd, enum timekeeper_ids tk_id, bool valid)
{
raw_spin_lock_init(&tkd->lock);
seqcount_raw_spinlock_init(&tkd->seq, &tkd->lock);
+ tkd->timekeeper.id = tkd->shadow_timekeeper.id = tk_id;
+ tkd->timekeeper.clock_valid = tkd->shadow_timekeeper.clock_valid = valid;
}
/*
@@ -1656,7 +1804,8 @@ void __init timekeeping_init(void)
struct timekeeper *tks = &tk_core.shadow_timekeeper;
struct clocksource *clock;
- tkd_basic_setup(&tk_core);
+ tkd_basic_setup(&tk_core, TIMEKEEPER_CORE, true);
+ tk_aux_setup();
read_persistent_wall_and_boot_offset(&wall_time, &boot_offset);
if (timespec64_valid_settod(&wall_time) &&
@@ -1845,6 +1994,11 @@ void timekeeping_resume(void)
timerfd_resume();
}
+static void timekeeping_syscore_resume(void *data)
+{
+ timekeeping_resume();
+}
+
int timekeeping_suspend(void)
{
struct timekeeper *tks = &tk_core.shadow_timekeeper;
@@ -1912,15 +2066,24 @@ int timekeeping_suspend(void)
return 0;
}
+static int timekeeping_syscore_suspend(void *data)
+{
+ return timekeeping_suspend();
+}
+
/* sysfs resume/suspend bits for timekeeping */
-static struct syscore_ops timekeeping_syscore_ops = {
- .resume = timekeeping_resume,
- .suspend = timekeeping_suspend,
+static const struct syscore_ops timekeeping_syscore_ops = {
+ .resume = timekeeping_syscore_resume,
+ .suspend = timekeeping_syscore_suspend,
+};
+
+static struct syscore timekeeping_syscore = {
+ .ops = &timekeeping_syscore_ops,
};
static int __init timekeeping_init_ops(void)
{
- register_syscore_ops(&timekeeping_syscore_ops);
+ register_syscore(&timekeeping_syscore);
return 0;
}
device_initcall(timekeeping_init_ops);
@@ -2008,7 +2171,7 @@ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk,
*/
static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
{
- u64 ntp_tl = ntp_tick_length();
+ u64 ntp_tl = ntp_tick_length(tk->id);
u32 mult;
/*
@@ -2089,7 +2252,7 @@ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
}
/* Figure out if its a leap sec and apply if needed */
- leap = second_overflow(tk->xtime_sec);
+ leap = second_overflow(tk->id, tk->xtime_sec);
if (unlikely(leap)) {
struct timespec64 ts;
@@ -2155,15 +2318,13 @@ static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset,
* timekeeping_advance - Updates the timekeeper to the current time and
* current NTP tick length
*/
-static bool timekeeping_advance(enum timekeeping_adv_mode mode)
+static bool __timekeeping_advance(struct tk_data *tkd, enum timekeeping_adv_mode mode)
{
- struct timekeeper *tk = &tk_core.shadow_timekeeper;
- struct timekeeper *real_tk = &tk_core.timekeeper;
+ struct timekeeper *tk = &tkd->shadow_timekeeper;
+ struct timekeeper *real_tk = &tkd->timekeeper;
unsigned int clock_set = 0;
int shift = 0, maxshift;
- u64 offset;
-
- guard(raw_spinlock_irqsave)(&tk_core.lock);
+ u64 offset, orig_offset;
/* Make sure we're fully resumed: */
if (unlikely(timekeeping_suspended))
@@ -2172,7 +2333,7 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode)
offset = clocksource_delta(tk_clock_read(&tk->tkr_mono),
tk->tkr_mono.cycle_last, tk->tkr_mono.mask,
tk->tkr_mono.clock->max_raw_delta);
-
+ orig_offset = offset;
/* Check if there's really nothing to do */
if (offset < real_tk->cycle_interval && mode == TK_ADV_TICK)
return false;
@@ -2188,7 +2349,7 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode)
shift = ilog2(offset) - ilog2(tk->cycle_interval);
shift = max(0, shift);
/* Bound shift to one less than what overflows tick_length */
- maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1;
+ maxshift = (64 - (ilog2(ntp_tick_length(tk->id)) + 1)) - 1;
shift = min(shift, maxshift);
while (offset >= tk->cycle_interval) {
offset = logarithmic_accumulation(tk, offset, shift, &clock_set);
@@ -2205,19 +2366,35 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode)
*/
clock_set |= accumulate_nsecs_to_secs(tk);
- timekeeping_update_from_shadow(&tk_core, clock_set);
+ /*
+ * To avoid inconsistencies caused adjtimex TK_ADV_FREQ calls
+ * making small negative adjustments to the base xtime_nsec
+ * value, only update the coarse clocks if we accumulated time
+ */
+ if (orig_offset != offset)
+ tk_update_coarse_nsecs(tk);
+
+ timekeeping_update_from_shadow(tkd, clock_set);
return !!clock_set;
}
+static bool timekeeping_advance(enum timekeeping_adv_mode mode)
+{
+ guard(raw_spinlock_irqsave)(&tk_core.lock);
+ return __timekeeping_advance(&tk_core, mode);
+}
+
/**
* update_wall_time - Uses the current clocksource to increment the wall time
*
+ * It also updates the enabled auxiliary clock timekeepers
*/
void update_wall_time(void)
{
if (timekeeping_advance(TK_ADV_TICK))
clock_was_set_delayed();
+ tk_aux_advance();
}
/**
@@ -2248,7 +2425,7 @@ void ktime_get_coarse_real_ts64(struct timespec64 *ts)
do {
seq = read_seqcount_begin(&tk_core.seq);
- *ts = tk_xtime(tk);
+ *ts = tk_xtime_coarse(tk);
} while (read_seqcount_retry(&tk_core.seq, seq));
}
EXPORT_SYMBOL(ktime_get_coarse_real_ts64);
@@ -2271,7 +2448,7 @@ void ktime_get_coarse_real_ts64_mg(struct timespec64 *ts)
do {
seq = read_seqcount_begin(&tk_core.seq);
- *ts = tk_xtime(tk);
+ *ts = tk_xtime_coarse(tk);
offset = tk_core.timekeeper.offs_real;
} while (read_seqcount_retry(&tk_core.seq, seq));
@@ -2350,12 +2527,12 @@ void ktime_get_coarse_ts64(struct timespec64 *ts)
do {
seq = read_seqcount_begin(&tk_core.seq);
- now = tk_xtime(tk);
+ now = tk_xtime_coarse(tk);
mono = tk->wall_to_monotonic;
} while (read_seqcount_retry(&tk_core.seq, seq));
set_normalized_timespec64(ts, now.tv_sec + mono.tv_sec,
- now.tv_nsec + mono.tv_nsec);
+ now.tv_nsec + mono.tv_nsec);
}
EXPORT_SYMBOL(ktime_get_coarse_ts64);
@@ -2415,7 +2592,7 @@ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real,
/*
* timekeeping_validate_timex - Ensures the timex is ok for use in do_adjtimex
*/
-static int timekeeping_validate_timex(const struct __kernel_timex *txc)
+static int timekeeping_validate_timex(const struct __kernel_timex *txc, bool aux_clock)
{
if (txc->modes & ADJ_ADJTIME) {
/* singleshot must not be used with any other mode bits */
@@ -2474,6 +2651,20 @@ static int timekeeping_validate_timex(const struct __kernel_timex *txc)
return -EINVAL;
}
+ if (aux_clock) {
+ /* Auxiliary clocks are similar to TAI and do not have leap seconds */
+ if (txc->status & (STA_INS | STA_DEL))
+ return -EINVAL;
+
+ /* No TAI offset setting */
+ if (txc->modes & ADJ_TAI)
+ return -EINVAL;
+
+ /* No PPS support either */
+ if (txc->status & (STA_PPSFREQ | STA_PPSTIME))
+ return -EINVAL;
+ }
+
return 0;
}
@@ -2492,74 +2683,103 @@ unsigned long random_get_entropy_fallback(void)
}
EXPORT_SYMBOL_GPL(random_get_entropy_fallback);
-/**
- * do_adjtimex() - Accessor function to NTP __do_adjtimex function
- * @txc: Pointer to kernel_timex structure containing NTP parameters
- */
-int do_adjtimex(struct __kernel_timex *txc)
+struct adjtimex_result {
+ struct audit_ntp_data ad;
+ struct timespec64 delta;
+ bool clock_set;
+};
+
+static int __do_adjtimex(struct tk_data *tkd, struct __kernel_timex *txc,
+ struct adjtimex_result *result)
{
- struct audit_ntp_data ad;
- bool offset_set = false;
- bool clock_set = false;
+ struct timekeeper *tks = &tkd->shadow_timekeeper;
+ bool aux_clock = !timekeeper_is_core_tk(tks);
struct timespec64 ts;
+ s32 orig_tai, tai;
int ret;
/* Validate the data before disabling interrupts */
- ret = timekeeping_validate_timex(txc);
+ ret = timekeeping_validate_timex(txc, aux_clock);
if (ret)
return ret;
add_device_randomness(txc, sizeof(*txc));
- if (txc->modes & ADJ_SETOFFSET) {
- struct timespec64 delta;
+ if (!aux_clock)
+ ktime_get_real_ts64(&ts);
+ else
+ tk_get_aux_ts64(tkd->timekeeper.id, &ts);
+
+ add_device_randomness(&ts, sizeof(ts));
+
+ guard(raw_spinlock_irqsave)(&tkd->lock);
+
+ if (!tks->clock_valid)
+ return -ENODEV;
- delta.tv_sec = txc->time.tv_sec;
- delta.tv_nsec = txc->time.tv_usec;
+ if (txc->modes & ADJ_SETOFFSET) {
+ result->delta.tv_sec = txc->time.tv_sec;
+ result->delta.tv_nsec = txc->time.tv_usec;
if (!(txc->modes & ADJ_NANO))
- delta.tv_nsec *= 1000;
- ret = timekeeping_inject_offset(&delta);
+ result->delta.tv_nsec *= 1000;
+ ret = __timekeeping_inject_offset(tkd, &result->delta);
if (ret)
return ret;
-
- offset_set = delta.tv_sec != 0;
- audit_tk_injoffset(delta);
+ result->clock_set = true;
}
- audit_ntp_init(&ad);
+ orig_tai = tai = tks->tai_offset;
+ ret = ntp_adjtimex(tks->id, txc, &ts, &tai, &result->ad);
- ktime_get_real_ts64(&ts);
- add_device_randomness(&ts, sizeof(ts));
+ if (tai != orig_tai) {
+ __timekeeping_set_tai_offset(tks, tai);
+ timekeeping_update_from_shadow(tkd, TK_CLOCK_WAS_SET);
+ result->clock_set = true;
+ } else {
+ tk_update_leap_state_all(&tk_core);
+ }
- scoped_guard (raw_spinlock_irqsave, &tk_core.lock) {
- struct timekeeper *tks = &tk_core.shadow_timekeeper;
- s32 orig_tai, tai;
+ /* Update the multiplier immediately if frequency was set directly */
+ if (txc->modes & (ADJ_FREQUENCY | ADJ_TICK))
+ result->clock_set |= __timekeeping_advance(tkd, TK_ADV_FREQ);
- orig_tai = tai = tks->tai_offset;
- ret = __do_adjtimex(txc, &ts, &tai, &ad);
+ return ret;
+}
- if (tai != orig_tai) {
- __timekeeping_set_tai_offset(tks, tai);
- timekeeping_update_from_shadow(&tk_core, TK_CLOCK_WAS_SET);
- clock_set = true;
- } else {
- tk_update_leap_state_all(&tk_core);
- }
- }
+/**
+ * do_adjtimex() - Accessor function to NTP __do_adjtimex function
+ * @txc: Pointer to kernel_timex structure containing NTP parameters
+ */
+int do_adjtimex(struct __kernel_timex *txc)
+{
+ struct adjtimex_result result = { };
+ int ret;
- audit_ntp_log(&ad);
+ ret = __do_adjtimex(&tk_core, txc, &result);
+ if (ret < 0)
+ return ret;
- /* Update the multiplier immediately if frequency was set directly */
- if (txc->modes & (ADJ_FREQUENCY | ADJ_TICK))
- clock_set |= timekeeping_advance(TK_ADV_FREQ);
+ if (txc->modes & ADJ_SETOFFSET)
+ audit_tk_injoffset(result.delta);
+
+ audit_ntp_log(&result.ad);
- if (clock_set)
+ if (result.clock_set)
clock_was_set(CLOCK_SET_WALL);
- ntp_notify_cmos_timer(offset_set);
+ ntp_notify_cmos_timer(result.delta.tv_sec != 0);
return ret;
}
+/*
+ * Invoked from NTP with the time keeper lock held, so lockless access is
+ * fine.
+ */
+long ktime_get_ntp_seconds(unsigned int id)
+{
+ return timekeeper_data[id].timekeeper.xtime_sec;
+}
+
#ifdef CONFIG_NTP_PPS
/**
* hardpps() - Accessor function to NTP __hardpps function
@@ -2573,3 +2793,321 @@ void hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts)
}
EXPORT_SYMBOL(hardpps);
#endif /* CONFIG_NTP_PPS */
+
+#ifdef CONFIG_POSIX_AUX_CLOCKS
+#include "posix-timers.h"
+
+/*
+ * Bitmap for the activated auxiliary timekeepers to allow lockless quick
+ * checks in the hot paths without touching extra cache lines. If set, then
+ * the state of the corresponding timekeeper has to be re-checked under
+ * timekeeper::lock.
+ */
+static unsigned long aux_timekeepers;
+
+static inline unsigned int clockid_to_tkid(unsigned int id)
+{
+ return TIMEKEEPER_AUX_FIRST + id - CLOCK_AUX;
+}
+
+static inline struct tk_data *aux_get_tk_data(clockid_t id)
+{
+ if (!clockid_aux_valid(id))
+ return NULL;
+ return &timekeeper_data[clockid_to_tkid(id)];
+}
+
+/* Invoked from timekeeping after a clocksource change */
+static void tk_aux_update_clocksource(void)
+{
+ unsigned long active = READ_ONCE(aux_timekeepers);
+ unsigned int id;
+
+ for_each_set_bit(id, &active, BITS_PER_LONG) {
+ struct tk_data *tkd = &timekeeper_data[id + TIMEKEEPER_AUX_FIRST];
+ struct timekeeper *tks = &tkd->shadow_timekeeper;
+
+ guard(raw_spinlock_irqsave)(&tkd->lock);
+ if (!tks->clock_valid)
+ continue;
+
+ timekeeping_forward_now(tks);
+ tk_setup_internals(tks, tk_core.timekeeper.tkr_mono.clock);
+ timekeeping_update_from_shadow(tkd, TK_UPDATE_ALL);
+ }
+}
+
+static void tk_aux_advance(void)
+{
+ unsigned long active = READ_ONCE(aux_timekeepers);
+ unsigned int id;
+
+ /* Lockless quick check to avoid extra cache lines */
+ for_each_set_bit(id, &active, BITS_PER_LONG) {
+ struct tk_data *aux_tkd = &timekeeper_data[id + TIMEKEEPER_AUX_FIRST];
+
+ guard(raw_spinlock)(&aux_tkd->lock);
+ if (aux_tkd->shadow_timekeeper.clock_valid)
+ __timekeeping_advance(aux_tkd, TK_ADV_TICK);
+ }
+}
+
+/**
+ * ktime_get_aux - Get time for a AUX clock
+ * @id: ID of the clock to read (CLOCK_AUX...)
+ * @kt: Pointer to ktime_t to store the time stamp
+ *
+ * Returns: True if the timestamp is valid, false otherwise
+ */
+bool ktime_get_aux(clockid_t id, ktime_t *kt)
+{
+ struct tk_data *aux_tkd = aux_get_tk_data(id);
+ struct timekeeper *aux_tk;
+ unsigned int seq;
+ ktime_t base;
+ u64 nsecs;
+
+ WARN_ON(timekeeping_suspended);
+
+ if (!aux_tkd)
+ return false;
+
+ aux_tk = &aux_tkd->timekeeper;
+ do {
+ seq = read_seqcount_begin(&aux_tkd->seq);
+ if (!aux_tk->clock_valid)
+ return false;
+
+ base = ktime_add(aux_tk->tkr_mono.base, aux_tk->offs_aux);
+ nsecs = timekeeping_get_ns(&aux_tk->tkr_mono);
+ } while (read_seqcount_retry(&aux_tkd->seq, seq));
+
+ *kt = ktime_add_ns(base, nsecs);
+ return true;
+}
+EXPORT_SYMBOL_GPL(ktime_get_aux);
+
+/**
+ * ktime_get_aux_ts64 - Get time for a AUX clock
+ * @id: ID of the clock to read (CLOCK_AUX...)
+ * @ts: Pointer to timespec64 to store the time stamp
+ *
+ * Returns: True if the timestamp is valid, false otherwise
+ */
+bool ktime_get_aux_ts64(clockid_t id, struct timespec64 *ts)
+{
+ ktime_t now;
+
+ if (!ktime_get_aux(id, &now))
+ return false;
+ *ts = ktime_to_timespec64(now);
+ return true;
+}
+EXPORT_SYMBOL_GPL(ktime_get_aux_ts64);
+
+static int aux_get_res(clockid_t id, struct timespec64 *tp)
+{
+ if (!clockid_aux_valid(id))
+ return -ENODEV;
+
+ tp->tv_sec = aux_clock_resolution_ns() / NSEC_PER_SEC;
+ tp->tv_nsec = aux_clock_resolution_ns() % NSEC_PER_SEC;
+ return 0;
+}
+
+static int aux_get_timespec(clockid_t id, struct timespec64 *tp)
+{
+ return ktime_get_aux_ts64(id, tp) ? 0 : -ENODEV;
+}
+
+static int aux_clock_set(const clockid_t id, const struct timespec64 *tnew)
+{
+ struct tk_data *aux_tkd = aux_get_tk_data(id);
+ struct timekeeper *aux_tks;
+ ktime_t tnow, nsecs;
+
+ if (!timespec64_valid_settod(tnew))
+ return -EINVAL;
+ if (!aux_tkd)
+ return -ENODEV;
+
+ aux_tks = &aux_tkd->shadow_timekeeper;
+
+ guard(raw_spinlock_irq)(&aux_tkd->lock);
+ if (!aux_tks->clock_valid)
+ return -ENODEV;
+
+ /* Forward the timekeeper base time */
+ timekeeping_forward_now(aux_tks);
+ /*
+ * Get the updated base time. tkr_mono.base has not been
+ * updated yet, so do that first. That makes the update
+ * in timekeeping_update_from_shadow() redundant, but
+ * that's harmless. After that @tnow can be calculated
+ * by using tkr_mono::cycle_last, which has been set
+ * by timekeeping_forward_now().
+ */
+ tk_update_ktime_data(aux_tks);
+ nsecs = timekeeping_cycles_to_ns(&aux_tks->tkr_mono, aux_tks->tkr_mono.cycle_last);
+ tnow = ktime_add(aux_tks->tkr_mono.base, nsecs);
+
+ /*
+ * Calculate the new AUX offset as delta to @tnow ("monotonic").
+ * That avoids all the tk::xtime back and forth conversions as
+ * xtime ("realtime") is not applicable for auxiliary clocks and
+ * kept in sync with "monotonic".
+ */
+ tk_update_aux_offs(aux_tks, ktime_sub(timespec64_to_ktime(*tnew), tnow));
+
+ timekeeping_update_from_shadow(aux_tkd, TK_UPDATE_ALL);
+ return 0;
+}
+
+static int aux_clock_adj(const clockid_t id, struct __kernel_timex *txc)
+{
+ struct tk_data *aux_tkd = aux_get_tk_data(id);
+ struct adjtimex_result result = { };
+
+ if (!aux_tkd)
+ return -ENODEV;
+
+ /*
+ * @result is ignored for now as there are neither hrtimers nor a
+ * RTC related to auxiliary clocks for now.
+ */
+ return __do_adjtimex(aux_tkd, txc, &result);
+}
+
+const struct k_clock clock_aux = {
+ .clock_getres = aux_get_res,
+ .clock_get_timespec = aux_get_timespec,
+ .clock_set = aux_clock_set,
+ .clock_adj = aux_clock_adj,
+};
+
+static void aux_clock_enable(clockid_t id)
+{
+ struct tk_read_base *tkr_raw = &tk_core.timekeeper.tkr_raw;
+ struct tk_data *aux_tkd = aux_get_tk_data(id);
+ struct timekeeper *aux_tks = &aux_tkd->shadow_timekeeper;
+
+ /* Prevent the core timekeeper from changing. */
+ guard(raw_spinlock_irq)(&tk_core.lock);
+
+ /*
+ * Setup the auxiliary clock assuming that the raw core timekeeper
+ * clock frequency conversion is close enough. Userspace has to
+ * adjust for the deviation via clock_adjtime(2).
+ */
+ guard(raw_spinlock_nested)(&aux_tkd->lock);
+
+ /* Remove leftovers of a previous registration */
+ memset(aux_tks, 0, sizeof(*aux_tks));
+ /* Restore the timekeeper id */
+ aux_tks->id = aux_tkd->timekeeper.id;
+ /* Setup the timekeeper based on the current system clocksource */
+ tk_setup_internals(aux_tks, tkr_raw->clock);
+
+ /* Mark it valid and set it live */
+ aux_tks->clock_valid = true;
+ timekeeping_update_from_shadow(aux_tkd, TK_UPDATE_ALL);
+}
+
+static void aux_clock_disable(clockid_t id)
+{
+ struct tk_data *aux_tkd = aux_get_tk_data(id);
+
+ guard(raw_spinlock_irq)(&aux_tkd->lock);
+ aux_tkd->shadow_timekeeper.clock_valid = false;
+ timekeeping_update_from_shadow(aux_tkd, TK_UPDATE_ALL);
+}
+
+static DEFINE_MUTEX(aux_clock_mutex);
+
+static ssize_t aux_clock_enable_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ /* Lazy atoi() as name is "0..7" */
+ int id = kobj->name[0] & 0x7;
+ bool enable;
+
+ if (!capable(CAP_SYS_TIME))
+ return -EPERM;
+
+ if (kstrtobool(buf, &enable) < 0)
+ return -EINVAL;
+
+ guard(mutex)(&aux_clock_mutex);
+ if (enable == test_bit(id, &aux_timekeepers))
+ return count;
+
+ if (enable) {
+ aux_clock_enable(CLOCK_AUX + id);
+ set_bit(id, &aux_timekeepers);
+ } else {
+ aux_clock_disable(CLOCK_AUX + id);
+ clear_bit(id, &aux_timekeepers);
+ }
+ return count;
+}
+
+static ssize_t aux_clock_enable_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+ unsigned long active = READ_ONCE(aux_timekeepers);
+ /* Lazy atoi() as name is "0..7" */
+ int id = kobj->name[0] & 0x7;
+
+ return sysfs_emit(buf, "%d\n", test_bit(id, &active));
+}
+
+static struct kobj_attribute aux_clock_enable_attr = __ATTR_RW(aux_clock_enable);
+
+static struct attribute *aux_clock_enable_attrs[] = {
+ &aux_clock_enable_attr.attr,
+ NULL
+};
+
+static const struct attribute_group aux_clock_enable_attr_group = {
+ .attrs = aux_clock_enable_attrs,
+};
+
+static int __init tk_aux_sysfs_init(void)
+{
+ struct kobject *auxo, *tko = kobject_create_and_add("time", kernel_kobj);
+ int ret = -ENOMEM;
+
+ if (!tko)
+ return ret;
+
+ auxo = kobject_create_and_add("aux_clocks", tko);
+ if (!auxo)
+ goto err_clean;
+
+ for (int i = 0; i < MAX_AUX_CLOCKS; i++) {
+ char id[2] = { [0] = '0' + i, };
+ struct kobject *clk = kobject_create_and_add(id, auxo);
+
+ if (!clk) {
+ ret = -ENOMEM;
+ goto err_clean;
+ }
+
+ ret = sysfs_create_group(clk, &aux_clock_enable_attr_group);
+ if (ret)
+ goto err_clean;
+ }
+ return 0;
+
+err_clean:
+ kobject_put(auxo);
+ kobject_put(tko);
+ return ret;
+}
+late_initcall(tk_aux_sysfs_init);
+
+static __init void tk_aux_setup(void)
+{
+ for (int i = TIMEKEEPER_AUX_FIRST; i <= TIMEKEEPER_AUX_LAST; i++)
+ tkd_basic_setup(&timekeeper_data[i], i, false);
+}
+#endif /* CONFIG_POSIX_AUX_CLOCKS */
diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h
index 8c9079108ffb..973ede670a36 100644
--- a/kernel/time/timekeeping_internal.h
+++ b/kernel/time/timekeeping_internal.h
@@ -45,4 +45,7 @@ static inline u64 clocksource_delta(u64 now, u64 last, u64 mask, u64 max_delta)
unsigned long timekeeper_lock_irqsave(void);
void timekeeper_unlock_irqrestore(unsigned long flags);
+/* NTP specific interface to access the current seconds value */
+long ktime_get_ntp_seconds(unsigned int id);
+
#endif /* _TIMEKEEPING_INTERNAL_H */
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 4d915c0a263c..1f2364126894 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -386,32 +386,6 @@ static unsigned long round_jiffies_common(unsigned long j, int cpu,
}
/**
- * __round_jiffies - function to round jiffies to a full second
- * @j: the time in (absolute) jiffies that should be rounded
- * @cpu: the processor number on which the timeout will happen
- *
- * __round_jiffies() rounds an absolute time in the future (in jiffies)
- * up or down to (approximately) full seconds. This is useful for timers
- * for which the exact time they fire does not matter too much, as long as
- * they fire approximately every X seconds.
- *
- * By rounding these timers to whole seconds, all such timers will fire
- * at the same time, rather than at various times spread out. The goal
- * of this is to have the CPU wake up less, which saves power.
- *
- * The exact rounding is skewed for each processor to avoid all
- * processors firing at the exact same time, which could lead
- * to lock contention or spurious cache line bouncing.
- *
- * The return value is the rounded version of the @j parameter.
- */
-unsigned long __round_jiffies(unsigned long j, int cpu)
-{
- return round_jiffies_common(j, cpu, false);
-}
-EXPORT_SYMBOL_GPL(__round_jiffies);
-
-/**
* __round_jiffies_relative - function to round jiffies to a full second
* @j: the time in (relative) jiffies that should be rounded
* @cpu: the processor number on which the timeout will happen
@@ -483,22 +457,6 @@ unsigned long round_jiffies_relative(unsigned long j)
EXPORT_SYMBOL_GPL(round_jiffies_relative);
/**
- * __round_jiffies_up - function to round jiffies up to a full second
- * @j: the time in (absolute) jiffies that should be rounded
- * @cpu: the processor number on which the timeout will happen
- *
- * This is the same as __round_jiffies() except that it will never
- * round down. This is useful for timeouts for which the exact time
- * of firing does not matter too much, as long as they don't fire too
- * early.
- */
-unsigned long __round_jiffies_up(unsigned long j, int cpu)
-{
- return round_jiffies_common(j, cpu, true);
-}
-EXPORT_SYMBOL_GPL(__round_jiffies_up);
-
-/**
* __round_jiffies_up_relative - function to round jiffies up to a full second
* @j: the time in (relative) jiffies that should be rounded
* @cpu: the processor number on which the timeout will happen
@@ -850,7 +808,7 @@ static void do_init_timer(struct timer_list *timer,
unsigned int flags,
const char *name, struct lock_class_key *key);
-void init_timer_on_stack_key(struct timer_list *timer,
+void timer_init_key_on_stack(struct timer_list *timer,
void (*func)(struct timer_list *),
unsigned int flags,
const char *name, struct lock_class_key *key)
@@ -858,13 +816,13 @@ void init_timer_on_stack_key(struct timer_list *timer,
debug_object_init_on_stack(timer, &timer_debug_descr);
do_init_timer(timer, func, flags, name, key);
}
-EXPORT_SYMBOL_GPL(init_timer_on_stack_key);
+EXPORT_SYMBOL_GPL(timer_init_key_on_stack);
-void destroy_timer_on_stack(struct timer_list *timer)
+void timer_destroy_on_stack(struct timer_list *timer)
{
debug_object_free(timer, &timer_debug_descr);
}
-EXPORT_SYMBOL_GPL(destroy_timer_on_stack);
+EXPORT_SYMBOL_GPL(timer_destroy_on_stack);
#else
static inline void debug_timer_init(struct timer_list *timer) { }
@@ -904,7 +862,7 @@ static void do_init_timer(struct timer_list *timer,
}
/**
- * init_timer_key - initialize a timer
+ * timer_init_key - initialize a timer
* @timer: the timer to be initialized
* @func: timer callback function
* @flags: timer flags
@@ -912,17 +870,17 @@ static void do_init_timer(struct timer_list *timer,
* @key: lockdep class key of the fake lock used for tracking timer
* sync lock dependencies
*
- * init_timer_key() must be done to a timer prior to calling *any* of the
+ * timer_init_key() must be done to a timer prior to calling *any* of the
* other timer functions.
*/
-void init_timer_key(struct timer_list *timer,
+void timer_init_key(struct timer_list *timer,
void (*func)(struct timer_list *), unsigned int flags,
const char *name, struct lock_class_key *key)
{
debug_init(timer);
do_init_timer(timer, func, flags, name, key);
}
-EXPORT_SYMBOL(init_timer_key);
+EXPORT_SYMBOL(timer_init_key);
static inline void detach_timer(struct timer_list *timer, bool clear_pending)
{
@@ -1500,10 +1458,11 @@ static int __try_to_del_timer_sync(struct timer_list *timer, bool shutdown)
base = lock_timer_base(timer, &flags);
- if (base->running_timer != timer)
+ if (base->running_timer != timer) {
ret = detach_if_pending(timer, base, true);
- if (shutdown)
- timer->function = NULL;
+ if (shutdown)
+ timer->function = NULL;
+ }
raw_spin_unlock_irqrestore(&base->lock, flags);
@@ -1511,7 +1470,7 @@ static int __try_to_del_timer_sync(struct timer_list *timer, bool shutdown)
}
/**
- * try_to_del_timer_sync - Try to deactivate a timer
+ * timer_delete_sync_try - Try to deactivate a timer
* @timer: Timer to deactivate
*
* This function tries to deactivate a timer. On success the timer is not
@@ -1526,11 +1485,11 @@ static int __try_to_del_timer_sync(struct timer_list *timer, bool shutdown)
* * %1 - The timer was pending and deactivated
* * %-1 - The timer callback function is running on a different CPU
*/
-int try_to_del_timer_sync(struct timer_list *timer)
+int timer_delete_sync_try(struct timer_list *timer)
{
return __try_to_del_timer_sync(timer, false);
}
-EXPORT_SYMBOL(try_to_del_timer_sync);
+EXPORT_SYMBOL(timer_delete_sync_try);
#ifdef CONFIG_PREEMPT_RT
static __init void timer_base_init_expiry_lock(struct timer_base *base)
@@ -1900,7 +1859,7 @@ static void timer_recalc_next_expiry(struct timer_base *base)
unsigned long clk, next, adj;
unsigned lvl, offset = 0;
- next = base->clk + NEXT_TIMER_MAX_DELTA;
+ next = base->clk + TIMER_NEXT_MAX_DELTA;
clk = base->clk;
for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) {
int pos = next_pending_bucket(base, offset, clk & LVL_MASK);
@@ -1963,7 +1922,7 @@ static void timer_recalc_next_expiry(struct timer_base *base)
WRITE_ONCE(base->next_expiry, next);
base->next_expiry_recalc = false;
- base->timers_pending = !(next == base->clk + NEXT_TIMER_MAX_DELTA);
+ base->timers_pending = !(next == base->clk + TIMER_NEXT_MAX_DELTA);
}
#ifdef CONFIG_NO_HZ_COMMON
@@ -2015,7 +1974,7 @@ static unsigned long next_timer_interrupt(struct timer_base *base,
* easy comparable to find out which base holds the first pending timer.
*/
if (!base->timers_pending)
- WRITE_ONCE(base->next_expiry, basej + NEXT_TIMER_MAX_DELTA);
+ WRITE_ONCE(base->next_expiry, basej + TIMER_NEXT_MAX_DELTA);
return base->next_expiry;
}
@@ -2399,7 +2358,7 @@ static inline void __run_timers(struct timer_base *base)
* timer at this clk are that all matching timers have been
* dequeued or no timer has been queued since
* base::next_expiry was set to base::clk +
- * NEXT_TIMER_MAX_DELTA.
+ * TIMER_NEXT_MAX_DELTA.
*/
WARN_ON_ONCE(!levels && !base->next_expiry_recalc
&& base->timers_pending);
@@ -2514,7 +2473,7 @@ void update_process_times(int user_tick)
run_local_timers();
rcu_sched_clock_irq(user_tick);
#ifdef CONFIG_IRQ_WORK
- if (in_irq())
+ if (in_hardirq())
irq_work_tick();
#endif
sched_tick();
@@ -2544,7 +2503,7 @@ int timers_prepare_cpu(unsigned int cpu)
for (b = 0; b < NR_BASES; b++) {
base = per_cpu_ptr(&timer_bases[b], cpu);
base->clk = jiffies;
- base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
+ base->next_expiry = base->clk + TIMER_NEXT_MAX_DELTA;
base->next_expiry_recalc = false;
base->timers_pending = false;
base->is_idle = false;
@@ -2599,7 +2558,7 @@ static void __init init_timer_cpu(int cpu)
base->cpu = cpu;
raw_spin_lock_init(&base->lock);
base->clk = jiffies;
- base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
+ base->next_expiry = base->clk + TIMER_NEXT_MAX_DELTA;
timer_base_init_expiry_lock(base);
}
}
@@ -2612,7 +2571,7 @@ static void __init init_timer_cpus(void)
init_timer_cpu(cpu);
}
-void __init init_timers(void)
+void __init timers_init(void)
{
init_timer_cpus();
posix_cputimers_init_work();
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index b03d0ada6469..488e47e96e93 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -102,8 +102,6 @@ print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
SEQ_printf(m, " .index: %d\n", base->index);
SEQ_printf(m, " .resolution: %u nsecs\n", hrtimer_resolution);
-
- SEQ_printf(m, " .get_time: %ps\n", base->get_time);
#ifdef CONFIG_HIGH_RES_TIMERS
SEQ_printf(m, " .offset: %Lu nsecs\n",
(unsigned long long) ktime_to_ns(base->offset));
diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c
index 2f6330831f08..18dda1aa782d 100644
--- a/kernel/time/timer_migration.c
+++ b/kernel/time/timer_migration.c
@@ -10,6 +10,7 @@
#include <linux/spinlock.h>
#include <linux/timerqueue.h>
#include <trace/events/ipi.h>
+#include <linux/sched/isolation.h>
#include "timer_migration.h"
#include "tick-internal.h"
@@ -420,14 +421,54 @@ static struct list_head *tmigr_level_list __read_mostly;
static unsigned int tmigr_hierarchy_levels __read_mostly;
static unsigned int tmigr_crossnode_level __read_mostly;
+static struct tmigr_group *tmigr_root;
+
static DEFINE_PER_CPU(struct tmigr_cpu, tmigr_cpu);
+/*
+ * CPUs available for timer migration.
+ * Protected by cpuset_mutex (with cpus_read_lock held) or cpus_write_lock.
+ * Additionally tmigr_available_mutex serializes set/clear operations with each other.
+ */
+static cpumask_var_t tmigr_available_cpumask;
+static DEFINE_MUTEX(tmigr_available_mutex);
+
+/* Enabled during late initcall */
+static DEFINE_STATIC_KEY_FALSE(tmigr_exclude_isolated);
+
#define TMIGR_NONE 0xFF
#define BIT_CNT 8
static inline bool tmigr_is_not_available(struct tmigr_cpu *tmc)
{
- return !(tmc->tmgroup && tmc->online);
+ return !(tmc->tmgroup && tmc->available);
+}
+
+/*
+ * Returns true if @cpu should be excluded from the hierarchy as isolated.
+ * Domain isolated CPUs don't participate in timer migration, nohz_full CPUs
+ * are still part of the hierarchy but become idle (from a tick and timer
+ * migration perspective) when they stop their tick. This lets the timekeeping
+ * CPU handle their global timers. Marking also isolated CPUs as idle would be
+ * too costly, hence they are completely excluded from the hierarchy.
+ * This check is necessary, for instance, to prevent offline isolated CPUs from
+ * being incorrectly marked as available once getting back online.
+ *
+ * This function returns false during early boot and the isolation logic is
+ * enabled only after isolated CPUs are marked as unavailable at late boot.
+ * The tick CPU can be isolated at boot, however we cannot mark it as
+ * unavailable to avoid having no global migrator for the nohz_full CPUs. This
+ * should be ensured by the callers of this function: implicitly from hotplug
+ * callbacks and explicitly in tmigr_init_isolation() and
+ * tmigr_isolated_exclude_cpumask().
+ */
+static inline bool tmigr_is_isolated(int cpu)
+{
+ if (!static_branch_unlikely(&tmigr_exclude_isolated))
+ return false;
+ return (!housekeeping_cpu(cpu, HK_TYPE_DOMAIN) ||
+ cpuset_cpu_is_isolated(cpu)) &&
+ housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE);
}
/*
@@ -502,11 +543,6 @@ static bool tmigr_check_lonely(struct tmigr_group *group)
* @now: timer base monotonic
* @check: is set if there is the need to handle remote timers;
* required in tmigr_requires_handle_remote() only
- * @tmc_active: this flag indicates, whether the CPU which triggers
- * the hierarchy walk is !idle in the timer migration
- * hierarchy. When the CPU is idle and the whole hierarchy is
- * idle, only the first event of the top level has to be
- * considered.
*/
struct tmigr_walk {
u64 nextexp;
@@ -517,16 +553,13 @@ struct tmigr_walk {
unsigned long basej;
u64 now;
bool check;
- bool tmc_active;
};
typedef bool (*up_f)(struct tmigr_group *, struct tmigr_group *, struct tmigr_walk *);
-static void __walk_groups(up_f up, struct tmigr_walk *data,
- struct tmigr_cpu *tmc)
+static void __walk_groups_from(up_f up, struct tmigr_walk *data,
+ struct tmigr_group *child, struct tmigr_group *group)
{
- struct tmigr_group *child = NULL, *group = tmc->tmgroup;
-
do {
WARN_ON_ONCE(group->level >= tmigr_hierarchy_levels);
@@ -544,6 +577,12 @@ static void __walk_groups(up_f up, struct tmigr_walk *data,
} while (group);
}
+static void __walk_groups(up_f up, struct tmigr_walk *data,
+ struct tmigr_cpu *tmc)
+{
+ __walk_groups_from(up, data, NULL, tmc->tmgroup);
+}
+
static void walk_groups(up_f up, struct tmigr_walk *data, struct tmigr_cpu *tmc)
{
lockdep_assert_held(&tmc->lock);
@@ -708,7 +747,7 @@ void tmigr_cpu_activate(void)
/*
* Returns true, if there is nothing to be propagated to the next level
*
- * @data->firstexp is set to expiry of first gobal event of the (top level of
+ * @data->firstexp is set to expiry of first global event of the (top level of
* the) hierarchy, but only when hierarchy is completely idle.
*
* The child and group states need to be read under the lock, to prevent a race
@@ -926,7 +965,7 @@ static void tmigr_handle_remote_cpu(unsigned int cpu, u64 now,
* updated the event takes care when hierarchy is completely
* idle. Otherwise the migrator does it as the event is enqueued.
*/
- if (!tmc->online || tmc->remote || tmc->cpuevt.ignore ||
+ if (!tmc->available || tmc->remote || tmc->cpuevt.ignore ||
now < tmc->cpuevt.nextevt.expires) {
raw_spin_unlock_irq(&tmc->lock);
return;
@@ -973,7 +1012,7 @@ static void tmigr_handle_remote_cpu(unsigned int cpu, u64 now,
* (See also section "Required event and timerqueue update after a
* remote expiry" in the documentation at the top)
*/
- if (!tmc->online || !tmc->idle) {
+ if (!tmc->available || !tmc->idle) {
timer_unlock_remote_bases(cpu);
goto unlock;
}
@@ -1113,15 +1152,6 @@ static bool tmigr_requires_handle_remote_up(struct tmigr_group *group,
*/
if (!tmigr_check_migrator(group, childmask))
return true;
-
- /*
- * When there is a parent group and the CPU which triggered the
- * hierarchy walk is not active, proceed the walk to reach the top level
- * group before reading the next_expiry value.
- */
- if (group->parent && !data->tmc_active)
- return false;
-
/*
* The lock is required on 32bit architectures to read the variable
* consistently with a concurrent writer. On 64bit the lock is not
@@ -1166,7 +1196,6 @@ bool tmigr_requires_handle_remote(void)
data.now = get_jiffies_update(&jif);
data.childmask = tmc->groupmask;
data.firstexp = KTIME_MAX;
- data.tmc_active = !tmc->idle;
data.check = false;
/*
@@ -1405,23 +1434,20 @@ u64 tmigr_quick_check(u64 nextevt)
return KTIME_MAX;
do {
- if (!tmigr_check_lonely(group)) {
+ if (!tmigr_check_lonely(group))
return KTIME_MAX;
- } else {
- /*
- * Since current CPU is active, events may not be sorted
- * from bottom to the top because the CPU's event is ignored
- * up to the top and its sibling's events not propagated upwards.
- * Thus keep track of the lowest observed expiry.
- */
- nextevt = min_t(u64, nextevt, READ_ONCE(group->next_expiry));
- if (!group->parent)
- return nextevt;
- }
+
+ /*
+ * Since current CPU is active, events may not be sorted
+ * from bottom to the top because the CPU's event is ignored
+ * up to the top and its sibling's events not propagated upwards.
+ * Thus keep track of the lowest observed expiry.
+ */
+ nextevt = min_t(u64, nextevt, READ_ONCE(group->next_expiry));
group = group->parent;
} while (group);
- return KTIME_MAX;
+ return nextevt;
}
/*
@@ -1435,38 +1461,43 @@ static long tmigr_trigger_active(void *unused)
{
struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
- WARN_ON_ONCE(!tmc->online || tmc->idle);
+ WARN_ON_ONCE(!tmc->available || tmc->idle);
return 0;
}
-static int tmigr_cpu_offline(unsigned int cpu)
+static int tmigr_clear_cpu_available(unsigned int cpu)
{
struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
int migrator;
u64 firstexp;
- raw_spin_lock_irq(&tmc->lock);
- tmc->online = false;
- WRITE_ONCE(tmc->wakeup, KTIME_MAX);
+ guard(mutex)(&tmigr_available_mutex);
- /*
- * CPU has to handle the local events on his own, when on the way to
- * offline; Therefore nextevt value is set to KTIME_MAX
- */
- firstexp = __tmigr_cpu_deactivate(tmc, KTIME_MAX);
- trace_tmigr_cpu_offline(tmc);
- raw_spin_unlock_irq(&tmc->lock);
+ cpumask_clear_cpu(cpu, tmigr_available_cpumask);
+ scoped_guard(raw_spinlock_irq, &tmc->lock) {
+ if (!tmc->available)
+ return 0;
+ tmc->available = false;
+ WRITE_ONCE(tmc->wakeup, KTIME_MAX);
+
+ /*
+ * CPU has to handle the local events on his own, when on the way to
+ * offline; Therefore nextevt value is set to KTIME_MAX
+ */
+ firstexp = __tmigr_cpu_deactivate(tmc, KTIME_MAX);
+ trace_tmigr_cpu_unavailable(tmc);
+ }
if (firstexp != KTIME_MAX) {
- migrator = cpumask_any_but(cpu_online_mask, cpu);
+ migrator = cpumask_any(tmigr_available_cpumask);
work_on_cpu(migrator, tmigr_trigger_active, NULL);
}
return 0;
}
-static int tmigr_cpu_online(unsigned int cpu)
+static int tmigr_set_cpu_available(unsigned int cpu)
{
struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
@@ -1474,16 +1505,123 @@ static int tmigr_cpu_online(unsigned int cpu)
if (WARN_ON_ONCE(!tmc->tmgroup))
return -EINVAL;
- raw_spin_lock_irq(&tmc->lock);
- trace_tmigr_cpu_online(tmc);
- tmc->idle = timer_base_is_idle();
- if (!tmc->idle)
- __tmigr_cpu_activate(tmc);
- tmc->online = true;
- raw_spin_unlock_irq(&tmc->lock);
+ if (tmigr_is_isolated(cpu))
+ return 0;
+
+ guard(mutex)(&tmigr_available_mutex);
+
+ cpumask_set_cpu(cpu, tmigr_available_cpumask);
+ scoped_guard(raw_spinlock_irq, &tmc->lock) {
+ if (tmc->available)
+ return 0;
+ trace_tmigr_cpu_available(tmc);
+ tmc->idle = timer_base_is_idle();
+ if (!tmc->idle)
+ __tmigr_cpu_activate(tmc);
+ tmc->available = true;
+ }
+ return 0;
+}
+
+static void tmigr_cpu_isolate(struct work_struct *ignored)
+{
+ tmigr_clear_cpu_available(smp_processor_id());
+}
+
+static void tmigr_cpu_unisolate(struct work_struct *ignored)
+{
+ tmigr_set_cpu_available(smp_processor_id());
+}
+
+/**
+ * tmigr_isolated_exclude_cpumask - Exclude given CPUs from hierarchy
+ * @exclude_cpumask: the cpumask to be excluded from timer migration hierarchy
+ *
+ * This function can be called from cpuset code to provide the new set of
+ * isolated CPUs that should be excluded from the hierarchy.
+ * Online CPUs not present in exclude_cpumask but already excluded are brought
+ * back to the hierarchy.
+ * Functions to isolate/unisolate need to be called locally and can sleep.
+ */
+int tmigr_isolated_exclude_cpumask(struct cpumask *exclude_cpumask)
+{
+ struct work_struct __percpu *works __free(free_percpu) =
+ alloc_percpu(struct work_struct);
+ cpumask_var_t cpumask __free(free_cpumask_var) = CPUMASK_VAR_NULL;
+ int cpu;
+
+ lockdep_assert_cpus_held();
+
+ if (!works)
+ return -ENOMEM;
+ if (!alloc_cpumask_var(&cpumask, GFP_KERNEL))
+ return -ENOMEM;
+
+ /*
+ * First set previously isolated CPUs as available (unisolate).
+ * This cpumask contains only CPUs that switched to available now.
+ */
+ cpumask_andnot(cpumask, cpu_online_mask, exclude_cpumask);
+ cpumask_andnot(cpumask, cpumask, tmigr_available_cpumask);
+
+ for_each_cpu(cpu, cpumask) {
+ struct work_struct *work = per_cpu_ptr(works, cpu);
+
+ INIT_WORK(work, tmigr_cpu_unisolate);
+ schedule_work_on(cpu, work);
+ }
+ for_each_cpu(cpu, cpumask)
+ flush_work(per_cpu_ptr(works, cpu));
+
+ /*
+ * Then clear previously available CPUs (isolate).
+ * This cpumask contains only CPUs that switched to not available now.
+ * There cannot be overlap with the newly available ones.
+ */
+ cpumask_and(cpumask, exclude_cpumask, tmigr_available_cpumask);
+ cpumask_and(cpumask, cpumask, housekeeping_cpumask(HK_TYPE_KERNEL_NOISE));
+ /*
+ * Handle this here and not in the cpuset code because exclude_cpumask
+ * might include also the tick CPU if included in isolcpus.
+ */
+ for_each_cpu(cpu, cpumask) {
+ if (!tick_nohz_cpu_hotpluggable(cpu)) {
+ cpumask_clear_cpu(cpu, cpumask);
+ break;
+ }
+ }
+
+ for_each_cpu(cpu, cpumask) {
+ struct work_struct *work = per_cpu_ptr(works, cpu);
+
+ INIT_WORK(work, tmigr_cpu_isolate);
+ schedule_work_on(cpu, work);
+ }
+ for_each_cpu(cpu, cpumask)
+ flush_work(per_cpu_ptr(works, cpu));
+
return 0;
}
+static int __init tmigr_init_isolation(void)
+{
+ cpumask_var_t cpumask __free(free_cpumask_var) = CPUMASK_VAR_NULL;
+
+ static_branch_enable(&tmigr_exclude_isolated);
+
+ if (!housekeeping_enabled(HK_TYPE_DOMAIN))
+ return 0;
+ if (!alloc_cpumask_var(&cpumask, GFP_KERNEL))
+ return -ENOMEM;
+
+ cpumask_andnot(cpumask, cpu_possible_mask, housekeeping_cpumask(HK_TYPE_DOMAIN));
+
+ /* Protect against RCU torture hotplug testing */
+ guard(cpus_read_lock)();
+ return tmigr_isolated_exclude_cpumask(cpumask);
+}
+late_initcall(tmigr_init_isolation);
+
static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl,
int node)
{
@@ -1501,21 +1639,6 @@ static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl,
s.seq = 0;
atomic_set(&group->migr_state, s.state);
- /*
- * If this is a new top-level, prepare its groupmask in advance.
- * This avoids accidents where yet another new top-level is
- * created in the future and made visible before the current groupmask.
- */
- if (list_empty(&tmigr_level_list[lvl])) {
- group->groupmask = BIT(0);
- /*
- * The previous top level has prepared its groupmask already,
- * simply account it as the first child.
- */
- if (lvl > 0)
- group->num_children = 1;
- }
-
timerqueue_init_head(&group->events);
timerqueue_init(&group->groupevt.nextevt);
group->groupevt.nextevt.expires = KTIME_MAX;
@@ -1523,8 +1646,7 @@ static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl,
group->groupevt.ignore = true;
}
-static struct tmigr_group *tmigr_get_group(unsigned int cpu, int node,
- unsigned int lvl)
+static struct tmigr_group *tmigr_get_group(int node, unsigned int lvl)
{
struct tmigr_group *tmp, *group = NULL;
@@ -1570,25 +1692,51 @@ static struct tmigr_group *tmigr_get_group(unsigned int cpu, int node,
return group;
}
+static bool tmigr_init_root(struct tmigr_group *group, bool activate)
+{
+ if (!group->parent && group != tmigr_root) {
+ /*
+ * This is the new top-level, prepare its groupmask in advance
+ * to avoid accidents where yet another new top-level is
+ * created in the future and made visible before this groupmask.
+ */
+ group->groupmask = BIT(0);
+ WARN_ON_ONCE(activate);
+
+ return true;
+ }
+
+ return false;
+
+}
+
static void tmigr_connect_child_parent(struct tmigr_group *child,
struct tmigr_group *parent,
bool activate)
{
- struct tmigr_walk data;
-
- raw_spin_lock_irq(&child->lock);
- raw_spin_lock_nested(&parent->lock, SINGLE_DEPTH_NESTING);
+ if (tmigr_init_root(parent, activate)) {
+ /*
+ * The previous top level had prepared its groupmask already,
+ * simply account it in advance as the first child. If some groups
+ * have been created between the old and new root due to node
+ * mismatch, the new root's child will be intialized accordingly.
+ */
+ parent->num_children = 1;
+ }
- if (activate) {
+ /* Connecting old root to new root ? */
+ if (!parent->parent && activate) {
/*
- * @child is the old top and @parent the new one. In this
- * case groupmask is pre-initialized and @child already
- * accounted, along with its new sibling corresponding to the
- * CPU going up.
+ * @child is the old top, or in case of node mismatch, some
+ * intermediate group between the old top and the new one in
+ * @parent. In this case the @child must be pre-accounted above
+ * as the first child. Its new inactive sibling corresponding
+ * to the CPU going up has been accounted as the second child.
*/
- WARN_ON_ONCE(child->groupmask != BIT(0) || parent->num_children != 2);
+ WARN_ON_ONCE(parent->num_children != 2);
+ child->groupmask = BIT(0);
} else {
- /* Adding @child for the CPU going up to @parent. */
+ /* Common case adding @child for the CPU going up to @parent. */
child->groupmask = BIT(parent->num_children++);
}
@@ -1599,87 +1747,61 @@ static void tmigr_connect_child_parent(struct tmigr_group *child,
*/
smp_store_release(&child->parent, parent);
- raw_spin_unlock(&parent->lock);
- raw_spin_unlock_irq(&child->lock);
-
trace_tmigr_connect_child_parent(child);
-
- if (!activate)
- return;
-
- /*
- * To prevent inconsistent states, active children need to be active in
- * the new parent as well. Inactive children are already marked inactive
- * in the parent group:
- *
- * * When new groups were created by tmigr_setup_groups() starting from
- * the lowest level (and not higher then one level below the current
- * top level), then they are not active. They will be set active when
- * the new online CPU comes active.
- *
- * * But if a new group above the current top level is required, it is
- * mandatory to propagate the active state of the already existing
- * child to the new parent. So tmigr_connect_child_parent() is
- * executed with the formerly top level group (child) and the newly
- * created group (parent).
- *
- * * It is ensured that the child is active, as this setup path is
- * executed in hotplug prepare callback. This is exectued by an
- * already connected and !idle CPU. Even if all other CPUs go idle,
- * the CPU executing the setup will be responsible up to current top
- * level group. And the next time it goes inactive, it will release
- * the new childmask and parent to subsequent walkers through this
- * @child. Therefore propagate active state unconditionally.
- */
- data.childmask = child->groupmask;
-
- /*
- * There is only one new level per time (which is protected by
- * tmigr_mutex). When connecting the child and the parent and set the
- * child active when the parent is inactive, the parent needs to be the
- * uppermost level. Otherwise there went something wrong!
- */
- WARN_ON(!tmigr_active_up(parent, child, &data) && parent->parent);
}
-static int tmigr_setup_groups(unsigned int cpu, unsigned int node)
+static int tmigr_setup_groups(unsigned int cpu, unsigned int node,
+ struct tmigr_group *start, bool activate)
{
struct tmigr_group *group, *child, **stack;
- int top = 0, err = 0, i = 0;
- struct list_head *lvllist;
+ int i, top = 0, err = 0, start_lvl = 0;
+ bool root_mismatch = false;
stack = kcalloc(tmigr_hierarchy_levels, sizeof(*stack), GFP_KERNEL);
if (!stack)
return -ENOMEM;
- do {
- group = tmigr_get_group(cpu, node, i);
+ if (start) {
+ stack[start->level] = start;
+ start_lvl = start->level + 1;
+ }
+
+ if (tmigr_root)
+ root_mismatch = tmigr_root->numa_node != node;
+
+ for (i = start_lvl; i < tmigr_hierarchy_levels; i++) {
+ group = tmigr_get_group(node, i);
if (IS_ERR(group)) {
err = PTR_ERR(group);
+ i--;
break;
}
top = i;
- stack[i++] = group;
+ stack[i] = group;
/*
* When booting only less CPUs of a system than CPUs are
- * available, not all calculated hierarchy levels are required.
+ * available, not all calculated hierarchy levels are required,
+ * unless a node mismatch is detected.
*
* The loop is aborted as soon as the highest level, which might
* be different from tmigr_hierarchy_levels, contains only a
- * single group.
+ * single group, unless the nodes mismatch below tmigr_crossnode_level
*/
- if (group->parent || list_is_singular(&tmigr_level_list[i - 1]))
+ if (group->parent)
break;
+ if ((!root_mismatch || i >= tmigr_crossnode_level) &&
+ list_is_singular(&tmigr_level_list[i]))
+ break;
+ }
- } while (i < tmigr_hierarchy_levels);
-
- /* Assert single root */
- WARN_ON_ONCE(!err && !group->parent && !list_is_singular(&tmigr_level_list[top]));
+ /* Assert single root without parent */
+ if (WARN_ON_ONCE(i >= tmigr_hierarchy_levels))
+ return -EINVAL;
- while (i > 0) {
- group = stack[--i];
+ for (; i >= start_lvl; i--) {
+ group = stack[i];
if (err < 0) {
list_del(&group->list);
@@ -1695,12 +1817,10 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node)
if (i == 0) {
struct tmigr_cpu *tmc = per_cpu_ptr(&tmigr_cpu, cpu);
- raw_spin_lock_irq(&group->lock);
-
tmc->tmgroup = group;
tmc->groupmask = BIT(group->num_children++);
- raw_spin_unlock_irq(&group->lock);
+ tmigr_init_root(group, activate);
trace_tmigr_connect_cpu_parent(tmc);
@@ -1708,42 +1828,58 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node)
continue;
} else {
child = stack[i - 1];
- /* Will be activated at online time */
- tmigr_connect_child_parent(child, group, false);
+ tmigr_connect_child_parent(child, group, activate);
}
+ }
- /* check if uppermost level was newly created */
- if (top != i)
- continue;
-
- WARN_ON_ONCE(top == 0);
+ if (err < 0)
+ goto out;
- lvllist = &tmigr_level_list[top];
+ if (activate) {
+ struct tmigr_walk data;
+ union tmigr_state state;
/*
- * Newly created root level should have accounted the upcoming
- * CPU's child group and pre-accounted the old root.
+ * To prevent inconsistent states, active children need to be active in
+ * the new parent as well. Inactive children are already marked inactive
+ * in the parent group:
+ *
+ * * When new groups were created by tmigr_setup_groups() starting from
+ * the lowest level, then they are not active. They will be set active
+ * when the new online CPU comes active.
+ *
+ * * But if new groups above the current top level are required, it is
+ * mandatory to propagate the active state of the already existing
+ * child to the new parents. So tmigr_active_up() activates the
+ * new parents while walking up from the old root to the new.
+ *
+ * * It is ensured that @start is active, as this setup path is
+ * executed in hotplug prepare callback. This is executed by an
+ * already connected and !idle CPU. Even if all other CPUs go idle,
+ * the CPU executing the setup will be responsible up to current top
+ * level group. And the next time it goes inactive, it will release
+ * the new childmask and parent to subsequent walkers through this
+ * @child. Therefore propagate active state unconditionally.
*/
- if (group->num_children == 2 && list_is_singular(lvllist)) {
- /*
- * The target CPU must never do the prepare work, except
- * on early boot when the boot CPU is the target. Otherwise
- * it may spuriously activate the old top level group inside
- * the new one (nevertheless whether old top level group is
- * active or not) and/or release an uninitialized childmask.
- */
- WARN_ON_ONCE(cpu == raw_smp_processor_id());
-
- lvllist = &tmigr_level_list[top - 1];
- list_for_each_entry(child, lvllist, list) {
- if (child->parent)
- continue;
+ state.state = atomic_read(&start->migr_state);
+ WARN_ON_ONCE(!state.active);
+ WARN_ON_ONCE(!start->parent);
+ data.childmask = start->groupmask;
+ __walk_groups_from(tmigr_active_up, &data, start, start->parent);
+ }
- tmigr_connect_child_parent(child, group, true);
- }
+ /* Root update */
+ if (list_is_singular(&tmigr_level_list[top])) {
+ group = list_first_entry(&tmigr_level_list[top],
+ typeof(*group), list);
+ WARN_ON_ONCE(group->parent);
+ if (tmigr_root) {
+ /* Old root should be the same or below */
+ WARN_ON_ONCE(tmigr_root->level > top);
}
+ tmigr_root = group;
}
-
+out:
kfree(stack);
return err;
@@ -1751,12 +1887,31 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node)
static int tmigr_add_cpu(unsigned int cpu)
{
+ struct tmigr_group *old_root = tmigr_root;
int node = cpu_to_node(cpu);
int ret;
- mutex_lock(&tmigr_mutex);
- ret = tmigr_setup_groups(cpu, node);
- mutex_unlock(&tmigr_mutex);
+ guard(mutex)(&tmigr_mutex);
+
+ ret = tmigr_setup_groups(cpu, node, NULL, false);
+
+ /* Root has changed? Connect the old one to the new */
+ if (ret >= 0 && old_root && old_root != tmigr_root) {
+ /*
+ * The target CPU must never do the prepare work, except
+ * on early boot when the boot CPU is the target. Otherwise
+ * it may spuriously activate the old top level group inside
+ * the new one (nevertheless whether old top level group is
+ * active or not) and/or release an uninitialized childmask.
+ */
+ WARN_ON_ONCE(cpu == raw_smp_processor_id());
+ /*
+ * The (likely) current CPU is expected to be online in the hierarchy,
+ * otherwise the old root may not be active as expected.
+ */
+ WARN_ON_ONCE(!per_cpu_ptr(&tmigr_cpu, raw_smp_processor_id())->available);
+ ret = tmigr_setup_groups(-1, old_root->numa_node, old_root, true);
+ }
return ret;
}
@@ -1801,6 +1956,11 @@ static int __init tmigr_init(void)
if (ncpus == 1)
return 0;
+ if (!zalloc_cpumask_var(&tmigr_available_cpumask, GFP_KERNEL)) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
/*
* Calculate the required hierarchy levels. Unfortunately there is no
* reliable information available, unless all possible CPUs have been
@@ -1850,7 +2010,7 @@ static int __init tmigr_init(void)
goto err;
ret = cpuhp_setup_state(CPUHP_AP_TMIGR_ONLINE, "tmigr:online",
- tmigr_cpu_online, tmigr_cpu_offline);
+ tmigr_set_cpu_available, tmigr_clear_cpu_available);
if (ret)
goto err;
diff --git a/kernel/time/timer_migration.h b/kernel/time/timer_migration.h
index ae19f70f8170..70879cde6fdd 100644
--- a/kernel/time/timer_migration.h
+++ b/kernel/time/timer_migration.h
@@ -97,7 +97,7 @@ struct tmigr_group {
*/
struct tmigr_cpu {
raw_spinlock_t lock;
- bool online;
+ bool available;
bool idle;
bool remote;
struct tmigr_group *tmgroup;
diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c
index 01c2ab1e8971..aa59919b8f2c 100644
--- a/kernel/time/vsyscall.c
+++ b/kernel/time/vsyscall.c
@@ -15,26 +15,25 @@
#include "timekeeping_internal.h"
+static inline void fill_clock_configuration(struct vdso_clock *vc, const struct tk_read_base *base)
+{
+ vc->cycle_last = base->cycle_last;
+#ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT
+ vc->max_cycles = base->clock->max_cycles;
+#endif
+ vc->mask = base->mask;
+ vc->mult = base->mult;
+ vc->shift = base->shift;
+}
+
static inline void update_vdso_time_data(struct vdso_time_data *vdata, struct timekeeper *tk)
{
struct vdso_clock *vc = vdata->clock_data;
struct vdso_timestamp *vdso_ts;
u64 nsec, sec;
- vc[CS_HRES_COARSE].cycle_last = tk->tkr_mono.cycle_last;
-#ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT
- vc[CS_HRES_COARSE].max_cycles = tk->tkr_mono.clock->max_cycles;
-#endif
- vc[CS_HRES_COARSE].mask = tk->tkr_mono.mask;
- vc[CS_HRES_COARSE].mult = tk->tkr_mono.mult;
- vc[CS_HRES_COARSE].shift = tk->tkr_mono.shift;
- vc[CS_RAW].cycle_last = tk->tkr_raw.cycle_last;
-#ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT
- vc[CS_RAW].max_cycles = tk->tkr_raw.clock->max_cycles;
-#endif
- vc[CS_RAW].mask = tk->tkr_raw.mask;
- vc[CS_RAW].mult = tk->tkr_raw.mult;
- vc[CS_RAW].shift = tk->tkr_raw.shift;
+ fill_clock_configuration(&vc[CS_HRES_COARSE], &tk->tkr_mono);
+ fill_clock_configuration(&vc[CS_RAW], &tk->tkr_raw);
/* CLOCK_MONOTONIC */
vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC];
@@ -98,12 +97,12 @@ void update_vsyscall(struct timekeeper *tk)
/* CLOCK_REALTIME_COARSE */
vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_REALTIME_COARSE];
vdso_ts->sec = tk->xtime_sec;
- vdso_ts->nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
+ vdso_ts->nsec = tk->coarse_nsec;
/* CLOCK_MONOTONIC_COARSE */
vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC_COARSE];
vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
- nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
+ nsec = tk->coarse_nsec;
nsec = nsec + tk->wall_to_monotonic.tv_nsec;
vdso_ts->sec += __iter_div_u64_rem(nsec, NSEC_PER_SEC, &vdso_ts->nsec);
@@ -119,7 +118,8 @@ void update_vsyscall(struct timekeeper *tk)
if (clock_mode != VDSO_CLOCKMODE_NONE)
update_vdso_time_data(vdata, tk);
- __arch_update_vsyscall(vdata);
+ __arch_update_vdso_clock(&vc[CS_HRES_COARSE]);
+ __arch_update_vdso_clock(&vc[CS_RAW]);
vdso_write_end(vdata);
@@ -136,6 +136,46 @@ void update_vsyscall_tz(void)
__arch_sync_vdso_time_data(vdata);
}
+#ifdef CONFIG_POSIX_AUX_CLOCKS
+void vdso_time_update_aux(struct timekeeper *tk)
+{
+ struct vdso_time_data *vdata = vdso_k_time_data;
+ struct vdso_timestamp *vdso_ts;
+ struct vdso_clock *vc;
+ s32 clock_mode;
+ u64 nsec;
+
+ vc = &vdata->aux_clock_data[tk->id - TIMEKEEPER_AUX_FIRST];
+ vdso_ts = &vc->basetime[VDSO_BASE_AUX];
+ clock_mode = tk->tkr_mono.clock->vdso_clock_mode;
+ if (!tk->clock_valid)
+ clock_mode = VDSO_CLOCKMODE_NONE;
+
+ /* copy vsyscall data */
+ vdso_write_begin_clock(vc);
+
+ vc->clock_mode = clock_mode;
+
+ if (clock_mode != VDSO_CLOCKMODE_NONE) {
+ fill_clock_configuration(vc, &tk->tkr_mono);
+
+ vdso_ts->sec = tk->xtime_sec + tk->monotonic_to_aux.tv_sec;
+
+ nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
+ nsec += tk->monotonic_to_aux.tv_nsec;
+ vdso_ts->sec += __iter_div_u64_rem(nsec, NSEC_PER_SEC, &nsec);
+ nsec = nsec << tk->tkr_mono.shift;
+ vdso_ts->nsec = nsec;
+ }
+
+ __arch_update_vdso_clock(vc);
+
+ vdso_write_end_clock(vc);
+
+ __arch_sync_vdso_time_data(vdata);
+}
+#endif
+
/**
* vdso_update_begin - Start of a VDSO update section
*