diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2025-04-18 13:06:12 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2025-04-18 13:06:12 -0700 |
commit | cb64c513b5fbc5a3290d89cbafcc8f9b07a29a46 (patch) | |
tree | 499cee4ca1fccb61a6eb544956cae7dda8b27abe | |
parent | 4b828867b3949d8e9dd698b906e2be5b7eaad4a5 (diff) | |
parent | f3b25a1b48191048e2f190d878fc3175fc08ffaa (diff) |
Merge tag 'pm-6.15-rc3' of git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm
Pull power management fixes from Rafael Wysocki:
"These are mostly cpufreq fixes, some of which address recent
regressions and some address older issues that have come to light
during the last two weeks, and a runtime PM documentation correction:
- Fix the performance-to-frequency scaling factor computation on
systems using HWP in the intel_pstate driver after a recent
incorrect update of it (Rafael Wysocki)
- Fix the usage of the CPUFREQ_NEED_UPDATE_LIMITS cpufreq driver flag
in the schedutil cpufreq governor after a recent update of it that
has caused frequency limits changes to be missed sometimes (Rafael
Wysocki)
- Address some recently discovered synchronization issues related to
frequency limits changes in the schedutil cpufreq governor and in
the cpufreq core (Rafael Wysocki)
- Fix ITMT support in the amd-pstate cpufreq driver so that it is
enabled after asym priorities have been correctly initialized for
all CPUs (K Prateek Nayak)
- Fix changing min/max limits in the amd-pstate cpufreq driver while
on the performance governor (Dhananjay Ugwekar)
- Fix a function name in the runtime PM documentation that was
previously incorrectly updated by mistake (Sakari Ailus)"
* tag 'pm-6.15-rc3' of git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm:
cpufreq: Avoid using inconsistent policy->min and policy->max
cpufreq/sched: Set need_freq_update in ignore_dl_rate_limit()
cpufreq/sched: Explicitly synchronize limits_changed flag handling
cpufreq/sched: Fix the usage of CPUFREQ_NEED_UPDATE_LIMITS
Documentation: PM: runtime: Fix a reference to pm_runtime_autosuspend()
cpufreq: intel_pstate: Fix hwp_get_cpu_scaling()
cpufreq/amd-pstate: Enable ITMT support after initializing core rankings
cpufreq/amd-pstate: Fix min_limit perf and freq updation for performance governor
-rw-r--r-- | Documentation/power/runtime_pm.rst | 2 | ||||
-rw-r--r-- | drivers/cpufreq/amd-pstate.c | 36 | ||||
-rw-r--r-- | drivers/cpufreq/cpufreq.c | 32 | ||||
-rw-r--r-- | drivers/cpufreq/intel_pstate.c | 2 | ||||
-rw-r--r-- | kernel/sched/cpufreq_schedutil.c | 49 |
5 files changed, 84 insertions, 37 deletions
diff --git a/Documentation/power/runtime_pm.rst b/Documentation/power/runtime_pm.rst index 12f429359a82..63344bea8393 100644 --- a/Documentation/power/runtime_pm.rst +++ b/Documentation/power/runtime_pm.rst @@ -154,7 +154,7 @@ suspending the device are satisfied) and to queue up a suspend request for the device in that case. If there is no idle callback, or if the callback returns 0, then the PM core will attempt to carry out a runtime suspend of the device, also respecting devices configured for autosuspend. In essence this means a -call to __pm_runtime_autosuspend() (do note that drivers needs to update the +call to pm_runtime_autosuspend() (do note that drivers needs to update the device last busy mark, pm_runtime_mark_last_busy(), to control the delay under this circumstance). To prevent this (for example, if the callback routine has started a delayed suspend), the routine must return a non-zero value. Negative diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 6789eed1bb5b..b961f3a3b580 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -607,13 +607,16 @@ static void amd_pstate_update_min_max_limit(struct cpufreq_policy *policy) union perf_cached perf = READ_ONCE(cpudata->perf); perf.max_limit_perf = freq_to_perf(perf, cpudata->nominal_freq, policy->max); - perf.min_limit_perf = freq_to_perf(perf, cpudata->nominal_freq, policy->min); + WRITE_ONCE(cpudata->max_limit_freq, policy->max); - if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) + if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) { perf.min_limit_perf = min(perf.nominal_perf, perf.max_limit_perf); + WRITE_ONCE(cpudata->min_limit_freq, min(cpudata->nominal_freq, cpudata->max_limit_freq)); + } else { + perf.min_limit_perf = freq_to_perf(perf, cpudata->nominal_freq, policy->min); + WRITE_ONCE(cpudata->min_limit_freq, policy->min); + } - WRITE_ONCE(cpudata->max_limit_freq, policy->max); - WRITE_ONCE(cpudata->min_limit_freq, policy->min); WRITE_ONCE(cpudata->perf, perf); } @@ -791,16 +794,6 @@ static void amd_perf_ctl_reset(unsigned int cpu) wrmsrl_on_cpu(cpu, MSR_AMD_PERF_CTL, 0); } -/* - * Set amd-pstate preferred core enable can't be done directly from cpufreq callbacks - * due to locking, so queue the work for later. - */ -static void amd_pstste_sched_prefcore_workfn(struct work_struct *work) -{ - sched_set_itmt_support(); -} -static DECLARE_WORK(sched_prefcore_work, amd_pstste_sched_prefcore_workfn); - #define CPPC_MAX_PERF U8_MAX static void amd_pstate_init_prefcore(struct amd_cpudata *cpudata) @@ -811,14 +804,8 @@ static void amd_pstate_init_prefcore(struct amd_cpudata *cpudata) cpudata->hw_prefcore = true; - /* - * The priorities can be set regardless of whether or not - * sched_set_itmt_support(true) has been called and it is valid to - * update them at any time after it has been called. - */ + /* Priorities must be initialized before ITMT support can be toggled on. */ sched_set_itmt_core_prio((int)READ_ONCE(cpudata->prefcore_ranking), cpudata->cpu); - - schedule_work(&sched_prefcore_work); } static void amd_pstate_update_limits(unsigned int cpu) @@ -1193,6 +1180,9 @@ static ssize_t show_energy_performance_preference( static void amd_pstate_driver_cleanup(void) { + if (amd_pstate_prefcore) + sched_clear_itmt_support(); + cppc_state = AMD_PSTATE_DISABLE; current_pstate_driver = NULL; } @@ -1235,6 +1225,10 @@ static int amd_pstate_register_driver(int mode) return ret; } + /* Enable ITMT support once all CPUs have initialized their asym priorities. */ + if (amd_pstate_prefcore) + sched_set_itmt_support(); + return 0; } diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 3841c9da6cac..acf19b0042bb 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -540,8 +540,6 @@ static unsigned int __resolve_freq(struct cpufreq_policy *policy, { unsigned int idx; - target_freq = clamp_val(target_freq, policy->min, policy->max); - if (!policy->freq_table) return target_freq; @@ -565,7 +563,22 @@ static unsigned int __resolve_freq(struct cpufreq_policy *policy, unsigned int cpufreq_driver_resolve_freq(struct cpufreq_policy *policy, unsigned int target_freq) { - return __resolve_freq(policy, target_freq, CPUFREQ_RELATION_LE); + unsigned int min = READ_ONCE(policy->min); + unsigned int max = READ_ONCE(policy->max); + + /* + * If this function runs in parallel with cpufreq_set_policy(), it may + * read policy->min before the update and policy->max after the update + * or the other way around, so there is no ordering guarantee. + * + * Resolve this by always honoring the max (in case it comes from + * thermal throttling or similar). + */ + if (unlikely(min > max)) + min = max; + + return __resolve_freq(policy, clamp_val(target_freq, min, max), + CPUFREQ_RELATION_LE); } EXPORT_SYMBOL_GPL(cpufreq_driver_resolve_freq); @@ -2384,6 +2397,7 @@ int __cpufreq_driver_target(struct cpufreq_policy *policy, if (cpufreq_disabled()) return -ENODEV; + target_freq = clamp_val(target_freq, policy->min, policy->max); target_freq = __resolve_freq(policy, target_freq, relation); pr_debug("target for CPU %u: %u kHz, relation %u, requested %u kHz\n", @@ -2708,11 +2722,15 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy, * Resolve policy min/max to available frequencies. It ensures * no frequency resolution will neither overshoot the requested maximum * nor undershoot the requested minimum. + * + * Avoid storing intermediate values in policy->max or policy->min and + * compiler optimizations around them because they may be accessed + * concurrently by cpufreq_driver_resolve_freq() during the update. */ - policy->min = new_data.min; - policy->max = new_data.max; - policy->min = __resolve_freq(policy, policy->min, CPUFREQ_RELATION_L); - policy->max = __resolve_freq(policy, policy->max, CPUFREQ_RELATION_H); + WRITE_ONCE(policy->max, __resolve_freq(policy, new_data.max, CPUFREQ_RELATION_H)); + new_data.min = __resolve_freq(policy, new_data.min, CPUFREQ_RELATION_L); + WRITE_ONCE(policy->min, new_data.min > policy->max ? policy->max : new_data.min); + trace_cpu_frequency_limits(policy); cpufreq_update_pressure(policy); diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 4aad79d26c64..f41ed0b9e610 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -2209,7 +2209,7 @@ static int knl_get_turbo_pstate(int cpu) static int hwp_get_cpu_scaling(int cpu) { if (hybrid_scaling_factor) { - struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); + struct cpuinfo_x86 *c = &cpu_data(cpu); u8 cpu_type = c->topo.intel_type; /* diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 1a19d69b91ed..816f07f9d30f 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -81,9 +81,23 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) if (!cpufreq_this_cpu_can_update(sg_policy->policy)) return false; - if (unlikely(sg_policy->limits_changed)) { - sg_policy->limits_changed = false; - sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS); + if (unlikely(READ_ONCE(sg_policy->limits_changed))) { + WRITE_ONCE(sg_policy->limits_changed, false); + sg_policy->need_freq_update = true; + + /* + * The above limits_changed update must occur before the reads + * of policy limits in cpufreq_driver_resolve_freq() or a policy + * limits update might be missed, so use a memory barrier to + * ensure it. + * + * This pairs with the write memory barrier in sugov_limits(). + */ + smp_mb(); + + return true; + } else if (sg_policy->need_freq_update) { + /* ignore_dl_rate_limit() wants a new frequency to be found. */ return true; } @@ -95,10 +109,22 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time, unsigned int next_freq) { - if (sg_policy->need_freq_update) + if (sg_policy->need_freq_update) { sg_policy->need_freq_update = false; - else if (sg_policy->next_freq == next_freq) + /* + * The policy limits have changed, but if the return value of + * cpufreq_driver_resolve_freq() after applying the new limits + * is still equal to the previously selected frequency, the + * driver callback need not be invoked unless the driver + * specifically wants that to happen on every update of the + * policy limits. + */ + if (sg_policy->next_freq == next_freq && + !cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS)) + return false; + } else if (sg_policy->next_freq == next_freq) { return false; + } sg_policy->next_freq = next_freq; sg_policy->last_freq_update_time = time; @@ -365,7 +391,7 @@ static inline bool sugov_hold_freq(struct sugov_cpu *sg_cpu) { return false; } static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu) { if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_min) - sg_cpu->sg_policy->limits_changed = true; + sg_cpu->sg_policy->need_freq_update = true; } static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu, @@ -871,7 +897,16 @@ static void sugov_limits(struct cpufreq_policy *policy) mutex_unlock(&sg_policy->work_lock); } - sg_policy->limits_changed = true; + /* + * The limits_changed update below must take place before the updates + * of policy limits in cpufreq_set_policy() or a policy limits update + * might be missed, so use a memory barrier to ensure it. + * + * This pairs with the memory barrier in sugov_should_update_freq(). + */ + smp_wmb(); + + WRITE_ONCE(sg_policy->limits_changed, true); } struct cpufreq_governor schedutil_gov = { |