From c9ff3637386c6eb72eac55a8b4c9a4972215dbcb Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Fri, 19 Sep 2025 17:30:08 +0200 Subject: [PATCH 01/96] PM: WQ_UNBOUND added to pm_wq workqueue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently if a user enqueue a work item using schedule_delayed_work() the used wq is "system_wq" (per-cpu wq) while queue_delayed_work() use WORK_CPU_UNBOUND (used when a cpu is not specified). The same applies to schedule_work() that is using system_wq and queue_work(), that makes use again of WORK_CPU_UNBOUND. This lack of consistentcy cannot be addressed without refactoring the API. alloc_workqueue() treats all queues as per-CPU by default, while unbound workqueues must opt-in via WQ_UNBOUND. This default is suboptimal: most workloads benefit from unbound queues, allowing the scheduler to place worker threads where they’re needed and reducing noise when CPUs are isolated. This change add the WQ_UNBOUND flag to pm_wq, to make explicit this workqueue can be unbound and that it does not benefit from per-cpu work. Once migration is complete, WQ_UNBOUND can be removed and unbound will become the implicit default. Suggested-by: Tejun Heo Signed-off-by: Marco Crivellari Signed-off-by: Rafael J. Wysocki --- kernel/power/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/power/main.c b/kernel/power/main.c index 3cf2d7e72567..33a47ed15994 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -1056,7 +1056,7 @@ EXPORT_SYMBOL_GPL(pm_wq); static int __init pm_start_workqueue(void) { - pm_wq = alloc_workqueue("pm", WQ_FREEZABLE, 0); + pm_wq = alloc_workqueue("pm", WQ_FREEZABLE | WQ_UNBOUND, 0); return pm_wq ? 0 : -ENOMEM; } From 67434ce57c7eb9a250125e159cb7ef8a3f764d3f Mon Sep 17 00:00:00 2001 From: Kaushlendra Kumar Date: Mon, 22 Sep 2025 11:22:31 +0530 Subject: [PATCH 02/96] PM: sleep: Replace snprintf() with scnprintf() in show_trace_dev_match() Replace snprintf() with scnprintf() in show_trace_dev_match() to simplify buffer length handling. The scnprintf() function returns the number of characters actually written (excluding the null terminator), which eliminates the need for manual length checking and clamping. This change removes the redundant size check since scnprintf() guarantees that the return value will never exceed the buffer size, making the code cleaner and less error-prone. Signed-off-by: Kaushlendra Kumar Link: https://patch.msgid.link/20250922055231.3523680-1-kaushlendra.kumar@intel.com [ rjw: Subject adjustment ] Signed-off-by: Rafael J. Wysocki --- drivers/base/power/trace.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/base/power/trace.c b/drivers/base/power/trace.c index cd6e559648b2..d8da7195bb00 100644 --- a/drivers/base/power/trace.c +++ b/drivers/base/power/trace.c @@ -238,10 +238,8 @@ int show_trace_dev_match(char *buf, size_t size) unsigned int hash = hash_string(DEVSEED, dev_name(dev), DEVHASH); if (hash == value) { - int len = snprintf(buf, size, "%s\n", + int len = scnprintf(buf, size, "%s\n", dev_driver_string(dev)); - if (len > size) - len = size; buf += len; ret += len; size -= len; From b57100a3d9ced8c2b78e87d313f514a3338d016e Mon Sep 17 00:00:00 2001 From: Malaya Kumar Rout Date: Tue, 14 Oct 2025 01:00:27 +0530 Subject: [PATCH 03/96] PM: console: Fix memory allocation error handling in pm_vt_switch_required() The pm_vt_switch_required() function fails silently when memory allocation fails, offering no indication to callers that the operation was unsuccessful. This behavior prevents drivers from handling allocation errors correctly or implementing retry mechanisms. By ensuring that failures are reported back to the caller, drivers can make informed decisions, improve robustness, and avoid unexpected behavior during critical power management operations. Change the function signature to return an integer error code and modify the implementation to return -ENOMEM when kmalloc() fails. Update both the function declaration and the inline stub in include/linux/pm.h to maintain consistency across CONFIG_VT_CONSOLE_SLEEP configurations. The function now returns: - 0 on success (including when updating existing entries) - -ENOMEM when memory allocation fails This change improves error reporting without breaking existing callers, as the current callers in drivers/video/fbdev/core/fbmem.c already ignore the return value, making this a backward-compatible improvement. Reviewed-by: Lyude Paul Signed-off-by: Malaya Kumar Rout Reviewed-by: Dhruva Gole Reviewed-by: Lyude Paul Link: https://patch.msgid.link/20251013193028.89570-1-mrout@redhat.com Signed-off-by: Rafael J. Wysocki --- include/linux/pm.h | 5 +++-- kernel/power/console.c | 8 ++++++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/include/linux/pm.h b/include/linux/pm.h index cc7b2dc28574..a72e42eec130 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -25,11 +25,12 @@ extern void (*pm_power_off)(void); struct device; /* we have a circular dep with device.h */ #ifdef CONFIG_VT_CONSOLE_SLEEP -extern void pm_vt_switch_required(struct device *dev, bool required); +extern int pm_vt_switch_required(struct device *dev, bool required); extern void pm_vt_switch_unregister(struct device *dev); #else -static inline void pm_vt_switch_required(struct device *dev, bool required) +static inline int pm_vt_switch_required(struct device *dev, bool required) { + return 0; } static inline void pm_vt_switch_unregister(struct device *dev) { diff --git a/kernel/power/console.c b/kernel/power/console.c index 19c48aa5355d..a906a0ac0f9b 100644 --- a/kernel/power/console.c +++ b/kernel/power/console.c @@ -44,9 +44,10 @@ static LIST_HEAD(pm_vt_switch_list); * no_console_suspend argument has been passed on the command line, VT * switches will occur. */ -void pm_vt_switch_required(struct device *dev, bool required) +int pm_vt_switch_required(struct device *dev, bool required) { struct pm_vt_switch *entry, *tmp; + int ret = 0; mutex_lock(&vt_switch_mutex); list_for_each_entry(tmp, &pm_vt_switch_list, head) { @@ -58,8 +59,10 @@ void pm_vt_switch_required(struct device *dev, bool required) } entry = kmalloc(sizeof(*entry), GFP_KERNEL); - if (!entry) + if (!entry) { + ret = -ENOMEM; goto out; + } entry->required = required; entry->dev = dev; @@ -67,6 +70,7 @@ void pm_vt_switch_required(struct device *dev, bool required) list_add(&entry->head, &pm_vt_switch_list); out: mutex_unlock(&vt_switch_mutex); + return ret; } EXPORT_SYMBOL(pm_vt_switch_required); From 5a151c2328a78aa0949a393f5c475a64b93a89ca Mon Sep 17 00:00:00 2001 From: Kaushlendra Kumar Date: Fri, 19 Sep 2025 18:14:37 +0530 Subject: [PATCH 04/96] PM: sleep: Introduce CALL_PM_OP() macro to simplify code Add CALL_PM_OP() macro to eliminate a repetitive code pattern in power management generic operations. Replace analogous driver PM callback invocation logic across all pm_generic_*() functions with a single macro that handles the NULL pointer checks and function calls. This reduces code size while maintaining the same functionality and improving code maintainability. Signed-off-by: Kaushlendra Kumar Reviewed-by: Dhruva Gole Link: https://patch.msgid.link/20250919124437.3075016-1-kaushlendra.kumar@intel.com [ rjw: Subject and changelog edits, adjust white space ] Signed-off-by: Rafael J. Wysocki --- drivers/base/power/generic_ops.c | 85 ++++++++++---------------------- 1 file changed, 25 insertions(+), 60 deletions(-) diff --git a/drivers/base/power/generic_ops.c b/drivers/base/power/generic_ops.c index 6502720bb564..af99bbcf281c 100644 --- a/drivers/base/power/generic_ops.c +++ b/drivers/base/power/generic_ops.c @@ -8,6 +8,13 @@ #include #include +#define CALL_PM_OP(dev, op) \ +({ \ + struct device *_dev = (dev); \ + const struct dev_pm_ops *pm = _dev->driver ? _dev->driver->pm : NULL; \ + pm && pm->op ? pm->op(_dev) : 0; \ +}) + #ifdef CONFIG_PM /** * pm_generic_runtime_suspend - Generic runtime suspend callback for subsystems. @@ -19,12 +26,7 @@ */ int pm_generic_runtime_suspend(struct device *dev) { - const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; - int ret; - - ret = pm && pm->runtime_suspend ? pm->runtime_suspend(dev) : 0; - - return ret; + return CALL_PM_OP(dev, runtime_suspend); } EXPORT_SYMBOL_GPL(pm_generic_runtime_suspend); @@ -38,12 +40,7 @@ EXPORT_SYMBOL_GPL(pm_generic_runtime_suspend); */ int pm_generic_runtime_resume(struct device *dev) { - const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; - int ret; - - ret = pm && pm->runtime_resume ? pm->runtime_resume(dev) : 0; - - return ret; + return CALL_PM_OP(dev, runtime_resume); } EXPORT_SYMBOL_GPL(pm_generic_runtime_resume); #endif /* CONFIG_PM */ @@ -72,9 +69,7 @@ int pm_generic_prepare(struct device *dev) */ int pm_generic_suspend_noirq(struct device *dev) { - const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; - - return pm && pm->suspend_noirq ? pm->suspend_noirq(dev) : 0; + return CALL_PM_OP(dev, suspend_noirq); } EXPORT_SYMBOL_GPL(pm_generic_suspend_noirq); @@ -84,9 +79,7 @@ EXPORT_SYMBOL_GPL(pm_generic_suspend_noirq); */ int pm_generic_suspend_late(struct device *dev) { - const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; - - return pm && pm->suspend_late ? pm->suspend_late(dev) : 0; + return CALL_PM_OP(dev, suspend_late); } EXPORT_SYMBOL_GPL(pm_generic_suspend_late); @@ -96,9 +89,7 @@ EXPORT_SYMBOL_GPL(pm_generic_suspend_late); */ int pm_generic_suspend(struct device *dev) { - const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; - - return pm && pm->suspend ? pm->suspend(dev) : 0; + return CALL_PM_OP(dev, suspend); } EXPORT_SYMBOL_GPL(pm_generic_suspend); @@ -108,9 +99,7 @@ EXPORT_SYMBOL_GPL(pm_generic_suspend); */ int pm_generic_freeze_noirq(struct device *dev) { - const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; - - return pm && pm->freeze_noirq ? pm->freeze_noirq(dev) : 0; + return CALL_PM_OP(dev, freeze_noirq); } EXPORT_SYMBOL_GPL(pm_generic_freeze_noirq); @@ -120,9 +109,7 @@ EXPORT_SYMBOL_GPL(pm_generic_freeze_noirq); */ int pm_generic_freeze(struct device *dev) { - const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; - - return pm && pm->freeze ? pm->freeze(dev) : 0; + return CALL_PM_OP(dev, freeze); } EXPORT_SYMBOL_GPL(pm_generic_freeze); @@ -132,9 +119,7 @@ EXPORT_SYMBOL_GPL(pm_generic_freeze); */ int pm_generic_poweroff_noirq(struct device *dev) { - const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; - - return pm && pm->poweroff_noirq ? pm->poweroff_noirq(dev) : 0; + return CALL_PM_OP(dev, poweroff_noirq); } EXPORT_SYMBOL_GPL(pm_generic_poweroff_noirq); @@ -144,9 +129,7 @@ EXPORT_SYMBOL_GPL(pm_generic_poweroff_noirq); */ int pm_generic_poweroff_late(struct device *dev) { - const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; - - return pm && pm->poweroff_late ? pm->poweroff_late(dev) : 0; + return CALL_PM_OP(dev, poweroff_late); } EXPORT_SYMBOL_GPL(pm_generic_poweroff_late); @@ -156,9 +139,7 @@ EXPORT_SYMBOL_GPL(pm_generic_poweroff_late); */ int pm_generic_poweroff(struct device *dev) { - const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; - - return pm && pm->poweroff ? pm->poweroff(dev) : 0; + return CALL_PM_OP(dev, poweroff); } EXPORT_SYMBOL_GPL(pm_generic_poweroff); @@ -168,9 +149,7 @@ EXPORT_SYMBOL_GPL(pm_generic_poweroff); */ int pm_generic_thaw_noirq(struct device *dev) { - const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; - - return pm && pm->thaw_noirq ? pm->thaw_noirq(dev) : 0; + return CALL_PM_OP(dev, thaw_noirq); } EXPORT_SYMBOL_GPL(pm_generic_thaw_noirq); @@ -180,9 +159,7 @@ EXPORT_SYMBOL_GPL(pm_generic_thaw_noirq); */ int pm_generic_thaw(struct device *dev) { - const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; - - return pm && pm->thaw ? pm->thaw(dev) : 0; + return CALL_PM_OP(dev, thaw); } EXPORT_SYMBOL_GPL(pm_generic_thaw); @@ -192,9 +169,7 @@ EXPORT_SYMBOL_GPL(pm_generic_thaw); */ int pm_generic_resume_noirq(struct device *dev) { - const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; - - return pm && pm->resume_noirq ? pm->resume_noirq(dev) : 0; + return CALL_PM_OP(dev, resume_noirq); } EXPORT_SYMBOL_GPL(pm_generic_resume_noirq); @@ -204,9 +179,7 @@ EXPORT_SYMBOL_GPL(pm_generic_resume_noirq); */ int pm_generic_resume_early(struct device *dev) { - const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; - - return pm && pm->resume_early ? pm->resume_early(dev) : 0; + return CALL_PM_OP(dev, resume_early); } EXPORT_SYMBOL_GPL(pm_generic_resume_early); @@ -216,9 +189,7 @@ EXPORT_SYMBOL_GPL(pm_generic_resume_early); */ int pm_generic_resume(struct device *dev) { - const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; - - return pm && pm->resume ? pm->resume(dev) : 0; + return CALL_PM_OP(dev, resume); } EXPORT_SYMBOL_GPL(pm_generic_resume); @@ -228,9 +199,7 @@ EXPORT_SYMBOL_GPL(pm_generic_resume); */ int pm_generic_restore_noirq(struct device *dev) { - const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; - - return pm && pm->restore_noirq ? pm->restore_noirq(dev) : 0; + return CALL_PM_OP(dev, restore_noirq); } EXPORT_SYMBOL_GPL(pm_generic_restore_noirq); @@ -240,9 +209,7 @@ EXPORT_SYMBOL_GPL(pm_generic_restore_noirq); */ int pm_generic_restore_early(struct device *dev) { - const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; - - return pm && pm->restore_early ? pm->restore_early(dev) : 0; + return CALL_PM_OP(dev, restore_early); } EXPORT_SYMBOL_GPL(pm_generic_restore_early); @@ -252,9 +219,7 @@ EXPORT_SYMBOL_GPL(pm_generic_restore_early); */ int pm_generic_restore(struct device *dev) { - const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; - - return pm && pm->restore ? pm->restore(dev) : 0; + return CALL_PM_OP(dev, restore); } EXPORT_SYMBOL_GPL(pm_generic_restore); From a67818f74512452e9d99a98d990ea9d9b7c91791 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Tue, 7 Oct 2025 15:35:40 +0900 Subject: [PATCH 05/96] PM: dpm_watchdog: add module param to backtrace all CPUs Add dpm_watchdog_all_cpu_backtrace module parameter which controls all CPU backtrace dump before the DPM watchdog panics the system. This is expected to help understand what might have caused device timeout. Signed-off-by: Sergey Senozhatsky Reviewed-by: Tomasz Figa Reviewed-by: Dhruva Gole Link: https://patch.msgid.link/20251007063551.3147937-1-senozhatsky@chromium.org [ rjw: Subject and changelog edits ] Signed-off-by: Rafael J. Wysocki --- drivers/base/power/main.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index e83503bdc1fd..7a8807ec9a5d 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -34,6 +34,7 @@ #include #include #include +#include #include "../base.h" #include "power.h" @@ -515,6 +516,11 @@ struct dpm_watchdog { #define DECLARE_DPM_WATCHDOG_ON_STACK(wd) \ struct dpm_watchdog wd +static bool __read_mostly dpm_watchdog_all_cpu_backtrace; +module_param(dpm_watchdog_all_cpu_backtrace, bool, 0644); +MODULE_PARM_DESC(dpm_watchdog_all_cpu_backtrace, + "Backtrace all CPUs on DPM watchdog timeout"); + /** * dpm_watchdog_handler - Driver suspend / resume watchdog handler. * @t: The timer that PM watchdog depends on. @@ -530,8 +536,12 @@ static void dpm_watchdog_handler(struct timer_list *t) unsigned int time_left; if (wd->fatal) { + unsigned int this_cpu = smp_processor_id(); + dev_emerg(wd->dev, "**** DPM device timeout ****\n"); show_stack(wd->tsk, NULL, KERN_EMERG); + if (dpm_watchdog_all_cpu_backtrace) + trigger_allbutcpu_cpu_backtrace(this_cpu); panic("%s %s: unrecoverable failure\n", dev_driver_string(wd->dev), dev_name(wd->dev)); } From d3db87f89c71c34f4a6a0ee9de3dfab5eee18b22 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 9 Oct 2025 21:34:16 +0200 Subject: [PATCH 06/96] PM: hibernate: Rework message printing in swsusp_save() The messages printed by swsusp_save() are basically only useful for debug, so printing them every time a hibernation image is created at the "info" log level is not particularly useful. Also printing a message on a failing memory allocation is redundant. Use pm_deferred_pr_dbg() for printing those messages so they will only be printed when requested and the "deferred" variant is used because this code runs in a deeply atomic context (one CPU with interrupts off, no functional devices). Also drop the useless message printed when memory allocations fails. While at it, extend one of the messages in question so it is less cryptic. Signed-off-by: Rafael J. Wysocki [ rjw: Dropped a useless colon at the end of one of the messages ] Link: https://patch.msgid.link/10750389.nUPlyArG6x@rafael.j.wysocki --- kernel/power/snapshot.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 645f42e40478..0a946932d5c1 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -2110,22 +2110,20 @@ asmlinkage __visible int swsusp_save(void) { unsigned int nr_pages, nr_highmem; - pr_info("Creating image:\n"); + pm_deferred_pr_dbg("Creating image\n"); drain_local_pages(NULL); nr_pages = count_data_pages(); nr_highmem = count_highmem_pages(); - pr_info("Need to copy %u pages\n", nr_pages + nr_highmem); + pm_deferred_pr_dbg("Need to copy %u pages\n", nr_pages + nr_highmem); if (!enough_free_mem(nr_pages, nr_highmem)) { - pr_err("Not enough free memory\n"); + pm_deferred_pr_dbg("Not enough free memory for image creation\n"); return -ENOMEM; } - if (swsusp_alloc(©_bm, nr_pages, nr_highmem)) { - pr_err("Memory allocation failed\n"); + if (swsusp_alloc(©_bm, nr_pages, nr_highmem)) return -ENOMEM; - } /* * During allocating of suspend pagedir, new cold pages may appear. @@ -2144,7 +2142,8 @@ asmlinkage __visible int swsusp_save(void) nr_zero_pages = nr_pages - nr_copy_pages; nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE); - pr_info("Image created (%d pages copied, %d zero pages)\n", nr_copy_pages, nr_zero_pages); + pm_deferred_pr_dbg("Image created (%d pages copied, %d zero pages)\n", + nr_copy_pages, nr_zero_pages); return 0; } From 6db0f533d320fab54154b0207e9df108427dd939 Mon Sep 17 00:00:00 2001 From: Zihuan Zhang Date: Sat, 11 Oct 2025 15:24:20 +0800 Subject: [PATCH 07/96] cpufreq: preserve freq_table_sorted across suspend/hibernate During S3/S4 suspend and resume, cpufreq policies are not freed or recreated; the freq_table and policy structure remain intact. However, set_freq_table_sorted() currently resets policy->freq_table_sorted to UNSORTED unconditionally, which is unnecessary since the table order does not change across suspend/resume. This patch adds a check to skip validation if policy->freq_table_sorted is already ASCENDING or DESCENDING. This avoids unnecessary traversal of the frequency table on S3/S4 resume or repeated online events, reducing overhead while preserving correctness. Signed-off-by: Zihuan Zhang Link: https://patch.msgid.link/20251011072420.11495-1-zhangzihuan@kylinos.cn Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 852e024facc3..4a27f6cb07d3 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1421,9 +1421,12 @@ static int cpufreq_policy_online(struct cpufreq_policy *policy, * If there is a problem with its frequency table, take it * offline and drop it. */ - ret = cpufreq_table_validate_and_sort(policy); - if (ret) - goto out_offline_policy; + if (policy->freq_table_sorted != CPUFREQ_TABLE_SORTED_ASCENDING && + policy->freq_table_sorted != CPUFREQ_TABLE_SORTED_DESCENDING) { + ret = cpufreq_table_validate_and_sort(policy); + if (ret) + goto out_offline_policy; + } /* related_cpus should at least include policy->cpus. */ cpumask_copy(policy->related_cpus, policy->cpus); From 528dde6619677ac6dc26d9dda1e3c9014b4a08c8 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 15 Oct 2025 15:46:40 +0200 Subject: [PATCH 08/96] cpufreq: intel_pstate: Add and use hybrid_get_cpu_type() Introduce a function for identifying the type of a given CPU in a hybrid system, called hybrid_get_cpu_type(), and use if for hybrid scaling factor determination in hwp_get_cpu_scaling(). Signed-off-by: Rafael J. Wysocki Link: https://patch.msgid.link/1954386.tdWV9SEqCh@rafael.j.wysocki --- drivers/cpufreq/intel_pstate.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 38897bb14a2c..22316c930864 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -912,6 +912,11 @@ static struct freq_attr *hwp_cpufreq_attrs[] = { [HWP_CPUFREQ_ATTR_COUNT] = NULL, }; +static u8 hybrid_get_cpu_type(unsigned int cpu) +{ + return cpu_data(cpu).topo.intel_type; +} + static bool no_cas __ro_after_init; static struct cpudata *hybrid_max_perf_cpu __read_mostly; @@ -2298,18 +2303,14 @@ static int knl_get_turbo_pstate(int cpu) static int hwp_get_cpu_scaling(int cpu) { if (hybrid_scaling_factor) { - struct cpuinfo_x86 *c = &cpu_data(cpu); - u8 cpu_type = c->topo.intel_type; - /* * Return the hybrid scaling factor for P-cores and use the * default core scaling for E-cores. */ - if (cpu_type == INTEL_CPU_TYPE_CORE) + if (hybrid_get_cpu_type(cpu) == INTEL_CPU_TYPE_CORE) return hybrid_scaling_factor; - if (cpu_type == INTEL_CPU_TYPE_ATOM) - return core_get_scaling(); + return core_get_scaling(); } /* Use core scaling on non-hybrid systems. */ From c17add73498245bd94cb8a05345c73366606e671 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 15 Oct 2025 15:47:45 +0200 Subject: [PATCH 09/96] cpufreq: intel_pstate: Add and use hybrid_has_l3() Introduce a function for checking whether or not a given CPU has L3 cache, called hybrid_has_l3(), and use it in hybrid_get_cost() for computing cost coefficients associated with a given perf domain. Signed-off-by: Rafael J. Wysocki Link: https://patch.msgid.link/13884343.uLZWGnKmhe@rafael.j.wysocki --- drivers/cpufreq/intel_pstate.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 22316c930864..f85056ee6e61 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -951,11 +951,26 @@ static int hybrid_active_power(struct device *dev, unsigned long *power, return 0; } +static bool hybrid_has_l3(unsigned int cpu) +{ + struct cpu_cacheinfo *cacheinfo = get_cpu_cacheinfo(cpu); + unsigned int i; + + if (!cacheinfo) + return false; + + for (i = 0; i < cacheinfo->num_leaves; i++) { + if (cacheinfo->info_list[i].level == 3) + return true; + } + + return false; +} + static int hybrid_get_cost(struct device *dev, unsigned long freq, unsigned long *cost) { struct pstate_data *pstate = &all_cpu_data[dev->id]->pstate; - struct cpu_cacheinfo *cacheinfo = get_cpu_cacheinfo(dev->id); /* * The smaller the perf-to-frequency scaling factor, the larger the IPC @@ -973,17 +988,8 @@ static int hybrid_get_cost(struct device *dev, unsigned long freq, * touching it in case some other CPUs of the same type can do the work * without it. */ - if (cacheinfo) { - unsigned int i; - - /* Check if L3 cache is there. */ - for (i = 0; i < cacheinfo->num_leaves; i++) { - if (cacheinfo->info_list[i].level == 3) { - *cost += 2; - break; - } - } - } + if (hybrid_has_l3(dev->id)) + *cost += 2; return 0; } From d852b6f67b71dd22cd2af8ee29306eccbd6c06bf Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 16 Oct 2025 18:22:13 +0200 Subject: [PATCH 10/96] cpufreq: intel_pstate: hybrid: Adjust energy model rules Instead of using HWP-to-frequency scaling factors for computing cost coefficients in the energy model used on hybrid systems, which is fragile, rely on CPU type information that is easily accessible now and the information on whether or not L3 cache is present for this purpose. This also allows the cost coefficients for P-cores to be adjusted so that they start to be populated somewhat earlier (that is, before E-cores are loaded up to their full capacity). In addition to the above, replace an inaccurate comment regarding the reason why the freq value is added to the cost in hybrid_get_cost(). Signed-off-by: Rafael J. Wysocki Reviewed-by: Dietmar Eggemann Reviewed-by: Yaxiong Tian Link: https://patch.msgid.link/5932894.DvuYhMxLoT@rafael.j.wysocki --- drivers/cpufreq/intel_pstate.c | 35 ++++++++++++++-------------------- 1 file changed, 14 insertions(+), 21 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index f85056ee6e61..3ed8a0001b2f 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -933,11 +933,8 @@ static int hybrid_active_power(struct device *dev, unsigned long *power, unsigned long *freq) { /* - * Create "utilization bins" of 0-40%, 40%-60%, 60%-80%, and 80%-100% - * of the maximum capacity such that two CPUs of the same type will be - * regarded as equally attractive if the utilization of each of them - * falls into the same bin, which should prevent tasks from being - * migrated between them too often. + * Create four "states" corresponding to 40%, 60%, 80%, and 100% of the + * full capacity. * * For this purpose, return the "frequency" of 2 for the first * performance level and otherwise leave the value set by the caller. @@ -970,26 +967,22 @@ static bool hybrid_has_l3(unsigned int cpu) static int hybrid_get_cost(struct device *dev, unsigned long freq, unsigned long *cost) { - struct pstate_data *pstate = &all_cpu_data[dev->id]->pstate; - + /* Facilitate load balancing between CPUs of the same type. */ + *cost = freq; /* - * The smaller the perf-to-frequency scaling factor, the larger the IPC - * ratio between the given CPU and the least capable CPU in the system. - * Regard that IPC ratio as the primary cost component and assume that - * the scaling factors for different CPU types will differ by at least - * 5% and they will not be above INTEL_PSTATE_CORE_SCALING. + * Adjust the cost depending on CPU type. * - * Add the freq value to the cost, so that the cost of running on CPUs - * of the same type in different "utilization bins" is different. + * The idea is to start loading up LPE-cores before E-cores and start + * to populate E-cores when LPE-cores are utilized above 60% of the + * capacity. Similarly, P-cores start to be populated when E-cores are + * utilized above 60% of the capacity. */ - *cost = div_u64(100ULL * INTEL_PSTATE_CORE_SCALING, pstate->scaling) + freq; - /* - * Increase the cost slightly for CPUs able to access L3 to avoid - * touching it in case some other CPUs of the same type can do the work - * without it. - */ - if (hybrid_has_l3(dev->id)) + if (hybrid_get_cpu_type(dev->id) == INTEL_CPU_TYPE_ATOM) { + if (hybrid_has_l3(dev->id)) /* E-core */ + *cost += 1; + } else { /* P-core */ *cost += 2; + } return 0; } From 5313ec4a215a0c4af8fd927b103d31c2c93e961f Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 15 Oct 2025 15:50:56 +0200 Subject: [PATCH 11/96] cpufreq: intel_pstate: Improve printing of debug messages Some debug messages generated by intel_pstate on a given hybrid system are only printed for some CPUs which is confusing, so modify the driver to print them for all CPUs. Also change those messages to avoid printing local variable names in them. Moreover, some debug messages printed by intel_pstate are quite hard to understand without looking at the code printing them, so make them somewhat clearer while at it. Signed-off-by: Rafael J. Wysocki Link: https://patch.msgid.link/8609836.T7Z3S40VBb@rafael.j.wysocki --- drivers/cpufreq/intel_pstate.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 3ed8a0001b2f..7d2a1aec3a61 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -575,13 +575,18 @@ static void intel_pstate_hybrid_hwp_adjust(struct cpudata *cpu) int scaling = cpu->pstate.scaling; int freq; - pr_debug("CPU%d: perf_ctl_max_phys = %d\n", cpu->cpu, perf_ctl_max_phys); - pr_debug("CPU%d: perf_ctl_turbo = %d\n", cpu->cpu, perf_ctl_turbo); - pr_debug("CPU%d: perf_ctl_scaling = %d\n", cpu->cpu, perf_ctl_scaling); + pr_debug("CPU%d: PERF_CTL max_phys = %d\n", cpu->cpu, perf_ctl_max_phys); + pr_debug("CPU%d: PERF_CTL turbo = %d\n", cpu->cpu, perf_ctl_turbo); + pr_debug("CPU%d: PERF_CTL scaling = %d\n", cpu->cpu, perf_ctl_scaling); pr_debug("CPU%d: HWP_CAP guaranteed = %d\n", cpu->cpu, cpu->pstate.max_pstate); pr_debug("CPU%d: HWP_CAP highest = %d\n", cpu->cpu, cpu->pstate.turbo_pstate); pr_debug("CPU%d: HWP-to-frequency scaling factor: %d\n", cpu->cpu, scaling); + if (scaling == perf_ctl_scaling) + return; + + hwp_is_hybrid = true; + cpu->pstate.turbo_freq = rounddown(cpu->pstate.turbo_pstate * scaling, perf_ctl_scaling); cpu->pstate.max_freq = rounddown(cpu->pstate.max_pstate * scaling, @@ -1044,9 +1049,9 @@ static void hybrid_set_cpu_capacity(struct cpudata *cpu) topology_set_cpu_scale(cpu->cpu, arch_scale_cpu_capacity(cpu->cpu)); - pr_debug("CPU%d: perf = %u, max. perf = %u, base perf = %d\n", cpu->cpu, - cpu->capacity_perf, hybrid_max_perf_cpu->capacity_perf, - cpu->pstate.max_pstate_physical); + pr_debug("CPU%d: capacity perf = %u, base perf = %u, sys max perf = %u\n", + cpu->cpu, cpu->capacity_perf, cpu->pstate.max_pstate_physical, + hybrid_max_perf_cpu->capacity_perf); } static void hybrid_clear_cpu_capacity(unsigned int cpunum) @@ -2344,11 +2349,10 @@ static void intel_pstate_set_min_pstate(struct cpudata *cpu) static void intel_pstate_get_cpu_pstates(struct cpudata *cpu) { - int perf_ctl_max_phys = pstate_funcs.get_max_physical(cpu->cpu); int perf_ctl_scaling = pstate_funcs.get_scaling(); + cpu->pstate.max_pstate_physical = pstate_funcs.get_max_physical(cpu->cpu); cpu->pstate.min_pstate = pstate_funcs.get_min(cpu->cpu); - cpu->pstate.max_pstate_physical = perf_ctl_max_phys; cpu->pstate.perf_ctl_scaling = perf_ctl_scaling; if (hwp_active && !hwp_mode_bdw) { @@ -2356,10 +2360,7 @@ static void intel_pstate_get_cpu_pstates(struct cpudata *cpu) if (pstate_funcs.get_cpu_scaling) { cpu->pstate.scaling = pstate_funcs.get_cpu_scaling(cpu->cpu); - if (cpu->pstate.scaling != perf_ctl_scaling) { - intel_pstate_hybrid_hwp_adjust(cpu); - hwp_is_hybrid = true; - } + intel_pstate_hybrid_hwp_adjust(cpu); } else { cpu->pstate.scaling = perf_ctl_scaling; } From ace04717749d20e34dac9f78c5ac772168232b67 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Fri, 17 Oct 2025 17:33:54 +0200 Subject: [PATCH 12/96] cpufreq: Replace deprecated strcpy() in cpufreq_unregister_governor() strcpy() is deprecated; assign the NUL terminator directly instead. Link: https://github.com/KSPP/linux/issues/88 Signed-off-by: Thorsten Blum [ rjw: Subject tweaks ] Link: https://patch.msgid.link/20251017153354.82009-2-thorsten.blum@linux.dev Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 4a27f6cb07d3..4472bb1ec83c 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -2553,7 +2553,7 @@ void cpufreq_unregister_governor(struct cpufreq_governor *governor) for_each_inactive_policy(policy) { if (!strcmp(policy->last_governor, governor->name)) { policy->governor = NULL; - strcpy(policy->last_governor, "\0"); + policy->last_governor[0] = '\0'; } } read_unlock_irqrestore(&cpufreq_driver_lock, flags); From e6fdbe8feace22ba54ebcf20d6e200fc97c8e065 Mon Sep 17 00:00:00 2001 From: Tamir Duberstein Date: Sat, 18 Oct 2025 15:16:32 -0400 Subject: [PATCH 13/96] rust: opp: fix broken rustdoc link Correct the spelling of "CString" to make the link work. Fixes: ce32e2d47ce6 ("rust: opp: Add abstractions for the configuration options") Signed-off-by: Tamir Duberstein Signed-off-by: Viresh Kumar --- rust/kernel/opp.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/kernel/opp.rs b/rust/kernel/opp.rs index 2c763fa9276d..04472a8de3ff 100644 --- a/rust/kernel/opp.rs +++ b/rust/kernel/opp.rs @@ -87,7 +87,7 @@ fn drop(&mut self) { use macros::vtable; -/// Creates a null-terminated slice of pointers to [`Cstring`]s. +/// Creates a null-terminated slice of pointers to [`CString`]s. fn to_c_str_array(names: &[CString]) -> Result> { // Allocated a null-terminated vector of pointers. let mut list = KVec::with_capacity(names.len() + 1, GFP_KERNEL)?; From cbe5aeedecc72314c3a8fd0d41d9b270f576aee1 Mon Sep 17 00:00:00 2001 From: Changwoo Min Date: Tue, 21 Oct 2025 07:09:05 +0900 Subject: [PATCH 14/96] PM: EM: Assign a unique ID when creating a performance domain It is necessary to refer to a specific performance domain from a userspace. For example, the energy model of a particular performance domain is updated. To this end, assign a unique ID to each performance domain to address it, and manage them in a global linked list to look up a specific one by matching ID. IDA is used for ID assignment, and the mutex is used to protect the global list from concurrent access. Note that the mutex (em_pd_list_mutex) is not supposed to hold while holding em_pd_mutex to avoid ABBA deadlock. Signed-off-by: Changwoo Min Reviewed-by: Lukasz Luba Link: https://patch.msgid.link/20251020220914.320832-2-changwoo@igalia.com Signed-off-by: Rafael J. Wysocki --- include/linux/energy_model.h | 4 ++++ kernel/power/energy_model.c | 30 +++++++++++++++++++++++++++++- 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 61d50571ad88..43aa6153dc57 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -54,6 +54,8 @@ struct em_perf_table { /** * struct em_perf_domain - Performance domain * @em_table: Pointer to the runtime modifiable em_perf_table + * @node: node in em_pd_list (in energy_model.c) + * @id: A unique ID number for each performance domain * @nr_perf_states: Number of performance states * @min_perf_state: Minimum allowed Performance State index * @max_perf_state: Maximum allowed Performance State index @@ -71,6 +73,8 @@ struct em_perf_table { */ struct em_perf_domain { struct em_perf_table __rcu *em_table; + struct list_head node; + int id; int nr_perf_states; int min_perf_state; int max_perf_state; diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 5f17d2e8e954..2047b546ad11 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -23,6 +23,16 @@ */ static DEFINE_MUTEX(em_pd_mutex); +/* + * Manage performance domains with IDs. One can iterate the performance domains + * through the list and pick one with their associated ID. The mutex serializes + * the list access. When holding em_pd_list_mutex, em_pd_mutex should not be + * taken to avoid potential deadlock. + */ +static DEFINE_IDA(em_pd_ida); +static LIST_HEAD(em_pd_list); +static DEFINE_MUTEX(em_pd_list_mutex); + static void em_cpufreq_update_efficiencies(struct device *dev, struct em_perf_state *table); static void em_check_capacity_update(void); @@ -396,7 +406,7 @@ static int em_create_pd(struct device *dev, int nr_states, struct em_perf_table *em_table; struct em_perf_domain *pd; struct device *cpu_dev; - int cpu, ret, num_cpus; + int cpu, ret, num_cpus, id; if (_is_cpu_device(dev)) { num_cpus = cpumask_weight(cpus); @@ -420,6 +430,13 @@ static int em_create_pd(struct device *dev, int nr_states, pd->nr_perf_states = nr_states; + INIT_LIST_HEAD(&pd->node); + + id = ida_alloc(&em_pd_ida, GFP_KERNEL); + if (id < 0) + return -ENOMEM; + pd->id = id; + em_table = em_table_alloc(pd); if (!em_table) goto free_pd; @@ -444,6 +461,7 @@ static int em_create_pd(struct device *dev, int nr_states, kfree(em_table); free_pd: kfree(pd); + ida_free(&em_pd_ida, id); return -EINVAL; } @@ -660,6 +678,10 @@ int em_dev_register_pd_no_update(struct device *dev, unsigned int nr_states, unlock: mutex_unlock(&em_pd_mutex); + mutex_lock(&em_pd_list_mutex); + list_add_tail(&dev->em_pd->node, &em_pd_list); + mutex_unlock(&em_pd_list_mutex); + return ret; } EXPORT_SYMBOL_GPL(em_dev_register_pd_no_update); @@ -678,6 +700,10 @@ void em_dev_unregister_perf_domain(struct device *dev) if (_is_cpu_device(dev)) return; + mutex_lock(&em_pd_list_mutex); + list_del_init(&dev->em_pd->node); + mutex_unlock(&em_pd_list_mutex); + /* * The mutex separates all register/unregister requests and protects * from potential clean-up/setup issues in the debugfs directories. @@ -689,6 +715,8 @@ void em_dev_unregister_perf_domain(struct device *dev) em_table_free(rcu_dereference_protected(dev->em_pd->em_table, lockdep_is_held(&em_pd_mutex))); + ida_free(&em_pd_ida, dev->em_pd->id); + kfree(dev->em_pd); dev->em_pd = NULL; mutex_unlock(&em_pd_mutex); From ee50b8bb6b5d62fc2ebff872ee7ecb3a9380ec64 Mon Sep 17 00:00:00 2001 From: Changwoo Min Date: Tue, 21 Oct 2025 07:09:06 +0900 Subject: [PATCH 15/96] PM: EM: Expose the ID of a performance domain via debugfs For ease of debugging, let's expose the assigned ID of a performance domain through debugfs (e.g., /sys/kernel/debug/energy_model/cpu0/id). Signed-off-by: Changwoo Min Reviewed-by: Lukasz Luba Link: https://patch.msgid.link/20251020220914.320832-3-changwoo@igalia.com Signed-off-by: Rafael J. Wysocki --- kernel/power/energy_model.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 2047b546ad11..756debf5406a 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -126,6 +126,16 @@ static int em_debug_flags_show(struct seq_file *s, void *unused) } DEFINE_SHOW_ATTRIBUTE(em_debug_flags); +static int em_debug_id_show(struct seq_file *s, void *unused) +{ + struct em_perf_domain *pd = s->private; + + seq_printf(s, "%d\n", pd->id); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(em_debug_id); + static void em_debug_create_pd(struct device *dev) { struct em_dbg_info *em_dbg; @@ -142,6 +152,8 @@ static void em_debug_create_pd(struct device *dev) debugfs_create_file("flags", 0444, d, dev->em_pd, &em_debug_flags_fops); + debugfs_create_file("id", 0444, d, dev->em_pd, &em_debug_id_fops); + em_dbg = devm_kcalloc(dev, dev->em_pd->nr_perf_states, sizeof(*em_dbg), GFP_KERNEL); if (!em_dbg) From bd26631ccdfd11701fa29e665a7f041875ba9423 Mon Sep 17 00:00:00 2001 From: Changwoo Min Date: Tue, 21 Oct 2025 07:09:07 +0900 Subject: [PATCH 16/96] PM: EM: Add em.yaml and autogen files Add a generic netlink spec in YAML format and autogenerate boilerplate code using ynl-regen.sh to introduce a generic netlink for the energy model. It allows a userspace program to read the performance domain and its energy model. It notifies the userspace program when a performance domain is created or deleted or its energy model is updated through a multicast interface. Specifically, it supports two commands: - EM_CMD_GET_PDS: Get the list of information for all performance domains. - EM_CMD_GET_PD_TABLE: Get the energy model table of a performance domain. Also, it supports three notification events: - EM_CMD_PD_CREATED: When a performance domain is created. - EM_CMD_PD_DELETED: When a performance domain is deleted. - EM_CMD_PD_UPDATED: When the energy model table of a performance domain is updated. Finally, update MAINTAINERS to include new files. Signed-off-by: Changwoo Min Reviewed-by: Lukasz Luba Link: https://patch.msgid.link/20251020220914.320832-4-changwoo@igalia.com Signed-off-by: Rafael J. Wysocki --- Documentation/netlink/specs/em.yaml | 113 ++++++++++++++++++++++++++++ MAINTAINERS | 3 + include/uapi/linux/energy_model.h | 62 +++++++++++++++ kernel/power/em_netlink_autogen.c | 48 ++++++++++++ kernel/power/em_netlink_autogen.h | 23 ++++++ 5 files changed, 249 insertions(+) create mode 100644 Documentation/netlink/specs/em.yaml create mode 100644 include/uapi/linux/energy_model.h create mode 100644 kernel/power/em_netlink_autogen.c create mode 100644 kernel/power/em_netlink_autogen.h diff --git a/Documentation/netlink/specs/em.yaml b/Documentation/netlink/specs/em.yaml new file mode 100644 index 000000000000..9905ca482325 --- /dev/null +++ b/Documentation/netlink/specs/em.yaml @@ -0,0 +1,113 @@ +# SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) + +name: em + +doc: | + Energy model netlink interface to notify its changes. + +protocol: genetlink + +uapi-header: linux/energy_model.h + +attribute-sets: + - + name: pds + attributes: + - + name: pd + type: nest + nested-attributes: pd + multi-attr: true + - + name: pd + attributes: + - + name: pad + type: pad + - + name: pd-id + type: u32 + - + name: flags + type: u64 + - + name: cpus + type: string + - + name: pd-table + attributes: + - + name: pd-id + type: u32 + - + name: ps + type: nest + nested-attributes: ps + multi-attr: true + - + name: ps + attributes: + - + name: pad + type: pad + - + name: performance + type: u64 + - + name: frequency + type: u64 + - + name: power + type: u64 + - + name: cost + type: u64 + - + name: flags + type: u64 + +operations: + list: + - + name: get-pds + attribute-set: pds + doc: Get the list of information for all performance domains. + do: + reply: + attributes: + - pd + - + name: get-pd-table + attribute-set: pd-table + doc: Get the energy model table of a performance domain. + do: + request: + attributes: + - pd-id + reply: + attributes: + - pd-id + - ps + - + name: pd-created + doc: A performance domain is created. + notify: get-pd-table + mcgrp: event + - + name: pd-updated + doc: A performance domain is updated. + notify: get-pd-table + mcgrp: event + - + name: pd-deleted + doc: A performance domain is deleted. + attribute-set: pd-table + event: + attributes: + - pd-id + mcgrp: event + +mcast-groups: + list: + - + name: event diff --git a/MAINTAINERS b/MAINTAINERS index 545a4776795e..e6b3bab9dbeb 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9181,6 +9181,9 @@ S: Maintained F: kernel/power/energy_model.c F: include/linux/energy_model.h F: Documentation/power/energy-model.rst +F: Documentation/netlink/specs/em.yaml +F: include/uapi/linux/energy_model.h +F: kernel/power/em_netlink_autogen.* EPAPR HYPERVISOR BYTE CHANNEL DEVICE DRIVER M: Laurentiu Tudor diff --git a/include/uapi/linux/energy_model.h b/include/uapi/linux/energy_model.h new file mode 100644 index 000000000000..4ec4c0eabbbb --- /dev/null +++ b/include/uapi/linux/energy_model.h @@ -0,0 +1,62 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/em.yaml */ +/* YNL-GEN uapi header */ + +#ifndef _UAPI_LINUX_ENERGY_MODEL_H +#define _UAPI_LINUX_ENERGY_MODEL_H + +#define EM_FAMILY_NAME "em" +#define EM_FAMILY_VERSION 1 + +enum { + EM_A_PDS_PD = 1, + + __EM_A_PDS_MAX, + EM_A_PDS_MAX = (__EM_A_PDS_MAX - 1) +}; + +enum { + EM_A_PD_PAD = 1, + EM_A_PD_PD_ID, + EM_A_PD_FLAGS, + EM_A_PD_CPUS, + + __EM_A_PD_MAX, + EM_A_PD_MAX = (__EM_A_PD_MAX - 1) +}; + +enum { + EM_A_PD_TABLE_PD_ID = 1, + EM_A_PD_TABLE_PS, + + __EM_A_PD_TABLE_MAX, + EM_A_PD_TABLE_MAX = (__EM_A_PD_TABLE_MAX - 1) +}; + +enum { + EM_A_PS_PAD = 1, + EM_A_PS_PERFORMANCE, + EM_A_PS_FREQUENCY, + EM_A_PS_POWER, + EM_A_PS_COST, + EM_A_PS_FLAGS, + + __EM_A_PS_MAX, + EM_A_PS_MAX = (__EM_A_PS_MAX - 1) +}; + +enum { + EM_CMD_GET_PDS = 1, + EM_CMD_GET_PD_TABLE, + EM_CMD_PD_CREATED, + EM_CMD_PD_UPDATED, + EM_CMD_PD_DELETED, + + __EM_CMD_MAX, + EM_CMD_MAX = (__EM_CMD_MAX - 1) +}; + +#define EM_MCGRP_EVENT "event" + +#endif /* _UAPI_LINUX_ENERGY_MODEL_H */ diff --git a/kernel/power/em_netlink_autogen.c b/kernel/power/em_netlink_autogen.c new file mode 100644 index 000000000000..a7a09ab1d1c2 --- /dev/null +++ b/kernel/power/em_netlink_autogen.c @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/em.yaml */ +/* YNL-GEN kernel source */ + +#include +#include + +#include "em_netlink_autogen.h" + +#include + +/* EM_CMD_GET_PD_TABLE - do */ +static const struct nla_policy em_get_pd_table_nl_policy[EM_A_PD_TABLE_PD_ID + 1] = { + [EM_A_PD_TABLE_PD_ID] = { .type = NLA_U32, }, +}; + +/* Ops table for em */ +static const struct genl_split_ops em_nl_ops[] = { + { + .cmd = EM_CMD_GET_PDS, + .doit = em_nl_get_pds_doit, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = EM_CMD_GET_PD_TABLE, + .doit = em_nl_get_pd_table_doit, + .policy = em_get_pd_table_nl_policy, + .maxattr = EM_A_PD_TABLE_PD_ID, + .flags = GENL_CMD_CAP_DO, + }, +}; + +static const struct genl_multicast_group em_nl_mcgrps[] = { + [EM_NLGRP_EVENT] = { "event", }, +}; + +struct genl_family em_nl_family __ro_after_init = { + .name = EM_FAMILY_NAME, + .version = EM_FAMILY_VERSION, + .netnsok = true, + .parallel_ops = true, + .module = THIS_MODULE, + .split_ops = em_nl_ops, + .n_split_ops = ARRAY_SIZE(em_nl_ops), + .mcgrps = em_nl_mcgrps, + .n_mcgrps = ARRAY_SIZE(em_nl_mcgrps), +}; diff --git a/kernel/power/em_netlink_autogen.h b/kernel/power/em_netlink_autogen.h new file mode 100644 index 000000000000..78ce609641f1 --- /dev/null +++ b/kernel/power/em_netlink_autogen.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/em.yaml */ +/* YNL-GEN kernel header */ + +#ifndef _LINUX_EM_GEN_H +#define _LINUX_EM_GEN_H + +#include +#include + +#include + +int em_nl_get_pds_doit(struct sk_buff *skb, struct genl_info *info); +int em_nl_get_pd_table_doit(struct sk_buff *skb, struct genl_info *info); + +enum { + EM_NLGRP_EVENT, +}; + +extern struct genl_family em_nl_family; + +#endif /* _LINUX_EM_GEN_H */ From e4ed8d26c5d320d9b9a6ee013a94b7ad73b4d243 Mon Sep 17 00:00:00 2001 From: Changwoo Min Date: Tue, 21 Oct 2025 07:09:08 +0900 Subject: [PATCH 17/96] PM: EM: Add a skeleton code for netlink notification Add a boilerplate code for netlink notification to register the new protocol family. Also, initialize and register the netlink during booting. The initialization is called at the postcore level, which is late enough after the generic netlink is initialized. Finally, update MAINTAINERS to include new files. Signed-off-by: Changwoo Min Reviewed-by: Lukasz Luba Link: https://patch.msgid.link/20251020220914.320832-5-changwoo@igalia.com Signed-off-by: Rafael J. Wysocki --- MAINTAINERS | 2 +- kernel/power/Makefile | 4 +++- kernel/power/em_netlink.c | 34 ++++++++++++++++++++++++++++++++++ kernel/power/em_netlink.h | 16 ++++++++++++++++ 4 files changed, 54 insertions(+), 2 deletions(-) create mode 100644 kernel/power/em_netlink.c create mode 100644 kernel/power/em_netlink.h diff --git a/MAINTAINERS b/MAINTAINERS index e6b3bab9dbeb..0d96aadb0d86 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9183,7 +9183,7 @@ F: include/linux/energy_model.h F: Documentation/power/energy-model.rst F: Documentation/netlink/specs/em.yaml F: include/uapi/linux/energy_model.h -F: kernel/power/em_netlink_autogen.* +F: kernel/power/em_netlink*.* EPAPR HYPERVISOR BYTE CHANNEL DEVICE DRIVER M: Laurentiu Tudor diff --git a/kernel/power/Makefile b/kernel/power/Makefile index 874ad834dc8d..773e2789412b 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -21,4 +21,6 @@ obj-$(CONFIG_PM_WAKELOCKS) += wakelock.o obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o -obj-$(CONFIG_ENERGY_MODEL) += energy_model.o +obj-$(CONFIG_ENERGY_MODEL) += em.o +em-y := energy_model.o +em-$(CONFIG_NET) += em_netlink_autogen.o em_netlink.o diff --git a/kernel/power/em_netlink.c b/kernel/power/em_netlink.c new file mode 100644 index 000000000000..f8c98ae96aca --- /dev/null +++ b/kernel/power/em_netlink.c @@ -0,0 +1,34 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * + * Generic netlink for energy model. + * + * Copyright (c) 2025 Valve Corporation. + * Author: Changwoo Min + */ + +#define pr_fmt(fmt) "energy_model: " fmt + +#include +#include +#include +#include + +#include "em_netlink.h" +#include "em_netlink_autogen.h" + +int em_nl_get_pds_doit(struct sk_buff *skb, struct genl_info *info) +{ + return -EOPNOTSUPP; +} + +int em_nl_get_pd_table_doit(struct sk_buff *skb, struct genl_info *info) +{ + return -EOPNOTSUPP; +} + +static int __init em_netlink_init(void) +{ + return genl_register_family(&em_nl_family); +} +postcore_initcall(em_netlink_init); diff --git a/kernel/power/em_netlink.h b/kernel/power/em_netlink.h new file mode 100644 index 000000000000..acd186c92d6b --- /dev/null +++ b/kernel/power/em_netlink.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * + * Generic netlink for energy model. + * + * Copyright (c) 2025 Valve Corporation. + * Author: Changwoo Min + */ +#ifndef _EM_NETLINK_H +#define _EM_NETLINK_H + +#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_NET) +#else +#endif + +#endif /* _EM_NETLINK_H */ From 7928339cfe7d80f35bca905f0a2358271a4e6aa7 Mon Sep 17 00:00:00 2001 From: Changwoo Min Date: Tue, 21 Oct 2025 07:09:09 +0900 Subject: [PATCH 18/96] PM: EM: Add an iterator and accessor for the performance domain Add an iterator function (for_each_em_perf_domain) that iterates all the performance domains in the global list. A passed callback function (cb) is called for each performance domain. Additionally, add a lookup function (em_perf_domain_get_by_id) that searches for a performance domain by matching the ID in the global list. Signed-off-by: Changwoo Min Reviewed-by: Lukasz Luba Link: https://patch.msgid.link/20251020220914.320832-6-changwoo@igalia.com Signed-off-by: Rafael J. Wysocki --- kernel/power/em_netlink.h | 14 ++++++++++++++ kernel/power/energy_model.c | 38 +++++++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+) diff --git a/kernel/power/em_netlink.h b/kernel/power/em_netlink.h index acd186c92d6b..8114b018c73b 100644 --- a/kernel/power/em_netlink.h +++ b/kernel/power/em_netlink.h @@ -10,7 +10,21 @@ #define _EM_NETLINK_H #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_NET) +int for_each_em_perf_domain(int (*cb)(struct em_perf_domain*, void *), + void *data); +struct em_perf_domain *em_perf_domain_get_by_id(int id); #else +static inline +int for_each_em_perf_domain(int (*cb)(struct em_perf_domain*, void *), + void *data) +{ + return -EINVAL; +} +static inline +struct em_perf_domain *em_perf_domain_get_by_id(int id) +{ + return NULL; +} #endif #endif /* _EM_NETLINK_H */ diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 756debf5406a..9e35aba4b113 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -17,6 +17,8 @@ #include #include +#include "em_netlink.h" + /* * Mutex serializing the registrations of performance domains and letting * callbacks defined by drivers sleep. @@ -998,3 +1000,39 @@ void em_rebuild_sched_domains(void) */ schedule_work(&rebuild_sd_work); } + +#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_NET) +int for_each_em_perf_domain(int (*cb)(struct em_perf_domain*, void *), + void *data) +{ + struct em_perf_domain *pd; + + lockdep_assert_not_held(&em_pd_mutex); + guard(mutex)(&em_pd_list_mutex); + + list_for_each_entry(pd, &em_pd_list, node) { + int ret; + + ret = cb(pd, data); + if (ret) + return ret; + } + + return 0; +} + +struct em_perf_domain *em_perf_domain_get_by_id(int id) +{ + struct em_perf_domain *pd; + + lockdep_assert_not_held(&em_pd_mutex); + guard(mutex)(&em_pd_list_mutex); + + list_for_each_entry(pd, &em_pd_list, node) { + if (pd->id == id) + return pd; + } + + return NULL; +} +#endif From d8eef0453132dc95354e4c7ae839815e679179c6 Mon Sep 17 00:00:00 2001 From: Changwoo Min Date: Tue, 21 Oct 2025 07:09:10 +0900 Subject: [PATCH 19/96] PM: EM: Implement em_nl_get_pds_doit() When a userspace requests EM_CMD_GET_PDS, the kernel responds with information on all performance domains. The message format of the response is as follows: EM_A_PDS_PD (NLA_NESTED)* EM_A_PD_PD_ID (NLA_U32) EM_A_PD_FLAGS (NLA_U64) EM_A_PD_CPUS (NLA_STRING) where EM_A_PDS_PD can be repeated as many times as there are performance domains, and EM_A_PD_CPUS is a hexadecimal string representing a CPU bitmask. Signed-off-by: Changwoo Min Reviewed-by: Lukasz Luba Link: https://patch.msgid.link/20251020220914.320832-7-changwoo@igalia.com Signed-off-by: Rafael J. Wysocki --- kernel/power/em_netlink.c | 82 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 81 insertions(+), 1 deletion(-) diff --git a/kernel/power/em_netlink.c b/kernel/power/em_netlink.c index f8c98ae96aca..16bdcf47f4d8 100644 --- a/kernel/power/em_netlink.c +++ b/kernel/power/em_netlink.c @@ -17,9 +17,89 @@ #include "em_netlink.h" #include "em_netlink_autogen.h" +#define EM_A_PD_CPUS_LEN 256 + +/*************************** Command encoding ********************************/ +static int __em_nl_get_pd_size(struct em_perf_domain *pd, void *data) +{ + char cpus_buf[EM_A_PD_CPUS_LEN]; + int *tot_msg_sz = data; + int msg_sz, cpus_sz; + + cpus_sz = snprintf(cpus_buf, sizeof(cpus_buf), "%*pb", + cpumask_pr_args(to_cpumask(pd->cpus))); + + msg_sz = nla_total_size(0) + /* EM_A_PDS_PD */ + nla_total_size(sizeof(u32)) + /* EM_A_PD_PD_ID */ + nla_total_size_64bit(sizeof(u64)) + /* EM_A_PD_FLAGS */ + nla_total_size(cpus_sz); /* EM_A_PD_CPUS */ + + *tot_msg_sz += nlmsg_total_size(genlmsg_msg_size(msg_sz)); + return 0; +} + +static int __em_nl_get_pd(struct em_perf_domain *pd, void *data) +{ + char cpus_buf[EM_A_PD_CPUS_LEN]; + struct sk_buff *msg = data; + struct nlattr *entry; + + entry = nla_nest_start(msg, EM_A_PDS_PD); + if (!entry) + goto out_cancel_nest; + + if (nla_put_u32(msg, EM_A_PD_PD_ID, pd->id)) + goto out_cancel_nest; + + if (nla_put_u64_64bit(msg, EM_A_PD_FLAGS, pd->flags, EM_A_PD_PAD)) + goto out_cancel_nest; + + snprintf(cpus_buf, sizeof(cpus_buf), "%*pb", + cpumask_pr_args(to_cpumask(pd->cpus))); + if (nla_put_string(msg, EM_A_PD_CPUS, cpus_buf)) + goto out_cancel_nest; + + nla_nest_end(msg, entry); + + return 0; + +out_cancel_nest: + nla_nest_cancel(msg, entry); + + return -EMSGSIZE; +} + int em_nl_get_pds_doit(struct sk_buff *skb, struct genl_info *info) { - return -EOPNOTSUPP; + struct sk_buff *msg; + void *hdr; + int cmd = info->genlhdr->cmd; + int ret = -EMSGSIZE, msg_sz = 0; + + for_each_em_perf_domain(__em_nl_get_pd_size, &msg_sz); + + msg = genlmsg_new(msg_sz, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + hdr = genlmsg_put_reply(msg, info, &em_nl_family, 0, cmd); + if (!hdr) + goto out_free_msg; + + ret = for_each_em_perf_domain(__em_nl_get_pd, msg); + if (ret) + goto out_cancel_msg; + + genlmsg_end(msg, hdr); + + return genlmsg_reply(msg, info); + +out_cancel_msg: + genlmsg_cancel(msg, hdr); +out_free_msg: + nlmsg_free(msg); + + return ret; } int em_nl_get_pd_table_doit(struct sk_buff *skb, struct genl_info *info) From f2d2946eaa5c9277e5eb565796ea5d86b13f4854 Mon Sep 17 00:00:00 2001 From: Changwoo Min Date: Tue, 21 Oct 2025 07:09:11 +0900 Subject: [PATCH 20/96] PM: EM: Implement em_nl_get_pd_table_doit() When a userspace requests EM_CMD_GET_PD_TABLE with an ID of a performance domain, the kernel reports back the energy model table of the specified performance domain. The message format of the response is as follows: EM_A_PD_TABLE_PD_ID (NLA_U32) EM_A_PD_TABLE_PS (NLA_NESTED)* EM_A_PS_PERFORMANCE (NLA_U64) EM_A_PS_FREQUENCY (NLA_U64) EM_A_PS_POWER (NLA_U64) EM_A_PS_COST (NLA_U64) EM_A_PS_FLAGS (NLA_U64) where EM_A_PD_TABLE_PS can be repeated as many times as there are performance states (struct em_perf_state). Signed-off-by: Changwoo Min Reviewed-by: Lukasz Luba Link: https://patch.msgid.link/20251020220914.320832-8-changwoo@igalia.com Signed-off-by: Rafael J. Wysocki --- kernel/power/em_netlink.c | 108 +++++++++++++++++++++++++++++++++++++- 1 file changed, 107 insertions(+), 1 deletion(-) diff --git a/kernel/power/em_netlink.c b/kernel/power/em_netlink.c index 16bdcf47f4d8..e144624f0335 100644 --- a/kernel/power/em_netlink.c +++ b/kernel/power/em_netlink.c @@ -102,9 +102,115 @@ int em_nl_get_pds_doit(struct sk_buff *skb, struct genl_info *info) return ret; } +static struct em_perf_domain *__em_nl_get_pd_table_id(struct nlattr **attrs) +{ + struct em_perf_domain *pd; + int id; + + if (!attrs[EM_A_PD_TABLE_PD_ID]) + return NULL; + + id = nla_get_u32(attrs[EM_A_PD_TABLE_PD_ID]); + pd = em_perf_domain_get_by_id(id); + return pd; +} + +static int __em_nl_get_pd_table_size(const struct em_perf_domain *pd) +{ + int id_sz, ps_sz; + + id_sz = nla_total_size(sizeof(u32)); /* EM_A_PD_TABLE_PD_ID */ + ps_sz = nla_total_size(0) + /* EM_A_PD_TABLE_PS */ + nla_total_size_64bit(sizeof(u64)) + /* EM_A_PS_PERFORMANCE */ + nla_total_size_64bit(sizeof(u64)) + /* EM_A_PS_FREQUENCY */ + nla_total_size_64bit(sizeof(u64)) + /* EM_A_PS_POWER */ + nla_total_size_64bit(sizeof(u64)) + /* EM_A_PS_COST */ + nla_total_size_64bit(sizeof(u64)); /* EM_A_PS_FLAGS */ + ps_sz *= pd->nr_perf_states; + + return nlmsg_total_size(genlmsg_msg_size(id_sz + ps_sz)); +} + +static int __em_nl_get_pd_table(struct sk_buff *msg, const struct em_perf_domain *pd) +{ + struct em_perf_state *table, *ps; + struct nlattr *entry; + int i; + + if (nla_put_u32(msg, EM_A_PD_TABLE_PD_ID, pd->id)) + goto out_err; + + rcu_read_lock(); + table = em_perf_state_from_pd((struct em_perf_domain *)pd); + + for (i = 0; i < pd->nr_perf_states; i++) { + ps = &table[i]; + + entry = nla_nest_start(msg, EM_A_PD_TABLE_PS); + if (!entry) + goto out_unlock_ps; + + if (nla_put_u64_64bit(msg, EM_A_PS_PERFORMANCE, + ps->performance, EM_A_PS_PAD)) + goto out_cancel_ps_nest; + if (nla_put_u64_64bit(msg, EM_A_PS_FREQUENCY, + ps->frequency, EM_A_PS_PAD)) + goto out_cancel_ps_nest; + if (nla_put_u64_64bit(msg, EM_A_PS_POWER, + ps->power, EM_A_PS_PAD)) + goto out_cancel_ps_nest; + if (nla_put_u64_64bit(msg, EM_A_PS_COST, + ps->cost, EM_A_PS_PAD)) + goto out_cancel_ps_nest; + if (nla_put_u64_64bit(msg, EM_A_PS_FLAGS, + ps->flags, EM_A_PS_PAD)) + goto out_cancel_ps_nest; + + nla_nest_end(msg, entry); + } + rcu_read_unlock(); + return 0; + +out_cancel_ps_nest: + nla_nest_cancel(msg, entry); +out_unlock_ps: + rcu_read_unlock(); +out_err: + return -EMSGSIZE; +} + int em_nl_get_pd_table_doit(struct sk_buff *skb, struct genl_info *info) { - return -EOPNOTSUPP; + int cmd = info->genlhdr->cmd; + int msg_sz, ret = -EMSGSIZE; + struct em_perf_domain *pd; + struct sk_buff *msg; + void *hdr; + + pd = __em_nl_get_pd_table_id(info->attrs); + if (!pd) + return -EINVAL; + + msg_sz = __em_nl_get_pd_table_size(pd); + + msg = genlmsg_new(msg_sz, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + hdr = genlmsg_put_reply(msg, info, &em_nl_family, 0, cmd); + if (!hdr) + goto out_free_msg; + + ret = __em_nl_get_pd_table(msg, pd); + if (ret) + goto out_free_msg; + + genlmsg_end(msg, hdr); + return genlmsg_reply(msg, info); + +out_free_msg: + nlmsg_free(msg); + return ret; } static int __init em_netlink_init(void) From b2b1bbcac758798e27ad9c29a88340fcb13c8321 Mon Sep 17 00:00:00 2001 From: Changwoo Min Date: Tue, 21 Oct 2025 07:09:12 +0900 Subject: [PATCH 21/96] PM: EM: Implement em_notify_pd_deleted() Add the event notification infrastructure and implement the event notification for when a performance domain is deleted (EM_CMD_PD_DELETED). The event contains the ID of the performance domain (EM_A_PD_TABLE_PD_ID) so the userspace can identify the changed performance domain for further processing. Signed-off-by: Changwoo Min Reviewed-by: Lukasz Luba Link: https://patch.msgid.link/20251020220914.320832-9-changwoo@igalia.com Signed-off-by: Rafael J. Wysocki --- kernel/power/em_netlink.c | 44 +++++++++++++++++++++++++++++++++++++++ kernel/power/em_netlink.h | 3 +++ 2 files changed, 47 insertions(+) diff --git a/kernel/power/em_netlink.c b/kernel/power/em_netlink.c index e144624f0335..43118b028bb6 100644 --- a/kernel/power/em_netlink.c +++ b/kernel/power/em_netlink.c @@ -213,6 +213,50 @@ int em_nl_get_pd_table_doit(struct sk_buff *skb, struct genl_info *info) return ret; } + +/**************************** Event encoding *********************************/ +static int __em_notify_pd_deleted_size(const struct em_perf_domain *pd) +{ + int id_sz = nla_total_size(sizeof(u32)); /* EM_A_PD_TABLE_PD_ID */ + + return nlmsg_total_size(genlmsg_msg_size(id_sz)); +} + +void em_notify_pd_deleted(const struct em_perf_domain *pd) +{ + struct sk_buff *msg; + void *hdr; + int msg_sz; + + if (!genl_has_listeners(&em_nl_family, &init_net, EM_NLGRP_EVENT)) + return; + + msg_sz = __em_notify_pd_deleted_size(pd); + + msg = genlmsg_new(msg_sz, GFP_KERNEL); + if (!msg) + return; + + hdr = genlmsg_put(msg, 0, 0, &em_nl_family, 0, EM_CMD_PD_DELETED); + if (!hdr) + goto out_free_msg; + + if (nla_put_u32(msg, EM_A_PD_TABLE_PD_ID, pd->id)) { + goto out_free_msg; + } + + genlmsg_end(msg, hdr); + + genlmsg_multicast(&em_nl_family, msg, 0, EM_NLGRP_EVENT, GFP_KERNEL); + + return; + +out_free_msg: + nlmsg_free(msg); + return; +} + +/**************************** Initialization *********************************/ static int __init em_netlink_init(void) { return genl_register_family(&em_nl_family); diff --git a/kernel/power/em_netlink.h b/kernel/power/em_netlink.h index 8114b018c73b..d56e5865e1ed 100644 --- a/kernel/power/em_netlink.h +++ b/kernel/power/em_netlink.h @@ -13,6 +13,7 @@ int for_each_em_perf_domain(int (*cb)(struct em_perf_domain*, void *), void *data); struct em_perf_domain *em_perf_domain_get_by_id(int id); +void em_notify_pd_deleted(const struct em_perf_domain *pd); #else static inline int for_each_em_perf_domain(int (*cb)(struct em_perf_domain*, void *), @@ -25,6 +26,8 @@ struct em_perf_domain *em_perf_domain_get_by_id(int id) { return NULL; } + +static inline void em_notify_pd_deleted(const struct em_perf_domain *pd) {} #endif #endif /* _EM_NETLINK_H */ From b95a0c02ada527b7cfc2e329d262324661fe30ce Mon Sep 17 00:00:00 2001 From: Changwoo Min Date: Tue, 21 Oct 2025 07:09:13 +0900 Subject: [PATCH 22/96] PM: EM: Implement em_notify_pd_created/updated() Implement two event notifications when a performance domain is created (EM_CMD_PD_CREATED) and updated (EM_CMD_PD_UPDATED). The message format of these two event notifications is the same as EM_CMD_GET_PD_TABLE -- containing the performance domain's ID and its energy model table. Signed-off-by: Changwoo Min Reviewed-by: Lukasz Luba Link: https://patch.msgid.link/20251020220914.320832-10-changwoo@igalia.com Signed-off-by: Rafael J. Wysocki --- kernel/power/em_netlink.c | 44 +++++++++++++++++++++++++++++++++++++++ kernel/power/em_netlink.h | 6 ++++++ 2 files changed, 50 insertions(+) diff --git a/kernel/power/em_netlink.c b/kernel/power/em_netlink.c index 43118b028bb6..4b85da138a06 100644 --- a/kernel/power/em_netlink.c +++ b/kernel/power/em_netlink.c @@ -215,6 +215,50 @@ int em_nl_get_pd_table_doit(struct sk_buff *skb, struct genl_info *info) /**************************** Event encoding *********************************/ +static void __em_notify_pd_table(const struct em_perf_domain *pd, int ntf_type) +{ + struct sk_buff *msg; + int msg_sz, ret = -EMSGSIZE; + void *hdr; + + if (!genl_has_listeners(&em_nl_family, &init_net, EM_NLGRP_EVENT)) + return; + + msg_sz = __em_nl_get_pd_table_size(pd); + + msg = genlmsg_new(msg_sz, GFP_KERNEL); + if (!msg) + return; + + hdr = genlmsg_put(msg, 0, 0, &em_nl_family, 0, ntf_type); + if (!hdr) + goto out_free_msg; + + ret = __em_nl_get_pd_table(msg, pd); + if (ret) + goto out_free_msg; + + genlmsg_end(msg, hdr); + + genlmsg_multicast(&em_nl_family, msg, 0, EM_NLGRP_EVENT, GFP_KERNEL); + + return; + +out_free_msg: + nlmsg_free(msg); + return; +} + +void em_notify_pd_created(const struct em_perf_domain *pd) +{ + __em_notify_pd_table(pd, EM_CMD_PD_CREATED); +} + +void em_notify_pd_updated(const struct em_perf_domain *pd) +{ + __em_notify_pd_table(pd, EM_CMD_PD_UPDATED); +} + static int __em_notify_pd_deleted_size(const struct em_perf_domain *pd) { int id_sz = nla_total_size(sizeof(u32)); /* EM_A_PD_TABLE_PD_ID */ diff --git a/kernel/power/em_netlink.h b/kernel/power/em_netlink.h index d56e5865e1ed..583d7f1c3939 100644 --- a/kernel/power/em_netlink.h +++ b/kernel/power/em_netlink.h @@ -13,7 +13,9 @@ int for_each_em_perf_domain(int (*cb)(struct em_perf_domain*, void *), void *data); struct em_perf_domain *em_perf_domain_get_by_id(int id); +void em_notify_pd_created(const struct em_perf_domain *pd); void em_notify_pd_deleted(const struct em_perf_domain *pd); +void em_notify_pd_updated(const struct em_perf_domain *pd); #else static inline int for_each_em_perf_domain(int (*cb)(struct em_perf_domain*, void *), @@ -27,7 +29,11 @@ struct em_perf_domain *em_perf_domain_get_by_id(int id) return NULL; } +static inline void em_notify_pd_created(const struct em_perf_domain *pd) {} + static inline void em_notify_pd_deleted(const struct em_perf_domain *pd) {} + +static inline void em_notify_pd_updated(const struct em_perf_domain *pd) {} #endif #endif /* _EM_NETLINK_H */ From a1b17c9ac87a21b677077bc47d7579a2897a13f3 Mon Sep 17 00:00:00 2001 From: Changwoo Min Date: Tue, 21 Oct 2025 07:09:14 +0900 Subject: [PATCH 23/96] PM: EM: Notify an event when the performance domain changes Send an event to userspace when a performance domain is created or deleted, or its energy model is updated. Signed-off-by: Changwoo Min Reviewed-by: Lukasz Luba Link: https://patch.msgid.link/20251020220914.320832-11-changwoo@igalia.com Signed-off-by: Rafael J. Wysocki --- kernel/power/energy_model.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 9e35aba4b113..e669d5057fca 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -352,6 +352,8 @@ int em_dev_update_perf_domain(struct device *dev, em_table_free(old_table); mutex_unlock(&em_pd_mutex); + + em_notify_pd_updated(pd); return 0; } EXPORT_SYMBOL_GPL(em_dev_update_perf_domain); @@ -696,6 +698,7 @@ int em_dev_register_pd_no_update(struct device *dev, unsigned int nr_states, list_add_tail(&dev->em_pd->node, &em_pd_list); mutex_unlock(&em_pd_list_mutex); + em_notify_pd_created(dev->em_pd); return ret; } EXPORT_SYMBOL_GPL(em_dev_register_pd_no_update); @@ -718,6 +721,8 @@ void em_dev_unregister_perf_domain(struct device *dev) list_del_init(&dev->em_pd->node); mutex_unlock(&em_pd_list_mutex); + em_notify_pd_deleted(dev->em_pd); + /* * The mutex separates all register/unregister requests and protects * from potential clean-up/setup issues in the debugfs directories. From 173e02d674946ff3ef8da7f44a9d5b820b9af21c Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 22 Oct 2025 12:57:41 +0530 Subject: [PATCH 24/96] OPP: Initialize scope-based pointers inline Uninitialized pointers with `__free` attribute can cause undefined behaviour as the memory allocated to the pointer is freed automatically when the pointer goes out of scope. The OPP core doesn't have any bugs related to this as of now, but it is better to initialize pointers marked with `__free` attribute at declaration to simplify the code and ensure proper scope-based cleanup. Reported-by: Joe Perches Reported-by: Dan Carpenter Signed-off-by: Viresh Kumar --- drivers/opp/core.c | 69 ++++++++++++++----------- drivers/opp/cpu.c | 16 +++--- drivers/opp/of.c | 125 +++++++++++++++++++++++++-------------------- 3 files changed, 117 insertions(+), 93 deletions(-) diff --git a/drivers/opp/core.c b/drivers/opp/core.c index bba4f7daff8c..dbebb8c829bc 100644 --- a/drivers/opp/core.c +++ b/drivers/opp/core.c @@ -309,9 +309,9 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_is_turbo); */ unsigned long dev_pm_opp_get_max_clock_latency(struct device *dev) { - struct opp_table *opp_table __free(put_opp_table); + struct opp_table *opp_table __free(put_opp_table) = + _find_opp_table(dev); - opp_table = _find_opp_table(dev); if (IS_ERR(opp_table)) return 0; @@ -327,7 +327,6 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_get_max_clock_latency); */ unsigned long dev_pm_opp_get_max_volt_latency(struct device *dev) { - struct opp_table *opp_table __free(put_opp_table); struct dev_pm_opp *opp; struct regulator *reg; unsigned long latency_ns = 0; @@ -337,7 +336,9 @@ unsigned long dev_pm_opp_get_max_volt_latency(struct device *dev) unsigned long max; } *uV; - opp_table = _find_opp_table(dev); + struct opp_table *opp_table __free(put_opp_table) = + _find_opp_table(dev); + if (IS_ERR(opp_table)) return 0; @@ -409,10 +410,11 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_get_max_transition_latency); */ unsigned long dev_pm_opp_get_suspend_opp_freq(struct device *dev) { - struct opp_table *opp_table __free(put_opp_table); unsigned long freq = 0; - opp_table = _find_opp_table(dev); + struct opp_table *opp_table __free(put_opp_table) = + _find_opp_table(dev); + if (IS_ERR(opp_table)) return 0; @@ -447,9 +449,9 @@ int _get_opp_count(struct opp_table *opp_table) */ int dev_pm_opp_get_opp_count(struct device *dev) { - struct opp_table *opp_table __free(put_opp_table); + struct opp_table *opp_table __free(put_opp_table) = + _find_opp_table(dev); - opp_table = _find_opp_table(dev); if (IS_ERR(opp_table)) { dev_dbg(dev, "%s: OPP table not found (%ld)\n", __func__, PTR_ERR(opp_table)); @@ -605,9 +607,9 @@ _find_key(struct device *dev, unsigned long *key, int index, bool available, unsigned long opp_key, unsigned long key), bool (*assert)(struct opp_table *opp_table, unsigned int index)) { - struct opp_table *opp_table __free(put_opp_table); + struct opp_table *opp_table __free(put_opp_table) = + _find_opp_table(dev); - opp_table = _find_opp_table(dev); if (IS_ERR(opp_table)) { dev_err(dev, "%s: OPP table not found (%ld)\n", __func__, PTR_ERR(opp_table)); @@ -1410,12 +1412,13 @@ static int _set_opp(struct device *dev, struct opp_table *opp_table, */ int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq) { - struct opp_table *opp_table __free(put_opp_table); struct dev_pm_opp *opp __free(put_opp) = NULL; unsigned long freq = 0, temp_freq; bool forced = false; - opp_table = _find_opp_table(dev); + struct opp_table *opp_table __free(put_opp_table) = + _find_opp_table(dev); + if (IS_ERR(opp_table)) { dev_err(dev, "%s: device's opp table doesn't exist\n", __func__); return PTR_ERR(opp_table); @@ -1477,9 +1480,9 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_set_rate); */ int dev_pm_opp_set_opp(struct device *dev, struct dev_pm_opp *opp) { - struct opp_table *opp_table __free(put_opp_table); + struct opp_table *opp_table __free(put_opp_table) = + _find_opp_table(dev); - opp_table = _find_opp_table(dev); if (IS_ERR(opp_table)) { dev_err(dev, "%s: device opp doesn't exist\n", __func__); return PTR_ERR(opp_table); @@ -1794,10 +1797,11 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_put); */ void dev_pm_opp_remove(struct device *dev, unsigned long freq) { - struct opp_table *opp_table __free(put_opp_table); struct dev_pm_opp *opp = NULL, *iter; - opp_table = _find_opp_table(dev); + struct opp_table *opp_table __free(put_opp_table) = + _find_opp_table(dev); + if (IS_ERR(opp_table)) return; @@ -1885,9 +1889,9 @@ bool _opp_remove_all_static(struct opp_table *opp_table) */ void dev_pm_opp_remove_all_dynamic(struct device *dev) { - struct opp_table *opp_table __free(put_opp_table); + struct opp_table *opp_table __free(put_opp_table) = + _find_opp_table(dev); - opp_table = _find_opp_table(dev); if (IS_ERR(opp_table)) return; @@ -2871,10 +2875,11 @@ static int _opp_set_availability(struct device *dev, unsigned long freq, bool availability_req) { struct dev_pm_opp *opp __free(put_opp) = ERR_PTR(-ENODEV), *tmp_opp; - struct opp_table *opp_table __free(put_opp_table); /* Find the opp_table */ - opp_table = _find_opp_table(dev); + struct opp_table *opp_table __free(put_opp_table) = + _find_opp_table(dev); + if (IS_ERR(opp_table)) { dev_warn(dev, "%s: Device OPP not found (%ld)\n", __func__, PTR_ERR(opp_table)); @@ -2932,11 +2937,12 @@ int dev_pm_opp_adjust_voltage(struct device *dev, unsigned long freq, { struct dev_pm_opp *opp __free(put_opp) = ERR_PTR(-ENODEV), *tmp_opp; - struct opp_table *opp_table __free(put_opp_table); int r; /* Find the opp_table */ - opp_table = _find_opp_table(dev); + struct opp_table *opp_table __free(put_opp_table) = + _find_opp_table(dev); + if (IS_ERR(opp_table)) { r = PTR_ERR(opp_table); dev_warn(dev, "%s: Device OPP not found (%d)\n", __func__, r); @@ -2986,12 +2992,13 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_adjust_voltage); */ int dev_pm_opp_sync_regulators(struct device *dev) { - struct opp_table *opp_table __free(put_opp_table); struct regulator *reg; int ret, i; /* Device may not have OPP table */ - opp_table = _find_opp_table(dev); + struct opp_table *opp_table __free(put_opp_table) = + _find_opp_table(dev); + if (IS_ERR(opp_table)) return 0; @@ -3062,9 +3069,9 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_disable); */ int dev_pm_opp_register_notifier(struct device *dev, struct notifier_block *nb) { - struct opp_table *opp_table __free(put_opp_table); + struct opp_table *opp_table __free(put_opp_table) = + _find_opp_table(dev); - opp_table = _find_opp_table(dev); if (IS_ERR(opp_table)) return PTR_ERR(opp_table); @@ -3082,9 +3089,9 @@ EXPORT_SYMBOL(dev_pm_opp_register_notifier); int dev_pm_opp_unregister_notifier(struct device *dev, struct notifier_block *nb) { - struct opp_table *opp_table __free(put_opp_table); + struct opp_table *opp_table __free(put_opp_table) = + _find_opp_table(dev); - opp_table = _find_opp_table(dev); if (IS_ERR(opp_table)) return PTR_ERR(opp_table); @@ -3101,10 +3108,10 @@ EXPORT_SYMBOL(dev_pm_opp_unregister_notifier); */ void dev_pm_opp_remove_table(struct device *dev) { - struct opp_table *opp_table __free(put_opp_table); - /* Check for existing table for 'dev' */ - opp_table = _find_opp_table(dev); + struct opp_table *opp_table __free(put_opp_table) = + _find_opp_table(dev); + if (IS_ERR(opp_table)) { int error = PTR_ERR(opp_table); diff --git a/drivers/opp/cpu.c b/drivers/opp/cpu.c index 97989d4fe336..a6da7ee3ec76 100644 --- a/drivers/opp/cpu.c +++ b/drivers/opp/cpu.c @@ -56,10 +56,10 @@ int dev_pm_opp_init_cpufreq_table(struct device *dev, return -ENOMEM; for (i = 0, rate = 0; i < max_opps; i++, rate++) { - struct dev_pm_opp *opp __free(put_opp); - /* find next rate */ - opp = dev_pm_opp_find_freq_ceil(dev, &rate); + struct dev_pm_opp *opp __free(put_opp) = + dev_pm_opp_find_freq_ceil(dev, &rate); + if (IS_ERR(opp)) { ret = PTR_ERR(opp); goto out; @@ -154,12 +154,13 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_cpumask_remove_table); int dev_pm_opp_set_sharing_cpus(struct device *cpu_dev, const struct cpumask *cpumask) { - struct opp_table *opp_table __free(put_opp_table); struct opp_device *opp_dev; struct device *dev; int cpu; - opp_table = _find_opp_table(cpu_dev); + struct opp_table *opp_table __free(put_opp_table) = + _find_opp_table(cpu_dev); + if (IS_ERR(opp_table)) return PTR_ERR(opp_table); @@ -201,10 +202,11 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_set_sharing_cpus); */ int dev_pm_opp_get_sharing_cpus(struct device *cpu_dev, struct cpumask *cpumask) { - struct opp_table *opp_table __free(put_opp_table); struct opp_device *opp_dev; - opp_table = _find_opp_table(cpu_dev); + struct opp_table *opp_table __free(put_opp_table) = + _find_opp_table(cpu_dev); + if (IS_ERR(opp_table)) return PTR_ERR(opp_table); diff --git a/drivers/opp/of.c b/drivers/opp/of.c index 505d79821584..1e0d0adb18e1 100644 --- a/drivers/opp/of.c +++ b/drivers/opp/of.c @@ -45,9 +45,10 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_of_get_opp_desc_node); struct opp_table *_managed_opp(struct device *dev, int index) { struct opp_table *opp_table, *managed_table = NULL; - struct device_node *np __free(device_node); - np = _opp_of_get_opp_desc_node(dev->of_node, index); + struct device_node *np __free(device_node) = + _opp_of_get_opp_desc_node(dev->of_node, index); + if (!np) return NULL; @@ -95,10 +96,11 @@ static struct device_node *of_parse_required_opp(struct device_node *np, /* The caller must call dev_pm_opp_put_opp_table() after the table is used */ static struct opp_table *_find_table_of_opp_np(struct device_node *opp_np) { - struct device_node *opp_table_np __free(device_node); struct opp_table *opp_table; - opp_table_np = of_get_parent(opp_np); + struct device_node *opp_table_np __free(device_node) = + of_get_parent(opp_np); + if (!opp_table_np) return ERR_PTR(-ENODEV); @@ -146,12 +148,13 @@ static void _opp_table_alloc_required_tables(struct opp_table *opp_table, struct device_node *opp_np) { struct opp_table **required_opp_tables; - struct device_node *np __free(device_node); bool lazy = false; int count, i, size; /* Traversing the first OPP node is all we need */ - np = of_get_next_available_child(opp_np, NULL); + struct device_node *np __free(device_node) = + of_get_next_available_child(opp_np, NULL); + if (!np) { dev_warn(dev, "Empty OPP table\n"); return; @@ -171,9 +174,9 @@ static void _opp_table_alloc_required_tables(struct opp_table *opp_table, opp_table->required_opp_count = count; for (i = 0; i < count; i++) { - struct device_node *required_np __free(device_node); + struct device_node *required_np __free(device_node) = + of_parse_required_opp(np, i); - required_np = of_parse_required_opp(np, i); if (!required_np) { _opp_table_free_required_tables(opp_table); return; @@ -199,14 +202,15 @@ static void _opp_table_alloc_required_tables(struct opp_table *opp_table, void _of_init_opp_table(struct opp_table *opp_table, struct device *dev, int index) { - struct device_node *np __free(device_node), *opp_np; + struct device_node *opp_np; u32 val; /* * Only required for backward compatibility with v1 bindings, but isn't * harmful for other cases. And so we do it unconditionally. */ - np = of_node_get(dev->of_node); + struct device_node *np __free(device_node) = of_node_get(dev->of_node); + if (!np) return; @@ -273,9 +277,9 @@ void _of_clear_opp(struct opp_table *opp_table, struct dev_pm_opp *opp) static int _link_required_opps(struct dev_pm_opp *opp, struct opp_table *required_table, int index) { - struct device_node *np __free(device_node); + struct device_node *np __free(device_node) = + of_parse_required_opp(opp->np, index); - np = of_parse_required_opp(opp->np, index); if (unlikely(!np)) return -ENODEV; @@ -349,16 +353,13 @@ static void lazy_link_required_opp_table(struct opp_table *new_table) guard(mutex)(&opp_table_lock); list_for_each_entry_safe(opp_table, temp, &lazy_opp_tables, lazy) { - struct device_node *opp_np __free(device_node); bool lazy = false; /* opp_np can't be invalid here */ - opp_np = of_get_next_available_child(opp_table->np, NULL); + struct device_node *opp_np __free(device_node) = + of_get_next_available_child(opp_table->np, NULL); for (i = 0; i < opp_table->required_opp_count; i++) { - struct device_node *required_np __free(device_node) = NULL; - struct device_node *required_table_np __free(device_node) = NULL; - required_opp_tables = opp_table->required_opp_tables; /* Required opp-table is already parsed */ @@ -366,8 +367,10 @@ static void lazy_link_required_opp_table(struct opp_table *new_table) continue; /* required_np can't be invalid here */ - required_np = of_parse_required_opp(opp_np, i); - required_table_np = of_get_parent(required_np); + struct device_node *required_np __free(device_node) = + of_parse_required_opp(opp_np, i); + struct device_node *required_table_np __free(device_node) = + of_get_parent(required_np); /* * Newly added table isn't the required opp-table for @@ -402,13 +405,12 @@ static void lazy_link_required_opp_table(struct opp_table *new_table) static int _bandwidth_supported(struct device *dev, struct opp_table *opp_table) { struct device_node *opp_np __free(device_node) = NULL; - struct device_node *np __free(device_node) = NULL; struct property *prop; if (!opp_table) { - struct device_node *np __free(device_node); + struct device_node *np __free(device_node) = + of_node_get(dev->of_node); - np = of_node_get(dev->of_node); if (!np) return -ENODEV; @@ -422,7 +424,9 @@ static int _bandwidth_supported(struct device *dev, struct opp_table *opp_table) return 0; /* Checking only first OPP is sufficient */ - np = of_get_next_available_child(opp_np, NULL); + struct device_node *np __free(device_node) = + of_get_next_available_child(opp_np, NULL); + if (!np) { dev_err(dev, "OPP table empty\n"); return -EINVAL; @@ -1269,11 +1273,12 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_of_cpumask_add_table); int dev_pm_opp_of_get_sharing_cpus(struct device *cpu_dev, struct cpumask *cpumask) { - struct device_node *np __free(device_node); int cpu; /* Get OPP descriptor node */ - np = dev_pm_opp_of_get_opp_desc_node(cpu_dev); + struct device_node *np __free(device_node) = + dev_pm_opp_of_get_opp_desc_node(cpu_dev); + if (!np) { dev_dbg(cpu_dev, "%s: Couldn't find opp node.\n", __func__); return -ENOENT; @@ -1286,13 +1291,12 @@ int dev_pm_opp_of_get_sharing_cpus(struct device *cpu_dev, return 0; for_each_possible_cpu(cpu) { - struct device_node *cpu_np __free(device_node) = NULL; - struct device_node *tmp_np __free(device_node) = NULL; - if (cpu == cpu_dev->id) continue; - cpu_np = of_cpu_device_node_get(cpu); + struct device_node *cpu_np __free(device_node) = + of_cpu_device_node_get(cpu); + if (!cpu_np) { dev_err(cpu_dev, "%s: failed to get cpu%d node\n", __func__, cpu); @@ -1300,7 +1304,9 @@ int dev_pm_opp_of_get_sharing_cpus(struct device *cpu_dev, } /* Get OPP descriptor node */ - tmp_np = _opp_of_get_opp_desc_node(cpu_np, 0); + struct device_node *tmp_np __free(device_node) = + _opp_of_get_opp_desc_node(cpu_np, 0); + if (!tmp_np) { pr_err("%pOF: Couldn't find opp node\n", cpu_np); return -ENOENT; @@ -1328,16 +1334,17 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_of_get_sharing_cpus); */ int of_get_required_opp_performance_state(struct device_node *np, int index) { - struct device_node *required_np __free(device_node); - struct opp_table *opp_table __free(put_opp_table) = NULL; - struct dev_pm_opp *opp __free(put_opp) = NULL; int pstate = -EINVAL; - required_np = of_parse_required_opp(np, index); + struct device_node *required_np __free(device_node) = + of_parse_required_opp(np, index); + if (!required_np) return -ENODEV; - opp_table = _find_table_of_opp_np(required_np); + struct opp_table *opp_table __free(put_opp_table) = + _find_table_of_opp_np(required_np); + if (IS_ERR(opp_table)) { pr_err("%s: Failed to find required OPP table %pOF: %ld\n", __func__, np, PTR_ERR(opp_table)); @@ -1350,7 +1357,9 @@ int of_get_required_opp_performance_state(struct device_node *np, int index) return -EINVAL; } - opp = _find_opp_of_np(opp_table, required_np); + struct dev_pm_opp *opp __free(put_opp) = + _find_opp_of_np(opp_table, required_np); + if (opp) { if (opp->level == OPP_LEVEL_UNSET) { pr_err("%s: OPP levels aren't available for %pOF\n", @@ -1376,14 +1385,17 @@ EXPORT_SYMBOL_GPL(of_get_required_opp_performance_state); */ bool dev_pm_opp_of_has_required_opp(struct device *dev) { - struct device_node *np __free(device_node) = NULL, *opp_np __free(device_node); int count; - opp_np = _opp_of_get_opp_desc_node(dev->of_node, 0); + struct device_node *opp_np __free(device_node) = + _opp_of_get_opp_desc_node(dev->of_node, 0); + if (!opp_np) return false; - np = of_get_next_available_child(opp_np, NULL); + struct device_node *np __free(device_node) = + of_get_next_available_child(opp_np, NULL); + if (!np) { dev_warn(dev, "Empty OPP table\n"); return false; @@ -1425,12 +1437,14 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_get_of_node); static int __maybe_unused _get_dt_power(struct device *dev, unsigned long *uW, unsigned long *kHz) { - struct dev_pm_opp *opp __free(put_opp); unsigned long opp_freq, opp_power; /* Find the right frequency and related OPP */ opp_freq = *kHz * 1000; - opp = dev_pm_opp_find_freq_ceil(dev, &opp_freq); + + struct dev_pm_opp *opp __free(put_opp) = + dev_pm_opp_find_freq_ceil(dev, &opp_freq); + if (IS_ERR(opp)) return -EINVAL; @@ -1465,14 +1479,13 @@ _get_dt_power(struct device *dev, unsigned long *uW, unsigned long *kHz) int dev_pm_opp_calc_power(struct device *dev, unsigned long *uW, unsigned long *kHz) { - struct dev_pm_opp *opp __free(put_opp) = NULL; - struct device_node *np __free(device_node); unsigned long mV, Hz; u32 cap; u64 tmp; int ret; - np = of_node_get(dev->of_node); + struct device_node *np __free(device_node) = of_node_get(dev->of_node); + if (!np) return -EINVAL; @@ -1481,7 +1494,10 @@ int dev_pm_opp_calc_power(struct device *dev, unsigned long *uW, return -EINVAL; Hz = *kHz * 1000; - opp = dev_pm_opp_find_freq_ceil(dev, &Hz); + + struct dev_pm_opp *opp __free(put_opp) = + dev_pm_opp_find_freq_ceil(dev, &Hz); + if (IS_ERR(opp)) return -EINVAL; @@ -1502,11 +1518,12 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_calc_power); static bool _of_has_opp_microwatt_property(struct device *dev) { - struct dev_pm_opp *opp __free(put_opp); unsigned long freq = 0; /* Check if at least one OPP has needed property */ - opp = dev_pm_opp_find_freq_ceil(dev, &freq); + struct dev_pm_opp *opp __free(put_opp) = + dev_pm_opp_find_freq_ceil(dev, &freq); + if (IS_ERR(opp)) return false; @@ -1526,12 +1543,16 @@ static bool _of_has_opp_microwatt_property(struct device *dev) */ int dev_pm_opp_of_register_em(struct device *dev, struct cpumask *cpus) { - struct device_node *np __free(device_node) = NULL; struct em_data_callback em_cb; int ret, nr_opp; u32 cap; - if (IS_ERR_OR_NULL(dev)) { + if (IS_ERR_OR_NULL(dev)) + return -EINVAL; + + struct device_node *np __free(device_node) = of_node_get(dev->of_node); + + if (!np) { ret = -EINVAL; goto failed; } @@ -1548,12 +1569,6 @@ int dev_pm_opp_of_register_em(struct device *dev, struct cpumask *cpus) goto register_em; } - np = of_node_get(dev->of_node); - if (!np) { - ret = -EINVAL; - goto failed; - } - /* * Register an EM only if the 'dynamic-power-coefficient' property is * set in devicetree. It is assumed the voltage values are known if that From 2de5cb96060a1664880d65b120e59485a73588a8 Mon Sep 17 00:00:00 2001 From: Shuhao Fu Date: Mon, 6 Oct 2025 03:31:17 +0800 Subject: [PATCH 25/96] cpufreq: s5pv210: fix refcount leak In function `s5pv210_cpu_init`, a possible refcount inconsistency has been identified, causing a resource leak. Why it is a bug: 1. For every clk_get, there should be a matching clk_put on every successive error handling path. 2. After calling `clk_get(dmc1_clk)`, variable `dmc1_clk` will not be freed even if any error happens. How it is fixed: For every failed path, an extra goto label is added to ensure `dmc1_clk` will be freed regardlessly. Signed-off-by: Shuhao Fu Signed-off-by: Viresh Kumar --- drivers/cpufreq/s5pv210-cpufreq.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/s5pv210-cpufreq.c b/drivers/cpufreq/s5pv210-cpufreq.c index 4215621deb3f..ba8a1c96427a 100644 --- a/drivers/cpufreq/s5pv210-cpufreq.c +++ b/drivers/cpufreq/s5pv210-cpufreq.c @@ -518,7 +518,7 @@ static int s5pv210_cpu_init(struct cpufreq_policy *policy) if (policy->cpu != 0) { ret = -EINVAL; - goto out_dmc1; + goto out; } /* @@ -530,7 +530,7 @@ static int s5pv210_cpu_init(struct cpufreq_policy *policy) if ((mem_type != LPDDR) && (mem_type != LPDDR2)) { pr_err("CPUFreq doesn't support this memory type\n"); ret = -EINVAL; - goto out_dmc1; + goto out; } /* Find current refresh counter and frequency each DMC */ @@ -544,6 +544,8 @@ static int s5pv210_cpu_init(struct cpufreq_policy *policy) cpufreq_generic_init(policy, s5pv210_freq_table, 40000); return 0; +out: + clk_put(dmc1_clk); out_dmc1: clk_put(dmc0_clk); out_dmc0: From 6e7970cab51d01b8f7c56f120486c571c22e1b80 Mon Sep 17 00:00:00 2001 From: Hal Feng Date: Thu, 16 Oct 2025 16:00:48 +0800 Subject: [PATCH 26/96] cpufreq: dt-platdev: Add JH7110S SOC to the allowlist Add the compatible strings for supporting the generic cpufreq driver on the StarFive JH7110S SoC. Signed-off-by: Hal Feng Reviewed-by: Heinrich Schuchardt Signed-off-by: Viresh Kumar --- drivers/cpufreq/cpufreq-dt-platdev.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/cpufreq/cpufreq-dt-platdev.c b/drivers/cpufreq/cpufreq-dt-platdev.c index cd1816a12bb9..dc11b62399ad 100644 --- a/drivers/cpufreq/cpufreq-dt-platdev.c +++ b/drivers/cpufreq/cpufreq-dt-platdev.c @@ -87,6 +87,7 @@ static const struct of_device_id allowlist[] __initconst = { { .compatible = "st-ericsson,u9540", }, { .compatible = "starfive,jh7110", }, + { .compatible = "starfive,jh7110s", }, { .compatible = "ti,omap2", }, { .compatible = "ti,omap4", }, From 85976d3774be8fe290eb0468c1b0a0c36f40cbfe Mon Sep 17 00:00:00 2001 From: Aaron Kling Date: Tue, 21 Oct 2025 20:08:36 -0500 Subject: [PATCH 27/96] cpufreq: tegra186: add OPP support and set bandwidth Add support to use OPP table from DT in Tegra186 cpufreq driver. Tegra SoC's receive the frequency lookup table (LUT) from BPMP-FW. Cross check the OPP's present in DT against the LUT from BPMP-FW and enable only those DT OPP's which are present in LUT also. The OPP table in DT has CPU Frequency to bandwidth mapping where the bandwidth value is per MC channel. DRAM bandwidth depends on the number of MC channels which can vary as per the boot configuration. This per channel bandwidth from OPP table will be later converted by MC driver to final bandwidth value by multiplying with number of channels before being handled in the EMC driver. If OPP table is not present in DT, then use the LUT from BPMP-FW directly as the CPU frequency table and not do the DRAM frequency scaling which is same as the current behavior. Signed-off-by: Aaron Kling [ Viresh: Fix _free() definitions ] Signed-off-by: Viresh Kumar --- drivers/cpufreq/tegra186-cpufreq.c | 150 +++++++++++++++++++++++++++-- 1 file changed, 143 insertions(+), 7 deletions(-) diff --git a/drivers/cpufreq/tegra186-cpufreq.c b/drivers/cpufreq/tegra186-cpufreq.c index 136ab102f636..34ed943c5f34 100644 --- a/drivers/cpufreq/tegra186-cpufreq.c +++ b/drivers/cpufreq/tegra186-cpufreq.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -58,7 +59,7 @@ static const struct tegra186_cpufreq_cpu tegra186_cpus[] = { }; struct tegra186_cpufreq_cluster { - struct cpufreq_frequency_table *table; + struct cpufreq_frequency_table *bpmp_lut; u32 ref_clk_khz; u32 div; }; @@ -66,16 +67,119 @@ struct tegra186_cpufreq_cluster { struct tegra186_cpufreq_data { void __iomem *regs; const struct tegra186_cpufreq_cpu *cpus; + bool icc_dram_bw_scaling; struct tegra186_cpufreq_cluster clusters[]; }; +static int tegra_cpufreq_set_bw(struct cpufreq_policy *policy, unsigned long freq_khz) +{ + struct tegra186_cpufreq_data *data = cpufreq_get_driver_data(); + struct device *dev; + int ret; + + dev = get_cpu_device(policy->cpu); + if (!dev) + return -ENODEV; + + struct dev_pm_opp *opp __free(put_opp) = + dev_pm_opp_find_freq_exact(dev, freq_khz * HZ_PER_KHZ, true); + if (IS_ERR(opp)) + return PTR_ERR(opp); + + ret = dev_pm_opp_set_opp(dev, opp); + if (ret) + data->icc_dram_bw_scaling = false; + + return ret; +} + +static int tegra_cpufreq_init_cpufreq_table(struct cpufreq_policy *policy, + struct cpufreq_frequency_table *bpmp_lut, + struct cpufreq_frequency_table **opp_table) +{ + struct tegra186_cpufreq_data *data = cpufreq_get_driver_data(); + struct cpufreq_frequency_table *freq_table = NULL; + struct cpufreq_frequency_table *pos; + struct device *cpu_dev; + unsigned long rate; + int ret, max_opps; + int j = 0; + + cpu_dev = get_cpu_device(policy->cpu); + if (!cpu_dev) { + pr_err("%s: failed to get cpu%d device\n", __func__, policy->cpu); + return -ENODEV; + } + + /* Initialize OPP table mentioned in operating-points-v2 property in DT */ + ret = dev_pm_opp_of_add_table_indexed(cpu_dev, 0); + if (ret) { + dev_err(cpu_dev, "Invalid or empty opp table in device tree\n"); + data->icc_dram_bw_scaling = false; + return ret; + } + + max_opps = dev_pm_opp_get_opp_count(cpu_dev); + if (max_opps <= 0) { + dev_err(cpu_dev, "Failed to add OPPs\n"); + return max_opps; + } + + /* Disable all opps and cross-validate against LUT later */ + for (rate = 0; ; rate++) { + struct dev_pm_opp *opp __free(put_opp) = + dev_pm_opp_find_freq_ceil(cpu_dev, &rate); + if (IS_ERR(opp)) + break; + + dev_pm_opp_disable(cpu_dev, rate); + } + + freq_table = kcalloc((max_opps + 1), sizeof(*freq_table), GFP_KERNEL); + if (!freq_table) + return -ENOMEM; + + /* + * Cross check the frequencies from BPMP-FW LUT against the OPP's present in DT. + * Enable only those DT OPP's which are present in LUT also. + */ + cpufreq_for_each_valid_entry(pos, bpmp_lut) { + struct dev_pm_opp *opp __free(put_opp) = + dev_pm_opp_find_freq_exact(cpu_dev, pos->frequency * HZ_PER_KHZ, false); + if (IS_ERR(opp)) + continue; + + ret = dev_pm_opp_enable(cpu_dev, pos->frequency * HZ_PER_KHZ); + if (ret < 0) + return ret; + + freq_table[j].driver_data = pos->driver_data; + freq_table[j].frequency = pos->frequency; + j++; + } + + freq_table[j].driver_data = pos->driver_data; + freq_table[j].frequency = CPUFREQ_TABLE_END; + + *opp_table = &freq_table[0]; + + dev_pm_opp_set_sharing_cpus(cpu_dev, policy->cpus); + + /* Prime interconnect data */ + tegra_cpufreq_set_bw(policy, freq_table[j - 1].frequency); + + return ret; +} + static int tegra186_cpufreq_init(struct cpufreq_policy *policy) { struct tegra186_cpufreq_data *data = cpufreq_get_driver_data(); unsigned int cluster = data->cpus[policy->cpu].bpmp_cluster_id; + struct cpufreq_frequency_table *freq_table; + struct cpufreq_frequency_table *bpmp_lut; u32 cpu; + int ret; - policy->freq_table = data->clusters[cluster].table; policy->cpuinfo.transition_latency = 300 * 1000; policy->driver_data = NULL; @@ -85,6 +189,20 @@ static int tegra186_cpufreq_init(struct cpufreq_policy *policy) cpumask_set_cpu(cpu, policy->cpus); } + bpmp_lut = data->clusters[cluster].bpmp_lut; + + if (data->icc_dram_bw_scaling) { + ret = tegra_cpufreq_init_cpufreq_table(policy, bpmp_lut, &freq_table); + if (!ret) { + policy->freq_table = freq_table; + return 0; + } + } + + data->icc_dram_bw_scaling = false; + policy->freq_table = bpmp_lut; + pr_info("OPP tables missing from DT, EMC frequency scaling disabled\n"); + return 0; } @@ -102,6 +220,10 @@ static int tegra186_cpufreq_set_target(struct cpufreq_policy *policy, writel(edvd_val, data->regs + edvd_offset); } + if (data->icc_dram_bw_scaling) + tegra_cpufreq_set_bw(policy, tbl->frequency); + + return 0; } @@ -134,7 +256,7 @@ static struct cpufreq_driver tegra186_cpufreq_driver = { .init = tegra186_cpufreq_init, }; -static struct cpufreq_frequency_table *init_vhint_table( +static struct cpufreq_frequency_table *tegra_cpufreq_bpmp_read_lut( struct platform_device *pdev, struct tegra_bpmp *bpmp, struct tegra186_cpufreq_cluster *cluster, unsigned int cluster_id, int *num_rates) @@ -229,6 +351,7 @@ static int tegra186_cpufreq_probe(struct platform_device *pdev) { struct tegra186_cpufreq_data *data; struct tegra_bpmp *bpmp; + struct device *cpu_dev; unsigned int i = 0, err, edvd_offset; int num_rates = 0; u32 edvd_val, cpu; @@ -254,9 +377,9 @@ static int tegra186_cpufreq_probe(struct platform_device *pdev) for (i = 0; i < TEGRA186_NUM_CLUSTERS; i++) { struct tegra186_cpufreq_cluster *cluster = &data->clusters[i]; - cluster->table = init_vhint_table(pdev, bpmp, cluster, i, &num_rates); - if (IS_ERR(cluster->table)) { - err = PTR_ERR(cluster->table); + cluster->bpmp_lut = tegra_cpufreq_bpmp_read_lut(pdev, bpmp, cluster, i, &num_rates); + if (IS_ERR(cluster->bpmp_lut)) { + err = PTR_ERR(cluster->bpmp_lut); goto put_bpmp; } else if (!num_rates) { err = -EINVAL; @@ -265,7 +388,7 @@ static int tegra186_cpufreq_probe(struct platform_device *pdev) for (cpu = 0; cpu < ARRAY_SIZE(tegra186_cpus); cpu++) { if (data->cpus[cpu].bpmp_cluster_id == i) { - edvd_val = cluster->table[num_rates - 1].driver_data; + edvd_val = cluster->bpmp_lut[num_rates - 1].driver_data; edvd_offset = data->cpus[cpu].edvd_offset; writel(edvd_val, data->regs + edvd_offset); } @@ -274,6 +397,19 @@ static int tegra186_cpufreq_probe(struct platform_device *pdev) tegra186_cpufreq_driver.driver_data = data; + /* Check for optional OPPv2 and interconnect paths on CPU0 to enable ICC scaling */ + cpu_dev = get_cpu_device(0); + if (!cpu_dev) { + err = -EPROBE_DEFER; + goto put_bpmp; + } + + if (dev_pm_opp_of_get_opp_desc_node(cpu_dev)) { + err = dev_pm_opp_of_find_icc_paths(cpu_dev, NULL); + if (!err) + data->icc_dram_bw_scaling = true; + } + err = cpufreq_register_driver(&tegra186_cpufreq_driver); put_bpmp: From cea54f8e3423a3c5d88377e15b1138a398a7a3a6 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 22 Oct 2025 22:26:23 +0200 Subject: [PATCH 28/96] PM: runtime: docs: Update pm_runtime_allow/forbid() documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drop confusing descriptions of pm_runtime_allow() and pm_runtime_forbid() from Documentation/power/runtime_pm.rst and update the kerneldoc comments of these functions to better explain their purpose. Link: https://lore.kernel.org/linux-pm/08976178-298f-79d9-1d63-cff5a4e56cc3@linux.intel.com/ Signed-off-by: Rafael J. Wysocki Reviewed-by: Brian Norris Reviewed-by: Ulf Hansson Reviewed-by: Ilpo Järvinen Link: https://patch.msgid.link/12780841.O9o76ZdvQC@rafael.j.wysocki --- Documentation/power/runtime_pm.rst | 10 ---------- drivers/base/power/runtime.c | 17 +++++++++++++---- 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/Documentation/power/runtime_pm.rst b/Documentation/power/runtime_pm.rst index c8dbdb8595e5..8246df3cecd7 100644 --- a/Documentation/power/runtime_pm.rst +++ b/Documentation/power/runtime_pm.rst @@ -480,16 +480,6 @@ drivers/base/power/runtime.c and include/linux/pm_runtime.h: `bool pm_runtime_status_suspended(struct device *dev);` - return true if the device's runtime PM status is 'suspended' - `void pm_runtime_allow(struct device *dev);` - - set the power.runtime_auto flag for the device and decrease its usage - counter (used by the /sys/devices/.../power/control interface to - effectively allow the device to be power managed at run time) - - `void pm_runtime_forbid(struct device *dev);` - - unset the power.runtime_auto flag for the device and increase its usage - counter (used by the /sys/devices/.../power/control interface to - effectively prevent the device from being power managed at run time) - `void pm_runtime_no_callbacks(struct device *dev);` - set the power.no_callbacks flag for the device and remove the runtime PM attributes from /sys/devices/.../power (or prevent them from being diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c index 1b11a3cd4acc..82bc4e9d8539 100644 --- a/drivers/base/power/runtime.c +++ b/drivers/base/power/runtime.c @@ -1664,9 +1664,12 @@ EXPORT_SYMBOL_GPL(devm_pm_runtime_get_noresume); * pm_runtime_forbid - Block runtime PM of a device. * @dev: Device to handle. * - * Increase the device's usage count and clear its power.runtime_auto flag, - * so that it cannot be suspended at run time until pm_runtime_allow() is called - * for it. + * Resume @dev if already suspended and block runtime suspend of @dev in such + * a way that it can be unblocked via the /sys/devices/.../power/control + * interface, or otherwise by calling pm_runtime_allow(). + * + * Calling this function many times in a row has the same effect as calling it + * once. */ void pm_runtime_forbid(struct device *dev) { @@ -1687,7 +1690,13 @@ EXPORT_SYMBOL_GPL(pm_runtime_forbid); * pm_runtime_allow - Unblock runtime PM of a device. * @dev: Device to handle. * - * Decrease the device's usage count and set its power.runtime_auto flag. + * Unblock runtime suspend of @dev after it has been blocked by + * pm_runtime_forbid() (for instance, if it has been blocked via the + * /sys/devices/.../power/control interface), check if @dev can be + * suspended and suspend it in that case. + * + * Calling this function many times in a row has the same effect as calling it + * once. */ void pm_runtime_allow(struct device *dev) { From 33ffb0aa8ce8b18aaa65e0f9346d52b4e314ad7d Mon Sep 17 00:00:00 2001 From: Tamir Duberstein Date: Thu, 23 Oct 2025 09:59:39 -0400 Subject: [PATCH 29/96] rust: opp: simplify callers of `to_c_str_array` Use `Option` combinators to make this a bit less noisy. Wrap the `dev_pm_opp_set_config` operation in a closure and use type ascription to leverage the compiler to check for use after free. Signed-off-by: Tamir Duberstein Tested-by: Viresh Kumar Signed-off-by: Viresh Kumar --- rust/kernel/opp.rs | 118 +++++++++++++++++++++++---------------------- 1 file changed, 61 insertions(+), 57 deletions(-) diff --git a/rust/kernel/opp.rs b/rust/kernel/opp.rs index 04472a8de3ff..f9641c639fff 100644 --- a/rust/kernel/opp.rs +++ b/rust/kernel/opp.rs @@ -443,66 +443,70 @@ pub fn set_supported_hw(mut self, hw: KVec) -> Result { /// /// The returned [`ConfigToken`] will remove the configuration when dropped. pub fn set(self, dev: &Device) -> Result { - let (_clk_list, clk_names) = match &self.clk_names { - Some(x) => { - let list = to_c_str_array(x)?; - let ptr = list.as_ptr(); - (Some(list), ptr) - } - None => (None, ptr::null()), + let clk_names = self.clk_names.as_deref().map(to_c_str_array).transpose()?; + let regulator_names = self + .regulator_names + .as_deref() + .map(to_c_str_array) + .transpose()?; + + let set_config = || { + let clk_names = clk_names.as_ref().map_or(ptr::null(), |c| c.as_ptr()); + let regulator_names = regulator_names.as_ref().map_or(ptr::null(), |c| c.as_ptr()); + + let prop_name = self + .prop_name + .as_ref() + .map_or(ptr::null(), |p| p.as_char_ptr()); + + let (supported_hw, supported_hw_count) = self + .supported_hw + .as_ref() + .map_or((ptr::null(), 0), |hw| (hw.as_ptr(), hw.len() as u32)); + + let (required_dev, required_dev_index) = self + .required_dev + .as_ref() + .map_or((ptr::null_mut(), 0), |(dev, idx)| (dev.as_raw(), *idx)); + + let mut config = bindings::dev_pm_opp_config { + clk_names, + config_clks: if T::HAS_CONFIG_CLKS { + Some(Self::config_clks) + } else { + None + }, + prop_name, + regulator_names, + config_regulators: if T::HAS_CONFIG_REGULATORS { + Some(Self::config_regulators) + } else { + None + }, + supported_hw, + supported_hw_count, + + required_dev, + required_dev_index, + }; + + // SAFETY: The requirements are satisfied by the existence of [`Device`] and its safety + // requirements. The OPP core guarantees not to access fields of [`Config`] after this + // call and so we don't need to save a copy of them for future use. + let ret = unsafe { bindings::dev_pm_opp_set_config(dev.as_raw(), &mut config) }; + + to_result(ret).map(|()| ConfigToken(ret)) }; - let (_regulator_list, regulator_names) = match &self.regulator_names { - Some(x) => { - let list = to_c_str_array(x)?; - let ptr = list.as_ptr(); - (Some(list), ptr) - } - None => (None, ptr::null()), - }; + // Ensure the closure does not accidentally drop owned data; if violated, the compiler + // produces E0525 with e.g.: + // + // ``` + // closure is `FnOnce` because it moves the variable `clk_names` out of its environment + // ``` + let _: &dyn Fn() -> _ = &set_config; - let prop_name = self - .prop_name - .as_ref() - .map_or(ptr::null(), |p| p.as_char_ptr()); - - let (supported_hw, supported_hw_count) = self - .supported_hw - .as_ref() - .map_or((ptr::null(), 0), |hw| (hw.as_ptr(), hw.len() as u32)); - - let (required_dev, required_dev_index) = self - .required_dev - .as_ref() - .map_or((ptr::null_mut(), 0), |(dev, idx)| (dev.as_raw(), *idx)); - - let mut config = bindings::dev_pm_opp_config { - clk_names, - config_clks: if T::HAS_CONFIG_CLKS { - Some(Self::config_clks) - } else { - None - }, - prop_name, - regulator_names, - config_regulators: if T::HAS_CONFIG_REGULATORS { - Some(Self::config_regulators) - } else { - None - }, - supported_hw, - supported_hw_count, - - required_dev, - required_dev_index, - }; - - // SAFETY: The requirements are satisfied by the existence of [`Device`] and its safety - // requirements. The OPP core guarantees not to access fields of [`Config`] after this call - // and so we don't need to save a copy of them for future use. - let ret = unsafe { bindings::dev_pm_opp_set_config(dev.as_raw(), &mut config) }; - - to_result(ret).map(|()| ConfigToken(ret)) + set_config() } /// Config's clk callback. From 07d815701274d156ad8c7c088a52e01642156fb8 Mon Sep 17 00:00:00 2001 From: Aboorva Devarajan Date: Mon, 6 Oct 2025 07:09:54 +0530 Subject: [PATCH 30/96] cpuidle: menu: Use residency threshold in polling state override decisions On virtualized PowerPC (pseries) systems, where only one polling state (Snooze) and one deep state (CEDE) are available, selecting CEDE when the predicted idle duration is less than the target residency of CEDE state can hurt performance. In such cases, the entry/exit overhead of CEDE outweighs the power savings, leading to unnecessary state transitions and higher latency. Menu governor currently contains a special-case rule that prioritizes the first non-polling state over polling, even when its target residency is much longer than the predicted idle duration. On PowerPC/pseries, where the gap between the polling state (Snooze) and the first non-polling state (CEDE) is large, this behavior causes performance regressions. Refine that special case by adding an extra requirement: the first non-polling state can only be chosen if its target residency is below the defined RESIDENCY_THRESHOLD_NS. If this condition is not satisfied, polling is allowed instead, avoiding suboptimal non-polling state entries. This change is limited to the single special-case rule for the first non-polling state. The general non-polling state selection logic in the menu governor remains unchanged. Performance improvement observed with pgbench on PowerPC (pseries) system: +---------------------------+------------+------------+------------+ | Metric | Baseline | Patched | Change (%) | +---------------------------+------------+------------+------------+ | Transactions/sec (TPS) | 495,210 | 536,982 | +8.45% | | Avg latency (ms) | 0.163 | 0.150 | -7.98% | +---------------------------+------------+------------+------------+ CPUIdle state usage: +--------------+--------------+-------------+ | Metric | Baseline | Patched | +--------------+--------------+-------------+ | Total usage | 12,735,820 | 13,918,442 | | Above usage | 11,401,520 | 1,598,210 | | Below usage | 20,145 | 702,395 | +--------------+--------------+-------------+ Above/Total and Below/Total usage percentages: +------------------------+-----------+---------+ | Metric | Baseline | Patched | +------------------------+-----------+---------+ | Above % (Above/Total) | 89.56% | 11.49% | | Below % (Below/Total) | 0.16% | 5.05% | | Total cpuidle miss (%) | 89.72% | 16.54% | +------------------------+-----------+---------+ The results indicate that restricting CEDE selection to cases where its residency matches the predicted idle time reduces mispredictions, lowers unnecessary state transitions, and improves overall throughput. Reviewed-by: Christian Loehle Signed-off-by: Aboorva Devarajan [ rjw: Changelog edits, rebase ] Link: https://patch.msgid.link/20251006013954.17972-1-aboorvad@linux.ibm.com Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/governors/menu.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index 23239b0c04f9..64d6f7a1c776 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -317,12 +317,13 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, } /* - * Use a physical idle state, not busy polling, unless a timer - * is going to trigger soon enough or the exit latency of the - * idle state in question is greater than the predicted idle - * duration. + * Use a physical idle state instead of busy polling so long as + * its target residency is below the residency threshold, its + * exit latency is not greater than the predicted idle duration, + * and the next timer doesn't expire soon. */ if ((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) && + s->target_residency_ns < RESIDENCY_THRESHOLD_NS && s->target_residency_ns <= data->next_timer_ns && s->exit_latency_ns <= predicted_ns) { predicted_ns = s->target_residency_ns; From 9600156bb99852c216a2128cdf9f114eb67c350f Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Mon, 27 Oct 2025 23:04:45 +0800 Subject: [PATCH 31/96] cpufreq: nforce2: fix reference count leak in nforce2 There are two reference count leaks in this driver: 1. In nforce2_fsb_read(): pci_get_subsys() increases the reference count of the PCI device, but pci_dev_put() is never called to release it, thus leaking the reference. 2. In nforce2_detect_chipset(): pci_get_subsys() gets a reference to the nforce2_dev which is stored in a global variable, but the reference is never released when the module is unloaded. Fix both by: - Adding pci_dev_put(nforce2_sub5) in nforce2_fsb_read() after reading the configuration. - Adding pci_dev_put(nforce2_dev) in nforce2_exit() to release the global device reference. Found via static analysis. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Cc: stable@vger.kernel.org Signed-off-by: Miaoqian Lin Signed-off-by: Viresh Kumar --- drivers/cpufreq/cpufreq-nforce2.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/cpufreq/cpufreq-nforce2.c b/drivers/cpufreq/cpufreq-nforce2.c index fedad1081973..fbbbe501cf2d 100644 --- a/drivers/cpufreq/cpufreq-nforce2.c +++ b/drivers/cpufreq/cpufreq-nforce2.c @@ -145,6 +145,8 @@ static unsigned int nforce2_fsb_read(int bootfsb) pci_read_config_dword(nforce2_sub5, NFORCE2_BOOTFSB, &fsb); fsb /= 1000000; + pci_dev_put(nforce2_sub5); + /* Check if PLL register is already set */ pci_read_config_byte(nforce2_dev, NFORCE2_PLLENABLE, (u8 *)&temp); @@ -426,6 +428,7 @@ static int __init nforce2_init(void) static void __exit nforce2_exit(void) { cpufreq_unregister_driver(&nforce2_driver); + pci_dev_put(nforce2_dev); } module_init(nforce2_init); From 1971b18785d198ae5adbb861136ae5c0f195c14d Mon Sep 17 00:00:00 2001 From: Jie Zhan Date: Thu, 28 Aug 2025 19:02:11 +0800 Subject: [PATCH 32/96] cpufreq: CPPC: Don't warn if FIE init fails to read counters During the CPPC FIE initialization, reading perf counters on offline cpus should be expected to fail. Don't warn on this case. Also, change the error log level to debug since FIE is optional. Co-developed-by: Bowen Yu Signed-off-by: Bowen Yu # Changing loglevel to debug Signed-off-by: Jie Zhan [ Viresh: Added back the dropped comment. ] Signed-off-by: Viresh Kumar --- drivers/cpufreq/cppc_cpufreq.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index e23d9abea135..9eac77c4f294 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -142,16 +142,15 @@ static void cppc_cpufreq_cpu_fie_init(struct cpufreq_policy *policy) init_irq_work(&cppc_fi->irq_work, cppc_irq_work); ret = cppc_get_perf_ctrs(cpu, &cppc_fi->prev_perf_fb_ctrs); - if (ret) { - pr_warn("%s: failed to read perf counters for cpu:%d: %d\n", - __func__, cpu, ret); - /* - * Don't abort if the CPU was offline while the driver - * was getting registered. - */ - if (cpu_online(cpu)) - return; + /* + * Don't abort as the CPU was offline while the driver was + * getting registered. + */ + if (ret && cpu_online(cpu)) { + pr_debug("%s: failed to read perf counters for cpu:%d: %d\n", + __func__, cpu, ret); + return; } } From 65df3a9629c10d70593bc90b2ca6b235b7a24909 Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Tue, 28 Oct 2025 22:23:32 +0800 Subject: [PATCH 33/96] PM: EM: Add to em_pd_list only when no failure When em_create_perf_table() returns failure, pd is freed, there dev->em_pd is not valid. Then accessing dev->em_pd->node will trigger kernel panic in em_dev_register_pd_no_update(). So return early if 'ret' is non-zero. Kernel dump: cpu cpu0: EM: invalid power: 0 Unable to handle kernel NULL pointer dereference at virtual address 0000000000000008 Mem abort info: pc : em_dev_register_pd_no_update+0xb4/0x79c lr : em_dev_register_pd_no_update+0x9c/0x79c Call trace: em_dev_register_pd_no_update+0xb4/0x79c (P) em_dev_register_perf_domain+0x18/0x58 scmi_cpufreq_register_em+0x84/0xb8 cpufreq_online+0x48c/0xb74 cpufreq_add_dev+0x80/0x98 subsys_interface_register+0x100/0x11c cpufreq_register_driver+0x158/0x278 scmi_cpufreq_probe+0x1f8/0x2e0 scmi_dev_probe+0x28/0x3c really_probe+0xbc/0x29c __driver_probe_device+0x78/0x12c driver_probe_device+0x3c/0x15c __device_attach_driver+0xb8/0x134 bus_for_each_drv+0x84/0xe4 Fixes: cbe5aeedecc7 ("PM: EM: Assign a unique ID when creating a performance domain") Signed-off-by: Peng Fan Reviewed-by: Changwoo Min Reviewed-by: Lukasz Luba Link: https://patch.msgid.link/20251028-fix-energy-v1-1-ab854fd6a97c@nxp.com Signed-off-by: Rafael J. Wysocki --- kernel/power/energy_model.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index e669d5057fca..11af9f64aa82 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -693,13 +693,16 @@ int em_dev_register_pd_no_update(struct device *dev, unsigned int nr_states, unlock: mutex_unlock(&em_pd_mutex); + if (ret) + return ret; mutex_lock(&em_pd_list_mutex); list_add_tail(&dev->em_pd->node, &em_pd_list); mutex_unlock(&em_pd_list_mutex); em_notify_pd_created(dev->em_pd); - return ret; + + return 0; } EXPORT_SYMBOL_GPL(em_dev_register_pd_no_update); From 4e48e7baa3a1ac6f21f2fde78e9ca8778f9aa14c Mon Sep 17 00:00:00 2001 From: Malaya Kumar Rout Date: Sun, 26 Oct 2025 22:35:27 +0530 Subject: [PATCH 34/96] PM: runtime: fix typos in runtime.c comments Fix several typos in comments: - "timesptamp" -> "timestamp" - "involed" -> "involved" - "nonero" -> "nonzero" Fix typos in comments to improve code documentation clarity. Signed-off-by: Malaya Kumar Rout Link: https://patch.msgid.link/20251026170527.262003-1-mrout@redhat.com Signed-off-by: Rafael J. Wysocki --- drivers/base/power/runtime.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c index 82bc4e9d8539..62707738caa4 100644 --- a/drivers/base/power/runtime.c +++ b/drivers/base/power/runtime.c @@ -90,7 +90,7 @@ static void update_pm_runtime_accounting(struct device *dev) /* * Because ktime_get_mono_fast_ns() is not monotonic during * timekeeping updates, ensure that 'now' is after the last saved - * timesptamp. + * timestamp. */ if (now < last) return; @@ -217,7 +217,7 @@ static int dev_memalloc_noio(struct device *dev, void *data) * resume/suspend callback of any one of its ancestors(or the * block device itself), the deadlock may be triggered inside the * memory allocation since it might not complete until the block - * device becomes active and the involed page I/O finishes. The + * device becomes active and the involved page I/O finishes. The * situation is pointed out first by Alan Stern. Network device * are involved in iSCSI kind of situation. * @@ -1210,7 +1210,7 @@ EXPORT_SYMBOL_GPL(__pm_runtime_resume); * * Otherwise, if its runtime PM status is %RPM_ACTIVE and (1) @ign_usage_count * is set, or (2) @dev is not ignoring children and its active child count is - * nonero, or (3) the runtime PM usage counter of @dev is not zero, increment + * nonzero, or (3) the runtime PM usage counter of @dev is not zero, increment * the usage counter of @dev and return 1. * * Otherwise, return 0 without changing the usage counter. From cb908f8b0acc7e28b93e653c2a521dd090d8b99e Mon Sep 17 00:00:00 2001 From: Swaraj Gaikwad Date: Wed, 29 Oct 2025 13:47:37 +0000 Subject: [PATCH 35/96] Documentation: intel_pstate: fix duplicate hyperlink target errors Fix reST warnings in Documentation/admin-guide/pm/intel_pstate.rst caused by missing explicit hyperlink labels for section titles. Before this change, the following errors were printed during `make htmldocs`: Documentation/admin-guide/pm/intel_pstate.rst:401: ERROR: Indirect hyperlink target (id="id6") refers to target "passive mode", which is a duplicate, and cannot be used as a unique reference. Documentation/admin-guide/pm/intel_pstate.rst:517: ERROR: Indirect hyperlink target (id="id9") refers to target "active mode", which is a duplicate, and cannot be used as a unique reference. Documentation/admin-guide/pm/intel_pstate.rst:611: ERROR: Indirect hyperlink target (id="id15") refers to target "global attributes", which is a duplicate, and cannot be used as a unique reference. ERROR: Duplicate target name, cannot be used as a unique reference: "passive mode", "active mode", "global attributes". These errors occurred because the sections "Active Mode", "Active Mode With HWP", "Passive Mode", and "Global Attributes" did not define explicit hyperlink labels. As a result, Sphinx auto-generated duplicate anchors when the same titles appeared multiple times within the document. Because of this, the generated HTML documentation contained broken references such as: `active mode `_ `passive mode `_ `global attributes `_ This patch adds explicit hyperlink labels for the affected sections, ensuring all references are unique and correctly resolved. After applying this patch, `make htmldocs` completes without any warnings, and all hyperlinks in intel_pstate.html render properly. Signed-off-by: Swaraj Gaikwad Reviewed-by: Bagas Sanjaya Acked-by: Randy Dunlap Tested-by: Randy Dunlap [ rjw: Subject adjustment ] Link: https://patch.msgid.link/20251029134737.42229-1-swarajgaikwad1925@gmail.com Signed-off-by: Rafael J. Wysocki --- Documentation/admin-guide/pm/intel_pstate.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Documentation/admin-guide/pm/intel_pstate.rst b/Documentation/admin-guide/pm/intel_pstate.rst index 26e702c7016e..9cdd9dad6516 100644 --- a/Documentation/admin-guide/pm/intel_pstate.rst +++ b/Documentation/admin-guide/pm/intel_pstate.rst @@ -62,6 +62,8 @@ a certain performance scaling algorithm. Which of them will be in effect depends on what kernel command line options are used and on the capabilities of the processor. +.. _Active Mode: + Active Mode ----------- @@ -94,6 +96,8 @@ Which of the P-state selection algorithms is used by default depends on the Namely, if that option is set, the ``performance`` algorithm will be used by default, and the other one will be used by default if it is not set. +.. _Active Mode With HWP: + Active Mode With HWP ~~~~~~~~~~~~~~~~~~~~ @@ -192,6 +196,8 @@ This is the default P-state selection algorithm if the :c:macro:`CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE` kernel configuration option is not set. +.. _Passive Mode: + Passive Mode ------------ @@ -432,6 +438,8 @@ the ``energy_model`` directory in ``debugfs`` (typlically mounted on User Space Interface in ``sysfs`` ================================= +.. _Global Attributes: + Global Attributes ----------------- From e114e2eb7e85b5cc737ff5286cfe68a58caffeba Mon Sep 17 00:00:00 2001 From: Xueqin Luo Date: Tue, 21 Oct 2025 19:37:26 +0800 Subject: [PATCH 36/96] PM: hibernate: dynamically allocate crc->unc_len/unc for configurable threads Convert crc->unc_len and crc->unc from fixed-size arrays to dynamically allocated arrays, sized according to the actual number of threads selected at runtime. This removes the fixed limit imposed by CMP_THREADS. Signed-off-by: Xueqin Luo Link: https://patch.msgid.link/b5db63bb95729482d2649b12d3a11cb7547b7fcc.1761046167.git.luoxueqin@kylinos.cn Signed-off-by: Rafael J. Wysocki --- kernel/power/swap.c | 58 ++++++++++++++++++++++++++++++++++----------- 1 file changed, 44 insertions(+), 14 deletions(-) diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 0beff7eeaaba..f8c13f5672ec 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -585,10 +585,48 @@ struct crc_data { wait_queue_head_t go; /* start crc update */ wait_queue_head_t done; /* crc update done */ u32 *crc32; /* points to handle's crc32 */ - size_t *unc_len[CMP_THREADS]; /* uncompressed lengths */ - unsigned char *unc[CMP_THREADS]; /* uncompressed data */ + size_t **unc_len; /* uncompressed lengths */ + unsigned char **unc; /* uncompressed data */ }; +static struct crc_data *alloc_crc_data(int nr_threads) +{ + struct crc_data *crc; + + crc = kzalloc(sizeof(*crc), GFP_KERNEL); + if (!crc) + return NULL; + + crc->unc = kcalloc(nr_threads, sizeof(*crc->unc), GFP_KERNEL); + if (!crc->unc) + goto err_free_crc; + + crc->unc_len = kcalloc(nr_threads, sizeof(*crc->unc_len), GFP_KERNEL); + if (!crc->unc_len) + goto err_free_unc; + + return crc; + +err_free_unc: + kfree(crc->unc); +err_free_crc: + kfree(crc); + return NULL; +} + +static void free_crc_data(struct crc_data *crc) +{ + if (!crc) + return; + + if (crc->thr) + kthread_stop(crc->thr); + + kfree(crc->unc_len); + kfree(crc->unc); + kfree(crc); +} + /* * CRC32 update function that runs in its own thread. */ @@ -719,7 +757,7 @@ static int save_compressed_image(struct swap_map_handle *handle, goto out_clean; } - crc = kzalloc(sizeof(*crc), GFP_KERNEL); + crc = alloc_crc_data(nr_threads); if (!crc) { pr_err("Failed to allocate crc\n"); ret = -ENOMEM; @@ -885,11 +923,7 @@ static int save_compressed_image(struct swap_map_handle *handle, out_clean: hib_finish_batch(&hb); - if (crc) { - if (crc->thr) - kthread_stop(crc->thr); - kfree(crc); - } + free_crc_data(crc); if (data) { for (thr = 0; thr < nr_threads; thr++) { if (data[thr].thr) @@ -1239,7 +1273,7 @@ static int load_compressed_image(struct swap_map_handle *handle, goto out_clean; } - crc = kzalloc(sizeof(*crc), GFP_KERNEL); + crc = alloc_crc_data(nr_threads); if (!crc) { pr_err("Failed to allocate crc\n"); ret = -ENOMEM; @@ -1506,11 +1540,7 @@ static int load_compressed_image(struct swap_map_handle *handle, hib_finish_batch(&hb); for (i = 0; i < ring_size; i++) free_page((unsigned long)page[i]); - if (crc) { - if (crc->thr) - kthread_stop(crc->thr); - kfree(crc); - } + free_crc_data(crc); if (data) { for (thr = 0; thr < nr_threads; thr++) { if (data[thr].thr) From 090bf5a0f40030f4ef294a3edb84c5e99b843c7f Mon Sep 17 00:00:00 2001 From: Xueqin Luo Date: Tue, 21 Oct 2025 19:37:27 +0800 Subject: [PATCH 37/96] PM: hibernate: make compression threads configurable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The number of compression/decompression threads has a direct impact on hibernate image generation and resume latency. Using more threads can reduce overall resume time, but on systems with fewer CPU cores it may also introduce contention and reduce efficiency. Performance was evaluated on an 8-core ARM system, averaged over 10 runs: Threads Hibernate(s) Resume(s) -------------------------------- 3 12.14 18.86 4 12.28 17.48 5 11.09 16.77 6 11.08 16.44 With 5–6 threads, resume latency improves by approximately 12% compared to the default 3-thread configuration, with negligible impact on hibernate time. Introduce a new kernel parameter `hibernate_compression_threads=` that allows users and integrators to tune the number of compression/decompression threads at boot. This provides a way to balance performance and CPU utilization across a wide range of hardware without recompiling the kernel. Signed-off-by: Xueqin Luo Link: https://patch.msgid.link/f24b3ca6416e230a515a154ed4c121d72a7e05a6.1761046167.git.luoxueqin@kylinos.cn Signed-off-by: Rafael J. Wysocki --- .../admin-guide/kernel-parameters.txt | 10 ++++++++ kernel/power/swap.c | 25 ++++++++++++++++--- 2 files changed, 31 insertions(+), 4 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 6c42061ca20e..46db3cbb838f 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1907,6 +1907,16 @@ /sys/power/pm_test). Only available when CONFIG_PM_DEBUG is set. Default value is 5. + hibernate_compression_threads= + [HIBERNATION] + Set the number of threads used for compressing or decompressing + hibernation images. + + Format: + Default: 3 + Minimum: 1 + Example: hibernate_compression_threads=4 + highmem=nn[KMG] [KNL,BOOT,EARLY] forces the highmem zone to have an exact size of . This works even on boxes that have no highmem otherwise. This also works to reduce highmem diff --git a/kernel/power/swap.c b/kernel/power/swap.c index f8c13f5672ec..aa11576e92a9 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -519,8 +519,9 @@ static int swap_writer_finish(struct swap_map_handle *handle, CMP_HEADER, PAGE_SIZE) #define CMP_SIZE (CMP_PAGES * PAGE_SIZE) -/* Maximum number of threads for compression/decompression. */ -#define CMP_THREADS 3 +/* Default number of threads for compression/decompression. */ +#define CMP_THREADS 3 +static unsigned int hibernate_compression_threads = CMP_THREADS; /* Minimum/maximum number of pages for read buffering. */ #define CMP_MIN_RD_PAGES 1024 @@ -741,7 +742,7 @@ static int save_compressed_image(struct swap_map_handle *handle, * footprint. */ nr_threads = num_online_cpus() - 1; - nr_threads = clamp_val(nr_threads, 1, CMP_THREADS); + nr_threads = clamp_val(nr_threads, 1, hibernate_compression_threads); page = (void *)__get_free_page(GFP_NOIO | __GFP_HIGH); if (!page) { @@ -1257,7 +1258,7 @@ static int load_compressed_image(struct swap_map_handle *handle, * footprint. */ nr_threads = num_online_cpus() - 1; - nr_threads = clamp_val(nr_threads, 1, CMP_THREADS); + nr_threads = clamp_val(nr_threads, 1, hibernate_compression_threads); page = vmalloc_array(CMP_MAX_RD_PAGES, sizeof(*page)); if (!page) { @@ -1697,3 +1698,19 @@ static int __init swsusp_header_init(void) } core_initcall(swsusp_header_init); + +static int __init hibernate_compression_threads_setup(char *str) +{ + int rc = kstrtouint(str, 0, &hibernate_compression_threads); + + if (rc) + return rc; + + if (hibernate_compression_threads < 1) + hibernate_compression_threads = CMP_THREADS; + + return 1; + +} + +__setup("hibernate_compression_threads=", hibernate_compression_threads_setup); From ea358066ded351036cb8b69c424a05e223472a03 Mon Sep 17 00:00:00 2001 From: Xueqin Luo Date: Tue, 21 Oct 2025 19:37:28 +0800 Subject: [PATCH 38/96] PM: hibernate: add sysfs interface for hibernate_compression_threads Add a sysfs attribute `/sys/power/hibernate_compression_threads` to allow runtime configuration of the number of threads used for compressing and decompressing hibernation images. The new sysfs interface enables dynamic adjustment at runtime: # cat /sys/power/hibernate_compression_threads 3 # echo 4 > /sys/power/hibernate_compression_threads This change provides greater flexibility for debugging and performance tuning of hibernation without requiring a reboot. Signed-off-by: Xueqin Luo Link: https://patch.msgid.link/c68c62f97fabf32507b8794ad8c16cd22ee656ac.1761046167.git.luoxueqin@kylinos.cn Signed-off-by: Rafael J. Wysocki --- Documentation/ABI/testing/sysfs-power | 16 +++++++++++ kernel/power/swap.c | 38 +++++++++++++++++++++++++++ 2 files changed, 54 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-power b/Documentation/ABI/testing/sysfs-power index 4d8e1ad020f0..d38da077905a 100644 --- a/Documentation/ABI/testing/sysfs-power +++ b/Documentation/ABI/testing/sysfs-power @@ -454,3 +454,19 @@ Description: disables it. Reads from the file return the current value. The default is "1" if the build-time "SUSPEND_SKIP_SYNC" config flag is unset, or "0" otherwise. + +What: /sys/power/hibernate_compression_threads +Date: October 2025 +Contact: +Description: + Controls the number of threads used for compression + and decompression of hibernation images. + + The value can be adjusted at runtime to balance + performance and CPU utilization. + + The change takes effect on the next hibernation or + resume operation. + + Minimum value: 1 + Default value: 3 diff --git a/kernel/power/swap.c b/kernel/power/swap.c index aa11576e92a9..d173e276b494 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -1689,8 +1689,46 @@ int swsusp_unmark(void) } #endif +static ssize_t hibernate_compression_threads_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%d\n", hibernate_compression_threads); +} + +static ssize_t hibernate_compression_threads_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + unsigned long val; + + if (kstrtoul(buf, 0, &val)) + return -EINVAL; + + if (val < 1) + return -EINVAL; + + hibernate_compression_threads = val; + return n; +} +power_attr(hibernate_compression_threads); + +static struct attribute *g[] = { + &hibernate_compression_threads_attr.attr, + NULL, +}; + +static const struct attribute_group attr_group = { + .attrs = g, +}; + static int __init swsusp_header_init(void) { + int error; + + error = sysfs_create_group(power_kobj, &attr_group); + if (error) + return -ENOMEM; + swsusp_header = (struct swsusp_header*) __get_free_page(GFP_KERNEL); if (!swsusp_header) panic("Could not allocate memory for swsusp_header\n"); From 8e4ec90701efec7f2814c89b398d6d4272636814 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 20 Oct 2025 07:55:55 -1000 Subject: [PATCH 39/96] freezer: Clarify that only cgroup1 freezer uses PM freezer cgroup1 freezer piggybacks on the PM freezer, which inadvertently allowed userspace to produce uninterruptible tasks at will. To avoid the issue, cgroup2 freezer switched to a separate job control based mechanism. While this happened a long time ago, the code and comment haven't been updated making it confusing to people who aren't familiar with the history. Rename cgroup_freezing() to cgroup1_freezing() and update comments on top of freezing() and frozen() to clarify that cgroup2 freezer isn't covered by the PM freezer mechanism. Signed-off-by: Tejun Heo Suggested-by: Qu Wenruo Link: https://patch.msgid.link/aPZ3q6Hm865NicBC@slm.duckdns.org Signed-off-by: Rafael J. Wysocki --- include/linux/freezer.h | 12 ++++++++---- kernel/cgroup/legacy_freezer.c | 2 +- kernel/freezer.c | 2 +- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/include/linux/freezer.h b/include/linux/freezer.h index 32884c9721e5..0a8c6c4d1a82 100644 --- a/include/linux/freezer.h +++ b/include/linux/freezer.h @@ -22,14 +22,18 @@ extern bool pm_nosig_freezing; /* PM nosig freezing in effect */ extern unsigned int freeze_timeout_msecs; /* - * Check if a process has been frozen + * Check if a process has been frozen for PM or cgroup1 freezer. Note that + * cgroup2 freezer uses the job control mechanism and does not interact with + * the PM freezer. */ extern bool frozen(struct task_struct *p); extern bool freezing_slow_path(struct task_struct *p); /* - * Check if there is a request to freeze a process + * Check if there is a request to freeze a task from PM or cgroup1 freezer. + * Note that cgroup2 freezer uses the job control mechanism and does not + * interact with the PM freezer. */ static inline bool freezing(struct task_struct *p) { @@ -63,9 +67,9 @@ extern bool freeze_task(struct task_struct *p); extern bool set_freezable(void); #ifdef CONFIG_CGROUP_FREEZER -extern bool cgroup_freezing(struct task_struct *task); +extern bool cgroup1_freezing(struct task_struct *task); #else /* !CONFIG_CGROUP_FREEZER */ -static inline bool cgroup_freezing(struct task_struct *task) +static inline bool cgroup1_freezing(struct task_struct *task) { return false; } diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c index dd9417425d92..915b02f65980 100644 --- a/kernel/cgroup/legacy_freezer.c +++ b/kernel/cgroup/legacy_freezer.c @@ -63,7 +63,7 @@ static struct freezer *parent_freezer(struct freezer *freezer) return css_freezer(freezer->css.parent); } -bool cgroup_freezing(struct task_struct *task) +bool cgroup1_freezing(struct task_struct *task) { bool ret; diff --git a/kernel/freezer.c b/kernel/freezer.c index ddc11a8bd2ea..a76bf957fb32 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -44,7 +44,7 @@ bool freezing_slow_path(struct task_struct *p) if (tsk_is_oom_victim(p)) return false; - if (pm_nosig_freezing || cgroup_freezing(p)) + if (pm_nosig_freezing || cgroup1_freezing(p)) return true; if (pm_freezing && !(p->flags & PF_KTHREAD)) From 790e826be8994d4510146458ebf5eee6f3267a3a Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Wed, 22 Oct 2025 14:54:25 -0700 Subject: [PATCH 40/96] cpufreq: intel_pstate: Add Diamond Rapids OOB mode support Prevent intel_pstate from loading when Out-of-Band (OOB) P-states mode is enabled. The OOB identification mechanism for Diamond Rapids servers is the same as for prior generation CPUs such as Granite Rapids. Add the Diamond Rapids CPU model to intel_pstate_cpu_oob_ids[] to ensure correct OOB handling. Signed-off-by: Kuppuswamy Sathyanarayanan Link: https://patch.msgid.link/20251022215425.3566218-1-sathyanarayanan.kuppuswamy@linux.intel.com Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 7d2a1aec3a61..56c28411e130 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -2762,6 +2762,7 @@ static const struct x86_cpu_id intel_pstate_cpu_oob_ids[] __initconst = { X86_MATCH(INTEL_ATOM_CRESTMONT, core_funcs), X86_MATCH(INTEL_ATOM_CRESTMONT_X, core_funcs), X86_MATCH(INTEL_ATOM_DARKMONT_X, core_funcs), + X86_MATCH(INTEL_DIAMONDRAPIDS_X, core_funcs), {} }; #endif From 39f421f2e301f995c17c35b783e2863155b3f647 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Thu, 23 Oct 2025 10:45:32 -0700 Subject: [PATCH 41/96] powercap: intel_rapl: Add support for Wildcat Lake platform Add Wildcat Lake to the list of supported processors for RAPL. Signed-off-by: Srinivas Pandruvada Link: https://patch.msgid.link/20251023174532.1882008-1-srinivas.pandruvada@linux.intel.com Signed-off-by: Rafael J. Wysocki --- drivers/powercap/intel_rapl_common.c | 1 + drivers/powercap/intel_rapl_msr.c | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c index c7e7f9bf5313..cdb4363589e9 100644 --- a/drivers/powercap/intel_rapl_common.c +++ b/drivers/powercap/intel_rapl_common.c @@ -1284,6 +1284,7 @@ static const struct x86_cpu_id rapl_ids[] __initconst = { X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, &rapl_defaults_spr_server), X86_MATCH_VFM(INTEL_LUNARLAKE_M, &rapl_defaults_core), X86_MATCH_VFM(INTEL_PANTHERLAKE_L, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_WILDCATLAKE_L, &rapl_defaults_core), X86_MATCH_VFM(INTEL_ARROWLAKE_H, &rapl_defaults_core), X86_MATCH_VFM(INTEL_ARROWLAKE, &rapl_defaults_core), X86_MATCH_VFM(INTEL_ARROWLAKE_U, &rapl_defaults_core), diff --git a/drivers/powercap/intel_rapl_msr.c b/drivers/powercap/intel_rapl_msr.c index 4ed06c71a3ac..c4d536c2f989 100644 --- a/drivers/powercap/intel_rapl_msr.c +++ b/drivers/powercap/intel_rapl_msr.c @@ -151,6 +151,7 @@ static const struct x86_cpu_id pl4_support_ids[] = { X86_MATCH_VFM(INTEL_ARROWLAKE_U, NULL), X86_MATCH_VFM(INTEL_ARROWLAKE_H, NULL), X86_MATCH_VFM(INTEL_PANTHERLAKE_L, NULL), + X86_MATCH_VFM(INTEL_WILDCATLAKE_L, NULL), {} }; From 4ab25c92147663a7ce3187bd9075eeb2709a415b Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Sat, 1 Nov 2025 12:56:14 +0700 Subject: [PATCH 42/96] Documentation: intel-pstate: Use :ref: directive for internal linking intel_pstate docs uses standard reST construct (`Section title`_) for cross-referencing sections (internal linking), rather than for external links. Incorrect cross-references are not caught when these are written in that syntax, however (fortunately docutils 0.22 raise duplicate target warnings that get fixed in cb908f8b0acc7e ("Documentation: intel_pstate: fix duplicate hyperlink target errors")). Convert the cross-references to use :ref: directive, which doesn't exhibit this problem. Signed-off-by: Bagas Sanjaya Reviewed-by: Randy Dunlap Tested-by: Randy Dunlap [ rjw: Changelog tweak ] Link: https://patch.msgid.link/20251101055614.32270-1-bagasdotme@gmail.com Signed-off-by: Rafael J. Wysocki --- Documentation/admin-guide/pm/intel_pstate.rst | 133 +++++++++--------- 1 file changed, 70 insertions(+), 63 deletions(-) diff --git a/Documentation/admin-guide/pm/intel_pstate.rst b/Documentation/admin-guide/pm/intel_pstate.rst index 9cdd9dad6516..fde967b0c2e0 100644 --- a/Documentation/admin-guide/pm/intel_pstate.rst +++ b/Documentation/admin-guide/pm/intel_pstate.rst @@ -48,8 +48,9 @@ only way to pass early-configuration-time parameters to it is via the kernel command line. However, its configuration can be adjusted via ``sysfs`` to a great extent. In some configurations it even is possible to unregister it via ``sysfs`` which allows another ``CPUFreq`` scaling driver to be loaded and -registered (see `below `_). +registered (see :ref:`below `). +.. _operation_modes: Operation Modes =============== @@ -62,7 +63,7 @@ a certain performance scaling algorithm. Which of them will be in effect depends on what kernel command line options are used and on the capabilities of the processor. -.. _Active Mode: +.. _active_mode: Active Mode ----------- @@ -96,7 +97,7 @@ Which of the P-state selection algorithms is used by default depends on the Namely, if that option is set, the ``performance`` algorithm will be used by default, and the other one will be used by default if it is not set. -.. _Active Mode With HWP: +.. _active_mode_hwp: Active Mode With HWP ~~~~~~~~~~~~~~~~~~~~ @@ -127,7 +128,7 @@ Energy-Performance Bias (EPB) knob (otherwise), which means that the processor's internal P-state selection logic is expected to focus entirely on performance. This will override the EPP/EPB setting coming from the ``sysfs`` interface -(see `Energy vs Performance Hints`_ below). Moreover, any attempts to change +(see :ref:`energy_performance_hints` below). Moreover, any attempts to change the EPP/EPB to a value different from 0 ("performance") via ``sysfs`` in this configuration will be rejected. @@ -196,7 +197,7 @@ This is the default P-state selection algorithm if the :c:macro:`CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE` kernel configuration option is not set. -.. _Passive Mode: +.. _passive_mode: Passive Mode ------------ @@ -295,12 +296,12 @@ Unlike ``_PSS`` objects in the ACPI tables, ``intel_pstate`` always exposes the entire range of available P-states, including the whole turbo range, to the ``CPUFreq`` core and (in the passive mode) to generic scaling governors. This generally causes turbo P-states to be set more often when ``intel_pstate`` is -used relative to ACPI-based CPU performance scaling (see `below `_ -for more information). +used relative to ACPI-based CPU performance scaling (see +:ref:`below ` for more information). Moreover, since ``intel_pstate`` always knows what the real turbo threshold is (even if the Configurable TDP feature is enabled in the processor), its -``no_turbo`` attribute in ``sysfs`` (described `below `_) should +``no_turbo`` attribute in ``sysfs`` (described :ref:`below `) should work as expected in all cases (that is, if set to disable turbo P-states, it always should prevent ``intel_pstate`` from using them). @@ -313,12 +314,12 @@ pieces of information on it to be known, including: * The minimum supported P-state. - * The maximum supported `non-turbo P-state `_. + * The maximum supported :ref:`non-turbo P-state `. * Whether or not turbo P-states are supported at all. - * The maximum supported `one-core turbo P-state `_ (if turbo P-states - are supported). + * The maximum supported :ref:`one-core turbo P-state ` (if turbo + P-states are supported). * The scaling formula to translate the driver's internal representation of P-states into frequencies and the other way around. @@ -406,10 +407,10 @@ Energy-Aware Scheduling Support If ``CONFIG_ENERGY_MODEL`` has been set during kernel configuration and ``intel_pstate`` runs on a hybrid processor without SMT, in addition to enabling -`CAS `_ it registers an Energy Model for the processor. This allows the +:ref:`CAS` it registers an Energy Model for the processor. This allows the Energy-Aware Scheduling (EAS) support to be enabled in the CPU scheduler if ``schedutil`` is used as the ``CPUFreq`` governor which requires ``intel_pstate`` -to operate in the `passive mode `_. +to operate in the :ref:`passive mode `. The Energy Model registered by ``intel_pstate`` is artificial (that is, it is based on abstract cost values and it does not include any real power numbers) @@ -438,7 +439,7 @@ the ``energy_model`` directory in ``debugfs`` (typlically mounted on User Space Interface in ``sysfs`` ================================= -.. _Global Attributes: +.. _global_attributes: Global Attributes ----------------- @@ -452,8 +453,8 @@ argument is passed to the kernel in the command line. ``max_perf_pct`` Maximum P-state the driver is allowed to set in percent of the - maximum supported performance level (the highest supported `turbo - P-state `_). + maximum supported performance level (the highest supported :ref:`turbo + P-state `). This attribute will not be exposed if the ``intel_pstate=per_cpu_perf_limits`` argument is present in the kernel @@ -461,8 +462,8 @@ argument is passed to the kernel in the command line. ``min_perf_pct`` Minimum P-state the driver is allowed to set in percent of the - maximum supported performance level (the highest supported `turbo - P-state `_). + maximum supported performance level (the highest supported :ref:`turbo + P-state `). This attribute will not be exposed if the ``intel_pstate=per_cpu_perf_limits`` argument is present in the kernel @@ -471,18 +472,18 @@ argument is passed to the kernel in the command line. ``num_pstates`` Number of P-states supported by the processor (between 0 and 255 inclusive) including both turbo and non-turbo P-states (see - `Turbo P-states Support`_). + :ref:`turbo`). This attribute is present only if the value exposed by it is the same for all of the CPUs in the system. The value of this attribute is not affected by the ``no_turbo`` - setting described `below `_. + setting described :ref:`below `. This attribute is read-only. ``turbo_pct`` - Ratio of the `turbo range `_ size to the size of the entire + Ratio of the :ref:`turbo range ` size to the size of the entire range of supported P-states, in percent. This attribute is present only if the value exposed by it is the same @@ -494,7 +495,7 @@ argument is passed to the kernel in the command line. ``no_turbo`` If set (equal to 1), the driver is not allowed to set any turbo P-states - (see `Turbo P-states Support`_). If unset (equal to 0, which is the + (see :ref:`turbo`). If unset (equal to 0, which is the default), turbo P-states can be set by the driver. [Note that ``intel_pstate`` does not support the general ``boost`` attribute (supported by some other scaling drivers) which is replaced @@ -503,11 +504,11 @@ argument is passed to the kernel in the command line. This attribute does not affect the maximum supported frequency value supplied to the ``CPUFreq`` core and exposed via the policy interface, but it affects the maximum possible value of per-policy P-state limits - (see `Interpretation of Policy Attributes`_ below for details). + (see :ref:`policy_attributes_interpretation` below for details). ``hwp_dynamic_boost`` This attribute is only present if ``intel_pstate`` works in the - `active mode with the HWP feature enabled `_ in + :ref:`active mode with the HWP feature enabled ` in the processor. If set (equal to 1), it causes the minimum P-state limit to be increased dynamically for a short time whenever a task previously waiting on I/O is selected to run on a given logical CPU (the purpose @@ -522,12 +523,12 @@ argument is passed to the kernel in the command line. Operation mode of the driver: "active", "passive" or "off". "active" - The driver is functional and in the `active mode - `_. + The driver is functional and in the :ref:`active mode + `. "passive" - The driver is functional and in the `passive mode - `_. + The driver is functional and in the :ref:`passive mode + `. "off" The driver is not functional (it is not registered as a scaling @@ -555,13 +556,15 @@ argument is passed to the kernel in the command line. attribute to "1" enables the energy-efficiency optimizations and setting to "0" disables them. +.. _policy_attributes_interpretation: + Interpretation of Policy Attributes ----------------------------------- The interpretation of some ``CPUFreq`` policy attributes described in Documentation/admin-guide/pm/cpufreq.rst is special with ``intel_pstate`` as the current scaling driver and it generally depends on the driver's -`operation mode `_. +:ref:`operation mode `. First of all, the values of the ``cpuinfo_max_freq``, ``cpuinfo_min_freq`` and ``scaling_cur_freq`` attributes are produced by applying a processor-specific @@ -570,9 +573,10 @@ Also, the values of the ``scaling_max_freq`` and ``scaling_min_freq`` attributes are capped by the frequency corresponding to the maximum P-state that the driver is allowed to set. -If the ``no_turbo`` `global attribute `_ is set, the driver is -not allowed to use turbo P-states, so the maximum value of ``scaling_max_freq`` -and ``scaling_min_freq`` is limited to the maximum non-turbo P-state frequency. +If the ``no_turbo`` :ref:`global attribute ` is set, the driver +is not allowed to use turbo P-states, so the maximum value of +``scaling_max_freq`` and ``scaling_min_freq`` is limited to the maximum +non-turbo P-state frequency. Accordingly, setting ``no_turbo`` causes ``scaling_max_freq`` and ``scaling_min_freq`` to go down to that value if they were above it before. However, the old values of ``scaling_max_freq`` and ``scaling_min_freq`` will be @@ -584,7 +588,7 @@ and ``scaling_min_freq`` corresponds to the maximum supported turbo P-state, which also is the value of ``cpuinfo_max_freq`` in either case. Next, the following policy attributes have special meaning if -``intel_pstate`` works in the `active mode `_: +``intel_pstate`` works in the :ref:`active mode `: ``scaling_available_governors`` List of P-state selection algorithms provided by ``intel_pstate``. @@ -605,20 +609,22 @@ processor: Shows the base frequency of the CPU. Any frequency above this will be in the turbo frequency range. -The meaning of these attributes in the `passive mode `_ is the +The meaning of these attributes in the :ref:`passive mode ` is the same as for other scaling drivers. Additionally, the value of the ``scaling_driver`` attribute for ``intel_pstate`` depends on the operation mode of the driver. Namely, it is either -"intel_pstate" (in the `active mode `_) or "intel_cpufreq" (in the -`passive mode `_). +"intel_pstate" (in the :ref:`active mode `) or "intel_cpufreq" +(in the :ref:`passive mode `). + +.. _pstate_limits_coordination: Coordination of P-State Limits ------------------------------ ``intel_pstate`` allows P-state limits to be set in two ways: with the help of -the ``max_perf_pct`` and ``min_perf_pct`` `global attributes -`_ or via the ``scaling_max_freq`` and ``scaling_min_freq`` +the ``max_perf_pct`` and ``min_perf_pct`` :ref:`global attributes +` or via the ``scaling_max_freq`` and ``scaling_min_freq`` ``CPUFreq`` policy attributes. The coordination between those limits is based on the following rules, regardless of the current operation mode of the driver: @@ -640,17 +646,18 @@ on the following rules, regardless of the current operation mode of the driver: 3. The global and per-policy limits can be set independently. -In the `active mode with the HWP feature enabled `_, the +In the :ref:`active mode with the HWP feature enabled `, the resulting effective values are written into hardware registers whenever the limits change in order to request its internal P-state selection logic to always set P-states within these limits. Otherwise, the limits are taken into account -by scaling governors (in the `passive mode `_) and by the driver -every time before setting a new P-state for a CPU. +by scaling governors (in the :ref:`passive mode `) and by the +driver every time before setting a new P-state for a CPU. Additionally, if the ``intel_pstate=per_cpu_perf_limits`` command line argument is passed to the kernel, ``max_perf_pct`` and ``min_perf_pct`` are not exposed at all and the only way to set the limits is by using the policy attributes. +.. _energy_performance_hints: Energy vs Performance Hints --------------------------- @@ -710,9 +717,9 @@ output. On those systems each ``_PSS`` object returns a list of P-states supported by the corresponding CPU which basically is a subset of the P-states range that can be used by ``intel_pstate`` on the same system, with one exception: the whole -`turbo range `_ is represented by one item in it (the topmost one). By -convention, the frequency returned by ``_PSS`` for that item is greater by 1 MHz -than the frequency of the highest non-turbo P-state listed by it, but the +:ref:`turbo range ` is represented by one item in it (the topmost one). +By convention, the frequency returned by ``_PSS`` for that item is greater by +1 MHz than the frequency of the highest non-turbo P-state listed by it, but the corresponding P-state representation (following the hardware specification) returned for it matches the maximum supported turbo P-state (or is the special value 255 meaning essentially "go as high as you can get"). @@ -738,18 +745,18 @@ benefit from running at turbo frequencies will be given non-turbo P-states instead. One more issue related to that may appear on systems supporting the -`Configurable TDP feature `_ allowing the platform firmware to set the -turbo threshold. Namely, if that is not coordinated with the lists of P-states -returned by ``_PSS`` properly, there may be more than one item corresponding to -a turbo P-state in those lists and there may be a problem with avoiding the -turbo range (if desirable or necessary). Usually, to avoid using turbo -P-states overall, ``acpi-cpufreq`` simply avoids using the topmost state listed -by ``_PSS``, but that is not sufficient when there are other turbo P-states in -the list returned by it. +:ref:`Configurable TDP feature ` allowing the platform firmware to set +the turbo threshold. Namely, if that is not coordinated with the lists of +P-states returned by ``_PSS`` properly, there may be more than one item +corresponding to a turbo P-state in those lists and there may be a problem with +avoiding the turbo range (if desirable or necessary). Usually, to avoid using +turbo P-states overall, ``acpi-cpufreq`` simply avoids using the topmost state +listed by ``_PSS``, but that is not sufficient when there are other turbo +P-states in the list returned by it. Apart from the above, ``acpi-cpufreq`` works like ``intel_pstate`` in the -`passive mode `_, except that the number of P-states it can set -is limited to the ones listed by the ACPI ``_PSS`` objects. +:ref:`passive mode `, except that the number of P-states it can +set is limited to the ones listed by the ACPI ``_PSS`` objects. Kernel Command Line Options for ``intel_pstate`` @@ -764,11 +771,11 @@ of them have to be prepended with the ``intel_pstate=`` prefix. processor is supported by it. ``active`` - Register ``intel_pstate`` in the `active mode `_ to start - with. + Register ``intel_pstate`` in the :ref:`active mode ` to + start with. ``passive`` - Register ``intel_pstate`` in the `passive mode `_ to + Register ``intel_pstate`` in the :ref:`passive mode ` to start with. ``force`` @@ -801,12 +808,12 @@ of them have to be prepended with the ``intel_pstate=`` prefix. and this option has no effect. ``per_cpu_perf_limits`` - Use per-logical-CPU P-State limits (see `Coordination of P-state - Limits`_ for details). + Use per-logical-CPU P-State limits (see + :ref:`pstate_limits_coordination` for details). ``no_cas`` - Do not enable `capacity-aware scheduling `_ which is enabled by - default on hybrid systems without SMT. + Do not enable :ref:`capacity-aware scheduling ` which is enabled + by default on hybrid systems without SMT. Diagnostics and Tuning ====================== @@ -818,7 +825,7 @@ There are two static trace events that can be used for ``intel_pstate`` diagnostics. One of them is the ``cpu_frequency`` trace event generally used by ``CPUFreq``, and the other one is the ``pstate_sample`` trace event specific to ``intel_pstate``. Both of them are triggered by ``intel_pstate`` only if -it works in the `active mode `_. +it works in the :ref:`active mode `. The following sequence of shell commands can be used to enable them and see their output (if the kernel is generally configured to support event tracing):: @@ -830,7 +837,7 @@ their output (if the kernel is generally configured to support event tracing):: gnome-terminal--4510 [001] ..s. 1177.680733: pstate_sample: core_busy=107 scaled=94 from=26 to=26 mperf=1143818 aperf=1230607 tsc=29838618 freq=2474476 cat-5235 [002] ..s. 1177.681723: cpu_frequency: state=2900000 cpu_id=2 -If ``intel_pstate`` works in the `passive mode `_, the +If ``intel_pstate`` works in the :ref:`passive mode `, the ``cpu_frequency`` trace event will be triggered either by the ``schedutil`` scaling governor (for the policies it is attached to), or by the ``CPUFreq`` core (for the policies with other scaling governors). From b1f02f005a2e01287cdb627e1c03c3deb73c5163 Mon Sep 17 00:00:00 2001 From: "Mario Limonciello (AMD)" Date: Fri, 24 Oct 2025 19:43:34 -0500 Subject: [PATCH 43/96] Documentation: power: Add document on debugging shutdown hangs If the kernel hangs while shutting down, ideally a UART log should be captured to debug the problem. However if one isn't available, users can use the pstore functionality to retrieve logs. Add a document explaining how this works to make it more accessible to users. Tested-by: Harry Wentland Signed-off-by: Mario Limonciello (AMD) Link: https://patch.msgid.link/20251025004341.2386868-1-superm1@kernel.org Signed-off-by: Rafael J. Wysocki --- Documentation/power/index.rst | 1 + Documentation/power/shutdown-debugging.rst | 53 ++++++++++++++++++++++ 2 files changed, 54 insertions(+) create mode 100644 Documentation/power/shutdown-debugging.rst diff --git a/Documentation/power/index.rst b/Documentation/power/index.rst index a0f5244fb427..ea70633d9ce6 100644 --- a/Documentation/power/index.rst +++ b/Documentation/power/index.rst @@ -19,6 +19,7 @@ Power Management power_supply_class runtime_pm s2ram + shutdown-debugging suspend-and-cpuhotplug suspend-and-interrupts swsusp-and-swap-files diff --git a/Documentation/power/shutdown-debugging.rst b/Documentation/power/shutdown-debugging.rst new file mode 100644 index 000000000000..cdfa2cd90e5c --- /dev/null +++ b/Documentation/power/shutdown-debugging.rst @@ -0,0 +1,53 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Debugging Kernel Shutdown Hangs with pstore ++++++++++++++++++++++++++++++++++++++++++++ + +Overview +======== +If the system hangs while shutting down, the kernel logs may need to be +retrieved to debug the issue. + +On systems that have a UART available, it is best to configure the kernel to use +this UART for kernel console output. + +If a UART isn't available, the ``pstore`` subsystem provides a mechanism to +persist this data across a system reset, allowing it to be retrieved on the next +boot. + +Kernel Configuration +==================== +To enable ``pstore`` and enable saving kernel ring buffer logs, set the +following kernel configuration options: + +* ``CONFIG_PSTORE=y`` +* ``CONFIG_PSTORE_CONSOLE=y`` + +Additionally, enable a backend to store the data. Depending upon your platform +some options include: + +* ``CONFIG_EFI_VARS_PSTORE=y`` +* ``CONFIG_PSTORE_RAM=y`` +* ``CONFIG_PSTORE_FIRMWARE=y`` +* ``CONFIG_PSTORE_BLK=y`` + +Kernel Command-line Parameters +============================== +Add these parameters to your kernel command line: + +* ``printk.always_kmsg_dump=Y`` + * Forces the kernel to dump the entire message buffer to pstore during + shutdown +* ``efi_pstore.pstore_disable=N`` + * For EFI-based systems, ensures the EFI backend is active + +Userspace Interaction and Log Retrieval +======================================= +On the next boot after a hang, pstore logs will be available in the pstore +filesystem (``/sys/fs/pstore``) and can be retrieved by userspace. + +On systemd systems, the ``systemd-pstore`` service will help do the following: + +#. Locate pstore data in ``/sys/fs/pstore`` +#. Read and save it to ``/var/lib/systemd/pstore`` +#. Clear pstore data for the next event From 059835bbfa282918a1e8e5e2d9628aa600093052 Mon Sep 17 00:00:00 2001 From: Zuo An Date: Fri, 24 Oct 2025 05:46:47 +0000 Subject: [PATCH 44/96] tools/power/cpupower: Support building libcpupower statically The cpupower Makefile built and installed libcpupower as a shared library (libcpupower.so) without passing `STATIC=true`, but did not build a static version of the library even with `STATIC=true`. (Only the programs were static). Thus, out-of-tree programs using libcpupower were unable to link statically against the library without having access to intermediate object files produced during the build. This fixes that situation by ensuring that libcpupower.a is built and installed when `STATIC=true` is specified. Link: https://lore.kernel.org/r/x7geegquiks3zndiavw2arihdc2rk7e2dx3lk7yxkewqii6zpg@tzjijqxyzwmu Signed-off-by: Zuo An Signed-off-by: Shuah Khan --- tools/power/cpupower/Makefile | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/tools/power/cpupower/Makefile b/tools/power/cpupower/Makefile index c43db1c41205..a1df9196dc45 100644 --- a/tools/power/cpupower/Makefile +++ b/tools/power/cpupower/Makefile @@ -37,9 +37,7 @@ NLS ?= true # cpufreq-bench benchmarking tool CPUFREQ_BENCH ?= true -# Do not build libraries, but build the code in statically -# Libraries are still built, otherwise the Makefile code would -# be rather ugly. +# Build the code, including libraries, statically. export STATIC ?= false # Prefix to the directories we're installing to @@ -207,14 +205,25 @@ $(OUTPUT)lib/%.o: $(LIB_SRC) $(LIB_HEADERS) $(ECHO) " CC " $@ $(QUIET) $(CC) $(CFLAGS) -fPIC -o $@ -c lib/$*.c -$(OUTPUT)libcpupower.so.$(LIB_VER): $(LIB_OBJS) +ifeq ($(strip $(STATIC)),true) +LIBCPUPOWER := libcpupower.a +else +LIBCPUPOWER := libcpupower.so.$(LIB_VER) +endif + +$(OUTPUT)$(LIBCPUPOWER): $(LIB_OBJS) +ifeq ($(strip $(STATIC)),true) + $(ECHO) " AR " $@ + $(QUIET) $(AR) rcs $@ $(LIB_OBJS) +else $(ECHO) " LD " $@ $(QUIET) $(CC) -shared $(CFLAGS) $(LDFLAGS) -o $@ \ -Wl,-soname,libcpupower.so.$(LIB_MAJ) $(LIB_OBJS) @ln -sf $(@F) $(OUTPUT)libcpupower.so @ln -sf $(@F) $(OUTPUT)libcpupower.so.$(LIB_MAJ) +endif -libcpupower: $(OUTPUT)libcpupower.so.$(LIB_VER) +libcpupower: $(OUTPUT)$(LIBCPUPOWER) # Let all .o files depend on its .c file and all headers # Might be worth to put this into utils/Makefile at some point of time @@ -224,7 +233,7 @@ $(OUTPUT)%.o: %.c $(ECHO) " CC " $@ $(QUIET) $(CC) $(CFLAGS) -I./lib -I ./utils -o $@ -c $*.c -$(OUTPUT)cpupower: $(UTIL_OBJS) $(OUTPUT)libcpupower.so.$(LIB_VER) +$(OUTPUT)cpupower: $(UTIL_OBJS) $(OUTPUT)$(LIBCPUPOWER) $(ECHO) " CC " $@ ifeq ($(strip $(STATIC)),true) $(QUIET) $(CC) $(CFLAGS) $(LDFLAGS) $(UTIL_OBJS) -lrt -lpci -L$(OUTPUT) -o $@ @@ -269,7 +278,7 @@ update-po: $(OUTPUT)po/$(PACKAGE).pot done; endif -compile-bench: $(OUTPUT)libcpupower.so.$(LIB_VER) +compile-bench: $(OUTPUT)$(LIBCPUPOWER) @V=$(V) confdir=$(confdir) $(MAKE) -C bench O=$(OUTPUT) # we compile into subdirectories. if the target directory is not the @@ -287,6 +296,7 @@ clean: -find $(OUTPUT) \( -not -type d \) -and \( -name '*~' -o -name '*.[oas]' \) -type f -print \ | xargs rm -f -rm -f $(OUTPUT)cpupower + -rm -f $(OUTPUT)libcpupower.a -rm -f $(OUTPUT)libcpupower.so* -rm -rf $(OUTPUT)po/*.gmo -rm -rf $(OUTPUT)po/*.pot @@ -295,7 +305,11 @@ clean: install-lib: libcpupower $(INSTALL) -d $(DESTDIR)${libdir} +ifeq ($(strip $(STATIC)),true) + $(CP) $(OUTPUT)libcpupower.a $(DESTDIR)${libdir}/ +else $(CP) $(OUTPUT)libcpupower.so* $(DESTDIR)${libdir}/ +endif $(INSTALL) -d $(DESTDIR)${includedir} $(INSTALL_DATA) lib/cpufreq.h $(DESTDIR)${includedir}/cpufreq.h $(INSTALL_DATA) lib/cpuidle.h $(DESTDIR)${includedir}/cpuidle.h @@ -336,11 +350,7 @@ install-bench: compile-bench @#DESTDIR must be set from outside to survive @sbindir=$(sbindir) bindir=$(bindir) docdir=$(docdir) confdir=$(confdir) $(MAKE) -C bench O=$(OUTPUT) install -ifeq ($(strip $(STATIC)),true) -install: all install-tools install-man $(INSTALL_NLS) $(INSTALL_BENCH) -else install: all install-lib install-tools install-man $(INSTALL_NLS) $(INSTALL_BENCH) -endif uninstall: - rm -f $(DESTDIR)${libdir}/libcpupower.* From 39ce15a48f6730c8e53cc8fd0f63995a5e4bb239 Mon Sep 17 00:00:00 2001 From: "Mario Limonciello (AMD)" Date: Thu, 6 Nov 2025 08:25:10 -0600 Subject: [PATCH 45/96] Documentation: power: Correct a mistaken configuration option Somehow CONFIG_PSTORE_FIRMWARE ended up in this document when I intended it to be CONFIG_CHROMEOS_PSTORE. Correct the configuration option and make it clear that not all options are required. Fixes: b1f02f005a2e ("Documentation: power: Add document on debugging shutdown hangs") Reported-by: Rodrigo Siqueira Signed-off-by: Mario Limonciello (AMD) [ rjw: Fixes: tag ] Link: https://patch.msgid.link/20251106142524.3841343-1-superm1@kernel.org Signed-off-by: Rafael J. Wysocki --- Documentation/power/shutdown-debugging.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/power/shutdown-debugging.rst b/Documentation/power/shutdown-debugging.rst index cdfa2cd90e5c..c510122e0bbc 100644 --- a/Documentation/power/shutdown-debugging.rst +++ b/Documentation/power/shutdown-debugging.rst @@ -24,11 +24,11 @@ following kernel configuration options: * ``CONFIG_PSTORE_CONSOLE=y`` Additionally, enable a backend to store the data. Depending upon your platform -some options include: +some potential options include: * ``CONFIG_EFI_VARS_PSTORE=y`` * ``CONFIG_PSTORE_RAM=y`` -* ``CONFIG_PSTORE_FIRMWARE=y`` +* ``CONFIG_CHROMEOS_PSTORE=y`` * ``CONFIG_PSTORE_BLK=y`` Kernel Command-line Parameters From 352899fd911cafd16b3f41bb5c8585124dbd7f4b Mon Sep 17 00:00:00 2001 From: Kaushlendra Kumar Date: Mon, 27 Oct 2025 10:11:27 +0530 Subject: [PATCH 46/96] PM: wakeup: Delete timer before removing wakeup source from list Replace timer_delete_sync() with timer_shutdown_sync() and move it before list_del_rcu() in wakeup_source_remove() to improve the cleanup ordering and code clarity. This ensures that the timer is stopped before removing the wakeup source from the events list, providing a more logical cleanup sequence. While the current ordering is functionally correct, stopping the timer first makes the cleanup flow more intuitive and follows the general pattern of disabling active components before removing data structures. Signed-off-by: Kaushlendra Kumar [ rjw: Subject and changelog edits ] Link: https://patch.msgid.link/20251027044127.2456365-1-kaushlendra.kumar@intel.com Signed-off-by: Rafael J. Wysocki --- drivers/base/power/wakeup.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c index d1283ff1080b..ab3eee23a52d 100644 --- a/drivers/base/power/wakeup.c +++ b/drivers/base/power/wakeup.c @@ -189,17 +189,11 @@ static void wakeup_source_remove(struct wakeup_source *ws) if (WARN_ON(!ws)) return; + timer_shutdown_sync(&ws->timer); raw_spin_lock_irqsave(&events_lock, flags); list_del_rcu(&ws->entry); raw_spin_unlock_irqrestore(&events_lock, flags); synchronize_srcu(&wakeup_srcu); - - timer_delete_sync(&ws->timer); - /* - * Clear timer.function to make wakeup_source_not_registered() treat - * this wakeup source as not registered. - */ - ws->timer.function = NULL; } /** From 58f5d39d5ed8f2e43f230389ea0d59791afdcd55 Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Wed, 5 Nov 2025 12:21:34 +0100 Subject: [PATCH 47/96] cpufreq: qcom-nvmem: add compatible fallback for ipq806x for no SMEM On some IPQ806x SoC SMEM might be not initialized by SBL. This is the case for some Google devices (the OnHub family) that can't make use of SMEM to detect the SoC ID (and socinfo can't be used either as it does depends on SMEM presence). To handle these specific case, check if the SMEM is not initialized (by checking if the qcom_smem_get_soc_id returns -ENODEV) and fallback to OF machine compatible checking to identify the SoC variant. Suggested-by: Dmitry Baryshkov Reviewed-by: Konrad Dybcio Signed-off-by: Christian Marangi Signed-off-by: Viresh Kumar --- drivers/cpufreq/qcom-cpufreq-nvmem.c | 35 ++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/qcom-cpufreq-nvmem.c b/drivers/cpufreq/qcom-cpufreq-nvmem.c index 765a5bb81829..d5af74bf71c6 100644 --- a/drivers/cpufreq/qcom-cpufreq-nvmem.c +++ b/drivers/cpufreq/qcom-cpufreq-nvmem.c @@ -256,13 +256,22 @@ static int qcom_cpufreq_krait_name_version(struct device *cpu_dev, return ret; } +static const struct of_device_id qcom_cpufreq_ipq806x_match_list[] = { + { .compatible = "qcom,ipq8062", .data = (const void *)QCOM_ID_IPQ8062 }, + { .compatible = "qcom,ipq8064", .data = (const void *)QCOM_ID_IPQ8064 }, + { .compatible = "qcom,ipq8065", .data = (const void *)QCOM_ID_IPQ8065 }, + { .compatible = "qcom,ipq8066", .data = (const void *)QCOM_ID_IPQ8066 }, + { .compatible = "qcom,ipq8068", .data = (const void *)QCOM_ID_IPQ8068 }, + { .compatible = "qcom,ipq8069", .data = (const void *)QCOM_ID_IPQ8069 }, +}; + static int qcom_cpufreq_ipq8064_name_version(struct device *cpu_dev, struct nvmem_cell *speedbin_nvmem, char **pvs_name, struct qcom_cpufreq_drv *drv) { + int msm_id = -1, ret = 0; int speed = 0, pvs = 0; - int msm_id, ret = 0; u8 *speedbin; size_t len; @@ -279,8 +288,30 @@ static int qcom_cpufreq_ipq8064_name_version(struct device *cpu_dev, get_krait_bin_format_a(cpu_dev, &speed, &pvs, speedbin); ret = qcom_smem_get_soc_id(&msm_id); - if (ret) + if (ret == -ENODEV) { + const struct of_device_id *match; + struct device_node *root; + + root = of_find_node_by_path("/"); + if (!root) { + ret = -ENODEV; + goto exit; + } + + /* Fallback to compatible match with no SMEM initialized */ + match = of_match_node(qcom_cpufreq_ipq806x_match_list, root); + of_node_put(root); + if (!match) { + ret = -ENODEV; + goto exit; + } + + /* We found a matching device, get the msm_id from the data entry */ + msm_id = (int)(uintptr_t)match->data; + ret = 0; + } else if (ret) { goto exit; + } switch (msm_id) { case QCOM_ID_IPQ8062: From 47c303ba6e8090f5941cc264bf207ccbda13586c Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Thu, 6 Nov 2025 17:33:41 +0100 Subject: [PATCH 48/96] cpufreq: tegra194: add WQ_PERCPU to alloc_workqueue users MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently if a user enqueues a work item using schedule_delayed_work() the used wq is "system_wq" (per-cpu wq) while queue_delayed_work() use WORK_CPU_UNBOUND (used when a cpu is not specified). The same applies to schedule_work() that is using system_wq and queue_work(), that makes use again of WORK_CPU_UNBOUND. This lack of consistency cannot be addressed without refactoring the API. alloc_workqueue() treats all queues as per-CPU by default, while unbound workqueues must opt-in via WQ_UNBOUND. This default is suboptimal: most workloads benefit from unbound queues, allowing the scheduler to place worker threads where they’re needed and reducing noise when CPUs are isolated. This continues the effort to refactor workqueue APIs, which began with the introduction of new workqueues and a new alloc_workqueue flag in: commit 128ea9f6ccfb ("workqueue: Add system_percpu_wq and system_dfl_wq") commit 930c2ea566af ("workqueue: Add new WQ_PERCPU flag") This change adds a new WQ_PERCPU flag to explicitly request alloc_workqueue() to be per-cpu when WQ_UNBOUND has not been specified. With the introduction of the WQ_PERCPU flag (equivalent to !WQ_UNBOUND), any alloc_workqueue() caller that doesn’t explicitly specify WQ_UNBOUND must now use WQ_PERCPU. Once migration is complete, WQ_UNBOUND can be removed and unbound will become the implicit default. Suggested-by: Tejun Heo Signed-off-by: Marco Crivellari [ Viresh: Fixed Subject ] Signed-off-by: Viresh Kumar --- drivers/cpufreq/tegra194-cpufreq.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/cpufreq/tegra194-cpufreq.c b/drivers/cpufreq/tegra194-cpufreq.c index 9b4f516f313e..695599e1001f 100644 --- a/drivers/cpufreq/tegra194-cpufreq.c +++ b/drivers/cpufreq/tegra194-cpufreq.c @@ -750,7 +750,8 @@ static int tegra194_cpufreq_probe(struct platform_device *pdev) if (IS_ERR(bpmp)) return PTR_ERR(bpmp); - read_counters_wq = alloc_workqueue("read_counters_wq", __WQ_LEGACY, 1); + read_counters_wq = alloc_workqueue("read_counters_wq", + __WQ_LEGACY | WQ_PERCPU, 1); if (!read_counters_wq) { dev_err(&pdev->dev, "fail to create_workqueue\n"); err = -EINVAL; From 7e17f48667b6707593fc215cbe025157920934f1 Mon Sep 17 00:00:00 2001 From: "Mario Limonciello (AMD)" Date: Thu, 9 Oct 2025 11:17:51 -0500 Subject: [PATCH 49/96] cpufreq/amd-pstate: Use sysfs_match_string() for epp Rather than scanning the buffer and manually matching the string use the sysfs macros. Reviewed-by: Gautham R. Shenoy Signed-off-by: Mario Limonciello (AMD) --- drivers/cpufreq/amd-pstate.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index b44f0f7a5ba1..0bc501344887 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -118,7 +118,6 @@ static const char * const energy_perf_strings[] = { [EPP_INDEX_BALANCE_PERFORMANCE] = "balance_performance", [EPP_INDEX_BALANCE_POWERSAVE] = "balance_power", [EPP_INDEX_POWERSAVE] = "power", - NULL }; static unsigned int epp_values[] = { @@ -1137,16 +1136,15 @@ static ssize_t show_amd_pstate_hw_prefcore(struct cpufreq_policy *policy, static ssize_t show_energy_performance_available_preferences( struct cpufreq_policy *policy, char *buf) { - int i = 0; - int offset = 0; + int offset = 0, i; struct amd_cpudata *cpudata = policy->driver_data; if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) return sysfs_emit_at(buf, offset, "%s\n", energy_perf_strings[EPP_INDEX_PERFORMANCE]); - while (energy_perf_strings[i] != NULL) - offset += sysfs_emit_at(buf, offset, "%s ", energy_perf_strings[i++]); + for (i = 0; i < ARRAY_SIZE(energy_perf_strings); i++) + offset += sysfs_emit_at(buf, offset, "%s ", energy_perf_strings[i]); offset += sysfs_emit_at(buf, offset, "\n"); @@ -1157,15 +1155,10 @@ static ssize_t store_energy_performance_preference( struct cpufreq_policy *policy, const char *buf, size_t count) { struct amd_cpudata *cpudata = policy->driver_data; - char str_preference[21]; ssize_t ret; u8 epp; - ret = sscanf(buf, "%20s", str_preference); - if (ret != 1) - return -EINVAL; - - ret = match_string(energy_perf_strings, -1, str_preference); + ret = sysfs_match_string(energy_perf_strings, buf); if (ret < 0) return -EINVAL; From 06791bc017ea0793dae71553163c9107e91c415b Mon Sep 17 00:00:00 2001 From: "Mario Limonciello (AMD)" Date: Thu, 9 Oct 2025 11:17:52 -0500 Subject: [PATCH 50/96] cpufreq/amd-pstate: Drop NULL value from amd_pstate_mode_string None of the users actually look for the NULL value. To avoid risk of regression introducing a new value but forgetting to add a string add a static assert to test AMD_PSTATE_MAX matches the array size. Reviewed-by: Gautham R. Shenoy Signed-off-by: Mario Limonciello (AMD) --- drivers/cpufreq/amd-pstate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 0bc501344887..a5b9e5baf423 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -65,8 +65,8 @@ static const char * const amd_pstate_mode_string[] = { [AMD_PSTATE_PASSIVE] = "passive", [AMD_PSTATE_ACTIVE] = "active", [AMD_PSTATE_GUIDED] = "guided", - NULL, }; +static_assert(ARRAY_SIZE(amd_pstate_mode_string) == AMD_PSTATE_MAX); const char *amd_pstate_get_mode_string(enum amd_pstate_mode mode) { From baf106f3a7ba8bf317e1f9d32ee88955723cbc71 Mon Sep 17 00:00:00 2001 From: "Mario Limonciello (AMD)" Date: Thu, 9 Oct 2025 11:17:53 -0500 Subject: [PATCH 51/96] cpufreq/amd-pstate: Make amd_pstate_get_mode_string() never return NULL amd_pstate_get_mode_string() is only used by amd-pstate-ut. Set the failure path to use AMD_PSTATE_UNDEFINED ("undefined") to avoid showing "(null)" as a string when running test suite. Reviewed-by: Gautham R. Shenoy Signed-off-by: Mario Limonciello (AMD) --- drivers/cpufreq/amd-pstate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index a5b9e5baf423..5feb9f5e3a49 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -70,8 +70,8 @@ static_assert(ARRAY_SIZE(amd_pstate_mode_string) == AMD_PSTATE_MAX); const char *amd_pstate_get_mode_string(enum amd_pstate_mode mode) { - if (mode < 0 || mode >= AMD_PSTATE_MAX) - return NULL; + if (mode < AMD_PSTATE_UNDEFINED || mode >= AMD_PSTATE_MAX) + mode = AMD_PSTATE_UNDEFINED; return amd_pstate_mode_string[mode]; } EXPORT_SYMBOL_GPL(amd_pstate_get_mode_string); From 92d6146a40b2d5ce0b9dcc7b3ff28d57b4757ed1 Mon Sep 17 00:00:00 2001 From: "Mario Limonciello (AMD)" Date: Thu, 9 Oct 2025 11:17:54 -0500 Subject: [PATCH 52/96] cpufreq/amd-pstate: Adjust return values in amd_pstate_update_status() get_mode_idx_from_str() already checks the upper boundary for a string sent. Drop the extra check in amd_pstate_update_status() and pass the return code if there is a failure. Reviewed-by: Gautham R. Shenoy Signed-off-by: Mario Limonciello (AMD) --- drivers/cpufreq/amd-pstate.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 5feb9f5e3a49..2d2ef53d1244 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -1346,9 +1346,8 @@ int amd_pstate_update_status(const char *buf, size_t size) return -EINVAL; mode_idx = get_mode_idx_from_str(buf, size); - - if (mode_idx < 0 || mode_idx >= AMD_PSTATE_MAX) - return -EINVAL; + if (mode_idx < 0) + return mode_idx; if (mode_state_machine[cppc_state][mode_idx]) { guard(mutex)(&amd_pstate_driver_lock); From e9d62ca86a5525a742742fe69e9aa316cfd4f471 Mon Sep 17 00:00:00 2001 From: "Mario Limonciello (AMD)" Date: Thu, 9 Oct 2025 11:17:55 -0500 Subject: [PATCH 53/96] cpufreq/amd-pstate: Fix some whitespace issues Add whitespace around the equals and remove leading space. Reviewed-by: Gautham R. Shenoy Signed-off-by: Mario Limonciello (AMD) --- drivers/cpufreq/amd-pstate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 2d2ef53d1244..a0f21ac1205a 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -126,7 +126,7 @@ static unsigned int epp_values[] = { [EPP_INDEX_BALANCE_PERFORMANCE] = AMD_CPPC_EPP_BALANCE_PERFORMANCE, [EPP_INDEX_BALANCE_POWERSAVE] = AMD_CPPC_EPP_BALANCE_POWERSAVE, [EPP_INDEX_POWERSAVE] = AMD_CPPC_EPP_POWERSAVE, - }; +}; typedef int (*cppc_mode_transition_fn)(int); @@ -182,7 +182,7 @@ static inline int get_mode_idx_from_str(const char *str, size_t size) { int i; - for (i=0; i < AMD_PSTATE_MAX; i++) { + for (i = 0; i < AMD_PSTATE_MAX; i++) { if (!strncmp(str, amd_pstate_mode_string[i], size)) return i; } From 077f23573d29d063a950e90aa77c8e1f79580147 Mon Sep 17 00:00:00 2001 From: "Mario Limonciello (AMD)" Date: Thu, 9 Oct 2025 11:17:56 -0500 Subject: [PATCH 54/96] cpufreq/amd-pstate: Add static asserts for EPP indices In case a new index is introduced add a static assert to make sure that strings and values are updated. Reviewed-by: Gautham R. Shenoy Signed-off-by: Mario Limonciello (AMD) --- drivers/cpufreq/amd-pstate.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index a0f21ac1205a..b3dad7cde46f 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -110,6 +110,7 @@ enum energy_perf_value_index { EPP_INDEX_BALANCE_PERFORMANCE, EPP_INDEX_BALANCE_POWERSAVE, EPP_INDEX_POWERSAVE, + EPP_INDEX_MAX, }; static const char * const energy_perf_strings[] = { @@ -119,6 +120,7 @@ static const char * const energy_perf_strings[] = { [EPP_INDEX_BALANCE_POWERSAVE] = "balance_power", [EPP_INDEX_POWERSAVE] = "power", }; +static_assert(ARRAY_SIZE(energy_perf_strings) == EPP_INDEX_MAX); static unsigned int epp_values[] = { [EPP_INDEX_DEFAULT] = 0, @@ -127,6 +129,7 @@ static unsigned int epp_values[] = { [EPP_INDEX_BALANCE_POWERSAVE] = AMD_CPPC_EPP_BALANCE_POWERSAVE, [EPP_INDEX_POWERSAVE] = AMD_CPPC_EPP_POWERSAVE, }; +static_assert(ARRAY_SIZE(epp_values) == EPP_INDEX_MAX); typedef int (*cppc_mode_transition_fn)(int); From bb31fef0d03ed17d587b40e3458786be408fb9df Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Fri, 7 Nov 2025 13:11:45 +0530 Subject: [PATCH 55/96] cpufreq/amd-pstate: Call cppc_set_auto_sel() only for online CPUs amd_pstate_change_mode_without_dvr_change() calls cppc_set_auto_sel() for all the present CPUs. However, this callpath eventually calls cppc_set_reg_val() which accesses the per-cpu cpc_desc_ptr object. This object is initialized only for online CPUs via acpi_soft_cpu_online() --> __acpi_processor_start() --> acpi_cppc_processor_probe(). Hence, restrict calling cppc_set_auto_sel() to only the online CPUs. Fixes: 3ca7bc818d8c ("cpufreq: amd-pstate: Add guided mode control support via sysfs") Suggested-by: Mario Limonciello (AMD) (kernel.org) Signed-off-by: Gautham R. Shenoy Signed-off-by: Mario Limonciello (AMD) --- drivers/cpufreq/amd-pstate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index b3dad7cde46f..c45bc98721d2 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -1278,7 +1278,7 @@ static int amd_pstate_change_mode_without_dvr_change(int mode) if (cpu_feature_enabled(X86_FEATURE_CPPC) || cppc_state == AMD_PSTATE_ACTIVE) return 0; - for_each_present_cpu(cpu) { + for_each_online_cpu(cpu) { cppc_set_auto_sel(cpu, (cppc_state == AMD_PSTATE_PASSIVE) ? 0 : 1); } From 62c95ea763915aebb8755185d5cdf72966b27cd3 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 7 Nov 2025 19:18:26 +0100 Subject: [PATCH 56/96] cpufreq: intel_pstate: Use mutex guard for driver locking Use guard(mutex)(&intel_pstate_driver_lock), or the scoped variant of it, wherever intel_pstate_driver_lock needs to be held. This allows some local variables and goto statements to be dropped as they are not necessary any more. Signed-off-by: Rafael J. Wysocki Reviewed-by: Muhammad Usama Anjum Link: https://patch.msgid.link/2807232.mvXUDI8C0e@rafael.j.wysocki --- drivers/cpufreq/intel_pstate.c | 101 +++++++++++---------------------- 1 file changed, 34 insertions(+), 67 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index e05bd9c8ab85..2a126d7dae01 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -1393,7 +1393,8 @@ static void set_power_ctl_ee_state(bool input) { u64 power_ctl; - mutex_lock(&intel_pstate_driver_lock); + guard(mutex)(&intel_pstate_driver_lock); + rdmsrq(MSR_IA32_POWER_CTL, power_ctl); if (input) { power_ctl &= ~BIT(MSR_IA32_POWER_CTL_BIT_EE); @@ -1403,7 +1404,6 @@ static void set_power_ctl_ee_state(bool input) power_ctl_ee_state = POWER_CTL_EE_DISABLE; } wrmsrq(MSR_IA32_POWER_CTL, power_ctl); - mutex_unlock(&intel_pstate_driver_lock); } static void intel_pstate_hwp_enable(struct cpudata *cpudata); @@ -1525,13 +1525,9 @@ static int intel_pstate_update_status(const char *buf, size_t size); static ssize_t show_status(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - ssize_t ret; + guard(mutex)(&intel_pstate_driver_lock); - mutex_lock(&intel_pstate_driver_lock); - ret = intel_pstate_show_status(buf); - mutex_unlock(&intel_pstate_driver_lock); - - return ret; + return intel_pstate_show_status(buf); } static ssize_t store_status(struct kobject *a, struct kobj_attribute *b, @@ -1540,11 +1536,13 @@ static ssize_t store_status(struct kobject *a, struct kobj_attribute *b, char *p = memchr(buf, '\n', count); int ret; - mutex_lock(&intel_pstate_driver_lock); - ret = intel_pstate_update_status(buf, p ? p - buf : count); - mutex_unlock(&intel_pstate_driver_lock); + guard(mutex)(&intel_pstate_driver_lock); - return ret < 0 ? ret : count; + ret = intel_pstate_update_status(buf, p ? p - buf : count); + if (ret < 0) + return ret; + + return count; } static ssize_t show_turbo_pct(struct kobject *kobj, @@ -1554,12 +1552,10 @@ static ssize_t show_turbo_pct(struct kobject *kobj, int total, no_turbo, turbo_pct; uint32_t turbo_fp; - mutex_lock(&intel_pstate_driver_lock); + guard(mutex)(&intel_pstate_driver_lock); - if (!intel_pstate_driver) { - mutex_unlock(&intel_pstate_driver_lock); + if (!intel_pstate_driver) return -EAGAIN; - } cpu = all_cpu_data[0]; @@ -1568,8 +1564,6 @@ static ssize_t show_turbo_pct(struct kobject *kobj, turbo_fp = div_fp(no_turbo, total); turbo_pct = 100 - fp_toint(mul_fp(turbo_fp, int_tofp(100))); - mutex_unlock(&intel_pstate_driver_lock); - return sprintf(buf, "%u\n", turbo_pct); } @@ -1579,38 +1573,26 @@ static ssize_t show_num_pstates(struct kobject *kobj, struct cpudata *cpu; int total; - mutex_lock(&intel_pstate_driver_lock); + guard(mutex)(&intel_pstate_driver_lock); - if (!intel_pstate_driver) { - mutex_unlock(&intel_pstate_driver_lock); + if (!intel_pstate_driver) return -EAGAIN; - } cpu = all_cpu_data[0]; total = cpu->pstate.turbo_pstate - cpu->pstate.min_pstate + 1; - mutex_unlock(&intel_pstate_driver_lock); - return sprintf(buf, "%u\n", total); } static ssize_t show_no_turbo(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - ssize_t ret; + guard(mutex)(&intel_pstate_driver_lock); - mutex_lock(&intel_pstate_driver_lock); - - if (!intel_pstate_driver) { - mutex_unlock(&intel_pstate_driver_lock); + if (!intel_pstate_driver) return -EAGAIN; - } - ret = sprintf(buf, "%u\n", global.no_turbo); - - mutex_unlock(&intel_pstate_driver_lock); - - return ret; + return sprintf(buf, "%u\n", global.no_turbo); } static ssize_t store_no_turbo(struct kobject *a, struct kobj_attribute *b, @@ -1622,28 +1604,24 @@ static ssize_t store_no_turbo(struct kobject *a, struct kobj_attribute *b, if (sscanf(buf, "%u", &input) != 1) return -EINVAL; - mutex_lock(&intel_pstate_driver_lock); + guard(mutex)(&intel_pstate_driver_lock); - if (!intel_pstate_driver) { - count = -EAGAIN; - goto unlock_driver; - } + if (!intel_pstate_driver) + return -EAGAIN; no_turbo = !!clamp_t(int, input, 0, 1); WRITE_ONCE(global.turbo_disabled, turbo_is_disabled()); if (global.turbo_disabled && !no_turbo) { pr_notice("Turbo disabled by BIOS or unavailable on processor\n"); - count = -EPERM; if (global.no_turbo) - goto unlock_driver; - else - no_turbo = 1; + return -EPERM; + + no_turbo = 1; } - if (no_turbo == global.no_turbo) { - goto unlock_driver; - } + if (no_turbo == global.no_turbo) + return count; WRITE_ONCE(global.no_turbo, no_turbo); @@ -1663,9 +1641,6 @@ static ssize_t store_no_turbo(struct kobject *a, struct kobj_attribute *b, intel_pstate_update_limits_for_all(); arch_set_max_freq_ratio(no_turbo); -unlock_driver: - mutex_unlock(&intel_pstate_driver_lock); - return count; } @@ -1715,12 +1690,10 @@ static ssize_t store_max_perf_pct(struct kobject *a, struct kobj_attribute *b, if (ret != 1) return -EINVAL; - mutex_lock(&intel_pstate_driver_lock); + guard(mutex)(&intel_pstate_driver_lock); - if (!intel_pstate_driver) { - mutex_unlock(&intel_pstate_driver_lock); + if (!intel_pstate_driver) return -EAGAIN; - } mutex_lock(&intel_pstate_limits_lock); @@ -1733,8 +1706,6 @@ static ssize_t store_max_perf_pct(struct kobject *a, struct kobj_attribute *b, else update_qos_requests(FREQ_QOS_MAX); - mutex_unlock(&intel_pstate_driver_lock); - return count; } @@ -1748,12 +1719,10 @@ static ssize_t store_min_perf_pct(struct kobject *a, struct kobj_attribute *b, if (ret != 1) return -EINVAL; - mutex_lock(&intel_pstate_driver_lock); + guard(mutex)(&intel_pstate_driver_lock); - if (!intel_pstate_driver) { - mutex_unlock(&intel_pstate_driver_lock); + if (!intel_pstate_driver) return -EAGAIN; - } mutex_lock(&intel_pstate_limits_lock); @@ -1767,8 +1736,6 @@ static ssize_t store_min_perf_pct(struct kobject *a, struct kobj_attribute *b, else update_qos_requests(FREQ_QOS_MIN); - mutex_unlock(&intel_pstate_driver_lock); - return count; } @@ -1789,10 +1756,10 @@ static ssize_t store_hwp_dynamic_boost(struct kobject *a, if (ret) return ret; - mutex_lock(&intel_pstate_driver_lock); + guard(mutex)(&intel_pstate_driver_lock); + hwp_boost = !!input; intel_pstate_update_policies(); - mutex_unlock(&intel_pstate_driver_lock); return count; } @@ -3914,9 +3881,9 @@ static int __init intel_pstate_init(void) } - mutex_lock(&intel_pstate_driver_lock); - rc = intel_pstate_register_driver(default_driver); - mutex_unlock(&intel_pstate_driver_lock); + scoped_guard(mutex, &intel_pstate_driver_lock) { + rc = intel_pstate_register_driver(default_driver); + } if (rc) { intel_pstate_sysfs_remove(); return rc; From 9cf02802d60af2ab52fc5f6d015baae946c85072 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 12 Nov 2025 17:14:26 +0100 Subject: [PATCH 57/96] PM: wakeup: Update after recent wakeup source removal ordering change After a recent change, wakeup_source_activate() will warn that the given wakeup source is "unregistered" after its timer has been shut down in wakeup_source_remove() which may be somewhat confusing, so change the warning message to say that the wakeup source is "unusable". Accordingly, rename wakeup_source_not_registered() to wakeup_source_not_usable() and update the comment in it to also mention the removal of the wakeup source. Also restore the comment in wakeup_source_remove() regarding the warning in wakeup_source_activate() that may trigger after shutting down the wakeup source timer. Signed-off-by: Rafael J. Wysocki Link: https://patch.msgid.link/12788103.O9o76ZdvQC@rafael.j.wysocki --- drivers/base/power/wakeup.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c index ab3eee23a52d..1e1a0e7eeac5 100644 --- a/drivers/base/power/wakeup.c +++ b/drivers/base/power/wakeup.c @@ -189,7 +189,12 @@ static void wakeup_source_remove(struct wakeup_source *ws) if (WARN_ON(!ws)) return; + /* + * After shutting down the timer, wakeup_source_activate() will warn if + * the given wakeup source is passed to it. + */ timer_shutdown_sync(&ws->timer); + raw_spin_lock_irqsave(&events_lock, flags); list_del_rcu(&ws->entry); raw_spin_unlock_irqrestore(&events_lock, flags); @@ -500,14 +505,14 @@ int device_set_wakeup_enable(struct device *dev, bool enable) EXPORT_SYMBOL_GPL(device_set_wakeup_enable); /** - * wakeup_source_not_registered - validate the given wakeup source. + * wakeup_source_not_usable - validate the given wakeup source. * @ws: Wakeup source to be validated. */ -static bool wakeup_source_not_registered(struct wakeup_source *ws) +static bool wakeup_source_not_usable(struct wakeup_source *ws) { /* - * Use timer struct to check if the given source is initialized - * by wakeup_source_add. + * Use the timer struct to check if the given wakeup source has been + * initialized by wakeup_source_add() and it is not going away. */ return ws->timer.function != pm_wakeup_timer_fn; } @@ -552,8 +557,7 @@ static void wakeup_source_activate(struct wakeup_source *ws) { unsigned int cec; - if (WARN_ONCE(wakeup_source_not_registered(ws), - "unregistered wakeup source\n")) + if (WARN_ONCE(wakeup_source_not_usable(ws), "unusable wakeup source\n")) return; ws->active = true; From 76934e495cdc31942b53b513cee4290750578a9a Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 7 Nov 2025 20:07:28 +0100 Subject: [PATCH 58/96] cpuidle: Add sanity check for exit latency and target residency Make __cpuidle_driver_init() fail if the exit latency of one of the driver's idle states is less than its target residency which would break cpuidle assumptions. Signed-off-by: Rafael J. Wysocki Reviewed-by: Artem Bityutskiy Reviewed-by: Christian Loehle [ rjw: Changelog fix ] Link: https://patch.msgid.link/12779486.O9o76ZdvQC@rafael.j.wysocki Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/driver.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/drivers/cpuidle/driver.c b/drivers/cpuidle/driver.c index 9bbfa594c442..1c295a93d582 100644 --- a/drivers/cpuidle/driver.c +++ b/drivers/cpuidle/driver.c @@ -152,7 +152,7 @@ static void cpuidle_setup_broadcast_timer(void *arg) * __cpuidle_driver_init - initialize the driver's internal data * @drv: a valid pointer to a struct cpuidle_driver */ -static void __cpuidle_driver_init(struct cpuidle_driver *drv) +static int __cpuidle_driver_init(struct cpuidle_driver *drv) { int i; @@ -193,7 +193,17 @@ static void __cpuidle_driver_init(struct cpuidle_driver *drv) s->exit_latency_ns = 0; else s->exit_latency = div_u64(s->exit_latency_ns, NSEC_PER_USEC); + + /* + * Ensure that the exit latency of a CPU idle state does not + * exceed its target residency which is assumed in cpuidle in + * multiple places. + */ + if (s->exit_latency_ns > s->target_residency_ns) + return -EINVAL; } + + return 0; } /** @@ -223,7 +233,9 @@ static int __cpuidle_register_driver(struct cpuidle_driver *drv) if (cpuidle_disabled()) return -ENODEV; - __cpuidle_driver_init(drv); + ret = __cpuidle_driver_init(drv); + if (ret) + return ret; ret = __cpuidle_set_driver(drv); if (ret) From 0796ddf4a7f0d15b0cf1ef6f265671f2e5174c1f Mon Sep 17 00:00:00 2001 From: Christian Loehle Date: Mon, 10 Nov 2025 12:08:19 +0000 Subject: [PATCH 59/96] cpuidle: teo: Use this_cpu_ptr() where possible The cpuidle governor callbacks for update, select and reflect are always running on the actual idle entering/exiting CPU, so use the more optimized this_cpu_ptr() to access the internal teo data. This brings down the latency-critical teo_reflect() from static void teo_reflect(struct cpuidle_device *dev, int state) { ffffffc080ffcff0: hint #0x19 ffffffc080ffcff4: stp x29, x30, [sp, #-48]! struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); ffffffc080ffcff8: adrp x2, ffffffc0848c0000 { ffffffc080ffcffc: add x29, sp, #0x0 ffffffc080ffd000: stp x19, x20, [sp, #16] ffffffc080ffd004: orr x20, xzr, x0 struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); ffffffc080ffd008: add x0, x2, #0xc20 { ffffffc080ffd00c: stp x21, x22, [sp, #32] struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); ffffffc080ffd010: adrp x19, ffffffc083eb5000 ffffffc080ffd014: add x19, x19, #0xbb0 ffffffc080ffd018: ldr w3, [x20, #4] dev->last_state_idx = state; to static void teo_reflect(struct cpuidle_device *dev, int state) { ffffffc080ffd034: hint #0x19 ffffffc080ffd038: stp x29, x30, [sp, #-48]! ffffffc080ffd03c: add x29, sp, #0x0 ffffffc080ffd040: stp x19, x20, [sp, #16] ffffffc080ffd044: orr x20, xzr, x0 struct teo_cpu *cpu_data = this_cpu_ptr(&teo_cpus); ffffffc080ffd048: adrp x19, ffffffc083eb5000 { ffffffc080ffd04c: stp x21, x22, [sp, #32] struct teo_cpu *cpu_data = this_cpu_ptr(&teo_cpus); ffffffc080ffd050: add x19, x19, #0xbb0 dev->last_state_idx = state; This saves us: adrp x2, ffffffc0848c0000 add x0, x2, #0xc20 ldr w3, [x20, #4] Signed-off-by: Christian Loehle [ rjw: Subject tweak ] Link: https://patch.msgid.link/20251110120819.714560-1-christian.loehle@arm.com Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/governors/teo.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c index bfa55c1eab5b..a3ebc2cda093 100644 --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c @@ -155,7 +155,7 @@ static DEFINE_PER_CPU(struct teo_cpu, teo_cpus); */ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) { - struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); + struct teo_cpu *cpu_data = this_cpu_ptr(&teo_cpus); int i, idx_timer = 0, idx_duration = 0; s64 target_residency_ns; u64 measured_ns; @@ -268,7 +268,7 @@ static int teo_find_shallower_state(struct cpuidle_driver *drv, static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, bool *stop_tick) { - struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); + struct teo_cpu *cpu_data = this_cpu_ptr(&teo_cpus); s64 latency_req = cpuidle_governor_latency_req(dev->cpu); ktime_t delta_tick = TICK_NSEC / 2; unsigned int idx_intercept_sum = 0; @@ -504,7 +504,7 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, */ static void teo_reflect(struct cpuidle_device *dev, int state) { - struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); + struct teo_cpu *cpu_data = this_cpu_ptr(&teo_cpus); dev->last_state_idx = state; if (dev->poll_time_limit || From a03b2011808ab02ccb7ab6b573b013b77fbb5921 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 13 Nov 2025 14:24:31 +0100 Subject: [PATCH 60/96] cpuidle: governors: teo: Drop misguided target residency check When the target residency of the current candidate idle state is greater than the expected time till the closest timer (the sleep length), it does not matter whether or not the tick has already been stopped or if it is going to be stopped. The closest timer will trigger anyway at its due time, so if an idle state with target residency above the sleep length is selected, energy will be wasted and there may be excess latency. Of course, if the closest timer were canceled before it could trigger, a deeper idle state would be more suitable, but this is not expected to happen (generally speaking, hrtimers are not expected to be canceled as a rule). Accordingly, the teo_state_ok() check done in that case causes energy to be wasted more often than it allows any energy to be saved (if it allows any energy to be saved at all), so drop it and let the governor use the teo_find_shallower_state() return value as the new candidate idle state index. Fixes: 21d28cd2fa5f ("cpuidle: teo: Do not call tick_nohz_get_sleep_length() upfront") Cc: All applicable Signed-off-by: Rafael J. Wysocki Reviewed-by: Christian Loehle Tested-by: Christian Loehle Link: https://patch.msgid.link/5955081.DvuYhMxLoT@rafael.j.wysocki --- drivers/cpuidle/governors/teo.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c index a3ebc2cda093..cc74cecbea7f 100644 --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c @@ -458,11 +458,8 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, * If the closest expected timer is before the target residency of the * candidate state, a shallower one needs to be found. */ - if (drv->states[idx].target_residency_ns > duration_ns) { - i = teo_find_shallower_state(drv, dev, idx, duration_ns, false); - if (teo_state_ok(i, drv)) - idx = i; - } + if (drv->states[idx].target_residency_ns > duration_ns) + idx = teo_find_shallower_state(drv, dev, idx, duration_ns, false); /* * If the selected state's target residency is below the tick length From 17673f64a002fa7bd8f688f45b12ed32b59dba26 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 12 Nov 2025 17:23:24 +0100 Subject: [PATCH 61/96] cpuidle: governors: teo: Drop redundant function parameter The last no_poll parameter of teo_find_shallower_state() is always false, so drop it. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Christian Loehle Tested-by: Christian Loehle Link: https://patch.msgid.link/2253109.irdbgypaU6@rafael.j.wysocki --- drivers/cpuidle/governors/teo.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c index cc74cecbea7f..ada42e2ca759 100644 --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c @@ -239,17 +239,15 @@ static bool teo_state_ok(int i, struct cpuidle_driver *drv) * @dev: Target CPU. * @state_idx: Index of the capping idle state. * @duration_ns: Idle duration value to match. - * @no_poll: Don't consider polling states. */ static int teo_find_shallower_state(struct cpuidle_driver *drv, struct cpuidle_device *dev, int state_idx, - s64 duration_ns, bool no_poll) + s64 duration_ns) { int i; for (i = state_idx - 1; i >= 0; i--) { - if (dev->states_usage[i].disable || - (no_poll && drv->states[i].flags & CPUIDLE_FLAG_POLLING)) + if (dev->states_usage[i].disable) continue; state_idx = i; @@ -459,7 +457,7 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, * candidate state, a shallower one needs to be found. */ if (drv->states[idx].target_residency_ns > duration_ns) - idx = teo_find_shallower_state(drv, dev, idx, duration_ns, false); + idx = teo_find_shallower_state(drv, dev, idx, duration_ns); /* * If the selected state's target residency is below the tick length @@ -487,7 +485,7 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, */ if (idx > idx0 && drv->states[idx].target_residency_ns > delta_tick) - idx = teo_find_shallower_state(drv, dev, idx, delta_tick, false); + idx = teo_find_shallower_state(drv, dev, idx, delta_tick); out_tick: *stop_tick = false; From 8f3f01082d7ab334706c7d96c9271cd99e68aabc Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 12 Nov 2025 17:24:40 +0100 Subject: [PATCH 62/96] cpuidle: governors: teo: Use s64 consistently in teo_update() Two local variables in teo_update() are defined as u64, but their values are then compared with s64 values, so it is more consistent to use s64 as their data type. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Christian Loehle Tested-by: Christian Loehle Link: https://patch.msgid.link/3026616.e9J7NaK4W3@rafael.j.wysocki --- drivers/cpuidle/governors/teo.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c index ada42e2ca759..88ed47e868b9 100644 --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c @@ -157,8 +157,7 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) { struct teo_cpu *cpu_data = this_cpu_ptr(&teo_cpus); int i, idx_timer = 0, idx_duration = 0; - s64 target_residency_ns; - u64 measured_ns; + s64 target_residency_ns, measured_ns; cpu_data->short_idles -= cpu_data->short_idles >> DECAY_SHIFT; @@ -167,9 +166,9 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) * If one of the safety nets has triggered, assume that this * might have been a long sleep. */ - measured_ns = U64_MAX; + measured_ns = S64_MAX; } else { - u64 lat_ns = drv->states[dev->last_state_idx].exit_latency_ns; + s64 lat_ns = drv->states[dev->last_state_idx].exit_latency_ns; measured_ns = dev->last_residency_ns; /* From b54df61c7428ff50b21a03a53e3d580c6e84d1bf Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 12 Nov 2025 19:03:08 +0100 Subject: [PATCH 63/96] cpuidle: governors: teo: Decay metrics below DECAY_SHIFT threshold If a given governor metric falls below a certain value (8 for DECAY_SHIFT equal to 3), it will not decay any more due to the simplistic decay implementation. This may in some cases lead to subtle inconsistencies in the governor behavior, so change the decay implementation to take it into account and set the metric at hand to 0 in that case. Suggested-by: Christian Loehle Signed-off-by: Rafael J. Wysocki Reviewed-by: Christian Loehle Tested-by: Christian Loehle Link: https://patch.msgid.link/2819353.mvXUDI8C0e@rafael.j.wysocki --- drivers/cpuidle/governors/teo.c | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c index 88ed47e868b9..8b80d73e518e 100644 --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c @@ -148,6 +148,16 @@ struct teo_cpu { static DEFINE_PER_CPU(struct teo_cpu, teo_cpus); +static void teo_decay(unsigned int *metric) +{ + unsigned int delta = *metric >> DECAY_SHIFT; + + if (delta) + *metric -= delta; + else + *metric = 0; +} + /** * teo_update - Update CPU metrics after wakeup. * @drv: cpuidle driver containing state data. @@ -158,8 +168,9 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) struct teo_cpu *cpu_data = this_cpu_ptr(&teo_cpus); int i, idx_timer = 0, idx_duration = 0; s64 target_residency_ns, measured_ns; + unsigned int total = 0; - cpu_data->short_idles -= cpu_data->short_idles >> DECAY_SHIFT; + teo_decay(&cpu_data->short_idles); if (cpu_data->artificial_wakeup) { /* @@ -195,8 +206,10 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) for (i = 0; i < drv->state_count; i++) { struct teo_bin *bin = &cpu_data->state_bins[i]; - bin->hits -= bin->hits >> DECAY_SHIFT; - bin->intercepts -= bin->intercepts >> DECAY_SHIFT; + teo_decay(&bin->hits); + total += bin->hits; + teo_decay(&bin->intercepts); + total += bin->intercepts; target_residency_ns = drv->states[i].target_residency_ns; @@ -207,7 +220,9 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) } } - cpu_data->tick_intercepts -= cpu_data->tick_intercepts >> DECAY_SHIFT; + cpu_data->total = total + PULSE; + + teo_decay(&cpu_data->tick_intercepts); /* * If the measured idle duration falls into the same bin as the sleep * length, this is a "hit", so update the "hits" metric for that bin. @@ -221,9 +236,6 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) if (TICK_NSEC <= measured_ns) cpu_data->tick_intercepts += PULSE; } - - cpu_data->total -= cpu_data->total >> DECAY_SHIFT; - cpu_data->total += PULSE; } static bool teo_state_ok(int i, struct cpuidle_driver *drv) From 0ca04993dac9b0d21ffbfd22bf54cc43ec2c49f2 Mon Sep 17 00:00:00 2001 From: "Mario Limonciello (AMD)" Date: Wed, 12 Nov 2025 16:40:23 -0600 Subject: [PATCH 64/96] PM: Introduce new PMSG_POWEROFF event PMSG_POWEROFF will be used for the PM core to allow differentiating between a hibernation or shutdown sequence when re-using callbacks for common code. Hibernation is started by writing a hibernation method (such as 'platform' 'shutdown', or 'reboot') to use into /sys/power/disk and writing 'disk' to /sys/power/state. Shutdown is initiated with the reboot() syscall with arguments on whether to halt the system or power it off. Tested-by: Eric Naim Signed-off-by: Mario Limonciello (AMD) Link: https://patch.msgid.link/20251112224025.2051702-2-superm1@kernel.org Signed-off-by: Rafael J. Wysocki --- drivers/base/power/main.c | 5 +++++ include/linux/pm.h | 3 +++ include/trace/events/power.h | 3 ++- 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 7a8807ec9a5d..38fc8a978b88 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -96,6 +96,8 @@ static const char *pm_verb(int event) return "restore"; case PM_EVENT_RECOVER: return "recover"; + case PM_EVENT_POWEROFF: + return "poweroff"; default: return "(unknown PM event)"; } @@ -368,6 +370,7 @@ static pm_callback_t pm_op(const struct dev_pm_ops *ops, pm_message_t state) case PM_EVENT_FREEZE: case PM_EVENT_QUIESCE: return ops->freeze; + case PM_EVENT_POWEROFF: case PM_EVENT_HIBERNATE: return ops->poweroff; case PM_EVENT_THAW: @@ -402,6 +405,7 @@ static pm_callback_t pm_late_early_op(const struct dev_pm_ops *ops, case PM_EVENT_FREEZE: case PM_EVENT_QUIESCE: return ops->freeze_late; + case PM_EVENT_POWEROFF: case PM_EVENT_HIBERNATE: return ops->poweroff_late; case PM_EVENT_THAW: @@ -436,6 +440,7 @@ static pm_callback_t pm_noirq_op(const struct dev_pm_ops *ops, pm_message_t stat case PM_EVENT_FREEZE: case PM_EVENT_QUIESCE: return ops->freeze_noirq; + case PM_EVENT_POWEROFF: case PM_EVENT_HIBERNATE: return ops->poweroff_noirq; case PM_EVENT_THAW: diff --git a/include/linux/pm.h b/include/linux/pm.h index a72e42eec130..7f69f739f613 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -508,6 +508,7 @@ const struct dev_pm_ops name = { \ * RECOVER Creation of a hibernation image or restoration of the main * memory contents from a hibernation image has failed, call * ->thaw() and ->complete() for all devices. + * POWEROFF System will poweroff, call ->poweroff() for all devices. * * The following PM_EVENT_ messages are defined for internal use by * kernel subsystems. They are never issued by the PM core. @@ -538,6 +539,7 @@ const struct dev_pm_ops name = { \ #define PM_EVENT_USER 0x0100 #define PM_EVENT_REMOTE 0x0200 #define PM_EVENT_AUTO 0x0400 +#define PM_EVENT_POWEROFF 0x0800 #define PM_EVENT_SLEEP (PM_EVENT_SUSPEND | PM_EVENT_HIBERNATE) #define PM_EVENT_USER_SUSPEND (PM_EVENT_USER | PM_EVENT_SUSPEND) @@ -552,6 +554,7 @@ const struct dev_pm_ops name = { \ #define PMSG_QUIESCE ((struct pm_message){ .event = PM_EVENT_QUIESCE, }) #define PMSG_SUSPEND ((struct pm_message){ .event = PM_EVENT_SUSPEND, }) #define PMSG_HIBERNATE ((struct pm_message){ .event = PM_EVENT_HIBERNATE, }) +#define PMSG_POWEROFF ((struct pm_message){ .event = PM_EVENT_POWEROFF, }) #define PMSG_RESUME ((struct pm_message){ .event = PM_EVENT_RESUME, }) #define PMSG_THAW ((struct pm_message){ .event = PM_EVENT_THAW, }) #define PMSG_RESTORE ((struct pm_message){ .event = PM_EVENT_RESTORE, }) diff --git a/include/trace/events/power.h b/include/trace/events/power.h index 82904291c2b8..370f8df2fdb4 100644 --- a/include/trace/events/power.h +++ b/include/trace/events/power.h @@ -179,7 +179,8 @@ TRACE_EVENT(pstate_sample, { PM_EVENT_HIBERNATE, "hibernate" }, \ { PM_EVENT_THAW, "thaw" }, \ { PM_EVENT_RESTORE, "restore" }, \ - { PM_EVENT_RECOVER, "recover" }) + { PM_EVENT_RECOVER, "recover" }, \ + { PM_EVENT_POWEROFF, "poweroff" }) DEFINE_EVENT(cpu, cpu_frequency, From 988dd0bd914d034860b969214e7f6b2217978565 Mon Sep 17 00:00:00 2001 From: "Mario Limonciello (AMD)" Date: Wed, 12 Nov 2025 16:40:24 -0600 Subject: [PATCH 65/96] scsi: Add PM_EVENT_POWEROFF into suspend callbacks If the PM core uses hibernation callbacks for powering off the system, drivers will receive PM_EVENT_POWEROFF and should handle it the same as they previously handled PM_EVENT_HIBERNATE. Support this case in the scsi driver. No functional changes. Reviewed-by: Martin K. Petersen Tested-by: Eric Naim Signed-off-by: Mario Limonciello (AMD) Link: https://patch.msgid.link/20251112224025.2051702-3-superm1@kernel.org Signed-off-by: Rafael J. Wysocki --- drivers/scsi/mesh.c | 1 + drivers/scsi/stex.c | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/scsi/mesh.c b/drivers/scsi/mesh.c index 1c15cac41d80..768b85eecc8f 100644 --- a/drivers/scsi/mesh.c +++ b/drivers/scsi/mesh.c @@ -1762,6 +1762,7 @@ static int mesh_suspend(struct macio_dev *mdev, pm_message_t mesg) case PM_EVENT_SUSPEND: case PM_EVENT_HIBERNATE: case PM_EVENT_FREEZE: + case PM_EVENT_POWEROFF: break; default: return 0; diff --git a/drivers/scsi/stex.c b/drivers/scsi/stex.c index d8ad02c29320..e6357bc301cb 100644 --- a/drivers/scsi/stex.c +++ b/drivers/scsi/stex.c @@ -1965,6 +1965,7 @@ static int stex_choice_sleep_mic(struct st_hba *hba, pm_message_t state) case PM_EVENT_SUSPEND: return ST_S3; case PM_EVENT_HIBERNATE: + case PM_EVENT_POWEROFF: hba->msi_lock = 0; return ST_S4; default: From 7b9725b3d1222c60571e8117f15fd8057b38ee83 Mon Sep 17 00:00:00 2001 From: "Mario Limonciello (AMD)" Date: Wed, 12 Nov 2025 16:40:25 -0600 Subject: [PATCH 66/96] usb: sl811-hcd: Add PM_EVENT_POWEROFF into suspend callbacks When the PM core uses hibernation callbacks for shutdown drivers will receive PM_EVENT_POWEROFF and should handle it the same as PM_EVENT_HIBERNATE would have been used. Tested-by: Eric Naim Signed-off-by: Mario Limonciello (AMD) [ rjw: Changelog adjustment ] Link: https://patch.msgid.link/20251112224025.2051702-4-superm1@kernel.org Signed-off-by: Rafael J. Wysocki --- drivers/usb/host/sl811-hcd.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/host/sl811-hcd.c b/drivers/usb/host/sl811-hcd.c index ea3cab99c5d4..5d6dba681e50 100644 --- a/drivers/usb/host/sl811-hcd.c +++ b/drivers/usb/host/sl811-hcd.c @@ -1748,6 +1748,7 @@ sl811h_suspend(struct platform_device *dev, pm_message_t state) break; case PM_EVENT_SUSPEND: case PM_EVENT_HIBERNATE: + case PM_EVENT_POWEROFF: case PM_EVENT_PRETHAW: /* explicitly discard hw state */ port_power(sl811, 0); break; From a10ad1b104024efe0a01d21ce7c08002cf4034c4 Mon Sep 17 00:00:00 2001 From: Riwen Lu Date: Thu, 13 Nov 2025 09:26:38 +0800 Subject: [PATCH 67/96] PM: suspend: Make pm_test delay interruptible by wakeup events Modify the suspend_test() function to allow the test delay to be interrupted by wakeup events. This improves the responsiveness of the system during suspend testing when wakeup events occur, allowing the suspend process to proceed without waiting for the full test delay to complete when wakeup events are detected. Additionally, using msleep() instead of mdelay() avoids potential soft lockup "CPU stuck" issues when long test delays are configured. Co-developed-by: xiongxin Signed-off-by: xiongxin Signed-off-by: Riwen Lu [ rjw: Changelog edits ] Link: https://patch.msgid.link/20251113012638.1362013-1-luriwen@kylinos.cn Signed-off-by: Rafael J. Wysocki --- kernel/power/suspend.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index b4ca17c2fecf..1c2f777da367 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -344,10 +344,14 @@ MODULE_PARM_DESC(pm_test_delay, static int suspend_test(int level) { #ifdef CONFIG_PM_DEBUG + int i; + if (pm_test_level == level) { pr_info("suspend debug: Waiting for %d second(s).\n", pm_test_delay); - mdelay(pm_test_delay * 1000); + for (i = 0; i < pm_test_delay && !pm_wakeup_pending(); i++) + msleep(1000); + return 1; } #endif /* !CONFIG_PM_DEBUG */ From ef8057b07c72a817537856b98d6e7493b9404eaf Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 13 Nov 2025 20:33:33 +0100 Subject: [PATCH 68/96] PM: runtime: Wrapper macros for ACQUIRE()/ACQUIRE_ERR() Add wrapper macros for ACQUIRE()/ACQUIRE_ERR() and runtime PM usage counter guards introduced recently: pm_runtime_active_try, pm_runtime_active_auto_try, pm_runtime_active_try_enabled, and pm_runtime_active_auto_try_enabled. The new macros should be more straightforward to use. For example, they can be used for rewriting a piece of code like below: ACQUIRE(pm_runtime_active_try, pm)(dev); if ((ret = ACQUIRE_ERR(pm_runtime_active_try, &pm))) return ret; in the following way: PM_RUNTIME_ACQUIRE(dev, pm); if ((ret = PM_RUNTIME_ACQUIRE_ERR(&pm))) return ret; If the original code does not care about the specific error code returned when attepmting to resume the device: ACQUIRE(pm_runtime_active_try, pm)(dev); if (ACQUIRE_ERR(pm_runtime_active_try, &pm)) return -ENXIO; it may be changed like this: PM_RUNTIME_ACQUIRE(dev, pm); if (PM_RUNTIME_ACQUIRE_ERR(&pm)) return -ENXIO; Link: https://lore.kernel.org/linux-pm/5068916.31r3eYUQgx@rafael.j.wysocki/ Signed-off-by: Rafael J. Wysocki Reviewed-by: Dan Williams Reviewed-by: Dhruva Gole Reviewed-by: Jonathan Cameron Reviewed-by: Frank Li Link: https://patch.msgid.link/3400866.aeNJFYEL58@rafael.j.wysocki --- include/linux/pm_runtime.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index 0b436e15f4cd..911d7a4d32c1 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -637,6 +637,30 @@ DEFINE_GUARD_COND(pm_runtime_active_auto, _try, DEFINE_GUARD_COND(pm_runtime_active_auto, _try_enabled, pm_runtime_resume_and_get(_T), _RET == 0) +/* ACQUIRE() wrapper macros for the guards defined above. */ + +#define PM_RUNTIME_ACQUIRE(_dev, _var) \ + ACQUIRE(pm_runtime_active_try, _var)(_dev) + +#define PM_RUNTIME_ACQUIRE_AUTOSUSPEND(_dev, _var) \ + ACQUIRE(pm_runtime_active_auto_try, _var)(_dev) + +#define PM_RUNTIME_ACQUIRE_IF_ENABLED(_dev, _var) \ + ACQUIRE(pm_runtime_active_try_enabled, _var)(_dev) + +#define PM_RUNTIME_ACQUIRE_IF_ENABLED_AUTOSUSPEND(_dev, _var) \ + ACQUIRE(pm_runtime_active_auto_try_enabled, _var)(_dev) + +/* + * ACQUIRE_ERR() wrapper macro for guard pm_runtime_active. + * + * Always check PM_RUNTIME_ACQUIRE_ERR() after using one of the + * PM_RUNTIME_ACQUIRE*() macros defined above (yes, it can be used with + * any of them) and if it is nonzero, avoid accessing the given device. + */ +#define PM_RUNTIME_ACQUIRE_ERR(_var_ptr) \ + ACQUIRE_ERR(pm_runtime_active, _var_ptr) + /** * pm_runtime_put_sync - Drop device usage counter and run "idle check" if 0. * @dev: Target device. From 70dcad34009e00523b50818eda082958986ebc0b Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 13 Nov 2025 20:34:53 +0100 Subject: [PATCH 69/96] ACPI: TAD: Use PM_RUNTIME_ACQUIRE()/PM_RUNTIME_ACQUIRE_ERR() Use new PM_RUNTIME_ACQUIRE() and PM_RUNTIME_ACQUIRE_ERR() wrapper macros to make the code look more straightforward. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Dhruva Gole Reviewed-by: Jonathan Cameron [ rjw: Typo fix in the changelog ] Link: https://patch.msgid.link/2040585.PYKUYFuaPT@rafael.j.wysocki Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpi_tad.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/acpi/acpi_tad.c b/drivers/acpi/acpi_tad.c index c9487c5bb7b3..6d870d97ada6 100644 --- a/drivers/acpi/acpi_tad.c +++ b/drivers/acpi/acpi_tad.c @@ -90,8 +90,8 @@ static int acpi_tad_set_real_time(struct device *dev, struct acpi_tad_rt *rt) args[0].buffer.pointer = (u8 *)rt; args[0].buffer.length = sizeof(*rt); - ACQUIRE(pm_runtime_active_try, pm)(dev); - if (ACQUIRE_ERR(pm_runtime_active_try, &pm)) + PM_RUNTIME_ACQUIRE(dev, pm); + if (PM_RUNTIME_ACQUIRE_ERR(&pm)) return -ENXIO; status = acpi_evaluate_integer(handle, "_SRT", &arg_list, &retval); @@ -137,8 +137,8 @@ static int acpi_tad_get_real_time(struct device *dev, struct acpi_tad_rt *rt) { int ret; - ACQUIRE(pm_runtime_active_try, pm)(dev); - if (ACQUIRE_ERR(pm_runtime_active_try, &pm)) + PM_RUNTIME_ACQUIRE(dev, pm); + if (PM_RUNTIME_ACQUIRE_ERR(&pm)) return -ENXIO; ret = acpi_tad_evaluate_grt(dev, rt); @@ -275,8 +275,8 @@ static int acpi_tad_wake_set(struct device *dev, char *method, u32 timer_id, args[0].integer.value = timer_id; args[1].integer.value = value; - ACQUIRE(pm_runtime_active_try, pm)(dev); - if (ACQUIRE_ERR(pm_runtime_active_try, &pm)) + PM_RUNTIME_ACQUIRE(dev, pm); + if (PM_RUNTIME_ACQUIRE_ERR(&pm)) return -ENXIO; status = acpi_evaluate_integer(handle, method, &arg_list, &retval); @@ -322,8 +322,8 @@ static ssize_t acpi_tad_wake_read(struct device *dev, char *buf, char *method, args[0].integer.value = timer_id; - ACQUIRE(pm_runtime_active_try, pm)(dev); - if (ACQUIRE_ERR(pm_runtime_active_try, &pm)) + PM_RUNTIME_ACQUIRE(dev, pm); + if (PM_RUNTIME_ACQUIRE_ERR(&pm)) return -ENXIO; status = acpi_evaluate_integer(handle, method, &arg_list, &retval); @@ -377,8 +377,8 @@ static int acpi_tad_clear_status(struct device *dev, u32 timer_id) args[0].integer.value = timer_id; - ACQUIRE(pm_runtime_active_try, pm)(dev); - if (ACQUIRE_ERR(pm_runtime_active_try, &pm)) + PM_RUNTIME_ACQUIRE(dev, pm); + if (PM_RUNTIME_ACQUIRE_ERR(&pm)) return -ENXIO; status = acpi_evaluate_integer(handle, "_CWS", &arg_list, &retval); @@ -417,8 +417,8 @@ static ssize_t acpi_tad_status_read(struct device *dev, char *buf, u32 timer_id) args[0].integer.value = timer_id; - ACQUIRE(pm_runtime_active_try, pm)(dev); - if (ACQUIRE_ERR(pm_runtime_active_try, &pm)) + PM_RUNTIME_ACQUIRE(dev, pm); + if (PM_RUNTIME_ACQUIRE_ERR(&pm)) return -ENXIO; status = acpi_evaluate_integer(handle, "_GWS", &arg_list, &retval); From 07f42f8290e927a38ee4248505fc39ed0518519e Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 13 Nov 2025 20:35:27 +0100 Subject: [PATCH 70/96] PCI/sysfs: Use PM_RUNTIME_ACQUIRE()/PM_RUNTIME_ACQUIRE_ERR() Use new PM_RUNTIME_ACQUIRE() and PM_RUNTIME_ACQUIRE_ERR() wrapper macros to make the code look more straightforward. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Dhruva Gole Reviewed-by: Jonathan Cameron [ rjw; Typo fix in the changelog ] Link: https://patch.msgid.link/3932581.kQq0lBPeGt@rafael.j.wysocki Signed-off-by: Rafael J. Wysocki --- drivers/pci/pci-sysfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index 9d6f74bd95f8..3881359440b1 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -1517,8 +1517,8 @@ static ssize_t reset_method_store(struct device *dev, return count; } - ACQUIRE(pm_runtime_active_try, pm)(dev); - if (ACQUIRE_ERR(pm_runtime_active_try, &pm)) + PM_RUNTIME_ACQUIRE(dev, pm); + if (PM_RUNTIME_ACQUIRE_ERR(&pm)) return -ENXIO; if (sysfs_streq(buf, "default")) { From 46fc75a29b7034d1971afcbdf47b88926a46b1ea Mon Sep 17 00:00:00 2001 From: Sunday Adelodun Date: Fri, 14 Nov 2025 23:04:38 +0100 Subject: [PATCH 71/96] PM: hibernate: Clean up kernel-doc comment style usage Several static functions in kernel/power/swap.c were described using the kernel-doc comment style (/** ... */) even though they are not exported or referenced by generated documentation. This led to kernel-doc warnings and stylistic inconsistencies. Convert these unnecessary kernel-doc blocks to regular C comments, remove comment blocks that are no longer useful, relocate comments to more appropriate positions where needed, and fix a few "Return:" descriptions that were either missing or incorrectly formatted. No functional changes. Signed-off-by: Sunday Adelodun [ rjw: Subject adjustment, changelog edits, comment edits ] Link: https://patch.msgid.link/20251114220438.52448-1-adelodunolaoluwa@yahoo.com Signed-off-by: Rafael J. Wysocki --- kernel/power/swap.c | 58 ++++++++++----------------------------------- 1 file changed, 12 insertions(+), 46 deletions(-) diff --git a/kernel/power/swap.c b/kernel/power/swap.c index f532f49d82ac..c7c2b89c9b2b 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -336,16 +336,14 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) */ unsigned int swsusp_header_flags; -/** - * swsusp_swap_check - check if the resume device is a swap device - * and get its index (if so) - * - * This is called before saving image - */ static int swsusp_swap_check(void) { int res; + /* + * Check if the resume device is a swap device and get its index (if so). + * This is called before saving the image. + */ if (swsusp_resume_device) res = swap_type_of(swsusp_resume_device, swsusp_resume_block); else @@ -362,13 +360,6 @@ static int swsusp_swap_check(void) return 0; } -/** - * write_page - Write one page to given swap location. - * @buf: Address we're writing. - * @offset: Offset of the swap page we're writing to. - * @hb: bio completion batch - */ - static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb) { gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY; @@ -527,10 +518,6 @@ static unsigned int hibernate_compression_threads = CMP_THREADS; #define CMP_MIN_RD_PAGES 1024 #define CMP_MAX_RD_PAGES 8192 -/** - * save_image - save the suspend image data - */ - static int save_image(struct swap_map_handle *handle, struct snapshot_handle *snapshot, unsigned int nr_to_write) @@ -710,12 +697,6 @@ static int compress_threadfn(void *data) return 0; } -/** - * save_compressed_image - Save the suspend image data after compression. - * @handle: Swap map handle to use for saving the image. - * @snapshot: Image to read data from. - * @nr_to_write: Number of pages to save. - */ static int save_compressed_image(struct swap_map_handle *handle, struct snapshot_handle *snapshot, unsigned int nr_to_write) @@ -943,13 +924,6 @@ static int save_compressed_image(struct swap_map_handle *handle, return ret; } -/** - * enough_swap - Make sure we have enough swap to save the image. - * - * Returns TRUE or FALSE after checking the total amount of swap - * space available from the resume partition. - */ - static int enough_swap(unsigned int nr_pages) { unsigned int free_swap = count_swap_pages(root_swap, 1); @@ -969,8 +943,9 @@ static int enough_swap(unsigned int nr_pages) * them synced (in case something goes wrong) but we DO not want to mark * filesystem clean: it is not. (And it does not matter, if we resume * correctly, we'll mark system clean, anyway.) + * + * Return: 0 on success, negative error code on failure. */ - int swsusp_write(unsigned int flags) { struct swap_map_handle handle; @@ -1116,12 +1091,6 @@ static int swap_reader_finish(struct swap_map_handle *handle) return 0; } -/** - * load_image - load the image using the swap map handle - * @handle and the snapshot handle @snapshot - * (assume there are @nr_pages pages to load) - */ - static int load_image(struct swap_map_handle *handle, struct snapshot_handle *snapshot, unsigned int nr_to_read) @@ -1229,12 +1198,6 @@ static int decompress_threadfn(void *data) return 0; } -/** - * load_compressed_image - Load compressed image data and decompress it. - * @handle: Swap map handle to use for loading data. - * @snapshot: Image to copy uncompressed data into. - * @nr_to_read: Number of pages to load. - */ static int load_compressed_image(struct swap_map_handle *handle, struct snapshot_handle *snapshot, unsigned int nr_to_read) @@ -1564,8 +1527,9 @@ static int load_compressed_image(struct swap_map_handle *handle, * swsusp_read - read the hibernation image. * @flags_p: flags passed by the "frozen" kernel in the image header should * be written into this memory location + * + * Return: 0 on success, negative error code on failure. */ - int swsusp_read(unsigned int *flags_p) { int error; @@ -1602,8 +1566,9 @@ static void *swsusp_holder; /** * swsusp_check - Open the resume device and check for the swsusp signature. * @exclusive: Open the resume device exclusively. + * + * Return: 0 if a valid image is found, negative error code otherwise. */ - int swsusp_check(bool exclusive) { void *holder = exclusive ? &swsusp_holder : NULL; @@ -1666,8 +1631,9 @@ void swsusp_close(void) /** * swsusp_unmark - Unmark swsusp signature in the resume device + * + * Return: 0 on success, negative error code on failure. */ - #ifdef CONFIG_SUSPEND int swsusp_unmark(void) { From 58075aec92a8141fd7f42e1c36d1bc54552c015e Mon Sep 17 00:00:00 2001 From: Kaushlendra Kumar Date: Tue, 28 Oct 2025 15:48:14 +0530 Subject: [PATCH 72/96] powercap: intel_rapl: Add support for Nova Lake processors Add RAPL support for Intel Nova Lake and Nova Lake L processors using the core defaults configuration. Signed-off-by: Kaushlendra Kumar [ rjw: Subject and changelog edits, rebase ] Link: https://patch.msgid.link/20251028101814.3482508-1-kaushlendra.kumar@intel.com Signed-off-by: Rafael J. Wysocki --- drivers/powercap/intel_rapl_common.c | 2 ++ drivers/powercap/intel_rapl_msr.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c index cdb4363589e9..57bebd07c7d0 100644 --- a/drivers/powercap/intel_rapl_common.c +++ b/drivers/powercap/intel_rapl_common.c @@ -1285,6 +1285,8 @@ static const struct x86_cpu_id rapl_ids[] __initconst = { X86_MATCH_VFM(INTEL_LUNARLAKE_M, &rapl_defaults_core), X86_MATCH_VFM(INTEL_PANTHERLAKE_L, &rapl_defaults_core), X86_MATCH_VFM(INTEL_WILDCATLAKE_L, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_NOVALAKE, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_NOVALAKE_L, &rapl_defaults_core), X86_MATCH_VFM(INTEL_ARROWLAKE_H, &rapl_defaults_core), X86_MATCH_VFM(INTEL_ARROWLAKE, &rapl_defaults_core), X86_MATCH_VFM(INTEL_ARROWLAKE_U, &rapl_defaults_core), diff --git a/drivers/powercap/intel_rapl_msr.c b/drivers/powercap/intel_rapl_msr.c index c4d536c2f989..c6b9a7debc35 100644 --- a/drivers/powercap/intel_rapl_msr.c +++ b/drivers/powercap/intel_rapl_msr.c @@ -152,6 +152,8 @@ static const struct x86_cpu_id pl4_support_ids[] = { X86_MATCH_VFM(INTEL_ARROWLAKE_H, NULL), X86_MATCH_VFM(INTEL_PANTHERLAKE_L, NULL), X86_MATCH_VFM(INTEL_WILDCATLAKE_L, NULL), + X86_MATCH_VFM(INTEL_NOVALAKE, NULL), + X86_MATCH_VFM(INTEL_NOVALAKE_L, NULL), {} }; From b20a374902bbb647b87e874bb2c9d708abc0109f Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 14 Nov 2025 19:48:34 +0100 Subject: [PATCH 73/96] cpufreq: intel_pstate: Eliminate some code duplication To eliminate some code duplication from the intel_pstate driver, move the core_get_val() function body to a new function called get_perf_ctl_val() and make both core_get_val() and atom_get_val() invoke it to carry out the same computation. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Acked-by: Srinivas Pandruvada Link: https://patch.msgid.link/2829273.mvXUDI8C0e@rafael.j.wysocki --- drivers/cpufreq/intel_pstate.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 2a126d7dae01..ec4abe374573 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -2048,6 +2048,18 @@ static void intel_pstate_hwp_enable(struct cpudata *cpudata) intel_pstate_update_epp_defaults(cpudata); } +static u64 get_perf_ctl_val(int pstate) +{ + u64 val; + + val = (u64)pstate << 8; + if (READ_ONCE(global.no_turbo) && !READ_ONCE(global.turbo_disabled) && + cpu_feature_enabled(X86_FEATURE_IDA)) + val |= (u64)1 << 32; + + return val; +} + static int atom_get_min_pstate(int not_used) { u64 value; @@ -2074,15 +2086,10 @@ static int atom_get_turbo_pstate(int not_used) static u64 atom_get_val(struct cpudata *cpudata, int pstate) { - u64 val; + u64 val = get_perf_ctl_val(pstate); int32_t vid_fp; u32 vid; - val = (u64)pstate << 8; - if (READ_ONCE(global.no_turbo) && !READ_ONCE(global.turbo_disabled) && - cpu_feature_enabled(X86_FEATURE_IDA)) - val |= (u64)1 << 32; - vid_fp = cpudata->vid.min + mul_fp( int_tofp(pstate - cpudata->pstate.min_pstate), cpudata->vid.ratio); @@ -2242,14 +2249,7 @@ static int core_get_turbo_pstate(int cpu) static u64 core_get_val(struct cpudata *cpudata, int pstate) { - u64 val; - - val = (u64)pstate << 8; - if (READ_ONCE(global.no_turbo) && !READ_ONCE(global.turbo_disabled) && - cpu_feature_enabled(X86_FEATURE_IDA)) - val |= (u64)1 << 32; - - return val; + return get_perf_ctl_val(pstate); } static int knl_get_aperf_mperf_shift(void) From 083654ded547238c70e0d4f57115cd1c91245b6e Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 13 Nov 2025 17:56:27 +0100 Subject: [PATCH 74/96] cpuidle: governors: teo: Rework the handling of tick wakeups If the wakeup pattern is clearly dominated by tick wakeups, count those wakeups as hits on the deepest available idle state to increase the likelihood of stopping the tick, especially on systems where there are only 2 usable idle states and the tick can only be stopped when the deeper state is selected. This change is expected to reduce power on some systems where state 0 is selected relatively often even though they are almost idle. Without it, the governor may end up selecting the shallowest idle state all the time even if the system is almost completely idle due all tick wakeups being counted as hits on that state and preventing the tick from being stopped at all. Fixes: 4b20b07ce72f ("cpuidle: teo: Don't count non-existent intercepts") Reported-by: Reka Norman Closes: https://lore.kernel.org/linux-pm/CAEmPcwsNMNnNXuxgvHTQ93Mx-q3Oz9U57THQsU_qdcCx1m4w5g@mail.gmail.com/ Tested-by: Reka Norman Tested-by: Christian Loehle Cc: 6.11+ # 6.11+: 92ce5c07b7a1: cpuidle: teo: Reorder candidate state index checks Cc: 6.11+ # 6.11+: ea185406d1ed: cpuidle: teo: Combine candidate state index checks against 0 Cc: 6.11+ # 6.11+: b9a6af26bd83: cpuidle: teo: Drop local variable prev_intercept_idx Cc: 6.11+ # 6.11+: e24f8a55de50: cpuidle: teo: Clarify two code comments Cc: 6.11+ # 6.11+: d619b5cc6780: cpuidle: teo: Simplify counting events used for tick management Cc: 6.11+ # 6.11+: 13ed5c4a6d9c: cpuidle: teo: Skip getting the sleep length if wakeups are very frequent Cc: 6.11+ # 6.11+: ddcfa7964677: cpuidle: teo: Simplify handling of total events count Cc: 6.11+ # 6.11+: 65e18e654475: cpuidle: teo: Replace time_span_ns with a flag Cc: 6.11+ # 6.11+: 0796ddf4a7f0: cpuidle: teo: Use this_cpu_ptr() where possible Cc: 6.11+ # 6.11+: 8f3f01082d7a: cpuidle: governors: teo: Use s64 consistently in teo_update() Cc: 6.11+ # 6.11+: b54df61c7428: cpuidle: governors: teo: Decay metrics below DECAY_SHIFT threshold Cc: 6.11+ # 6.11+ Signed-off-by: Rafael J. Wysocki [ rjw: Rebase on commit 0796ddf4a7f0, changelog update ] Link: https://patch.msgid.link/6228387.lOV4Wx5bFT@rafael.j.wysocki Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/governors/teo.c | 39 ++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c index 8b80d73e518e..94ba00b7617d 100644 --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c @@ -133,17 +133,19 @@ struct teo_bin { * @sleep_length_ns: Time till the closest timer event (at the selection time). * @state_bins: Idle state data bins for this CPU. * @total: Grand total of the "intercepts" and "hits" metrics for all bins. + * @total_tick: Wakeups by the scheduler tick. * @tick_intercepts: "Intercepts" before TICK_NSEC. * @short_idles: Wakeups after short idle periods. - * @artificial_wakeup: Set if the wakeup has been triggered by a safety net. + * @tick_wakeup: Set if the last wakeup was by the scheduler tick. */ struct teo_cpu { s64 sleep_length_ns; struct teo_bin state_bins[CPUIDLE_STATE_MAX]; unsigned int total; + unsigned int total_tick; unsigned int tick_intercepts; unsigned int short_idles; - bool artificial_wakeup; + bool tick_wakeup; }; static DEFINE_PER_CPU(struct teo_cpu, teo_cpus); @@ -172,9 +174,10 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) teo_decay(&cpu_data->short_idles); - if (cpu_data->artificial_wakeup) { + if (dev->poll_time_limit) { + dev->poll_time_limit = false; /* - * If one of the safety nets has triggered, assume that this + * Polling state timeout has triggered, so assume that this * might have been a long sleep. */ measured_ns = S64_MAX; @@ -223,6 +226,21 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) cpu_data->total = total + PULSE; teo_decay(&cpu_data->tick_intercepts); + + teo_decay(&cpu_data->total_tick); + if (cpu_data->tick_wakeup) { + cpu_data->total_tick += PULSE; + /* + * If tick wakeups dominate the wakeup pattern, count this one + * as a hit on the deepest available idle state to increase the + * likelihood of stopping the tick. + */ + if (3 * cpu_data->total_tick > 2 * cpu_data->total) { + cpu_data->state_bins[drv->state_count-1].hits += PULSE; + return; + } + } + /* * If the measured idle duration falls into the same bin as the sleep * length, this is a "hit", so update the "hits" metric for that bin. @@ -512,18 +530,9 @@ static void teo_reflect(struct cpuidle_device *dev, int state) { struct teo_cpu *cpu_data = this_cpu_ptr(&teo_cpus); + cpu_data->tick_wakeup = tick_nohz_idle_got_tick(); + dev->last_state_idx = state; - if (dev->poll_time_limit || - (tick_nohz_idle_got_tick() && cpu_data->sleep_length_ns > TICK_NSEC)) { - /* - * The wakeup was not "genuine", but triggered by one of the - * safety nets. - */ - dev->poll_time_limit = false; - cpu_data->artificial_wakeup = true; - } else { - cpu_data->artificial_wakeup = false; - } } /** From 50db438231dcf7ceac187a6a9c68a1d757b8d883 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 16 Nov 2025 13:34:29 +0100 Subject: [PATCH 75/96] cpuidle: governors: teo: Fix tick_intercepts handling in teo_update() The condition deciding whether or not to increase cpu_data->tick_intercepts in teo_update() is reverse, so fix it. Fixes: d619b5cc6780 ("cpuidle: teo: Simplify counting events used for tick management") Cc: 6.14+ # 6.14+: 0796ddf4a7f0: cpuidle: teo: Use this_cpu_ptr() where possible Cc: 6.14+ # 6.14+: 8f3f01082d7a: cpuidle: governors: teo: Use s64 consistently in teo_update() Cc: 6.14+ # 6.14+: b54df61c7428: cpuidle: governors: teo: Decay metrics below DECAY_SHIFT threshold Cc: 6.14+ 6.14+: 083654ded547: cpuidle: governors: teo: Rework the handling of tick wakeups Cc: 6.14+ Signed-off-by: Rafael J. Wysocki Reviewed-by: Christian Loehle Link: https://patch.msgid.link/5085160.31r3eYUQgx@rafael.j.wysocki --- drivers/cpuidle/governors/teo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c index 94ba00b7617d..85b5517067d1 100644 --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c @@ -251,7 +251,7 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) cpu_data->state_bins[idx_timer].hits += PULSE; } else { cpu_data->state_bins[idx_duration].intercepts += PULSE; - if (TICK_NSEC <= measured_ns) + if (measured_ns <= TICK_NSEC) cpu_data->tick_intercepts += PULSE; } } From d834e68a0e8b4a3c673eb96d4d53e48f3c19a81e Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 16 Nov 2025 13:35:14 +0100 Subject: [PATCH 76/96] cpuidle: governors: teo: Simplify intercepts-based state lookup Simplify the loop looking up a candidate idle state in the case when an intercept is likely to occur by adding a search for the state index limit if the tick is stopped before it. First, call tick_nohz_tick_stopped() just once and if it returns true, look for the shallowest state index below the current candidate one with target residency at least equal to the tick period length. Next, simply look for a state that is not shallower than the one found in the previous step and satisfies the intercepts majority condition (if there are no such states, the shallowest state that is not shallower than the one found in the previous step becomes the new candidate). Since teo_state_ok() has no callers any more after the above changes, drop it. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Christian Loehle [ rjw: Changelog clarification and code comment edit ] Link: https://patch.msgid.link/2418792.ElGaqSPkdT@rafael.j.wysocki Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/governors/teo.c | 62 +++++++++------------------------ 1 file changed, 16 insertions(+), 46 deletions(-) diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c index 85b5517067d1..bab186336bf4 100644 --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c @@ -256,12 +256,6 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) } } -static bool teo_state_ok(int i, struct cpuidle_driver *drv) -{ - return !tick_nohz_tick_stopped() || - drv->states[i].target_residency_ns >= TICK_NSEC; -} - /** * teo_find_shallower_state - Find shallower idle state matching given duration. * @drv: cpuidle driver containing state data. @@ -383,7 +377,18 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, * better choice. */ if (2 * idx_intercept_sum > cpu_data->total - idx_hit_sum) { - int first_suitable_idx = idx; + int min_idx = idx0; + + if (tick_nohz_tick_stopped()) { + /* + * Look for the shallowest idle state below the current + * candidate one whose target residency is at least + * equal to the tick period length. + */ + while (min_idx < idx && + drv->states[min_idx].target_residency_ns < TICK_NSEC) + min_idx++; + } /* * Look for the deepest idle state whose target residency had @@ -393,49 +398,14 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, * Take the possible duration limitation present if the tick * has been stopped already into account. */ - intercept_sum = 0; - - for (i = idx - 1; i >= 0; i--) { - struct teo_bin *bin = &cpu_data->state_bins[i]; - - intercept_sum += bin->intercepts; - - if (2 * intercept_sum > idx_intercept_sum) { - /* - * Use the current state unless it is too - * shallow or disabled, in which case take the - * first enabled state that is deep enough. - */ - if (teo_state_ok(i, drv) && - !dev->states_usage[i].disable) { - idx = i; - break; - } - idx = first_suitable_idx; - break; - } + for (i = idx - 1, intercept_sum = 0; i >= min_idx; i--) { + intercept_sum += cpu_data->state_bins[i].intercepts; if (dev->states_usage[i].disable) continue; - if (teo_state_ok(i, drv)) { - /* - * The current state is deep enough, but still - * there may be a better one. - */ - first_suitable_idx = i; - continue; - } - - /* - * The current state is too shallow, so if no suitable - * states other than the initial candidate have been - * found, give up (the remaining states to check are - * shallower still), but otherwise the first suitable - * state other than the initial candidate may turn out - * to be preferable. - */ - if (first_suitable_idx == idx) + idx = i; + if (2 * intercept_sum > idx_intercept_sum) break; } } From 1b541e10eea6ecea84431dd69d9052b12ed1f729 Mon Sep 17 00:00:00 2001 From: Kaushlendra Kumar Date: Wed, 19 Nov 2025 08:41:09 +0530 Subject: [PATCH 77/96] cpufreq: ACPI: Replace udelay() with usleep_range() Replace udelay() with usleep_range() in check_freqs() to allow CPU scheduling during frequency polling. Signed-off-by: Kaushlendra Kumar [ rjw: Changelog edits ] Link: https://patch.msgid.link/20251119031109.134583-1-kaushlendra.kumar@intel.com Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/acpi-cpufreq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c index 083d8369a591..e73a66785d69 100644 --- a/drivers/cpufreq/acpi-cpufreq.c +++ b/drivers/cpufreq/acpi-cpufreq.c @@ -395,7 +395,7 @@ static unsigned int check_freqs(struct cpufreq_policy *policy, cur_freq = extract_freq(policy, get_cur_val(mask, data)); if (cur_freq == freq) return 1; - udelay(10); + usleep_range(10, 15); } return 0; } From bf8867eae17fde94d7081545cf90ca8d5aba690a Mon Sep 17 00:00:00 2001 From: Samuel Wu Date: Wed, 19 Nov 2025 09:14:24 -0800 Subject: [PATCH 78/96] PM: sleep: Add support for wakeup during filesystem sync Add helper function pm_sleep_fs_sync() and related data structures as a preparation for allowing system suspend and hibernation to be aborted by wakeup events while syncing file systems. The new function, to be called by the suspend process in order to sync file systems, uses a dedicated ordered workqueue to run ksys_sync_helper() in parallel with the calling process. Next, it waits for the completion of the filesystem sync and periodically checks if any system wakeup events are pending, in which case it will return an error. If that happens while the filesystem sync is still in progress, it will continue, possibly after pm_sleep_fs_sync() has returned, and if that function is called again before the sync is complete, a new work item to run ksys_sync_helper() again will be queued (and waited for) to increase the likelihood of writing all of the dirty pages in memory back to persistent storage. Suggested-by: Saravana Kannan Signed-off-by: Samuel Wu Co-developed-by: Rafael J. Wysocki [ rjw: Subject and changelog rewrite, tags adjustment ] Link: https://patch.msgid.link/20251119171426.4086783-2-wusamuel@google.com Signed-off-by: Rafael J. Wysocki --- kernel/power/main.c | 79 ++++++++++++++++++++++++++++++++++++++++---- kernel/power/power.h | 1 + 2 files changed, 74 insertions(+), 6 deletions(-) diff --git a/kernel/power/main.c b/kernel/power/main.c index 549f51ca3a1e..e76a55583ec6 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -18,6 +18,8 @@ #include #include #include +#include +#include #include "power.h" @@ -92,6 +94,61 @@ void ksys_sync_helper(void) } EXPORT_SYMBOL_GPL(ksys_sync_helper); +#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION) +/* Wakeup events handling resolution while syncing file systems in jiffies */ +#define PM_FS_SYNC_WAKEUP_RESOLUTION 5 + +static atomic_t pm_fs_sync_count = ATOMIC_INIT(0); +static struct workqueue_struct *pm_fs_sync_wq; +static DECLARE_WAIT_QUEUE_HEAD(pm_fs_sync_wait); + +static bool pm_fs_sync_completed(void) +{ + return atomic_read(&pm_fs_sync_count) == 0; +} + +static void pm_fs_sync_work_fn(struct work_struct *work) +{ + ksys_sync_helper(); + + if (atomic_dec_and_test(&pm_fs_sync_count)) + wake_up(&pm_fs_sync_wait); +} +static DECLARE_WORK(pm_fs_sync_work, pm_fs_sync_work_fn); + +/** + * pm_sleep_fs_sync() - Sync file systems in an interruptible way + * + * Return: 0 on successful file system sync, or -EBUSY if the file system sync + * was aborted. + */ +int pm_sleep_fs_sync(void) +{ + pm_wakeup_clear(0); + + /* + * Take back-to-back sleeps into account by queuing a subsequent fs sync + * only if the previous fs sync is running or is not queued. Multiple fs + * syncs increase the likelihood of saving the latest files immediately + * before sleep. + */ + if (!work_pending(&pm_fs_sync_work)) { + atomic_inc(&pm_fs_sync_count); + queue_work(pm_fs_sync_wq, &pm_fs_sync_work); + } + + while (!pm_fs_sync_completed()) { + if (pm_wakeup_pending()) + return -EBUSY; + + wait_event_timeout(pm_fs_sync_wait, pm_fs_sync_completed(), + PM_FS_SYNC_WAKEUP_RESOLUTION); + } + + return 0; +} +#endif /* CONFIG_SUSPEND || CONFIG_HIBERNATION */ + /* Routines for PM-transition notifications */ static BLOCKING_NOTIFIER_HEAD(pm_chain_head); @@ -231,10 +288,10 @@ static ssize_t mem_sleep_store(struct kobject *kobj, struct kobj_attribute *attr power_attr(mem_sleep); /* - * sync_on_suspend: invoke ksys_sync_helper() before suspend. + * sync_on_suspend: Sync file systems before suspend. * - * show() returns whether ksys_sync_helper() is invoked before suspend. - * store() accepts 0 or 1. 0 disables ksys_sync_helper() and 1 enables it. + * show() returns whether file systems sync before suspend is enabled. + * store() accepts 0 or 1. 0 disables file systems sync and 1 enables it. */ bool sync_on_suspend_enabled = !IS_ENABLED(CONFIG_SUSPEND_SKIP_SYNC); @@ -1066,16 +1123,26 @@ static const struct attribute_group *attr_groups[] = { struct workqueue_struct *pm_wq; EXPORT_SYMBOL_GPL(pm_wq); -static int __init pm_start_workqueue(void) +static int __init pm_start_workqueues(void) { pm_wq = alloc_workqueue("pm", WQ_FREEZABLE, 0); + if (!pm_wq) + return -ENOMEM; - return pm_wq ? 0 : -ENOMEM; +#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION) + pm_fs_sync_wq = alloc_ordered_workqueue("pm_fs_sync", 0); + if (!pm_fs_sync_wq) { + destroy_workqueue(pm_wq); + return -ENOMEM; + } +#endif + + return 0; } static int __init pm_init(void) { - int error = pm_start_workqueue(); + int error = pm_start_workqueues(); if (error) return error; hibernate_image_size_init(); diff --git a/kernel/power/power.h b/kernel/power/power.h index 7ccd709af93f..75b63843886e 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -19,6 +19,7 @@ struct swsusp_info { } __aligned(PAGE_SIZE); #if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION) +extern int pm_sleep_fs_sync(void); extern bool filesystem_freeze_enabled; #endif From 8e2d57e6539b1c2c9b76bc1726ac49384a96c04f Mon Sep 17 00:00:00 2001 From: Samuel Wu Date: Wed, 19 Nov 2025 09:14:25 -0800 Subject: [PATCH 79/96] PM: sleep: Call pm_sleep_fs_sync() instead of ksys_sync_helper() Replace the direct calls to ksys_sync_helper() with the new pm_sleep_fs_sync() in suspend and hibernation code paths. This enables the new mechanism allowing the filesystem sync phase to be interrupted. Suggested-by: Saravana Kannan Signed-off-by: Samuel Wu Co-developed-by: Rafael J. Wysocki [ rjw: Subject and changelog edits, tags adjustment ] Link: https://patch.msgid.link/20251119171426.4086783-3-wusamuel@google.com Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 6 +++++- kernel/power/suspend.c | 6 +++++- kernel/power/user.c | 4 +++- 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 53166ef86ba4..7fed1cd36e4d 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -820,7 +820,10 @@ int hibernate(void) if (error) goto Restore; - ksys_sync_helper(); + error = pm_sleep_fs_sync(); + if (error) + goto Notify; + if (filesystem_freeze_enabled) filesystems_freeze(); @@ -892,6 +895,7 @@ int hibernate(void) freezer_test_done = false; Exit: filesystems_thaw(); + Notify: pm_notifier_call_chain(PM_POST_HIBERNATION); Restore: pm_restore_console(); diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 1c2f777da367..02f50afaa927 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -594,7 +594,11 @@ static int enter_state(suspend_state_t state) if (sync_on_suspend_enabled) { trace_suspend_resume(TPS("sync_filesystems"), 0, true); - ksys_sync_helper(); + + error = pm_sleep_fs_sync(); + if (error) + goto Unlock; + trace_suspend_resume(TPS("sync_filesystems"), 0, false); } diff --git a/kernel/power/user.c b/kernel/power/user.c index 3f9e3efb9f6e..4401cfe26e5c 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -278,7 +278,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, if (data->frozen) break; - ksys_sync_helper(); + error = pm_sleep_fs_sync(); + if (error) + break; error = freeze_processes(); if (error) From c3852d2ca46503c00866d8eea5e18bb67d981f9b Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Thu, 20 Nov 2025 14:35:02 +0100 Subject: [PATCH 80/96] cpufreq: qcom-nvmem: fix compilation warning for qcom_cpufreq_ipq806x_match_list If CONFIG_OF is not enabled, of_match_node() is set as NULL and qcom_cpufreq_ipq806x_match_list won't be used causing a compilation warning. Flag qcom_cpufreq_ipq806x_match_list as __maybe_unused to fix the compilation warning. While at it also flag as __initconst as it's used only in probe contest and can be freed after probe. This follows the pattern of the usual of_device_id variables. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202511202119.6zvvFMup-lkp@intel.com/ Fixes: 58f5d39d5ed8 ("cpufreq: qcom-nvmem: add compatible fallback for ipq806x for no SMEM") Signed-off-by: Christian Marangi [ Viresh: Drop __initconst ] Signed-off-by: Viresh Kumar --- drivers/cpufreq/qcom-cpufreq-nvmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpufreq/qcom-cpufreq-nvmem.c b/drivers/cpufreq/qcom-cpufreq-nvmem.c index d5af74bf71c6..81e16b5a0245 100644 --- a/drivers/cpufreq/qcom-cpufreq-nvmem.c +++ b/drivers/cpufreq/qcom-cpufreq-nvmem.c @@ -256,7 +256,7 @@ static int qcom_cpufreq_krait_name_version(struct device *cpu_dev, return ret; } -static const struct of_device_id qcom_cpufreq_ipq806x_match_list[] = { +static const struct of_device_id qcom_cpufreq_ipq806x_match_list[] __maybe_unused = { { .compatible = "qcom,ipq8062", .data = (const void *)QCOM_ID_IPQ8062 }, { .compatible = "qcom,ipq8064", .data = (const void *)QCOM_ID_IPQ8064 }, { .compatible = "qcom,ipq8065", .data = (const void *)QCOM_ID_IPQ8065 }, From 1d6c915819f5b805c35487b6ce5923e31a28266b Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Thu, 20 Nov 2025 16:05:38 -0800 Subject: [PATCH 81/96] powercap: intel_rapl: Prepare read_raw() interface for atomic-context callers The current read_raw() implementation of the TPMI, MMIO and MSR interfaces does not distinguish between atomic and non-atomic callers. rapl_msr_read_raw() uses rdmsrq_safe_on_cpu(), which can sleep and issue cross CPU calls. When MSR-based RAPL PMU support is enabled, PMU event handlers can invoke this function from atomic context where sleeping or rescheduling is not allowed. In atomic context, the caller is already executing on the target CPU, so a direct rdmsrq() is sufficient. To support such usage, introduce an atomic flag to the read_raw() interface to allow callers pass the context information. Modify the common RAPL code to propagate this flag, and set the flag to reflect the calling contexts. Utilize the atomic flag in rapl_msr_read_raw() to perform direct MSR read with rdmsrq() when running in atomic context, and a sanity check to ensure target CPU matches the current CPU for such use cases. The TPMI and MMIO implementations do not require special atomic handling, so the flag is ignored in those paths. This is a preparatory patch for adding MSR-based RAPL PMU support. Signed-off-by: Kuppuswamy Sathyanarayanan Reviewed-by: Srinivas Pandruvada [ rjw: Subject tweak ] Link: https://patch.msgid.link/20251121000539.386069-2-sathyanarayanan.kuppuswamy@linux.intel.com Signed-off-by: Rafael J. Wysocki --- drivers/powercap/intel_rapl_common.c | 24 ++++++++++--------- drivers/powercap/intel_rapl_msr.c | 16 ++++++++++++- drivers/powercap/intel_rapl_tpmi.c | 2 +- .../int340x_thermal/processor_thermal_rapl.c | 2 +- include/linux/intel_rapl.h | 2 +- 5 files changed, 31 insertions(+), 15 deletions(-) diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c index 57bebd07c7d0..47ec34d4c099 100644 --- a/drivers/powercap/intel_rapl_common.c +++ b/drivers/powercap/intel_rapl_common.c @@ -253,7 +253,8 @@ struct rapl_primitive_info { static void rapl_init_domains(struct rapl_package *rp); static int rapl_read_data_raw(struct rapl_domain *rd, enum rapl_primitives prim, - bool xlate, u64 *data); + bool xlate, u64 *data, + bool atomic); static int rapl_write_data_raw(struct rapl_domain *rd, enum rapl_primitives prim, unsigned long long value); @@ -289,7 +290,7 @@ static int get_energy_counter(struct powercap_zone *power_zone, cpus_read_lock(); rd = power_zone_to_rapl_domain(power_zone); - if (!rapl_read_data_raw(rd, ENERGY_COUNTER, true, &energy_now)) { + if (!rapl_read_data_raw(rd, ENERGY_COUNTER, true, &energy_now, false)) { *energy_raw = energy_now; cpus_read_unlock(); @@ -830,7 +831,8 @@ prim_fixups(struct rapl_domain *rd, enum rapl_primitives prim) * 63-------------------------- 31--------------------------- 0 */ static int rapl_read_data_raw(struct rapl_domain *rd, - enum rapl_primitives prim, bool xlate, u64 *data) + enum rapl_primitives prim, bool xlate, u64 *data, + bool atomic) { u64 value; enum rapl_primitives prim_fixed = prim_fixups(rd, prim); @@ -852,7 +854,7 @@ static int rapl_read_data_raw(struct rapl_domain *rd, ra.mask = rpi->mask; - if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra)) { + if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra, atomic)) { pr_debug("failed to read reg 0x%llx for %s:%s\n", ra.reg.val, rd->rp->name, rd->name); return -EIO; } @@ -904,7 +906,7 @@ static int rapl_read_pl_data(struct rapl_domain *rd, int pl, if (!is_pl_valid(rd, pl)) return -EINVAL; - return rapl_read_data_raw(rd, prim, xlate, data); + return rapl_read_data_raw(rd, prim, xlate, data, false); } static int rapl_write_pl_data(struct rapl_domain *rd, int pl, @@ -941,7 +943,7 @@ static int rapl_check_unit_core(struct rapl_domain *rd) ra.reg = rd->regs[RAPL_DOMAIN_REG_UNIT]; ra.mask = ~0; - if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra)) { + if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra, false)) { pr_err("Failed to read power unit REG 0x%llx on %s:%s, exit.\n", ra.reg.val, rd->rp->name, rd->name); return -ENODEV; @@ -969,7 +971,7 @@ static int rapl_check_unit_atom(struct rapl_domain *rd) ra.reg = rd->regs[RAPL_DOMAIN_REG_UNIT]; ra.mask = ~0; - if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra)) { + if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra, false)) { pr_err("Failed to read power unit REG 0x%llx on %s:%s, exit.\n", ra.reg.val, rd->rp->name, rd->name); return -ENODEV; @@ -1156,7 +1158,7 @@ static int rapl_check_unit_tpmi(struct rapl_domain *rd) ra.reg = rd->regs[RAPL_DOMAIN_REG_UNIT]; ra.mask = ~0; - if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra)) { + if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra, false)) { pr_err("Failed to read power unit REG 0x%llx on %s:%s, exit.\n", ra.reg.val, rd->rp->name, rd->name); return -ENODEV; @@ -1328,7 +1330,7 @@ static void rapl_update_domain_data(struct rapl_package *rp) struct rapl_primitive_info *rpi = get_rpi(rp, prim); if (!rapl_read_data_raw(&rp->domains[dmn], prim, - rpi->unit, &val)) + rpi->unit, &val, false)) rp->domains[dmn].rdd.primitives[prim] = val; } } @@ -1428,7 +1430,7 @@ static int rapl_check_domain(int domain, struct rapl_package *rp) */ ra.mask = ENERGY_STATUS_MASK; - if (rp->priv->read_raw(get_rid(rp), &ra) || !ra.value) + if (rp->priv->read_raw(get_rid(rp), &ra, false) || !ra.value) return -ENODEV; return 0; @@ -1639,7 +1641,7 @@ static u64 event_read_counter(struct perf_event *event) if (event->hw.idx < 0) return 0; - ret = rapl_read_data_raw(&rp->domains[event->hw.idx], ENERGY_COUNTER, false, &val); + ret = rapl_read_data_raw(&rp->domains[event->hw.idx], ENERGY_COUNTER, false, &val, true); /* Return 0 for failed read */ if (ret) diff --git a/drivers/powercap/intel_rapl_msr.c b/drivers/powercap/intel_rapl_msr.c index c6b9a7debc35..6e3c50af0912 100644 --- a/drivers/powercap/intel_rapl_msr.c +++ b/drivers/powercap/intel_rapl_msr.c @@ -102,12 +102,26 @@ static int rapl_cpu_down_prep(unsigned int cpu) return 0; } -static int rapl_msr_read_raw(int cpu, struct reg_action *ra) +static int rapl_msr_read_raw(int cpu, struct reg_action *ra, bool atomic) { + /* + * When called from atomic-context (eg PMU event handler) + * perform MSR read directly using rdmsrq(). + */ + if (atomic) { + if (unlikely(smp_processor_id() != cpu)) + return -EIO; + + rdmsrq(ra->reg.msr, ra->value); + goto out; + } + if (rdmsrq_safe_on_cpu(cpu, ra->reg.msr, &ra->value)) { pr_debug("failed to read msr 0x%x on cpu %d\n", ra->reg.msr, cpu); return -EIO; } + +out: ra->value &= ra->mask; return 0; } diff --git a/drivers/powercap/intel_rapl_tpmi.c b/drivers/powercap/intel_rapl_tpmi.c index 82201bf4685d..0a0b85f4528b 100644 --- a/drivers/powercap/intel_rapl_tpmi.c +++ b/drivers/powercap/intel_rapl_tpmi.c @@ -60,7 +60,7 @@ static DEFINE_MUTEX(tpmi_rapl_lock); static struct powercap_control_type *tpmi_control_type; -static int tpmi_rapl_read_raw(int id, struct reg_action *ra) +static int tpmi_rapl_read_raw(int id, struct reg_action *ra, bool atomic) { if (!ra->reg.mmio) return -EINVAL; diff --git a/drivers/thermal/intel/int340x_thermal/processor_thermal_rapl.c b/drivers/thermal/intel/int340x_thermal/processor_thermal_rapl.c index bde2cc386afd..bf51a17c5be6 100644 --- a/drivers/thermal/intel/int340x_thermal/processor_thermal_rapl.c +++ b/drivers/thermal/intel/int340x_thermal/processor_thermal_rapl.c @@ -19,7 +19,7 @@ static const struct rapl_mmio_regs rapl_mmio_default = { .limits[RAPL_DOMAIN_DRAM] = BIT(POWER_LIMIT2), }; -static int rapl_mmio_read_raw(int cpu, struct reg_action *ra) +static int rapl_mmio_read_raw(int cpu, struct reg_action *ra, bool atomic) { if (!ra->reg.mmio) return -EINVAL; diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h index c0397423d3a8..e9ade2ff4af6 100644 --- a/include/linux/intel_rapl.h +++ b/include/linux/intel_rapl.h @@ -152,7 +152,7 @@ struct rapl_if_priv { union rapl_reg reg_unit; union rapl_reg regs[RAPL_DOMAIN_MAX][RAPL_DOMAIN_REG_MAX]; int limits[RAPL_DOMAIN_MAX]; - int (*read_raw)(int id, struct reg_action *ra); + int (*read_raw)(int id, struct reg_action *ra, bool atomic); int (*write_raw)(int id, struct reg_action *ra); void *defaults; void *rpi; From 748d6ba43afde7e9ac27443233203995cc15d235 Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Thu, 20 Nov 2025 16:05:39 -0800 Subject: [PATCH 82/96] powercap: intel_rapl: Enable MSR-based RAPL PMU support Currently, RAPL PMU support requires adding CPU model entries to arch/x86/events/rapl.c for each new generation. However, RAPL MSRs are not architectural and require platform-specific customization, making arch/x86 an inappropriate location for this functionality. The powercap subsystem already handles RAPL functionality and is the natural place to consolidate all RAPL features. The powercap RAPL driver already includes PMU support for TPMI-based RAPL interfaces, making it straightforward to extend this support to MSR-based RAPL interfaces as well. This consolidation eliminates the need to maintain RAPL support in multiple subsystems and provides a unified approach for both TPMI and MSR-based RAPL implementations. The MSR-based PMU support includes the following updates: 1. Register MSR-based PMU support for the supported platforms and unregister it when no online CPUs remain in the package. 2. Remove existing checks that restrict RAPL PMU support to TPMI-based interfaces and extend the logic to allow MSR-based RAPL interfaces. 3. Define a CPU model list to determine which processors should register RAPL PMU interface through the powercap driver for MSR-based RAPL, excluding those that support TPMI interface. This list prevents conflicts with existing arch/x86 PMU code that already registers RAPL PMU for some processors. Add Panther Lake & Wildcat Lake to the CPU models list. Signed-off-by: Kuppuswamy Sathyanarayanan Reviewed-by: Srinivas Pandruvada [ rjw: Changelog edits ] Link: https://patch.msgid.link/20251121000539.386069-3-sathyanarayanan.kuppuswamy@linux.intel.com Signed-off-by: Rafael J. Wysocki --- drivers/powercap/intel_rapl_common.c | 12 ++++++------ drivers/powercap/intel_rapl_msr.c | 24 ++++++++++++++++++++++-- 2 files changed, 28 insertions(+), 8 deletions(-) diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c index 47ec34d4c099..b9d87e56cbbc 100644 --- a/drivers/powercap/intel_rapl_common.c +++ b/drivers/powercap/intel_rapl_common.c @@ -1597,11 +1597,11 @@ static int get_pmu_cpu(struct rapl_package *rp) if (!rp->has_pmu) return nr_cpu_ids; - /* Only TPMI RAPL is supported for now */ - if (rp->priv->type != RAPL_IF_TPMI) + /* Only TPMI & MSR RAPL are supported for now */ + if (rp->priv->type != RAPL_IF_TPMI && rp->priv->type != RAPL_IF_MSR) return nr_cpu_ids; - /* TPMI RAPL uses any CPU in the package for PMU */ + /* TPMI/MSR RAPL uses any CPU in the package for PMU */ for_each_online_cpu(cpu) if (topology_physical_package_id(cpu) == rp->id) return cpu; @@ -1614,11 +1614,11 @@ static bool is_rp_pmu_cpu(struct rapl_package *rp, int cpu) if (!rp->has_pmu) return false; - /* Only TPMI RAPL is supported for now */ - if (rp->priv->type != RAPL_IF_TPMI) + /* Only TPMI & MSR RAPL are supported for now */ + if (rp->priv->type != RAPL_IF_TPMI && rp->priv->type != RAPL_IF_MSR) return false; - /* TPMI RAPL uses any CPU in the package for PMU */ + /* TPMI/MSR RAPL uses any CPU in the package for PMU */ return topology_physical_package_id(cpu) == rp->id; } diff --git a/drivers/powercap/intel_rapl_msr.c b/drivers/powercap/intel_rapl_msr.c index 6e3c50af0912..0ce1096b6314 100644 --- a/drivers/powercap/intel_rapl_msr.c +++ b/drivers/powercap/intel_rapl_msr.c @@ -33,6 +33,8 @@ /* private data for RAPL MSR Interface */ static struct rapl_if_priv *rapl_msr_priv; +static bool rapl_msr_pmu __ro_after_init; + static struct rapl_if_priv rapl_msr_priv_intel = { .type = RAPL_IF_MSR, .reg_unit.msr = MSR_RAPL_POWER_UNIT, @@ -79,6 +81,8 @@ static int rapl_cpu_online(unsigned int cpu) rp = rapl_add_package_cpuslocked(cpu, rapl_msr_priv, true); if (IS_ERR(rp)) return PTR_ERR(rp); + if (rapl_msr_pmu) + rapl_package_add_pmu(rp); } cpumask_set_cpu(cpu, &rp->cpumask); return 0; @@ -95,10 +99,14 @@ static int rapl_cpu_down_prep(unsigned int cpu) cpumask_clear_cpu(cpu, &rp->cpumask); lead_cpu = cpumask_first(&rp->cpumask); - if (lead_cpu >= nr_cpu_ids) + if (lead_cpu >= nr_cpu_ids) { + if (rapl_msr_pmu) + rapl_package_remove_pmu(rp); rapl_remove_package_cpuslocked(rp); - else if (rp->lead_cpu == cpu) + } else if (rp->lead_cpu == cpu) { rp->lead_cpu = lead_cpu; + } + return 0; } @@ -171,6 +179,13 @@ static const struct x86_cpu_id pl4_support_ids[] = { {} }; +/* List of MSR-based RAPL PMU support CPUs */ +static const struct x86_cpu_id pmu_support_ids[] = { + X86_MATCH_VFM(INTEL_PANTHERLAKE_L, NULL), + X86_MATCH_VFM(INTEL_WILDCATLAKE_L, NULL), + {} +}; + static int rapl_msr_probe(struct platform_device *pdev) { const struct x86_cpu_id *id = x86_match_cpu(pl4_support_ids); @@ -198,6 +213,11 @@ static int rapl_msr_probe(struct platform_device *pdev) pr_info("PL4 support detected.\n"); } + if (x86_match_cpu(pmu_support_ids)) { + rapl_msr_pmu = true; + pr_info("MSR-based RAPL PMU support enabled\n"); + } + rapl_msr_priv->control_type = powercap_register_control_type(NULL, "intel-rapl", NULL); if (IS_ERR(rapl_msr_priv->control_type)) { pr_debug("failed to register powercap control_type.\n"); From 447c4e8338dbfad517769d26b53d633b88d51184 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Thu, 30 Oct 2025 20:26:28 +0200 Subject: [PATCH 83/96] PM / devfreq: Move governor.h to a public header location Some device drivers (and out-of-tree modules) might want to define device-specific device governors. Rather than restricting all of them to be a part of drivers/devfreq/ (which is not possible for out-of-tree drivers anyway) move governor.h to include/linux/devfreq-governor.h and update all drivers to use it. The devfreq_cpu_data is only used internally, by the passive governor, so it is moved to the driver source rather than being a part of the public interface. Reported-by: Robie Basak Acked-by: Jon Hunter Signed-off-by: Dmitry Baryshkov Reviewed-by: Bjorn Andersson Acked-by: MyungJoo Ham Signed-off-by: Chanwoo Choi Link: https://patchwork.kernel.org/project/linux-pm/patch/20251030-governor-public-v2-1-432a11a9975a@oss.qualcomm.com/ --- drivers/devfreq/devfreq.c | 2 +- drivers/devfreq/governor_passive.c | 27 ++++++++++++++- drivers/devfreq/governor_performance.c | 2 +- drivers/devfreq/governor_powersave.c | 2 +- drivers/devfreq/governor_simpleondemand.c | 2 +- drivers/devfreq/governor_userspace.c | 2 +- drivers/devfreq/hisi_uncore_freq.c | 3 +- drivers/devfreq/tegra30-devfreq.c | 3 +- .../linux/devfreq-governor.h | 33 +++---------------- 9 files changed, 37 insertions(+), 39 deletions(-) rename drivers/devfreq/governor.h => include/linux/devfreq-governor.h (80%) diff --git a/drivers/devfreq/devfreq.c b/drivers/devfreq/devfreq.c index 2e8d01d47f69..00979f2e0e27 100644 --- a/drivers/devfreq/devfreq.c +++ b/drivers/devfreq/devfreq.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -28,7 +29,6 @@ #include #include #include -#include "governor.h" #define CREATE_TRACE_POINTS #include diff --git a/drivers/devfreq/governor_passive.c b/drivers/devfreq/governor_passive.c index 953cf9a1e9f7..8cd6f9a59f64 100644 --- a/drivers/devfreq/governor_passive.c +++ b/drivers/devfreq/governor_passive.c @@ -14,8 +14,33 @@ #include #include #include +#include #include -#include "governor.h" + +/** + * struct devfreq_cpu_data - Hold the per-cpu data + * @node: list node + * @dev: reference to cpu device. + * @first_cpu: the cpumask of the first cpu of a policy. + * @opp_table: reference to cpu opp table. + * @cur_freq: the current frequency of the cpu. + * @min_freq: the min frequency of the cpu. + * @max_freq: the max frequency of the cpu. + * + * This structure stores the required cpu_data of a cpu. + * This is auto-populated by the governor. + */ +struct devfreq_cpu_data { + struct list_head node; + + struct device *dev; + unsigned int first_cpu; + + struct opp_table *opp_table; + unsigned int cur_freq; + unsigned int min_freq; + unsigned int max_freq; +}; static struct devfreq_cpu_data * get_parent_cpu_data(struct devfreq_passive_data *p_data, diff --git a/drivers/devfreq/governor_performance.c b/drivers/devfreq/governor_performance.c index 2e4e981446fa..fdb22bf512cf 100644 --- a/drivers/devfreq/governor_performance.c +++ b/drivers/devfreq/governor_performance.c @@ -7,8 +7,8 @@ */ #include +#include #include -#include "governor.h" static int devfreq_performance_func(struct devfreq *df, unsigned long *freq) diff --git a/drivers/devfreq/governor_powersave.c b/drivers/devfreq/governor_powersave.c index f059e8814804..ee2d6ec8a512 100644 --- a/drivers/devfreq/governor_powersave.c +++ b/drivers/devfreq/governor_powersave.c @@ -7,8 +7,8 @@ */ #include +#include #include -#include "governor.h" static int devfreq_powersave_func(struct devfreq *df, unsigned long *freq) diff --git a/drivers/devfreq/governor_simpleondemand.c b/drivers/devfreq/governor_simpleondemand.c index c23435736367..9c69b96df5f9 100644 --- a/drivers/devfreq/governor_simpleondemand.c +++ b/drivers/devfreq/governor_simpleondemand.c @@ -9,8 +9,8 @@ #include #include #include +#include #include -#include "governor.h" /* Default constants for DevFreq-Simple-Ondemand (DFSO) */ #define DFSO_UPTHRESHOLD (90) diff --git a/drivers/devfreq/governor_userspace.c b/drivers/devfreq/governor_userspace.c index 175de0c0b50e..395174f93960 100644 --- a/drivers/devfreq/governor_userspace.c +++ b/drivers/devfreq/governor_userspace.c @@ -9,11 +9,11 @@ #include #include #include +#include #include #include #include #include -#include "governor.h" struct userspace_data { unsigned long user_frequency; diff --git a/drivers/devfreq/hisi_uncore_freq.c b/drivers/devfreq/hisi_uncore_freq.c index 96d1815059e3..b8e4621c57eb 100644 --- a/drivers/devfreq/hisi_uncore_freq.c +++ b/drivers/devfreq/hisi_uncore_freq.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -26,8 +27,6 @@ #include #include -#include "governor.h" - struct hisi_uncore_pcc_data { u16 status; u16 resv; diff --git a/drivers/devfreq/tegra30-devfreq.c b/drivers/devfreq/tegra30-devfreq.c index 4a4f0106ab9d..77cbb204087c 100644 --- a/drivers/devfreq/tegra30-devfreq.c +++ b/drivers/devfreq/tegra30-devfreq.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -21,8 +22,6 @@ #include -#include "governor.h" - #define ACTMON_GLB_STATUS 0x0 #define ACTMON_GLB_PERIOD_CTRL 0x4 diff --git a/drivers/devfreq/governor.h b/include/linux/devfreq-governor.h similarity index 80% rename from drivers/devfreq/governor.h rename to include/linux/devfreq-governor.h index 0adfebc0467a..dfdd0160a29f 100644 --- a/drivers/devfreq/governor.h +++ b/include/linux/devfreq-governor.h @@ -5,11 +5,11 @@ * Copyright (C) 2011 Samsung Electronics * MyungJoo Ham * - * This header is for devfreq governors in drivers/devfreq/ + * This header is for devfreq governors */ -#ifndef _GOVERNOR_H -#define _GOVERNOR_H +#ifndef __LINUX_DEVFREQ_DEVFREQ_H__ +#define __LINUX_DEVFREQ_DEVFREQ_H__ #include @@ -47,31 +47,6 @@ #define DEVFREQ_GOV_ATTR_POLLING_INTERVAL BIT(0) #define DEVFREQ_GOV_ATTR_TIMER BIT(1) -/** - * struct devfreq_cpu_data - Hold the per-cpu data - * @node: list node - * @dev: reference to cpu device. - * @first_cpu: the cpumask of the first cpu of a policy. - * @opp_table: reference to cpu opp table. - * @cur_freq: the current frequency of the cpu. - * @min_freq: the min frequency of the cpu. - * @max_freq: the max frequency of the cpu. - * - * This structure stores the required cpu_data of a cpu. - * This is auto-populated by the governor. - */ -struct devfreq_cpu_data { - struct list_head node; - - struct device *dev; - unsigned int first_cpu; - - struct opp_table *opp_table; - unsigned int cur_freq; - unsigned int min_freq; - unsigned int max_freq; -}; - /** * struct devfreq_governor - Devfreq policy governor * @node: list node - contains registered devfreq governors @@ -124,4 +99,4 @@ static inline int devfreq_update_stats(struct devfreq *df) return df->profile->get_dev_status(df->dev.parent, &df->last_status); } -#endif /* _GOVERNOR_H */ +#endif /* __LINUX_DEVFREQ_DEVFREQ_H__ */ From 26dd44a40096468396b6438985d8e44e0743f64c Mon Sep 17 00:00:00 2001 From: Pengjie Zhang Date: Mon, 15 Sep 2025 14:21:35 +0800 Subject: [PATCH 84/96] PM / devfreq: hisi: Fix potential UAF in OPP handling Ensure all required data is acquired before calling dev_pm_opp_put(opp) to maintain correct resource acquisition and release order. Fixes: 7da2fdaaa1e6 ("PM / devfreq: Add HiSilicon uncore frequency scaling driver") Signed-off-by: Pengjie Zhang Reviewed-by: Jie Zhan Acked-by: Chanwoo Choi Signed-off-by: Chanwoo Choi Link: https://patchwork.kernel.org/project/linux-pm/patch/20250915062135.748653-1-zhangpengjie2@huawei.com/ --- drivers/devfreq/hisi_uncore_freq.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/devfreq/hisi_uncore_freq.c b/drivers/devfreq/hisi_uncore_freq.c index b8e4621c57eb..4d00d813c8ac 100644 --- a/drivers/devfreq/hisi_uncore_freq.c +++ b/drivers/devfreq/hisi_uncore_freq.c @@ -264,10 +264,11 @@ static int hisi_uncore_target(struct device *dev, unsigned long *freq, dev_err(dev, "Failed to get opp for freq %lu hz\n", *freq); return PTR_ERR(opp); } - dev_pm_opp_put(opp); data = (u32)(dev_pm_opp_get_freq(opp) / HZ_PER_MHZ); + dev_pm_opp_put(opp); + return hisi_uncore_cmd_send(uncore, HUCF_PCC_CMD_SET_FREQ, &data); } From dc30fe7a0a850a88b930581d837e9a668dbcb206 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Wed, 12 Nov 2025 18:21:21 +0100 Subject: [PATCH 85/96] PM / devfreq: tegra30: use min to simplify actmon_cpu_to_emc_rate Use min() to improve the readability of actmon_cpu_to_emc_rate() and remove any unnecessary curly braces. Reviewed-by: Dmitry Osipenko Signed-off-by: Thorsten Blum Acked-by: Thierry Reding Signed-off-by: Chanwoo Choi Link: https://patchwork.kernel.org/project/linux-pm/patch/20251112172121.3741-2-thorsten.blum@linux.dev/ --- drivers/devfreq/tegra30-devfreq.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/drivers/devfreq/tegra30-devfreq.c b/drivers/devfreq/tegra30-devfreq.c index 77cbb204087c..8b57194ac698 100644 --- a/drivers/devfreq/tegra30-devfreq.c +++ b/drivers/devfreq/tegra30-devfreq.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -325,14 +326,9 @@ static unsigned long actmon_cpu_to_emc_rate(struct tegra_devfreq *tegra, unsigned int i; const struct tegra_actmon_emc_ratio *ratio = actmon_emc_ratios; - for (i = 0; i < ARRAY_SIZE(actmon_emc_ratios); i++, ratio++) { - if (cpu_freq >= ratio->cpu_freq) { - if (ratio->emc_freq >= tegra->max_freq) - return tegra->max_freq; - else - return ratio->emc_freq; - } - } + for (i = 0; i < ARRAY_SIZE(actmon_emc_ratios); i++, ratio++) + if (cpu_freq >= ratio->cpu_freq) + return min(ratio->emc_freq, tegra->max_freq); return 0; } From c03aef8833597f184cc3439d2d336596f63bd709 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 21 Nov 2025 21:09:42 +0100 Subject: [PATCH 86/96] PM: hibernate: Extra cleanup of comments in swap handling code Continue recent cleanups of comments in the swap handling code. Unify the use of white space in the comments, drop some unuseful comments outside function bodies, and move some other comments into function bodies. No functional impact. Signed-off-by: Rafael J. Wysocki Link: https://patch.msgid.link/5943864.DvuYhMxLoT@rafael.j.wysocki --- kernel/power/swap.c | 81 ++++++++++++++++++--------------------------- 1 file changed, 33 insertions(+), 48 deletions(-) diff --git a/kernel/power/swap.c b/kernel/power/swap.c index c7c2b89c9b2b..33a186373bef 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -46,19 +46,18 @@ static bool clean_pages_on_read; static bool clean_pages_on_decompress; /* - * The swap map is a data structure used for keeping track of each page - * written to a swap partition. It consists of many swap_map_page - * structures that contain each an array of MAP_PAGE_ENTRIES swap entries. - * These structures are stored on the swap and linked together with the - * help of the .next_swap member. + * The swap map is a data structure used for keeping track of each page + * written to a swap partition. It consists of many swap_map_page structures + * that contain each an array of MAP_PAGE_ENTRIES swap entries. These + * structures are stored on the swap and linked together with the help of the + * .next_swap member. * - * The swap map is created during suspend. The swap map pages are - * allocated and populated one at a time, so we only need one memory - * page to set up the entire structure. + * The swap map is created during suspend. The swap map pages are allocated and + * populated one at a time, so we only need one memory page to set up the entire + * structure. * - * During resume we pick up all swap_map_page structures into a list. + * During resume we pick up all swap_map_page structures into a list. */ - #define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1) /* @@ -89,10 +88,8 @@ struct swap_map_page_list { }; /* - * The swap_map_handle structure is used for handling swap in - * a file-alike way + * The swap_map_handle structure is used for handling swap in a file-alike way. */ - struct swap_map_handle { struct swap_map_page *cur; struct swap_map_page_list *maps; @@ -117,10 +114,9 @@ struct swsusp_header { static struct swsusp_header *swsusp_header; /* - * The following functions are used for tracing the allocated - * swap pages, so that they can be freed in case of an error. + * The following functions are used for tracing the allocated swap pages, so + * that they can be freed in case of an error. */ - struct swsusp_extent { struct rb_node node; unsigned long start; @@ -170,15 +166,14 @@ static int swsusp_extents_insert(unsigned long swap_offset) return 0; } -/* - * alloc_swapdev_block - allocate a swap page and register that it has - * been allocated, so that it can be freed in case of an error. - */ - sector_t alloc_swapdev_block(int swap) { unsigned long offset; + /* + * Allocate a swap page and register that it has been allocated, so that + * it can be freed in case of an error. + */ offset = swp_offset(get_swap_page_of_type(swap)); if (offset) { if (swsusp_extents_insert(offset)) @@ -189,16 +184,14 @@ sector_t alloc_swapdev_block(int swap) return 0; } -/* - * free_all_swap_pages - free swap pages allocated for saving image data. - * It also frees the extents used to register which swap entries had been - * allocated. - */ - void free_all_swap_pages(int swap) { struct rb_node *node; + /* + * Free swap pages allocated for saving image data. It also frees the + * extents used to register which swap entries had been allocated. + */ while ((node = swsusp_extents.rb_node)) { struct swsusp_extent *ext; @@ -303,6 +296,7 @@ static int hib_wait_io(struct hib_bio_batch *hb) /* * Saving part */ + static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) { int error; @@ -615,9 +609,6 @@ static void free_crc_data(struct crc_data *crc) kfree(crc); } -/* - * CRC32 update function that runs in its own thread. - */ static int crc32_threadfn(void *data) { struct crc_data *d = data; @@ -642,6 +633,7 @@ static int crc32_threadfn(void *data) } return 0; } + /* * Structure used for data compression. */ @@ -663,9 +655,6 @@ struct cmp_data { /* Indicates the image size after compression */ static atomic64_t compressed_size = ATOMIC_INIT(0); -/* - * Compression function that runs in its own thread. - */ static int compress_threadfn(void *data) { struct cmp_data *d = data; @@ -936,15 +925,15 @@ static int enough_swap(unsigned int nr_pages) } /** - * swsusp_write - Write entire image and metadata. - * @flags: flags to pass to the "boot" kernel in the image header + * swsusp_write - Write entire image and metadata. + * @flags: flags to pass to the "boot" kernel in the image header * - * It is important _NOT_ to umount filesystems at this point. We want - * them synced (in case something goes wrong) but we DO not want to mark - * filesystem clean: it is not. (And it does not matter, if we resume - * correctly, we'll mark system clean, anyway.) + * It is important _NOT_ to umount filesystems at this point. We want them + * synced (in case something goes wrong) but we DO not want to mark filesystem + * clean: it is not. (And it does not matter, if we resume correctly, we'll mark + * system clean, anyway.) * - * Return: 0 on success, negative error code on failure. + * Return: 0 on success, negative error code on failure. */ int swsusp_write(unsigned int flags) { @@ -988,8 +977,8 @@ int swsusp_write(unsigned int flags) } /* - * The following functions allow us to read data using a swap map - * in a file-like way. + * The following functions allow us to read data using a swap map in a file-like + * way. */ static void release_swap_reader(struct swap_map_handle *handle) @@ -1161,9 +1150,6 @@ struct dec_data { unsigned char cmp[CMP_SIZE]; /* compressed buffer */ }; -/* - * Decompression function that runs in its own thread. - */ static int decompress_threadfn(void *data) { struct dec_data *d = data; @@ -1618,7 +1604,6 @@ int swsusp_check(bool exclusive) /** * swsusp_close - close resume device. */ - void swsusp_close(void) { if (IS_ERR(hib_resume_bdev_file)) { @@ -1630,9 +1615,9 @@ void swsusp_close(void) } /** - * swsusp_unmark - Unmark swsusp signature in the resume device + * swsusp_unmark - Unmark swsusp signature in the resume device * - * Return: 0 on success, negative error code on failure. + * Return: 0 on success, negative error code on failure. */ #ifdef CONFIG_SUSPEND int swsusp_unmark(void) From 15bfdadd617ec5363802f7cb6a0385b6569f374e Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 21 Nov 2025 21:11:16 +0100 Subject: [PATCH 87/96] cpuidle: governors: teo: Add missing space to the description There is a missing space in the governor description comment, so add it. No functional impact. Signed-off-by: Rafael J. Wysocki Link: https://patch.msgid.link/5059034.31r3eYUQgx@rafael.j.wysocki --- drivers/cpuidle/governors/teo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c index bab186336bf4..81ac5fd58a1c 100644 --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c @@ -76,7 +76,7 @@ * likely woken up by a non-timer wakeup source). * * 2. If the second sum computed in step 1 is greater than a half of the sum of - * both metrics for the candidate state bin and all subsequent bins(if any), + * both metrics for the candidate state bin and all subsequent bins (if any), * a shallower idle state is likely to be more suitable, so look for it. * * - Traverse the enabled idle states shallower than the candidate one in the From a4e6512a79d8486dccf3e8b066e5d6bd5ff95446 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Tue, 25 Nov 2025 12:26:42 +0100 Subject: [PATCH 88/96] PM: QoS: Introduce a CPU system wakeup QoS limit Some platforms supports multiple low power states for CPUs that can be used when entering system-wide suspend. Currently we are always selecting the deepest possible state for the CPUs, which can break the system wakeup latency constraint that may be required for a use case. Let's take the first step towards addressing this problem, by introducing an interface for user space, that allows us to specify the CPU system wakeup QoS limit. Subsequent changes will start taking into account the new QoS limit. Reviewed-by: Dhruva Gole Reviewed-by: Kevin Hilman (TI) Tested-by: Kevin Hilman (TI) Signed-off-by: Ulf Hansson Link: https://patch.msgid.link/20251125112650.329269-2-ulf.hansson@linaro.org Signed-off-by: Rafael J. Wysocki --- include/linux/pm_qos.h | 9 ++++ kernel/power/Kconfig | 11 +++++ kernel/power/qos.c | 106 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 126 insertions(+) diff --git a/include/linux/pm_qos.h b/include/linux/pm_qos.h index 4a69d4af3ff8..6cea4455f867 100644 --- a/include/linux/pm_qos.h +++ b/include/linux/pm_qos.h @@ -162,6 +162,15 @@ static inline void cpu_latency_qos_update_request(struct pm_qos_request *req, static inline void cpu_latency_qos_remove_request(struct pm_qos_request *req) {} #endif +#ifdef CONFIG_PM_QOS_CPU_SYSTEM_WAKEUP +s32 cpu_wakeup_latency_qos_limit(void); +#else +static inline s32 cpu_wakeup_latency_qos_limit(void) +{ + return PM_QOS_RESUME_LATENCY_NO_CONSTRAINT; +} +#endif + #ifdef CONFIG_PM enum pm_qos_flags_status __dev_pm_qos_flags(struct device *dev, s32 mask); enum pm_qos_flags_status dev_pm_qos_flags(struct device *dev, s32 mask); diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 54a623680019..05337f437cca 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -202,6 +202,17 @@ config PM_WAKELOCKS_GC depends on PM_WAKELOCKS default y +config PM_QOS_CPU_SYSTEM_WAKEUP + bool "User space interface for CPU system wakeup QoS" + depends on CPU_IDLE + help + Enable this to allow user space via the cpu_wakeup_latency file to + specify a CPU system wakeup latency limit. + + This may be particularly useful for platforms supporting multiple low + power states for CPUs during system-wide suspend and s2idle in + particular. + config PM bool "Device power management core functionality" help diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 4244b069442e..f7d8064e9adc 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -415,6 +415,105 @@ static struct miscdevice cpu_latency_qos_miscdev = { .fops = &cpu_latency_qos_fops, }; +#ifdef CONFIG_PM_QOS_CPU_SYSTEM_WAKEUP +/* The CPU system wakeup latency QoS. */ +static struct pm_qos_constraints cpu_wakeup_latency_constraints = { + .list = PLIST_HEAD_INIT(cpu_wakeup_latency_constraints.list), + .target_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT, + .default_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT, + .no_constraint_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT, + .type = PM_QOS_MIN, +}; + +/** + * cpu_wakeup_latency_qos_limit - Current CPU system wakeup latency QoS limit. + * + * Returns the current CPU system wakeup latency QoS limit that may have been + * requested by user space. + */ +s32 cpu_wakeup_latency_qos_limit(void) +{ + return pm_qos_read_value(&cpu_wakeup_latency_constraints); +} + +static int cpu_wakeup_latency_qos_open(struct inode *inode, struct file *filp) +{ + struct pm_qos_request *req; + + req = kzalloc(sizeof(*req), GFP_KERNEL); + if (!req) + return -ENOMEM; + + req->qos = &cpu_wakeup_latency_constraints; + pm_qos_update_target(req->qos, &req->node, PM_QOS_ADD_REQ, + PM_QOS_RESUME_LATENCY_NO_CONSTRAINT); + filp->private_data = req; + + return 0; +} + +static int cpu_wakeup_latency_qos_release(struct inode *inode, + struct file *filp) +{ + struct pm_qos_request *req = filp->private_data; + + filp->private_data = NULL; + pm_qos_update_target(req->qos, &req->node, PM_QOS_REMOVE_REQ, + PM_QOS_RESUME_LATENCY_NO_CONSTRAINT); + kfree(req); + + return 0; +} + +static ssize_t cpu_wakeup_latency_qos_read(struct file *filp, char __user *buf, + size_t count, loff_t *f_pos) +{ + s32 value = pm_qos_read_value(&cpu_wakeup_latency_constraints); + + return simple_read_from_buffer(buf, count, f_pos, &value, sizeof(s32)); +} + +static ssize_t cpu_wakeup_latency_qos_write(struct file *filp, + const char __user *buf, + size_t count, loff_t *f_pos) +{ + struct pm_qos_request *req = filp->private_data; + s32 value; + + if (count == sizeof(s32)) { + if (copy_from_user(&value, buf, sizeof(s32))) + return -EFAULT; + } else { + int ret; + + ret = kstrtos32_from_user(buf, count, 16, &value); + if (ret) + return ret; + } + + if (value < 0) + return -EINVAL; + + pm_qos_update_target(req->qos, &req->node, PM_QOS_UPDATE_REQ, value); + + return count; +} + +static const struct file_operations cpu_wakeup_latency_qos_fops = { + .open = cpu_wakeup_latency_qos_open, + .release = cpu_wakeup_latency_qos_release, + .read = cpu_wakeup_latency_qos_read, + .write = cpu_wakeup_latency_qos_write, + .llseek = noop_llseek, +}; + +static struct miscdevice cpu_wakeup_latency_qos_miscdev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "cpu_wakeup_latency", + .fops = &cpu_wakeup_latency_qos_fops, +}; +#endif /* CONFIG_PM_QOS_CPU_SYSTEM_WAKEUP */ + static int __init cpu_latency_qos_init(void) { int ret; @@ -424,6 +523,13 @@ static int __init cpu_latency_qos_init(void) pr_err("%s: %s setup failed\n", __func__, cpu_latency_qos_miscdev.name); +#ifdef CONFIG_PM_QOS_CPU_SYSTEM_WAKEUP + ret = misc_register(&cpu_wakeup_latency_qos_miscdev); + if (ret < 0) + pr_err("%s: %s setup failed\n", __func__, + cpu_wakeup_latency_qos_miscdev.name); +#endif + return ret; } late_initcall(cpu_latency_qos_init); From 8e7de6dc420979f4e4443807b71dcc8b72d8c4a9 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Tue, 25 Nov 2025 12:26:43 +0100 Subject: [PATCH 89/96] pmdomain: Respect the CPU system wakeup QoS limit for s2idle A CPU system wakeup QoS limit may have been requested by user space. To avoid breaking this constraint when entering a low power state during s2idle through genpd, let's extend the corresponding genpd governor for CPUs. More precisely, during s2idle let the genpd governor select a suitable domain idle state, by taking into account the QoS limit. Reviewed-by: Dhruva Gole Reviewed-by: Kevin Hilman (TI) Tested-by: Kevin Hilman (TI) Signed-off-by: Ulf Hansson Link: https://patch.msgid.link/20251125112650.329269-3-ulf.hansson@linaro.org Signed-off-by: Rafael J. Wysocki --- drivers/pmdomain/core.c | 10 ++++++++-- drivers/pmdomain/governor.c | 27 +++++++++++++++++++++++++++ include/linux/pm_domain.h | 1 + 3 files changed, 36 insertions(+), 2 deletions(-) diff --git a/drivers/pmdomain/core.c b/drivers/pmdomain/core.c index 61c2277c9ce3..4fd546ef0448 100644 --- a/drivers/pmdomain/core.c +++ b/drivers/pmdomain/core.c @@ -1425,8 +1425,14 @@ static void genpd_sync_power_off(struct generic_pm_domain *genpd, bool use_lock, return; } - /* Choose the deepest state when suspending */ - genpd->state_idx = genpd->state_count - 1; + if (genpd->gov && genpd->gov->system_power_down_ok) { + if (!genpd->gov->system_power_down_ok(&genpd->domain)) + return; + } else { + /* Default to the deepest state. */ + genpd->state_idx = genpd->state_count - 1; + } + if (_genpd_power_off(genpd, false)) { genpd->states[genpd->state_idx].rejected++; return; diff --git a/drivers/pmdomain/governor.c b/drivers/pmdomain/governor.c index 39359811a930..bd1b9d66d4a5 100644 --- a/drivers/pmdomain/governor.c +++ b/drivers/pmdomain/governor.c @@ -415,9 +415,36 @@ static bool cpu_power_down_ok(struct dev_pm_domain *pd) return false; } +static bool cpu_system_power_down_ok(struct dev_pm_domain *pd) +{ + s64 constraint_ns = cpu_wakeup_latency_qos_limit() * NSEC_PER_USEC; + struct generic_pm_domain *genpd = pd_to_genpd(pd); + int state_idx = genpd->state_count - 1; + + if (!(genpd->flags & GENPD_FLAG_CPU_DOMAIN)) { + genpd->state_idx = state_idx; + return true; + } + + /* Find the deepest state for the latency constraint. */ + while (state_idx >= 0) { + s64 latency_ns = genpd->states[state_idx].power_off_latency_ns + + genpd->states[state_idx].power_on_latency_ns; + + if (latency_ns <= constraint_ns) { + genpd->state_idx = state_idx; + return true; + } + state_idx--; + } + + return false; +} + struct dev_power_governor pm_domain_cpu_gov = { .suspend_ok = default_suspend_ok, .power_down_ok = cpu_power_down_ok, + .system_power_down_ok = cpu_system_power_down_ok, }; #endif diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h index f67a2cb7d781..93ba0143ca47 100644 --- a/include/linux/pm_domain.h +++ b/include/linux/pm_domain.h @@ -153,6 +153,7 @@ enum genpd_sync_state { }; struct dev_power_governor { + bool (*system_power_down_ok)(struct dev_pm_domain *domain); bool (*power_down_ok)(struct dev_pm_domain *domain); bool (*suspend_ok)(struct device *dev); }; From e2e4695f015eacbe11178540524438f631ba9413 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Tue, 25 Nov 2025 12:26:44 +0100 Subject: [PATCH 90/96] pmdomain: Respect the CPU system wakeup QoS limit for cpuidle The CPU system wakeup QoS limit must be respected for the regular cpuidle state selection. Therefore, let's extend the genpd governor for CPUs to take the constraint into account when it selects a domain idle state for the corresponding PM domain. Reviewed-by: Dhruva Gole Reviewed-by: Kevin Hilman (TI) Tested-by: Kevin Hilman (TI) Signed-off-by: Ulf Hansson Link: https://patch.msgid.link/20251125112650.329269-4-ulf.hansson@linaro.org Signed-off-by: Rafael J. Wysocki --- drivers/pmdomain/governor.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/pmdomain/governor.c b/drivers/pmdomain/governor.c index bd1b9d66d4a5..05e68680f34b 100644 --- a/drivers/pmdomain/governor.c +++ b/drivers/pmdomain/governor.c @@ -351,7 +351,7 @@ static bool cpu_power_down_ok(struct dev_pm_domain *pd) ktime_t domain_wakeup, next_hrtimer; ktime_t now = ktime_get(); struct device *cpu_dev; - s64 cpu_constraint, global_constraint; + s64 cpu_constraint, global_constraint, wakeup_constraint; s64 idle_duration_ns; int cpu, i; @@ -362,7 +362,11 @@ static bool cpu_power_down_ok(struct dev_pm_domain *pd) if (!(genpd->flags & GENPD_FLAG_CPU_DOMAIN)) return true; + wakeup_constraint = cpu_wakeup_latency_qos_limit(); global_constraint = cpu_latency_qos_limit(); + if (global_constraint > wakeup_constraint) + global_constraint = wakeup_constraint; + /* * Find the next wakeup for any of the online CPUs within the PM domain * and its subdomains. Note, we only need the genpd->cpus, as it already From 99b42445f4a4aaff75eca24dfc9e6e376292dd48 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Tue, 25 Nov 2025 12:26:45 +0100 Subject: [PATCH 91/96] sched: idle: Respect the CPU system wakeup QoS limit for s2idle A CPU system wakeup QoS limit may have been requested by user space. To avoid breaking this constraint when entering a low power state during s2idle, let's start to take into account the QoS limit. Acked-by: Peter Zijlstra (Intel) Reviewed-by: Dhruva Gole Reviewed-by: Kevin Hilman (TI) Tested-by: Kevin Hilman (TI) Signed-off-by: Ulf Hansson Link: https://patch.msgid.link/20251125112650.329269-5-ulf.hansson@linaro.org Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/cpuidle.c | 12 +++++++----- include/linux/cpuidle.h | 6 ++++-- kernel/sched/idle.c | 12 +++++++----- 3 files changed, 18 insertions(+), 12 deletions(-) diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index 56132e843c99..c7876e9e024f 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -184,20 +184,22 @@ static noinstr void enter_s2idle_proper(struct cpuidle_driver *drv, * cpuidle_enter_s2idle - Enter an idle state suitable for suspend-to-idle. * @drv: cpuidle driver for the given CPU. * @dev: cpuidle device for the given CPU. + * @latency_limit_ns: Idle state exit latency limit * * If there are states with the ->enter_s2idle callback, find the deepest of * them and enter it with frozen tick. */ -int cpuidle_enter_s2idle(struct cpuidle_driver *drv, struct cpuidle_device *dev) +int cpuidle_enter_s2idle(struct cpuidle_driver *drv, struct cpuidle_device *dev, + u64 latency_limit_ns) { int index; /* - * Find the deepest state with ->enter_s2idle present, which guarantees - * that interrupts won't be enabled when it exits and allows the tick to - * be frozen safely. + * Find the deepest state with ->enter_s2idle present that meets the + * specified latency limit, which guarantees that interrupts won't be + * enabled when it exits and allows the tick to be frozen safely. */ - index = find_deepest_state(drv, dev, U64_MAX, 0, true); + index = find_deepest_state(drv, dev, latency_limit_ns, 0, true); if (index > 0) { enter_s2idle_proper(drv, dev, index); local_irq_enable(); diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index a9ee4fe55dcf..4073690504a7 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -248,7 +248,8 @@ extern int cpuidle_find_deepest_state(struct cpuidle_driver *drv, struct cpuidle_device *dev, u64 latency_limit_ns); extern int cpuidle_enter_s2idle(struct cpuidle_driver *drv, - struct cpuidle_device *dev); + struct cpuidle_device *dev, + u64 latency_limit_ns); extern void cpuidle_use_deepest_state(u64 latency_limit_ns); #else static inline int cpuidle_find_deepest_state(struct cpuidle_driver *drv, @@ -256,7 +257,8 @@ static inline int cpuidle_find_deepest_state(struct cpuidle_driver *drv, u64 latency_limit_ns) {return -ENODEV; } static inline int cpuidle_enter_s2idle(struct cpuidle_driver *drv, - struct cpuidle_device *dev) + struct cpuidle_device *dev, + u64 latency_limit_ns) {return -ENODEV; } static inline void cpuidle_use_deepest_state(u64 latency_limit_ns) { diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index c39b089d4f09..c1c3d0166610 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -131,12 +131,13 @@ void __cpuidle default_idle_call(void) } static int call_cpuidle_s2idle(struct cpuidle_driver *drv, - struct cpuidle_device *dev) + struct cpuidle_device *dev, + u64 max_latency_ns) { if (current_clr_polling_and_test()) return -EBUSY; - return cpuidle_enter_s2idle(drv, dev); + return cpuidle_enter_s2idle(drv, dev, max_latency_ns); } static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev, @@ -205,12 +206,13 @@ static void cpuidle_idle_call(void) u64 max_latency_ns; if (idle_should_enter_s2idle()) { + max_latency_ns = cpu_wakeup_latency_qos_limit() * + NSEC_PER_USEC; - entered_state = call_cpuidle_s2idle(drv, dev); + entered_state = call_cpuidle_s2idle(drv, dev, + max_latency_ns); if (entered_state > 0) goto exit_idle; - - max_latency_ns = U64_MAX; } else { max_latency_ns = dev->forced_idle_latency_limit_ns; } From 2b8d594742398cdbf40012c0b3c8b71ca160e22d Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Tue, 25 Nov 2025 12:26:46 +0100 Subject: [PATCH 92/96] cpuidle: Respect the CPU system wakeup QoS limit for cpuidle The CPU system wakeup QoS limit must be respected for the regular cpuidle state selection. Therefore, let's extend the common governor helper cpuidle_governor_latency_req(), to take the constraint into account. Reviewed-by: Dhruva Gole Reviewed-by: Kevin Hilman (TI) Tested-by: Kevin Hilman (TI) Signed-off-by: Ulf Hansson Link: https://patch.msgid.link/20251125112650.329269-6-ulf.hansson@linaro.org Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/governor.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/cpuidle/governor.c b/drivers/cpuidle/governor.c index 0d0f9751ff8f..5d0e7f78c6c5 100644 --- a/drivers/cpuidle/governor.c +++ b/drivers/cpuidle/governor.c @@ -111,6 +111,10 @@ s64 cpuidle_governor_latency_req(unsigned int cpu) struct device *device = get_cpu_device(cpu); int device_req = dev_pm_qos_raw_resume_latency(device); int global_req = cpu_latency_qos_limit(); + int global_wake_req = cpu_wakeup_latency_qos_limit(); + + if (global_req > global_wake_req) + global_req = global_wake_req; if (device_req > global_req) device_req = global_req; From c19dfb267c28032293515a635eaefbf9194629ac Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Tue, 25 Nov 2025 12:26:47 +0100 Subject: [PATCH 93/96] Documentation: power/cpuidle: Document the CPU system wakeup latency QoS Let's document how the new CPU system wakeup latency QoS limit can be used from user space, along with how the constraint is taken into account for s2idle and cpuidle. Reviewed-by: Dhruva Gole Reviewed-by: Kevin Hilman (TI) Tested-by: Kevin Hilman (TI) Signed-off-by: Ulf Hansson Link: https://patch.msgid.link/20251125112650.329269-7-ulf.hansson@linaro.org Signed-off-by: Rafael J. Wysocki --- Documentation/admin-guide/pm/cpuidle.rst | 9 +++++++++ Documentation/power/pm_qos_interface.rst | 9 +++++---- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/Documentation/admin-guide/pm/cpuidle.rst b/Documentation/admin-guide/pm/cpuidle.rst index 0c090b076224..be4c1120e3f0 100644 --- a/Documentation/admin-guide/pm/cpuidle.rst +++ b/Documentation/admin-guide/pm/cpuidle.rst @@ -580,6 +580,15 @@ the given CPU as the upper limit for the exit latency of the idle states that they are allowed to select for that CPU. They should never select any idle states with exit latency beyond that limit. +While the above CPU QoS constraints apply to CPU idle time management, user +space may also request a CPU system wakeup latency QoS limit, via the +`cpu_wakeup_latency` file. This QoS constraint is respected when selecting a +suitable idle state for the CPUs, while entering the system-wide suspend-to-idle +sleep state, but also to the regular CPU idle time management. + +Note that, the management of the `cpu_wakeup_latency` file works according to +the 'cpu_dma_latency' file from user space point of view. Moreover, the unit +is also microseconds. Idle States Control Via Kernel Command Line =========================================== diff --git a/Documentation/power/pm_qos_interface.rst b/Documentation/power/pm_qos_interface.rst index 5019c79c7710..4c008e2202f0 100644 --- a/Documentation/power/pm_qos_interface.rst +++ b/Documentation/power/pm_qos_interface.rst @@ -55,7 +55,8 @@ int cpu_latency_qos_request_active(handle): From user space: -The infrastructure exposes one device node, /dev/cpu_dma_latency, for the CPU +The infrastructure exposes two separate device nodes, /dev/cpu_dma_latency for +the CPU latency QoS and /dev/cpu_wakeup_latency for the CPU system wakeup latency QoS. Only processes can register a PM QoS request. To provide for automatic @@ -63,15 +64,15 @@ cleanup of a process, the interface requires the process to register its parameter requests as follows. To register the default PM QoS target for the CPU latency QoS, the process must -open /dev/cpu_dma_latency. +open /dev/cpu_dma_latency. To register a CPU system wakeup QoS limit, the +process must open /dev/cpu_wakeup_latency. As long as the device node is held open that process has a registered request on the parameter. To change the requested target value, the process needs to write an s32 value to the open device node. Alternatively, it can write a hex string for the value -using the 10 char long format e.g. "0x12345678". This translates to a -cpu_latency_qos_update_request() call. +using the 10 char long format e.g. "0x12345678". To remove the user mode request for a target value simply close the device node. From 6d96ceff9aeb7e7a1713faaccf472f363cc6d48f Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 24 Nov 2025 21:57:52 +0100 Subject: [PATCH 94/96] cpuidle: Update header inclusion While cleaning up some headers, I got a build error on this file: drivers/cpuidle/poll_state.c:52:2: error: call to undeclared library function 'snprintf' with type 'int (char *restrict, unsigned long, const char *restrict, ...)'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration] Update header inclusions to follow IWYU (Include What You Use) principle. Signed-off-by: Andy Shevchenko Link: https://patch.msgid.link/20251124205752.1328701-1-andriy.shevchenko@linux.intel.com Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/poll_state.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/cpuidle/poll_state.c b/drivers/cpuidle/poll_state.c index 9b6d90a72601..c7524e4c522a 100644 --- a/drivers/cpuidle/poll_state.c +++ b/drivers/cpuidle/poll_state.c @@ -4,9 +4,13 @@ */ #include +#include +#include #include #include #include +#include +#include #define POLL_IDLE_RELAX_COUNT 200 From 4bf944f3fcb6c192af1ea73e3d183b6364458b25 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 25 Nov 2025 17:23:12 +0100 Subject: [PATCH 95/96] cpuidle: Warn instead of bailing out if target residency check fails It turns out that the change in commit 76934e495cdc ("cpuidle: Add sanity check for exit latency and target residency") goes too far because there are systems in the field on which the check introduced by that commit does not pass. For this reason, change __cpuidle_driver_init() return type back to void and make it print a warning when the check mentioned above does not pass. Fixes: 76934e495cdc ("cpuidle: Add sanity check for exit latency and target residency") Reported-by: Val Packett Closes: https://lore.kernel.org/linux-pm/20251121010756.6687-1-val@packett.cool/ Signed-off-by: Rafael J. Wysocki Reviewed-by: Christian Loehle Link: https://patch.msgid.link/2808566.mvXUDI8C0e@rafael.j.wysocki --- drivers/cpuidle/driver.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/drivers/cpuidle/driver.c b/drivers/cpuidle/driver.c index 1c295a93d582..370664c47e65 100644 --- a/drivers/cpuidle/driver.c +++ b/drivers/cpuidle/driver.c @@ -8,6 +8,8 @@ * This code is licenced under the GPL. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -152,7 +154,7 @@ static void cpuidle_setup_broadcast_timer(void *arg) * __cpuidle_driver_init - initialize the driver's internal data * @drv: a valid pointer to a struct cpuidle_driver */ -static int __cpuidle_driver_init(struct cpuidle_driver *drv) +static void __cpuidle_driver_init(struct cpuidle_driver *drv) { int i; @@ -195,15 +197,13 @@ static int __cpuidle_driver_init(struct cpuidle_driver *drv) s->exit_latency = div_u64(s->exit_latency_ns, NSEC_PER_USEC); /* - * Ensure that the exit latency of a CPU idle state does not - * exceed its target residency which is assumed in cpuidle in - * multiple places. + * Warn if the exit latency of a CPU idle state exceeds its + * target residency which is assumed to never happen in cpuidle + * in multiple places. */ if (s->exit_latency_ns > s->target_residency_ns) - return -EINVAL; + pr_warn("Idle state %d target residency too low\n", i); } - - return 0; } /** @@ -233,9 +233,7 @@ static int __cpuidle_register_driver(struct cpuidle_driver *drv) if (cpuidle_disabled()) return -ENODEV; - ret = __cpuidle_driver_init(drv); - if (ret) - return ret; + __cpuidle_driver_init(drv); ret = __cpuidle_set_driver(drv); if (ret) From d9600d57668c49308f705a660c5ad17fa3a53f73 Mon Sep 17 00:00:00 2001 From: Riwen Lu Date: Tue, 18 Nov 2025 11:23:38 +0800 Subject: [PATCH 96/96] PM / devfreq: Fix typo in DFSO_DOWNDIFFERENTIAL macro name Correct the spelling error in the DFSO_DOWNDIFFERENTIAL macro definition and update the corresponding variable assignment. The macro was previously misspelled as DFSO_DOWNDIFFERENCTIAL. This change ensures consistent and correct spelling throughout the simpleondemand governor implementation. Signed-off-by: Riwen Lu Signed-off-by: Chanwoo Choi Link: https://patchwork.kernel.org/project/linux-pm/patch/20251118032339.2799230-1-luriwen@kylinos.cn/ --- drivers/devfreq/governor_simpleondemand.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/devfreq/governor_simpleondemand.c b/drivers/devfreq/governor_simpleondemand.c index 9c69b96df5f9..ac9c5e9e51a4 100644 --- a/drivers/devfreq/governor_simpleondemand.c +++ b/drivers/devfreq/governor_simpleondemand.c @@ -14,7 +14,7 @@ /* Default constants for DevFreq-Simple-Ondemand (DFSO) */ #define DFSO_UPTHRESHOLD (90) -#define DFSO_DOWNDIFFERENCTIAL (5) +#define DFSO_DOWNDIFFERENTIAL (5) static int devfreq_simple_ondemand_func(struct devfreq *df, unsigned long *freq) { @@ -22,7 +22,7 @@ static int devfreq_simple_ondemand_func(struct devfreq *df, struct devfreq_dev_status *stat; unsigned long long a, b; unsigned int dfso_upthreshold = DFSO_UPTHRESHOLD; - unsigned int dfso_downdifferential = DFSO_DOWNDIFFERENCTIAL; + unsigned int dfso_downdifferential = DFSO_DOWNDIFFERENTIAL; struct devfreq_simple_ondemand_data *data = df->data; err = devfreq_update_stats(df);