From beda3b363546a423e4e29a7395e04c0ac4ff677e Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Thu, 26 Mar 2026 17:17:45 +0530 Subject: [PATCH 01/22] amd-pstate: Fix memory leak in amd_pstate_epp_cpu_init() On failure to set the epp, the function amd_pstate_epp_cpu_init() returns with an error code without freeing the cpudata object that was allocated at the beginning of the function. Ensure that the cpudata object is freed before returning from the function. This memory leak was discovered by Claude Opus 4.6 with the aid of Chris Mason's AI review-prompts (https://github.com/masoncl/review-prompts/tree/main/kernel). Assisted-by: Claude:claude-opus-4.6 review-prompts/linux Fixes: f9a378ff6443 ("cpufreq/amd-pstate: Set different default EPP policy for Epyc and Ryzen") Reviewed-by: Mario Limonciello (AMD) Signed-off-by: Gautham R. Shenoy Signed-off-by: Mario Limonciello (AMD) --- drivers/cpufreq/amd-pstate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 5aa9fcd80cf5..d57969c72c9d 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -1533,7 +1533,7 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) ret = amd_pstate_set_epp(policy, cpudata->epp_default); if (ret) - return ret; + goto free_cpudata1; current_pstate_driver->adjust_perf = NULL; From fcc25a291fbdca2c06c2c6602532050873f0c9de Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Thu, 26 Mar 2026 17:17:46 +0530 Subject: [PATCH 02/22] amd-pstate: Update cppc_req_cached in fast_switch case The function msr_update_perf() does not cache the new value that is written to MSR_AMD_CPPC_REQ into the variable cpudata->cppc_req_cached when the update is happening from the fast path. Fix that by caching the value everytime the MSR_AMD_CPPC_REQ gets updated. This issue was discovered by Claude Opus 4.6 with the aid of Chris Mason's AI review-prompts (https://github.com/masoncl/review-prompts/tree/main/kernel). Assisted-by: Claude:claude-opus-4.6 review-prompts/linux Reviewed-by: Mario Limonciello (AMD) Fixes: fff395796917 ("cpufreq/amd-pstate: Always write EPP value when updating perf") Signed-off-by: Gautham R. Shenoy Signed-off-by: Mario Limonciello (AMD) --- drivers/cpufreq/amd-pstate.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index d57969c72c9d..24cdeffbcd40 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -261,7 +261,6 @@ static int msr_update_perf(struct cpufreq_policy *policy, u8 min_perf, if (fast_switch) { wrmsrq(MSR_AMD_CPPC_REQ, value); - return 0; } else { int ret = wrmsrq_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value); From e67a5b6541831bbf1c40b6042a867a4594ec6b55 Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Thu, 26 Mar 2026 17:17:47 +0530 Subject: [PATCH 03/22] amd-pstate: Make certain freq_attrs conditionally visible Certain amd_pstate freq_attrs such as amd_pstate_hw_prefcore and amd_pstate_prefcore_ranking are enabled even when preferred core is not supported on the platform. Similarly there are common freq_attrs between the amd-pstate and the amd-pstate-epp drivers (eg: amd_pstate_max_freq, amd_pstate_lowest_nonlinear_freq, etc.) but are duplicated in two different freq_attr structs. Unify all the attributes in a single place and associate each of them with a visibility function that determines whether the attribute should be visible based on the underlying platform support and the current amd_pstate mode. Reviewed-by: Mario Limonciello (AMD) Signed-off-by: Gautham R. Shenoy Signed-off-by: Mario Limonciello (AMD) --- drivers/cpufreq/amd-pstate.c | 124 ++++++++++++++++++++++++++--------- 1 file changed, 93 insertions(+), 31 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 24cdeffbcd40..4de2037a414c 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -1220,12 +1220,87 @@ static ssize_t show_energy_performance_preference( return sysfs_emit(buf, "%s\n", energy_perf_strings[preference]); } +cpufreq_freq_attr_ro(amd_pstate_max_freq); +cpufreq_freq_attr_ro(amd_pstate_lowest_nonlinear_freq); + +cpufreq_freq_attr_ro(amd_pstate_highest_perf); +cpufreq_freq_attr_ro(amd_pstate_prefcore_ranking); +cpufreq_freq_attr_ro(amd_pstate_hw_prefcore); +cpufreq_freq_attr_rw(energy_performance_preference); +cpufreq_freq_attr_ro(energy_performance_available_preferences); + +struct freq_attr_visibility { + struct freq_attr *attr; + bool (*visibility_fn)(void); +}; + +/* For attributes which are always visible */ +static bool always_visible(void) +{ + return true; +} + +/* Determines whether prefcore related attributes should be visible */ +static bool prefcore_visibility(void) +{ + return amd_pstate_prefcore; +} + +/* Determines whether energy performance preference should be visible */ +static bool epp_visibility(void) +{ + return cppc_state == AMD_PSTATE_ACTIVE; +} + +static struct freq_attr_visibility amd_pstate_attr_visibility[] = { + {&amd_pstate_max_freq, always_visible}, + {&amd_pstate_lowest_nonlinear_freq, always_visible}, + {&amd_pstate_highest_perf, always_visible}, + {&amd_pstate_prefcore_ranking, prefcore_visibility}, + {&amd_pstate_hw_prefcore, prefcore_visibility}, + {&energy_performance_preference, epp_visibility}, + {&energy_performance_available_preferences, epp_visibility}, +}; + +static struct freq_attr **get_freq_attrs(void) +{ + bool attr_visible[ARRAY_SIZE(amd_pstate_attr_visibility)]; + struct freq_attr **attrs; + int i, j, count; + + for (i = 0, count = 0; i < ARRAY_SIZE(amd_pstate_attr_visibility); i++) { + struct freq_attr_visibility *v = &amd_pstate_attr_visibility[i]; + + attr_visible[i] = v->visibility_fn(); + if (attr_visible[i]) + count++; + } + + /* amd_pstate_{max_freq, lowest_nonlinear_freq, highest_perf} should always be visible */ + BUG_ON(!count); + + attrs = kcalloc(count + 1, sizeof(struct freq_attr *), GFP_KERNEL); + if (!attrs) + return ERR_PTR(-ENOMEM); + + for (i = 0, j = 0; i < ARRAY_SIZE(amd_pstate_attr_visibility); i++) { + if (!attr_visible[i]) + continue; + + attrs[j++] = amd_pstate_attr_visibility[i].attr; + } + + return attrs; +} + static void amd_pstate_driver_cleanup(void) { if (amd_pstate_prefcore) sched_clear_itmt_support(); cppc_state = AMD_PSTATE_DISABLE; + kfree(current_pstate_driver->attr); + current_pstate_driver->attr = NULL; current_pstate_driver = NULL; } @@ -1250,6 +1325,7 @@ static int amd_pstate_set_driver(int mode_idx) static int amd_pstate_register_driver(int mode) { + struct freq_attr **attr = NULL; int ret; ret = amd_pstate_set_driver(mode); @@ -1258,6 +1334,22 @@ static int amd_pstate_register_driver(int mode) cppc_state = mode; + /* + * Note: It is important to compute the attrs _after_ + * re-initializing the cppc_state. Some attributes become + * visible only when cppc_state is AMD_PSTATE_ACTIVE. + */ + attr = get_freq_attrs(); + if (IS_ERR(attr)) { + ret = (int) PTR_ERR(attr); + pr_err("Couldn't compute freq_attrs for current mode %s [%d]\n", + amd_pstate_get_mode_string(cppc_state), ret); + amd_pstate_driver_cleanup(); + return ret; + } + + current_pstate_driver->attr = attr; + /* at least one CPU supports CPB */ current_pstate_driver->boost_enabled = cpu_feature_enabled(X86_FEATURE_CPB); @@ -1399,37 +1491,9 @@ static ssize_t prefcore_show(struct device *dev, return sysfs_emit(buf, "%s\n", str_enabled_disabled(amd_pstate_prefcore)); } -cpufreq_freq_attr_ro(amd_pstate_max_freq); -cpufreq_freq_attr_ro(amd_pstate_lowest_nonlinear_freq); - -cpufreq_freq_attr_ro(amd_pstate_highest_perf); -cpufreq_freq_attr_ro(amd_pstate_prefcore_ranking); -cpufreq_freq_attr_ro(amd_pstate_hw_prefcore); -cpufreq_freq_attr_rw(energy_performance_preference); -cpufreq_freq_attr_ro(energy_performance_available_preferences); static DEVICE_ATTR_RW(status); static DEVICE_ATTR_RO(prefcore); -static struct freq_attr *amd_pstate_attr[] = { - &amd_pstate_max_freq, - &amd_pstate_lowest_nonlinear_freq, - &amd_pstate_highest_perf, - &amd_pstate_prefcore_ranking, - &amd_pstate_hw_prefcore, - NULL, -}; - -static struct freq_attr *amd_pstate_epp_attr[] = { - &amd_pstate_max_freq, - &amd_pstate_lowest_nonlinear_freq, - &amd_pstate_highest_perf, - &amd_pstate_prefcore_ranking, - &amd_pstate_hw_prefcore, - &energy_performance_preference, - &energy_performance_available_preferences, - NULL, -}; - static struct attribute *pstate_global_attributes[] = { &dev_attr_status.attr, &dev_attr_prefcore.attr, @@ -1696,7 +1760,6 @@ static struct cpufreq_driver amd_pstate_driver = { .set_boost = amd_pstate_set_boost, .update_limits = amd_pstate_update_limits, .name = "amd-pstate", - .attr = amd_pstate_attr, }; static struct cpufreq_driver amd_pstate_epp_driver = { @@ -1712,7 +1775,6 @@ static struct cpufreq_driver amd_pstate_epp_driver = { .update_limits = amd_pstate_update_limits, .set_boost = amd_pstate_set_boost, .name = "amd-pstate-epp", - .attr = amd_pstate_epp_attr, }; /* @@ -1858,7 +1920,7 @@ static int __init amd_pstate_init(void) return ret; global_attr_free: - cpufreq_unregister_driver(current_pstate_driver); + amd_pstate_unregister_driver(0); return ret; } device_initcall(amd_pstate_init); From 172100088f9b131b88bcde70724485470c20e7d2 Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Thu, 26 Mar 2026 17:17:48 +0530 Subject: [PATCH 04/22] x86/cpufeatures: Add AMD CPPC Performance Priority feature. Some future AMD processors have feature named "CPPC Performance Priority" which lets userspace specify different floor performance levels for different CPUs. The platform firmware takes these different floor performance levels into consideration while throttling the CPUs under power/thermal constraints. The presence of this feature is indicated by bit 16 of the EDX register for CPUID leaf 0x80000007. More details can be found in AMD Publication titled "AMD64 Collaborative Processor Performance Control (CPPC) Performance Priority" Revision 1.10. Define a new feature bit named X86_FEATURE_CPPC_PERF_PRIO to map to CPUID 0x80000007.EDX[16]. Reviewed-by: Borislav Petkov (AMD) Signed-off-by: Gautham R. Shenoy Signed-off-by: Mario Limonciello (AMD) --- arch/x86/include/asm/cpufeatures.h | 2 +- arch/x86/kernel/cpu/scattered.c | 1 + tools/arch/x86/include/asm/cpufeatures.h | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index dbe104df339b..86d17b195e79 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -415,7 +415,7 @@ */ #define X86_FEATURE_OVERFLOW_RECOV (17*32+ 0) /* "overflow_recov" MCA overflow recovery support */ #define X86_FEATURE_SUCCOR (17*32+ 1) /* "succor" Uncorrectable error containment and recovery */ - +#define X86_FEATURE_CPPC_PERF_PRIO (17*32+ 2) /* CPPC Floor Perf support */ #define X86_FEATURE_SMCA (17*32+ 3) /* "smca" Scalable MCA */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */ diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 42c7eac0c387..837d6a4b0c28 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -52,6 +52,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, { X86_FEATURE_AMD_FAST_CPPC, CPUID_EDX, 15, 0x80000007, 0 }, + { X86_FEATURE_CPPC_PERF_PRIO, CPUID_EDX, 16, 0x80000007, 0 }, { X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 }, { X86_FEATURE_X2AVIC_EXT, CPUID_ECX, 6, 0x8000000a, 0 }, { X86_FEATURE_COHERENCY_SFW_NO, CPUID_EBX, 31, 0x8000001f, 0 }, diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index dbe104df339b..86d17b195e79 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -415,7 +415,7 @@ */ #define X86_FEATURE_OVERFLOW_RECOV (17*32+ 0) /* "overflow_recov" MCA overflow recovery support */ #define X86_FEATURE_SUCCOR (17*32+ 1) /* "succor" Uncorrectable error containment and recovery */ - +#define X86_FEATURE_CPPC_PERF_PRIO (17*32+ 2) /* CPPC Floor Perf support */ #define X86_FEATURE_SMCA (17*32+ 3) /* "smca" Scalable MCA */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */ From 97838281f587a9e98e74b913201f7408214b5999 Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Thu, 26 Mar 2026 17:17:49 +0530 Subject: [PATCH 05/22] amd-pstate: Add support for CPPC_REQ2 and FLOOR_PERF Some future AMD processors have feature named "CPPC Performance Priority" which lets userspace specify different floor performance levels for different CPUs. The platform firmware takes these different floor performance levels into consideration while throttling the CPUs under power/thermal constraints. The presence of this feature is indicated by bit 16 of the EDX register for CPUID leaf 0x80000007. More details can be found in AMD Publication titled "AMD64 Collaborative Processor Performance Control (CPPC) Performance Priority" Revision 1.10. The number of distinct floor performance levels supported on the platform will be advertised through the bits 32:39 of the MSR_AMD_CPPC_CAP1. Bits 0:7 of a new MSR MSR_AMD_CPPC_REQ2 (0xc00102b5) will be used to specify the desired floor performance level for that CPU. Add support for the aforementioned MSR_AMD_CPPC_REQ2, and macros for parsing and updating the relevant bits from MSR_AMD_CPPC_CAP1 and MSR_AMD_CPPC_REQ2. On boot if the default value of the MSR_AMD_CPPC_REQ2[7:0] (Floor Perf) is lower than CPPC.lowest_perf, and thus invalid, initialize it to MSR_AMD_CPPC_CAP1.nominal_perf which is a sane default value. Save the boot-time floor_perf during amd_pstate_init_floor_perf(). In a subsequent patch it will be restored in the suspend, offline, and exit paths, mirroring how bios_min_perf is handled for MSR_AMD_CPPC_REQ. Link: https://docs.amd.com/v/u/en-US/69206_1.10_AMD64_CPPC_PUB Reviewed-by: Mario Limonciello (AMD) Signed-off-by: Gautham R. Shenoy Signed-off-by: Mario Limonciello (AMD) --- arch/x86/include/asm/msr-index.h | 5 ++ drivers/cpufreq/amd-pstate.c | 78 +++++++++++++++++++++++++++++++- drivers/cpufreq/amd-pstate.h | 8 ++++ 3 files changed, 90 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 6673601246b3..e126c7fb69cf 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -765,12 +765,14 @@ #define MSR_AMD_CPPC_CAP2 0xc00102b2 #define MSR_AMD_CPPC_REQ 0xc00102b3 #define MSR_AMD_CPPC_STATUS 0xc00102b4 +#define MSR_AMD_CPPC_REQ2 0xc00102b5 /* Masks for use with MSR_AMD_CPPC_CAP1 */ #define AMD_CPPC_LOWEST_PERF_MASK GENMASK(7, 0) #define AMD_CPPC_LOWNONLIN_PERF_MASK GENMASK(15, 8) #define AMD_CPPC_NOMINAL_PERF_MASK GENMASK(23, 16) #define AMD_CPPC_HIGHEST_PERF_MASK GENMASK(31, 24) +#define AMD_CPPC_FLOOR_PERF_CNT_MASK GENMASK_ULL(39, 32) /* Masks for use with MSR_AMD_CPPC_REQ */ #define AMD_CPPC_MAX_PERF_MASK GENMASK(7, 0) @@ -778,6 +780,9 @@ #define AMD_CPPC_DES_PERF_MASK GENMASK(23, 16) #define AMD_CPPC_EPP_PERF_MASK GENMASK(31, 24) +/* Masks for use with MSR_AMD_CPPC_REQ2 */ +#define AMD_CPPC_FLOOR_PERF_MASK GENMASK(7, 0) + /* AMD Performance Counter Global Status and Control MSRs */ #define MSR_AMD64_PERF_CNTR_GLOBAL_STATUS 0xc0000300 #define MSR_AMD64_PERF_CNTR_GLOBAL_CTL 0xc0000301 diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 4de2037a414c..53b8173ff183 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -329,6 +329,65 @@ static inline int amd_pstate_set_epp(struct cpufreq_policy *policy, u8 epp) return static_call(amd_pstate_set_epp)(policy, epp); } +static int amd_pstate_set_floor_perf(struct cpufreq_policy *policy, u8 perf) +{ + struct amd_cpudata *cpudata = policy->driver_data; + u64 value, prev; + int ret; + + if (!cpu_feature_enabled(X86_FEATURE_CPPC_PERF_PRIO)) + return 0; + + value = prev = READ_ONCE(cpudata->cppc_req2_cached); + FIELD_MODIFY(AMD_CPPC_FLOOR_PERF_MASK, &value, perf); + + if (value == prev) + return 0; + + ret = wrmsrq_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ2, value); + if (ret) { + pr_err("failed to set CPPC REQ2 value. Error (%d)\n", ret); + return ret; + } + + WRITE_ONCE(cpudata->cppc_req2_cached, value); + + return ret; +} + +static int amd_pstate_init_floor_perf(struct cpufreq_policy *policy) +{ + struct amd_cpudata *cpudata = policy->driver_data; + u8 floor_perf; + u64 value; + int ret; + + if (!cpu_feature_enabled(X86_FEATURE_CPPC_PERF_PRIO)) + return 0; + + ret = rdmsrq_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ2, &value); + if (ret) { + pr_err("failed to read CPPC REQ2 value. Error (%d)\n", ret); + return ret; + } + + WRITE_ONCE(cpudata->cppc_req2_cached, value); + floor_perf = FIELD_GET(AMD_CPPC_FLOOR_PERF_MASK, + cpudata->cppc_req2_cached); + + /* Set a sane value for floor_perf if the default value is invalid */ + if (floor_perf < cpudata->perf.lowest_perf) { + floor_perf = cpudata->perf.nominal_perf; + ret = amd_pstate_set_floor_perf(policy, floor_perf); + if (ret) + return ret; + } + + cpudata->bios_floor_perf = floor_perf; + + return 0; +} + static int shmem_set_epp(struct cpufreq_policy *policy, u8 epp) { struct amd_cpudata *cpudata = policy->driver_data; @@ -426,6 +485,7 @@ static int msr_init_perf(struct amd_cpudata *cpudata) perf.lowest_perf = FIELD_GET(AMD_CPPC_LOWEST_PERF_MASK, cap1); WRITE_ONCE(cpudata->perf, perf); WRITE_ONCE(cpudata->prefcore_ranking, FIELD_GET(AMD_CPPC_HIGHEST_PERF_MASK, cap1)); + WRITE_ONCE(cpudata->floor_perf_cnt, FIELD_GET(AMD_CPPC_FLOOR_PERF_CNT_MASK, cap1)); return 0; } @@ -1024,6 +1084,7 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy) cpudata->nominal_freq, perf.highest_perf); + policy->driver_data = cpudata; ret = amd_pstate_cppc_enable(policy); if (ret) goto free_cpudata1; @@ -1036,6 +1097,12 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy) if (cpu_feature_enabled(X86_FEATURE_CPPC)) policy->fast_switch_possible = true; + ret = amd_pstate_init_floor_perf(policy); + if (ret) { + dev_err(dev, "Failed to initialize Floor Perf (%d)\n", ret); + goto free_cpudata1; + } + ret = freq_qos_add_request(&policy->constraints, &cpudata->req[0], FREQ_QOS_MIN, FREQ_QOS_MIN_DEFAULT_VALUE); if (ret < 0) { @@ -1050,7 +1117,6 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy) goto free_cpudata2; } - policy->driver_data = cpudata; if (!current_pstate_driver->adjust_perf) current_pstate_driver->adjust_perf = amd_pstate_adjust_perf; @@ -1062,6 +1128,7 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy) free_cpudata1: pr_warn("Failed to initialize CPU %d: %d\n", policy->cpu, ret); kfree(cpudata); + policy->driver_data = NULL; return ret; } @@ -1072,6 +1139,7 @@ static void amd_pstate_cpu_exit(struct cpufreq_policy *policy) /* Reset CPPC_REQ MSR to the BIOS value */ amd_pstate_update_perf(policy, perf.bios_min_perf, 0U, 0U, 0U, false); + amd_pstate_set_floor_perf(policy, cpudata->bios_floor_perf); freq_qos_remove_request(&cpudata->req[1]); freq_qos_remove_request(&cpudata->req[0]); @@ -1598,6 +1666,12 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) if (ret) goto free_cpudata1; + ret = amd_pstate_init_floor_perf(policy); + if (ret) { + dev_err(dev, "Failed to initialize Floor Perf (%d)\n", ret); + goto free_cpudata1; + } + current_pstate_driver->adjust_perf = NULL; return 0; @@ -1605,6 +1679,7 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) free_cpudata1: pr_warn("Failed to initialize CPU %d: %d\n", policy->cpu, ret); kfree(cpudata); + policy->driver_data = NULL; return ret; } @@ -1617,6 +1692,7 @@ static void amd_pstate_epp_cpu_exit(struct cpufreq_policy *policy) /* Reset CPPC_REQ MSR to the BIOS value */ amd_pstate_update_perf(policy, perf.bios_min_perf, 0U, 0U, 0U, false); + amd_pstate_set_floor_perf(policy, cpudata->bios_floor_perf); kfree(cpudata); policy->driver_data = NULL; diff --git a/drivers/cpufreq/amd-pstate.h b/drivers/cpufreq/amd-pstate.h index cb45fdca27a6..303da70b0afa 100644 --- a/drivers/cpufreq/amd-pstate.h +++ b/drivers/cpufreq/amd-pstate.h @@ -62,9 +62,14 @@ struct amd_aperf_mperf { * @cpu: CPU number * @req: constraint request to apply * @cppc_req_cached: cached performance request hints + * @cppc_req2_cached: cached value of MSR_AMD_CPPC_REQ2 * @perf: cached performance-related data * @prefcore_ranking: the preferred core ranking, the higher value indicates a higher * priority. + * @floor_perf_cnt: Cached value of the number of distinct floor + * performance levels supported + * @bios_floor_perf: Cached value of the boot-time floor performance level from + * MSR_AMD_CPPC_REQ2 * @min_limit_freq: Cached value of policy->min (in khz) * @max_limit_freq: Cached value of policy->max (in khz) * @nominal_freq: the frequency (in khz) that mapped to nominal_perf @@ -87,10 +92,13 @@ struct amd_cpudata { struct freq_qos_request req[2]; u64 cppc_req_cached; + u64 cppc_req2_cached; union perf_cached perf; u8 prefcore_ranking; + u8 floor_perf_cnt; + u8 bios_floor_perf; u32 min_limit_freq; u32 max_limit_freq; u32 nominal_freq; From b9f103d0968bc5b33bff1b1eb11c756b2ac07c6c Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Thu, 26 Mar 2026 17:17:50 +0530 Subject: [PATCH 06/22] amd-pstate: Add sysfs support for floor_freq and floor_count When Floor Performance feature is supported by the platform, expose two sysfs files: * amd_pstate_floor_freq to allow userspace to request the floor frequency for each CPU. * amd_pstate_floor_count which advertises the number of distinct levels of floor frequencies supported on this platform. Reset the floor_perf to bios_floor_perf in the suspend, offline, and exit paths, and restore the value to the cached user-request floor_freq on the resume and online paths mirroring how bios_min_perf is handled for MSR_AMD_CPPC_REQ. Reviewed-by: Mario Limonciello (AMD) Signed-off-by: Gautham R. Shenoy Signed-off-by: Mario Limonciello (AMD) --- drivers/cpufreq/amd-pstate.c | 93 +++++++++++++++++++++++++++++++++--- drivers/cpufreq/amd-pstate.h | 2 + 2 files changed, 89 insertions(+), 6 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 53b8173ff183..a068c4457a8f 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -383,8 +383,10 @@ static int amd_pstate_init_floor_perf(struct cpufreq_policy *policy) return ret; } - cpudata->bios_floor_perf = floor_perf; + cpudata->bios_floor_perf = floor_perf; + cpudata->floor_freq = perf_to_freq(cpudata->perf, cpudata->nominal_freq, + floor_perf); return 0; } @@ -1288,6 +1290,46 @@ static ssize_t show_energy_performance_preference( return sysfs_emit(buf, "%s\n", energy_perf_strings[preference]); } +static ssize_t store_amd_pstate_floor_freq(struct cpufreq_policy *policy, + const char *buf, size_t count) +{ + struct amd_cpudata *cpudata = policy->driver_data; + union perf_cached perf = READ_ONCE(cpudata->perf); + unsigned int freq; + u8 floor_perf; + int ret; + + ret = kstrtouint(buf, 0, &freq); + if (ret) + return ret; + + if (freq < policy->cpuinfo.min_freq || freq > policy->max) + return -EINVAL; + + floor_perf = freq_to_perf(perf, cpudata->nominal_freq, freq); + ret = amd_pstate_set_floor_perf(policy, floor_perf); + + if (!ret) + cpudata->floor_freq = freq; + + return ret ?: count; +} + +static ssize_t show_amd_pstate_floor_freq(struct cpufreq_policy *policy, char *buf) +{ + struct amd_cpudata *cpudata = policy->driver_data; + + return sysfs_emit(buf, "%u\n", cpudata->floor_freq); +} + +static ssize_t show_amd_pstate_floor_count(struct cpufreq_policy *policy, char *buf) +{ + struct amd_cpudata *cpudata = policy->driver_data; + u8 count = cpudata->floor_perf_cnt; + + return sysfs_emit(buf, "%u\n", count); +} + cpufreq_freq_attr_ro(amd_pstate_max_freq); cpufreq_freq_attr_ro(amd_pstate_lowest_nonlinear_freq); @@ -1296,6 +1338,8 @@ cpufreq_freq_attr_ro(amd_pstate_prefcore_ranking); cpufreq_freq_attr_ro(amd_pstate_hw_prefcore); cpufreq_freq_attr_rw(energy_performance_preference); cpufreq_freq_attr_ro(energy_performance_available_preferences); +cpufreq_freq_attr_rw(amd_pstate_floor_freq); +cpufreq_freq_attr_ro(amd_pstate_floor_count); struct freq_attr_visibility { struct freq_attr *attr; @@ -1320,6 +1364,12 @@ static bool epp_visibility(void) return cppc_state == AMD_PSTATE_ACTIVE; } +/* Determines whether amd_pstate_floor_freq related attributes should be visible */ +static bool floor_freq_visibility(void) +{ + return cpu_feature_enabled(X86_FEATURE_CPPC_PERF_PRIO); +} + static struct freq_attr_visibility amd_pstate_attr_visibility[] = { {&amd_pstate_max_freq, always_visible}, {&amd_pstate_lowest_nonlinear_freq, always_visible}, @@ -1328,6 +1378,8 @@ static struct freq_attr_visibility amd_pstate_attr_visibility[] = { {&amd_pstate_hw_prefcore, prefcore_visibility}, {&energy_performance_preference, epp_visibility}, {&energy_performance_available_preferences, epp_visibility}, + {&amd_pstate_floor_freq, floor_freq_visibility}, + {&amd_pstate_floor_count, floor_freq_visibility}, }; static struct freq_attr **get_freq_attrs(void) @@ -1748,24 +1800,39 @@ static int amd_pstate_epp_set_policy(struct cpufreq_policy *policy) static int amd_pstate_cpu_online(struct cpufreq_policy *policy) { - return amd_pstate_cppc_enable(policy); + struct amd_cpudata *cpudata = policy->driver_data; + union perf_cached perf = READ_ONCE(cpudata->perf); + u8 cached_floor_perf; + int ret; + + ret = amd_pstate_cppc_enable(policy); + if (ret) + return ret; + + cached_floor_perf = freq_to_perf(perf, cpudata->nominal_freq, cpudata->floor_freq); + return amd_pstate_set_floor_perf(policy, cached_floor_perf); } static int amd_pstate_cpu_offline(struct cpufreq_policy *policy) { struct amd_cpudata *cpudata = policy->driver_data; union perf_cached perf = READ_ONCE(cpudata->perf); + int ret; /* * Reset CPPC_REQ MSR to the BIOS value, this will allow us to retain the BIOS specified * min_perf value across kexec reboots. If this CPU is just onlined normally after this, the * limits, epp and desired perf will get reset to the cached values in cpudata struct */ - return amd_pstate_update_perf(policy, perf.bios_min_perf, + ret = amd_pstate_update_perf(policy, perf.bios_min_perf, FIELD_GET(AMD_CPPC_DES_PERF_MASK, cpudata->cppc_req_cached), FIELD_GET(AMD_CPPC_MAX_PERF_MASK, cpudata->cppc_req_cached), FIELD_GET(AMD_CPPC_EPP_PERF_MASK, cpudata->cppc_req_cached), false); + if (ret) + return ret; + + return amd_pstate_set_floor_perf(policy, cpudata->bios_floor_perf); } static int amd_pstate_suspend(struct cpufreq_policy *policy) @@ -1787,6 +1854,10 @@ static int amd_pstate_suspend(struct cpufreq_policy *policy) if (ret) return ret; + ret = amd_pstate_set_floor_perf(policy, cpudata->bios_floor_perf); + if (ret) + return ret; + /* set this flag to avoid setting core offline*/ cpudata->suspended = true; @@ -1798,15 +1869,24 @@ static int amd_pstate_resume(struct cpufreq_policy *policy) struct amd_cpudata *cpudata = policy->driver_data; union perf_cached perf = READ_ONCE(cpudata->perf); int cur_perf = freq_to_perf(perf, cpudata->nominal_freq, policy->cur); + u8 cached_floor_perf; + int ret; /* Set CPPC_REQ to last sane value until the governor updates it */ - return amd_pstate_update_perf(policy, perf.min_limit_perf, cur_perf, perf.max_limit_perf, - 0U, false); + ret = amd_pstate_update_perf(policy, perf.min_limit_perf, cur_perf, perf.max_limit_perf, + 0U, false); + if (ret) + return ret; + + cached_floor_perf = freq_to_perf(perf, cpudata->nominal_freq, cpudata->floor_freq); + return amd_pstate_set_floor_perf(policy, cached_floor_perf); } static int amd_pstate_epp_resume(struct cpufreq_policy *policy) { struct amd_cpudata *cpudata = policy->driver_data; + union perf_cached perf = READ_ONCE(cpudata->perf); + u8 cached_floor_perf; if (cpudata->suspended) { int ret; @@ -1819,7 +1899,8 @@ static int amd_pstate_epp_resume(struct cpufreq_policy *policy) cpudata->suspended = false; } - return 0; + cached_floor_perf = freq_to_perf(perf, cpudata->nominal_freq, cpudata->floor_freq); + return amd_pstate_set_floor_perf(policy, cached_floor_perf); } static struct cpufreq_driver amd_pstate_driver = { diff --git a/drivers/cpufreq/amd-pstate.h b/drivers/cpufreq/amd-pstate.h index 303da70b0afa..453adfb445f8 100644 --- a/drivers/cpufreq/amd-pstate.h +++ b/drivers/cpufreq/amd-pstate.h @@ -74,6 +74,7 @@ struct amd_aperf_mperf { * @max_limit_freq: Cached value of policy->max (in khz) * @nominal_freq: the frequency (in khz) that mapped to nominal_perf * @lowest_nonlinear_freq: the frequency (in khz) that mapped to lowest_nonlinear_perf + * @floor_freq: Cached value of the user requested floor_freq * @cur: Difference of Aperf/Mperf/tsc count between last and current sample * @prev: Last Aperf/Mperf/tsc count value read from register * @freq: current cpu frequency value (in khz) @@ -103,6 +104,7 @@ struct amd_cpudata { u32 max_limit_freq; u32 nominal_freq; u32 lowest_nonlinear_freq; + u32 floor_freq; struct amd_aperf_mperf cur; struct amd_aperf_mperf prev; From 30c63f723440f12626a19da5a93e094da29af51e Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Thu, 26 Mar 2026 17:17:51 +0530 Subject: [PATCH 07/22] amd-pstate: Introduce a tracepoint trace_amd_pstate_cppc_req2() Introduce a new tracepoint trace_amd_pstate_cppc_req2() to track updates to MSR_AMD_CPPC_REQ2. Invoke this while changing the Floor Perf. Reviewed-by: Mario Limonciello Signed-off-by: Gautham R. Shenoy Signed-off-by: Mario Limonciello (AMD) --- drivers/cpufreq/amd-pstate-trace.h | 35 ++++++++++++++++++++++++++++++ drivers/cpufreq/amd-pstate.c | 14 +++++++++--- 2 files changed, 46 insertions(+), 3 deletions(-) diff --git a/drivers/cpufreq/amd-pstate-trace.h b/drivers/cpufreq/amd-pstate-trace.h index 32e1bdc588c5..91fa073b2be4 100644 --- a/drivers/cpufreq/amd-pstate-trace.h +++ b/drivers/cpufreq/amd-pstate-trace.h @@ -133,6 +133,41 @@ TRACE_EVENT(amd_pstate_epp_perf, ) ); +TRACE_EVENT(amd_pstate_cppc_req2, + + TP_PROTO(unsigned int cpu_id, + u8 floor_perf, + bool changed, + int err_code + ), + + TP_ARGS(cpu_id, + floor_perf, + changed, + err_code), + + TP_STRUCT__entry( + __field(unsigned int, cpu_id) + __field(u8, floor_perf) + __field(bool, changed) + __field(int, err_code) + ), + + TP_fast_assign( + __entry->cpu_id = cpu_id; + __entry->floor_perf = floor_perf; + __entry->changed = changed; + __entry->err_code = err_code; + ), + + TP_printk("cpu%u: floor_perf=%u, changed=%u (error = %d)", + __entry->cpu_id, + __entry->floor_perf, + __entry->changed, + __entry->err_code + ) +); + #endif /* _AMD_PSTATE_TRACE_H */ /* This part must be outside protection */ diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index a068c4457a8f..5eae74a67aeb 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -333,6 +333,7 @@ static int amd_pstate_set_floor_perf(struct cpufreq_policy *policy, u8 perf) { struct amd_cpudata *cpudata = policy->driver_data; u64 value, prev; + bool changed; int ret; if (!cpu_feature_enabled(X86_FEATURE_CPPC_PERF_PRIO)) @@ -341,17 +342,24 @@ static int amd_pstate_set_floor_perf(struct cpufreq_policy *policy, u8 perf) value = prev = READ_ONCE(cpudata->cppc_req2_cached); FIELD_MODIFY(AMD_CPPC_FLOOR_PERF_MASK, &value, perf); - if (value == prev) - return 0; + changed = value != prev; + if (!changed) { + ret = 0; + goto out_trace; + } ret = wrmsrq_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ2, value); if (ret) { + changed = false; pr_err("failed to set CPPC REQ2 value. Error (%d)\n", ret); - return ret; + goto out_trace; } WRITE_ONCE(cpudata->cppc_req2_cached, value); +out_trace: + if (trace_amd_pstate_cppc_req2_enabled()) + trace_amd_pstate_cppc_req2(cpudata->cpu, perf, changed, ret); return ret; } From c6a2b750de13db9103db29c64927bec3919232b5 Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Thu, 26 Mar 2026 17:17:52 +0530 Subject: [PATCH 08/22] amd-pstate-ut: Add module parameter to select testcases Currently when amd-pstate-ut test module is loaded, it runs all the tests from amd_pstate_ut_cases[] array. Add a module parameter named "test_list" that accepts a comma-delimited list of test names, allowing users to run a selected subset of tests. When the parameter is omitted or empty, all tests are run as before. Signed-off-by: Gautham R. Shenoy Reviewed-by: Mario Limonciello (AMD) Signed-off-by: Mario Limonciello (AMD) --- drivers/cpufreq/amd-pstate-ut.c | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/drivers/cpufreq/amd-pstate-ut.c b/drivers/cpufreq/amd-pstate-ut.c index 447b9aa5ce40..3dcdf56883a6 100644 --- a/drivers/cpufreq/amd-pstate-ut.c +++ b/drivers/cpufreq/amd-pstate-ut.c @@ -35,6 +35,10 @@ #include "amd-pstate.h" +static char *test_list; +module_param(test_list, charp, 0444); +MODULE_PARM_DESC(test_list, + "Comma-delimited list of tests to run (empty means run all tests)"); struct amd_pstate_ut_struct { const char *name; @@ -58,6 +62,25 @@ static struct amd_pstate_ut_struct amd_pstate_ut_cases[] = { {"amd_pstate_ut_check_driver", amd_pstate_ut_check_driver } }; +static bool test_in_list(const char *list, const char *name) +{ + size_t name_len = strlen(name); + const char *p = list; + + while (*p) { + const char *sep = strchr(p, ','); + size_t token_len = sep ? sep - p : strlen(p); + + if (token_len == name_len && !strncmp(p, name, token_len)) + return true; + if (!sep) + break; + p = sep + 1; + } + + return false; +} + static bool get_shared_mem(void) { bool result = false; @@ -275,7 +298,13 @@ static int __init amd_pstate_ut_init(void) u32 i = 0, arr_size = ARRAY_SIZE(amd_pstate_ut_cases); for (i = 0; i < arr_size; i++) { - int ret = amd_pstate_ut_cases[i].func(i); + int ret; + + if (test_list && *test_list && + !test_in_list(test_list, amd_pstate_ut_cases[i].name)) + continue; + + ret = amd_pstate_ut_cases[i].func(i); if (ret) pr_err("%-4d %-20s\t fail: %d!\n", i+1, amd_pstate_ut_cases[i].name, ret); From 3b90e5a4176acacc6781b9ac84cdc5ac53671eee Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Thu, 26 Mar 2026 17:17:53 +0530 Subject: [PATCH 09/22] amd-pstate-ut: Add a testcase to validate the visibility of driver attributes amd-pstate driver has per-attribute visibility functions to dynamically control which sysfs freq_attrs are exposed based on the platform capabilities and the current amd_pstate mode. However, there is no test coverage to validate that the driver's live attribute list matches the expected visibility for each mode. Add amd_pstate_ut_check_freq_attrs() to the amd-pstate unit test module. For each enabled mode (passive, active, guided), the test independently derives the expected visibility of each attribute: - Core attributes (max_freq, lowest_nonlinear_freq, highest_perf) are always expected. - Prefcore attributes (prefcore_ranking, hw_prefcore) are expected only when cpudata->hw_prefcore indicates platform support. - EPP attributes (energy_performance_preference, energy_performance_available_preferences) are expected only in active mode. - Floor frequency attributes (floor_freq, floor_count) are expected only when X86_FEATURE_CPPC_PERF_PRIO is present. Compare these independent expectations against the live driver's attr array, catching bugs such as attributes leaking into wrong modes or visibility functions checking incorrect conditions. Signed-off-by: Gautham R. Shenoy Signed-off-by: Mario Limonciello (AMD) --- drivers/cpufreq/amd-pstate-ut.c | 139 ++++++++++++++++++++++++++++++-- drivers/cpufreq/amd-pstate.c | 8 ++ drivers/cpufreq/amd-pstate.h | 4 + 3 files changed, 146 insertions(+), 5 deletions(-) diff --git a/drivers/cpufreq/amd-pstate-ut.c b/drivers/cpufreq/amd-pstate-ut.c index 3dcdf56883a6..1f62ab6438b4 100644 --- a/drivers/cpufreq/amd-pstate-ut.c +++ b/drivers/cpufreq/amd-pstate-ut.c @@ -23,6 +23,8 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include +#include +#include #include #include #include @@ -53,13 +55,15 @@ static int amd_pstate_ut_check_enabled(u32 index); static int amd_pstate_ut_check_perf(u32 index); static int amd_pstate_ut_check_freq(u32 index); static int amd_pstate_ut_check_driver(u32 index); +static int amd_pstate_ut_check_freq_attrs(u32 index); static struct amd_pstate_ut_struct amd_pstate_ut_cases[] = { - {"amd_pstate_ut_acpi_cpc_valid", amd_pstate_ut_acpi_cpc_valid }, - {"amd_pstate_ut_check_enabled", amd_pstate_ut_check_enabled }, - {"amd_pstate_ut_check_perf", amd_pstate_ut_check_perf }, - {"amd_pstate_ut_check_freq", amd_pstate_ut_check_freq }, - {"amd_pstate_ut_check_driver", amd_pstate_ut_check_driver } + {"amd_pstate_ut_acpi_cpc_valid", amd_pstate_ut_acpi_cpc_valid }, + {"amd_pstate_ut_check_enabled", amd_pstate_ut_check_enabled }, + {"amd_pstate_ut_check_perf", amd_pstate_ut_check_perf }, + {"amd_pstate_ut_check_freq", amd_pstate_ut_check_freq }, + {"amd_pstate_ut_check_driver", amd_pstate_ut_check_driver }, + {"amd_pstate_ut_check_freq_attrs", amd_pstate_ut_check_freq_attrs }, }; static bool test_in_list(const char *list, const char *name) @@ -293,6 +297,131 @@ static int amd_pstate_ut_check_driver(u32 index) return ret; } +enum attr_category { + ATTR_ALWAYS, + ATTR_PREFCORE, + ATTR_EPP, + ATTR_FLOOR_FREQ, +}; + +static const struct { + const char *name; + enum attr_category category; +} expected_freq_attrs[] = { + {"amd_pstate_max_freq", ATTR_ALWAYS}, + {"amd_pstate_lowest_nonlinear_freq", ATTR_ALWAYS}, + {"amd_pstate_highest_perf", ATTR_ALWAYS}, + {"amd_pstate_prefcore_ranking", ATTR_PREFCORE}, + {"amd_pstate_hw_prefcore", ATTR_PREFCORE}, + {"energy_performance_preference", ATTR_EPP}, + {"energy_performance_available_preferences", ATTR_EPP}, + {"amd_pstate_floor_freq", ATTR_FLOOR_FREQ}, + {"amd_pstate_floor_count", ATTR_FLOOR_FREQ}, +}; + +static bool attr_in_driver(struct freq_attr **driver_attrs, const char *name) +{ + int j; + + for (j = 0; driver_attrs[j]; j++) { + if (!strcmp(driver_attrs[j]->attr.name, name)) + return true; + } + return false; +} + +/* + * Verify that for each mode the driver's live ->attr array contains exactly + * the attributes that should be visible. Expected visibility is derived + * independently from hw_prefcore, cpu features, and the current mode — + * not from the driver's own visibility functions. + */ +static int amd_pstate_ut_check_freq_attrs(u32 index) +{ + enum amd_pstate_mode orig_mode = amd_pstate_get_status(); + static const enum amd_pstate_mode modes[] = { + AMD_PSTATE_PASSIVE, AMD_PSTATE_ACTIVE, AMD_PSTATE_GUIDED, + }; + bool has_prefcore, has_floor_freq; + int m, i, ret; + + has_floor_freq = cpu_feature_enabled(X86_FEATURE_CPPC_PERF_PRIO); + + /* + * Determine prefcore support from any online CPU's cpudata. + * hw_prefcore reflects the platform-wide decision made at init. + */ + has_prefcore = false; + for_each_online_cpu(i) { + struct cpufreq_policy *policy __free(put_cpufreq_policy) = NULL; + struct amd_cpudata *cpudata; + + policy = cpufreq_cpu_get(i); + if (!policy) + continue; + cpudata = policy->driver_data; + has_prefcore = cpudata->hw_prefcore; + break; + } + + for (m = 0; m < ARRAY_SIZE(modes); m++) { + struct freq_attr **driver_attrs; + + ret = amd_pstate_set_mode(modes[m]); + if (ret) + goto out; + + driver_attrs = amd_pstate_get_current_attrs(); + if (!driver_attrs) { + pr_err("%s: no driver attrs in mode %s\n", + __func__, amd_pstate_get_mode_string(modes[m])); + ret = -EINVAL; + goto out; + } + + for (i = 0; i < ARRAY_SIZE(expected_freq_attrs); i++) { + bool expected, found; + + switch (expected_freq_attrs[i].category) { + case ATTR_ALWAYS: + expected = true; + break; + case ATTR_PREFCORE: + expected = has_prefcore; + break; + case ATTR_EPP: + expected = (modes[m] == AMD_PSTATE_ACTIVE); + break; + case ATTR_FLOOR_FREQ: + expected = has_floor_freq; + break; + default: + expected = false; + break; + } + + found = attr_in_driver(driver_attrs, + expected_freq_attrs[i].name); + + if (expected != found) { + pr_err("%s: mode %s: attr %s expected %s but is %s\n", + __func__, + amd_pstate_get_mode_string(modes[m]), + expected_freq_attrs[i].name, + expected ? "visible" : "hidden", + found ? "visible" : "hidden"); + ret = -EINVAL; + goto out; + } + } + } + + ret = 0; +out: + amd_pstate_set_mode(orig_mode); + return ret; +} + static int __init amd_pstate_ut_init(void) { u32 i = 0, arr_size = ARRAY_SIZE(amd_pstate_ut_cases); diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 5eae74a67aeb..ed9fd4155a25 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -1390,6 +1390,14 @@ static struct freq_attr_visibility amd_pstate_attr_visibility[] = { {&amd_pstate_floor_count, floor_freq_visibility}, }; +struct freq_attr **amd_pstate_get_current_attrs(void) +{ + if (!current_pstate_driver) + return NULL; + return current_pstate_driver->attr; +} +EXPORT_SYMBOL_GPL(amd_pstate_get_current_attrs); + static struct freq_attr **get_freq_attrs(void) { bool attr_visible[ARRAY_SIZE(amd_pstate_attr_visibility)]; diff --git a/drivers/cpufreq/amd-pstate.h b/drivers/cpufreq/amd-pstate.h index 453adfb445f8..faead0b19a8a 100644 --- a/drivers/cpufreq/amd-pstate.h +++ b/drivers/cpufreq/amd-pstate.h @@ -134,4 +134,8 @@ const char *amd_pstate_get_mode_string(enum amd_pstate_mode mode); int amd_pstate_get_status(void); int amd_pstate_update_status(const char *buf, size_t size); +struct freq_attr; + +struct freq_attr **amd_pstate_get_current_attrs(void); + #endif /* _LINUX_AMD_PSTATE_H */ From 7e1cf24efba4b59a275e87372267dadcb7fd1850 Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Thu, 26 Mar 2026 17:17:54 +0530 Subject: [PATCH 10/22] Documentation/amd-pstate: List amd_pstate_hw_prefcore sysfs file Add the missing amd_pstate_hw_prefcore filenames in the sysfs listing example leading to the descriptions of these parameters. Clarify when will the file be visible. Fixes: b96b82d1af7f ("cpufreq: amd-pstate: Add documentation for `amd_pstate_hw_prefcore`") Reviewed-by: Mario Limonciello (AMD) Signed-off-by: Gautham R. Shenoy Signed-off-by: Mario Limonciello (AMD) --- Documentation/admin-guide/pm/amd-pstate.rst | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/pm/amd-pstate.rst b/Documentation/admin-guide/pm/amd-pstate.rst index e1771f2225d5..b8c846cbf301 100644 --- a/Documentation/admin-guide/pm/amd-pstate.rst +++ b/Documentation/admin-guide/pm/amd-pstate.rst @@ -239,6 +239,7 @@ control its functionality at the system level. They are located in the root@hr-test1:/home/ray# ls /sys/devices/system/cpu/cpufreq/policy0/*amd* /sys/devices/system/cpu/cpufreq/policy0/amd_pstate_highest_perf + /sys/devices/system/cpu/cpufreq/policy0/amd_pstate_hw_prefcore /sys/devices/system/cpu/cpufreq/policy0/amd_pstate_lowest_nonlinear_freq /sys/devices/system/cpu/cpufreq/policy0/amd_pstate_max_freq @@ -264,8 +265,9 @@ This attribute is read-only. ``amd_pstate_hw_prefcore`` -Whether the platform supports the preferred core feature and it has been -enabled. This attribute is read-only. +Whether the platform supports the preferred core feature and it has +been enabled. This attribute is read-only. This file is only visible +on platforms which support the preferred core feature. ``amd_pstate_prefcore_ranking`` From a5bc4c44aeec2920931e17db7f93965fcd69ee2f Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Thu, 26 Mar 2026 17:17:55 +0530 Subject: [PATCH 11/22] Documentation/amd-pstate: List amd_pstate_prefcore_ranking sysfs file Add the missing amd_pstate_prefcore_ranking filenames in the sysfs listing example leading to the descriptions of these parameters. Clarify when will the file be visible. Fixes: 15a2b764ea7c ("amd-pstate: Add missing documentation for `amd_pstate_prefcore_ranking`") Reviewed-by: Mario Limonciello (AMD) Signed-off-by: Gautham R. Shenoy Signed-off-by: Mario Limonciello (AMD) --- Documentation/admin-guide/pm/amd-pstate.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/pm/amd-pstate.rst b/Documentation/admin-guide/pm/amd-pstate.rst index b8c846cbf301..b31a478c28ba 100644 --- a/Documentation/admin-guide/pm/amd-pstate.rst +++ b/Documentation/admin-guide/pm/amd-pstate.rst @@ -242,6 +242,7 @@ control its functionality at the system level. They are located in the /sys/devices/system/cpu/cpufreq/policy0/amd_pstate_hw_prefcore /sys/devices/system/cpu/cpufreq/policy0/amd_pstate_lowest_nonlinear_freq /sys/devices/system/cpu/cpufreq/policy0/amd_pstate_max_freq + /sys/devices/system/cpu/cpufreq/policy0/amd_pstate_prefcore_ranking ``amd_pstate_highest_perf / amd_pstate_max_freq`` @@ -273,7 +274,8 @@ on platforms which support the preferred core feature. The performance ranking of the core. This number doesn't have any unit, but larger numbers are preferred at the time of reading. This can change at -runtime based on platform conditions. This attribute is read-only. +runtime based on platform conditions. This attribute is read-only. This file +is only visible on platforms which support the preferred core feature. ``energy_performance_available_preferences`` From 88d2ca6a68fcc43d65b3b056cb8c481804b100b0 Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Thu, 26 Mar 2026 17:17:56 +0530 Subject: [PATCH 12/22] Documentation/amd-pstate: Add documentation for amd_pstate_floor_{freq,count} Add documentation for the sysfs files /sys/devices/system/cpu/cpufreq/policy*/amd_pstate_floor_freq and /sys/devices/system/cpu/cpufreq/policy*/amd_pstate_floor_count. Reviewed-by: Mario Limonciello (AMD) Signed-off-by: Gautham R. Shenoy Signed-off-by: Mario Limonciello (AMD) --- Documentation/admin-guide/pm/amd-pstate.rst | 32 +++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/Documentation/admin-guide/pm/amd-pstate.rst b/Documentation/admin-guide/pm/amd-pstate.rst index b31a478c28ba..d6c2f233ab23 100644 --- a/Documentation/admin-guide/pm/amd-pstate.rst +++ b/Documentation/admin-guide/pm/amd-pstate.rst @@ -242,6 +242,8 @@ control its functionality at the system level. They are located in the /sys/devices/system/cpu/cpufreq/policy0/amd_pstate_hw_prefcore /sys/devices/system/cpu/cpufreq/policy0/amd_pstate_lowest_nonlinear_freq /sys/devices/system/cpu/cpufreq/policy0/amd_pstate_max_freq + /sys/devices/system/cpu/cpufreq/policy0/amd_pstate_floor_freq + /sys/devices/system/cpu/cpufreq/policy0/amd_pstate_floor_count /sys/devices/system/cpu/cpufreq/policy0/amd_pstate_prefcore_ranking @@ -277,6 +279,36 @@ larger numbers are preferred at the time of reading. This can change at runtime based on platform conditions. This attribute is read-only. This file is only visible on platforms which support the preferred core feature. +``amd_pstate_floor_freq`` + +The floor frequency associated with each CPU. Userspace can write any +value between ``cpuinfo_min_freq`` and ``scaling_max_freq`` into this +file. When the system is under power or thermal constraints, the +platform firmware will attempt to throttle the CPU frequency to the +value specified in ``amd_pstate_floor_freq`` before throttling it +further. This allows userspace to specify different floor frequencies +to different CPUs. For optimal results, threads of the same core +should have the same floor frequency value. This file is only visible +on platforms that support the CPPC Performance Priority feature. + + +``amd_pstate_floor_count`` + +The number of distinct Floor Performance levels supported by the +platform. For example, if this value is 2, then the number of unique +values obtained from the command ``cat +/sys/devices/system/cpu/cpufreq/policy*/amd_pstate_floor_freq | +sort -n | uniq`` should be at most this number for the behavior +described in ``amd_pstate_floor_freq`` to take effect. A zero value +implies that the platform supports unlimited floor performance levels. +This file is only visible on platforms that support the CPPC +Performance Priority feature. + +**Note**: When ``amd_pstate_floor_count`` is non-zero, the frequency to +which the CPU is throttled under power or thermal constraints is +undefined when the number of unique values of ``amd_pstate_floor_freq`` +across all CPUs in the system exceeds ``amd_pstate_floor_count``. + ``energy_performance_available_preferences`` A list of all the supported EPP preferences that could be used for From 8cdc494013dfcd48f31eafe19b18fd67c224dd8a Mon Sep 17 00:00:00 2001 From: "Mario Limonciello (AMD)" Date: Thu, 26 Mar 2026 14:36:20 -0500 Subject: [PATCH 13/22] cpufreq/amd-pstate: Cache the max frequency in cpudata The value of maximum frequency is fixed and never changes. Doing calculations every time based off of perf is unnecessary. Reviewed-by: Gautham R. Shenoy Link: https://lore.kernel.org/r/20260326193620.649441-1-mario.limonciello@amd.com Signed-off-by: Mario Limonciello (AMD) --- drivers/cpufreq/amd-pstate.c | 27 +++++++++------------------ drivers/cpufreq/amd-pstate.h | 2 ++ 2 files changed, 11 insertions(+), 18 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index ed9fd4155a25..f207252eb5f5 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -826,15 +826,13 @@ static void amd_pstate_adjust_perf(unsigned int cpu, static int amd_pstate_cpu_boost_update(struct cpufreq_policy *policy, bool on) { struct amd_cpudata *cpudata = policy->driver_data; - union perf_cached perf = READ_ONCE(cpudata->perf); - u32 nominal_freq, max_freq; + u32 nominal_freq; int ret = 0; nominal_freq = READ_ONCE(cpudata->nominal_freq); - max_freq = perf_to_freq(perf, cpudata->nominal_freq, perf.highest_perf); if (on) - policy->cpuinfo.max_freq = max_freq; + policy->cpuinfo.max_freq = cpudata->max_freq; else if (policy->cpuinfo.max_freq > nominal_freq) policy->cpuinfo.max_freq = nominal_freq; @@ -1021,13 +1019,15 @@ static int amd_pstate_init_freq(struct amd_cpudata *cpudata) WRITE_ONCE(cpudata->nominal_freq, nominal_freq); + /* max_freq is calculated according to (nominal_freq * highest_perf)/nominal_perf */ max_freq = perf_to_freq(perf, nominal_freq, perf.highest_perf); + WRITE_ONCE(cpudata->max_freq, max_freq); + lowest_nonlinear_freq = perf_to_freq(perf, nominal_freq, perf.lowest_nonlinear_perf); WRITE_ONCE(cpudata->lowest_nonlinear_freq, lowest_nonlinear_freq); /** * Below values need to be initialized correctly, otherwise driver will fail to load - * max_freq is calculated according to (nominal_freq * highest_perf)/nominal_perf * lowest_nonlinear_freq is a value between [min_freq, nominal_freq] * Check _CPC in ACPI table objects if any values are incorrect */ @@ -1090,9 +1090,7 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy) policy->cpuinfo.min_freq = policy->min = perf_to_freq(perf, cpudata->nominal_freq, perf.lowest_perf); - policy->cpuinfo.max_freq = policy->max = perf_to_freq(perf, - cpudata->nominal_freq, - perf.highest_perf); + policy->cpuinfo.max_freq = policy->max = cpudata->max_freq; policy->driver_data = cpudata; ret = amd_pstate_cppc_enable(policy); @@ -1167,14 +1165,9 @@ static void amd_pstate_cpu_exit(struct cpufreq_policy *policy) static ssize_t show_amd_pstate_max_freq(struct cpufreq_policy *policy, char *buf) { - struct amd_cpudata *cpudata; - union perf_cached perf; + struct amd_cpudata *cpudata = policy->driver_data; - cpudata = policy->driver_data; - perf = READ_ONCE(cpudata->perf); - - return sysfs_emit(buf, "%u\n", - perf_to_freq(perf, cpudata->nominal_freq, perf.highest_perf)); + return sysfs_emit(buf, "%u\n", cpudata->max_freq); } static ssize_t show_amd_pstate_lowest_nonlinear_freq(struct cpufreq_policy *policy, @@ -1702,9 +1695,7 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) policy->cpuinfo.min_freq = policy->min = perf_to_freq(perf, cpudata->nominal_freq, perf.lowest_perf); - policy->cpuinfo.max_freq = policy->max = perf_to_freq(perf, - cpudata->nominal_freq, - perf.highest_perf); + policy->cpuinfo.max_freq = policy->max = cpudata->max_freq; policy->driver_data = cpudata; ret = amd_pstate_cppc_enable(policy); diff --git a/drivers/cpufreq/amd-pstate.h b/drivers/cpufreq/amd-pstate.h index faead0b19a8a..32b8b26ce388 100644 --- a/drivers/cpufreq/amd-pstate.h +++ b/drivers/cpufreq/amd-pstate.h @@ -73,6 +73,7 @@ struct amd_aperf_mperf { * @min_limit_freq: Cached value of policy->min (in khz) * @max_limit_freq: Cached value of policy->max (in khz) * @nominal_freq: the frequency (in khz) that mapped to nominal_perf + * @max_freq: in ideal conditions the maximum frequency (in khz) possible frequency * @lowest_nonlinear_freq: the frequency (in khz) that mapped to lowest_nonlinear_perf * @floor_freq: Cached value of the user requested floor_freq * @cur: Difference of Aperf/Mperf/tsc count between last and current sample @@ -103,6 +104,7 @@ struct amd_cpudata { u32 min_limit_freq; u32 max_limit_freq; u32 nominal_freq; + u32 max_freq; u32 lowest_nonlinear_freq; u32 floor_freq; From a362ae6e7e85bca4c870c37085d7793c4beec360 Mon Sep 17 00:00:00 2001 From: Ninad Naik Date: Tue, 31 Mar 2026 00:38:55 +0530 Subject: [PATCH 14/22] Documentation: amd-pstate: fix dead links in the reference section The links for AMD64 Architecture Programmer's Manual and PPR for AMD Family 19h Model 51h, Revision A1 Processors redirect to a generic page. Update the links to the working ones. Signed-off-by: Ninad Naik Link: https://lore.kernel.org/r/20260330190855.1115304-1-ninadnaik07@gmail.com Acked-by: Mario Limonciello (AMD) Signed-off-by: Mario Limonciello (AMD) --- Documentation/admin-guide/pm/amd-pstate.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/pm/amd-pstate.rst b/Documentation/admin-guide/pm/amd-pstate.rst index d6c2f233ab23..b43675b7f739 100644 --- a/Documentation/admin-guide/pm/amd-pstate.rst +++ b/Documentation/admin-guide/pm/amd-pstate.rst @@ -826,13 +826,13 @@ Reference =========== .. [1] AMD64 Architecture Programmer's Manual Volume 2: System Programming, - https://www.amd.com/system/files/TechDocs/24593.pdf + https://docs.amd.com/v/u/en-US/24593_3.44_APM_Vol2 .. [2] Advanced Configuration and Power Interface Specification, https://uefi.org/sites/default/files/resources/ACPI_Spec_6_4_Jan22.pdf .. [3] Processor Programming Reference (PPR) for AMD Family 19h Model 51h, Revision A1 Processors - https://www.amd.com/system/files/TechDocs/56569-A1-PUB.zip + https://docs.amd.com/v/u/en-US/56569-A1-PUB_3.03 .. [4] Linux Kernel Selftests, https://www.kernel.org/doc/html/latest/dev-tools/kselftest.html From e30ca6dd5345c5b8ba05f346a8e81105352fe571 Mon Sep 17 00:00:00 2001 From: "Mario Limonciello (AMD)" Date: Sun, 29 Mar 2026 15:38:07 -0500 Subject: [PATCH 15/22] cpufreq/amd-pstate: Add dynamic energy performance preference Dynamic energy performance preference changes the EPP profile based on whether the machine is running on AC or DC power. A notification chain from the power supply core is used to adjust EPP values on plug in or plug out events. When enabled, the driver exposes a sysfs toggle for dynamic EPP, blocks manual writes to energy_performance_preference while it "owns" the EPP updates. For non-server systems: * the default EPP for AC mode is `performance`. * the default EPP for DC mode is `balance_performance`. For server systems dynamic EPP is mostly a no-op. Reviewed-by: Gautham R. Shenoy Signed-off-by: Mario Limonciello (AMD) --- Documentation/admin-guide/pm/amd-pstate.rst | 18 ++- drivers/cpufreq/Kconfig.x86 | 12 ++ drivers/cpufreq/amd-pstate.c | 128 +++++++++++++++++++- drivers/cpufreq/amd-pstate.h | 10 +- 4 files changed, 160 insertions(+), 8 deletions(-) diff --git a/Documentation/admin-guide/pm/amd-pstate.rst b/Documentation/admin-guide/pm/amd-pstate.rst index b43675b7f739..bb1341763882 100644 --- a/Documentation/admin-guide/pm/amd-pstate.rst +++ b/Documentation/admin-guide/pm/amd-pstate.rst @@ -325,7 +325,7 @@ and user can change current preference according to energy or performance needs Please get all support profiles list from ``energy_performance_available_preferences`` attribute, all the profiles are integer values defined between 0 to 255 when EPP feature is enabled by platform -firmware, if EPP feature is disabled, driver will ignore the written value +firmware, but if the dynamic EPP feature is enabled, driver will block writes. This attribute is read-write. ``boost`` @@ -347,6 +347,22 @@ boost or `1` to enable it, for the respective CPU using the sysfs path Other performance and frequency values can be read back from ``/sys/devices/system/cpu/cpuX/acpi_cppc/``, see :ref:`cppc_sysfs`. +Dynamic energy performance profile +================================== +The amd-pstate driver supports dynamically selecting the energy performance +profile based on whether the machine is running on AC or DC power. + +Whether this behavior is enabled by default depends on the kernel +config option `CONFIG_X86_AMD_PSTATE_DYNAMIC_EPP`. This behavior can also be overridden +at runtime by the sysfs file ``/sys/devices/system/cpu/cpufreq/policyX/dynamic_epp``. + +When set to enabled, the driver will select a different energy performance +profile when the machine is running on battery or AC power. +When set to disabled, the driver will not change the energy performance profile +based on the power source and will not react to user desired power state. + +Attempting to manually write to the ``energy_performance_preference`` sysfs +file will fail when ``dynamic_epp`` is enabled. ``amd-pstate`` vs ``acpi-cpufreq`` ====================================== diff --git a/drivers/cpufreq/Kconfig.x86 b/drivers/cpufreq/Kconfig.x86 index 2c5c228408bf..cdaa8d858045 100644 --- a/drivers/cpufreq/Kconfig.x86 +++ b/drivers/cpufreq/Kconfig.x86 @@ -68,6 +68,18 @@ config X86_AMD_PSTATE_DEFAULT_MODE For details, take a look at: . +config X86_AMD_PSTATE_DYNAMIC_EPP + bool "AMD Processor P-State dynamic EPP support" + depends on X86_AMD_PSTATE + default n + help + Allow the kernel to dynamically change the energy performance + value from events like ACPI platform profile and AC adapter plug + events. + + This feature can also be changed at runtime, this configuration + option only sets the kernel default value behavior. + config X86_AMD_PSTATE_UT tristate "selftest for AMD Processor P-State driver" depends on X86 && ACPI_PROCESSOR diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index f207252eb5f5..379e7dd44252 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include @@ -86,6 +87,11 @@ static struct cpufreq_driver amd_pstate_driver; static struct cpufreq_driver amd_pstate_epp_driver; static int cppc_state = AMD_PSTATE_UNDEFINED; static bool amd_pstate_prefcore = true; +#ifdef CONFIG_X86_AMD_PSTATE_DYNAMIC_EPP +static bool dynamic_epp = CONFIG_X86_AMD_PSTATE_DYNAMIC_EPP; +#else +static bool dynamic_epp; +#endif static struct quirk_entry *quirks; /* @@ -1155,6 +1161,73 @@ static void amd_pstate_cpu_exit(struct cpufreq_policy *policy) kfree(cpudata); } +static int amd_pstate_get_balanced_epp(struct cpufreq_policy *policy) +{ + struct amd_cpudata *cpudata = policy->driver_data; + + if (power_supply_is_system_supplied()) + return cpudata->epp_default_ac; + else + return cpudata->epp_default_dc; +} + +static int amd_pstate_power_supply_notifier(struct notifier_block *nb, + unsigned long event, void *data) +{ + struct amd_cpudata *cpudata = container_of(nb, struct amd_cpudata, power_nb); + struct cpufreq_policy *policy __free(put_cpufreq_policy) = cpufreq_cpu_get(cpudata->cpu); + u8 epp; + int ret; + + if (event != PSY_EVENT_PROP_CHANGED) + return NOTIFY_OK; + + epp = amd_pstate_get_balanced_epp(policy); + + ret = amd_pstate_set_epp(policy, epp); + if (ret) + pr_warn("Failed to set CPU %d EPP %u: %d\n", cpudata->cpu, epp, ret); + + return NOTIFY_OK; +} +static void amd_pstate_clear_dynamic_epp(struct cpufreq_policy *policy) +{ + struct amd_cpudata *cpudata = policy->driver_data; + + if (cpudata->power_nb.notifier_call) + power_supply_unreg_notifier(&cpudata->power_nb); + cpudata->dynamic_epp = false; +} + +static int amd_pstate_set_dynamic_epp(struct cpufreq_policy *policy) +{ + struct amd_cpudata *cpudata = policy->driver_data; + int ret; + u8 epp; + + epp = amd_pstate_get_balanced_epp(policy); + ret = amd_pstate_set_epp(policy, epp); + if (ret) + return ret; + + /* only enable notifier if things will actually change */ + if (cpudata->epp_default_ac != cpudata->epp_default_dc) { + cpudata->power_nb.notifier_call = amd_pstate_power_supply_notifier; + ret = power_supply_reg_notifier(&cpudata->power_nb); + if (ret) + goto cleanup; + } + + cpudata->dynamic_epp = true; + + return 0; + +cleanup: + amd_pstate_clear_dynamic_epp(policy); + + return ret; +} + /* Sysfs attributes */ /* @@ -1244,14 +1317,19 @@ static ssize_t store_energy_performance_preference( ssize_t ret; u8 epp; + if (cpudata->dynamic_epp) { + pr_debug("EPP cannot be set when dynamic EPP is enabled\n"); + return -EBUSY; + } + ret = sysfs_match_string(energy_perf_strings, buf); if (ret < 0) return -EINVAL; - if (!ret) - epp = cpudata->epp_default; - else + if (ret) epp = epp_values[ret]; + else + epp = amd_pstate_get_balanced_epp(policy); if (epp > 0 && policy->policy == CPUFREQ_POLICY_PERFORMANCE) { pr_debug("EPP cannot be set under performance policy\n"); @@ -1259,6 +1337,8 @@ static ssize_t store_energy_performance_preference( } ret = amd_pstate_set_epp(policy, epp); + if (ret) + return ret; return ret ? ret : count; } @@ -1620,12 +1700,42 @@ static ssize_t prefcore_show(struct device *dev, return sysfs_emit(buf, "%s\n", str_enabled_disabled(amd_pstate_prefcore)); } +static ssize_t dynamic_epp_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%s\n", str_enabled_disabled(dynamic_epp)); +} + +static ssize_t dynamic_epp_store(struct device *a, struct device_attribute *b, + const char *buf, size_t count) +{ + bool enabled; + int ret; + + ret = kstrtobool(buf, &enabled); + if (ret) + return ret; + + if (dynamic_epp == enabled) + return -EINVAL; + + /* reinitialize with desired dynamic EPP value */ + dynamic_epp = enabled; + ret = amd_pstate_change_driver_mode(cppc_state); + if (ret) + dynamic_epp = false; + + return ret ? ret : count; +} + static DEVICE_ATTR_RW(status); static DEVICE_ATTR_RO(prefcore); +static DEVICE_ATTR_RW(dynamic_epp); static struct attribute *pstate_global_attributes[] = { &dev_attr_status.attr, &dev_attr_prefcore.attr, + &dev_attr_dynamic_epp.attr, NULL }; @@ -1715,13 +1825,17 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) if (amd_pstate_acpi_pm_profile_server() || amd_pstate_acpi_pm_profile_undefined()) { policy->policy = CPUFREQ_POLICY_PERFORMANCE; - cpudata->epp_default = amd_pstate_get_epp(cpudata); + cpudata->epp_default_ac = cpudata->epp_default_dc = amd_pstate_get_epp(cpudata); } else { policy->policy = CPUFREQ_POLICY_POWERSAVE; - cpudata->epp_default = AMD_CPPC_EPP_BALANCE_PERFORMANCE; + cpudata->epp_default_ac = AMD_CPPC_EPP_PERFORMANCE; + cpudata->epp_default_dc = AMD_CPPC_EPP_BALANCE_PERFORMANCE; } - ret = amd_pstate_set_epp(policy, cpudata->epp_default); + if (dynamic_epp) + ret = amd_pstate_set_dynamic_epp(policy); + else + ret = amd_pstate_set_epp(policy, amd_pstate_get_balanced_epp(policy)); if (ret) goto free_cpudata1; @@ -1753,6 +1867,8 @@ static void amd_pstate_epp_cpu_exit(struct cpufreq_policy *policy) amd_pstate_update_perf(policy, perf.bios_min_perf, 0U, 0U, 0U, false); amd_pstate_set_floor_perf(policy, cpudata->bios_floor_perf); + if (cpudata->dynamic_epp) + amd_pstate_clear_dynamic_epp(policy); kfree(cpudata); policy->driver_data = NULL; } diff --git a/drivers/cpufreq/amd-pstate.h b/drivers/cpufreq/amd-pstate.h index 32b8b26ce388..d929ae3163b3 100644 --- a/drivers/cpufreq/amd-pstate.h +++ b/drivers/cpufreq/amd-pstate.h @@ -85,6 +85,11 @@ struct amd_aperf_mperf { * AMD P-State driver supports preferred core featue. * @epp_cached: Cached CPPC energy-performance preference value * @policy: Cpufreq policy value + * @suspended: If CPU core if offlined + * @epp_default_ac: Default EPP value for AC power source + * @epp_default_dc: Default EPP value for DC power source + * @dynamic_epp: Whether dynamic EPP is enabled + * @power_nb: Notifier block for power events * * The amd_cpudata is key private data for each CPU thread in AMD P-State, and * represents all the attributes and goals that AMD P-State requests at runtime. @@ -118,7 +123,10 @@ struct amd_cpudata { /* EPP feature related attributes*/ u32 policy; bool suspended; - u8 epp_default; + u8 epp_default_ac; + u8 epp_default_dc; + bool dynamic_epp; + struct notifier_block power_nb; }; /* From da8afb1c666a4a966f0ab91dc336df4c855bc7b2 Mon Sep 17 00:00:00 2001 From: "Mario Limonciello (AMD)" Date: Sun, 29 Mar 2026 15:38:08 -0500 Subject: [PATCH 16/22] cpufreq/amd-pstate: add kernel command line to override dynamic epp Add `amd_dynamic_epp=enable` and `amd_dynamic_epp=disable` to override the kernel configuration option `CONFIG_X86_AMD_PSTATE_DYNAMIC_EPP` locally. Signed-off-by: Mario Limonciello (AMD) Reviewed-by: Gautham R. Shenoy Signed-off-by: Mario Limonciello (AMD) --- Documentation/admin-guide/kernel-parameters.txt | 7 +++++++ Documentation/admin-guide/pm/amd-pstate.rst | 7 +++++++ drivers/cpufreq/amd-pstate.c | 11 +++++++++++ 3 files changed, 25 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 03a550630644..9552819051cd 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -493,6 +493,13 @@ Kernel parameters disable Disable amd-pstate preferred core. + amd_dynamic_epp= + [X86] + disable + Disable amd-pstate dynamic EPP. + enable + Enable amd-pstate dynamic EPP. + amijoy.map= [HW,JOY] Amiga joystick support Map of devices attached to JOY0DAT and JOY1DAT Format: , diff --git a/Documentation/admin-guide/pm/amd-pstate.rst b/Documentation/admin-guide/pm/amd-pstate.rst index bb1341763882..01e6ab10f996 100644 --- a/Documentation/admin-guide/pm/amd-pstate.rst +++ b/Documentation/admin-guide/pm/amd-pstate.rst @@ -474,6 +474,13 @@ For systems that support ``amd-pstate`` preferred core, the core rankings will always be advertised by the platform. But OS can choose to ignore that via the kernel parameter ``amd_prefcore=disable``. +``amd_dynamic_epp`` + +When AMD pstate is in auto mode, dynamic EPP will control whether the kernel +autonomously changes the EPP mode. The default is configured by +``CONFIG_X86_AMD_PSTATE_DYNAMIC_EPP`` but can be explicitly enabled with +``amd_dynamic_epp=enable`` or disabled with ``amd_dynamic_epp=disable``. + User Space Interface in ``sysfs`` - General =========================================== diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 379e7dd44252..301e603e4966 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -2227,8 +2227,19 @@ static int __init amd_prefcore_param(char *str) return 0; } +static int __init amd_dynamic_epp_param(char *str) +{ + if (!strcmp(str, "disable")) + dynamic_epp = false; + if (!strcmp(str, "enable")) + dynamic_epp = true; + + return 0; +} + early_param("amd_pstate", amd_pstate_param); early_param("amd_prefcore", amd_prefcore_param); +early_param("amd_dynamic_epp", amd_dynamic_epp_param); MODULE_AUTHOR("Huang Rui "); MODULE_DESCRIPTION("AMD Processor P-state Frequency Driver"); From 798c47593ccae7dd36c033e557f3f364a2056b9e Mon Sep 17 00:00:00 2001 From: "Mario Limonciello (AMD)" Date: Sun, 29 Mar 2026 15:38:09 -0500 Subject: [PATCH 17/22] cpufreq/amd-pstate: Add support for platform profile class The platform profile core allows multiple drivers and devices to register platform profile support. When the legacy platform profile interface is used all drivers will adjust the platform profile as well. Add support for registering every CPU with the platform profile handler when dynamic EPP is enabled. The end result will be that changing the platform profile will modify EPP accordingly. Reviewed-by: Gautham R. Shenoy Signed-off-by: Mario Limonciello (AMD) --- Documentation/admin-guide/pm/amd-pstate.rst | 4 +- drivers/cpufreq/Kconfig.x86 | 1 + drivers/cpufreq/amd-pstate.c | 106 ++++++++++++++++++-- drivers/cpufreq/amd-pstate.h | 6 ++ 4 files changed, 110 insertions(+), 7 deletions(-) diff --git a/Documentation/admin-guide/pm/amd-pstate.rst b/Documentation/admin-guide/pm/amd-pstate.rst index 01e6ab10f996..d68ddfea6a9d 100644 --- a/Documentation/admin-guide/pm/amd-pstate.rst +++ b/Documentation/admin-guide/pm/amd-pstate.rst @@ -357,7 +357,9 @@ config option `CONFIG_X86_AMD_PSTATE_DYNAMIC_EPP`. This behavior can also be ove at runtime by the sysfs file ``/sys/devices/system/cpu/cpufreq/policyX/dynamic_epp``. When set to enabled, the driver will select a different energy performance -profile when the machine is running on battery or AC power. +profile when the machine is running on battery or AC power. The driver will +also register with the platform profile handler to receive notifications of +user desired power state and react to those. When set to disabled, the driver will not change the energy performance profile based on the power source and will not react to user desired power state. diff --git a/drivers/cpufreq/Kconfig.x86 b/drivers/cpufreq/Kconfig.x86 index cdaa8d858045..a0dbb9808ae9 100644 --- a/drivers/cpufreq/Kconfig.x86 +++ b/drivers/cpufreq/Kconfig.x86 @@ -40,6 +40,7 @@ config X86_AMD_PSTATE select ACPI_PROCESSOR select ACPI_CPPC_LIB if X86_64 select CPU_FREQ_GOV_SCHEDUTIL if SMP + select ACPI_PLATFORM_PROFILE help This driver adds a CPUFreq driver which utilizes a fine grain processor performance frequency control range instead of legacy diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 301e603e4966..d553bbd3fecc 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -1182,6 +1182,10 @@ static int amd_pstate_power_supply_notifier(struct notifier_block *nb, if (event != PSY_EVENT_PROP_CHANGED) return NOTIFY_OK; + /* dynamic actions are only applied while platform profile is in balanced */ + if (cpudata->current_profile != PLATFORM_PROFILE_BALANCED) + return 0; + epp = amd_pstate_get_balanced_epp(policy); ret = amd_pstate_set_epp(policy, epp); @@ -1190,12 +1194,77 @@ static int amd_pstate_power_supply_notifier(struct notifier_block *nb, return NOTIFY_OK; } + +static int amd_pstate_profile_probe(void *drvdata, unsigned long *choices) +{ + set_bit(PLATFORM_PROFILE_LOW_POWER, choices); + set_bit(PLATFORM_PROFILE_BALANCED, choices); + set_bit(PLATFORM_PROFILE_PERFORMANCE, choices); + + return 0; +} + +static int amd_pstate_profile_get(struct device *dev, + enum platform_profile_option *profile) +{ + struct amd_cpudata *cpudata = dev_get_drvdata(dev); + + *profile = cpudata->current_profile; + + return 0; +} + +static int amd_pstate_profile_set(struct device *dev, + enum platform_profile_option profile) +{ + struct amd_cpudata *cpudata = dev_get_drvdata(dev); + struct cpufreq_policy *policy __free(put_cpufreq_policy) = cpufreq_cpu_get(cpudata->cpu); + int ret; + + switch (profile) { + case PLATFORM_PROFILE_LOW_POWER: + ret = amd_pstate_set_epp(policy, AMD_CPPC_EPP_POWERSAVE); + if (ret) + return ret; + break; + case PLATFORM_PROFILE_BALANCED: + ret = amd_pstate_set_epp(policy, + amd_pstate_get_balanced_epp(policy)); + if (ret) + return ret; + break; + case PLATFORM_PROFILE_PERFORMANCE: + ret = amd_pstate_set_epp(policy, AMD_CPPC_EPP_PERFORMANCE); + if (ret) + return ret; + break; + default: + pr_err("Unknown Platform Profile %d\n", profile); + return -EOPNOTSUPP; + } + + cpudata->current_profile = profile; + + return 0; +} + +static const struct platform_profile_ops amd_pstate_profile_ops = { + .probe = amd_pstate_profile_probe, + .profile_set = amd_pstate_profile_set, + .profile_get = amd_pstate_profile_get, +}; + static void amd_pstate_clear_dynamic_epp(struct cpufreq_policy *policy) { struct amd_cpudata *cpudata = policy->driver_data; if (cpudata->power_nb.notifier_call) power_supply_unreg_notifier(&cpudata->power_nb); + if (cpudata->ppdev) { + platform_profile_remove(cpudata->ppdev); + cpudata->ppdev = NULL; + } + kfree(cpudata->profile_name); cpudata->dynamic_epp = false; } @@ -1205,11 +1274,35 @@ static int amd_pstate_set_dynamic_epp(struct cpufreq_policy *policy) int ret; u8 epp; - epp = amd_pstate_get_balanced_epp(policy); + switch (cpudata->current_profile) { + case PLATFORM_PROFILE_PERFORMANCE: + epp = AMD_CPPC_EPP_PERFORMANCE; + break; + case PLATFORM_PROFILE_LOW_POWER: + epp = AMD_CPPC_EPP_POWERSAVE; + break; + case PLATFORM_PROFILE_BALANCED: + epp = amd_pstate_get_balanced_epp(policy); + break; + default: + pr_err("Unknown Platform Profile %d\n", cpudata->current_profile); + return -EOPNOTSUPP; + } ret = amd_pstate_set_epp(policy, epp); if (ret) return ret; + cpudata->profile_name = kasprintf(GFP_KERNEL, "amd-pstate-epp-cpu%d", cpudata->cpu); + + cpudata->ppdev = platform_profile_register(get_cpu_device(policy->cpu), + cpudata->profile_name, + policy->driver_data, + &amd_pstate_profile_ops); + if (IS_ERR(cpudata->ppdev)) { + ret = PTR_ERR(cpudata->ppdev); + goto cleanup; + } + /* only enable notifier if things will actually change */ if (cpudata->epp_default_ac != cpudata->epp_default_dc) { cpudata->power_nb.notifier_call = amd_pstate_power_supply_notifier; @@ -1310,8 +1403,8 @@ static ssize_t show_energy_performance_available_preferences( return offset; } -static ssize_t store_energy_performance_preference( - struct cpufreq_policy *policy, const char *buf, size_t count) +static ssize_t store_energy_performance_preference(struct cpufreq_policy *policy, + const char *buf, size_t count) { struct amd_cpudata *cpudata = policy->driver_data; ssize_t ret; @@ -1331,7 +1424,7 @@ static ssize_t store_energy_performance_preference( else epp = amd_pstate_get_balanced_epp(policy); - if (epp > 0 && policy->policy == CPUFREQ_POLICY_PERFORMANCE) { + if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) { pr_debug("EPP cannot be set under performance policy\n"); return -EBUSY; } @@ -1343,8 +1436,7 @@ static ssize_t store_energy_performance_preference( return ret ? ret : count; } -static ssize_t show_energy_performance_preference( - struct cpufreq_policy *policy, char *buf) +static ssize_t show_energy_performance_preference(struct cpufreq_policy *policy, char *buf) { struct amd_cpudata *cpudata = policy->driver_data; u8 preference, epp; @@ -1826,10 +1918,12 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) amd_pstate_acpi_pm_profile_undefined()) { policy->policy = CPUFREQ_POLICY_PERFORMANCE; cpudata->epp_default_ac = cpudata->epp_default_dc = amd_pstate_get_epp(cpudata); + cpudata->current_profile = PLATFORM_PROFILE_PERFORMANCE; } else { policy->policy = CPUFREQ_POLICY_POWERSAVE; cpudata->epp_default_ac = AMD_CPPC_EPP_PERFORMANCE; cpudata->epp_default_dc = AMD_CPPC_EPP_BALANCE_PERFORMANCE; + cpudata->current_profile = PLATFORM_PROFILE_BALANCED; } if (dynamic_epp) diff --git a/drivers/cpufreq/amd-pstate.h b/drivers/cpufreq/amd-pstate.h index d929ae3163b3..a7e52f79a802 100644 --- a/drivers/cpufreq/amd-pstate.h +++ b/drivers/cpufreq/amd-pstate.h @@ -9,6 +9,7 @@ #define _LINUX_AMD_PSTATE_H #include +#include /********************************************************************* * AMD P-state INTERFACE * @@ -127,6 +128,11 @@ struct amd_cpudata { u8 epp_default_dc; bool dynamic_epp; struct notifier_block power_nb; + + /* platform profile */ + enum platform_profile_option current_profile; + struct device *ppdev; + char *profile_name; }; /* From 6927f21852f38db2975b5d5539cbe5241c25a99b Mon Sep 17 00:00:00 2001 From: "Mario Limonciello (AMD)" Date: Sun, 29 Mar 2026 15:38:10 -0500 Subject: [PATCH 18/22] cpufreq/amd-pstate: Add support for raw EPP writes The energy performance preference field of the CPPC request MSR supports values from 0 to 255, but the strings only offer 4 values. The other values are useful for tuning the performance of some workloads. Add support for writing the raw energy performance preference value to the sysfs file. If the last value written was an integer then an integer will be returned. If the last value written was a string then a string will be returned. Reviewed-by: Gautham R. Shenoy Signed-off-by: Mario Limonciello (AMD) --- Documentation/admin-guide/pm/amd-pstate.rst | 16 ++++++--- drivers/cpufreq/amd-pstate.c | 36 +++++++++++++++------ drivers/cpufreq/amd-pstate.h | 1 + 3 files changed, 38 insertions(+), 15 deletions(-) diff --git a/Documentation/admin-guide/pm/amd-pstate.rst b/Documentation/admin-guide/pm/amd-pstate.rst index d68ddfea6a9d..f8e7050fc762 100644 --- a/Documentation/admin-guide/pm/amd-pstate.rst +++ b/Documentation/admin-guide/pm/amd-pstate.rst @@ -316,16 +316,22 @@ A list of all the supported EPP preferences that could be used for These profiles represent different hints that are provided to the low-level firmware about the user's desired energy vs efficiency tradeoff. ``default`` represents the epp value is set by platform -firmware. This attribute is read-only. +firmware. ``custom`` designates that integer values 0-255 may be written +as well. This attribute is read-only. ``energy_performance_preference`` The current energy performance preference can be read from this attribute. and user can change current preference according to energy or performance needs -Please get all support profiles list from -``energy_performance_available_preferences`` attribute, all the profiles are -integer values defined between 0 to 255 when EPP feature is enabled by platform -firmware, but if the dynamic EPP feature is enabled, driver will block writes. +Coarse named profiles are available in the attribute +``energy_performance_available_preferences``. +Users can also write individual integer values between 0 to 255. +When dynamic EPP is enabled, writes to energy_performance_preference are blocked +even when EPP feature is enabled by platform firmware. Lower epp values shift the bias +towards improved performance while a higher epp value shifts the bias towards +power-savings. The exact impact can change from one platform to the other. +If a valid integer was last written, then a number will be returned on future reads. +If a valid string was last written then a string will be returned on future reads. This attribute is read-write. ``boost`` diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index d553bbd3fecc..83e937764fab 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -109,6 +109,7 @@ static struct quirk_entry *quirks; * 2 balance_performance * 3 balance_power * 4 power + * 5 custom (for raw EPP values) */ enum energy_perf_value_index { EPP_INDEX_DEFAULT = 0, @@ -116,6 +117,7 @@ enum energy_perf_value_index { EPP_INDEX_BALANCE_PERFORMANCE, EPP_INDEX_BALANCE_POWERSAVE, EPP_INDEX_POWERSAVE, + EPP_INDEX_CUSTOM, EPP_INDEX_MAX, }; @@ -125,6 +127,7 @@ static const char * const energy_perf_strings[] = { [EPP_INDEX_BALANCE_PERFORMANCE] = "balance_performance", [EPP_INDEX_BALANCE_POWERSAVE] = "balance_power", [EPP_INDEX_POWERSAVE] = "power", + [EPP_INDEX_CUSTOM] = "custom", }; static_assert(ARRAY_SIZE(energy_perf_strings) == EPP_INDEX_MAX); @@ -135,7 +138,7 @@ static unsigned int epp_values[] = { [EPP_INDEX_BALANCE_POWERSAVE] = AMD_CPPC_EPP_BALANCE_POWERSAVE, [EPP_INDEX_POWERSAVE] = AMD_CPPC_EPP_POWERSAVE, }; -static_assert(ARRAY_SIZE(epp_values) == EPP_INDEX_MAX); +static_assert(ARRAY_SIZE(epp_values) == EPP_INDEX_MAX - 1); typedef int (*cppc_mode_transition_fn)(int); @@ -1408,6 +1411,7 @@ static ssize_t store_energy_performance_preference(struct cpufreq_policy *policy { struct amd_cpudata *cpudata = policy->driver_data; ssize_t ret; + bool raw_epp = false; u8 epp; if (cpudata->dynamic_epp) { @@ -1415,14 +1419,21 @@ static ssize_t store_energy_performance_preference(struct cpufreq_policy *policy return -EBUSY; } - ret = sysfs_match_string(energy_perf_strings, buf); - if (ret < 0) - return -EINVAL; - - if (ret) - epp = epp_values[ret]; - else - epp = amd_pstate_get_balanced_epp(policy); + /* + * if the value matches a number, use that, otherwise see if + * matches an index in the energy_perf_strings array + */ + ret = kstrtou8(buf, 0, &epp); + raw_epp = !ret; + if (ret) { + ret = sysfs_match_string(energy_perf_strings, buf); + if (ret < 0 || ret == EPP_INDEX_CUSTOM) + return -EINVAL; + if (ret) + epp = epp_values[ret]; + else + epp = amd_pstate_get_balanced_epp(policy); + } if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) { pr_debug("EPP cannot be set under performance policy\n"); @@ -1433,7 +1444,9 @@ static ssize_t store_energy_performance_preference(struct cpufreq_policy *policy if (ret) return ret; - return ret ? ret : count; + cpudata->raw_epp = raw_epp; + + return count; } static ssize_t show_energy_performance_preference(struct cpufreq_policy *policy, char *buf) @@ -1443,6 +1456,9 @@ static ssize_t show_energy_performance_preference(struct cpufreq_policy *policy, epp = FIELD_GET(AMD_CPPC_EPP_PERF_MASK, cpudata->cppc_req_cached); + if (cpudata->raw_epp) + return sysfs_emit(buf, "%u\n", epp); + switch (epp) { case AMD_CPPC_EPP_PERFORMANCE: preference = EPP_INDEX_PERFORMANCE; diff --git a/drivers/cpufreq/amd-pstate.h b/drivers/cpufreq/amd-pstate.h index a7e52f79a802..f7461d1b6bf3 100644 --- a/drivers/cpufreq/amd-pstate.h +++ b/drivers/cpufreq/amd-pstate.h @@ -127,6 +127,7 @@ struct amd_cpudata { u8 epp_default_ac; u8 epp_default_dc; bool dynamic_epp; + bool raw_epp; struct notifier_block power_nb; /* platform profile */ From 7e173bc310d2b1df018edc66334a5304305889a2 Mon Sep 17 00:00:00 2001 From: "Mario Limonciello (AMD)" Date: Sun, 29 Mar 2026 15:38:11 -0500 Subject: [PATCH 19/22] cpufreq/amd-pstate-ut: Add a unit test for raw EPP Ensure that all supported raw EPP values work properly. Export the driver helpers used by the test module so the test can drive raw EPP writes and temporarily disable dynamic EPP while it runs. Reviewed-by: Gautham R. Shenoy Signed-off-by: Mario Limonciello (AMD) --- drivers/cpufreq/amd-pstate-ut.c | 109 ++++++++++++++++++++++++++++++++ drivers/cpufreq/amd-pstate.c | 11 ++-- drivers/cpufreq/amd-pstate.h | 4 ++ 3 files changed, 120 insertions(+), 4 deletions(-) diff --git a/drivers/cpufreq/amd-pstate-ut.c b/drivers/cpufreq/amd-pstate-ut.c index 1f62ab6438b4..aa8a464fab47 100644 --- a/drivers/cpufreq/amd-pstate-ut.c +++ b/drivers/cpufreq/amd-pstate-ut.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -41,6 +42,7 @@ static char *test_list; module_param(test_list, charp, 0444); MODULE_PARM_DESC(test_list, "Comma-delimited list of tests to run (empty means run all tests)"); +DEFINE_FREE(cleanup_page, void *, if (_T) free_page((unsigned long)_T)) struct amd_pstate_ut_struct { const char *name; @@ -54,6 +56,7 @@ static int amd_pstate_ut_acpi_cpc_valid(u32 index); static int amd_pstate_ut_check_enabled(u32 index); static int amd_pstate_ut_check_perf(u32 index); static int amd_pstate_ut_check_freq(u32 index); +static int amd_pstate_ut_epp(u32 index); static int amd_pstate_ut_check_driver(u32 index); static int amd_pstate_ut_check_freq_attrs(u32 index); @@ -62,6 +65,7 @@ static struct amd_pstate_ut_struct amd_pstate_ut_cases[] = { {"amd_pstate_ut_check_enabled", amd_pstate_ut_check_enabled }, {"amd_pstate_ut_check_perf", amd_pstate_ut_check_perf }, {"amd_pstate_ut_check_freq", amd_pstate_ut_check_freq }, + {"amd_pstate_ut_epp", amd_pstate_ut_epp }, {"amd_pstate_ut_check_driver", amd_pstate_ut_check_driver }, {"amd_pstate_ut_check_freq_attrs", amd_pstate_ut_check_freq_attrs }, }; @@ -268,6 +272,111 @@ static int amd_pstate_set_mode(enum amd_pstate_mode mode) return amd_pstate_update_status(mode_str, strlen(mode_str)); } +static int amd_pstate_ut_epp(u32 index) +{ + struct cpufreq_policy *policy __free(put_cpufreq_policy) = NULL; + char *buf __free(cleanup_page) = NULL; + static const char * const epp_strings[] = { + "performance", + "balance_performance", + "balance_power", + "power", + }; + struct amd_cpudata *cpudata; + enum amd_pstate_mode orig_mode; + bool orig_dynamic_epp; + int ret, cpu = 0; + int i; + u16 epp; + + policy = cpufreq_cpu_get(cpu); + if (!policy) + return -ENODEV; + + cpudata = policy->driver_data; + orig_mode = amd_pstate_get_status(); + orig_dynamic_epp = cpudata->dynamic_epp; + + /* disable dynamic EPP before running test */ + if (cpudata->dynamic_epp) { + pr_debug("Dynamic EPP is enabled, disabling it\n"); + amd_pstate_clear_dynamic_epp(policy); + } + + buf = (char *)__get_free_page(GFP_KERNEL); + if (!buf) + return -ENOMEM; + + ret = amd_pstate_set_mode(AMD_PSTATE_ACTIVE); + if (ret) + goto out; + + for (epp = 0; epp <= U8_MAX; epp++) { + u8 val; + + /* write all EPP values */ + memset(buf, 0, PAGE_SIZE); + snprintf(buf, PAGE_SIZE, "%d", epp); + ret = store_energy_performance_preference(policy, buf, strlen(buf)); + if (ret < 0) + goto out; + + /* check if the EPP value reads back correctly for raw numbers */ + memset(buf, 0, PAGE_SIZE); + ret = show_energy_performance_preference(policy, buf); + if (ret < 0) + goto out; + strreplace(buf, '\n', '\0'); + ret = kstrtou8(buf, 0, &val); + if (!ret && epp != val) { + pr_err("Raw EPP value mismatch: %d != %d\n", epp, val); + ret = -EINVAL; + goto out; + } + } + + for (i = 0; i < ARRAY_SIZE(epp_strings); i++) { + memset(buf, 0, PAGE_SIZE); + snprintf(buf, PAGE_SIZE, "%s", epp_strings[i]); + ret = store_energy_performance_preference(policy, buf, strlen(buf)); + if (ret < 0) + goto out; + + memset(buf, 0, PAGE_SIZE); + ret = show_energy_performance_preference(policy, buf); + if (ret < 0) + goto out; + strreplace(buf, '\n', '\0'); + + if (strcmp(buf, epp_strings[i])) { + pr_err("String EPP value mismatch: %s != %s\n", buf, epp_strings[i]); + ret = -EINVAL; + goto out; + } + } + + ret = 0; + +out: + if (orig_dynamic_epp) { + int ret2; + + ret2 = amd_pstate_set_mode(AMD_PSTATE_DISABLE); + if (!ret && ret2) + ret = ret2; + } + + if (orig_mode != amd_pstate_get_status()) { + int ret2; + + ret2 = amd_pstate_set_mode(orig_mode); + if (!ret && ret2) + ret = ret2; + } + + return ret; +} + static int amd_pstate_ut_check_driver(u32 index) { enum amd_pstate_mode mode1, mode2 = AMD_PSTATE_DISABLE; diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 83e937764fab..ca593c209111 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -1257,7 +1257,7 @@ static const struct platform_profile_ops amd_pstate_profile_ops = { .profile_get = amd_pstate_profile_get, }; -static void amd_pstate_clear_dynamic_epp(struct cpufreq_policy *policy) +void amd_pstate_clear_dynamic_epp(struct cpufreq_policy *policy) { struct amd_cpudata *cpudata = policy->driver_data; @@ -1270,6 +1270,7 @@ static void amd_pstate_clear_dynamic_epp(struct cpufreq_policy *policy) kfree(cpudata->profile_name); cpudata->dynamic_epp = false; } +EXPORT_SYMBOL_GPL(amd_pstate_clear_dynamic_epp); static int amd_pstate_set_dynamic_epp(struct cpufreq_policy *policy) { @@ -1406,8 +1407,8 @@ static ssize_t show_energy_performance_available_preferences( return offset; } -static ssize_t store_energy_performance_preference(struct cpufreq_policy *policy, - const char *buf, size_t count) +ssize_t store_energy_performance_preference(struct cpufreq_policy *policy, + const char *buf, size_t count) { struct amd_cpudata *cpudata = policy->driver_data; ssize_t ret; @@ -1448,8 +1449,9 @@ static ssize_t store_energy_performance_preference(struct cpufreq_policy *policy return count; } +EXPORT_SYMBOL_GPL(store_energy_performance_preference); -static ssize_t show_energy_performance_preference(struct cpufreq_policy *policy, char *buf) +ssize_t show_energy_performance_preference(struct cpufreq_policy *policy, char *buf) { struct amd_cpudata *cpudata = policy->driver_data; u8 preference, epp; @@ -1478,6 +1480,7 @@ static ssize_t show_energy_performance_preference(struct cpufreq_policy *policy, return sysfs_emit(buf, "%s\n", energy_perf_strings[preference]); } +EXPORT_SYMBOL_GPL(show_energy_performance_preference); static ssize_t store_amd_pstate_floor_freq(struct cpufreq_policy *policy, const char *buf, size_t count) diff --git a/drivers/cpufreq/amd-pstate.h b/drivers/cpufreq/amd-pstate.h index f7461d1b6bf3..e4722e54387b 100644 --- a/drivers/cpufreq/amd-pstate.h +++ b/drivers/cpufreq/amd-pstate.h @@ -150,6 +150,10 @@ enum amd_pstate_mode { const char *amd_pstate_get_mode_string(enum amd_pstate_mode mode); int amd_pstate_get_status(void); int amd_pstate_update_status(const char *buf, size_t size); +ssize_t store_energy_performance_preference(struct cpufreq_policy *policy, + const char *buf, size_t count); +ssize_t show_energy_performance_preference(struct cpufreq_policy *policy, char *buf); +void amd_pstate_clear_dynamic_epp(struct cpufreq_policy *policy); struct freq_attr; From 86d71f1d7686cecebbafb371ad58c6ad7f80a93a Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Mon, 16 Mar 2026 08:18:48 +0000 Subject: [PATCH 20/22] cpufreq/amd-pstate: Pass the policy to amd_pstate_update() All callers of amd_pstate_update() already have a reference to the cpufreq_policy object. Pass the entire policy object and grab the cpudata using "policy->driver_data" instead of passing the cpudata and unnecessarily grabbing another read-side reference to the cpufreq policy object when it is already available in the caller. No functional changes intended. Reviewed-by: Mario Limonciello (AMD) Acked-by: Viresh Kumar Signed-off-by: K Prateek Nayak Reviewed-by: Gautham R. Shenoy Link: https://lore.kernel.org/r/20260316081849.19368-2-kprateek.nayak@amd.com Signed-off-by: Mario Limonciello (AMD) --- drivers/cpufreq/amd-pstate.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index ca593c209111..2ea4d27fe020 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -643,15 +643,12 @@ static inline bool amd_pstate_sample(struct amd_cpudata *cpudata) return true; } -static void amd_pstate_update(struct amd_cpudata *cpudata, u8 min_perf, +static void amd_pstate_update(struct cpufreq_policy *policy, u8 min_perf, u8 des_perf, u8 max_perf, bool fast_switch, int gov_flags) { - struct cpufreq_policy *policy __free(put_cpufreq_policy) = cpufreq_cpu_get(cpudata->cpu); + struct amd_cpudata *cpudata = policy->driver_data; union perf_cached perf = READ_ONCE(cpudata->perf); - if (!policy) - return; - /* limit the max perf when core performance boost feature is disabled */ if (!cpudata->boost_supported) max_perf = min_t(u8, perf.nominal_perf, max_perf); @@ -766,7 +763,7 @@ static int amd_pstate_update_freq(struct cpufreq_policy *policy, if (!fast_switch) cpufreq_freq_transition_begin(policy, &freqs); - amd_pstate_update(cpudata, perf.min_limit_perf, des_perf, + amd_pstate_update(policy, perf.min_limit_perf, des_perf, perf.max_limit_perf, fast_switch, policy->governor->flags); @@ -828,7 +825,7 @@ static void amd_pstate_adjust_perf(unsigned int cpu, if (max_perf < min_perf) max_perf = min_perf; - amd_pstate_update(cpudata, min_perf, des_perf, max_perf, true, + amd_pstate_update(policy, min_perf, des_perf, max_perf, true, policy->governor->flags); } From c03791085adcd61fa9b766ab303c7d0941d7378d Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Mon, 16 Mar 2026 08:18:49 +0000 Subject: [PATCH 21/22] cpufreq: Pass the policy to cpufreq_driver->adjust_perf() cpufreq_cpu_get() can sleep on PREEMPT_RT in presence of concurrent writer(s), however amd-pstate depends on fetching the cpudata via the policy's driver data which necessitates grabbing the reference. Since schedutil governor can call "cpufreq_driver->update_perf()" during sched_tick/enqueue/dequeue with rq_lock held and IRQs disabled, fetching the policy object using the cpufreq_cpu_get() helper in the scheduler fast-path leads to "BUG: scheduling while atomic" on PREEMPT_RT [1]. Pass the cached cpufreq policy object in sg_policy to the update_perf() instead of just the CPU. The CPU can be inferred using "policy->cpu". The lifetime of cpufreq_policy object outlasts that of the governor and the cpufreq driver (allocated when the CPU is onlined and only reclaimed when the CPU is offlined / the CPU device is removed) which makes it safe to be referenced throughout the governor's lifetime. Closes:https://lore.kernel.org/all/20250731092316.3191-1-spasswolf@web.de/ [1] Fixes: 1d215f0319c2 ("cpufreq: amd-pstate: Add fast switch function for AMD P-State") Reported-by: Bert Karwatzki Acked-by: Viresh Kumar Signed-off-by: K Prateek Nayak Acked-by: Gary Guo # Rust Reviewed-by: Gautham R. Shenoy Reviewed-by: Zhongqiu Han Link: https://lore.kernel.org/r/20260316081849.19368-3-kprateek.nayak@amd.com Signed-off-by: Mario Limonciello (AMD) --- drivers/cpufreq/amd-pstate.c | 3 +-- drivers/cpufreq/cpufreq.c | 6 +++--- drivers/cpufreq/intel_pstate.c | 4 ++-- include/linux/cpufreq.h | 4 ++-- kernel/sched/cpufreq_schedutil.c | 5 +++-- rust/kernel/cpufreq.rs | 13 ++++++------- 6 files changed, 17 insertions(+), 18 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 2ea4d27fe020..c825fab0bf5c 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -788,13 +788,12 @@ static unsigned int amd_pstate_fast_switch(struct cpufreq_policy *policy, return policy->cur; } -static void amd_pstate_adjust_perf(unsigned int cpu, +static void amd_pstate_adjust_perf(struct cpufreq_policy *policy, unsigned long _min_perf, unsigned long target_perf, unsigned long capacity) { u8 max_perf, min_perf, des_perf, cap_perf; - struct cpufreq_policy *policy __free(put_cpufreq_policy) = cpufreq_cpu_get(cpu); struct amd_cpudata *cpudata; union perf_cached perf; diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 277884d91913..90e939069cde 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -2231,7 +2231,7 @@ EXPORT_SYMBOL_GPL(cpufreq_driver_fast_switch); /** * cpufreq_driver_adjust_perf - Adjust CPU performance level in one go. - * @cpu: Target CPU. + * @policy: cpufreq policy object of the target CPU. * @min_perf: Minimum (required) performance level (units of @capacity). * @target_perf: Target (desired) performance level (units of @capacity). * @capacity: Capacity of the target CPU. @@ -2250,12 +2250,12 @@ EXPORT_SYMBOL_GPL(cpufreq_driver_fast_switch); * parallel with either ->target() or ->target_index() or ->fast_switch() for * the same CPU. */ -void cpufreq_driver_adjust_perf(unsigned int cpu, +void cpufreq_driver_adjust_perf(struct cpufreq_policy *policy, unsigned long min_perf, unsigned long target_perf, unsigned long capacity) { - cpufreq_driver->adjust_perf(cpu, min_perf, target_perf, capacity); + cpufreq_driver->adjust_perf(policy, min_perf, target_perf, capacity); } /** diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 11c58af41900..0f50034e4b68 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -3239,12 +3239,12 @@ static unsigned int intel_cpufreq_fast_switch(struct cpufreq_policy *policy, return target_pstate * cpu->pstate.scaling; } -static void intel_cpufreq_adjust_perf(unsigned int cpunum, +static void intel_cpufreq_adjust_perf(struct cpufreq_policy *policy, unsigned long min_perf, unsigned long target_perf, unsigned long capacity) { - struct cpudata *cpu = all_cpu_data[cpunum]; + struct cpudata *cpu = all_cpu_data[policy->cpu]; u64 hwp_cap = READ_ONCE(cpu->hwp_cap_cached); int old_pstate = cpu->pstate.current_pstate; int cap_pstate, min_pstate, max_pstate, target_pstate; diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index cc894fc38971..4317c5a312bd 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -372,7 +372,7 @@ struct cpufreq_driver { * conditions) scale invariance can be disabled, which causes the * schedutil governor to fall back to the latter. */ - void (*adjust_perf)(unsigned int cpu, + void (*adjust_perf)(struct cpufreq_policy *policy, unsigned long min_perf, unsigned long target_perf, unsigned long capacity); @@ -617,7 +617,7 @@ struct cpufreq_governor { /* Pass a target to the cpufreq driver */ unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy, unsigned int target_freq); -void cpufreq_driver_adjust_perf(unsigned int cpu, +void cpufreq_driver_adjust_perf(struct cpufreq_policy *policy, unsigned long min_perf, unsigned long target_perf, unsigned long capacity); diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 153232dd8276..ae9fd211cec1 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -461,6 +461,7 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time, unsigned int flags) { struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); + struct sugov_policy *sg_policy = sg_cpu->sg_policy; unsigned long prev_util = sg_cpu->util; unsigned long max_cap; @@ -482,10 +483,10 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time, if (sugov_hold_freq(sg_cpu) && sg_cpu->util < prev_util) sg_cpu->util = prev_util; - cpufreq_driver_adjust_perf(sg_cpu->cpu, sg_cpu->bw_min, + cpufreq_driver_adjust_perf(sg_policy->policy, sg_cpu->bw_min, sg_cpu->util, max_cap); - sg_cpu->sg_policy->last_freq_update_time = time; + sg_policy->last_freq_update_time = time; } static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) diff --git a/rust/kernel/cpufreq.rs b/rust/kernel/cpufreq.rs index f5adee48d40c..d8d26870bea2 100644 --- a/rust/kernel/cpufreq.rs +++ b/rust/kernel/cpufreq.rs @@ -1257,18 +1257,17 @@ impl Registration { /// # Safety /// /// - This function may only be called from the cpufreq C infrastructure. + /// - The pointer arguments must be valid pointers. unsafe extern "C" fn adjust_perf_callback( - cpu: c_uint, + ptr: *mut bindings::cpufreq_policy, min_perf: c_ulong, target_perf: c_ulong, capacity: c_ulong, ) { - // SAFETY: The C API guarantees that `cpu` refers to a valid CPU number. - let cpu_id = unsafe { CpuId::from_u32_unchecked(cpu) }; - - if let Ok(mut policy) = PolicyCpu::from_cpu(cpu_id) { - T::adjust_perf(&mut policy, min_perf, target_perf, capacity); - } + // SAFETY: The `ptr` is guaranteed to be valid by the contract with the C code for the + // lifetime of `policy`. + let policy = unsafe { Policy::from_raw_mut(ptr) }; + T::adjust_perf(policy, min_perf, target_perf, capacity); } /// Driver's `get_intermediate` callback. From 9487e2a00e7b3c6f258c5c99953f470eba6fb61d Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Thu, 2 Apr 2026 15:56:11 +0530 Subject: [PATCH 22/22] MAINTAINERS: amd-pstate: Step down as maintainer, add Prateek as reviewer Mario Limonciello has led amd-pstate maintenance in recent years and has done excellent work. The amd-pstate driver is in good hands with him. I am stepping down as co-maintainer as I move on to other things. Add K Prateek Nayak as a reviewer. He has been actively contributing to the driver including preferred-core and ITMT improvements, and has been helping review amd-pstate patches for a while now. Signed-off-by: Gautham R. Shenoy Acked-by: K Prateek Nayak Acked-by: Mario Limonciello (AMD) Link: https://lore.kernel.org/r/20260402102611.16519-1-gautham.shenoy@amd.com Signed-off-by: Mario Limonciello (AMD) --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 7d10988cbc62..50723bc3e69e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1234,9 +1234,9 @@ F: drivers/gpu/drm/amd/pm/ AMD PSTATE DRIVER M: Huang Rui -M: Gautham R. Shenoy M: Mario Limonciello R: Perry Yuan +R: K Prateek Nayak L: linux-pm@vger.kernel.org S: Supported F: Documentation/admin-guide/pm/amd-pstate.rst