From 6d4b8d052ff22742c3980fa45f26b0969a9b6163 Mon Sep 17 00:00:00 2001 From: Zide Chen Date: Mon, 15 Dec 2025 10:25:18 -0800 Subject: [PATCH 01/58] perf/x86/intel/cstate: Add Wildcat Lake support Wildcat Lake (WCL) is a low-power variant of Panther Lake. From a C-state profiling perspective, it supports the same residency counters: CC1/CC6/CC7 and PC2/PC6/PC10. Signed-off-by: Zide Chen Signed-off-by: Ingo Molnar Reviewed-by: Dapeng Mi Link: https://patch.msgid.link/20251215182520.115822-1-zide.chen@intel.com --- arch/x86/events/intel/cstate.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index fa67fda6e45b..b719b0a68a2a 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -41,7 +41,7 @@ * MSR_CORE_C1_RES: CORE C1 Residency Counter * perf code: 0x00 * Available model: SLM,AMT,GLM,CNL,ICX,TNT,ADL,RPL - * MTL,SRF,GRR,ARL,LNL,PTL + * MTL,SRF,GRR,ARL,LNL,PTL,WCL * Scope: Core (each processor core has a MSR) * MSR_CORE_C3_RESIDENCY: CORE C3 Residency Counter * perf code: 0x01 @@ -53,19 +53,19 @@ * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW, * SKL,KNL,GLM,CNL,KBL,CML,ICL,ICX, * TGL,TNT,RKL,ADL,RPL,SPR,MTL,SRF, - * GRR,ARL,LNL,PTL + * GRR,ARL,LNL,PTL,WCL * Scope: Core * MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter * perf code: 0x03 * Available model: SNB,IVB,HSW,BDW,SKL,CNL,KBL,CML, * ICL,TGL,RKL,ADL,RPL,MTL,ARL,LNL, - * PTL + * PTL,WCL * Scope: Core * MSR_PKG_C2_RESIDENCY: Package C2 Residency Counter. * perf code: 0x00 * Available model: SNB,IVB,HSW,BDW,SKL,KNL,GLM,CNL, * KBL,CML,ICL,ICX,TGL,TNT,RKL,ADL, - * RPL,SPR,MTL,ARL,LNL,SRF,PTL + * RPL,SPR,MTL,ARL,LNL,SRF,PTL,WCL * Scope: Package (physical package) * MSR_PKG_C3_RESIDENCY: Package C3 Residency Counter. * perf code: 0x01 @@ -78,7 +78,7 @@ * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW, * SKL,KNL,GLM,CNL,KBL,CML,ICL,ICX, * TGL,TNT,RKL,ADL,RPL,SPR,MTL,SRF, - * ARL,LNL,PTL + * ARL,LNL,PTL,WCL * Scope: Package (physical package) * MSR_PKG_C7_RESIDENCY: Package C7 Residency Counter. * perf code: 0x03 @@ -97,7 +97,8 @@ * MSR_PKG_C10_RESIDENCY: Package C10 Residency Counter. * perf code: 0x06 * Available model: HSW ULT,KBL,GLM,CNL,CML,ICL,TGL, - * TNT,RKL,ADL,RPL,MTL,ARL,LNL,PTL + * TNT,RKL,ADL,RPL,MTL,ARL,LNL,PTL, + * WCL * Scope: Package (physical package) * MSR_MODULE_C6_RES_MS: Module C6 Residency Counter. * perf code: 0x00 @@ -654,6 +655,7 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_MATCH_VFM(INTEL_ARROWLAKE_U, &adl_cstates), X86_MATCH_VFM(INTEL_LUNARLAKE_M, &lnl_cstates), X86_MATCH_VFM(INTEL_PANTHERLAKE_L, &lnl_cstates), + X86_MATCH_VFM(INTEL_WILDCATLAKE_L, &lnl_cstates), { }, }; MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match); From 7e760ac4617b63628edd55a96be2fc85b7eaa435 Mon Sep 17 00:00:00 2001 From: Zide Chen Date: Mon, 15 Dec 2025 10:25:19 -0800 Subject: [PATCH 02/58] perf/x86/intel/cstate: Add Nova Lake support Similar to Lunar Lake and Panther Lake, Nova Lake supports CC1/CC6/CC7 and PC2/PC6/PC10 residency counters; it also adds support for MC6. Signed-off-by: Zide Chen Signed-off-by: Ingo Molnar Reviewed-by: Dapeng Mi Link: https://patch.msgid.link/20251215182520.115822-2-zide.chen@intel.com --- arch/x86/events/intel/cstate.c | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index b719b0a68a2a..008f8ea59315 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -41,7 +41,7 @@ * MSR_CORE_C1_RES: CORE C1 Residency Counter * perf code: 0x00 * Available model: SLM,AMT,GLM,CNL,ICX,TNT,ADL,RPL - * MTL,SRF,GRR,ARL,LNL,PTL,WCL + * MTL,SRF,GRR,ARL,LNL,PTL,WCL,NVL * Scope: Core (each processor core has a MSR) * MSR_CORE_C3_RESIDENCY: CORE C3 Residency Counter * perf code: 0x01 @@ -53,19 +53,20 @@ * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW, * SKL,KNL,GLM,CNL,KBL,CML,ICL,ICX, * TGL,TNT,RKL,ADL,RPL,SPR,MTL,SRF, - * GRR,ARL,LNL,PTL,WCL + * GRR,ARL,LNL,PTL,WCL,NVL * Scope: Core * MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter * perf code: 0x03 * Available model: SNB,IVB,HSW,BDW,SKL,CNL,KBL,CML, * ICL,TGL,RKL,ADL,RPL,MTL,ARL,LNL, - * PTL,WCL + * PTL,WCL,NVL * Scope: Core * MSR_PKG_C2_RESIDENCY: Package C2 Residency Counter. * perf code: 0x00 * Available model: SNB,IVB,HSW,BDW,SKL,KNL,GLM,CNL, * KBL,CML,ICL,ICX,TGL,TNT,RKL,ADL, - * RPL,SPR,MTL,ARL,LNL,SRF,PTL,WCL + * RPL,SPR,MTL,ARL,LNL,SRF,PTL,WCL, + * NVL * Scope: Package (physical package) * MSR_PKG_C3_RESIDENCY: Package C3 Residency Counter. * perf code: 0x01 @@ -78,7 +79,7 @@ * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW, * SKL,KNL,GLM,CNL,KBL,CML,ICL,ICX, * TGL,TNT,RKL,ADL,RPL,SPR,MTL,SRF, - * ARL,LNL,PTL,WCL + * ARL,LNL,PTL,WCL,NVL * Scope: Package (physical package) * MSR_PKG_C7_RESIDENCY: Package C7 Residency Counter. * perf code: 0x03 @@ -98,11 +99,11 @@ * perf code: 0x06 * Available model: HSW ULT,KBL,GLM,CNL,CML,ICL,TGL, * TNT,RKL,ADL,RPL,MTL,ARL,LNL,PTL, - * WCL + * WCL,NVL * Scope: Package (physical package) * MSR_MODULE_C6_RES_MS: Module C6 Residency Counter. * perf code: 0x00 - * Available model: SRF,GRR + * Available model: SRF,GRR,NVL * Scope: A cluster of cores shared L2 cache * */ @@ -528,6 +529,18 @@ static const struct cstate_model lnl_cstates __initconst = { BIT(PERF_CSTATE_PKG_C10_RES), }; +static const struct cstate_model nvl_cstates __initconst = { + .core_events = BIT(PERF_CSTATE_CORE_C1_RES) | + BIT(PERF_CSTATE_CORE_C6_RES) | + BIT(PERF_CSTATE_CORE_C7_RES), + + .module_events = BIT(PERF_CSTATE_MODULE_C6_RES), + + .pkg_events = BIT(PERF_CSTATE_PKG_C2_RES) | + BIT(PERF_CSTATE_PKG_C6_RES) | + BIT(PERF_CSTATE_PKG_C10_RES), +}; + static const struct cstate_model slm_cstates __initconst = { .core_events = BIT(PERF_CSTATE_CORE_C1_RES) | BIT(PERF_CSTATE_CORE_C6_RES), @@ -656,6 +669,8 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_MATCH_VFM(INTEL_LUNARLAKE_M, &lnl_cstates), X86_MATCH_VFM(INTEL_PANTHERLAKE_L, &lnl_cstates), X86_MATCH_VFM(INTEL_WILDCATLAKE_L, &lnl_cstates), + X86_MATCH_VFM(INTEL_NOVALAKE, &nvl_cstates), + X86_MATCH_VFM(INTEL_NOVALAKE_L, &nvl_cstates), { }, }; MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match); From 7ac422cf7b16ec524bcd8e017459e328a4103f63 Mon Sep 17 00:00:00 2001 From: Zide Chen Date: Mon, 15 Dec 2025 10:25:20 -0800 Subject: [PATCH 03/58] perf/x86/intel/cstate: Add Diamond Rapids support From a C-state residency profiling perspective, Diamond Rapids is similar to SRF and GNR, supporting core C1/C6, module C6, and package C2/C6 residency counters. Similar to CWF, the C1E residency can be accessed via PMT only. Signed-off-by: Zide Chen Signed-off-by: Ingo Molnar Reviewed-by: Dapeng Mi Link: https://patch.msgid.link/20251215182520.115822-3-zide.chen@intel.com --- arch/x86/events/intel/cstate.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index 008f8ea59315..1e2658b60d91 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -652,6 +652,7 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, &icx_cstates), X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, &icx_cstates), X86_MATCH_VFM(INTEL_GRANITERAPIDS_D, &icx_cstates), + X86_MATCH_VFM(INTEL_DIAMONDRAPIDS_X, &srf_cstates), X86_MATCH_VFM(INTEL_TIGERLAKE_L, &icl_cstates), X86_MATCH_VFM(INTEL_TIGERLAKE, &icl_cstates), From b825444b6179eb071e66ca3da5ac12d4dbd808d5 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 5 Dec 2025 16:16:37 -0800 Subject: [PATCH 04/58] perf: Skip pmu_ctx based on event_type To optimize the cgroup context switch, the perf_event_pmu_context iteration skips the PMUs without cgroup events. A bool cgroup was introduced to indicate the case. It can work, but this way is hard to extend for other cases, e.g. skipping non-mediated PMUs. It doesn't make sense to keep adding bool variables. Pass the event_type instead of the specific bool variable. Check both the event_type and related pmu_ctx variables to decide whether skipping a PMU. Event flags, e.g., EVENT_CGROUP, should be cleard in the ctx->is_active. Add EVENT_FLAGS to indicate such event flags. No functional change. Signed-off-by: Kan Liang Signed-off-by: Mingwei Zhang Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Tested-by: Yongwei Ma Tested-by: Xudong Hao Link: https://patch.msgid.link/20251206001720.468579-2-seanjc@google.com --- kernel/events/core.c | 74 ++++++++++++++++++++++++-------------------- 1 file changed, 40 insertions(+), 34 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index dad0d3d2e85f..406371ce45f2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -165,7 +165,7 @@ enum event_type_t { /* see ctx_resched() for details */ EVENT_CPU = 0x10, EVENT_CGROUP = 0x20, - + EVENT_FLAGS = EVENT_CGROUP, /* compound helpers */ EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, EVENT_TIME_FROZEN = EVENT_TIME | EVENT_FROZEN, @@ -779,27 +779,37 @@ do { \ ___p; \ }) -#define for_each_epc(_epc, _ctx, _pmu, _cgroup) \ +static bool perf_skip_pmu_ctx(struct perf_event_pmu_context *pmu_ctx, + enum event_type_t event_type) +{ + if ((event_type & EVENT_CGROUP) && !pmu_ctx->nr_cgroups) + return true; + return false; +} + +#define for_each_epc(_epc, _ctx, _pmu, _event_type) \ list_for_each_entry(_epc, &((_ctx)->pmu_ctx_list), pmu_ctx_entry) \ - if (_cgroup && !_epc->nr_cgroups) \ + if (perf_skip_pmu_ctx(_epc, _event_type)) \ continue; \ else if (_pmu && _epc->pmu != _pmu) \ continue; \ else -static void perf_ctx_disable(struct perf_event_context *ctx, bool cgroup) +static void perf_ctx_disable(struct perf_event_context *ctx, + enum event_type_t event_type) { struct perf_event_pmu_context *pmu_ctx; - for_each_epc(pmu_ctx, ctx, NULL, cgroup) + for_each_epc(pmu_ctx, ctx, NULL, event_type) perf_pmu_disable(pmu_ctx->pmu); } -static void perf_ctx_enable(struct perf_event_context *ctx, bool cgroup) +static void perf_ctx_enable(struct perf_event_context *ctx, + enum event_type_t event_type) { struct perf_event_pmu_context *pmu_ctx; - for_each_epc(pmu_ctx, ctx, NULL, cgroup) + for_each_epc(pmu_ctx, ctx, NULL, event_type) perf_pmu_enable(pmu_ctx->pmu); } @@ -964,8 +974,7 @@ static void perf_cgroup_switch(struct task_struct *task) return; WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0); - - perf_ctx_disable(&cpuctx->ctx, true); + perf_ctx_disable(&cpuctx->ctx, EVENT_CGROUP); ctx_sched_out(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP); /* @@ -981,7 +990,7 @@ static void perf_cgroup_switch(struct task_struct *task) */ ctx_sched_in(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP); - perf_ctx_enable(&cpuctx->ctx, true); + perf_ctx_enable(&cpuctx->ctx, EVENT_CGROUP); } static int perf_cgroup_ensure_storage(struct perf_event *event, @@ -2902,11 +2911,11 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, event_type &= EVENT_ALL; - for_each_epc(epc, &cpuctx->ctx, pmu, false) + for_each_epc(epc, &cpuctx->ctx, pmu, 0) perf_pmu_disable(epc->pmu); if (task_ctx) { - for_each_epc(epc, task_ctx, pmu, false) + for_each_epc(epc, task_ctx, pmu, 0) perf_pmu_disable(epc->pmu); task_ctx_sched_out(task_ctx, pmu, event_type); @@ -2926,11 +2935,11 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, perf_event_sched_in(cpuctx, task_ctx, pmu); - for_each_epc(epc, &cpuctx->ctx, pmu, false) + for_each_epc(epc, &cpuctx->ctx, pmu, 0) perf_pmu_enable(epc->pmu); if (task_ctx) { - for_each_epc(epc, task_ctx, pmu, false) + for_each_epc(epc, task_ctx, pmu, 0) perf_pmu_enable(epc->pmu); } } @@ -3479,11 +3488,10 @@ static void ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); + enum event_type_t active_type = event_type & ~EVENT_FLAGS; struct perf_event_pmu_context *pmu_ctx; int is_active = ctx->is_active; - bool cgroup = event_type & EVENT_CGROUP; - event_type &= ~EVENT_CGROUP; lockdep_assert_held(&ctx->lock); @@ -3514,7 +3522,7 @@ ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t * see __load_acquire() in perf_event_time_now() */ barrier(); - ctx->is_active &= ~event_type; + ctx->is_active &= ~active_type; if (!(ctx->is_active & EVENT_ALL)) { /* @@ -3535,7 +3543,7 @@ ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t is_active ^= ctx->is_active; /* changed bits */ - for_each_epc(pmu_ctx, ctx, pmu, cgroup) + for_each_epc(pmu_ctx, ctx, pmu, event_type) __pmu_ctx_sched_out(pmu_ctx, is_active); } @@ -3691,7 +3699,7 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next) raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); if (context_equiv(ctx, next_ctx)) { - perf_ctx_disable(ctx, false); + perf_ctx_disable(ctx, 0); /* PMIs are disabled; ctx->nr_no_switch_fast is stable. */ if (local_read(&ctx->nr_no_switch_fast) || @@ -3715,7 +3723,7 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next) perf_ctx_sched_task_cb(ctx, task, false); - perf_ctx_enable(ctx, false); + perf_ctx_enable(ctx, 0); /* * RCU_INIT_POINTER here is safe because we've not @@ -3739,13 +3747,13 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next) if (do_switch) { raw_spin_lock(&ctx->lock); - perf_ctx_disable(ctx, false); + perf_ctx_disable(ctx, 0); inside_switch: perf_ctx_sched_task_cb(ctx, task, false); task_ctx_sched_out(ctx, NULL, EVENT_ALL); - perf_ctx_enable(ctx, false); + perf_ctx_enable(ctx, 0); raw_spin_unlock(&ctx->lock); } } @@ -4054,11 +4062,9 @@ static void ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); + enum event_type_t active_type = event_type & ~EVENT_FLAGS; struct perf_event_pmu_context *pmu_ctx; int is_active = ctx->is_active; - bool cgroup = event_type & EVENT_CGROUP; - - event_type &= ~EVENT_CGROUP; lockdep_assert_held(&ctx->lock); @@ -4076,7 +4082,7 @@ ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t barrier(); } - ctx->is_active |= (event_type | EVENT_TIME); + ctx->is_active |= active_type | EVENT_TIME; if (ctx->task) { if (!(is_active & EVENT_ALL)) cpuctx->task_ctx = ctx; @@ -4091,13 +4097,13 @@ ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t * in order to give them the best chance of going on. */ if (is_active & EVENT_PINNED) { - for_each_epc(pmu_ctx, ctx, pmu, cgroup) + for_each_epc(pmu_ctx, ctx, pmu, event_type) __pmu_ctx_sched_in(pmu_ctx, EVENT_PINNED); } /* Then walk through the lower prio flexible groups */ if (is_active & EVENT_FLEXIBLE) { - for_each_epc(pmu_ctx, ctx, pmu, cgroup) + for_each_epc(pmu_ctx, ctx, pmu, event_type) __pmu_ctx_sched_in(pmu_ctx, EVENT_FLEXIBLE); } } @@ -4114,11 +4120,11 @@ static void perf_event_context_sched_in(struct task_struct *task) if (cpuctx->task_ctx == ctx) { perf_ctx_lock(cpuctx, ctx); - perf_ctx_disable(ctx, false); + perf_ctx_disable(ctx, 0); perf_ctx_sched_task_cb(ctx, task, true); - perf_ctx_enable(ctx, false); + perf_ctx_enable(ctx, 0); perf_ctx_unlock(cpuctx, ctx); goto rcu_unlock; } @@ -4131,7 +4137,7 @@ static void perf_event_context_sched_in(struct task_struct *task) if (!ctx->nr_events) goto unlock; - perf_ctx_disable(ctx, false); + perf_ctx_disable(ctx, 0); /* * We want to keep the following priority order: * cpu pinned (that don't need to move), task pinned, @@ -4141,7 +4147,7 @@ static void perf_event_context_sched_in(struct task_struct *task) * events, no need to flip the cpuctx's events around. */ if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) { - perf_ctx_disable(&cpuctx->ctx, false); + perf_ctx_disable(&cpuctx->ctx, 0); ctx_sched_out(&cpuctx->ctx, NULL, EVENT_FLEXIBLE); } @@ -4150,9 +4156,9 @@ static void perf_event_context_sched_in(struct task_struct *task) perf_ctx_sched_task_cb(cpuctx->task_ctx, task, true); if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) - perf_ctx_enable(&cpuctx->ctx, false); + perf_ctx_enable(&cpuctx->ctx, 0); - perf_ctx_enable(ctx, false); + perf_ctx_enable(ctx, 0); unlock: perf_ctx_unlock(cpuctx, ctx); From b9e52b11d2e5e403afaf69a7f8d6b29f8380ed38 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 5 Dec 2025 16:16:38 -0800 Subject: [PATCH 05/58] perf: Add generic exclude_guest support Only KVM knows the exact time when a guest is entering/exiting. Expose two interfaces to KVM to switch the ownership of the PMU resources. All the pinned events must be scheduled in first. Extend the perf_event_sched_in() helper to support extra flag, e.g., EVENT_GUEST. Signed-off-by: Kan Liang Signed-off-by: Mingwei Zhang Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Tested-by: Xudong Hao Link: https://patch.msgid.link/20251206001720.468579-3-seanjc@google.com --- kernel/events/core.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 406371ce45f2..fab358daa42e 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2870,14 +2870,15 @@ static void task_ctx_sched_out(struct perf_event_context *ctx, static void perf_event_sched_in(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, - struct pmu *pmu) + struct pmu *pmu, + enum event_type_t event_type) { - ctx_sched_in(&cpuctx->ctx, pmu, EVENT_PINNED); + ctx_sched_in(&cpuctx->ctx, pmu, EVENT_PINNED | event_type); if (ctx) - ctx_sched_in(ctx, pmu, EVENT_PINNED); - ctx_sched_in(&cpuctx->ctx, pmu, EVENT_FLEXIBLE); + ctx_sched_in(ctx, pmu, EVENT_PINNED | event_type); + ctx_sched_in(&cpuctx->ctx, pmu, EVENT_FLEXIBLE | event_type); if (ctx) - ctx_sched_in(ctx, pmu, EVENT_FLEXIBLE); + ctx_sched_in(ctx, pmu, EVENT_FLEXIBLE | event_type); } /* @@ -2933,7 +2934,7 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, else if (event_type & EVENT_PINNED) ctx_sched_out(&cpuctx->ctx, pmu, EVENT_FLEXIBLE); - perf_event_sched_in(cpuctx, task_ctx, pmu); + perf_event_sched_in(cpuctx, task_ctx, pmu, 0); for_each_epc(epc, &cpuctx->ctx, pmu, 0) perf_pmu_enable(epc->pmu); @@ -4151,7 +4152,7 @@ static void perf_event_context_sched_in(struct task_struct *task) ctx_sched_out(&cpuctx->ctx, NULL, EVENT_FLEXIBLE); } - perf_event_sched_in(cpuctx, ctx, NULL); + perf_event_sched_in(cpuctx, ctx, NULL, 0); perf_ctx_sched_task_cb(cpuctx->task_ctx, task, true); From 991bdf7e9d6cc74c1de215d1a05c23ff61076bf0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 5 Dec 2025 16:16:39 -0800 Subject: [PATCH 06/58] perf: Move security_perf_event_free() call to __free_event() Move the freeing of any security state associated with a perf event from _free_event() to __free_event(), i.e. invoke security_perf_event_free() in the error paths for perf_event_alloc(). This will allow adding potential error paths in perf_event_alloc() that can occur after allocating security state. Note, kfree() and thus security_perf_event_free() is a nop if event->security is NULL, i.e. calling security_perf_event_free() even if security_perf_event_alloc() fails or is never reached is functionality ok. Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Tested-by: Xudong Hao Link: https://patch.msgid.link/20251206001720.468579-4-seanjc@google.com --- kernel/events/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index fab358daa42e..6973483d0dfa 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5601,6 +5601,8 @@ static void __free_event(struct perf_event *event) { struct pmu *pmu = event->pmu; + security_perf_event_free(event); + if (event->attach_state & PERF_ATTACH_CALLCHAIN) put_callchain_buffers(); @@ -5664,8 +5666,6 @@ static void _free_event(struct perf_event *event) unaccount_event(event); - security_perf_event_free(event); - if (event->rb) { /* * Can happen when we close an event with re-directed output. From eff95e170275d9e80b968f335cd03d0ac250d2d1 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 5 Dec 2025 16:16:40 -0800 Subject: [PATCH 07/58] perf: Add APIs to create/release mediated guest vPMUs Currently, exposing PMU capabilities to a KVM guest is done by emulating guest PMCs via host perf events, i.e. by having KVM be "just" another user of perf. As a result, the guest and host are effectively competing for resources, and emulating guest accesses to vPMU resources requires expensive actions (expensive relative to the native instruction). The overhead and resource competition results in degraded guest performance and ultimately very poor vPMU accuracy. To address the issues with the perf-emulated vPMU, introduce a "mediated vPMU", where the data plane (PMCs and enable/disable knobs) is exposed directly to the guest, but the control plane (event selectors and access to fixed counters) is managed by KVM (via MSR interceptions). To allow host perf usage of the PMU to (partially) co-exist with KVM/guest usage of the PMU, KVM and perf will coordinate to a world switch between host perf context and guest vPMU context near VM-Enter/VM-Exit. Add two exported APIs, perf_{create,release}_mediated_pmu(), to allow KVM to create and release a mediated PMU instance (per VM). Because host perf context will be deactivated while the guest is running, mediated PMU usage will be mutually exclusive with perf analysis of the guest, i.e. perf events that do NOT exclude the guest will not behave as expected. To avoid silent failure of !exclude_guest perf events, disallow creating a mediated PMU if there are active !exclude_guest events, and on the perf side, disallowing creating new !exclude_guest perf events while there is at least one active mediated PMU. Exempt PMU resources that do not support mediated PMU usage, i.e. that are outside the scope/view of KVM's vPMU and will not be swapped out while the guest is running. Guard mediated PMU with a new kconfig to help readers identify code paths that are unique to mediated PMU support, and to allow for adding arch- specific hooks without stubs. KVM x86 is expected to be the only KVM architecture to support a mediated PMU in the near future (e.g. arm64 is trending toward a partitioned PMU implementation), and KVM x86 will select PERF_GUEST_MEDIATED_PMU unconditionally, i.e. won't need stubs. Immediately select PERF_GUEST_MEDIATED_PMU when KVM x86 is enabled so that all paths are compile tested. Full KVM support is on its way... [sean: add kconfig and WARNing, rewrite changelog, swizzle patch ordering] Suggested-by: Sean Christopherson Signed-off-by: Kan Liang Signed-off-by: Mingwei Zhang Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Tested-by: Xudong Hao Link: https://patch.msgid.link/20251206001720.468579-5-seanjc@google.com --- arch/x86/kvm/Kconfig | 1 + include/linux/perf_event.h | 6 +++ init/Kconfig | 4 ++ kernel/events/core.c | 82 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 93 insertions(+) diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 278f08194ec8..d916bd766c94 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -37,6 +37,7 @@ config KVM_X86 select SCHED_INFO select PERF_EVENTS select GUEST_PERF_EVENTS + select PERF_GUEST_MEDIATED_PMU select HAVE_KVM_MSI select HAVE_KVM_CPU_RELAX_INTERCEPT select HAVE_KVM_NO_POLL diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 9870d768db4c..31929da6e711 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -305,6 +305,7 @@ struct perf_event_pmu_context; #define PERF_PMU_CAP_EXTENDED_HW_TYPE 0x0100 #define PERF_PMU_CAP_AUX_PAUSE 0x0200 #define PERF_PMU_CAP_AUX_PREFER_LARGE 0x0400 +#define PERF_PMU_CAP_MEDIATED_VPMU 0x0800 /** * pmu::scope @@ -1914,6 +1915,11 @@ extern int perf_event_account_interrupt(struct perf_event *event); extern int perf_event_period(struct perf_event *event, u64 value); extern u64 perf_event_pause(struct perf_event *event, bool reset); +#ifdef CONFIG_PERF_GUEST_MEDIATED_PMU +int perf_create_mediated_pmu(void); +void perf_release_mediated_pmu(void); +#endif + #else /* !CONFIG_PERF_EVENTS: */ static inline void * diff --git a/init/Kconfig b/init/Kconfig index fa79feb8fe57..6628ff295cb8 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -2061,6 +2061,10 @@ config GUEST_PERF_EVENTS bool depends on HAVE_PERF_EVENTS +config PERF_GUEST_MEDIATED_PMU + bool + depends on GUEST_PERF_EVENTS + config PERF_USE_VMALLOC bool help diff --git a/kernel/events/core.c b/kernel/events/core.c index 6973483d0dfa..5a2166ba6138 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5656,6 +5656,8 @@ static void __free_event(struct perf_event *event) call_rcu(&event->rcu_head, free_event_rcu); } +static void mediated_pmu_unaccount_event(struct perf_event *event); + DEFINE_FREE(__free_event, struct perf_event *, if (_T) __free_event(_T)) /* vs perf_event_alloc() success */ @@ -5665,6 +5667,7 @@ static void _free_event(struct perf_event *event) irq_work_sync(&event->pending_disable_irq); unaccount_event(event); + mediated_pmu_unaccount_event(event); if (event->rb) { /* @@ -6187,6 +6190,81 @@ u64 perf_event_pause(struct perf_event *event, bool reset) } EXPORT_SYMBOL_GPL(perf_event_pause); +#ifdef CONFIG_PERF_GUEST_MEDIATED_PMU +static atomic_t nr_include_guest_events __read_mostly; + +static atomic_t nr_mediated_pmu_vms __read_mostly; +static DEFINE_MUTEX(perf_mediated_pmu_mutex); + +/* !exclude_guest event of PMU with PERF_PMU_CAP_MEDIATED_VPMU */ +static inline bool is_include_guest_event(struct perf_event *event) +{ + if ((event->pmu->capabilities & PERF_PMU_CAP_MEDIATED_VPMU) && + !event->attr.exclude_guest) + return true; + + return false; +} + +static int mediated_pmu_account_event(struct perf_event *event) +{ + if (!is_include_guest_event(event)) + return 0; + + guard(mutex)(&perf_mediated_pmu_mutex); + + if (atomic_read(&nr_mediated_pmu_vms)) + return -EOPNOTSUPP; + + atomic_inc(&nr_include_guest_events); + return 0; +} + +static void mediated_pmu_unaccount_event(struct perf_event *event) +{ + if (!is_include_guest_event(event)) + return; + + atomic_dec(&nr_include_guest_events); +} + +/* + * Currently invoked at VM creation to + * - Check whether there are existing !exclude_guest events of PMU with + * PERF_PMU_CAP_MEDIATED_VPMU + * - Set nr_mediated_pmu_vms to prevent !exclude_guest event creation on + * PMUs with PERF_PMU_CAP_MEDIATED_VPMU + * + * No impact for the PMU without PERF_PMU_CAP_MEDIATED_VPMU. The perf + * still owns all the PMU resources. + */ +int perf_create_mediated_pmu(void) +{ + guard(mutex)(&perf_mediated_pmu_mutex); + if (atomic_inc_not_zero(&nr_mediated_pmu_vms)) + return 0; + + if (atomic_read(&nr_include_guest_events)) + return -EBUSY; + + atomic_inc(&nr_mediated_pmu_vms); + return 0; +} +EXPORT_SYMBOL_GPL(perf_create_mediated_pmu); + +void perf_release_mediated_pmu(void) +{ + if (WARN_ON_ONCE(!atomic_read(&nr_mediated_pmu_vms))) + return; + + atomic_dec(&nr_mediated_pmu_vms); +} +EXPORT_SYMBOL_GPL(perf_release_mediated_pmu); +#else +static int mediated_pmu_account_event(struct perf_event *event) { return 0; } +static void mediated_pmu_unaccount_event(struct perf_event *event) {} +#endif + /* * Holding the top-level event's child_mutex means that any * descendant process that has inherited this event will block @@ -13147,6 +13225,10 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (err) return ERR_PTR(err); + err = mediated_pmu_account_event(event); + if (err) + return ERR_PTR(err); + /* symmetric to unaccount_event() in _free_event() */ account_event(event); From f5c7de8f84a152d559256aa4d0fc953118b73ca4 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 5 Dec 2025 16:16:41 -0800 Subject: [PATCH 08/58] perf: Clean up perf ctx time The current perf tracks two timestamps for the normal ctx and cgroup. The same type of variables and similar codes are used to track the timestamps. In the following patch, the third timestamp to track the guest time will be introduced. To avoid the code duplication, add a new struct perf_time_ctx and factor out a generic function update_perf_time_ctx(). No functional change. Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Kan Liang Signed-off-by: Mingwei Zhang Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Tested-by: Xudong Hao Link: https://patch.msgid.link/20251206001720.468579-6-seanjc@google.com --- include/linux/perf_event.h | 13 +++---- kernel/events/core.c | 70 +++++++++++++++++--------------------- 2 files changed, 39 insertions(+), 44 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 31929da6e711..d5aa1bc3f088 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -999,6 +999,11 @@ struct perf_event_groups { u64 index; }; +struct perf_time_ctx { + u64 time; + u64 stamp; + u64 offset; +}; /** * struct perf_event_context - event context structure @@ -1037,9 +1042,7 @@ struct perf_event_context { /* * Context clock, runs when context enabled. */ - u64 time; - u64 timestamp; - u64 timeoffset; + struct perf_time_ctx time; /* * These fields let us detect when two contexts have both @@ -1172,9 +1175,7 @@ struct bpf_perf_event_data_kern { * This is a per-cpu dynamically allocated data structure. */ struct perf_cgroup_info { - u64 time; - u64 timestamp; - u64 timeoffset; + struct perf_time_ctx time; int active; }; diff --git a/kernel/events/core.c b/kernel/events/core.c index 5a2166ba6138..95f118230ff5 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -816,6 +816,24 @@ static void perf_ctx_enable(struct perf_event_context *ctx, static void ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type); static void ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type); +static inline void update_perf_time_ctx(struct perf_time_ctx *time, u64 now, bool adv) +{ + if (adv) + time->time += now - time->stamp; + time->stamp = now; + + /* + * The above: time' = time + (now - timestamp), can be re-arranged + * into: time` = now + (time - timestamp), which gives a single value + * offset to compute future time without locks on. + * + * See perf_event_time_now(), which can be used from NMI context where + * it's (obviously) not possible to acquire ctx->lock in order to read + * both the above values in a consistent manner. + */ + WRITE_ONCE(time->offset, time->time - time->stamp); +} + #ifdef CONFIG_CGROUP_PERF static inline bool @@ -857,7 +875,7 @@ static inline u64 perf_cgroup_event_time(struct perf_event *event) struct perf_cgroup_info *t; t = per_cpu_ptr(event->cgrp->info, event->cpu); - return t->time; + return t->time.time; } static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now) @@ -866,22 +884,11 @@ static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now) t = per_cpu_ptr(event->cgrp->info, event->cpu); if (!__load_acquire(&t->active)) - return t->time; - now += READ_ONCE(t->timeoffset); + return t->time.time; + now += READ_ONCE(t->time.offset); return now; } -static inline void __update_cgrp_time(struct perf_cgroup_info *info, u64 now, bool adv) -{ - if (adv) - info->time += now - info->timestamp; - info->timestamp = now; - /* - * see update_context_time() - */ - WRITE_ONCE(info->timeoffset, info->time - info->timestamp); -} - static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final) { struct perf_cgroup *cgrp = cpuctx->cgrp; @@ -895,7 +902,7 @@ static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, cgrp = container_of(css, struct perf_cgroup, css); info = this_cpu_ptr(cgrp->info); - __update_cgrp_time(info, now, true); + update_perf_time_ctx(&info->time, now, true); if (final) __store_release(&info->active, 0); } @@ -918,7 +925,7 @@ static inline void update_cgrp_time_from_event(struct perf_event *event) * Do not update time when cgroup is not active */ if (info->active) - __update_cgrp_time(info, perf_clock(), true); + update_perf_time_ctx(&info->time, perf_clock(), true); } static inline void @@ -942,7 +949,7 @@ perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) for (css = &cgrp->css; css; css = css->parent) { cgrp = container_of(css, struct perf_cgroup, css); info = this_cpu_ptr(cgrp->info); - __update_cgrp_time(info, ctx->timestamp, false); + update_perf_time_ctx(&info->time, ctx->time.stamp, false); __store_release(&info->active, 1); } } @@ -1563,20 +1570,7 @@ static void __update_context_time(struct perf_event_context *ctx, bool adv) lockdep_assert_held(&ctx->lock); - if (adv) - ctx->time += now - ctx->timestamp; - ctx->timestamp = now; - - /* - * The above: time' = time + (now - timestamp), can be re-arranged - * into: time` = now + (time - timestamp), which gives a single value - * offset to compute future time without locks on. - * - * See perf_event_time_now(), which can be used from NMI context where - * it's (obviously) not possible to acquire ctx->lock in order to read - * both the above values in a consistent manner. - */ - WRITE_ONCE(ctx->timeoffset, ctx->time - ctx->timestamp); + update_perf_time_ctx(&ctx->time, now, adv); } static void update_context_time(struct perf_event_context *ctx) @@ -1594,7 +1588,7 @@ static u64 perf_event_time(struct perf_event *event) if (is_cgroup_event(event)) return perf_cgroup_event_time(event); - return ctx->time; + return ctx->time.time; } static u64 perf_event_time_now(struct perf_event *event, u64 now) @@ -1608,9 +1602,9 @@ static u64 perf_event_time_now(struct perf_event *event, u64 now) return perf_cgroup_event_time_now(event, now); if (!(__load_acquire(&ctx->is_active) & EVENT_TIME)) - return ctx->time; + return ctx->time.time; - now += READ_ONCE(ctx->timeoffset); + now += READ_ONCE(ctx->time.offset); return now; } @@ -12113,7 +12107,7 @@ static void task_clock_event_update(struct perf_event *event, u64 now) static void task_clock_event_start(struct perf_event *event, int flags) { event->hw.state = 0; - local64_set(&event->hw.prev_count, event->ctx->time); + local64_set(&event->hw.prev_count, event->ctx->time.time); perf_swevent_start_hrtimer(event); } @@ -12122,7 +12116,7 @@ static void task_clock_event_stop(struct perf_event *event, int flags) event->hw.state = PERF_HES_STOPPED; perf_swevent_cancel_hrtimer(event); if (flags & PERF_EF_UPDATE) - task_clock_event_update(event, event->ctx->time); + task_clock_event_update(event, event->ctx->time.time); } static int task_clock_event_add(struct perf_event *event, int flags) @@ -12142,8 +12136,8 @@ static void task_clock_event_del(struct perf_event *event, int flags) static void task_clock_event_read(struct perf_event *event) { u64 now = perf_clock(); - u64 delta = now - event->ctx->timestamp; - u64 time = event->ctx->time + delta; + u64 delta = now - event->ctx->time.stamp; + u64 time = event->ctx->time.time + delta; task_clock_event_update(event, time); } From 4593b4b6e218a0f21afbacc8124cf469d2d04094 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 5 Dec 2025 16:16:42 -0800 Subject: [PATCH 09/58] perf: Add a EVENT_GUEST flag Current perf doesn't explicitly schedule out all exclude_guest events while the guest is running. There is no problem with the current emulated vPMU. Because perf owns all the PMU counters. It can mask the counter which is assigned to an exclude_guest event when a guest is running (Intel way), or set the corresponding HOSTONLY bit in evsentsel (AMD way). The counter doesn't count when a guest is running. However, either way doesn't work with the introduced mediated vPMU. A guest owns all the PMU counters when it's running. The host should not mask any counters. The counter may be used by the guest. The evsentsel may be overwritten. Perf should explicitly schedule out all exclude_guest events to release the PMU resources when entering a guest, and resume the counting when exiting the guest. It's possible that an exclude_guest event is created when a guest is running. The new event should not be scheduled in as well. The ctx time is shared among different PMUs. The time cannot be stopped when a guest is running. It is required to calculate the time for events from other PMUs, e.g., uncore events. Add timeguest to track the guest run time. For an exclude_guest event, the elapsed time equals the ctx time - guest time. Cgroup has dedicated times. Use the same method to deduct the guest time from the cgroup time as well. [sean: massage comments] Co-developed-by: Peter Zijlstra (Intel) Signed-off-by: Kan Liang Signed-off-by: Mingwei Zhang Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Tested-by: Xudong Hao Link: https://patch.msgid.link/20251206001720.468579-7-seanjc@google.com --- include/linux/perf_event.h | 6 + kernel/events/core.c | 234 ++++++++++++++++++++++++++++--------- 2 files changed, 187 insertions(+), 53 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index d5aa1bc3f088..d9988e3fd557 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1044,6 +1044,11 @@ struct perf_event_context { */ struct perf_time_ctx time; + /* + * Context clock, runs when in the guest mode. + */ + struct perf_time_ctx timeguest; + /* * These fields let us detect when two contexts have both * been cloned (inherited) from a common ancestor. @@ -1176,6 +1181,7 @@ struct bpf_perf_event_data_kern { */ struct perf_cgroup_info { struct perf_time_ctx time; + struct perf_time_ctx timeguest; int active; }; diff --git a/kernel/events/core.c b/kernel/events/core.c index 95f118230ff5..6781d39f3158 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -165,7 +165,19 @@ enum event_type_t { /* see ctx_resched() for details */ EVENT_CPU = 0x10, EVENT_CGROUP = 0x20, - EVENT_FLAGS = EVENT_CGROUP, + + /* + * EVENT_GUEST is set when scheduling in/out events between the host + * and a guest with a mediated vPMU. Among other things, EVENT_GUEST + * is used: + * + * - In for_each_epc() to skip PMUs that don't support events in a + * MEDIATED_VPMU guest, i.e. don't need to be context switched. + * - To indicate the start/end point of the events in a guest. Guest + * running time is deducted for host-only (exclude_guest) events. + */ + EVENT_GUEST = 0x40, + EVENT_FLAGS = EVENT_CGROUP | EVENT_GUEST, /* compound helpers */ EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, EVENT_TIME_FROZEN = EVENT_TIME | EVENT_FROZEN, @@ -458,6 +470,11 @@ static cpumask_var_t perf_online_pkg_mask; static cpumask_var_t perf_online_sys_mask; static struct kmem_cache *perf_event_cache; +static __always_inline bool is_guest_mediated_pmu_loaded(void) +{ + return false; +} + /* * perf event paranoia level: * -1 - not paranoid at all @@ -784,6 +801,9 @@ static bool perf_skip_pmu_ctx(struct perf_event_pmu_context *pmu_ctx, { if ((event_type & EVENT_CGROUP) && !pmu_ctx->nr_cgroups) return true; + if ((event_type & EVENT_GUEST) && + !(pmu_ctx->pmu->capabilities & PERF_PMU_CAP_MEDIATED_VPMU)) + return true; return false; } @@ -834,6 +854,39 @@ static inline void update_perf_time_ctx(struct perf_time_ctx *time, u64 now, boo WRITE_ONCE(time->offset, time->time - time->stamp); } +static_assert(offsetof(struct perf_event_context, timeguest) - + offsetof(struct perf_event_context, time) == + sizeof(struct perf_time_ctx)); + +#define T_TOTAL 0 +#define T_GUEST 1 + +static inline u64 __perf_event_time_ctx(struct perf_event *event, + struct perf_time_ctx *times) +{ + u64 time = times[T_TOTAL].time; + + if (event->attr.exclude_guest) + time -= times[T_GUEST].time; + + return time; +} + +static inline u64 __perf_event_time_ctx_now(struct perf_event *event, + struct perf_time_ctx *times, + u64 now) +{ + if (is_guest_mediated_pmu_loaded() && event->attr.exclude_guest) { + /* + * (now + times[total].offset) - (now + times[guest].offset) := + * times[total].offset - times[guest].offset + */ + return READ_ONCE(times[T_TOTAL].offset) - READ_ONCE(times[T_GUEST].offset); + } + + return now + READ_ONCE(times[T_TOTAL].offset); +} + #ifdef CONFIG_CGROUP_PERF static inline bool @@ -870,12 +923,16 @@ static inline int is_cgroup_event(struct perf_event *event) return event->cgrp != NULL; } +static_assert(offsetof(struct perf_cgroup_info, timeguest) - + offsetof(struct perf_cgroup_info, time) == + sizeof(struct perf_time_ctx)); + static inline u64 perf_cgroup_event_time(struct perf_event *event) { struct perf_cgroup_info *t; t = per_cpu_ptr(event->cgrp->info, event->cpu); - return t->time.time; + return __perf_event_time_ctx(event, &t->time); } static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now) @@ -884,9 +941,21 @@ static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now) t = per_cpu_ptr(event->cgrp->info, event->cpu); if (!__load_acquire(&t->active)) - return t->time.time; - now += READ_ONCE(t->time.offset); - return now; + return __perf_event_time_ctx(event, &t->time); + + return __perf_event_time_ctx_now(event, &t->time, now); +} + +static inline void __update_cgrp_guest_time(struct perf_cgroup_info *info, u64 now, bool adv) +{ + update_perf_time_ctx(&info->timeguest, now, adv); +} + +static inline void update_cgrp_time(struct perf_cgroup_info *info, u64 now) +{ + update_perf_time_ctx(&info->time, now, true); + if (is_guest_mediated_pmu_loaded()) + __update_cgrp_guest_time(info, now, true); } static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final) @@ -902,7 +971,7 @@ static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, cgrp = container_of(css, struct perf_cgroup, css); info = this_cpu_ptr(cgrp->info); - update_perf_time_ctx(&info->time, now, true); + update_cgrp_time(info, now); if (final) __store_release(&info->active, 0); } @@ -925,11 +994,11 @@ static inline void update_cgrp_time_from_event(struct perf_event *event) * Do not update time when cgroup is not active */ if (info->active) - update_perf_time_ctx(&info->time, perf_clock(), true); + update_cgrp_time(info, perf_clock()); } static inline void -perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) +perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx, bool guest) { struct perf_event_context *ctx = &cpuctx->ctx; struct perf_cgroup *cgrp = cpuctx->cgrp; @@ -949,8 +1018,12 @@ perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) for (css = &cgrp->css; css; css = css->parent) { cgrp = container_of(css, struct perf_cgroup, css); info = this_cpu_ptr(cgrp->info); - update_perf_time_ctx(&info->time, ctx->time.stamp, false); - __store_release(&info->active, 1); + if (guest) { + __update_cgrp_guest_time(info, ctx->time.stamp, false); + } else { + update_perf_time_ctx(&info->time, ctx->time.stamp, false); + __store_release(&info->active, 1); + } } } @@ -1154,7 +1227,7 @@ static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event, } static inline void -perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) +perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx, bool guest) { } @@ -1566,16 +1639,24 @@ static void perf_unpin_context(struct perf_event_context *ctx) */ static void __update_context_time(struct perf_event_context *ctx, bool adv) { - u64 now = perf_clock(); - lockdep_assert_held(&ctx->lock); - update_perf_time_ctx(&ctx->time, now, adv); + update_perf_time_ctx(&ctx->time, perf_clock(), adv); +} + +static void __update_context_guest_time(struct perf_event_context *ctx, bool adv) +{ + lockdep_assert_held(&ctx->lock); + + /* must be called after __update_context_time(); */ + update_perf_time_ctx(&ctx->timeguest, ctx->time.stamp, adv); } static void update_context_time(struct perf_event_context *ctx) { __update_context_time(ctx, true); + if (is_guest_mediated_pmu_loaded()) + __update_context_guest_time(ctx, true); } static u64 perf_event_time(struct perf_event *event) @@ -1588,7 +1669,7 @@ static u64 perf_event_time(struct perf_event *event) if (is_cgroup_event(event)) return perf_cgroup_event_time(event); - return ctx->time.time; + return __perf_event_time_ctx(event, &ctx->time); } static u64 perf_event_time_now(struct perf_event *event, u64 now) @@ -1602,10 +1683,9 @@ static u64 perf_event_time_now(struct perf_event *event, u64 now) return perf_cgroup_event_time_now(event, now); if (!(__load_acquire(&ctx->is_active) & EVENT_TIME)) - return ctx->time.time; + return __perf_event_time_ctx(event, &ctx->time); - now += READ_ONCE(ctx->time.offset); - return now; + return __perf_event_time_ctx_now(event, &ctx->time, now); } static enum event_type_t get_event_type(struct perf_event *event) @@ -2425,20 +2505,23 @@ group_sched_out(struct perf_event *group_event, struct perf_event_context *ctx) } static inline void -__ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, bool final) +__ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, + bool final, enum event_type_t event_type) { if (ctx->is_active & EVENT_TIME) { if (ctx->is_active & EVENT_FROZEN) return; + update_context_time(ctx); - update_cgrp_time_from_cpuctx(cpuctx, final); + /* vPMU should not stop time */ + update_cgrp_time_from_cpuctx(cpuctx, !(event_type & EVENT_GUEST) && final); } } static inline void ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { - __ctx_time_update(cpuctx, ctx, false); + __ctx_time_update(cpuctx, ctx, false, 0); } /* @@ -3510,7 +3593,7 @@ ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t * * would only update time for the pinned events. */ - __ctx_time_update(cpuctx, ctx, ctx == &cpuctx->ctx); + __ctx_time_update(cpuctx, ctx, ctx == &cpuctx->ctx, event_type); /* * CPU-release for the below ->is_active store, @@ -3536,7 +3619,18 @@ ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t cpuctx->task_ctx = NULL; } - is_active ^= ctx->is_active; /* changed bits */ + if (event_type & EVENT_GUEST) { + /* + * Schedule out all exclude_guest events of PMU + * with PERF_PMU_CAP_MEDIATED_VPMU. + */ + is_active = EVENT_ALL; + __update_context_guest_time(ctx, false); + perf_cgroup_set_timestamp(cpuctx, true); + barrier(); + } else { + is_active ^= ctx->is_active; /* changed bits */ + } for_each_epc(pmu_ctx, ctx, pmu, event_type) __pmu_ctx_sched_out(pmu_ctx, is_active); @@ -3995,10 +4089,15 @@ static inline void group_update_userpage(struct perf_event *group_event) event_update_userpage(event); } +struct merge_sched_data { + int can_add_hw; + enum event_type_t event_type; +}; + static int merge_sched_in(struct perf_event *event, void *data) { struct perf_event_context *ctx = event->ctx; - int *can_add_hw = data; + struct merge_sched_data *msd = data; if (event->state <= PERF_EVENT_STATE_OFF) return 0; @@ -4006,13 +4105,22 @@ static int merge_sched_in(struct perf_event *event, void *data) if (!event_filter_match(event)) return 0; - if (group_can_go_on(event, *can_add_hw)) { + /* + * Don't schedule in any host events from PMU with + * PERF_PMU_CAP_MEDIATED_VPMU, while a guest is running. + */ + if (is_guest_mediated_pmu_loaded() && + event->pmu_ctx->pmu->capabilities & PERF_PMU_CAP_MEDIATED_VPMU && + !(msd->event_type & EVENT_GUEST)) + return 0; + + if (group_can_go_on(event, msd->can_add_hw)) { if (!group_sched_in(event, ctx)) list_add_tail(&event->active_list, get_event_list(event)); } if (event->state == PERF_EVENT_STATE_INACTIVE) { - *can_add_hw = 0; + msd->can_add_hw = 0; if (event->attr.pinned) { perf_cgroup_event_disable(event, ctx); perf_event_set_state(event, PERF_EVENT_STATE_ERROR); @@ -4035,11 +4143,15 @@ static int merge_sched_in(struct perf_event *event, void *data) static void pmu_groups_sched_in(struct perf_event_context *ctx, struct perf_event_groups *groups, - struct pmu *pmu) + struct pmu *pmu, + enum event_type_t event_type) { - int can_add_hw = 1; + struct merge_sched_data msd = { + .can_add_hw = 1, + .event_type = event_type, + }; visit_groups_merge(ctx, groups, smp_processor_id(), pmu, - merge_sched_in, &can_add_hw); + merge_sched_in, &msd); } static void __pmu_ctx_sched_in(struct perf_event_pmu_context *pmu_ctx, @@ -4048,9 +4160,9 @@ static void __pmu_ctx_sched_in(struct perf_event_pmu_context *pmu_ctx, struct perf_event_context *ctx = pmu_ctx->ctx; if (event_type & EVENT_PINNED) - pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu_ctx->pmu); + pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu_ctx->pmu, event_type); if (event_type & EVENT_FLEXIBLE) - pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu_ctx->pmu); + pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu_ctx->pmu, event_type); } static void @@ -4067,9 +4179,11 @@ ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t return; if (!(is_active & EVENT_TIME)) { + /* EVENT_TIME should be active while the guest runs */ + WARN_ON_ONCE(event_type & EVENT_GUEST); /* start ctx time */ __update_context_time(ctx, false); - perf_cgroup_set_timestamp(cpuctx); + perf_cgroup_set_timestamp(cpuctx, false); /* * CPU-release for the below ->is_active store, * see __load_acquire() in perf_event_time_now() @@ -4085,7 +4199,23 @@ ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t WARN_ON_ONCE(cpuctx->task_ctx != ctx); } - is_active ^= ctx->is_active; /* changed bits */ + if (event_type & EVENT_GUEST) { + /* + * Schedule in the required exclude_guest events of PMU + * with PERF_PMU_CAP_MEDIATED_VPMU. + */ + is_active = event_type & EVENT_ALL; + + /* + * Update ctx time to set the new start time for + * the exclude_guest events. + */ + update_context_time(ctx); + update_cgrp_time_from_cpuctx(cpuctx, false); + barrier(); + } else { + is_active ^= ctx->is_active; /* changed bits */ + } /* * First go through the list and put on any pinned groups @@ -4093,13 +4223,13 @@ ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t */ if (is_active & EVENT_PINNED) { for_each_epc(pmu_ctx, ctx, pmu, event_type) - __pmu_ctx_sched_in(pmu_ctx, EVENT_PINNED); + __pmu_ctx_sched_in(pmu_ctx, EVENT_PINNED | (event_type & EVENT_GUEST)); } /* Then walk through the lower prio flexible groups */ if (is_active & EVENT_FLEXIBLE) { for_each_epc(pmu_ctx, ctx, pmu, event_type) - __pmu_ctx_sched_in(pmu_ctx, EVENT_FLEXIBLE); + __pmu_ctx_sched_in(pmu_ctx, EVENT_FLEXIBLE | (event_type & EVENT_GUEST)); } } @@ -6626,23 +6756,23 @@ void perf_event_update_userpage(struct perf_event *event) if (!rb) goto unlock; - /* - * compute total_time_enabled, total_time_running - * based on snapshot values taken when the event - * was last scheduled in. - * - * we cannot simply called update_context_time() - * because of locking issue as we can be called in - * NMI context - */ - calc_timer_values(event, &now, &enabled, &running); - - userpg = rb->user_page; /* * Disable preemption to guarantee consistent time stamps are stored to * the user page. */ preempt_disable(); + + /* + * Compute total_time_enabled, total_time_running based on snapshot + * values taken when the event was last scheduled in. + * + * We cannot simply call update_context_time() because doing so would + * lead to deadlock when called from NMI context. + */ + calc_timer_values(event, &now, &enabled, &running); + + userpg = rb->user_page; + ++userpg->lock; barrier(); userpg->index = perf_event_index(event); @@ -7939,13 +8069,11 @@ static void perf_output_read(struct perf_output_handle *handle, u64 read_format = event->attr.read_format; /* - * compute total_time_enabled, total_time_running - * based on snapshot values taken when the event - * was last scheduled in. + * Compute total_time_enabled, total_time_running based on snapshot + * values taken when the event was last scheduled in. * - * we cannot simply called update_context_time() - * because of locking issue as we are called in - * NMI context + * We cannot simply call update_context_time() because doing so would + * lead to deadlock when called from NMI context. */ if (read_format & PERF_FORMAT_TOTAL_TIMES) calc_timer_values(event, &now, &enabled, &running); From 42457a7fb6cacca83be4deaf202ac3e45830daf2 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 5 Dec 2025 16:16:43 -0800 Subject: [PATCH 10/58] perf: Add APIs to load/put guest mediated PMU context Add exported APIs to load/put a guest mediated PMU context. KVM will load the guest PMU shortly before VM-Enter, and put the guest PMU shortly after VM-Exit. On the perf side of things, schedule out all exclude_guest events when the guest context is loaded, and schedule them back in when the guest context is put. I.e. yield the hardware PMU resources to the guest, by way of KVM. Note, perf is only responsible for managing host context. KVM is responsible for loading/storing guest state to/from hardware. [sean: shuffle patches around, write changelog] Suggested-by: Sean Christopherson Signed-off-by: Kan Liang Signed-off-by: Mingwei Zhang Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Tested-by: Xudong Hao Link: https://patch.msgid.link/20251206001720.468579-8-seanjc@google.com --- include/linux/perf_event.h | 2 ++ kernel/events/core.c | 61 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index d9988e3fd557..322cfa9f3d48 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1925,6 +1925,8 @@ extern u64 perf_event_pause(struct perf_event *event, bool reset); #ifdef CONFIG_PERF_GUEST_MEDIATED_PMU int perf_create_mediated_pmu(void); void perf_release_mediated_pmu(void); +void perf_load_guest_context(void); +void perf_put_guest_context(void); #endif #else /* !CONFIG_PERF_EVENTS: */ diff --git a/kernel/events/core.c b/kernel/events/core.c index 6781d39f3158..bbb81a4a3196 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -470,10 +470,19 @@ static cpumask_var_t perf_online_pkg_mask; static cpumask_var_t perf_online_sys_mask; static struct kmem_cache *perf_event_cache; +#ifdef CONFIG_PERF_GUEST_MEDIATED_PMU +static DEFINE_PER_CPU(bool, guest_ctx_loaded); + +static __always_inline bool is_guest_mediated_pmu_loaded(void) +{ + return __this_cpu_read(guest_ctx_loaded); +} +#else static __always_inline bool is_guest_mediated_pmu_loaded(void) { return false; } +#endif /* * perf event paranoia level: @@ -6384,6 +6393,58 @@ void perf_release_mediated_pmu(void) atomic_dec(&nr_mediated_pmu_vms); } EXPORT_SYMBOL_GPL(perf_release_mediated_pmu); + +/* When loading a guest's mediated PMU, schedule out all exclude_guest events. */ +void perf_load_guest_context(void) +{ + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); + + lockdep_assert_irqs_disabled(); + + guard(perf_ctx_lock)(cpuctx, cpuctx->task_ctx); + + if (WARN_ON_ONCE(__this_cpu_read(guest_ctx_loaded))) + return; + + perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST); + ctx_sched_out(&cpuctx->ctx, NULL, EVENT_GUEST); + if (cpuctx->task_ctx) { + perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST); + task_ctx_sched_out(cpuctx->task_ctx, NULL, EVENT_GUEST); + } + + perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST); + if (cpuctx->task_ctx) + perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST); + + __this_cpu_write(guest_ctx_loaded, true); +} +EXPORT_SYMBOL_GPL(perf_load_guest_context); + +void perf_put_guest_context(void) +{ + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); + + lockdep_assert_irqs_disabled(); + + guard(perf_ctx_lock)(cpuctx, cpuctx->task_ctx); + + if (WARN_ON_ONCE(!__this_cpu_read(guest_ctx_loaded))) + return; + + perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST); + if (cpuctx->task_ctx) + perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST); + + perf_event_sched_in(cpuctx, cpuctx->task_ctx, NULL, EVENT_GUEST); + + if (cpuctx->task_ctx) + perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST); + perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST); + + __this_cpu_write(guest_ctx_loaded, false); +} +EXPORT_SYMBOL_GPL(perf_put_guest_context); #else static int mediated_pmu_account_event(struct perf_event *event) { return 0; } static void mediated_pmu_unaccount_event(struct perf_event *event) {} From a05385d84b2af64600fc84b027bea481e8f6261d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 5 Dec 2025 16:16:44 -0800 Subject: [PATCH 11/58] perf/x86/core: Register a new vector for handling mediated guest PMIs Wire up system vector 0xf5 for handling PMIs (i.e. interrupts delivered through the LVTPC) while running KVM guests with a mediated PMU. Perf currently delivers all PMIs as NMIs, e.g. so that events that trigger while IRQs are disabled aren't delayed and generate useless records, but due to the multiplexing of NMIs throughout the system, correctly identifying NMIs for a mediated PMU is practically infeasible. To (greatly) simplify identifying guest mediated PMU PMIs, perf will switch the CPU's LVTPC between PERF_GUEST_MEDIATED_PMI_VECTOR and NMI when guest PMU context is loaded/put. I.e. PMIs that are generated by the CPU while the guest is active will be identified purely based on the IRQ vector. Route the vector through perf, e.g. as opposed to letting KVM attach a handler directly a la posted interrupt notification vectors, as perf owns the LVTPC and thus is the rightful owner of PERF_GUEST_MEDIATED_PMI_VECTOR. Functionally, having KVM directly own the vector would be fine (both KVM and perf will be completely aware of when a mediated PMU is active), but would lead to an undesirable split in ownership: perf would be responsible for installing the vector, but not handling the resulting IRQs. Add a new perf_guest_info_callbacks hook (and static call) to allow KVM to register its handler with perf when running guests with mediated PMUs. Note, because KVM always runs guests with host IRQs enabled, there is no danger of a PMI being delayed from the guest's perspective due to using a regular IRQ instead of an NMI. Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Tested-by: Xudong Hao Link: https://patch.msgid.link/20251206001720.468579-9-seanjc@google.com --- arch/x86/entry/entry_fred.c | 1 + arch/x86/include/asm/hardirq.h | 3 +++ arch/x86/include/asm/idtentry.h | 6 ++++++ arch/x86/include/asm/irq_vectors.h | 4 +++- arch/x86/kernel/idt.c | 3 +++ arch/x86/kernel/irq.c | 19 +++++++++++++++++++ include/linux/perf_event.h | 8 ++++++++ kernel/events/core.c | 9 +++++++-- .../beauty/arch/x86/include/asm/irq_vectors.h | 3 ++- virt/kvm/kvm_main.c | 3 +++ 10 files changed, 55 insertions(+), 4 deletions(-) diff --git a/arch/x86/entry/entry_fred.c b/arch/x86/entry/entry_fred.c index 94e626cc6a07..a9b72997103d 100644 --- a/arch/x86/entry/entry_fred.c +++ b/arch/x86/entry/entry_fred.c @@ -114,6 +114,7 @@ static idtentry_t sysvec_table[NR_SYSTEM_VECTORS] __ro_after_init = { SYSVEC(IRQ_WORK_VECTOR, irq_work), + SYSVEC(PERF_GUEST_MEDIATED_PMI_VECTOR, perf_guest_mediated_pmi_handler), SYSVEC(POSTED_INTR_VECTOR, kvm_posted_intr_ipi), SYSVEC(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi), SYSVEC(POSTED_INTR_NESTED_VECTOR, kvm_posted_intr_nested_ipi), diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 6b6d472baa0b..9314642ae93c 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -18,6 +18,9 @@ typedef struct { unsigned int kvm_posted_intr_ipis; unsigned int kvm_posted_intr_wakeup_ipis; unsigned int kvm_posted_intr_nested_ipis; +#endif +#ifdef CONFIG_GUEST_PERF_EVENTS + unsigned int perf_guest_mediated_pmis; #endif unsigned int x86_platform_ipis; /* arch dependent */ unsigned int apic_perf_irqs; diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 3218770670d3..42bf6a58ec36 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -746,6 +746,12 @@ DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_NESTED_VECTOR, sysvec_kvm_posted_intr_nested # define fred_sysvec_kvm_posted_intr_nested_ipi NULL #endif +# ifdef CONFIG_GUEST_PERF_EVENTS +DECLARE_IDTENTRY_SYSVEC(PERF_GUEST_MEDIATED_PMI_VECTOR, sysvec_perf_guest_mediated_pmi_handler); +#else +# define fred_sysvec_perf_guest_mediated_pmi_handler NULL +#endif + # ifdef CONFIG_X86_POSTED_MSI DECLARE_IDTENTRY_SYSVEC(POSTED_MSI_NOTIFICATION_VECTOR, sysvec_posted_msi_notification); #else diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 47051871b436..85253fc8e384 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -77,7 +77,9 @@ */ #define IRQ_WORK_VECTOR 0xf6 -/* 0xf5 - unused, was UV_BAU_MESSAGE */ +/* IRQ vector for PMIs when running a guest with a mediated PMU. */ +#define PERF_GUEST_MEDIATED_PMI_VECTOR 0xf5 + #define DEFERRED_ERROR_VECTOR 0xf4 /* Vector on which hypervisor callbacks will be delivered */ diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index f445bec516a0..260456588756 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -158,6 +158,9 @@ static const __initconst struct idt_data apic_idts[] = { INTG(POSTED_INTR_WAKEUP_VECTOR, asm_sysvec_kvm_posted_intr_wakeup_ipi), INTG(POSTED_INTR_NESTED_VECTOR, asm_sysvec_kvm_posted_intr_nested_ipi), # endif +#ifdef CONFIG_GUEST_PERF_EVENTS + INTG(PERF_GUEST_MEDIATED_PMI_VECTOR, asm_sysvec_perf_guest_mediated_pmi_handler), +#endif # ifdef CONFIG_IRQ_WORK INTG(IRQ_WORK_VECTOR, asm_sysvec_irq_work), # endif diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 86f4e574de02..d56185b49a0e 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -192,6 +192,13 @@ int arch_show_interrupts(struct seq_file *p, int prec) irq_stats(j)->kvm_posted_intr_wakeup_ipis); seq_puts(p, " Posted-interrupt wakeup event\n"); #endif +#ifdef CONFIG_GUEST_PERF_EVENTS + seq_printf(p, "%*s: ", prec, "VPMI"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", + irq_stats(j)->perf_guest_mediated_pmis); + seq_puts(p, " Perf Guest Mediated PMI\n"); +#endif #ifdef CONFIG_X86_POSTED_MSI seq_printf(p, "%*s: ", prec, "PMN"); for_each_online_cpu(j) @@ -349,6 +356,18 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_x86_platform_ipi) } #endif +#ifdef CONFIG_GUEST_PERF_EVENTS +/* + * Handler for PERF_GUEST_MEDIATED_PMI_VECTOR. + */ +DEFINE_IDTENTRY_SYSVEC(sysvec_perf_guest_mediated_pmi_handler) +{ + apic_eoi(); + inc_irq_stat(perf_guest_mediated_pmis); + perf_guest_handle_mediated_pmi(); +} +#endif + #if IS_ENABLED(CONFIG_KVM) static void dummy_handler(void) {} static void (*kvm_posted_intr_wakeup_handler)(void) = dummy_handler; diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 322cfa9f3d48..82e617fad165 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1677,6 +1677,8 @@ struct perf_guest_info_callbacks { unsigned int (*state)(void); unsigned long (*get_ip)(void); unsigned int (*handle_intel_pt_intr)(void); + + void (*handle_mediated_pmi)(void); }; #ifdef CONFIG_GUEST_PERF_EVENTS @@ -1686,6 +1688,7 @@ extern struct perf_guest_info_callbacks __rcu *perf_guest_cbs; DECLARE_STATIC_CALL(__perf_guest_state, *perf_guest_cbs->state); DECLARE_STATIC_CALL(__perf_guest_get_ip, *perf_guest_cbs->get_ip); DECLARE_STATIC_CALL(__perf_guest_handle_intel_pt_intr, *perf_guest_cbs->handle_intel_pt_intr); +DECLARE_STATIC_CALL(__perf_guest_handle_mediated_pmi, *perf_guest_cbs->handle_mediated_pmi); static inline unsigned int perf_guest_state(void) { @@ -1702,6 +1705,11 @@ static inline unsigned int perf_guest_handle_intel_pt_intr(void) return static_call(__perf_guest_handle_intel_pt_intr)(); } +static inline void perf_guest_handle_mediated_pmi(void) +{ + static_call(__perf_guest_handle_mediated_pmi)(); +} + extern void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs); extern void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs); diff --git a/kernel/events/core.c b/kernel/events/core.c index bbb81a4a3196..dd842a4ca789 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7644,6 +7644,7 @@ struct perf_guest_info_callbacks __rcu *perf_guest_cbs; DEFINE_STATIC_CALL_RET0(__perf_guest_state, *perf_guest_cbs->state); DEFINE_STATIC_CALL_RET0(__perf_guest_get_ip, *perf_guest_cbs->get_ip); DEFINE_STATIC_CALL_RET0(__perf_guest_handle_intel_pt_intr, *perf_guest_cbs->handle_intel_pt_intr); +DEFINE_STATIC_CALL_RET0(__perf_guest_handle_mediated_pmi, *perf_guest_cbs->handle_mediated_pmi); void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) { @@ -7658,6 +7659,10 @@ void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) if (cbs->handle_intel_pt_intr) static_call_update(__perf_guest_handle_intel_pt_intr, cbs->handle_intel_pt_intr); + + if (cbs->handle_mediated_pmi) + static_call_update(__perf_guest_handle_mediated_pmi, + cbs->handle_mediated_pmi); } EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks); @@ -7669,8 +7674,8 @@ void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) rcu_assign_pointer(perf_guest_cbs, NULL); static_call_update(__perf_guest_state, (void *)&__static_call_return0); static_call_update(__perf_guest_get_ip, (void *)&__static_call_return0); - static_call_update(__perf_guest_handle_intel_pt_intr, - (void *)&__static_call_return0); + static_call_update(__perf_guest_handle_intel_pt_intr, (void *)&__static_call_return0); + static_call_update(__perf_guest_handle_mediated_pmi, (void *)&__static_call_return0); synchronize_rcu(); } EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); diff --git a/tools/perf/trace/beauty/arch/x86/include/asm/irq_vectors.h b/tools/perf/trace/beauty/arch/x86/include/asm/irq_vectors.h index 47051871b436..6e1d5b955aae 100644 --- a/tools/perf/trace/beauty/arch/x86/include/asm/irq_vectors.h +++ b/tools/perf/trace/beauty/arch/x86/include/asm/irq_vectors.h @@ -77,7 +77,8 @@ */ #define IRQ_WORK_VECTOR 0xf6 -/* 0xf5 - unused, was UV_BAU_MESSAGE */ +#define PERF_GUEST_MEDIATED_PMI_VECTOR 0xf5 + #define DEFERRED_ERROR_VECTOR 0xf4 /* Vector on which hypervisor callbacks will be delivered */ diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 5fcd401a5897..21a0d226d63f 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -6467,11 +6467,14 @@ static struct perf_guest_info_callbacks kvm_guest_cbs = { .state = kvm_guest_state, .get_ip = kvm_guest_get_ip, .handle_intel_pt_intr = NULL, + .handle_mediated_pmi = NULL, }; void kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void)) { kvm_guest_cbs.handle_intel_pt_intr = pt_intr_handler; + kvm_guest_cbs.handle_mediated_pmi = NULL; + perf_register_guest_info_callbacks(&kvm_guest_cbs); } void kvm_unregister_perf_callbacks(void) From 560ac136f25da2da44a8b68d581adfdc8230b7e2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 5 Dec 2025 16:16:45 -0800 Subject: [PATCH 12/58] perf/x86/core: Add APIs to switch to/from mediated PMI vector (for KVM) Add APIs (exported only for KVM) to switch PMIs to the dedicated mediated PMU IRQ vector when loading guest context, and back to perf's standard NMI when the guest context is put. I.e. route PMIs to PERF_GUEST_MEDIATED_PMI_VECTOR when the guest context is active, and to NMIs while the host context is active. While running with guest context loaded, ignore all NMIs (in perf). Any NMI that arrives while the LVTPC points at the mediated PMU IRQ vector can't possibly be due to a host perf event. Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251206001720.468579-10-seanjc@google.com --- arch/x86/events/core.c | 32 +++++++++++++++++++++++++++++++ arch/x86/include/asm/perf_event.h | 5 +++++ 2 files changed, 37 insertions(+) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 0c38a31d5fc7..3ad5c658e286 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -56,6 +56,8 @@ DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { .pmu = &pmu, }; +static DEFINE_PER_CPU(bool, guest_lvtpc_loaded); + DEFINE_STATIC_KEY_FALSE(rdpmc_never_available_key); DEFINE_STATIC_KEY_FALSE(rdpmc_always_available_key); DEFINE_STATIC_KEY_FALSE(perf_is_hybrid); @@ -1760,6 +1762,25 @@ void perf_events_lapic_init(void) apic_write(APIC_LVTPC, APIC_DM_NMI); } +#ifdef CONFIG_PERF_GUEST_MEDIATED_PMU +void perf_load_guest_lvtpc(u32 guest_lvtpc) +{ + u32 masked = guest_lvtpc & APIC_LVT_MASKED; + + apic_write(APIC_LVTPC, + APIC_DM_FIXED | PERF_GUEST_MEDIATED_PMI_VECTOR | masked); + this_cpu_write(guest_lvtpc_loaded, true); +} +EXPORT_SYMBOL_FOR_MODULES(perf_load_guest_lvtpc, "kvm"); + +void perf_put_guest_lvtpc(void) +{ + this_cpu_write(guest_lvtpc_loaded, false); + apic_write(APIC_LVTPC, APIC_DM_NMI); +} +EXPORT_SYMBOL_FOR_MODULES(perf_put_guest_lvtpc, "kvm"); +#endif /* CONFIG_PERF_GUEST_MEDIATED_PMU */ + static int perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs) { @@ -1767,6 +1788,17 @@ perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs) u64 finish_clock; int ret; + /* + * Ignore all NMIs when the CPU's LVTPC is configured to route PMIs to + * PERF_GUEST_MEDIATED_PMI_VECTOR, i.e. when an NMI time can't be due + * to a PMI. Attempting to handle a PMI while the guest's context is + * loaded will generate false positives and clobber guest state. Note, + * the LVTPC is switched to/from the dedicated mediated PMI IRQ vector + * while host events are quiesced. + */ + if (this_cpu_read(guest_lvtpc_loaded)) + return NMI_DONE; + /* * All PMUs/events that share this PMI handler should make sure to * increment active_events for their events. diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 7276ba70c88a..fb7b261357bf 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -759,6 +759,11 @@ static inline void perf_events_lapic_init(void) { } static inline void perf_check_microcode(void) { } #endif +#ifdef CONFIG_PERF_GUEST_MEDIATED_PMU +extern void perf_load_guest_lvtpc(u32 guest_lvtpc); +extern void perf_put_guest_lvtpc(void); +#endif + #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr, void *data); extern void x86_perf_get_lbr(struct x86_pmu_lbr *lbr); From b456a6ba5756b6fb7e651775343e713bd08418e7 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Fri, 5 Dec 2025 16:16:46 -0800 Subject: [PATCH 13/58] perf/x86/core: Do not set bit width for unavailable counters Not all x86 processors have fixed counters. It may also be the case that a processor has only fixed counters and no general-purpose counters. Set the bit widths corresponding to each counter type only if such counters are available. Fixes: b3d9468a8bd2 ("perf, x86: Expose perf capability to other modules") Signed-off-by: Sandipan Das Co-developed-by: Dapeng Mi Signed-off-by: Dapeng Mi Signed-off-by: Mingwei Zhang Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Tested-by: Xudong Hao Link: https://patch.msgid.link/20251206001720.468579-11-seanjc@google.com --- arch/x86/events/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 3ad5c658e286..3f7838810cc5 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -3105,8 +3105,8 @@ void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap) cap->version = x86_pmu.version; cap->num_counters_gp = x86_pmu_num_counters(NULL); cap->num_counters_fixed = x86_pmu_num_counters_fixed(NULL); - cap->bit_width_gp = x86_pmu.cntval_bits; - cap->bit_width_fixed = x86_pmu.cntval_bits; + cap->bit_width_gp = cap->num_counters_gp ? x86_pmu.cntval_bits : 0; + cap->bit_width_fixed = cap->num_counters_fixed ? x86_pmu.cntval_bits : 0; cap->events_mask = (unsigned int)x86_pmu.events_maskl; cap->events_mask_len = x86_pmu.events_mask_len; cap->pebs_ept = x86_pmu.pebs_ept; From c8824a95d9673763a0a9d642f8c79b2162296923 Mon Sep 17 00:00:00 2001 From: Mingwei Zhang Date: Fri, 5 Dec 2025 16:16:47 -0800 Subject: [PATCH 14/58] perf/x86/core: Plumb mediated PMU capability from x86_pmu to x86_pmu_cap Plumb mediated PMU capability to x86_pmu_cap in order to let any kernel entity such as KVM know that host PMU support mediated PMU mode and has the implementation. Signed-off-by: Mingwei Zhang Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Tested-by: Xudong Hao Link: https://patch.msgid.link/20251206001720.468579-12-seanjc@google.com --- arch/x86/events/core.c | 1 + arch/x86/include/asm/perf_event.h | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 3f7838810cc5..df7a32be9914 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -3110,6 +3110,7 @@ void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap) cap->events_mask = (unsigned int)x86_pmu.events_maskl; cap->events_mask_len = x86_pmu.events_mask_len; cap->pebs_ept = x86_pmu.pebs_ept; + cap->mediated = !!(pmu.capabilities & PERF_PMU_CAP_MEDIATED_VPMU); } EXPORT_SYMBOL_FOR_KVM(perf_get_x86_pmu_capability); diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index fb7b261357bf..0d9af4135e0a 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -301,6 +301,7 @@ struct x86_pmu_capability { unsigned int events_mask; int events_mask_len; unsigned int pebs_ept :1; + unsigned int mediated :1; }; /* From 4280d79587a3fd4bf9415705536fe385467c5f44 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 5 Dec 2025 16:16:48 -0800 Subject: [PATCH 15/58] perf/x86/intel: Support PERF_PMU_CAP_MEDIATED_VPMU Apply the PERF_PMU_CAP_MEDIATED_VPMU for Intel core PMU. It only indicates that the perf side of core PMU is ready to support the mediated vPMU. Besides the capability, the hypervisor, a.k.a. KVM, still needs to check the PMU version and other PMU features/capabilities to decide whether to enable support mediated vPMUs. [sean: massage changelog] Signed-off-by: Kan Liang Signed-off-by: Mingwei Zhang Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Tested-by: Xudong Hao Link: https://patch.msgid.link/20251206001720.468579-13-seanjc@google.com --- arch/x86/events/intel/core.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index bdf3f0d0fe21..0553c1160f15 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -5695,6 +5695,8 @@ static void intel_pmu_check_hybrid_pmus(struct x86_hybrid_pmu *pmu) else pmu->intel_ctrl &= ~GLOBAL_CTRL_EN_PERF_METRICS; + pmu->pmu.capabilities |= PERF_PMU_CAP_MEDIATED_VPMU; + intel_pmu_check_event_constraints_all(&pmu->pmu); intel_pmu_check_extra_regs(pmu->extra_regs); @@ -7314,6 +7316,9 @@ __init int intel_pmu_init(void) pr_cont(" AnyThread deprecated, "); } + /* The perf side of core PMU is ready to support the mediated vPMU. */ + x86_get_pmu(smp_processor_id())->capabilities |= PERF_PMU_CAP_MEDIATED_VPMU; + /* * Many features on and after V6 require dynamic constraint, * e.g., Arch PEBS, ACR. From 65eb3a9a8a34fa9188e0ab5e657d84ce4fa242a7 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Fri, 5 Dec 2025 16:16:49 -0800 Subject: [PATCH 16/58] perf/x86/amd: Support PERF_PMU_CAP_MEDIATED_VPMU for AMD host Apply the PERF_PMU_CAP_MEDIATED_VPMU flag for version 2 and later implementations of the core PMU. Aside from having Global Control and Status registers, virtualizing the PMU using the mediated model requires an interface to set or clear the overflow bits in the Global Status MSRs while restoring or saving the PMU context of a vCPU. PerfMonV2-capable hardware has additional MSRs for this purpose, namely PerfCntrGlobalStatusSet and PerfCntrGlobalStatusClr, thereby making it suitable for use with mediated vPMU. Signed-off-by: Sandipan Das Signed-off-by: Mingwei Zhang Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Tested-by: Xudong Hao Link: https://patch.msgid.link/20251206001720.468579-14-seanjc@google.com --- arch/x86/events/amd/core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 44656d2fb555..0c92ed5f464b 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -1439,6 +1439,8 @@ static int __init amd_core_pmu_init(void) amd_pmu_global_cntr_mask = x86_pmu.cntr_mask64; + x86_get_pmu(smp_processor_id())->capabilities |= PERF_PMU_CAP_MEDIATED_VPMU; + /* Update PMC handling functions */ x86_pmu.enable_all = amd_pmu_v2_enable_all; x86_pmu.disable_all = amd_pmu_v2_disable_all; From 2d6ad925fb2386f3ee1d26f5022f7ea71bbc1541 Mon Sep 17 00:00:00 2001 From: Jens Remus Date: Mon, 8 Dec 2025 17:03:49 +0100 Subject: [PATCH 17/58] unwind_user: Enhance comments on get CFA, FP, and RA Move the comment "Get the Canonical Frame Address (CFA)" to the top of the sequence of statements that actually get the CFA. Reword the comment "Find the Return Address (RA)" to "Get ...", as the statements actually get the RA. Add a respective comment to the statements that get the FP. This will be useful once future commits extend the logic to get the RA and FP. While at it align the comment on the "stack going in wrong direction" check to the following one on the "address is word aligned" check. Signed-off-by: Jens Remus Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251208160352.1363040-2-jremus@linux.ibm.com --- kernel/unwind/user.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/unwind/user.c b/kernel/unwind/user.c index 39e270789444..0ca434f86e73 100644 --- a/kernel/unwind/user.c +++ b/kernel/unwind/user.c @@ -31,6 +31,7 @@ static int unwind_user_next_common(struct unwind_user_state *state, { unsigned long cfa, fp, ra; + /* Get the Canonical Frame Address (CFA) */ if (frame->use_fp) { if (state->fp < state->sp) return -EINVAL; @@ -38,11 +39,9 @@ static int unwind_user_next_common(struct unwind_user_state *state, } else { cfa = state->sp; } - - /* Get the Canonical Frame Address (CFA) */ cfa += frame->cfa_off; - /* stack going in wrong direction? */ + /* Make sure that stack is not going in wrong direction */ if (cfa <= state->sp) return -EINVAL; @@ -50,10 +49,11 @@ static int unwind_user_next_common(struct unwind_user_state *state, if (cfa & (state->ws - 1)) return -EINVAL; - /* Find the Return Address (RA) */ + /* Get the Return Address (RA) */ if (get_user_word(&ra, cfa, frame->ra_off, state->ws)) return -EINVAL; + /* Get the Frame Pointer (FP) */ if (frame->fp_off && get_user_word(&fp, cfa, frame->fp_off, state->ws)) return -EINVAL; From 2652f9a4b019e34fbbde8dcd1396f1f00ec4844f Mon Sep 17 00:00:00 2001 From: Jens Remus Date: Mon, 8 Dec 2025 17:03:50 +0100 Subject: [PATCH 18/58] unwind_user/fp: Use dummies instead of ifdef This simplifies the code. unwind_user_next_fp() does not need to return -EINVAL if config option HAVE_UNWIND_USER_FP is disabled, as unwind_user_start() will then not select this unwind method and unwind_user_next() will therefore not call it. Provide (1) a dummy definition of ARCH_INIT_USER_FP_FRAME, if the unwind user method HAVE_UNWIND_USER_FP is not enabled, (2) a common fallback definition of unwind_user_at_function_start() which returns false, and (3) a common dummy definition of ARCH_INIT_USER_FP_ENTRY_FRAME. Note that enabling the config option HAVE_UNWIND_USER_FP without defining ARCH_INIT_USER_FP_FRAME triggers a compile error, which is helpful when implementing support for this unwind user method in an architecture. Enabling the config option when providing an arch- specific unwind_user_at_function_start() definition makes it necessary to also provide an arch-specific ARCH_INIT_USER_FP_ENTRY_FRAME definition. Signed-off-by: Jens Remus Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251208160352.1363040-3-jremus@linux.ibm.com --- arch/x86/include/asm/unwind_user.h | 1 + include/linux/unwind_user.h | 18 ++++++++++++++++-- kernel/unwind/user.c | 4 ---- 3 files changed, 17 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/unwind_user.h b/arch/x86/include/asm/unwind_user.h index 12064284bc4e..971ffe937d50 100644 --- a/arch/x86/include/asm/unwind_user.h +++ b/arch/x86/include/asm/unwind_user.h @@ -35,6 +35,7 @@ static inline bool unwind_user_at_function_start(struct pt_regs *regs) { return is_uprobe_at_func_entry(regs); } +#define unwind_user_at_function_start unwind_user_at_function_start #endif /* CONFIG_HAVE_UNWIND_USER_FP */ diff --git a/include/linux/unwind_user.h b/include/linux/unwind_user.h index 7f7282516bf5..64618618febd 100644 --- a/include/linux/unwind_user.h +++ b/include/linux/unwind_user.h @@ -5,8 +5,22 @@ #include #include -#ifndef ARCH_INIT_USER_FP_FRAME - #define ARCH_INIT_USER_FP_FRAME +#ifndef CONFIG_HAVE_UNWIND_USER_FP + +#define ARCH_INIT_USER_FP_FRAME(ws) + +#endif + +#ifndef ARCH_INIT_USER_FP_ENTRY_FRAME +#define ARCH_INIT_USER_FP_ENTRY_FRAME(ws) +#endif + +#ifndef unwind_user_at_function_start +static inline bool unwind_user_at_function_start(struct pt_regs *regs) +{ + return false; +} +#define unwind_user_at_function_start unwind_user_at_function_start #endif int unwind_user(struct unwind_stacktrace *trace, unsigned int max_entries); diff --git a/kernel/unwind/user.c b/kernel/unwind/user.c index 0ca434f86e73..90ab3c1a205e 100644 --- a/kernel/unwind/user.c +++ b/kernel/unwind/user.c @@ -67,7 +67,6 @@ static int unwind_user_next_common(struct unwind_user_state *state, static int unwind_user_next_fp(struct unwind_user_state *state) { -#ifdef CONFIG_HAVE_UNWIND_USER_FP struct pt_regs *regs = task_pt_regs(current); if (state->topmost && unwind_user_at_function_start(regs)) { @@ -81,9 +80,6 @@ static int unwind_user_next_fp(struct unwind_user_state *state) ARCH_INIT_USER_FP_FRAME(state->ws) }; return unwind_user_next_common(state, &fp_frame); -#else - return -EINVAL; -#endif } static int unwind_user_next(struct unwind_user_state *state) From aa6047ef7204ea1faa346b9123439abed0546f7e Mon Sep 17 00:00:00 2001 From: Jens Remus Date: Mon, 8 Dec 2025 17:03:51 +0100 Subject: [PATCH 19/58] x86/unwind_user: Guard unwind_user_word_size() by UNWIND_USER The unwind user framework in general requires an architecture-specific implementation of unwind_user_word_size() to be present for any unwind method, whether that is fp or a future other method, such as potentially sframe. Guard unwind_user_word_size() by the availability of the UNWIND_USER framework instead of the specific HAVE_UNWIND_USER_FP method. This facilitates to selectively disable HAVE_UNWIND_USER_FP on x86 (e.g. for test purposes) once a new unwind method is added to unwind user. Signed-off-by: Jens Remus Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251208160352.1363040-4-jremus@linux.ibm.com --- arch/x86/include/asm/unwind_user.h | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/arch/x86/include/asm/unwind_user.h b/arch/x86/include/asm/unwind_user.h index 971ffe937d50..7f1229b33d06 100644 --- a/arch/x86/include/asm/unwind_user.h +++ b/arch/x86/include/asm/unwind_user.h @@ -2,23 +2,11 @@ #ifndef _ASM_X86_UNWIND_USER_H #define _ASM_X86_UNWIND_USER_H -#ifdef CONFIG_HAVE_UNWIND_USER_FP +#ifdef CONFIG_UNWIND_USER #include #include -#define ARCH_INIT_USER_FP_FRAME(ws) \ - .cfa_off = 2*(ws), \ - .ra_off = -1*(ws), \ - .fp_off = -2*(ws), \ - .use_fp = true, - -#define ARCH_INIT_USER_FP_ENTRY_FRAME(ws) \ - .cfa_off = 1*(ws), \ - .ra_off = -1*(ws), \ - .fp_off = 0, \ - .use_fp = false, - static inline int unwind_user_word_size(struct pt_regs *regs) { /* We can't unwind VM86 stacks */ @@ -31,6 +19,22 @@ static inline int unwind_user_word_size(struct pt_regs *regs) return sizeof(long); } +#endif /* CONFIG_UNWIND_USER */ + +#ifdef CONFIG_HAVE_UNWIND_USER_FP + +#define ARCH_INIT_USER_FP_FRAME(ws) \ + .cfa_off = 2*(ws), \ + .ra_off = -1*(ws), \ + .fp_off = -2*(ws), \ + .use_fp = true, + +#define ARCH_INIT_USER_FP_ENTRY_FRAME(ws) \ + .cfa_off = 1*(ws), \ + .ra_off = -1*(ws), \ + .fp_off = 0, \ + .use_fp = false, + static inline bool unwind_user_at_function_start(struct pt_regs *regs) { return is_uprobe_at_func_entry(regs); From 3c48808408af11d6f173c65eee9bd5ca4c53667c Mon Sep 17 00:00:00 2001 From: Jens Remus Date: Mon, 8 Dec 2025 17:03:52 +0100 Subject: [PATCH 20/58] x86/unwind_user: Simplify unwind_user_word_size() Get rid of superfluous ifdef and return explicit word size depending on 32-bit or 64-bit mode. Suggested-by: Linus Torvalds Signed-off-by: Jens Remus Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251208160352.1363040-5-jremus@linux.ibm.com --- arch/x86/include/asm/unwind_user.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/arch/x86/include/asm/unwind_user.h b/arch/x86/include/asm/unwind_user.h index 7f1229b33d06..6e469044e4de 100644 --- a/arch/x86/include/asm/unwind_user.h +++ b/arch/x86/include/asm/unwind_user.h @@ -12,11 +12,7 @@ static inline int unwind_user_word_size(struct pt_regs *regs) /* We can't unwind VM86 stacks */ if (regs->flags & X86_VM_MASK) return 0; -#ifdef CONFIG_X86_64 - if (!user_64bit_mode(regs)) - return sizeof(int); -#endif - return sizeof(long); + return user_64bit_mode(regs) ? 8 : 4; } #endif /* CONFIG_UNWIND_USER */ From 63dbadcafc1f4d1da796a8e2c0aea1e561f79ece Mon Sep 17 00:00:00 2001 From: Martin Schiller Date: Mon, 24 Nov 2025 08:48:44 +0100 Subject: [PATCH 21/58] perf/x86/msr: Add Airmont NP Like Airmont, the Airmont NP (aka Intel / MaxLinear Lightning Mountain) supports SMI_COUNT MSR. Signed-off-by: Martin Schiller Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dapeng Mi Link: https://patch.msgid.link/20251124074846.9653-2-ms@dev.tdt.de --- arch/x86/events/msr.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c index 7f5007a4752a..8052596b8503 100644 --- a/arch/x86/events/msr.c +++ b/arch/x86/events/msr.c @@ -78,6 +78,7 @@ static bool test_intel(int idx, void *data) case INTEL_ATOM_SILVERMONT: case INTEL_ATOM_SILVERMONT_D: case INTEL_ATOM_AIRMONT: + case INTEL_ATOM_AIRMONT_NP: case INTEL_ATOM_GOLDMONT: case INTEL_ATOM_GOLDMONT_D: From a08340fd291671c54d379d285b2325490ce90ddd Mon Sep 17 00:00:00 2001 From: Martin Schiller Date: Mon, 24 Nov 2025 08:48:45 +0100 Subject: [PATCH 22/58] perf/x86/intel: Add Airmont NP The Intel / MaxLinear Airmont NP (aka Lightning Mountain) supports the same architectual and non-architecural events as Airmont. Signed-off-by: Martin Schiller Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dapeng Mi Link: https://patch.msgid.link/20251124074846.9653-3-ms@dev.tdt.de --- arch/x86/events/intel/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 0553c1160f15..1840ca1918d1 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -7410,6 +7410,7 @@ __init int intel_pmu_init(void) case INTEL_ATOM_SILVERMONT_D: case INTEL_ATOM_SILVERMONT_MID: case INTEL_ATOM_AIRMONT: + case INTEL_ATOM_AIRMONT_NP: case INTEL_ATOM_SILVERMONT_MID2: memcpy(hw_cache_event_ids, slm_hw_cache_event_ids, sizeof(hw_cache_event_ids)); From 3006911f284d769b0f66c12b39da130325ef1440 Mon Sep 17 00:00:00 2001 From: Martin Schiller Date: Mon, 24 Nov 2025 08:48:46 +0100 Subject: [PATCH 23/58] perf/x86/cstate: Add Airmont NP From the perspective of Intel cstate residency counters, the Airmont NP (aka Lightning Mountain) is identical to the Airmont. Signed-off-by: Martin Schiller Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dapeng Mi Link: https://patch.msgid.link/20251124074846.9653-4-ms@dev.tdt.de --- arch/x86/events/intel/cstate.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index 1e2658b60d91..f3d5ee07f8f2 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -613,6 +613,7 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_MATCH_VFM(INTEL_ATOM_SILVERMONT, &slm_cstates), X86_MATCH_VFM(INTEL_ATOM_SILVERMONT_D, &slm_cstates), X86_MATCH_VFM(INTEL_ATOM_AIRMONT, &slm_cstates), + X86_MATCH_VFM(INTEL_ATOM_AIRMONT_NP, &slm_cstates), X86_MATCH_VFM(INTEL_BROADWELL, &snb_cstates), X86_MATCH_VFM(INTEL_BROADWELL_D, &snb_cstates), From 3cb3c2f6886f9489df13de8efe7a1e803a3f21ea Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 17 Dec 2025 12:08:01 +0100 Subject: [PATCH 24/58] perf: Clean up mediated vPMU accounting The mediated_pmu_account_event() and perf_create_mediated_pmu() functions implement the exclusion between '!exclude_guest' counters and mediated vPMUs. Their implementation is basically identical, except mirrored in what they count/check. Make sure the actual implementations reflect this similarity. Notably: - while perf_release_mediated_pmu() has an underflow check; mediated_pmu_unaccount_event() did not. - while perf_create_mediated_pmu() has an inc_not_zero() path; mediated_pmu_account_event() did not. Also, the inc_not_zero() path can be outsite of perf_mediated_pmu_mutex. The mutex must guard the 0->1 (of either nr_include_guest_events or nr_mediated_pmu_vms) transition, but once a counter is already non-zero, it can safely be incremented further. Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251208115156.GE3707891@noisy.programming.kicks-ass.net --- kernel/events/core.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index dd842a4ca789..e6a4b1e34f84 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6344,8 +6344,10 @@ static int mediated_pmu_account_event(struct perf_event *event) if (!is_include_guest_event(event)) return 0; - guard(mutex)(&perf_mediated_pmu_mutex); + if (atomic_inc_not_zero(&nr_include_guest_events)) + return 0; + guard(mutex)(&perf_mediated_pmu_mutex); if (atomic_read(&nr_mediated_pmu_vms)) return -EOPNOTSUPP; @@ -6358,6 +6360,9 @@ static void mediated_pmu_unaccount_event(struct perf_event *event) if (!is_include_guest_event(event)) return; + if (WARN_ON_ONCE(!atomic_read(&nr_include_guest_events))) + return; + atomic_dec(&nr_include_guest_events); } @@ -6373,10 +6378,10 @@ static void mediated_pmu_unaccount_event(struct perf_event *event) */ int perf_create_mediated_pmu(void) { - guard(mutex)(&perf_mediated_pmu_mutex); if (atomic_inc_not_zero(&nr_mediated_pmu_vms)) return 0; + guard(mutex)(&perf_mediated_pmu_mutex); if (atomic_read(&nr_include_guest_events)) return -EBUSY; From 01122b89361e565b3c88b9fbebe92dc5c7420cb7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 17 Dec 2025 13:23:59 +0100 Subject: [PATCH 25/58] perf: Use EXPORT_SYMBOL_FOR_KVM() for the mediated APIs Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251208115156.GE3707891@noisy.programming.kicks-ass.net --- arch/x86/events/core.c | 5 +++-- include/asm-generic/Kbuild | 1 + kernel/events/core.c | 5 +++-- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index df7a32be9914..0ecac9495d74 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -1771,14 +1772,14 @@ void perf_load_guest_lvtpc(u32 guest_lvtpc) APIC_DM_FIXED | PERF_GUEST_MEDIATED_PMI_VECTOR | masked); this_cpu_write(guest_lvtpc_loaded, true); } -EXPORT_SYMBOL_FOR_MODULES(perf_load_guest_lvtpc, "kvm"); +EXPORT_SYMBOL_FOR_KVM(perf_load_guest_lvtpc); void perf_put_guest_lvtpc(void) { this_cpu_write(guest_lvtpc_loaded, false); apic_write(APIC_LVTPC, APIC_DM_NMI); } -EXPORT_SYMBOL_FOR_MODULES(perf_put_guest_lvtpc, "kvm"); +EXPORT_SYMBOL_FOR_KVM(perf_put_guest_lvtpc); #endif /* CONFIG_PERF_GUEST_MEDIATED_PMU */ static int diff --git a/include/asm-generic/Kbuild b/include/asm-generic/Kbuild index 295c94a3ccc1..9aff61e7b8f2 100644 --- a/include/asm-generic/Kbuild +++ b/include/asm-generic/Kbuild @@ -32,6 +32,7 @@ mandatory-y += irq_work.h mandatory-y += kdebug.h mandatory-y += kmap_size.h mandatory-y += kprobes.h +mandatory-y += kvm_types.h mandatory-y += linkage.h mandatory-y += local.h mandatory-y += local64.h diff --git a/kernel/events/core.c b/kernel/events/core.c index e6a4b1e34f84..376fb07d869b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -57,6 +57,7 @@ #include #include #include +#include #include "internal.h" @@ -6388,7 +6389,7 @@ int perf_create_mediated_pmu(void) atomic_inc(&nr_mediated_pmu_vms); return 0; } -EXPORT_SYMBOL_GPL(perf_create_mediated_pmu); +EXPORT_SYMBOL_FOR_KVM(perf_create_mediated_pmu); void perf_release_mediated_pmu(void) { @@ -6397,7 +6398,7 @@ void perf_release_mediated_pmu(void) atomic_dec(&nr_mediated_pmu_vms); } -EXPORT_SYMBOL_GPL(perf_release_mediated_pmu); +EXPORT_SYMBOL_FOR_KVM(perf_release_mediated_pmu); /* When loading a guest's mediated PMU, schedule out all exclude_guest events. */ void perf_load_guest_context(void) From 632d89b030f1dac8d91875cd56a08adededba349 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 17 Dec 2025 13:42:41 +0100 Subject: [PATCH 26/58] perf/x86/uncore: clean up const mismatch In some cmp functions, a const pointer is cast out to a non-const pointer by using container_of() which is not correct. Fix this up by properly marking the pointers as const, which preserves the correct type of the pointer passed into the functions. Signed-off-by: Greg Kroah-Hartman Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/2025121741-headstand-stratus-f5eb@gregkh --- arch/x86/events/intel/uncore_discovery.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/events/intel/uncore_discovery.c b/arch/x86/events/intel/uncore_discovery.c index 7d57ce706feb..330bca2f14b3 100644 --- a/arch/x86/events/intel/uncore_discovery.c +++ b/arch/x86/events/intel/uncore_discovery.c @@ -52,7 +52,7 @@ static int get_device_die_id(struct pci_dev *dev) static inline int __type_cmp(const void *key, const struct rb_node *b) { - struct intel_uncore_discovery_type *type_b = __node_2_type(b); + const struct intel_uncore_discovery_type *type_b = __node_2_type(b); const u16 *type_id = key; if (type_b->type > *type_id) @@ -115,7 +115,7 @@ get_uncore_discovery_type(struct uncore_unit_discovery *unit) static inline int pmu_idx_cmp(const void *key, const struct rb_node *b) { - struct intel_uncore_discovery_unit *unit; + const struct intel_uncore_discovery_unit *unit; const unsigned int *id = key; unit = rb_entry(b, struct intel_uncore_discovery_unit, node); @@ -173,7 +173,7 @@ int intel_uncore_find_discovery_unit_id(struct rb_root *units, int die, static inline bool unit_less(struct rb_node *a, const struct rb_node *b) { - struct intel_uncore_discovery_unit *a_node, *b_node; + const struct intel_uncore_discovery_unit *a_node, *b_node; a_node = rb_entry(a, struct intel_uncore_discovery_unit, node); b_node = rb_entry(b, struct intel_uncore_discovery_unit, node); From 098fe55a450b280d8a8584b2511634e1236ba96d Mon Sep 17 00:00:00 2001 From: Zide Chen Date: Wed, 31 Dec 2025 14:42:18 -0800 Subject: [PATCH 27/58] perf/x86/intel/uncore: Move uncore discovery init struct to header The discovery base MSR or PCI device is platform-specific and must be defined statically in the per-platform init table and passed to the discovery code. Move the definition of struct intel_uncore_init_fun to uncore.h so it can be accessed by discovery code, and rename it to reflect that it now carries more than just init callbacks. Shorten intel_uncore_has_discovery_tables[_pci/msr] to uncore_discovery[_pci/msr] for improved readability and alignment. Drop the `intel_` prefix from new names since the code is under the intel directory and long identifiers make alignment harder. Further cleanups will continue removing `intel_` prefixes. No functional change intended. Signed-off-by: Zide Chen Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dapeng Mi Link: https://patch.msgid.link/20251231224233.113839-2-zide.chen@intel.com --- arch/x86/events/intel/uncore.c | 72 ++++++++++-------------- arch/x86/events/intel/uncore.h | 10 ++++ arch/x86/events/intel/uncore_discovery.c | 12 ++-- arch/x86/events/intel/uncore_discovery.h | 2 +- 4 files changed, 49 insertions(+), 47 deletions(-) diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index e228e564b15e..cd561290be8c 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -1697,133 +1697,123 @@ static int __init uncore_mmio_init(void) return ret; } -struct intel_uncore_init_fun { - void (*cpu_init)(void); - int (*pci_init)(void); - void (*mmio_init)(void); - /* Discovery table is required */ - bool use_discovery; - /* The units in the discovery table should be ignored. */ - int *uncore_units_ignore; -}; - -static const struct intel_uncore_init_fun nhm_uncore_init __initconst = { +static const struct uncore_plat_init nhm_uncore_init __initconst = { .cpu_init = nhm_uncore_cpu_init, }; -static const struct intel_uncore_init_fun snb_uncore_init __initconst = { +static const struct uncore_plat_init snb_uncore_init __initconst = { .cpu_init = snb_uncore_cpu_init, .pci_init = snb_uncore_pci_init, }; -static const struct intel_uncore_init_fun ivb_uncore_init __initconst = { +static const struct uncore_plat_init ivb_uncore_init __initconst = { .cpu_init = snb_uncore_cpu_init, .pci_init = ivb_uncore_pci_init, }; -static const struct intel_uncore_init_fun hsw_uncore_init __initconst = { +static const struct uncore_plat_init hsw_uncore_init __initconst = { .cpu_init = snb_uncore_cpu_init, .pci_init = hsw_uncore_pci_init, }; -static const struct intel_uncore_init_fun bdw_uncore_init __initconst = { +static const struct uncore_plat_init bdw_uncore_init __initconst = { .cpu_init = snb_uncore_cpu_init, .pci_init = bdw_uncore_pci_init, }; -static const struct intel_uncore_init_fun snbep_uncore_init __initconst = { +static const struct uncore_plat_init snbep_uncore_init __initconst = { .cpu_init = snbep_uncore_cpu_init, .pci_init = snbep_uncore_pci_init, }; -static const struct intel_uncore_init_fun nhmex_uncore_init __initconst = { +static const struct uncore_plat_init nhmex_uncore_init __initconst = { .cpu_init = nhmex_uncore_cpu_init, }; -static const struct intel_uncore_init_fun ivbep_uncore_init __initconst = { +static const struct uncore_plat_init ivbep_uncore_init __initconst = { .cpu_init = ivbep_uncore_cpu_init, .pci_init = ivbep_uncore_pci_init, }; -static const struct intel_uncore_init_fun hswep_uncore_init __initconst = { +static const struct uncore_plat_init hswep_uncore_init __initconst = { .cpu_init = hswep_uncore_cpu_init, .pci_init = hswep_uncore_pci_init, }; -static const struct intel_uncore_init_fun bdx_uncore_init __initconst = { +static const struct uncore_plat_init bdx_uncore_init __initconst = { .cpu_init = bdx_uncore_cpu_init, .pci_init = bdx_uncore_pci_init, }; -static const struct intel_uncore_init_fun knl_uncore_init __initconst = { +static const struct uncore_plat_init knl_uncore_init __initconst = { .cpu_init = knl_uncore_cpu_init, .pci_init = knl_uncore_pci_init, }; -static const struct intel_uncore_init_fun skl_uncore_init __initconst = { +static const struct uncore_plat_init skl_uncore_init __initconst = { .cpu_init = skl_uncore_cpu_init, .pci_init = skl_uncore_pci_init, }; -static const struct intel_uncore_init_fun skx_uncore_init __initconst = { +static const struct uncore_plat_init skx_uncore_init __initconst = { .cpu_init = skx_uncore_cpu_init, .pci_init = skx_uncore_pci_init, }; -static const struct intel_uncore_init_fun icl_uncore_init __initconst = { +static const struct uncore_plat_init icl_uncore_init __initconst = { .cpu_init = icl_uncore_cpu_init, .pci_init = skl_uncore_pci_init, }; -static const struct intel_uncore_init_fun tgl_uncore_init __initconst = { +static const struct uncore_plat_init tgl_uncore_init __initconst = { .cpu_init = tgl_uncore_cpu_init, .mmio_init = tgl_uncore_mmio_init, }; -static const struct intel_uncore_init_fun tgl_l_uncore_init __initconst = { +static const struct uncore_plat_init tgl_l_uncore_init __initconst = { .cpu_init = tgl_uncore_cpu_init, .mmio_init = tgl_l_uncore_mmio_init, }; -static const struct intel_uncore_init_fun rkl_uncore_init __initconst = { +static const struct uncore_plat_init rkl_uncore_init __initconst = { .cpu_init = tgl_uncore_cpu_init, .pci_init = skl_uncore_pci_init, }; -static const struct intel_uncore_init_fun adl_uncore_init __initconst = { +static const struct uncore_plat_init adl_uncore_init __initconst = { .cpu_init = adl_uncore_cpu_init, .mmio_init = adl_uncore_mmio_init, }; -static const struct intel_uncore_init_fun mtl_uncore_init __initconst = { +static const struct uncore_plat_init mtl_uncore_init __initconst = { .cpu_init = mtl_uncore_cpu_init, .mmio_init = adl_uncore_mmio_init, }; -static const struct intel_uncore_init_fun lnl_uncore_init __initconst = { +static const struct uncore_plat_init lnl_uncore_init __initconst = { .cpu_init = lnl_uncore_cpu_init, .mmio_init = lnl_uncore_mmio_init, }; -static const struct intel_uncore_init_fun ptl_uncore_init __initconst = { +static const struct uncore_plat_init ptl_uncore_init __initconst = { .cpu_init = ptl_uncore_cpu_init, .mmio_init = ptl_uncore_mmio_init, .use_discovery = true, }; -static const struct intel_uncore_init_fun icx_uncore_init __initconst = { +static const struct uncore_plat_init icx_uncore_init __initconst = { .cpu_init = icx_uncore_cpu_init, .pci_init = icx_uncore_pci_init, .mmio_init = icx_uncore_mmio_init, }; -static const struct intel_uncore_init_fun snr_uncore_init __initconst = { +static const struct uncore_plat_init snr_uncore_init __initconst = { .cpu_init = snr_uncore_cpu_init, .pci_init = snr_uncore_pci_init, .mmio_init = snr_uncore_mmio_init, }; -static const struct intel_uncore_init_fun spr_uncore_init __initconst = { +static const struct uncore_plat_init spr_uncore_init __initconst = { .cpu_init = spr_uncore_cpu_init, .pci_init = spr_uncore_pci_init, .mmio_init = spr_uncore_mmio_init, @@ -1831,7 +1821,7 @@ static const struct intel_uncore_init_fun spr_uncore_init __initconst = { .uncore_units_ignore = spr_uncore_units_ignore, }; -static const struct intel_uncore_init_fun gnr_uncore_init __initconst = { +static const struct uncore_plat_init gnr_uncore_init __initconst = { .cpu_init = gnr_uncore_cpu_init, .pci_init = gnr_uncore_pci_init, .mmio_init = gnr_uncore_mmio_init, @@ -1839,7 +1829,7 @@ static const struct intel_uncore_init_fun gnr_uncore_init __initconst = { .uncore_units_ignore = gnr_uncore_units_ignore, }; -static const struct intel_uncore_init_fun generic_uncore_init __initconst = { +static const struct uncore_plat_init generic_uncore_init __initconst = { .cpu_init = intel_uncore_generic_uncore_cpu_init, .pci_init = intel_uncore_generic_uncore_pci_init, .mmio_init = intel_uncore_generic_uncore_mmio_init, @@ -1910,7 +1900,7 @@ MODULE_DEVICE_TABLE(x86cpu, intel_uncore_match); static int __init intel_uncore_init(void) { const struct x86_cpu_id *id; - struct intel_uncore_init_fun *uncore_init; + struct uncore_plat_init *uncore_init; int pret = 0, cret = 0, mret = 0, ret; if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) @@ -1921,16 +1911,16 @@ static int __init intel_uncore_init(void) id = x86_match_cpu(intel_uncore_match); if (!id) { - if (!uncore_no_discover && intel_uncore_has_discovery_tables(NULL)) - uncore_init = (struct intel_uncore_init_fun *)&generic_uncore_init; + if (!uncore_no_discover && uncore_discovery(NULL)) + uncore_init = (struct uncore_plat_init *)&generic_uncore_init; else return -ENODEV; } else { - uncore_init = (struct intel_uncore_init_fun *)id->driver_data; + uncore_init = (struct uncore_plat_init *)id->driver_data; if (uncore_no_discover && uncore_init->use_discovery) return -ENODEV; if (uncore_init->use_discovery && - !intel_uncore_has_discovery_tables(uncore_init->uncore_units_ignore)) + !uncore_discovery(uncore_init)) return -ENODEV; } diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index d8815fff7588..568536ef28ee 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -47,6 +47,16 @@ struct uncore_event_desc; struct freerunning_counters; struct intel_uncore_topology; +struct uncore_plat_init { + void (*cpu_init)(void); + int (*pci_init)(void); + void (*mmio_init)(void); + /* Discovery table is required */ + bool use_discovery; + /* The units in the discovery table should be ignored. */ + int *uncore_units_ignore; +}; + struct intel_uncore_type { const char *name; int num_counters; diff --git a/arch/x86/events/intel/uncore_discovery.c b/arch/x86/events/intel/uncore_discovery.c index 330bca2f14b3..97f14436d318 100644 --- a/arch/x86/events/intel/uncore_discovery.c +++ b/arch/x86/events/intel/uncore_discovery.c @@ -350,7 +350,7 @@ static int parse_discovery_table(struct pci_dev *dev, int die, return __parse_discovery_table(addr, die, parsed, ignore); } -static bool intel_uncore_has_discovery_tables_pci(int *ignore) +static bool uncore_discovery_pci(int *ignore) { u32 device, val, entry_id, bar_offset; int die, dvsec = 0, ret = true; @@ -399,7 +399,7 @@ static bool intel_uncore_has_discovery_tables_pci(int *ignore) return ret; } -static bool intel_uncore_has_discovery_tables_msr(int *ignore) +static bool uncore_discovery_msr(int *ignore) { unsigned long *die_mask; bool parsed = false; @@ -432,10 +432,12 @@ static bool intel_uncore_has_discovery_tables_msr(int *ignore) return parsed; } -bool intel_uncore_has_discovery_tables(int *ignore) +bool uncore_discovery(struct uncore_plat_init *init) { - return intel_uncore_has_discovery_tables_msr(ignore) || - intel_uncore_has_discovery_tables_pci(ignore); + int *ignore = init ? init->uncore_units_ignore : NULL; + + return uncore_discovery_msr(ignore) || + uncore_discovery_pci(ignore); } void intel_uncore_clear_discovery_tables(void) diff --git a/arch/x86/events/intel/uncore_discovery.h b/arch/x86/events/intel/uncore_discovery.h index dff75c98e22f..dfc237a2b6df 100644 --- a/arch/x86/events/intel/uncore_discovery.h +++ b/arch/x86/events/intel/uncore_discovery.h @@ -136,7 +136,7 @@ struct intel_uncore_discovery_type { u16 num_units; /* number of units */ }; -bool intel_uncore_has_discovery_tables(int *ignore); +bool uncore_discovery(struct uncore_plat_init *init); void intel_uncore_clear_discovery_tables(void); void intel_uncore_generic_uncore_cpu_init(void); int intel_uncore_generic_uncore_pci_init(void); From e75462f6c7eaa5affd922c9a14591cdd5e3ab63d Mon Sep 17 00:00:00 2001 From: Zide Chen Date: Wed, 31 Dec 2025 14:42:19 -0800 Subject: [PATCH 28/58] perf/x86/intel/uncore: Support per-platform discovery base devices On DMR platforms, IMH discovery tables are enumerated via PCI, while CBB domains use MSRs, unlike earlier platforms which relied on either PCI or MSR exclusively. DMR also uses different MSRs and PCI devices, requiring support for multiple, platform-specific discovery bases. Introduce struct uncore_discovery_domain to hold the discovery base and other domain-specific configuration. Move uncore_units_ignore into uncore_discovery_domain so a single structure can be passed to uncore_discovery_[pci/msr]. No functional change intended. Co-developed-by: Dapeng Mi Signed-off-by: Dapeng Mi Signed-off-by: Zide Chen Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dapeng Mi Link: https://patch.msgid.link/20251231224233.113839-3-zide.chen@intel.com --- arch/x86/events/intel/uncore.c | 31 ++++++++----- arch/x86/events/intel/uncore.h | 15 +++++-- arch/x86/events/intel/uncore_discovery.c | 57 +++++++++++++++--------- 3 files changed, 68 insertions(+), 35 deletions(-) diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index cd561290be8c..78a03af87204 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -1798,7 +1798,7 @@ static const struct uncore_plat_init lnl_uncore_init __initconst = { static const struct uncore_plat_init ptl_uncore_init __initconst = { .cpu_init = ptl_uncore_cpu_init, .mmio_init = ptl_uncore_mmio_init, - .use_discovery = true, + .domain[0].discovery_base = UNCORE_DISCOVERY_MSR, }; static const struct uncore_plat_init icx_uncore_init __initconst = { @@ -1817,16 +1817,18 @@ static const struct uncore_plat_init spr_uncore_init __initconst = { .cpu_init = spr_uncore_cpu_init, .pci_init = spr_uncore_pci_init, .mmio_init = spr_uncore_mmio_init, - .use_discovery = true, - .uncore_units_ignore = spr_uncore_units_ignore, + .domain[0].base_is_pci = true, + .domain[0].discovery_base = UNCORE_DISCOVERY_TABLE_DEVICE, + .domain[0].units_ignore = spr_uncore_units_ignore, }; static const struct uncore_plat_init gnr_uncore_init __initconst = { .cpu_init = gnr_uncore_cpu_init, .pci_init = gnr_uncore_pci_init, .mmio_init = gnr_uncore_mmio_init, - .use_discovery = true, - .uncore_units_ignore = gnr_uncore_units_ignore, + .domain[0].base_is_pci = true, + .domain[0].discovery_base = UNCORE_DISCOVERY_TABLE_DEVICE, + .domain[0].units_ignore = gnr_uncore_units_ignore, }; static const struct uncore_plat_init generic_uncore_init __initconst = { @@ -1897,6 +1899,16 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = { }; MODULE_DEVICE_TABLE(x86cpu, intel_uncore_match); +static bool uncore_use_discovery(struct uncore_plat_init *config) +{ + for (int i = 0; i < UNCORE_DISCOVERY_DOMAINS; i++) { + if (config->domain[i].discovery_base) + return true; + } + + return false; +} + static int __init intel_uncore_init(void) { const struct x86_cpu_id *id; @@ -1911,15 +1923,14 @@ static int __init intel_uncore_init(void) id = x86_match_cpu(intel_uncore_match); if (!id) { - if (!uncore_no_discover && uncore_discovery(NULL)) - uncore_init = (struct uncore_plat_init *)&generic_uncore_init; - else + uncore_init = (struct uncore_plat_init *)&generic_uncore_init; + if (uncore_no_discover || !uncore_discovery(uncore_init)) return -ENODEV; } else { uncore_init = (struct uncore_plat_init *)id->driver_data; - if (uncore_no_discover && uncore_init->use_discovery) + if (uncore_no_discover && uncore_use_discovery(uncore_init)) return -ENODEV; - if (uncore_init->use_discovery && + if (uncore_use_discovery(uncore_init) && !uncore_discovery(uncore_init)) return -ENODEV; } diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index 568536ef28ee..1574ffc7ee05 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -47,14 +47,21 @@ struct uncore_event_desc; struct freerunning_counters; struct intel_uncore_topology; +struct uncore_discovery_domain { + /* MSR address or PCI device used as the discovery base */ + u32 discovery_base; + bool base_is_pci; + /* The units in the discovery table should be ignored. */ + int *units_ignore; +}; + +#define UNCORE_DISCOVERY_DOMAINS 2 struct uncore_plat_init { void (*cpu_init)(void); int (*pci_init)(void); void (*mmio_init)(void); - /* Discovery table is required */ - bool use_discovery; - /* The units in the discovery table should be ignored. */ - int *uncore_units_ignore; + + struct uncore_discovery_domain domain[UNCORE_DISCOVERY_DOMAINS]; }; struct intel_uncore_type { diff --git a/arch/x86/events/intel/uncore_discovery.c b/arch/x86/events/intel/uncore_discovery.c index 97f14436d318..25b6a65c9614 100644 --- a/arch/x86/events/intel/uncore_discovery.c +++ b/arch/x86/events/intel/uncore_discovery.c @@ -259,23 +259,24 @@ uncore_insert_box_info(struct uncore_unit_discovery *unit, } static bool -uncore_ignore_unit(struct uncore_unit_discovery *unit, int *ignore) +uncore_ignore_unit(struct uncore_unit_discovery *unit, + struct uncore_discovery_domain *domain) { int i; - if (!ignore) + if (!domain || !domain->units_ignore) return false; - for (i = 0; ignore[i] != UNCORE_IGNORE_END ; i++) { - if (unit->box_type == ignore[i]) + for (i = 0; domain->units_ignore[i] != UNCORE_IGNORE_END ; i++) { + if (unit->box_type == domain->units_ignore[i]) return true; } return false; } -static int __parse_discovery_table(resource_size_t addr, int die, - bool *parsed, int *ignore) +static int __parse_discovery_table(struct uncore_discovery_domain *domain, + resource_size_t addr, int die, bool *parsed) { struct uncore_global_discovery global; struct uncore_unit_discovery unit; @@ -314,7 +315,7 @@ static int __parse_discovery_table(resource_size_t addr, int die, if (unit.access_type >= UNCORE_ACCESS_MAX) continue; - if (uncore_ignore_unit(&unit, ignore)) + if (uncore_ignore_unit(&unit, domain)) continue; uncore_insert_box_info(&unit, die); @@ -325,9 +326,9 @@ static int __parse_discovery_table(resource_size_t addr, int die, return 0; } -static int parse_discovery_table(struct pci_dev *dev, int die, - u32 bar_offset, bool *parsed, - int *ignore) +static int parse_discovery_table(struct uncore_discovery_domain *domain, + struct pci_dev *dev, int die, + u32 bar_offset, bool *parsed) { resource_size_t addr; u32 val; @@ -347,17 +348,19 @@ static int parse_discovery_table(struct pci_dev *dev, int die, } #endif - return __parse_discovery_table(addr, die, parsed, ignore); + return __parse_discovery_table(domain, addr, die, parsed); } -static bool uncore_discovery_pci(int *ignore) +static bool uncore_discovery_pci(struct uncore_discovery_domain *domain) { u32 device, val, entry_id, bar_offset; int die, dvsec = 0, ret = true; struct pci_dev *dev = NULL; bool parsed = false; - if (has_generic_discovery_table()) + if (domain->discovery_base) + device = domain->discovery_base; + else if (has_generic_discovery_table()) device = UNCORE_DISCOVERY_TABLE_DEVICE; else device = PCI_ANY_ID; @@ -386,7 +389,7 @@ static bool uncore_discovery_pci(int *ignore) if (die < 0) continue; - parse_discovery_table(dev, die, bar_offset, &parsed, ignore); + parse_discovery_table(domain, dev, die, bar_offset, &parsed); } } @@ -399,11 +402,11 @@ static bool uncore_discovery_pci(int *ignore) return ret; } -static bool uncore_discovery_msr(int *ignore) +static bool uncore_discovery_msr(struct uncore_discovery_domain *domain) { unsigned long *die_mask; bool parsed = false; - int cpu, die; + int cpu, die, msr; u64 base; die_mask = kcalloc(BITS_TO_LONGS(uncore_max_dies()), @@ -411,19 +414,22 @@ static bool uncore_discovery_msr(int *ignore) if (!die_mask) return false; + msr = domain->discovery_base ? + domain->discovery_base : UNCORE_DISCOVERY_MSR; + cpus_read_lock(); for_each_online_cpu(cpu) { die = topology_logical_die_id(cpu); if (__test_and_set_bit(die, die_mask)) continue; - if (rdmsrq_safe_on_cpu(cpu, UNCORE_DISCOVERY_MSR, &base)) + if (rdmsrq_safe_on_cpu(cpu, msr, &base)) continue; if (!base) continue; - __parse_discovery_table(base, die, &parsed, ignore); + __parse_discovery_table(domain, base, die, &parsed); } cpus_read_unlock(); @@ -434,10 +440,19 @@ static bool uncore_discovery_msr(int *ignore) bool uncore_discovery(struct uncore_plat_init *init) { - int *ignore = init ? init->uncore_units_ignore : NULL; + struct uncore_discovery_domain *domain; + bool ret = false; + int i; - return uncore_discovery_msr(ignore) || - uncore_discovery_pci(ignore); + for (i = 0; i < UNCORE_DISCOVERY_DOMAINS; i++) { + domain = &init->domain[i]; + if (!domain->base_is_pci) + ret |= uncore_discovery_msr(domain); + else + ret |= uncore_discovery_pci(domain); + } + + return ret; } void intel_uncore_clear_discovery_tables(void) From 1897336728b4ab0229fb73bb6f1e94cfe914afa9 Mon Sep 17 00:00:00 2001 From: Zide Chen Date: Wed, 31 Dec 2025 14:42:20 -0800 Subject: [PATCH 29/58] perf/x86/intel/uncore: Remove has_generic_discovery_table() In the !x86_match_cpu() fallback path, has_generic_discovery_table() is removed because it does not handle multiple PCI devices. Instead, use PCI_ANY_ID in generic_uncore_init[] to probe all PCI devices. For MSR portals, only probe MSR 0x201e to keep the fallback simple, as this path is best-effort only. Signed-off-by: Zide Chen Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dapeng Mi Link: https://patch.msgid.link/20251231224233.113839-4-zide.chen@intel.com --- arch/x86/events/intel/uncore.c | 3 ++ arch/x86/events/intel/uncore_discovery.c | 42 +++++------------------- 2 files changed, 12 insertions(+), 33 deletions(-) diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 78a03af87204..2387b1a80ca0 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -1835,6 +1835,9 @@ static const struct uncore_plat_init generic_uncore_init __initconst = { .cpu_init = intel_uncore_generic_uncore_cpu_init, .pci_init = intel_uncore_generic_uncore_pci_init, .mmio_init = intel_uncore_generic_uncore_mmio_init, + .domain[0].base_is_pci = true, + .domain[0].discovery_base = PCI_ANY_ID, + .domain[1].discovery_base = UNCORE_DISCOVERY_MSR, }; static const struct x86_cpu_id intel_uncore_match[] __initconst = { diff --git a/arch/x86/events/intel/uncore_discovery.c b/arch/x86/events/intel/uncore_discovery.c index 25b6a65c9614..aaa081011fd6 100644 --- a/arch/x86/events/intel/uncore_discovery.c +++ b/arch/x86/events/intel/uncore_discovery.c @@ -12,24 +12,6 @@ static struct rb_root discovery_tables = RB_ROOT; static int num_discovered_types[UNCORE_ACCESS_MAX]; -static bool has_generic_discovery_table(void) -{ - struct pci_dev *dev; - int dvsec; - - dev = pci_get_device(PCI_VENDOR_ID_INTEL, UNCORE_DISCOVERY_TABLE_DEVICE, NULL); - if (!dev) - return false; - - /* A discovery table device has the unique capability ID. */ - dvsec = pci_find_next_ext_capability(dev, 0, UNCORE_EXT_CAP_ID_DISCOVERY); - pci_dev_put(dev); - if (dvsec) - return true; - - return false; -} - static int logical_die_id; static int get_device_die_id(struct pci_dev *dev) @@ -358,12 +340,7 @@ static bool uncore_discovery_pci(struct uncore_discovery_domain *domain) struct pci_dev *dev = NULL; bool parsed = false; - if (domain->discovery_base) - device = domain->discovery_base; - else if (has_generic_discovery_table()) - device = UNCORE_DISCOVERY_TABLE_DEVICE; - else - device = PCI_ANY_ID; + device = domain->discovery_base; /* * Start a new search and iterates through the list of @@ -406,7 +383,7 @@ static bool uncore_discovery_msr(struct uncore_discovery_domain *domain) { unsigned long *die_mask; bool parsed = false; - int cpu, die, msr; + int cpu, die; u64 base; die_mask = kcalloc(BITS_TO_LONGS(uncore_max_dies()), @@ -414,16 +391,13 @@ static bool uncore_discovery_msr(struct uncore_discovery_domain *domain) if (!die_mask) return false; - msr = domain->discovery_base ? - domain->discovery_base : UNCORE_DISCOVERY_MSR; - cpus_read_lock(); for_each_online_cpu(cpu) { die = topology_logical_die_id(cpu); if (__test_and_set_bit(die, die_mask)) continue; - if (rdmsrq_safe_on_cpu(cpu, msr, &base)) + if (rdmsrq_safe_on_cpu(cpu, domain->discovery_base, &base)) continue; if (!base) @@ -446,10 +420,12 @@ bool uncore_discovery(struct uncore_plat_init *init) for (i = 0; i < UNCORE_DISCOVERY_DOMAINS; i++) { domain = &init->domain[i]; - if (!domain->base_is_pci) - ret |= uncore_discovery_msr(domain); - else - ret |= uncore_discovery_pci(domain); + if (domain->discovery_base) { + if (!domain->base_is_pci) + ret |= uncore_discovery_msr(domain); + else + ret |= uncore_discovery_pci(domain); + } } return ret; From 6daf2c35b835da211bf70606e9f74d1af98613a9 Mon Sep 17 00:00:00 2001 From: Zide Chen Date: Wed, 31 Dec 2025 14:42:21 -0800 Subject: [PATCH 30/58] perf/x86/intel/uncore: Add IMH PMON support for Diamond Rapids DMR supports IMH PMON units for PCU, UBox, iMC, and CXL: - PCU and UBox are same with SPR. - iMC is similar to SPR but uses different offsets for fixed registers. - CXL introduces a new port_enable field and changes the position of the threshold field. DMR also introduces additional PMON units: SCA, HAMVF, D2D_ULA, UBR, PCIE4, CRS, CPC, ITC, OTC, CMS, and PCIE6. Among these, PCIE4 and PCIE6 use different unit types, but share the same config register layout, and the generic PCIe PMON events apply to both. Additionally, ignore the broken MSE unit. Signed-off-by: Zide Chen Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251231224233.113839-5-zide.chen@intel.com --- arch/x86/events/intel/uncore.c | 9 + arch/x86/events/intel/uncore.h | 3 + arch/x86/events/intel/uncore_discovery.h | 2 + arch/x86/events/intel/uncore_snbep.c | 229 +++++++++++++++++++++++ 4 files changed, 243 insertions(+) diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 2387b1a80ca0..40cf9bf79b81 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -1831,6 +1831,14 @@ static const struct uncore_plat_init gnr_uncore_init __initconst = { .domain[0].units_ignore = gnr_uncore_units_ignore, }; +static const struct uncore_plat_init dmr_uncore_init __initconst = { + .pci_init = dmr_uncore_pci_init, + .mmio_init = dmr_uncore_mmio_init, + .domain[0].base_is_pci = true, + .domain[0].discovery_base = DMR_UNCORE_DISCOVERY_TABLE_DEVICE, + .domain[0].units_ignore = dmr_uncore_imh_units_ignore, +}; + static const struct uncore_plat_init generic_uncore_init __initconst = { .cpu_init = intel_uncore_generic_uncore_cpu_init, .pci_init = intel_uncore_generic_uncore_pci_init, @@ -1898,6 +1906,7 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = { X86_MATCH_VFM(INTEL_ATOM_CRESTMONT_X, &gnr_uncore_init), X86_MATCH_VFM(INTEL_ATOM_CRESTMONT, &gnr_uncore_init), X86_MATCH_VFM(INTEL_ATOM_DARKMONT_X, &gnr_uncore_init), + X86_MATCH_VFM(INTEL_DIAMONDRAPIDS_X, &dmr_uncore_init), {}, }; MODULE_DEVICE_TABLE(x86cpu, intel_uncore_match); diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index 1574ffc7ee05..1e4b3a22403c 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -614,6 +614,7 @@ extern struct pci_extra_dev *uncore_extra_pci_dev; extern struct event_constraint uncore_constraint_empty; extern int spr_uncore_units_ignore[]; extern int gnr_uncore_units_ignore[]; +extern int dmr_uncore_imh_units_ignore[]; /* uncore_snb.c */ int snb_uncore_pci_init(void); @@ -662,6 +663,8 @@ void spr_uncore_mmio_init(void); int gnr_uncore_pci_init(void); void gnr_uncore_cpu_init(void); void gnr_uncore_mmio_init(void); +int dmr_uncore_pci_init(void); +void dmr_uncore_mmio_init(void); /* uncore_nhmex.c */ void nhmex_uncore_cpu_init(void); diff --git a/arch/x86/events/intel/uncore_discovery.h b/arch/x86/events/intel/uncore_discovery.h index dfc237a2b6df..618788c30ac6 100644 --- a/arch/x86/events/intel/uncore_discovery.h +++ b/arch/x86/events/intel/uncore_discovery.h @@ -5,6 +5,8 @@ /* Generic device ID of a discovery table device */ #define UNCORE_DISCOVERY_TABLE_DEVICE 0x09a7 +/* Device ID used on DMR */ +#define DMR_UNCORE_DISCOVERY_TABLE_DEVICE 0x09a1 /* Capability ID for a discovery table device */ #define UNCORE_EXT_CAP_ID_DISCOVERY 0x23 /* First DVSEC offset */ diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index e1f370b8d065..4b72560dc13f 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -471,6 +471,14 @@ #define SPR_C0_MSR_PMON_BOX_FILTER0 0x200e +/* DMR */ +#define DMR_CXLCM_EVENT_MASK_EXT 0xf +#define DMR_HAMVF_EVENT_MASK_EXT 0xffffffff +#define DMR_PCIE4_EVENT_MASK_EXT 0xffffff + +#define DMR_IMC_PMON_FIXED_CTR 0x18 +#define DMR_IMC_PMON_FIXED_CTL 0x10 + DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7"); DEFINE_UNCORE_FORMAT_ATTR(event2, event, "config:0-6"); DEFINE_UNCORE_FORMAT_ATTR(event_ext, event, "config:0-7,21"); @@ -486,6 +494,10 @@ DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18"); DEFINE_UNCORE_FORMAT_ATTR(tid_en, tid_en, "config:19"); DEFINE_UNCORE_FORMAT_ATTR(tid_en2, tid_en, "config:16"); DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23"); +DEFINE_UNCORE_FORMAT_ATTR(inv2, inv, "config:21"); +DEFINE_UNCORE_FORMAT_ATTR(thresh_ext, thresh_ext, "config:32-35"); +DEFINE_UNCORE_FORMAT_ATTR(thresh10, thresh, "config:23-32"); +DEFINE_UNCORE_FORMAT_ATTR(thresh9_2, thresh, "config:23-31"); DEFINE_UNCORE_FORMAT_ATTR(thresh9, thresh, "config:24-35"); DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31"); DEFINE_UNCORE_FORMAT_ATTR(thresh6, thresh, "config:24-29"); @@ -494,6 +506,13 @@ DEFINE_UNCORE_FORMAT_ATTR(occ_sel, occ_sel, "config:14-15"); DEFINE_UNCORE_FORMAT_ATTR(occ_invert, occ_invert, "config:30"); DEFINE_UNCORE_FORMAT_ATTR(occ_edge, occ_edge, "config:14-51"); DEFINE_UNCORE_FORMAT_ATTR(occ_edge_det, occ_edge_det, "config:31"); +DEFINE_UNCORE_FORMAT_ATTR(port_en, port_en, "config:32-35"); +DEFINE_UNCORE_FORMAT_ATTR(rs3_sel, rs3_sel, "config:36"); +DEFINE_UNCORE_FORMAT_ATTR(rx_sel, rx_sel, "config:37"); +DEFINE_UNCORE_FORMAT_ATTR(tx_sel, tx_sel, "config:38"); +DEFINE_UNCORE_FORMAT_ATTR(iep_sel, iep_sel, "config:39"); +DEFINE_UNCORE_FORMAT_ATTR(vc_sel, vc_sel, "config:40-47"); +DEFINE_UNCORE_FORMAT_ATTR(port_sel, port_sel, "config:48-55"); DEFINE_UNCORE_FORMAT_ATTR(ch_mask, ch_mask, "config:36-43"); DEFINE_UNCORE_FORMAT_ATTR(ch_mask2, ch_mask, "config:36-47"); DEFINE_UNCORE_FORMAT_ATTR(fc_mask, fc_mask, "config:44-46"); @@ -6709,3 +6728,213 @@ void gnr_uncore_mmio_init(void) } /* end of GNR uncore support */ + +/* DMR uncore support */ +#define UNCORE_DMR_NUM_UNCORE_TYPES 52 + +static struct attribute *dmr_imc_uncore_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh10.attr, + NULL, +}; + +static const struct attribute_group dmr_imc_uncore_format_group = { + .name = "format", + .attrs = dmr_imc_uncore_formats_attr, +}; + +static struct intel_uncore_type dmr_uncore_imc = { + .name = "imc", + .fixed_ctr_bits = 48, + .fixed_ctr = DMR_IMC_PMON_FIXED_CTR, + .fixed_ctl = DMR_IMC_PMON_FIXED_CTL, + .ops = &spr_uncore_mmio_ops, + .format_group = &dmr_imc_uncore_format_group, + .attr_update = uncore_alias_groups, +}; + +static struct attribute *dmr_sca_uncore_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask_ext5.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh8.attr, + NULL, +}; + +static const struct attribute_group dmr_sca_uncore_format_group = { + .name = "format", + .attrs = dmr_sca_uncore_formats_attr, +}; + +static struct intel_uncore_type dmr_uncore_sca = { + .name = "sca", + .event_mask_ext = DMR_HAMVF_EVENT_MASK_EXT, + .format_group = &dmr_sca_uncore_format_group, + .attr_update = uncore_alias_groups, +}; + +static struct attribute *dmr_cxlcm_uncore_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv2.attr, + &format_attr_thresh9_2.attr, + &format_attr_port_en.attr, + NULL, +}; + +static const struct attribute_group dmr_cxlcm_uncore_format_group = { + .name = "format", + .attrs = dmr_cxlcm_uncore_formats_attr, +}; + +static struct intel_uncore_type dmr_uncore_cxlcm = { + .name = "cxlcm", + .event_mask = GENERIC_PMON_RAW_EVENT_MASK, + .event_mask_ext = DMR_CXLCM_EVENT_MASK_EXT, + .format_group = &dmr_cxlcm_uncore_format_group, + .attr_update = uncore_alias_groups, +}; + +static struct intel_uncore_type dmr_uncore_hamvf = { + .name = "hamvf", + .event_mask_ext = DMR_HAMVF_EVENT_MASK_EXT, + .format_group = &dmr_sca_uncore_format_group, + .attr_update = uncore_alias_groups, +}; + +static struct intel_uncore_type dmr_uncore_ula = { + .name = "ula", + .event_mask_ext = DMR_HAMVF_EVENT_MASK_EXT, + .format_group = &dmr_sca_uncore_format_group, + .attr_update = uncore_alias_groups, +}; + +static struct intel_uncore_type dmr_uncore_ubr = { + .name = "ubr", + .event_mask_ext = DMR_HAMVF_EVENT_MASK_EXT, + .format_group = &dmr_sca_uncore_format_group, + .attr_update = uncore_alias_groups, +}; + +static struct attribute *dmr_pcie4_uncore_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh8.attr, + &format_attr_thresh_ext.attr, + &format_attr_rs3_sel.attr, + &format_attr_rx_sel.attr, + &format_attr_tx_sel.attr, + &format_attr_iep_sel.attr, + &format_attr_vc_sel.attr, + &format_attr_port_sel.attr, + NULL, +}; + +static const struct attribute_group dmr_pcie4_uncore_format_group = { + .name = "format", + .attrs = dmr_pcie4_uncore_formats_attr, +}; + +static struct intel_uncore_type dmr_uncore_pcie4 = { + .name = "pcie4", + .event_mask_ext = DMR_PCIE4_EVENT_MASK_EXT, + .format_group = &dmr_pcie4_uncore_format_group, + .attr_update = uncore_alias_groups, +}; + +static struct intel_uncore_type dmr_uncore_crs = { + .name = "crs", + .attr_update = uncore_alias_groups, +}; + +static struct intel_uncore_type dmr_uncore_cpc = { + .name = "cpc", + .event_mask_ext = DMR_HAMVF_EVENT_MASK_EXT, + .format_group = &dmr_sca_uncore_format_group, + .attr_update = uncore_alias_groups, +}; + +static struct intel_uncore_type dmr_uncore_itc = { + .name = "itc", + .event_mask_ext = DMR_HAMVF_EVENT_MASK_EXT, + .format_group = &dmr_sca_uncore_format_group, + .attr_update = uncore_alias_groups, +}; + +static struct intel_uncore_type dmr_uncore_otc = { + .name = "otc", + .event_mask_ext = DMR_HAMVF_EVENT_MASK_EXT, + .format_group = &dmr_sca_uncore_format_group, + .attr_update = uncore_alias_groups, +}; + +static struct intel_uncore_type dmr_uncore_cms = { + .name = "cms", + .attr_update = uncore_alias_groups, +}; + +static struct intel_uncore_type dmr_uncore_pcie6 = { + .name = "pcie6", + .event_mask_ext = DMR_PCIE4_EVENT_MASK_EXT, + .format_group = &dmr_pcie4_uncore_format_group, + .attr_update = uncore_alias_groups, +}; + +static struct intel_uncore_type *dmr_uncores[UNCORE_DMR_NUM_UNCORE_TYPES] = { + NULL, NULL, NULL, NULL, + &spr_uncore_pcu, + &gnr_uncore_ubox, + &dmr_uncore_imc, + NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, + &dmr_uncore_sca, + &dmr_uncore_cxlcm, + NULL, NULL, NULL, + NULL, NULL, + &dmr_uncore_hamvf, + NULL, + NULL, NULL, NULL, + &dmr_uncore_ula, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, + &dmr_uncore_ubr, + NULL, + &dmr_uncore_pcie4, + &dmr_uncore_crs, + &dmr_uncore_cpc, + &dmr_uncore_itc, + &dmr_uncore_otc, + &dmr_uncore_cms, + &dmr_uncore_pcie6, +}; + +int dmr_uncore_imh_units_ignore[] = { + 0x13, /* MSE */ + UNCORE_IGNORE_END +}; + +int dmr_uncore_pci_init(void) +{ + uncore_pci_uncores = uncore_get_uncores(UNCORE_ACCESS_PCI, 0, NULL, + UNCORE_DMR_NUM_UNCORE_TYPES, + dmr_uncores); + return 0; +} +void dmr_uncore_mmio_init(void) +{ + uncore_mmio_uncores = uncore_get_uncores(UNCORE_ACCESS_MMIO, 0, NULL, + UNCORE_DMR_NUM_UNCORE_TYPES, + dmr_uncores); +} + +/* end of DMR uncore support */ From 66e2075426f3220857eb3987c803764c82cef851 Mon Sep 17 00:00:00 2001 From: Zide Chen Date: Wed, 31 Dec 2025 14:42:22 -0800 Subject: [PATCH 31/58] perf/x86/intel/uncore: Add CBB PMON support for Diamond Rapids MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On DMR, PMON units inside the Core Building Block (CBB) are enumerated separately from those in the Integrated Memory and I/O Hub (IMH). A new per-CBB MSR (0x710) is introduced for discovery table enumeration. For counter control registers, the tid_en bit (bit 16) exists on CBO, SBO, and Santa, but it is not used by any events. Mark this bit as reserved. Similarly, disallow extended umask (bits 32–63) on Santa and sNCU. Additionally, ignore broken SB2UCIE unit. Signed-off-by: Zide Chen Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dapeng Mi Link: https://patch.msgid.link/20251231224233.113839-6-zide.chen@intel.com --- arch/x86/events/intel/uncore.c | 2 + arch/x86/events/intel/uncore.h | 1 + arch/x86/events/intel/uncore_discovery.h | 2 + arch/x86/events/intel/uncore_snbep.c | 52 ++++++++++++++++++++++-- 4 files changed, 54 insertions(+), 3 deletions(-) diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 40cf9bf79b81..08e5dd4a5fb8 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -1837,6 +1837,8 @@ static const struct uncore_plat_init dmr_uncore_init __initconst = { .domain[0].base_is_pci = true, .domain[0].discovery_base = DMR_UNCORE_DISCOVERY_TABLE_DEVICE, .domain[0].units_ignore = dmr_uncore_imh_units_ignore, + .domain[1].discovery_base = CBB_UNCORE_DISCOVERY_MSR, + .domain[1].units_ignore = dmr_uncore_cbb_units_ignore, }; static const struct uncore_plat_init generic_uncore_init __initconst = { diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index 1e4b3a22403c..83d01a9cefc0 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -615,6 +615,7 @@ extern struct event_constraint uncore_constraint_empty; extern int spr_uncore_units_ignore[]; extern int gnr_uncore_units_ignore[]; extern int dmr_uncore_imh_units_ignore[]; +extern int dmr_uncore_cbb_units_ignore[]; /* uncore_snb.c */ int snb_uncore_pci_init(void); diff --git a/arch/x86/events/intel/uncore_discovery.h b/arch/x86/events/intel/uncore_discovery.h index 618788c30ac6..63b8f7634e42 100644 --- a/arch/x86/events/intel/uncore_discovery.h +++ b/arch/x86/events/intel/uncore_discovery.h @@ -2,6 +2,8 @@ /* Store the full address of the global discovery table */ #define UNCORE_DISCOVERY_MSR 0x201e +/* Base address of uncore perfmon discovery table for CBB domain */ +#define CBB_UNCORE_DISCOVERY_MSR 0x710 /* Generic device ID of a discovery table device */ #define UNCORE_DISCOVERY_TABLE_DEVICE 0x09a7 diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 4b72560dc13f..df173534637a 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -6807,6 +6807,28 @@ static struct intel_uncore_type dmr_uncore_hamvf = { .attr_update = uncore_alias_groups, }; +static struct intel_uncore_type dmr_uncore_cbo = { + .name = "cbo", + .event_mask_ext = DMR_HAMVF_EVENT_MASK_EXT, + .format_group = &dmr_sca_uncore_format_group, + .attr_update = uncore_alias_groups, +}; + +static struct intel_uncore_type dmr_uncore_santa = { + .name = "santa", + .attr_update = uncore_alias_groups, +}; + +static struct intel_uncore_type dmr_uncore_cncu = { + .name = "cncu", + .attr_update = uncore_alias_groups, +}; + +static struct intel_uncore_type dmr_uncore_sncu = { + .name = "sncu", + .attr_update = uncore_alias_groups, +}; + static struct intel_uncore_type dmr_uncore_ula = { .name = "ula", .event_mask_ext = DMR_HAMVF_EVENT_MASK_EXT, @@ -6814,6 +6836,20 @@ static struct intel_uncore_type dmr_uncore_ula = { .attr_update = uncore_alias_groups, }; +static struct intel_uncore_type dmr_uncore_dda = { + .name = "dda", + .event_mask_ext = DMR_HAMVF_EVENT_MASK_EXT, + .format_group = &dmr_sca_uncore_format_group, + .attr_update = uncore_alias_groups, +}; + +static struct intel_uncore_type dmr_uncore_sbo = { + .name = "sbo", + .event_mask_ext = DMR_HAMVF_EVENT_MASK_EXT, + .format_group = &dmr_sca_uncore_format_group, + .attr_update = uncore_alias_groups, +}; + static struct intel_uncore_type dmr_uncore_ubr = { .name = "ubr", .event_mask_ext = DMR_HAMVF_EVENT_MASK_EXT, @@ -6902,10 +6938,15 @@ static struct intel_uncore_type *dmr_uncores[UNCORE_DMR_NUM_UNCORE_TYPES] = { NULL, NULL, NULL, NULL, NULL, &dmr_uncore_hamvf, - NULL, - NULL, NULL, NULL, + &dmr_uncore_cbo, + &dmr_uncore_santa, + &dmr_uncore_cncu, + &dmr_uncore_sncu, &dmr_uncore_ula, - NULL, NULL, NULL, NULL, + &dmr_uncore_dda, + NULL, + &dmr_uncore_sbo, + NULL, NULL, NULL, NULL, &dmr_uncore_ubr, NULL, @@ -6923,6 +6964,11 @@ int dmr_uncore_imh_units_ignore[] = { UNCORE_IGNORE_END }; +int dmr_uncore_cbb_units_ignore[] = { + 0x25, /* SB2UCIE */ + UNCORE_IGNORE_END +}; + int dmr_uncore_pci_init(void) { uncore_pci_uncores = uncore_get_uncores(UNCORE_ACCESS_PCI, 0, NULL, From b575fc0e33574f3a476b68057e340ebe32d7b750 Mon Sep 17 00:00:00 2001 From: Zide Chen Date: Wed, 31 Dec 2025 14:42:23 -0800 Subject: [PATCH 32/58] perf/x86/intel/uncore: Add domain global init callback In the Intel uncore self-describing mechanism, the Global Control Register freeze_all bit is SoC-wide and propagates to all uncore PMUs. On Diamond Rapids, this bit is set at power-on, unlike some prior platforms. Add a global_init callback to unfreeze all PMON units. Signed-off-by: Zide Chen Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dapeng Mi Link: https://patch.msgid.link/20251231224233.113839-7-zide.chen@intel.com --- arch/x86/events/intel/uncore.c | 16 ++++++++++++++++ arch/x86/events/intel/uncore.h | 2 ++ arch/x86/events/intel/uncore_discovery.c | 3 +++ 3 files changed, 21 insertions(+) diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 08e5dd4a5fb8..25a678b4ca43 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -1697,6 +1697,21 @@ static int __init uncore_mmio_init(void) return ret; } +static int uncore_mmio_global_init(u64 ctl) +{ + void __iomem *io_addr; + + io_addr = ioremap(ctl, sizeof(ctl)); + if (!io_addr) + return -ENOMEM; + + /* Clear freeze bit (0) to enable all counters. */ + writel(0, io_addr); + + iounmap(io_addr); + return 0; +} + static const struct uncore_plat_init nhm_uncore_init __initconst = { .cpu_init = nhm_uncore_cpu_init, }; @@ -1839,6 +1854,7 @@ static const struct uncore_plat_init dmr_uncore_init __initconst = { .domain[0].units_ignore = dmr_uncore_imh_units_ignore, .domain[1].discovery_base = CBB_UNCORE_DISCOVERY_MSR, .domain[1].units_ignore = dmr_uncore_cbb_units_ignore, + .domain[1].global_init = uncore_mmio_global_init, }; static const struct uncore_plat_init generic_uncore_init __initconst = { diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index 83d01a9cefc0..55e3aebf4b5e 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -51,6 +51,8 @@ struct uncore_discovery_domain { /* MSR address or PCI device used as the discovery base */ u32 discovery_base; bool base_is_pci; + int (*global_init)(u64 ctl); + /* The units in the discovery table should be ignored. */ int *units_ignore; }; diff --git a/arch/x86/events/intel/uncore_discovery.c b/arch/x86/events/intel/uncore_discovery.c index aaa081011fd6..b46575254dbe 100644 --- a/arch/x86/events/intel/uncore_discovery.c +++ b/arch/x86/events/intel/uncore_discovery.c @@ -286,6 +286,9 @@ static int __parse_discovery_table(struct uncore_discovery_domain *domain, if (!io_addr) return -ENOMEM; + if (domain->global_init && domain->global_init(global.ctl)) + return -ENODEV; + /* Parsing Unit Discovery State */ for (i = 0; i < global.max_units; i++) { memcpy_fromio(&unit, io_addr + (i + 1) * (global.stride * 8), From 8a4bd1c0d6bb64ab4d9e94d83c40326356421a73 Mon Sep 17 00:00:00 2001 From: Zide Chen Date: Wed, 31 Dec 2025 14:42:24 -0800 Subject: [PATCH 33/58] perf/x86/intel/uncore: Add freerunning event descriptor helper macro Freerunning counter events are repetitive: the event code is fixed to 0xff, the unit is always "MiB", and the scale is identical across all counters on a given PMON unit. Introduce a new helper macro, INTEL_UNCORE_FR_EVENT_DESC(), to populate the event, scale, and unit descriptor triplet. This reduces duplicated lines and improves readability. No functional change intended. Signed-off-by: Zide Chen Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dapeng Mi Link: https://patch.msgid.link/20251231224233.113839-8-zide.chen@intel.com --- arch/x86/events/intel/uncore_snbep.c | 95 ++++++++-------------------- 1 file changed, 28 insertions(+), 67 deletions(-) diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index df173534637a..cfb4ce325bb6 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -4068,34 +4068,24 @@ static struct freerunning_counters skx_iio_freerunning[] = { [SKX_IIO_MSR_UTIL] = { 0xb08, 0x1, 0x10, 8, 36 }, }; +#define INTEL_UNCORE_FR_EVENT_DESC(name, umask, scl) \ + INTEL_UNCORE_EVENT_DESC(name, \ + "event=0xff,umask=" __stringify(umask)),\ + INTEL_UNCORE_EVENT_DESC(name.scale, __stringify(scl)), \ + INTEL_UNCORE_EVENT_DESC(name.unit, "MiB") + static struct uncore_event_desc skx_uncore_iio_freerunning_events[] = { /* Free-Running IO CLOCKS Counter */ INTEL_UNCORE_EVENT_DESC(ioclk, "event=0xff,umask=0x10"), /* Free-Running IIO BANDWIDTH Counters */ - INTEL_UNCORE_EVENT_DESC(bw_in_port0, "event=0xff,umask=0x20"), - INTEL_UNCORE_EVENT_DESC(bw_in_port0.scale, "3.814697266e-6"), - INTEL_UNCORE_EVENT_DESC(bw_in_port0.unit, "MiB"), - INTEL_UNCORE_EVENT_DESC(bw_in_port1, "event=0xff,umask=0x21"), - INTEL_UNCORE_EVENT_DESC(bw_in_port1.scale, "3.814697266e-6"), - INTEL_UNCORE_EVENT_DESC(bw_in_port1.unit, "MiB"), - INTEL_UNCORE_EVENT_DESC(bw_in_port2, "event=0xff,umask=0x22"), - INTEL_UNCORE_EVENT_DESC(bw_in_port2.scale, "3.814697266e-6"), - INTEL_UNCORE_EVENT_DESC(bw_in_port2.unit, "MiB"), - INTEL_UNCORE_EVENT_DESC(bw_in_port3, "event=0xff,umask=0x23"), - INTEL_UNCORE_EVENT_DESC(bw_in_port3.scale, "3.814697266e-6"), - INTEL_UNCORE_EVENT_DESC(bw_in_port3.unit, "MiB"), - INTEL_UNCORE_EVENT_DESC(bw_out_port0, "event=0xff,umask=0x24"), - INTEL_UNCORE_EVENT_DESC(bw_out_port0.scale, "3.814697266e-6"), - INTEL_UNCORE_EVENT_DESC(bw_out_port0.unit, "MiB"), - INTEL_UNCORE_EVENT_DESC(bw_out_port1, "event=0xff,umask=0x25"), - INTEL_UNCORE_EVENT_DESC(bw_out_port1.scale, "3.814697266e-6"), - INTEL_UNCORE_EVENT_DESC(bw_out_port1.unit, "MiB"), - INTEL_UNCORE_EVENT_DESC(bw_out_port2, "event=0xff,umask=0x26"), - INTEL_UNCORE_EVENT_DESC(bw_out_port2.scale, "3.814697266e-6"), - INTEL_UNCORE_EVENT_DESC(bw_out_port2.unit, "MiB"), - INTEL_UNCORE_EVENT_DESC(bw_out_port3, "event=0xff,umask=0x27"), - INTEL_UNCORE_EVENT_DESC(bw_out_port3.scale, "3.814697266e-6"), - INTEL_UNCORE_EVENT_DESC(bw_out_port3.unit, "MiB"), + INTEL_UNCORE_FR_EVENT_DESC(bw_in_port0, 0x20, 3.814697266e-6), + INTEL_UNCORE_FR_EVENT_DESC(bw_in_port1, 0x21, 3.814697266e-6), + INTEL_UNCORE_FR_EVENT_DESC(bw_in_port2, 0x22, 3.814697266e-6), + INTEL_UNCORE_FR_EVENT_DESC(bw_in_port3, 0x23, 3.814697266e-6), + INTEL_UNCORE_FR_EVENT_DESC(bw_out_port0, 0x24, 3.814697266e-6), + INTEL_UNCORE_FR_EVENT_DESC(bw_out_port1, 0x25, 3.814697266e-6), + INTEL_UNCORE_FR_EVENT_DESC(bw_out_port2, 0x26, 3.814697266e-6), + INTEL_UNCORE_FR_EVENT_DESC(bw_out_port3, 0x27, 3.814697266e-6), /* Free-running IIO UTILIZATION Counters */ INTEL_UNCORE_EVENT_DESC(util_in_port0, "event=0xff,umask=0x30"), INTEL_UNCORE_EVENT_DESC(util_out_port0, "event=0xff,umask=0x31"), @@ -4910,30 +4900,14 @@ static struct uncore_event_desc snr_uncore_iio_freerunning_events[] = { /* Free-Running IIO CLOCKS Counter */ INTEL_UNCORE_EVENT_DESC(ioclk, "event=0xff,umask=0x10"), /* Free-Running IIO BANDWIDTH IN Counters */ - INTEL_UNCORE_EVENT_DESC(bw_in_port0, "event=0xff,umask=0x20"), - INTEL_UNCORE_EVENT_DESC(bw_in_port0.scale, "3.0517578125e-5"), - INTEL_UNCORE_EVENT_DESC(bw_in_port0.unit, "MiB"), - INTEL_UNCORE_EVENT_DESC(bw_in_port1, "event=0xff,umask=0x21"), - INTEL_UNCORE_EVENT_DESC(bw_in_port1.scale, "3.0517578125e-5"), - INTEL_UNCORE_EVENT_DESC(bw_in_port1.unit, "MiB"), - INTEL_UNCORE_EVENT_DESC(bw_in_port2, "event=0xff,umask=0x22"), - INTEL_UNCORE_EVENT_DESC(bw_in_port2.scale, "3.0517578125e-5"), - INTEL_UNCORE_EVENT_DESC(bw_in_port2.unit, "MiB"), - INTEL_UNCORE_EVENT_DESC(bw_in_port3, "event=0xff,umask=0x23"), - INTEL_UNCORE_EVENT_DESC(bw_in_port3.scale, "3.0517578125e-5"), - INTEL_UNCORE_EVENT_DESC(bw_in_port3.unit, "MiB"), - INTEL_UNCORE_EVENT_DESC(bw_in_port4, "event=0xff,umask=0x24"), - INTEL_UNCORE_EVENT_DESC(bw_in_port4.scale, "3.0517578125e-5"), - INTEL_UNCORE_EVENT_DESC(bw_in_port4.unit, "MiB"), - INTEL_UNCORE_EVENT_DESC(bw_in_port5, "event=0xff,umask=0x25"), - INTEL_UNCORE_EVENT_DESC(bw_in_port5.scale, "3.0517578125e-5"), - INTEL_UNCORE_EVENT_DESC(bw_in_port5.unit, "MiB"), - INTEL_UNCORE_EVENT_DESC(bw_in_port6, "event=0xff,umask=0x26"), - INTEL_UNCORE_EVENT_DESC(bw_in_port6.scale, "3.0517578125e-5"), - INTEL_UNCORE_EVENT_DESC(bw_in_port6.unit, "MiB"), - INTEL_UNCORE_EVENT_DESC(bw_in_port7, "event=0xff,umask=0x27"), - INTEL_UNCORE_EVENT_DESC(bw_in_port7.scale, "3.0517578125e-5"), - INTEL_UNCORE_EVENT_DESC(bw_in_port7.unit, "MiB"), + INTEL_UNCORE_FR_EVENT_DESC(bw_in_port0, 0x20, 3.0517578125e-5), + INTEL_UNCORE_FR_EVENT_DESC(bw_in_port1, 0x21, 3.0517578125e-5), + INTEL_UNCORE_FR_EVENT_DESC(bw_in_port2, 0x22, 3.0517578125e-5), + INTEL_UNCORE_FR_EVENT_DESC(bw_in_port3, 0x23, 3.0517578125e-5), + INTEL_UNCORE_FR_EVENT_DESC(bw_in_port4, 0x24, 3.0517578125e-5), + INTEL_UNCORE_FR_EVENT_DESC(bw_in_port5, 0x25, 3.0517578125e-5), + INTEL_UNCORE_FR_EVENT_DESC(bw_in_port6, 0x26, 3.0517578125e-5), + INTEL_UNCORE_FR_EVENT_DESC(bw_in_port7, 0x27, 3.0517578125e-5), { /* end: all zeroes */ }, }; @@ -5266,12 +5240,8 @@ static struct freerunning_counters snr_imc_freerunning[] = { static struct uncore_event_desc snr_uncore_imc_freerunning_events[] = { INTEL_UNCORE_EVENT_DESC(dclk, "event=0xff,umask=0x10"), - INTEL_UNCORE_EVENT_DESC(read, "event=0xff,umask=0x20"), - INTEL_UNCORE_EVENT_DESC(read.scale, "6.103515625e-5"), - INTEL_UNCORE_EVENT_DESC(read.unit, "MiB"), - INTEL_UNCORE_EVENT_DESC(write, "event=0xff,umask=0x21"), - INTEL_UNCORE_EVENT_DESC(write.scale, "6.103515625e-5"), - INTEL_UNCORE_EVENT_DESC(write.unit, "MiB"), + INTEL_UNCORE_FR_EVENT_DESC(read, 0x20, 6.103515625e-5), + INTEL_UNCORE_FR_EVENT_DESC(write, 0x21, 6.103515625e-5), { /* end: all zeroes */ }, }; @@ -5836,19 +5806,10 @@ static struct freerunning_counters icx_imc_freerunning[] = { static struct uncore_event_desc icx_uncore_imc_freerunning_events[] = { INTEL_UNCORE_EVENT_DESC(dclk, "event=0xff,umask=0x10"), - INTEL_UNCORE_EVENT_DESC(read, "event=0xff,umask=0x20"), - INTEL_UNCORE_EVENT_DESC(read.scale, "6.103515625e-5"), - INTEL_UNCORE_EVENT_DESC(read.unit, "MiB"), - INTEL_UNCORE_EVENT_DESC(write, "event=0xff,umask=0x21"), - INTEL_UNCORE_EVENT_DESC(write.scale, "6.103515625e-5"), - INTEL_UNCORE_EVENT_DESC(write.unit, "MiB"), - - INTEL_UNCORE_EVENT_DESC(ddrt_read, "event=0xff,umask=0x30"), - INTEL_UNCORE_EVENT_DESC(ddrt_read.scale, "6.103515625e-5"), - INTEL_UNCORE_EVENT_DESC(ddrt_read.unit, "MiB"), - INTEL_UNCORE_EVENT_DESC(ddrt_write, "event=0xff,umask=0x31"), - INTEL_UNCORE_EVENT_DESC(ddrt_write.scale, "6.103515625e-5"), - INTEL_UNCORE_EVENT_DESC(ddrt_write.unit, "MiB"), + INTEL_UNCORE_FR_EVENT_DESC(read, 0x20, 6.103515625e-5), + INTEL_UNCORE_FR_EVENT_DESC(write, 0x21, 6.103515625e-5), + INTEL_UNCORE_FR_EVENT_DESC(ddrt_read, 0x30, 6.103515625e-5), + INTEL_UNCORE_FR_EVENT_DESC(ddrt_write, 0x31, 6.103515625e-5), { /* end: all zeroes */ }, }; From d8987048f6655b38453d00782a256179f082b79c Mon Sep 17 00:00:00 2001 From: Zide Chen Date: Wed, 31 Dec 2025 14:42:25 -0800 Subject: [PATCH 34/58] perf/x86/intel/uncore: Support IIO free-running counters on DMR The free-running counters for IIO uncore blocks on Diamond Rapids are similar to Sapphire Rapids IMC freecounters, with the following differences: - The counters are MMIO based. - Only a subset of IP blocks implement free-running counters: HIOP0 (IP Base Addr: 2E7000h) HIOP1 (IP Base Addr: 2EF000h) HIOP3 (IP Base Addr: 2FF000h) HIOP4 (IP Base Addr: 307000h) - IMH2 (Secondary IMH) does not provide free-running counters. Signed-off-by: Zide Chen Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dapeng Mi Link: https://patch.msgid.link/20251231224233.113839-9-zide.chen@intel.com --- arch/x86/events/intel/uncore_snbep.c | 118 +++++++++++++++++++++++++-- 1 file changed, 113 insertions(+), 5 deletions(-) diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index cfb4ce325bb6..cc8145e7030e 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -472,10 +472,14 @@ #define SPR_C0_MSR_PMON_BOX_FILTER0 0x200e /* DMR */ +#define DMR_IMH1_HIOP_MMIO_BASE 0x1ffff6ae7000 +#define DMR_HIOP_MMIO_SIZE 0x8000 #define DMR_CXLCM_EVENT_MASK_EXT 0xf #define DMR_HAMVF_EVENT_MASK_EXT 0xffffffff #define DMR_PCIE4_EVENT_MASK_EXT 0xffffff +#define UNCORE_DMR_ITC 0x30 + #define DMR_IMC_PMON_FIXED_CTR 0x18 #define DMR_IMC_PMON_FIXED_CTL 0x10 @@ -6442,7 +6446,11 @@ static int uncore_type_max_boxes(struct intel_uncore_type **types, for (node = rb_first(type->boxes); node; node = rb_next(node)) { unit = rb_entry(node, struct intel_uncore_discovery_unit, node); - if (unit->id > max) + /* + * on DMR IMH2, the unit id starts from 0x8000, + * and we don't need to count it. + */ + if ((unit->id > max) && (unit->id < 0x8000)) max = unit->id; } return max + 1; @@ -6930,6 +6938,101 @@ int dmr_uncore_cbb_units_ignore[] = { UNCORE_IGNORE_END }; +static unsigned int dmr_iio_freerunning_box_offsets[] = { + 0x0, 0x8000, 0x18000, 0x20000 +}; + +static void dmr_uncore_freerunning_init_box(struct intel_uncore_box *box) +{ + struct intel_uncore_type *type = box->pmu->type; + u64 mmio_base; + + if (box->pmu->pmu_idx >= type->num_boxes) + return; + + mmio_base = DMR_IMH1_HIOP_MMIO_BASE; + mmio_base += dmr_iio_freerunning_box_offsets[box->pmu->pmu_idx]; + + box->io_addr = ioremap(mmio_base, type->mmio_map_size); + if (!box->io_addr) + pr_warn("perf uncore: Failed to ioremap for %s.\n", type->name); +} + +static struct intel_uncore_ops dmr_uncore_freerunning_ops = { + .init_box = dmr_uncore_freerunning_init_box, + .exit_box = uncore_mmio_exit_box, + .read_counter = uncore_mmio_read_counter, + .hw_config = uncore_freerunning_hw_config, +}; + +enum perf_uncore_dmr_iio_freerunning_type_id { + DMR_ITC_INB_DATA_BW, + DMR_ITC_BW_IN, + DMR_OTC_BW_OUT, + DMR_OTC_CLOCK_TICKS, + + DMR_IIO_FREERUNNING_TYPE_MAX, +}; + +static struct freerunning_counters dmr_iio_freerunning[] = { + [DMR_ITC_INB_DATA_BW] = { 0x4d40, 0x8, 0, 8, 48}, + [DMR_ITC_BW_IN] = { 0x6b00, 0x8, 0, 8, 48}, + [DMR_OTC_BW_OUT] = { 0x6b60, 0x8, 0, 8, 48}, + [DMR_OTC_CLOCK_TICKS] = { 0x6bb0, 0x8, 0, 1, 48}, +}; + +static struct uncore_event_desc dmr_uncore_iio_freerunning_events[] = { + /* ITC Free Running Data BW counter for inbound traffic */ + INTEL_UNCORE_FR_EVENT_DESC(inb_data_port0, 0x10, "3.814697266e-6"), + INTEL_UNCORE_FR_EVENT_DESC(inb_data_port1, 0x11, "3.814697266e-6"), + INTEL_UNCORE_FR_EVENT_DESC(inb_data_port2, 0x12, "3.814697266e-6"), + INTEL_UNCORE_FR_EVENT_DESC(inb_data_port3, 0x13, "3.814697266e-6"), + INTEL_UNCORE_FR_EVENT_DESC(inb_data_port4, 0x14, "3.814697266e-6"), + INTEL_UNCORE_FR_EVENT_DESC(inb_data_port5, 0x15, "3.814697266e-6"), + INTEL_UNCORE_FR_EVENT_DESC(inb_data_port6, 0x16, "3.814697266e-6"), + INTEL_UNCORE_FR_EVENT_DESC(inb_data_port7, 0x17, "3.814697266e-6"), + + /* ITC Free Running BW IN counters */ + INTEL_UNCORE_FR_EVENT_DESC(bw_in_port0, 0x20, "3.814697266e-6"), + INTEL_UNCORE_FR_EVENT_DESC(bw_in_port1, 0x21, "3.814697266e-6"), + INTEL_UNCORE_FR_EVENT_DESC(bw_in_port2, 0x22, "3.814697266e-6"), + INTEL_UNCORE_FR_EVENT_DESC(bw_in_port3, 0x23, "3.814697266e-6"), + INTEL_UNCORE_FR_EVENT_DESC(bw_in_port4, 0x24, "3.814697266e-6"), + INTEL_UNCORE_FR_EVENT_DESC(bw_in_port5, 0x25, "3.814697266e-6"), + INTEL_UNCORE_FR_EVENT_DESC(bw_in_port6, 0x26, "3.814697266e-6"), + INTEL_UNCORE_FR_EVENT_DESC(bw_in_port7, 0x27, "3.814697266e-6"), + + /* ITC Free Running BW OUT counters */ + INTEL_UNCORE_FR_EVENT_DESC(bw_out_port0, 0x30, "3.814697266e-6"), + INTEL_UNCORE_FR_EVENT_DESC(bw_out_port1, 0x31, "3.814697266e-6"), + INTEL_UNCORE_FR_EVENT_DESC(bw_out_port2, 0x32, "3.814697266e-6"), + INTEL_UNCORE_FR_EVENT_DESC(bw_out_port3, 0x33, "3.814697266e-6"), + INTEL_UNCORE_FR_EVENT_DESC(bw_out_port4, 0x34, "3.814697266e-6"), + INTEL_UNCORE_FR_EVENT_DESC(bw_out_port5, 0x35, "3.814697266e-6"), + INTEL_UNCORE_FR_EVENT_DESC(bw_out_port6, 0x36, "3.814697266e-6"), + INTEL_UNCORE_FR_EVENT_DESC(bw_out_port7, 0x37, "3.814697266e-6"), + + /* Free Running Clock Counter */ + INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x40"), + { /* end: all zeroes */ }, +}; + +static struct intel_uncore_type dmr_uncore_iio_free_running = { + .name = "iio_free_running", + .num_counters = 25, + .mmio_map_size = DMR_HIOP_MMIO_SIZE, + .num_freerunning_types = DMR_IIO_FREERUNNING_TYPE_MAX, + .freerunning = dmr_iio_freerunning, + .ops = &dmr_uncore_freerunning_ops, + .event_descs = dmr_uncore_iio_freerunning_events, + .format_group = &skx_uncore_iio_freerunning_format_group, +}; + +#define UNCORE_DMR_MMIO_EXTRA_UNCORES 1 +static struct intel_uncore_type *dmr_mmio_uncores[UNCORE_DMR_MMIO_EXTRA_UNCORES] = { + &dmr_uncore_iio_free_running, +}; + int dmr_uncore_pci_init(void) { uncore_pci_uncores = uncore_get_uncores(UNCORE_ACCESS_PCI, 0, NULL, @@ -6937,11 +7040,16 @@ int dmr_uncore_pci_init(void) dmr_uncores); return 0; } + void dmr_uncore_mmio_init(void) { - uncore_mmio_uncores = uncore_get_uncores(UNCORE_ACCESS_MMIO, 0, NULL, - UNCORE_DMR_NUM_UNCORE_TYPES, - dmr_uncores); -} + uncore_mmio_uncores = uncore_get_uncores(UNCORE_ACCESS_MMIO, + UNCORE_DMR_MMIO_EXTRA_UNCORES, + dmr_mmio_uncores, + UNCORE_DMR_NUM_UNCORE_TYPES, + dmr_uncores); + dmr_uncore_iio_free_running.num_boxes = + uncore_type_max_boxes(uncore_mmio_uncores, UNCORE_DMR_ITC); +} /* end of DMR uncore support */ From aacb0718fddfe7060576c82e47bbda559c2f2d0d Mon Sep 17 00:00:00 2001 From: Zide Chen Date: Wed, 31 Dec 2025 14:42:26 -0800 Subject: [PATCH 35/58] perf/x86/intel/uncore: Support uncore constraint ranges Add UNCORE_EVENT_CONSTRAINT_RANGE macro for uncore constraints, similar to INTEL_EVENT_CONSTRAINT_RANGE, to reduce duplication when defining consecutive uncore event constraints. No functional change intended. Suggested-by: Dapeng Mi Signed-off-by: Zide Chen Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dapeng Mi Link: https://patch.msgid.link/20251231224233.113839-10-zide.chen@intel.com --- arch/x86/events/intel/uncore.c | 2 +- arch/x86/events/intel/uncore.h | 2 + arch/x86/events/intel/uncore_snbep.c | 183 ++++++--------------------- 3 files changed, 44 insertions(+), 143 deletions(-) diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 25a678b4ca43..19ff8db4ddac 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -436,7 +436,7 @@ uncore_get_event_constraint(struct intel_uncore_box *box, struct perf_event *eve if (type->constraints) { for_each_event_constraint(c, type->constraints) { - if ((event->hw.config & c->cmask) == c->code) + if (constraint_match(c, event->hw.config)) return c; } } diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index 55e3aebf4b5e..564cb26c4468 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -33,6 +33,8 @@ #define UNCORE_EXTRA_PCI_DEV_MAX 4 #define UNCORE_EVENT_CONSTRAINT(c, n) EVENT_CONSTRAINT(c, n, 0xff) +#define UNCORE_EVENT_CONSTRAINT_RANGE(c, e, n) \ + EVENT_CONSTRAINT_RANGE(c, e, n, 0xff) #define UNCORE_IGNORE_END -1 diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index cc8145e7030e..eaeb4e97cd9c 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -836,76 +836,37 @@ static struct intel_uncore_ops snbep_uncore_pci_ops = { static struct event_constraint snbep_uncore_cbox_constraints[] = { UNCORE_EVENT_CONSTRAINT(0x01, 0x1), UNCORE_EVENT_CONSTRAINT(0x02, 0x3), - UNCORE_EVENT_CONSTRAINT(0x04, 0x3), - UNCORE_EVENT_CONSTRAINT(0x05, 0x3), + UNCORE_EVENT_CONSTRAINT_RANGE(0x04, 0x5, 0x3), UNCORE_EVENT_CONSTRAINT(0x07, 0x3), UNCORE_EVENT_CONSTRAINT(0x09, 0x3), UNCORE_EVENT_CONSTRAINT(0x11, 0x1), - UNCORE_EVENT_CONSTRAINT(0x12, 0x3), - UNCORE_EVENT_CONSTRAINT(0x13, 0x3), - UNCORE_EVENT_CONSTRAINT(0x1b, 0xc), - UNCORE_EVENT_CONSTRAINT(0x1c, 0xc), - UNCORE_EVENT_CONSTRAINT(0x1d, 0xc), - UNCORE_EVENT_CONSTRAINT(0x1e, 0xc), + UNCORE_EVENT_CONSTRAINT_RANGE(0x12, 0x13, 0x3), + UNCORE_EVENT_CONSTRAINT_RANGE(0x1b, 0x1e, 0xc), UNCORE_EVENT_CONSTRAINT(0x1f, 0xe), UNCORE_EVENT_CONSTRAINT(0x21, 0x3), UNCORE_EVENT_CONSTRAINT(0x23, 0x3), - UNCORE_EVENT_CONSTRAINT(0x31, 0x3), - UNCORE_EVENT_CONSTRAINT(0x32, 0x3), - UNCORE_EVENT_CONSTRAINT(0x33, 0x3), - UNCORE_EVENT_CONSTRAINT(0x34, 0x3), - UNCORE_EVENT_CONSTRAINT(0x35, 0x3), + UNCORE_EVENT_CONSTRAINT_RANGE(0x31, 0x35, 0x3), UNCORE_EVENT_CONSTRAINT(0x36, 0x1), - UNCORE_EVENT_CONSTRAINT(0x37, 0x3), - UNCORE_EVENT_CONSTRAINT(0x38, 0x3), - UNCORE_EVENT_CONSTRAINT(0x39, 0x3), + UNCORE_EVENT_CONSTRAINT_RANGE(0x37, 0x39, 0x3), UNCORE_EVENT_CONSTRAINT(0x3b, 0x1), EVENT_CONSTRAINT_END }; static struct event_constraint snbep_uncore_r2pcie_constraints[] = { - UNCORE_EVENT_CONSTRAINT(0x10, 0x3), - UNCORE_EVENT_CONSTRAINT(0x11, 0x3), + UNCORE_EVENT_CONSTRAINT_RANGE(0x10, 0x11, 0x3), UNCORE_EVENT_CONSTRAINT(0x12, 0x1), UNCORE_EVENT_CONSTRAINT(0x23, 0x3), - UNCORE_EVENT_CONSTRAINT(0x24, 0x3), - UNCORE_EVENT_CONSTRAINT(0x25, 0x3), - UNCORE_EVENT_CONSTRAINT(0x26, 0x3), - UNCORE_EVENT_CONSTRAINT(0x32, 0x3), - UNCORE_EVENT_CONSTRAINT(0x33, 0x3), - UNCORE_EVENT_CONSTRAINT(0x34, 0x3), + UNCORE_EVENT_CONSTRAINT_RANGE(0x24, 0x26, 0x3), + UNCORE_EVENT_CONSTRAINT_RANGE(0x32, 0x34, 0x3), EVENT_CONSTRAINT_END }; static struct event_constraint snbep_uncore_r3qpi_constraints[] = { - UNCORE_EVENT_CONSTRAINT(0x10, 0x3), - UNCORE_EVENT_CONSTRAINT(0x11, 0x3), - UNCORE_EVENT_CONSTRAINT(0x12, 0x3), + UNCORE_EVENT_CONSTRAINT_RANGE(0x10, 0x12, 0x3), UNCORE_EVENT_CONSTRAINT(0x13, 0x1), - UNCORE_EVENT_CONSTRAINT(0x20, 0x3), - UNCORE_EVENT_CONSTRAINT(0x21, 0x3), - UNCORE_EVENT_CONSTRAINT(0x22, 0x3), - UNCORE_EVENT_CONSTRAINT(0x23, 0x3), - UNCORE_EVENT_CONSTRAINT(0x24, 0x3), - UNCORE_EVENT_CONSTRAINT(0x25, 0x3), - UNCORE_EVENT_CONSTRAINT(0x26, 0x3), - UNCORE_EVENT_CONSTRAINT(0x28, 0x3), - UNCORE_EVENT_CONSTRAINT(0x29, 0x3), - UNCORE_EVENT_CONSTRAINT(0x2a, 0x3), - UNCORE_EVENT_CONSTRAINT(0x2b, 0x3), - UNCORE_EVENT_CONSTRAINT(0x2c, 0x3), - UNCORE_EVENT_CONSTRAINT(0x2d, 0x3), - UNCORE_EVENT_CONSTRAINT(0x2e, 0x3), - UNCORE_EVENT_CONSTRAINT(0x2f, 0x3), - UNCORE_EVENT_CONSTRAINT(0x30, 0x3), - UNCORE_EVENT_CONSTRAINT(0x31, 0x3), - UNCORE_EVENT_CONSTRAINT(0x32, 0x3), - UNCORE_EVENT_CONSTRAINT(0x33, 0x3), - UNCORE_EVENT_CONSTRAINT(0x34, 0x3), - UNCORE_EVENT_CONSTRAINT(0x36, 0x3), - UNCORE_EVENT_CONSTRAINT(0x37, 0x3), - UNCORE_EVENT_CONSTRAINT(0x38, 0x3), - UNCORE_EVENT_CONSTRAINT(0x39, 0x3), + UNCORE_EVENT_CONSTRAINT_RANGE(0x20, 0x26, 0x3), + UNCORE_EVENT_CONSTRAINT_RANGE(0x28, 0x34, 0x3), + UNCORE_EVENT_CONSTRAINT_RANGE(0x36, 0x39, 0x3), EVENT_CONSTRAINT_END }; @@ -3034,24 +2995,15 @@ static struct intel_uncore_type hswep_uncore_qpi = { }; static struct event_constraint hswep_uncore_r2pcie_constraints[] = { - UNCORE_EVENT_CONSTRAINT(0x10, 0x3), - UNCORE_EVENT_CONSTRAINT(0x11, 0x3), + UNCORE_EVENT_CONSTRAINT_RANGE(0x10, 0x11, 0x3), UNCORE_EVENT_CONSTRAINT(0x13, 0x1), - UNCORE_EVENT_CONSTRAINT(0x23, 0x1), - UNCORE_EVENT_CONSTRAINT(0x24, 0x1), - UNCORE_EVENT_CONSTRAINT(0x25, 0x1), + UNCORE_EVENT_CONSTRAINT_RANGE(0x23, 0x25, 0x1), UNCORE_EVENT_CONSTRAINT(0x26, 0x3), UNCORE_EVENT_CONSTRAINT(0x27, 0x1), - UNCORE_EVENT_CONSTRAINT(0x28, 0x3), - UNCORE_EVENT_CONSTRAINT(0x29, 0x3), + UNCORE_EVENT_CONSTRAINT_RANGE(0x28, 0x29, 0x3), UNCORE_EVENT_CONSTRAINT(0x2a, 0x1), - UNCORE_EVENT_CONSTRAINT(0x2b, 0x3), - UNCORE_EVENT_CONSTRAINT(0x2c, 0x3), - UNCORE_EVENT_CONSTRAINT(0x2d, 0x3), - UNCORE_EVENT_CONSTRAINT(0x32, 0x3), - UNCORE_EVENT_CONSTRAINT(0x33, 0x3), - UNCORE_EVENT_CONSTRAINT(0x34, 0x3), - UNCORE_EVENT_CONSTRAINT(0x35, 0x3), + UNCORE_EVENT_CONSTRAINT_RANGE(0x2b, 0x2d, 0x3), + UNCORE_EVENT_CONSTRAINT_RANGE(0x32, 0x35, 0x3), EVENT_CONSTRAINT_END }; @@ -3066,38 +3018,17 @@ static struct intel_uncore_type hswep_uncore_r2pcie = { static struct event_constraint hswep_uncore_r3qpi_constraints[] = { UNCORE_EVENT_CONSTRAINT(0x01, 0x3), - UNCORE_EVENT_CONSTRAINT(0x07, 0x7), - UNCORE_EVENT_CONSTRAINT(0x08, 0x7), - UNCORE_EVENT_CONSTRAINT(0x09, 0x7), - UNCORE_EVENT_CONSTRAINT(0x0a, 0x7), + UNCORE_EVENT_CONSTRAINT_RANGE(0x7, 0x0a, 0x7), UNCORE_EVENT_CONSTRAINT(0x0e, 0x7), - UNCORE_EVENT_CONSTRAINT(0x10, 0x3), - UNCORE_EVENT_CONSTRAINT(0x11, 0x3), - UNCORE_EVENT_CONSTRAINT(0x12, 0x3), + UNCORE_EVENT_CONSTRAINT_RANGE(0x10, 0x12, 0x3), UNCORE_EVENT_CONSTRAINT(0x13, 0x1), - UNCORE_EVENT_CONSTRAINT(0x14, 0x3), - UNCORE_EVENT_CONSTRAINT(0x15, 0x3), - UNCORE_EVENT_CONSTRAINT(0x1f, 0x3), - UNCORE_EVENT_CONSTRAINT(0x20, 0x3), - UNCORE_EVENT_CONSTRAINT(0x21, 0x3), - UNCORE_EVENT_CONSTRAINT(0x22, 0x3), - UNCORE_EVENT_CONSTRAINT(0x23, 0x3), - UNCORE_EVENT_CONSTRAINT(0x25, 0x3), - UNCORE_EVENT_CONSTRAINT(0x26, 0x3), - UNCORE_EVENT_CONSTRAINT(0x28, 0x3), - UNCORE_EVENT_CONSTRAINT(0x29, 0x3), - UNCORE_EVENT_CONSTRAINT(0x2c, 0x3), - UNCORE_EVENT_CONSTRAINT(0x2d, 0x3), - UNCORE_EVENT_CONSTRAINT(0x2e, 0x3), - UNCORE_EVENT_CONSTRAINT(0x2f, 0x3), - UNCORE_EVENT_CONSTRAINT(0x31, 0x3), - UNCORE_EVENT_CONSTRAINT(0x32, 0x3), - UNCORE_EVENT_CONSTRAINT(0x33, 0x3), - UNCORE_EVENT_CONSTRAINT(0x34, 0x3), - UNCORE_EVENT_CONSTRAINT(0x36, 0x3), - UNCORE_EVENT_CONSTRAINT(0x37, 0x3), - UNCORE_EVENT_CONSTRAINT(0x38, 0x3), - UNCORE_EVENT_CONSTRAINT(0x39, 0x3), + UNCORE_EVENT_CONSTRAINT_RANGE(0x14, 0x15, 0x3), + UNCORE_EVENT_CONSTRAINT_RANGE(0x1f, 0x23, 0x3), + UNCORE_EVENT_CONSTRAINT_RANGE(0x25, 0x26, 0x3), + UNCORE_EVENT_CONSTRAINT_RANGE(0x28, 0x29, 0x3), + UNCORE_EVENT_CONSTRAINT_RANGE(0x2c, 0x2f, 0x3), + UNCORE_EVENT_CONSTRAINT_RANGE(0x31, 0x34, 0x3), + UNCORE_EVENT_CONSTRAINT_RANGE(0x36, 0x39, 0x3), EVENT_CONSTRAINT_END }; @@ -3371,8 +3302,7 @@ static struct event_constraint bdx_uncore_r2pcie_constraints[] = { UNCORE_EVENT_CONSTRAINT(0x25, 0x1), UNCORE_EVENT_CONSTRAINT(0x26, 0x3), UNCORE_EVENT_CONSTRAINT(0x28, 0x3), - UNCORE_EVENT_CONSTRAINT(0x2c, 0x3), - UNCORE_EVENT_CONSTRAINT(0x2d, 0x3), + UNCORE_EVENT_CONSTRAINT_RANGE(0x2c, 0x2d, 0x3), EVENT_CONSTRAINT_END }; @@ -3387,35 +3317,18 @@ static struct intel_uncore_type bdx_uncore_r2pcie = { static struct event_constraint bdx_uncore_r3qpi_constraints[] = { UNCORE_EVENT_CONSTRAINT(0x01, 0x7), - UNCORE_EVENT_CONSTRAINT(0x07, 0x7), - UNCORE_EVENT_CONSTRAINT(0x08, 0x7), - UNCORE_EVENT_CONSTRAINT(0x09, 0x7), - UNCORE_EVENT_CONSTRAINT(0x0a, 0x7), + UNCORE_EVENT_CONSTRAINT_RANGE(0x07, 0x0a, 0x7), UNCORE_EVENT_CONSTRAINT(0x0e, 0x7), - UNCORE_EVENT_CONSTRAINT(0x10, 0x3), - UNCORE_EVENT_CONSTRAINT(0x11, 0x3), + UNCORE_EVENT_CONSTRAINT_RANGE(0x10, 0x11, 0x3), UNCORE_EVENT_CONSTRAINT(0x13, 0x1), - UNCORE_EVENT_CONSTRAINT(0x14, 0x3), - UNCORE_EVENT_CONSTRAINT(0x15, 0x3), - UNCORE_EVENT_CONSTRAINT(0x1f, 0x3), - UNCORE_EVENT_CONSTRAINT(0x20, 0x3), - UNCORE_EVENT_CONSTRAINT(0x21, 0x3), - UNCORE_EVENT_CONSTRAINT(0x22, 0x3), - UNCORE_EVENT_CONSTRAINT(0x23, 0x3), + UNCORE_EVENT_CONSTRAINT_RANGE(0x14, 0x15, 0x3), + UNCORE_EVENT_CONSTRAINT_RANGE(0x1f, 0x23, 0x3), UNCORE_EVENT_CONSTRAINT(0x25, 0x3), UNCORE_EVENT_CONSTRAINT(0x26, 0x3), - UNCORE_EVENT_CONSTRAINT(0x28, 0x3), - UNCORE_EVENT_CONSTRAINT(0x29, 0x3), - UNCORE_EVENT_CONSTRAINT(0x2c, 0x3), - UNCORE_EVENT_CONSTRAINT(0x2d, 0x3), - UNCORE_EVENT_CONSTRAINT(0x2e, 0x3), - UNCORE_EVENT_CONSTRAINT(0x2f, 0x3), - UNCORE_EVENT_CONSTRAINT(0x33, 0x3), - UNCORE_EVENT_CONSTRAINT(0x34, 0x3), - UNCORE_EVENT_CONSTRAINT(0x36, 0x3), - UNCORE_EVENT_CONSTRAINT(0x37, 0x3), - UNCORE_EVENT_CONSTRAINT(0x38, 0x3), - UNCORE_EVENT_CONSTRAINT(0x39, 0x3), + UNCORE_EVENT_CONSTRAINT_RANGE(0x28, 0x29, 0x3), + UNCORE_EVENT_CONSTRAINT_RANGE(0x2c, 0x2f, 0x3), + UNCORE_EVENT_CONSTRAINT_RANGE(0x33, 0x34, 0x3), + UNCORE_EVENT_CONSTRAINT_RANGE(0x36, 0x39, 0x3), EVENT_CONSTRAINT_END }; @@ -3722,8 +3635,7 @@ static struct event_constraint skx_uncore_iio_constraints[] = { UNCORE_EVENT_CONSTRAINT(0x95, 0xc), UNCORE_EVENT_CONSTRAINT(0xc0, 0xc), UNCORE_EVENT_CONSTRAINT(0xc5, 0xc), - UNCORE_EVENT_CONSTRAINT(0xd4, 0xc), - UNCORE_EVENT_CONSTRAINT(0xd5, 0xc), + UNCORE_EVENT_CONSTRAINT_RANGE(0xd4, 0xd5, 0xc), EVENT_CONSTRAINT_END }; @@ -4479,14 +4391,9 @@ static struct intel_uncore_type skx_uncore_m2pcie = { }; static struct event_constraint skx_uncore_m3upi_constraints[] = { - UNCORE_EVENT_CONSTRAINT(0x1d, 0x1), - UNCORE_EVENT_CONSTRAINT(0x1e, 0x1), + UNCORE_EVENT_CONSTRAINT_RANGE(0x1d, 0x1e, 0x1), UNCORE_EVENT_CONSTRAINT(0x40, 0x7), - UNCORE_EVENT_CONSTRAINT(0x4e, 0x7), - UNCORE_EVENT_CONSTRAINT(0x4f, 0x7), - UNCORE_EVENT_CONSTRAINT(0x50, 0x7), - UNCORE_EVENT_CONSTRAINT(0x51, 0x7), - UNCORE_EVENT_CONSTRAINT(0x52, 0x7), + UNCORE_EVENT_CONSTRAINT_RANGE(0x4e, 0x52, 0x7), EVENT_CONSTRAINT_END }; @@ -5652,14 +5559,9 @@ static struct intel_uncore_type icx_uncore_upi = { }; static struct event_constraint icx_uncore_m3upi_constraints[] = { - UNCORE_EVENT_CONSTRAINT(0x1c, 0x1), - UNCORE_EVENT_CONSTRAINT(0x1d, 0x1), - UNCORE_EVENT_CONSTRAINT(0x1e, 0x1), - UNCORE_EVENT_CONSTRAINT(0x1f, 0x1), + UNCORE_EVENT_CONSTRAINT_RANGE(0x1c, 0x1f, 0x1), UNCORE_EVENT_CONSTRAINT(0x40, 0x7), - UNCORE_EVENT_CONSTRAINT(0x4e, 0x7), - UNCORE_EVENT_CONSTRAINT(0x4f, 0x7), - UNCORE_EVENT_CONSTRAINT(0x50, 0x7), + UNCORE_EVENT_CONSTRAINT_RANGE(0x4e, 0x50, 0x7), EVENT_CONSTRAINT_END }; @@ -6142,10 +6044,7 @@ static struct intel_uncore_ops spr_uncore_mmio_offs8_ops = { static struct event_constraint spr_uncore_cxlcm_constraints[] = { UNCORE_EVENT_CONSTRAINT(0x02, 0x0f), UNCORE_EVENT_CONSTRAINT(0x05, 0x0f), - UNCORE_EVENT_CONSTRAINT(0x40, 0xf0), - UNCORE_EVENT_CONSTRAINT(0x41, 0xf0), - UNCORE_EVENT_CONSTRAINT(0x42, 0xf0), - UNCORE_EVENT_CONSTRAINT(0x43, 0xf0), + UNCORE_EVENT_CONSTRAINT_RANGE(0x40, 0x43, 0xf0), UNCORE_EVENT_CONSTRAINT(0x4b, 0xf0), UNCORE_EVENT_CONSTRAINT(0x52, 0xf0), EVENT_CONSTRAINT_END From 171b5292a82d04e6692f1b19573d15753f21e7fd Mon Sep 17 00:00:00 2001 From: Zide Chen Date: Wed, 31 Dec 2025 14:42:27 -0800 Subject: [PATCH 36/58] perf/x86/intel/uncore: Update DMR uncore constraints preliminarily Update event constraints base on the latest DMR uncore event list. Signed-off-by: Zide Chen Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dapeng Mi Link: https://patch.msgid.link/20251231224233.113839-11-zide.chen@intel.com --- arch/x86/events/intel/uncore_snbep.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index eaeb4e97cd9c..7ca0429c4004 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -6660,10 +6660,19 @@ static const struct attribute_group dmr_cxlcm_uncore_format_group = { .attrs = dmr_cxlcm_uncore_formats_attr, }; +static struct event_constraint dmr_uncore_cxlcm_constraints[] = { + UNCORE_EVENT_CONSTRAINT_RANGE(0x1, 0x24, 0x0f), + UNCORE_EVENT_CONSTRAINT_RANGE(0x41, 0x41, 0xf0), + UNCORE_EVENT_CONSTRAINT_RANGE(0x50, 0x5e, 0xf0), + UNCORE_EVENT_CONSTRAINT_RANGE(0x60, 0x61, 0xf0), + EVENT_CONSTRAINT_END +}; + static struct intel_uncore_type dmr_uncore_cxlcm = { .name = "cxlcm", .event_mask = GENERIC_PMON_RAW_EVENT_MASK, .event_mask_ext = DMR_CXLCM_EVENT_MASK_EXT, + .constraints = dmr_uncore_cxlcm_constraints, .format_group = &dmr_cxlcm_uncore_format_group, .attr_update = uncore_alias_groups, }; @@ -6675,9 +6684,20 @@ static struct intel_uncore_type dmr_uncore_hamvf = { .attr_update = uncore_alias_groups, }; +static struct event_constraint dmr_uncore_cbo_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x11, 0x1), + UNCORE_EVENT_CONSTRAINT_RANGE(0x19, 0x1a, 0x1), + UNCORE_EVENT_CONSTRAINT(0x1f, 0x1), + UNCORE_EVENT_CONSTRAINT(0x21, 0x1), + UNCORE_EVENT_CONSTRAINT(0x25, 0x1), + UNCORE_EVENT_CONSTRAINT(0x36, 0x1), + EVENT_CONSTRAINT_END +}; + static struct intel_uncore_type dmr_uncore_cbo = { .name = "cbo", .event_mask_ext = DMR_HAMVF_EVENT_MASK_EXT, + .constraints = dmr_uncore_cbo_constraints, .format_group = &dmr_sca_uncore_format_group, .attr_update = uncore_alias_groups, }; @@ -6711,9 +6731,16 @@ static struct intel_uncore_type dmr_uncore_dda = { .attr_update = uncore_alias_groups, }; +static struct event_constraint dmr_uncore_sbo_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x1f, 0x01), + UNCORE_EVENT_CONSTRAINT(0x25, 0x01), + EVENT_CONSTRAINT_END +}; + static struct intel_uncore_type dmr_uncore_sbo = { .name = "sbo", .event_mask_ext = DMR_HAMVF_EVENT_MASK_EXT, + .constraints = dmr_uncore_sbo_constraints, .format_group = &dmr_sca_uncore_format_group, .attr_update = uncore_alias_groups, }; From 2246c24426fbc1069cb2a47e0624ccffe5f2627b Mon Sep 17 00:00:00 2001 From: Zide Chen Date: Wed, 31 Dec 2025 14:42:28 -0800 Subject: [PATCH 37/58] perf pmu: Relax uncore wildcard matching to allow numeric suffix Diamond Rapids introduces two types of PCIe related uncore PMUs: "uncore_pcie4_*" and "uncore_pcie6_*". To ensure that generic PCIe events (e.g., UNC_PCIE_CLOCKTICKS) can match and collect events from both PMU types, slightly relax the wildcard matching logic in perf_pmu__match_wildcard(). This change allows a wildcard such as "pcie" to match PMU names that include a numeric suffix, such as "pcie4_*" and "pcie6_*". Co-developed-by: Dapeng Mi Signed-off-by: Dapeng Mi Signed-off-by: Zide Chen Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dapeng Mi Link: https://patch.msgid.link/20251231224233.113839-12-zide.chen@intel.com --- tools/perf/util/pmu.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index 956ea273c2c7..01a21b6aa031 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -939,6 +939,7 @@ static bool perf_pmu__match_wildcard(const char *pmu_name, const char *tok) { const char *p, *suffix; bool has_hex = false; + bool has_underscore = false; size_t tok_len = strlen(tok); /* Check start of pmu_name for equality. */ @@ -949,13 +950,14 @@ static bool perf_pmu__match_wildcard(const char *pmu_name, const char *tok) if (*p == 0) return true; - if (*p == '_') { - ++p; - ++suffix; - } - - /* Ensure we end in a number */ + /* Ensure we end in a number or a mix of number and "_". */ while (1) { + if (!has_underscore && (*p == '_')) { + has_underscore = true; + ++p; + ++suffix; + } + if (!isxdigit(*p)) return false; if (!has_hex) From 46da08a2bb4d07874990579235ff87b41911a412 Mon Sep 17 00:00:00 2001 From: Zide Chen Date: Wed, 31 Dec 2025 14:42:29 -0800 Subject: [PATCH 38/58] perf/x86/intel/uncore: Add missing PMON units for Panther Lake Besides CBOX, Panther Lake includes several legacy uncore PMON units not enumerated via discovery tables, including cNCU, SANTA, and ia_core_bridge. The cNCU PMON is similar to Meteor Lake but has two boxes with two counters each. SANTA and IA Core Bridge PMON units follow the legacy model used on Lunar Lake, Meteor Lake, and others. Panther Lake implements the Global Control Register; the freeze_all bit must be cleared before programming counters. Signed-off-by: Zide Chen Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dapeng Mi Link: https://patch.msgid.link/20251231224233.113839-13-zide.chen@intel.com --- arch/x86/events/intel/uncore.c | 1 + arch/x86/events/intel/uncore_snb.c | 45 ++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+) diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 19ff8db4ddac..704591a41e6b 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -1814,6 +1814,7 @@ static const struct uncore_plat_init ptl_uncore_init __initconst = { .cpu_init = ptl_uncore_cpu_init, .mmio_init = ptl_uncore_mmio_init, .domain[0].discovery_base = UNCORE_DISCOVERY_MSR, + .domain[0].global_init = uncore_mmio_global_init, }; static const struct uncore_plat_init icx_uncore_init __initconst = { diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index 807e582b8f17..c663b00b68fe 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -245,6 +245,17 @@ #define MTL_UNC_HBO_CTR 0x2048 #define MTL_UNC_HBO_CTRL 0x2042 +/* PTL Low Power Bridge register */ +#define PTL_UNC_IA_CORE_BRIDGE_PER_CTR0 0x2028 +#define PTL_UNC_IA_CORE_BRIDGE_PERFEVTSEL0 0x2022 + +/* PTL Santa register */ +#define PTL_UNC_SANTA_CTR0 0x2418 +#define PTL_UNC_SANTA_CTRL0 0x2412 + +/* PTL cNCU register */ +#define PTL_UNC_CNCU_MSR_OFFSET 0x140 + DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7"); DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15"); DEFINE_UNCORE_FORMAT_ATTR(chmask, chmask, "config:8-11"); @@ -1921,8 +1932,36 @@ void ptl_uncore_mmio_init(void) ptl_uncores); } +static struct intel_uncore_type ptl_uncore_ia_core_bridge = { + .name = "ia_core_bridge", + .num_counters = 2, + .num_boxes = 1, + .perf_ctr_bits = 48, + .perf_ctr = PTL_UNC_IA_CORE_BRIDGE_PER_CTR0, + .event_ctl = PTL_UNC_IA_CORE_BRIDGE_PERFEVTSEL0, + .event_mask = ADL_UNC_RAW_EVENT_MASK, + .ops = &icl_uncore_msr_ops, + .format_group = &adl_uncore_format_group, +}; + +static struct intel_uncore_type ptl_uncore_santa = { + .name = "santa", + .num_counters = 2, + .num_boxes = 2, + .perf_ctr_bits = 48, + .perf_ctr = PTL_UNC_SANTA_CTR0, + .event_ctl = PTL_UNC_SANTA_CTRL0, + .event_mask = ADL_UNC_RAW_EVENT_MASK, + .msr_offset = SNB_UNC_CBO_MSR_OFFSET, + .ops = &icl_uncore_msr_ops, + .format_group = &adl_uncore_format_group, +}; + static struct intel_uncore_type *ptl_msr_uncores[] = { &mtl_uncore_cbox, + &ptl_uncore_ia_core_bridge, + &ptl_uncore_santa, + &mtl_uncore_cncu, NULL }; @@ -1930,6 +1969,12 @@ void ptl_uncore_cpu_init(void) { mtl_uncore_cbox.num_boxes = 6; mtl_uncore_cbox.ops = &lnl_uncore_msr_ops; + + mtl_uncore_cncu.num_counters = 2; + mtl_uncore_cncu.num_boxes = 2; + mtl_uncore_cncu.msr_offset = PTL_UNC_CNCU_MSR_OFFSET; + mtl_uncore_cncu.single_fixed = 0; + uncore_msr_uncores = ptl_msr_uncores; } From e7d5f2ea0923c5e49ccf94cdaab74a08c865115e Mon Sep 17 00:00:00 2001 From: Zide Chen Date: Wed, 31 Dec 2025 14:42:30 -0800 Subject: [PATCH 39/58] perf/x86/intel/uncore: Add Nova Lake support Nova Lake uncore PMON largely follows Panther Lake and supports CBOX, iMC, cNCU, SANTA, sNCU, and HBO units. As with Panther Lake, CBOX, cNCU, and SANTA are not enumerated via discovery tables. Their programming model matches Panther Lake, with differences limited to MSR addresses and the number of boxes or counters per box. The remaining units are enumerated via discovery tables using a new base MSR (0x711) and otherwise reuse the Panther Lake implementation. Nova Lake also supports iMC free-running counters. Signed-off-by: Zide Chen Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dapeng Mi Link: https://patch.msgid.link/20251231224233.113839-14-zide.chen@intel.com --- arch/x86/events/intel/uncore.c | 9 ++++++ arch/x86/events/intel/uncore.h | 1 + arch/x86/events/intel/uncore_discovery.h | 2 ++ arch/x86/events/intel/uncore_snb.c | 40 ++++++++++++++++++++++++ 4 files changed, 52 insertions(+) diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 704591a41e6b..4684649109d9 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -1817,6 +1817,13 @@ static const struct uncore_plat_init ptl_uncore_init __initconst = { .domain[0].global_init = uncore_mmio_global_init, }; +static const struct uncore_plat_init nvl_uncore_init __initconst = { + .cpu_init = nvl_uncore_cpu_init, + .mmio_init = ptl_uncore_mmio_init, + .domain[0].discovery_base = PACKAGE_UNCORE_DISCOVERY_MSR, + .domain[0].global_init = uncore_mmio_global_init, +}; + static const struct uncore_plat_init icx_uncore_init __initconst = { .cpu_init = icx_uncore_cpu_init, .pci_init = icx_uncore_pci_init, @@ -1916,6 +1923,8 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = { X86_MATCH_VFM(INTEL_LUNARLAKE_M, &lnl_uncore_init), X86_MATCH_VFM(INTEL_PANTHERLAKE_L, &ptl_uncore_init), X86_MATCH_VFM(INTEL_WILDCATLAKE_L, &ptl_uncore_init), + X86_MATCH_VFM(INTEL_NOVALAKE, &nvl_uncore_init), + X86_MATCH_VFM(INTEL_NOVALAKE_L, &nvl_uncore_init), X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, &spr_uncore_init), X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, &spr_uncore_init), X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, &gnr_uncore_init), diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index 564cb26c4468..c35918c01afa 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -636,6 +636,7 @@ void adl_uncore_cpu_init(void); void lnl_uncore_cpu_init(void); void mtl_uncore_cpu_init(void); void ptl_uncore_cpu_init(void); +void nvl_uncore_cpu_init(void); void tgl_uncore_mmio_init(void); void tgl_l_uncore_mmio_init(void); void adl_uncore_mmio_init(void); diff --git a/arch/x86/events/intel/uncore_discovery.h b/arch/x86/events/intel/uncore_discovery.h index 63b8f7634e42..e1330342b92e 100644 --- a/arch/x86/events/intel/uncore_discovery.h +++ b/arch/x86/events/intel/uncore_discovery.h @@ -4,6 +4,8 @@ #define UNCORE_DISCOVERY_MSR 0x201e /* Base address of uncore perfmon discovery table for CBB domain */ #define CBB_UNCORE_DISCOVERY_MSR 0x710 +/* Base address of uncore perfmon discovery table for the package */ +#define PACKAGE_UNCORE_DISCOVERY_MSR 0x711 /* Generic device ID of a discovery table device */ #define UNCORE_DISCOVERY_TABLE_DEVICE 0x09a7 diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index c663b00b68fe..e8e44741200e 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -256,6 +256,19 @@ /* PTL cNCU register */ #define PTL_UNC_CNCU_MSR_OFFSET 0x140 +/* NVL cNCU register */ +#define NVL_UNC_CNCU_BOX_CTL 0x202e +#define NVL_UNC_CNCU_FIXED_CTR 0x2028 +#define NVL_UNC_CNCU_FIXED_CTRL 0x2022 + +/* NVL SANTA register */ +#define NVL_UNC_SANTA_CTR0 0x2048 +#define NVL_UNC_SANTA_CTRL0 0x2042 + +/* NVL CBOX register */ +#define NVL_UNC_CBOX_PER_CTR0 0x2108 +#define NVL_UNC_CBOX_PERFEVTSEL0 0x2102 + DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7"); DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15"); DEFINE_UNCORE_FORMAT_ATTR(chmask, chmask, "config:8-11"); @@ -1979,3 +1992,30 @@ void ptl_uncore_cpu_init(void) } /* end of Panther Lake uncore support */ + +/* Nova Lake uncore support */ + +static struct intel_uncore_type *nvl_msr_uncores[] = { + &mtl_uncore_cbox, + &ptl_uncore_santa, + &mtl_uncore_cncu, + NULL +}; + +void nvl_uncore_cpu_init(void) +{ + mtl_uncore_cbox.num_boxes = 12; + mtl_uncore_cbox.perf_ctr = NVL_UNC_CBOX_PER_CTR0, + mtl_uncore_cbox.event_ctl = NVL_UNC_CBOX_PERFEVTSEL0, + + ptl_uncore_santa.perf_ctr = NVL_UNC_SANTA_CTR0, + ptl_uncore_santa.event_ctl = NVL_UNC_SANTA_CTRL0, + + mtl_uncore_cncu.box_ctl = NVL_UNC_CNCU_BOX_CTL; + mtl_uncore_cncu.fixed_ctr = NVL_UNC_CNCU_FIXED_CTR; + mtl_uncore_cncu.fixed_ctl = NVL_UNC_CNCU_FIXED_CTRL; + + uncore_msr_uncores = nvl_msr_uncores; +} + +/* end of Nova Lake uncore support */ From a18dfb5dd33247679a0cc4ff208ad7340d1e08a5 Mon Sep 17 00:00:00 2001 From: Keke Ming Date: Sat, 3 Jan 2026 16:42:39 +0800 Subject: [PATCH 40/58] riscv/uprobes: use kmap_local_page() in arch_uprobe_copy_ixol() Replace deprecated kmap_atomic() with kmap_local_page(). Signed-off-by: Keke Ming Signed-off-by: Peter Zijlstra (Intel) Acked-by: Oleg Nesterov Link: https://patch.msgid.link/20260103084243.195125-2-ming.jvle@gmail.com --- arch/riscv/kernel/probes/uprobes.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/riscv/kernel/probes/uprobes.c b/arch/riscv/kernel/probes/uprobes.c index cc15f7ca6cc1..f0d0691a8688 100644 --- a/arch/riscv/kernel/probes/uprobes.c +++ b/arch/riscv/kernel/probes/uprobes.c @@ -165,7 +165,7 @@ void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, void *src, unsigned long len) { /* Initialize the slot */ - void *kaddr = kmap_atomic(page); + void *kaddr = kmap_local_page(page); void *dst = kaddr + (vaddr & ~PAGE_MASK); unsigned long start = (unsigned long)dst; @@ -178,5 +178,5 @@ void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, } flush_icache_range(start, start + len); - kunmap_atomic(kaddr); + kunmap_local(kaddr); } From 094cc7bb5fc3b484614417b4578233a38e3df942 Mon Sep 17 00:00:00 2001 From: Keke Ming Date: Sat, 3 Jan 2026 16:42:40 +0800 Subject: [PATCH 41/58] arm64/uprobes: use kmap_local_page() in arch_uprobe_copy_ixol() Replace deprecated kmap_atomic() with kmap_local_page(). Signed-off-by: Keke Ming Signed-off-by: Peter Zijlstra (Intel) Acked-by: Oleg Nesterov Link: https://patch.msgid.link/20260103084243.195125-3-ming.jvle@gmail.com --- arch/arm64/kernel/probes/uprobes.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kernel/probes/uprobes.c b/arch/arm64/kernel/probes/uprobes.c index 941668800aea..4c55bf832ec3 100644 --- a/arch/arm64/kernel/probes/uprobes.c +++ b/arch/arm64/kernel/probes/uprobes.c @@ -15,7 +15,7 @@ void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, void *src, unsigned long len) { - void *xol_page_kaddr = kmap_atomic(page); + void *xol_page_kaddr = kmap_local_page(page); void *dst = xol_page_kaddr + (vaddr & ~PAGE_MASK); /* @@ -32,7 +32,7 @@ void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, sync_icache_aliases((unsigned long)dst, (unsigned long)dst + len); done: - kunmap_atomic(xol_page_kaddr); + kunmap_local(xol_page_kaddr); } unsigned long uprobe_get_swbp_addr(struct pt_regs *regs) From e6eb9acc024cdad7d1cbb7693ac39afdf9c5193f Mon Sep 17 00:00:00 2001 From: Keke Ming Date: Sat, 3 Jan 2026 16:42:41 +0800 Subject: [PATCH 42/58] mips/uprobes: use kmap_local_page() in arch_uprobe_copy_ixol() Replace deprecated kmap_atomic() with kmap_local_page(). Signed-off-by: Keke Ming Signed-off-by: Peter Zijlstra (Intel) Acked-by: Oleg Nesterov Link: https://patch.msgid.link/20260103084243.195125-4-ming.jvle@gmail.com --- arch/mips/kernel/uprobes.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/mips/kernel/uprobes.c b/arch/mips/kernel/uprobes.c index 401b148f8917..05cfc320992b 100644 --- a/arch/mips/kernel/uprobes.c +++ b/arch/mips/kernel/uprobes.c @@ -214,11 +214,11 @@ void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, unsigned long kaddr, kstart; /* Initialize the slot */ - kaddr = (unsigned long)kmap_atomic(page); + kaddr = (unsigned long)kmap_local_page(page); kstart = kaddr + (vaddr & ~PAGE_MASK); memcpy((void *)kstart, src, len); flush_icache_range(kstart, kstart + len); - kunmap_atomic((void *)kaddr); + kunmap_local((void *)kaddr); } /** From 1752a1ad43a1d4fd450a8ef5f8d240f9e228e0a3 Mon Sep 17 00:00:00 2001 From: Keke Ming Date: Sat, 3 Jan 2026 16:42:42 +0800 Subject: [PATCH 43/58] arm/uprobes: use kmap_local_page() in arch_uprobe_copy_ixol() Replace deprecated kmap_atomic() with kmap_local_page(). Signed-off-by: Keke Ming Signed-off-by: Peter Zijlstra (Intel) Acked-by: Oleg Nesterov Link: https://patch.msgid.link/20260103084243.195125-5-ming.jvle@gmail.com --- arch/arm/probes/uprobes/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/probes/uprobes/core.c b/arch/arm/probes/uprobes/core.c index 3d96fb41d624..0e1c6b9e7e54 100644 --- a/arch/arm/probes/uprobes/core.c +++ b/arch/arm/probes/uprobes/core.c @@ -113,7 +113,7 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, void *src, unsigned long len) { - void *xol_page_kaddr = kmap_atomic(page); + void *xol_page_kaddr = kmap_local_page(page); void *dst = xol_page_kaddr + (vaddr & ~PAGE_MASK); preempt_disable(); @@ -126,7 +126,7 @@ void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, preempt_enable(); - kunmap_atomic(xol_page_kaddr); + kunmap_local(xol_page_kaddr); } From a491c02c2770c9c2d02b96fad7e3a176d77bb737 Mon Sep 17 00:00:00 2001 From: Keke Ming Date: Sat, 3 Jan 2026 16:42:43 +0800 Subject: [PATCH 44/58] uprobes: use kmap_local_page() for temporary page mappings Replace deprecated kmap_atomic() with kmap_local_page(). Signed-off-by: Keke Ming Signed-off-by: Peter Zijlstra (Intel) Acked-by: Oleg Nesterov Link: https://patch.msgid.link/20260103084243.195125-6-ming.jvle@gmail.com --- kernel/events/uprobes.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index d546d32390a8..a7d7d83ca1d7 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -179,16 +179,16 @@ bool __weak is_trap_insn(uprobe_opcode_t *insn) void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len) { - void *kaddr = kmap_atomic(page); + void *kaddr = kmap_local_page(page); memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len); - kunmap_atomic(kaddr); + kunmap_local(kaddr); } static void copy_to_page(struct page *page, unsigned long vaddr, const void *src, int len) { - void *kaddr = kmap_atomic(page); + void *kaddr = kmap_local_page(page); memcpy(kaddr + (vaddr & ~PAGE_MASK), src, len); - kunmap_atomic(kaddr); + kunmap_local(kaddr); } static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *insn, @@ -323,7 +323,7 @@ __update_ref_ctr(struct mm_struct *mm, unsigned long vaddr, short d) return ret == 0 ? -EBUSY : ret; } - kaddr = kmap_atomic(page); + kaddr = kmap_local_page(page); ptr = kaddr + (vaddr & ~PAGE_MASK); if (unlikely(*ptr + d < 0)) { @@ -336,7 +336,7 @@ __update_ref_ctr(struct mm_struct *mm, unsigned long vaddr, short d) *ptr += d; ret = 0; out: - kunmap_atomic(kaddr); + kunmap_local(kaddr); put_page(page); return ret; } From eebe6446ccb75ecb36cb145ab1cbc3db06cbc8d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=2E=20Sch=C3=B6nherr?= Date: Tue, 13 Jan 2026 21:34:46 +0100 Subject: [PATCH 45/58] perf/core: Speed up kexec shutdown by avoiding unnecessary cross CPU calls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are typically a lot of PMUs registered, but in many cases only few of them have an event registered (like the "cpu" PMU in the presence of the watchdog). As the mutex is already held, it's safe to just check for existing events before doing the cross CPU call. This change saves tens of milliseconds from kexec time (perceived as steal time during a hypervisor host update), with <2ms remaining for this step in the shutdown. There might be additional potential for parallelization or we could just disable performance monitoring during the actual shutdown and be less graceful about it. Signed-off-by: Jan H. Schönherr Signed-off-by: David Woodhouse Signed-off-by: Peter Zijlstra (Intel) --- kernel/events/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 376fb07d869b..101da5c1ab50 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -15066,7 +15066,8 @@ static void perf_event_exit_cpu_context(int cpu) ctx = &cpuctx->ctx; mutex_lock(&ctx->mutex); - smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1); + if (ctx->nr_events) + smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1); cpuctx->online = 0; mutex_unlock(&ctx->mutex); mutex_unlock(&pmus_lock); From 4960626f956d63dce57f099016c2ecbe637a8229 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 12 Jan 2026 08:51:57 -0800 Subject: [PATCH 46/58] perf/core: Fix slow perf_event_task_exit() with LBR callstacks I got a report that a task is stuck in perf_event_exit_task() waiting for global_ctx_data_rwsem. On large systems with lots threads, it'd have performance issues when it grabs the lock to iterate all threads in the system to allocate the context data. And it'd block task exit path which is problematic especially under memory pressure. perf_event_open perf_event_alloc attach_perf_ctx_data attach_global_ctx_data percpu_down_write (global_ctx_data_rwsem) for_each_process_thread alloc_task_ctx_data do_exit perf_event_exit_task percpu_down_read (global_ctx_data_rwsem) It should not hold the global_ctx_data_rwsem on the exit path. Let's skip allocation for exiting tasks and free the data carefully. Reported-by: Rosalie Fang Suggested-by: Peter Zijlstra Signed-off-by: Namhyung Kim Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260112165157.1919624-1-namhyung@kernel.org --- kernel/events/core.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 101da5c1ab50..da013b9a595f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5421,9 +5421,20 @@ attach_task_ctx_data(struct task_struct *task, struct kmem_cache *ctx_cache, return -ENOMEM; for (;;) { - if (try_cmpxchg((struct perf_ctx_data **)&task->perf_ctx_data, &old, cd)) { + if (try_cmpxchg(&task->perf_ctx_data, &old, cd)) { if (old) perf_free_ctx_data_rcu(old); + /* + * Above try_cmpxchg() pairs with try_cmpxchg() from + * detach_task_ctx_data() such that + * if we race with perf_event_exit_task(), we must + * observe PF_EXITING. + */ + if (task->flags & PF_EXITING) { + /* detach_task_ctx_data() may free it already */ + if (try_cmpxchg(&task->perf_ctx_data, &cd, NULL)) + perf_free_ctx_data_rcu(cd); + } return 0; } @@ -5469,6 +5480,8 @@ attach_global_ctx_data(struct kmem_cache *ctx_cache) /* Allocate everything */ scoped_guard (rcu) { for_each_process_thread(g, p) { + if (p->flags & PF_EXITING) + continue; cd = rcu_dereference(p->perf_ctx_data); if (cd && !cd->global) { cd->global = 1; @@ -14562,8 +14575,11 @@ void perf_event_exit_task(struct task_struct *task) /* * Detach the perf_ctx_data for the system-wide event. + * + * Done without holding global_ctx_data_rwsem; typically + * attach_global_ctx_data() will skip over this task, but otherwise + * attach_task_ctx_data() will observe PF_EXITING. */ - guard(percpu_read)(&global_ctx_data_rwsem); detach_task_ctx_data(task); } From 4e955c08d6dc76fb60cda9af955ddcebedaa7f69 Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Wed, 14 Jan 2026 09:17:44 +0800 Subject: [PATCH 47/58] perf/x86/intel: Support the 4 new OMR MSRs introduced in DMR and NVL Diamond Rapids (DMR) and Nova Lake (NVL) introduce an enhanced Off-Module Response (OMR) facility, replacing the Off-Core Response (OCR) Performance Monitoring of previous processors. Legacy microarchitectures used the OCR facility to evaluate off-core and multi-core off-module transactions. The newly named OMR facility improves OCR capabilities for scalable coverage of new memory systems in multi-core module systems. Similar to OCR, 4 additional off-module configuration MSRs (OFFMODULE_RSP_0 to OFFMODULE_RSP_3) are introduced to specify attributes of off-module transactions. When multiple identical OMR events are created, they need to occupy the same OFFMODULE_RSP_x MSR. To ensure these multiple identical OMR events can work simultaneously, the intel_alt_er() and intel_fixup_er() helpers are enhanced to rotate these OMR events across different OFFMODULE_RSP_* MSRs, similar to previous OCR events. For more details about OMR, please refer to section 16.1 "OFF-MODULE RESPONSE (OMR) FACILITY" in ISE documentation. Signed-off-by: Dapeng Mi Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260114011750.350569-2-dapeng1.mi@linux.intel.com --- arch/x86/events/intel/core.c | 59 +++++++++++++++++++++++--------- arch/x86/events/perf_event.h | 5 +++ arch/x86/include/asm/msr-index.h | 5 +++ 3 files changed, 52 insertions(+), 17 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 1840ca1918d1..3578c660a904 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3532,17 +3532,32 @@ static int intel_alt_er(struct cpu_hw_events *cpuc, struct extra_reg *extra_regs = hybrid(cpuc->pmu, extra_regs); int alt_idx = idx; - if (!(x86_pmu.flags & PMU_FL_HAS_RSP_1)) - return idx; + switch (idx) { + case EXTRA_REG_RSP_0 ... EXTRA_REG_RSP_1: + if (!(x86_pmu.flags & PMU_FL_HAS_RSP_1)) + return idx; + if (++alt_idx > EXTRA_REG_RSP_1) + alt_idx = EXTRA_REG_RSP_0; + if (config & ~extra_regs[alt_idx].valid_mask) + return idx; + break; - if (idx == EXTRA_REG_RSP_0) - alt_idx = EXTRA_REG_RSP_1; + case EXTRA_REG_OMR_0 ... EXTRA_REG_OMR_3: + if (!(x86_pmu.flags & PMU_FL_HAS_OMR)) + return idx; + if (++alt_idx > EXTRA_REG_OMR_3) + alt_idx = EXTRA_REG_OMR_0; + /* + * Subtracting EXTRA_REG_OMR_0 ensures to get correct + * OMR extra_reg entries which start from 0. + */ + if (config & ~extra_regs[alt_idx - EXTRA_REG_OMR_0].valid_mask) + return idx; + break; - if (idx == EXTRA_REG_RSP_1) - alt_idx = EXTRA_REG_RSP_0; - - if (config & ~extra_regs[alt_idx].valid_mask) - return idx; + default: + break; + } return alt_idx; } @@ -3550,16 +3565,26 @@ static int intel_alt_er(struct cpu_hw_events *cpuc, static void intel_fixup_er(struct perf_event *event, int idx) { struct extra_reg *extra_regs = hybrid(event->pmu, extra_regs); - event->hw.extra_reg.idx = idx; + int er_idx; - if (idx == EXTRA_REG_RSP_0) { + event->hw.extra_reg.idx = idx; + switch (idx) { + case EXTRA_REG_RSP_0 ... EXTRA_REG_RSP_1: + er_idx = idx - EXTRA_REG_RSP_0; event->hw.config &= ~INTEL_ARCH_EVENT_MASK; - event->hw.config |= extra_regs[EXTRA_REG_RSP_0].event; - event->hw.extra_reg.reg = MSR_OFFCORE_RSP_0; - } else if (idx == EXTRA_REG_RSP_1) { - event->hw.config &= ~INTEL_ARCH_EVENT_MASK; - event->hw.config |= extra_regs[EXTRA_REG_RSP_1].event; - event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1; + event->hw.config |= extra_regs[er_idx].event; + event->hw.extra_reg.reg = MSR_OFFCORE_RSP_0 + er_idx; + break; + + case EXTRA_REG_OMR_0 ... EXTRA_REG_OMR_3: + er_idx = idx - EXTRA_REG_OMR_0; + event->hw.config &= ~ARCH_PERFMON_EVENTSEL_UMASK; + event->hw.config |= 1ULL << (8 + er_idx); + event->hw.extra_reg.reg = MSR_OMR_0 + er_idx; + break; + + default: + pr_warn("The extra reg idx %d is not supported.\n", idx); } } diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 3161ec0a3416..586e3fdfe6d8 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -45,6 +45,10 @@ enum extra_reg_type { EXTRA_REG_FE = 4, /* fe_* */ EXTRA_REG_SNOOP_0 = 5, /* snoop response 0 */ EXTRA_REG_SNOOP_1 = 6, /* snoop response 1 */ + EXTRA_REG_OMR_0 = 7, /* OMR 0 */ + EXTRA_REG_OMR_1 = 8, /* OMR 1 */ + EXTRA_REG_OMR_2 = 9, /* OMR 2 */ + EXTRA_REG_OMR_3 = 10, /* OMR 3 */ EXTRA_REG_MAX /* number of entries needed */ }; @@ -1099,6 +1103,7 @@ do { \ #define PMU_FL_RETIRE_LATENCY 0x200 /* Support Retire Latency in PEBS */ #define PMU_FL_BR_CNTR 0x400 /* Support branch counter logging */ #define PMU_FL_DYN_CONSTRAINT 0x800 /* Needs dynamic constraint */ +#define PMU_FL_HAS_OMR 0x1000 /* has 4 equivalent OMR regs */ #define EVENT_VAR(_id) event_attr_##_id #define EVENT_PTR(_id) &event_attr_##_id.attr.attr diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 3d0a0950d20a..6d1b69ea01c2 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -263,6 +263,11 @@ #define MSR_SNOOP_RSP_0 0x00001328 #define MSR_SNOOP_RSP_1 0x00001329 +#define MSR_OMR_0 0x000003e0 +#define MSR_OMR_1 0x000003e1 +#define MSR_OMR_2 0x000003e2 +#define MSR_OMR_3 0x000003e3 + #define MSR_LBR_SELECT 0x000001c8 #define MSR_LBR_TOS 0x000001c9 From d2bdcde9626cbea0c44a6aaa33b440c8adf81e09 Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Wed, 14 Jan 2026 09:17:45 +0800 Subject: [PATCH 48/58] perf/x86/intel: Add support for PEBS memory auxiliary info field in DMR With the introduction of the OMR feature, the PEBS memory auxiliary info field for load and store latency events has been restructured for DMR. The memory auxiliary info field's bit[8] indicates whether a L2 cache miss occurred for a memory load or store instruction. If bit[8] is 0, it signifies no L2 cache miss, and bits[7:0] specify the exact cache data source (up to the L2 cache level). If bit[8] is 1, bits[7:0] represent the OMR encoding, indicating the specific L3 cache or memory region involved in the memory access. A significant enhancement is OMR encoding provides up to 8 fine-grained memory regions besides the cache region. A significant enhancement for OMR encoding is the ability to provide up to 8 fine-grained memory regions in addition to the cache region, offering more detailed insights into memory access regions. For detailed information on the memory auxiliary info encoding, please refer to section 16.2 "PEBS LOAD LATENCY AND STORE LATENCY FACILITY" in the ISE documentation. This patch ensures that the PEBS memory auxiliary info field is correctly interpreted and utilized in DMR. Signed-off-by: Dapeng Mi Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260114011750.350569-3-dapeng1.mi@linux.intel.com --- arch/x86/events/intel/ds.c | 140 ++++++++++++++++++++++++++ arch/x86/events/perf_event.h | 2 + include/uapi/linux/perf_event.h | 27 ++++- tools/include/uapi/linux/perf_event.h | 27 ++++- 4 files changed, 190 insertions(+), 6 deletions(-) diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index feb1c3cf63e4..272e652f25fc 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -34,6 +34,17 @@ struct pebs_record_32 { */ +union omr_encoding { + struct { + u8 omr_source : 4; + u8 omr_remote : 1; + u8 omr_hitm : 1; + u8 omr_snoop : 1; + u8 omr_promoted : 1; + }; + u8 omr_full; +}; + union intel_x86_pebs_dse { u64 val; struct { @@ -73,6 +84,18 @@ union intel_x86_pebs_dse { unsigned int lnc_addr_blk:1; unsigned int ld_reserved6:18; }; + struct { + unsigned int pnc_dse: 8; + unsigned int pnc_l2_miss:1; + unsigned int pnc_stlb_clean_hit:1; + unsigned int pnc_stlb_any_hit:1; + unsigned int pnc_stlb_miss:1; + unsigned int pnc_locked:1; + unsigned int pnc_data_blk:1; + unsigned int pnc_addr_blk:1; + unsigned int pnc_fb_full:1; + unsigned int ld_reserved8:16; + }; }; @@ -228,6 +251,85 @@ void __init intel_pmu_pebs_data_source_lnl(void) __intel_pmu_pebs_data_source_cmt(data_source); } +/* Version for Panthercove and later */ + +/* L2 hit */ +#define PNC_PEBS_DATA_SOURCE_MAX 16 +static u64 pnc_pebs_l2_hit_data_source[PNC_PEBS_DATA_SOURCE_MAX] = { + P(OP, LOAD) | P(LVL, NA) | LEVEL(NA) | P(SNOOP, NA), /* 0x00: non-cache access */ + OP_LH | LEVEL(L0) | P(SNOOP, NONE), /* 0x01: L0 hit */ + OP_LH | P(LVL, L1) | LEVEL(L1) | P(SNOOP, NONE), /* 0x02: L1 hit */ + OP_LH | P(LVL, LFB) | LEVEL(LFB) | P(SNOOP, NONE), /* 0x03: L1 Miss Handling Buffer hit */ + OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, NONE), /* 0x04: L2 Hit Clean */ + 0, /* 0x05: Reserved */ + 0, /* 0x06: Reserved */ + OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, HIT), /* 0x07: L2 Hit Snoop HIT */ + OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, HITM), /* 0x08: L2 Hit Snoop Hit Modified */ + OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, MISS), /* 0x09: Prefetch Promotion */ + OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, MISS), /* 0x0a: Cross Core Prefetch Promotion */ + 0, /* 0x0b: Reserved */ + 0, /* 0x0c: Reserved */ + 0, /* 0x0d: Reserved */ + 0, /* 0x0e: Reserved */ + OP_LH | P(LVL, UNC) | LEVEL(NA) | P(SNOOP, NONE), /* 0x0f: uncached */ +}; + +/* L2 miss */ +#define OMR_DATA_SOURCE_MAX 16 +static u64 omr_data_source[OMR_DATA_SOURCE_MAX] = { + P(OP, LOAD) | P(LVL, NA) | LEVEL(NA) | P(SNOOP, NA), /* 0x00: invalid */ + 0, /* 0x01: Reserved */ + OP_LH | P(LVL, L3) | LEVEL(L3) | P(REGION, L_SHARE), /* 0x02: local CA shared cache */ + OP_LH | P(LVL, L3) | LEVEL(L3) | P(REGION, L_NON_SHARE),/* 0x03: local CA non-shared cache */ + OP_LH | P(LVL, L3) | LEVEL(L3) | P(REGION, O_IO), /* 0x04: other CA IO agent */ + OP_LH | P(LVL, L3) | LEVEL(L3) | P(REGION, O_SHARE), /* 0x05: other CA shared cache */ + OP_LH | P(LVL, L3) | LEVEL(L3) | P(REGION, O_NON_SHARE),/* 0x06: other CA non-shared cache */ + OP_LH | LEVEL(RAM) | P(REGION, MMIO), /* 0x07: MMIO */ + OP_LH | LEVEL(RAM) | P(REGION, MEM0), /* 0x08: Memory region 0 */ + OP_LH | LEVEL(RAM) | P(REGION, MEM1), /* 0x09: Memory region 1 */ + OP_LH | LEVEL(RAM) | P(REGION, MEM2), /* 0x0a: Memory region 2 */ + OP_LH | LEVEL(RAM) | P(REGION, MEM3), /* 0x0b: Memory region 3 */ + OP_LH | LEVEL(RAM) | P(REGION, MEM4), /* 0x0c: Memory region 4 */ + OP_LH | LEVEL(RAM) | P(REGION, MEM5), /* 0x0d: Memory region 5 */ + OP_LH | LEVEL(RAM) | P(REGION, MEM6), /* 0x0e: Memory region 6 */ + OP_LH | LEVEL(RAM) | P(REGION, MEM7), /* 0x0f: Memory region 7 */ +}; + +static u64 parse_omr_data_source(u8 dse) +{ + union omr_encoding omr; + u64 val = 0; + + omr.omr_full = dse; + val = omr_data_source[omr.omr_source]; + if (omr.omr_source > 0x1 && omr.omr_source < 0x7) + val |= omr.omr_remote ? P(LVL, REM_CCE1) : 0; + else if (omr.omr_source > 0x7) + val |= omr.omr_remote ? P(LVL, REM_RAM1) : P(LVL, LOC_RAM); + + if (omr.omr_remote) + val |= REM; + + val |= omr.omr_hitm ? P(SNOOP, HITM) : P(SNOOP, HIT); + + if (omr.omr_source == 0x2) { + u8 snoop = omr.omr_snoop | omr.omr_promoted; + + if (snoop == 0x0) + val |= P(SNOOP, NA); + else if (snoop == 0x1) + val |= P(SNOOP, MISS); + else if (snoop == 0x2) + val |= P(SNOOP, HIT); + else if (snoop == 0x3) + val |= P(SNOOP, NONE); + } else if (omr.omr_source > 0x2 && omr.omr_source < 0x7) { + val |= omr.omr_snoop ? P(SNOOPX, FWD) : 0; + } + + return val; +} + static u64 precise_store_data(u64 status) { union intel_x86_pebs_dse dse; @@ -411,6 +513,44 @@ u64 arl_h_latency_data(struct perf_event *event, u64 status) return lnl_latency_data(event, status); } +u64 pnc_latency_data(struct perf_event *event, u64 status) +{ + union intel_x86_pebs_dse dse; + union perf_mem_data_src src; + u64 val; + + dse.val = status; + + if (!dse.pnc_l2_miss) + val = pnc_pebs_l2_hit_data_source[dse.pnc_dse & 0xf]; + else + val = parse_omr_data_source(dse.pnc_dse); + + if (!val) + val = P(OP, LOAD) | LEVEL(NA) | P(SNOOP, NA); + + if (dse.pnc_stlb_miss) + val |= P(TLB, MISS) | P(TLB, L2); + else + val |= P(TLB, HIT) | P(TLB, L1) | P(TLB, L2); + + if (dse.pnc_locked) + val |= P(LOCK, LOCKED); + + if (dse.pnc_data_blk) + val |= P(BLK, DATA); + if (dse.pnc_addr_blk) + val |= P(BLK, ADDR); + if (!dse.pnc_data_blk && !dse.pnc_addr_blk) + val |= P(BLK, NA); + + src.val = val; + if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW) + src.mem_op = P(OP, STORE); + + return src.val; +} + static u64 load_latency_data(struct perf_event *event, u64 status) { union intel_x86_pebs_dse dse; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 586e3fdfe6d8..bd501c2a0f73 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -1664,6 +1664,8 @@ u64 lnl_latency_data(struct perf_event *event, u64 status); u64 arl_h_latency_data(struct perf_event *event, u64 status); +u64 pnc_latency_data(struct perf_event *event, u64 status); + extern struct event_constraint intel_core2_pebs_event_constraints[]; extern struct event_constraint intel_atom_pebs_event_constraints[]; diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index c44a8fb3e418..533393ec94d0 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -1330,14 +1330,16 @@ union perf_mem_data_src { mem_snoopx : 2, /* Snoop mode, ext */ mem_blk : 3, /* Access blocked */ mem_hops : 3, /* Hop level */ - mem_rsvd : 18; + mem_region : 5, /* cache/memory regions */ + mem_rsvd : 13; }; }; #elif defined(__BIG_ENDIAN_BITFIELD) union perf_mem_data_src { __u64 val; struct { - __u64 mem_rsvd : 18, + __u64 mem_rsvd : 13, + mem_region : 5, /* cache/memory regions */ mem_hops : 3, /* Hop level */ mem_blk : 3, /* Access blocked */ mem_snoopx : 2, /* Snoop mode, ext */ @@ -1394,7 +1396,7 @@ union perf_mem_data_src { #define PERF_MEM_LVLNUM_L4 0x0004 /* L4 */ #define PERF_MEM_LVLNUM_L2_MHB 0x0005 /* L2 Miss Handling Buffer */ #define PERF_MEM_LVLNUM_MSC 0x0006 /* Memory-side Cache */ -/* 0x007 available */ +#define PERF_MEM_LVLNUM_L0 0x0007 /* L0 */ #define PERF_MEM_LVLNUM_UNC 0x0008 /* Uncached */ #define PERF_MEM_LVLNUM_CXL 0x0009 /* CXL */ #define PERF_MEM_LVLNUM_IO 0x000a /* I/O */ @@ -1447,6 +1449,25 @@ union perf_mem_data_src { /* 5-7 available */ #define PERF_MEM_HOPS_SHIFT 43 +/* Cache/Memory region */ +#define PERF_MEM_REGION_NA 0x0 /* Invalid */ +#define PERF_MEM_REGION_RSVD 0x01 /* Reserved */ +#define PERF_MEM_REGION_L_SHARE 0x02 /* Local CA shared cache */ +#define PERF_MEM_REGION_L_NON_SHARE 0x03 /* Local CA non-shared cache */ +#define PERF_MEM_REGION_O_IO 0x04 /* Other CA IO agent */ +#define PERF_MEM_REGION_O_SHARE 0x05 /* Other CA shared cache */ +#define PERF_MEM_REGION_O_NON_SHARE 0x06 /* Other CA non-shared cache */ +#define PERF_MEM_REGION_MMIO 0x07 /* MMIO */ +#define PERF_MEM_REGION_MEM0 0x08 /* Memory region 0 */ +#define PERF_MEM_REGION_MEM1 0x09 /* Memory region 1 */ +#define PERF_MEM_REGION_MEM2 0x0a /* Memory region 2 */ +#define PERF_MEM_REGION_MEM3 0x0b /* Memory region 3 */ +#define PERF_MEM_REGION_MEM4 0x0c /* Memory region 4 */ +#define PERF_MEM_REGION_MEM5 0x0d /* Memory region 5 */ +#define PERF_MEM_REGION_MEM6 0x0e /* Memory region 6 */ +#define PERF_MEM_REGION_MEM7 0x0f /* Memory region 7 */ +#define PERF_MEM_REGION_SHIFT 46 + #define PERF_MEM_S(a, s) \ (((__u64)PERF_MEM_##a##_##s) << PERF_MEM_##a##_SHIFT) diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index c44a8fb3e418..d4b99610a3b0 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -1330,14 +1330,16 @@ union perf_mem_data_src { mem_snoopx : 2, /* Snoop mode, ext */ mem_blk : 3, /* Access blocked */ mem_hops : 3, /* Hop level */ - mem_rsvd : 18; + mem_region : 5, /* cache/memory regions */ + mem_rsvd : 13; }; }; #elif defined(__BIG_ENDIAN_BITFIELD) union perf_mem_data_src { __u64 val; struct { - __u64 mem_rsvd : 18, + __u64 mem_rsvd : 13, + mem_region : 5, /* cache/memory regions */ mem_hops : 3, /* Hop level */ mem_blk : 3, /* Access blocked */ mem_snoopx : 2, /* Snoop mode, ext */ @@ -1394,7 +1396,7 @@ union perf_mem_data_src { #define PERF_MEM_LVLNUM_L4 0x0004 /* L4 */ #define PERF_MEM_LVLNUM_L2_MHB 0x0005 /* L2 Miss Handling Buffer */ #define PERF_MEM_LVLNUM_MSC 0x0006 /* Memory-side Cache */ -/* 0x007 available */ +#define PERF_MEM_LVLNUM_L0 0x0007 /* L0 */ #define PERF_MEM_LVLNUM_UNC 0x0008 /* Uncached */ #define PERF_MEM_LVLNUM_CXL 0x0009 /* CXL */ #define PERF_MEM_LVLNUM_IO 0x000a /* I/O */ @@ -1447,6 +1449,25 @@ union perf_mem_data_src { /* 5-7 available */ #define PERF_MEM_HOPS_SHIFT 43 +/* Cache/Memory region */ +#define PERF_MEM_REGION_NA 0x0 /* Invalid */ +#define PERF_MEM_REGION_RSVD 0x01 /* Reserved */ +#define PERF_MEM_REGION_L_SHARE 0x02 /* Local CA shared cache */ +#define PERF_MEM_REGION_L_NON_SHARE 0x03 /* Local CA non-shared cache */ +#define PERF_MEM_REGION_O_IO 0x04 /* Other CA IO agent */ +#define PERF_MEM_REGION_O_SHARE 0x05 /* Other CA shared cache */ +#define PERF_MEM_REGION_O_NON_SHARE 0x06 /* Other CA non-shared cache */ +#define PERF_MEM_REGION_MMIO 0x07 /* MMIO */ +#define PERF_MEM_REGION_MEM0 0x08 /* Memory region 0 */ +#define PERF_MEM_REGION_MEM1 0x09 /* Memory region 1 */ +#define PERF_MEM_REGION_MEM2 0x0a /* Memory region 2 */ +#define PERF_MEM_REGION_MEM3 0x0b /* Memory region 3 */ +#define PERF_MEM_REGION_MEM4 0x0c /* Memory region 4 */ +#define PERF_MEM_REGION_MEM5 0x0d /* Memory region 5 */ +#define PERF_MEM_REGION_MEM6 0x0e /* Memory region 6 */ +#define PERF_MEM_REGION_MEM7 0x0f /* Memory region 7 */ +#define PERF_MEM_REGION_SHIFT 46 + #define PERF_MEM_S(a, s) \ (((__u64)PERF_MEM_##a##_##s) << PERF_MEM_##a##_SHIFT) From d345b6bb886004ac1018da0348b5da7d9906071b Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Wed, 14 Jan 2026 09:17:46 +0800 Subject: [PATCH 49/58] perf/x86/intel: Add core PMU support for DMR This patch enables core PMU features for Diamond Rapids (Panther Cove microarchitecture), including Panther Cove specific counter and PEBS constraints, a new cache events ID table, and the model-specific OMR events extra registers table. For detailed information about counter constraints, please refer to section 16.3 "COUNTER RESTRICTIONS" in the ISE documentation. Signed-off-by: Dapeng Mi Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260114011750.350569-4-dapeng1.mi@linux.intel.com --- arch/x86/events/intel/core.c | 179 ++++++++++++++++++++++++++++++++++- arch/x86/events/intel/ds.c | 27 ++++++ arch/x86/events/perf_event.h | 2 + 3 files changed, 207 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 3578c660a904..b2f99d47292b 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -435,6 +435,62 @@ static struct extra_reg intel_lnc_extra_regs[] __read_mostly = { EVENT_EXTRA_END }; +static struct event_constraint intel_pnc_event_constraints[] = { + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x0100, 0), /* INST_RETIRED.PREC_DIST */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ + FIXED_EVENT_CONSTRAINT(0x013c, 2), /* CPU_CLK_UNHALTED.REF_TSC_P */ + FIXED_EVENT_CONSTRAINT(0x0400, 3), /* SLOTS */ + METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_RETIRING, 0), + METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_BAD_SPEC, 1), + METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_FE_BOUND, 2), + METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_BE_BOUND, 3), + METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_HEAVY_OPS, 4), + METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_BR_MISPREDICT, 5), + METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_FETCH_LAT, 6), + METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_MEM_BOUND, 7), + + INTEL_EVENT_CONSTRAINT(0x20, 0xf), + INTEL_EVENT_CONSTRAINT(0x79, 0xf), + + INTEL_UEVENT_CONSTRAINT(0x0275, 0xf), + INTEL_UEVENT_CONSTRAINT(0x0176, 0xf), + INTEL_UEVENT_CONSTRAINT(0x04a4, 0x1), + INTEL_UEVENT_CONSTRAINT(0x08a4, 0x1), + INTEL_UEVENT_CONSTRAINT(0x01cd, 0xfc), + INTEL_UEVENT_CONSTRAINT(0x02cd, 0x3), + + INTEL_EVENT_CONSTRAINT(0xd0, 0xf), + INTEL_EVENT_CONSTRAINT(0xd1, 0xf), + INTEL_EVENT_CONSTRAINT(0xd4, 0xf), + INTEL_EVENT_CONSTRAINT(0xd6, 0xf), + INTEL_EVENT_CONSTRAINT(0xdf, 0xf), + INTEL_EVENT_CONSTRAINT(0xce, 0x1), + + INTEL_UEVENT_CONSTRAINT(0x01b1, 0x8), + INTEL_UEVENT_CONSTRAINT(0x0847, 0xf), + INTEL_UEVENT_CONSTRAINT(0x0446, 0xf), + INTEL_UEVENT_CONSTRAINT(0x0846, 0xf), + INTEL_UEVENT_CONSTRAINT(0x0148, 0xf), + + EVENT_CONSTRAINT_END +}; + +static struct extra_reg intel_pnc_extra_regs[] __read_mostly = { + /* must define OMR_X first, see intel_alt_er() */ + INTEL_UEVENT_EXTRA_REG(0x012a, MSR_OMR_0, 0x40ffffff0000ffffull, OMR_0), + INTEL_UEVENT_EXTRA_REG(0x022a, MSR_OMR_1, 0x40ffffff0000ffffull, OMR_1), + INTEL_UEVENT_EXTRA_REG(0x042a, MSR_OMR_2, 0x40ffffff0000ffffull, OMR_2), + INTEL_UEVENT_EXTRA_REG(0x082a, MSR_OMR_3, 0x40ffffff0000ffffull, OMR_3), + INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd), + INTEL_UEVENT_EXTRA_REG(0x02c6, MSR_PEBS_FRONTEND, 0x9, FE), + INTEL_UEVENT_EXTRA_REG(0x03c6, MSR_PEBS_FRONTEND, 0x7fff1f, FE), + INTEL_UEVENT_EXTRA_REG(0x40ad, MSR_PEBS_FRONTEND, 0xf, FE), + INTEL_UEVENT_EXTRA_REG(0x04c2, MSR_PEBS_FRONTEND, 0x8, FE), + EVENT_EXTRA_END +}; + EVENT_ATTR_STR(mem-loads, mem_ld_nhm, "event=0x0b,umask=0x10,ldlat=3"); EVENT_ATTR_STR(mem-loads, mem_ld_snb, "event=0xcd,umask=0x1,ldlat=3"); EVENT_ATTR_STR(mem-stores, mem_st_snb, "event=0xcd,umask=0x2"); @@ -650,6 +706,102 @@ static __initconst const u64 glc_hw_cache_extra_regs }, }; +static __initconst const u64 pnc_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x81d0, + [ C(RESULT_MISS) ] = 0xe124, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x82d0, + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_MISS) ] = 0xe424, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x12a, + [ C(RESULT_MISS) ] = 0x12a, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x12a, + [ C(RESULT_MISS) ] = 0x12a, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x81d0, + [ C(RESULT_MISS) ] = 0xe12, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x82d0, + [ C(RESULT_MISS) ] = 0xe13, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = 0xe11, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x4c4, + [ C(RESULT_MISS) ] = 0x4c5, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + +static __initconst const u64 pnc_hw_cache_extra_regs + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x4000000000000001, + [ C(RESULT_MISS) ] = 0xFFFFF000000001, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x4000000000000002, + [ C(RESULT_MISS) ] = 0xFFFFF000000002, + }, + }, +}; + /* * Notes on the events: * - data reads do not include code reads (comparable to earlier tables) @@ -7236,6 +7388,20 @@ static __always_inline void intel_pmu_init_lnc(struct pmu *pmu) hybrid(pmu, extra_regs) = intel_lnc_extra_regs; } +static __always_inline void intel_pmu_init_pnc(struct pmu *pmu) +{ + intel_pmu_init_glc(pmu); + x86_pmu.flags &= ~PMU_FL_HAS_RSP_1; + x86_pmu.flags |= PMU_FL_HAS_OMR; + memcpy(hybrid_var(pmu, hw_cache_event_ids), + pnc_hw_cache_event_ids, sizeof(hw_cache_event_ids)); + memcpy(hybrid_var(pmu, hw_cache_extra_regs), + pnc_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); + hybrid(pmu, event_constraints) = intel_pnc_event_constraints; + hybrid(pmu, pebs_constraints) = intel_pnc_pebs_event_constraints; + hybrid(pmu, extra_regs) = intel_pnc_extra_regs; +} + static __always_inline void intel_pmu_init_skt(struct pmu *pmu) { intel_pmu_init_grt(pmu); @@ -7897,9 +8063,21 @@ __init int intel_pmu_init(void) x86_pmu.extra_regs = intel_rwc_extra_regs; pr_cont("Granite Rapids events, "); name = "granite_rapids"; + goto glc_common; + + case INTEL_DIAMONDRAPIDS_X: + intel_pmu_init_pnc(NULL); + x86_pmu.pebs_latency_data = pnc_latency_data; + + pr_cont("Panthercove events, "); + name = "panthercove"; + goto glc_base; glc_common: intel_pmu_init_glc(NULL); + intel_pmu_pebs_data_source_skl(true); + + glc_base: x86_pmu.pebs_ept = 1; x86_pmu.hw_config = hsw_hw_config; x86_pmu.get_event_constraints = glc_get_event_constraints; @@ -7909,7 +8087,6 @@ __init int intel_pmu_init(void) mem_attr = glc_events_attrs; td_attr = glc_td_events_attrs; tsx_attr = glc_tsx_events_attrs; - intel_pmu_pebs_data_source_skl(true); break; case INTEL_ALDERLAKE: diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 272e652f25fc..06e42ac33749 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1425,6 +1425,33 @@ struct event_constraint intel_lnc_pebs_event_constraints[] = { EVENT_CONSTRAINT_END }; +struct event_constraint intel_pnc_pebs_event_constraints[] = { + INTEL_FLAGS_UEVENT_CONSTRAINT(0x100, 0x100000000ULL), /* INST_RETIRED.PREC_DIST */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x0400, 0x800000000ULL), + + INTEL_HYBRID_LDLAT_CONSTRAINT(0x1cd, 0xfc), + INTEL_HYBRID_STLAT_CONSTRAINT(0x2cd, 0x3), + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_LOADS */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_STORES */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_INST_RETIRED.LOCK_LOADS */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_INST_RETIRED.SPLIT_LOADS */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_INST_RETIRED.SPLIT_STORES */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_INST_RETIRED.ALL_LOADS */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_INST_RETIRED.ALL_STORES */ + + INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD_RANGE(0xd1, 0xd4, 0xf), + + INTEL_FLAGS_EVENT_CONSTRAINT(0xd0, 0xf), + INTEL_FLAGS_EVENT_CONSTRAINT(0xd6, 0xf), + + /* + * Everything else is handled by PMU_FL_PEBS_ALL, because we + * need the full constraints from the main table. + */ + + EVENT_CONSTRAINT_END +}; + struct event_constraint *intel_pebs_constraints(struct perf_event *event) { struct event_constraint *pebs_constraints = hybrid(event->pmu, pebs_constraints); diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index bd501c2a0f73..cbca1888e8f7 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -1698,6 +1698,8 @@ extern struct event_constraint intel_glc_pebs_event_constraints[]; extern struct event_constraint intel_lnc_pebs_event_constraints[]; +extern struct event_constraint intel_pnc_pebs_event_constraints[]; + struct event_constraint *intel_pebs_constraints(struct perf_event *event); void intel_pmu_pebs_add(struct perf_event *event); From 7cd264d1972d13177acc1ac9fb11ee0a7003e2e6 Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Wed, 14 Jan 2026 09:17:47 +0800 Subject: [PATCH 50/58] perf/x86/intel: Add support for PEBS memory auxiliary info field in NVL Similar to DMR (Panther Cove uarch), both P-core (Coyote Cove uarch) and E-core (Arctic Wolf uarch) of NVL adopt the new PEBS memory auxiliary info layout. Coyote Cove microarchitecture shares the same PMU capabilities, including the memory auxiliary info layout, with Panther Cove. Arctic Wolf microarchitecture has a similar layout to Panther Cove, with the only difference being specific data source encoding for L2 hit cases (up to the L2 cache level). The OMR encoding remains the same as in Panther Cove. For detailed information on the memory auxiliary info encoding, please refer to section 16.2 "PEBS LOAD LATENCY AND STORE LATENCY FACILITY" in the latest ISE documentation. This patch defines Arctic Wolf specific data source encoding and then supports PEBS memory auxiliary info field for NVL. Signed-off-by: Dapeng Mi Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260114011750.350569-5-dapeng1.mi@linux.intel.com --- arch/x86/events/intel/ds.c | 83 ++++++++++++++++++++++++++++++++++++ arch/x86/events/perf_event.h | 2 + 2 files changed, 85 insertions(+) diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 06e42ac33749..a47f173d411b 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -96,6 +96,18 @@ union intel_x86_pebs_dse { unsigned int pnc_fb_full:1; unsigned int ld_reserved8:16; }; + struct { + unsigned int arw_dse:8; + unsigned int arw_l2_miss:1; + unsigned int arw_xq_promotion:1; + unsigned int arw_reissue:1; + unsigned int arw_stlb_miss:1; + unsigned int arw_locked:1; + unsigned int arw_data_blk:1; + unsigned int arw_addr_blk:1; + unsigned int arw_fb_full:1; + unsigned int ld_reserved9:16; + }; }; @@ -274,6 +286,29 @@ static u64 pnc_pebs_l2_hit_data_source[PNC_PEBS_DATA_SOURCE_MAX] = { OP_LH | P(LVL, UNC) | LEVEL(NA) | P(SNOOP, NONE), /* 0x0f: uncached */ }; +/* Version for Arctic Wolf and later */ + +/* L2 hit */ +#define ARW_PEBS_DATA_SOURCE_MAX 16 +static u64 arw_pebs_l2_hit_data_source[ARW_PEBS_DATA_SOURCE_MAX] = { + P(OP, LOAD) | P(LVL, NA) | LEVEL(NA) | P(SNOOP, NA), /* 0x00: non-cache access */ + OP_LH | P(LVL, L1) | LEVEL(L1) | P(SNOOP, NONE), /* 0x01: L1 hit */ + OP_LH | P(LVL, LFB) | LEVEL(LFB) | P(SNOOP, NONE), /* 0x02: WCB Hit */ + OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, NONE), /* 0x03: L2 Hit Clean */ + OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, HIT), /* 0x04: L2 Hit Snoop HIT */ + OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, HITM), /* 0x05: L2 Hit Snoop Hit Modified */ + OP_LH | P(LVL, UNC) | LEVEL(NA) | P(SNOOP, NONE), /* 0x06: uncached */ + 0, /* 0x07: Reserved */ + 0, /* 0x08: Reserved */ + 0, /* 0x09: Reserved */ + 0, /* 0x0a: Reserved */ + 0, /* 0x0b: Reserved */ + 0, /* 0x0c: Reserved */ + 0, /* 0x0d: Reserved */ + 0, /* 0x0e: Reserved */ + 0, /* 0x0f: Reserved */ +}; + /* L2 miss */ #define OMR_DATA_SOURCE_MAX 16 static u64 omr_data_source[OMR_DATA_SOURCE_MAX] = { @@ -458,6 +493,44 @@ u64 cmt_latency_data(struct perf_event *event, u64 status) dse.mtl_fwd_blk); } +static u64 arw_latency_data(struct perf_event *event, u64 status) +{ + union intel_x86_pebs_dse dse; + union perf_mem_data_src src; + u64 val; + + dse.val = status; + + if (!dse.arw_l2_miss) + val = arw_pebs_l2_hit_data_source[dse.arw_dse & 0xf]; + else + val = parse_omr_data_source(dse.arw_dse); + + if (!val) + val = P(OP, LOAD) | LEVEL(NA) | P(SNOOP, NA); + + if (dse.arw_stlb_miss) + val |= P(TLB, MISS) | P(TLB, L2); + else + val |= P(TLB, HIT) | P(TLB, L1) | P(TLB, L2); + + if (dse.arw_locked) + val |= P(LOCK, LOCKED); + + if (dse.arw_data_blk) + val |= P(BLK, DATA); + if (dse.arw_addr_blk) + val |= P(BLK, ADDR); + if (!dse.arw_data_blk && !dse.arw_addr_blk) + val |= P(BLK, NA); + + src.val = val; + if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW) + src.mem_op = P(OP, STORE); + + return src.val; +} + static u64 lnc_latency_data(struct perf_event *event, u64 status) { union intel_x86_pebs_dse dse; @@ -551,6 +624,16 @@ u64 pnc_latency_data(struct perf_event *event, u64 status) return src.val; } +u64 nvl_latency_data(struct perf_event *event, u64 status) +{ + struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu); + + if (pmu->pmu_type == hybrid_small) + return arw_latency_data(event, status); + + return pnc_latency_data(event, status); +} + static u64 load_latency_data(struct perf_event *event, u64 status) { union intel_x86_pebs_dse dse; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index cbca1888e8f7..aedc1a7762c2 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -1666,6 +1666,8 @@ u64 arl_h_latency_data(struct perf_event *event, u64 status); u64 pnc_latency_data(struct perf_event *event, u64 status); +u64 nvl_latency_data(struct perf_event *event, u64 status); + extern struct event_constraint intel_core2_pebs_event_constraints[]; extern struct event_constraint intel_atom_pebs_event_constraints[]; From c847a208f43bfeb56943f2ca6fe2baf1db9dee7a Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Wed, 14 Jan 2026 09:17:48 +0800 Subject: [PATCH 51/58] perf/x86/intel: Add core PMU support for Novalake This patch enables core PMU support for Novalake, covering both P-core and E-core. It includes Arctic Wolf-specific counters and PEBS constraints, and the model-specific OMR extra registers table. Since Coyote Cove shares the same PMU capabilities as Panther Cove, the existing Panther Cove PMU enabling functions are reused for Coyote Cove. For detailed information about counter constraints, please refer to section 16.3 "COUNTER RESTRICTIONS" in the ISE documentation. Signed-off-by: Dapeng Mi Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260114011750.350569-6-dapeng1.mi@linux.intel.com --- arch/x86/events/intel/core.c | 99 ++++++++++++++++++++++++++++++++++++ arch/x86/events/intel/ds.c | 11 ++++ arch/x86/events/perf_event.h | 2 + 3 files changed, 112 insertions(+) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index b2f99d47292b..d6bdbb7e449a 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -232,6 +232,29 @@ static struct event_constraint intel_skt_event_constraints[] __read_mostly = { EVENT_CONSTRAINT_END }; +static struct event_constraint intel_arw_event_constraints[] __read_mostly = { + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* pseudo CPU_CLK_UNHALTED.REF */ + FIXED_EVENT_CONSTRAINT(0x013c, 2), /* CPU_CLK_UNHALTED.REF_TSC_P */ + FIXED_EVENT_CONSTRAINT(0x0073, 4), /* TOPDOWN_BAD_SPECULATION.ALL */ + FIXED_EVENT_CONSTRAINT(0x019c, 5), /* TOPDOWN_FE_BOUND.ALL */ + FIXED_EVENT_CONSTRAINT(0x02c2, 6), /* TOPDOWN_RETIRING.ALL */ + INTEL_UEVENT_CONSTRAINT(0x01b7, 0x1), + INTEL_UEVENT_CONSTRAINT(0x02b7, 0x2), + INTEL_UEVENT_CONSTRAINT(0x04b7, 0x4), + INTEL_UEVENT_CONSTRAINT(0x08b7, 0x8), + INTEL_UEVENT_CONSTRAINT(0x01d4, 0x1), + INTEL_UEVENT_CONSTRAINT(0x02d4, 0x2), + INTEL_UEVENT_CONSTRAINT(0x04d4, 0x4), + INTEL_UEVENT_CONSTRAINT(0x08d4, 0x8), + INTEL_UEVENT_CONSTRAINT(0x0175, 0x1), + INTEL_UEVENT_CONSTRAINT(0x0275, 0x2), + INTEL_UEVENT_CONSTRAINT(0x21d3, 0x1), + INTEL_UEVENT_CONSTRAINT(0x22d3, 0x1), + EVENT_CONSTRAINT_END +}; + static struct event_constraint intel_skl_event_constraints[] = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ @@ -2319,6 +2342,26 @@ static __initconst const u64 tnt_hw_cache_extra_regs }, }; +static __initconst const u64 arw_hw_cache_extra_regs + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = { + [C(LL)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x4000000000000001, + [C(RESULT_MISS)] = 0xFFFFF000000001, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = 0x4000000000000002, + [C(RESULT_MISS)] = 0xFFFFF000000002, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = 0x0, + [C(RESULT_MISS)] = 0x0, + }, + }, +}; + EVENT_ATTR_STR(topdown-fe-bound, td_fe_bound_tnt, "event=0x71,umask=0x0"); EVENT_ATTR_STR(topdown-retiring, td_retiring_tnt, "event=0xc2,umask=0x0"); EVENT_ATTR_STR(topdown-bad-spec, td_bad_spec_tnt, "event=0x73,umask=0x6"); @@ -2377,6 +2420,22 @@ static struct extra_reg intel_cmt_extra_regs[] __read_mostly = { EVENT_EXTRA_END }; +static struct extra_reg intel_arw_extra_regs[] __read_mostly = { + /* must define OMR_X first, see intel_alt_er() */ + INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OMR_0, 0xc0ffffffffffffffull, OMR_0), + INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OMR_1, 0xc0ffffffffffffffull, OMR_1), + INTEL_UEVENT_EXTRA_REG(0x04b7, MSR_OMR_2, 0xc0ffffffffffffffull, OMR_2), + INTEL_UEVENT_EXTRA_REG(0x08b7, MSR_OMR_3, 0xc0ffffffffffffffull, OMR_3), + INTEL_UEVENT_EXTRA_REG(0x01d4, MSR_OMR_0, 0xc0ffffffffffffffull, OMR_0), + INTEL_UEVENT_EXTRA_REG(0x02d4, MSR_OMR_1, 0xc0ffffffffffffffull, OMR_1), + INTEL_UEVENT_EXTRA_REG(0x04d4, MSR_OMR_2, 0xc0ffffffffffffffull, OMR_2), + INTEL_UEVENT_EXTRA_REG(0x08d4, MSR_OMR_3, 0xc0ffffffffffffffull, OMR_3), + INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x5d0), + INTEL_UEVENT_EXTRA_REG(0x0127, MSR_SNOOP_RSP_0, 0xffffffffffffffffull, SNOOP_0), + INTEL_UEVENT_EXTRA_REG(0x0227, MSR_SNOOP_RSP_1, 0xffffffffffffffffull, SNOOP_1), + EVENT_EXTRA_END +}; + EVENT_ATTR_STR(topdown-fe-bound, td_fe_bound_skt, "event=0x9c,umask=0x01"); EVENT_ATTR_STR(topdown-retiring, td_retiring_skt, "event=0xc2,umask=0x02"); EVENT_ATTR_STR(topdown-be-bound, td_be_bound_skt, "event=0xa4,umask=0x02"); @@ -7410,6 +7469,19 @@ static __always_inline void intel_pmu_init_skt(struct pmu *pmu) static_call_update(intel_pmu_enable_acr_event, intel_pmu_enable_acr); } +static __always_inline void intel_pmu_init_arw(struct pmu *pmu) +{ + intel_pmu_init_grt(pmu); + x86_pmu.flags &= ~PMU_FL_HAS_RSP_1; + x86_pmu.flags |= PMU_FL_HAS_OMR; + memcpy(hybrid_var(pmu, hw_cache_extra_regs), + arw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); + hybrid(pmu, event_constraints) = intel_arw_event_constraints; + hybrid(pmu, pebs_constraints) = intel_arw_pebs_event_constraints; + hybrid(pmu, extra_regs) = intel_arw_extra_regs; + static_call_update(intel_pmu_enable_acr_event, intel_pmu_enable_acr); +} + __init int intel_pmu_init(void) { struct attribute **extra_skl_attr = &empty_attrs; @@ -8250,6 +8322,33 @@ __init int intel_pmu_init(void) name = "arrowlake_h_hybrid"; break; + case INTEL_NOVALAKE: + case INTEL_NOVALAKE_L: + pr_cont("Novalake Hybrid events, "); + name = "novalake_hybrid"; + intel_pmu_init_hybrid(hybrid_big_small); + + x86_pmu.pebs_latency_data = nvl_latency_data; + x86_pmu.get_event_constraints = mtl_get_event_constraints; + x86_pmu.hw_config = adl_hw_config; + + td_attr = lnl_hybrid_events_attrs; + mem_attr = mtl_hybrid_mem_attrs; + tsx_attr = adl_hybrid_tsx_attrs; + extra_attr = boot_cpu_has(X86_FEATURE_RTM) ? + mtl_hybrid_extra_attr_rtm : mtl_hybrid_extra_attr; + + /* Initialize big core specific PerfMon capabilities.*/ + pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_CORE_IDX]; + intel_pmu_init_pnc(&pmu->pmu); + + /* Initialize Atom core specific PerfMon capabilities.*/ + pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_ATOM_IDX]; + intel_pmu_init_arw(&pmu->pmu); + + intel_pmu_pebs_data_source_lnl(); + break; + default: switch (x86_pmu.version) { case 1: diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index a47f173d411b..5027afc97b65 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1293,6 +1293,17 @@ struct event_constraint intel_grt_pebs_event_constraints[] = { EVENT_CONSTRAINT_END }; +struct event_constraint intel_arw_pebs_event_constraints[] = { + /* Allow all events as PEBS with no flags */ + INTEL_HYBRID_LAT_CONSTRAINT(0x5d0, 0xff), + INTEL_HYBRID_LAT_CONSTRAINT(0x6d0, 0xff), + INTEL_FLAGS_UEVENT_CONSTRAINT(0x01d4, 0x1), + INTEL_FLAGS_UEVENT_CONSTRAINT(0x02d4, 0x2), + INTEL_FLAGS_UEVENT_CONSTRAINT(0x04d4, 0x4), + INTEL_FLAGS_UEVENT_CONSTRAINT(0x08d4, 0x8), + EVENT_CONSTRAINT_END +}; + struct event_constraint intel_nehalem_pebs_event_constraints[] = { INTEL_PLD_CONSTRAINT(0x100b, 0xf), /* MEM_INST_RETIRED.* */ INTEL_FLAGS_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */ diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index aedc1a7762c2..f7caabc5d487 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -1680,6 +1680,8 @@ extern struct event_constraint intel_glp_pebs_event_constraints[]; extern struct event_constraint intel_grt_pebs_event_constraints[]; +extern struct event_constraint intel_arw_pebs_event_constraints[]; + extern struct event_constraint intel_nehalem_pebs_event_constraints[]; extern struct event_constraint intel_westmere_pebs_event_constraints[]; From 8c74e4e3e0596950554962229582260f1501d899 Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Wed, 14 Jan 2026 09:17:49 +0800 Subject: [PATCH 52/58] perf/x86: Use macros to replace magic numbers in attr_rdpmc Replace magic numbers in attr_rdpmc with macros to improve readability and make their meanings clearer for users. Signed-off-by: Dapeng Mi Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260114011750.350569-7-dapeng1.mi@linux.intel.com --- arch/x86/events/core.c | 7 ++++--- arch/x86/events/intel/p6.c | 2 +- arch/x86/events/perf_event.h | 7 +++++++ 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 0ecac9495d74..c2717cb5034f 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2163,7 +2163,8 @@ static int __init init_hw_perf_events(void) pr_cont("%s PMU driver.\n", x86_pmu.name); - x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */ + /* enable userspace RDPMC usage by default */ + x86_pmu.attr_rdpmc = X86_USER_RDPMC_CONDITIONAL_ENABLE; for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next) quirk->func(); @@ -2643,12 +2644,12 @@ static ssize_t set_attr_rdpmc(struct device *cdev, */ if (val == 0) static_branch_inc(&rdpmc_never_available_key); - else if (x86_pmu.attr_rdpmc == 0) + else if (x86_pmu.attr_rdpmc == X86_USER_RDPMC_NEVER_ENABLE) static_branch_dec(&rdpmc_never_available_key); if (val == 2) static_branch_inc(&rdpmc_always_available_key); - else if (x86_pmu.attr_rdpmc == 2) + else if (x86_pmu.attr_rdpmc == X86_USER_RDPMC_ALWAYS_ENABLE) static_branch_dec(&rdpmc_always_available_key); on_each_cpu(cr4_update_pce, NULL, 1); diff --git a/arch/x86/events/intel/p6.c b/arch/x86/events/intel/p6.c index 6e41de355bd8..fb991e0ac614 100644 --- a/arch/x86/events/intel/p6.c +++ b/arch/x86/events/intel/p6.c @@ -243,7 +243,7 @@ static __init void p6_pmu_rdpmc_quirk(void) */ pr_warn("Userspace RDPMC support disabled due to a CPU erratum\n"); x86_pmu.attr_rdpmc_broken = 1; - x86_pmu.attr_rdpmc = 0; + x86_pmu.attr_rdpmc = X86_USER_RDPMC_NEVER_ENABLE; } } diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index f7caabc5d487..24a81d2916e9 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -187,6 +187,13 @@ struct amd_nb { (1ULL << PERF_REG_X86_R14) | \ (1ULL << PERF_REG_X86_R15)) +/* user space rdpmc control values */ +enum { + X86_USER_RDPMC_NEVER_ENABLE = 0, + X86_USER_RDPMC_CONDITIONAL_ENABLE = 1, + X86_USER_RDPMC_ALWAYS_ENABLE = 2, +}; + /* * Per register state. */ From 59af95e028d4114991b9bd96a39ad855b399cc07 Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Wed, 14 Jan 2026 09:17:50 +0800 Subject: [PATCH 53/58] perf/x86/intel: Add support for rdpmc user disable feature Starting with Panther Cove, the rdpmc user disable feature is supported. This feature allows the perf system to disable user space rdpmc reads at the counter level. Currently, when a global counter is active, any user with rdpmc rights can read it, even if perf access permissions forbid it (e.g., disallow reading ring 0 counters). The rdpmc user disable feature mitigates this security concern. Details: - A new RDPMC_USR_DISABLE bit (bit 37) in each EVNTSELx MSR indicates that the GP counter cannot be read by RDPMC in ring 3. - New RDPMC_USR_DISABLE bits in IA32_FIXED_CTR_CTRL MSR (bits 33, 37, 41, 45, etc.) for fixed counters 0, 1, 2, 3, etc. - When calling rdpmc instruction for counter x, the following pseudo code demonstrates how the counter value is obtained: If (!CPL0 && RDPMC_USR_DISABLE[x] == 1) ? 0 : counter_value; - RDPMC_USR_DISABLE is enumerated by CPUID.0x23.0.EBX[2]. This patch extends the current global user space rdpmc control logic via the sysfs interface (/sys/devices/cpu/rdpmc) as follows: - rdpmc = 0: Global user space rdpmc and counter-level user space rdpmc for all counters are both disabled. - rdpmc = 1: Global user space rdpmc is enabled during the mmap-enabled time window, and counter-level user space rdpmc is enabled only for non-system-wide events. This prevents counter data leaks as count data is cleared during context switches. - rdpmc = 2: Global user space rdpmc and counter-level user space rdpmc for all counters are enabled unconditionally. The new rdpmc settings only affect newly activated perf events; currently active perf events remain unaffected. This simplifies and cleans up the code. The default value of rdpmc remains unchanged at 1. For more details about rdpmc user disable, please refer to chapter 15 "RDPMC USER DISABLE" in ISE documentation. Signed-off-by: Dapeng Mi Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260114011750.350569-8-dapeng1.mi@linux.intel.com --- .../sysfs-bus-event_source-devices-rdpmc | 44 +++++++++++++++++++ arch/x86/events/core.c | 21 +++++++++ arch/x86/events/intel/core.c | 27 ++++++++++++ arch/x86/events/perf_event.h | 6 +++ arch/x86/include/asm/perf_event.h | 8 +++- 5 files changed, 104 insertions(+), 2 deletions(-) create mode 100644 Documentation/ABI/testing/sysfs-bus-event_source-devices-rdpmc diff --git a/Documentation/ABI/testing/sysfs-bus-event_source-devices-rdpmc b/Documentation/ABI/testing/sysfs-bus-event_source-devices-rdpmc new file mode 100644 index 000000000000..59ec18bbb418 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-bus-event_source-devices-rdpmc @@ -0,0 +1,44 @@ +What: /sys/bus/event_source/devices/cpu.../rdpmc +Date: November 2011 +KernelVersion: 3.10 +Contact: Linux kernel mailing list linux-kernel@vger.kernel.org +Description: The /sys/bus/event_source/devices/cpu.../rdpmc attribute + is used to show/manage if rdpmc instruction can be + executed in user space. This attribute supports 3 numbers. + - rdpmc = 0 + user space rdpmc is globally disabled for all PMU + counters. + - rdpmc = 1 + user space rdpmc is globally enabled only in event mmap + ioctl called time window. If the mmap region is unmapped, + user space rdpmc is disabled again. + - rdpmc = 2 + user space rdpmc is globally enabled for all PMU + counters. + + In the Intel platforms supporting counter level's user + space rdpmc disable feature (CPUID.23H.EBX[2] = 1), the + meaning of 3 numbers is extended to + - rdpmc = 0 + global user space rdpmc and counter level's user space + rdpmc of all counters are both disabled. + - rdpmc = 1 + No changes on behavior of global user space rdpmc. + counter level's rdpmc of system-wide events is disabled + but counter level's rdpmc of non-system-wide events is + enabled. + - rdpmc = 2 + global user space rdpmc and counter level's user space + rdpmc of all counters are both enabled unconditionally. + + The default value of rdpmc is 1. + + Please notice: + - global user space rdpmc's behavior would change + immediately along with the rdpmc value's change, + but the behavior of counter level's user space rdpmc + won't take effect immediately until the event is + reactivated or recreated. + - The rdpmc attribute is global, even for x86 hybrid + platforms. For example, changing cpu_core/rdpmc will + also change cpu_atom/rdpmc. diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index c2717cb5034f..6df73e8398cd 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2616,6 +2616,27 @@ static ssize_t get_attr_rdpmc(struct device *cdev, return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc); } +/* + * Behaviors of rdpmc value: + * - rdpmc = 0 + * global user space rdpmc and counter level's user space rdpmc of all + * counters are both disabled. + * - rdpmc = 1 + * global user space rdpmc is enabled in mmap enabled time window and + * counter level's user space rdpmc is enabled for only non system-wide + * events. Counter level's user space rdpmc of system-wide events is + * still disabled by default. This won't introduce counter data leak for + * non system-wide events since their count data would be cleared when + * context switches. + * - rdpmc = 2 + * global user space rdpmc and counter level's user space rdpmc of all + * counters are enabled unconditionally. + * + * Suppose the rdpmc value won't be changed frequently, don't dynamically + * reschedule events to make the new rpdmc value take effect on active perf + * events immediately, the new rdpmc value would only impact the new + * activated perf events. This makes code simpler and cleaner. + */ static ssize_t set_attr_rdpmc(struct device *cdev, struct device_attribute *attr, const char *buf, size_t count) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index d6bdbb7e449a..f3ae1f8ee3cd 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3128,6 +3128,8 @@ static void intel_pmu_enable_fixed(struct perf_event *event) bits |= INTEL_FIXED_0_USER; if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) bits |= INTEL_FIXED_0_KERNEL; + if (hwc->config & ARCH_PERFMON_EVENTSEL_RDPMC_USER_DISABLE) + bits |= INTEL_FIXED_0_RDPMC_USER_DISABLE; /* * ANY bit is supported in v3 and up @@ -3263,6 +3265,27 @@ static void intel_pmu_enable_event_ext(struct perf_event *event) __intel_pmu_update_event_ext(hwc->idx, ext); } +static void intel_pmu_update_rdpmc_user_disable(struct perf_event *event) +{ + if (!x86_pmu_has_rdpmc_user_disable(event->pmu)) + return; + + /* + * Counter scope's user-space rdpmc is disabled by default + * except two cases. + * a. rdpmc = 2 (user space rdpmc enabled unconditionally) + * b. rdpmc = 1 and the event is not a system-wide event. + * The count of non-system-wide events would be cleared when + * context switches, so no count data is leaked. + */ + if (x86_pmu.attr_rdpmc == X86_USER_RDPMC_ALWAYS_ENABLE || + (x86_pmu.attr_rdpmc == X86_USER_RDPMC_CONDITIONAL_ENABLE && + event->ctx->task)) + event->hw.config &= ~ARCH_PERFMON_EVENTSEL_RDPMC_USER_DISABLE; + else + event->hw.config |= ARCH_PERFMON_EVENTSEL_RDPMC_USER_DISABLE; +} + DEFINE_STATIC_CALL_NULL(intel_pmu_enable_event_ext, intel_pmu_enable_event_ext); static void intel_pmu_enable_event(struct perf_event *event) @@ -3271,6 +3294,8 @@ static void intel_pmu_enable_event(struct perf_event *event) struct hw_perf_event *hwc = &event->hw; int idx = hwc->idx; + intel_pmu_update_rdpmc_user_disable(event); + if (unlikely(event->attr.precise_ip)) static_call(x86_pmu_pebs_enable)(event); @@ -5869,6 +5894,8 @@ static void update_pmu_cap(struct pmu *pmu) hybrid(pmu, config_mask) |= ARCH_PERFMON_EVENTSEL_UMASK2; if (ebx_0.split.eq) hybrid(pmu, config_mask) |= ARCH_PERFMON_EVENTSEL_EQ; + if (ebx_0.split.rdpmc_user_disable) + hybrid(pmu, config_mask) |= ARCH_PERFMON_EVENTSEL_RDPMC_USER_DISABLE; if (eax_0.split.cntr_subleaf) { cpuid_count(ARCH_PERFMON_EXT_LEAF, ARCH_PERFMON_NUM_COUNTER_LEAF, diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 24a81d2916e9..cd337f3ffd01 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -1333,6 +1333,12 @@ static inline u64 x86_pmu_get_event_config(struct perf_event *event) return event->attr.config & hybrid(event->pmu, config_mask); } +static inline bool x86_pmu_has_rdpmc_user_disable(struct pmu *pmu) +{ + return !!(hybrid(pmu, config_mask) & + ARCH_PERFMON_EVENTSEL_RDPMC_USER_DISABLE); +} + extern struct event_constraint emptyconstraint; extern struct event_constraint unconstrained; diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 0d9af4135e0a..ff5acb8b199b 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -33,6 +33,7 @@ #define ARCH_PERFMON_EVENTSEL_CMASK 0xFF000000ULL #define ARCH_PERFMON_EVENTSEL_BR_CNTR (1ULL << 35) #define ARCH_PERFMON_EVENTSEL_EQ (1ULL << 36) +#define ARCH_PERFMON_EVENTSEL_RDPMC_USER_DISABLE (1ULL << 37) #define ARCH_PERFMON_EVENTSEL_UMASK2 (0xFFULL << 40) #define INTEL_FIXED_BITS_STRIDE 4 @@ -40,6 +41,7 @@ #define INTEL_FIXED_0_USER (1ULL << 1) #define INTEL_FIXED_0_ANYTHREAD (1ULL << 2) #define INTEL_FIXED_0_ENABLE_PMI (1ULL << 3) +#define INTEL_FIXED_0_RDPMC_USER_DISABLE (1ULL << 33) #define INTEL_FIXED_3_METRICS_CLEAR (1ULL << 2) #define HSW_IN_TX (1ULL << 32) @@ -50,7 +52,7 @@ #define INTEL_FIXED_BITS_MASK \ (INTEL_FIXED_0_KERNEL | INTEL_FIXED_0_USER | \ INTEL_FIXED_0_ANYTHREAD | INTEL_FIXED_0_ENABLE_PMI | \ - ICL_FIXED_0_ADAPTIVE) + ICL_FIXED_0_ADAPTIVE | INTEL_FIXED_0_RDPMC_USER_DISABLE) #define intel_fixed_bits_by_idx(_idx, _bits) \ ((_bits) << ((_idx) * INTEL_FIXED_BITS_STRIDE)) @@ -226,7 +228,9 @@ union cpuid35_ebx { unsigned int umask2:1; /* EQ-bit Supported */ unsigned int eq:1; - unsigned int reserved:30; + /* rdpmc user disable Supported */ + unsigned int rdpmc_user_disable:1; + unsigned int reserved:29; } split; unsigned int full; }; From 10d6d2416db2137a5a0ef9162662e5b7fee56dd4 Mon Sep 17 00:00:00 2001 From: Chen Ni Date: Wed, 14 Jan 2026 10:36:52 +0800 Subject: [PATCH 54/58] perf/x86/intel/uncore: Convert comma to semicolon Replace comma between expressions with semicolons. Using a ',' in place of a ';' can have unintended side effects. Although that is not the case here, it is seems best to use ';' unless ',' is intended. Found by inspection. No functional change intended. Compile tested only. Fixes: e7d5f2ea0923 ("perf/x86/intel/uncore: Add Nova Lake support") Signed-off-by: Chen Ni Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dapeng Mi Link: https://patch.msgid.link/20260114023652.3926117-1-nichen@iscas.ac.cn --- arch/x86/events/intel/uncore_snb.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index e8e44741200e..3dbc6bacbd9d 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -2005,11 +2005,11 @@ static struct intel_uncore_type *nvl_msr_uncores[] = { void nvl_uncore_cpu_init(void) { mtl_uncore_cbox.num_boxes = 12; - mtl_uncore_cbox.perf_ctr = NVL_UNC_CBOX_PER_CTR0, - mtl_uncore_cbox.event_ctl = NVL_UNC_CBOX_PERFEVTSEL0, + mtl_uncore_cbox.perf_ctr = NVL_UNC_CBOX_PER_CTR0; + mtl_uncore_cbox.event_ctl = NVL_UNC_CBOX_PERFEVTSEL0; - ptl_uncore_santa.perf_ctr = NVL_UNC_SANTA_CTR0, - ptl_uncore_santa.event_ctl = NVL_UNC_SANTA_CTRL0, + ptl_uncore_santa.perf_ctr = NVL_UNC_SANTA_CTR0; + ptl_uncore_santa.event_ctl = NVL_UNC_SANTA_CTRL0; mtl_uncore_cncu.box_ctl = NVL_UNC_CNCU_BOX_CTL; mtl_uncore_cncu.fixed_ctr = NVL_UNC_CNCU_FIXED_CTR; From d55c571e4333fac71826e8db3b9753fadfbead6a Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 11 Jan 2026 16:00:37 +0100 Subject: [PATCH 55/58] x86/uprobes: Fix XOL allocation failure for 32-bit tasks This script #!/usr/bin/bash echo 0 > /proc/sys/kernel/randomize_va_space echo 'void main(void) {}' > TEST.c # -fcf-protection to ensure that the 1st endbr32 insn can't be emulated gcc -m32 -fcf-protection=branch TEST.c -o test bpftrace -e 'uprobe:./test:main {}' -c ./test "hangs", the probed ./test task enters an endless loop. The problem is that with randomize_va_space == 0 get_unmapped_area(TASK_SIZE - PAGE_SIZE) called by xol_add_vma() can not just return the "addr == TASK_SIZE - PAGE_SIZE" hint, this addr is used by the stack vma. arch_get_unmapped_area_topdown() doesn't take TIF_ADDR32 into account and in_32bit_syscall() is false, this leads to info.high_limit > TASK_SIZE. vm_unmapped_area() happily returns the high address > TASK_SIZE and then get_unmapped_area() returns -ENOMEM after the "if (addr > TASK_SIZE - len)" check. handle_swbp() doesn't report this failure (probably it should) and silently restarts the probed insn. Endless loop. I think that the right fix should change the x86 get_unmapped_area() paths to rely on TIF_ADDR32 rather than in_32bit_syscall(). Note also that if CONFIG_X86_X32_ABI=y, in_x32_syscall() falsely returns true in this case because ->orig_ax = -1. But we need a simple fix for -stable, so this patch just sets TS_COMPAT if the probed task is 32-bit to make in_ia32_syscall() true. Fixes: 1b028f784e8c ("x86/mm: Introduce mmap_compat_base() for 32-bit mmap()") Reported-by: Paulo Andrade Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/all/aV5uldEvV7pb4RA8@redhat.com/ Cc: stable@vger.kernel.org Link: https://patch.msgid.link/aWO7Fdxn39piQnxu@redhat.com --- arch/x86/kernel/uprobes.c | 24 ++++++++++++++++++++++++ include/linux/uprobes.h | 1 + kernel/events/uprobes.c | 10 +++++++--- 3 files changed, 32 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 7be8e361ca55..619dddf54424 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -1823,3 +1823,27 @@ bool is_uprobe_at_func_entry(struct pt_regs *regs) return false; } + +#ifdef CONFIG_IA32_EMULATION +unsigned long arch_uprobe_get_xol_area(void) +{ + struct thread_info *ti = current_thread_info(); + unsigned long vaddr; + + /* + * HACK: we are not in a syscall, but x86 get_unmapped_area() paths + * ignore TIF_ADDR32 and rely on in_32bit_syscall() to calculate + * vm_unmapped_area_info.high_limit. + * + * The #ifdef above doesn't cover the CONFIG_X86_X32_ABI=y case, + * but in this case in_32bit_syscall() -> in_x32_syscall() always + * (falsely) returns true because ->orig_ax == -1. + */ + if (test_thread_flag(TIF_ADDR32)) + ti->status |= TS_COMPAT; + vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0); + ti->status &= ~TS_COMPAT; + + return vaddr; +} +#endif diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index ee3d36eda45d..f548fea2adec 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -242,6 +242,7 @@ extern void arch_uprobe_clear_state(struct mm_struct *mm); extern void arch_uprobe_init_state(struct mm_struct *mm); extern void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr); extern void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr); +extern unsigned long arch_uprobe_get_xol_area(void); #else /* !CONFIG_UPROBES */ struct uprobes_state { }; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index a7d7d83ca1d7..dfbce021fb02 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1694,6 +1694,12 @@ static const struct vm_special_mapping xol_mapping = { .mremap = xol_mremap, }; +unsigned long __weak arch_uprobe_get_xol_area(void) +{ + /* Try to map as high as possible, this is only a hint. */ + return get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0); +} + /* Slot allocation for XOL */ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) { @@ -1709,9 +1715,7 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) } if (!area->vaddr) { - /* Try to map as high as possible, this is only a hint. */ - area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, - PAGE_SIZE, 0, 0); + area->vaddr = arch_uprobe_get_xol_area(); if (IS_ERR_VALUE(area->vaddr)) { ret = area->vaddr; goto fail; From 171efc70097a9f5f207461bed03478a1b0a3dfa6 Mon Sep 17 00:00:00 2001 From: Xiang-Bin Shi Date: Fri, 23 Jan 2026 15:57:19 +0800 Subject: [PATCH 56/58] x86/ibs: Fix typo in dc_l2tlb_miss comment The comment for dc_l2tlb_miss incorrectly describes it as a "hit". This contradicts the field name and the actual bit definition. Fix the comment to correctly describe it as a "miss". Signed-off-by: Xiang-Bin Shi Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260123075719.160734-1-eric91102091@gmail.com --- arch/x86/include/asm/amd/ibs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/amd/ibs.h b/arch/x86/include/asm/amd/ibs.h index 3ee5903982c2..fcc8a5abe54e 100644 --- a/arch/x86/include/asm/amd/ibs.h +++ b/arch/x86/include/asm/amd/ibs.h @@ -110,7 +110,7 @@ union ibs_op_data3 { __u64 ld_op:1, /* 0: load op */ st_op:1, /* 1: store op */ dc_l1tlb_miss:1, /* 2: data cache L1TLB miss */ - dc_l2tlb_miss:1, /* 3: data cache L2TLB hit in 2M page */ + dc_l2tlb_miss:1, /* 3: data cache L2TLB miss in 2M page */ dc_l1tlb_hit_2m:1, /* 4: data cache L1TLB hit in 2M page */ dc_l1tlb_hit_1g:1, /* 5: data cache L1TLB hit in 1G page */ dc_l2tlb_hit_2m:1, /* 6: data cache L2TLB hit in 2M page */ From a56a38fd9196fc89401e498d70b7aa9c9679fa6e Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Wed, 28 Jan 2026 10:16:11 -0800 Subject: [PATCH 57/58] uprobes: Fix incorrect lockdep condition in filter_chain() The list_for_each_entry_rcu() in filter_chain() uses rcu_read_lock_trace_held() as the lockdep condition, but the function holds consumer_rwsem, not the RCU trace lock. This gives me the following output when running with some locking debug option enabled: kernel/events/uprobes.c:1141 RCU-list traversed in non-reader section!! filter_chain register_for_each_vma uprobe_unregister_nosync __probe_event_disable Remove the incorrect lockdep condition since the rwsem provides sufficient protection for the list traversal. Fixes: cc01bd044e6a ("uprobes: travers uprobe's consumer list locklessly under SRCU protection") Signed-off-by: Breno Leitao Signed-off-by: Peter Zijlstra (Intel) Acked-by: Oleg Nesterov Acked-by: Andrii Nakryiko Acked-by: Masami Hiramatsu (Google) Cc: stable@vger.kernel.org Link: https://patch.msgid.link/20260128-uprobe_rcu-v2-1-994ea6d32730@debian.org --- kernel/events/uprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index dfbce021fb02..424ef2235b07 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1138,7 +1138,7 @@ static bool filter_chain(struct uprobe *uprobe, struct mm_struct *mm) bool ret = false; down_read(&uprobe->consumer_rwsem); - list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) { + list_for_each_entry(uc, &uprobe->consumers, cons_node) { ret = consumer_filter(uc, mm); if (ret) break; From 7db06e329af30dcb170a6782c1714217ad65033d Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 3 Feb 2026 10:42:04 -0800 Subject: [PATCH 58/58] s390: remove kvm_types.h from Kbuild kvm_types.h is mandatory in include/asm-generic/Kbuild so having it in another Kbuild file causes a warning. Remove it from the arch/ Kbuild file to fix the warning. ../scripts/Makefile.asm-headers:39: redundant generic-y found in ../arch/s390/include/asm/Kbuild: kvm_types.h Signed-off-by: Randy Dunlap Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260203184204.1329414-1-rdunlap@infradead.org --- arch/s390/include/asm/Kbuild | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/s390/include/asm/Kbuild b/arch/s390/include/asm/Kbuild index 297bf7157968..80bad7de7a04 100644 --- a/arch/s390/include/asm/Kbuild +++ b/arch/s390/include/asm/Kbuild @@ -5,6 +5,5 @@ generated-y += syscall_table.h generated-y += unistd_nr.h generic-y += asm-offsets.h -generic-y += kvm_types.h generic-y += mcs_spinlock.h generic-y += mmzone.h