From 88aea41b9bc5f6cb325396c3311f581c5a1aad65 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Feb 2025 13:48:54 +0000 Subject: [PATCH 01/76] arm64: cpufeature: Handle NV_frac as a synonym of NV2 With ARMv9.5, an implementation supporting Nested Virtualization is allowed to only support NV2, and to avoid supporting the old (and useless) ARMv8.3 variant. This is indicated by ID_AA64MMFR2_EL1.NV being 0 (as if NV wasn't implemented) and ID_AA64MMFR4_EL1.NV_frac being 1 (indicating that NV2 is actually supported). Given that KVM only deals with NV2 and refuses to use the old NV, detecting NV2 or NV_frac is what we need to enable it. Signed-off-by: Marc Zyngier Reviewed-by: Joey Gouly Link: https://lore.kernel.org/r/20250220134907.554085-2-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kernel/cpufeature.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index d561cf3b8ac7..2c198cd4f940 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -497,6 +497,7 @@ static const struct arm64_ftr_bits ftr_id_aa64mmfr3[] = { static const struct arm64_ftr_bits ftr_id_aa64mmfr4[] = { S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR4_EL1_E2H0_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR4_EL1_NV_frac_SHIFT, 4, 0), ARM64_FTR_END, }; @@ -2162,7 +2163,7 @@ static bool has_nested_virt_support(const struct arm64_cpu_capabilities *cap, if (kvm_get_mode() != KVM_MODE_NV) return false; - if (!has_cpuid_feature(cap, scope)) { + if (!cpucap_multi_entry_cap_matches(cap, scope)) { pr_warn("unavailable: %s\n", cap->desc); return false; } @@ -2519,7 +2520,17 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .capability = ARM64_HAS_NESTED_VIRT, .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_nested_virt_support, - ARM64_CPUID_FIELDS(ID_AA64MMFR2_EL1, NV, NV2) + .match_list = (const struct arm64_cpu_capabilities []){ + { + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64MMFR2_EL1, NV, NV2) + }, + { + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY) + }, + { /* Sentinel */ } + }, }, { .capability = ARM64_HAS_32BIT_EL0_DO_NOT_USE, From 9d6745572899599966fb76a868acca1cae9518af Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Feb 2025 13:48:55 +0000 Subject: [PATCH 02/76] KVM: arm64: Hide ID_AA64MMFR2_EL1.NV from guest and userspace Since our take on FEAT_NV is to only support FEAT_NV2, we should never expose ID_AA64MMFR2_EL1.NV to a guest nor userspace. Make sure we mask this field for good. Signed-off-by: Marc Zyngier Reviewed-by: Joey Gouly Link: https://lore.kernel.org/r/20250220134907.554085-3-maz@kernel.org [oliver: squash diff for NV field] Signed-off-by: Oliver Upton --- arch/arm64/kvm/sys_regs.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 82430c1e1dd0..4f675f4ae536 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1627,6 +1627,7 @@ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu, break; case SYS_ID_AA64MMFR2_EL1: val &= ~ID_AA64MMFR2_EL1_CCIDX_MASK; + val &= ~ID_AA64MMFR2_EL1_NV; break; case SYS_ID_AA64MMFR3_EL1: val &= ID_AA64MMFR3_EL1_TCRX | ID_AA64MMFR3_EL1_S1POE | @@ -1945,6 +1946,22 @@ static int set_id_aa64pfr1_el1(struct kvm_vcpu *vcpu, return set_id_reg(vcpu, rd, user_val); } +static int set_id_aa64mmfr2_el1(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, u64 user_val) +{ + u64 hw_val = read_sanitised_ftr_reg(SYS_ID_AA64MMFR2_EL1); + u64 nv_mask = ID_AA64MMFR2_EL1_NV_MASK; + + /* + * We made the mistake to expose the now deprecated NV field, + * so allow userspace to write it, but silently ignore it. + */ + if ((hw_val & nv_mask) == (user_val & nv_mask)) + user_val &= ~nv_mask; + + return set_id_reg(vcpu, rd, user_val); +} + static int set_ctr_el0(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 user_val) { @@ -2671,7 +2688,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { ID_AA64MMFR1_EL1_XNX | ID_AA64MMFR1_EL1_VH | ID_AA64MMFR1_EL1_VMIDBits)), - ID_WRITABLE(ID_AA64MMFR2_EL1, ~(ID_AA64MMFR2_EL1_RES0 | + ID_FILTERED(ID_AA64MMFR2_EL1, + id_aa64mmfr2_el1, ~(ID_AA64MMFR2_EL1_RES0 | ID_AA64MMFR2_EL1_EVT | ID_AA64MMFR2_EL1_FWB | ID_AA64MMFR2_EL1_IDS | From d9f943f76506f60b9b74ce04caead6ce81b12fe0 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Feb 2025 13:48:56 +0000 Subject: [PATCH 03/76] KVM: arm64: Mark HCR.EL2.E2H RES0 when ID_AA64MMFR1_EL1.VH is zero Enforce HCR_EL2.E2H being RES0 when VHE is disabled, so that we can actually rely on that bit never being flipped behind our back. Signed-off-by: Marc Zyngier Reviewed-by: Joey Gouly Link: https://lore.kernel.org/r/20250220134907.554085-4-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/nested.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 0c9387d2f507..ed3add7d32f6 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -1034,6 +1034,8 @@ int kvm_init_nv_sysregs(struct kvm_vcpu *vcpu) res0 |= (HCR_TEA | HCR_TERR); if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, LO, IMP)) res0 |= HCR_TLOR; + if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, VH, IMP)) + res0 |= HCR_E2H; if (!kvm_has_feat(kvm, ID_AA64MMFR4_EL1, E2H0, IMP)) res1 |= HCR_E2H; set_sysreg_masks(kvm, HCR_EL2, res0, res1); From 8f8d6084f5b5df7e29b58ff17b6c735fb349b1a9 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Feb 2025 13:48:57 +0000 Subject: [PATCH 04/76] KVM: arm64: Mark HCR.EL2.{NV*,AT} RES0 when ID_AA64MMFR4_EL1.NV_frac is 0 Enforce HCR_EL2.{NV*,AT} being RES0 when NV2 is disabled, so that we can actually rely on these bits never being flipped behind our back. This of course relies on our earlier ID reg sanitising. Signed-off-by: Marc Zyngier Reviewed-by: Joey Gouly Link: https://lore.kernel.org/r/20250220134907.554085-5-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/nested.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index ed3add7d32f6..9f140560a6f5 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -1021,10 +1021,11 @@ int kvm_init_nv_sysregs(struct kvm_vcpu *vcpu) res0 |= HCR_FIEN; if (!kvm_has_feat(kvm, ID_AA64MMFR2_EL1, FWB, IMP)) res0 |= HCR_FWB; - if (!kvm_has_feat(kvm, ID_AA64MMFR2_EL1, NV, NV2)) - res0 |= HCR_NV2; - if (!kvm_has_feat(kvm, ID_AA64MMFR2_EL1, NV, IMP)) - res0 |= (HCR_AT | HCR_NV1 | HCR_NV); + /* Implementation choice: NV2 is the only supported config */ + if (!kvm_has_feat(kvm, ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY)) + res0 |= (HCR_NV2 | HCR_NV | HCR_AT); + if (!kvm_has_feat(kvm, ID_AA64MMFR4_EL1, E2H0, NI)) + res0 |= HCR_NV1; if (!(kvm_vcpu_has_feature(kvm, KVM_ARM_VCPU_PTRAUTH_ADDRESS) && kvm_vcpu_has_feature(kvm, KVM_ARM_VCPU_PTRAUTH_GENERIC))) res0 |= (HCR_API | HCR_APK); From 2cd9542a375a78d3465f51ff792d711fabf854a2 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Feb 2025 13:48:58 +0000 Subject: [PATCH 05/76] KVM: arm64: Advertise NV2 in the boot messages Make it a bit easier to understand what people are running by adding a +NV2 string to the successful KVM initialisation. Signed-off-by: Marc Zyngier Reviewed-by: Joey Gouly Link: https://lore.kernel.org/r/20250220134907.554085-6-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/arm.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index b8e55a441282..5ea09f13c7bc 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -2806,11 +2806,12 @@ static __init int kvm_arm_init(void) if (err) goto out_hyp; - kvm_info("%s%sVHE mode initialized successfully\n", + kvm_info("%s%sVHE%s mode initialized successfully\n", in_hyp_mode ? "" : (is_protected_kvm_enabled() ? "Protected " : "Hyp "), in_hyp_mode ? "" : (cpus_have_final_cap(ARM64_KVM_HVHE) ? - "h" : "n")); + "h" : "n"), + cpus_have_final_cap(ARM64_HAS_NESTED_VIRT) ? "+NV2": ""); /* * FIXME: Do something reasonable if kvm_init() fails after pKVM From 57e7de2650c86e6cfb4ce0c809c785f69ae5c536 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Feb 2025 13:48:59 +0000 Subject: [PATCH 06/76] KVM: arm64: Consolidate idreg callbacks Most of the ID_DESC() users use the same callbacks, with only a few overrides. Consolidate the common callbacks in a macro, and consistently use it everywhere. Whilst we're at it, give ID_UNALLOCATED() a .name string, so that we can easily decode traces. Signed-off-by: Marc Zyngier Reviewed-by: Ganapatrao Kulkarni Reviewed-by: Joey Gouly Link: https://lore.kernel.org/r/20250220134907.554085-7-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/sys_regs.c | 28 ++++++++++------------------ 1 file changed, 10 insertions(+), 18 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 4f675f4ae536..a30c48a71f50 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -2283,35 +2283,33 @@ static bool bad_redir_trap(struct kvm_vcpu *vcpu, * from userspace. */ +#define ID_DESC_DEFAULT_CALLBACKS \ + .access = access_id_reg, \ + .get_user = get_id_reg, \ + .set_user = set_id_reg, \ + .visibility = id_visibility, \ + .reset = kvm_read_sanitised_id_reg + #define ID_DESC(name) \ SYS_DESC(SYS_##name), \ - .access = access_id_reg, \ - .get_user = get_id_reg \ + ID_DESC_DEFAULT_CALLBACKS /* sys_reg_desc initialiser for known cpufeature ID registers */ #define ID_SANITISED(name) { \ ID_DESC(name), \ - .set_user = set_id_reg, \ - .visibility = id_visibility, \ - .reset = kvm_read_sanitised_id_reg, \ .val = 0, \ } /* sys_reg_desc initialiser for known cpufeature ID registers */ #define AA32_ID_SANITISED(name) { \ ID_DESC(name), \ - .set_user = set_id_reg, \ .visibility = aa32_id_visibility, \ - .reset = kvm_read_sanitised_id_reg, \ .val = 0, \ } /* sys_reg_desc initialiser for writable ID registers */ #define ID_WRITABLE(name, mask) { \ ID_DESC(name), \ - .set_user = set_id_reg, \ - .visibility = id_visibility, \ - .reset = kvm_read_sanitised_id_reg, \ .val = mask, \ } @@ -2319,8 +2317,6 @@ static bool bad_redir_trap(struct kvm_vcpu *vcpu, #define ID_FILTERED(sysreg, name, mask) { \ ID_DESC(sysreg), \ .set_user = set_##name, \ - .visibility = id_visibility, \ - .reset = kvm_read_sanitised_id_reg, \ .val = (mask), \ } @@ -2330,12 +2326,10 @@ static bool bad_redir_trap(struct kvm_vcpu *vcpu, * (1 <= crm < 8, 0 <= Op2 < 8). */ #define ID_UNALLOCATED(crm, op2) { \ + .name = "S3_0_0_" #crm "_" #op2, \ Op0(3), Op1(0), CRn(0), CRm(crm), Op2(op2), \ - .access = access_id_reg, \ - .get_user = get_id_reg, \ - .set_user = set_id_reg, \ + ID_DESC_DEFAULT_CALLBACKS, \ .visibility = raz_visibility, \ - .reset = kvm_read_sanitised_id_reg, \ .val = 0, \ } @@ -2346,9 +2340,7 @@ static bool bad_redir_trap(struct kvm_vcpu *vcpu, */ #define ID_HIDDEN(name) { \ ID_DESC(name), \ - .set_user = set_id_reg, \ .visibility = raz_visibility, \ - .reset = kvm_read_sanitised_id_reg, \ .val = 0, \ } From 179fd7e30f0455249fd9d1fd0041da4d141a2b97 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Feb 2025 13:49:00 +0000 Subject: [PATCH 07/76] KVM: arm64: Make ID_REG_LIMIT_FIELD_ENUM() more widely available ID_REG_LIMIT_FIELD_ENUM() is a useful macro to limit the idreg features exposed to guest and userspace, and the NV code can make use of it. Signed-off-by: Marc Zyngier Reviewed-by: Joey Gouly Link: https://lore.kernel.org/r/20250220134907.554085-8-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/sys_regs.c | 10 ---------- arch/arm64/kvm/sys_regs.h | 10 ++++++++++ 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index a30c48a71f50..db94d8a38033 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1803,16 +1803,6 @@ static u64 sanitise_id_aa64pfr0_el1(const struct kvm_vcpu *vcpu, u64 val) return val; } -#define ID_REG_LIMIT_FIELD_ENUM(val, reg, field, limit) \ -({ \ - u64 __f_val = FIELD_GET(reg##_##field##_MASK, val); \ - (val) &= ~reg##_##field##_MASK; \ - (val) |= FIELD_PREP(reg##_##field##_MASK, \ - min(__f_val, \ - (u64)SYS_FIELD_VALUE(reg, field, limit))); \ - (val); \ -}) - static u64 sanitise_id_aa64dfr0_el1(const struct kvm_vcpu *vcpu, u64 val) { val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64DFR0_EL1, DebugVer, V8P8); diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h index 1d94ed6efad2..cc6338d38766 100644 --- a/arch/arm64/kvm/sys_regs.h +++ b/arch/arm64/kvm/sys_regs.h @@ -247,4 +247,14 @@ int kvm_finalize_sys_regs(struct kvm_vcpu *vcpu); CRn(sys_reg_CRn(reg)), CRm(sys_reg_CRm(reg)), \ Op2(sys_reg_Op2(reg)) +#define ID_REG_LIMIT_FIELD_ENUM(val, reg, field, limit) \ +({ \ + u64 __f_val = FIELD_GET(reg##_##field##_MASK, val); \ + (val) &= ~reg##_##field##_MASK; \ + (val) |= FIELD_PREP(reg##_##field##_MASK, \ + min(__f_val, \ + (u64)SYS_FIELD_VALUE(reg, field, limit))); \ + (val); \ +}) + #endif /* __ARM64_KVM_SYS_REGS_LOCAL_H__ */ From e7ef6ed4583ea3b49911015a6591d7a1cbe7436a Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Feb 2025 13:49:01 +0000 Subject: [PATCH 08/76] KVM: arm64: Enforce NV limits on a per-idregs basis As we are about to change the way the idreg reset values are computed, move all the NV limits into a function that initialises one register at a time. This will be most useful in the upcoming patches. We take this opportunity to remove the NV_FTR() macro and rely on the generated names instead. Signed-off-by: Marc Zyngier Reviewed-by: Joey Gouly Link: https://lore.kernel.org/r/20250220134907.554085-9-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/nested.c | 239 +++++++++++++++++++++++----------------- 1 file changed, 136 insertions(+), 103 deletions(-) diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 9f140560a6f5..f0d7dd3615bb 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -16,9 +16,6 @@ #include "sys_regs.h" -/* Protection against the sysreg repainting madness... */ -#define NV_FTR(r, f) ID_AA64##r##_EL1_##f - /* * Ratio of live shadow S2 MMU per vcpu. This is a trade-off between * memory usage and potential number of different sets of S2 PTs in @@ -807,133 +804,169 @@ void kvm_arch_flush_shadow_all(struct kvm *kvm) * This list should get updated as new features get added to the NV * support, and new extension to the architecture. */ +static u64 limit_nv_id_reg(struct kvm *kvm, u32 reg, u64 val) +{ + switch (reg) { + case SYS_ID_AA64ISAR0_EL1: + /* Support everything but TME */ + val &= ~ID_AA64ISAR0_EL1_TME; + break; + + case SYS_ID_AA64ISAR1_EL1: + /* Support everything but LS64 and Spec Invalidation */ + val &= ~(ID_AA64ISAR1_EL1_LS64 | + ID_AA64ISAR1_EL1_SPECRES); + break; + + case SYS_ID_AA64PFR0_EL1: + /* No RME, AMU, MPAM, S-EL2, or RAS */ + val &= ~(ID_AA64PFR0_EL1_RME | + ID_AA64PFR0_EL1_AMU | + ID_AA64PFR0_EL1_MPAM | + ID_AA64PFR0_EL1_SEL2 | + ID_AA64PFR0_EL1_RAS | + ID_AA64PFR0_EL1_EL3 | + ID_AA64PFR0_EL1_EL2 | + ID_AA64PFR0_EL1_EL1 | + ID_AA64PFR0_EL1_EL0); + /* 64bit only at any EL */ + val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, EL0, IMP); + val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, EL1, IMP); + val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, EL2, IMP); + val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, EL3, IMP); + break; + + case SYS_ID_AA64PFR1_EL1: + /* Only support BTI, SSBS, CSV2_frac */ + val &= (ID_AA64PFR1_EL1_BT | + ID_AA64PFR1_EL1_SSBS | + ID_AA64PFR1_EL1_CSV2_frac); + break; + + case SYS_ID_AA64MMFR0_EL1: + /* Hide ECV, ExS, Secure Memory */ + val &= ~(ID_AA64MMFR0_EL1_ECV | + ID_AA64MMFR0_EL1_EXS | + ID_AA64MMFR0_EL1_TGRAN4_2 | + ID_AA64MMFR0_EL1_TGRAN16_2 | + ID_AA64MMFR0_EL1_TGRAN64_2 | + ID_AA64MMFR0_EL1_SNSMEM); + + /* Disallow unsupported S2 page sizes */ + switch (PAGE_SIZE) { + case SZ_64K: + val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN16_2, NI); + fallthrough; + case SZ_16K: + val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN4_2, NI); + fallthrough; + case SZ_4K: + /* Support everything */ + break; + } + + /* + * Since we can't support a guest S2 page size smaller + * than the host's own page size (due to KVM only + * populating its own S2 using the kernel's page + * size), advertise the limitation using FEAT_GTG. + */ + switch (PAGE_SIZE) { + case SZ_4K: + val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN4_2, IMP); + fallthrough; + case SZ_16K: + val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN16_2, IMP); + fallthrough; + case SZ_64K: + val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN64_2, IMP); + break; + } + + /* Cap PARange to 48bits */ + val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64MMFR0_EL1, PARANGE, 48); + break; + + case SYS_ID_AA64MMFR1_EL1: + val &= (ID_AA64MMFR1_EL1_HCX | + ID_AA64MMFR1_EL1_PAN | + ID_AA64MMFR1_EL1_LO | + ID_AA64MMFR1_EL1_HPDS | + ID_AA64MMFR1_EL1_VH | + ID_AA64MMFR1_EL1_VMIDBits); + break; + + case SYS_ID_AA64MMFR2_EL1: + val &= ~(ID_AA64MMFR2_EL1_BBM | + ID_AA64MMFR2_EL1_TTL | + GENMASK_ULL(47, 44) | + ID_AA64MMFR2_EL1_ST | + ID_AA64MMFR2_EL1_CCIDX | + ID_AA64MMFR2_EL1_VARange); + + /* Force TTL support */ + val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR2_EL1, TTL, IMP); + break; + + case SYS_ID_AA64MMFR4_EL1: + val = SYS_FIELD_PREP_ENUM(ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY); + val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR4_EL1, E2H0, NI_NV1); + break; + + case SYS_ID_AA64DFR0_EL1: + /* Only limited support for PMU, Debug, BPs, WPs, and HPMN0 */ + val &= (ID_AA64DFR0_EL1_PMUVer | + ID_AA64DFR0_EL1_WRPs | + ID_AA64DFR0_EL1_BRPs | + ID_AA64DFR0_EL1_DebugVer| + ID_AA64DFR0_EL1_HPMN0); + + /* Cap Debug to ARMv8.1 */ + val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64DFR0_EL1, DebugVer, VHE); + break; + } + + return val; +} + static void limit_nv_id_regs(struct kvm *kvm) { - u64 val, tmp; + u64 val; - /* Support everything but TME */ val = kvm_read_vm_id_reg(kvm, SYS_ID_AA64ISAR0_EL1); - val &= ~NV_FTR(ISAR0, TME); + val = limit_nv_id_reg(kvm, SYS_ID_AA64ISAR0_EL1, val); kvm_set_vm_id_reg(kvm, SYS_ID_AA64ISAR0_EL1, val); - /* Support everything but Spec Invalidation and LS64 */ val = kvm_read_vm_id_reg(kvm, SYS_ID_AA64ISAR1_EL1); - val &= ~(NV_FTR(ISAR1, LS64) | - NV_FTR(ISAR1, SPECRES)); + val = limit_nv_id_reg(kvm, SYS_ID_AA64ISAR1_EL1, val); kvm_set_vm_id_reg(kvm, SYS_ID_AA64ISAR1_EL1, val); - /* No AMU, MPAM, S-EL2, or RAS */ val = kvm_read_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1); - val &= ~(GENMASK_ULL(55, 52) | - NV_FTR(PFR0, AMU) | - NV_FTR(PFR0, MPAM) | - NV_FTR(PFR0, SEL2) | - NV_FTR(PFR0, RAS) | - NV_FTR(PFR0, EL3) | - NV_FTR(PFR0, EL2) | - NV_FTR(PFR0, EL1) | - NV_FTR(PFR0, EL0)); - /* 64bit only at any EL */ - val |= FIELD_PREP(NV_FTR(PFR0, EL0), 0b0001); - val |= FIELD_PREP(NV_FTR(PFR0, EL1), 0b0001); - val |= FIELD_PREP(NV_FTR(PFR0, EL2), 0b0001); - val |= FIELD_PREP(NV_FTR(PFR0, EL3), 0b0001); + val = limit_nv_id_reg(kvm, SYS_ID_AA64PFR0_EL1, val); kvm_set_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1, val); - /* Only support BTI, SSBS, CSV2_frac */ val = kvm_read_vm_id_reg(kvm, SYS_ID_AA64PFR1_EL1); - val &= (NV_FTR(PFR1, BT) | - NV_FTR(PFR1, SSBS) | - NV_FTR(PFR1, CSV2_frac)); + val = limit_nv_id_reg(kvm, SYS_ID_AA64PFR1_EL1, val); kvm_set_vm_id_reg(kvm, SYS_ID_AA64PFR1_EL1, val); - /* Hide ECV, ExS, Secure Memory */ val = kvm_read_vm_id_reg(kvm, SYS_ID_AA64MMFR0_EL1); - val &= ~(NV_FTR(MMFR0, ECV) | - NV_FTR(MMFR0, EXS) | - NV_FTR(MMFR0, TGRAN4_2) | - NV_FTR(MMFR0, TGRAN16_2) | - NV_FTR(MMFR0, TGRAN64_2) | - NV_FTR(MMFR0, SNSMEM)); - - /* Disallow unsupported S2 page sizes */ - switch (PAGE_SIZE) { - case SZ_64K: - val |= FIELD_PREP(NV_FTR(MMFR0, TGRAN16_2), 0b0001); - fallthrough; - case SZ_16K: - val |= FIELD_PREP(NV_FTR(MMFR0, TGRAN4_2), 0b0001); - fallthrough; - case SZ_4K: - /* Support everything */ - break; - } - /* - * Since we can't support a guest S2 page size smaller than - * the host's own page size (due to KVM only populating its - * own S2 using the kernel's page size), advertise the - * limitation using FEAT_GTG. - */ - switch (PAGE_SIZE) { - case SZ_4K: - val |= FIELD_PREP(NV_FTR(MMFR0, TGRAN4_2), 0b0010); - fallthrough; - case SZ_16K: - val |= FIELD_PREP(NV_FTR(MMFR0, TGRAN16_2), 0b0010); - fallthrough; - case SZ_64K: - val |= FIELD_PREP(NV_FTR(MMFR0, TGRAN64_2), 0b0010); - break; - } - /* Cap PARange to 48bits */ - tmp = FIELD_GET(NV_FTR(MMFR0, PARANGE), val); - if (tmp > 0b0101) { - val &= ~NV_FTR(MMFR0, PARANGE); - val |= FIELD_PREP(NV_FTR(MMFR0, PARANGE), 0b0101); - } + val = limit_nv_id_reg(kvm, SYS_ID_AA64MMFR0_EL1, val); kvm_set_vm_id_reg(kvm, SYS_ID_AA64MMFR0_EL1, val); val = kvm_read_vm_id_reg(kvm, SYS_ID_AA64MMFR1_EL1); - val &= (NV_FTR(MMFR1, HCX) | - NV_FTR(MMFR1, PAN) | - NV_FTR(MMFR1, LO) | - NV_FTR(MMFR1, HPDS) | - NV_FTR(MMFR1, VH) | - NV_FTR(MMFR1, VMIDBits)); + val = limit_nv_id_reg(kvm, SYS_ID_AA64MMFR1_EL1, val); kvm_set_vm_id_reg(kvm, SYS_ID_AA64MMFR1_EL1, val); val = kvm_read_vm_id_reg(kvm, SYS_ID_AA64MMFR2_EL1); - val &= ~(NV_FTR(MMFR2, BBM) | - NV_FTR(MMFR2, TTL) | - GENMASK_ULL(47, 44) | - NV_FTR(MMFR2, ST) | - NV_FTR(MMFR2, CCIDX) | - NV_FTR(MMFR2, VARange)); - - /* Force TTL support */ - val |= FIELD_PREP(NV_FTR(MMFR2, TTL), 0b0001); + val = limit_nv_id_reg(kvm, SYS_ID_AA64MMFR2_EL1, val); kvm_set_vm_id_reg(kvm, SYS_ID_AA64MMFR2_EL1, val); - val = 0; - if (!cpus_have_final_cap(ARM64_HAS_HCR_NV1)) - val |= FIELD_PREP(NV_FTR(MMFR4, E2H0), - ID_AA64MMFR4_EL1_E2H0_NI_NV1); + val = kvm_read_vm_id_reg(kvm, SYS_ID_AA64MMFR4_EL1); + val = limit_nv_id_reg(kvm, SYS_ID_AA64MMFR4_EL1, val); kvm_set_vm_id_reg(kvm, SYS_ID_AA64MMFR4_EL1, val); - /* Only limited support for PMU, Debug, BPs, WPs, and HPMN0 */ val = kvm_read_vm_id_reg(kvm, SYS_ID_AA64DFR0_EL1); - val &= (NV_FTR(DFR0, PMUVer) | - NV_FTR(DFR0, WRPs) | - NV_FTR(DFR0, BRPs) | - NV_FTR(DFR0, DebugVer) | - NV_FTR(DFR0, HPMN0)); - - /* Cap Debug to ARMv8.1 */ - tmp = FIELD_GET(NV_FTR(DFR0, DebugVer), val); - if (tmp > 0b0111) { - val &= ~NV_FTR(DFR0, DebugVer); - val |= FIELD_PREP(NV_FTR(DFR0, DebugVer), 0b0111); - } + val = limit_nv_id_reg(kvm, SYS_ID_AA64DFR0_EL1, val); kvm_set_vm_id_reg(kvm, SYS_ID_AA64DFR0_EL1, val); } From 94f296dcd6d937371dd83df048b9a2d723d357c9 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Feb 2025 13:49:02 +0000 Subject: [PATCH 09/76] KVM: arm64: Move NV-specific capping to idreg sanitisation Instead of applying the NV idreg limits at run time, switch to doing it at the same time as the reset of the VM initialisation. This will make things much simpler once we introduce vcpu-driven variants of NV. Signed-off-by: Marc Zyngier Reviewed-by: Joey Gouly Link: https://lore.kernel.org/r/20250220134907.554085-10-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_nested.h | 1 + arch/arm64/kvm/nested.c | 45 +---------------------------- arch/arm64/kvm/sys_regs.c | 3 ++ 3 files changed, 5 insertions(+), 44 deletions(-) diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h index 56c4bcd35e2e..692f403c1896 100644 --- a/arch/arm64/include/asm/kvm_nested.h +++ b/arch/arm64/include/asm/kvm_nested.h @@ -188,6 +188,7 @@ static inline bool kvm_supported_tlbi_s1e2_op(struct kvm_vcpu *vpcu, u32 instr) } int kvm_init_nv_sysregs(struct kvm_vcpu *vcpu); +u64 limit_nv_id_reg(struct kvm *kvm, u32 reg, u64 val); #ifdef CONFIG_ARM64_PTR_AUTH bool kvm_auth_eretax(struct kvm_vcpu *vcpu, u64 *elr); diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index f0d7dd3615bb..409e5e67ae1e 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -804,7 +804,7 @@ void kvm_arch_flush_shadow_all(struct kvm *kvm) * This list should get updated as new features get added to the NV * support, and new extension to the architecture. */ -static u64 limit_nv_id_reg(struct kvm *kvm, u32 reg, u64 val) +u64 limit_nv_id_reg(struct kvm *kvm, u32 reg, u64 val) { switch (reg) { case SYS_ID_AA64ISAR0_EL1: @@ -929,47 +929,6 @@ static u64 limit_nv_id_reg(struct kvm *kvm, u32 reg, u64 val) return val; } -static void limit_nv_id_regs(struct kvm *kvm) -{ - u64 val; - - val = kvm_read_vm_id_reg(kvm, SYS_ID_AA64ISAR0_EL1); - val = limit_nv_id_reg(kvm, SYS_ID_AA64ISAR0_EL1, val); - kvm_set_vm_id_reg(kvm, SYS_ID_AA64ISAR0_EL1, val); - - val = kvm_read_vm_id_reg(kvm, SYS_ID_AA64ISAR1_EL1); - val = limit_nv_id_reg(kvm, SYS_ID_AA64ISAR1_EL1, val); - kvm_set_vm_id_reg(kvm, SYS_ID_AA64ISAR1_EL1, val); - - val = kvm_read_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1); - val = limit_nv_id_reg(kvm, SYS_ID_AA64PFR0_EL1, val); - kvm_set_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1, val); - - val = kvm_read_vm_id_reg(kvm, SYS_ID_AA64PFR1_EL1); - val = limit_nv_id_reg(kvm, SYS_ID_AA64PFR1_EL1, val); - kvm_set_vm_id_reg(kvm, SYS_ID_AA64PFR1_EL1, val); - - val = kvm_read_vm_id_reg(kvm, SYS_ID_AA64MMFR0_EL1); - val = limit_nv_id_reg(kvm, SYS_ID_AA64MMFR0_EL1, val); - kvm_set_vm_id_reg(kvm, SYS_ID_AA64MMFR0_EL1, val); - - val = kvm_read_vm_id_reg(kvm, SYS_ID_AA64MMFR1_EL1); - val = limit_nv_id_reg(kvm, SYS_ID_AA64MMFR1_EL1, val); - kvm_set_vm_id_reg(kvm, SYS_ID_AA64MMFR1_EL1, val); - - val = kvm_read_vm_id_reg(kvm, SYS_ID_AA64MMFR2_EL1); - val = limit_nv_id_reg(kvm, SYS_ID_AA64MMFR2_EL1, val); - kvm_set_vm_id_reg(kvm, SYS_ID_AA64MMFR2_EL1, val); - - val = kvm_read_vm_id_reg(kvm, SYS_ID_AA64MMFR4_EL1); - val = limit_nv_id_reg(kvm, SYS_ID_AA64MMFR4_EL1, val); - kvm_set_vm_id_reg(kvm, SYS_ID_AA64MMFR4_EL1, val); - - val = kvm_read_vm_id_reg(kvm, SYS_ID_AA64DFR0_EL1); - val = limit_nv_id_reg(kvm, SYS_ID_AA64DFR0_EL1, val); - kvm_set_vm_id_reg(kvm, SYS_ID_AA64DFR0_EL1, val); -} - u64 kvm_vcpu_apply_reg_masks(const struct kvm_vcpu *vcpu, enum vcpu_sysreg sr, u64 v) { @@ -1014,8 +973,6 @@ int kvm_init_nv_sysregs(struct kvm_vcpu *vcpu) if (!kvm->arch.sysreg_masks) return -ENOMEM; - limit_nv_id_regs(kvm); - /* VTTBR_EL2 */ res0 = res1 = 0; if (!kvm_has_feat_enum(kvm, ID_AA64MMFR1_EL1, VMIDBits, 16)) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index db94d8a38033..dacccc35a3bd 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1638,6 +1638,9 @@ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu, break; } + if (vcpu_has_nv(vcpu)) + val = limit_nv_id_reg(vcpu->kvm, id, val); + return val; } From f83c41fb3dddbf47881249335a9718d2cdce0bd0 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Feb 2025 13:49:03 +0000 Subject: [PATCH 10/76] KVM: arm64: Allow userspace to limit NV support to nVHE NV is hard. No kidding. In order to make things simpler, we have established that NV would support two mutually exclusive configurations: - VHE-only, and supporting recursive virtualisation - nVHE-only, and not supporting recursive virtualisation For that purpose, introduce a new vcpu feature flag that denotes the second configuration. We use this flag to limit the idregs further. Signed-off-by: Marc Zyngier Reviewed-by: Joey Gouly Link: https://lore.kernel.org/r/20250220134907.554085-11-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/include/uapi/asm/kvm.h | 1 + arch/arm64/kvm/nested.c | 28 ++++++++++++++++++++++++++-- 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h index 568bf858f319..3bcab2a106c9 100644 --- a/arch/arm64/include/uapi/asm/kvm.h +++ b/arch/arm64/include/uapi/asm/kvm.h @@ -105,6 +105,7 @@ struct kvm_regs { #define KVM_ARM_VCPU_PTRAUTH_ADDRESS 5 /* VCPU uses address authentication */ #define KVM_ARM_VCPU_PTRAUTH_GENERIC 6 /* VCPU uses generic authentication */ #define KVM_ARM_VCPU_HAS_EL2 7 /* Support nested virtualization */ +#define KVM_ARM_VCPU_HAS_EL2_E2H0 8 /* Limit NV support to E2H RES0 */ struct kvm_vcpu_init { __u32 target; diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 409e5e67ae1e..933dc3acac5f 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -51,6 +51,10 @@ int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu) struct kvm_s2_mmu *tmp; int num_mmus, ret = 0; + if (test_bit(KVM_ARM_VCPU_HAS_EL2_E2H0, kvm->arch.vcpu_features) && + !cpus_have_final_cap(ARM64_HAS_HCR_NV1)) + return -EINVAL; + /* * Let's treat memory allocation failures as benign: If we fail to * allocate anything, return an error and keep the allocated array @@ -894,6 +898,9 @@ u64 limit_nv_id_reg(struct kvm *kvm, u32 reg, u64 val) ID_AA64MMFR1_EL1_HPDS | ID_AA64MMFR1_EL1_VH | ID_AA64MMFR1_EL1_VMIDBits); + /* FEAT_E2H0 implies no VHE */ + if (test_bit(KVM_ARM_VCPU_HAS_EL2_E2H0, kvm->arch.vcpu_features)) + val &= ~ID_AA64MMFR1_EL1_VH; break; case SYS_ID_AA64MMFR2_EL1: @@ -909,8 +916,25 @@ u64 limit_nv_id_reg(struct kvm *kvm, u32 reg, u64 val) break; case SYS_ID_AA64MMFR4_EL1: - val = SYS_FIELD_PREP_ENUM(ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY); - val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR4_EL1, E2H0, NI_NV1); + /* + * You get EITHER + * + * - FEAT_VHE without FEAT_E2H0 + * - FEAT_NV limited to FEAT_NV2 + * - HCR_EL2.NV1 being RES0 + * + * OR + * + * - FEAT_E2H0 without FEAT_VHE nor FEAT_NV + * + * Life is too short for anything else. + */ + if (test_bit(KVM_ARM_VCPU_HAS_EL2_E2H0, kvm->arch.vcpu_features)) { + val = 0; + } else { + val = SYS_FIELD_PREP_ENUM(ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY); + val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR4_EL1, E2H0, NI_NV1); + } break; case SYS_ID_AA64DFR0_EL1: From 642c23ea8b45b673d8a256e01c04ef5b3c819f11 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Feb 2025 13:49:04 +0000 Subject: [PATCH 11/76] KVM: arm64: Make ID_AA64MMFR4_EL1.NV_frac writable We want to make sure that it is possible for userspace to configure whether recursive NV is possible. Make NV_frac writable for that purpose. Signed-off-by: Marc Zyngier Reviewed-by: Joey Gouly Link: https://lore.kernel.org/r/20250220134907.554085-12-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/sys_regs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index dacccc35a3bd..1a18a0324d9f 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -2683,7 +2683,7 @@ static const struct sys_reg_desc sys_reg_descs[] = { ID_WRITABLE(ID_AA64MMFR3_EL1, (ID_AA64MMFR3_EL1_TCRX | ID_AA64MMFR3_EL1_S1PIE | ID_AA64MMFR3_EL1_S1POE)), - ID_SANITISED(ID_AA64MMFR4_EL1), + ID_WRITABLE(ID_AA64MMFR4_EL1, ID_AA64MMFR4_EL1_NV_frac), ID_UNALLOCATED(7,5), ID_UNALLOCATED(7,6), ID_UNALLOCATED(7,7), From 8b0b98ebf34d6e6ad68a27109f78f2e153187737 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Feb 2025 13:49:05 +0000 Subject: [PATCH 12/76] KVM: arm64: Advertise FEAT_ECV when possible We can advertise support for FEAT_ECV if supported on the HW as long as we limit it to the basic trap bits, and not advertise CNTPOFF_EL2 support, even if the host has it (the short story being that CNTPOFF_EL2 is not virtualisable). Signed-off-by: Marc Zyngier Reviewed-by: Joey Gouly Link: https://lore.kernel.org/r/20250220134907.554085-13-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/nested.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 933dc3acac5f..d55c296fcb27 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -848,14 +848,16 @@ u64 limit_nv_id_reg(struct kvm *kvm, u32 reg, u64 val) break; case SYS_ID_AA64MMFR0_EL1: - /* Hide ECV, ExS, Secure Memory */ - val &= ~(ID_AA64MMFR0_EL1_ECV | - ID_AA64MMFR0_EL1_EXS | + /* Hide ExS, Secure Memory */ + val &= ~(ID_AA64MMFR0_EL1_EXS | ID_AA64MMFR0_EL1_TGRAN4_2 | ID_AA64MMFR0_EL1_TGRAN16_2 | ID_AA64MMFR0_EL1_TGRAN64_2 | ID_AA64MMFR0_EL1_SNSMEM); + /* Hide CNTPOFF if present */ + val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64MMFR0_EL1, ECV, IMP); + /* Disallow unsupported S2 page sizes */ switch (PAGE_SIZE) { case SZ_64K: From 4cd48565b0e5df398e7253c0d2d8c0403d69e7bf Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Mon, 24 Feb 2025 16:53:57 -0800 Subject: [PATCH 13/76] KVM: arm64: Set HCR_EL2.TID1 unconditionally commit 90807748ca3a ("KVM: arm64: Hide SME system registers from guests") added trap handling for SMIDR_EL1, treating it as UNDEFINED as KVM does not support SME. This is right for the most part, however KVM needs to set HCR_EL2.TID1 to _actually_ trap the register. Unfortunately, this comes with some collateral damage as TID1 forces REVIDR_EL1 and AIDR_EL1 to trap as well. KVM has long treated these registers as "invariant" which is an awful term for the following: - Userspace sees the boot CPU values on all vCPUs - The guest sees the hardware values of the CPU on which a vCPU is scheduled Keep the plates spinning by adding trap handling for the affected registers and repaint all of the "invariant" crud into terms of identifying an implementation. Yes, at this point we only need to set TID1 on SME hardware, but REVIDR_EL1 and AIDR_EL1 are about to become mutable anyway. Cc: Mark Brown Cc: stable@vger.kernel.org Fixes: 90807748ca3a ("KVM: arm64: Hide SME system registers from guests") [maz: handle traps from 32bit] Co-developed-by: Marc Zyngier Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20250225005401.679536-2-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_arm.h | 4 +- arch/arm64/kvm/sys_regs.c | 183 +++++++++++++++++-------------- 2 files changed, 100 insertions(+), 87 deletions(-) diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index 8d94a6c0ed5c..b01c01e55de5 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -92,12 +92,12 @@ * SWIO: Turn set/way invalidates into set/way clean+invalidate * PTW: Take a stage2 fault if a stage1 walk steps in device memory * TID3: Trap EL1 reads of group 3 ID registers - * TID2: Trap CTR_EL0, CCSIDR2_EL1, CLIDR_EL1, and CSSELR_EL1 + * TID1: Trap REVIDR_EL1, AIDR_EL1, and SMIDR_EL1 */ #define HCR_GUEST_FLAGS (HCR_TSC | HCR_TSW | HCR_TWE | HCR_TWI | HCR_VM | \ HCR_BSU_IS | HCR_FB | HCR_TACR | \ HCR_AMO | HCR_SWIO | HCR_TIDCP | HCR_RW | HCR_TLOR | \ - HCR_FMO | HCR_IMO | HCR_PTW | HCR_TID3) + HCR_FMO | HCR_IMO | HCR_PTW | HCR_TID3 | HCR_TID1) #define HCR_HOST_NVHE_FLAGS (HCR_RW | HCR_API | HCR_APK | HCR_ATA) #define HCR_HOST_NVHE_PROTECTED_FLAGS (HCR_HOST_NVHE_FLAGS | HCR_TSC) #define HCR_HOST_VHE_FLAGS (HCR_RW | HCR_TGE | HCR_E2H) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 82430c1e1dd0..57e945d6b715 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -2493,6 +2493,93 @@ static bool access_mdcr(struct kvm_vcpu *vcpu, return true; } +/* + * For historical (ahem ABI) reasons, KVM treated MIDR_EL1, REVIDR_EL1, and + * AIDR_EL1 as "invariant" registers, meaning userspace cannot change them. + * The values made visible to userspace were the register values of the boot + * CPU. + * + * At the same time, reads from these registers at EL1 previously were not + * trapped, allowing the guest to read the actual hardware value. On big-little + * machines, this means the VM can see different values depending on where a + * given vCPU got scheduled. + * + * These registers are now trapped as collateral damage from SME, and what + * follows attempts to give a user / guest view consistent with the existing + * ABI. + */ +static bool access_imp_id_reg(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (p->is_write) + return write_to_read_only(vcpu, p, r); + + switch (reg_to_encoding(r)) { + case SYS_REVIDR_EL1: + p->regval = read_sysreg(revidr_el1); + break; + case SYS_AIDR_EL1: + p->regval = read_sysreg(aidr_el1); + break; + default: + WARN_ON_ONCE(1); + } + + return true; +} + +static u64 __ro_after_init boot_cpu_midr_val; +static u64 __ro_after_init boot_cpu_revidr_val; +static u64 __ro_after_init boot_cpu_aidr_val; + +static void init_imp_id_regs(void) +{ + boot_cpu_midr_val = read_sysreg(midr_el1); + boot_cpu_revidr_val = read_sysreg(revidr_el1); + boot_cpu_aidr_val = read_sysreg(aidr_el1); +} + +static int get_imp_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 *val) +{ + switch (reg_to_encoding(r)) { + case SYS_MIDR_EL1: + *val = boot_cpu_midr_val; + break; + case SYS_REVIDR_EL1: + *val = boot_cpu_revidr_val; + break; + case SYS_AIDR_EL1: + *val = boot_cpu_aidr_val; + break; + default: + WARN_ON_ONCE(1); + return -EINVAL; + } + + return 0; +} + +static int set_imp_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 val) +{ + u64 expected; + int ret; + + ret = get_imp_id_reg(vcpu, r, &expected); + if (ret) + return ret; + + return (expected == val) ? 0 : -EINVAL; +} + +#define IMPLEMENTATION_ID(reg) { \ + SYS_DESC(SYS_##reg), \ + .access = access_imp_id_reg, \ + .get_user = get_imp_id_reg, \ + .set_user = set_imp_id_reg, \ +} /* * Architected system registers. @@ -2542,7 +2629,9 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_DBGVCR32_EL2), undef_access, reset_val, DBGVCR32_EL2, 0 }, + IMPLEMENTATION_ID(MIDR_EL1), { SYS_DESC(SYS_MPIDR_EL1), NULL, reset_mpidr, MPIDR_EL1 }, + IMPLEMENTATION_ID(REVIDR_EL1), /* * ID regs: all ID_SANITISED() entries here must have corresponding @@ -2814,6 +2903,7 @@ static const struct sys_reg_desc sys_reg_descs[] = { .set_user = set_clidr, .val = ~CLIDR_EL1_RES0 }, { SYS_DESC(SYS_CCSIDR2_EL1), undef_access }, { SYS_DESC(SYS_SMIDR_EL1), undef_access }, + IMPLEMENTATION_ID(AIDR_EL1), { SYS_DESC(SYS_CSSELR_EL1), access_csselr, reset_unknown, CSSELR_EL1 }, ID_FILTERED(CTR_EL0, ctr_el0, CTR_EL0_DIC_MASK | @@ -4272,9 +4362,13 @@ int kvm_handle_cp15_32(struct kvm_vcpu *vcpu) * Certain AArch32 ID registers are handled by rerouting to the AArch64 * system register table. Registers in the ID range where CRm=0 are * excluded from this scheme as they do not trivially map into AArch64 - * system register encodings. + * system register encodings, except for AIDR/REVIDR. */ - if (params.Op1 == 0 && params.CRn == 0 && params.CRm) + if (params.Op1 == 0 && params.CRn == 0 && + (params.CRm || params.Op2 == 6 /* REVIDR */)) + return kvm_emulate_cp15_id_reg(vcpu, ¶ms); + if (params.Op1 == 1 && params.CRn == 0 && + params.CRm == 0 && params.Op2 == 7 /* AIDR */) return kvm_emulate_cp15_id_reg(vcpu, ¶ms); return kvm_handle_cp_32(vcpu, ¶ms, cp15_regs, ARRAY_SIZE(cp15_regs)); @@ -4578,65 +4672,6 @@ id_to_sys_reg_desc(struct kvm_vcpu *vcpu, u64 id, return r; } -/* - * These are the invariant sys_reg registers: we let the guest see the - * host versions of these, so they're part of the guest state. - * - * A future CPU may provide a mechanism to present different values to - * the guest, or a future kvm may trap them. - */ - -#define FUNCTION_INVARIANT(reg) \ - static u64 reset_##reg(struct kvm_vcpu *v, \ - const struct sys_reg_desc *r) \ - { \ - ((struct sys_reg_desc *)r)->val = read_sysreg(reg); \ - return ((struct sys_reg_desc *)r)->val; \ - } - -FUNCTION_INVARIANT(midr_el1) -FUNCTION_INVARIANT(revidr_el1) -FUNCTION_INVARIANT(aidr_el1) - -/* ->val is filled in by kvm_sys_reg_table_init() */ -static struct sys_reg_desc invariant_sys_regs[] __ro_after_init = { - { SYS_DESC(SYS_MIDR_EL1), NULL, reset_midr_el1 }, - { SYS_DESC(SYS_REVIDR_EL1), NULL, reset_revidr_el1 }, - { SYS_DESC(SYS_AIDR_EL1), NULL, reset_aidr_el1 }, -}; - -static int get_invariant_sys_reg(u64 id, u64 __user *uaddr) -{ - const struct sys_reg_desc *r; - - r = get_reg_by_id(id, invariant_sys_regs, - ARRAY_SIZE(invariant_sys_regs)); - if (!r) - return -ENOENT; - - return put_user(r->val, uaddr); -} - -static int set_invariant_sys_reg(u64 id, u64 __user *uaddr) -{ - const struct sys_reg_desc *r; - u64 val; - - r = get_reg_by_id(id, invariant_sys_regs, - ARRAY_SIZE(invariant_sys_regs)); - if (!r) - return -ENOENT; - - if (get_user(val, uaddr)) - return -EFAULT; - - /* This is what we mean by invariant: you can't change it. */ - if (r->val != val) - return -EINVAL; - - return 0; -} - static int demux_c15_get(struct kvm_vcpu *vcpu, u64 id, void __user *uaddr) { u32 val; @@ -4718,15 +4753,10 @@ int kvm_sys_reg_get_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, int kvm_arm_sys_reg_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { void __user *uaddr = (void __user *)(unsigned long)reg->addr; - int err; if ((reg->id & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_DEMUX) return demux_c15_get(vcpu, reg->id, uaddr); - err = get_invariant_sys_reg(reg->id, uaddr); - if (err != -ENOENT) - return err; - return kvm_sys_reg_get_user(vcpu, reg, sys_reg_descs, ARRAY_SIZE(sys_reg_descs)); } @@ -4762,15 +4792,10 @@ int kvm_sys_reg_set_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, int kvm_arm_sys_reg_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { void __user *uaddr = (void __user *)(unsigned long)reg->addr; - int err; if ((reg->id & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_DEMUX) return demux_c15_set(vcpu, reg->id, uaddr); - err = set_invariant_sys_reg(reg->id, uaddr); - if (err != -ENOENT) - return err; - return kvm_sys_reg_set_user(vcpu, reg, sys_reg_descs, ARRAY_SIZE(sys_reg_descs)); } @@ -4859,23 +4884,14 @@ static int walk_sys_regs(struct kvm_vcpu *vcpu, u64 __user *uind) unsigned long kvm_arm_num_sys_reg_descs(struct kvm_vcpu *vcpu) { - return ARRAY_SIZE(invariant_sys_regs) - + num_demux_regs() + return num_demux_regs() + walk_sys_regs(vcpu, (u64 __user *)NULL); } int kvm_arm_copy_sys_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) { - unsigned int i; int err; - /* Then give them all the invariant registers' indices. */ - for (i = 0; i < ARRAY_SIZE(invariant_sys_regs); i++) { - if (put_user(sys_reg_to_index(&invariant_sys_regs[i]), uindices)) - return -EFAULT; - uindices++; - } - err = walk_sys_regs(vcpu, uindices); if (err < 0) return err; @@ -5101,15 +5117,12 @@ int __init kvm_sys_reg_table_init(void) valid &= check_sysreg_table(cp14_64_regs, ARRAY_SIZE(cp14_64_regs), true); valid &= check_sysreg_table(cp15_regs, ARRAY_SIZE(cp15_regs), true); valid &= check_sysreg_table(cp15_64_regs, ARRAY_SIZE(cp15_64_regs), true); - valid &= check_sysreg_table(invariant_sys_regs, ARRAY_SIZE(invariant_sys_regs), false); valid &= check_sysreg_table(sys_insn_descs, ARRAY_SIZE(sys_insn_descs), false); if (!valid) return -EINVAL; - /* We abuse the reset function to overwrite the table itself. */ - for (i = 0; i < ARRAY_SIZE(invariant_sys_regs); i++) - invariant_sys_regs[i].reset(NULL, &invariant_sys_regs[i]); + init_imp_id_regs(); ret = populate_nv_trap_config(); From b4043e7cb78b96dae07c62d3ac527f84010828cd Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Mon, 24 Feb 2025 16:53:58 -0800 Subject: [PATCH 14/76] KVM: arm64: Maintain per-VM copy of implementation ID regs Get ready to allow changes to the implementation ID registers by tracking the VM-wide values. Signed-off-by: Sebastian Ott Link: https://lore.kernel.org/r/20250225005401.679536-3-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_host.h | 9 +++++++ arch/arm64/kvm/sys_regs.c | 43 ++++++++++++++++--------------- 2 files changed, 31 insertions(+), 21 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 3a7ec98ef123..b849fda62fc5 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -373,6 +373,9 @@ struct kvm_arch { #define KVM_ARM_ID_REG_NUM (IDREG_IDX(sys_reg(3, 0, 0, 7, 7)) + 1) u64 id_regs[KVM_ARM_ID_REG_NUM]; + u64 midr_el1; + u64 revidr_el1; + u64 aidr_el1; u64 ctr_el0; /* Masks for VNCR-backed and general EL2 sysregs */ @@ -1459,6 +1462,12 @@ static inline u64 *__vm_id_reg(struct kvm_arch *ka, u32 reg) return &ka->id_regs[IDREG_IDX(reg)]; case SYS_CTR_EL0: return &ka->ctr_el0; + case SYS_MIDR_EL1: + return &ka->midr_el1; + case SYS_REVIDR_EL1: + return &ka->revidr_el1; + case SYS_AIDR_EL1: + return &ka->aidr_el1; default: WARN_ON_ONCE(1); return NULL; diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 57e945d6b715..64bbf38efed4 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1663,15 +1663,24 @@ static bool is_feature_id_reg(u32 encoding) * Return true if the register's (Op0, Op1, CRn, CRm, Op2) is * (3, 0, 0, crm, op2), where 1<=crm<8, 0<=op2<8, which is the range of ID * registers KVM maintains on a per-VM basis. + * + * Additionally, the implementation ID registers and CTR_EL0 are handled as + * per-VM registers. */ static inline bool is_vm_ftr_id_reg(u32 id) { - if (id == SYS_CTR_EL0) + switch (id) { + case SYS_CTR_EL0: + case SYS_MIDR_EL1: + case SYS_REVIDR_EL1: + case SYS_AIDR_EL1: return true; + default: + return (sys_reg_Op0(id) == 3 && sys_reg_Op1(id) == 0 && + sys_reg_CRn(id) == 0 && sys_reg_CRm(id) >= 1 && + sys_reg_CRm(id) < 8); - return (sys_reg_Op0(id) == 3 && sys_reg_Op1(id) == 0 && - sys_reg_CRn(id) == 0 && sys_reg_CRm(id) >= 1 && - sys_reg_CRm(id) < 8); + } } static inline bool is_vcpu_ftr_id_reg(u32 id) @@ -2540,36 +2549,27 @@ static void init_imp_id_regs(void) boot_cpu_aidr_val = read_sysreg(aidr_el1); } -static int get_imp_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, - u64 *val) +static u64 reset_imp_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { switch (reg_to_encoding(r)) { case SYS_MIDR_EL1: - *val = boot_cpu_midr_val; - break; + return boot_cpu_midr_val; case SYS_REVIDR_EL1: - *val = boot_cpu_revidr_val; - break; + return boot_cpu_revidr_val; case SYS_AIDR_EL1: - *val = boot_cpu_aidr_val; - break; + return boot_cpu_aidr_val; default: - WARN_ON_ONCE(1); - return -EINVAL; + KVM_BUG_ON(1, vcpu->kvm); + return 0; } - - return 0; } static int set_imp_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, u64 val) { u64 expected; - int ret; - ret = get_imp_id_reg(vcpu, r, &expected); - if (ret) - return ret; + expected = read_id_reg(vcpu, r); return (expected == val) ? 0 : -EINVAL; } @@ -2577,8 +2577,9 @@ static int set_imp_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, #define IMPLEMENTATION_ID(reg) { \ SYS_DESC(SYS_##reg), \ .access = access_imp_id_reg, \ - .get_user = get_imp_id_reg, \ + .get_user = get_id_reg, \ .set_user = set_imp_id_reg, \ + .reset = reset_imp_id_reg, \ } /* From d0d81e03e6292b411452501ef0b3583b5ea884f7 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Mon, 24 Feb 2025 16:53:59 -0800 Subject: [PATCH 15/76] KVM: arm64: Load VPIDR_EL2 with the VM's MIDR_EL1 value Userspace will soon be able to change the value of MIDR_EL1. Prepare by loading VPIDR_EL2 with the guest value for non-nested VMs. Since VPIDR_EL2 is set for any VM, get rid of the NV-specific cleanup of reloading the hardware value on vcpu_put(). And for nVHE, load the hardware value before switching to the host. Link: https://lore.kernel.org/r/20250225005401.679536-4-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h | 8 ++++++- arch/arm64/kvm/hyp/nvhe/sysreg-sr.c | 4 +++- arch/arm64/kvm/hyp/vhe/sysreg-sr.c | 28 ++++++++-------------- 3 files changed, 20 insertions(+), 20 deletions(-) diff --git a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h index 76ff095c6b6e..6e6d13580377 100644 --- a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h +++ b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h @@ -43,6 +43,11 @@ static inline u64 *ctxt_mdscr_el1(struct kvm_cpu_context *ctxt) return &ctxt_sys_reg(ctxt, MDSCR_EL1); } +static inline u64 ctxt_midr_el1(struct kvm_cpu_context *ctxt) +{ + return read_cpuid_id(); +} + static inline void __sysreg_save_common_state(struct kvm_cpu_context *ctxt) { *ctxt_mdscr_el1(ctxt) = read_sysreg(mdscr_el1); @@ -168,8 +173,9 @@ static inline void __sysreg_restore_user_state(struct kvm_cpu_context *ctxt) } static inline void __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt, - u64 mpidr) + u64 midr, u64 mpidr) { + write_sysreg(midr, vpidr_el2); write_sysreg(mpidr, vmpidr_el2); if (has_vhe() || diff --git a/arch/arm64/kvm/hyp/nvhe/sysreg-sr.c b/arch/arm64/kvm/hyp/nvhe/sysreg-sr.c index dba101565de3..3cc613cce5f5 100644 --- a/arch/arm64/kvm/hyp/nvhe/sysreg-sr.c +++ b/arch/arm64/kvm/hyp/nvhe/sysreg-sr.c @@ -28,7 +28,9 @@ void __sysreg_save_state_nvhe(struct kvm_cpu_context *ctxt) void __sysreg_restore_state_nvhe(struct kvm_cpu_context *ctxt) { - __sysreg_restore_el1_state(ctxt, ctxt_sys_reg(ctxt, MPIDR_EL1)); + u64 midr = ctxt_midr_el1(ctxt); + + __sysreg_restore_el1_state(ctxt, midr, ctxt_sys_reg(ctxt, MPIDR_EL1)); __sysreg_restore_common_state(ctxt); __sysreg_restore_user_state(ctxt); __sysreg_restore_el2_return_state(ctxt); diff --git a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c index 90b018e06f2c..3814b0b2c937 100644 --- a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c +++ b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c @@ -87,11 +87,12 @@ static void __sysreg_restore_vel2_state(struct kvm_vcpu *vcpu) write_sysreg(__vcpu_sys_reg(vcpu, PAR_EL1), par_el1); write_sysreg(__vcpu_sys_reg(vcpu, TPIDR_EL1), tpidr_el1); - write_sysreg(__vcpu_sys_reg(vcpu, MPIDR_EL1), vmpidr_el2); - write_sysreg_el1(__vcpu_sys_reg(vcpu, MAIR_EL2), SYS_MAIR); - write_sysreg_el1(__vcpu_sys_reg(vcpu, VBAR_EL2), SYS_VBAR); - write_sysreg_el1(__vcpu_sys_reg(vcpu, CONTEXTIDR_EL2), SYS_CONTEXTIDR); - write_sysreg_el1(__vcpu_sys_reg(vcpu, AMAIR_EL2), SYS_AMAIR); + write_sysreg(ctxt_midr_el1(&vcpu->arch.ctxt), vpidr_el2); + write_sysreg(__vcpu_sys_reg(vcpu, MPIDR_EL1), vmpidr_el2); + write_sysreg_el1(__vcpu_sys_reg(vcpu, MAIR_EL2), SYS_MAIR); + write_sysreg_el1(__vcpu_sys_reg(vcpu, VBAR_EL2), SYS_VBAR); + write_sysreg_el1(__vcpu_sys_reg(vcpu, CONTEXTIDR_EL2), SYS_CONTEXTIDR); + write_sysreg_el1(__vcpu_sys_reg(vcpu, AMAIR_EL2), SYS_AMAIR); if (vcpu_el2_e2h_is_set(vcpu)) { /* @@ -191,7 +192,7 @@ void __vcpu_load_switch_sysregs(struct kvm_vcpu *vcpu) { struct kvm_cpu_context *guest_ctxt = &vcpu->arch.ctxt; struct kvm_cpu_context *host_ctxt; - u64 mpidr; + u64 midr, mpidr; host_ctxt = host_data_ptr(host_ctxt); __sysreg_save_user_state(host_ctxt); @@ -220,23 +221,18 @@ void __vcpu_load_switch_sysregs(struct kvm_vcpu *vcpu) __sysreg_restore_vel2_state(vcpu); } else { if (vcpu_has_nv(vcpu)) { - /* - * Use the guest hypervisor's VPIDR_EL2 when in a - * nested state. The hardware value of MIDR_EL1 gets - * restored on put. - */ - write_sysreg(ctxt_sys_reg(guest_ctxt, VPIDR_EL2), vpidr_el2); - /* * As we're restoring a nested guest, set the value * provided by the guest hypervisor. */ + midr = ctxt_sys_reg(guest_ctxt, VPIDR_EL2); mpidr = ctxt_sys_reg(guest_ctxt, VMPIDR_EL2); } else { + midr = ctxt_midr_el1(guest_ctxt); mpidr = ctxt_sys_reg(guest_ctxt, MPIDR_EL1); } - __sysreg_restore_el1_state(guest_ctxt, mpidr); + __sysreg_restore_el1_state(guest_ctxt, midr, mpidr); } vcpu_set_flag(vcpu, SYSREGS_ON_CPU); @@ -271,9 +267,5 @@ void __vcpu_put_switch_sysregs(struct kvm_vcpu *vcpu) /* Restore host user state */ __sysreg_restore_user_state(host_ctxt); - /* If leaving a nesting guest, restore MIDR_EL1 default view */ - if (vcpu_has_nv(vcpu)) - write_sysreg(read_cpuid_id(), vpidr_el2); - vcpu_clear_flag(vcpu, SYSREGS_ON_CPU); } From 3adaee78306148da5df5d3d655e9a90bf18e9513 Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Mon, 24 Feb 2025 16:54:00 -0800 Subject: [PATCH 16/76] KVM: arm64: Allow userspace to change the implementation ID registers KVM's treatment of the ID registers that describe the implementation (MIDR, REVIDR, and AIDR) is interesting, to say the least. On the userspace-facing end of it, KVM presents the values of the boot CPU on all vCPUs and treats them as invariant. On the guest side of things KVM presents the hardware values of the local CPU, which can change during CPU migration in a big-little system. While one may call this fragile, there is at least some degree of predictability around it. For example, if a VMM wanted to present big-little to a guest, it could affine vCPUs accordingly to the correct clusters. All of this makes a giant mess out of adding support for making these implementation ID registers writable. Avoid breaking the rather subtle ABI around the old way of doing things by requiring opt-in from userspace to make the registers writable. When the cap is enabled, allow userspace to set MIDR, REVIDR, and AIDR to any non-reserved value and present those values consistently across all vCPUs. Signed-off-by: Sebastian Ott [oliver: changelog, capability] Link: https://lore.kernel.org/r/20250225005401.679536-5-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- Documentation/virt/kvm/api.rst | 18 +++++++++ arch/arm64/include/asm/kvm_host.h | 2 + arch/arm64/kvm/arm.c | 9 +++++ arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h | 8 +++- arch/arm64/kvm/sys_regs.c | 47 +++++++++++++++++++--- include/uapi/linux/kvm.h | 1 + 6 files changed, 78 insertions(+), 7 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 2b52eb77e29c..56f3dcdc4477 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -8258,6 +8258,24 @@ KVM exits with the register state of either the L1 or L2 guest depending on which executed at the time of an exit. Userspace must take care to differentiate between these cases. +7.37 KVM_CAP_ARM_WRITABLE_IMP_ID_REGS +------------------------------------- + +:Architectures: arm64 +:Target: VM +:Parameters: None +:Returns: 0 on success, -EBUSY if vCPUs have been created before enabling this + capability. + +This capability changes the behavior of the registers that identify a PE +implementation of the Arm architecture: MIDR_EL1, REVIDR_EL1, and AIDR_EL1. +By default, these registers are visible to userspace but treated as invariant. + +When this capability is enabled, KVM allows userspace to change the +aforementioned registers before the first KVM_RUN. These registers are VM +scoped, meaning that the same set of values are presented on all vCPUs in a +given VM. + 8. Other capabilities. ====================== diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index b849fda62fc5..c654c90c98dc 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -334,6 +334,8 @@ struct kvm_arch { #define KVM_ARCH_FLAG_FGU_INITIALIZED 8 /* SVE exposed to guest */ #define KVM_ARCH_FLAG_GUEST_HAS_SVE 9 + /* MIDR_EL1, REVIDR_EL1, and AIDR_EL1 are writable from userspace */ +#define KVM_ARCH_FLAG_WRITABLE_IMP_ID_REGS 10 unsigned long flags; /* VM-wide vCPU feature set */ diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index b8e55a441282..413824d9935d 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -125,6 +125,14 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, } mutex_unlock(&kvm->slots_lock); break; + case KVM_CAP_ARM_WRITABLE_IMP_ID_REGS: + mutex_lock(&kvm->lock); + if (!kvm->created_vcpus) { + r = 0; + set_bit(KVM_ARCH_FLAG_WRITABLE_IMP_ID_REGS, &kvm->arch.flags); + } + mutex_unlock(&kvm->lock); + break; default: break; } @@ -313,6 +321,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_ARM_SYSTEM_SUSPEND: case KVM_CAP_IRQFD_RESAMPLE: case KVM_CAP_COUNTER_OFFSET: + case KVM_CAP_ARM_WRITABLE_IMP_ID_REGS: r = 1; break; case KVM_CAP_SET_GUEST_DEBUG2: diff --git a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h index 6e6d13580377..b9cff893bbe0 100644 --- a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h +++ b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h @@ -45,7 +45,13 @@ static inline u64 *ctxt_mdscr_el1(struct kvm_cpu_context *ctxt) static inline u64 ctxt_midr_el1(struct kvm_cpu_context *ctxt) { - return read_cpuid_id(); + struct kvm *kvm = kern_hyp_va(ctxt_to_vcpu(ctxt)->kvm); + + if (!(ctxt_is_guest(ctxt) && + test_bit(KVM_ARCH_FLAG_WRITABLE_IMP_ID_REGS, &kvm->arch.flags))) + return read_cpuid_id(); + + return kvm_read_vm_id_reg(kvm, SYS_MIDR_EL1); } static inline void __sysreg_save_common_state(struct kvm_cpu_context *ctxt) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 64bbf38efed4..2cb89429f4f7 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -2524,6 +2524,17 @@ static bool access_imp_id_reg(struct kvm_vcpu *vcpu, if (p->is_write) return write_to_read_only(vcpu, p, r); + /* + * Return the VM-scoped implementation ID register values if userspace + * has made them writable. + */ + if (test_bit(KVM_ARCH_FLAG_WRITABLE_IMP_ID_REGS, &vcpu->kvm->arch.flags)) + return access_id_reg(vcpu, p, r); + + /* + * Otherwise, fall back to the old behavior of returning the value of + * the current CPU. + */ switch (reg_to_encoding(r)) { case SYS_REVIDR_EL1: p->regval = read_sysreg(revidr_el1); @@ -2567,19 +2578,43 @@ static u64 reset_imp_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) static int set_imp_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, u64 val) { + struct kvm *kvm = vcpu->kvm; u64 expected; - expected = read_id_reg(vcpu, r); + guard(mutex)(&kvm->arch.config_lock); - return (expected == val) ? 0 : -EINVAL; + expected = read_id_reg(vcpu, r); + if (expected == val) + return 0; + + if (!test_bit(KVM_ARCH_FLAG_WRITABLE_IMP_ID_REGS, &kvm->arch.flags)) + return -EINVAL; + + /* + * Once the VM has started the ID registers are immutable. Reject the + * write if userspace tries to change it. + */ + if (kvm_vm_has_ran_once(kvm)) + return -EBUSY; + + /* + * Any value is allowed for the implementation ID registers so long as + * it is within the writable mask. + */ + if ((val & r->val) != val) + return -EINVAL; + + kvm_set_vm_id_reg(kvm, reg_to_encoding(r), val); + return 0; } -#define IMPLEMENTATION_ID(reg) { \ +#define IMPLEMENTATION_ID(reg, mask) { \ SYS_DESC(SYS_##reg), \ .access = access_imp_id_reg, \ .get_user = get_id_reg, \ .set_user = set_imp_id_reg, \ .reset = reset_imp_id_reg, \ + .val = mask, \ } /* @@ -2630,9 +2665,9 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_DBGVCR32_EL2), undef_access, reset_val, DBGVCR32_EL2, 0 }, - IMPLEMENTATION_ID(MIDR_EL1), + IMPLEMENTATION_ID(MIDR_EL1, GENMASK_ULL(31, 0)), { SYS_DESC(SYS_MPIDR_EL1), NULL, reset_mpidr, MPIDR_EL1 }, - IMPLEMENTATION_ID(REVIDR_EL1), + IMPLEMENTATION_ID(REVIDR_EL1, GENMASK_ULL(63, 0)), /* * ID regs: all ID_SANITISED() entries here must have corresponding @@ -2904,7 +2939,7 @@ static const struct sys_reg_desc sys_reg_descs[] = { .set_user = set_clidr, .val = ~CLIDR_EL1_RES0 }, { SYS_DESC(SYS_CCSIDR2_EL1), undef_access }, { SYS_DESC(SYS_SMIDR_EL1), undef_access }, - IMPLEMENTATION_ID(AIDR_EL1), + IMPLEMENTATION_ID(AIDR_EL1, GENMASK_ULL(63, 0)), { SYS_DESC(SYS_CSSELR_EL1), access_csselr, reset_unknown, CSSELR_EL1 }, ID_FILTERED(CTR_EL0, ctr_el0, CTR_EL0_DIC_MASK | diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 45e6d8fca9b9..b6ae8ad8934b 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -929,6 +929,7 @@ struct kvm_enable_cap { #define KVM_CAP_PRE_FAULT_MEMORY 236 #define KVM_CAP_X86_APIC_BUS_CYCLES_NS 237 #define KVM_CAP_X86_GUEST_MODE 238 +#define KVM_CAP_ARM_WRITABLE_IMP_ID_REGS 239 struct kvm_irq_routing_irqchip { __u32 irqchip; From a88c7c22447932c9d5e45cd5b45224a8eb8efa40 Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Mon, 24 Feb 2025 16:54:01 -0800 Subject: [PATCH 17/76] KVM: selftests: arm64: Test writes to MIDR,REVIDR,AIDR Assert that MIDR_EL1, REVIDR_EL1, AIDR_EL1 are writable from userspace, that the changed values are visible to guests, and that they are preserved across a vCPU reset. Signed-off-by: Sebastian Ott Link: https://lore.kernel.org/r/20250225005401.679536-6-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- .../testing/selftests/kvm/arm64/set_id_regs.c | 37 +++++++++++++++---- 1 file changed, 30 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/kvm/arm64/set_id_regs.c b/tools/testing/selftests/kvm/arm64/set_id_regs.c index 217541fe6536..1d65f4a09e6f 100644 --- a/tools/testing/selftests/kvm/arm64/set_id_regs.c +++ b/tools/testing/selftests/kvm/arm64/set_id_regs.c @@ -230,6 +230,9 @@ static void guest_code(void) GUEST_REG_SYNC(SYS_ID_AA64MMFR2_EL1); GUEST_REG_SYNC(SYS_ID_AA64ZFR0_EL1); GUEST_REG_SYNC(SYS_CTR_EL0); + GUEST_REG_SYNC(SYS_MIDR_EL1); + GUEST_REG_SYNC(SYS_REVIDR_EL1); + GUEST_REG_SYNC(SYS_AIDR_EL1); GUEST_DONE(); } @@ -609,18 +612,31 @@ static void test_ctr(struct kvm_vcpu *vcpu) test_reg_vals[encoding_to_range_idx(SYS_CTR_EL0)] = ctr; } -static void test_vcpu_ftr_id_regs(struct kvm_vcpu *vcpu) +static void test_id_reg(struct kvm_vcpu *vcpu, u32 id) { u64 val; + val = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(id)); + val++; + vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(id), val); + test_reg_vals[encoding_to_range_idx(id)] = val; +} + +static void test_vcpu_ftr_id_regs(struct kvm_vcpu *vcpu) +{ test_clidr(vcpu); test_ctr(vcpu); - val = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_MPIDR_EL1)); - val++; - vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_MPIDR_EL1), val); + test_id_reg(vcpu, SYS_MPIDR_EL1); + ksft_test_result_pass("%s\n", __func__); +} + +static void test_vcpu_non_ftr_id_regs(struct kvm_vcpu *vcpu) +{ + test_id_reg(vcpu, SYS_MIDR_EL1); + test_id_reg(vcpu, SYS_REVIDR_EL1); + test_id_reg(vcpu, SYS_AIDR_EL1); - test_reg_vals[encoding_to_range_idx(SYS_MPIDR_EL1)] = val; ksft_test_result_pass("%s\n", __func__); } @@ -647,6 +663,9 @@ static void test_reset_preserves_id_regs(struct kvm_vcpu *vcpu) test_assert_id_reg_unchanged(vcpu, SYS_MPIDR_EL1); test_assert_id_reg_unchanged(vcpu, SYS_CLIDR_EL1); test_assert_id_reg_unchanged(vcpu, SYS_CTR_EL0); + test_assert_id_reg_unchanged(vcpu, SYS_MIDR_EL1); + test_assert_id_reg_unchanged(vcpu, SYS_REVIDR_EL1); + test_assert_id_reg_unchanged(vcpu, SYS_AIDR_EL1); ksft_test_result_pass("%s\n", __func__); } @@ -660,8 +679,11 @@ int main(void) int test_cnt; TEST_REQUIRE(kvm_has_cap(KVM_CAP_ARM_SUPPORTED_REG_MASK_RANGES)); + TEST_REQUIRE(kvm_has_cap(KVM_CAP_ARM_WRITABLE_IMP_ID_REGS)); - vm = vm_create_with_one_vcpu(&vcpu, guest_code); + vm = vm_create(1); + vm_enable_cap(vm, KVM_CAP_ARM_WRITABLE_IMP_ID_REGS, 0); + vcpu = vm_vcpu_add(vm, 0, guest_code); /* Check for AARCH64 only system */ val = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR0_EL1)); @@ -675,13 +697,14 @@ int main(void) ARRAY_SIZE(ftr_id_aa64isar2_el1) + ARRAY_SIZE(ftr_id_aa64pfr0_el1) + ARRAY_SIZE(ftr_id_aa64pfr1_el1) + ARRAY_SIZE(ftr_id_aa64mmfr0_el1) + ARRAY_SIZE(ftr_id_aa64mmfr1_el1) + ARRAY_SIZE(ftr_id_aa64mmfr2_el1) + - ARRAY_SIZE(ftr_id_aa64zfr0_el1) - ARRAY_SIZE(test_regs) + 2 + + ARRAY_SIZE(ftr_id_aa64zfr0_el1) - ARRAY_SIZE(test_regs) + 3 + MPAM_IDREG_TEST; ksft_set_plan(test_cnt); test_vm_ftr_id_regs(vcpu, aarch64_only); test_vcpu_ftr_id_regs(vcpu); + test_vcpu_non_ftr_id_regs(vcpu); test_user_set_mpam_reg(vcpu); test_guest_reg_read(vcpu); From a0d7e2fc61ab54f1be817c9300231a1b48432628 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 26 Feb 2025 10:31:21 -0800 Subject: [PATCH 18/76] KVM: arm64: vgic-v4: Only attempt vLPI mapping for actual MSIs Some 'creative' VMMs out there may assign a VFIO MSI eventfd to an SPI routing entry. And yes, I can already hear you shouting about possibly driving a level interrupt with an edge-sensitive one. You know who you are. This works for the most part, and interrupt injection winds up taking the software path. However, when running on GICv4-enabled hardware, KVM erroneously attempts to setup LPI forwarding, even though the KVM routing isn't an MSI. Thanks to misuse of a union, the MSI destination is unlikely to match any ITS in the VM and kvm_vgic_v4_set_forwarding() bails early. Later on when the VM is being torn down, this half-configured state triggers the WARN_ON() in kvm_vgic_v4_unset_forwarding() due to the fact that no HW IRQ was ever assigned. Avoid the whole mess by preventing SPI routing entries from getting into the LPI forwarding helpers. Reported-by: Sudheer Dantuluri Tested-by: Sudheer Dantuluri Fixes: 196b136498b3 ("KVM: arm/arm64: GICv4: Wire mapping/unmapping of VLPIs in VFIO irq bypass") Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20250226183124.82094-2-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/kvm/arm.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index b8e55a441282..36d8dd80b431 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -2717,6 +2717,14 @@ int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, { struct kvm_kernel_irqfd *irqfd = container_of(cons, struct kvm_kernel_irqfd, consumer); + struct kvm_kernel_irq_routing_entry *irq_entry = &irqfd->irq_entry; + + /* + * The only thing we have a chance of directly-injecting is LPIs. Maybe + * one day... + */ + if (irq_entry->type != KVM_IRQ_ROUTING_MSI) + return 0; return kvm_vgic_v4_set_forwarding(irqfd->kvm, prod->irq, &irqfd->irq_entry); @@ -2726,6 +2734,10 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, { struct kvm_kernel_irqfd *irqfd = container_of(cons, struct kvm_kernel_irqfd, consumer); + struct kvm_kernel_irq_routing_entry *irq_entry = &irqfd->irq_entry; + + if (irq_entry->type != KVM_IRQ_ROUTING_MSI) + return; kvm_vgic_v4_unset_forwarding(irqfd->kvm, prod->irq, &irqfd->irq_entry); From d0b79563fd60c063ee4ad2a76024e60338ed3881 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 26 Feb 2025 10:31:22 -0800 Subject: [PATCH 19/76] KVM: arm64: vgic-v4: Only WARN for HW IRQ mismatch when unmapping vLPI The VMM or guest can easily screw up GICv4 vLPI injection by misconfiguring the MSI or the virtual ITS. Don't fuss over it; limit the WARN in vgic_v4_unset_forwarding() to fire in the worrying case where an unrelated HW IRQ was mapped to a vLPI. Reported-by: Sudheer Dantuluri Tested-by: Sudheer Dantuluri Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20250226183124.82094-3-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic-v4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c index eedecbbbcf31..3e1de40720e0 100644 --- a/arch/arm64/kvm/vgic/vgic-v4.c +++ b/arch/arm64/kvm/vgic/vgic-v4.c @@ -512,7 +512,7 @@ int kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int virq, if (ret) goto out; - WARN_ON(!(irq->hw && irq->host_irq == virq)); + WARN_ON(irq->hw && irq->host_irq != virq); if (irq->hw) { atomic_dec(&irq->target_vcpu->arch.vgic_cpu.vgic_v3.its_vpe.vlpi_count); irq->hw = false; From 5c57533eb8c1efeeab4e2e417fa983d65707b925 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 26 Feb 2025 10:31:23 -0800 Subject: [PATCH 20/76] KVM: arm64: vgic-v4: Fall back to software irqbypass if LPI not found Continuing with the theme of broken VMMs and guests, irqbypass registration can fail if the virtual ITS lacks a translation for the MSI. Either the guest hasn't mapped it or userspace may have forgotten to restore the ITS. Exit silently and allow irqbypass configuration to succeed. As a reward for ingenuity, LPIs are demoted to software injection. Tested-by: Sudheer Dantuluri Fixes: 196b136498b3 ("KVM: arm/arm64: GICv4: Wire mapping/unmapping of VLPIs in VFIO irq bypass") Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20250226183124.82094-4-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic-v4.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c index 3e1de40720e0..9f51d58b5f8a 100644 --- a/arch/arm64/kvm/vgic/vgic-v4.c +++ b/arch/arm64/kvm/vgic/vgic-v4.c @@ -415,7 +415,7 @@ int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int virq, struct vgic_irq *irq; struct its_vlpi_map map; unsigned long flags; - int ret; + int ret = 0; if (!vgic_supports_direct_msis(kvm)) return 0; @@ -430,10 +430,15 @@ int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int virq, mutex_lock(&its->its_lock); - /* Perform the actual DevID/EventID -> LPI translation. */ - ret = vgic_its_resolve_lpi(kvm, its, irq_entry->msi.devid, - irq_entry->msi.data, &irq); - if (ret) + /* + * Perform the actual DevID/EventID -> LPI translation. + * + * Silently exit if translation fails as the guest (or userspace!) has + * managed to do something stupid. Emulated LPI injection will still + * work if the guest figures itself out at a later time. + */ + if (vgic_its_resolve_lpi(kvm, its, irq_entry->msi.devid, + irq_entry->msi.data, &irq)) goto out; /* Silently exit if the vLPI is already mapped */ From d766d87cf4a0a415a45bbb8fc2e3a4cc1995654d Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 26 Feb 2025 10:31:24 -0800 Subject: [PATCH 21/76] KVM: arm64: Document ordering requirements for irqbypass One of the not-so-obvious requirements for restoring a VM is ensuring that the vITS has been restored _before_ creating any irqbypass mappings. This is because KVM needs to get the guest translation for MSIs to correctly assemble the vLPI mapping getting installed in the physical ITS. Document the restore ordering requirements necessary for GICv4 vLPI injection to work. Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20250226183124.82094-5-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- Documentation/virt/kvm/devices/arm-vgic-its.rst | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Documentation/virt/kvm/devices/arm-vgic-its.rst b/Documentation/virt/kvm/devices/arm-vgic-its.rst index e053124f77c4..0f971f83205a 100644 --- a/Documentation/virt/kvm/devices/arm-vgic-its.rst +++ b/Documentation/virt/kvm/devices/arm-vgic-its.rst @@ -126,7 +126,8 @@ KVM_DEV_ARM_VGIC_GRP_ITS_REGS ITS Restore Sequence: --------------------- -The following ordering must be followed when restoring the GIC and the ITS: +The following ordering must be followed when restoring the GIC, ITS, and +KVM_IRQFD assignments: a) restore all guest memory and create vcpus b) restore all redistributors @@ -139,6 +140,8 @@ d) restore the ITS in the following order: 3. Load the ITS table data (KVM_DEV_ARM_ITS_RESTORE_TABLES) 4. Restore GITS_CTLR +e) restore KVM_IRQFD assignments for MSIs + Then vcpus can be started. ITS Table ABI REV0: From e3121298c7fcaf488df8e61f50fced9a741c4c44 Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Fri, 21 Feb 2025 14:02:24 +0000 Subject: [PATCH 22/76] arm64: Modify _midr_range() functions to read MIDR/REVIDR internally These changes lay the groundwork for adding support for guest kernels, allowing them to leverage target CPU implementations provided by the VMM. No functional changes intended. Suggested-by: Oliver Upton Reviewed-by: Sebastian Ott Reviewed-by: Cornelia Huck Signed-off-by: Shameer Kolothum Acked-by: Catalin Marinas Link: https://lore.kernel.org/r/20250221140229.12588-2-shameerali.kolothum.thodi@huawei.com Signed-off-by: Oliver Upton --- arch/arm64/include/asm/cputype.h | 28 +++++++++---------- arch/arm64/include/asm/mmu.h | 3 +- arch/arm64/kernel/cpu_errata.c | 23 ++++++++------- arch/arm64/kernel/cpufeature.c | 6 ++-- arch/arm64/kernel/proton-pack.c | 17 ++++++----- arch/arm64/kvm/vgic/vgic-v3.c | 2 +- drivers/clocksource/arm_arch_timer.c | 2 +- .../coresight/coresight-etm4x-core.c | 2 +- 8 files changed, 42 insertions(+), 41 deletions(-) diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index 6f3f4142e214..2a76f0e30006 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -231,6 +231,16 @@ #define read_cpuid(reg) read_sysreg_s(SYS_ ## reg) +/* + * The CPU ID never changes at run time, so we might as well tell the + * compiler that it's constant. Use this function to read the CPU ID + * rather than directly reading processor_id or read_cpuid() directly. + */ +static inline u32 __attribute_const__ read_cpuid_id(void) +{ + return read_cpuid(MIDR_EL1); +} + /* * Represent a range of MIDR values for a given CPU model and a * range of variant/revision values. @@ -266,31 +276,21 @@ static inline bool midr_is_cpu_model_range(u32 midr, u32 model, u32 rv_min, return _model == model && rv >= rv_min && rv <= rv_max; } -static inline bool is_midr_in_range(u32 midr, struct midr_range const *range) +static inline bool is_midr_in_range(struct midr_range const *range) { - return midr_is_cpu_model_range(midr, range->model, + return midr_is_cpu_model_range(read_cpuid_id(), range->model, range->rv_min, range->rv_max); } static inline bool -is_midr_in_range_list(u32 midr, struct midr_range const *ranges) +is_midr_in_range_list(struct midr_range const *ranges) { while (ranges->model) - if (is_midr_in_range(midr, ranges++)) + if (is_midr_in_range(ranges++)) return true; return false; } -/* - * The CPU ID never changes at run time, so we might as well tell the - * compiler that it's constant. Use this function to read the CPU ID - * rather than directly reading processor_id or read_cpuid() directly. - */ -static inline u32 __attribute_const__ read_cpuid_id(void) -{ - return read_cpuid(MIDR_EL1); -} - static inline u64 __attribute_const__ read_cpuid_mpidr(void) { return read_cpuid(MPIDR_EL1); diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h index 662471cfc536..30a29e88994b 100644 --- a/arch/arm64/include/asm/mmu.h +++ b/arch/arm64/include/asm/mmu.h @@ -101,8 +101,7 @@ static inline bool kaslr_requires_kpti(void) if (IS_ENABLED(CONFIG_CAVIUM_ERRATUM_27456)) { extern const struct midr_range cavium_erratum_27456_cpus[]; - if (is_midr_in_range_list(read_cpuid_id(), - cavium_erratum_27456_cpus)) + if (is_midr_in_range_list(cavium_erratum_27456_cpus)) return false; } diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index 7ce555862895..99b55893fc4e 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -15,30 +15,34 @@ #include static bool __maybe_unused -is_affected_midr_range(const struct arm64_cpu_capabilities *entry, int scope) +__is_affected_midr_range(const struct arm64_cpu_capabilities *entry, + u32 midr, u32 revidr) { const struct arm64_midr_revidr *fix; - u32 midr = read_cpuid_id(), revidr; - - WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible()); - if (!is_midr_in_range(midr, &entry->midr_range)) + if (!is_midr_in_range(&entry->midr_range)) return false; midr &= MIDR_REVISION_MASK | MIDR_VARIANT_MASK; - revidr = read_cpuid(REVIDR_EL1); for (fix = entry->fixed_revs; fix && fix->revidr_mask; fix++) if (midr == fix->midr_rv && (revidr & fix->revidr_mask)) return false; - return true; } +static bool __maybe_unused +is_affected_midr_range(const struct arm64_cpu_capabilities *entry, int scope) +{ + WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible()); + return __is_affected_midr_range(entry, read_cpuid_id(), + read_cpuid(REVIDR_EL1)); +} + static bool __maybe_unused is_affected_midr_range_list(const struct arm64_cpu_capabilities *entry, int scope) { WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible()); - return is_midr_in_range_list(read_cpuid_id(), entry->midr_range_list); + return is_midr_in_range_list(entry->midr_range_list); } static bool __maybe_unused @@ -186,12 +190,11 @@ static bool __maybe_unused has_neoverse_n1_erratum_1542419(const struct arm64_cpu_capabilities *entry, int scope) { - u32 midr = read_cpuid_id(); bool has_dic = read_cpuid_cachetype() & BIT(CTR_EL0_DIC_SHIFT); const struct midr_range range = MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1); WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible()); - return is_midr_in_range(midr, &range) && has_dic; + return is_midr_in_range(&range) && has_dic; } #ifdef CONFIG_ARM64_WORKAROUND_REPEAT_TLBI diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index d561cf3b8ac7..aafa3fbdd5ff 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -1792,7 +1792,7 @@ static bool unmap_kernel_at_el0(const struct arm64_cpu_capabilities *entry, char const *str = "kpti command line option"; bool meltdown_safe; - meltdown_safe = is_midr_in_range_list(read_cpuid_id(), kpti_safe_list); + meltdown_safe = is_midr_in_range_list(kpti_safe_list); /* Defer to CPU feature registers */ if (has_cpuid_feature(entry, scope)) @@ -1862,7 +1862,7 @@ static bool has_nv1(const struct arm64_cpu_capabilities *entry, int scope) return (__system_matches_cap(ARM64_HAS_NESTED_VIRT) && !(has_cpuid_feature(entry, scope) || - is_midr_in_range_list(read_cpuid_id(), nv1_ni_list))); + is_midr_in_range_list(nv1_ni_list))); } #if defined(ID_AA64MMFR0_EL1_TGRAN_LPA2) && defined(ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_LPA2) @@ -2045,7 +2045,7 @@ static bool cpu_has_broken_dbm(void) {}, }; - return is_midr_in_range_list(read_cpuid_id(), cpus); + return is_midr_in_range_list(cpus); } static bool cpu_can_use_dbm(const struct arm64_cpu_capabilities *cap) diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c index da53722f95d4..a573fa40d4b6 100644 --- a/arch/arm64/kernel/proton-pack.c +++ b/arch/arm64/kernel/proton-pack.c @@ -172,7 +172,7 @@ static enum mitigation_state spectre_v2_get_cpu_hw_mitigation_state(void) return SPECTRE_UNAFFECTED; /* Alternatively, we have a list of unaffected CPUs */ - if (is_midr_in_range_list(read_cpuid_id(), spectre_v2_safe_list)) + if (is_midr_in_range_list(spectre_v2_safe_list)) return SPECTRE_UNAFFECTED; return SPECTRE_VULNERABLE; @@ -331,7 +331,7 @@ bool has_spectre_v3a(const struct arm64_cpu_capabilities *entry, int scope) }; WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible()); - return is_midr_in_range_list(read_cpuid_id(), spectre_v3a_unsafe_list); + return is_midr_in_range_list(spectre_v3a_unsafe_list); } void spectre_v3a_enable_mitigation(const struct arm64_cpu_capabilities *__unused) @@ -475,7 +475,7 @@ static enum mitigation_state spectre_v4_get_cpu_hw_mitigation_state(void) { /* sentinel */ }, }; - if (is_midr_in_range_list(read_cpuid_id(), spectre_v4_safe_list)) + if (is_midr_in_range_list(spectre_v4_safe_list)) return SPECTRE_UNAFFECTED; /* CPU features are detected first */ @@ -878,13 +878,13 @@ u8 spectre_bhb_loop_affected(int scope) {}, }; - if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k32_list)) + if (is_midr_in_range_list(spectre_bhb_k32_list)) k = 32; - else if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k24_list)) + else if (is_midr_in_range_list(spectre_bhb_k24_list)) k = 24; - else if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k11_list)) + else if (is_midr_in_range_list(spectre_bhb_k11_list)) k = 11; - else if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k8_list)) + else if (is_midr_in_range_list(spectre_bhb_k8_list)) k = 8; max_bhb_k = max(max_bhb_k, k); @@ -926,8 +926,7 @@ static bool is_spectre_bhb_fw_affected(int scope) MIDR_ALL_VERSIONS(MIDR_CORTEX_A75), {}, }; - bool cpu_in_list = is_midr_in_range_list(read_cpuid_id(), - spectre_bhb_firmware_mitigated_list); + bool cpu_in_list = is_midr_in_range_list(spectre_bhb_firmware_mitigated_list); if (scope != SCOPE_LOCAL_CPU) return system_affected; diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index d7233ab982d0..87a8a96c7a41 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -633,7 +633,7 @@ static const struct midr_range broken_seis[] = { static bool vgic_v3_broken_seis(void) { return ((kvm_vgic_global_state.ich_vtr_el2 & ICH_VTR_SEIS_MASK) && - is_midr_in_range_list(read_cpuid_id(), broken_seis)); + is_midr_in_range_list(broken_seis)); } /** diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c index 808f259781fd..981a578043a5 100644 --- a/drivers/clocksource/arm_arch_timer.c +++ b/drivers/clocksource/arm_arch_timer.c @@ -842,7 +842,7 @@ static u64 __arch_timer_check_delta(void) {}, }; - if (is_midr_in_range_list(read_cpuid_id(), broken_cval_midrs)) { + if (is_midr_in_range_list(broken_cval_midrs)) { pr_warn_once("Broken CNTx_CVAL_EL1, using 31 bit TVAL instead.\n"); return CLOCKSOURCE_MASK(31); } diff --git a/drivers/hwtracing/coresight/coresight-etm4x-core.c b/drivers/hwtracing/coresight/coresight-etm4x-core.c index 2c1a60577728..3d98e3371fff 100644 --- a/drivers/hwtracing/coresight/coresight-etm4x-core.c +++ b/drivers/hwtracing/coresight/coresight-etm4x-core.c @@ -1216,7 +1216,7 @@ static void etm4_fixup_wrong_ccitmin(struct etmv4_drvdata *drvdata) * recorded value for 'drvdata->ccitmin' to workaround * this problem. */ - if (is_midr_in_range_list(read_cpuid_id(), etm_wrong_ccitmin_cpus)) { + if (is_midr_in_range_list(etm_wrong_ccitmin_cpus)) { if (drvdata->ccitmin == 256) drvdata->ccitmin = 4; } From 57e5cc9b8a399f85c95b6249ecb423553aad209c Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Fri, 21 Feb 2025 14:02:25 +0000 Subject: [PATCH 23/76] KVM: arm64: Specify hypercall ABI for retrieving target implementations If the Guest requires migration to multiple targets, these hypercalls will provide a way to retrieve the target CPU implementations from the user space VMM. Subsequent patch will use this to enable the associated errata. Suggested-by: Oliver Upton Suggested-by: Marc Zyngier Reviewed-by: Cornelia Huck Reviewed-by: Sebastian Ott Signed-off-by: Shameer Kolothum Link: https://lore.kernel.org/r/20250221140229.12588-3-shameerali.kolothum.thodi@huawei.com Signed-off-by: Oliver Upton --- Documentation/virt/kvm/arm/hypercalls.rst | 59 +++++++++++++++++++++++ include/linux/arm-smccc.h | 15 ++++++ 2 files changed, 74 insertions(+) diff --git a/Documentation/virt/kvm/arm/hypercalls.rst b/Documentation/virt/kvm/arm/hypercalls.rst index af7bc2c2e0cb..7400a89aabf8 100644 --- a/Documentation/virt/kvm/arm/hypercalls.rst +++ b/Documentation/virt/kvm/arm/hypercalls.rst @@ -142,3 +142,62 @@ region is equal to the memory protection granule advertised by | | | +---------------------------------------------+ | | | | ``INVALID_PARAMETER (-3)`` | +---------------------+----------+----+---------------------------------------------+ + +``ARM_SMCCC_VENDOR_HYP_KVM_DISCOVER_IMPL_VER_FUNC_ID`` +------------------------------------------------------- +Request the target CPU implementation version information and the number of target +implementations for the Guest VM. + ++---------------------+-------------------------------------------------------------+ +| Presence: | Optional; KVM/ARM64 Guests only | ++---------------------+-------------------------------------------------------------+ +| Calling convention: | HVC64 | ++---------------------+----------+--------------------------------------------------+ +| Function ID: | (uint32) | 0xC6000040 | ++---------------------+----------+--------------------------------------------------+ +| Arguments: | None | ++---------------------+----------+----+---------------------------------------------+ +| Return Values: | (int64) | R0 | ``SUCCESS (0)`` | +| | | +---------------------------------------------+ +| | | | ``NOT_SUPPORTED (-1)`` | +| +----------+----+---------------------------------------------+ +| | (uint64) | R1 | Bits [63:32] Reserved/Must be zero | +| | | +---------------------------------------------+ +| | | | Bits [31:16] Major version | +| | | +---------------------------------------------+ +| | | | Bits [15:0] Minor version | +| +----------+----+---------------------------------------------+ +| | (uint64) | R2 | Number of target implementations | +| +----------+----+---------------------------------------------+ +| | (uint64) | R3 | Reserved / Must be zero | ++---------------------+----------+----+---------------------------------------------+ + +``ARM_SMCCC_VENDOR_HYP_KVM_DISCOVER_IMPL_CPUS_FUNC_ID`` +------------------------------------------------------- + +Request the target CPU implementation information for the Guest VM. The Guest kernel +will use this information to enable the associated errata. + ++---------------------+-------------------------------------------------------------+ +| Presence: | Optional; KVM/ARM64 Guests only | ++---------------------+-------------------------------------------------------------+ +| Calling convention: | HVC64 | ++---------------------+----------+--------------------------------------------------+ +| Function ID: | (uint32) | 0xC6000041 | ++---------------------+----------+----+---------------------------------------------+ +| Arguments: | (uint64) | R1 | selected implementation index | +| +----------+----+---------------------------------------------+ +| | (uint64) | R2 | Reserved / Must be zero | +| +----------+----+---------------------------------------------+ +| | (uint64) | R3 | Reserved / Must be zero | ++---------------------+----------+----+---------------------------------------------+ +| Return Values: | (int64) | R0 | ``SUCCESS (0)`` | +| | | +---------------------------------------------+ +| | | | ``INVALID_PARAMETER (-3)`` | +| +----------+----+---------------------------------------------+ +| | (uint64) | R1 | MIDR_EL1 of the selected implementation | +| +----------+----+---------------------------------------------+ +| | (uint64) | R2 | REVIDR_EL1 of the selected implementation | +| +----------+----+---------------------------------------------+ +| | (uint64) | R3 | AIDR_EL1 of the selected implementation | ++---------------------+----------+----+---------------------------------------------+ diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h index 67f6fdf2e7cd..f19be5754090 100644 --- a/include/linux/arm-smccc.h +++ b/include/linux/arm-smccc.h @@ -179,6 +179,9 @@ #define ARM_SMCCC_KVM_FUNC_PKVM_RESV_62 62 #define ARM_SMCCC_KVM_FUNC_PKVM_RESV_63 63 /* End of pKVM hypercall range */ +#define ARM_SMCCC_KVM_FUNC_DISCOVER_IMPL_VER 64 +#define ARM_SMCCC_KVM_FUNC_DISCOVER_IMPL_CPUS 65 + #define ARM_SMCCC_KVM_FUNC_FEATURES_2 127 #define ARM_SMCCC_KVM_NUM_FUNCS 128 @@ -225,6 +228,18 @@ ARM_SMCCC_OWNER_VENDOR_HYP, \ ARM_SMCCC_KVM_FUNC_MMIO_GUARD) +#define ARM_SMCCC_VENDOR_HYP_KVM_DISCOVER_IMPL_VER_FUNC_ID \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_64, \ + ARM_SMCCC_OWNER_VENDOR_HYP, \ + ARM_SMCCC_KVM_FUNC_DISCOVER_IMPL_VER) + +#define ARM_SMCCC_VENDOR_HYP_KVM_DISCOVER_IMPL_CPUS_FUNC_ID \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_64, \ + ARM_SMCCC_OWNER_VENDOR_HYP, \ + ARM_SMCCC_KVM_FUNC_DISCOVER_IMPL_CPUS) + /* ptp_kvm counter type ID */ #define KVM_PTP_VIRT_COUNTER 0 #define KVM_PTP_PHYS_COUNTER 1 From c0000e58c74eed0702b8271e8475378c3c17cf0f Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Fri, 21 Feb 2025 14:02:26 +0000 Subject: [PATCH 24/76] KVM: arm64: Introduce KVM_REG_ARM_VENDOR_HYP_BMAP_2 The vendor_hyp_bmap bitmap holds the information about the Vendor Hyp services available to the user space and can be get/set using {G, S}ET_ONE_REG interfaces. This is done using the pseudo-firmware bitmap register KVM_REG_ARM_VENDOR_HYP_BMAP. At present, this bitmap is a 64 bit one and since the function numbers for newly added DISCOVER_IPML_* hypercalls are 64-65, introduce another pseudo-firmware bitmap register KVM_REG_ARM_VENDOR_HYP_BMAP_2. Reviewed-by: Sebastian Ott Signed-off-by: Shameer Kolothum Link: https://lore.kernel.org/r/20250221140229.12588-4-shameerali.kolothum.thodi@huawei.com Signed-off-by: Oliver Upton --- .../virt/kvm/arm/fw-pseudo-registers.rst | 15 ++++++++++++++- arch/arm64/include/asm/kvm_host.h | 3 ++- arch/arm64/include/uapi/asm/kvm.h | 12 ++++++++++++ arch/arm64/kvm/hypercalls.c | 13 +++++++++++++ 4 files changed, 41 insertions(+), 2 deletions(-) diff --git a/Documentation/virt/kvm/arm/fw-pseudo-registers.rst b/Documentation/virt/kvm/arm/fw-pseudo-registers.rst index b90fd0b0fa66..d78b53b05dfc 100644 --- a/Documentation/virt/kvm/arm/fw-pseudo-registers.rst +++ b/Documentation/virt/kvm/arm/fw-pseudo-registers.rst @@ -116,7 +116,7 @@ The pseudo-firmware bitmap register are as follows: ARM DEN0057A. * KVM_REG_ARM_VENDOR_HYP_BMAP: - Controls the bitmap of the Vendor specific Hypervisor Service Calls. + Controls the bitmap of the Vendor specific Hypervisor Service Calls[0-63]. The following bits are accepted: @@ -127,6 +127,19 @@ The pseudo-firmware bitmap register are as follows: Bit-1: KVM_REG_ARM_VENDOR_HYP_BIT_PTP: The bit represents the Precision Time Protocol KVM service. +* KVM_REG_ARM_VENDOR_HYP_BMAP_2: + Controls the bitmap of the Vendor specific Hypervisor Service Calls[64-127]. + + The following bits are accepted: + + Bit-0: KVM_REG_ARM_VENDOR_HYP_BIT_DISCOVER_IMPL_VER + This represents the ARM_SMCCC_VENDOR_HYP_KVM_DISCOVER_IMPL_VER_FUNC_ID + function-id. This is reset to 0. + + Bit-1: KVM_REG_ARM_VENDOR_HYP_BIT_DISCOVER_IMPL_CPUS + This represents the ARM_SMCCC_VENDOR_HYP_KVM_DISCOVER_IMPL_CPUS_FUNC_ID + function-id. This is reset to 0. + Errors: ======= ============================================================= diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 3a7ec98ef123..6c700c789173 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -237,7 +237,8 @@ struct kvm_arch_memory_slot { struct kvm_smccc_features { unsigned long std_bmap; unsigned long std_hyp_bmap; - unsigned long vendor_hyp_bmap; + unsigned long vendor_hyp_bmap; /* Function numbers 0-63 */ + unsigned long vendor_hyp_bmap_2; /* Function numbers 64-127 */ }; typedef unsigned int pkvm_handle_t; diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h index 568bf858f319..072feeff74d0 100644 --- a/arch/arm64/include/uapi/asm/kvm.h +++ b/arch/arm64/include/uapi/asm/kvm.h @@ -371,6 +371,7 @@ enum { #endif }; +/* Vendor hyper call function numbers 0-63 */ #define KVM_REG_ARM_VENDOR_HYP_BMAP KVM_REG_ARM_FW_FEAT_BMAP_REG(2) enum { @@ -381,6 +382,17 @@ enum { #endif }; +/* Vendor hyper call function numbers 64-127 */ +#define KVM_REG_ARM_VENDOR_HYP_BMAP_2 KVM_REG_ARM_FW_FEAT_BMAP_REG(3) + +enum { + KVM_REG_ARM_VENDOR_HYP_BIT_DISCOVER_IMPL_VER = 0, + KVM_REG_ARM_VENDOR_HYP_BIT_DISCOVER_IMPL_CPUS = 1, +#ifdef __KERNEL__ + KVM_REG_ARM_VENDOR_HYP_BMAP_2_BIT_COUNT, +#endif +}; + /* Device Control API on vm fd */ #define KVM_ARM_VM_SMCCC_CTRL 0 #define KVM_ARM_VM_SMCCC_FILTER 0 diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c index 27ce4cb44904..569941eeb3fe 100644 --- a/arch/arm64/kvm/hypercalls.c +++ b/arch/arm64/kvm/hypercalls.c @@ -15,6 +15,8 @@ GENMASK(KVM_REG_ARM_STD_HYP_BMAP_BIT_COUNT - 1, 0) #define KVM_ARM_SMCCC_VENDOR_HYP_FEATURES \ GENMASK(KVM_REG_ARM_VENDOR_HYP_BMAP_BIT_COUNT - 1, 0) +#define KVM_ARM_SMCCC_VENDOR_HYP_FEATURES_2 \ + GENMASK(KVM_REG_ARM_VENDOR_HYP_BMAP_2_BIT_COUNT - 1, 0) static void kvm_ptp_get_time(struct kvm_vcpu *vcpu, u64 *val) { @@ -360,6 +362,8 @@ int kvm_smccc_call_handler(struct kvm_vcpu *vcpu) break; case ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID: val[0] = smccc_feat->vendor_hyp_bmap; + /* Function numbers 2-63 are reserved for pKVM for now */ + val[2] = smccc_feat->vendor_hyp_bmap_2; break; case ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID: kvm_ptp_get_time(vcpu, val); @@ -387,6 +391,7 @@ static const u64 kvm_arm_fw_reg_ids[] = { KVM_REG_ARM_STD_BMAP, KVM_REG_ARM_STD_HYP_BMAP, KVM_REG_ARM_VENDOR_HYP_BMAP, + KVM_REG_ARM_VENDOR_HYP_BMAP_2, }; void kvm_arm_init_hypercalls(struct kvm *kvm) @@ -497,6 +502,9 @@ int kvm_arm_get_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) case KVM_REG_ARM_VENDOR_HYP_BMAP: val = READ_ONCE(smccc_feat->vendor_hyp_bmap); break; + case KVM_REG_ARM_VENDOR_HYP_BMAP_2: + val = READ_ONCE(smccc_feat->vendor_hyp_bmap_2); + break; default: return -ENOENT; } @@ -527,6 +535,10 @@ static int kvm_arm_set_fw_reg_bmap(struct kvm_vcpu *vcpu, u64 reg_id, u64 val) fw_reg_bmap = &smccc_feat->vendor_hyp_bmap; fw_reg_features = KVM_ARM_SMCCC_VENDOR_HYP_FEATURES; break; + case KVM_REG_ARM_VENDOR_HYP_BMAP_2: + fw_reg_bmap = &smccc_feat->vendor_hyp_bmap_2; + fw_reg_features = KVM_ARM_SMCCC_VENDOR_HYP_FEATURES_2; + break; default: return -ENOENT; } @@ -633,6 +645,7 @@ int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) case KVM_REG_ARM_STD_BMAP: case KVM_REG_ARM_STD_HYP_BMAP: case KVM_REG_ARM_VENDOR_HYP_BMAP: + case KVM_REG_ARM_VENDOR_HYP_BMAP_2: return kvm_arm_set_fw_reg_bmap(vcpu, reg->id, val); default: return -ENOENT; From c8c2647e69bedf803244eabc80e2e0757f9c33d6 Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Fri, 21 Feb 2025 14:02:27 +0000 Subject: [PATCH 25/76] =?UTF-8?q?arm64:=20Make=20=C2=A0=5Fmidr=5Fin=5Frang?= =?UTF-8?q?e=5Flist()=20an=20exported=20function?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Subsequent patch will add target implementation CPU support and that will require _midr_in_range_list() to access new data. To avoid exporting the data make _midr_in_range_list() a normal function and export it. No functional changes intended. Signed-off-by: Shameer Kolothum Acked-by: Catalin Marinas Link: https://lore.kernel.org/r/20250221140229.12588-5-shameerali.kolothum.thodi@huawei.com Signed-off-by: Oliver Upton --- arch/arm64/include/asm/cputype.h | 15 +-------------- arch/arm64/kernel/cpu_errata.c | 15 +++++++++++++++ arch/arm64/kernel/image-vars.h | 1 + 3 files changed, 17 insertions(+), 14 deletions(-) diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index 2a76f0e30006..ccb4a155d118 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -276,20 +276,7 @@ static inline bool midr_is_cpu_model_range(u32 midr, u32 model, u32 rv_min, return _model == model && rv >= rv_min && rv <= rv_max; } -static inline bool is_midr_in_range(struct midr_range const *range) -{ - return midr_is_cpu_model_range(read_cpuid_id(), range->model, - range->rv_min, range->rv_max); -} - -static inline bool -is_midr_in_range_list(struct midr_range const *ranges) -{ - while (ranges->model) - if (is_midr_in_range(ranges++)) - return true; - return false; -} +bool is_midr_in_range_list(struct midr_range const *ranges); static inline u64 __attribute_const__ read_cpuid_mpidr(void) { diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index 99b55893fc4e..1f51cf6378c5 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -14,6 +14,21 @@ #include #include +static inline bool is_midr_in_range(struct midr_range const *range) +{ + return midr_is_cpu_model_range(read_cpuid_id(), range->model, + range->rv_min, range->rv_max); +} + +bool is_midr_in_range_list(struct midr_range const *ranges) +{ + while (ranges->model) + if (is_midr_in_range(ranges++)) + return true; + return false; +} +EXPORT_SYMBOL_GPL(is_midr_in_range_list); + static bool __maybe_unused __is_affected_midr_range(const struct arm64_cpu_capabilities *entry, u32 midr, u32 revidr) diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h index ef3a69cc398e..de3d081e5a57 100644 --- a/arch/arm64/kernel/image-vars.h +++ b/arch/arm64/kernel/image-vars.h @@ -49,6 +49,7 @@ PROVIDE(__pi_arm64_sw_feature_override = arm64_sw_feature_override); PROVIDE(__pi_arm64_use_ng_mappings = arm64_use_ng_mappings); #ifdef CONFIG_CAVIUM_ERRATUM_27456 PROVIDE(__pi_cavium_erratum_27456_cpus = cavium_erratum_27456_cpus); +PROVIDE(__pi_is_midr_in_range_list = is_midr_in_range_list); #endif PROVIDE(__pi__ctype = _ctype); PROVIDE(__pi_memstart_offset_seed = memstart_offset_seed); From 86edf6bdcf0571c07103b8751e9d792a4b808e97 Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Fri, 21 Feb 2025 14:02:28 +0000 Subject: [PATCH 26/76] smccc/kvm_guest: Enable errata based on implementation CPUs Retrieve any migration target implementation CPUs using the hypercall and enable associated errata. Reviewed-by: Cornelia Huck Reviewed-by: Sebastian Ott Signed-off-by: Shameer Kolothum Acked-by: Catalin Marinas Link: https://lore.kernel.org/r/20250221140229.12588-6-shameerali.kolothum.thodi@huawei.com Signed-off-by: Oliver Upton --- arch/arm64/include/asm/cputype.h | 7 ++++ arch/arm64/include/asm/hypervisor.h | 1 + arch/arm64/kernel/cpu_errata.c | 45 +++++++++++++++++--- arch/arm64/kernel/cpufeature.c | 2 + drivers/firmware/smccc/kvm_guest.c | 64 +++++++++++++++++++++++++++++ 5 files changed, 114 insertions(+), 5 deletions(-) diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index ccb4a155d118..cc2d58141be1 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -276,6 +276,13 @@ static inline bool midr_is_cpu_model_range(u32 midr, u32 model, u32 rv_min, return _model == model && rv >= rv_min && rv <= rv_max; } +struct target_impl_cpu { + u64 midr; + u64 revidr; + u64 aidr; +}; + +bool cpu_errata_set_target_impl(u64 num, void *impl_cpus); bool is_midr_in_range_list(struct midr_range const *ranges); static inline u64 __attribute_const__ read_cpuid_mpidr(void) diff --git a/arch/arm64/include/asm/hypervisor.h b/arch/arm64/include/asm/hypervisor.h index 409e239834d1..a12fd897c877 100644 --- a/arch/arm64/include/asm/hypervisor.h +++ b/arch/arm64/include/asm/hypervisor.h @@ -6,6 +6,7 @@ void kvm_init_hyp_services(void); bool kvm_arm_hyp_service_available(u32 func_id); +void kvm_arm_target_impl_cpu_init(void); #ifdef CONFIG_ARM_PKVM_GUEST void pkvm_init_hyp_services(void); diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index 1f51cf6378c5..66869d81c3d5 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -14,10 +14,34 @@ #include #include +static u64 target_impl_cpu_num; +static struct target_impl_cpu *target_impl_cpus; + +bool cpu_errata_set_target_impl(u64 num, void *impl_cpus) +{ + if (target_impl_cpu_num || !num || !impl_cpus) + return false; + + target_impl_cpu_num = num; + target_impl_cpus = impl_cpus; + return true; +} + static inline bool is_midr_in_range(struct midr_range const *range) { - return midr_is_cpu_model_range(read_cpuid_id(), range->model, - range->rv_min, range->rv_max); + int i; + + if (!target_impl_cpu_num) + return midr_is_cpu_model_range(read_cpuid_id(), range->model, + range->rv_min, range->rv_max); + + for (i = 0; i < target_impl_cpu_num; i++) { + if (midr_is_cpu_model_range(target_impl_cpus[i].midr, + range->model, + range->rv_min, range->rv_max)) + return true; + } + return false; } bool is_midr_in_range_list(struct midr_range const *ranges) @@ -47,9 +71,20 @@ __is_affected_midr_range(const struct arm64_cpu_capabilities *entry, static bool __maybe_unused is_affected_midr_range(const struct arm64_cpu_capabilities *entry, int scope) { - WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible()); - return __is_affected_midr_range(entry, read_cpuid_id(), - read_cpuid(REVIDR_EL1)); + int i; + + if (!target_impl_cpu_num) { + WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible()); + return __is_affected_midr_range(entry, read_cpuid_id(), + read_cpuid(REVIDR_EL1)); + } + + for (i = 0; i < target_impl_cpu_num; i++) { + if (__is_affected_midr_range(entry, target_impl_cpus[i].midr, + target_impl_cpus[i].midr)) + return true; + } + return false; } static bool __maybe_unused diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index aafa3fbdd5ff..c615eb6ac7e9 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -86,6 +86,7 @@ #include #include #include +#include #include #include #include @@ -3680,6 +3681,7 @@ unsigned long cpu_get_elf_hwcap3(void) static void __init setup_boot_cpu_capabilities(void) { + kvm_arm_target_impl_cpu_init(); /* * The boot CPU's feature register values have been recorded. Detect * boot cpucaps and local cpucaps for the boot CPU, then enable and diff --git a/drivers/firmware/smccc/kvm_guest.c b/drivers/firmware/smccc/kvm_guest.c index f3319be20b36..2f03b582c298 100644 --- a/drivers/firmware/smccc/kvm_guest.c +++ b/drivers/firmware/smccc/kvm_guest.c @@ -6,8 +6,11 @@ #include #include #include +#include #include +#include + #include static DECLARE_BITMAP(__kvm_arm_hyp_services, ARM_SMCCC_KVM_NUM_FUNCS) __ro_after_init = { }; @@ -51,3 +54,64 @@ bool kvm_arm_hyp_service_available(u32 func_id) return test_bit(func_id, __kvm_arm_hyp_services); } EXPORT_SYMBOL_GPL(kvm_arm_hyp_service_available); + +void __init kvm_arm_target_impl_cpu_init(void) +{ + int i; + u32 ver; + u64 max_cpus; + struct arm_smccc_res res; + struct target_impl_cpu *target; + + if (!kvm_arm_hyp_service_available(ARM_SMCCC_KVM_FUNC_DISCOVER_IMPL_VER) || + !kvm_arm_hyp_service_available(ARM_SMCCC_KVM_FUNC_DISCOVER_IMPL_CPUS)) + return; + + arm_smccc_1_1_invoke(ARM_SMCCC_VENDOR_HYP_KVM_DISCOVER_IMPL_VER_FUNC_ID, + 0, &res); + if (res.a0 != SMCCC_RET_SUCCESS) + return; + + /* Version info is in lower 32 bits and is in SMMCCC_VERSION format */ + ver = lower_32_bits(res.a1); + if (PSCI_VERSION_MAJOR(ver) != 1) { + pr_warn("Unsupported target CPU implementation version v%d.%d\n", + PSCI_VERSION_MAJOR(ver), PSCI_VERSION_MINOR(ver)); + return; + } + + if (!res.a2) { + pr_warn("No target implementation CPUs specified\n"); + return; + } + + max_cpus = res.a2; + target = memblock_alloc(sizeof(*target) * max_cpus, __alignof__(*target)); + if (!target) { + pr_warn("Not enough memory for struct target_impl_cpu\n"); + return; + } + + for (i = 0; i < max_cpus; i++) { + arm_smccc_1_1_invoke(ARM_SMCCC_VENDOR_HYP_KVM_DISCOVER_IMPL_CPUS_FUNC_ID, + i, &res); + if (res.a0 != SMCCC_RET_SUCCESS) { + pr_warn("Discovering target implementation CPUs failed\n"); + goto mem_free; + } + target[i].midr = res.a1; + target[i].revidr = res.a2; + target[i].aidr = res.a3; + }; + + if (!cpu_errata_set_target_impl(max_cpus, target)) { + pr_warn("Failed to set target implementation CPUs\n"); + goto mem_free; + } + + pr_info("Number of target implementation CPUs is %lld\n", max_cpus); + return; + +mem_free: + memblock_free(target, sizeof(*target) * max_cpus); +} From f69656656fa7030eff42fcabe7316ff96fc2a5cd Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Fri, 21 Feb 2025 14:02:29 +0000 Subject: [PATCH 27/76] KVM: selftests: Add test for KVM_REG_ARM_VENDOR_HYP_BMAP_2 One difference here with other pseudo-firmware bitmap registers is that the default/reset value for the supported hypercall function-ids is 0 at present. Hence, modify the test accordingly. Reviewed-by: Sebastian Ott Signed-off-by: Shameer Kolothum Link: https://lore.kernel.org/r/20250221140229.12588-7-shameerali.kolothum.thodi@huawei.com Signed-off-by: Oliver Upton --- tools/arch/arm64/include/uapi/asm/kvm.h | 12 +++++ .../selftests/kvm/arm64/get-reg-list.c | 1 + .../testing/selftests/kvm/arm64/hypercalls.c | 46 +++++++++++++++---- 3 files changed, 49 insertions(+), 10 deletions(-) diff --git a/tools/arch/arm64/include/uapi/asm/kvm.h b/tools/arch/arm64/include/uapi/asm/kvm.h index 66736ff04011..6d44f8c8a18f 100644 --- a/tools/arch/arm64/include/uapi/asm/kvm.h +++ b/tools/arch/arm64/include/uapi/asm/kvm.h @@ -374,6 +374,7 @@ enum { #endif }; +/* Vendor hyper call function numbers 0-63 */ #define KVM_REG_ARM_VENDOR_HYP_BMAP KVM_REG_ARM_FW_FEAT_BMAP_REG(2) enum { @@ -384,6 +385,17 @@ enum { #endif }; +/* Vendor hyper call function numbers 64-127 */ +#define KVM_REG_ARM_VENDOR_HYP_BMAP_2 KVM_REG_ARM_FW_FEAT_BMAP_REG(3) + +enum { + KVM_REG_ARM_VENDOR_HYP_BIT_DISCOVER_IMPL_VER = 0, + KVM_REG_ARM_VENDOR_HYP_BIT_DISCOVER_IMPL_CPUS = 1, +#ifdef __KERNEL__ + KVM_REG_ARM_VENDOR_HYP_BMAP_2_BIT_COUNT, +#endif +}; + /* Device Control API on vm fd */ #define KVM_ARM_VM_SMCCC_CTRL 0 #define KVM_ARM_VM_SMCCC_FILTER 0 diff --git a/tools/testing/selftests/kvm/arm64/get-reg-list.c b/tools/testing/selftests/kvm/arm64/get-reg-list.c index d43fb3f49050..d01798b6b3b4 100644 --- a/tools/testing/selftests/kvm/arm64/get-reg-list.c +++ b/tools/testing/selftests/kvm/arm64/get-reg-list.c @@ -332,6 +332,7 @@ static __u64 base_regs[] = { KVM_REG_ARM_FW_FEAT_BMAP_REG(0), /* KVM_REG_ARM_STD_BMAP */ KVM_REG_ARM_FW_FEAT_BMAP_REG(1), /* KVM_REG_ARM_STD_HYP_BMAP */ KVM_REG_ARM_FW_FEAT_BMAP_REG(2), /* KVM_REG_ARM_VENDOR_HYP_BMAP */ + KVM_REG_ARM_FW_FEAT_BMAP_REG(3), /* KVM_REG_ARM_VENDOR_HYP_BMAP_2 */ ARM64_SYS_REG(3, 3, 14, 3, 1), /* CNTV_CTL_EL0 */ ARM64_SYS_REG(3, 3, 14, 3, 2), /* CNTV_CVAL_EL0 */ ARM64_SYS_REG(3, 3, 14, 0, 2), diff --git a/tools/testing/selftests/kvm/arm64/hypercalls.c b/tools/testing/selftests/kvm/arm64/hypercalls.c index ec54ec7726e9..44cfcf8a7f46 100644 --- a/tools/testing/selftests/kvm/arm64/hypercalls.c +++ b/tools/testing/selftests/kvm/arm64/hypercalls.c @@ -21,22 +21,31 @@ #define KVM_REG_ARM_STD_BMAP_BIT_MAX 0 #define KVM_REG_ARM_STD_HYP_BMAP_BIT_MAX 0 #define KVM_REG_ARM_VENDOR_HYP_BMAP_BIT_MAX 1 +#define KVM_REG_ARM_VENDOR_HYP_BMAP_2_BIT_MAX 1 + +#define KVM_REG_ARM_STD_BMAP_RESET_VAL FW_REG_ULIMIT_VAL(KVM_REG_ARM_STD_BMAP_BIT_MAX) +#define KVM_REG_ARM_STD_HYP_BMAP_RESET_VAL FW_REG_ULIMIT_VAL(KVM_REG_ARM_STD_HYP_BMAP_BIT_MAX) +#define KVM_REG_ARM_VENDOR_HYP_BMAP_RESET_VAL FW_REG_ULIMIT_VAL(KVM_REG_ARM_VENDOR_HYP_BMAP_BIT_MAX) +#define KVM_REG_ARM_VENDOR_HYP_BMAP_2_RESET_VAL 0 struct kvm_fw_reg_info { uint64_t reg; /* Register definition */ uint64_t max_feat_bit; /* Bit that represents the upper limit of the feature-map */ + uint64_t reset_val; /* Reset value for the register */ }; #define FW_REG_INFO(r) \ { \ .reg = r, \ .max_feat_bit = r##_BIT_MAX, \ + .reset_val = r##_RESET_VAL \ } static const struct kvm_fw_reg_info fw_reg_info[] = { FW_REG_INFO(KVM_REG_ARM_STD_BMAP), FW_REG_INFO(KVM_REG_ARM_STD_HYP_BMAP), FW_REG_INFO(KVM_REG_ARM_VENDOR_HYP_BMAP), + FW_REG_INFO(KVM_REG_ARM_VENDOR_HYP_BMAP_2), }; enum test_stage { @@ -171,22 +180,39 @@ static void test_fw_regs_before_vm_start(struct kvm_vcpu *vcpu) for (i = 0; i < ARRAY_SIZE(fw_reg_info); i++) { const struct kvm_fw_reg_info *reg_info = &fw_reg_info[i]; + uint64_t set_val; - /* First 'read' should be an upper limit of the features supported */ + /* First 'read' should be the reset value for the reg */ val = vcpu_get_reg(vcpu, reg_info->reg); - TEST_ASSERT(val == FW_REG_ULIMIT_VAL(reg_info->max_feat_bit), - "Expected all the features to be set for reg: 0x%lx; expected: 0x%lx; read: 0x%lx", - reg_info->reg, FW_REG_ULIMIT_VAL(reg_info->max_feat_bit), val); + TEST_ASSERT(val == reg_info->reset_val, + "Unexpected reset value for reg: 0x%lx; expected: 0x%lx; read: 0x%lx", + reg_info->reg, reg_info->reset_val, val); - /* Test a 'write' by disabling all the features of the register map */ - ret = __vcpu_set_reg(vcpu, reg_info->reg, 0); + if (reg_info->reset_val) + set_val = 0; + else + set_val = FW_REG_ULIMIT_VAL(reg_info->max_feat_bit); + + ret = __vcpu_set_reg(vcpu, reg_info->reg, set_val); TEST_ASSERT(ret == 0, + "Failed to %s all the features of reg: 0x%lx; ret: %d", + (set_val ? "set" : "clear"), reg_info->reg, errno); + + val = vcpu_get_reg(vcpu, reg_info->reg); + TEST_ASSERT(val == set_val, + "Expected all the features to be %s for reg: 0x%lx", + (set_val ? "set" : "cleared"), reg_info->reg); + + /* + * If the reg has been set, clear it as test_fw_regs_after_vm_start() + * expects it to be cleared. + */ + if (set_val) { + ret = __vcpu_set_reg(vcpu, reg_info->reg, 0); + TEST_ASSERT(ret == 0, "Failed to clear all the features of reg: 0x%lx; ret: %d", reg_info->reg, errno); - - val = vcpu_get_reg(vcpu, reg_info->reg); - TEST_ASSERT(val == 0, - "Expected all the features to be cleared for reg: 0x%lx", reg_info->reg); + } /* * Test enabling a feature that's not supported. From 22513c0d2ad86af126a221ae4cc804b346bce374 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 25 Feb 2025 17:29:15 +0000 Subject: [PATCH 28/76] arm64: sysreg: Add layout for ICH_HCR_EL2 The ICH_HCR_EL2-related macros are missing a number of control bits that we are about to handle. Take this opportunity to fully describe the layout of that register as part of the automatic generation infrastructure. This results in a bit of churn, unfortunately. Reviewed-by: Andre Przywara Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20250225172930.1850838-2-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/include/asm/sysreg.h | 12 ------------ arch/arm64/kvm/emulate-nested.c | 16 ++++++++-------- arch/arm64/kvm/hyp/vgic-v3-sr.c | 14 +++++++------- arch/arm64/kvm/vgic/vgic-v3.c | 17 +++++++++-------- arch/arm64/tools/sysreg | 22 ++++++++++++++++++++++ drivers/irqchip/irq-apple-aic.c | 8 ++++---- tools/arch/arm64/include/asm/sysreg.h | 12 ------------ 7 files changed, 50 insertions(+), 51 deletions(-) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 05ea5223d2d5..76a88042390f 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -562,7 +562,6 @@ #define SYS_ICH_VSEIR_EL2 sys_reg(3, 4, 12, 9, 4) #define SYS_ICC_SRE_EL2 sys_reg(3, 4, 12, 9, 5) -#define SYS_ICH_HCR_EL2 sys_reg(3, 4, 12, 11, 0) #define SYS_ICH_VTR_EL2 sys_reg(3, 4, 12, 11, 1) #define SYS_ICH_MISR_EL2 sys_reg(3, 4, 12, 11, 2) #define SYS_ICH_EISR_EL2 sys_reg(3, 4, 12, 11, 3) @@ -1003,17 +1002,6 @@ #define ICH_LR_PRIORITY_SHIFT 48 #define ICH_LR_PRIORITY_MASK (0xffULL << ICH_LR_PRIORITY_SHIFT) -/* ICH_HCR_EL2 bit definitions */ -#define ICH_HCR_EN (1 << 0) -#define ICH_HCR_UIE (1 << 1) -#define ICH_HCR_NPIE (1 << 3) -#define ICH_HCR_TC (1 << 10) -#define ICH_HCR_TALL0 (1 << 11) -#define ICH_HCR_TALL1 (1 << 12) -#define ICH_HCR_TDIR (1 << 14) -#define ICH_HCR_EOIcount_SHIFT 27 -#define ICH_HCR_EOIcount_MASK (0x1f << ICH_HCR_EOIcount_SHIFT) - /* ICH_VMCR_EL2 bit definitions */ #define ICH_VMCR_ACK_CTL_SHIFT 2 #define ICH_VMCR_ACK_CTL_MASK (1 << ICH_VMCR_ACK_CTL_SHIFT) diff --git a/arch/arm64/kvm/emulate-nested.c b/arch/arm64/kvm/emulate-nested.c index 607d37bab70b..9986bb88c259 100644 --- a/arch/arm64/kvm/emulate-nested.c +++ b/arch/arm64/kvm/emulate-nested.c @@ -412,26 +412,26 @@ static const struct trap_bits coarse_trap_bits[] = { }, [CGT_ICH_HCR_TC] = { .index = ICH_HCR_EL2, - .value = ICH_HCR_TC, - .mask = ICH_HCR_TC, + .value = ICH_HCR_EL2_TC, + .mask = ICH_HCR_EL2_TC, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_ICH_HCR_TALL0] = { .index = ICH_HCR_EL2, - .value = ICH_HCR_TALL0, - .mask = ICH_HCR_TALL0, + .value = ICH_HCR_EL2_TALL0, + .mask = ICH_HCR_EL2_TALL0, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_ICH_HCR_TALL1] = { .index = ICH_HCR_EL2, - .value = ICH_HCR_TALL1, - .mask = ICH_HCR_TALL1, + .value = ICH_HCR_EL2_TALL1, + .mask = ICH_HCR_EL2_TALL1, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_ICH_HCR_TDIR] = { .index = ICH_HCR_EL2, - .value = ICH_HCR_TDIR, - .mask = ICH_HCR_TDIR, + .value = ICH_HCR_EL2_TDIR, + .mask = ICH_HCR_EL2_TDIR, .behaviour = BEHAVE_FORWARD_RW, }, }; diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c index 3f9741e51d41..b47dede973b3 100644 --- a/arch/arm64/kvm/hyp/vgic-v3-sr.c +++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c @@ -218,7 +218,7 @@ void __vgic_v3_save_state(struct vgic_v3_cpu_if *cpu_if) elrsr = read_gicreg(ICH_ELRSR_EL2); - write_gicreg(cpu_if->vgic_hcr & ~ICH_HCR_EN, ICH_HCR_EL2); + write_gicreg(cpu_if->vgic_hcr & ~ICH_HCR_EL2_En, ICH_HCR_EL2); for (i = 0; i < used_lrs; i++) { if (elrsr & (1 << i)) @@ -274,7 +274,7 @@ void __vgic_v3_activate_traps(struct vgic_v3_cpu_if *cpu_if) * system registers to trap to EL1 (duh), force ICC_SRE_EL1.SRE to 1 * so that the trap bits can take effect. Yes, we *loves* the GIC. */ - if (!(cpu_if->vgic_hcr & ICH_HCR_EN)) { + if (!(cpu_if->vgic_hcr & ICH_HCR_EL2_En)) { write_gicreg(ICC_SRE_EL1_SRE, ICC_SRE_EL1); isb(); } else if (!cpu_if->vgic_sre) { @@ -752,7 +752,7 @@ static void __vgic_v3_bump_eoicount(void) u32 hcr; hcr = read_gicreg(ICH_HCR_EL2); - hcr += 1 << ICH_HCR_EOIcount_SHIFT; + hcr += 1 << ICH_HCR_EL2_EOIcount_SHIFT; write_gicreg(hcr, ICH_HCR_EL2); } @@ -1069,7 +1069,7 @@ static bool __vgic_v3_check_trap_forwarding(struct kvm_vcpu *vcpu, case SYS_ICC_EOIR0_EL1: case SYS_ICC_HPPIR0_EL1: case SYS_ICC_IAR0_EL1: - return ich_hcr & ICH_HCR_TALL0; + return ich_hcr & ICH_HCR_EL2_TALL0; case SYS_ICC_IGRPEN1_EL1: if (is_read && @@ -1090,10 +1090,10 @@ static bool __vgic_v3_check_trap_forwarding(struct kvm_vcpu *vcpu, case SYS_ICC_EOIR1_EL1: case SYS_ICC_HPPIR1_EL1: case SYS_ICC_IAR1_EL1: - return ich_hcr & ICH_HCR_TALL1; + return ich_hcr & ICH_HCR_EL2_TALL1; case SYS_ICC_DIR_EL1: - if (ich_hcr & ICH_HCR_TDIR) + if (ich_hcr & ICH_HCR_EL2_TDIR) return true; fallthrough; @@ -1101,7 +1101,7 @@ static bool __vgic_v3_check_trap_forwarding(struct kvm_vcpu *vcpu, case SYS_ICC_RPR_EL1: case SYS_ICC_CTLR_EL1: case SYS_ICC_PMR_EL1: - return ich_hcr & ICH_HCR_TC; + return ich_hcr & ICH_HCR_EL2_TC; default: return false; diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index d7233ab982d0..5e9682a55046 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -24,7 +24,7 @@ void vgic_v3_set_underflow(struct kvm_vcpu *vcpu) { struct vgic_v3_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v3; - cpuif->vgic_hcr |= ICH_HCR_UIE; + cpuif->vgic_hcr |= ICH_HCR_EL2_UIE; } static bool lr_signals_eoi_mi(u64 lr_val) @@ -42,7 +42,7 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) DEBUG_SPINLOCK_BUG_ON(!irqs_disabled()); - cpuif->vgic_hcr &= ~ICH_HCR_UIE; + cpuif->vgic_hcr &= ~ICH_HCR_EL2_UIE; for (lr = 0; lr < cpuif->used_lrs; lr++) { u64 val = cpuif->vgic_lr[lr]; @@ -292,7 +292,7 @@ void vgic_v3_enable(struct kvm_vcpu *vcpu) ICH_VTR_PRI_BITS_SHIFT) + 1; /* Get the show on the road... */ - vgic_v3->vgic_hcr = ICH_HCR_EN; + vgic_v3->vgic_hcr = ICH_HCR_EL2_En; } void vcpu_set_ich_hcr(struct kvm_vcpu *vcpu) @@ -301,18 +301,19 @@ void vcpu_set_ich_hcr(struct kvm_vcpu *vcpu) /* Hide GICv3 sysreg if necessary */ if (!kvm_has_gicv3(vcpu->kvm)) { - vgic_v3->vgic_hcr |= ICH_HCR_TALL0 | ICH_HCR_TALL1 | ICH_HCR_TC; + vgic_v3->vgic_hcr |= (ICH_HCR_EL2_TALL0 | ICH_HCR_EL2_TALL1 | + ICH_HCR_EL2_TC); return; } if (group0_trap) - vgic_v3->vgic_hcr |= ICH_HCR_TALL0; + vgic_v3->vgic_hcr |= ICH_HCR_EL2_TALL0; if (group1_trap) - vgic_v3->vgic_hcr |= ICH_HCR_TALL1; + vgic_v3->vgic_hcr |= ICH_HCR_EL2_TALL1; if (common_trap) - vgic_v3->vgic_hcr |= ICH_HCR_TC; + vgic_v3->vgic_hcr |= ICH_HCR_EL2_TC; if (dir_trap) - vgic_v3->vgic_hcr |= ICH_HCR_TDIR; + vgic_v3->vgic_hcr |= ICH_HCR_EL2_TDIR; } int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, struct vgic_irq *irq) diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index 762ee084b37c..fa77621aba1a 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -3035,6 +3035,28 @@ Field 31:16 PhyPARTID29 Field 15:0 PhyPARTID28 EndSysreg +Sysreg ICH_HCR_EL2 3 4 12 11 0 +Res0 63:32 +Field 31:27 EOIcount +Res0 26:16 +Field 15 DVIM +Field 14 TDIR +Field 13 TSEI +Field 12 TALL1 +Field 11 TALL0 +Field 10 TC +Res0 9 +Field 8 vSGIEOICount +Field 7 VGrp1DIE +Field 6 VGrp1EIE +Field 5 VGrp0DIE +Field 4 VGrp0EIE +Field 3 NPIE +Field 2 LRENPIE +Field 1 UIE +Field 0 En +EndSysreg + Sysreg CONTEXTIDR_EL2 3 4 13 0 1 Fields CONTEXTIDR_ELx EndSysreg diff --git a/drivers/irqchip/irq-apple-aic.c b/drivers/irqchip/irq-apple-aic.c index 2b1684c60e3c..974dc088c853 100644 --- a/drivers/irqchip/irq-apple-aic.c +++ b/drivers/irqchip/irq-apple-aic.c @@ -409,15 +409,15 @@ static void __exception_irq_entry aic_handle_irq(struct pt_regs *regs) * in use, and be cleared when coming back from the handler. */ if (is_kernel_in_hyp_mode() && - (read_sysreg_s(SYS_ICH_HCR_EL2) & ICH_HCR_EN) && + (read_sysreg_s(SYS_ICH_HCR_EL2) & ICH_HCR_EL2_En) && read_sysreg_s(SYS_ICH_MISR_EL2) != 0) { generic_handle_domain_irq(aic_irqc->hw_domain, AIC_FIQ_HWIRQ(AIC_VGIC_MI)); - if (unlikely((read_sysreg_s(SYS_ICH_HCR_EL2) & ICH_HCR_EN) && + if (unlikely((read_sysreg_s(SYS_ICH_HCR_EL2) & ICH_HCR_EL2_En) && read_sysreg_s(SYS_ICH_MISR_EL2))) { pr_err_ratelimited("vGIC IRQ fired and not handled by KVM, disabling.\n"); - sysreg_clear_set_s(SYS_ICH_HCR_EL2, ICH_HCR_EN, 0); + sysreg_clear_set_s(SYS_ICH_HCR_EL2, ICH_HCR_EL2_En, 0); } } } @@ -841,7 +841,7 @@ static int aic_init_cpu(unsigned int cpu) VM_TMR_FIQ_ENABLE_V | VM_TMR_FIQ_ENABLE_P, 0); /* vGIC maintenance IRQ */ - sysreg_clear_set_s(SYS_ICH_HCR_EL2, ICH_HCR_EN, 0); + sysreg_clear_set_s(SYS_ICH_HCR_EL2, ICH_HCR_EL2_En, 0); } /* PMC FIQ */ diff --git a/tools/arch/arm64/include/asm/sysreg.h b/tools/arch/arm64/include/asm/sysreg.h index 150416682e2c..0ce8fc540fe2 100644 --- a/tools/arch/arm64/include/asm/sysreg.h +++ b/tools/arch/arm64/include/asm/sysreg.h @@ -558,7 +558,6 @@ #define SYS_ICH_VSEIR_EL2 sys_reg(3, 4, 12, 9, 4) #define SYS_ICC_SRE_EL2 sys_reg(3, 4, 12, 9, 5) -#define SYS_ICH_HCR_EL2 sys_reg(3, 4, 12, 11, 0) #define SYS_ICH_VTR_EL2 sys_reg(3, 4, 12, 11, 1) #define SYS_ICH_MISR_EL2 sys_reg(3, 4, 12, 11, 2) #define SYS_ICH_EISR_EL2 sys_reg(3, 4, 12, 11, 3) @@ -999,17 +998,6 @@ #define ICH_LR_PRIORITY_SHIFT 48 #define ICH_LR_PRIORITY_MASK (0xffULL << ICH_LR_PRIORITY_SHIFT) -/* ICH_HCR_EL2 bit definitions */ -#define ICH_HCR_EN (1 << 0) -#define ICH_HCR_UIE (1 << 1) -#define ICH_HCR_NPIE (1 << 3) -#define ICH_HCR_TC (1 << 10) -#define ICH_HCR_TALL0 (1 << 11) -#define ICH_HCR_TALL1 (1 << 12) -#define ICH_HCR_TDIR (1 << 14) -#define ICH_HCR_EOIcount_SHIFT 27 -#define ICH_HCR_EOIcount_MASK (0x1f << ICH_HCR_EOIcount_SHIFT) - /* ICH_VMCR_EL2 bit definitions */ #define ICH_VMCR_ACK_CTL_SHIFT 2 #define ICH_VMCR_ACK_CTL_MASK (1 << ICH_VMCR_ACK_CTL_SHIFT) From 5815fb82dc67c936f2881e601f046faa6ff577be Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 25 Feb 2025 17:29:16 +0000 Subject: [PATCH 29/76] arm64: sysreg: Add layout for ICH_VTR_EL2 The ICH_VTR_EL2-related macros are missing a number of config bits that we are about to handle. Take this opportunity to fully describe the layout of that register as part of the automatic generation infrastructure. This results in a bit of churn to repaint constants that are now generated with a different format. Reviewed-by: Andre Przywara Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20250225172930.1850838-3-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/include/asm/sysreg.h | 13 ------------- arch/arm64/kvm/vgic-sys-reg-v3.c | 8 ++++---- arch/arm64/kvm/vgic/vgic-v3.c | 16 +++++++--------- arch/arm64/tools/sysreg | 14 ++++++++++++++ tools/arch/arm64/include/asm/sysreg.h | 13 ------------- 5 files changed, 25 insertions(+), 39 deletions(-) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 76a88042390f..b59b2c680e97 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -562,7 +562,6 @@ #define SYS_ICH_VSEIR_EL2 sys_reg(3, 4, 12, 9, 4) #define SYS_ICC_SRE_EL2 sys_reg(3, 4, 12, 9, 5) -#define SYS_ICH_VTR_EL2 sys_reg(3, 4, 12, 11, 1) #define SYS_ICH_MISR_EL2 sys_reg(3, 4, 12, 11, 2) #define SYS_ICH_EISR_EL2 sys_reg(3, 4, 12, 11, 3) #define SYS_ICH_ELRSR_EL2 sys_reg(3, 4, 12, 11, 5) @@ -1022,18 +1021,6 @@ #define ICH_VMCR_ENG1_SHIFT 1 #define ICH_VMCR_ENG1_MASK (1 << ICH_VMCR_ENG1_SHIFT) -/* ICH_VTR_EL2 bit definitions */ -#define ICH_VTR_PRI_BITS_SHIFT 29 -#define ICH_VTR_PRI_BITS_MASK (7 << ICH_VTR_PRI_BITS_SHIFT) -#define ICH_VTR_ID_BITS_SHIFT 23 -#define ICH_VTR_ID_BITS_MASK (7 << ICH_VTR_ID_BITS_SHIFT) -#define ICH_VTR_SEIS_SHIFT 22 -#define ICH_VTR_SEIS_MASK (1 << ICH_VTR_SEIS_SHIFT) -#define ICH_VTR_A3V_SHIFT 21 -#define ICH_VTR_A3V_MASK (1 << ICH_VTR_A3V_SHIFT) -#define ICH_VTR_TDS_SHIFT 19 -#define ICH_VTR_TDS_MASK (1 << ICH_VTR_TDS_SHIFT) - /* * Permission Indirection Extension (PIE) permission encodings. * Encodings with the _O suffix, have overlays applied (Permission Overlay Extension). diff --git a/arch/arm64/kvm/vgic-sys-reg-v3.c b/arch/arm64/kvm/vgic-sys-reg-v3.c index 9e7c486b48c2..5eacb4b3250a 100644 --- a/arch/arm64/kvm/vgic-sys-reg-v3.c +++ b/arch/arm64/kvm/vgic-sys-reg-v3.c @@ -35,12 +35,12 @@ static int set_gic_ctlr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, vgic_v3_cpu->num_id_bits = host_id_bits; - host_seis = FIELD_GET(ICH_VTR_SEIS_MASK, kvm_vgic_global_state.ich_vtr_el2); + host_seis = FIELD_GET(ICH_VTR_EL2_SEIS, kvm_vgic_global_state.ich_vtr_el2); seis = FIELD_GET(ICC_CTLR_EL1_SEIS_MASK, val); if (host_seis != seis) return -EINVAL; - host_a3v = FIELD_GET(ICH_VTR_A3V_MASK, kvm_vgic_global_state.ich_vtr_el2); + host_a3v = FIELD_GET(ICH_VTR_EL2_A3V, kvm_vgic_global_state.ich_vtr_el2); a3v = FIELD_GET(ICC_CTLR_EL1_A3V_MASK, val); if (host_a3v != a3v) return -EINVAL; @@ -68,10 +68,10 @@ static int get_gic_ctlr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, val |= FIELD_PREP(ICC_CTLR_EL1_PRI_BITS_MASK, vgic_v3_cpu->num_pri_bits - 1); val |= FIELD_PREP(ICC_CTLR_EL1_ID_BITS_MASK, vgic_v3_cpu->num_id_bits); val |= FIELD_PREP(ICC_CTLR_EL1_SEIS_MASK, - FIELD_GET(ICH_VTR_SEIS_MASK, + FIELD_GET(ICH_VTR_EL2_SEIS, kvm_vgic_global_state.ich_vtr_el2)); val |= FIELD_PREP(ICC_CTLR_EL1_A3V_MASK, - FIELD_GET(ICH_VTR_A3V_MASK, kvm_vgic_global_state.ich_vtr_el2)); + FIELD_GET(ICH_VTR_EL2_A3V, kvm_vgic_global_state.ich_vtr_el2)); /* * The VMCR.CTLR value is in ICC_CTLR_EL1 layout. * Extract it directly using ICC_CTLR_EL1 reg definitions. diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 5e9682a55046..51f0c7451817 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -284,12 +284,10 @@ void vgic_v3_enable(struct kvm_vcpu *vcpu) vgic_v3->vgic_sre = 0; } - vcpu->arch.vgic_cpu.num_id_bits = (kvm_vgic_global_state.ich_vtr_el2 & - ICH_VTR_ID_BITS_MASK) >> - ICH_VTR_ID_BITS_SHIFT; - vcpu->arch.vgic_cpu.num_pri_bits = ((kvm_vgic_global_state.ich_vtr_el2 & - ICH_VTR_PRI_BITS_MASK) >> - ICH_VTR_PRI_BITS_SHIFT) + 1; + vcpu->arch.vgic_cpu.num_id_bits = FIELD_GET(ICH_VTR_EL2_IDbits, + kvm_vgic_global_state.ich_vtr_el2); + vcpu->arch.vgic_cpu.num_pri_bits = FIELD_GET(ICH_VTR_EL2_PRIbits, + kvm_vgic_global_state.ich_vtr_el2) + 1; /* Get the show on the road... */ vgic_v3->vgic_hcr = ICH_HCR_EL2_En; @@ -633,7 +631,7 @@ static const struct midr_range broken_seis[] = { static bool vgic_v3_broken_seis(void) { - return ((kvm_vgic_global_state.ich_vtr_el2 & ICH_VTR_SEIS_MASK) && + return ((kvm_vgic_global_state.ich_vtr_el2 & ICH_VTR_EL2_SEIS) && is_midr_in_range_list(read_cpuid_id(), broken_seis)); } @@ -707,10 +705,10 @@ int vgic_v3_probe(const struct gic_kvm_info *info) if (vgic_v3_broken_seis()) { kvm_info("GICv3 with broken locally generated SEI\n"); - kvm_vgic_global_state.ich_vtr_el2 &= ~ICH_VTR_SEIS_MASK; + kvm_vgic_global_state.ich_vtr_el2 &= ~ICH_VTR_EL2_SEIS; group0_trap = true; group1_trap = true; - if (ich_vtr_el2 & ICH_VTR_TDS_MASK) + if (ich_vtr_el2 & ICH_VTR_EL2_TDS) dir_trap = true; else common_trap = true; diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index fa77621aba1a..3e82a072eb49 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -3057,6 +3057,20 @@ Field 1 UIE Field 0 En EndSysreg +Sysreg ICH_VTR_EL2 3 4 12 11 1 +Res0 63:32 +Field 31:29 PRIbits +Field 28:26 PREbits +Field 25:23 IDbits +Field 22 SEIS +Field 21 A3V +Field 20 nV4 +Field 19 TDS +Field 18 DVIM +Res0 17:5 +Field 4:0 ListRegs +EndSysreg + Sysreg CONTEXTIDR_EL2 3 4 13 0 1 Fields CONTEXTIDR_ELx EndSysreg diff --git a/tools/arch/arm64/include/asm/sysreg.h b/tools/arch/arm64/include/asm/sysreg.h index 0ce8fc540fe2..5d9d7e394b25 100644 --- a/tools/arch/arm64/include/asm/sysreg.h +++ b/tools/arch/arm64/include/asm/sysreg.h @@ -558,7 +558,6 @@ #define SYS_ICH_VSEIR_EL2 sys_reg(3, 4, 12, 9, 4) #define SYS_ICC_SRE_EL2 sys_reg(3, 4, 12, 9, 5) -#define SYS_ICH_VTR_EL2 sys_reg(3, 4, 12, 11, 1) #define SYS_ICH_MISR_EL2 sys_reg(3, 4, 12, 11, 2) #define SYS_ICH_EISR_EL2 sys_reg(3, 4, 12, 11, 3) #define SYS_ICH_ELRSR_EL2 sys_reg(3, 4, 12, 11, 5) @@ -1018,18 +1017,6 @@ #define ICH_VMCR_ENG1_SHIFT 1 #define ICH_VMCR_ENG1_MASK (1 << ICH_VMCR_ENG1_SHIFT) -/* ICH_VTR_EL2 bit definitions */ -#define ICH_VTR_PRI_BITS_SHIFT 29 -#define ICH_VTR_PRI_BITS_MASK (7 << ICH_VTR_PRI_BITS_SHIFT) -#define ICH_VTR_ID_BITS_SHIFT 23 -#define ICH_VTR_ID_BITS_MASK (7 << ICH_VTR_ID_BITS_SHIFT) -#define ICH_VTR_SEIS_SHIFT 22 -#define ICH_VTR_SEIS_MASK (1 << ICH_VTR_SEIS_SHIFT) -#define ICH_VTR_A3V_SHIFT 21 -#define ICH_VTR_A3V_MASK (1 << ICH_VTR_A3V_SHIFT) -#define ICH_VTR_TDS_SHIFT 19 -#define ICH_VTR_TDS_MASK (1 << ICH_VTR_TDS_SHIFT) - /* * Permission Indirection Extension (PIE) permission encodings. * Encodings with the _O suffix, have overlays applied (Permission Overlay Extension). From b7a252e881f3322abb9ec899d13dbf7bae7d9bea Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 25 Feb 2025 17:29:17 +0000 Subject: [PATCH 30/76] arm64: sysreg: Add layout for ICH_MISR_EL2 The ICH_MISR_EL2-related macros are missing a number of status bits that we are about to handle. Take this opportunity to fully describe the layout of that register as part of the automatic generation infrastructure. Reviewed-by: Andre Przywara Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20250225172930.1850838-4-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/include/asm/sysreg.h | 5 ----- arch/arm64/tools/sysreg | 12 ++++++++++++ tools/arch/arm64/include/asm/sysreg.h | 5 ----- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index b59b2c680e97..9511f3faac46 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -562,7 +562,6 @@ #define SYS_ICH_VSEIR_EL2 sys_reg(3, 4, 12, 9, 4) #define SYS_ICC_SRE_EL2 sys_reg(3, 4, 12, 9, 5) -#define SYS_ICH_MISR_EL2 sys_reg(3, 4, 12, 11, 2) #define SYS_ICH_EISR_EL2 sys_reg(3, 4, 12, 11, 3) #define SYS_ICH_ELRSR_EL2 sys_reg(3, 4, 12, 11, 5) #define SYS_ICH_VMCR_EL2 sys_reg(3, 4, 12, 11, 7) @@ -983,10 +982,6 @@ #define SYS_MPIDR_SAFE_VAL (BIT(31)) /* GIC Hypervisor interface registers */ -/* ICH_MISR_EL2 bit definitions */ -#define ICH_MISR_EOI (1 << 0) -#define ICH_MISR_U (1 << 1) - /* ICH_LR*_EL2 bit definitions */ #define ICH_LR_VIRTUAL_ID_MASK ((1ULL << 32) - 1) diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index 3e82a072eb49..2c63662c1a48 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -3071,6 +3071,18 @@ Res0 17:5 Field 4:0 ListRegs EndSysreg +Sysreg ICH_MISR_EL2 3 4 12 11 2 +Res0 63:8 +Field 7 VGrp1D +Field 6 VGrp1E +Field 5 VGrp0D +Field 4 VGrp0E +Field 3 NP +Field 2 LRENP +Field 1 U +Field 0 EOI +EndSysreg + Sysreg CONTEXTIDR_EL2 3 4 13 0 1 Fields CONTEXTIDR_ELx EndSysreg diff --git a/tools/arch/arm64/include/asm/sysreg.h b/tools/arch/arm64/include/asm/sysreg.h index 5d9d7e394b25..b6c5ece4fdee 100644 --- a/tools/arch/arm64/include/asm/sysreg.h +++ b/tools/arch/arm64/include/asm/sysreg.h @@ -558,7 +558,6 @@ #define SYS_ICH_VSEIR_EL2 sys_reg(3, 4, 12, 9, 4) #define SYS_ICC_SRE_EL2 sys_reg(3, 4, 12, 9, 5) -#define SYS_ICH_MISR_EL2 sys_reg(3, 4, 12, 11, 2) #define SYS_ICH_EISR_EL2 sys_reg(3, 4, 12, 11, 3) #define SYS_ICH_ELRSR_EL2 sys_reg(3, 4, 12, 11, 5) #define SYS_ICH_VMCR_EL2 sys_reg(3, 4, 12, 11, 7) @@ -979,10 +978,6 @@ #define SYS_MPIDR_SAFE_VAL (BIT(31)) /* GIC Hypervisor interface registers */ -/* ICH_MISR_EL2 bit definitions */ -#define ICH_MISR_EOI (1 << 0) -#define ICH_MISR_U (1 << 1) - /* ICH_LR*_EL2 bit definitions */ #define ICH_LR_VIRTUAL_ID_MASK ((1ULL << 32) - 1) From 16abeb60be621a7927d68ab77663e93f8c734f6e Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 25 Feb 2025 17:29:18 +0000 Subject: [PATCH 31/76] KVM: arm64: nv: Load timer before the GIC In order for vgic_v3_load_nested to be able to observe which timer interrupts have the HW bit set for the current context, the timers must have been loaded in the new mode and the right timer mapped to their corresponding HW IRQs. At the moment, we load the GIC first, meaning that timer interrupts injected to an L2 guest will never have the HW bit set (we see the old configuration). Swapping the two loads solves this particular problem. Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20250225172930.1850838-5-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/arm.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index b8e55a441282..383aa4862234 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -576,8 +576,12 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) nommu: vcpu->cpu = cpu; - kvm_vgic_load(vcpu); + /* + * The timer must be loaded before the vgic to correctly set up physical + * interrupt deactivation in nested state (e.g. timer interrupt). + */ kvm_timer_vcpu_load(vcpu); + kvm_vgic_load(vcpu); kvm_vcpu_load_debug(vcpu); if (has_vhe()) kvm_vcpu_load_vhe(vcpu); From 182f1596941e99f4de4c6fe6063c318eec72f441 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 25 Feb 2025 17:29:19 +0000 Subject: [PATCH 32/76] KVM: arm64: nv: Add ICH_*_EL2 registers to vpcu_sysreg FEAT_NV2 comes with a bunch of register-to-memory redirection involving the ICH_*_EL2 registers (LRs, APRs, VMCR, HCR). Adds them to the vcpu_sysreg enumeration. Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20250225172930.1850838-6-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_host.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 3a7ec98ef123..7e0dc6b4cb99 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -557,7 +557,33 @@ enum vcpu_sysreg { VNCR(CNTP_CVAL_EL0), VNCR(CNTP_CTL_EL0), + VNCR(ICH_LR0_EL2), + VNCR(ICH_LR1_EL2), + VNCR(ICH_LR2_EL2), + VNCR(ICH_LR3_EL2), + VNCR(ICH_LR4_EL2), + VNCR(ICH_LR5_EL2), + VNCR(ICH_LR6_EL2), + VNCR(ICH_LR7_EL2), + VNCR(ICH_LR8_EL2), + VNCR(ICH_LR9_EL2), + VNCR(ICH_LR10_EL2), + VNCR(ICH_LR11_EL2), + VNCR(ICH_LR12_EL2), + VNCR(ICH_LR13_EL2), + VNCR(ICH_LR14_EL2), + VNCR(ICH_LR15_EL2), + + VNCR(ICH_AP0R0_EL2), + VNCR(ICH_AP0R1_EL2), + VNCR(ICH_AP0R2_EL2), + VNCR(ICH_AP0R3_EL2), + VNCR(ICH_AP1R0_EL2), + VNCR(ICH_AP1R1_EL2), + VNCR(ICH_AP1R2_EL2), + VNCR(ICH_AP1R3_EL2), VNCR(ICH_HCR_EL2), + VNCR(ICH_VMCR_EL2), NR_SYS_REGS /* Nothing after this line! */ }; From 96c2f03311de1a9363a7b4cee28776ac9cec8109 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 25 Feb 2025 17:29:20 +0000 Subject: [PATCH 33/76] KVM: arm64: nv: Plumb handling of GICv3 EL2 accesses Wire the handling of all GICv3 EL2 registers, and provide emulation for all the non memory-backed registers (ICC_SRE_EL2, ICH_VTR_EL2, ICH_MISR_EL2, ICH_ELRSR_EL2, and ICH_EISR_EL2). Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20250225172930.1850838-7-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/Makefile | 2 +- arch/arm64/kvm/sys_regs.c | 95 +++++++++++++++++++- arch/arm64/kvm/vgic/vgic-v3-nested.c | 125 +++++++++++++++++++++++++++ include/kvm/arm_vgic.h | 4 + 4 files changed, 224 insertions(+), 2 deletions(-) create mode 100644 arch/arm64/kvm/vgic/vgic-v3-nested.c diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile index 3cf7adb2b503..209bc76263f1 100644 --- a/arch/arm64/kvm/Makefile +++ b/arch/arm64/kvm/Makefile @@ -23,7 +23,7 @@ kvm-y += arm.o mmu.o mmio.o psci.o hypercalls.o pvtime.o \ vgic/vgic-v3.o vgic/vgic-v4.o \ vgic/vgic-mmio.o vgic/vgic-mmio-v2.o \ vgic/vgic-mmio-v3.o vgic/vgic-kvm-device.o \ - vgic/vgic-its.o vgic/vgic-debug.o + vgic/vgic-its.o vgic/vgic-debug.o vgic/vgic-v3-nested.o kvm-$(CONFIG_HW_PERF_EVENTS) += pmu-emul.o pmu.o kvm-$(CONFIG_ARM64_PTR_AUTH) += pauth.o diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 82430c1e1dd0..84248fabd418 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -531,7 +532,13 @@ static bool access_gic_sre(struct kvm_vcpu *vcpu, if (p->is_write) return ignore_write(vcpu, p); - p->regval = vcpu->arch.vgic_cpu.vgic_v3.vgic_sre; + if (p->Op1 == 4) { /* ICC_SRE_EL2 */ + p->regval = (ICC_SRE_EL2_ENABLE | ICC_SRE_EL2_SRE | + ICC_SRE_EL1_DIB | ICC_SRE_EL1_DFB); + } else { /* ICC_SRE_EL1 */ + p->regval = vcpu->arch.vgic_cpu.vgic_v3.vgic_sre; + } + return true; } @@ -2426,6 +2433,59 @@ static bool access_zcr_el2(struct kvm_vcpu *vcpu, vq = SYS_FIELD_GET(ZCR_ELx, LEN, p->regval) + 1; vq = min(vq, vcpu_sve_max_vq(vcpu)); vcpu_write_sys_reg(vcpu, vq - 1, ZCR_EL2); + + return true; +} + +static bool access_gic_vtr(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (p->is_write) + return write_to_read_only(vcpu, p, r); + + p->regval = kvm_vgic_global_state.ich_vtr_el2; + p->regval &= ~(ICH_VTR_EL2_DVIM | + ICH_VTR_EL2_A3V | + ICH_VTR_EL2_IDbits); + p->regval |= ICH_VTR_EL2_nV4; + + return true; +} + +static bool access_gic_misr(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (p->is_write) + return write_to_read_only(vcpu, p, r); + + p->regval = vgic_v3_get_misr(vcpu); + + return true; +} + +static bool access_gic_eisr(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (p->is_write) + return write_to_read_only(vcpu, p, r); + + p->regval = vgic_v3_get_eisr(vcpu); + + return true; +} + +static bool access_gic_elrsr(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (p->is_write) + return write_to_read_only(vcpu, p, r); + + p->regval = vgic_v3_get_elrsr(vcpu); + return true; } @@ -3102,7 +3162,40 @@ static const struct sys_reg_desc sys_reg_descs[] = { EL2_REG(RVBAR_EL2, access_rw, reset_val, 0), { SYS_DESC(SYS_RMR_EL2), undef_access }, + EL2_REG_VNCR(ICH_AP0R0_EL2, reset_val, 0), + EL2_REG_VNCR(ICH_AP0R1_EL2, reset_val, 0), + EL2_REG_VNCR(ICH_AP0R2_EL2, reset_val, 0), + EL2_REG_VNCR(ICH_AP0R3_EL2, reset_val, 0), + EL2_REG_VNCR(ICH_AP1R0_EL2, reset_val, 0), + EL2_REG_VNCR(ICH_AP1R1_EL2, reset_val, 0), + EL2_REG_VNCR(ICH_AP1R2_EL2, reset_val, 0), + EL2_REG_VNCR(ICH_AP1R3_EL2, reset_val, 0), + + { SYS_DESC(SYS_ICC_SRE_EL2), access_gic_sre }, + EL2_REG_VNCR(ICH_HCR_EL2, reset_val, 0), + { SYS_DESC(SYS_ICH_VTR_EL2), access_gic_vtr }, + { SYS_DESC(SYS_ICH_MISR_EL2), access_gic_misr }, + { SYS_DESC(SYS_ICH_EISR_EL2), access_gic_eisr }, + { SYS_DESC(SYS_ICH_ELRSR_EL2), access_gic_elrsr }, + EL2_REG_VNCR(ICH_VMCR_EL2, reset_val, 0), + + EL2_REG_VNCR(ICH_LR0_EL2, reset_val, 0), + EL2_REG_VNCR(ICH_LR1_EL2, reset_val, 0), + EL2_REG_VNCR(ICH_LR2_EL2, reset_val, 0), + EL2_REG_VNCR(ICH_LR3_EL2, reset_val, 0), + EL2_REG_VNCR(ICH_LR4_EL2, reset_val, 0), + EL2_REG_VNCR(ICH_LR5_EL2, reset_val, 0), + EL2_REG_VNCR(ICH_LR6_EL2, reset_val, 0), + EL2_REG_VNCR(ICH_LR7_EL2, reset_val, 0), + EL2_REG_VNCR(ICH_LR8_EL2, reset_val, 0), + EL2_REG_VNCR(ICH_LR9_EL2, reset_val, 0), + EL2_REG_VNCR(ICH_LR10_EL2, reset_val, 0), + EL2_REG_VNCR(ICH_LR11_EL2, reset_val, 0), + EL2_REG_VNCR(ICH_LR12_EL2, reset_val, 0), + EL2_REG_VNCR(ICH_LR13_EL2, reset_val, 0), + EL2_REG_VNCR(ICH_LR14_EL2, reset_val, 0), + EL2_REG_VNCR(ICH_LR15_EL2, reset_val, 0), EL2_REG(CONTEXTIDR_EL2, access_rw, reset_val, 0), EL2_REG(TPIDR_EL2, access_rw, reset_val, 0), diff --git a/arch/arm64/kvm/vgic/vgic-v3-nested.c b/arch/arm64/kvm/vgic/vgic-v3-nested.c new file mode 100644 index 000000000000..48bfd2f556a3 --- /dev/null +++ b/arch/arm64/kvm/vgic/vgic-v3-nested.c @@ -0,0 +1,125 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include "vgic.h" + +#define ICH_LRN(n) (ICH_LR0_EL2 + (n)) + +struct mi_state { + u16 eisr; + u16 elrsr; + bool pend; +}; + +/* + * Nesting GICv3 support + * + * System register emulation: + * + * We get two classes of registers: + * + * - those backed by memory (LRs, APRs, HCR, VMCR): L1 can freely access + * them, and L0 doesn't see a thing. + * + * - those that always trap (ELRSR, EISR, MISR): these are status registers + * that are built on the fly based on the in-memory state. + * + * Only L1 can access the ICH_*_EL2 registers. A non-NV L2 obviously cannot, + * and a NV L2 would either access the VNCR page provided by L1 (memory + * based registers), or see the access redirected to L1 (registers that + * trap) thanks to NV being set by L1. + */ + +static bool lr_triggers_eoi(u64 lr) +{ + return !(lr & (ICH_LR_STATE | ICH_LR_HW)) && (lr & ICH_LR_EOI); +} + +static void vgic_compute_mi_state(struct kvm_vcpu *vcpu, struct mi_state *mi_state) +{ + u16 eisr = 0, elrsr = 0; + bool pend = false; + + for (int i = 0; i < kvm_vgic_global_state.nr_lr; i++) { + u64 lr = __vcpu_sys_reg(vcpu, ICH_LRN(i)); + + if (lr_triggers_eoi(lr)) + eisr |= BIT(i); + if (!(lr & ICH_LR_STATE)) + elrsr |= BIT(i); + pend |= (lr & ICH_LR_PENDING_BIT); + } + + mi_state->eisr = eisr; + mi_state->elrsr = elrsr; + mi_state->pend = pend; +} + +u16 vgic_v3_get_eisr(struct kvm_vcpu *vcpu) +{ + struct mi_state mi_state; + + vgic_compute_mi_state(vcpu, &mi_state); + return mi_state.eisr; +} + +u16 vgic_v3_get_elrsr(struct kvm_vcpu *vcpu) +{ + struct mi_state mi_state; + + vgic_compute_mi_state(vcpu, &mi_state); + return mi_state.elrsr; +} + +u64 vgic_v3_get_misr(struct kvm_vcpu *vcpu) +{ + struct mi_state mi_state; + u64 reg = 0, hcr, vmcr; + + hcr = __vcpu_sys_reg(vcpu, ICH_HCR_EL2); + vmcr = __vcpu_sys_reg(vcpu, ICH_VMCR_EL2); + + vgic_compute_mi_state(vcpu, &mi_state); + + if (mi_state.eisr) + reg |= ICH_MISR_EL2_EOI; + + if (__vcpu_sys_reg(vcpu, ICH_HCR_EL2) & ICH_HCR_EL2_UIE) { + int used_lrs = kvm_vgic_global_state.nr_lr; + + used_lrs -= hweight16(mi_state.elrsr); + reg |= (used_lrs <= 1) ? ICH_MISR_EL2_U : 0; + } + + if ((hcr & ICH_HCR_EL2_LRENPIE) && FIELD_GET(ICH_HCR_EL2_EOIcount_MASK, hcr)) + reg |= ICH_MISR_EL2_LRENP; + + if ((hcr & ICH_HCR_EL2_NPIE) && !mi_state.pend) + reg |= ICH_MISR_EL2_NP; + + if ((hcr & ICH_HCR_EL2_VGrp0EIE) && (vmcr & ICH_VMCR_ENG0_MASK)) + reg |= ICH_MISR_EL2_VGrp0E; + + if ((hcr & ICH_HCR_EL2_VGrp0DIE) && !(vmcr & ICH_VMCR_ENG0_MASK)) + reg |= ICH_MISR_EL2_VGrp0D; + + if ((hcr & ICH_HCR_EL2_VGrp1EIE) && (vmcr & ICH_VMCR_ENG1_MASK)) + reg |= ICH_MISR_EL2_VGrp1E; + + if ((hcr & ICH_HCR_EL2_VGrp1DIE) && !(vmcr & ICH_VMCR_ENG1_MASK)) + reg |= ICH_MISR_EL2_VGrp1D; + + return reg; +} diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 3a8ccfda34d2..5017fcc71e60 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -389,6 +389,10 @@ int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu); void kvm_vgic_load(struct kvm_vcpu *vcpu); void kvm_vgic_put(struct kvm_vcpu *vcpu); +u16 vgic_v3_get_eisr(struct kvm_vcpu *vcpu); +u16 vgic_v3_get_elrsr(struct kvm_vcpu *vcpu); +u64 vgic_v3_get_misr(struct kvm_vcpu *vcpu); + #define irqchip_in_kernel(k) (!!((k)->arch.vgic.in_kernel)) #define vgic_initialized(k) ((k)->arch.vgic.initialized) #define vgic_ready(k) ((k)->arch.vgic.ready) From 21d29cd814d794f8ed9dc466d7481b8629ca5e73 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 25 Feb 2025 17:29:21 +0000 Subject: [PATCH 34/76] KVM: arm64: nv: Sanitise ICH_HCR_EL2 accesses As ICH_HCR_EL2 is a VNCR accessor when runnintg NV, add some sanitising to what gets written. Crucially, mark TDIR as RES0 if the HW doesn't support it (unlikely, but hey...), as well as anything GICv4 related, since we only expose a GICv3 to the uest. Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20250225172930.1850838-8-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/nested.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 0c9387d2f507..7c8f39070a50 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -1290,6 +1290,15 @@ int kvm_init_nv_sysregs(struct kvm_vcpu *vcpu) res0 |= GENMASK(11, 8); set_sysreg_masks(kvm, CNTHCTL_EL2, res0, res1); + /* ICH_HCR_EL2 */ + res0 = ICH_HCR_EL2_RES0; + res1 = ICH_HCR_EL2_RES1; + if (!(kvm_vgic_global_state.ich_vtr_el2 & ICH_VTR_EL2_TDS)) + res0 |= ICH_HCR_EL2_TDIR; + /* No GICv4 is presented to the guest */ + res0 |= ICH_HCR_EL2_DVIM | ICH_HCR_EL2_vSGIEOICount; + set_sysreg_masks(kvm, ICH_HCR_EL2, res0, res1); + out: for (enum vcpu_sysreg sr = __SANITISED_REG_START__; sr < NR_SYS_REGS; sr++) (void)__vcpu_sys_reg(vcpu, sr); From 146a050f2d8c4394dfe3e236dc49d155fd5c04d1 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 25 Feb 2025 17:29:22 +0000 Subject: [PATCH 35/76] KVM: arm64: nv: Nested GICv3 emulation When entering a nested VM, we set up the hypervisor control interface based on what the guest hypervisor has set. Especially, we investigate each list register written by the guest hypervisor whether HW bit is set. If so, we translate hw irq number from the guest's point of view to the real hardware irq number if there is a mapping. Co-developed-by: Jintack Lim Signed-off-by: Jintack Lim [Christoffer: Redesigned execution flow around vcpu load/put] Co-developed-by: Christoffer Dall Signed-off-by: Christoffer Dall [maz: Rewritten to support GICv3 instead of GICv2, NV2 support] Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20250225172930.1850838-9-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_hyp.h | 2 + arch/arm64/kvm/hyp/vgic-v3-sr.c | 2 +- arch/arm64/kvm/vgic/vgic-v3-nested.c | 218 +++++++++++++++++++++++++++ arch/arm64/kvm/vgic/vgic-v3.c | 11 ++ arch/arm64/kvm/vgic/vgic.c | 6 + arch/arm64/kvm/vgic/vgic.h | 4 + include/kvm/arm_vgic.h | 2 + 7 files changed, 244 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index c838309e4ec4..e6be1f5d0967 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -76,6 +76,8 @@ DECLARE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params); int __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu); +u64 __gic_v3_get_lr(unsigned int lr); + void __vgic_v3_save_state(struct vgic_v3_cpu_if *cpu_if); void __vgic_v3_restore_state(struct vgic_v3_cpu_if *cpu_if); void __vgic_v3_activate_traps(struct vgic_v3_cpu_if *cpu_if); diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c index b47dede973b3..ed363aa3027e 100644 --- a/arch/arm64/kvm/hyp/vgic-v3-sr.c +++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c @@ -18,7 +18,7 @@ #define vtr_to_nr_pre_bits(v) ((((u32)(v) >> 26) & 7) + 1) #define vtr_to_nr_apr_regs(v) (1 << (vtr_to_nr_pre_bits(v) - 5)) -static u64 __gic_v3_get_lr(unsigned int lr) +u64 __gic_v3_get_lr(unsigned int lr) { switch (lr & 0xf) { case 0: diff --git a/arch/arm64/kvm/vgic/vgic-v3-nested.c b/arch/arm64/kvm/vgic/vgic-v3-nested.c index 48bfd2f556a3..eabff56b8b65 100644 --- a/arch/arm64/kvm/vgic/vgic-v3-nested.c +++ b/arch/arm64/kvm/vgic/vgic-v3-nested.c @@ -16,6 +16,8 @@ #include "vgic.h" #define ICH_LRN(n) (ICH_LR0_EL2 + (n)) +#define ICH_AP0RN(n) (ICH_AP0R0_EL2 + (n)) +#define ICH_AP1RN(n) (ICH_AP1R0_EL2 + (n)) struct mi_state { u16 eisr; @@ -23,9 +25,54 @@ struct mi_state { bool pend; }; +/* + * The shadow registers loaded to the hardware when running a L2 guest + * with the virtual IMO/FMO bits set. + */ +struct shadow_if { + struct vgic_v3_cpu_if cpuif; + unsigned long lr_map; +}; + +static DEFINE_PER_CPU(struct shadow_if, shadow_if); + /* * Nesting GICv3 support * + * On a non-nesting VM (only running at EL0/EL1), the host hypervisor + * completely controls the interrupts injected via the list registers. + * Consequently, most of the state that is modified by the guest (by ACK-ing + * and EOI-ing interrupts) is synced by KVM on each entry/exit, so that we + * keep a semi-consistent view of the interrupts. + * + * This still applies for a NV guest, but only while "InHost" (either + * running at EL2, or at EL0 with HCR_EL2.{E2H.TGE}=={1,1}. + * + * When running a L2 guest ("not InHost"), things are radically different, + * as the L1 guest is in charge of provisioning the interrupts via its own + * view of the ICH_LR*_EL2 registers, which conveniently live in the VNCR + * page. This means that the flow described above does work (there is no + * state to rebuild in the L0 hypervisor), and that most things happed on L2 + * load/put: + * + * - on L2 load: move the in-memory L1 vGIC configuration into a shadow, + * per-CPU data structure that is used to populate the actual LRs. This is + * an extra copy that we could avoid, but life is short. In the process, + * we remap any interrupt that has the HW bit set to the mapped interrupt + * on the host, should the host consider it a HW one. This allows the HW + * deactivation to take its course, such as for the timer. + * + * - on L2 put: perform the inverse transformation, so that the result of L2 + * running becomes visible to L1 in the VNCR-accessible registers. + * + * - there is nothing to do on L2 entry, as everything will have happened + * on load. However, this is the point where we detect that an interrupt + * targeting L1 and prepare the grand switcheroo. + * + * - on L2 exit: emulate the HW bit, and deactivate corresponding the L1 + * interrupt. The L0 active state will be cleared by the HW if the L1 + * interrupt was itself backed by a HW interrupt. + * * System register emulation: * * We get two classes of registers: @@ -42,6 +89,26 @@ struct mi_state { * trap) thanks to NV being set by L1. */ +bool vgic_state_is_nested(struct kvm_vcpu *vcpu) +{ + u64 xmo; + + if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) { + xmo = __vcpu_sys_reg(vcpu, HCR_EL2) & (HCR_IMO | HCR_FMO); + WARN_ONCE(xmo && xmo != (HCR_IMO | HCR_FMO), + "Separate virtual IRQ/FIQ settings not supported\n"); + + return !!xmo; + } + + return false; +} + +static struct shadow_if *get_shadow_if(void) +{ + return this_cpu_ptr(&shadow_if); +} + static bool lr_triggers_eoi(u64 lr) { return !(lr & (ICH_LR_STATE | ICH_LR_HW)) && (lr & ICH_LR_EOI); @@ -123,3 +190,154 @@ u64 vgic_v3_get_misr(struct kvm_vcpu *vcpu) return reg; } + +/* + * For LRs which have HW bit set such as timer interrupts, we modify them to + * have the host hardware interrupt number instead of the virtual one programmed + * by the guest hypervisor. + */ +static void vgic_v3_create_shadow_lr(struct kvm_vcpu *vcpu, + struct vgic_v3_cpu_if *s_cpu_if) +{ + unsigned long lr_map = 0; + int index = 0; + + for (int i = 0; i < kvm_vgic_global_state.nr_lr; i++) { + u64 lr = __vcpu_sys_reg(vcpu, ICH_LRN(i)); + struct vgic_irq *irq; + + if (!(lr & ICH_LR_STATE)) + lr = 0; + + if (!(lr & ICH_LR_HW)) + goto next; + + /* We have the HW bit set, check for validity of pINTID */ + irq = vgic_get_vcpu_irq(vcpu, FIELD_GET(ICH_LR_PHYS_ID_MASK, lr)); + if (!irq || !irq->hw || irq->intid > VGIC_MAX_SPI ) { + /* There was no real mapping, so nuke the HW bit */ + lr &= ~ICH_LR_HW; + if (irq) + vgic_put_irq(vcpu->kvm, irq); + goto next; + } + + /* It is illegal to have the EOI bit set with HW */ + lr &= ~ICH_LR_EOI; + + /* Translate the virtual mapping to the real one */ + lr &= ~ICH_LR_PHYS_ID_MASK; + lr |= FIELD_PREP(ICH_LR_PHYS_ID_MASK, (u64)irq->hwintid); + + vgic_put_irq(vcpu->kvm, irq); + +next: + s_cpu_if->vgic_lr[index] = lr; + if (lr) { + lr_map |= BIT(i); + index++; + } + } + + container_of(s_cpu_if, struct shadow_if, cpuif)->lr_map = lr_map; + s_cpu_if->used_lrs = index; +} + +void vgic_v3_sync_nested(struct kvm_vcpu *vcpu) +{ + struct shadow_if *shadow_if = get_shadow_if(); + int i, index = 0; + + for_each_set_bit(i, &shadow_if->lr_map, kvm_vgic_global_state.nr_lr) { + u64 lr = __vcpu_sys_reg(vcpu, ICH_LRN(i)); + struct vgic_irq *irq; + + if (!(lr & ICH_LR_HW) || !(lr & ICH_LR_STATE)) + goto next; + + /* + * If we had a HW lr programmed by the guest hypervisor, we + * need to emulate the HW effect between the guest hypervisor + * and the nested guest. + */ + irq = vgic_get_vcpu_irq(vcpu, FIELD_GET(ICH_LR_PHYS_ID_MASK, lr)); + if (WARN_ON(!irq)) /* Shouldn't happen as we check on load */ + goto next; + + lr = __gic_v3_get_lr(index); + if (!(lr & ICH_LR_STATE)) + irq->active = false; + + vgic_put_irq(vcpu->kvm, irq); + next: + index++; + } +} + +static void vgic_v3_create_shadow_state(struct kvm_vcpu *vcpu, + struct vgic_v3_cpu_if *s_cpu_if) +{ + struct vgic_v3_cpu_if *host_if = &vcpu->arch.vgic_cpu.vgic_v3; + int i; + + s_cpu_if->vgic_hcr = __vcpu_sys_reg(vcpu, ICH_HCR_EL2); + s_cpu_if->vgic_vmcr = __vcpu_sys_reg(vcpu, ICH_VMCR_EL2); + s_cpu_if->vgic_sre = host_if->vgic_sre; + + for (i = 0; i < 4; i++) { + s_cpu_if->vgic_ap0r[i] = __vcpu_sys_reg(vcpu, ICH_AP0RN(i)); + s_cpu_if->vgic_ap1r[i] = __vcpu_sys_reg(vcpu, ICH_AP1RN(i)); + } + + vgic_v3_create_shadow_lr(vcpu, s_cpu_if); +} + +void vgic_v3_load_nested(struct kvm_vcpu *vcpu) +{ + struct shadow_if *shadow_if = get_shadow_if(); + struct vgic_v3_cpu_if *cpu_if = &shadow_if->cpuif; + + BUG_ON(!vgic_state_is_nested(vcpu)); + + vgic_v3_create_shadow_state(vcpu, cpu_if); + + __vgic_v3_restore_vmcr_aprs(cpu_if); + __vgic_v3_activate_traps(cpu_if); + + __vgic_v3_restore_state(cpu_if); +} + +void vgic_v3_put_nested(struct kvm_vcpu *vcpu) +{ + struct shadow_if *shadow_if = get_shadow_if(); + struct vgic_v3_cpu_if *s_cpu_if = &shadow_if->cpuif; + int i; + + __vgic_v3_save_vmcr_aprs(s_cpu_if); + __vgic_v3_deactivate_traps(s_cpu_if); + __vgic_v3_save_state(s_cpu_if); + + /* + * Translate the shadow state HW fields back to the virtual ones + * before copying the shadow struct back to the nested one. + */ + __vcpu_sys_reg(vcpu, ICH_HCR_EL2) = s_cpu_if->vgic_hcr; + __vcpu_sys_reg(vcpu, ICH_VMCR_EL2) = s_cpu_if->vgic_vmcr; + + for (i = 0; i < 4; i++) { + __vcpu_sys_reg(vcpu, ICH_AP0RN(i)) = s_cpu_if->vgic_ap0r[i]; + __vcpu_sys_reg(vcpu, ICH_AP1RN(i)) = s_cpu_if->vgic_ap1r[i]; + } + + for_each_set_bit(i, &shadow_if->lr_map, kvm_vgic_global_state.nr_lr) { + u64 val = __vcpu_sys_reg(vcpu, ICH_LRN(i)); + + val &= ~ICH_LR_STATE; + val |= s_cpu_if->vgic_lr[i] & ICH_LR_STATE; + + __vcpu_sys_reg(vcpu, ICH_LRN(i)) = val; + s_cpu_if->vgic_lr[i] = 0; + } + + shadow_if->lr_map = 0; +} diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 51f0c7451817..73a8a7df4bd2 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -734,6 +734,12 @@ void vgic_v3_load(struct kvm_vcpu *vcpu) { struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; + /* If the vgic is nested, perform the full state loading */ + if (vgic_state_is_nested(vcpu)) { + vgic_v3_load_nested(vcpu); + return; + } + if (likely(!is_protected_kvm_enabled())) kvm_call_hyp(__vgic_v3_restore_vmcr_aprs, cpu_if); @@ -747,6 +753,11 @@ void vgic_v3_put(struct kvm_vcpu *vcpu) { struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; + if (vgic_state_is_nested(vcpu)) { + vgic_v3_put_nested(vcpu); + return; + } + if (likely(!is_protected_kvm_enabled())) kvm_call_hyp(__vgic_v3_save_vmcr_aprs, cpu_if); WARN_ON(vgic_v4_put(vcpu)); diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c index cc8c6b9b5dd8..324c547e1b4d 100644 --- a/arch/arm64/kvm/vgic/vgic.c +++ b/arch/arm64/kvm/vgic/vgic.c @@ -872,6 +872,12 @@ void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu) { int used_lrs; + /* If nesting, emulate the HW effect from L0 to L1 */ + if (vgic_state_is_nested(vcpu)) { + vgic_v3_sync_nested(vcpu); + return; + } + /* An empty ap_list_head implies used_lrs == 0 */ if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head)) return; diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h index 122d95b4e284..cf0c084e5d34 100644 --- a/arch/arm64/kvm/vgic/vgic.h +++ b/arch/arm64/kvm/vgic/vgic.h @@ -353,4 +353,8 @@ static inline bool kvm_has_gicv3(struct kvm *kvm) return kvm_has_feat(kvm, ID_AA64PFR0_EL1, GIC, IMP); } +void vgic_v3_sync_nested(struct kvm_vcpu *vcpu); +void vgic_v3_load_nested(struct kvm_vcpu *vcpu); +void vgic_v3_put_nested(struct kvm_vcpu *vcpu); + #endif diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 5017fcc71e60..1b373cb870fe 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -437,6 +437,8 @@ int vgic_v4_load(struct kvm_vcpu *vcpu); void vgic_v4_commit(struct kvm_vcpu *vcpu); int vgic_v4_put(struct kvm_vcpu *vcpu); +bool vgic_state_is_nested(struct kvm_vcpu *vcpu); + /* CPU HP callbacks */ void kvm_vgic_cpu_up(void); void kvm_vgic_cpu_down(void); From 4b1b97f0d7cfd3e29ae72f380996b8359200fd86 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 25 Feb 2025 17:29:23 +0000 Subject: [PATCH 36/76] KVM: arm64: nv: Handle L2->L1 transition on interrupt injection An interrupt being delivered to L1 while running L2 must result in the correct exception being delivered to L1. This means that if, on entry to L2, we found ourselves with pending interrupts in the L1 distributor, we need to take immediate action. This is done by posting a request which will prevent the entry in L2, and deliver an IRQ exception to L1, forcing the switch. Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20250225172930.1850838-10-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_host.h | 17 +++++++++-------- arch/arm64/kvm/arm.c | 5 +++++ arch/arm64/kvm/nested.c | 4 ++++ arch/arm64/kvm/vgic/vgic.c | 23 +++++++++++++++++++++++ 4 files changed, 41 insertions(+), 8 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 7e0dc6b4cb99..86519a73971e 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -44,14 +44,15 @@ #define KVM_REQ_SLEEP \ KVM_ARCH_REQ_FLAGS(0, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) -#define KVM_REQ_IRQ_PENDING KVM_ARCH_REQ(1) -#define KVM_REQ_VCPU_RESET KVM_ARCH_REQ(2) -#define KVM_REQ_RECORD_STEAL KVM_ARCH_REQ(3) -#define KVM_REQ_RELOAD_GICv4 KVM_ARCH_REQ(4) -#define KVM_REQ_RELOAD_PMU KVM_ARCH_REQ(5) -#define KVM_REQ_SUSPEND KVM_ARCH_REQ(6) -#define KVM_REQ_RESYNC_PMU_EL0 KVM_ARCH_REQ(7) -#define KVM_REQ_NESTED_S2_UNMAP KVM_ARCH_REQ(8) +#define KVM_REQ_IRQ_PENDING KVM_ARCH_REQ(1) +#define KVM_REQ_VCPU_RESET KVM_ARCH_REQ(2) +#define KVM_REQ_RECORD_STEAL KVM_ARCH_REQ(3) +#define KVM_REQ_RELOAD_GICv4 KVM_ARCH_REQ(4) +#define KVM_REQ_RELOAD_PMU KVM_ARCH_REQ(5) +#define KVM_REQ_SUSPEND KVM_ARCH_REQ(6) +#define KVM_REQ_RESYNC_PMU_EL0 KVM_ARCH_REQ(7) +#define KVM_REQ_NESTED_S2_UNMAP KVM_ARCH_REQ(8) +#define KVM_REQ_GUEST_HYP_IRQ_PENDING KVM_ARCH_REQ(9) #define KVM_DIRTY_LOG_MANUAL_CAPS (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \ KVM_DIRTY_LOG_INITIALLY_SET) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 383aa4862234..0ec7099f4b63 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1148,6 +1148,11 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) * preserved on VMID roll-over if the task was preempted, * making a thread's VMID inactive. So we need to call * kvm_arm_vmid_update() in non-premptible context. + * + * Note that this must happen after the check_vcpu_request() + * call to pick the correct s2_mmu structure, as a pending + * nested exception (IRQ, for example) can trigger a change + * in translation regime. */ if (kvm_arm_vmid_update(&vcpu->arch.hw_mmu->vmid) && has_vhe()) diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 7c8f39070a50..722e61e410e2 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -1318,4 +1318,8 @@ void check_nested_vcpu_requests(struct kvm_vcpu *vcpu) } write_unlock(&vcpu->kvm->mmu_lock); } + + /* Must be last, as may switch context! */ + if (kvm_check_request(KVM_REQ_GUEST_HYP_IRQ_PENDING, vcpu)) + kvm_inject_nested_irq(vcpu); } diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c index 324c547e1b4d..9734a71b8561 100644 --- a/arch/arm64/kvm/vgic/vgic.c +++ b/arch/arm64/kvm/vgic/vgic.c @@ -906,6 +906,29 @@ static inline void vgic_restore_state(struct kvm_vcpu *vcpu) /* Flush our emulation state into the GIC hardware before entering the guest. */ void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu) { + /* + * If in a nested state, we must return early. Two possibilities: + * + * - If we have any pending IRQ for the guest and the guest + * expects IRQs to be handled in its virtual EL2 mode (the + * virtual IMO bit is set) and it is not already running in + * virtual EL2 mode, then we have to emulate an IRQ + * exception to virtual EL2. + * + * We do that by placing a request to ourselves which will + * abort the entry procedure and inject the exception at the + * beginning of the run loop. + * + * - Otherwise, do exactly *NOTHING*. The guest state is + * already loaded, and we can carry on with running it. + */ + if (vgic_state_is_nested(vcpu)) { + if (kvm_vgic_vcpu_pending_irq(vcpu)) + kvm_make_request(KVM_REQ_GUEST_HYP_IRQ_PENDING, vcpu); + + return; + } + /* * If there are no virtual interrupts active or pending for this * VCPU, then there is no work to do and we can bail out without From 201c8d40dde900719e9dc1548698e18195f46443 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 25 Feb 2025 17:29:24 +0000 Subject: [PATCH 37/76] KVM: arm64: nv: Add Maintenance Interrupt emulation Emulating the vGIC means emulating the dreaded Maintenance Interrupt. This is a two-pronged problem: - while running L2, getting an MI translates into an MI injected in the L1 based on the state of the HW. - while running L1, we must accurately reflect the state of the MI line, based on the in-memory state. The MI INTID is added to the distributor, as expected on any virtualisation-capable implementation, and further patches will allow its configuration. Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20250225172930.1850838-11-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/arm.c | 6 ++++ arch/arm64/kvm/vgic/vgic-init.c | 29 ++++++++++++++++++ arch/arm64/kvm/vgic/vgic-v3-nested.c | 45 ++++++++++++++++++++++++++++ arch/arm64/kvm/vgic/vgic.c | 9 ++++++ arch/arm64/kvm/vgic/vgic.h | 2 ++ include/kvm/arm_vgic.h | 4 +++ 6 files changed, 95 insertions(+) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 0ec7099f4b63..abf51f6cfcaf 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -819,6 +819,12 @@ int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu) if (ret) return ret; + if (vcpu_has_nv(vcpu)) { + ret = kvm_vgic_vcpu_nv_init(vcpu); + if (ret) + return ret; + } + /* * This needs to happen after any restriction has been applied * to the feature set. diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c index 775461cf2d2d..1f33e71c2a73 100644 --- a/arch/arm64/kvm/vgic/vgic-init.c +++ b/arch/arm64/kvm/vgic/vgic-init.c @@ -198,6 +198,27 @@ static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis) return 0; } +/* Default GICv3 Maintenance Interrupt INTID, as per SBSA */ +#define DEFAULT_MI_INTID 25 + +int kvm_vgic_vcpu_nv_init(struct kvm_vcpu *vcpu) +{ + int ret; + + guard(mutex)(&vcpu->kvm->arch.config_lock); + + /* + * Matching the tradition established with the timers, provide + * a default PPI for the maintenance interrupt. It makes + * things easier to reason about. + */ + if (vcpu->kvm->arch.vgic.mi_intid == 0) + vcpu->kvm->arch.vgic.mi_intid = DEFAULT_MI_INTID; + ret = kvm_vgic_set_owner(vcpu, vcpu->kvm->arch.vgic.mi_intid, vcpu); + + return ret; +} + static int vgic_allocate_private_irqs_locked(struct kvm_vcpu *vcpu, u32 type) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; @@ -588,12 +609,20 @@ void kvm_vgic_cpu_down(void) static irqreturn_t vgic_maintenance_handler(int irq, void *data) { + struct kvm_vcpu *vcpu = *(struct kvm_vcpu **)data; + /* * We cannot rely on the vgic maintenance interrupt to be * delivered synchronously. This means we can only use it to * exit the VM, and we perform the handling of EOIed * interrupts on the exit path (see vgic_fold_lr_state). + * + * Of course, NV throws a wrench in this plan, and needs + * something special. */ + if (vcpu && vgic_state_is_nested(vcpu)) + vgic_v3_handle_nested_maint_irq(vcpu); + return IRQ_HANDLED; } diff --git a/arch/arm64/kvm/vgic/vgic-v3-nested.c b/arch/arm64/kvm/vgic/vgic-v3-nested.c index eabff56b8b65..26585a02caa2 100644 --- a/arch/arm64/kvm/vgic/vgic-v3-nested.c +++ b/arch/arm64/kvm/vgic/vgic-v3-nested.c @@ -73,6 +73,24 @@ static DEFINE_PER_CPU(struct shadow_if, shadow_if); * interrupt. The L0 active state will be cleared by the HW if the L1 * interrupt was itself backed by a HW interrupt. * + * Maintenance Interrupt (MI) management: + * + * Since the L2 guest runs the vgic in its full glory, MIs get delivered and + * used as a handover point between L2 and L1. + * + * - on delivery of a MI to L0 while L2 is running: make the L1 MI pending, + * and let it rip. This will initiate a vcpu_put() on L2, and allow L1 to + * run and process the MI. + * + * - L1 MI is a fully virtual interrupt, not linked to the host's MI. Its + * state must be computed at each entry/exit of the guest, much like we do + * it for the PMU interrupt. + * + * - because most of the ICH_*_EL2 registers live in the VNCR page, the + * quality of emulation is poor: L1 can setup the vgic so that an MI would + * immediately fire, and not observe anything until the next exit. Trying + * to read ICH_MISR_EL2 would do the trick, for example. + * * System register emulation: * * We get two classes of registers: @@ -341,3 +359,30 @@ void vgic_v3_put_nested(struct kvm_vcpu *vcpu) shadow_if->lr_map = 0; } + +/* + * If we exit a L2 VM with a pending maintenance interrupt from the GIC, + * then we need to forward this to L1 so that it can re-sync the appropriate + * LRs and sample level triggered interrupts again. + */ +void vgic_v3_handle_nested_maint_irq(struct kvm_vcpu *vcpu) +{ + bool state = read_sysreg_s(SYS_ICH_MISR_EL2); + + /* This will force a switch back to L1 if the level is high */ + kvm_vgic_inject_irq(vcpu->kvm, vcpu, + vcpu->kvm->arch.vgic.mi_intid, state, vcpu); + + sysreg_clear_set_s(SYS_ICH_HCR_EL2, ICH_HCR_EL2_En, 0); +} + +void vgic_v3_nested_update_mi(struct kvm_vcpu *vcpu) +{ + bool level; + + level = __vcpu_sys_reg(vcpu, ICH_HCR_EL2) & ICH_HCR_EL2_En; + if (level) + level &= vgic_v3_get_misr(vcpu); + kvm_vgic_inject_irq(vcpu->kvm, vcpu, + vcpu->kvm->arch.vgic.mi_intid, level, vcpu); +} diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c index 9734a71b8561..8f8096d48925 100644 --- a/arch/arm64/kvm/vgic/vgic.c +++ b/arch/arm64/kvm/vgic/vgic.c @@ -878,6 +878,9 @@ void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu) return; } + if (vcpu_has_nv(vcpu)) + vgic_v3_nested_update_mi(vcpu); + /* An empty ap_list_head implies used_lrs == 0 */ if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head)) return; @@ -921,6 +924,9 @@ void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu) * * - Otherwise, do exactly *NOTHING*. The guest state is * already loaded, and we can carry on with running it. + * + * If we have NV, but are not in a nested state, compute the + * maintenance interrupt state, as it may fire. */ if (vgic_state_is_nested(vcpu)) { if (kvm_vgic_vcpu_pending_irq(vcpu)) @@ -929,6 +935,9 @@ void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu) return; } + if (vcpu_has_nv(vcpu)) + vgic_v3_nested_update_mi(vcpu); + /* * If there are no virtual interrupts active or pending for this * VCPU, then there is no work to do and we can bail out without diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h index cf0c084e5d34..0c5a63712702 100644 --- a/arch/arm64/kvm/vgic/vgic.h +++ b/arch/arm64/kvm/vgic/vgic.h @@ -356,5 +356,7 @@ static inline bool kvm_has_gicv3(struct kvm *kvm) void vgic_v3_sync_nested(struct kvm_vcpu *vcpu); void vgic_v3_load_nested(struct kvm_vcpu *vcpu); void vgic_v3_put_nested(struct kvm_vcpu *vcpu); +void vgic_v3_handle_nested_maint_irq(struct kvm_vcpu *vcpu); +void vgic_v3_nested_update_mi(struct kvm_vcpu *vcpu); #endif diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 1b373cb870fe..714cef854c1c 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -249,6 +249,9 @@ struct vgic_dist { int nr_spis; + /* The GIC maintenance IRQ for nested hypervisors. */ + u32 mi_intid; + /* base addresses in guest physical address space: */ gpa_t vgic_dist_base; /* distributor */ union { @@ -369,6 +372,7 @@ extern struct static_key_false vgic_v3_cpuif_trap; int kvm_set_legacy_vgic_v2_addr(struct kvm *kvm, struct kvm_arm_device_addr *dev_addr); void kvm_vgic_early_init(struct kvm *kvm); int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu); +int kvm_vgic_vcpu_nv_init(struct kvm_vcpu *vcpu); int kvm_vgic_create(struct kvm *kvm, u32 type); void kvm_vgic_destroy(struct kvm *kvm); void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu); From 69c9176c3862345850b7fecbe2546bfd051dc7da Mon Sep 17 00:00:00 2001 From: Jintack Lim Date: Tue, 25 Feb 2025 17:29:25 +0000 Subject: [PATCH 38/76] KVM: arm64: nv: Respect virtual HCR_EL2.TWx setting Forward exceptions due to WFI or WFE instructions to the virtual EL2 if they are not coming from the virtual EL2 and virtual HCR_EL2.TWx is set. Signed-off-by: Jintack Lim Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20250225172930.1850838-12-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_emulate.h | 13 +++++++++++++ arch/arm64/kvm/handle_exit.c | 6 +++++- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index 78ec1ef2cfe8..d4e4212ff596 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -275,6 +275,19 @@ static __always_inline u64 kvm_vcpu_get_esr(const struct kvm_vcpu *vcpu) return vcpu->arch.fault.esr_el2; } +static inline bool guest_hyp_wfx_traps_enabled(const struct kvm_vcpu *vcpu) +{ + u64 esr = kvm_vcpu_get_esr(vcpu); + bool is_wfe = !!(esr & ESR_ELx_WFx_ISS_WFE); + u64 hcr_el2 = __vcpu_sys_reg(vcpu, HCR_EL2); + + if (!vcpu_has_nv(vcpu) || vcpu_is_el2(vcpu)) + return false; + + return ((is_wfe && (hcr_el2 & HCR_TWE)) || + (!is_wfe && (hcr_el2 & HCR_TWI))); +} + static __always_inline int kvm_vcpu_get_condition(const struct kvm_vcpu *vcpu) { u64 esr = kvm_vcpu_get_esr(vcpu); diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index 512d152233ff..b73dc26bc44b 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -129,8 +129,12 @@ static int kvm_handle_fpasimd(struct kvm_vcpu *vcpu) static int kvm_handle_wfx(struct kvm_vcpu *vcpu) { u64 esr = kvm_vcpu_get_esr(vcpu); + bool is_wfe = !!(esr & ESR_ELx_WFx_ISS_WFE); - if (esr & ESR_ELx_WFx_ISS_WFE) { + if (guest_hyp_wfx_traps_enabled(vcpu)) + return kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu)); + + if (is_wfe) { trace_kvm_wfx_arm64(*vcpu_pc(vcpu), true); vcpu->stat.wfe_exit_stat++; } else { From 93078ae63f20f09809c51e0505f8e8cc930d60ef Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Tue, 25 Feb 2025 17:29:26 +0000 Subject: [PATCH 39/76] KVM: arm64: nv: Request vPE doorbell upon nested ERET to L2 Running an L2 guest with GICv4 enabled goes absolutely nowhere, and gets into a vicious cycle of nested ERET followed by nested exception entry into the L1. When KVM does a put on a runnable vCPU, it marks the vPE as nonresident but does not request a doorbell IRQ. Behind the scenes in the ITS driver's view of the vCPU, its_vpe::pending_last gets set to true to indicate that context is still runnable. This comes to a head when doing the nested ERET into L2. The vPE doesn't get scheduled on the redistributor as it is exclusively part of the L1's VGIC context. kvm_vgic_vcpu_pending_irq() returns true because the vPE appears runnable, and KVM does a nested exception entry into the L1 before L2 ever gets off the ground. This issue can be papered over by requesting a doorbell IRQ when descheduling a vPE as part of a nested ERET. KVM needs this anyway to kick the vCPU out of the L2 when an IRQ becomes pending for the L1. Link: https://lore.kernel.org/r/20240823212703.3576061-4-oliver.upton@linux.dev Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20250225172930.1850838-13-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_host.h | 2 ++ arch/arm64/kvm/emulate-nested.c | 2 ++ arch/arm64/kvm/vgic/vgic-v4.c | 18 +++++++++++++++++- 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 86519a73971e..74be23236aac 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -946,6 +946,8 @@ struct kvm_vcpu_arch { #define PMUSERENR_ON_CPU __vcpu_single_flag(sflags, BIT(5)) /* WFI instruction trapped */ #define IN_WFI __vcpu_single_flag(sflags, BIT(6)) +/* KVM is currently emulating a nested ERET */ +#define IN_NESTED_ERET __vcpu_single_flag(sflags, BIT(7)) /* Pointer to the vcpu's SVE FFR for sve_{save,load}_state() */ diff --git a/arch/arm64/kvm/emulate-nested.c b/arch/arm64/kvm/emulate-nested.c index 9986bb88c259..834c58750069 100644 --- a/arch/arm64/kvm/emulate-nested.c +++ b/arch/arm64/kvm/emulate-nested.c @@ -2503,6 +2503,7 @@ void kvm_emulate_nested_eret(struct kvm_vcpu *vcpu) } preempt_disable(); + vcpu_set_flag(vcpu, IN_NESTED_ERET); kvm_arch_vcpu_put(vcpu); if (!esr_iss_is_eretax(esr)) @@ -2514,6 +2515,7 @@ void kvm_emulate_nested_eret(struct kvm_vcpu *vcpu) *vcpu_cpsr(vcpu) = spsr; kvm_arch_vcpu_load(vcpu, smp_processor_id()); + vcpu_clear_flag(vcpu, IN_NESTED_ERET); preempt_enable(); kvm_pmu_nested_transition(vcpu); diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c index eedecbbbcf31..0d9fb235c018 100644 --- a/arch/arm64/kvm/vgic/vgic-v4.c +++ b/arch/arm64/kvm/vgic/vgic-v4.c @@ -336,6 +336,22 @@ void vgic_v4_teardown(struct kvm *kvm) its_vm->vpes = NULL; } +static inline bool vgic_v4_want_doorbell(struct kvm_vcpu *vcpu) +{ + if (vcpu_get_flag(vcpu, IN_WFI)) + return true; + + if (likely(!vcpu_has_nv(vcpu))) + return false; + + /* + * GICv4 hardware is only ever used for the L1. Mark the vPE (i.e. the + * L1 context) nonresident and request a doorbell to kick us out of the + * L2 when an IRQ becomes pending. + */ + return vcpu_get_flag(vcpu, IN_NESTED_ERET); +} + int vgic_v4_put(struct kvm_vcpu *vcpu) { struct its_vpe *vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe; @@ -343,7 +359,7 @@ int vgic_v4_put(struct kvm_vcpu *vcpu) if (!vgic_supports_direct_msis(vcpu->kvm) || !vpe->resident) return 0; - return its_make_vpe_non_resident(vpe, !!vcpu_get_flag(vcpu, IN_WFI)); + return its_make_vpe_non_resident(vpe, vgic_v4_want_doorbell(vcpu)); } int vgic_v4_load(struct kvm_vcpu *vcpu) From 7682c023212e77babc68f04c44fe895091d9a5a9 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 25 Feb 2025 17:29:27 +0000 Subject: [PATCH 40/76] KVM: arm64: nv: Propagate used_lrs between L1 and L0 contexts We have so far made sure that L1 and L0 vgic contexts were totally independent. There is however one spot of bother with this approach, and that's in the GICv3 emulation code required by our fruity friends. The issue is that the emulation code needs to know how many LRs are in flight. And while it is easy to reach the L0 version through the vcpu pointer, doing so for the L1 is much more complicated, as these structures are private to the nested code. We could simply expose that structure and pick one or the other depending on the context, but this seems extra complexity for not much benefit. Instead, just propagate the number of used LRs from the nested code into the L0 context, and be done with it. Should this become a burden, it can be easily rectified. Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20250225172930.1850838-14-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic-v3-nested.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/arm64/kvm/vgic/vgic-v3-nested.c b/arch/arm64/kvm/vgic/vgic-v3-nested.c index 26585a02caa2..5464ff244740 100644 --- a/arch/arm64/kvm/vgic/vgic-v3-nested.c +++ b/arch/arm64/kvm/vgic/vgic-v3-nested.c @@ -323,6 +323,12 @@ void vgic_v3_load_nested(struct kvm_vcpu *vcpu) __vgic_v3_activate_traps(cpu_if); __vgic_v3_restore_state(cpu_if); + + /* + * Propagate the number of used LRs for the benefit of the HYP + * GICv3 emulation code. Yes, this is a pretty sorry hack. + */ + vcpu->arch.vgic_cpu.vgic_v3.used_lrs = cpu_if->used_lrs; } void vgic_v3_put_nested(struct kvm_vcpu *vcpu) @@ -358,6 +364,7 @@ void vgic_v3_put_nested(struct kvm_vcpu *vcpu) } shadow_if->lr_map = 0; + vcpu->arch.vgic_cpu.vgic_v3.used_lrs = 0; } /* From 89896cc1591188bfaf2f550abd5b8119b9a9787f Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 25 Feb 2025 17:29:28 +0000 Subject: [PATCH 41/76] KVM: arm64: nv: Fold GICv3 host trapping requirements into guest setup Popular HW that is able to use NV also has a broken vgic implementation that requires trapping. On such HW, propagate the host trap bits into the guest's shadow ICH_HCR_EL2 register, making sure we don't allow an L2 guest to bring the system down. This involves a bit of tweaking so that the emulation code correctly poicks up the shadow state as needed, and to only partially sync ICH_HCR_EL2 back with the guest state to capture EOIcount. Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20250225172930.1850838-15-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic-v3-nested.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-v3-nested.c b/arch/arm64/kvm/vgic/vgic-v3-nested.c index 5464ff244740..bfa5bde1f106 100644 --- a/arch/arm64/kvm/vgic/vgic-v3-nested.c +++ b/arch/arm64/kvm/vgic/vgic-v3-nested.c @@ -296,9 +296,19 @@ static void vgic_v3_create_shadow_state(struct kvm_vcpu *vcpu, struct vgic_v3_cpu_if *s_cpu_if) { struct vgic_v3_cpu_if *host_if = &vcpu->arch.vgic_cpu.vgic_v3; + u64 val = 0; int i; - s_cpu_if->vgic_hcr = __vcpu_sys_reg(vcpu, ICH_HCR_EL2); + /* + * If we're on a system with a broken vgic that requires + * trapping, propagate the trapping requirements. + * + * Ah, the smell of rotten fruits... + */ + if (static_branch_unlikely(&vgic_v3_cpuif_trap)) + val = host_if->vgic_hcr & (ICH_HCR_EL2_TALL0 | ICH_HCR_EL2_TALL1 | + ICH_HCR_EL2_TC | ICH_HCR_EL2_TDIR); + s_cpu_if->vgic_hcr = __vcpu_sys_reg(vcpu, ICH_HCR_EL2) | val; s_cpu_if->vgic_vmcr = __vcpu_sys_reg(vcpu, ICH_VMCR_EL2); s_cpu_if->vgic_sre = host_if->vgic_sre; @@ -335,6 +345,7 @@ void vgic_v3_put_nested(struct kvm_vcpu *vcpu) { struct shadow_if *shadow_if = get_shadow_if(); struct vgic_v3_cpu_if *s_cpu_if = &shadow_if->cpuif; + u64 val; int i; __vgic_v3_save_vmcr_aprs(s_cpu_if); @@ -345,7 +356,10 @@ void vgic_v3_put_nested(struct kvm_vcpu *vcpu) * Translate the shadow state HW fields back to the virtual ones * before copying the shadow struct back to the nested one. */ - __vcpu_sys_reg(vcpu, ICH_HCR_EL2) = s_cpu_if->vgic_hcr; + val = __vcpu_sys_reg(vcpu, ICH_HCR_EL2); + val &= ~ICH_HCR_EL2_EOIcount_MASK; + val |= (s_cpu_if->vgic_hcr & ICH_HCR_EL2_EOIcount_MASK); + __vcpu_sys_reg(vcpu, ICH_HCR_EL2) = val; __vcpu_sys_reg(vcpu, ICH_VMCR_EL2) = s_cpu_if->vgic_vmcr; for (i = 0; i < 4; i++) { @@ -354,7 +368,7 @@ void vgic_v3_put_nested(struct kvm_vcpu *vcpu) } for_each_set_bit(i, &shadow_if->lr_map, kvm_vgic_global_state.nr_lr) { - u64 val = __vcpu_sys_reg(vcpu, ICH_LRN(i)); + val = __vcpu_sys_reg(vcpu, ICH_LRN(i)); val &= ~ICH_LR_STATE; val |= s_cpu_if->vgic_lr[i] & ICH_LR_STATE; From faf7714a47a25c626ec7fdbd8e85c6bfcd565fdc Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Tue, 25 Feb 2025 17:29:29 +0000 Subject: [PATCH 42/76] KVM: arm64: nv: Allow userland to set VGIC maintenance IRQ The VGIC maintenance IRQ signals various conditions about the LRs, when the GIC's virtualization extension is used. So far we didn't need it, but nested virtualization needs to know about this interrupt, so add a userland interface to setup the IRQ number. The architecture mandates that it must be a PPI, on top of that this code only exports a per-device option, so the PPI is the same on all VCPUs. Signed-off-by: Andre Przywara [added some bits of documentation] Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20250225172930.1850838-16-maz@kernel.org Signed-off-by: Oliver Upton --- .../virt/kvm/devices/arm-vgic-v3.rst | 12 +++++++- arch/arm64/include/uapi/asm/kvm.h | 1 + arch/arm64/kvm/vgic/vgic-kvm-device.c | 29 +++++++++++++++++-- tools/arch/arm/include/uapi/asm/kvm.h | 1 + 4 files changed, 40 insertions(+), 3 deletions(-) diff --git a/Documentation/virt/kvm/devices/arm-vgic-v3.rst b/Documentation/virt/kvm/devices/arm-vgic-v3.rst index 5817edb4e046..e860498b1e35 100644 --- a/Documentation/virt/kvm/devices/arm-vgic-v3.rst +++ b/Documentation/virt/kvm/devices/arm-vgic-v3.rst @@ -291,8 +291,18 @@ Groups: | Aff3 | Aff2 | Aff1 | Aff0 | Errors: - ======= ============================================= -EINVAL vINTID is not multiple of 32 or info field is not VGIC_LEVEL_INFO_LINE_LEVEL ======= ============================================= + + KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ + Attributes: + + The attr field of kvm_device_attr encodes the following values: + + bits: | 31 .... 5 | 4 .... 0 | + values: | RES0 | vINTID | + + The vINTID specifies which interrupt is generated when the vGIC + must generate a maintenance interrupt. This must be a PPI. diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h index 568bf858f319..fc5a641b3ed6 100644 --- a/arch/arm64/include/uapi/asm/kvm.h +++ b/arch/arm64/include/uapi/asm/kvm.h @@ -403,6 +403,7 @@ enum { #define KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS 6 #define KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO 7 #define KVM_DEV_ARM_VGIC_GRP_ITS_REGS 8 +#define KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ 9 #define KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT 10 #define KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_MASK \ (0x3fffffULL << KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT) diff --git a/arch/arm64/kvm/vgic/vgic-kvm-device.c b/arch/arm64/kvm/vgic/vgic-kvm-device.c index 5f4f57aaa23e..359094f68c23 100644 --- a/arch/arm64/kvm/vgic/vgic-kvm-device.c +++ b/arch/arm64/kvm/vgic/vgic-kvm-device.c @@ -303,6 +303,12 @@ static int vgic_get_common_attr(struct kvm_device *dev, VGIC_NR_PRIVATE_IRQS, uaddr); break; } + case KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ: { + u32 __user *uaddr = (u32 __user *)(long)attr->addr; + + r = put_user(dev->kvm->arch.vgic.mi_intid, uaddr); + break; + } } return r; @@ -517,7 +523,7 @@ static int vgic_v3_attr_regs_access(struct kvm_device *dev, struct vgic_reg_attr reg_attr; gpa_t addr; struct kvm_vcpu *vcpu; - bool uaccess; + bool uaccess, post_init = true; u32 val; int ret; @@ -533,6 +539,9 @@ static int vgic_v3_attr_regs_access(struct kvm_device *dev, /* Sysregs uaccess is performed by the sysreg handling code */ uaccess = false; break; + case KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ: + post_init = false; + fallthrough; default: uaccess = true; } @@ -552,7 +561,7 @@ static int vgic_v3_attr_regs_access(struct kvm_device *dev, mutex_lock(&dev->kvm->arch.config_lock); - if (unlikely(!vgic_initialized(dev->kvm))) { + if (post_init != vgic_initialized(dev->kvm)) { ret = -EBUSY; goto out; } @@ -582,6 +591,19 @@ static int vgic_v3_attr_regs_access(struct kvm_device *dev, } break; } + case KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ: + if (!is_write) { + val = dev->kvm->arch.vgic.mi_intid; + ret = 0; + break; + } + + ret = -EINVAL; + if ((val < VGIC_NR_PRIVATE_IRQS) && (val >= VGIC_NR_SGIS)) { + dev->kvm->arch.vgic.mi_intid = val; + ret = 0; + } + break; default: ret = -EINVAL; break; @@ -608,6 +630,7 @@ static int vgic_v3_set_attr(struct kvm_device *dev, case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: + case KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ: return vgic_v3_attr_regs_access(dev, attr, true); default: return vgic_set_common_attr(dev, attr); @@ -622,6 +645,7 @@ static int vgic_v3_get_attr(struct kvm_device *dev, case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: + case KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ: return vgic_v3_attr_regs_access(dev, attr, false); default: return vgic_get_common_attr(dev, attr); @@ -645,6 +669,7 @@ static int vgic_v3_has_attr(struct kvm_device *dev, case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: return vgic_v3_has_attr_regs(dev, attr); case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: + case KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ: return 0; case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: { if (((attr->attr & KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_MASK) >> diff --git a/tools/arch/arm/include/uapi/asm/kvm.h b/tools/arch/arm/include/uapi/asm/kvm.h index 03cd7c19a683..d5dd96902817 100644 --- a/tools/arch/arm/include/uapi/asm/kvm.h +++ b/tools/arch/arm/include/uapi/asm/kvm.h @@ -246,6 +246,7 @@ struct kvm_vcpu_events { #define KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS 6 #define KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO 7 #define KVM_DEV_ARM_VGIC_GRP_ITS_REGS 8 +#define KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ 9 #define KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT 10 #define KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_MASK \ (0x3fffffULL << KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT) From 83c6cb20147b5ee22357f4604632df9f8ef3c17b Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 25 Feb 2025 17:29:30 +0000 Subject: [PATCH 43/76] KVM: arm64: nv: Fail KVM init if asking for NV without GICv3 Although there is nothing in NV that is fundamentally incompatible with the lack of GICv3, there is no HW implementation without one, at least on the virtual side (yes, even fruits have some form of vGICv3). We therefore make the decision to require GICv3, which will only affect models such as QEMU. Booting with a GICv2 or something even more exotic while asking for NV will result in KVM being disabled. Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20250225172930.1850838-17-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/arm.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index abf51f6cfcaf..a67653e26225 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -2325,6 +2325,13 @@ static int __init init_subsystems(void) goto out; } + if (kvm_mode == KVM_MODE_NV && + !(vgic_present && kvm_vgic_global_state.type == VGIC_V3)) { + kvm_err("NV support requires GICv3, giving up\n"); + err = -EINVAL; + goto out; + } + /* * Init HYP architected timer support */ From 9d91227364330fd1735d6fc3f7226f854f9f7b8c Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 5 Mar 2025 15:08:23 -0800 Subject: [PATCH 44/76] KVM: arm64: Copy guest CTR_EL0 into hyp VM Since commit 2843cae26644 ("KVM: arm64: Treat CTR_EL0 as a VM feature ID register") KVM has allowed userspace to configure the VM-wide view of CTR_EL0, falling back to trap-n-emulate if the value doesn't match hardware. It appears that this has worked by chance in protected-mode for some time, and on systems with FEAT_EVT protected-mode unconditionally sets TID4 (i.e. TID2 traps sans CTR_EL0). Forward the guest CTR_EL0 value through to the hyp VM and align the TID2/TID4 configuration with the non-protected setup. Fixes: 2843cae26644 ("KVM: arm64: Treat CTR_EL0 as a VM feature ID register") Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20250305230825.484091-2-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/kvm/hyp/nvhe/pkvm.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index 3927fe52a3dd..96348abeb5c2 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -46,7 +46,8 @@ static void pkvm_vcpu_reset_hcr(struct kvm_vcpu *vcpu) vcpu->arch.hcr_el2 |= HCR_FWB; if (cpus_have_final_cap(ARM64_HAS_EVT) && - !cpus_have_final_cap(ARM64_MISMATCHED_CACHE_TYPE)) + !cpus_have_final_cap(ARM64_MISMATCHED_CACHE_TYPE) && + kvm_read_vm_id_reg(vcpu->kvm, SYS_CTR_EL0) == read_cpuid(CTR_EL0)) vcpu->arch.hcr_el2 |= HCR_TID4; else vcpu->arch.hcr_el2 |= HCR_TID2; @@ -315,6 +316,9 @@ static void pkvm_init_features_from_host(struct pkvm_hyp_vm *hyp_vm, const struc unsigned long host_arch_flags = READ_ONCE(host_kvm->arch.flags); DECLARE_BITMAP(allowed_features, KVM_VCPU_MAX_FEATURES); + /* CTR_EL0 is always under host control, even for protected VMs. */ + hyp_vm->kvm.arch.ctr_el0 = host_kvm->arch.ctr_el0; + if (test_bit(KVM_ARCH_FLAG_MTE_ENABLED, &host_kvm->arch.flags)) set_bit(KVM_ARCH_FLAG_MTE_ENABLED, &kvm->arch.flags); From 03e1b89d051fadf37c617c2fe84304c1e800da91 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 5 Mar 2025 15:08:24 -0800 Subject: [PATCH 45/76] KVM: arm64: Copy MIDR_EL1 into hyp VM when it is writable KVM recently added a capability that allows userspace to override the 'implementation ID' registers presented to the VM. MIDR_EL1 is a special example, where the hypervisor can directly set the value when read from EL1 using VPIDR_EL2. Copy the VM-wide value for MIDR_EL1 into the hyp VM for non-protected guests when the capability is enabled so VPIDR_EL2 gets set up correctly. Reported-by: Mark Brown Closes: https://lore.kernel.org/kvmarm/ac594b9c-4bbb-46c8-9391-e7a68ce4de5b@sirena.org.uk/ Fixes: 3adaee783061 ("KVM: arm64: Allow userspace to change the implementation ID registers") Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20250305230825.484091-3-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/kvm/hyp/nvhe/pkvm.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index 96348abeb5c2..64a4a4c95bc3 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -329,6 +329,10 @@ static void pkvm_init_features_from_host(struct pkvm_hyp_vm *hyp_vm, const struc bitmap_copy(kvm->arch.vcpu_features, host_kvm->arch.vcpu_features, KVM_VCPU_MAX_FEATURES); + + if (test_bit(KVM_ARCH_FLAG_WRITABLE_IMP_ID_REGS, &host_arch_flags)) + hyp_vm->kvm.arch.midr_el1 = host_kvm->arch.midr_el1; + return; } From 5980a693701229c02f19fec051ae0845309413c6 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 5 Mar 2025 15:08:25 -0800 Subject: [PATCH 46/76] KVM: arm64: Fix documentation for KVM_CAP_ARM_WRITABLE_IMP_ID_REGS The capability actually fails with EINVAL if vCPUs have already been created. Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20250305230825.484091-4-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- Documentation/virt/kvm/api.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 56f3dcdc4477..3468a2a7de6b 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -8264,7 +8264,7 @@ take care to differentiate between these cases. :Architectures: arm64 :Target: VM :Parameters: None -:Returns: 0 on success, -EBUSY if vCPUs have been created before enabling this +:Returns: 0 on success, -EINVAL if vCPUs have been created before enabling this capability. This capability changes the behavior of the registers that identify a PE From 44ff44cadbd144ee1159f5687a852c49c4290262 Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Thu, 6 Mar 2025 16:38:28 -0800 Subject: [PATCH 47/76] smccc: kvm_guest: Fix kernel builds for 32 bit arm The paravirtual implementation ID stuffs is 64-bit only and broke 32bit arm builds. Slap an ifdef bandaid on the situation to get things rolling again. Signed-off-by: Shameer Kolothum Signed-off-by: Oliver Upton --- drivers/firmware/smccc/kvm_guest.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/firmware/smccc/kvm_guest.c b/drivers/firmware/smccc/kvm_guest.c index 2f03b582c298..5767aed25cdc 100644 --- a/drivers/firmware/smccc/kvm_guest.c +++ b/drivers/firmware/smccc/kvm_guest.c @@ -55,6 +55,7 @@ bool kvm_arm_hyp_service_available(u32 func_id) } EXPORT_SYMBOL_GPL(kvm_arm_hyp_service_available); +#ifdef CONFIG_ARM64 void __init kvm_arm_target_impl_cpu_init(void) { int i; @@ -115,3 +116,4 @@ void __init kvm_arm_target_impl_cpu_init(void) mem_free: memblock_free(target, sizeof(*target) * max_cpus); } +#endif From 75ecffc361bbc85696c084f3d3c73eb207386e3f Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 5 Mar 2025 12:26:28 -0800 Subject: [PATCH 48/76] drivers/perf: apple_m1: Refactor event select/filter configuration Supporting guest mode events will necessitate programming two event filters. Prepare by splitting up the programming of the event selector + event filter into separate headers. Opportunistically replace RMW patterns with sysreg_clear_set_s(). Tested-by: Janne Grunau Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20250305202641.428114-2-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- drivers/perf/apple_m1_cpu_pmu.c | 52 ++++++++++++++++++++------------- 1 file changed, 32 insertions(+), 20 deletions(-) diff --git a/drivers/perf/apple_m1_cpu_pmu.c b/drivers/perf/apple_m1_cpu_pmu.c index 06fd317529fc..cea80afd1253 100644 --- a/drivers/perf/apple_m1_cpu_pmu.c +++ b/drivers/perf/apple_m1_cpu_pmu.c @@ -327,11 +327,10 @@ static void m1_pmu_disable_counter_interrupt(unsigned int index) __m1_pmu_enable_counter_interrupt(index, false); } -static void m1_pmu_configure_counter(unsigned int index, u8 event, - bool user, bool kernel) +static void __m1_pmu_configure_event_filter(unsigned int index, bool user, + bool kernel) { - u64 val, user_bit, kernel_bit; - int shift; + u64 clear, set, user_bit, kernel_bit; switch (index) { case 0 ... 7: @@ -346,19 +345,24 @@ static void m1_pmu_configure_counter(unsigned int index, u8 event, BUG(); } - val = read_sysreg_s(SYS_IMP_APL_PMCR1_EL1); - + clear = set = 0; if (user) - val |= user_bit; + set |= user_bit; else - val &= ~user_bit; + clear |= user_bit; if (kernel) - val |= kernel_bit; + set |= kernel_bit; else - val &= ~kernel_bit; + clear |= kernel_bit; - write_sysreg_s(val, SYS_IMP_APL_PMCR1_EL1); + sysreg_clear_set_s(SYS_IMP_APL_PMCR1_EL1, clear, set); +} + +static void __m1_pmu_configure_eventsel(unsigned int index, u8 event) +{ + u64 clear = 0, set = 0; + int shift; /* * Counters 0 and 1 have fixed events. For anything else, @@ -371,21 +375,29 @@ static void m1_pmu_configure_counter(unsigned int index, u8 event, break; case 2 ... 5: shift = (index - 2) * 8; - val = read_sysreg_s(SYS_IMP_APL_PMESR0_EL1); - val &= ~((u64)0xff << shift); - val |= (u64)event << shift; - write_sysreg_s(val, SYS_IMP_APL_PMESR0_EL1); + clear |= (u64)0xff << shift; + set |= (u64)event << shift; + sysreg_clear_set_s(SYS_IMP_APL_PMESR0_EL1, clear, set); break; case 6 ... 9: shift = (index - 6) * 8; - val = read_sysreg_s(SYS_IMP_APL_PMESR1_EL1); - val &= ~((u64)0xff << shift); - val |= (u64)event << shift; - write_sysreg_s(val, SYS_IMP_APL_PMESR1_EL1); + clear |= (u64)0xff << shift; + set |= (u64)event << shift; + sysreg_clear_set_s(SYS_IMP_APL_PMESR1_EL1, clear, set); break; } } +static void m1_pmu_configure_counter(unsigned int index, unsigned long config_base) +{ + bool kernel = config_base & M1_PMU_CFG_COUNT_KERNEL; + bool user = config_base & M1_PMU_CFG_COUNT_USER; + u8 evt = config_base & M1_PMU_CFG_EVENT; + + __m1_pmu_configure_event_filter(index, user, kernel); + __m1_pmu_configure_eventsel(index, evt); +} + /* arm_pmu backend */ static void m1_pmu_enable_event(struct perf_event *event) { @@ -400,7 +412,7 @@ static void m1_pmu_enable_event(struct perf_event *event) m1_pmu_disable_counter(event->hw.idx); isb(); - m1_pmu_configure_counter(event->hw.idx, evt, user, kernel); + m1_pmu_configure_counter(event->hw.idx, event->hw.config_base); m1_pmu_enable_counter(event->hw.idx); m1_pmu_enable_counter_interrupt(event->hw.idx); isb(); From 46573d944f00f440dc794fa87ebbdb0dd9dbf691 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 5 Mar 2025 12:26:29 -0800 Subject: [PATCH 49/76] drivers/perf: apple_m1: Support host/guest event filtering The PMU appears to have a separate register for filtering 'guest' exception levels (i.e. EL1 and !ELIsInHost(EL0)) which has the same layout as PMCR1_EL1. Conveniently, there exists a VHE register alias (PMCR1_EL12) that can be used to configure it. Support guest events by programming the EL12 register with the intended guest kernel/userspace filters. Limit support for guest events to VHE (i.e. kernel running at EL2), as it avoids involving KVM to context switch PMU registers. VHE is the only supported mode on M* parts anyway, so this isn't an actual feature limitation. Tested-by: Janne Grunau Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20250305202641.428114-3-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/include/asm/apple_m1_pmu.h | 1 + drivers/perf/apple_m1_cpu_pmu.c | 20 ++++++++++++++++---- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/arch/arm64/include/asm/apple_m1_pmu.h b/arch/arm64/include/asm/apple_m1_pmu.h index 99483b19b99f..02e05d05851f 100644 --- a/arch/arm64/include/asm/apple_m1_pmu.h +++ b/arch/arm64/include/asm/apple_m1_pmu.h @@ -37,6 +37,7 @@ #define PMCR0_PMI_ENABLE_8_9 GENMASK(45, 44) #define SYS_IMP_APL_PMCR1_EL1 sys_reg(3, 1, 15, 1, 0) +#define SYS_IMP_APL_PMCR1_EL12 sys_reg(3, 1, 15, 7, 2) #define PMCR1_COUNT_A64_EL0_0_7 GENMASK(15, 8) #define PMCR1_COUNT_A64_EL1_0_7 GENMASK(23, 16) #define PMCR1_COUNT_A64_EL0_8_9 GENMASK(41, 40) diff --git a/drivers/perf/apple_m1_cpu_pmu.c b/drivers/perf/apple_m1_cpu_pmu.c index cea80afd1253..d6d4ff6da862 100644 --- a/drivers/perf/apple_m1_cpu_pmu.c +++ b/drivers/perf/apple_m1_cpu_pmu.c @@ -120,6 +120,8 @@ enum m1_pmu_events { */ M1_PMU_CFG_COUNT_USER = BIT(8), M1_PMU_CFG_COUNT_KERNEL = BIT(9), + M1_PMU_CFG_COUNT_HOST = BIT(10), + M1_PMU_CFG_COUNT_GUEST = BIT(11), }; /* @@ -328,7 +330,7 @@ static void m1_pmu_disable_counter_interrupt(unsigned int index) } static void __m1_pmu_configure_event_filter(unsigned int index, bool user, - bool kernel) + bool kernel, bool host) { u64 clear, set, user_bit, kernel_bit; @@ -356,7 +358,10 @@ static void __m1_pmu_configure_event_filter(unsigned int index, bool user, else clear |= kernel_bit; - sysreg_clear_set_s(SYS_IMP_APL_PMCR1_EL1, clear, set); + if (host) + sysreg_clear_set_s(SYS_IMP_APL_PMCR1_EL1, clear, set); + else if (is_kernel_in_hyp_mode()) + sysreg_clear_set_s(SYS_IMP_APL_PMCR1_EL12, clear, set); } static void __m1_pmu_configure_eventsel(unsigned int index, u8 event) @@ -391,10 +396,13 @@ static void __m1_pmu_configure_eventsel(unsigned int index, u8 event) static void m1_pmu_configure_counter(unsigned int index, unsigned long config_base) { bool kernel = config_base & M1_PMU_CFG_COUNT_KERNEL; + bool guest = config_base & M1_PMU_CFG_COUNT_GUEST; + bool host = config_base & M1_PMU_CFG_COUNT_HOST; bool user = config_base & M1_PMU_CFG_COUNT_USER; u8 evt = config_base & M1_PMU_CFG_EVENT; - __m1_pmu_configure_event_filter(index, user, kernel); + __m1_pmu_configure_event_filter(index, user && host, kernel && host, true); + __m1_pmu_configure_event_filter(index, user && guest, kernel && guest, false); __m1_pmu_configure_eventsel(index, evt); } @@ -570,7 +578,7 @@ static int m1_pmu_set_event_filter(struct hw_perf_event *event, { unsigned long config_base = 0; - if (!attr->exclude_guest) { + if (!attr->exclude_guest && !is_kernel_in_hyp_mode()) { pr_debug("ARM performance counters do not support mode exclusion\n"); return -EOPNOTSUPP; } @@ -578,6 +586,10 @@ static int m1_pmu_set_event_filter(struct hw_perf_event *event, config_base |= M1_PMU_CFG_COUNT_KERNEL; if (!attr->exclude_user) config_base |= M1_PMU_CFG_COUNT_USER; + if (!attr->exclude_host) + config_base |= M1_PMU_CFG_COUNT_HOST; + if (!attr->exclude_guest) + config_base |= M1_PMU_CFG_COUNT_GUEST; event->config_base = config_base; From 93b01528586b693a3ab31ed7be92b2347a9416ca Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 5 Mar 2025 12:26:30 -0800 Subject: [PATCH 50/76] KVM: arm64: Compute PMCEID from arm_pmu's event bitmaps The PMUv3 driver populates a couple of bitmaps with the values of PMCEID{0,1}, from which the guest's PMCEID{0,1} can be derived. This is particularly convenient when virtualizing PMUv3 on IMP DEF hardware, as reading the nonexistent PMCEID registers leads to a rather unpleasant UNDEF. Tested-by: Janne Grunau Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20250305202641.428114-4-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/kvm/pmu-emul.c | 47 ++++++++++++++++++++++++++++++--------- 1 file changed, 36 insertions(+), 11 deletions(-) diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 6c5950b9ceac..104672a0c5a2 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -842,8 +842,42 @@ static struct arm_pmu *kvm_pmu_probe_armpmu(void) return pmu; } +static u64 __compute_pmceid(struct arm_pmu *pmu, bool pmceid1) +{ + u32 hi[2], lo[2]; + + bitmap_to_arr32(lo, pmu->pmceid_bitmap, ARMV8_PMUV3_MAX_COMMON_EVENTS); + bitmap_to_arr32(hi, pmu->pmceid_ext_bitmap, ARMV8_PMUV3_MAX_COMMON_EVENTS); + + return ((u64)hi[pmceid1] << 32) | lo[pmceid1]; +} + +static u64 compute_pmceid0(struct arm_pmu *pmu) +{ + u64 val = __compute_pmceid(pmu, 0); + + /* always support CHAIN */ + val |= BIT(ARMV8_PMUV3_PERFCTR_CHAIN); + return val; +} + +static u64 compute_pmceid1(struct arm_pmu *pmu) +{ + u64 val = __compute_pmceid(pmu, 1); + + /* + * Don't advertise STALL_SLOT*, as PMMIR_EL0 is handled + * as RAZ + */ + val &= ~(BIT_ULL(ARMV8_PMUV3_PERFCTR_STALL_SLOT - 32) | + BIT_ULL(ARMV8_PMUV3_PERFCTR_STALL_SLOT_FRONTEND - 32) | + BIT_ULL(ARMV8_PMUV3_PERFCTR_STALL_SLOT_BACKEND - 32)); + return val; +} + u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1) { + struct arm_pmu *cpu_pmu = vcpu->kvm->arch.arm_pmu; unsigned long *bmap = vcpu->kvm->arch.pmu_filter; u64 val, mask = 0; int base, i, nr_events; @@ -852,19 +886,10 @@ u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1) return 0; if (!pmceid1) { - val = read_sysreg(pmceid0_el0); - /* always support CHAIN */ - val |= BIT(ARMV8_PMUV3_PERFCTR_CHAIN); + val = compute_pmceid0(cpu_pmu); base = 0; } else { - val = read_sysreg(pmceid1_el0); - /* - * Don't advertise STALL_SLOT*, as PMMIR_EL0 is handled - * as RAZ - */ - val &= ~(BIT_ULL(ARMV8_PMUV3_PERFCTR_STALL_SLOT - 32) | - BIT_ULL(ARMV8_PMUV3_PERFCTR_STALL_SLOT_FRONTEND - 32) | - BIT_ULL(ARMV8_PMUV3_PERFCTR_STALL_SLOT_BACKEND - 32)); + val = compute_pmceid1(cpu_pmu); base = 32; } From ed335722b4571c438c008fc96bf86b9d4705a60f Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 5 Mar 2025 12:26:31 -0800 Subject: [PATCH 51/76] KVM: arm64: Always support SW_INCR PMU event Support for SW_INCR is unconditional, as KVM traps accesses to PMSWINC_EL0 and emulates the intended event increment. While it is expected that ~all PMUv3 implementations already advertise this event, non-PMUv3 hardware may not. Tested-by: Janne Grunau Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20250305202641.428114-5-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/kvm/pmu-emul.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 104672a0c5a2..62349b670cf9 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -856,6 +856,8 @@ static u64 compute_pmceid0(struct arm_pmu *pmu) { u64 val = __compute_pmceid(pmu, 0); + /* always support SW_INCR */ + val |= BIT(ARMV8_PMUV3_PERFCTR_SW_INCR); /* always support CHAIN */ val |= BIT(ARMV8_PMUV3_PERFCTR_CHAIN); return val; From 6f34024d185e2fa6636c660431c4595a6529328d Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 5 Mar 2025 12:26:32 -0800 Subject: [PATCH 52/76] KVM: arm64: Use a cpucap to determine if system supports FEAT_PMUv3 KVM is about to learn some new tricks to virtualize PMUv3 on IMPDEF hardware. As part of that, we now need to differentiate host support from guest support for PMUv3. Add a cpucap to determine if an architectural PMUv3 is present to guard host usage of PMUv3 controls. Tested-by: Janne Grunau Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20250305202641.428114-6-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/include/asm/cpucaps.h | 2 ++ arch/arm64/include/asm/cpufeature.h | 5 +++++ arch/arm64/kernel/cpufeature.c | 30 +++++++++++++++++++++++++ arch/arm64/kvm/hyp/include/hyp/switch.h | 4 ++-- arch/arm64/kvm/pmu.c | 10 ++++----- arch/arm64/tools/cpucaps | 1 + include/kvm/arm_pmu.h | 2 +- 7 files changed, 46 insertions(+), 8 deletions(-) diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h index 0b5ca6e0eb09..9d769291a306 100644 --- a/arch/arm64/include/asm/cpucaps.h +++ b/arch/arm64/include/asm/cpucaps.h @@ -71,6 +71,8 @@ cpucap_is_possible(const unsigned int cap) * KVM MPAM support doesn't rely on the host kernel supporting MPAM. */ return true; + case ARM64_HAS_PMUV3: + return IS_ENABLED(CONFIG_HW_PERF_EVENTS); } return true; diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index e0e4478f5fb5..0eff048848b8 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -866,6 +866,11 @@ static __always_inline bool system_supports_mpam_hcr(void) return alternative_has_cap_unlikely(ARM64_MPAM_HCR); } +static inline bool system_supports_pmuv3(void) +{ + return cpus_have_final_cap(ARM64_HAS_PMUV3); +} + int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt); bool try_emulate_mrs(struct pt_regs *regs, u32 isn); diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index d561cf3b8ac7..adb26697d908 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -1898,6 +1898,28 @@ static bool has_lpa2(const struct arm64_cpu_capabilities *entry, int scope) } #endif +#ifdef CONFIG_HW_PERF_EVENTS +static bool has_pmuv3(const struct arm64_cpu_capabilities *entry, int scope) +{ + u64 dfr0 = read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1); + unsigned int pmuver; + + /* + * PMUVer follows the standard ID scheme for an unsigned field with the + * exception of 0xF (IMP_DEF) which is treated specially and implies + * FEAT_PMUv3 is not implemented. + * + * See DDI0487L.a D24.1.3.2 for more details. + */ + pmuver = cpuid_feature_extract_unsigned_field(dfr0, + ID_AA64DFR0_EL1_PMUVer_SHIFT); + if (pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF) + return false; + + return pmuver >= ID_AA64DFR0_EL1_PMUVer_IMP; +} +#endif + #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 #define KPTI_NG_TEMP_VA (-(1UL << PMD_SHIFT)) @@ -2998,6 +3020,14 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .matches = has_cpuid_feature, ARM64_CPUID_FIELDS(ID_AA64PFR1_EL1, GCS, IMP) }, +#endif +#ifdef CONFIG_HW_PERF_EVENTS + { + .desc = "PMUv3", + .capability = ARM64_HAS_PMUV3, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_pmuv3, + }, #endif {}, }; diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 23bbe28eaaf9..b741ea6aefa5 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -244,7 +244,7 @@ static inline void __activate_traps_common(struct kvm_vcpu *vcpu) * counter, which could make a PMXEVCNTR_EL0 access UNDEF at * EL1 instead of being trapped to EL2. */ - if (kvm_arm_support_pmu_v3()) { + if (system_supports_pmuv3()) { struct kvm_cpu_context *hctxt; write_sysreg(0, pmselr_el0); @@ -281,7 +281,7 @@ static inline void __deactivate_traps_common(struct kvm_vcpu *vcpu) write_sysreg(*host_data_ptr(host_debug_state.mdcr_el2), mdcr_el2); write_sysreg(0, hstr_el2); - if (kvm_arm_support_pmu_v3()) { + if (system_supports_pmuv3()) { struct kvm_cpu_context *hctxt; hctxt = host_data_ptr(host_ctxt); diff --git a/arch/arm64/kvm/pmu.c b/arch/arm64/kvm/pmu.c index 0b3adf3e17b4..6b48a3d16d0d 100644 --- a/arch/arm64/kvm/pmu.c +++ b/arch/arm64/kvm/pmu.c @@ -41,7 +41,7 @@ void kvm_set_pmu_events(u64 set, struct perf_event_attr *attr) { struct kvm_pmu_events *pmu = kvm_get_pmu_events(); - if (!kvm_arm_support_pmu_v3() || !kvm_pmu_switch_needed(attr)) + if (!system_supports_pmuv3() || !kvm_pmu_switch_needed(attr)) return; if (!attr->exclude_host) @@ -57,7 +57,7 @@ void kvm_clr_pmu_events(u64 clr) { struct kvm_pmu_events *pmu = kvm_get_pmu_events(); - if (!kvm_arm_support_pmu_v3()) + if (!system_supports_pmuv3()) return; pmu->events_host &= ~clr; @@ -133,7 +133,7 @@ void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu) struct kvm_pmu_events *pmu; u64 events_guest, events_host; - if (!kvm_arm_support_pmu_v3() || !has_vhe()) + if (!system_supports_pmuv3() || !has_vhe()) return; preempt_disable(); @@ -154,7 +154,7 @@ void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu) struct kvm_pmu_events *pmu; u64 events_guest, events_host; - if (!kvm_arm_support_pmu_v3() || !has_vhe()) + if (!system_supports_pmuv3() || !has_vhe()) return; pmu = kvm_get_pmu_events(); @@ -180,7 +180,7 @@ bool kvm_set_pmuserenr(u64 val) struct kvm_cpu_context *hctxt; struct kvm_vcpu *vcpu; - if (!kvm_arm_support_pmu_v3() || !has_vhe()) + if (!system_supports_pmuv3() || !has_vhe()) return false; vcpu = kvm_get_running_vcpu(); diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps index 1e65f2fb45bd..ee4316cb3690 100644 --- a/arch/arm64/tools/cpucaps +++ b/arch/arm64/tools/cpucaps @@ -45,6 +45,7 @@ HAS_LSE_ATOMICS HAS_MOPS HAS_NESTED_VIRT HAS_PAN +HAS_PMUV3 HAS_S1PIE HAS_S1POE HAS_RAS_EXTN diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h index 147bd3ee4f7b..3a8edd78240f 100644 --- a/include/kvm/arm_pmu.h +++ b/include/kvm/arm_pmu.h @@ -86,7 +86,7 @@ void kvm_vcpu_pmu_resync_el0(void); */ #define kvm_pmu_update_vcpu_events(vcpu) \ do { \ - if (!has_vhe() && kvm_arm_support_pmu_v3()) \ + if (!has_vhe() && system_supports_pmuv3()) \ vcpu->arch.pmu.events = *kvm_get_pmu_events(); \ } while (0) From a38b67d1518331a4147c52c4fa563846ea0969f9 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 5 Mar 2025 12:26:33 -0800 Subject: [PATCH 53/76] KVM: arm64: Drop kvm_arm_pmu_available static key With the PMUv3 cpucap, kvm_arm_pmu_available is no longer used in the hot path of guest entry/exit. On top of that, guest support for PMUv3 may not correlate with host support for the feature, e.g. on IMPDEF hardware. Throw out the static key and just inspect the list of PMUs to determine if PMUv3 is supported for KVM guests. Tested-by: Janne Grunau Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20250305202641.428114-7-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/kernel/image-vars.h | 5 ----- arch/arm64/kvm/arm.c | 4 ++-- arch/arm64/kvm/pmu-emul.c | 11 ++++++----- include/kvm/arm_pmu.h | 10 ++-------- 4 files changed, 10 insertions(+), 20 deletions(-) diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h index ef3a69cc398e..e705c64138ce 100644 --- a/arch/arm64/kernel/image-vars.h +++ b/arch/arm64/kernel/image-vars.h @@ -112,11 +112,6 @@ KVM_NVHE_ALIAS(broken_cntvoff_key); KVM_NVHE_ALIAS(__start___kvm_ex_table); KVM_NVHE_ALIAS(__stop___kvm_ex_table); -/* PMU available static key */ -#ifdef CONFIG_HW_PERF_EVENTS -KVM_NVHE_ALIAS(kvm_arm_pmu_available); -#endif - /* Position-independent library routines */ KVM_NVHE_ALIAS_HYP(clear_page, __pi_clear_page); KVM_NVHE_ALIAS_HYP(copy_page, __pi_copy_page); diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index b8e55a441282..dc27d4eb503a 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -366,7 +366,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = get_num_wrps(); break; case KVM_CAP_ARM_PMU_V3: - r = kvm_arm_support_pmu_v3(); + r = kvm_supports_guest_pmuv3(); break; case KVM_CAP_ARM_INJECT_SERROR_ESR: r = cpus_have_final_cap(ARM64_HAS_RAS_EXTN); @@ -1388,7 +1388,7 @@ static unsigned long system_supported_vcpu_features(void) if (!cpus_have_final_cap(ARM64_HAS_32BIT_EL1)) clear_bit(KVM_ARM_VCPU_EL1_32BIT, &features); - if (!kvm_arm_support_pmu_v3()) + if (!kvm_supports_guest_pmuv3()) clear_bit(KVM_ARM_VCPU_PMU_V3, &features); if (!system_supports_sve()) diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 62349b670cf9..120f48136a0f 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -17,8 +17,6 @@ #define PERF_ATTR_CFG1_COUNTER_64BIT BIT(0) -DEFINE_STATIC_KEY_FALSE(kvm_arm_pmu_available); - static LIST_HEAD(arm_pmus); static DEFINE_MUTEX(arm_pmus_lock); @@ -26,6 +24,12 @@ static void kvm_pmu_create_perf_event(struct kvm_pmc *pmc); static void kvm_pmu_release_perf_event(struct kvm_pmc *pmc); static bool kvm_pmu_counter_is_enabled(struct kvm_pmc *pmc); +bool kvm_supports_guest_pmuv3(void) +{ + guard(mutex)(&arm_pmus_lock); + return !list_empty(&arm_pmus); +} + static struct kvm_vcpu *kvm_pmc_to_vcpu(const struct kvm_pmc *pmc) { return container_of(pmc, struct kvm_vcpu, arch.pmu.pmc[pmc->idx]); @@ -795,9 +799,6 @@ void kvm_host_pmu_init(struct arm_pmu *pmu) entry->arm_pmu = pmu; list_add_tail(&entry->entry, &arm_pmus); - if (list_is_singular(&arm_pmus)) - static_branch_enable(&kvm_arm_pmu_available); - out_unlock: mutex_unlock(&arm_pmus_lock); } diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h index 3a8edd78240f..58fc7f932b3f 100644 --- a/include/kvm/arm_pmu.h +++ b/include/kvm/arm_pmu.h @@ -37,13 +37,7 @@ struct arm_pmu_entry { struct arm_pmu *arm_pmu; }; -DECLARE_STATIC_KEY_FALSE(kvm_arm_pmu_available); - -static __always_inline bool kvm_arm_support_pmu_v3(void) -{ - return static_branch_likely(&kvm_arm_pmu_available); -} - +bool kvm_supports_guest_pmuv3(void); #define kvm_arm_pmu_irq_initialized(v) ((v)->arch.pmu.irq_num >= VGIC_NR_SGIS) u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx); void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val); @@ -102,7 +96,7 @@ void kvm_pmu_nested_transition(struct kvm_vcpu *vcpu); struct kvm_pmu { }; -static inline bool kvm_arm_support_pmu_v3(void) +static inline bool kvm_supports_guest_pmuv3(void) { return false; } From 3d6d9172128ef6219b737d0b05a225f37b0f3ab6 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 5 Mar 2025 12:26:34 -0800 Subject: [PATCH 54/76] KVM: arm64: Use guard() to cleanup usage of arm_pmus_lock Get rid of some goto label patterns by using guard() to drop the arm_pmus_lock when returning from a function. Tested-by: Janne Grunau Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20250305202641.428114-8-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/kvm/pmu-emul.c | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 120f48136a0f..fee3d0003d54 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -790,26 +790,23 @@ void kvm_host_pmu_init(struct arm_pmu *pmu) if (!pmuv3_implemented(kvm_arm_pmu_get_pmuver_limit())) return; - mutex_lock(&arm_pmus_lock); + guard(mutex)(&arm_pmus_lock); entry = kmalloc(sizeof(*entry), GFP_KERNEL); if (!entry) - goto out_unlock; + return; entry->arm_pmu = pmu; list_add_tail(&entry->entry, &arm_pmus); - -out_unlock: - mutex_unlock(&arm_pmus_lock); } static struct arm_pmu *kvm_pmu_probe_armpmu(void) { - struct arm_pmu *tmp, *pmu = NULL; struct arm_pmu_entry *entry; + struct arm_pmu *pmu; int cpu; - mutex_lock(&arm_pmus_lock); + guard(mutex)(&arm_pmus_lock); /* * It is safe to use a stale cpu to iterate the list of PMUs so long as @@ -830,17 +827,13 @@ static struct arm_pmu *kvm_pmu_probe_armpmu(void) */ cpu = raw_smp_processor_id(); list_for_each_entry(entry, &arm_pmus, entry) { - tmp = entry->arm_pmu; + pmu = entry->arm_pmu; - if (cpumask_test_cpu(cpu, &tmp->supported_cpus)) { - pmu = tmp; - break; - } + if (cpumask_test_cpu(cpu, &pmu->supported_cpus)) + return pmu; } - mutex_unlock(&arm_pmus_lock); - - return pmu; + return NULL; } static u64 __compute_pmceid(struct arm_pmu *pmu, bool pmceid1) From 56290316a443709957f958e4e40bc93e0213bcc3 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 5 Mar 2025 12:26:35 -0800 Subject: [PATCH 55/76] KVM: arm64: Move PMUVer filtering into KVM code The supported guest PMU version on a particular platform is ultimately a KVM decision. Move PMUVer filtering into KVM code. Tested-by: Janne Grunau Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20250305202641.428114-9-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/include/asm/cpufeature.h | 23 ----------------------- arch/arm64/kvm/pmu-emul.c | 15 +++++++++------ 2 files changed, 9 insertions(+), 29 deletions(-) diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 0eff048848b8..c4326f1cb917 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -525,29 +525,6 @@ cpuid_feature_extract_unsigned_field(u64 features, int field) return cpuid_feature_extract_unsigned_field_width(features, field, 4); } -/* - * Fields that identify the version of the Performance Monitors Extension do - * not follow the standard ID scheme. See ARM DDI 0487E.a page D13-2825, - * "Alternative ID scheme used for the Performance Monitors Extension version". - */ -static inline u64 __attribute_const__ -cpuid_feature_cap_perfmon_field(u64 features, int field, u64 cap) -{ - u64 val = cpuid_feature_extract_unsigned_field(features, field); - u64 mask = GENMASK_ULL(field + 3, field); - - /* Treat IMPLEMENTATION DEFINED functionality as unimplemented */ - if (val == ID_AA64DFR0_EL1_PMUVer_IMP_DEF) - val = 0; - - if (val > cap) { - features &= ~mask; - features |= (cap << field) & mask; - } - - return features; -} - static inline u64 arm64_ftr_mask(const struct arm64_ftr_bits *ftrp) { return (u64)GENMASK(ftrp->shift + ftrp->width - 1, ftrp->shift); diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index fee3d0003d54..0a2023aabe18 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -1226,13 +1226,16 @@ int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) u8 kvm_arm_pmu_get_pmuver_limit(void) { - u64 tmp; + unsigned int pmuver; - tmp = read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1); - tmp = cpuid_feature_cap_perfmon_field(tmp, - ID_AA64DFR0_EL1_PMUVer_SHIFT, - ID_AA64DFR0_EL1_PMUVer_V3P5); - return FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), tmp); + pmuver = SYS_FIELD_GET(ID_AA64DFR0_EL1, PMUVer, + read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1)); + + /* Treat IMPLEMENTATION DEFINED functionality as unimplemented */ + if (pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF) + return 0; + + return min(pmuver, ID_AA64DFR0_EL1_PMUVer_V3P5); } /** From 2c433f70dccc1af6922931404bfe23d215bdfff0 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 5 Mar 2025 12:26:36 -0800 Subject: [PATCH 56/76] KVM: arm64: Compute synthetic sysreg ESR for Apple PMUv3 traps Apple M* CPUs provide an IMPDEF trap for PMUv3 sysregs, where ESR_EL2.EC is a reserved value (0x3F) and a sysreg-like ISS is reported in AFSR1_EL2. Compute a synthetic ESR for these PMUv3 traps, giving the illusion of something architectural to the rest of KVM. Tested-by: Janne Grunau Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20250305202641.428114-10-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/kvm/hyp/vhe/switch.c | 22 ++++++++++++++++++++++ arch/arm64/tools/cpucaps | 1 + 2 files changed, 23 insertions(+) diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index 647737d6e8d0..731a0378ed13 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -527,6 +527,25 @@ static bool kvm_hyp_handle_sysreg_vhe(struct kvm_vcpu *vcpu, u64 *exit_code) return kvm_hyp_handle_sysreg(vcpu, exit_code); } +static bool kvm_hyp_handle_impdef(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + u64 iss; + + if (!cpus_have_final_cap(ARM64_WORKAROUND_PMUV3_IMPDEF_TRAPS)) + return false; + + /* + * Compute a synthetic ESR for a sysreg trap. Conveniently, AFSR1_EL2 + * is populated with a correct ISS for a sysreg trap. These fruity + * parts are 64bit only, so unconditionally set IL. + */ + iss = ESR_ELx_ISS(read_sysreg_s(SYS_AFSR1_EL2)); + vcpu->arch.fault.esr_el2 = FIELD_PREP(ESR_ELx_EC_MASK, ESR_ELx_EC_SYS64) | + FIELD_PREP(ESR_ELx_ISS_MASK, iss) | + ESR_ELx_IL; + return false; +} + static const exit_handler_fn hyp_exit_handlers[] = { [0 ... ESR_ELx_EC_MAX] = NULL, [ESR_ELx_EC_CP15_32] = kvm_hyp_handle_cp15_32, @@ -538,6 +557,9 @@ static const exit_handler_fn hyp_exit_handlers[] = { [ESR_ELx_EC_WATCHPT_LOW] = kvm_hyp_handle_watchpt_low, [ESR_ELx_EC_ERET] = kvm_hyp_handle_eret, [ESR_ELx_EC_MOPS] = kvm_hyp_handle_mops, + + /* Apple shenanigans */ + [0x3F] = kvm_hyp_handle_impdef, }; static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps index ee4316cb3690..772c1b008e43 100644 --- a/arch/arm64/tools/cpucaps +++ b/arch/arm64/tools/cpucaps @@ -105,6 +105,7 @@ WORKAROUND_CAVIUM_TX2_219_TVM WORKAROUND_CLEAN_CACHE WORKAROUND_DEVICE_LOAD_ACQUIRE WORKAROUND_NVIDIA_CARMEL_CNP +WORKAROUND_PMUV3_IMPDEF_TRAPS WORKAROUND_QCOM_FALKOR_E1003 WORKAROUND_QCOM_ORYON_CNTVOFF WORKAROUND_REPEAT_TLBI From bed9b8ec8c71135c59004d12658d8affe232f27e Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 5 Mar 2025 12:26:37 -0800 Subject: [PATCH 57/76] KVM: arm64: Advertise PMUv3 if IMPDEF traps are present Advertise a baseline PMUv3 implementation when running on hardware with IMPDEF traps of the PMUv3 sysregs. Tested-by: Janne Grunau Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20250305202641.428114-11-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/kvm/pmu-emul.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 0a2023aabe18..38d9490c17fd 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -1231,7 +1231,17 @@ u8 kvm_arm_pmu_get_pmuver_limit(void) pmuver = SYS_FIELD_GET(ID_AA64DFR0_EL1, PMUVer, read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1)); - /* Treat IMPLEMENTATION DEFINED functionality as unimplemented */ + /* + * Spoof a barebones PMUv3 implementation if the system supports IMPDEF + * traps of the PMUv3 sysregs + */ + if (cpus_have_final_cap(ARM64_WORKAROUND_PMUV3_IMPDEF_TRAPS)) + return ID_AA64DFR0_EL1_PMUVer_IMP; + + /* + * Otherwise, treat IMPLEMENTATION DEFINED functionality as + * unimplemented + */ if (pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF) return 0; From 1e7dcbfa4b7cddfbe1cde6067d135f5452692256 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 5 Mar 2025 12:26:38 -0800 Subject: [PATCH 58/76] KVM: arm64: Remap PMUv3 events onto hardware Map PMUv3 event IDs onto hardware, if the driver exposes such a helper. This is expected to be quite rare, and only useful for non-PMUv3 hardware. Tested-by: Janne Grunau Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20250305202641.428114-12-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/kvm/pmu-emul.c | 25 ++++++++++++++++++++++++- include/linux/perf/arm_pmu.h | 4 ++++ 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 38d9490c17fd..5f4e9be8aa8b 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -677,6 +677,20 @@ static bool kvm_pmc_counts_at_el2(struct kvm_pmc *pmc) return kvm_pmc_read_evtreg(pmc) & ARMV8_PMU_INCLUDE_EL2; } +static int kvm_map_pmu_event(struct kvm *kvm, unsigned int eventsel) +{ + struct arm_pmu *pmu = kvm->arch.arm_pmu; + + /* + * The CPU PMU likely isn't PMUv3; let the driver provide a mapping + * for the guest's PMUv3 event ID. + */ + if (unlikely(pmu->map_pmuv3_event)) + return pmu->map_pmuv3_event(eventsel); + + return eventsel; +} + /** * kvm_pmu_create_perf_event - create a perf event for a counter * @pmc: Counter context @@ -687,7 +701,8 @@ static void kvm_pmu_create_perf_event(struct kvm_pmc *pmc) struct arm_pmu *arm_pmu = vcpu->kvm->arch.arm_pmu; struct perf_event *event; struct perf_event_attr attr; - u64 eventsel, evtreg; + int eventsel; + u64 evtreg; evtreg = kvm_pmc_read_evtreg(pmc); @@ -713,6 +728,14 @@ static void kvm_pmu_create_perf_event(struct kvm_pmc *pmc) !test_bit(eventsel, vcpu->kvm->arch.pmu_filter)) return; + /* + * Don't create an event if we're running on hardware that requires + * PMUv3 event translation and we couldn't find a valid mapping. + */ + eventsel = kvm_map_pmu_event(vcpu->kvm, eventsel); + if (eventsel < 0) + return; + memset(&attr, 0, sizeof(struct perf_event_attr)); attr.type = arm_pmu->pmu.type; attr.size = sizeof(attr); diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h index 4b5b83677e3f..7ce6dea5bfa9 100644 --- a/include/linux/perf/arm_pmu.h +++ b/include/linux/perf/arm_pmu.h @@ -100,6 +100,10 @@ struct arm_pmu { void (*stop)(struct arm_pmu *); void (*reset)(void *); int (*map_event)(struct perf_event *event); + /* + * Called by KVM to map the PMUv3 event space onto non-PMUv3 hardware. + */ + int (*map_pmuv3_event)(unsigned int eventsel); DECLARE_BITMAP(cntr_mask, ARMPMU_MAX_HWEVENTS); bool secure_access; /* 32-bit ARM only */ #define ARMV8_PMUV3_MAX_COMMON_EVENTS 0x40 From 2d00cab849be5e4c1b2ba82d4fdcfa0af48c340f Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 5 Mar 2025 12:26:39 -0800 Subject: [PATCH 59/76] drivers/perf: apple_m1: Provide helper for mapping PMUv3 events Apple M* parts carry some IMP DEF traps for guest accesses to PMUv3 registers, even though the underlying hardware doesn't implement PMUv3. This means it is possible to virtualize PMUv3 for KVM guests. Add a helper for mapping common PMUv3 event IDs onto hardware event IDs, keeping the implementation-specific crud in the PMU driver rather than KVM proper. Populate the pmceid_bitmap based on the supported events so KVM can provide synthetic PMCEID* values to the guest. Tested-by: Janne Grunau Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20250305202641.428114-13-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- drivers/perf/apple_m1_cpu_pmu.c | 35 +++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/drivers/perf/apple_m1_cpu_pmu.c b/drivers/perf/apple_m1_cpu_pmu.c index d6d4ff6da862..6be703619a97 100644 --- a/drivers/perf/apple_m1_cpu_pmu.c +++ b/drivers/perf/apple_m1_cpu_pmu.c @@ -12,6 +12,7 @@ #include #include +#include #include #include @@ -174,6 +175,17 @@ static const unsigned m1_pmu_perf_map[PERF_COUNT_HW_MAX] = { [PERF_COUNT_HW_BRANCH_MISSES] = M1_PMU_PERFCTR_BRANCH_MISPRED_NONSPEC, }; +#define M1_PMUV3_EVENT_MAP(pmuv3_event, m1_event) \ + [ARMV8_PMUV3_PERFCTR_##pmuv3_event] = M1_PMU_PERFCTR_##m1_event + +static const u16 m1_pmu_pmceid_map[ARMV8_PMUV3_MAX_COMMON_EVENTS] = { + [0 ... ARMV8_PMUV3_MAX_COMMON_EVENTS - 1] = HW_OP_UNSUPPORTED, + M1_PMUV3_EVENT_MAP(INST_RETIRED, INST_ALL), + M1_PMUV3_EVENT_MAP(CPU_CYCLES, CORE_ACTIVE_CYCLE), + M1_PMUV3_EVENT_MAP(BR_RETIRED, INST_BRANCH), + M1_PMUV3_EVENT_MAP(BR_MIS_PRED_RETIRED, BRANCH_MISPRED_NONSPEC), +}; + /* sysfs definitions */ static ssize_t m1_pmu_events_sysfs_show(struct device *dev, struct device_attribute *attr, @@ -558,6 +570,26 @@ static int m2_pmu_map_event(struct perf_event *event) return armpmu_map_event(event, &m1_pmu_perf_map, NULL, M1_PMU_CFG_EVENT); } +static int m1_pmu_map_pmuv3_event(unsigned int eventsel) +{ + u16 m1_event = HW_OP_UNSUPPORTED; + + if (eventsel < ARMV8_PMUV3_MAX_COMMON_EVENTS) + m1_event = m1_pmu_pmceid_map[eventsel]; + + return m1_event == HW_OP_UNSUPPORTED ? -EOPNOTSUPP : m1_event; +} + +static void m1_pmu_init_pmceid(struct arm_pmu *pmu) +{ + unsigned int event; + + for (event = 0; event < ARMV8_PMUV3_MAX_COMMON_EVENTS; event++) { + if (m1_pmu_map_pmuv3_event(event) >= 0) + set_bit(event, pmu->pmceid_bitmap); + } +} + static void m1_pmu_reset(void *info) { int i; @@ -618,6 +650,9 @@ static int m1_pmu_init(struct arm_pmu *cpu_pmu, u32 flags) cpu_pmu->reset = m1_pmu_reset; cpu_pmu->set_event_filter = m1_pmu_set_event_filter; + cpu_pmu->map_pmuv3_event = m1_pmu_map_pmuv3_event; + m1_pmu_init_pmceid(cpu_pmu); + bitmap_set(cpu_pmu->cntr_mask, 0, M1_PMU_NR_COUNTERS); cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_EVENTS] = &m1_pmu_events_attr_group; cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_FORMATS] = &m1_pmu_format_attr_group; From 1b92e65f5006da55c8779b86f052855a49ecdd97 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 5 Mar 2025 12:30:21 -0800 Subject: [PATCH 60/76] KVM: arm64: Provide 1 event counter on IMPDEF hardware PMUv3 requires that all programmable event counters are capable of counting any event. The Apple M* PMU is quite a bit different, and events have affinities for particular PMCs. Expose 1 event counter on IMPDEF hardware, allowing the guest to do something useful with its PMU while also upholding the requirements of the architecture. Tested-by: Janne Grunau Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20250305203021.428366-1-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/kvm/pmu-emul.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 5f4e9be8aa8b..51fb47e0bf89 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -1038,6 +1038,13 @@ u8 kvm_arm_pmu_get_max_counters(struct kvm *kvm) { struct arm_pmu *arm_pmu = kvm->arch.arm_pmu; + /* + * PMUv3 requires that all event counters are capable of counting any + * event, though the same may not be true of non-PMUv3 hardware. + */ + if (cpus_have_final_cap(ARM64_WORKAROUND_PMUV3_IMPDEF_TRAPS)) + return 1; + /* * The arm_pmu->cntr_mask considers the fixed counter(s) as well. * Ignore those and return only the general-purpose counters. From e1231aacb065b2594d6f4833896aedc142fbd1d6 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 5 Mar 2025 12:30:40 -0800 Subject: [PATCH 61/76] arm64: Enable IMP DEF PMUv3 traps on Apple M* Apple M1 and M2 CPUs support IMPDEF traps of the PMUv3 sysregs, allowing a hypervisor to virtualize an architectural PMU for a VM. Flip the appropriate bit in HACR_EL2 on supporting hardware. Tested-by: Janne Grunau Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20250305203040.428448-1-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/kernel/cpu_errata.c | 44 ++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index 7ce555862895..a1e16b156fab 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -194,6 +194,43 @@ has_neoverse_n1_erratum_1542419(const struct arm64_cpu_capabilities *entry, return is_midr_in_range(midr, &range) && has_dic; } +static const struct midr_range impdef_pmuv3_cpus[] = { + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM_PRO), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM_PRO), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM_MAX), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM_MAX), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD_PRO), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE_PRO), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD_MAX), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE_MAX), + {}, +}; + +static bool has_impdef_pmuv3(const struct arm64_cpu_capabilities *entry, int scope) +{ + u64 dfr0 = read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1); + unsigned int pmuver; + + if (!is_kernel_in_hyp_mode()) + return false; + + pmuver = cpuid_feature_extract_unsigned_field(dfr0, + ID_AA64DFR0_EL1_PMUVer_SHIFT); + if (pmuver != ID_AA64DFR0_EL1_PMUVer_IMP_DEF) + return false; + + return is_midr_in_range_list(read_cpuid_id(), impdef_pmuv3_cpus); +} + +static void cpu_enable_impdef_pmuv3_traps(const struct arm64_cpu_capabilities *__unused) +{ + sysreg_clear_set_s(SYS_HACR_EL2, 0, BIT(56)); +} + #ifdef CONFIG_ARM64_WORKAROUND_REPEAT_TLBI static const struct arm64_cpu_capabilities arm64_repeat_tlbi_list[] = { #ifdef CONFIG_QCOM_FALKOR_ERRATUM_1009 @@ -794,6 +831,13 @@ const struct arm64_cpu_capabilities arm64_errata[] = { {} })), }, + { + .desc = "Apple IMPDEF PMUv3 Traps", + .capability = ARM64_WORKAROUND_PMUV3_IMPDEF_TRAPS, + .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, + .matches = has_impdef_pmuv3, + .cpu_enable = cpu_enable_impdef_pmuv3_traps, + }, { } }; From 3f1e07275341c691e0901ed7030f4980b9b517c2 Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Wed, 12 Mar 2025 13:39:50 -0700 Subject: [PATCH 62/76] KVM: arm64: Allow userspace to write ID_AA64MMFR0_EL1.TGRAN*_2 Allow userspace to write the safe (NI) value for ID_AA64MMFR0_EL1.TGRAN*_2. Disallow to change these fields for NV since kvm provides a sanitized view for them based on the PAGE_SIZE. Signed-off-by: Sebastian Ott Link: https://lore.kernel.org/kvmarm/20250306184013.30008-1-sebott@redhat.com/ Signed-off-by: Oliver Upton --- arch/arm64/kvm/sys_regs.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 1a18a0324d9f..a3f4599920d0 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1939,6 +1939,21 @@ static int set_id_aa64pfr1_el1(struct kvm_vcpu *vcpu, return set_id_reg(vcpu, rd, user_val); } +static int set_id_aa64mmfr0_el1(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, u64 user_val) +{ + u64 sanitized_val = kvm_read_sanitised_id_reg(vcpu, rd); + u64 tgran2_mask = ID_AA64MMFR0_EL1_TGRAN4_2_MASK | + ID_AA64MMFR0_EL1_TGRAN16_2_MASK | + ID_AA64MMFR0_EL1_TGRAN64_2_MASK; + + if (vcpu_has_nv(vcpu) && + ((sanitized_val & tgran2_mask) != (user_val & tgran2_mask))) + return -EINVAL; + + return set_id_reg(vcpu, rd, user_val); +} + static int set_id_aa64mmfr2_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 user_val) { @@ -2662,10 +2677,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { ID_UNALLOCATED(6,7), /* CRm=7 */ - ID_WRITABLE(ID_AA64MMFR0_EL1, ~(ID_AA64MMFR0_EL1_RES0 | - ID_AA64MMFR0_EL1_TGRAN4_2 | - ID_AA64MMFR0_EL1_TGRAN64_2 | - ID_AA64MMFR0_EL1_TGRAN16_2 | + ID_FILTERED(ID_AA64MMFR0_EL1, id_aa64mmfr0_el1, + ~(ID_AA64MMFR0_EL1_RES0 | ID_AA64MMFR0_EL1_ASIDBITS)), ID_WRITABLE(ID_AA64MMFR1_EL1, ~(ID_AA64MMFR1_EL1_RES0 | ID_AA64MMFR1_EL1_HCX | From edfd826b8be7f6fa4bc120a9551d75f0cbf05cc6 Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Wed, 12 Mar 2025 13:40:53 -0700 Subject: [PATCH 63/76] KVM: arm64: selftests: Test that TGRAN*_2 fields are writable Userspace can write to these fields for non-NV guests; add test that do just that. Signed-off-by: Sebastian Ott Link: https://lore.kernel.org/kvmarm/20250306184013.30008-1-sebott@redhat.com/ Signed-off-by: Oliver Upton --- tools/testing/selftests/kvm/arm64/set_id_regs.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/testing/selftests/kvm/arm64/set_id_regs.c b/tools/testing/selftests/kvm/arm64/set_id_regs.c index 217541fe6536..f71b8da4255b 100644 --- a/tools/testing/selftests/kvm/arm64/set_id_regs.c +++ b/tools/testing/selftests/kvm/arm64/set_id_regs.c @@ -146,6 +146,9 @@ static const struct reg_ftr_bits ftr_id_aa64pfr1_el1[] = { static const struct reg_ftr_bits ftr_id_aa64mmfr0_el1[] = { REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR0_EL1, ECV, 0), REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR0_EL1, EXS, 0), + REG_FTR_BITS(FTR_EXACT, ID_AA64MMFR0_EL1, TGRAN4_2, 1), + REG_FTR_BITS(FTR_EXACT, ID_AA64MMFR0_EL1, TGRAN64_2, 1), + REG_FTR_BITS(FTR_EXACT, ID_AA64MMFR0_EL1, TGRAN16_2, 1), S_REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR0_EL1, TGRAN4, 0), S_REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR0_EL1, TGRAN64, 0), REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR0_EL1, TGRAN16, 0), From cf2d228da9a8140bd7227ca9093947d20e8ea39d Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Thu, 13 Mar 2025 11:40:36 +0000 Subject: [PATCH 64/76] KVM: arm64: Add flags to kvm_hyp_memcache Add flags to kvm_hyp_memcache and propagate the latter to the allocation and free callbacks. This will later allow to account for memory, based on the memcache configuration. Signed-off-by: Vincent Donnefort Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20250313114038.1502357-2-vdonnefort@google.com Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_host.h | 1 + arch/arm64/kvm/mmu.c | 8 ++++---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 3a7ec98ef123..12691ae23d4c 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -86,6 +86,7 @@ struct kvm_hyp_memcache { phys_addr_t head; unsigned long nr_pages; struct pkvm_mapping *mapping; /* only used from EL1 */ + unsigned long flags; }; static inline void push_hyp_memcache(struct kvm_hyp_memcache *mc, diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 1f55b0c7b11d..6107a3c8ccf6 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1086,12 +1086,12 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu) } } -static void hyp_mc_free_fn(void *addr, void *unused) +static void hyp_mc_free_fn(void *addr, void *mc) { free_page((unsigned long)addr); } -static void *hyp_mc_alloc_fn(void *unused) +static void *hyp_mc_alloc_fn(void *mc) { return (void *)__get_free_page(GFP_KERNEL_ACCOUNT); } @@ -1102,7 +1102,7 @@ void free_hyp_memcache(struct kvm_hyp_memcache *mc) return; kfree(mc->mapping); - __free_hyp_memcache(mc, hyp_mc_free_fn, kvm_host_va, NULL); + __free_hyp_memcache(mc, hyp_mc_free_fn, kvm_host_va, mc); } int topup_hyp_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages) @@ -1117,7 +1117,7 @@ int topup_hyp_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages) } return __topup_hyp_memcache(mc, min_pages, hyp_mc_alloc_fn, - kvm_host_pa, NULL); + kvm_host_pa, mc); } /** From 8c0d7d14c5cdcf01a47c71527a522b2986798cc3 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Thu, 13 Mar 2025 11:40:37 +0000 Subject: [PATCH 65/76] KVM: arm64: Distinct pKVM teardown memcache for stage-2 In order to account for memory dedicated to the stage-2 page-tables, use a separated memcache when tearing down the VM. Meanwhile rename reclaim_guest_pages to reflect the fact it only reclaim page-table pages. Signed-off-by: Vincent Donnefort Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20250313114038.1502357-3-vdonnefort@google.com Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_host.h | 1 + arch/arm64/kvm/hyp/include/nvhe/mem_protect.h | 2 +- arch/arm64/kvm/hyp/nvhe/mem_protect.c | 2 +- arch/arm64/kvm/hyp/nvhe/pkvm.c | 8 ++++---- arch/arm64/kvm/pkvm.c | 1 + 5 files changed, 8 insertions(+), 6 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 12691ae23d4c..ace3969e8106 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -246,6 +246,7 @@ typedef unsigned int pkvm_handle_t; struct kvm_protected_vm { pkvm_handle_t handle; struct kvm_hyp_memcache teardown_mc; + struct kvm_hyp_memcache stage2_teardown_mc; bool enabled; }; diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h index 978f38c386ee..ea0a704da9b8 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h +++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h @@ -56,7 +56,7 @@ void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt); int hyp_pin_shared_mem(void *from, void *to); void hyp_unpin_shared_mem(void *from, void *to); -void reclaim_guest_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc); +void reclaim_pgtable_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc); int refill_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages, struct kvm_hyp_memcache *host_mc); diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 19c3c631708c..f34f11c720d7 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -266,7 +266,7 @@ int kvm_guest_prepare_stage2(struct pkvm_hyp_vm *vm, void *pgd) return 0; } -void reclaim_guest_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc) +void reclaim_pgtable_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc) { struct hyp_page *page; void *addr; diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index 3927fe52a3dd..ac85bc51b8d3 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -678,7 +678,7 @@ teardown_donated_memory(struct kvm_hyp_memcache *mc, void *addr, size_t size) int __pkvm_teardown_vm(pkvm_handle_t handle) { - struct kvm_hyp_memcache *mc; + struct kvm_hyp_memcache *mc, *stage2_mc; struct pkvm_hyp_vm *hyp_vm; struct kvm *host_kvm; unsigned int idx; @@ -706,10 +706,10 @@ int __pkvm_teardown_vm(pkvm_handle_t handle) /* Reclaim guest pages (including page-table pages) */ mc = &host_kvm->arch.pkvm.teardown_mc; - reclaim_guest_pages(hyp_vm, mc); + stage2_mc = &host_kvm->arch.pkvm.stage2_teardown_mc; + reclaim_pgtable_pages(hyp_vm, stage2_mc); unpin_host_vcpus(hyp_vm->vcpus, hyp_vm->nr_vcpus); - /* Push the metadata pages to the teardown memcache */ for (idx = 0; idx < hyp_vm->nr_vcpus; ++idx) { struct pkvm_hyp_vcpu *hyp_vcpu = hyp_vm->vcpus[idx]; struct kvm_hyp_memcache *vcpu_mc = &hyp_vcpu->vcpu.arch.pkvm_memcache; @@ -717,7 +717,7 @@ int __pkvm_teardown_vm(pkvm_handle_t handle) while (vcpu_mc->nr_pages) { void *addr = pop_hyp_memcache(vcpu_mc, hyp_phys_to_virt); - push_hyp_memcache(mc, addr, hyp_virt_to_phys); + push_hyp_memcache(stage2_mc, addr, hyp_virt_to_phys); unmap_donated_memory_noclear(addr, PAGE_SIZE); } diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c index 930b677eb9b0..19921ca407c6 100644 --- a/arch/arm64/kvm/pkvm.c +++ b/arch/arm64/kvm/pkvm.c @@ -111,6 +111,7 @@ static void __pkvm_destroy_hyp_vm(struct kvm *host_kvm) host_kvm->arch.pkvm.handle = 0; free_hyp_memcache(&host_kvm->arch.pkvm.teardown_mc); + free_hyp_memcache(&host_kvm->arch.pkvm.stage2_teardown_mc); } /* From 79ea66231599c18e3b01852e28ecc2dc84d7326c Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Thu, 13 Mar 2025 11:40:38 +0000 Subject: [PATCH 66/76] KVM: arm64: Count pKVM stage-2 usage in secondary pagetable stats Count the pages used by pKVM for the guest stage-2 in memory stats under secondary pagetable, similarly to what the VHE mode does. Signed-off-by: Vincent Donnefort Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20250313114038.1502357-4-vdonnefort@google.com Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_host.h | 2 ++ arch/arm64/kvm/mmu.c | 14 +++++++++++++- arch/arm64/kvm/pkvm.c | 4 ++++ 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index ace3969e8106..51754a354b7a 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -86,6 +86,8 @@ struct kvm_hyp_memcache { phys_addr_t head; unsigned long nr_pages; struct pkvm_mapping *mapping; /* only used from EL1 */ + +#define HYP_MEMCACHE_ACCOUNT_STAGE2 BIT(1) unsigned long flags; }; diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 6107a3c8ccf6..2feb6c6b63af 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1088,12 +1088,24 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu) static void hyp_mc_free_fn(void *addr, void *mc) { + struct kvm_hyp_memcache *memcache = mc; + + if (memcache->flags & HYP_MEMCACHE_ACCOUNT_STAGE2) + kvm_account_pgtable_pages(addr, -1); + free_page((unsigned long)addr); } static void *hyp_mc_alloc_fn(void *mc) { - return (void *)__get_free_page(GFP_KERNEL_ACCOUNT); + struct kvm_hyp_memcache *memcache = mc; + void *addr; + + addr = (void *)__get_free_page(GFP_KERNEL_ACCOUNT); + if (addr && memcache->flags & HYP_MEMCACHE_ACCOUNT_STAGE2) + kvm_account_pgtable_pages(addr, 1); + + return addr; } void free_hyp_memcache(struct kvm_hyp_memcache *mc) diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c index 19921ca407c6..4409a10d02b6 100644 --- a/arch/arm64/kvm/pkvm.c +++ b/arch/arm64/kvm/pkvm.c @@ -165,12 +165,16 @@ static int __pkvm_create_hyp_vm(struct kvm *host_kvm) handle = ret; host_kvm->arch.pkvm.handle = handle; + host_kvm->arch.pkvm.stage2_teardown_mc.flags |= HYP_MEMCACHE_ACCOUNT_STAGE2; + kvm_account_pgtable_pages(pgd, pgd_sz / PAGE_SIZE); /* Donate memory for the vcpus at hyp and initialize it. */ hyp_vcpu_sz = PAGE_ALIGN(PKVM_HYP_VCPU_SIZE); kvm_for_each_vcpu(idx, host_vcpu, host_kvm) { void *hyp_vcpu; + host_vcpu->arch.pkvm_memcache.flags |= HYP_MEMCACHE_ACCOUNT_STAGE2; + /* Indexing of the vcpus to be sequential starting at 0. */ if (WARN_ON(host_vcpu->vcpu_idx != idx)) { ret = -EINVAL; From 44f979bf434e9ba1f9040700138825e1ecb69d25 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Fri, 14 Mar 2025 11:18:29 +0000 Subject: [PATCH 67/76] KVM: arm64: Factor out setting HCRX_EL2 traps into separate function Factor out the code for setting a vcpu's HCRX_EL2 traps in to a separate inline function. This allows us to share the logic with pKVM when setting the traps in protected mode. No functional change intended. Reviewed-by: Marc Zyngier Signed-off-by: Fuad Tabba Link: https://lore.kernel.org/r/20250314111832.4137161-2-tabba@google.com Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_emulate.h | 24 ++++++++++++++++++++++++ arch/arm64/kvm/sys_regs.c | 20 +------------------- 2 files changed, 25 insertions(+), 19 deletions(-) diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index 78ec1ef2cfe8..31c8497372d0 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -649,4 +649,28 @@ static inline bool guest_hyp_sve_traps_enabled(const struct kvm_vcpu *vcpu) { return __guest_hyp_cptr_xen_trap_enabled(vcpu, ZEN); } + +static inline void vcpu_set_hcrx(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + + if (cpus_have_final_cap(ARM64_HAS_HCX)) { + /* + * In general, all HCRX_EL2 bits are gated by a feature. + * The only reason we can set SMPME without checking any + * feature is that its effects are not directly observable + * from the guest. + */ + vcpu->arch.hcrx_el2 = HCRX_EL2_SMPME; + + if (kvm_has_feat(kvm, ID_AA64ISAR2_EL1, MOPS, IMP)) + vcpu->arch.hcrx_el2 |= (HCRX_EL2_MSCEn | HCRX_EL2_MCE2); + + if (kvm_has_tcr2(kvm)) + vcpu->arch.hcrx_el2 |= HCRX_EL2_TCR2En; + + if (kvm_has_fpmr(kvm)) + vcpu->arch.hcrx_el2 |= HCRX_EL2_EnFPM; + } +} #endif /* __ARM64_KVM_EMULATE_H__ */ diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 82430c1e1dd0..16ce5f584e7c 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -4971,25 +4971,7 @@ void kvm_calculate_traps(struct kvm_vcpu *vcpu) mutex_lock(&kvm->arch.config_lock); vcpu_set_hcr(vcpu); vcpu_set_ich_hcr(vcpu); - - if (cpus_have_final_cap(ARM64_HAS_HCX)) { - /* - * In general, all HCRX_EL2 bits are gated by a feature. - * The only reason we can set SMPME without checking any - * feature is that its effects are not directly observable - * from the guest. - */ - vcpu->arch.hcrx_el2 = HCRX_EL2_SMPME; - - if (kvm_has_feat(kvm, ID_AA64ISAR2_EL1, MOPS, IMP)) - vcpu->arch.hcrx_el2 |= (HCRX_EL2_MSCEn | HCRX_EL2_MCE2); - - if (kvm_has_tcr2(kvm)) - vcpu->arch.hcrx_el2 |= HCRX_EL2_TCR2En; - - if (kvm_has_fpmr(kvm)) - vcpu->arch.hcrx_el2 |= HCRX_EL2_EnFPM; - } + vcpu_set_hcrx(vcpu); if (test_bit(KVM_ARCH_FLAG_FGU_INITIALIZED, &kvm->arch.flags)) goto out; From 066daa8d3bc2694c392e14091978043aed7b1f23 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Fri, 14 Mar 2025 11:18:30 +0000 Subject: [PATCH 68/76] KVM: arm64: Initialize HCRX_EL2 traps in pKVM Initialize and set the traps controlled by the HCRX_EL2 in pKVM. Reviewed-by: Marc Zyngier Signed-off-by: Fuad Tabba Link: https://lore.kernel.org/r/20250314111832.4137161-3-tabba@google.com Signed-off-by: Oliver Upton --- arch/arm64/kvm/hyp/nvhe/pkvm.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index ac85bc51b8d3..c34a6da5fbec 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -166,8 +166,13 @@ static int pkvm_vcpu_init_traps(struct pkvm_hyp_vcpu *hyp_vcpu) pkvm_vcpu_reset_hcr(vcpu); - if ((!pkvm_hyp_vcpu_is_protected(hyp_vcpu))) + if ((!pkvm_hyp_vcpu_is_protected(hyp_vcpu))) { + struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu; + + /* Trust the host for non-protected vcpu features. */ + vcpu->arch.hcrx_el2 = host_vcpu->arch.hcrx_el2; return 0; + } ret = pkvm_check_pvm_cpu_features(vcpu); if (ret) @@ -175,6 +180,7 @@ static int pkvm_vcpu_init_traps(struct pkvm_hyp_vcpu *hyp_vcpu) pvm_init_traps_hcr(vcpu); pvm_init_traps_mdcr(vcpu); + vcpu_set_hcrx(vcpu); return 0; } From 8b21fb47c7788779f571499cde4ce573c53ed218 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Fri, 14 Mar 2025 11:18:31 +0000 Subject: [PATCH 69/76] KVM: arm64: Factor out pKVM hyp vcpu creation to separate function Move the code that creates and initializes the hyp view of a vcpu in pKVM to its own function. This is meant to make the transition to initializing every vcpu individually clearer. Acked-by: Will Deacon Reviewed-by: Marc Zyngier Signed-off-by: Fuad Tabba Link: https://lore.kernel.org/r/20250314111832.4137161-4-tabba@google.com Signed-off-by: Oliver Upton --- arch/arm64/kvm/pkvm.c | 52 ++++++++++++++++++++----------------------- 1 file changed, 24 insertions(+), 28 deletions(-) diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c index 4409a10d02b6..f297ccdaef64 100644 --- a/arch/arm64/kvm/pkvm.c +++ b/arch/arm64/kvm/pkvm.c @@ -114,6 +114,26 @@ static void __pkvm_destroy_hyp_vm(struct kvm *host_kvm) free_hyp_memcache(&host_kvm->arch.pkvm.stage2_teardown_mc); } +static int __pkvm_create_hyp_vcpu(struct kvm_vcpu *vcpu) +{ + size_t hyp_vcpu_sz = PAGE_ALIGN(PKVM_HYP_VCPU_SIZE); + pkvm_handle_t handle = vcpu->kvm->arch.pkvm.handle; + void *hyp_vcpu; + int ret; + + vcpu->arch.pkvm_memcache.flags |= HYP_MEMCACHE_ACCOUNT_STAGE2; + + hyp_vcpu = alloc_pages_exact(hyp_vcpu_sz, GFP_KERNEL_ACCOUNT); + if (!hyp_vcpu) + return -ENOMEM; + + ret = kvm_call_hyp_nvhe(__pkvm_init_vcpu, handle, vcpu, hyp_vcpu); + if (ret) + free_pages_exact(hyp_vcpu, hyp_vcpu_sz); + + return ret; +} + /* * Allocates and donates memory for hypervisor VM structs at EL2. * @@ -126,9 +146,8 @@ static void __pkvm_destroy_hyp_vm(struct kvm *host_kvm) */ static int __pkvm_create_hyp_vm(struct kvm *host_kvm) { - size_t pgd_sz, hyp_vm_sz, hyp_vcpu_sz; + size_t pgd_sz, hyp_vm_sz; struct kvm_vcpu *host_vcpu; - pkvm_handle_t handle; void *pgd, *hyp_vm; unsigned long idx; int ret; @@ -162,37 +181,14 @@ static int __pkvm_create_hyp_vm(struct kvm *host_kvm) if (ret < 0) goto free_vm; - handle = ret; - - host_kvm->arch.pkvm.handle = handle; + host_kvm->arch.pkvm.handle = ret; host_kvm->arch.pkvm.stage2_teardown_mc.flags |= HYP_MEMCACHE_ACCOUNT_STAGE2; kvm_account_pgtable_pages(pgd, pgd_sz / PAGE_SIZE); - /* Donate memory for the vcpus at hyp and initialize it. */ - hyp_vcpu_sz = PAGE_ALIGN(PKVM_HYP_VCPU_SIZE); kvm_for_each_vcpu(idx, host_vcpu, host_kvm) { - void *hyp_vcpu; - - host_vcpu->arch.pkvm_memcache.flags |= HYP_MEMCACHE_ACCOUNT_STAGE2; - - /* Indexing of the vcpus to be sequential starting at 0. */ - if (WARN_ON(host_vcpu->vcpu_idx != idx)) { - ret = -EINVAL; + ret = __pkvm_create_hyp_vcpu(host_vcpu); + if (ret) goto destroy_vm; - } - - hyp_vcpu = alloc_pages_exact(hyp_vcpu_sz, GFP_KERNEL_ACCOUNT); - if (!hyp_vcpu) { - ret = -ENOMEM; - goto destroy_vm; - } - - ret = kvm_call_hyp_nvhe(__pkvm_init_vcpu, handle, host_vcpu, - hyp_vcpu); - if (ret) { - free_pages_exact(hyp_vcpu, hyp_vcpu_sz); - goto destroy_vm; - } } return 0; From 1eab115486c5c721e57bc72f8241d20cfed08f16 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Fri, 14 Mar 2025 11:18:32 +0000 Subject: [PATCH 70/76] KVM: arm64: Create each pKVM hyp vcpu after its corresponding host vcpu Instead of creating and initializing _all_ hyp vcpus in pKVM when the first host vcpu runs for the first time, initialize _each_ hyp vcpu in conjunction with its corresponding host vcpu. Some of the host vcpu state (e.g., system registers and traps values) is not initialized until the first time the host vcpu is run. Therefore, initializing a hyp vcpu before its corresponding host vcpu has run for the first time might not view the complete host state of these vcpus. Additionally, this behavior is inline with non-protected modes. Acked-by: Will Deacon Reviewed-by: Marc Zyngier Signed-off-by: Fuad Tabba Link: https://lore.kernel.org/r/20250314111832.4137161-5-tabba@google.com Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_host.h | 2 + arch/arm64/include/asm/kvm_pkvm.h | 1 + arch/arm64/kvm/arm.c | 4 ++ arch/arm64/kvm/hyp/include/nvhe/pkvm.h | 6 --- arch/arm64/kvm/hyp/nvhe/pkvm.c | 55 +++++++++++++++----------- arch/arm64/kvm/pkvm.c | 28 +++++++------ 6 files changed, 54 insertions(+), 42 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 51754a354b7a..a7db58d1ef58 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -873,6 +873,8 @@ struct kvm_vcpu_arch { #define VCPU_INITIALIZED __vcpu_single_flag(cflags, BIT(0)) /* SVE config completed */ #define VCPU_SVE_FINALIZED __vcpu_single_flag(cflags, BIT(1)) +/* pKVM VCPU setup completed */ +#define VCPU_PKVM_FINALIZED __vcpu_single_flag(cflags, BIT(2)) /* Exception pending */ #define PENDING_EXCEPTION __vcpu_single_flag(iflags, BIT(0)) diff --git a/arch/arm64/include/asm/kvm_pkvm.h b/arch/arm64/include/asm/kvm_pkvm.h index eb65f12e81d9..abd693ce5b93 100644 --- a/arch/arm64/include/asm/kvm_pkvm.h +++ b/arch/arm64/include/asm/kvm_pkvm.h @@ -19,6 +19,7 @@ int pkvm_init_host_vm(struct kvm *kvm); int pkvm_create_hyp_vm(struct kvm *kvm); void pkvm_destroy_hyp_vm(struct kvm *kvm); +int pkvm_create_hyp_vcpu(struct kvm_vcpu *vcpu); /* * This functions as an allow-list of protected VM capabilities. diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index b8e55a441282..cd7a808eeb64 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -833,6 +833,10 @@ int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu) ret = pkvm_create_hyp_vm(kvm); if (ret) return ret; + + ret = pkvm_create_hyp_vcpu(vcpu); + if (ret) + return ret; } mutex_lock(&kvm->arch.config_lock); diff --git a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h index e42bf68c8848..ce31d3b73603 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h +++ b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h @@ -43,12 +43,6 @@ struct pkvm_hyp_vm { struct hyp_pool pool; hyp_spinlock_t lock; - /* - * The number of vcpus initialized and ready to run. - * Modifying this is protected by 'vm_table_lock'. - */ - unsigned int nr_vcpus; - /* Array of the hyp vCPU structures for this VM. */ struct pkvm_hyp_vcpu *vcpus[]; }; diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index c34a6da5fbec..99f22c1f10c9 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -245,10 +245,12 @@ struct pkvm_hyp_vcpu *pkvm_load_hyp_vcpu(pkvm_handle_t handle, hyp_spin_lock(&vm_table_lock); hyp_vm = get_vm_by_handle(handle); - if (!hyp_vm || hyp_vm->nr_vcpus <= vcpu_idx) + if (!hyp_vm || hyp_vm->kvm.created_vcpus <= vcpu_idx) goto unlock; hyp_vcpu = hyp_vm->vcpus[vcpu_idx]; + if (!hyp_vcpu) + goto unlock; /* Ensure vcpu isn't loaded on more than one cpu simultaneously. */ if (unlikely(hyp_vcpu->loaded_hyp_vcpu)) { @@ -367,8 +369,14 @@ static void unpin_host_vcpus(struct pkvm_hyp_vcpu *hyp_vcpus[], { int i; - for (i = 0; i < nr_vcpus; i++) - unpin_host_vcpu(hyp_vcpus[i]->host_vcpu); + for (i = 0; i < nr_vcpus; i++) { + struct pkvm_hyp_vcpu *hyp_vcpu = hyp_vcpus[i]; + + if (!hyp_vcpu) + continue; + + unpin_host_vcpu(hyp_vcpu->host_vcpu); + } } static void init_pkvm_hyp_vm(struct kvm *host_kvm, struct pkvm_hyp_vm *hyp_vm, @@ -392,24 +400,18 @@ static void pkvm_vcpu_init_sve(struct pkvm_hyp_vcpu *hyp_vcpu, struct kvm_vcpu * static int init_pkvm_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu, struct pkvm_hyp_vm *hyp_vm, - struct kvm_vcpu *host_vcpu, - unsigned int vcpu_idx) + struct kvm_vcpu *host_vcpu) { int ret = 0; if (hyp_pin_shared_mem(host_vcpu, host_vcpu + 1)) return -EBUSY; - if (host_vcpu->vcpu_idx != vcpu_idx) { - ret = -EINVAL; - goto done; - } - hyp_vcpu->host_vcpu = host_vcpu; hyp_vcpu->vcpu.kvm = &hyp_vm->kvm; hyp_vcpu->vcpu.vcpu_id = READ_ONCE(host_vcpu->vcpu_id); - hyp_vcpu->vcpu.vcpu_idx = vcpu_idx; + hyp_vcpu->vcpu.vcpu_idx = READ_ONCE(host_vcpu->vcpu_idx); hyp_vcpu->vcpu.arch.hw_mmu = &hyp_vm->kvm.arch.mmu; hyp_vcpu->vcpu.arch.cflags = READ_ONCE(host_vcpu->arch.cflags); @@ -647,27 +649,28 @@ int __pkvm_init_vcpu(pkvm_handle_t handle, struct kvm_vcpu *host_vcpu, goto unlock; } - idx = hyp_vm->nr_vcpus; + ret = init_pkvm_hyp_vcpu(hyp_vcpu, hyp_vm, host_vcpu); + if (ret) + goto unlock; + + idx = hyp_vcpu->vcpu.vcpu_idx; if (idx >= hyp_vm->kvm.created_vcpus) { ret = -EINVAL; goto unlock; } - ret = init_pkvm_hyp_vcpu(hyp_vcpu, hyp_vm, host_vcpu, idx); - if (ret) + if (hyp_vm->vcpus[idx]) { + ret = -EINVAL; goto unlock; + } hyp_vm->vcpus[idx] = hyp_vcpu; - hyp_vm->nr_vcpus++; unlock: hyp_spin_unlock(&vm_table_lock); - if (ret) { + if (ret) unmap_donated_memory(hyp_vcpu, sizeof(*hyp_vcpu)); - return ret; - } - - return 0; + return ret; } static void @@ -714,11 +717,17 @@ int __pkvm_teardown_vm(pkvm_handle_t handle) mc = &host_kvm->arch.pkvm.teardown_mc; stage2_mc = &host_kvm->arch.pkvm.stage2_teardown_mc; reclaim_pgtable_pages(hyp_vm, stage2_mc); - unpin_host_vcpus(hyp_vm->vcpus, hyp_vm->nr_vcpus); + unpin_host_vcpus(hyp_vm->vcpus, hyp_vm->kvm.created_vcpus); - for (idx = 0; idx < hyp_vm->nr_vcpus; ++idx) { + /* Push the metadata pages to the teardown memcache */ + for (idx = 0; idx < hyp_vm->kvm.created_vcpus; ++idx) { struct pkvm_hyp_vcpu *hyp_vcpu = hyp_vm->vcpus[idx]; - struct kvm_hyp_memcache *vcpu_mc = &hyp_vcpu->vcpu.arch.pkvm_memcache; + struct kvm_hyp_memcache *vcpu_mc; + + if (!hyp_vcpu) + continue; + + vcpu_mc = &hyp_vcpu->vcpu.arch.pkvm_memcache; while (vcpu_mc->nr_pages) { void *addr = pop_hyp_memcache(vcpu_mc, hyp_phys_to_virt); diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c index f297ccdaef64..0f89157d31fd 100644 --- a/arch/arm64/kvm/pkvm.c +++ b/arch/arm64/kvm/pkvm.c @@ -128,7 +128,9 @@ static int __pkvm_create_hyp_vcpu(struct kvm_vcpu *vcpu) return -ENOMEM; ret = kvm_call_hyp_nvhe(__pkvm_init_vcpu, handle, vcpu, hyp_vcpu); - if (ret) + if (!ret) + vcpu_set_flag(vcpu, VCPU_PKVM_FINALIZED); + else free_pages_exact(hyp_vcpu, hyp_vcpu_sz); return ret; @@ -147,9 +149,7 @@ static int __pkvm_create_hyp_vcpu(struct kvm_vcpu *vcpu) static int __pkvm_create_hyp_vm(struct kvm *host_kvm) { size_t pgd_sz, hyp_vm_sz; - struct kvm_vcpu *host_vcpu; void *pgd, *hyp_vm; - unsigned long idx; int ret; if (host_kvm->created_vcpus < 1) @@ -185,17 +185,7 @@ static int __pkvm_create_hyp_vm(struct kvm *host_kvm) host_kvm->arch.pkvm.stage2_teardown_mc.flags |= HYP_MEMCACHE_ACCOUNT_STAGE2; kvm_account_pgtable_pages(pgd, pgd_sz / PAGE_SIZE); - kvm_for_each_vcpu(idx, host_vcpu, host_kvm) { - ret = __pkvm_create_hyp_vcpu(host_vcpu); - if (ret) - goto destroy_vm; - } - return 0; - -destroy_vm: - __pkvm_destroy_hyp_vm(host_kvm); - return ret; free_vm: free_pages_exact(hyp_vm, hyp_vm_sz); free_pgd: @@ -215,6 +205,18 @@ int pkvm_create_hyp_vm(struct kvm *host_kvm) return ret; } +int pkvm_create_hyp_vcpu(struct kvm_vcpu *vcpu) +{ + int ret = 0; + + mutex_lock(&vcpu->kvm->arch.config_lock); + if (!vcpu_get_flag(vcpu, VCPU_PKVM_FINALIZED)) + ret = __pkvm_create_hyp_vcpu(vcpu); + mutex_unlock(&vcpu->kvm->arch.config_lock); + + return ret; +} + void pkvm_destroy_hyp_vm(struct kvm *host_kvm) { mutex_lock(&host_kvm->arch.config_lock); From f2aeb7bbd5745fbcf7f0769e29a184e24924b9a9 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Sat, 15 Mar 2025 18:12:10 +0900 Subject: [PATCH 71/76] KVM: arm64: PMU: Set raw values from user to PM{C,I}NTEN{SET,CLR}, PMOVS{SET,CLR} Commit a45f41d754e0 ("KVM: arm64: Add {get,set}_user for PM{C,I}NTEN{SET,CLR}, PMOVS{SET,CLR}") changed KVM_SET_ONE_REG to update the mentioned registers in a way matching with the behavior of guest register writes. This is a breaking change of a UAPI though the new semantics looks cleaner and VMMs are not prepared for this. Firecracker, QEMU, and crosvm perform migration by listing registers with KVM_GET_REG_LIST, getting their values with KVM_GET_ONE_REG and setting them with KVM_SET_ONE_REG. This algorithm assumes KVM_SET_ONE_REG restores the values retrieved with KVM_GET_ONE_REG without any alteration. However, bit operations added by the earlier commit do not preserve the values retried with KVM_GET_ONE_REG and potentially break migration. Remove the bit operations that alter the values retrieved with KVM_GET_ONE_REG. Cc: stable@vger.kernel.org Fixes: a45f41d754e0 ("KVM: arm64: Add {get,set}_user for PM{C,I}NTEN{SET,CLR}, PMOVS{SET,CLR}") Signed-off-by: Akihiko Odaki Acked-by: Marc Zyngier Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20250315-pmc-v5-1-ecee87dab216@daynix.com Signed-off-by: Oliver Upton --- arch/arm64/kvm/sys_regs.c | 21 ++------------------- 1 file changed, 2 insertions(+), 19 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 82430c1e1dd0..ffee72fd1273 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1051,26 +1051,9 @@ static bool access_pmu_evtyper(struct kvm_vcpu *vcpu, struct sys_reg_params *p, static int set_pmreg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, u64 val) { - bool set; - - val &= kvm_pmu_accessible_counter_mask(vcpu); - - switch (r->reg) { - case PMOVSSET_EL0: - /* CRm[1] being set indicates a SET register, and CLR otherwise */ - set = r->CRm & 2; - break; - default: - /* Op2[0] being set indicates a SET register, and CLR otherwise */ - set = r->Op2 & 1; - break; - } - - if (set) - __vcpu_sys_reg(vcpu, r->reg) |= val; - else - __vcpu_sys_reg(vcpu, r->reg) &= ~val; + u64 mask = kvm_pmu_accessible_counter_mask(vcpu); + __vcpu_sys_reg(vcpu, r->reg) = val & mask; return 0; } From be5ccac3f15ea6f0e9086a98dc3ee4f15e873ccb Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Sat, 15 Mar 2025 18:12:11 +0900 Subject: [PATCH 72/76] KVM: arm64: PMU: Assume PMU presence in pmu-emul.c Many functions in pmu-emul.c checks kvm_vcpu_has_pmu(vcpu). A favorable interpretation is defensive programming, but it also has downsides: - It is confusing as it implies these functions are called without PMU although most of them are called only when a PMU is present. - It makes semantics of functions fuzzy. For example, calling kvm_pmu_disable_counter_mask() without PMU may result in no-op as there are no enabled counters, but it's unclear what kvm_pmu_get_counter_value() returns when there is no PMU. - It allows callers without checking kvm_vcpu_has_pmu(vcpu), but it is often wrong to call these functions without PMU. - It is error-prone to duplicate kvm_vcpu_has_pmu(vcpu) checks into multiple functions. Many functions are called for system registers, and the system register infrastructure already employs less error-prone, comprehensive checks. Check kvm_vcpu_has_pmu(vcpu) in callers of these functions instead, and remove the obsolete checks from pmu-emul.c. The only exceptions are the functions that implement ioctls as they have definitive semantics even when the PMU is not present. Signed-off-by: Akihiko Odaki Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20250315-pmc-v5-2-ecee87dab216@daynix.com Signed-off-by: Oliver Upton --- arch/arm64/kvm/arm.c | 17 +++++++++++------ arch/arm64/kvm/emulate-nested.c | 6 ++++-- arch/arm64/kvm/pmu-emul.c | 26 +------------------------- arch/arm64/kvm/sys_regs.c | 6 ++++-- 4 files changed, 20 insertions(+), 35 deletions(-) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 0160b4924351..caa1357fa367 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -835,9 +835,11 @@ int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu) if (ret) return ret; - ret = kvm_arm_pmu_v3_enable(vcpu); - if (ret) - return ret; + if (kvm_vcpu_has_pmu(vcpu)) { + ret = kvm_arm_pmu_v3_enable(vcpu); + if (ret) + return ret; + } if (is_protected_kvm_enabled()) { ret = pkvm_create_hyp_vm(kvm); @@ -1148,7 +1150,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) */ preempt_disable(); - kvm_pmu_flush_hwstate(vcpu); + if (kvm_vcpu_has_pmu(vcpu)) + kvm_pmu_flush_hwstate(vcpu); local_irq_disable(); @@ -1167,7 +1170,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) if (ret <= 0 || kvm_vcpu_exit_request(vcpu, &ret)) { vcpu->mode = OUTSIDE_GUEST_MODE; isb(); /* Ensure work in x_flush_hwstate is committed */ - kvm_pmu_sync_hwstate(vcpu); + if (kvm_vcpu_has_pmu(vcpu)) + kvm_pmu_sync_hwstate(vcpu); if (unlikely(!irqchip_in_kernel(vcpu->kvm))) kvm_timer_sync_user(vcpu); kvm_vgic_sync_hwstate(vcpu); @@ -1197,7 +1201,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) * that the vgic can properly sample the updated state of the * interrupt line. */ - kvm_pmu_sync_hwstate(vcpu); + if (kvm_vcpu_has_pmu(vcpu)) + kvm_pmu_sync_hwstate(vcpu); /* * Sync the vgic state before syncing the timer state because diff --git a/arch/arm64/kvm/emulate-nested.c b/arch/arm64/kvm/emulate-nested.c index 607d37bab70b..9293fb078fc6 100644 --- a/arch/arm64/kvm/emulate-nested.c +++ b/arch/arm64/kvm/emulate-nested.c @@ -2516,7 +2516,8 @@ void kvm_emulate_nested_eret(struct kvm_vcpu *vcpu) kvm_arch_vcpu_load(vcpu, smp_processor_id()); preempt_enable(); - kvm_pmu_nested_transition(vcpu); + if (kvm_vcpu_has_pmu(vcpu)) + kvm_pmu_nested_transition(vcpu); } static void kvm_inject_el2_exception(struct kvm_vcpu *vcpu, u64 esr_el2, @@ -2599,7 +2600,8 @@ static int kvm_inject_nested(struct kvm_vcpu *vcpu, u64 esr_el2, kvm_arch_vcpu_load(vcpu, smp_processor_id()); preempt_enable(); - kvm_pmu_nested_transition(vcpu); + if (kvm_vcpu_has_pmu(vcpu)) + kvm_pmu_nested_transition(vcpu); return 1; } diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 6c5950b9ceac..98fdc65f5b24 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -150,9 +150,6 @@ static u64 kvm_pmu_get_pmc_value(struct kvm_pmc *pmc) */ u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx) { - if (!kvm_vcpu_has_pmu(vcpu)) - return 0; - return kvm_pmu_get_pmc_value(kvm_vcpu_idx_to_pmc(vcpu, select_idx)); } @@ -191,9 +188,6 @@ static void kvm_pmu_set_pmc_value(struct kvm_pmc *pmc, u64 val, bool force) */ void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val) { - if (!kvm_vcpu_has_pmu(vcpu)) - return; - kvm_pmu_set_pmc_value(kvm_vcpu_idx_to_pmc(vcpu, select_idx), val, false); } @@ -350,7 +344,7 @@ void kvm_pmu_reprogram_counter_mask(struct kvm_vcpu *vcpu, u64 val) { int i; - if (!kvm_vcpu_has_pmu(vcpu) || !val) + if (!val) return; for (i = 0; i < KVM_ARMV8_PMU_MAX_COUNTERS; i++) { @@ -401,9 +395,6 @@ static void kvm_pmu_update_state(struct kvm_vcpu *vcpu) struct kvm_pmu *pmu = &vcpu->arch.pmu; bool overflow; - if (!kvm_vcpu_has_pmu(vcpu)) - return; - overflow = kvm_pmu_overflow_status(vcpu); if (pmu->irq_level == overflow) return; @@ -599,9 +590,6 @@ void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val) { int i; - if (!kvm_vcpu_has_pmu(vcpu)) - return; - /* Fixup PMCR_EL0 to reconcile the PMU version and the LP bit */ if (!kvm_has_feat(vcpu->kvm, ID_AA64DFR0_EL1, PMUVer, V3P5)) val &= ~ARMV8_PMU_PMCR_LP; @@ -766,9 +754,6 @@ void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data, struct kvm_pmc *pmc = kvm_vcpu_idx_to_pmc(vcpu, select_idx); u64 reg; - if (!kvm_vcpu_has_pmu(vcpu)) - return; - reg = counter_index_to_evtreg(pmc->idx); __vcpu_sys_reg(vcpu, reg) = data & kvm_pmu_evtyper_mask(vcpu->kvm); @@ -848,9 +833,6 @@ u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1) u64 val, mask = 0; int base, i, nr_events; - if (!kvm_vcpu_has_pmu(vcpu)) - return 0; - if (!pmceid1) { val = read_sysreg(pmceid0_el0); /* always support CHAIN */ @@ -900,9 +882,6 @@ void kvm_vcpu_reload_pmu(struct kvm_vcpu *vcpu) int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu) { - if (!kvm_vcpu_has_pmu(vcpu)) - return 0; - if (!vcpu->arch.pmu.created) return -EINVAL; @@ -1231,9 +1210,6 @@ void kvm_pmu_nested_transition(struct kvm_vcpu *vcpu) unsigned long mask; int i; - if (!kvm_vcpu_has_pmu(vcpu)) - return; - mask = __vcpu_sys_reg(vcpu, PMCNTENSET_EL0); for_each_set_bit(i, &mask, 32) { struct kvm_pmc *pmc = kvm_vcpu_idx_to_pmc(vcpu, i); diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index ffee72fd1273..e8e9c781a929 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1853,12 +1853,14 @@ static int set_id_aa64dfr0_el1(struct kvm_vcpu *vcpu, static u64 read_sanitised_id_dfr0_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { - u8 perfmon = pmuver_to_perfmon(kvm_arm_pmu_get_pmuver_limit()); + u8 perfmon; u64 val = read_sanitised_ftr_reg(SYS_ID_DFR0_EL1); val &= ~ID_DFR0_EL1_PerfMon_MASK; - if (kvm_vcpu_has_pmu(vcpu)) + if (kvm_vcpu_has_pmu(vcpu)) { + perfmon = pmuver_to_perfmon(kvm_arm_pmu_get_pmuver_limit()); val |= SYS_FIELD_PREP(ID_DFR0_EL1, PerfMon, perfmon); + } val = ID_REG_LIMIT_FIELD_ENUM(val, ID_DFR0_EL1, CopDbg, Debugv8p8); From 64074ca8ca92b8e6b6f76f5ea6982cd55bbc27bf Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Sat, 15 Mar 2025 18:12:12 +0900 Subject: [PATCH 73/76] KVM: arm64: PMU: Fix SET_ONE_REG for vPMC regs Reload the perf event when setting the vPMU counter (vPMC) registers (PMCCNTR_EL0 and PMEVCNTR_EL0). This is a change corresponding to commit 9228b26194d1 ("KVM: arm64: PMU: Fix GET_ONE_REG for vPMC regs to return the current value") but for SET_ONE_REG. Values of vPMC registers are saved in sysreg files on certain occasions. These saved values don't represent the current values of the vPMC registers if the perf events for the vPMCs count events after the save. The current values of those registers are the sum of the sysreg file value and the current perf event counter value. But, when userspace writes those registers (using KVM_SET_ONE_REG), KVM only updates the sysreg file value and leaves the current perf event counter value as is. It is also important to keep the correct state even if userspace writes them after first run, specifically when debugging Windows on QEMU with GDB; QEMU tries to write back all visible registers when resuming the VM execution with GDB, corrupting the PMU state. Windows always uses the PMU so this can cause adverse effects on that particular OS. Fix this by releasing the current perf event and trigger recreating one with KVM_REQ_RELOAD_PMU. Fixes: 051ff581ce70 ("arm64: KVM: Add access handler for event counter register") Signed-off-by: Akihiko Odaki Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20250315-pmc-v5-3-ecee87dab216@daynix.com Signed-off-by: Oliver Upton --- arch/arm64/kvm/pmu-emul.c | 13 +++++++++++++ arch/arm64/kvm/sys_regs.c | 20 +++++++++++++++++++- include/kvm/arm_pmu.h | 3 +++ 3 files changed, 35 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 98fdc65f5b24..593216bc14f0 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -191,6 +191,19 @@ void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val) kvm_pmu_set_pmc_value(kvm_vcpu_idx_to_pmc(vcpu, select_idx), val, false); } +/** + * kvm_pmu_set_counter_value_user - set PMU counter value from user + * @vcpu: The vcpu pointer + * @select_idx: The counter index + * @val: The counter value + */ +void kvm_pmu_set_counter_value_user(struct kvm_vcpu *vcpu, u64 select_idx, u64 val) +{ + kvm_pmu_release_perf_event(kvm_vcpu_idx_to_pmc(vcpu, select_idx)); + __vcpu_sys_reg(vcpu, counter_index_to_reg(select_idx)) = val; + kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu); +} + /** * kvm_pmu_release_perf_event - remove the perf event * @pmc: The PMU counter pointer diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index e8e9c781a929..4d1ef47d0049 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -960,6 +960,22 @@ static int get_pmu_evcntr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, return 0; } +static int set_pmu_evcntr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 val) +{ + u64 idx; + + if (r->CRn == 9 && r->CRm == 13 && r->Op2 == 0) + /* PMCCNTR_EL0 */ + idx = ARMV8_PMU_CYCLE_IDX; + else + /* PMEVCNTRn_EL0 */ + idx = ((r->CRm & 3) << 3) | (r->Op2 & 7); + + kvm_pmu_set_counter_value_user(vcpu, idx, val); + return 0; +} + static bool access_pmu_evcntr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) @@ -1238,6 +1254,7 @@ static int set_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, #define PMU_PMEVCNTR_EL0(n) \ { PMU_SYS_REG(PMEVCNTRn_EL0(n)), \ .reset = reset_pmevcntr, .get_user = get_pmu_evcntr, \ + .set_user = set_pmu_evcntr, \ .access = access_pmu_evcntr, .reg = (PMEVCNTR0_EL0 + n), } /* Macro to expand the PMEVTYPERn_EL0 register */ @@ -2835,7 +2852,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { .access = access_pmceid, .reset = NULL }, { PMU_SYS_REG(PMCCNTR_EL0), .access = access_pmu_evcntr, .reset = reset_unknown, - .reg = PMCCNTR_EL0, .get_user = get_pmu_evcntr}, + .reg = PMCCNTR_EL0, .get_user = get_pmu_evcntr, + .set_user = set_pmu_evcntr }, { PMU_SYS_REG(PMXEVTYPER_EL0), .access = access_pmu_evtyper, .reset = NULL }, { PMU_SYS_REG(PMXEVCNTR_EL0), diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h index 147bd3ee4f7b..a045284b3fd4 100644 --- a/include/kvm/arm_pmu.h +++ b/include/kvm/arm_pmu.h @@ -47,6 +47,7 @@ static __always_inline bool kvm_arm_support_pmu_v3(void) #define kvm_arm_pmu_irq_initialized(v) ((v)->arch.pmu.irq_num >= VGIC_NR_SGIS) u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx); void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val); +void kvm_pmu_set_counter_value_user(struct kvm_vcpu *vcpu, u64 select_idx, u64 val); u64 kvm_pmu_implemented_counter_mask(struct kvm_vcpu *vcpu); u64 kvm_pmu_accessible_counter_mask(struct kvm_vcpu *vcpu); u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1); @@ -115,6 +116,8 @@ static inline u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, } static inline void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val) {} +static inline void kvm_pmu_set_counter_value_user(struct kvm_vcpu *vcpu, + u64 select_idx, u64 val) {} static inline u64 kvm_pmu_implemented_counter_mask(struct kvm_vcpu *vcpu) { return 0; From 1db4aaa05589454c674b1c117263997dcac515f3 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Sat, 15 Mar 2025 18:12:13 +0900 Subject: [PATCH 74/76] KVM: arm64: PMU: Reload when user modifies registers Commit d0c94c49792c ("KVM: arm64: Restore PMU configuration on first run") added the code to reload the PMU configuration on first run. It is also important to keep the correct state even if system registers are modified after first run, specifically when debugging Windows on QEMU with GDB; QEMU tries to write back all visible registers when resuming the VM execution with GDB, corrupting the PMU state. Windows always uses the PMU so this can cause adverse effects on that particular OS. The usual register writes and reset are already handled independently, but register writes from userspace are not covered. Trigger the code to reload the PMU configuration for them instead so that PMU configuration changes made by users will be applied also after the first run. Signed-off-by: Akihiko Odaki Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20250315-pmc-v5-4-ecee87dab216@daynix.com Signed-off-by: Oliver Upton --- arch/arm64/kvm/pmu-emul.c | 3 --- arch/arm64/kvm/sys_regs.c | 4 ++++ 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 593216bc14f0..8e10124a7420 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -917,9 +917,6 @@ int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu) return -EINVAL; } - /* One-off reload of the PMU on first run */ - kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu); - return 0; } diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 4d1ef47d0049..727579acc7f6 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1070,6 +1070,8 @@ static int set_pmreg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, u64 va u64 mask = kvm_pmu_accessible_counter_mask(vcpu); __vcpu_sys_reg(vcpu, r->reg) = val & mask; + kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu); + return 0; } @@ -1228,6 +1230,8 @@ static int set_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, val |= ARMV8_PMU_PMCR_LC; __vcpu_sys_reg(vcpu, r->reg) = val; + kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu); + return 0; } From fe53538069bb4f625bc8734103ba044a83138fea Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Sat, 15 Mar 2025 18:12:14 +0900 Subject: [PATCH 75/76] KVM: arm64: PMU: Reload when resetting Replace kvm_pmu_vcpu_reset() with the generic PMU reloading mechanism to ensure the consistency with system registers and to reduce code size. Signed-off-by: Akihiko Odaki Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20250315-pmc-v5-5-ecee87dab216@daynix.com Signed-off-by: Oliver Upton --- arch/arm64/kvm/pmu-emul.c | 14 -------------- arch/arm64/kvm/reset.c | 3 --- arch/arm64/kvm/sys_regs.c | 3 +++ include/kvm/arm_pmu.h | 2 -- 4 files changed, 3 insertions(+), 19 deletions(-) diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 8e10124a7420..aae5713d8993 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -254,20 +254,6 @@ void kvm_pmu_vcpu_init(struct kvm_vcpu *vcpu) pmu->pmc[i].idx = i; } -/** - * kvm_pmu_vcpu_reset - reset pmu state for cpu - * @vcpu: The vcpu pointer - * - */ -void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu) -{ - unsigned long mask = kvm_pmu_implemented_counter_mask(vcpu); - int i; - - for_each_set_bit(i, &mask, 32) - kvm_pmu_stop_counter(kvm_vcpu_idx_to_pmc(vcpu, i)); -} - /** * kvm_pmu_vcpu_destroy - free perf event of PMU for cpu * @vcpu: The vcpu pointer diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index 803e11b0dc8f..f82fcc614e13 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -196,9 +196,6 @@ void kvm_reset_vcpu(struct kvm_vcpu *vcpu) vcpu->arch.reset_state.reset = false; spin_unlock(&vcpu->arch.mp_state_lock); - /* Reset PMU outside of the non-preemptible section */ - kvm_pmu_vcpu_reset(vcpu); - preempt_disable(); loaded = (vcpu->cpu != -1); if (loaded) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 727579acc7f6..14f66c7a4545 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -4480,6 +4480,9 @@ void kvm_reset_sys_regs(struct kvm_vcpu *vcpu) } set_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags); + + if (kvm_vcpu_has_pmu(vcpu)) + kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu); } /** diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h index a045284b3fd4..7eaac08e47f5 100644 --- a/include/kvm/arm_pmu.h +++ b/include/kvm/arm_pmu.h @@ -52,7 +52,6 @@ u64 kvm_pmu_implemented_counter_mask(struct kvm_vcpu *vcpu); u64 kvm_pmu_accessible_counter_mask(struct kvm_vcpu *vcpu); u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1); void kvm_pmu_vcpu_init(struct kvm_vcpu *vcpu); -void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu); void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu); void kvm_pmu_reprogram_counter_mask(struct kvm_vcpu *vcpu, u64 val); void kvm_pmu_flush_hwstate(struct kvm_vcpu *vcpu); @@ -127,7 +126,6 @@ static inline u64 kvm_pmu_accessible_counter_mask(struct kvm_vcpu *vcpu) return 0; } static inline void kvm_pmu_vcpu_init(struct kvm_vcpu *vcpu) {} -static inline void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu) {} static inline void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu) {} static inline void kvm_pmu_reprogram_counter_mask(struct kvm_vcpu *vcpu, u64 val) {} static inline void kvm_pmu_flush_hwstate(struct kvm_vcpu *vcpu) {} From 250f25367b58d8c65a1b060a2dda037eea09a672 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 14 Mar 2025 13:34:09 +0000 Subject: [PATCH 76/76] KVM: arm64: Tear down vGIC on failed vCPU creation If kvm_arch_vcpu_create() fails to share the vCPU page with the hypervisor, we propagate the error back to the ioctl but leave the vGIC vCPU data initialised. Note only does this leak the corresponding memory when the vCPU is destroyed but it can also lead to use-after-free if the redistributor device handling tries to walk into the vCPU. Add the missing cleanup to kvm_arch_vcpu_create(), ensuring that the vGIC vCPU structures are destroyed on error. Cc: Cc: Marc Zyngier Cc: Oliver Upton Cc: Quentin Perret Signed-off-by: Will Deacon Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20250314133409.9123-1-will@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/arm.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 36d8dd80b431..fe1e1a1adc50 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -466,7 +466,11 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) if (err) return err; - return kvm_share_hyp(vcpu, vcpu + 1); + err = kvm_share_hyp(vcpu, vcpu + 1); + if (err) + kvm_vgic_vcpu_destroy(vcpu); + + return err; } void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)