From fb3066904a4e2562cbcf71b26b0f0dc7a262280c Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 14 May 2025 11:34:44 +0100 Subject: [PATCH 01/20] arm64: sysreg: Add layout for VNCR_EL2 Now that we're about to emulate VNCR_EL2, we need its full layout. Add it to the sysreg file. Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20250514103501.2225951-2-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/sysreg.h | 1 - arch/arm64/tools/sysreg | 6 ++++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 2639d3633073..b8842e092014 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -521,7 +521,6 @@ #define SYS_VTTBR_EL2 sys_reg(3, 4, 2, 1, 0) #define SYS_VTCR_EL2 sys_reg(3, 4, 2, 1, 2) -#define SYS_VNCR_EL2 sys_reg(3, 4, 2, 2, 0) #define SYS_HAFGRTR_EL2 sys_reg(3, 4, 3, 1, 6) #define SYS_SPSR_EL2 sys_reg(3, 4, 4, 0, 0) #define SYS_ELR_EL2 sys_reg(3, 4, 4, 0, 1) diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index bdf044c5d11b..5a3190600a0b 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -2971,6 +2971,12 @@ Sysreg SMCR_EL2 3 4 1 2 6 Fields SMCR_ELx EndSysreg +Sysreg VNCR_EL2 3 4 2 2 0 +Field 63:57 RESS +Field 56:12 BADDR +Res0 11:0 +EndSysreg + Sysreg GCSCR_EL2 3 4 2 5 0 Fields GCSCR_ELx EndSysreg From 469c4713d48028186e4bbf4b74ebce273af9a394 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 14 May 2025 11:34:45 +0100 Subject: [PATCH 02/20] KVM: arm64: nv: Allocate VNCR page when required If running a NV guest on an ARMv8.4-NV capable system, let's allocate an additional page that will be used by the hypervisor to fulfill system register accesses. Reviewed-by: Ganapatrao Kulkarni Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20250514103501.2225951-3-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/nested.c | 10 ++++++++++ arch/arm64/kvm/reset.c | 1 + 2 files changed, 11 insertions(+) diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 4a3fc11f7ecf..0513f1367219 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -55,6 +55,13 @@ int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu) !cpus_have_final_cap(ARM64_HAS_HCR_NV1)) return -EINVAL; + if (!vcpu->arch.ctxt.vncr_array) + vcpu->arch.ctxt.vncr_array = (u64 *)__get_free_page(GFP_KERNEL_ACCOUNT | + __GFP_ZERO); + + if (!vcpu->arch.ctxt.vncr_array) + return -ENOMEM; + /* * Let's treat memory allocation failures as benign: If we fail to * allocate anything, return an error and keep the allocated array @@ -85,6 +92,9 @@ int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu) for (int i = kvm->arch.nested_mmus_size; i < num_mmus; i++) kvm_free_stage2_pgd(&kvm->arch.nested_mmus[i]); + free_page((unsigned long)vcpu->arch.ctxt.vncr_array); + vcpu->arch.ctxt.vncr_array = NULL; + return ret; } diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index f82fcc614e13..965e1429b9f6 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -158,6 +158,7 @@ void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu) if (sve_state) kvm_unshare_hyp(sve_state, sve_state + vcpu_sve_state_size(vcpu)); kfree(sve_state); + free_page((unsigned long)vcpu->arch.ctxt.vncr_array); kfree(vcpu->arch.ccsidr); } From 34fa9dece52757727ed2ffd5cf4713c6cd0b707f Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 14 May 2025 11:34:46 +0100 Subject: [PATCH 03/20] KVM: arm64: nv: Extract translation helper from the AT code The address translation infrastructure is currently pretty tied to the AT emulation. However, we also need to features that require the use of VAs, such as VNCR_EL2 (and maybe one of these days SPE), meaning that we need a slightly more generic infrastructure. Start this by introducing a new helper (__kvm_translate_va()) that performs a S1 walk for a given translation regime, EL and PAN settings. Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20250514103501.2225951-4-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_nested.h | 54 ++++++++++++++++ arch/arm64/kvm/at.c | 96 +++++++++++------------------ 2 files changed, 91 insertions(+), 59 deletions(-) diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h index 692f403c1896..c8a779b393c2 100644 --- a/arch/arm64/include/asm/kvm_nested.h +++ b/arch/arm64/include/asm/kvm_nested.h @@ -245,4 +245,58 @@ static inline unsigned int ps_to_output_size(unsigned int ps) } } +enum trans_regime { + TR_EL10, + TR_EL20, + TR_EL2, +}; + +struct s1_walk_info { + u64 baddr; + enum trans_regime regime; + unsigned int max_oa_bits; + unsigned int pgshift; + unsigned int txsz; + int sl; + bool as_el0; + bool hpd; + bool e0poe; + bool poe; + bool pan; + bool be; + bool s2; +}; + +struct s1_walk_result { + union { + struct { + u64 desc; + u64 pa; + s8 level; + u8 APTable; + bool UXNTable; + bool PXNTable; + bool uwxn; + bool uov; + bool ur; + bool uw; + bool ux; + bool pwxn; + bool pov; + bool pr; + bool pw; + bool px; + }; + struct { + u8 fst; + bool ptw; + bool s2; + }; + }; + bool failed; +}; + +int __kvm_translate_va(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, + struct s1_walk_result *wr, u64 va); + #endif /* __ARM64_KVM_NESTED_H */ diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c index 7a5267f43b51..71406908d4f4 100644 --- a/arch/arm64/kvm/at.c +++ b/arch/arm64/kvm/at.c @@ -10,56 +10,6 @@ #include #include -enum trans_regime { - TR_EL10, - TR_EL20, - TR_EL2, -}; - -struct s1_walk_info { - u64 baddr; - enum trans_regime regime; - unsigned int max_oa_bits; - unsigned int pgshift; - unsigned int txsz; - int sl; - bool hpd; - bool e0poe; - bool poe; - bool pan; - bool be; - bool s2; -}; - -struct s1_walk_result { - union { - struct { - u64 desc; - u64 pa; - s8 level; - u8 APTable; - bool UXNTable; - bool PXNTable; - bool uwxn; - bool uov; - bool ur; - bool uw; - bool ux; - bool pwxn; - bool pov; - bool pr; - bool pw; - bool px; - }; - struct { - u8 fst; - bool ptw; - bool s2; - }; - }; - bool failed; -}; - static void fail_s1_walk(struct s1_walk_result *wr, u8 fst, bool s1ptw) { wr->fst = fst; @@ -145,20 +95,15 @@ static void compute_s1poe(struct kvm_vcpu *vcpu, struct s1_walk_info *wi) } } -static int setup_s1_walk(struct kvm_vcpu *vcpu, u32 op, struct s1_walk_info *wi, +static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, struct s1_walk_result *wr, u64 va) { u64 hcr, sctlr, tcr, tg, ps, ia_bits, ttbr; unsigned int stride, x; - bool va55, tbi, lva, as_el0; + bool va55, tbi, lva; hcr = __vcpu_sys_reg(vcpu, HCR_EL2); - wi->regime = compute_translation_regime(vcpu, op); - as_el0 = (op == OP_AT_S1E0R || op == OP_AT_S1E0W); - wi->pan = (op == OP_AT_S1E1RP || op == OP_AT_S1E1WP) && - (*vcpu_cpsr(vcpu) & PSR_PAN_BIT); - va55 = va & BIT(55); if (wi->regime == TR_EL2 && va55) @@ -319,7 +264,7 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, u32 op, struct s1_walk_info *wi, /* R_BNDVG and following statements */ if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, E0PD, IMP) && - as_el0 && (tcr & (va55 ? TCR_E0PD1 : TCR_E0PD0))) + wi->as_el0 && (tcr & (va55 ? TCR_E0PD1 : TCR_E0PD0))) goto transfault_l0; /* AArch64.S1StartLevel() */ @@ -1155,7 +1100,12 @@ static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) bool perm_fail = false; int ret, idx; - ret = setup_s1_walk(vcpu, op, &wi, &wr, vaddr); + wi.regime = compute_translation_regime(vcpu, op); + wi.as_el0 = (op == OP_AT_S1E0R || op == OP_AT_S1E0W); + wi.pan = (op == OP_AT_S1E1RP || op == OP_AT_S1E1WP) && + (*vcpu_cpsr(vcpu) & PSR_PAN_BIT); + + ret = setup_s1_walk(vcpu, &wi, &wr, vaddr); if (ret) goto compute_par; @@ -1457,3 +1407,31 @@ void __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) par = compute_par_s12(vcpu, par, &out); vcpu_write_sys_reg(vcpu, par, PAR_EL1); } + +/* + * Translate a VA for a given EL in a given translation regime, with + * or without PAN. This requires wi->{regime, as_el0, pan} to be + * set. The rest of the wi and wr should be 0-initialised. + */ +int __kvm_translate_va(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, + struct s1_walk_result *wr, u64 va) +{ + int ret; + + ret = setup_s1_walk(vcpu, wi, wr, va); + if (ret) + return ret; + + if (wr->level == S1_MMU_DISABLED) { + wr->ur = wr->uw = wr->ux = true; + wr->pr = wr->pw = wr->px = true; + } else { + ret = walk_s1(vcpu, wi, wr, va); + if (ret) + return ret; + + compute_s1_permissions(vcpu, wi, wr); + } + + return 0; +} From a0ec2b822caba9ccdefa397918071e591b19e144 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 14 May 2025 11:34:47 +0100 Subject: [PATCH 04/20] KVM: arm64: nv: Snapshot S1 ASID tagging information during walk We currently completely ignore any sort of ASID tagging during a S1 walk, as AT doesn't care about it. However, such information is required if we are going to create anything that looks like a TLB from this walk. Let's capture it both the nG and ASID information while walking the page tables. Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20250514103501.2225951-5-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_nested.h | 2 ++ arch/arm64/kvm/at.c | 27 +++++++++++++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h index c8a779b393c2..4ba3780cb780 100644 --- a/arch/arm64/include/asm/kvm_nested.h +++ b/arch/arm64/include/asm/kvm_nested.h @@ -274,6 +274,8 @@ struct s1_walk_result { u64 pa; s8 level; u8 APTable; + bool nG; + u16 asid; bool UXNTable; bool PXNTable; bool uwxn; diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c index 71406908d4f4..da5359668b9c 100644 --- a/arch/arm64/kvm/at.c +++ b/arch/arm64/kvm/at.c @@ -414,6 +414,33 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, wr->pa = desc & GENMASK(47, va_bottom); wr->pa |= va & GENMASK_ULL(va_bottom - 1, 0); + wr->nG = (wi->regime != TR_EL2) && (desc & PTE_NG); + if (wr->nG) { + u64 asid_ttbr, tcr; + + switch (wi->regime) { + case TR_EL10: + tcr = vcpu_read_sys_reg(vcpu, TCR_EL1); + asid_ttbr = ((tcr & TCR_A1) ? + vcpu_read_sys_reg(vcpu, TTBR1_EL1) : + vcpu_read_sys_reg(vcpu, TTBR0_EL1)); + break; + case TR_EL20: + tcr = vcpu_read_sys_reg(vcpu, TCR_EL2); + asid_ttbr = ((tcr & TCR_A1) ? + vcpu_read_sys_reg(vcpu, TTBR1_EL2) : + vcpu_read_sys_reg(vcpu, TTBR0_EL2)); + break; + default: + BUG(); + } + + wr->asid = FIELD_GET(TTBR_ASID_MASK, asid_ttbr); + if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR0_EL1, ASIDBITS, 16) || + !(tcr & TCR_ASID16)) + wr->asid &= GENMASK(7, 0); + } + return 0; addrsz: From 85bba00425ae0b4b30938ebfdde6d986e5423aff Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 14 May 2025 11:34:48 +0100 Subject: [PATCH 05/20] KVM: arm64: nv: Move TLBI range decoding to a helper As we are about to expand out TLB invalidation capabilities to support recursive virtualisation, move the decoding of a TLBI by range into a helper that returns the base, the range and the ASID. Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20250514103501.2225951-6-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_nested.h | 32 +++++++++++++++++++++++++++++ arch/arm64/kvm/sys_regs.c | 24 ++-------------------- 2 files changed, 34 insertions(+), 22 deletions(-) diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h index 4ba3780cb780..9d56fd946e5e 100644 --- a/arch/arm64/include/asm/kvm_nested.h +++ b/arch/arm64/include/asm/kvm_nested.h @@ -231,6 +231,38 @@ static inline u64 kvm_encode_nested_level(struct kvm_s2_trans *trans) shift; \ }) +static inline u64 decode_range_tlbi(u64 val, u64 *range, u16 *asid) +{ + u64 base, tg, num, scale; + int shift; + + tg = FIELD_GET(GENMASK(47, 46), val); + + switch(tg) { + case 1: + shift = 12; + break; + case 2: + shift = 14; + break; + case 3: + default: /* IMPDEF: handle tg==0 as 64k */ + shift = 16; + break; + } + + base = (val & GENMASK(36, 0)) << shift; + + if (asid) + *asid = FIELD_GET(TLBIR_ASID_MASK, val); + + scale = FIELD_GET(GENMASK(45, 44), val); + num = FIELD_GET(GENMASK(43, 39), val); + *range = __TLBI_RANGE_PAGES(num, scale) << shift; + + return base; +} + static inline unsigned int ps_to_output_size(unsigned int ps) { switch (ps) { diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 005ad28f7306..26e02e172391 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -3546,8 +3546,7 @@ static bool handle_ripas2e1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p, { u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); u64 vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2); - u64 base, range, tg, num, scale; - int shift; + u64 base, range; if (!kvm_supported_tlbi_ipas2_op(vcpu, sys_encoding)) return undef_access(vcpu, p, r); @@ -3557,26 +3556,7 @@ static bool handle_ripas2e1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p, * of the guest's S2 (different base granule size, for example), we * decide to ignore TTL and only use the described range. */ - tg = FIELD_GET(GENMASK(47, 46), p->regval); - scale = FIELD_GET(GENMASK(45, 44), p->regval); - num = FIELD_GET(GENMASK(43, 39), p->regval); - base = p->regval & GENMASK(36, 0); - - switch(tg) { - case 1: - shift = 12; - break; - case 2: - shift = 14; - break; - case 3: - default: /* IMPDEF: handle tg==0 as 64k */ - shift = 16; - break; - } - - base <<= shift; - range = __TLBI_RANGE_PAGES(num, scale) << shift; + base = decode_range_tlbi(p->regval, &range, NULL); kvm_s2_mmu_iterate_by_vmid(vcpu->kvm, get_vmid(vttbr), &(union tlbi_info) { From bd914a981446df475be27ef9c5e86961e6f39c5a Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 14 May 2025 11:34:49 +0100 Subject: [PATCH 06/20] KVM: arm64: nv: Don't adjust PSTATE.M when L2 is nesting We currently check for HCR_EL2.NV being set to decide whether we need to repaint PSTATE.M to say EL2 instead of EL1 on exit. However, this isn't correct when L2 is itself a hypervisor, and that L1 as set its own HCR_EL2.NV. That's because we "flatten" the state and inherit parts of the guest's own setup. In that case, we shouldn't adjust PSTATE.M, as this is really EL1 for both us and the guest. Instead of trying to try and work out how we ended-up with HCR_EL2.NV being set by introspecting both the host and guest states, use a per-CPU flag to remember the context (HYP or not), and use that information to decide whether PSTATE needs tweaking. Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20250514103501.2225951-7-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 1 + arch/arm64/kvm/hyp/vhe/switch.c | 21 +++++++++++++++++++-- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index e98cfe7855a6..12adab97e7f2 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -654,6 +654,7 @@ struct kvm_host_data { #define KVM_HOST_DATA_FLAG_HAS_TRBE 1 #define KVM_HOST_DATA_FLAG_TRBE_ENABLED 4 #define KVM_HOST_DATA_FLAG_EL1_TRACING_CONFIGURED 5 +#define KVM_HOST_DATA_FLAG_VCPU_IN_HYP_CONTEXT 6 unsigned long flags; struct kvm_cpu_context host_ctxt; diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index 731a0378ed13..220dee8a45e0 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -53,13 +53,23 @@ static u64 __compute_hcr(struct kvm_vcpu *vcpu) if (!vcpu_has_nv(vcpu)) return hcr; + /* + * We rely on the invariant that a vcpu entered from HYP + * context must also exit in the same context, as only an ERET + * instruction can kick us out of it, and we obviously trap + * that sucker. PSTATE.M will get fixed-up on exit. + */ if (is_hyp_ctxt(vcpu)) { + host_data_set_flag(VCPU_IN_HYP_CONTEXT); + hcr |= HCR_NV | HCR_NV2 | HCR_AT | HCR_TTLB; if (!vcpu_el2_e2h_is_set(vcpu)) hcr |= HCR_NV1; write_sysreg_s(vcpu->arch.ctxt.vncr_array, SYS_VNCR_EL2); + } else { + host_data_clear_flag(VCPU_IN_HYP_CONTEXT); } return hcr | (__vcpu_sys_reg(vcpu, HCR_EL2) & ~NV_HCR_GUEST_EXCLUDE); @@ -568,9 +578,12 @@ static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) /* * If we were in HYP context on entry, adjust the PSTATE view - * so that the usual helpers work correctly. + * so that the usual helpers work correctly. This enforces our + * invariant that the guest's HYP context status is preserved + * across a run. */ - if (vcpu_has_nv(vcpu) && (read_sysreg(hcr_el2) & HCR_NV)) { + if (vcpu_has_nv(vcpu) && + unlikely(host_data_test_flag(VCPU_IN_HYP_CONTEXT))) { u64 mode = *vcpu_cpsr(vcpu) & (PSR_MODE_MASK | PSR_MODE32_BIT); switch (mode) { @@ -586,6 +599,10 @@ static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) *vcpu_cpsr(vcpu) |= mode; } + /* Apply extreme paranoia! */ + BUG_ON(vcpu_has_nv(vcpu) && + !!host_data_test_flag(VCPU_IN_HYP_CONTEXT) != is_hyp_ctxt(vcpu)); + return __fixup_guest_exit(vcpu, exit_code, hyp_exit_handlers); } From ea8d3cf46d57bc1e131ca66ebc3e9aabe40234ef Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 14 May 2025 11:34:50 +0100 Subject: [PATCH 07/20] KVM: arm64: nv: Add pseudo-TLB backing VNCR_EL2 FEAT_NV2 introduces an interesting problem for NV, as VNCR_EL2.BADDR is a virtual address in the EL2&0 (or EL2, but we thankfully ignore this) translation regime. As we need to replicate such mapping in the real EL2, it means that we need to remember that there is such a translation, and that any TLBI affecting EL2 can possibly affect this translation. It also means that any invalidation driven by an MMU notifier must be able to shoot down any such mapping. All in all, we need a data structure that represents this mapping, and that is extremely close to a TLB. Given that we can only use one of those per vcpu at any given time, we only allocate one. No effort is made to keep that structure small. If we need to start caching multiple of them, we may want to revisit that design point. But for now, it is kept simple so that we can reason about it. Oh, and add a braindump of how things are supposed to work, because I will definitely page this out at some point. Yes, pun intended. Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20250514103501.2225951-8-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 5 ++ arch/arm64/include/asm/kvm_nested.h | 3 ++ arch/arm64/kvm/arm.c | 4 ++ arch/arm64/kvm/nested.c | 72 +++++++++++++++++++++++++++++ arch/arm64/kvm/reset.c | 1 + 5 files changed, 85 insertions(+) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 12adab97e7f2..c762919a2072 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -731,6 +731,8 @@ struct vcpu_reset_state { bool reset; }; +struct vncr_tlb; + struct kvm_vcpu_arch { struct kvm_cpu_context ctxt; @@ -825,6 +827,9 @@ struct kvm_vcpu_arch { /* Per-vcpu CCSIDR override or NULL */ u32 *ccsidr; + + /* Per-vcpu TLB for VNCR_EL2 -- NULL when !NV */ + struct vncr_tlb *vncr_tlb; }; /* diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h index 9d56fd946e5e..98b3d6b58966 100644 --- a/arch/arm64/include/asm/kvm_nested.h +++ b/arch/arm64/include/asm/kvm_nested.h @@ -333,4 +333,7 @@ struct s1_walk_result { int __kvm_translate_va(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, struct s1_walk_result *wr, u64 va); +/* VNCR management */ +int kvm_vcpu_allocate_vncr_tlb(struct kvm_vcpu *vcpu); + #endif /* __ARM64_KVM_NESTED_H */ diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 68fec8c95fee..528743587360 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -843,6 +843,10 @@ int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu) return ret; if (vcpu_has_nv(vcpu)) { + ret = kvm_vcpu_allocate_vncr_tlb(vcpu); + if (ret) + return ret; + ret = kvm_vgic_vcpu_nv_init(vcpu); if (ret) return ret; diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 0513f1367219..806e9cf6049a 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -16,6 +16,24 @@ #include "sys_regs.h" +struct vncr_tlb { + /* The guest's VNCR_EL2 */ + u64 gva; + struct s1_walk_info wi; + struct s1_walk_result wr; + + u64 hpa; + + /* -1 when not mapped on a CPU */ + int cpu; + + /* + * true if the TLB is valid. Can only be changed with the + * mmu_lock held. + */ + bool valid; +}; + /* * Ratio of live shadow S2 MMU per vcpu. This is a trade-off between * memory usage and potential number of different sets of S2 PTs in @@ -811,6 +829,60 @@ void kvm_arch_flush_shadow_all(struct kvm *kvm) kvm_uninit_stage2_mmu(kvm); } +/* + * Dealing with VNCR_EL2 exposed by the *guest* is a complicated matter: + * + * - We introduce an internal representation of a vcpu-private TLB, + * representing the mapping between the guest VA contained in VNCR_EL2, + * the IPA the guest's EL2 PTs point to, and the actual PA this lives at. + * + * - On translation fault from a nested VNCR access, we create such a TLB. + * If there is no mapping to describe, the guest inherits the fault. + * Crucially, no actual mapping is done at this stage. + * + * - On vcpu_load() in a non-HYP context with HCR_EL2.NV==1, if the above + * TLB exists, we map it in the fixmap for this CPU, and run with it. We + * have to respect the permissions dictated by the guest, but not the + * memory type (FWB is a must). + * + * - Note that we usually don't do a vcpu_load() on the back of a fault + * (unless we are preempted), so the resolution of a translation fault + * must go via a request that will map the VNCR page in the fixmap. + * vcpu_load() might as well use the same mechanism. + * + * - On vcpu_put() in a non-HYP context with HCR_EL2.NV==1, if the TLB was + * mapped, we unmap it. Yes it is that simple. The TLB still exists + * though, and may be reused at a later load. + * + * - On permission fault, we simply forward the fault to the guest's EL2. + * Get out of my way. + * + * - On any TLBI for the EL2&0 translation regime, we must find any TLB that + * intersects with the TLBI request, invalidate it, and unmap the page + * from the fixmap. Because we need to look at all the vcpu-private TLBs, + * this requires some wide-ranging locking to ensure that nothing races + * against it. This may require some refcounting to avoid the search when + * no such TLB is present. + * + * - On MMU notifiers, we must invalidate our TLB in a similar way, but + * looking at the IPA instead. The funny part is that there may not be a + * stage-2 mapping for this page if L1 hasn't accessed it using LD/ST + * instructions. + */ + +int kvm_vcpu_allocate_vncr_tlb(struct kvm_vcpu *vcpu) +{ + if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY)) + return 0; + + vcpu->arch.vncr_tlb = kzalloc(sizeof(*vcpu->arch.vncr_tlb), + GFP_KERNEL_ACCOUNT); + if (!vcpu->arch.vncr_tlb) + return -ENOMEM; + + return 0; +} + /* * Our emulated CPU doesn't support all the possible features. For the * sake of simplicity (and probably mental sanity), wipe out a number diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index 965e1429b9f6..959532422d3a 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -159,6 +159,7 @@ void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu) kvm_unshare_hyp(sve_state, sve_state + vcpu_sve_state_size(vcpu)); kfree(sve_state); free_page((unsigned long)vcpu->arch.ctxt.vncr_array); + kfree(vcpu->arch.vncr_tlb); kfree(vcpu->arch.ccsidr); } From 6fb75733f148ecd6c1898df0098b37f70a80f002 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 14 May 2025 11:34:51 +0100 Subject: [PATCH 08/20] KVM: arm64: nv: Add userspace and guest handling of VNCR_EL2 Plug VNCR_EL2 in the vcpu_sysreg enum, define its RES0/RES1 bits, and make it accessible to userspace when the VM is configured to support FEAT_NV2. Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20250514103501.2225951-9-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 2 ++ arch/arm64/kvm/nested.c | 3 +++ arch/arm64/kvm/sys_regs.c | 24 +++++++++++++++--------- 3 files changed, 20 insertions(+), 9 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index c762919a2072..f5ac454dcf66 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -562,6 +562,8 @@ enum vcpu_sysreg { VNCR(HDFGWTR_EL2), VNCR(HAFGRTR_EL2), + VNCR(VNCR_EL2), + VNCR(CNTVOFF_EL2), VNCR(CNTV_CVAL_EL0), VNCR(CNTV_CTL_EL0), diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 806e9cf6049a..32ea6e362bab 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -1400,6 +1400,9 @@ int kvm_init_nv_sysregs(struct kvm_vcpu *vcpu) res0 |= ICH_HCR_EL2_DVIM | ICH_HCR_EL2_vSGIEOICount; set_sysreg_masks(kvm, ICH_HCR_EL2, res0, res1); + /* VNCR_EL2 */ + set_sysreg_masks(kvm, VNCR_EL2, VNCR_EL2_RES0, VNCR_EL2_RES1); + out: for (enum vcpu_sysreg sr = __SANITISED_REG_START__; sr < NR_SYS_REGS; sr++) (void)__vcpu_sys_reg(vcpu, sr); diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 26e02e172391..204470283ccc 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -2281,15 +2281,6 @@ static bool bad_redir_trap(struct kvm_vcpu *vcpu, "trap of EL2 register redirected to EL1"); } -#define EL2_REG(name, acc, rst, v) { \ - SYS_DESC(SYS_##name), \ - .access = acc, \ - .reset = rst, \ - .reg = name, \ - .visibility = el2_visibility, \ - .val = v, \ -} - #define EL2_REG_FILTERED(name, acc, rst, v, filter) { \ SYS_DESC(SYS_##name), \ .access = acc, \ @@ -2299,6 +2290,9 @@ static bool bad_redir_trap(struct kvm_vcpu *vcpu, .val = v, \ } +#define EL2_REG(name, acc, rst, v) \ + EL2_REG_FILTERED(name, acc, rst, v, el2_visibility) + #define EL2_REG_VNCR(name, rst, v) EL2_REG(name, bad_vncr_trap, rst, v) #define EL2_REG_REDIR(name, rst, v) EL2_REG(name, bad_redir_trap, rst, v) @@ -2446,6 +2440,16 @@ static unsigned int sve_el2_visibility(const struct kvm_vcpu *vcpu, return __el2_visibility(vcpu, rd, sve_visibility); } +static unsigned int vncr_el2_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + if (el2_visibility(vcpu, rd) == 0 && + kvm_has_feat(vcpu->kvm, ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY)) + return 0; + + return REG_HIDDEN; +} + static bool access_zcr_el2(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) @@ -3263,6 +3267,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { tcr2_el2_visibility), EL2_REG_VNCR(VTTBR_EL2, reset_val, 0), EL2_REG_VNCR(VTCR_EL2, reset_val, 0), + EL2_REG_FILTERED(VNCR_EL2, bad_vncr_trap, reset_val, 0, + vncr_el2_visibility), { SYS_DESC(SYS_DACR32_EL2), undef_access, reset_unknown, DACR32_EL2 }, EL2_REG_VNCR(HDFGRTR_EL2, reset_val, 0), From 069a05e53549685d2b5e54ceb51db1fd04aa50d7 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 14 May 2025 11:34:52 +0100 Subject: [PATCH 09/20] KVM: arm64: nv: Handle VNCR_EL2-triggered faults As VNCR_EL2.BADDR contains a VA, it is bound to trigger faults. These faults can have multiple source: - We haven't mapped anything on the host: we need to compute the resulting translation, populate a TLB, and eventually map the corresponding page - The permissions are out of whack: we need to tell the guest about this state of affairs Note that the kernel doesn't support S1POE for itself yet, so the particular case of a VNCR page mapped with no permissions or with write-only permissions is not correctly handled yet. Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20250514103501.2225951-10-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/esr.h | 2 + arch/arm64/include/asm/kvm_host.h | 1 + arch/arm64/include/asm/kvm_nested.h | 1 + arch/arm64/kvm/handle_exit.c | 1 + arch/arm64/kvm/nested.c | 159 ++++++++++++++++++++++++++++ 5 files changed, 164 insertions(+) diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h index e4f77757937e..fb4e119e1aaf 100644 --- a/arch/arm64/include/asm/esr.h +++ b/arch/arm64/include/asm/esr.h @@ -99,6 +99,8 @@ #define ESR_ELx_AET_CE (UL(6) << ESR_ELx_AET_SHIFT) /* Shared ISS field definitions for Data/Instruction aborts */ +#define ESR_ELx_VNCR_SHIFT (13) +#define ESR_ELx_VNCR (UL(1) << ESR_ELx_VNCR_SHIFT) #define ESR_ELx_SET_SHIFT (11) #define ESR_ELx_SET_MASK (UL(3) << ESR_ELx_SET_SHIFT) #define ESR_ELx_FnV_SHIFT (10) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index f5ac454dcf66..8fb1c8d5fd14 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -53,6 +53,7 @@ #define KVM_REQ_RESYNC_PMU_EL0 KVM_ARCH_REQ(7) #define KVM_REQ_NESTED_S2_UNMAP KVM_ARCH_REQ(8) #define KVM_REQ_GUEST_HYP_IRQ_PENDING KVM_ARCH_REQ(9) +#define KVM_REQ_MAP_L1_VNCR_EL2 KVM_ARCH_REQ(10) #define KVM_DIRTY_LOG_MANUAL_CAPS (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \ KVM_DIRTY_LOG_INITIALLY_SET) diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h index 98b3d6b58966..be4be8ec49d9 100644 --- a/arch/arm64/include/asm/kvm_nested.h +++ b/arch/arm64/include/asm/kvm_nested.h @@ -335,5 +335,6 @@ int __kvm_translate_va(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, /* VNCR management */ int kvm_vcpu_allocate_vncr_tlb(struct kvm_vcpu *vcpu); +int kvm_handle_vncr_abort(struct kvm_vcpu *vcpu); #endif /* __ARM64_KVM_NESTED_H */ diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index b73dc26bc44b..9700627dd85f 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -317,6 +317,7 @@ static exit_handle_fn arm_exit_handlers[] = { [ESR_ELx_EC_ERET] = kvm_handle_eret, [ESR_ELx_EC_IABT_LOW] = kvm_handle_guest_abort, [ESR_ELx_EC_DABT_LOW] = kvm_handle_guest_abort, + [ESR_ELx_EC_DABT_CUR] = kvm_handle_vncr_abort, [ESR_ELx_EC_SOFTSTP_LOW]= kvm_handle_guest_debug, [ESR_ELx_EC_WATCHPT_LOW]= kvm_handle_guest_debug, [ESR_ELx_EC_BREAKPT_LOW]= kvm_handle_guest_debug, diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 32ea6e362bab..d53c22f51009 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -883,6 +883,165 @@ int kvm_vcpu_allocate_vncr_tlb(struct kvm_vcpu *vcpu) return 0; } +static u64 read_vncr_el2(struct kvm_vcpu *vcpu) +{ + return (u64)sign_extend64(__vcpu_sys_reg(vcpu, VNCR_EL2), 48); +} + +static int kvm_translate_vncr(struct kvm_vcpu *vcpu) +{ + bool write_fault, writable; + unsigned long mmu_seq; + struct vncr_tlb *vt; + struct page *page; + u64 va, pfn, gfn; + int ret; + + vt = vcpu->arch.vncr_tlb; + + vt->wi = (struct s1_walk_info) { + .regime = TR_EL20, + .as_el0 = false, + .pan = false, + }; + vt->wr = (struct s1_walk_result){}; + vt->valid = false; + + guard(srcu)(&vcpu->kvm->srcu); + + va = read_vncr_el2(vcpu); + + ret = __kvm_translate_va(vcpu, &vt->wi, &vt->wr, va); + if (ret) + return ret; + + write_fault = kvm_is_write_fault(vcpu); + + mmu_seq = vcpu->kvm->mmu_invalidate_seq; + smp_rmb(); + + gfn = vt->wr.pa >> PAGE_SHIFT; + pfn = kvm_faultin_pfn(vcpu, gfn, write_fault, &writable, &page); + if (is_error_noslot_pfn(pfn) || (write_fault && !writable)) + return -EFAULT; + + scoped_guard(write_lock, &vcpu->kvm->mmu_lock) { + if (mmu_invalidate_retry(vcpu->kvm, mmu_seq)) + return -EAGAIN; + + vt->gva = va; + vt->hpa = pfn << PAGE_SHIFT; + vt->valid = true; + vt->cpu = -1; + + kvm_make_request(KVM_REQ_MAP_L1_VNCR_EL2, vcpu); + } + + kvm_release_faultin_page(vcpu->kvm, page, false, vt->wr.pw); + if (vt->wr.pw) + mark_page_dirty(vcpu->kvm, gfn); + + return 0; +} + +static void inject_vncr_perm(struct kvm_vcpu *vcpu) +{ + struct vncr_tlb *vt = vcpu->arch.vncr_tlb; + u64 esr = kvm_vcpu_get_esr(vcpu); + + /* Adjust the fault level to reflect that of the guest's */ + esr &= ~ESR_ELx_FSC; + esr |= FIELD_PREP(ESR_ELx_FSC, + ESR_ELx_FSC_PERM_L(vt->wr.level)); + + kvm_inject_nested_sync(vcpu, esr); +} + +static bool kvm_vncr_tlb_lookup(struct kvm_vcpu *vcpu) +{ + struct vncr_tlb *vt = vcpu->arch.vncr_tlb; + + lockdep_assert_held_read(&vcpu->kvm->mmu_lock); + + if (!vt->valid) + return false; + + if (read_vncr_el2(vcpu) != vt->gva) + return false; + + if (vt->wr.nG) { + u64 tcr = vcpu_read_sys_reg(vcpu, TCR_EL2); + u64 ttbr = ((tcr & TCR_A1) ? + vcpu_read_sys_reg(vcpu, TTBR1_EL2) : + vcpu_read_sys_reg(vcpu, TTBR0_EL2)); + u16 asid; + + asid = FIELD_GET(TTBR_ASID_MASK, ttbr); + if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR0_EL1, ASIDBITS, 16) || + !(tcr & TCR_ASID16)) + asid &= GENMASK(7, 0); + + return asid != vt->wr.asid; + } + + return true; +} + +int kvm_handle_vncr_abort(struct kvm_vcpu *vcpu) +{ + struct vncr_tlb *vt = vcpu->arch.vncr_tlb; + u64 esr = kvm_vcpu_get_esr(vcpu); + + BUG_ON(!(esr & ESR_ELx_VNCR_SHIFT)); + + if (esr_fsc_is_permission_fault(esr)) { + inject_vncr_perm(vcpu); + } else if (esr_fsc_is_translation_fault(esr)) { + bool valid; + int ret; + + scoped_guard(read_lock, &vcpu->kvm->mmu_lock) + valid = kvm_vncr_tlb_lookup(vcpu); + + if (!valid) + ret = kvm_translate_vncr(vcpu); + else + ret = -EPERM; + + switch (ret) { + case -EAGAIN: + case -ENOMEM: + /* Let's try again... */ + break; + case -EFAULT: + case -EINVAL: + case -ENOENT: + case -EACCES: + /* + * Translation failed, inject the corresponding + * exception back to EL2. + */ + BUG_ON(!vt->wr.failed); + + esr &= ~ESR_ELx_FSC; + esr |= FIELD_PREP(ESR_ELx_FSC, vt->wr.fst); + + kvm_inject_nested_sync(vcpu, esr); + break; + case -EPERM: + /* Hack to deal with POE until we get kernel support */ + inject_vncr_perm(vcpu); + break; + case 0: + break; + } + } else { + WARN_ONCE(1, "Unhandled VNCR abort, ESR=%llx\n", esr); + } + + return 1; +} + /* * Our emulated CPU doesn't support all the possible features. For the * sake of simplicity (and probably mental sanity), wipe out a number From 2a359e072596fcb2e9e85017a865e3618a2fe5b5 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 14 May 2025 11:34:53 +0100 Subject: [PATCH 10/20] KVM: arm64: nv: Handle mapping of VNCR_EL2 at EL2 Now that we can handle faults triggered through VNCR_EL2, we need to map the corresponding page at EL2. But where, you'll ask? Since each CPU in the system can run a vcpu, we need a per-CPU mapping. For that, we carve a NR_CPUS range in the fixmap, giving us a per-CPU va at which to map the guest's VNCR's page. The mapping occurs both on vcpu load and on the back of a fault, both generating a request that will take care of the mapping. That mapping will also get dropped on vcpu put. Yes, this is a bit heavy handed, but it is simple. Eventually, we may want to have a per-VM, per-CPU mapping, which would avoid all the TLBI overhead. Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20250514103501.2225951-11-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/fixmap.h | 6 ++ arch/arm64/include/asm/kvm_host.h | 1 + arch/arm64/include/asm/kvm_nested.h | 7 +++ arch/arm64/kvm/nested.c | 98 ++++++++++++++++++++++++++--- 4 files changed, 103 insertions(+), 9 deletions(-) diff --git a/arch/arm64/include/asm/fixmap.h b/arch/arm64/include/asm/fixmap.h index 87e307804b99..635a43c4ec85 100644 --- a/arch/arm64/include/asm/fixmap.h +++ b/arch/arm64/include/asm/fixmap.h @@ -48,6 +48,12 @@ enum fixed_addresses { FIX_EARLYCON_MEM_BASE, FIX_TEXT_POKE0, +#ifdef CONFIG_KVM + /* One slot per CPU, mapping the guest's VNCR page at EL2. */ + FIX_VNCR_END, + FIX_VNCR = FIX_VNCR_END + NR_CPUS, +#endif + #ifdef CONFIG_ACPI_APEI_GHES /* Used for GHES mapping from assorted contexts */ FIX_APEI_GHES_IRQ, diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 8fb1c8d5fd14..d87fed0b4833 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -658,6 +658,7 @@ struct kvm_host_data { #define KVM_HOST_DATA_FLAG_TRBE_ENABLED 4 #define KVM_HOST_DATA_FLAG_EL1_TRACING_CONFIGURED 5 #define KVM_HOST_DATA_FLAG_VCPU_IN_HYP_CONTEXT 6 +#define KVM_HOST_DATA_FLAG_L1_VNCR_MAPPED 7 unsigned long flags; struct kvm_cpu_context host_ctxt; diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h index be4be8ec49d9..ea50cad1a6a2 100644 --- a/arch/arm64/include/asm/kvm_nested.h +++ b/arch/arm64/include/asm/kvm_nested.h @@ -337,4 +337,11 @@ int __kvm_translate_va(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, int kvm_vcpu_allocate_vncr_tlb(struct kvm_vcpu *vcpu); int kvm_handle_vncr_abort(struct kvm_vcpu *vcpu); +#define vncr_fixmap(c) \ + ({ \ + u32 __c = (c); \ + BUG_ON(__c >= NR_CPUS); \ + (FIX_VNCR - __c); \ + }) + #endif /* __ARM64_KVM_NESTED_H */ diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index d53c22f51009..e81a0fcfb1f3 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -8,6 +8,7 @@ #include #include +#include #include #include #include @@ -704,23 +705,35 @@ void kvm_init_nested_s2_mmu(struct kvm_s2_mmu *mmu) void kvm_vcpu_load_hw_mmu(struct kvm_vcpu *vcpu) { /* - * The vCPU kept its reference on the MMU after the last put, keep - * rolling with it. + * If the vCPU kept its reference on the MMU after the last put, + * keep rolling with it. */ - if (vcpu->arch.hw_mmu) - return; - if (is_hyp_ctxt(vcpu)) { - vcpu->arch.hw_mmu = &vcpu->kvm->arch.mmu; + if (!vcpu->arch.hw_mmu) + vcpu->arch.hw_mmu = &vcpu->kvm->arch.mmu; } else { - write_lock(&vcpu->kvm->mmu_lock); - vcpu->arch.hw_mmu = get_s2_mmu_nested(vcpu); - write_unlock(&vcpu->kvm->mmu_lock); + if (!vcpu->arch.hw_mmu) { + scoped_guard(write_lock, &vcpu->kvm->mmu_lock) + vcpu->arch.hw_mmu = get_s2_mmu_nested(vcpu); + } + + if (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_NV) + kvm_make_request(KVM_REQ_MAP_L1_VNCR_EL2, vcpu); } } void kvm_vcpu_put_hw_mmu(struct kvm_vcpu *vcpu) { + /* Unconditionally drop the VNCR mapping if we have one */ + if (host_data_test_flag(L1_VNCR_MAPPED)) { + BUG_ON(vcpu->arch.vncr_tlb->cpu != smp_processor_id()); + BUG_ON(is_hyp_ctxt(vcpu)); + + clear_fixmap(vncr_fixmap(vcpu->arch.vncr_tlb->cpu)); + vcpu->arch.vncr_tlb->cpu = -1; + host_data_clear_flag(L1_VNCR_MAPPED); + } + /* * Keep a reference on the associated stage-2 MMU if the vCPU is * scheduling out and not in WFI emulation, suggesting it is likely to @@ -1042,6 +1055,70 @@ int kvm_handle_vncr_abort(struct kvm_vcpu *vcpu) return 1; } +static void kvm_map_l1_vncr(struct kvm_vcpu *vcpu) +{ + struct vncr_tlb *vt = vcpu->arch.vncr_tlb; + pgprot_t prot; + + guard(preempt)(); + guard(read_lock)(&vcpu->kvm->mmu_lock); + + /* + * The request to map VNCR may have raced against some other + * event, such as an interrupt, and may not be valid anymore. + */ + if (is_hyp_ctxt(vcpu)) + return; + + /* + * Check that the pseudo-TLB is valid and that VNCR_EL2 still + * contains the expected value. If it doesn't, we simply bail out + * without a mapping -- a transformed MSR/MRS will generate the + * fault and allows us to populate the pseudo-TLB. + */ + if (!vt->valid) + return; + + if (read_vncr_el2(vcpu) != vt->gva) + return; + + if (vt->wr.nG) { + u64 tcr = vcpu_read_sys_reg(vcpu, TCR_EL2); + u64 ttbr = ((tcr & TCR_A1) ? + vcpu_read_sys_reg(vcpu, TTBR1_EL2) : + vcpu_read_sys_reg(vcpu, TTBR0_EL2)); + u16 asid; + + asid = FIELD_GET(TTBR_ASID_MASK, ttbr); + if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR0_EL1, ASIDBITS, 16) || + !(tcr & TCR_ASID16)) + asid &= GENMASK(7, 0); + + if (asid != vt->wr.asid) + return; + } + + vt->cpu = smp_processor_id(); + + if (vt->wr.pw && vt->wr.pr) + prot = PAGE_KERNEL; + else if (vt->wr.pr) + prot = PAGE_KERNEL_RO; + else + prot = PAGE_NONE; + + /* + * We can't map write-only (or no permission at all) in the kernel, + * but the guest can do it if using POE, so we'll have to turn a + * translation fault into a permission fault at runtime. + * FIXME: WO doesn't work at all, need POE support in the kernel. + */ + if (pgprot_val(prot) != pgprot_val(PAGE_NONE)) { + __set_fixmap(vncr_fixmap(vt->cpu), vt->hpa, prot); + host_data_set_flag(L1_VNCR_MAPPED); + } +} + /* * Our emulated CPU doesn't support all the possible features. For the * sake of simplicity (and probably mental sanity), wipe out a number @@ -1582,6 +1659,9 @@ void check_nested_vcpu_requests(struct kvm_vcpu *vcpu) write_unlock(&vcpu->kvm->mmu_lock); } + if (kvm_check_request(KVM_REQ_MAP_L1_VNCR_EL2, vcpu)) + kvm_map_l1_vncr(vcpu); + /* Must be last, as may switch context! */ if (kvm_check_request(KVM_REQ_GUEST_HYP_IRQ_PENDING, vcpu)) kvm_inject_nested_irq(vcpu); From 7270cc9157f474dfc46750a34c9d7defc686b2eb Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 14 May 2025 11:34:54 +0100 Subject: [PATCH 11/20] KVM: arm64: nv: Handle VNCR_EL2 invalidation from MMU notifiers During an invalidation triggered by an MMU notifier, we need to make sure we can drop the *host* mapping that would have been translated by the stage-2 mapping being invalidated. For the moment, the invalidation is pretty brutal, as we nuke the full IPA range, and therefore any VNCR_EL2 mapping. At some point, we'll be more light-weight, and the code is able to deal with something more targetted. Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20250514103501.2225951-12-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/nested.c | 75 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index e81a0fcfb1f3..825bc1b0d26f 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -434,6 +434,30 @@ static unsigned int ttl_to_size(u8 ttl) return max_size; } +static u8 pgshift_level_to_ttl(u16 shift, u8 level) +{ + u8 ttl; + + switch(shift) { + case 12: + ttl = TLBI_TTL_TG_4K; + break; + case 14: + ttl = TLBI_TTL_TG_16K; + break; + case 16: + ttl = TLBI_TTL_TG_64K; + break; + default: + BUG(); + } + + ttl <<= 2; + ttl |= level & 3; + + return ttl; +} + /* * Compute the equivalent of the TTL field by parsing the shadow PT. The * granule size is extracted from the cached VTCR_EL2.TG0 while the level is @@ -784,6 +808,53 @@ int kvm_inject_s2_fault(struct kvm_vcpu *vcpu, u64 esr_el2) return kvm_inject_nested_sync(vcpu, esr_el2); } +static void invalidate_vncr(struct vncr_tlb *vt) +{ + vt->valid = false; + if (vt->cpu != -1) + clear_fixmap(vncr_fixmap(vt->cpu)); +} + +static void kvm_invalidate_vncr_ipa(struct kvm *kvm, u64 start, u64 end) +{ + struct kvm_vcpu *vcpu; + unsigned long i; + + lockdep_assert_held_write(&kvm->mmu_lock); + + if (!kvm_has_feat(kvm, ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY)) + return; + + kvm_for_each_vcpu(i, vcpu, kvm) { + struct vncr_tlb *vt = vcpu->arch.vncr_tlb; + u64 ipa_start, ipa_end, ipa_size; + + /* + * Careful here: We end-up here from an MMU notifier, + * and this can race against a vcpu not being onlined + * yet, without the pseudo-TLB being allocated. + * + * Skip those, as they obviously don't participate in + * the invalidation at this stage. + */ + if (!vt) + continue; + + if (!vt->valid) + continue; + + ipa_size = ttl_to_size(pgshift_level_to_ttl(vt->wi.pgshift, + vt->wr.level)); + ipa_start = vt->wr.pa & (ipa_size - 1); + ipa_end = ipa_start + ipa_size; + + if (ipa_end <= start || ipa_start >= end) + continue; + + invalidate_vncr(vt); + } +} + void kvm_nested_s2_wp(struct kvm *kvm) { int i; @@ -796,6 +867,8 @@ void kvm_nested_s2_wp(struct kvm *kvm) if (kvm_s2_mmu_valid(mmu)) kvm_stage2_wp_range(mmu, 0, kvm_phys_size(mmu)); } + + kvm_invalidate_vncr_ipa(kvm, 0, BIT(kvm->arch.mmu.pgt->ia_bits)); } void kvm_nested_s2_unmap(struct kvm *kvm, bool may_block) @@ -810,6 +883,8 @@ void kvm_nested_s2_unmap(struct kvm *kvm, bool may_block) if (kvm_s2_mmu_valid(mmu)) kvm_stage2_unmap_range(mmu, 0, kvm_phys_size(mmu), may_block); } + + kvm_invalidate_vncr_ipa(kvm, 0, BIT(kvm->arch.mmu.pgt->ia_bits)); } void kvm_nested_s2_flush(struct kvm *kvm) From 73e1b621b25d7d45cebf9940cad3b377d3b94791 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 14 May 2025 11:34:55 +0100 Subject: [PATCH 12/20] KVM: arm64: nv: Program host's VNCR_EL2 to the fixmap address Since we now have a way to map the guest's VNCR_EL2 on the host, we can point the host's VNCR_EL2 to it and go full circle! Note that we unconditionally assign the fixmap to VNCR_EL2, irrespective of the guest's version being mapped or not. We want to take a fault on first access, so the fixmap either contains something guranteed to be either invalid or a guest mapping. Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20250514103501.2225951-13-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/vhe/switch.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index 220dee8a45e0..5eaff1ae32b2 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -48,6 +48,7 @@ DEFINE_PER_CPU(unsigned long, kvm_hyp_vector); static u64 __compute_hcr(struct kvm_vcpu *vcpu) { + u64 guest_hcr = __vcpu_sys_reg(vcpu, HCR_EL2); u64 hcr = vcpu->arch.hcr_el2; if (!vcpu_has_nv(vcpu)) @@ -70,9 +71,23 @@ static u64 __compute_hcr(struct kvm_vcpu *vcpu) write_sysreg_s(vcpu->arch.ctxt.vncr_array, SYS_VNCR_EL2); } else { host_data_clear_flag(VCPU_IN_HYP_CONTEXT); + + if (guest_hcr & HCR_NV) { + u64 va = __fix_to_virt(vncr_fixmap(smp_processor_id())); + + /* Inherit the low bits from the actual register */ + va |= __vcpu_sys_reg(vcpu, VNCR_EL2) & GENMASK(PAGE_SHIFT - 1, 0); + write_sysreg_s(va, SYS_VNCR_EL2); + + /* Force NV2 in case the guest is forgetful... */ + guest_hcr |= HCR_NV2; + } } - return hcr | (__vcpu_sys_reg(vcpu, HCR_EL2) & ~NV_HCR_GUEST_EXCLUDE); + BUG_ON(host_data_test_flag(VCPU_IN_HYP_CONTEXT) && + host_data_test_flag(L1_VNCR_MAPPED)); + + return hcr | (guest_hcr & ~NV_HCR_GUEST_EXCLUDE); } static void __activate_cptr_traps(struct kvm_vcpu *vcpu) From 4ffa72ad8f37e73bbb6c0baa88557bcb4fd39929 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 14 May 2025 11:34:56 +0100 Subject: [PATCH 13/20] KVM: arm64: nv: Add S1 TLB invalidation primitive for VNCR_EL2 A TLBI by VA for S1 must take effect on our pseudo-TLB for VNCR and potentially knock the fixmap mapping. Even worse, that TLBI must be able to work cross-vcpu. For that, we track on a per-VM basis if any VNCR is mapped, using an atomic counter. Whenever a TLBI S1E2 occurs and that this counter is non-zero, we take the long road all the way back to the core code. There, we iterate over all vcpus and check whether this particular invalidation has any damaging effect. If it does, we nuke the pseudo TLB and the corresponding fixmap. Yes, this is costly. Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20250514103501.2225951-14-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 3 + arch/arm64/include/asm/kvm_nested.h | 1 + arch/arm64/kvm/hyp/vhe/switch.c | 8 ++ arch/arm64/kvm/nested.c | 193 ++++++++++++++++++++++++++++ 4 files changed, 205 insertions(+) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index d87fed0b4833..b2c535036a06 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -390,6 +390,9 @@ struct kvm_arch { /* Masks for VNCR-backed and general EL2 sysregs */ struct kvm_sysreg_masks *sysreg_masks; + /* Count the number of VNCR_EL2 currently mapped */ + atomic_t vncr_map_count; + /* * For an untrusted host VM, 'pkvm.handle' is used to lookup * the associated pKVM instance in the hypervisor. diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h index ea50cad1a6a2..0bd07ea068a1 100644 --- a/arch/arm64/include/asm/kvm_nested.h +++ b/arch/arm64/include/asm/kvm_nested.h @@ -336,6 +336,7 @@ int __kvm_translate_va(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, /* VNCR management */ int kvm_vcpu_allocate_vncr_tlb(struct kvm_vcpu *vcpu); int kvm_handle_vncr_abort(struct kvm_vcpu *vcpu); +void kvm_handle_s1e2_tlbi(struct kvm_vcpu *vcpu, u32 inst, u64 val); #define vncr_fixmap(c) \ ({ \ diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index 5eaff1ae32b2..5902e5f15d09 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -484,6 +484,14 @@ static bool kvm_hyp_handle_tlbi_el2(struct kvm_vcpu *vcpu, u64 *exit_code) if (ret) return false; + /* + * If we have to check for any VNCR mapping being invalidated, + * go back to the slow path for further processing. + */ + if (vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu) && + atomic_read(&vcpu->kvm->arch.vncr_map_count)) + return false; + __kvm_skip_instr(vcpu); return true; diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 825bc1b0d26f..6a9fd4e0e789 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -47,6 +47,7 @@ void kvm_init_nested(struct kvm *kvm) { kvm->arch.nested_mmus = NULL; kvm->arch.nested_mmus_size = 0; + atomic_set(&kvm->arch.vncr_map_count, 0); } static int init_nested_s2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu) @@ -756,6 +757,7 @@ void kvm_vcpu_put_hw_mmu(struct kvm_vcpu *vcpu) clear_fixmap(vncr_fixmap(vcpu->arch.vncr_tlb->cpu)); vcpu->arch.vncr_tlb->cpu = -1; host_data_clear_flag(L1_VNCR_MAPPED); + atomic_dec(&vcpu->kvm->arch.vncr_map_count); } /* @@ -855,6 +857,196 @@ static void kvm_invalidate_vncr_ipa(struct kvm *kvm, u64 start, u64 end) } } +struct s1e2_tlbi_scope { + enum { + TLBI_ALL, + TLBI_VA, + TLBI_VAA, + TLBI_ASID, + } type; + + u16 asid; + u64 va; + u64 size; +}; + +static void invalidate_vncr_va(struct kvm *kvm, + struct s1e2_tlbi_scope *scope) +{ + struct kvm_vcpu *vcpu; + unsigned long i; + + lockdep_assert_held_write(&kvm->mmu_lock); + + kvm_for_each_vcpu(i, vcpu, kvm) { + struct vncr_tlb *vt = vcpu->arch.vncr_tlb; + u64 va_start, va_end, va_size; + + if (!vt->valid) + continue; + + va_size = ttl_to_size(pgshift_level_to_ttl(vt->wi.pgshift, + vt->wr.level)); + va_start = vt->gva & (va_size - 1); + va_end = va_start + va_size; + + switch (scope->type) { + case TLBI_ALL: + break; + + case TLBI_VA: + if (va_end <= scope->va || + va_start >= (scope->va + scope->size)) + continue; + if (vt->wr.nG && vt->wr.asid != scope->asid) + continue; + break; + + case TLBI_VAA: + if (va_end <= scope->va || + va_start >= (scope->va + scope->size)) + continue; + break; + + case TLBI_ASID: + if (!vt->wr.nG || vt->wr.asid != scope->asid) + continue; + break; + } + + invalidate_vncr(vt); + } +} + +static void compute_s1_tlbi_range(struct kvm_vcpu *vcpu, u32 inst, u64 val, + struct s1e2_tlbi_scope *scope) +{ + switch (inst) { + case OP_TLBI_ALLE2: + case OP_TLBI_ALLE2IS: + case OP_TLBI_ALLE2OS: + case OP_TLBI_VMALLE1: + case OP_TLBI_VMALLE1IS: + case OP_TLBI_VMALLE1OS: + case OP_TLBI_ALLE2NXS: + case OP_TLBI_ALLE2ISNXS: + case OP_TLBI_ALLE2OSNXS: + case OP_TLBI_VMALLE1NXS: + case OP_TLBI_VMALLE1ISNXS: + case OP_TLBI_VMALLE1OSNXS: + scope->type = TLBI_ALL; + break; + case OP_TLBI_VAE2: + case OP_TLBI_VAE2IS: + case OP_TLBI_VAE2OS: + case OP_TLBI_VAE1: + case OP_TLBI_VAE1IS: + case OP_TLBI_VAE1OS: + case OP_TLBI_VAE2NXS: + case OP_TLBI_VAE2ISNXS: + case OP_TLBI_VAE2OSNXS: + case OP_TLBI_VAE1NXS: + case OP_TLBI_VAE1ISNXS: + case OP_TLBI_VAE1OSNXS: + case OP_TLBI_VALE2: + case OP_TLBI_VALE2IS: + case OP_TLBI_VALE2OS: + case OP_TLBI_VALE1: + case OP_TLBI_VALE1IS: + case OP_TLBI_VALE1OS: + case OP_TLBI_VALE2NXS: + case OP_TLBI_VALE2ISNXS: + case OP_TLBI_VALE2OSNXS: + case OP_TLBI_VALE1NXS: + case OP_TLBI_VALE1ISNXS: + case OP_TLBI_VALE1OSNXS: + scope->type = TLBI_VA; + scope->size = ttl_to_size(FIELD_GET(TLBI_TTL_MASK, val)); + if (!scope->size) + scope->size = SZ_1G; + scope->va = (val << 12) & ~(scope->size - 1); + scope->asid = FIELD_GET(TLBIR_ASID_MASK, val); + break; + case OP_TLBI_ASIDE1: + case OP_TLBI_ASIDE1IS: + case OP_TLBI_ASIDE1OS: + case OP_TLBI_ASIDE1NXS: + case OP_TLBI_ASIDE1ISNXS: + case OP_TLBI_ASIDE1OSNXS: + scope->type = TLBI_ASID; + scope->asid = FIELD_GET(TLBIR_ASID_MASK, val); + break; + case OP_TLBI_VAAE1: + case OP_TLBI_VAAE1IS: + case OP_TLBI_VAAE1OS: + case OP_TLBI_VAAE1NXS: + case OP_TLBI_VAAE1ISNXS: + case OP_TLBI_VAAE1OSNXS: + case OP_TLBI_VAALE1: + case OP_TLBI_VAALE1IS: + case OP_TLBI_VAALE1OS: + case OP_TLBI_VAALE1NXS: + case OP_TLBI_VAALE1ISNXS: + case OP_TLBI_VAALE1OSNXS: + scope->type = TLBI_VAA; + scope->size = ttl_to_size(FIELD_GET(TLBI_TTL_MASK, val)); + if (!scope->size) + scope->size = SZ_1G; + scope->va = (val << 12) & ~(scope->size - 1); + break; + case OP_TLBI_RVAE2: + case OP_TLBI_RVAE2IS: + case OP_TLBI_RVAE2OS: + case OP_TLBI_RVAE1: + case OP_TLBI_RVAE1IS: + case OP_TLBI_RVAE1OS: + case OP_TLBI_RVAE2NXS: + case OP_TLBI_RVAE2ISNXS: + case OP_TLBI_RVAE2OSNXS: + case OP_TLBI_RVAE1NXS: + case OP_TLBI_RVAE1ISNXS: + case OP_TLBI_RVAE1OSNXS: + case OP_TLBI_RVALE2: + case OP_TLBI_RVALE2IS: + case OP_TLBI_RVALE2OS: + case OP_TLBI_RVALE1: + case OP_TLBI_RVALE1IS: + case OP_TLBI_RVALE1OS: + case OP_TLBI_RVALE2NXS: + case OP_TLBI_RVALE2ISNXS: + case OP_TLBI_RVALE2OSNXS: + case OP_TLBI_RVALE1NXS: + case OP_TLBI_RVALE1ISNXS: + case OP_TLBI_RVALE1OSNXS: + scope->type = TLBI_VA; + scope->va = decode_range_tlbi(val, &scope->size, &scope->asid); + break; + case OP_TLBI_RVAAE1: + case OP_TLBI_RVAAE1IS: + case OP_TLBI_RVAAE1OS: + case OP_TLBI_RVAAE1NXS: + case OP_TLBI_RVAAE1ISNXS: + case OP_TLBI_RVAAE1OSNXS: + case OP_TLBI_RVAALE1: + case OP_TLBI_RVAALE1IS: + case OP_TLBI_RVAALE1OS: + case OP_TLBI_RVAALE1NXS: + case OP_TLBI_RVAALE1ISNXS: + case OP_TLBI_RVAALE1OSNXS: + scope->type = TLBI_VAA; + scope->va = decode_range_tlbi(val, &scope->size, NULL); + break; + } +} + +void kvm_handle_s1e2_tlbi(struct kvm_vcpu *vcpu, u32 inst, u64 val) +{ + struct s1e2_tlbi_scope scope = {}; + + compute_s1_tlbi_range(vcpu, inst, val, &scope); + invalidate_vncr_va(vcpu->kvm, &scope); +} + void kvm_nested_s2_wp(struct kvm *kvm) { int i; @@ -1191,6 +1383,7 @@ static void kvm_map_l1_vncr(struct kvm_vcpu *vcpu) if (pgprot_val(prot) != pgprot_val(PAGE_NONE)) { __set_fixmap(vncr_fixmap(vt->cpu), vt->hpa, prot); host_data_set_flag(L1_VNCR_MAPPED); + atomic_inc(&vcpu->kvm->arch.vncr_map_count); } } From aa98df31f6b42d3379869b82588bebaec4ce474b Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 14 May 2025 11:34:57 +0100 Subject: [PATCH 14/20] KVM: arm64: nv: Plumb TLBI S1E2 into system instruction dispatch Now that we have to handle TLBI S1E2 in the core code, plumb the sysinsn dispatch code into it, so that these instructions don't just UNDEF anymore. Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20250514103501.2225951-15-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/sys_regs.c | 87 +++++++++++++++++++++++++++------------ 1 file changed, 61 insertions(+), 26 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 204470283ccc..de954524b787 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -3628,11 +3628,22 @@ static void s2_mmu_tlbi_s1e1(struct kvm_s2_mmu *mmu, WARN_ON(__kvm_tlbi_s1e2(mmu, info->va.addr, info->va.encoding)); } +static bool handle_tlbi_el2(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); + + if (!kvm_supported_tlbi_s1e2_op(vcpu, sys_encoding)) + return undef_access(vcpu, p, r); + + kvm_handle_s1e2_tlbi(vcpu, sys_encoding, p->regval); + return true; +} + static bool handle_tlbi_el1(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); - u64 vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2); /* * If we're here, this is because we've trapped on a EL1 TLBI @@ -3643,6 +3654,13 @@ static bool handle_tlbi_el1(struct kvm_vcpu *vcpu, struct sys_reg_params *p, * - HCR_EL2.E2H == 0 : a non-VHE guest * - HCR_EL2.{E2H,TGE} == { 1, 0 } : a VHE guest in guest mode * + * Another possibility is that we are invalidating the EL2 context + * using EL1 instructions, but that we landed here because we need + * additional invalidation for structures that are not held in the + * CPU TLBs (such as the VNCR pseudo-TLB and its EL2 mapping). In + * that case, we are guaranteed that HCR_EL2.{E2H,TGE} == { 1, 1 } + * as we don't allow an NV-capable L1 in a nVHE configuration. + * * We don't expect these helpers to ever be called when running * in a vEL1 context. */ @@ -3652,7 +3670,13 @@ static bool handle_tlbi_el1(struct kvm_vcpu *vcpu, struct sys_reg_params *p, if (!kvm_supported_tlbi_s1e1_op(vcpu, sys_encoding)) return undef_access(vcpu, p, r); - kvm_s2_mmu_iterate_by_vmid(vcpu->kvm, get_vmid(vttbr), + if (vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu)) { + kvm_handle_s1e2_tlbi(vcpu, sys_encoding, p->regval); + return true; + } + + kvm_s2_mmu_iterate_by_vmid(vcpu->kvm, + get_vmid(__vcpu_sys_reg(vcpu, VTTBR_EL2)), &(union tlbi_info) { .va = { .addr = p->regval, @@ -3774,16 +3798,21 @@ static struct sys_reg_desc sys_insn_descs[] = { SYS_INSN(TLBI_IPAS2LE1IS, handle_ipas2e1is), SYS_INSN(TLBI_RIPAS2LE1IS, handle_ripas2e1is), - SYS_INSN(TLBI_ALLE2OS, undef_access), - SYS_INSN(TLBI_VAE2OS, undef_access), + SYS_INSN(TLBI_ALLE2OS, handle_tlbi_el2), + SYS_INSN(TLBI_VAE2OS, handle_tlbi_el2), SYS_INSN(TLBI_ALLE1OS, handle_alle1is), - SYS_INSN(TLBI_VALE2OS, undef_access), + SYS_INSN(TLBI_VALE2OS, handle_tlbi_el2), SYS_INSN(TLBI_VMALLS12E1OS, handle_vmalls12e1is), - SYS_INSN(TLBI_RVAE2IS, undef_access), - SYS_INSN(TLBI_RVALE2IS, undef_access), + SYS_INSN(TLBI_RVAE2IS, handle_tlbi_el2), + SYS_INSN(TLBI_RVALE2IS, handle_tlbi_el2), + SYS_INSN(TLBI_ALLE2IS, handle_tlbi_el2), + SYS_INSN(TLBI_VAE2IS, handle_tlbi_el2), SYS_INSN(TLBI_ALLE1IS, handle_alle1is), + + SYS_INSN(TLBI_VALE2IS, handle_tlbi_el2), + SYS_INSN(TLBI_VMALLS12E1IS, handle_vmalls12e1is), SYS_INSN(TLBI_IPAS2E1OS, handle_ipas2e1is), SYS_INSN(TLBI_IPAS2E1, handle_ipas2e1is), @@ -3793,11 +3822,17 @@ static struct sys_reg_desc sys_insn_descs[] = { SYS_INSN(TLBI_IPAS2LE1, handle_ipas2e1is), SYS_INSN(TLBI_RIPAS2LE1, handle_ripas2e1is), SYS_INSN(TLBI_RIPAS2LE1OS, handle_ripas2e1is), - SYS_INSN(TLBI_RVAE2OS, undef_access), - SYS_INSN(TLBI_RVALE2OS, undef_access), - SYS_INSN(TLBI_RVAE2, undef_access), - SYS_INSN(TLBI_RVALE2, undef_access), + SYS_INSN(TLBI_RVAE2OS, handle_tlbi_el2), + SYS_INSN(TLBI_RVALE2OS, handle_tlbi_el2), + SYS_INSN(TLBI_RVAE2, handle_tlbi_el2), + SYS_INSN(TLBI_RVALE2, handle_tlbi_el2), + SYS_INSN(TLBI_ALLE2, handle_tlbi_el2), + SYS_INSN(TLBI_VAE2, handle_tlbi_el2), + SYS_INSN(TLBI_ALLE1, handle_alle1is), + + SYS_INSN(TLBI_VALE2, handle_tlbi_el2), + SYS_INSN(TLBI_VMALLS12E1, handle_vmalls12e1is), SYS_INSN(TLBI_IPAS2E1ISNXS, handle_ipas2e1is), @@ -3805,19 +3840,19 @@ static struct sys_reg_desc sys_insn_descs[] = { SYS_INSN(TLBI_IPAS2LE1ISNXS, handle_ipas2e1is), SYS_INSN(TLBI_RIPAS2LE1ISNXS, handle_ripas2e1is), - SYS_INSN(TLBI_ALLE2OSNXS, undef_access), - SYS_INSN(TLBI_VAE2OSNXS, undef_access), + SYS_INSN(TLBI_ALLE2OSNXS, handle_tlbi_el2), + SYS_INSN(TLBI_VAE2OSNXS, handle_tlbi_el2), SYS_INSN(TLBI_ALLE1OSNXS, handle_alle1is), - SYS_INSN(TLBI_VALE2OSNXS, undef_access), + SYS_INSN(TLBI_VALE2OSNXS, handle_tlbi_el2), SYS_INSN(TLBI_VMALLS12E1OSNXS, handle_vmalls12e1is), - SYS_INSN(TLBI_RVAE2ISNXS, undef_access), - SYS_INSN(TLBI_RVALE2ISNXS, undef_access), - SYS_INSN(TLBI_ALLE2ISNXS, undef_access), - SYS_INSN(TLBI_VAE2ISNXS, undef_access), + SYS_INSN(TLBI_RVAE2ISNXS, handle_tlbi_el2), + SYS_INSN(TLBI_RVALE2ISNXS, handle_tlbi_el2), + SYS_INSN(TLBI_ALLE2ISNXS, handle_tlbi_el2), + SYS_INSN(TLBI_VAE2ISNXS, handle_tlbi_el2), SYS_INSN(TLBI_ALLE1ISNXS, handle_alle1is), - SYS_INSN(TLBI_VALE2ISNXS, undef_access), + SYS_INSN(TLBI_VALE2ISNXS, handle_tlbi_el2), SYS_INSN(TLBI_VMALLS12E1ISNXS, handle_vmalls12e1is), SYS_INSN(TLBI_IPAS2E1OSNXS, handle_ipas2e1is), SYS_INSN(TLBI_IPAS2E1NXS, handle_ipas2e1is), @@ -3827,14 +3862,14 @@ static struct sys_reg_desc sys_insn_descs[] = { SYS_INSN(TLBI_IPAS2LE1NXS, handle_ipas2e1is), SYS_INSN(TLBI_RIPAS2LE1NXS, handle_ripas2e1is), SYS_INSN(TLBI_RIPAS2LE1OSNXS, handle_ripas2e1is), - SYS_INSN(TLBI_RVAE2OSNXS, undef_access), - SYS_INSN(TLBI_RVALE2OSNXS, undef_access), - SYS_INSN(TLBI_RVAE2NXS, undef_access), - SYS_INSN(TLBI_RVALE2NXS, undef_access), - SYS_INSN(TLBI_ALLE2NXS, undef_access), - SYS_INSN(TLBI_VAE2NXS, undef_access), + SYS_INSN(TLBI_RVAE2OSNXS, handle_tlbi_el2), + SYS_INSN(TLBI_RVALE2OSNXS, handle_tlbi_el2), + SYS_INSN(TLBI_RVAE2NXS, handle_tlbi_el2), + SYS_INSN(TLBI_RVALE2NXS, handle_tlbi_el2), + SYS_INSN(TLBI_ALLE2NXS, handle_tlbi_el2), + SYS_INSN(TLBI_VAE2NXS, handle_tlbi_el2), SYS_INSN(TLBI_ALLE1NXS, handle_alle1is), - SYS_INSN(TLBI_VALE2NXS, undef_access), + SYS_INSN(TLBI_VALE2NXS, handle_tlbi_el2), SYS_INSN(TLBI_VMALLS12E1NXS, handle_vmalls12e1is), }; From 6ec4c371d4223adae6f9c0f7f1bd014d381b2170 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 14 May 2025 11:34:58 +0100 Subject: [PATCH 15/20] KVM: arm64: nv: Remove dead code from ERET handling Cleanly, this code cannot trigger, since we filter this from the caller. Drop it. Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20250514103501.2225951-16-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/emulate-nested.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/arch/arm64/kvm/emulate-nested.c b/arch/arm64/kvm/emulate-nested.c index 0fcfcc0478f9..5e9fec251684 100644 --- a/arch/arm64/kvm/emulate-nested.c +++ b/arch/arm64/kvm/emulate-nested.c @@ -2471,13 +2471,6 @@ void kvm_emulate_nested_eret(struct kvm_vcpu *vcpu) { u64 spsr, elr, esr; - /* - * Forward this trap to the virtual EL2 if the virtual - * HCR_EL2.NV bit is set and this is coming from !EL2. - */ - if (forward_hcr_traps(vcpu, HCR_NV)) - return; - spsr = vcpu_read_sys_reg(vcpu, SPSR_EL2); spsr = kvm_check_illegal_exception_return(vcpu, spsr); From a7484c80e5ca1ae0c397bb8003bc588f0dcf43f4 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 14 May 2025 11:34:59 +0100 Subject: [PATCH 16/20] KVM: arm64: Allow userspace to request KVM_ARM_VCPU_EL2* Since we're (almost) feature complete, let's allow userspace to request KVM_ARM_VCPU_EL2* by bumping KVM_VCPU_MAX_FEATURES up. We also now advertise the features to userspace with new capabilities. It's going to be great... Reviewed-by: Oliver Upton Reviewed-by: Joey Gouly Reviewed-by: Ganapatrao Kulkarni Link: https://lore.kernel.org/r/20250514103501.2225951-17-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 2 +- arch/arm64/kvm/arm.c | 6 ++++++ include/uapi/linux/kvm.h | 2 ++ 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index b2c535036a06..79e175a16d35 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -39,7 +39,7 @@ #define KVM_MAX_VCPUS VGIC_V3_MAX_CPUS -#define KVM_VCPU_MAX_FEATURES 7 +#define KVM_VCPU_MAX_FEATURES 9 #define KVM_VCPU_VALID_FEATURES (BIT(KVM_VCPU_MAX_FEATURES) - 1) #define KVM_REQ_SLEEP \ diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 528743587360..b021ce1e42c5 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -368,6 +368,12 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_ARM_EL1_32BIT: r = cpus_have_final_cap(ARM64_HAS_32BIT_EL1); break; + case KVM_CAP_ARM_EL2: + r = cpus_have_final_cap(ARM64_HAS_NESTED_VIRT); + break; + case KVM_CAP_ARM_EL2_E2H0: + r = cpus_have_final_cap(ARM64_HAS_HCR_NV1); + break; case KVM_CAP_GUEST_DEBUG_HW_BPS: r = get_num_brps(); break; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index b6ae8ad8934b..c9d4a908976e 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -930,6 +930,8 @@ struct kvm_enable_cap { #define KVM_CAP_X86_APIC_BUS_CYCLES_NS 237 #define KVM_CAP_X86_GUEST_MODE 238 #define KVM_CAP_ARM_WRITABLE_IMP_ID_REGS 239 +#define KVM_CAP_ARM_EL2 240 +#define KVM_CAP_ARM_EL2_E2H0 241 struct kvm_irq_routing_irqchip { __u32 irqchip; From 29d1697c8c8f64e4d8bb988f957ca1d4980937dc Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 14 May 2025 11:35:00 +0100 Subject: [PATCH 17/20] KVM: arm64: Document NV caps and vcpu flags Describe the two new vcpu flags that control NV, together with the capabilities that advertise them. Reviewed-by: Oliver Upton Reviewed-by: Joey Gouly Link: https://lore.kernel.org/r/20250514103501.2225951-18-maz@kernel.org Signed-off-by: Marc Zyngier --- Documentation/virt/kvm/api.rst | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 47c7c3f92314..fe3d6b5d2acc 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -3460,7 +3460,8 @@ The initial values are defined as: - FPSIMD/NEON registers: set to 0 - SVE registers: set to 0 - System registers: Reset to their architecturally defined - values as for a warm reset to EL1 (resp. SVC) + values as for a warm reset to EL1 (resp. SVC) or EL2 (in the + case of EL2 being enabled). Note that because some registers reflect machine topology, all vcpus should be created before this ioctl is invoked. @@ -3527,6 +3528,17 @@ Possible features: - the KVM_REG_ARM64_SVE_VLS pseudo-register is immutable, and can no longer be written using KVM_SET_ONE_REG. + - KVM_ARM_VCPU_HAS_EL2: Enable Nested Virtualisation support, + booting the guest from EL2 instead of EL1. + Depends on KVM_CAP_ARM_EL2. + The VM is running with HCR_EL2.E2H being RES1 (VHE) unless + KVM_ARM_VCPU_HAS_EL2_E2H0 is also set. + + - KVM_ARM_VCPU_HAS_EL2_E2H0: Restrict Nested Virtualisation + support to HCR_EL2.E2H being RES0 (non-VHE). + Depends on KVM_CAP_ARM_EL2_E2H0. + KVM_ARM_VCPU_HAS_EL2 must also be set. + 4.83 KVM_ARM_PREFERRED_TARGET ----------------------------- From d43548f422f27219eff5ce1897336af2c4f15091 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 20 May 2025 15:41:16 +0100 Subject: [PATCH 18/20] KVM: arm64: nv: Hold mmu_lock when invalidating VNCR SW-TLB before translating When translating a VNCR translation fault, we start by marking the current SW-managed TLB as invalid, so that we can populate it in place. This is, however, done without the mmu_lock held. A consequence of this is that another CPU dealing with TLBI emulation can observe a translation still flagged as valid, but with invalid walk results (such as pgshift being 0). Bad things can result from this, such as a BUG() in pgshift_level_to_ttl(). Fix it by taking the mmu_lock for write to perform this local invalidation, and use invalidate_vncr() instead of open-coding the write to the 'valid' flag. Fixes: 069a05e535496 ("KVM: arm64: nv: Handle VNCR_EL2-triggered faults") Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20250520144116.3667978-1-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/nested.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 6a9fd4e0e789..56b732003caa 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -1179,13 +1179,24 @@ static int kvm_translate_vncr(struct kvm_vcpu *vcpu) vt = vcpu->arch.vncr_tlb; - vt->wi = (struct s1_walk_info) { - .regime = TR_EL20, - .as_el0 = false, - .pan = false, - }; - vt->wr = (struct s1_walk_result){}; - vt->valid = false; + /* + * If we're about to walk the EL2 S1 PTs, we must invalidate the + * current TLB, as it could be sampled from another vcpu doing a + * TLBI *IS. A real CPU wouldn't do that, but we only keep a single + * translation, so not much of a choice. + * + * We also prepare the next walk wilst we're at it. + */ + scoped_guard(write_lock, &vcpu->kvm->mmu_lock) { + invalidate_vncr(vt); + + vt->wi = (struct s1_walk_info) { + .regime = TR_EL20, + .as_el0 = false, + .pan = false, + }; + vt->wr = (struct s1_walk_result){}; + } guard(srcu)(&vcpu->kvm->srcu); From beab7d058309bfe0460a441b1c73639941e33d38 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 21 May 2025 10:58:29 +0100 Subject: [PATCH 19/20] KVM: arm64: nv: Handle TLBI S1E2 for VNCR invalidation with mmu_lock held Calling invalidate_vncr_va() without the mmu_lock held for write is a bad idea, and lockdep tells you about that. Fixes: 4ffa72ad8f37e ("KVM: arm64: nv: Add S1 TLB invalidation primitive for VNCR_EL2") Signed-off-by: Marc Zyngier --- arch/arm64/kvm/nested.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 56b732003caa..2381663d2ee9 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -1044,6 +1044,8 @@ void kvm_handle_s1e2_tlbi(struct kvm_vcpu *vcpu, u32 inst, u64 val) struct s1e2_tlbi_scope scope = {}; compute_s1_tlbi_range(vcpu, inst, val, &scope); + + guard(write_lock)(&vcpu->kvm->mmu_lock); invalidate_vncr_va(vcpu->kvm, &scope); } From 538fbac74019c13dac341b20fbcc1e96c9a8d01e Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 21 May 2025 11:04:11 +0100 Subject: [PATCH 20/20] KVM: arm64: nv: Release faulted-in VNCR page from mmu_lock critical section The conversion to kvm_release_faultin_page() missed the requirement for this to be called within a critical section with mmu_lock held for write. Move this call up to satisfy this requirement. Fixes: 069a05e535496 ("KVM: arm64: nv: Handle VNCR_EL2-triggered faults") Signed-off-by: Marc Zyngier --- arch/arm64/kvm/nested.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 2381663d2ee9..e7e71f1615f1 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -1228,9 +1228,9 @@ static int kvm_translate_vncr(struct kvm_vcpu *vcpu) vt->cpu = -1; kvm_make_request(KVM_REQ_MAP_L1_VNCR_EL2, vcpu); + kvm_release_faultin_page(vcpu->kvm, page, false, vt->wr.pw); } - kvm_release_faultin_page(vcpu->kvm, page, false, vt->wr.pw); if (vt->wr.pw) mark_page_dirty(vcpu->kvm, gfn);