Merge tag 'kvmarm-fixes-7.1-2' of git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm into HEAD

KVM/arm64 fixes for 7.1, take #2

- Add the pKVM side of the workaround for ARM's erratum 4193714, provided
  that the EL3 firmware does its part of the job. KVM will refuse to
  initialise otherwise.

- Correctly handle 52bit VAs for guest EL2 stage-1 translations when
  running under NV with E2H==0.

- Correctly deal with permission faults in guest_memfd memslots.

- Fix the steal-time selftest after the infrastructure was reworked.

- Make sure the host cannot pass a non-sensical clock update to the
  EL2 tracing infrastructure.

- Appoint Steffen Eiden as a reviewer in anticipation of the KVM/s390
  ability to run arm64 guests, which will inevitably lead to arm64
  code being directly used on s390.

- Make sure that EL2 is configured with both exception entry and exit
  being Context Synchronization Events.

- Handle the current vcpu being NULL on EL2 panic.

- Fix the selftest_vcpu memcache being empty at the point of donation or
  sharing.

- Check that the memcache has enough capacity before engaging on the
  share/donate path.

- Fix __deactivate_fgt() to use its parameter rather than a variable
  in the macro context.
This commit is contained in:
Paolo Bonzini
2026-05-12 22:19:20 +02:00
12 changed files with 120 additions and 13 deletions

View File

@@ -14052,6 +14052,7 @@ KERNEL VIRTUAL MACHINE FOR ARM64 (KVM/arm64)
M: Marc Zyngier <maz@kernel.org>
M: Oliver Upton <oupton@kernel.org>
R: Joey Gouly <joey.gouly@arm.com>
R: Steffen Eiden <seiden@linux.ibm.com>
R: Suzuki K Poulose <suzuki.poulose@arm.com>
R: Zenghui Yu <yuzenghui@huawei.com>
L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)

View File

@@ -23,6 +23,7 @@ static inline u64 tcr_el2_ps_to_tcr_el1_ips(u64 tcr_el2)
static inline u64 translate_tcr_el2_to_tcr_el1(u64 tcr)
{
return TCR_EPD1_MASK | /* disable TTBR1_EL1 */
((tcr & TCR_EL2_DS) ? TCR_DS : 0) |
((tcr & TCR_EL2_TBI) ? TCR_TBI0 : 0) |
tcr_el2_ps_to_tcr_el1_ips(tcr) |
(tcr & TCR_EL2_TG0_MASK) |

View File

@@ -844,7 +844,7 @@
#define INIT_SCTLR_EL2_MMU_ON \
(SCTLR_ELx_M | SCTLR_ELx_C | SCTLR_ELx_SA | SCTLR_ELx_I | \
SCTLR_ELx_IESB | SCTLR_ELx_WXN | ENDIAN_SET_EL2 | \
SCTLR_ELx_ITFSB | SCTLR_EL2_RES1)
SCTLR_ELx_ITFSB | SCTLR_ELx_EIS | SCTLR_ELx_EOS | SCTLR_EL2_RES1)
#define INIT_SCTLR_EL2_MMU_OFF \
(SCTLR_EL2_RES1 | ENDIAN_SET_EL2)

View File

@@ -4,6 +4,7 @@
* Author: Christoffer Dall <c.dall@virtualopensystems.com>
*/
#include <linux/arm-smccc.h>
#include <linux/bug.h>
#include <linux/cpu_pm.h>
#include <linux/errno.h>
@@ -2638,6 +2639,22 @@ static int init_pkvm_host_sve_state(void)
return 0;
}
static int pkvm_check_sme_dvmsync_fw_call(void)
{
struct arm_smccc_res res;
if (!cpus_have_final_cap(ARM64_WORKAROUND_4193714))
return 0;
arm_smccc_1_1_smc(ARM_SMCCC_CPU_WORKAROUND_4193714, &res);
if (res.a0) {
kvm_err("pKVM requires firmware support for C1-Pro erratum 4193714\n");
return -ENODEV;
}
return 0;
}
/*
* Finalizes the initialization of hyp mode, once everything else is initialized
* and the initialziation process cannot fail.
@@ -2838,6 +2855,10 @@ static int __init init_hyp_mode(void)
if (err)
goto out_err;
err = pkvm_check_sme_dvmsync_fw_call();
if (err)
goto out_err;
err = kvm_hyp_init_protection(hyp_va_bits);
if (err) {
kvm_err("Failed to init hyp memory protection\n");

View File

@@ -245,7 +245,7 @@ static inline void __activate_traps_ich_hfgxtr(struct kvm_vcpu *vcpu)
__activate_fgt(hctxt, vcpu, ICH_HFGITR_EL2);
}
#define __deactivate_fgt(htcxt, vcpu, reg) \
#define __deactivate_fgt(hctxt, vcpu, reg) \
do { \
write_sysreg_s(ctxt_sys_reg(hctxt, reg), \
SYS_ ## reg); \

View File

@@ -35,6 +35,9 @@ void trace_clock_update(u32 mult, u32 shift, u64 epoch_ns, u64 epoch_cyc)
struct clock_data *clock = &trace_clock_data;
u64 bank = clock->cur ^ 1;
if (!mult || shift >= 64)
return;
clock->data[bank].mult = mult;
clock->data[bank].shift = shift;
clock->data[bank].epoch_ns = epoch_ns;

View File

@@ -5,6 +5,7 @@
*/
#include <linux/kvm_host.h>
#include <asm/kvm_emulate.h>
#include <asm/kvm_hyp.h>
#include <asm/kvm_mmu.h>
@@ -14,6 +15,7 @@
#include <hyp/fault.h>
#include <nvhe/arm-smccc.h>
#include <nvhe/gfp.h>
#include <nvhe/memory.h>
#include <nvhe/mem_protect.h>
@@ -29,6 +31,19 @@ static struct hyp_pool host_s2_pool;
static DEFINE_PER_CPU(struct pkvm_hyp_vm *, __current_vm);
#define current_vm (*this_cpu_ptr(&__current_vm))
static void pkvm_sme_dvmsync_fw_call(void)
{
if (alternative_has_cap_unlikely(ARM64_WORKAROUND_4193714)) {
struct arm_smccc_res res;
/*
* Ignore the return value. Probing for the workaround
* availability took place in init_hyp_mode().
*/
hyp_smccc_1_1_smc(ARM_SMCCC_CPU_WORKAROUND_4193714, &res);
}
}
static void guest_lock_component(struct pkvm_hyp_vm *vm)
{
hyp_spin_lock(&vm->lock);
@@ -574,8 +589,14 @@ static int host_stage2_set_owner_metadata_locked(phys_addr_t addr, u64 size,
ret = host_stage2_try(kvm_pgtable_stage2_annotate, &host_mmu.pgt,
addr, size, &host_s2_pool,
KVM_HOST_INVALID_PTE_TYPE_DONATION, annotation);
if (!ret)
if (!ret) {
/*
* After stage2 maintenance has happened, but before the page
* owner has changed.
*/
pkvm_sme_dvmsync_fw_call();
__host_update_page_state(addr, size, PKVM_NOPAGE);
}
return ret;
}
@@ -1369,6 +1390,22 @@ int __pkvm_host_reclaim_page_guest(u64 gfn, struct pkvm_hyp_vm *vm)
return ret && ret != -EHWPOISON ? ret : 0;
}
/*
* share/donate install at most one stage-2 leaf (PAGE_SIZE, or one
* KVM_PGTABLE_LAST_LEVEL - 1 block for share). kvm_mmu_cache_min_pages()
* bounds the worst-case allocation: exact for the PAGE_SIZE leaf,
* conservative by one for the block.
*/
static int __guest_check_pgtable_memcache(struct pkvm_hyp_vcpu *vcpu)
{
struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu);
if (vcpu->vcpu.arch.pkvm_memcache.nr_pages < kvm_mmu_cache_min_pages(vm->pgt.mmu))
return -ENOMEM;
return 0;
}
int __pkvm_host_donate_guest(u64 pfn, u64 gfn, struct pkvm_hyp_vcpu *vcpu)
{
struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu);
@@ -1388,6 +1425,10 @@ int __pkvm_host_donate_guest(u64 pfn, u64 gfn, struct pkvm_hyp_vcpu *vcpu)
if (ret)
goto unlock;
ret = __guest_check_pgtable_memcache(vcpu);
if (ret)
goto unlock;
meta = host_stage2_encode_gfn_meta(vm, gfn);
WARN_ON(host_stage2_set_owner_metadata_locked(phys, PAGE_SIZE,
PKVM_ID_GUEST, meta));
@@ -1453,6 +1494,10 @@ int __pkvm_host_share_guest(u64 pfn, u64 gfn, u64 nr_pages, struct pkvm_hyp_vcpu
}
}
ret = __guest_check_pgtable_memcache(vcpu);
if (ret)
goto unlock;
for_each_hyp_page(page, phys, size) {
set_host_state(page, PKVM_PAGE_SHARED_OWNED);
page->host_share_guest_count++;

View File

@@ -752,16 +752,30 @@ static struct pkvm_hyp_vcpu selftest_vcpu = {
struct pkvm_hyp_vcpu *init_selftest_vm(void *virt)
{
struct hyp_page *p = hyp_virt_to_page(virt);
unsigned long min_pages, seeded = 0;
int i;
selftest_vm.kvm.arch.mmu.vtcr = host_mmu.arch.mmu.vtcr;
WARN_ON(kvm_guest_prepare_stage2(&selftest_vm, virt));
/*
* Mirror pkvm_refill_memcache() for the share/donate pre-checks;
* the selftest invokes those functions directly and would
* otherwise see an empty memcache.
*/
min_pages = kvm_mmu_cache_min_pages(&selftest_vm.kvm.arch.mmu);
for (i = 0; i < pkvm_selftest_pages(); i++) {
if (p[i].refcount)
continue;
p[i].refcount = 1;
hyp_put_page(&selftest_vm.pool, hyp_page_to_virt(&p[i]));
if (seeded < min_pages) {
push_hyp_memcache(&selftest_vcpu.vcpu.arch.pkvm_memcache,
hyp_page_to_virt(&p[i]), hyp_virt_to_phys);
seeded++;
} else {
hyp_put_page(&selftest_vm.pool, hyp_page_to_virt(&p[i]));
}
}
selftest_vm.kvm.arch.pkvm.handle = __pkvm_reserve_vm();

View File

@@ -663,7 +663,8 @@ static void __noreturn __hyp_call_panic(u64 spsr, u64 elr, u64 par)
host_ctxt = host_data_ptr(host_ctxt);
vcpu = host_ctxt->__hyp_running_vcpu;
__deactivate_traps(vcpu);
if (vcpu)
__deactivate_traps(vcpu);
sysreg_restore_host_state_vhe(host_ctxt);
panic("HYP panic:\nPS:%08llx PC:%016llx ESR:%08llx\nFAR:%016llx HPFAR:%016llx PAR:%016llx\nVCPU:%p\n",

View File

@@ -1576,21 +1576,24 @@ struct kvm_s2_fault_desc {
static int gmem_abort(const struct kvm_s2_fault_desc *s2fd)
{
bool write_fault, exec_fault;
bool perm_fault = kvm_vcpu_trap_is_permission_fault(s2fd->vcpu);
enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_SHARED;
enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
struct kvm_pgtable *pgt = s2fd->vcpu->arch.hw_mmu->pgt;
unsigned long mmu_seq;
struct page *page;
struct kvm *kvm = s2fd->vcpu->kvm;
void *memcache;
void *memcache = NULL;
kvm_pfn_t pfn;
gfn_t gfn;
int ret;
memcache = get_mmu_memcache(s2fd->vcpu);
ret = topup_mmu_memcache(s2fd->vcpu, memcache);
if (ret)
return ret;
if (!perm_fault) {
memcache = get_mmu_memcache(s2fd->vcpu);
ret = topup_mmu_memcache(s2fd->vcpu, memcache);
if (ret)
return ret;
}
if (s2fd->nested)
gfn = kvm_s2_trans_output(s2fd->nested) >> PAGE_SHIFT;
@@ -1631,9 +1634,19 @@ static int gmem_abort(const struct kvm_s2_fault_desc *s2fd)
goto out_unlock;
}
ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, s2fd->fault_ipa, PAGE_SIZE,
__pfn_to_phys(pfn), prot,
memcache, flags);
if (perm_fault) {
/*
* Drop the SW bits in favour of those stored in the
* PTE, which will be preserved.
*/
prot &= ~KVM_NV_GUEST_MAP_SZ;
ret = KVM_PGT_FN(kvm_pgtable_stage2_relax_perms)(pgt, s2fd->fault_ipa,
prot, flags);
} else {
ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, s2fd->fault_ipa, PAGE_SIZE,
__pfn_to_phys(pfn), prot,
memcache, flags);
}
out_unlock:
kvm_release_faultin_page(kvm, page, !!ret, prot & KVM_PGTABLE_PROT_W);

View File

@@ -105,6 +105,12 @@
ARM_SMCCC_SMC_32, \
0, 0x3fff)
/* C1-Pro erratum 4193714: SME DVMSync early acknowledgement */
#define ARM_SMCCC_CPU_WORKAROUND_4193714 \
ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \
ARM_SMCCC_SMC_32, \
ARM_SMCCC_OWNER_CPU, 0x10)
#define ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID \
ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \
ARM_SMCCC_SMC_32, \

View File

@@ -220,6 +220,8 @@ static void check_steal_time_uapi(void)
};
vcpu_ioctl(vcpu, KVM_HAS_DEVICE_ATTR, &dev);
vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, ST_GPA_BASE, 1, 1, 0);
virt_map(vm, ST_GPA_BASE, ST_GPA_BASE, 1);
st_ipa = (ulong)ST_GPA_BASE | 1;
ret = __vcpu_ioctl(vcpu, KVM_SET_DEVICE_ATTR, &dev);