From 0152e049bd764942c41bc79b339ba6281c15dee9 Mon Sep 17 00:00:00 2001 From: Dmytro Maluka Date: Fri, 26 Sep 2025 17:57:24 +0200 Subject: [PATCH 01/14] KVM: VMX: Remove stale vmx_set_dr6() declaration Remove leftover after commit 80c64c7afea1 ("KVM: x86: Drop kvm_x86_ops.set_dr6() in favor of a new KVM_RUN flag") which removed vmx_set_dr6(). Signed-off-by: Dmytro Maluka Link: https://lore.kernel.org/r/20250926155724.1619716-1-dmaluka@chromium.org Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/x86_ops.h | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h index 9697368d65b3..77613a44cebf 100644 --- a/arch/x86/kvm/vmx/x86_ops.h +++ b/arch/x86/kvm/vmx/x86_ops.h @@ -73,7 +73,6 @@ void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt); -void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val); void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val); void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu); void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg); From f505c7b16fbeea25f82a7e8d578e128178dd6706 Mon Sep 17 00:00:00 2001 From: Xin Li Date: Wed, 24 Sep 2025 07:54:21 -0700 Subject: [PATCH 02/14] KVM: nVMX: Use vcpu instead of vmx->vcpu when vcpu is available Prefer using vcpu directly when available, instead of accessing it through vmx->vcpu. Signed-off-by: Xin Li (Intel) Link: https://lore.kernel.org/r/20250924145421.2046822-1-xin@zytor.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 76271962cb70..3fca63a261f5 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -761,7 +761,7 @@ static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu, vmcs12->vmcs_link_pointer, VMCS12_SIZE)) return; - kvm_read_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu), + kvm_read_guest_cached(vcpu->kvm, ghc, get_shadow_vmcs12(vcpu), VMCS12_SIZE); } @@ -780,7 +780,7 @@ static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu, vmcs12->vmcs_link_pointer, VMCS12_SIZE)) return; - kvm_write_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu), + kvm_write_guest_cached(vcpu->kvm, ghc, get_shadow_vmcs12(vcpu), VMCS12_SIZE); } @@ -2749,7 +2749,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); vcpu->arch.pat = vmcs12->guest_ia32_pat; } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { - vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); + vmcs_write64(GUEST_IA32_PAT, vcpu->arch.pat); } vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset( @@ -3880,7 +3880,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) goto vmentry_failed; /* Hide L1D cache contents from the nested guest. */ - vmx->vcpu.arch.l1tf_flush_l1d = true; + vcpu->arch.l1tf_flush_l1d = true; /* * Must happen outside of nested_vmx_enter_non_root_mode() as it will From 0bd0a4a1428baaf4447e95f0832492d9e3d64961 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Tue, 16 Sep 2025 23:31:29 +0200 Subject: [PATCH 03/14] KVM: TDX: Replace kmalloc + copy_from_user with memdup_user in tdx_td_init() Use get_user() to retrieve the number of entries instead of allocating memory for 'init_vm' with the maximum size, copying 'cmd->data' to it, only to then read the actual entry count 'cpuid.nent' from the copy. Use memdup_user() to allocate just enough memory to fit all entries and to copy 'cmd->data' from userspace. Use struct_size() instead of manually calculating the number of bytes to allocate and copy. No functional changes intended. Signed-off-by: Thorsten Blum Link: https://lore.kernel.org/r/20250916213129.2535597-2-thorsten.blum@linux.dev [sean: s/user_init_vm/user_data] Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/tdx.c | 30 ++++++++++-------------------- 1 file changed, 10 insertions(+), 20 deletions(-) diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 0a49c863c811..326db9b9c567 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -2749,9 +2749,11 @@ static int tdx_read_cpuid(struct kvm_vcpu *vcpu, u32 leaf, u32 sub_leaf, static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd) { + struct kvm_tdx_init_vm __user *user_data = u64_to_user_ptr(cmd->data); struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); struct kvm_tdx_init_vm *init_vm; struct td_params *td_params = NULL; + u32 nr_user_entries; int ret; BUILD_BUG_ON(sizeof(*init_vm) != 256 + sizeof_field(struct kvm_tdx_init_vm, cpuid)); @@ -2763,28 +2765,16 @@ static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd) if (cmd->flags) return -EINVAL; - init_vm = kmalloc(sizeof(*init_vm) + - sizeof(init_vm->cpuid.entries[0]) * KVM_MAX_CPUID_ENTRIES, - GFP_KERNEL); - if (!init_vm) - return -ENOMEM; + if (get_user(nr_user_entries, &user_data->cpuid.nent)) + return -EFAULT; - if (copy_from_user(init_vm, u64_to_user_ptr(cmd->data), sizeof(*init_vm))) { - ret = -EFAULT; - goto out; - } + if (nr_user_entries > KVM_MAX_CPUID_ENTRIES) + return -E2BIG; - if (init_vm->cpuid.nent > KVM_MAX_CPUID_ENTRIES) { - ret = -E2BIG; - goto out; - } - - if (copy_from_user(init_vm->cpuid.entries, - u64_to_user_ptr(cmd->data) + sizeof(*init_vm), - flex_array_size(init_vm, cpuid.entries, init_vm->cpuid.nent))) { - ret = -EFAULT; - goto out; - } + init_vm = memdup_user(user_data, + struct_size(user_data, cpuid.entries, nr_user_entries)); + if (IS_ERR(init_vm)) + return PTR_ERR(init_vm); if (memchr_inv(init_vm->reserved, 0, sizeof(init_vm->reserved))) { ret = -EINVAL; From f48888bb8ad109c772c9dcdfabbf749ab5ac5502 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 18 Sep 2025 17:59:47 -0700 Subject: [PATCH 04/14] KVM: VMX: Hoist construct_eptp() "up" in vmx.c Move construct_eptp() further up in vmx.c so that it's above vmx_flush_tlb_current(), its "first" user in vmx.c. This will allow a future patch to opportunistically make construct_eptp() local to vmx.c. No functional change intended. Link: https://lore.kernel.org/r/20250919005955.1366256-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index f87c216d976d..98506c723291 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -3219,6 +3219,20 @@ static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu) return to_vmx(vcpu)->vpid; } +u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) +{ + u64 eptp = VMX_EPTP_MT_WB; + + eptp |= (root_level == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4; + + if (enable_ept_ad_bits && + (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu))) + eptp |= VMX_EPTP_AD_ENABLE_BIT; + eptp |= root_hpa; + + return eptp; +} + void vmx_flush_tlb_current(struct kvm_vcpu *vcpu) { struct kvm_mmu *mmu = vcpu->arch.mmu; @@ -3396,20 +3410,6 @@ static int vmx_get_max_ept_level(void) return 4; } -u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) -{ - u64 eptp = VMX_EPTP_MT_WB; - - eptp |= (root_level == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4; - - if (enable_ept_ad_bits && - (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu))) - eptp |= VMX_EPTP_AD_ENABLE_BIT; - eptp |= root_hpa; - - return eptp; -} - void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) { struct kvm *kvm = vcpu->kvm; From a8749281e4c63e582574ae4409be7641763e58ad Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 18 Sep 2025 17:59:48 -0700 Subject: [PATCH 05/14] KVM: nVMX: Hardcode dummy EPTP used for early nested consistency checks Hardcode the dummy EPTP used for "early" consistency checks as there's no need to use 5-level EPT based on the guest.MAXPHYADDR (the EPTP just needs to be valid, it's never truly consumed). This will allow breaking construct_eptp()'s dependency on having access to the vCPU, which in turn will (much further in the future) allow for eliding per-root TLB flushes when a vCPU is migrated between pCPUs (a flush is need if and only if that particular pCPU hasn't already flushed the vCPU's roots). Link: https://lore.kernel.org/r/20250919005955.1366256-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 8 +++----- arch/x86/kvm/vmx/vmx.c | 2 +- arch/x86/kvm/vmx/vmx.h | 1 - 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 3fca63a261f5..0a4b4e790f9f 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2297,13 +2297,11 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) vmx->nested.vmcs02_initialized = true; /* - * We don't care what the EPTP value is we just need to guarantee - * it's valid so we don't get a false positive when doing early - * consistency checks. + * If early consistency checks are enabled, stuff the EPT Pointer with + * a dummy *legal* value to avoid false positives on bad control state. */ if (enable_ept && nested_early_check) - vmcs_write64(EPT_POINTER, - construct_eptp(&vmx->vcpu, 0, PT64_ROOT_4LEVEL)); + vmcs_write64(EPT_POINTER, VMX_EPTP_MT_WB | VMX_EPTP_PWL_4); if (vmx->ve_info) vmcs_write64(VE_INFORMATION_ADDRESS, __pa(vmx->ve_info)); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 98506c723291..6a1377015a2a 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -3219,7 +3219,7 @@ static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu) return to_vmx(vcpu)->vpid; } -u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) +static u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) { u64 eptp = VMX_EPTP_MT_WB; diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index ea93121029f9..6cb04a6afeef 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -369,7 +369,6 @@ void set_cr4_guest_host_mask(struct vcpu_vmx *vmx); void ept_save_pdptrs(struct kvm_vcpu *vcpu); void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); -u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level); bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu); void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu); From a10f5cc3ac9b05c764e87ae13de9a716ff519903 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 18 Sep 2025 17:59:49 -0700 Subject: [PATCH 06/14] KVM: x86/mmu: Move "dummy root" helpers to spte.h Move the helpers to get/query a dummy root from mmu_internal.h to spte.h so that VMX can detect and handle dummy roots when constructing EPTPs. This will allow using the root's role to build the EPTP instead of pulling equivalent information out of the vCPU structure. No functional change intended. Link: https://lore.kernel.org/r/20250919005955.1366256-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu_internal.h | 10 ---------- arch/x86/kvm/mmu/spte.h | 10 ++++++++++ 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index ed5c01df21ba..73cdcbccc89e 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -39,16 +39,6 @@ #define INVALID_PAE_ROOT 0 #define IS_VALID_PAE_ROOT(x) (!!(x)) -static inline hpa_t kvm_mmu_get_dummy_root(void) -{ - return my_zero_pfn(0) << PAGE_SHIFT; -} - -static inline bool kvm_mmu_is_dummy_root(hpa_t shadow_page) -{ - return is_zero_pfn(shadow_page >> PAGE_SHIFT); -} - typedef u64 __rcu *tdp_ptep_t; struct kvm_mmu_page { diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 3133f066927e..91ce29fd6f1b 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -246,6 +246,16 @@ static inline int spte_index(u64 *sptep) */ extern u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask; +static inline hpa_t kvm_mmu_get_dummy_root(void) +{ + return my_zero_pfn(0) << PAGE_SHIFT; +} + +static inline bool kvm_mmu_is_dummy_root(hpa_t shadow_page) +{ + return is_zero_pfn(shadow_page >> PAGE_SHIFT); +} + static inline struct kvm_mmu_page *to_shadow_page(hpa_t shadow_page) { struct page *page = pfn_to_page((shadow_page) >> PAGE_SHIFT); From 2f723a86342355fee85574352a165e8bf6fa5372 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 18 Sep 2025 17:59:50 -0700 Subject: [PATCH 07/14] KVM: VMX: Use kvm_mmu_page role to construct EPTP, not current vCPU state Use the role for the to-be-loaded/invalidated EPT root to compute the root's level and A/D enablement instead of pulling the information from the vCPU (e.g. by passing in the root level and querying vmcs12). Not making unnecessary assumptions about the root will allow invalidating arbitrary EPT roots (which sadly requires a full EPTP) at any given time. No functional change intended (the end result should be the same). Link: https://lore.kernel.org/r/20250919005955.1366256-5-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 41 ++++++++++++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 6a1377015a2a..1021d3b65ea0 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -3219,20 +3219,40 @@ static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu) return to_vmx(vcpu)->vpid; } -static u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) +static u64 construct_eptp(hpa_t root_hpa) { - u64 eptp = VMX_EPTP_MT_WB; + u64 eptp = root_hpa | VMX_EPTP_MT_WB; + struct kvm_mmu_page *root; - eptp |= (root_level == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4; + if (kvm_mmu_is_dummy_root(root_hpa)) + return eptp | VMX_EPTP_PWL_4; - if (enable_ept_ad_bits && - (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu))) + /* + * EPT roots should always have an associated MMU page. Return a "bad" + * EPTP to induce VM-Fail instead of continuing on in a unknown state. + */ + root = root_to_sp(root_hpa); + if (WARN_ON_ONCE(!root)) + return INVALID_PAGE; + + eptp |= (root->role.level == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4; + + if (enable_ept_ad_bits && !root->role.ad_disabled) eptp |= VMX_EPTP_AD_ENABLE_BIT; - eptp |= root_hpa; return eptp; } +static void vmx_flush_tlb_ept_root(hpa_t root_hpa) +{ + u64 eptp = construct_eptp(root_hpa); + + if (VALID_PAGE(eptp)) + ept_sync_context(eptp); + else + ept_sync_global(); +} + void vmx_flush_tlb_current(struct kvm_vcpu *vcpu) { struct kvm_mmu *mmu = vcpu->arch.mmu; @@ -3243,8 +3263,7 @@ void vmx_flush_tlb_current(struct kvm_vcpu *vcpu) return; if (enable_ept) - ept_sync_context(construct_eptp(vcpu, root_hpa, - mmu->root_role.level)); + vmx_flush_tlb_ept_root(root_hpa); else vpid_sync_context(vmx_get_current_vpid(vcpu)); } @@ -3415,11 +3434,11 @@ void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) struct kvm *kvm = vcpu->kvm; bool update_guest_cr3 = true; unsigned long guest_cr3; - u64 eptp; if (enable_ept) { - eptp = construct_eptp(vcpu, root_hpa, root_level); - vmcs_write64(EPT_POINTER, eptp); + KVM_MMU_WARN_ON(root_to_sp(root_hpa) && + root_level != root_to_sp(root_hpa)->role.level); + vmcs_write64(EPT_POINTER, construct_eptp(root_hpa)); hv_track_root_tdp(vcpu, root_hpa); From 15fe455dd1a011bbc8f9e512c6dc324cfca028c4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 18 Sep 2025 17:59:51 -0700 Subject: [PATCH 08/14] KVM: nVMX: Add consistency check for TPR_THRESHOLD[31:4]!=0 without VID Add a missing consistency check on the TPR Threshold. Per the SDM If the "use TPR shadow" VM-execution control is 1 and the "virtual- interrupt delivery" VM-execution control is 0, bits 31:4 of the TPR threshold VM-execution control field must be 0. Note, nested_vmx_check_tpr_shadow_controls() bails early if "use TPR shadow" is 0. Link: https://lore.kernel.org/r/20250919005955.1366256-6-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 0a4b4e790f9f..ffd2628b9c1e 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -555,6 +555,9 @@ static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu, if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr))) return -EINVAL; + if (CC(!nested_cpu_has_vid(vmcs12) && vmcs12->tpr_threshold >> 4)) + return -EINVAL; + return 0; } From ae8e6ad84177456d8810a8ff3a5cf3f477fb0721 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 18 Sep 2025 17:59:52 -0700 Subject: [PATCH 09/14] KVM: nVMX: Add consistency check for TSC_MULTIPLIER=0 Add a missing consistency check on the TSC Multiplier being '0'. Per the SDM: If the "use TSC scaling" VM-execution control is 1, the TSC-multiplier must not be zero. Fixes: d041b5ea9335 ("KVM: nVMX: Enable nested TSC scaling") Link: https://lore.kernel.org/r/20250919005955.1366256-7-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index ffd2628b9c1e..77b5f75cc2bb 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2962,6 +2962,10 @@ static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu, } } + if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING) && + CC(!vmcs12->tsc_multiplier)) + return -EINVAL; + return 0; } From f91699d5692ddd0ee92b9487014fc477179ab3a7 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 18 Sep 2025 17:59:53 -0700 Subject: [PATCH 10/14] KVM: nVMX: Stuff vmcs02.TSC_MULTIPLIER early on for nested early checks If KVM is doing "early" nested VM-Enter consistency checks and TSC scaling is supported, stuff vmcs02's TSC Multiplier early on to avoid getting a false positive VM-Fail due to trying to do VM-Enter with TSC_MULTIPLIER=0. To minimize complexity around L1 vs. L2 TSC, KVM sets the actual TSC Multiplier rather late during VM-Entry, i.e. may have '0' at the time of early consistency checks. If vmcs12 has TSC Scaling enabled, use the multiplier from vmcs12 so that nested early checks actually check vmcs12 state, otherwise throw in an arbitrary value of '1' (anything non-zero is legal). Fixes: d041b5ea9335 ("KVM: nVMX: Enable nested TSC scaling") Link: https://lore.kernel.org/r/20250919005955.1366256-8-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 77b5f75cc2bb..efca276c67f0 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2371,6 +2371,13 @@ static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx, else vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); } + + if (kvm_caps.has_tsc_control && nested_early_check) { + if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING)) + vmcs_write64(TSC_MULTIPLIER, vmcs12->tsc_multiplier); + else + vmcs_write64(TSC_MULTIPLIER, 1); + } } static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs01, From a175da6d430ef7f8e24153e44c59ab6903e20f97 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 18 Sep 2025 17:59:54 -0700 Subject: [PATCH 11/14] KVM: nVMX: Remove support for "early" consistency checks via hardware Remove nested_early_check and all associated code, as it's quite obviously not being used or tested (it's been broken for 4+ years without a single bug report). More importantly, KVM's software-based consistency checks have matured since the option to do hardware-based checks was added; KVM appears to be missing only _one_ consistency check, on vTPR. And even *more* importantly, that consistency check can't be prevented by an early hardware check due to L1 being able to modify the virtual APIC at any time, i.e. there's an inherent TOCTOU flaw that could cause KVM to "miss" a consistency check VM-Fail, regardless of whether the check is performed by software or by hardware. In other words, KVM _must_ be able to unwind from a late VM-Fail (which was a big motivation for doing early checks). I.e. now that KVM provides (almost) all necessary consistency checks, what's really needed is a way to detect missing checks in KVM, not a way to avoid having to unwind from a late VM-Fail. And that can be done much more simply, e.g. by an simple module param to guard a WARN (which, sadly, must be off-by-default to avoid splats due to the aforementioned TOCTOU issue). For all intents and purposes, this reverts commit 52017608da33 ("KVM: nVMX: add option to perform early consistency checks via H/W"). Link: https://lore.kernel.org/r/20250919005955.1366256-9-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 130 ++++---------------------------------- 1 file changed, 12 insertions(+), 118 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index efca276c67f0..24a2d0fa1660 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -23,9 +23,6 @@ static bool __read_mostly enable_shadow_vmcs = 1; module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); -static bool __read_mostly nested_early_check = 0; -module_param(nested_early_check, bool, S_IRUGO); - #define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK /* @@ -2299,13 +2296,6 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) return; vmx->nested.vmcs02_initialized = true; - /* - * If early consistency checks are enabled, stuff the EPT Pointer with - * a dummy *legal* value to avoid false positives on bad control state. - */ - if (enable_ept && nested_early_check) - vmcs_write64(EPT_POINTER, VMX_EPTP_MT_WB | VMX_EPTP_PWL_4); - if (vmx->ve_info) vmcs_write64(VE_INFORMATION_ADDRESS, __pa(vmx->ve_info)); @@ -2371,13 +2361,6 @@ static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx, else vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); } - - if (kvm_caps.has_tsc_control && nested_early_check) { - if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING)) - vmcs_write64(TSC_MULTIPLIER, vmcs12->tsc_multiplier); - else - vmcs_write64(TSC_MULTIPLIER, 1); - } } static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs01, @@ -3345,84 +3328,6 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, return 0; } -static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long cr3, cr4; - bool vm_fail; - - if (!nested_early_check) - return 0; - - if (vmx->msr_autoload.host.nr) - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); - if (vmx->msr_autoload.guest.nr) - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); - - preempt_disable(); - - vmx_prepare_switch_to_guest(vcpu); - - /* - * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS, - * which is reserved to '1' by hardware. GUEST_RFLAGS is guaranteed to - * be written (by prepare_vmcs02()) before the "real" VMEnter, i.e. - * there is no need to preserve other bits or save/restore the field. - */ - vmcs_writel(GUEST_RFLAGS, 0); - - cr3 = __get_current_cr3_fast(); - if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { - vmcs_writel(HOST_CR3, cr3); - vmx->loaded_vmcs->host_state.cr3 = cr3; - } - - cr4 = cr4_read_shadow(); - if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { - vmcs_writel(HOST_CR4, cr4); - vmx->loaded_vmcs->host_state.cr4 = cr4; - } - - vm_fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, - __vmx_vcpu_run_flags(vmx)); - - if (vmx->msr_autoload.host.nr) - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); - if (vmx->msr_autoload.guest.nr) - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); - - if (vm_fail) { - u32 error = vmcs_read32(VM_INSTRUCTION_ERROR); - - preempt_enable(); - - trace_kvm_nested_vmenter_failed( - "early hardware check VM-instruction error: ", error); - WARN_ON_ONCE(error != VMXERR_ENTRY_INVALID_CONTROL_FIELD); - return 1; - } - - /* - * VMExit clears RFLAGS.IF and DR7, even on a consistency check. - */ - if (hw_breakpoint_active()) - set_debugreg(__this_cpu_read(cpu_dr7), 7); - local_irq_enable(); - preempt_enable(); - - /* - * A non-failing VMEntry means we somehow entered guest mode with - * an illegal RIP, and that's just the tip of the iceberg. There - * is no telling what memory has been modified or what state has - * been exposed to unknown code. Hitting this all but guarantees - * a (very critical) hardware issue. - */ - WARN_ON(!(vmcs_read32(VM_EXIT_REASON) & - VMX_EXIT_REASONS_FAILED_VMENTRY)); - - return 0; -} - #ifdef CONFIG_KVM_HYPERV static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu) { @@ -3679,22 +3584,18 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, &vmx->nested.pre_vmenter_ssp_tbl); /* - * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and* - * nested early checks are disabled. In the event of a "late" VM-Fail, - * i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its - * software model to the pre-VMEntry host state. When EPT is disabled, - * GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes - * nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3. Stuffing - * vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to - * the correct value. Smashing vmcs01.GUEST_CR3 is safe because nested - * VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is - * guaranteed to be overwritten with a shadow CR3 prior to re-entering - * L1. Don't stuff vmcs01.GUEST_CR3 when using nested early checks as - * KVM modifies vcpu->arch.cr3 if and only if the early hardware checks - * pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail - * path would need to manually save/restore vmcs01.GUEST_CR3. + * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled. In the + * event of a "late" VM-Fail, i.e. a VM-Fail detected by hardware but + * not KVM, KVM must unwind its software model to the pre-VM-Entry host + * state. When EPT is disabled, GUEST_CR3 holds KVM's shadow CR3, not + * L1's "real" CR3, which causes nested_vmx_restore_host_state() to + * corrupt vcpu->arch.cr3. Stuffing vmcs01.GUEST_CR3 results in the + * unwind naturally setting arch.cr3 to the correct value. Smashing + * vmcs01.GUEST_CR3 is safe because nested VM-Exits, and the unwind, + * reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is guaranteed to be + * overwritten with a shadow CR3 prior to re-entering L1. */ - if (!enable_ept && !nested_early_check) + if (!enable_ept) vmcs_writel(GUEST_CR3, vcpu->arch.cr3); vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); @@ -3707,11 +3608,6 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, return NVMX_VMENTRY_KVM_INTERNAL_ERROR; } - if (nested_vmx_check_vmentry_hw(vcpu)) { - vmx_switch_vmcs(vcpu, &vmx->vmcs01); - return NVMX_VMENTRY_VMFAIL; - } - if (nested_vmx_check_guest_state(vcpu, vmcs12, &entry_failure_code)) { exit_reason.basic = EXIT_REASON_INVALID_STATE; @@ -5176,12 +5072,10 @@ void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, /* * The only expected VM-instruction error is "VM entry with * invalid control field(s)." Anything else indicates a - * problem with L0. And we should never get here with a - * VMFail of any type if early consistency checks are enabled. + * problem with L0. */ WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != VMXERR_ENTRY_INVALID_CONTROL_FIELD); - WARN_ON_ONCE(nested_early_check); } /* From 1100e4910ad207bc00aedc8dfdb228dd1b81f310 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 18 Sep 2025 17:59:55 -0700 Subject: [PATCH 12/14] KVM: nVMX: Add an off-by-default module param to WARN on missed consistency checks Add an off-by-default param, "warn_on_missed_cc", to have KVM WARN on a missed VMX Consistency Check on nested VM-Enter, specifically so that KVM developers and maintainers can more easily detect missing checks. KVM's goal/intent is that KVM detect *all* VM-Fail conditions in software, as relying on hardware leads to false passes when KVM's nested support is a subset of hardware support, e.g. see commit 095686e6fcb4 ("KVM: nVMX: Check vmcs12->guest_ia32_debugctl on nested VM-Enter"). With one notable exception, KVM now detects all VM-Fail scenarios for which there is known test coverage, i.e. KVM developers can enable the param and expect a clean run, and thus can use the param to detect missed checks, e.g. when enabling new features, when writing new tests, etc. The one exception is an unfortunate consistency check on vTPR. Because the vTPR for L2 comes from the virtual APIC page provided by L1, L2's vTPR is fully writable at all times, i.e. is inherently subject to TOCTOU issues with respect to checks in software versus consumption in hardware. Further complicating matters is KVM's deferred handling of vmcs12 pages when loading nested state; KVM flat out cannot check vTPR during KVM_SET_NESTED_STATE without breaking setups that do on-demand paging, e.g. for live migration and/or live update. To fudge around the vTPR issue, add a "late" controls check for vTPR and also treat an invalid virtual APIC as VM-Fail, but gate the check on warn_on_missed_cc being enabled to avoid unwanted false positives, i.e. to avoid breaking KVM in production. Cc: Jim Mattson Link: https://lore.kernel.org/r/20250919005955.1366256-10-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 43 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 24a2d0fa1660..b0cd745518b4 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -23,6 +23,9 @@ static bool __read_mostly enable_shadow_vmcs = 1; module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); +static bool __ro_after_init warn_on_missed_cc; +module_param(warn_on_missed_cc, bool, 0444); + #define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK /* @@ -3073,6 +3076,38 @@ static int nested_vmx_check_controls(struct kvm_vcpu *vcpu, return 0; } +static int nested_vmx_check_controls_late(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + void *vapic = to_vmx(vcpu)->nested.virtual_apic_map.hva; + u32 vtpr = vapic ? (*(u32 *)(vapic + APIC_TASKPRI)) >> 4 : 0; + + /* + * Don't bother with the consistency checks if KVM isn't configured to + * WARN on missed consistency checks, as KVM needs to rely on hardware + * to fully detect an illegal vTPR vs. TRP Threshold combination due to + * the vTPR being writable by L1 at all times (it's an in-memory value, + * not a VMCS field). I.e. even if the check passes now, it might fail + * at the actual VM-Enter. + * + * Keying off the module param also allows treating an invalid vAPIC + * mapping as a consistency check failure without increasing the risk + * of breaking a "real" VM. + */ + if (!warn_on_missed_cc) + return 0; + + if ((exec_controls_get(to_vmx(vcpu)) & CPU_BASED_TPR_SHADOW) && + nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW) && + !nested_cpu_has_vid(vmcs12) && + !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && + (CC(!vapic) || + CC((vmcs12->tpr_threshold & GENMASK(3, 0)) > (vtpr & GENMASK(3, 0))))) + return -EINVAL; + + return 0; +} + static int nested_vmx_check_address_space_size(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { @@ -3608,6 +3643,11 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, return NVMX_VMENTRY_KVM_INTERNAL_ERROR; } + if (nested_vmx_check_controls_late(vcpu, vmcs12)) { + vmx_switch_vmcs(vcpu, &vmx->vmcs01); + return NVMX_VMENTRY_VMFAIL; + } + if (nested_vmx_check_guest_state(vcpu, vmcs12, &entry_failure_code)) { exit_reason.basic = EXIT_REASON_INVALID_STATE; @@ -5076,6 +5116,9 @@ void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, */ WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != VMXERR_ENTRY_INVALID_CONTROL_FIELD); + + /* VM-Fail at VM-Entry means KVM missed a consistency check. */ + WARN_ON_ONCE(warn_on_missed_cc); } /* From 32ed0bc2f0f8ce411a822531c71b49fa93608b37 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Wed, 20 Aug 2025 11:59:54 +0200 Subject: [PATCH 13/14] KVM: VMX: Ensure guest's SPEC_CTRL[63:32] is loaded on VM-Enter SPEC_CTRL is an MSR, i.e. a 64-bit value, but the assembly code that loads the guest's value assumes bits 63:32 are always zero. The bug is _currently_ benign because neither KVM nor the kernel support setting any of bits 63:32, but it's still a bug that needs to be fixed. Note, the host's value is restored in C code and is unaffected. Fixes: 07853adc29a0 ("KVM: VMX: Prevent RSB underflow before vmenter") Suggested-by: Sean Christopherson Signed-off-by: Uros Bizjak Cc: Sean Christopherson Cc: Paolo Bonzini Cc: Josh Poimboeuf Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: "H. Peter Anvin" Link: https://patch.msgid.link/20250820100007.356761-1-ubizjak@gmail.com [sean: call out that only the guest's value is affected] Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmenter.S | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index bc255d709d8a..574159a84ee9 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -118,13 +118,23 @@ SYM_FUNC_START(__vmx_vcpu_run) * and vmentry. */ mov 2*WORD_SIZE(%_ASM_SP), %_ASM_DI - movl VMX_spec_ctrl(%_ASM_DI), %edi - movl PER_CPU_VAR(x86_spec_ctrl_current), %esi - cmp %edi, %esi +#ifdef CONFIG_X86_64 + mov VMX_spec_ctrl(%rdi), %rdx + cmp PER_CPU_VAR(x86_spec_ctrl_current), %rdx je .Lspec_ctrl_done + movl %edx, %eax + shr $32, %rdx +#else + mov VMX_spec_ctrl(%edi), %eax + mov PER_CPU_VAR(x86_spec_ctrl_current), %ecx + xor %eax, %ecx + mov VMX_spec_ctrl + 4(%edi), %edx + mov PER_CPU_VAR(x86_spec_ctrl_current + 4), %edi + xor %edx, %edi + or %edi, %ecx + je .Lspec_ctrl_done +#endif mov $MSR_IA32_SPEC_CTRL, %ecx - xor %edx, %edx - mov %edi, %eax wrmsr .Lspec_ctrl_done: From dfd1572a64c90770a2bddfab9bbb69932217b1da Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 6 Nov 2025 12:51:14 -0800 Subject: [PATCH 14/14] KVM: VMX: Make loaded_vmcs_clear() static in vmx.c Make loaded_vmcs_clear() local to vmx.c as there are no longer any external callers. No functional change intended. Link: https://patch.msgid.link/20251106205114.218226-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 2 +- arch/x86/kvm/vmx/vmx.h | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 1021d3b65ea0..d484f60fbeba 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -752,7 +752,7 @@ static void __loaded_vmcs_clear(void *arg) loaded_vmcs->launched = 0; } -void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) +static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) { int cpu = loaded_vmcs->cpu; diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 6cb04a6afeef..bc3ed3145d7e 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -680,7 +680,6 @@ struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags); void free_vmcs(struct vmcs *vmcs); int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs); void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs); -void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs); static inline struct vmcs *alloc_vmcs(bool shadow) {