From 3ef0df3f760f438d275334bbe193f95886890fc4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 30 Sep 2024 22:00:45 -0700 Subject: [PATCH 1/6] KVM: VMX: Don't modify guest XFD_ERR if CR0.TS=1 Don't update the guest's XFD_ERR MSR if CR0.TS is set; per the SDM, XFD_ERR is not modified if CR0.TS=1. Although it's not explicitly stated in the SDM, conceptually it makes sense the CR0.TS check would be done prior to the XFD_ERR check, e.g. CR0.TS=1 blocks all SIMD state, whereas XFD blocks only XTILE state. Device-not-available exceptions that are not due to XFD - those resulting from setting CR0.TS to 1 - do not modify the IA32_XFD_ERR MSR. Opportunistically update the comment to call out that XFD_ERR is updated before the VM-Exit check occurs. Nothing in the SDM explicitly calls out this behavior, but logically it must be the behavior, otherwise reading XFD_ERR in handle_nm_fault_irqoff() would return stale data, i.e. the to-be-delivered XFD_ERR value would need to be saved in EXIT_QUALIFICATION, a la DR6 for #DB and CR2 for #PF, so that software could capture the guest value. Fixes: ec5be88ab29f ("kvm: x86: Intercept #NM for saving IA32_XFD_ERR") Signed-off-by: Xin Li (Intel) Tested-by: Shan Kang Link: https://lore.kernel.org/r/20241001050110.3643764-3-xin@zytor.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index f72835e85b6d..24fcab183c18 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6995,16 +6995,16 @@ static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu) * MSR value is not clobbered by the host activity before the guest * has chance to consume it. * - * Do not blindly read xfd_err here, since this exception might - * be caused by L1 interception on a platform which doesn't - * support xfd at all. + * Update the guest's XFD_ERR if and only if XFD is enabled, as the #NM + * interception may have been caused by L1 interception. Per the SDM, + * XFD_ERR is not modified if CR0.TS=1. * - * Do it conditionally upon guest_fpu::xfd. xfd_err matters - * only when xfd contains a non-zero value. - * - * Queuing exception is done in vmx_handle_exit. See comment there. + * Note, XFD_ERR is updated _before_ the #NM interception check, i.e. + * unlike CR2 and DR6, the value is not a payload that is attached to + * the #NM exception. */ - if (vcpu->arch.guest_fpu.fpstate->xfd) + if (vcpu->arch.guest_fpu.fpstate->xfd && + !kvm_is_cr0_bit_set(vcpu, X86_CR0_TS)) rdmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err); } From d62c02af7a96c8accf311287df80d6010044a822 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 30 Sep 2024 22:00:57 -0700 Subject: [PATCH 2/6] KVM: VMX: Pass XFD_ERR as pseudo-payload when injecting #NM Pass XFD_ERR via KVM's exception payload mechanism when injecting an #NM after interception so that XFD_ERR can be propagated to FRED's event_data field without needing a dedicated field (which would need to be migrated). For non-FRED vCPUs, this is a glorified NOP as kvm_deliver_exception_payload() will simply do nothing (which is desirable and correct). Signed-off-by: Xin Li (Intel) Tested-by: Shan Kang Link: https://lore.kernel.org/r/20241001050110.3643764-15-xin@zytor.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 24fcab183c18..ec4527e7fdf9 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5215,6 +5215,12 @@ bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu) (kvm_get_rflags(vcpu) & X86_EFLAGS_AC); } +static bool is_xfd_nm_fault(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.guest_fpu.fpstate->xfd && + !kvm_is_cr0_bit_set(vcpu, X86_CR0_TS); +} + static int handle_exception_nmi(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -5241,7 +5247,8 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) * point. */ if (is_nm_fault(intr_info)) { - kvm_queue_exception(vcpu, NM_VECTOR); + kvm_queue_exception_p(vcpu, NM_VECTOR, + is_xfd_nm_fault(vcpu) ? vcpu->arch.guest_fpu.xfd_err : 0); return 1; } @@ -6997,14 +7004,13 @@ static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu) * * Update the guest's XFD_ERR if and only if XFD is enabled, as the #NM * interception may have been caused by L1 interception. Per the SDM, - * XFD_ERR is not modified if CR0.TS=1. + * XFD_ERR is not modified for non-XFD #NM, i.e. if CR0.TS=1. * * Note, XFD_ERR is updated _before_ the #NM interception check, i.e. * unlike CR2 and DR6, the value is not a payload that is attached to * the #NM exception. */ - if (vcpu->arch.guest_fpu.fpstate->xfd && - !kvm_is_cr0_bit_set(vcpu, X86_CR0_TS)) + if (is_xfd_nm_fault(vcpu)) rdmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err); } From fa6c8fc2d2673dcaf7333bc35eb759ab7c39b81f Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 26 Feb 2025 16:07:04 -0800 Subject: [PATCH 3/6] KVM: VMX: Remove EPT_VIOLATIONS_ACC_*_BIT defines Those defines are only used in the definition of the various EPT_VIOLATIONS_ACC_* macros which are then used to extract respective bits from vmexit error qualifications. Remove the _BIT defines and redefine the _ACC ones via BIT() macro. No functional changes. Signed-off-by: Nikolay Borisov Link: https://lore.kernel.org/r/20250227000705.3199706-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/vmx.h | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index f7fd4369b821..aabc223c6498 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -580,18 +580,13 @@ enum vm_entry_failure_code { /* * Exit Qualifications for EPT Violations */ -#define EPT_VIOLATION_ACC_READ_BIT 0 -#define EPT_VIOLATION_ACC_WRITE_BIT 1 -#define EPT_VIOLATION_ACC_INSTR_BIT 2 #define EPT_VIOLATION_RWX_SHIFT 3 -#define EPT_VIOLATION_GVA_IS_VALID_BIT 7 -#define EPT_VIOLATION_GVA_TRANSLATED_BIT 8 -#define EPT_VIOLATION_ACC_READ (1 << EPT_VIOLATION_ACC_READ_BIT) -#define EPT_VIOLATION_ACC_WRITE (1 << EPT_VIOLATION_ACC_WRITE_BIT) -#define EPT_VIOLATION_ACC_INSTR (1 << EPT_VIOLATION_ACC_INSTR_BIT) +#define EPT_VIOLATION_ACC_READ BIT(0) +#define EPT_VIOLATION_ACC_WRITE BIT(1) +#define EPT_VIOLATION_ACC_INSTR BIT(2) #define EPT_VIOLATION_RWX_MASK (VMX_EPT_RWX_MASK << EPT_VIOLATION_RWX_SHIFT) -#define EPT_VIOLATION_GVA_IS_VALID (1 << EPT_VIOLATION_GVA_IS_VALID_BIT) -#define EPT_VIOLATION_GVA_TRANSLATED (1 << EPT_VIOLATION_GVA_TRANSLATED_BIT) +#define EPT_VIOLATION_GVA_IS_VALID BIT(7) +#define EPT_VIOLATION_GVA_TRANSLATED BIT(8) /* * Exit Qualifications for NOTIFY VM EXIT From 61146f67e4cb67064ce3003d94ee19302d314fff Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 26 Feb 2025 16:07:05 -0800 Subject: [PATCH 4/6] KVM: nVMX: Decouple EPT RWX bits from EPT Violation protection bits Define independent macros for the RWX protection bits that are enumerated via EXIT_QUALIFICATION for EPT Violations, and tie them to the RWX bits in EPT entries via compile-time asserts. Piggybacking the EPTE defines works for now, but it creates holes in the EPT_VIOLATION_xxx macros and will cause headaches if/when KVM emulates Mode-Based Execution (MBEC), or any other features that introduces additional protection information. Opportunistically rename EPT_VIOLATION_RWX_MASK to EPT_VIOLATION_PROT_MASK so that it doesn't become stale if/when MBEC support is added. No functional change intended. Cc: Jon Kohler Cc: Nikolay Borisov Reviewed-by: Nikolay Borisov Link: https://lore.kernel.org/r/20250227000705.3199706-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/vmx.h | 13 +++++++++++-- arch/x86/kvm/mmu/paging_tmpl.h | 3 +-- arch/x86/kvm/vmx/vmx.c | 2 +- 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index aabc223c6498..8707361b24da 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -580,14 +580,23 @@ enum vm_entry_failure_code { /* * Exit Qualifications for EPT Violations */ -#define EPT_VIOLATION_RWX_SHIFT 3 #define EPT_VIOLATION_ACC_READ BIT(0) #define EPT_VIOLATION_ACC_WRITE BIT(1) #define EPT_VIOLATION_ACC_INSTR BIT(2) -#define EPT_VIOLATION_RWX_MASK (VMX_EPT_RWX_MASK << EPT_VIOLATION_RWX_SHIFT) +#define EPT_VIOLATION_PROT_READ BIT(3) +#define EPT_VIOLATION_PROT_WRITE BIT(4) +#define EPT_VIOLATION_PROT_EXEC BIT(5) +#define EPT_VIOLATION_PROT_MASK (EPT_VIOLATION_PROT_READ | \ + EPT_VIOLATION_PROT_WRITE | \ + EPT_VIOLATION_PROT_EXEC) #define EPT_VIOLATION_GVA_IS_VALID BIT(7) #define EPT_VIOLATION_GVA_TRANSLATED BIT(8) +#define EPT_VIOLATION_RWX_TO_PROT(__epte) (((__epte) & VMX_EPT_RWX_MASK) << 3) + +static_assert(EPT_VIOLATION_RWX_TO_PROT(VMX_EPT_RWX_MASK) == + (EPT_VIOLATION_PROT_READ | EPT_VIOLATION_PROT_WRITE | EPT_VIOLATION_PROT_EXEC)); + /* * Exit Qualifications for NOTIFY VM EXIT */ diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index f4711674c47b..68e323568e95 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -510,8 +510,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, * Note, pte_access holds the raw RWX bits from the EPTE, not * ACC_*_MASK flags! */ - walker->fault.exit_qualification |= (pte_access & VMX_EPT_RWX_MASK) << - EPT_VIOLATION_RWX_SHIFT; + walker->fault.exit_qualification |= EPT_VIOLATION_RWX_TO_PROT(pte_access); } #endif walker->fault.address = addr; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index ec4527e7fdf9..9bbcdbfc7f2c 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5822,7 +5822,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR) ? PFERR_FETCH_MASK : 0; /* ept page table entry is present? */ - error_code |= (exit_qualification & EPT_VIOLATION_RWX_MASK) + error_code |= (exit_qualification & EPT_VIOLATION_PROT_MASK) ? PFERR_PRESENT_MASK : 0; if (error_code & EPT_VIOLATION_GVA_IS_VALID) From 64c947a1cf351989245ba83eb0587645c8d0c482 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 24 Feb 2025 09:14:09 -0800 Subject: [PATCH 5/6] KVM: VMX: Reject KVM_RUN if userspace forces emulation during nested VM-Enter Extend KVM's restrictions on userspace forcing "emulation required" at odd times to cover stuffing invalid guest state while a nested run is pending. Clobbering guest state while KVM is in the middle of emulating VM-Enter is nonsensical, as it puts the vCPU into an architecturally impossible state, and will trip KVM's sanity check that guards against KVM bugs, e.g. helps detect missed VMX consistency checks. WARNING: CPU: 3 PID: 6336 at arch/x86/kvm/vmx/vmx.c:6480 __vmx_handle_exit arch/x86/kvm/vmx/vmx.c:6480 [inline] WARNING: CPU: 3 PID: 6336 at arch/x86/kvm/vmx/vmx.c:6480 vmx_handle_exit+0x40f/0x1f70 arch/x86/kvm/vmx/vmx.c:6637 Modules linked in: CPU: 3 UID: 0 PID: 6336 Comm: syz.0.73 Not tainted 6.13.0-rc1-syzkaller-00316-gb5f217084ab3 #0 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2~bpo12+1 04/01/2014 RIP: 0010:__vmx_handle_exit arch/x86/kvm/vmx/vmx.c:6480 [inline] RIP: 0010:vmx_handle_exit+0x40f/0x1f70 arch/x86/kvm/vmx/vmx.c:6637 vcpu_enter_guest arch/x86/kvm/x86.c:11081 [inline] vcpu_run+0x3047/0x4f50 arch/x86/kvm/x86.c:11242 kvm_arch_vcpu_ioctl_run+0x44a/0x1740 arch/x86/kvm/x86.c:11560 kvm_vcpu_ioctl+0x6ce/0x1520 virt/kvm/kvm_main.c:4340 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:906 [inline] __se_sys_ioctl fs/ioctl.c:892 [inline] __x64_sys_ioctl+0x190/0x200 fs/ioctl.c:892 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xcd/0x250 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f Reported-by: syzbot+ac0bc3a70282b4d586cc@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/67598fb9.050a0220.17f54a.003b.GAE@google.com Debugged-by: James Houghton Tested-by: James Houghton Link: https://lore.kernel.org/r/20250224171409.2348647-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 9bbcdbfc7f2c..67ebc1959724 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5876,11 +5876,35 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu) return 1; } -static bool vmx_emulation_required_with_pending_exception(struct kvm_vcpu *vcpu) +/* + * Returns true if emulation is required (due to the vCPU having invalid state + * with unsrestricted guest mode disabled) and KVM can't faithfully emulate the + * current vCPU state. + */ +static bool vmx_unhandleable_emulation_required(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - return vmx->emulation_required && !vmx->rmode.vm86_active && + if (!vmx->emulation_required) + return false; + + /* + * It is architecturally impossible for emulation to be required when a + * nested VM-Enter is pending completion, as VM-Enter will VM-Fail if + * guest state is invalid and unrestricted guest is disabled, i.e. KVM + * should synthesize VM-Fail instead emulation L2 code. This path is + * only reachable if userspace modifies L2 guest state after KVM has + * performed the nested VM-Enter consistency checks. + */ + if (vmx->nested.nested_run_pending) + return true; + + /* + * KVM only supports emulating exceptions if the vCPU is in Real Mode. + * If emulation is required, KVM can't perform a successful VM-Enter to + * inject the exception. + */ + return !vmx->rmode.vm86_active && (kvm_is_exception_pending(vcpu) || vcpu->arch.exception.injected); } @@ -5903,7 +5927,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) if (!kvm_emulate_instruction(vcpu, 0)) return 0; - if (vmx_emulation_required_with_pending_exception(vcpu)) { + if (vmx_unhandleable_emulation_required(vcpu)) { kvm_prepare_emulation_failure_exit(vcpu); return 0; } @@ -5927,7 +5951,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu) { - if (vmx_emulation_required_with_pending_exception(vcpu)) { + if (vmx_unhandleable_emulation_required(vcpu)) { kvm_prepare_emulation_failure_exit(vcpu); return 0; } From 0c3566b63de860f6d42e3d9254890c00ac0970d7 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 26 Feb 2025 16:53:53 -0800 Subject: [PATCH 6/6] KVM: VMX: Extract checks on entry/exit control pairs to a helper macro Extract the checking of entry/exit pairs to a helper macro so that the code can be reused to process the upcoming "secondary" exit controls (the primary exit controls field is out of bits). Use a macro instead of a function to support different sized variables (all secondary exit controls will be optional and so the MSR doesn't have the fixed-0/fixed-1 split). Taking the largest size as input is trivial, but handling the modification of KVM's to-be-used controls is much trickier, e.g. would require bitmap games to clear bits from a 32-bit bitmap vs. a 64-bit bitmap. Opportunistically add sanity checks to ensure the size of the controls match (yay, macro!), e.g. to detect bugs where KVM passes in the pairs for primary exit controls, but its variable for the secondary exit controls. To help users triage mismatches, print the control bits that are checked, not just the actual value. For the foreseeable future, that provides enough information for a user to determine which fields mismatched. E.g. until secondary entry controls comes along, all entry bits and thus all error messages are guaranteed to be unique. To avoid returning from a macro, which can get quite dangerous, simply process all pairs even if error_on_inconsistent_vmcs_config is set. The speed at which KVM rejects module load is not at all interesting. Keep the error message a "once" printk, even though it would be nice to print out all mismatching pairs. In practice, the most likely scenario is that a single pair will mismatch on all CPUs. Printing all mismatches generates redundant messages in that situation, and can be extremely noisy on systems with large numbers of CPUs. If a CPU has multiple mismatches, not printing every bad pair is the least of the user's concerns. Cc: Xin Li (Intel) Link: https://lore.kernel.org/r/20250227005353.3216123-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 48 +++++++++++++++++++++++++++--------------- 1 file changed, 31 insertions(+), 17 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 67ebc1959724..142cdbd4f481 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2582,6 +2582,34 @@ static u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr) return ctl_opt & allowed; } +#define vmx_check_entry_exit_pairs(pairs, entry_controls, exit_controls) \ +({ \ + int i, r = 0; \ + \ + BUILD_BUG_ON(sizeof(pairs[0].entry_control) != sizeof(entry_controls)); \ + BUILD_BUG_ON(sizeof(pairs[0].exit_control) != sizeof(exit_controls)); \ + \ + for (i = 0; i < ARRAY_SIZE(pairs); i++) { \ + typeof(entry_controls) n_ctrl = pairs[i].entry_control; \ + typeof(exit_controls) x_ctrl = pairs[i].exit_control; \ + \ + if (!(entry_controls & n_ctrl) == !(exit_controls & x_ctrl)) \ + continue; \ + \ + pr_warn_once("Inconsistent VM-Entry/VM-Exit pair, " \ + "entry = %llx (%llx), exit = %llx (%llx)\n", \ + (u64)(entry_controls & n_ctrl), (u64)n_ctrl, \ + (u64)(exit_controls & x_ctrl), (u64)x_ctrl); \ + \ + if (error_on_inconsistent_vmcs_config) \ + r = -EIO; \ + \ + entry_controls &= ~n_ctrl; \ + exit_controls &= ~x_ctrl; \ + } \ + r; \ +}) + static int setup_vmcs_config(struct vmcs_config *vmcs_conf, struct vmx_capability *vmx_cap) { @@ -2593,7 +2621,6 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf, u32 _vmentry_control = 0; u64 basic_msr; u64 misc_msr; - int i; /* * LOAD/SAVE_DEBUG_CONTROLS are absent because both are mandatory. @@ -2697,22 +2724,9 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf, &_vmentry_control)) return -EIO; - for (i = 0; i < ARRAY_SIZE(vmcs_entry_exit_pairs); i++) { - u32 n_ctrl = vmcs_entry_exit_pairs[i].entry_control; - u32 x_ctrl = vmcs_entry_exit_pairs[i].exit_control; - - if (!(_vmentry_control & n_ctrl) == !(_vmexit_control & x_ctrl)) - continue; - - pr_warn_once("Inconsistent VM-Entry/VM-Exit pair, entry = %x, exit = %x\n", - _vmentry_control & n_ctrl, _vmexit_control & x_ctrl); - - if (error_on_inconsistent_vmcs_config) - return -EIO; - - _vmentry_control &= ~n_ctrl; - _vmexit_control &= ~x_ctrl; - } + if (vmx_check_entry_exit_pairs(vmcs_entry_exit_pairs, + _vmentry_control, _vmexit_control)) + return -EIO; /* * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they