diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index c3b53beb1300..81f7b3b91986 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -472,6 +472,7 @@ #define X86_FEATURE_GP_ON_USER_CPUID (20*32+17) /* User CPUID faulting */ #define X86_FEATURE_PREFETCHI (20*32+20) /* Prefetch Data/Instruction to Cache Level */ +#define X86_FEATURE_ERAPS (20*32+24) /* Enhanced Return Address Predictor Security */ #define X86_FEATURE_SBPB (20*32+27) /* Selective Branch Prediction Barrier */ #define X86_FEATURE_IBPB_BRTYPE (20*32+28) /* MSR_PRED_CMD[IBPB] flushes all branch type predictions */ #define X86_FEATURE_SRSO_NO (20*32+29) /* CPU is not affected by SRSO */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 5a3bfa293e8b..0353d8b6988c 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -195,7 +195,15 @@ enum kvm_reg { VCPU_EXREG_PDPTR = NR_VCPU_REGS, VCPU_EXREG_CR0, + /* + * Alias AMD's ERAPS (not a real register) to CR3 so that common code + * can trigger emulation of the RAP (Return Address Predictor) with + * minimal support required in common code. Piggyback CR3 as the RAP + * is cleared on writes to CR3, i.e. marking CR3 dirty will naturally + * mark ERAPS dirty as well. + */ VCPU_EXREG_CR3, + VCPU_EXREG_ERAPS = VCPU_EXREG_CR3, VCPU_EXREG_CR4, VCPU_EXREG_RFLAGS, VCPU_EXREG_SEGMENTS, diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 56aa99503dc4..50ece197c98a 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -131,7 +131,8 @@ struct __attribute__ ((__packed__)) vmcb_control_area { u64 tsc_offset; u32 asid; u8 tlb_ctl; - u8 reserved_2[3]; + u8 erap_ctl; + u8 reserved_2[2]; u32 int_ctl; u32 int_vector; u32 int_state; @@ -182,6 +183,9 @@ struct __attribute__ ((__packed__)) vmcb_control_area { #define TLB_CONTROL_FLUSH_ASID 3 #define TLB_CONTROL_FLUSH_ASID_LOCAL 7 +#define ERAP_CONTROL_ALLOW_LARGER_RAP BIT(0) +#define ERAP_CONTROL_CLEAR_RAP BIT(1) + #define V_TPR_MASK 0x0f #define V_IRQ_SHIFT 8 diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 88a5426674a1..c590a5bd3196 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -1223,6 +1223,7 @@ void kvm_set_cpu_caps(void) /* PrefetchCtlMsr */ /* GpOnUserCpuid */ /* EPSF */ + F(ERAPS), SYNTHESIZED_F(SBPB), SYNTHESIZED_F(IBPB_BRTYPE), SYNTHESIZED_F(SRSO_NO), @@ -1803,8 +1804,14 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->eax = entry->ebx = entry->ecx = entry->edx = 0; break; case 0x80000021: - entry->ebx = entry->edx = 0; + entry->edx = 0; cpuid_entry_override(entry, CPUID_8000_0021_EAX); + + if (kvm_cpu_cap_has(X86_FEATURE_ERAPS)) + entry->ebx &= GENMASK(23, 16); + else + entry->ebx = 0; + cpuid_entry_override(entry, CPUID_8000_0021_ECX); break; /* AMD Extended Performance Monitoring and Debug */ diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index aa1bea134ace..5a1e1164c197 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -436,6 +436,7 @@ void __nested_copy_vmcb_control_to_cache(struct kvm_vcpu *vcpu, to->msrpm_base_pa = from->msrpm_base_pa; to->tsc_offset = from->tsc_offset; to->tlb_ctl = from->tlb_ctl; + to->erap_ctl = from->erap_ctl; to->int_ctl = from->int_ctl; to->int_vector = from->int_vector; to->int_state = from->int_state; @@ -885,6 +886,19 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, } } + /* + * Take ALLOW_LARGER_RAP from vmcb12 even though it should be safe to + * let L2 use a larger RAP since KVM will emulate the necessary clears, + * as it's possible L1 deliberately wants to restrict L2 to the legacy + * RAP size. Unconditionally clear the RAP on nested VMRUN, as KVM is + * responsible for emulating the host vs. guest tags (L1 is the "host", + * L2 is the "guest"). + */ + if (guest_cpu_cap_has(vcpu, X86_FEATURE_ERAPS)) + vmcb02->control.erap_ctl = (svm->nested.ctl.erap_ctl & + ERAP_CONTROL_ALLOW_LARGER_RAP) | + ERAP_CONTROL_CLEAR_RAP; + /* * Merge guest and host intercepts - must be called with vcpu in * guest-mode to take effect. @@ -1180,6 +1194,9 @@ int nested_svm_vmexit(struct vcpu_svm *svm) kvm_nested_vmexit_handle_ibrs(vcpu); + if (guest_cpu_cap_has(vcpu, X86_FEATURE_ERAPS)) + vmcb01->control.erap_ctl |= ERAP_CONTROL_CLEAR_RAP; + svm_switch_vmcb(svm, &svm->vmcb01); /* @@ -1686,6 +1703,7 @@ static void nested_copy_vmcb_cache_to_control(struct vmcb_control_area *dst, dst->tsc_offset = from->tsc_offset; dst->asid = from->asid; dst->tlb_ctl = from->tlb_ctl; + dst->erap_ctl = from->erap_ctl; dst->int_ctl = from->int_ctl; dst->int_vector = from->int_vector; dst->int_state = from->int_state; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d1ff23e02ecd..34c8a94b1b81 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1141,6 +1141,9 @@ static void init_vmcb(struct kvm_vcpu *vcpu, bool init_event) svm_clr_intercept(svm, INTERCEPT_PAUSE); } + if (guest_cpu_cap_has(vcpu, X86_FEATURE_ERAPS)) + svm->vmcb->control.erap_ctl |= ERAP_CONTROL_ALLOW_LARGER_RAP; + if (kvm_vcpu_apicv_active(vcpu)) avic_init_vmcb(svm, vmcb); @@ -3293,6 +3296,7 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset); pr_err("%-20s%d\n", "asid:", control->asid); pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl); + pr_err("%-20s%d\n", "erap_ctl:", control->erap_ctl); pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl); pr_err("%-20s%08x\n", "int_vector:", control->int_vector); pr_err("%-20s%08x\n", "int_state:", control->int_state); @@ -4004,6 +4008,13 @@ static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva) invlpga(gva, svm->vmcb->control.asid); } +static void svm_flush_tlb_guest(struct kvm_vcpu *vcpu) +{ + kvm_register_mark_dirty(vcpu, VCPU_EXREG_ERAPS); + + svm_flush_tlb_asid(vcpu); +} + static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -4262,6 +4273,10 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) } svm->vmcb->save.cr2 = vcpu->arch.cr2; + if (guest_cpu_cap_has(vcpu, X86_FEATURE_ERAPS) && + kvm_register_is_dirty(vcpu, VCPU_EXREG_ERAPS)) + svm->vmcb->control.erap_ctl |= ERAP_CONTROL_CLEAR_RAP; + svm_hv_update_vp_id(svm->vmcb, vcpu); /* @@ -4339,6 +4354,14 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) } svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; + + /* + * Unconditionally mask off the CLEAR_RAP bit, the AND is just as cheap + * as the TEST+Jcc to avoid it. + */ + if (cpu_feature_enabled(X86_FEATURE_ERAPS)) + svm->vmcb->control.erap_ctl &= ~ERAP_CONTROL_CLEAR_RAP; + vmcb_mark_all_clean(svm->vmcb); /* if exit due to PF check for async PF */ @@ -5094,7 +5117,7 @@ struct kvm_x86_ops svm_x86_ops __initdata = { .flush_tlb_all = svm_flush_tlb_all, .flush_tlb_current = svm_flush_tlb_current, .flush_tlb_gva = svm_flush_tlb_gva, - .flush_tlb_guest = svm_flush_tlb_asid, + .flush_tlb_guest = svm_flush_tlb_guest, .vcpu_pre_run = svm_vcpu_pre_run, .vcpu_run = svm_vcpu_run, diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 806e68ba821b..7d28a739865f 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -156,6 +156,7 @@ struct vmcb_ctrl_area_cached { u64 tsc_offset; u32 asid; u8 tlb_ctl; + u8 erap_ctl; u32 int_ctl; u32 int_vector; u32 int_state; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ff8812f3a129..e013392fe20c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -14130,6 +14130,13 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva) return 1; } + /* + * When ERAPS is supported, invalidating a specific PCID clears + * the RAP (Return Address Predicator). + */ + if (guest_cpu_cap_has(vcpu, X86_FEATURE_ERAPS)) + kvm_register_is_dirty(vcpu, VCPU_EXREG_ERAPS); + kvm_invalidate_pcid(vcpu, operand.pcid); return kvm_skip_emulated_instruction(vcpu); @@ -14143,6 +14150,11 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva) fallthrough; case INVPCID_TYPE_ALL_INCL_GLOBAL: + /* + * Don't bother marking VCPU_EXREG_ERAPS dirty, SVM will take + * care of doing so when emulating the full guest TLB flush + * (the RAP is cleared on all implicit TLB flushes). + */ kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); return kvm_skip_emulated_instruction(vcpu);