From acf2923271ef3611ceab7004fc8798d3ba31b739 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 2 Aug 2024 13:20:06 -0700 Subject: [PATCH 01/33] KVM: x86/mmu: Clean up function comments for dirty logging APIs Rework the function comment for kvm_arch_mmu_enable_log_dirty_pt_masked() into the body of the function, as it has gotten a bit stale, is harder to read without the code context, and is the last source of warnings for W=1 builds in KVM x86 due to using a kernel-doc comment without documenting all parameters. Opportunistically subsume the functions comments for kvm_mmu_write_protect_pt_masked() and kvm_mmu_clear_dirty_pt_masked(), as there is no value in regurgitating similar information at a higher level, and capturing the differences between write-protection and PML-based dirty logging is best done in a common location. No functional change intended. Cc: David Matlack Reviewed-by: Kai Huang Reviewed-by: Pankaj Gupta Link: https://lore.kernel.org/r/20240802202006.340854-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 48 +++++++++++++----------------------------- 1 file changed, 15 insertions(+), 33 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 928cf84778b0..5226bb055d99 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1307,15 +1307,6 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head, return flush; } -/** - * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages - * @kvm: kvm instance - * @slot: slot to protect - * @gfn_offset: start of the BITS_PER_LONG pages we care about - * @mask: indicates which pages we should protect - * - * Used when we do not need to care about huge page mappings. - */ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask) @@ -1339,16 +1330,6 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, } } -/** - * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages, or write - * protect the page if the D-bit isn't supported. - * @kvm: kvm instance - * @slot: slot to clear D-bit - * @gfn_offset: start of the BITS_PER_LONG pages we care about - * @mask: indicates which pages we should clear D-bit - * - * Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap. - */ static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask) @@ -1372,24 +1353,16 @@ static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, } } -/** - * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected - * PT level pages. - * - * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to - * enable dirty logging for them. - * - * We need to care about huge page mappings: e.g. during dirty logging we may - * have such mappings. - */ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask) { /* - * Huge pages are NOT write protected when we start dirty logging in - * initially-all-set mode; must write protect them here so that they - * are split to 4K on the first write. + * If the slot was assumed to be "initially all dirty", write-protect + * huge pages to ensure they are split to 4KiB on the first write (KVM + * dirty logs at 4KiB granularity). If eager page splitting is enabled, + * immediately try to split huge pages, e.g. so that vCPUs don't get + * saddled with the cost of splitting. * * The gfn_offset is guaranteed to be aligned to 64, but the base_gfn * of memslot has no such restriction, so the range can cross two large @@ -1411,7 +1384,16 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, PG_LEVEL_2M); } - /* Now handle 4K PTEs. */ + /* + * (Re)Enable dirty logging for all 4KiB SPTEs that map the GFNs in + * mask. If PML is enabled and the GFN doesn't need to be write- + * protected for other reasons, e.g. shadow paging, clear the Dirty bit. + * Otherwise clear the Writable bit. + * + * Note that kvm_mmu_clear_dirty_pt_masked() is called whenever PML is + * enabled but it chooses between clearing the Dirty bit and Writeable + * bit based on the context. + */ if (kvm_x86_ops.cpu_dirty_log_size) kvm_mmu_clear_dirty_pt_masked(kvm, slot, gfn_offset, mask); else From 174b6e4a25ea80c2432cedd8e2760e152a6d7f82 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 2 Aug 2024 13:38:58 -0700 Subject: [PATCH 02/33] KVM: x86/mmu: Decrease indentation in logic to sync new indirect shadow page Combine the back-to-back if-statements for synchronizing children when linking a new indirect shadow page in order to decrease the indentation, and to make it easier to "see" the logic in its entirety. No functional change intended. Link: https://lore.kernel.org/r/20240802203900.348808-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/paging_tmpl.h | 40 ++++++++++++++++------------------ 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 69941cebb3a8..0e97e080a997 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -674,27 +674,25 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, sp = kvm_mmu_get_child_sp(vcpu, it.sptep, table_gfn, false, access); - if (sp != ERR_PTR(-EEXIST)) { - /* - * We must synchronize the pagetable before linking it - * because the guest doesn't need to flush tlb when - * the gpte is changed from non-present to present. - * Otherwise, the guest may use the wrong mapping. - * - * For PG_LEVEL_4K, kvm_mmu_get_page() has already - * synchronized it transiently via kvm_sync_page(). - * - * For higher level pagetable, we synchronize it via - * the slower mmu_sync_children(). If it needs to - * break, some progress has been made; return - * RET_PF_RETRY and retry on the next #PF. - * KVM_REQ_MMU_SYNC is not necessary but it - * expedites the process. - */ - if (sp->unsync_children && - mmu_sync_children(vcpu, sp, false)) - return RET_PF_RETRY; - } + /* + * Synchronize the new page before linking it, as the CPU (KVM) + * is architecturally disallowed from inserting non-present + * entries into the TLB, i.e. the guest isn't required to flush + * the TLB when changing the gPTE from non-present to present. + * + * For PG_LEVEL_4K, kvm_mmu_find_shadow_page() has already + * synchronized the page via kvm_sync_page(). + * + * For higher level pages, which cannot be unsync themselves + * but can have unsync children, synchronize via the slower + * mmu_sync_children(). If KVM needs to drop mmu_lock due to + * contention or to reschedule, instruct the caller to retry + * the #PF (mmu_sync_children() ensures forward progress will + * be made). + */ + if (sp != ERR_PTR(-EEXIST) && sp->unsync_children && + mmu_sync_children(vcpu, sp, false)) + return RET_PF_RETRY; /* * Verify that the gpte in the page we've just write From 7d67b03e6fff3934913a38674f269f55964bdd65 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 2 Aug 2024 13:38:59 -0700 Subject: [PATCH 03/33] KVM: x86/mmu: Drop pointless "return" wrapper label in FNAME(fetch) Drop the pointless and poorly named "out_gpte_changed" label, in FNAME(fetch), and instead return RET_PF_RETRY directly. No functional change intended. Link: https://lore.kernel.org/r/20240802203900.348808-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/paging_tmpl.h | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 0e97e080a997..480c54122991 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -646,10 +646,10 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, * really care if it changes underneath us after this point). */ if (FNAME(gpte_changed)(vcpu, gw, top_level)) - goto out_gpte_changed; + return RET_PF_RETRY; if (WARN_ON_ONCE(!VALID_PAGE(vcpu->arch.mmu->root.hpa))) - goto out_gpte_changed; + return RET_PF_RETRY; /* * Load a new root and retry the faulting instruction in the extremely @@ -659,7 +659,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, */ if (unlikely(kvm_mmu_is_dummy_root(vcpu->arch.mmu->root.hpa))) { kvm_make_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu); - goto out_gpte_changed; + return RET_PF_RETRY; } for_each_shadow_entry(vcpu, fault->addr, it) { @@ -699,7 +699,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, * protected is still there. */ if (FNAME(gpte_changed)(vcpu, gw, it.level - 1)) - goto out_gpte_changed; + return RET_PF_RETRY; if (sp != ERR_PTR(-EEXIST)) link_shadow_page(vcpu, it.sptep, sp); @@ -753,9 +753,6 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, FNAME(pte_prefetch)(vcpu, gw, it.sptep); return ret; - -out_gpte_changed: - return RET_PF_RETRY; } /* From 1dc9cc1c4c230fbc6bf6322f150d4b81f712cfb9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 2 Aug 2024 13:39:00 -0700 Subject: [PATCH 04/33] KVM: x86/mmu: Reword a misleading comment about checking gpte_changed() Rewrite the comment in FNAME(fetch) to explain why KVM needs to check that the gPTE is still fresh before continuing the shadow page walk, even if KVM already has a linked shadow page for the gPTE in question. No functional change intended. Link: https://lore.kernel.org/r/20240802203900.348808-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/paging_tmpl.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 480c54122991..405bd7ceee2a 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -695,8 +695,14 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, return RET_PF_RETRY; /* - * Verify that the gpte in the page we've just write - * protected is still there. + * Verify that the gpte in the page, which is now either + * write-protected or unsync, wasn't modified between the fault + * and acquiring mmu_lock. This needs to be done even when + * reusing an existing shadow page to ensure the information + * gathered by the walker matches the information stored in the + * shadow page (which could have been modified by a different + * vCPU even if the page was already linked). Holding mmu_lock + * prevents the shadow page from changing after this point. */ if (FNAME(gpte_changed)(vcpu, gw, it.level - 1)) return RET_PF_RETRY; From 4ececec19a0914873634ad69bbaca5557c33e855 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 30 Aug 2024 17:15:17 -0700 Subject: [PATCH 05/33] KVM: x86/mmu: Replace PFERR_NESTED_GUEST_PAGE with a more descriptive helper Drop the globally visible PFERR_NESTED_GUEST_PAGE and replace it with a more appropriately named is_write_to_guest_page_table(). The macro name is misleading, because while all nNPT walks match PAGE|WRITE|PRESENT, the reverse is not true. No functional change intended. Link: https://lore.kernel.org/r/20240831001538.336683-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 4 ---- arch/x86/kvm/mmu/mmu.c | 9 ++++++++- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 4a68cb3eba78..797433994d56 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -280,10 +280,6 @@ enum x86_intercept_stage; #define PFERR_PRIVATE_ACCESS BIT_ULL(49) #define PFERR_SYNTHETIC_MASK (PFERR_IMPLICIT_ACCESS | PFERR_PRIVATE_ACCESS) -#define PFERR_NESTED_GUEST_PAGE (PFERR_GUEST_PAGE_MASK | \ - PFERR_WRITE_MASK | \ - PFERR_PRESENT_MASK) - /* apic attention bits */ #define KVM_APIC_CHECK_VAPIC 0 /* diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 5226bb055d99..1eedd1932e5f 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5945,6 +5945,13 @@ void kvm_mmu_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, write_unlock(&vcpu->kvm->mmu_lock); } +static bool is_write_to_guest_page_table(u64 error_code) +{ + const u64 mask = PFERR_GUEST_PAGE_MASK | PFERR_WRITE_MASK | PFERR_PRESENT_MASK; + + return (error_code & mask) == mask; +} + int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, void *insn, int insn_len) { @@ -6008,7 +6015,7 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err * and resume the guest. */ if (vcpu->arch.mmu->root_role.direct && - (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) { + is_write_to_guest_page_table(error_code)) { kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa)); return 1; } From 989a84c93f592e6b288fb3b96d2eeec827d75bef Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 30 Aug 2024 17:15:18 -0700 Subject: [PATCH 06/33] KVM: x86/mmu: Trigger unprotect logic only on write-protection page faults Trigger KVM's various "unprotect gfn" paths if and only if the page fault was a write to a write-protected gfn. To do so, add a new page fault return code, RET_PF_WRITE_PROTECTED, to explicitly and precisely track such page faults. If a page fault requires emulation for any MMIO (or any reason besides write-protection), trying to unprotect the gfn is pointless and risks putting the vCPU into an infinite loop. E.g. KVM will put the vCPU into an infinite loop if the vCPU manages to trigger MMIO on a page table walk. Fixes: 147277540bbc ("kvm: svm: Add support for additional SVM NPF error codes") Reviewed-by: Yuan Yao Link: https://lore.kernel.org/r/20240831001538.336683-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 75 +++++++++++++++++++-------------- arch/x86/kvm/mmu/mmu_internal.h | 3 ++ arch/x86/kvm/mmu/mmutrace.h | 1 + arch/x86/kvm/mmu/paging_tmpl.h | 2 +- arch/x86/kvm/mmu/tdp_mmu.c | 6 +-- 5 files changed, 50 insertions(+), 37 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 1eedd1932e5f..d8deed29091b 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2896,10 +2896,8 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, trace_kvm_mmu_set_spte(level, gfn, sptep); } - if (wrprot) { - if (write_fault) - ret = RET_PF_EMULATE; - } + if (wrprot && write_fault) + ret = RET_PF_WRITE_PROTECTED; if (flush) kvm_flush_remote_tlbs_gfn(vcpu->kvm, gfn, level); @@ -4531,7 +4529,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault return RET_PF_RETRY; if (page_fault_handle_page_track(vcpu, fault)) - return RET_PF_EMULATE; + return RET_PF_WRITE_PROTECTED; r = fast_page_fault(vcpu, fault); if (r != RET_PF_INVALID) @@ -4624,7 +4622,7 @@ static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu, int r; if (page_fault_handle_page_track(vcpu, fault)) - return RET_PF_EMULATE; + return RET_PF_WRITE_PROTECTED; r = fast_page_fault(vcpu, fault); if (r != RET_PF_INVALID) @@ -4703,6 +4701,7 @@ static int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, switch (r) { case RET_PF_FIXED: case RET_PF_SPURIOUS: + case RET_PF_WRITE_PROTECTED: return 0; case RET_PF_EMULATE: @@ -5952,6 +5951,40 @@ static bool is_write_to_guest_page_table(u64 error_code) return (error_code & mask) == mask; } +static int kvm_mmu_write_protect_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, + u64 error_code, int *emulation_type) +{ + bool direct = vcpu->arch.mmu->root_role.direct; + + /* + * Before emulating the instruction, check if the error code + * was due to a RO violation while translating the guest page. + * This can occur when using nested virtualization with nested + * paging in both guests. If true, we simply unprotect the page + * and resume the guest. + */ + if (direct && is_write_to_guest_page_table(error_code)) { + kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa)); + return RET_PF_RETRY; + } + + /* + * The gfn is write-protected, but if emulation fails we can still + * optimistically try to just unprotect the page and let the processor + * re-execute the instruction that caused the page fault. Do not allow + * retrying MMIO emulation, as it's not only pointless but could also + * cause us to enter an infinite loop because the processor will keep + * faulting on the non-existent MMIO address. Retrying an instruction + * from a nested guest is also pointless and dangerous as we are only + * explicitly shadowing L1's page tables, i.e. unprotecting something + * for L1 isn't going to magically fix whatever issue cause L2 to fail. + */ + if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu)) + *emulation_type |= EMULTYPE_ALLOW_RETRY_PF; + + return RET_PF_EMULATE; +} + int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, void *insn, int insn_len) { @@ -5997,6 +6030,10 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err if (r < 0) return r; + if (r == RET_PF_WRITE_PROTECTED) + r = kvm_mmu_write_protect_fault(vcpu, cr2_or_gpa, error_code, + &emulation_type); + if (r == RET_PF_FIXED) vcpu->stat.pf_fixed++; else if (r == RET_PF_EMULATE) @@ -6007,32 +6044,6 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err if (r != RET_PF_EMULATE) return 1; - /* - * Before emulating the instruction, check if the error code - * was due to a RO violation while translating the guest page. - * This can occur when using nested virtualization with nested - * paging in both guests. If true, we simply unprotect the page - * and resume the guest. - */ - if (vcpu->arch.mmu->root_role.direct && - is_write_to_guest_page_table(error_code)) { - kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa)); - return 1; - } - - /* - * vcpu->arch.mmu.page_fault returned RET_PF_EMULATE, but we can still - * optimistically try to just unprotect the page and let the processor - * re-execute the instruction that caused the page fault. Do not allow - * retrying MMIO emulation, as it's not only pointless but could also - * cause us to enter an infinite loop because the processor will keep - * faulting on the non-existent MMIO address. Retrying an instruction - * from a nested guest is also pointless and dangerous as we are only - * explicitly shadowing L1's page tables, i.e. unprotecting something - * for L1 isn't going to magically fix whatever issue cause L2 to fail. - */ - if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu)) - emulation_type |= EMULTYPE_ALLOW_RETRY_PF; emulate: return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn, insn_len); diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 1721d97743e9..50d2624111f8 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -258,6 +258,8 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); * RET_PF_CONTINUE: So far, so good, keep handling the page fault. * RET_PF_RETRY: let CPU fault again on the address. * RET_PF_EMULATE: mmio page fault, emulate the instruction directly. + * RET_PF_WRITE_PROTECTED: the gfn is write-protected, either unprotected the + * gfn and retry, or emulate the instruction directly. * RET_PF_INVALID: the spte is invalid, let the real page fault path update it. * RET_PF_FIXED: The faulting entry has been fixed. * RET_PF_SPURIOUS: The faulting entry was already fixed, e.g. by another vCPU. @@ -274,6 +276,7 @@ enum { RET_PF_CONTINUE = 0, RET_PF_RETRY, RET_PF_EMULATE, + RET_PF_WRITE_PROTECTED, RET_PF_INVALID, RET_PF_FIXED, RET_PF_SPURIOUS, diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h index 195d98bc8de8..f35a830ce469 100644 --- a/arch/x86/kvm/mmu/mmutrace.h +++ b/arch/x86/kvm/mmu/mmutrace.h @@ -57,6 +57,7 @@ TRACE_DEFINE_ENUM(RET_PF_CONTINUE); TRACE_DEFINE_ENUM(RET_PF_RETRY); TRACE_DEFINE_ENUM(RET_PF_EMULATE); +TRACE_DEFINE_ENUM(RET_PF_WRITE_PROTECTED); TRACE_DEFINE_ENUM(RET_PF_INVALID); TRACE_DEFINE_ENUM(RET_PF_FIXED); TRACE_DEFINE_ENUM(RET_PF_SPURIOUS); diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 405bd7ceee2a..ae7d39ff2d07 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -806,7 +806,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault if (page_fault_handle_page_track(vcpu, fault)) { shadow_page_table_clear_flood(vcpu, fault->addr); - return RET_PF_EMULATE; + return RET_PF_WRITE_PROTECTED; } r = mmu_topup_memory_caches(vcpu, true); diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index c7dc49ee7388..8bf44ac9372f 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1046,10 +1046,8 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, * protected, emulation is needed. If the emulation was skipped, * the vCPU would have the same fault again. */ - if (wrprot) { - if (fault->write) - ret = RET_PF_EMULATE; - } + if (wrprot && fault->write) + ret = RET_PF_WRITE_PROTECTED; /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */ if (unlikely(is_mmio_spte(vcpu->kvm, new_spte))) { From 2fb2b7877b3a4cac4de070ef92437b38f13559b0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 30 Aug 2024 17:15:19 -0700 Subject: [PATCH 07/33] KVM: x86/mmu: Skip emulation on page fault iff 1+ SPs were unprotected When doing "fast unprotection" of nested TDP page tables, skip emulation if and only if at least one gfn was unprotected, i.e. continue with emulation if simply resuming is likely to hit the same fault and risk putting the vCPU into an infinite loop. Note, it's entirely possible to get a false negative, e.g. if a different vCPU faults on the same gfn and unprotects the gfn first, but that's a relatively rare edge case, and emulating is still functionally ok, i.e. saving a few cycles by avoiding emulation isn't worth the risk of putting the vCPU into an infinite loop. Opportunistically rewrite the relevant comment to document in gory detail exactly what scenario the "fast unprotect" logic is handling. Fixes: 147277540bbc ("kvm: svm: Add support for additional SVM NPF error codes") Cc: Yuan Yao Reviewed-by: Yuan Yao Link: https://lore.kernel.org/r/20240831001538.336683-5-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 37 +++++++++++++++++++++++++++++-------- 1 file changed, 29 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index d8deed29091b..edca32a4e874 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5957,16 +5957,37 @@ static int kvm_mmu_write_protect_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, bool direct = vcpu->arch.mmu->root_role.direct; /* - * Before emulating the instruction, check if the error code - * was due to a RO violation while translating the guest page. - * This can occur when using nested virtualization with nested - * paging in both guests. If true, we simply unprotect the page - * and resume the guest. + * Before emulating the instruction, check to see if the access was due + * to a read-only violation while the CPU was walking non-nested NPT + * page tables, i.e. for a direct MMU, for _guest_ page tables in L1. + * If L1 is sharing (a subset of) its page tables with L2, e.g. by + * having nCR3 share lower level page tables with hCR3, then when KVM + * (L0) write-protects the nested NPTs, i.e. npt12 entries, KVM is also + * unknowingly write-protecting L1's guest page tables, which KVM isn't + * shadowing. + * + * Because the CPU (by default) walks NPT page tables using a write + * access (to ensure the CPU can do A/D updates), page walks in L1 can + * trigger write faults for the above case even when L1 isn't modifying + * PTEs. As a result, KVM will unnecessarily emulate (or at least, try + * to emulate) an excessive number of L1 instructions; because L1's MMU + * isn't shadowed by KVM, there is no need to write-protect L1's gPTEs + * and thus no need to emulate in order to guarantee forward progress. + * + * Try to unprotect the gfn, i.e. zap any shadow pages, so that L1 can + * proceed without triggering emulation. If one or more shadow pages + * was zapped, skip emulation and resume L1 to let it natively execute + * the instruction. If no shadow pages were zapped, then the write- + * fault is due to something else entirely, i.e. KVM needs to emulate, + * as resuming the guest will put it into an infinite loop. + * + * Note, this code also applies to Intel CPUs, even though it is *very* + * unlikely that an L1 will share its page tables (IA32/PAE/paging64 + * format) with L2's page tables (EPT format). */ - if (direct && is_write_to_guest_page_table(error_code)) { - kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa)); + if (direct && is_write_to_guest_page_table(error_code) && + kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa))) return RET_PF_RETRY; - } /* * The gfn is write-protected, but if emulation fails we can still From c1edcc41c3603c65f34000ae031a20971f4e56f9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 30 Aug 2024 17:15:20 -0700 Subject: [PATCH 08/33] KVM: x86: Retry to-be-emulated insn in "slow" unprotect path iff sp is zapped Resume the guest and thus skip emulation of a non-PTE-writing instruction if and only if unprotecting the gfn actually zapped at least one shadow page. If the gfn is write-protected for some reason other than shadow paging, attempting to unprotect the gfn will effectively fail, and thus retrying the instruction is all but guaranteed to be pointless. This bug has existed for a long time, but was effectively fudged around by the retry RIP+address anti-loop detection. Reviewed-by: Yuan Yao Link: https://lore.kernel.org/r/20240831001538.336683-6-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 70219e406987..5d4bcb999df2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8965,14 +8965,14 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt, if (ctxt->eip == last_retry_eip && last_retry_addr == cr2_or_gpa) return false; - vcpu->arch.last_retry_eip = ctxt->eip; - vcpu->arch.last_retry_addr = cr2_or_gpa; - if (!vcpu->arch.mmu->root_role.direct) gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL); - kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)); + if (!kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa))) + return false; + vcpu->arch.last_retry_eip = ctxt->eip; + vcpu->arch.last_retry_addr = cr2_or_gpa; return true; } From 019f3f84a40c88b68ca4d455306b92c20733e784 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 30 Aug 2024 17:15:21 -0700 Subject: [PATCH 09/33] KVM: x86: Get RIP from vCPU state when storing it to last_retry_eip Read RIP from vCPU state instead of pulling it from the emulation context when filling last_retry_eip, which is part of the anti-infinite-loop protection used when unprotecting and retrying instructions that hit a write-protected gfn. This will allow reusing the anti-infinite-loop protection in flows that never make it into the emulator. No functional change intended, as ctxt->eip is set to kvm_rip_read() in init_emulate_ctxt(), and EMULTYPE_PF emulation is mutually exclusive with EMULTYPE_NO_DECODE and EMULTYPE_SKIP, i.e. always goes through x86_decode_emulated_instruction() and hasn't advanced ctxt->eip (yet). Reviewed-by: Yuan Yao Link: https://lore.kernel.org/r/20240831001538.336683-7-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5d4bcb999df2..760d455f77ac 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8971,7 +8971,7 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt, if (!kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa))) return false; - vcpu->arch.last_retry_eip = ctxt->eip; + vcpu->arch.last_retry_eip = kvm_rip_read(vcpu); vcpu->arch.last_retry_addr = cr2_or_gpa; return true; } From 9c19129e535bfff85bdfcb5a804e19e5aae935b2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 30 Aug 2024 17:15:22 -0700 Subject: [PATCH 10/33] KVM: x86: Store gpa as gpa_t, not unsigned long, when unprotecting for retry Store the gpa used to unprotect the faulting gfn for retry as a gpa_t, not an unsigned long. This fixes a bug where 32-bit KVM would unprotect and retry the wrong gfn if the gpa had bits 63:32!=0. In practice, this bug is functionally benign, as unprotecting the wrong gfn is purely a performance issue (thanks to the anti-infinite-loop logic). And of course, almost no one runs 32-bit KVM these days. Reviewed-by: Yuan Yao Link: https://lore.kernel.org/r/20240831001538.336683-8-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 760d455f77ac..d26e107225f7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8932,7 +8932,8 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt, gpa_t cr2_or_gpa, int emulation_type) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - unsigned long last_retry_eip, last_retry_addr, gpa = cr2_or_gpa; + unsigned long last_retry_eip, last_retry_addr; + gpa_t gpa = cr2_or_gpa; last_retry_eip = vcpu->arch.last_retry_eip; last_retry_addr = vcpu->arch.last_retry_addr; From 01dd4d319207c4cfd51a1c9a1812909e944d8c86 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 30 Aug 2024 17:15:23 -0700 Subject: [PATCH 11/33] KVM: x86/mmu: Apply retry protection to "fast nTDP unprotect" path Move the anti-infinite-loop protection provided by last_retry_{eip,addr} into kvm_mmu_write_protect_fault() so that it guards unprotect+retry that never hits the emulator, as well as reexecute_instruction(), which is the last ditch "might as well try it" logic that kicks in when emulation fails on an instruction that faulted on a write-protected gfn. Add a new helper, kvm_mmu_unprotect_gfn_and_retry(), to set the retry fields and deduplicate other code (with more to come). Link: https://lore.kernel.org/r/20240831001538.336683-9-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/mmu/mmu.c | 39 ++++++++++++++++++++++++++++++++- arch/x86/kvm/x86.c | 27 +---------------------- 3 files changed, 40 insertions(+), 27 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 797433994d56..fd115feb49b3 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -2133,6 +2133,7 @@ int kvm_get_nr_pending_nmis(struct kvm_vcpu *vcpu); void kvm_update_dr7(struct kvm_vcpu *vcpu); int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn); +bool kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa); void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu, ulong roots_to_free); void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index edca32a4e874..e992da21f4e5 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2713,6 +2713,22 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) return r; } +bool kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa) +{ + gpa_t gpa = cr2_or_gpa; + bool r; + + if (!vcpu->arch.mmu->root_role.direct) + gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL); + + r = kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)); + if (r) { + vcpu->arch.last_retry_eip = kvm_rip_read(vcpu); + vcpu->arch.last_retry_addr = cr2_or_gpa; + } + return r; +} + static int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) { gpa_t gpa; @@ -5956,6 +5972,27 @@ static int kvm_mmu_write_protect_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, { bool direct = vcpu->arch.mmu->root_role.direct; + /* + * Do not try to unprotect and retry if the vCPU re-faulted on the same + * RIP with the same address that was previously unprotected, as doing + * so will likely put the vCPU into an infinite. E.g. if the vCPU uses + * a non-page-table modifying instruction on the PDE that points to the + * instruction, then unprotecting the gfn will unmap the instruction's + * code, i.e. make it impossible for the instruction to ever complete. + */ + if (vcpu->arch.last_retry_eip == kvm_rip_read(vcpu) && + vcpu->arch.last_retry_addr == cr2_or_gpa) + return RET_PF_EMULATE; + + /* + * Reset the unprotect+retry values that guard against infinite loops. + * The values will be refreshed if KVM explicitly unprotects a gfn and + * retries, in all other cases it's safe to retry in the future even if + * the next page fault happens on the same RIP+address. + */ + vcpu->arch.last_retry_eip = 0; + vcpu->arch.last_retry_addr = 0; + /* * Before emulating the instruction, check to see if the access was due * to a read-only violation while the CPU was walking non-nested NPT @@ -5986,7 +6023,7 @@ static int kvm_mmu_write_protect_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, * format) with L2's page tables (EPT format). */ if (direct && is_write_to_guest_page_table(error_code) && - kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa))) + kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa)) return RET_PF_RETRY; /* diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d26e107225f7..ce075e07126b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8932,27 +8932,13 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt, gpa_t cr2_or_gpa, int emulation_type) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - unsigned long last_retry_eip, last_retry_addr; - gpa_t gpa = cr2_or_gpa; - - last_retry_eip = vcpu->arch.last_retry_eip; - last_retry_addr = vcpu->arch.last_retry_addr; /* * If the emulation is caused by #PF and it is non-page_table * writing instruction, it means the VM-EXIT is caused by shadow * page protected, we can zap the shadow page and retry this * instruction directly. - * - * Note: if the guest uses a non-page-table modifying instruction - * on the PDE that points to the instruction, then we will unmap - * the instruction and go to an infinite loop. So, we cache the - * last retried eip and the last fault address, if we meet the eip - * and the address again, we can break out of the potential infinite - * loop. */ - vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0; - if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF)) return false; @@ -8963,18 +8949,7 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt, if (x86_page_table_writing_insn(ctxt)) return false; - if (ctxt->eip == last_retry_eip && last_retry_addr == cr2_or_gpa) - return false; - - if (!vcpu->arch.mmu->root_role.direct) - gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL); - - if (!kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa))) - return false; - - vcpu->arch.last_retry_eip = kvm_rip_read(vcpu); - vcpu->arch.last_retry_addr = cr2_or_gpa; - return true; + return kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa); } static int complete_emulated_mmio(struct kvm_vcpu *vcpu); From dfaae8447c53819749cf3ba10ce24d3c609752e3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 30 Aug 2024 17:15:24 -0700 Subject: [PATCH 12/33] KVM: x86/mmu: Try "unprotect for retry" iff there are indirect SPs Try to unprotect shadow pages if and only if indirect_shadow_pages is non- zero, i.e. iff there is at least one protected such shadow page. Pre- checking indirect_shadow_pages avoids taking mmu_lock for write when the gfn is write-protected by a third party, i.e. not for KVM shadow paging, and in the *extremely* unlikely case that a different task has already unprotected the last shadow page. Link: https://lore.kernel.org/r/20240831001538.336683-10-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index e992da21f4e5..e24de1f3d1db 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2718,6 +2718,17 @@ bool kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa) gpa_t gpa = cr2_or_gpa; bool r; + /* + * Bail early if there aren't any write-protected shadow pages to avoid + * unnecessarily taking mmu_lock lock, e.g. if the gfn is write-tracked + * by a third party. Reading indirect_shadow_pages without holding + * mmu_lock is safe, as this is purely an optimization, i.e. a false + * positive is benign, and a false negative will simply result in KVM + * skipping the unprotect+retry path, which is also an optimization. + */ + if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages)) + return false; + if (!vcpu->arch.mmu->root_role.direct) gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL); From 41e6e367d576ce1801dc5c2b106e14cde35e3c80 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 30 Aug 2024 17:15:25 -0700 Subject: [PATCH 13/33] KVM: x86: Move EMULTYPE_ALLOW_RETRY_PF to x86_emulate_instruction() Move the sanity checks for EMULTYPE_ALLOW_RETRY_PF to the top of x86_emulate_instruction(). In addition to deduplicating a small amount of code, this makes the connection between EMULTYPE_ALLOW_RETRY_PF and EMULTYPE_PF even more explicit, and will allow dropping retry_instruction() entirely. Reviewed-by: Yuan Yao Link: https://lore.kernel.org/r/20240831001538.336683-11-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ce075e07126b..51d22d9de8d7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8870,10 +8870,6 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF)) return false; - if (WARN_ON_ONCE(is_guest_mode(vcpu)) || - WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF))) - return false; - if (!vcpu->arch.mmu->root_role.direct) { /* * Write permission should be allowed since only @@ -8942,10 +8938,6 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt, if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF)) return false; - if (WARN_ON_ONCE(is_guest_mode(vcpu)) || - WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF))) - return false; - if (x86_page_table_writing_insn(ctxt)) return false; @@ -9148,6 +9140,11 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; bool writeback = true; + if ((emulation_type & EMULTYPE_ALLOW_RETRY_PF) && + (WARN_ON_ONCE(is_guest_mode(vcpu)) || + WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF)))) + emulation_type &= ~EMULTYPE_ALLOW_RETRY_PF; + r = kvm_check_emulate_insn(vcpu, emulation_type, insn, insn_len); if (r != X86EMUL_CONTINUE) { if (r == X86EMUL_RETRY_INSTR || r == X86EMUL_PROPAGATE_FAULT) From 2df354e37c1398a85bb43cbbf1f913eb3f91d035 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 30 Aug 2024 17:15:26 -0700 Subject: [PATCH 14/33] KVM: x86: Fold retry_instruction() into x86_emulate_instruction() Now that retry_instruction() is reasonably tiny, fold it into its sole caller, x86_emulate_instruction(). In addition to getting rid of the absurdly confusing retry_instruction() name, handling the retry in x86_emulate_instruction() pairs it back up with the code that resets last_retry_{eip,address}. No functional change intended. Reviewed-by: Yuan Yao Link: https://lore.kernel.org/r/20240831001538.336683-12-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 30 +++++++++--------------------- 1 file changed, 9 insertions(+), 21 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 51d22d9de8d7..a7961f8a6429 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8924,26 +8924,6 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, return !(emulation_type & EMULTYPE_WRITE_PF_TO_SP); } -static bool retry_instruction(struct x86_emulate_ctxt *ctxt, - gpa_t cr2_or_gpa, int emulation_type) -{ - struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - - /* - * If the emulation is caused by #PF and it is non-page_table - * writing instruction, it means the VM-EXIT is caused by shadow - * page protected, we can zap the shadow page and retry this - * instruction directly. - */ - if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF)) - return false; - - if (x86_page_table_writing_insn(ctxt)) - return false; - - return kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa); -} - static int complete_emulated_mmio(struct kvm_vcpu *vcpu); static int complete_emulated_pio(struct kvm_vcpu *vcpu); @@ -9223,7 +9203,15 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, return 1; } - if (retry_instruction(ctxt, cr2_or_gpa, emulation_type)) + /* + * If emulation was caused by a write-protection #PF on a non-page_table + * writing instruction, try to unprotect the gfn, i.e. zap shadow pages, + * and retry the instruction, as the vCPU is likely no longer using the + * gfn as a page table. + */ + if ((emulation_type & EMULTYPE_ALLOW_RETRY_PF) && + !x86_page_table_writing_insn(ctxt) && + kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa)) return 1; /* this is needed for vmware backdoor interface to work since it From b7e948898e772ac900950c0dac4ca90e905cd0c0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 30 Aug 2024 17:15:27 -0700 Subject: [PATCH 15/33] KVM: x86/mmu: Don't try to unprotect an INVALID_GPA If getting the gpa for a gva fails, e.g. because the gva isn't mapped in the guest page tables, don't try to unprotect the invalid gfn. This is mostly a performance fix (avoids unnecessarily taking mmu_lock), as for_each_gfn_valid_sp_with_gptes() won't explode on garbage input, it's simply pointless. Link: https://lore.kernel.org/r/20240831001538.336683-13-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index e24de1f3d1db..bafec04b07ea 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2729,8 +2729,11 @@ bool kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa) if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages)) return false; - if (!vcpu->arch.mmu->root_role.direct) + if (!vcpu->arch.mmu->root_role.direct) { gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL); + if (gpa == INVALID_GPA) + return false; + } r = kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)); if (r) { @@ -2749,6 +2752,8 @@ static int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) return 0; gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); + if (gpa == INVALID_GPA) + return 0; r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); From 29e495bdf847ac6ad0e0d03e5db39a3ed9f12858 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 30 Aug 2024 17:15:28 -0700 Subject: [PATCH 16/33] KVM: x86/mmu: Always walk guest PTEs with WRITE access when unprotecting When getting a gpa from a gva to unprotect the associated gfn when an event is awating reinjection, walk the guest PTEs for WRITE as there's no point in unprotecting the gfn if the guest is unable to write the page, i.e. if write-protection can't trigger emulation. Note, the entire flow should be guarded on the access being a write, and even better should be conditioned on actually triggering a write-protect fault. This will be addressed in a future commit. Reviewed-by: Yuan Yao Link: https://lore.kernel.org/r/20240831001538.336683-14-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index bafec04b07ea..937fa9a82a43 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2751,7 +2751,7 @@ static int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) if (vcpu->arch.mmu->root_role.direct) return 0; - gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); + gpa = kvm_mmu_gva_to_gpa_write(vcpu, gva, NULL); if (gpa == INVALID_GPA) return 0; From b299c273c06f005976cdc1b9e9299d492527607e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 30 Aug 2024 17:15:29 -0700 Subject: [PATCH 17/33] KVM: x86/mmu: Move event re-injection unprotect+retry into common path Move the event re-injection unprotect+retry logic into kvm_mmu_write_protect_fault(), i.e. unprotect and retry if and only if the #PF actually hit a write-protected gfn. Note, there is a small possibility that the gfn was unprotected by a different tasking between hitting the #PF and acquiring mmu_lock, but in that case, KVM will resume the guest immediately anyways because KVM will treat the fault as spurious. As a bonus, unprotecting _after_ handling the page fault also addresses the case where the installing a SPTE to handle fault encounters a shadowed PTE, i.e. *creates* a read-only SPTE. Opportunstically add a comment explaining what on earth the intent of the code is, as based on the changelog from commit 577bdc496614 ("KVM: Avoid instruction emulation when event delivery is pending"). Link: https://lore.kernel.org/r/20240831001538.336683-15-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 28 ++++++++-------------------- 1 file changed, 8 insertions(+), 20 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 937fa9a82a43..195ba7430720 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2743,23 +2743,6 @@ bool kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa) return r; } -static int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) -{ - gpa_t gpa; - int r; - - if (vcpu->arch.mmu->root_role.direct) - return 0; - - gpa = kvm_mmu_gva_to_gpa_write(vcpu, gva, NULL); - if (gpa == INVALID_GPA) - return 0; - - r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); - - return r; -} - static void kvm_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) { trace_kvm_mmu_unsync_page(sp); @@ -4630,8 +4613,6 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, if (!flags) { trace_kvm_page_fault(vcpu, fault_address, error_code); - if (kvm_event_needs_reinjection(vcpu)) - kvm_mmu_unprotect_page_virt(vcpu, fault_address); r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn, insn_len); } else if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) { @@ -6037,8 +6018,15 @@ static int kvm_mmu_write_protect_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, * Note, this code also applies to Intel CPUs, even though it is *very* * unlikely that an L1 will share its page tables (IA32/PAE/paging64 * format) with L2's page tables (EPT format). + * + * For indirect MMUs, i.e. if KVM is shadowing the current MMU, try to + * unprotect the gfn and retry if an event is awaiting reinjection. If + * KVM emulates multiple instructions before completing event injection, + * the event could be delayed beyond what is architecturally allowed, + * e.g. KVM could inject an IRQ after the TPR has been raised. */ - if (direct && is_write_to_guest_page_table(error_code) && + if (((direct && is_write_to_guest_page_table(error_code)) || + (!direct && kvm_event_needs_reinjection(vcpu))) && kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa)) return RET_PF_RETRY; From 620525739521376a65a690df899e1596d56791f8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 30 Aug 2024 17:15:30 -0700 Subject: [PATCH 18/33] KVM: x86: Remove manual pfn lookup when retrying #PF after failed emulation Drop the manual pfn look when retrying an instruction that KVM failed to emulation in response to a #PF due to a write-protected gfn. Now that KVM sets EMULTYPE_ALLOW_RETRY_PF if and only if the page fault hit a write- protected gfn, i.e. if and only if there's a writable memslot, there's no need to redo the lookup to avoid retrying an instruction that failed on emulated MMIO (no slot, or a write to a read-only slot). I.e. KVM will never attempt to retry an instruction that failed on emulated MMIO, whereas that was not the case prior to the introduction of RET_PF_WRITE_PROTECTED. Reviewed-by: Yuan Yao Link: https://lore.kernel.org/r/20240831001538.336683-16-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a7961f8a6429..1e9c5ef4a9f5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8865,7 +8865,6 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int emulation_type) { gpa_t gpa = cr2_or_gpa; - kvm_pfn_t pfn; if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF)) return false; @@ -8885,23 +8884,6 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, return true; } - /* - * Do not retry the unhandleable instruction if it faults on the - * readonly host memory, otherwise it will goto a infinite loop: - * retry instruction -> write #PF -> emulation fail -> retry - * instruction -> ... - */ - pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa)); - - /* - * If the instruction failed on the error pfn, it can not be fixed, - * report the error to userspace. - */ - if (is_error_noslot_pfn(pfn)) - return false; - - kvm_release_pfn_clean(pfn); - /* * If emulation may have been triggered by a write to a shadowed page * table, unprotect the gfn (zap any relevant SPTEs) and re-enter the From 19ab2c8be070160be70a88027b3b93106fef7b89 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 30 Aug 2024 17:15:31 -0700 Subject: [PATCH 19/33] KVM: x86: Check EMULTYPE_WRITE_PF_TO_SP before unprotecting gfn Don't bother unprotecting the target gfn if EMULTYPE_WRITE_PF_TO_SP is set, as KVM will simply report the emulation failure to userspace. This will allow converting reexecute_instruction() to use kvm_mmu_unprotect_gfn_instead_retry() instead of kvm_mmu_unprotect_page(). Link: https://lore.kernel.org/r/20240831001538.336683-17-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1e9c5ef4a9f5..7170eee23597 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8869,6 +8869,19 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF)) return false; + /* + * If the failed instruction faulted on an access to page tables that + * are used to translate any part of the instruction, KVM can't resolve + * the issue by unprotecting the gfn, as zapping the shadow page will + * result in the instruction taking a !PRESENT page fault and thus put + * the vCPU into an infinite loop of page faults. E.g. KVM will create + * a SPTE and write-protect the gfn to resolve the !PRESENT fault, and + * then zap the SPTE to unprotect the gfn, and then do it all over + * again. Report the error to userspace. + */ + if (emulation_type & EMULTYPE_WRITE_PF_TO_SP) + return false; + if (!vcpu->arch.mmu->root_role.direct) { /* * Write permission should be allowed since only @@ -8894,16 +8907,13 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)); /* - * If the failed instruction faulted on an access to page tables that - * are used to translate any part of the instruction, KVM can't resolve - * the issue by unprotecting the gfn, as zapping the shadow page will - * result in the instruction taking a !PRESENT page fault and thus put - * the vCPU into an infinite loop of page faults. E.g. KVM will create - * a SPTE and write-protect the gfn to resolve the !PRESENT fault, and - * then zap the SPTE to unprotect the gfn, and then do it all over - * again. Report the error to userspace. + * Retry even if _this_ vCPU didn't unprotect the gfn, as it's possible + * all SPTEs were already zapped by a different task. The alternative + * is to report the error to userspace and likely terminate the guest, + * and the last_retry_{eip,addr} checks will prevent retrying the page + * fault indefinitely, i.e. there's nothing to lose by retrying. */ - return !(emulation_type & EMULTYPE_WRITE_PF_TO_SP); + return true; } static int complete_emulated_mmio(struct kvm_vcpu *vcpu); From dabc4ff70c35756bc107bc5d035d0f0746396a9a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 30 Aug 2024 17:15:32 -0700 Subject: [PATCH 20/33] KVM: x86: Apply retry protection to "unprotect on failure" path Use kvm_mmu_unprotect_gfn_and_retry() in reexecute_instruction() to pick up protection against infinite loops, e.g. if KVM somehow manages to encounter an unsupported instruction and unprotecting the gfn doesn't allow the vCPU to make forward progress. Other than that, the retry-on- failure logic is a functionally equivalent, open coded version of kvm_mmu_unprotect_gfn_and_retry(). Note, the emulation failure path still isn't fully protected, as KVM won't update the retry protection fields if no shadow pages are zapped (but this change is still a step forward). That flaw will be addressed in a future patch. Reviewed-by: Yuan Yao Link: https://lore.kernel.org/r/20240831001538.336683-18-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 20 +------------------- 1 file changed, 1 insertion(+), 19 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7170eee23597..ad942892fa2c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8864,8 +8864,6 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type) static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int emulation_type) { - gpa_t gpa = cr2_or_gpa; - if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF)) return false; @@ -8882,29 +8880,13 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, if (emulation_type & EMULTYPE_WRITE_PF_TO_SP) return false; - if (!vcpu->arch.mmu->root_role.direct) { - /* - * Write permission should be allowed since only - * write access need to be emulated. - */ - gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL); - - /* - * If the mapping is invalid in guest, let cpu retry - * it to generate fault. - */ - if (gpa == INVALID_GPA) - return true; - } - /* * If emulation may have been triggered by a write to a shadowed page * table, unprotect the gfn (zap any relevant SPTEs) and re-enter the * guest to let the CPU re-execute the instruction in the hope that the * CPU can cleanly execute the instruction that KVM failed to emulate. */ - if (vcpu->kvm->arch.indirect_shadow_pages) - kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)); + kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa); /* * Retry even if _this_ vCPU didn't unprotect the gfn, as it's possible From 4df685664bed04794ad72b58d8af1fa4fcc60261 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 30 Aug 2024 17:15:33 -0700 Subject: [PATCH 21/33] KVM: x86: Update retry protection fields when forcing retry on emulation failure When retrying the faulting instruction after emulation failure, refresh the infinite loop protection fields even if no shadow pages were zapped, i.e. avoid hitting an infinite loop even when retrying the instruction as a last-ditch effort to avoid terminating the guest. Link: https://lore.kernel.org/r/20240831001538.336683-19-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 10 +++++++++- arch/x86/kvm/mmu/mmu.c | 12 +++++++----- arch/x86/kvm/x86.c | 2 +- 3 files changed, 17 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index fd115feb49b3..cdee59f3d15b 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -2133,7 +2133,15 @@ int kvm_get_nr_pending_nmis(struct kvm_vcpu *vcpu); void kvm_update_dr7(struct kvm_vcpu *vcpu); int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn); -bool kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa); +bool __kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, + bool always_retry); + +static inline bool kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, + gpa_t cr2_or_gpa) +{ + return __kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa, false); +} + void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu, ulong roots_to_free); void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 195ba7430720..4b4edaf7dc06 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2713,10 +2713,11 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) return r; } -bool kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa) +bool __kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, + bool always_retry) { gpa_t gpa = cr2_or_gpa; - bool r; + bool r = false; /* * Bail early if there aren't any write-protected shadow pages to avoid @@ -2727,16 +2728,17 @@ bool kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa) * skipping the unprotect+retry path, which is also an optimization. */ if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages)) - return false; + goto out; if (!vcpu->arch.mmu->root_role.direct) { gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL); if (gpa == INVALID_GPA) - return false; + goto out; } r = kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)); - if (r) { +out: + if (r || always_retry) { vcpu->arch.last_retry_eip = kvm_rip_read(vcpu); vcpu->arch.last_retry_addr = cr2_or_gpa; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ad942892fa2c..843ddb982b35 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8886,7 +8886,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, * guest to let the CPU re-execute the instruction in the hope that the * CPU can cleanly execute the instruction that KVM failed to emulate. */ - kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa); + __kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa, true); /* * Retry even if _this_ vCPU didn't unprotect the gfn, as it's possible From 2876624e1adcd9a3a3ffa8c4fe3bf8dbba969d95 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 30 Aug 2024 17:15:34 -0700 Subject: [PATCH 22/33] KVM: x86: Rename reexecute_instruction()=>kvm_unprotect_and_retry_on_failure() Rename reexecute_instruction() to kvm_unprotect_and_retry_on_failure() to make the intent and purpose of the helper much more obvious. No functional change intended. Reviewed-by: Yuan Yao Link: https://lore.kernel.org/r/20240831001538.336683-20-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 843ddb982b35..cd9725df3680 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8861,8 +8861,9 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type) return 1; } -static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, - int emulation_type) +static bool kvm_unprotect_and_retry_on_failure(struct kvm_vcpu *vcpu, + gpa_t cr2_or_gpa, + int emulation_type) { if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF)) return false; @@ -9129,8 +9130,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, kvm_queue_exception(vcpu, UD_VECTOR); return 1; } - if (reexecute_instruction(vcpu, cr2_or_gpa, - emulation_type)) + if (kvm_unprotect_and_retry_on_failure(vcpu, cr2_or_gpa, + emulation_type)) return 1; if (ctxt->have_exception && @@ -9216,7 +9217,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, return 1; if (r == EMULATION_FAILED) { - if (reexecute_instruction(vcpu, cr2_or_gpa, emulation_type)) + if (kvm_unprotect_and_retry_on_failure(vcpu, cr2_or_gpa, + emulation_type)) return 1; return handle_emulation_failure(vcpu, emulation_type); From 6b3dcabc10911711eba15816d808e2a18f130406 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 30 Aug 2024 17:15:35 -0700 Subject: [PATCH 23/33] KVM: x86/mmu: Subsume kvm_mmu_unprotect_page() into the and_retry() version Fold kvm_mmu_unprotect_page() into kvm_mmu_unprotect_gfn_and_retry() now that all other direct usage is gone. No functional change intended. Link: https://lore.kernel.org/r/20240831001538.336683-21-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 1 - arch/x86/kvm/mmu/mmu.c | 33 +++++++++++++-------------------- 2 files changed, 13 insertions(+), 21 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index cdee59f3d15b..8f4164f58b6c 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -2132,7 +2132,6 @@ int kvm_get_nr_pending_nmis(struct kvm_vcpu *vcpu); void kvm_update_dr7(struct kvm_vcpu *vcpu); -int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn); bool __kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, bool always_retry); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 4b4edaf7dc06..29305403f956 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2695,27 +2695,12 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages) write_unlock(&kvm->mmu_lock); } -int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) -{ - struct kvm_mmu_page *sp; - LIST_HEAD(invalid_list); - int r; - - r = 0; - write_lock(&kvm->mmu_lock); - for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) { - r = 1; - kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); - } - kvm_mmu_commit_zap_page(kvm, &invalid_list); - write_unlock(&kvm->mmu_lock); - - return r; -} - bool __kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, bool always_retry) { + struct kvm *kvm = vcpu->kvm; + LIST_HEAD(invalid_list); + struct kvm_mmu_page *sp; gpa_t gpa = cr2_or_gpa; bool r = false; @@ -2727,7 +2712,7 @@ bool __kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, * positive is benign, and a false negative will simply result in KVM * skipping the unprotect+retry path, which is also an optimization. */ - if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages)) + if (!READ_ONCE(kvm->arch.indirect_shadow_pages)) goto out; if (!vcpu->arch.mmu->root_role.direct) { @@ -2736,7 +2721,15 @@ bool __kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, goto out; } - r = kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)); + r = false; + write_lock(&kvm->mmu_lock); + for_each_gfn_valid_sp_with_gptes(kvm, sp, gpa_to_gfn(gpa)) { + r = true; + kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); + } + kvm_mmu_commit_zap_page(kvm, &invalid_list); + write_unlock(&kvm->mmu_lock); + out: if (r || always_retry) { vcpu->arch.last_retry_eip = kvm_rip_read(vcpu); From d859b16161c81ee929b7b02a85227b8e3250bc97 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 30 Aug 2024 17:15:36 -0700 Subject: [PATCH 24/33] KVM: x86/mmu: Detect if unprotect will do anything based on invalid_list Explicitly query the list of to-be-zapped shadow pages when checking to see if unprotecting a gfn for retry has succeeded, i.e. if KVM should retry the faulting instruction. Add a comment to explain why the list needs to be checked before zapping, which is the primary motivation for this change. No functional change intended. Reviewed-by: Yuan Yao Link: https://lore.kernel.org/r/20240831001538.336683-22-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 29305403f956..ebbdc979a069 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2721,12 +2721,15 @@ bool __kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, goto out; } - r = false; write_lock(&kvm->mmu_lock); - for_each_gfn_valid_sp_with_gptes(kvm, sp, gpa_to_gfn(gpa)) { - r = true; + for_each_gfn_valid_sp_with_gptes(kvm, sp, gpa_to_gfn(gpa)) kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); - } + + /* + * Snapshot the result before zapping, as zapping will remove all list + * entries, i.e. checking the list later would yield a false negative. + */ + r = !list_empty(&invalid_list); kvm_mmu_commit_zap_page(kvm, &invalid_list); write_unlock(&kvm->mmu_lock); From 98a69b96caca3e07aff57ca91fd7cc3a3853871a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 30 Aug 2024 17:15:37 -0700 Subject: [PATCH 25/33] KVM: x86/mmu: WARN on MMIO cache hit when emulating write-protected gfn WARN if KVM gets an MMIO cache hit on a RET_PF_WRITE_PROTECTED fault, as KVM should return RET_PF_WRITE_PROTECTED if and only if there is a memslot, and creating a memslot is supposed to invalidate the MMIO cache by virtue of changing the memslot generation. Keep the code around mainly to provide a convenient location to document why emulated MMIO should be impossible. Suggested-by: Yuan Yao Link: https://lore.kernel.org/r/20240831001538.336683-23-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index ebbdc979a069..330b87a1c80a 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5988,6 +5988,18 @@ static int kvm_mmu_write_protect_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, vcpu->arch.last_retry_eip = 0; vcpu->arch.last_retry_addr = 0; + /* + * It should be impossible to reach this point with an MMIO cache hit, + * as RET_PF_WRITE_PROTECTED is returned if and only if there's a valid, + * writable memslot, and creating a memslot should invalidate the MMIO + * cache by way of changing the memslot generation. WARN and disallow + * retry if MMIO is detected, as retrying MMIO emulation is pointless + * and could put the vCPU into an infinite loop because the processor + * will keep faulting on the non-existent MMIO address. + */ + if (WARN_ON_ONCE(mmio_info_in_cache(vcpu, cr2_or_gpa, direct))) + return RET_PF_EMULATE; + /* * Before emulating the instruction, check to see if the access was due * to a read-only violation while the CPU was walking non-nested NPT @@ -6029,17 +6041,15 @@ static int kvm_mmu_write_protect_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, return RET_PF_RETRY; /* - * The gfn is write-protected, but if emulation fails we can still - * optimistically try to just unprotect the page and let the processor + * The gfn is write-protected, but if KVM detects its emulating an + * instruction that is unlikely to be used to modify page tables, or if + * emulation fails, KVM can try to unprotect the gfn and let the CPU * re-execute the instruction that caused the page fault. Do not allow - * retrying MMIO emulation, as it's not only pointless but could also - * cause us to enter an infinite loop because the processor will keep - * faulting on the non-existent MMIO address. Retrying an instruction - * from a nested guest is also pointless and dangerous as we are only - * explicitly shadowing L1's page tables, i.e. unprotecting something - * for L1 isn't going to magically fix whatever issue cause L2 to fail. + * retrying an instruction from a nested guest as KVM is only explicitly + * shadowing L1's page tables, i.e. unprotecting something for L1 isn't + * going to magically fix whatever issue caused L2 to fail. */ - if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu)) + if (!is_guest_mode(vcpu)) *emulation_type |= EMULTYPE_ALLOW_RETRY_PF; return RET_PF_EMULATE; From 0a37fffda14538036f5402a29920285284fe5d5b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 9 Aug 2024 12:43:22 -0700 Subject: [PATCH 26/33] KVM: x86/mmu: Move walk_slot_rmaps() up near for_each_slot_rmap_range() Move walk_slot_rmaps() and friends up near for_each_slot_rmap_range() so that the walkers can be used to handle mmu_notifier invalidations, and so that similar function has some amount of locality in code. No functional change intended. Link: https://lore.kernel.org/r/20240809194335.1726916-11-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 106 ++++++++++++++++++++--------------------- 1 file changed, 53 insertions(+), 53 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 330b87a1c80a..17edf1499be7 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1516,6 +1516,59 @@ static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator) slot_rmap_walk_okay(_iter_); \ slot_rmap_walk_next(_iter_)) +/* The return value indicates if tlb flush on all vcpus is needed. */ +typedef bool (*slot_rmaps_handler) (struct kvm *kvm, + struct kvm_rmap_head *rmap_head, + const struct kvm_memory_slot *slot); + +static __always_inline bool __walk_slot_rmaps(struct kvm *kvm, + const struct kvm_memory_slot *slot, + slot_rmaps_handler fn, + int start_level, int end_level, + gfn_t start_gfn, gfn_t end_gfn, + bool flush_on_yield, bool flush) +{ + struct slot_rmap_walk_iterator iterator; + + lockdep_assert_held_write(&kvm->mmu_lock); + + for_each_slot_rmap_range(slot, start_level, end_level, start_gfn, + end_gfn, &iterator) { + if (iterator.rmap) + flush |= fn(kvm, iterator.rmap, slot); + + if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { + if (flush && flush_on_yield) { + kvm_flush_remote_tlbs_range(kvm, start_gfn, + iterator.gfn - start_gfn + 1); + flush = false; + } + cond_resched_rwlock_write(&kvm->mmu_lock); + } + } + + return flush; +} + +static __always_inline bool walk_slot_rmaps(struct kvm *kvm, + const struct kvm_memory_slot *slot, + slot_rmaps_handler fn, + int start_level, int end_level, + bool flush_on_yield) +{ + return __walk_slot_rmaps(kvm, slot, fn, start_level, end_level, + slot->base_gfn, slot->base_gfn + slot->npages - 1, + flush_on_yield, false); +} + +static __always_inline bool walk_slot_rmaps_4k(struct kvm *kvm, + const struct kvm_memory_slot *slot, + slot_rmaps_handler fn, + bool flush_on_yield) +{ + return walk_slot_rmaps(kvm, slot, fn, PG_LEVEL_4K, PG_LEVEL_4K, flush_on_yield); +} + typedef bool (*rmap_handler_t)(struct kvm *kvm, struct kvm_rmap_head *rmap_head, struct kvm_memory_slot *slot, gfn_t gfn, int level); @@ -6272,59 +6325,6 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, } EXPORT_SYMBOL_GPL(kvm_configure_mmu); -/* The return value indicates if tlb flush on all vcpus is needed. */ -typedef bool (*slot_rmaps_handler) (struct kvm *kvm, - struct kvm_rmap_head *rmap_head, - const struct kvm_memory_slot *slot); - -static __always_inline bool __walk_slot_rmaps(struct kvm *kvm, - const struct kvm_memory_slot *slot, - slot_rmaps_handler fn, - int start_level, int end_level, - gfn_t start_gfn, gfn_t end_gfn, - bool flush_on_yield, bool flush) -{ - struct slot_rmap_walk_iterator iterator; - - lockdep_assert_held_write(&kvm->mmu_lock); - - for_each_slot_rmap_range(slot, start_level, end_level, start_gfn, - end_gfn, &iterator) { - if (iterator.rmap) - flush |= fn(kvm, iterator.rmap, slot); - - if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { - if (flush && flush_on_yield) { - kvm_flush_remote_tlbs_range(kvm, start_gfn, - iterator.gfn - start_gfn + 1); - flush = false; - } - cond_resched_rwlock_write(&kvm->mmu_lock); - } - } - - return flush; -} - -static __always_inline bool walk_slot_rmaps(struct kvm *kvm, - const struct kvm_memory_slot *slot, - slot_rmaps_handler fn, - int start_level, int end_level, - bool flush_on_yield) -{ - return __walk_slot_rmaps(kvm, slot, fn, start_level, end_level, - slot->base_gfn, slot->base_gfn + slot->npages - 1, - flush_on_yield, false); -} - -static __always_inline bool walk_slot_rmaps_4k(struct kvm *kvm, - const struct kvm_memory_slot *slot, - slot_rmaps_handler fn, - bool flush_on_yield) -{ - return walk_slot_rmaps(kvm, slot, fn, PG_LEVEL_4K, PG_LEVEL_4K, flush_on_yield); -} - static void free_mmu_pages(struct kvm_mmu *mmu) { if (!tdp_enabled && mmu->pae_root) From 5b1fb116e1a636701627a6eb202d17be93e8f7a8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 9 Aug 2024 12:43:23 -0700 Subject: [PATCH 27/33] KVM: x86/mmu: Plumb a @can_yield parameter into __walk_slot_rmaps() Add a @can_yield param to __walk_slot_rmaps() to control whether or not dropping mmu_lock and conditionally rescheduling is allowed. This will allow using __walk_slot_rmaps() and thus cond_resched() to handle mmu_notifier invalidations, which usually allow blocking/yielding, but not when invoked by the OOM killer. Link: https://lore.kernel.org/r/20240809194335.1726916-12-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 17edf1499be7..e3adc934559d 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1526,7 +1526,8 @@ static __always_inline bool __walk_slot_rmaps(struct kvm *kvm, slot_rmaps_handler fn, int start_level, int end_level, gfn_t start_gfn, gfn_t end_gfn, - bool flush_on_yield, bool flush) + bool can_yield, bool flush_on_yield, + bool flush) { struct slot_rmap_walk_iterator iterator; @@ -1537,6 +1538,9 @@ static __always_inline bool __walk_slot_rmaps(struct kvm *kvm, if (iterator.rmap) flush |= fn(kvm, iterator.rmap, slot); + if (!can_yield) + continue; + if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { if (flush && flush_on_yield) { kvm_flush_remote_tlbs_range(kvm, start_gfn, @@ -1558,7 +1562,7 @@ static __always_inline bool walk_slot_rmaps(struct kvm *kvm, { return __walk_slot_rmaps(kvm, slot, fn, start_level, end_level, slot->base_gfn, slot->base_gfn + slot->npages - 1, - flush_on_yield, false); + true, flush_on_yield, false); } static __always_inline bool walk_slot_rmaps_4k(struct kvm *kvm, @@ -6600,7 +6604,7 @@ static bool kvm_rmap_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_e flush = __walk_slot_rmaps(kvm, memslot, __kvm_zap_rmap, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, - start, end - 1, true, flush); + start, end - 1, true, true, flush); } } @@ -6888,7 +6892,7 @@ static void kvm_shadow_mmu_try_split_huge_pages(struct kvm *kvm, */ for (level = KVM_MAX_HUGEPAGE_LEVEL; level > target_level; level--) __walk_slot_rmaps(kvm, slot, shadow_mmu_try_split_huge_pages, - level, level, start, end - 1, true, false); + level, level, start, end - 1, true, true, false); } /* Must be called with the mmu_lock held in write-mode. */ From dd9eaad744f4ed30913cca423439a1765a760c71 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 9 Aug 2024 12:43:24 -0700 Subject: [PATCH 28/33] KVM: x86/mmu: Add a helper to walk and zap rmaps for a memslot Add a dedicated helper to walk and zap rmaps for a given memslot so that the code can be shared between KVM-initiated zaps and mmu_notifier invalidations. No functional change intended. Link: https://lore.kernel.org/r/20240809194335.1726916-13-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index e3adc934559d..70b043d7701d 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1573,6 +1573,16 @@ static __always_inline bool walk_slot_rmaps_4k(struct kvm *kvm, return walk_slot_rmaps(kvm, slot, fn, PG_LEVEL_4K, PG_LEVEL_4K, flush_on_yield); } +static bool __kvm_rmap_zap_gfn_range(struct kvm *kvm, + const struct kvm_memory_slot *slot, + gfn_t start, gfn_t end, bool can_yield, + bool flush) +{ + return __walk_slot_rmaps(kvm, slot, __kvm_zap_rmap, + PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, + start, end - 1, can_yield, true, flush); +} + typedef bool (*rmap_handler_t)(struct kvm *kvm, struct kvm_rmap_head *rmap_head, struct kvm_memory_slot *slot, gfn_t gfn, int level); @@ -6602,9 +6612,8 @@ static bool kvm_rmap_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_e if (WARN_ON_ONCE(start >= end)) continue; - flush = __walk_slot_rmaps(kvm, memslot, __kvm_zap_rmap, - PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, - start, end - 1, true, true, flush); + flush = __kvm_rmap_zap_gfn_range(kvm, memslot, start, + end, true, flush); } } From 548f87f667a38ffeb2f021d9cfbc1f1b34fb4cb5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 9 Aug 2024 12:43:25 -0700 Subject: [PATCH 29/33] KVM: x86/mmu: Honor NEED_RESCHED when zapping rmaps and blocking is allowed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Convert kvm_unmap_gfn_range(), which is the helper that zaps rmap SPTEs in response to an mmu_notifier invalidation, to use __kvm_rmap_zap_gfn_range() and feed in range->may_block. In other words, honor NEED_RESCHED by way of cond_resched() when zapping rmaps. This fixes a long-standing issue where KVM could process an absurd number of rmap entries without ever yielding, e.g. if an mmu_notifier fired on a PUD (or larger) range. Opportunistically rename __kvm_zap_rmap() to kvm_zap_rmap(), and drop the old kvm_zap_rmap(). Ideally, the shuffling would be done in a different patch, but that just makes the compiler unhappy, e.g. arch/x86/kvm/mmu/mmu.c:1462:13: error: ‘kvm_zap_rmap’ defined but not used Reported-by: Peter Xu Link: https://lore.kernel.org/r/20240809194335.1726916-14-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 70b043d7701d..27a8a4f486c5 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1435,18 +1435,12 @@ static bool kvm_vcpu_write_protect_gfn(struct kvm_vcpu *vcpu, u64 gfn) return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn, PG_LEVEL_4K); } -static bool __kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - const struct kvm_memory_slot *slot) +static bool kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, + const struct kvm_memory_slot *slot) { return kvm_zap_all_rmap_sptes(kvm, rmap_head); } -static bool kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, gfn_t gfn, int level) -{ - return __kvm_zap_rmap(kvm, rmap_head, slot); -} - struct slot_rmap_walk_iterator { /* input fields. */ const struct kvm_memory_slot *slot; @@ -1578,7 +1572,7 @@ static bool __kvm_rmap_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end, bool can_yield, bool flush) { - return __walk_slot_rmaps(kvm, slot, __kvm_zap_rmap, + return __walk_slot_rmaps(kvm, slot, kvm_zap_rmap, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, start, end - 1, can_yield, true, flush); } @@ -1607,7 +1601,9 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) bool flush = false; if (kvm_memslots_have_rmaps(kvm)) - flush = kvm_handle_gfn_range(kvm, range, kvm_zap_rmap); + flush = __kvm_rmap_zap_gfn_range(kvm, range->slot, + range->start, range->end, + range->may_block, flush); if (tdp_mmu_enabled) flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush); From c17f150000f6b06061dc109bc2dd2858898a62b2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 9 Aug 2024 12:43:26 -0700 Subject: [PATCH 30/33] KVM: x86/mmu: Morph kvm_handle_gfn_range() into an aging specific helper Rework kvm_handle_gfn_range() into an aging-specic helper, kvm_rmap_age_gfn_range(). In addition to purging a bunch of unnecessary boilerplate code, this sets the stage for aging rmap SPTEs outside of mmu_lock. Note, there's a small functional change, as kvm_test_age_gfn() will now return immediately if a young SPTE is found, whereas previously KVM would continue iterating over other levels. Link: https://lore.kernel.org/r/20240809194335.1726916-15-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 68 ++++++++++++++---------------------------- 1 file changed, 22 insertions(+), 46 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 27a8a4f486c5..9b977560677b 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1577,25 +1577,6 @@ static bool __kvm_rmap_zap_gfn_range(struct kvm *kvm, start, end - 1, can_yield, true, flush); } -typedef bool (*rmap_handler_t)(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, gfn_t gfn, - int level); - -static __always_inline bool kvm_handle_gfn_range(struct kvm *kvm, - struct kvm_gfn_range *range, - rmap_handler_t handler) -{ - struct slot_rmap_walk_iterator iterator; - bool ret = false; - - for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, - range->start, range->end - 1, &iterator) - ret |= handler(kvm, iterator.rmap, range->slot, iterator.gfn, - iterator.level); - - return ret; -} - bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) { bool flush = false; @@ -1615,31 +1596,6 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) return flush; } -static bool kvm_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, gfn_t gfn, int level) -{ - u64 *sptep; - struct rmap_iterator iter; - int young = 0; - - for_each_rmap_spte(rmap_head, &iter, sptep) - young |= mmu_spte_age(sptep); - - return young; -} - -static bool kvm_test_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, gfn_t gfn, int level) -{ - u64 *sptep; - struct rmap_iterator iter; - - for_each_rmap_spte(rmap_head, &iter, sptep) - if (is_accessed_spte(*sptep)) - return true; - return false; -} - #define RMAP_RECYCLE_THRESHOLD 1000 static void __rmap_add(struct kvm *kvm, @@ -1674,12 +1630,32 @@ static void rmap_add(struct kvm_vcpu *vcpu, const struct kvm_memory_slot *slot, __rmap_add(vcpu->kvm, cache, slot, spte, gfn, access); } +static bool kvm_rmap_age_gfn_range(struct kvm *kvm, + struct kvm_gfn_range *range, bool test_only) +{ + struct slot_rmap_walk_iterator iterator; + struct rmap_iterator iter; + bool young = false; + u64 *sptep; + + for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, + range->start, range->end - 1, &iterator) { + for_each_rmap_spte(iterator.rmap, &iter, sptep) { + if (test_only && is_accessed_spte(*sptep)) + return true; + + young = mmu_spte_age(sptep); + } + } + return young; +} + bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { bool young = false; if (kvm_memslots_have_rmaps(kvm)) - young = kvm_handle_gfn_range(kvm, range, kvm_age_rmap); + young = kvm_rmap_age_gfn_range(kvm, range, false); if (tdp_mmu_enabled) young |= kvm_tdp_mmu_age_gfn_range(kvm, range); @@ -1692,7 +1668,7 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) bool young = false; if (kvm_memslots_have_rmaps(kvm)) - young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmap); + young = kvm_rmap_age_gfn_range(kvm, range, true); if (tdp_mmu_enabled) young |= kvm_tdp_mmu_test_age_gfn(kvm, range); From 7aac9dc680da9390a22515c3f822c9d1907c4f02 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 9 Aug 2024 12:43:27 -0700 Subject: [PATCH 31/33] KVM: x86/mmu: Fold mmu_spte_age() into kvm_rmap_age_gfn_range() Fold mmu_spte_age() into its sole caller now that aging and testing for young SPTEs is handled in a common location, i.e. doesn't require more helpers. Opportunistically remove the use of mmu_spte_get_lockless(), as mmu_lock is held (for write!), and marking SPTEs for access tracking outside of mmu_lock is unsafe (at least, as written). I.e. using the lockless accessor is quite misleading. No functional change intended. Link: https://lore.kernel.org/r/20240809194335.1726916-16-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 50 +++++++++++++++++++----------------------- 1 file changed, 22 insertions(+), 28 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 9b977560677b..24ca233d78cd 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -614,32 +614,6 @@ static u64 mmu_spte_get_lockless(u64 *sptep) return __get_spte_lockless(sptep); } -/* Returns the Accessed status of the PTE and resets it at the same time. */ -static bool mmu_spte_age(u64 *sptep) -{ - u64 spte = mmu_spte_get_lockless(sptep); - - if (!is_accessed_spte(spte)) - return false; - - if (spte_ad_enabled(spte)) { - clear_bit((ffs(shadow_accessed_mask) - 1), - (unsigned long *)sptep); - } else { - /* - * Capture the dirty status of the page, so that it doesn't get - * lost when the SPTE is marked for access tracking. - */ - if (is_writable_pte(spte)) - kvm_set_pfn_dirty(spte_to_pfn(spte)); - - spte = mark_spte_for_access_track(spte); - mmu_spte_update_no_track(sptep, spte); - } - - return true; -} - static inline bool is_tdp_mmu_active(struct kvm_vcpu *vcpu) { return tdp_mmu_enabled && vcpu->arch.mmu->root_role.direct; @@ -1641,10 +1615,30 @@ static bool kvm_rmap_age_gfn_range(struct kvm *kvm, for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, range->start, range->end - 1, &iterator) { for_each_rmap_spte(iterator.rmap, &iter, sptep) { - if (test_only && is_accessed_spte(*sptep)) + u64 spte = *sptep; + + if (!is_accessed_spte(spte)) + continue; + + if (test_only) return true; - young = mmu_spte_age(sptep); + if (spte_ad_enabled(spte)) { + clear_bit((ffs(shadow_accessed_mask) - 1), + (unsigned long *)sptep); + } else { + /* + * Capture the dirty status of the page, so that + * it doesn't get lost when the SPTE is marked + * for access tracking. + */ + if (is_writable_pte(spte)) + kvm_set_pfn_dirty(spte_to_pfn(spte)); + + spte = mark_spte_for_access_track(spte); + mmu_spte_update_no_track(sptep, spte); + } + young = true; } } return young; From 7645829145a91a3e13fdc322492500dae46ca17c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 9 Aug 2024 12:43:28 -0700 Subject: [PATCH 32/33] KVM: x86/mmu: Add KVM_RMAP_MANY to replace open coded '1' and '1ul' literals Replace all of the open coded '1' literals used to mark a PTE list as having many/multiple entries with a proper define. It's hard enough to read the code with one magic bit, and a future patch to support "locking" a single rmap will add another. No functional change intended. Link: https://lore.kernel.org/r/20240809194335.1726916-17-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 24ca233d78cd..53e24892055a 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -912,6 +912,7 @@ static struct kvm_memory_slot *gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu * in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct * pte_list_desc containing more mappings. */ +#define KVM_RMAP_MANY BIT(0) /* * Returns the number of pointers in the rmap chain, not counting the new one. @@ -924,16 +925,16 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte, if (!rmap_head->val) { rmap_head->val = (unsigned long)spte; - } else if (!(rmap_head->val & 1)) { + } else if (!(rmap_head->val & KVM_RMAP_MANY)) { desc = kvm_mmu_memory_cache_alloc(cache); desc->sptes[0] = (u64 *)rmap_head->val; desc->sptes[1] = spte; desc->spte_count = 2; desc->tail_count = 0; - rmap_head->val = (unsigned long)desc | 1; + rmap_head->val = (unsigned long)desc | KVM_RMAP_MANY; ++count; } else { - desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); + desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); count = desc->tail_count + desc->spte_count; /* @@ -942,10 +943,10 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte, */ if (desc->spte_count == PTE_LIST_EXT) { desc = kvm_mmu_memory_cache_alloc(cache); - desc->more = (struct pte_list_desc *)(rmap_head->val & ~1ul); + desc->more = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); desc->spte_count = 0; desc->tail_count = count; - rmap_head->val = (unsigned long)desc | 1; + rmap_head->val = (unsigned long)desc | KVM_RMAP_MANY; } desc->sptes[desc->spte_count++] = spte; } @@ -956,7 +957,7 @@ static void pte_list_desc_remove_entry(struct kvm *kvm, struct kvm_rmap_head *rmap_head, struct pte_list_desc *desc, int i) { - struct pte_list_desc *head_desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); + struct pte_list_desc *head_desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); int j = head_desc->spte_count - 1; /* @@ -985,7 +986,7 @@ static void pte_list_desc_remove_entry(struct kvm *kvm, if (!head_desc->more) rmap_head->val = 0; else - rmap_head->val = (unsigned long)head_desc->more | 1; + rmap_head->val = (unsigned long)head_desc->more | KVM_RMAP_MANY; mmu_free_pte_list_desc(head_desc); } @@ -998,13 +999,13 @@ static void pte_list_remove(struct kvm *kvm, u64 *spte, if (KVM_BUG_ON_DATA_CORRUPTION(!rmap_head->val, kvm)) return; - if (!(rmap_head->val & 1)) { + if (!(rmap_head->val & KVM_RMAP_MANY)) { if (KVM_BUG_ON_DATA_CORRUPTION((u64 *)rmap_head->val != spte, kvm)) return; rmap_head->val = 0; } else { - desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); + desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); while (desc) { for (i = 0; i < desc->spte_count; ++i) { if (desc->sptes[i] == spte) { @@ -1037,12 +1038,12 @@ static bool kvm_zap_all_rmap_sptes(struct kvm *kvm, if (!rmap_head->val) return false; - if (!(rmap_head->val & 1)) { + if (!(rmap_head->val & KVM_RMAP_MANY)) { mmu_spte_clear_track_bits(kvm, (u64 *)rmap_head->val); goto out; } - desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); + desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); for (; desc; desc = next) { for (i = 0; i < desc->spte_count; i++) @@ -1062,10 +1063,10 @@ unsigned int pte_list_count(struct kvm_rmap_head *rmap_head) if (!rmap_head->val) return 0; - else if (!(rmap_head->val & 1)) + else if (!(rmap_head->val & KVM_RMAP_MANY)) return 1; - desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); + desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); return desc->tail_count + desc->spte_count; } @@ -1127,13 +1128,13 @@ static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head, if (!rmap_head->val) return NULL; - if (!(rmap_head->val & 1)) { + if (!(rmap_head->val & KVM_RMAP_MANY)) { iter->desc = NULL; sptep = (u64 *)rmap_head->val; goto out; } - iter->desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); + iter->desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); iter->pos = 0; sptep = iter->desc->sptes[iter->pos]; out: From 9a5bff7f5ec2383e3edac5eda561b52e267ccbb5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 9 Aug 2024 12:43:30 -0700 Subject: [PATCH 33/33] KVM: x86/mmu: Use KVM_PAGES_PER_HPAGE() instead of an open coded equivalent Use KVM_PAGES_PER_HPAGE() instead of open coding equivalent logic that is anything but obvious. No functional change intended, and verified by compiling with the below assertions: BUILD_BUG_ON((1UL << KVM_HPAGE_GFN_SHIFT(PG_LEVEL_4K)) != KVM_PAGES_PER_HPAGE(PG_LEVEL_4K)); BUILD_BUG_ON((1UL << KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M)) != KVM_PAGES_PER_HPAGE(PG_LEVEL_2M)); BUILD_BUG_ON((1UL << KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G)) != KVM_PAGES_PER_HPAGE(PG_LEVEL_1G)); Link: https://lore.kernel.org/r/20240809194335.1726916-19-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 53e24892055a..b751e7e2a05e 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1464,7 +1464,7 @@ static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator) static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator) { while (++iterator->rmap <= iterator->end_rmap) { - iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level)); + iterator->gfn += KVM_PAGES_PER_HPAGE(iterator->level); if (iterator->rmap->val) return;