Merge tag 'kvm-x86-sev-6.17' of https://github.com/kvm-x86/linux into HEAD

KVM SEV cache maintenance changes for 6.17

 - Drop a superfluous WBINVD (on all CPUs!) when destroying a VM.

 - Use WBNOINVD instead of WBINVD when possible, for SEV cache maintenance,
   e.g. to minimize collateral damage when reclaiming memory from an SEV guest.

 - When reclaiming memory from an SEV guest, only do cache flushes on CPUs that
   have ever run a vCPU for the guest, i.e. don't flush the caches for CPUs
   that can't possibly have cache lines with dirty, encrypted data.
This commit is contained in:
Paolo Bonzini
2025-07-28 11:41:06 -04:00
3 changed files with 84 additions and 35 deletions

View File

@@ -117,6 +117,7 @@ static int sev_flush_asids(unsigned int min_asid, unsigned int max_asid)
*/
down_write(&sev_deactivate_lock);
/* SNP firmware requires use of WBINVD for ASID recycling. */
wbinvd_on_all_cpus();
if (sev_snp_enabled)
@@ -446,7 +447,12 @@ static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp,
init_args.probe = false;
ret = sev_platform_init(&init_args);
if (ret)
goto e_free;
goto e_free_asid;
if (!zalloc_cpumask_var(&sev->have_run_cpus, GFP_KERNEL_ACCOUNT)) {
ret = -ENOMEM;
goto e_free_asid;
}
/* This needs to happen after SEV/SNP firmware initialization. */
if (vm_type == KVM_X86_SNP_VM) {
@@ -464,6 +470,8 @@ static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp,
return 0;
e_free:
free_cpumask_var(sev->have_run_cpus);
e_free_asid:
argp->error = init_args.error;
sev_asid_free(sev);
sev->asid = 0;
@@ -708,6 +716,33 @@ static void sev_clflush_pages(struct page *pages[], unsigned long npages)
}
}
static void sev_writeback_caches(struct kvm *kvm)
{
/*
* Note, the caller is responsible for ensuring correctness if the mask
* can be modified, e.g. if a CPU could be doing VMRUN.
*/
if (cpumask_empty(to_kvm_sev_info(kvm)->have_run_cpus))
return;
/*
* Ensure that all dirty guest tagged cache entries are written back
* before releasing the pages back to the system for use. CLFLUSH will
* not do this without SME_COHERENT, and flushing many cache lines
* individually is slower than blasting WBINVD for large VMs, so issue
* WBNOINVD (or WBINVD if the "no invalidate" variant is unsupported)
* on CPUs that have done VMRUN, i.e. may have dirtied data using the
* VM's ASID.
*
* For simplicity, never remove CPUs from the bitmap. Ideally, KVM
* would clear the mask when flushing caches, but doing so requires
* serializing multiple calls and having responding CPUs (to the IPI)
* mark themselves as still running if they are running (or about to
* run) a vCPU for the VM.
*/
wbnoinvd_on_cpus_mask(to_kvm_sev_info(kvm)->have_run_cpus);
}
static unsigned long get_num_contig_pages(unsigned long idx,
struct page **inpages, unsigned long npages)
{
@@ -2037,6 +2072,17 @@ int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd)
if (ret)
goto out_source_vcpu;
/*
* Allocate a new have_run_cpus for the destination, i.e. don't copy
* the set of CPUs from the source. If a CPU was used to run a vCPU in
* the source VM but is never used for the destination VM, then the CPU
* can only have cached memory that was accessible to the source VM.
*/
if (!zalloc_cpumask_var(&dst_sev->have_run_cpus, GFP_KERNEL_ACCOUNT)) {
ret = -ENOMEM;
goto out_source_vcpu;
}
sev_migrate_from(kvm, source_kvm);
kvm_vm_dead(source_kvm);
cg_cleanup_sev = src_sev;
@@ -2694,12 +2740,7 @@ int sev_mem_enc_unregister_region(struct kvm *kvm,
goto failed;
}
/*
* Ensure that all guest tagged cache entries are flushed before
* releasing the pages back to the system for use. CLFLUSH will
* not do this, so issue a WBINVD.
*/
wbinvd_on_all_cpus();
sev_writeback_caches(kvm);
__unregister_enc_region_locked(kvm, region);
@@ -2741,13 +2782,18 @@ int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd)
goto e_unlock;
}
mirror_sev = to_kvm_sev_info(kvm);
if (!zalloc_cpumask_var(&mirror_sev->have_run_cpus, GFP_KERNEL_ACCOUNT)) {
ret = -ENOMEM;
goto e_unlock;
}
/*
* The mirror kvm holds an enc_context_owner ref so its asid can't
* disappear until we're done with it
*/
source_sev = to_kvm_sev_info(source_kvm);
kvm_get_kvm(source_kvm);
mirror_sev = to_kvm_sev_info(kvm);
list_add_tail(&mirror_sev->mirror_entry, &source_sev->mirror_vms);
/* Set enc_context_owner and copy its encryption context over */
@@ -2809,7 +2855,13 @@ void sev_vm_destroy(struct kvm *kvm)
WARN_ON(!list_empty(&sev->mirror_vms));
/* If this is a mirror_kvm release the enc_context_owner and skip sev cleanup */
free_cpumask_var(sev->have_run_cpus);
/*
* If this is a mirror VM, remove it from the owner's list of a mirrors
* and skip ASID cleanup (the ASID is tied to the lifetime of the owner).
* Note, mirror VMs don't support registering encrypted regions.
*/
if (is_mirroring_enc_context(kvm)) {
struct kvm *owner_kvm = sev->enc_context_owner;
@@ -2820,12 +2872,6 @@ void sev_vm_destroy(struct kvm *kvm)
return;
}
/*
* Ensure that all guest tagged cache entries are flushed before
* releasing the pages back to the system for use. CLFLUSH will
* not do this, so issue a WBINVD.
*/
wbinvd_on_all_cpus();
/*
* if userspace was terminated before unregistering the memory regions
@@ -3095,30 +3141,29 @@ static void sev_flush_encrypted_page(struct kvm_vcpu *vcpu, void *va)
/*
* VM Page Flush takes a host virtual address and a guest ASID. Fall
* back to WBINVD if this faults so as not to make any problems worse
* by leaving stale encrypted data in the cache.
* back to full writeback of caches if this faults so as not to make
* any problems worse by leaving stale encrypted data in the cache.
*/
if (WARN_ON_ONCE(wrmsrq_safe(MSR_AMD64_VM_PAGE_FLUSH, addr | asid)))
goto do_wbinvd;
goto do_sev_writeback_caches;
return;
do_wbinvd:
wbinvd_on_all_cpus();
do_sev_writeback_caches:
sev_writeback_caches(vcpu->kvm);
}
void sev_guest_memory_reclaimed(struct kvm *kvm)
{
/*
* With SNP+gmem, private/encrypted memory is unreachable via the
* hva-based mmu notifiers, so these events are only actually
* pertaining to shared pages where there is no need to perform
* the WBINVD to flush associated caches.
* hva-based mmu notifiers, i.e. these events are explicitly scoped to
* shared pages, where there's no need to flush caches.
*/
if (!sev_guest(kvm) || sev_snp_guest(kvm))
return;
wbinvd_on_all_cpus();
sev_writeback_caches(kvm);
}
void sev_free_vcpu(struct kvm_vcpu *vcpu)
@@ -3450,6 +3495,15 @@ int pre_sev_run(struct vcpu_svm *svm, int cpu)
if (sev_es_guest(kvm) && !VALID_PAGE(svm->vmcb->control.vmsa_pa))
return -EINVAL;
/*
* To optimize cache flushes when memory is reclaimed from an SEV VM,
* track physical CPUs that enter the guest for SEV VMs and thus can
* have encrypted, dirty data in the cache, and flush caches only for
* CPUs that have entered the guest.
*/
if (!cpumask_test_cpu(cpu, to_kvm_sev_info(kvm)->have_run_cpus))
cpumask_set_cpu(cpu, to_kvm_sev_info(kvm)->have_run_cpus);
/* Assign the asid allocated with this SEV guest */
svm->asid = asid;
@@ -3882,9 +3936,9 @@ void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu)
* From this point forward, the VMSA will always be a guest-mapped page
* rather than the initial one allocated by KVM in svm->sev_es.vmsa. In
* theory, svm->sev_es.vmsa could be free'd and cleaned up here, but
* that involves cleanups like wbinvd_on_all_cpus() which would ideally
* be handled during teardown rather than guest boot. Deferring that
* also allows the existing logic for SEV-ES VMSAs to be re-used with
* that involves cleanups like flushing caches, which would ideally be
* handled during teardown rather than guest boot. Deferring that also
* allows the existing logic for SEV-ES VMSAs to be re-used with
* minimal SNP-specific changes.
*/
svm->sev_es.snp_has_guest_vmsa = true;
@@ -4875,7 +4929,7 @@ void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end)
/*
* SEV-ES avoids host/guest cache coherency issues through
* WBINVD hooks issued via MMU notifiers during run-time, and
* WBNOINVD hooks issued via MMU notifiers during run-time, and
* KVM's VM destroy path at shutdown. Those MMU notifier events
* don't cover gmem since there is no requirement to map pages
* to a HVA in order to use them for a running guest. While the

View File

@@ -110,6 +110,7 @@ struct kvm_sev_info {
void *guest_req_buf; /* Bounce buffer for SNP Guest Request input */
void *guest_resp_buf; /* Bounce buffer for SNP Guest Request output */
struct mutex guest_req_mutex; /* Must acquire before using bounce buffers */
cpumask_var_t have_run_cpus; /* CPUs that have done VMRUN for this VM. */
};
#define SEV_POLICY_NODBG BIT_ULL(0)

View File

@@ -4994,11 +4994,6 @@ long kvm_arch_dev_ioctl(struct file *filp,
return r;
}
static void wbinvd_ipi(void *garbage)
{
wbinvd();
}
static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
{
return kvm_arch_has_noncoherent_dma(vcpu->kvm);
@@ -5022,8 +5017,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
if (kvm_x86_call(has_wbinvd_exit)())
cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
smp_call_function_single(vcpu->cpu,
wbinvd_ipi, NULL, 1);
wbinvd_on_cpu(vcpu->cpu);
}
kvm_x86_call(vcpu_load)(vcpu, cpu);