Merge tag 'kvm-x86-fixes-7.1-rc6' of https://github.com/kvm-x86/linux into HEAD

KVM x86 fixes for 7.1-rcN

 - Include the kernel's linux/mman.h in KVM selftests to ensure MADV_COLLAPSE
   is defined, as older libc versions may not provide it.

 - Include execinfo.h if and only if KVM selftests are building against glibc,
   and provide a test_dump_stack() for non-glibc builds.

 - Fudge around an RCU splat in the emegerncy reboot code that is technically
   a legitimate flaw, but in practice is a non-issue and fixing the flaw, e.g.
   by adding locking, would incur meaningful risk, i.e. do more harm than good.

 - Rate-limit global clock updates once again (but without delayed work), as
   KVM was subtly relying on the old rate-limiting for NPT correction to guard
   against "update storms" when running without a master clock on systems with
   overcommitted CPUs.

 - Fix a brown paper bag goof where KVM checked if ERAPS is "dirty" instead of
   marking it dirty when emulating INVPCID.

 - Flush the TLB when transitioning from xAVIC => x2AVIC to ensure the CPU TLB
   doesn't contain AVIC-tagged entries for the APIC base GPA.
This commit is contained in:
Paolo Bonzini
2026-05-29 19:28:16 +02:00
14 changed files with 79 additions and 20 deletions

View File

@@ -1504,6 +1504,7 @@ struct kvm_arch {
bool use_master_clock;
u64 master_kernel_ns;
u64 master_cycle_now;
struct ratelimit_state kvmclock_update_rs;
#ifdef CONFIG_KVM_HYPERV
struct kvm_hv hyperv;

View File

@@ -206,6 +206,35 @@ static void avic_activate_vmcb(struct vcpu_svm *svm)
svm_clr_intercept(svm, INTERCEPT_CR8_WRITE);
/*
* Flush the TLB when enabling (x2)AVIC and when transitioning between
* xAVIC and x2AVIC, as the CPU may have inserted a TLB entry for the
* "wrong" mapping.
*
* KVM uses a per-VM "scratch" page to back the APIC memslot, because
* KVM also uses per-VM page tables *and* maintains the page table (NPT
* or shadow page) mappings for said memslot even if one or more vCPUs
* have their local APIC hardware-disabled or are in x2APIC mode, i.e.
* even if one or more vCPUs' APIC MMIO BAR is effectively disabled.
*
* If xAVIC is fully enabled, hardware ignores the physical address in
* KVM's page tables, i.e. in the leaf SPTE for the APIC memslot, and
* instead redirects the access to the AVIC backing page, i.e. to the
* vCPU's virtual APIC page. If xAVIC is not enabled (APIC is either
* hardware-disabled or in x2APIC mode), then guest accesses will use
* the page table mapping verbatim, i.e. will access the per-VM scratch
* page, as normal memory.
*
* In both cases, the CPU is allowed to cache TLB entries for the APIC
* base GPA. So, KVM needs to flush the TLB when enabling xAVIC, as
* accesses need to be redirected to the virtual APIC page, but the TLB
* may contain entries pointing at the scratch page. KVM also needs to
* flush the TLB when enabling x2AVIC, as accesses need to go to the
* scratch page, but the TLB may contain entries tagged as xAVIC, i.e.
* entries pointing to the vCPU's virtual APIC page.
*/
kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, &svm->vcpu);
/*
* Note: KVM supports hybrid-AVIC mode, where KVM emulates x2APIC MSR
* accesses, while interrupt injection to a running vCPU can be
@@ -219,12 +248,6 @@ static void avic_activate_vmcb(struct vcpu_svm *svm)
/* Disabling MSR intercept for x2APIC registers */
avic_set_x2apic_msr_interception(svm, false);
} else {
/*
* Flush the TLB, the guest may have inserted a non-APIC
* mapping into the TLB while AVIC was disabled.
*/
kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, &svm->vcpu);
/* Enabling MSR intercept for x2APIC registers */
avic_set_x2apic_msr_interception(svm, true);
}

View File

@@ -5227,8 +5227,13 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
* On a host with synchronized TSC, there is no need to update
* kvmclock on vcpu->cpu migration
*/
if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1)
kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1) {
if (__ratelimit(&vcpu->kvm->arch.kvmclock_update_rs))
kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
else
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
}
if (vcpu->cpu != cpu)
kvm_make_request(KVM_REQ_MIGRATE_TIMER, vcpu);
vcpu->cpu = cpu;
@@ -13366,6 +13371,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
raw_spin_lock_init(&kvm->arch.tsc_write_lock);
mutex_init(&kvm->arch.apic_map_lock);
seqcount_raw_spinlock_init(&kvm->arch.pvclock_sc, &kvm->arch.tsc_write_lock);
ratelimit_state_init(&kvm->arch.kvmclock_update_rs, HZ, 10);
ratelimit_set_flags(&kvm->arch.kvmclock_update_rs, RATELIMIT_MSG_ON_RELEASE);
kvm->arch.kvmclock_offset = -get_kvmclock_base_ns();
raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
@@ -14323,7 +14330,7 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva)
* the RAP (Return Address Predicator).
*/
if (guest_cpu_cap_has(vcpu, X86_FEATURE_ERAPS))
kvm_register_is_dirty(vcpu, VCPU_EXREG_ERAPS);
kvm_register_mark_dirty(vcpu, VCPU_EXREG_ERAPS);
kvm_invalidate_pcid(vcpu, operand.pcid);
return kvm_skip_emulated_instruction(vcpu);

View File

@@ -49,7 +49,20 @@ static void x86_virt_invoke_kvm_emergency_callback(void)
{
cpu_emergency_virt_cb *kvm_callback;
kvm_callback = rcu_dereference(kvm_emergency_callback);
/*
* RCU may not be watching the crashing CPU here, so rcu_dereference()
* triggers a suspicious-RCU-usage splat. In principle, a concurrent
* KVM module unload could race with this read; see commit 2baa33a8ddd6
* ("KVM: x86: Leave user-return notifier registered on reboot/shutdown")
* which notes that nothing prevents module unload during panic/reboot.
*
* However, taking a lock here would be riskier than the current race:
* the system is going down via NMI shootdown, and any lock could be
* held by an already-stopped CPU. Use rcu_dereference_raw() to silence
* the lockdep splat and accept the comically small remaining race;
* panic context inherently cannot guarantee complete correctness.
*/
kvm_callback = rcu_dereference_raw(kvm_emergency_callback);
if (kvm_callback)
kvm_callback();
}

View File

@@ -41,10 +41,10 @@
#include <inttypes.h>
#include <limits.h>
#include <pthread.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include "kvm_syscalls.h"
#include "kvm_util.h"
#include "test_util.h"
#include "memstress.h"

View File

@@ -14,10 +14,10 @@
#include <linux/bitmap.h>
#include <linux/falloc.h>
#include <linux/sizes.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include "kvm_syscalls.h"
#include "kvm_util.h"
#include "numaif.h"
#include "test_util.h"

View File

@@ -2,8 +2,18 @@
#ifndef SELFTEST_KVM_SYSCALLS_H
#define SELFTEST_KVM_SYSCALLS_H
/*
* Include both the kernel and libc versions of mman.h. The kernel provides
* the most up-to-date flags and definitions, while libc provides the syscall
* wrappers tests expect.
*/
#include <linux/mman.h>
#include <sys/mman.h>
#include <sys/syscall.h>
#include <test_util.h>
#define MAP_ARGS0(m,...)
#define MAP_ARGS1(m,t,a,...) m(t,a)
#define MAP_ARGS2(m,t,a,...) m(t,a), MAP_ARGS1(m,__VA_ARGS__)

View File

@@ -19,9 +19,9 @@
#include <errno.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/mman.h>
#include "kselftest.h"
#include <linux/mman.h>
#include <linux/types.h>
#define msecs_to_usecs(msec) ((msec) * 1000ULL)

View File

@@ -6,11 +6,14 @@
*/
#include "test_util.h"
#include <execinfo.h>
#include <sys/syscall.h>
#include "kselftest.h"
#ifdef __GLIBC__
#include <execinfo.h>
/* Dumps the current stack trace to stderr. */
static void __attribute__((noinline)) test_dump_stack(void);
static void test_dump_stack(void)
@@ -57,6 +60,9 @@ static void test_dump_stack(void)
system(cmd);
#pragma GCC diagnostic pop
}
#else
static void test_dump_stack(void) {}
#endif
static pid_t _gettid(void)
{

View File

@@ -5,13 +5,13 @@
* Copyright (C) 2018, Google LLC.
*/
#include "test_util.h"
#include "kvm_syscalls.h"
#include "kvm_util.h"
#include "processor.h"
#include "ucall_common.h"
#include <assert.h>
#include <sched.h>
#include <sys/mman.h>
#include <sys/resource.h>
#include <sys/types.h>
#include <sys/stat.h>

View File

@@ -15,7 +15,6 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <time.h>
#include <unistd.h>
@@ -23,6 +22,7 @@
#include <linux/sizes.h>
#include <test_util.h>
#include <kvm_syscalls.h>
#include <kvm_util.h>
#include <processor.h>
#include <ucall_common.h>

View File

@@ -4,11 +4,10 @@
*
* Copyright (C) 2024, Red Hat, Inc.
*/
#include <sys/mman.h>
#include <linux/fs.h>
#include "test_util.h"
#include "kvm_syscalls.h"
#include "kvm_util.h"
#include "kselftest.h"
#include "ucall_common.h"

View File

@@ -4,8 +4,8 @@
*
* Copyright IBM Corp. 2021
*/
#include <sys/mman.h>
#include "test_util.h"
#include "kvm_syscalls.h"
#include "kvm_util.h"
#include "kselftest.h"
#include "ucall_common.h"

View File

@@ -8,11 +8,11 @@
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <linux/compiler.h>
#include <test_util.h>
#include <kvm_syscalls.h>
#include <kvm_util.h>
#include <processor.h>