Merge tag 'kvm-x86-mmu-6.15' of https://github.com/kvm-x86/linux into HEAD

KVM x86/mmu changes for 6.15

Add support for "fast" aging of SPTEs in both the TDP MMU and Shadow MMU, where
"fast" means "without holding mmu_lock".  Not taking mmu_lock allows multiple
aging actions to run in parallel, and more importantly avoids stalling vCPUs,
e.g. due to holding mmu_lock for an extended duration while a vCPU is faulting
in memory.

For the TDP MMU, protect aging via RCU; the page tables are RCU-protected and
KVM doesn't need to access any metadata to age SPTEs.

For the Shadow MMU, use bit 1 of rmap pointers (bit 0 is used to terminate a
list of rmaps) to implement a per-rmap single-bit spinlock.  When aging a gfn,
acquire the rmap's spinlock with read-only permissions, which allows hardening
and optimizing the locking and aging, e.g. locking an rmap for write requires
mmu_lock to also be held.  The lock is NOT a true R/W spinlock, i.e. multiple
concurrent readers aren't supported.

To avoid forcing all SPTE updates to use atomic operations (clearing the
Accessed bit out of mmu_lock makes it inherently volatile), rework and rename
spte_has_volatile_bits() to spte_needs_atomic_update() and deliberately exclude
the Accessed bit.  KVM (and mm/) already tolerates false positives/negatives
for Accessed information, and all testing has shown that reducing the latency
of aging is far more beneficial to overall system performance than providing
"perfect" young/old information.
This commit is contained in:
Paolo Bonzini
2025-03-19 09:04:33 -04:00
11 changed files with 373 additions and 169 deletions

View File

@@ -196,7 +196,7 @@ writable between reading spte and updating spte. Like below case:
The Dirty bit is lost in this case.
In order to avoid this kind of issue, we always treat the spte as "volatile"
if it can be updated out of mmu-lock [see spte_has_volatile_bits()]; it means
if it can be updated out of mmu-lock [see spte_needs_atomic_update()]; it means
the spte is always atomically updated in this case.
3) flush tlbs due to spte updated
@@ -212,7 +212,7 @@ function to update spte (present -> present).
Since the spte is "volatile" if it can be updated out of mmu-lock, we always
atomically update the spte and the race caused by fast page fault can be avoided.
See the comments in spte_has_volatile_bits() and mmu_spte_update().
See the comments in spte_needs_atomic_update() and mmu_spte_update().
Lockless Access Tracking:

View File

@@ -27,6 +27,7 @@
#include <linux/kfifo.h>
#include <linux/sched/vhost_task.h>
#include <linux/call_once.h>
#include <linux/atomic.h>
#include <asm/apic.h>
#include <asm/pvclock-abi.h>
@@ -405,7 +406,7 @@ union kvm_cpu_role {
};
struct kvm_rmap_head {
unsigned long val;
atomic_long_t val;
};
struct kvm_pio_request {
@@ -1479,6 +1480,7 @@ struct kvm_arch {
* tdp_mmu_page set.
*
* For reads, this list is protected by:
* RCU alone or
* the MMU lock in read mode + RCU or
* the MMU lock in write mode
*

View File

@@ -22,6 +22,7 @@ config KVM_X86
select KVM_COMMON
select KVM_GENERIC_MMU_NOTIFIER
select KVM_ELIDE_TLB_FLUSH_IF_YOUNG
select KVM_MMU_LOCKLESS_AGING
select HAVE_KVM_IRQCHIP
select HAVE_KVM_PFNCACHE
select HAVE_KVM_DIRTY_RING_TSO

View File

@@ -501,7 +501,7 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
return false;
}
if (!spte_has_volatile_bits(old_spte))
if (!spte_needs_atomic_update(old_spte))
__update_clear_spte_fast(sptep, new_spte);
else
old_spte = __update_clear_spte_slow(sptep, new_spte);
@@ -524,7 +524,7 @@ static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep)
int level = sptep_to_sp(sptep)->role.level;
if (!is_shadow_present_pte(old_spte) ||
!spte_has_volatile_bits(old_spte))
!spte_needs_atomic_update(old_spte))
__update_clear_spte_fast(sptep, SHADOW_NONPRESENT_VALUE);
else
old_spte = __update_clear_spte_slow(sptep, SHADOW_NONPRESENT_VALUE);
@@ -853,32 +853,173 @@ static struct kvm_memory_slot *gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu
* About rmap_head encoding:
*
* If the bit zero of rmap_head->val is clear, then it points to the only spte
* in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct
* in this rmap chain. Otherwise, (rmap_head->val & ~3) points to a struct
* pte_list_desc containing more mappings.
*/
#define KVM_RMAP_MANY BIT(0)
/*
* rmaps and PTE lists are mostly protected by mmu_lock (the shadow MMU always
* operates with mmu_lock held for write), but rmaps can be walked without
* holding mmu_lock so long as the caller can tolerate SPTEs in the rmap chain
* being zapped/dropped _while the rmap is locked_.
*
* Other than the KVM_RMAP_LOCKED flag, modifications to rmap entries must be
* done while holding mmu_lock for write. This allows a task walking rmaps
* without holding mmu_lock to concurrently walk the same entries as a task
* that is holding mmu_lock but _not_ the rmap lock. Neither task will modify
* the rmaps, thus the walks are stable.
*
* As alluded to above, SPTEs in rmaps are _not_ protected by KVM_RMAP_LOCKED,
* only the rmap chains themselves are protected. E.g. holding an rmap's lock
* ensures all "struct pte_list_desc" fields are stable.
*/
#define KVM_RMAP_LOCKED BIT(1)
static unsigned long __kvm_rmap_lock(struct kvm_rmap_head *rmap_head)
{
unsigned long old_val, new_val;
lockdep_assert_preemption_disabled();
/*
* Elide the lock if the rmap is empty, as lockless walkers (read-only
* mode) don't need to (and can't) walk an empty rmap, nor can they add
* entries to the rmap. I.e. the only paths that process empty rmaps
* do so while holding mmu_lock for write, and are mutually exclusive.
*/
old_val = atomic_long_read(&rmap_head->val);
if (!old_val)
return 0;
do {
/*
* If the rmap is locked, wait for it to be unlocked before
* trying acquire the lock, e.g. to avoid bouncing the cache
* line.
*/
while (old_val & KVM_RMAP_LOCKED) {
cpu_relax();
old_val = atomic_long_read(&rmap_head->val);
}
/*
* Recheck for an empty rmap, it may have been purged by the
* task that held the lock.
*/
if (!old_val)
return 0;
new_val = old_val | KVM_RMAP_LOCKED;
/*
* Use try_cmpxchg_acquire() to prevent reads and writes to the rmap
* from being reordered outside of the critical section created by
* __kvm_rmap_lock().
*
* Pairs with the atomic_long_set_release() in kvm_rmap_unlock().
*
* For the !old_val case, no ordering is needed, as there is no rmap
* to walk.
*/
} while (!atomic_long_try_cmpxchg_acquire(&rmap_head->val, &old_val, new_val));
/*
* Return the old value, i.e. _without_ the LOCKED bit set. It's
* impossible for the return value to be 0 (see above), i.e. the read-
* only unlock flow can't get a false positive and fail to unlock.
*/
return old_val;
}
static unsigned long kvm_rmap_lock(struct kvm *kvm,
struct kvm_rmap_head *rmap_head)
{
lockdep_assert_held_write(&kvm->mmu_lock);
return __kvm_rmap_lock(rmap_head);
}
static void __kvm_rmap_unlock(struct kvm_rmap_head *rmap_head,
unsigned long val)
{
KVM_MMU_WARN_ON(val & KVM_RMAP_LOCKED);
/*
* Ensure that all accesses to the rmap have completed before unlocking
* the rmap.
*
* Pairs with the atomic_long_try_cmpxchg_acquire() in __kvm_rmap_lock().
*/
atomic_long_set_release(&rmap_head->val, val);
}
static void kvm_rmap_unlock(struct kvm *kvm,
struct kvm_rmap_head *rmap_head,
unsigned long new_val)
{
lockdep_assert_held_write(&kvm->mmu_lock);
__kvm_rmap_unlock(rmap_head, new_val);
}
static unsigned long kvm_rmap_get(struct kvm_rmap_head *rmap_head)
{
return atomic_long_read(&rmap_head->val) & ~KVM_RMAP_LOCKED;
}
/*
* If mmu_lock isn't held, rmaps can only be locked in read-only mode. The
* actual locking is the same, but the caller is disallowed from modifying the
* rmap, and so the unlock flow is a nop if the rmap is/was empty.
*/
static unsigned long kvm_rmap_lock_readonly(struct kvm_rmap_head *rmap_head)
{
unsigned long rmap_val;
preempt_disable();
rmap_val = __kvm_rmap_lock(rmap_head);
if (!rmap_val)
preempt_enable();
return rmap_val;
}
static void kvm_rmap_unlock_readonly(struct kvm_rmap_head *rmap_head,
unsigned long old_val)
{
if (!old_val)
return;
KVM_MMU_WARN_ON(old_val != kvm_rmap_get(rmap_head));
__kvm_rmap_unlock(rmap_head, old_val);
preempt_enable();
}
/*
* Returns the number of pointers in the rmap chain, not counting the new one.
*/
static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte,
struct kvm_rmap_head *rmap_head)
static int pte_list_add(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
u64 *spte, struct kvm_rmap_head *rmap_head)
{
unsigned long old_val, new_val;
struct pte_list_desc *desc;
int count = 0;
if (!rmap_head->val) {
rmap_head->val = (unsigned long)spte;
} else if (!(rmap_head->val & KVM_RMAP_MANY)) {
old_val = kvm_rmap_lock(kvm, rmap_head);
if (!old_val) {
new_val = (unsigned long)spte;
} else if (!(old_val & KVM_RMAP_MANY)) {
desc = kvm_mmu_memory_cache_alloc(cache);
desc->sptes[0] = (u64 *)rmap_head->val;
desc->sptes[0] = (u64 *)old_val;
desc->sptes[1] = spte;
desc->spte_count = 2;
desc->tail_count = 0;
rmap_head->val = (unsigned long)desc | KVM_RMAP_MANY;
new_val = (unsigned long)desc | KVM_RMAP_MANY;
++count;
} else {
desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
desc = (struct pte_list_desc *)(old_val & ~KVM_RMAP_MANY);
count = desc->tail_count + desc->spte_count;
/*
@@ -887,21 +1028,25 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte,
*/
if (desc->spte_count == PTE_LIST_EXT) {
desc = kvm_mmu_memory_cache_alloc(cache);
desc->more = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
desc->more = (struct pte_list_desc *)(old_val & ~KVM_RMAP_MANY);
desc->spte_count = 0;
desc->tail_count = count;
rmap_head->val = (unsigned long)desc | KVM_RMAP_MANY;
new_val = (unsigned long)desc | KVM_RMAP_MANY;
} else {
new_val = old_val;
}
desc->sptes[desc->spte_count++] = spte;
}
kvm_rmap_unlock(kvm, rmap_head, new_val);
return count;
}
static void pte_list_desc_remove_entry(struct kvm *kvm,
struct kvm_rmap_head *rmap_head,
static void pte_list_desc_remove_entry(struct kvm *kvm, unsigned long *rmap_val,
struct pte_list_desc *desc, int i)
{
struct pte_list_desc *head_desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
struct pte_list_desc *head_desc = (struct pte_list_desc *)(*rmap_val & ~KVM_RMAP_MANY);
int j = head_desc->spte_count - 1;
/*
@@ -928,9 +1073,9 @@ static void pte_list_desc_remove_entry(struct kvm *kvm,
* head at the next descriptor, i.e. the new head.
*/
if (!head_desc->more)
rmap_head->val = 0;
*rmap_val = 0;
else
rmap_head->val = (unsigned long)head_desc->more | KVM_RMAP_MANY;
*rmap_val = (unsigned long)head_desc->more | KVM_RMAP_MANY;
mmu_free_pte_list_desc(head_desc);
}
@@ -938,24 +1083,26 @@ static void pte_list_remove(struct kvm *kvm, u64 *spte,
struct kvm_rmap_head *rmap_head)
{
struct pte_list_desc *desc;
unsigned long rmap_val;
int i;
if (KVM_BUG_ON_DATA_CORRUPTION(!rmap_head->val, kvm))
return;
rmap_val = kvm_rmap_lock(kvm, rmap_head);
if (KVM_BUG_ON_DATA_CORRUPTION(!rmap_val, kvm))
goto out;
if (!(rmap_head->val & KVM_RMAP_MANY)) {
if (KVM_BUG_ON_DATA_CORRUPTION((u64 *)rmap_head->val != spte, kvm))
return;
if (!(rmap_val & KVM_RMAP_MANY)) {
if (KVM_BUG_ON_DATA_CORRUPTION((u64 *)rmap_val != spte, kvm))
goto out;
rmap_head->val = 0;
rmap_val = 0;
} else {
desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
desc = (struct pte_list_desc *)(rmap_val & ~KVM_RMAP_MANY);
while (desc) {
for (i = 0; i < desc->spte_count; ++i) {
if (desc->sptes[i] == spte) {
pte_list_desc_remove_entry(kvm, rmap_head,
pte_list_desc_remove_entry(kvm, &rmap_val,
desc, i);
return;
goto out;
}
}
desc = desc->more;
@@ -963,6 +1110,9 @@ static void pte_list_remove(struct kvm *kvm, u64 *spte,
KVM_BUG_ON_DATA_CORRUPTION(true, kvm);
}
out:
kvm_rmap_unlock(kvm, rmap_head, rmap_val);
}
static void kvm_zap_one_rmap_spte(struct kvm *kvm,
@@ -977,17 +1127,19 @@ static bool kvm_zap_all_rmap_sptes(struct kvm *kvm,
struct kvm_rmap_head *rmap_head)
{
struct pte_list_desc *desc, *next;
unsigned long rmap_val;
int i;
if (!rmap_head->val)
rmap_val = kvm_rmap_lock(kvm, rmap_head);
if (!rmap_val)
return false;
if (!(rmap_head->val & KVM_RMAP_MANY)) {
mmu_spte_clear_track_bits(kvm, (u64 *)rmap_head->val);
if (!(rmap_val & KVM_RMAP_MANY)) {
mmu_spte_clear_track_bits(kvm, (u64 *)rmap_val);
goto out;
}
desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
desc = (struct pte_list_desc *)(rmap_val & ~KVM_RMAP_MANY);
for (; desc; desc = next) {
for (i = 0; i < desc->spte_count; i++)
@@ -997,20 +1149,21 @@ static bool kvm_zap_all_rmap_sptes(struct kvm *kvm,
}
out:
/* rmap_head is meaningless now, remember to reset it */
rmap_head->val = 0;
kvm_rmap_unlock(kvm, rmap_head, 0);
return true;
}
unsigned int pte_list_count(struct kvm_rmap_head *rmap_head)
{
unsigned long rmap_val = kvm_rmap_get(rmap_head);
struct pte_list_desc *desc;
if (!rmap_head->val)
if (!rmap_val)
return 0;
else if (!(rmap_head->val & KVM_RMAP_MANY))
else if (!(rmap_val & KVM_RMAP_MANY))
return 1;
desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
desc = (struct pte_list_desc *)(rmap_val & ~KVM_RMAP_MANY);
return desc->tail_count + desc->spte_count;
}
@@ -1053,6 +1206,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
*/
struct rmap_iterator {
/* private fields */
struct rmap_head *head;
struct pte_list_desc *desc; /* holds the sptep if not NULL */
int pos; /* index of the sptep */
};
@@ -1067,23 +1221,19 @@ struct rmap_iterator {
static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head,
struct rmap_iterator *iter)
{
u64 *sptep;
unsigned long rmap_val = kvm_rmap_get(rmap_head);
if (!rmap_head->val)
if (!rmap_val)
return NULL;
if (!(rmap_head->val & KVM_RMAP_MANY)) {
if (!(rmap_val & KVM_RMAP_MANY)) {
iter->desc = NULL;
sptep = (u64 *)rmap_head->val;
goto out;
return (u64 *)rmap_val;
}
iter->desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
iter->desc = (struct pte_list_desc *)(rmap_val & ~KVM_RMAP_MANY);
iter->pos = 0;
sptep = iter->desc->sptes[iter->pos];
out:
BUG_ON(!is_shadow_present_pte(*sptep));
return sptep;
return iter->desc->sptes[iter->pos];
}
/*
@@ -1093,14 +1243,11 @@ static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head,
*/
static u64 *rmap_get_next(struct rmap_iterator *iter)
{
u64 *sptep;
if (iter->desc) {
if (iter->pos < PTE_LIST_EXT - 1) {
++iter->pos;
sptep = iter->desc->sptes[iter->pos];
if (sptep)
goto out;
if (iter->desc->sptes[iter->pos])
return iter->desc->sptes[iter->pos];
}
iter->desc = iter->desc->more;
@@ -1108,20 +1255,24 @@ static u64 *rmap_get_next(struct rmap_iterator *iter)
if (iter->desc) {
iter->pos = 0;
/* desc->sptes[0] cannot be NULL */
sptep = iter->desc->sptes[iter->pos];
goto out;
return iter->desc->sptes[iter->pos];
}
}
return NULL;
out:
BUG_ON(!is_shadow_present_pte(*sptep));
return sptep;
}
#define for_each_rmap_spte(_rmap_head_, _iter_, _spte_) \
for (_spte_ = rmap_get_first(_rmap_head_, _iter_); \
_spte_; _spte_ = rmap_get_next(_iter_))
#define __for_each_rmap_spte(_rmap_head_, _iter_, _sptep_) \
for (_sptep_ = rmap_get_first(_rmap_head_, _iter_); \
_sptep_; _sptep_ = rmap_get_next(_iter_))
#define for_each_rmap_spte(_rmap_head_, _iter_, _sptep_) \
__for_each_rmap_spte(_rmap_head_, _iter_, _sptep_) \
if (!WARN_ON_ONCE(!is_shadow_present_pte(*(_sptep_)))) \
#define for_each_rmap_spte_lockless(_rmap_head_, _iter_, _sptep_, _spte_) \
__for_each_rmap_spte(_rmap_head_, _iter_, _sptep_) \
if (is_shadow_present_pte(_spte_ = mmu_spte_get_lockless(sptep)))
static void drop_spte(struct kvm *kvm, u64 *sptep)
{
@@ -1207,12 +1358,13 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
struct rmap_iterator iter;
bool flush = false;
for_each_rmap_spte(rmap_head, &iter, sptep)
for_each_rmap_spte(rmap_head, &iter, sptep) {
if (spte_ad_need_write_protect(*sptep))
flush |= test_and_clear_bit(PT_WRITABLE_SHIFT,
(unsigned long *)sptep);
else
flush |= spte_clear_dirty(sptep);
}
return flush;
}
@@ -1401,7 +1553,7 @@ static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
while (++iterator->rmap <= iterator->end_rmap) {
iterator->gfn += KVM_PAGES_PER_HPAGE(iterator->level);
if (iterator->rmap->val)
if (atomic_long_read(&iterator->rmap->val))
return;
}
@@ -1533,7 +1685,7 @@ static void __rmap_add(struct kvm *kvm,
kvm_update_page_stats(kvm, sp->role.level, 1);
rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);
rmap_count = pte_list_add(cache, spte, rmap_head);
rmap_count = pte_list_add(kvm, cache, spte, rmap_head);
if (rmap_count > kvm->stat.max_mmu_rmap_size)
kvm->stat.max_mmu_rmap_size = rmap_count;
@@ -1552,51 +1704,67 @@ static void rmap_add(struct kvm_vcpu *vcpu, const struct kvm_memory_slot *slot,
}
static bool kvm_rmap_age_gfn_range(struct kvm *kvm,
struct kvm_gfn_range *range, bool test_only)
struct kvm_gfn_range *range,
bool test_only)
{
struct slot_rmap_walk_iterator iterator;
struct kvm_rmap_head *rmap_head;
struct rmap_iterator iter;
unsigned long rmap_val;
bool young = false;
u64 *sptep;
gfn_t gfn;
int level;
u64 spte;
for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
range->start, range->end - 1, &iterator) {
for_each_rmap_spte(iterator.rmap, &iter, sptep) {
u64 spte = *sptep;
for (level = PG_LEVEL_4K; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) {
for (gfn = range->start; gfn < range->end;
gfn += KVM_PAGES_PER_HPAGE(level)) {
rmap_head = gfn_to_rmap(gfn, level, range->slot);
rmap_val = kvm_rmap_lock_readonly(rmap_head);
if (!is_accessed_spte(spte))
continue;
for_each_rmap_spte_lockless(rmap_head, &iter, sptep, spte) {
if (!is_accessed_spte(spte))
continue;
if (test_only)
return true;
if (test_only) {
kvm_rmap_unlock_readonly(rmap_head, rmap_val);
return true;
}
if (spte_ad_enabled(spte)) {
clear_bit((ffs(shadow_accessed_mask) - 1),
(unsigned long *)sptep);
} else {
/*
* WARN if mmu_spte_update() signals the need
* for a TLB flush, as Access tracking a SPTE
* should never trigger an _immediate_ flush.
*/
spte = mark_spte_for_access_track(spte);
WARN_ON_ONCE(mmu_spte_update(sptep, spte));
if (spte_ad_enabled(spte))
clear_bit((ffs(shadow_accessed_mask) - 1),
(unsigned long *)sptep);
else
/*
* If the following cmpxchg fails, the
* spte is being concurrently modified
* and should most likely stay young.
*/
cmpxchg64(sptep, spte,
mark_spte_for_access_track(spte));
young = true;
}
young = true;
kvm_rmap_unlock_readonly(rmap_head, rmap_val);
}
}
return young;
}
static bool kvm_may_have_shadow_mmu_sptes(struct kvm *kvm)
{
return !tdp_mmu_enabled || READ_ONCE(kvm->arch.indirect_shadow_pages);
}
bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
bool young = false;
if (kvm_memslots_have_rmaps(kvm))
young = kvm_rmap_age_gfn_range(kvm, range, false);
if (tdp_mmu_enabled)
young |= kvm_tdp_mmu_age_gfn_range(kvm, range);
young = kvm_tdp_mmu_age_gfn_range(kvm, range);
if (kvm_may_have_shadow_mmu_sptes(kvm))
young |= kvm_rmap_age_gfn_range(kvm, range, false);
return young;
}
@@ -1605,11 +1773,14 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
bool young = false;
if (kvm_memslots_have_rmaps(kvm))
young = kvm_rmap_age_gfn_range(kvm, range, true);
if (tdp_mmu_enabled)
young |= kvm_tdp_mmu_test_age_gfn(kvm, range);
young = kvm_tdp_mmu_test_age_gfn(kvm, range);
if (young)
return young;
if (kvm_may_have_shadow_mmu_sptes(kvm))
young |= kvm_rmap_age_gfn_range(kvm, range, true);
return young;
}
@@ -1656,13 +1827,14 @@ static unsigned kvm_page_table_hashfn(gfn_t gfn)
return hash_64(gfn, KVM_MMU_HASH_SHIFT);
}
static void mmu_page_add_parent_pte(struct kvm_mmu_memory_cache *cache,
static void mmu_page_add_parent_pte(struct kvm *kvm,
struct kvm_mmu_memory_cache *cache,
struct kvm_mmu_page *sp, u64 *parent_pte)
{
if (!parent_pte)
return;
pte_list_add(cache, parent_pte, &sp->parent_ptes);
pte_list_add(kvm, cache, parent_pte, &sp->parent_ptes);
}
static void mmu_page_remove_parent_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
@@ -2352,7 +2524,7 @@ static void __link_shadow_page(struct kvm *kvm,
mmu_spte_set(sptep, spte);
mmu_page_add_parent_pte(cache, sp, sptep);
mmu_page_add_parent_pte(kvm, cache, sp, sptep);
/*
* The non-direct sub-pagetable must be updated before linking. For
@@ -2416,7 +2588,8 @@ static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
* avoids retaining a large number of stale nested SPs.
*/
if (tdp_enabled && invalid_list &&
child->role.guest_mode && !child->parent_ptes.val)
child->role.guest_mode &&
!atomic_long_read(&child->parent_ptes.val))
return kvm_mmu_prepare_zap_page(kvm, child,
invalid_list);
}

View File

@@ -129,25 +129,32 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
}
/*
* Returns true if the SPTE has bits that may be set without holding mmu_lock.
* The caller is responsible for checking if the SPTE is shadow-present, and
* for determining whether or not the caller cares about non-leaf SPTEs.
* Returns true if the SPTE needs to be updated atomically due to having bits
* that may be changed without holding mmu_lock, and for which KVM must not
* lose information. E.g. KVM must not drop Dirty bit information. The caller
* is responsible for checking if the SPTE is shadow-present, and for
* determining whether or not the caller cares about non-leaf SPTEs.
*/
bool spte_has_volatile_bits(u64 spte)
bool spte_needs_atomic_update(u64 spte)
{
/* SPTEs can be made Writable bit by KVM's fast page fault handler. */
if (!is_writable_pte(spte) && is_mmu_writable_spte(spte))
return true;
if (is_access_track_spte(spte))
/*
* A/D-disabled SPTEs can be access-tracked by aging, and access-tracked
* SPTEs can be restored by KVM's fast page fault handler.
*/
if (!spte_ad_enabled(spte))
return true;
if (spte_ad_enabled(spte)) {
if (!(spte & shadow_accessed_mask) ||
(is_writable_pte(spte) && !(spte & shadow_dirty_mask)))
return true;
}
return false;
/*
* Dirty and Accessed bits can be set by the CPU. Ignore the Accessed
* bit, as KVM tolerates false negatives/positives, e.g. KVM doesn't
* invalidate TLBs when aging SPTEs, and so it's safe to clobber the
* Accessed bit (and rare in practice).
*/
return is_writable_pte(spte) && !(spte & shadow_dirty_mask);
}
bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,

View File

@@ -519,7 +519,7 @@ static inline u64 get_mmio_spte_generation(u64 spte)
return gen;
}
bool spte_has_volatile_bits(u64 spte);
bool spte_needs_atomic_update(u64 spte);
bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
const struct kvm_memory_slot *slot,

View File

@@ -25,6 +25,13 @@ static inline u64 kvm_tdp_mmu_write_spte_atomic(tdp_ptep_t sptep, u64 new_spte)
return xchg(rcu_dereference(sptep), new_spte);
}
static inline u64 tdp_mmu_clear_spte_bits_atomic(tdp_ptep_t sptep, u64 mask)
{
atomic64_t *sptep_atomic = (atomic64_t *)rcu_dereference(sptep);
return (u64)atomic64_fetch_and(~mask, sptep_atomic);
}
static inline void __kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 new_spte)
{
KVM_MMU_WARN_ON(is_ept_ve_possible(new_spte));
@@ -32,28 +39,21 @@ static inline void __kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 new_spte)
}
/*
* SPTEs must be modified atomically if they are shadow-present, leaf
* SPTEs, and have volatile bits, i.e. has bits that can be set outside
* of mmu_lock. The Writable bit can be set by KVM's fast page fault
* handler, and Accessed and Dirty bits can be set by the CPU.
*
* Note, non-leaf SPTEs do have Accessed bits and those bits are
* technically volatile, but KVM doesn't consume the Accessed bit of
* non-leaf SPTEs, i.e. KVM doesn't care if it clobbers the bit. This
* logic needs to be reassessed if KVM were to use non-leaf Accessed
* bits, e.g. to skip stepping down into child SPTEs when aging SPTEs.
* SPTEs must be modified atomically if they are shadow-present, leaf SPTEs,
* and have volatile bits (bits that can be set outside of mmu_lock) that
* must not be clobbered.
*/
static inline bool kvm_tdp_mmu_spte_need_atomic_write(u64 old_spte, int level)
static inline bool kvm_tdp_mmu_spte_need_atomic_update(u64 old_spte, int level)
{
return is_shadow_present_pte(old_spte) &&
is_last_spte(old_spte, level) &&
spte_has_volatile_bits(old_spte);
spte_needs_atomic_update(old_spte);
}
static inline u64 kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 old_spte,
u64 new_spte, int level)
{
if (kvm_tdp_mmu_spte_need_atomic_write(old_spte, level))
if (kvm_tdp_mmu_spte_need_atomic_update(old_spte, level))
return kvm_tdp_mmu_write_spte_atomic(sptep, new_spte);
__kvm_tdp_mmu_write_spte(sptep, new_spte);
@@ -63,12 +63,8 @@ static inline u64 kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 old_spte,
static inline u64 tdp_mmu_clear_spte_bits(tdp_ptep_t sptep, u64 old_spte,
u64 mask, int level)
{
atomic64_t *sptep_atomic;
if (kvm_tdp_mmu_spte_need_atomic_write(old_spte, level)) {
sptep_atomic = (atomic64_t *)rcu_dereference(sptep);
return (u64)atomic64_fetch_and(~mask, sptep_atomic);
}
if (kvm_tdp_mmu_spte_need_atomic_update(old_spte, level))
return tdp_mmu_clear_spte_bits_atomic(sptep, mask);
__kvm_tdp_mmu_write_spte(sptep, old_spte & ~mask);
return old_spte;

View File

@@ -193,6 +193,19 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
!tdp_mmu_root_match((_root), (_types)))) { \
} else
/*
* Iterate over all TDP MMU roots in an RCU read-side critical section.
* It is safe to iterate over the SPTEs under the root, but their values will
* be unstable, so all writes must be atomic. As this routine is meant to be
* used without holding the mmu_lock at all, any bits that are flipped must
* be reflected in kvm_tdp_mmu_spte_need_atomic_write().
*/
#define for_each_tdp_mmu_root_rcu(_kvm, _root, _as_id, _types) \
list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link) \
if ((_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) || \
!tdp_mmu_root_match((_root), (_types))) { \
} else
#define for_each_valid_tdp_mmu_root(_kvm, _root, _as_id) \
__for_each_tdp_mmu_root(_kvm, _root, _as_id, KVM_VALID_ROOTS)
@@ -774,9 +787,6 @@ static inline void tdp_mmu_iter_set_spte(struct kvm *kvm, struct tdp_iter *iter,
continue; \
else
#define tdp_mmu_for_each_pte(_iter, _kvm, _root, _start, _end) \
for_each_tdp_pte(_iter, _kvm, _root, _start, _end)
static inline bool __must_check tdp_mmu_iter_need_resched(struct kvm *kvm,
struct tdp_iter *iter)
{
@@ -1235,7 +1245,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
rcu_read_lock();
tdp_mmu_for_each_pte(iter, kvm, root, fault->gfn, fault->gfn + 1) {
for_each_tdp_pte(iter, kvm, root, fault->gfn, fault->gfn + 1) {
int r;
if (fault->nx_huge_page_workaround_enabled)
@@ -1332,21 +1342,22 @@ bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
* from the clear_young() or clear_flush_young() notifier, which uses the
* return value to determine if the page has been accessed.
*/
static void kvm_tdp_mmu_age_spte(struct tdp_iter *iter)
static void kvm_tdp_mmu_age_spte(struct kvm *kvm, struct tdp_iter *iter)
{
u64 new_spte;
if (spte_ad_enabled(iter->old_spte)) {
iter->old_spte = tdp_mmu_clear_spte_bits(iter->sptep,
iter->old_spte,
shadow_accessed_mask,
iter->level);
iter->old_spte = tdp_mmu_clear_spte_bits_atomic(iter->sptep,
shadow_accessed_mask);
new_spte = iter->old_spte & ~shadow_accessed_mask;
} else {
new_spte = mark_spte_for_access_track(iter->old_spte);
iter->old_spte = kvm_tdp_mmu_write_spte(iter->sptep,
iter->old_spte, new_spte,
iter->level);
/*
* It is safe for the following cmpxchg to fail. Leave the
* Accessed bit set, as the spte is most likely young anyway.
*/
if (__tdp_mmu_set_spte_atomic(kvm, iter, new_spte))
return;
}
trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level,
@@ -1371,9 +1382,9 @@ static bool __kvm_tdp_mmu_age_gfn_range(struct kvm *kvm,
* valid roots!
*/
WARN_ON(types & ~KVM_VALID_ROOTS);
__for_each_tdp_mmu_root(kvm, root, range->slot->as_id, types) {
guard(rcu)();
guard(rcu)();
for_each_tdp_mmu_root_rcu(kvm, root, range->slot->as_id, types) {
tdp_root_for_each_leaf_pte(iter, kvm, root, range->start, range->end) {
if (!is_accessed_spte(iter.old_spte))
continue;
@@ -1382,7 +1393,7 @@ static bool __kvm_tdp_mmu_age_gfn_range(struct kvm *kvm,
return true;
ret = true;
kvm_tdp_mmu_age_spte(&iter);
kvm_tdp_mmu_age_spte(kvm, &iter);
}
}
@@ -1904,7 +1915,7 @@ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
*root_level = vcpu->arch.mmu->root_role.level;
tdp_mmu_for_each_pte(iter, vcpu->kvm, root, gfn, gfn + 1) {
for_each_tdp_pte(iter, vcpu->kvm, root, gfn, gfn + 1) {
leaf = iter.level;
sptes[leaf] = iter.old_spte;
}
@@ -1931,7 +1942,7 @@ u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gfn_t gfn,
struct tdp_iter iter;
tdp_ptep_t sptep = NULL;
tdp_mmu_for_each_pte(iter, vcpu->kvm, root, gfn, gfn + 1) {
for_each_tdp_pte(iter, vcpu->kvm, root, gfn, gfn + 1) {
*spte = iter.old_spte;
sptep = iter.sptep;
}

View File

@@ -267,6 +267,7 @@ struct kvm_gfn_range {
union kvm_mmu_notifier_arg arg;
enum kvm_gfn_range_filter attr_filter;
bool may_block;
bool lockless;
};
bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);

View File

@@ -104,6 +104,10 @@ config KVM_ELIDE_TLB_FLUSH_IF_YOUNG
depends on KVM_GENERIC_MMU_NOTIFIER
bool
config KVM_MMU_LOCKLESS_AGING
depends on KVM_GENERIC_MMU_NOTIFIER
bool
config KVM_GENERIC_MEMORY_ATTRIBUTES
depends on KVM_GENERIC_MMU_NOTIFIER
bool

View File

@@ -517,6 +517,7 @@ struct kvm_mmu_notifier_range {
on_lock_fn_t on_lock;
bool flush_on_ret;
bool may_block;
bool lockless;
};
/*
@@ -551,8 +552,8 @@ static void kvm_null_fn(void)
node; \
node = interval_tree_iter_next(node, start, last)) \
static __always_inline kvm_mn_ret_t __kvm_handle_hva_range(struct kvm *kvm,
const struct kvm_mmu_notifier_range *range)
static __always_inline kvm_mn_ret_t kvm_handle_hva_range(struct kvm *kvm,
const struct kvm_mmu_notifier_range *range)
{
struct kvm_mmu_notifier_return r = {
.ret = false,
@@ -571,6 +572,10 @@ static __always_inline kvm_mn_ret_t __kvm_handle_hva_range(struct kvm *kvm,
IS_KVM_NULL_FN(range->handler)))
return r;
/* on_lock will never be called for lockless walks */
if (WARN_ON_ONCE(range->lockless && !IS_KVM_NULL_FN(range->on_lock)))
return r;
idx = srcu_read_lock(&kvm->srcu);
for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
@@ -607,15 +612,18 @@ static __always_inline kvm_mn_ret_t __kvm_handle_hva_range(struct kvm *kvm,
gfn_range.start = hva_to_gfn_memslot(hva_start, slot);
gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot);
gfn_range.slot = slot;
gfn_range.lockless = range->lockless;
if (!r.found_memslot) {
r.found_memslot = true;
KVM_MMU_LOCK(kvm);
if (!IS_KVM_NULL_FN(range->on_lock))
range->on_lock(kvm);
if (!range->lockless) {
KVM_MMU_LOCK(kvm);
if (!IS_KVM_NULL_FN(range->on_lock))
range->on_lock(kvm);
if (IS_KVM_NULL_FN(range->handler))
goto mmu_unlock;
if (IS_KVM_NULL_FN(range->handler))
goto mmu_unlock;
}
}
r.ret |= range->handler(kvm, &gfn_range);
}
@@ -625,7 +633,7 @@ static __always_inline kvm_mn_ret_t __kvm_handle_hva_range(struct kvm *kvm,
kvm_flush_remote_tlbs(kvm);
mmu_unlock:
if (r.found_memslot)
if (r.found_memslot && !range->lockless)
KVM_MMU_UNLOCK(kvm);
srcu_read_unlock(&kvm->srcu, idx);
@@ -633,7 +641,7 @@ static __always_inline kvm_mn_ret_t __kvm_handle_hva_range(struct kvm *kvm,
return r;
}
static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn,
static __always_inline int kvm_age_hva_range(struct mmu_notifier *mn,
unsigned long start,
unsigned long end,
gfn_handler_t handler,
@@ -647,17 +655,18 @@ static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn,
.on_lock = (void *)kvm_null_fn,
.flush_on_ret = flush_on_ret,
.may_block = false,
.lockless = IS_ENABLED(CONFIG_KVM_MMU_LOCKLESS_AGING),
};
return __kvm_handle_hva_range(kvm, &range).ret;
return kvm_handle_hva_range(kvm, &range).ret;
}
static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn,
unsigned long start,
unsigned long end,
gfn_handler_t handler)
static __always_inline int kvm_age_hva_range_no_flush(struct mmu_notifier *mn,
unsigned long start,
unsigned long end,
gfn_handler_t handler)
{
return kvm_handle_hva_range(mn, start, end, handler, false);
return kvm_age_hva_range(mn, start, end, handler, false);
}
void kvm_mmu_invalidate_begin(struct kvm *kvm)
@@ -752,7 +761,7 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
* that guest memory has been reclaimed. This needs to be done *after*
* dropping mmu_lock, as x86's reclaim path is slooooow.
*/
if (__kvm_handle_hva_range(kvm, &hva_range).found_memslot)
if (kvm_handle_hva_range(kvm, &hva_range).found_memslot)
kvm_arch_guest_memory_reclaimed(kvm);
return 0;
@@ -798,7 +807,7 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
};
bool wake;
__kvm_handle_hva_range(kvm, &hva_range);
kvm_handle_hva_range(kvm, &hva_range);
/* Pairs with the increment in range_start(). */
spin_lock(&kvm->mn_invalidate_lock);
@@ -822,8 +831,8 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
{
trace_kvm_age_hva(start, end);
return kvm_handle_hva_range(mn, start, end, kvm_age_gfn,
!IS_ENABLED(CONFIG_KVM_ELIDE_TLB_FLUSH_IF_YOUNG));
return kvm_age_hva_range(mn, start, end, kvm_age_gfn,
!IS_ENABLED(CONFIG_KVM_ELIDE_TLB_FLUSH_IF_YOUNG));
}
static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
@@ -846,7 +855,7 @@ static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
* cadence. If we find this inaccurate, we might come up with a
* more sophisticated heuristic later.
*/
return kvm_handle_hva_range_no_flush(mn, start, end, kvm_age_gfn);
return kvm_age_hva_range_no_flush(mn, start, end, kvm_age_gfn);
}
static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
@@ -855,8 +864,8 @@ static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
{
trace_kvm_test_age_hva(address);
return kvm_handle_hva_range_no_flush(mn, address, address + 1,
kvm_test_age_gfn);
return kvm_age_hva_range_no_flush(mn, address, address + 1,
kvm_test_age_gfn);
}
static void kvm_mmu_notifier_release(struct mmu_notifier *mn,