mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2026-05-04 04:28:10 -04:00
Merge tag 'kvm-x86-mmu-6.15' of https://github.com/kvm-x86/linux into HEAD
KVM x86/mmu changes for 6.15 Add support for "fast" aging of SPTEs in both the TDP MMU and Shadow MMU, where "fast" means "without holding mmu_lock". Not taking mmu_lock allows multiple aging actions to run in parallel, and more importantly avoids stalling vCPUs, e.g. due to holding mmu_lock for an extended duration while a vCPU is faulting in memory. For the TDP MMU, protect aging via RCU; the page tables are RCU-protected and KVM doesn't need to access any metadata to age SPTEs. For the Shadow MMU, use bit 1 of rmap pointers (bit 0 is used to terminate a list of rmaps) to implement a per-rmap single-bit spinlock. When aging a gfn, acquire the rmap's spinlock with read-only permissions, which allows hardening and optimizing the locking and aging, e.g. locking an rmap for write requires mmu_lock to also be held. The lock is NOT a true R/W spinlock, i.e. multiple concurrent readers aren't supported. To avoid forcing all SPTE updates to use atomic operations (clearing the Accessed bit out of mmu_lock makes it inherently volatile), rework and rename spte_has_volatile_bits() to spte_needs_atomic_update() and deliberately exclude the Accessed bit. KVM (and mm/) already tolerates false positives/negatives for Accessed information, and all testing has shown that reducing the latency of aging is far more beneficial to overall system performance than providing "perfect" young/old information.
This commit is contained in:
@@ -196,7 +196,7 @@ writable between reading spte and updating spte. Like below case:
|
||||
The Dirty bit is lost in this case.
|
||||
|
||||
In order to avoid this kind of issue, we always treat the spte as "volatile"
|
||||
if it can be updated out of mmu-lock [see spte_has_volatile_bits()]; it means
|
||||
if it can be updated out of mmu-lock [see spte_needs_atomic_update()]; it means
|
||||
the spte is always atomically updated in this case.
|
||||
|
||||
3) flush tlbs due to spte updated
|
||||
@@ -212,7 +212,7 @@ function to update spte (present -> present).
|
||||
|
||||
Since the spte is "volatile" if it can be updated out of mmu-lock, we always
|
||||
atomically update the spte and the race caused by fast page fault can be avoided.
|
||||
See the comments in spte_has_volatile_bits() and mmu_spte_update().
|
||||
See the comments in spte_needs_atomic_update() and mmu_spte_update().
|
||||
|
||||
Lockless Access Tracking:
|
||||
|
||||
|
||||
@@ -27,6 +27,7 @@
|
||||
#include <linux/kfifo.h>
|
||||
#include <linux/sched/vhost_task.h>
|
||||
#include <linux/call_once.h>
|
||||
#include <linux/atomic.h>
|
||||
|
||||
#include <asm/apic.h>
|
||||
#include <asm/pvclock-abi.h>
|
||||
@@ -405,7 +406,7 @@ union kvm_cpu_role {
|
||||
};
|
||||
|
||||
struct kvm_rmap_head {
|
||||
unsigned long val;
|
||||
atomic_long_t val;
|
||||
};
|
||||
|
||||
struct kvm_pio_request {
|
||||
@@ -1479,6 +1480,7 @@ struct kvm_arch {
|
||||
* tdp_mmu_page set.
|
||||
*
|
||||
* For reads, this list is protected by:
|
||||
* RCU alone or
|
||||
* the MMU lock in read mode + RCU or
|
||||
* the MMU lock in write mode
|
||||
*
|
||||
|
||||
@@ -22,6 +22,7 @@ config KVM_X86
|
||||
select KVM_COMMON
|
||||
select KVM_GENERIC_MMU_NOTIFIER
|
||||
select KVM_ELIDE_TLB_FLUSH_IF_YOUNG
|
||||
select KVM_MMU_LOCKLESS_AGING
|
||||
select HAVE_KVM_IRQCHIP
|
||||
select HAVE_KVM_PFNCACHE
|
||||
select HAVE_KVM_DIRTY_RING_TSO
|
||||
|
||||
@@ -501,7 +501,7 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!spte_has_volatile_bits(old_spte))
|
||||
if (!spte_needs_atomic_update(old_spte))
|
||||
__update_clear_spte_fast(sptep, new_spte);
|
||||
else
|
||||
old_spte = __update_clear_spte_slow(sptep, new_spte);
|
||||
@@ -524,7 +524,7 @@ static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep)
|
||||
int level = sptep_to_sp(sptep)->role.level;
|
||||
|
||||
if (!is_shadow_present_pte(old_spte) ||
|
||||
!spte_has_volatile_bits(old_spte))
|
||||
!spte_needs_atomic_update(old_spte))
|
||||
__update_clear_spte_fast(sptep, SHADOW_NONPRESENT_VALUE);
|
||||
else
|
||||
old_spte = __update_clear_spte_slow(sptep, SHADOW_NONPRESENT_VALUE);
|
||||
@@ -853,32 +853,173 @@ static struct kvm_memory_slot *gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu
|
||||
* About rmap_head encoding:
|
||||
*
|
||||
* If the bit zero of rmap_head->val is clear, then it points to the only spte
|
||||
* in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct
|
||||
* in this rmap chain. Otherwise, (rmap_head->val & ~3) points to a struct
|
||||
* pte_list_desc containing more mappings.
|
||||
*/
|
||||
#define KVM_RMAP_MANY BIT(0)
|
||||
|
||||
/*
|
||||
* rmaps and PTE lists are mostly protected by mmu_lock (the shadow MMU always
|
||||
* operates with mmu_lock held for write), but rmaps can be walked without
|
||||
* holding mmu_lock so long as the caller can tolerate SPTEs in the rmap chain
|
||||
* being zapped/dropped _while the rmap is locked_.
|
||||
*
|
||||
* Other than the KVM_RMAP_LOCKED flag, modifications to rmap entries must be
|
||||
* done while holding mmu_lock for write. This allows a task walking rmaps
|
||||
* without holding mmu_lock to concurrently walk the same entries as a task
|
||||
* that is holding mmu_lock but _not_ the rmap lock. Neither task will modify
|
||||
* the rmaps, thus the walks are stable.
|
||||
*
|
||||
* As alluded to above, SPTEs in rmaps are _not_ protected by KVM_RMAP_LOCKED,
|
||||
* only the rmap chains themselves are protected. E.g. holding an rmap's lock
|
||||
* ensures all "struct pte_list_desc" fields are stable.
|
||||
*/
|
||||
#define KVM_RMAP_LOCKED BIT(1)
|
||||
|
||||
static unsigned long __kvm_rmap_lock(struct kvm_rmap_head *rmap_head)
|
||||
{
|
||||
unsigned long old_val, new_val;
|
||||
|
||||
lockdep_assert_preemption_disabled();
|
||||
|
||||
/*
|
||||
* Elide the lock if the rmap is empty, as lockless walkers (read-only
|
||||
* mode) don't need to (and can't) walk an empty rmap, nor can they add
|
||||
* entries to the rmap. I.e. the only paths that process empty rmaps
|
||||
* do so while holding mmu_lock for write, and are mutually exclusive.
|
||||
*/
|
||||
old_val = atomic_long_read(&rmap_head->val);
|
||||
if (!old_val)
|
||||
return 0;
|
||||
|
||||
do {
|
||||
/*
|
||||
* If the rmap is locked, wait for it to be unlocked before
|
||||
* trying acquire the lock, e.g. to avoid bouncing the cache
|
||||
* line.
|
||||
*/
|
||||
while (old_val & KVM_RMAP_LOCKED) {
|
||||
cpu_relax();
|
||||
old_val = atomic_long_read(&rmap_head->val);
|
||||
}
|
||||
|
||||
/*
|
||||
* Recheck for an empty rmap, it may have been purged by the
|
||||
* task that held the lock.
|
||||
*/
|
||||
if (!old_val)
|
||||
return 0;
|
||||
|
||||
new_val = old_val | KVM_RMAP_LOCKED;
|
||||
/*
|
||||
* Use try_cmpxchg_acquire() to prevent reads and writes to the rmap
|
||||
* from being reordered outside of the critical section created by
|
||||
* __kvm_rmap_lock().
|
||||
*
|
||||
* Pairs with the atomic_long_set_release() in kvm_rmap_unlock().
|
||||
*
|
||||
* For the !old_val case, no ordering is needed, as there is no rmap
|
||||
* to walk.
|
||||
*/
|
||||
} while (!atomic_long_try_cmpxchg_acquire(&rmap_head->val, &old_val, new_val));
|
||||
|
||||
/*
|
||||
* Return the old value, i.e. _without_ the LOCKED bit set. It's
|
||||
* impossible for the return value to be 0 (see above), i.e. the read-
|
||||
* only unlock flow can't get a false positive and fail to unlock.
|
||||
*/
|
||||
return old_val;
|
||||
}
|
||||
|
||||
static unsigned long kvm_rmap_lock(struct kvm *kvm,
|
||||
struct kvm_rmap_head *rmap_head)
|
||||
{
|
||||
lockdep_assert_held_write(&kvm->mmu_lock);
|
||||
|
||||
return __kvm_rmap_lock(rmap_head);
|
||||
}
|
||||
|
||||
static void __kvm_rmap_unlock(struct kvm_rmap_head *rmap_head,
|
||||
unsigned long val)
|
||||
{
|
||||
KVM_MMU_WARN_ON(val & KVM_RMAP_LOCKED);
|
||||
/*
|
||||
* Ensure that all accesses to the rmap have completed before unlocking
|
||||
* the rmap.
|
||||
*
|
||||
* Pairs with the atomic_long_try_cmpxchg_acquire() in __kvm_rmap_lock().
|
||||
*/
|
||||
atomic_long_set_release(&rmap_head->val, val);
|
||||
}
|
||||
|
||||
static void kvm_rmap_unlock(struct kvm *kvm,
|
||||
struct kvm_rmap_head *rmap_head,
|
||||
unsigned long new_val)
|
||||
{
|
||||
lockdep_assert_held_write(&kvm->mmu_lock);
|
||||
|
||||
__kvm_rmap_unlock(rmap_head, new_val);
|
||||
}
|
||||
|
||||
static unsigned long kvm_rmap_get(struct kvm_rmap_head *rmap_head)
|
||||
{
|
||||
return atomic_long_read(&rmap_head->val) & ~KVM_RMAP_LOCKED;
|
||||
}
|
||||
|
||||
/*
|
||||
* If mmu_lock isn't held, rmaps can only be locked in read-only mode. The
|
||||
* actual locking is the same, but the caller is disallowed from modifying the
|
||||
* rmap, and so the unlock flow is a nop if the rmap is/was empty.
|
||||
*/
|
||||
static unsigned long kvm_rmap_lock_readonly(struct kvm_rmap_head *rmap_head)
|
||||
{
|
||||
unsigned long rmap_val;
|
||||
|
||||
preempt_disable();
|
||||
rmap_val = __kvm_rmap_lock(rmap_head);
|
||||
|
||||
if (!rmap_val)
|
||||
preempt_enable();
|
||||
|
||||
return rmap_val;
|
||||
}
|
||||
|
||||
static void kvm_rmap_unlock_readonly(struct kvm_rmap_head *rmap_head,
|
||||
unsigned long old_val)
|
||||
{
|
||||
if (!old_val)
|
||||
return;
|
||||
|
||||
KVM_MMU_WARN_ON(old_val != kvm_rmap_get(rmap_head));
|
||||
|
||||
__kvm_rmap_unlock(rmap_head, old_val);
|
||||
preempt_enable();
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns the number of pointers in the rmap chain, not counting the new one.
|
||||
*/
|
||||
static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte,
|
||||
struct kvm_rmap_head *rmap_head)
|
||||
static int pte_list_add(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
|
||||
u64 *spte, struct kvm_rmap_head *rmap_head)
|
||||
{
|
||||
unsigned long old_val, new_val;
|
||||
struct pte_list_desc *desc;
|
||||
int count = 0;
|
||||
|
||||
if (!rmap_head->val) {
|
||||
rmap_head->val = (unsigned long)spte;
|
||||
} else if (!(rmap_head->val & KVM_RMAP_MANY)) {
|
||||
old_val = kvm_rmap_lock(kvm, rmap_head);
|
||||
|
||||
if (!old_val) {
|
||||
new_val = (unsigned long)spte;
|
||||
} else if (!(old_val & KVM_RMAP_MANY)) {
|
||||
desc = kvm_mmu_memory_cache_alloc(cache);
|
||||
desc->sptes[0] = (u64 *)rmap_head->val;
|
||||
desc->sptes[0] = (u64 *)old_val;
|
||||
desc->sptes[1] = spte;
|
||||
desc->spte_count = 2;
|
||||
desc->tail_count = 0;
|
||||
rmap_head->val = (unsigned long)desc | KVM_RMAP_MANY;
|
||||
new_val = (unsigned long)desc | KVM_RMAP_MANY;
|
||||
++count;
|
||||
} else {
|
||||
desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
|
||||
desc = (struct pte_list_desc *)(old_val & ~KVM_RMAP_MANY);
|
||||
count = desc->tail_count + desc->spte_count;
|
||||
|
||||
/*
|
||||
@@ -887,21 +1028,25 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte,
|
||||
*/
|
||||
if (desc->spte_count == PTE_LIST_EXT) {
|
||||
desc = kvm_mmu_memory_cache_alloc(cache);
|
||||
desc->more = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
|
||||
desc->more = (struct pte_list_desc *)(old_val & ~KVM_RMAP_MANY);
|
||||
desc->spte_count = 0;
|
||||
desc->tail_count = count;
|
||||
rmap_head->val = (unsigned long)desc | KVM_RMAP_MANY;
|
||||
new_val = (unsigned long)desc | KVM_RMAP_MANY;
|
||||
} else {
|
||||
new_val = old_val;
|
||||
}
|
||||
desc->sptes[desc->spte_count++] = spte;
|
||||
}
|
||||
|
||||
kvm_rmap_unlock(kvm, rmap_head, new_val);
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
static void pte_list_desc_remove_entry(struct kvm *kvm,
|
||||
struct kvm_rmap_head *rmap_head,
|
||||
static void pte_list_desc_remove_entry(struct kvm *kvm, unsigned long *rmap_val,
|
||||
struct pte_list_desc *desc, int i)
|
||||
{
|
||||
struct pte_list_desc *head_desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
|
||||
struct pte_list_desc *head_desc = (struct pte_list_desc *)(*rmap_val & ~KVM_RMAP_MANY);
|
||||
int j = head_desc->spte_count - 1;
|
||||
|
||||
/*
|
||||
@@ -928,9 +1073,9 @@ static void pte_list_desc_remove_entry(struct kvm *kvm,
|
||||
* head at the next descriptor, i.e. the new head.
|
||||
*/
|
||||
if (!head_desc->more)
|
||||
rmap_head->val = 0;
|
||||
*rmap_val = 0;
|
||||
else
|
||||
rmap_head->val = (unsigned long)head_desc->more | KVM_RMAP_MANY;
|
||||
*rmap_val = (unsigned long)head_desc->more | KVM_RMAP_MANY;
|
||||
mmu_free_pte_list_desc(head_desc);
|
||||
}
|
||||
|
||||
@@ -938,24 +1083,26 @@ static void pte_list_remove(struct kvm *kvm, u64 *spte,
|
||||
struct kvm_rmap_head *rmap_head)
|
||||
{
|
||||
struct pte_list_desc *desc;
|
||||
unsigned long rmap_val;
|
||||
int i;
|
||||
|
||||
if (KVM_BUG_ON_DATA_CORRUPTION(!rmap_head->val, kvm))
|
||||
return;
|
||||
rmap_val = kvm_rmap_lock(kvm, rmap_head);
|
||||
if (KVM_BUG_ON_DATA_CORRUPTION(!rmap_val, kvm))
|
||||
goto out;
|
||||
|
||||
if (!(rmap_head->val & KVM_RMAP_MANY)) {
|
||||
if (KVM_BUG_ON_DATA_CORRUPTION((u64 *)rmap_head->val != spte, kvm))
|
||||
return;
|
||||
if (!(rmap_val & KVM_RMAP_MANY)) {
|
||||
if (KVM_BUG_ON_DATA_CORRUPTION((u64 *)rmap_val != spte, kvm))
|
||||
goto out;
|
||||
|
||||
rmap_head->val = 0;
|
||||
rmap_val = 0;
|
||||
} else {
|
||||
desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
|
||||
desc = (struct pte_list_desc *)(rmap_val & ~KVM_RMAP_MANY);
|
||||
while (desc) {
|
||||
for (i = 0; i < desc->spte_count; ++i) {
|
||||
if (desc->sptes[i] == spte) {
|
||||
pte_list_desc_remove_entry(kvm, rmap_head,
|
||||
pte_list_desc_remove_entry(kvm, &rmap_val,
|
||||
desc, i);
|
||||
return;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
desc = desc->more;
|
||||
@@ -963,6 +1110,9 @@ static void pte_list_remove(struct kvm *kvm, u64 *spte,
|
||||
|
||||
KVM_BUG_ON_DATA_CORRUPTION(true, kvm);
|
||||
}
|
||||
|
||||
out:
|
||||
kvm_rmap_unlock(kvm, rmap_head, rmap_val);
|
||||
}
|
||||
|
||||
static void kvm_zap_one_rmap_spte(struct kvm *kvm,
|
||||
@@ -977,17 +1127,19 @@ static bool kvm_zap_all_rmap_sptes(struct kvm *kvm,
|
||||
struct kvm_rmap_head *rmap_head)
|
||||
{
|
||||
struct pte_list_desc *desc, *next;
|
||||
unsigned long rmap_val;
|
||||
int i;
|
||||
|
||||
if (!rmap_head->val)
|
||||
rmap_val = kvm_rmap_lock(kvm, rmap_head);
|
||||
if (!rmap_val)
|
||||
return false;
|
||||
|
||||
if (!(rmap_head->val & KVM_RMAP_MANY)) {
|
||||
mmu_spte_clear_track_bits(kvm, (u64 *)rmap_head->val);
|
||||
if (!(rmap_val & KVM_RMAP_MANY)) {
|
||||
mmu_spte_clear_track_bits(kvm, (u64 *)rmap_val);
|
||||
goto out;
|
||||
}
|
||||
|
||||
desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
|
||||
desc = (struct pte_list_desc *)(rmap_val & ~KVM_RMAP_MANY);
|
||||
|
||||
for (; desc; desc = next) {
|
||||
for (i = 0; i < desc->spte_count; i++)
|
||||
@@ -997,20 +1149,21 @@ static bool kvm_zap_all_rmap_sptes(struct kvm *kvm,
|
||||
}
|
||||
out:
|
||||
/* rmap_head is meaningless now, remember to reset it */
|
||||
rmap_head->val = 0;
|
||||
kvm_rmap_unlock(kvm, rmap_head, 0);
|
||||
return true;
|
||||
}
|
||||
|
||||
unsigned int pte_list_count(struct kvm_rmap_head *rmap_head)
|
||||
{
|
||||
unsigned long rmap_val = kvm_rmap_get(rmap_head);
|
||||
struct pte_list_desc *desc;
|
||||
|
||||
if (!rmap_head->val)
|
||||
if (!rmap_val)
|
||||
return 0;
|
||||
else if (!(rmap_head->val & KVM_RMAP_MANY))
|
||||
else if (!(rmap_val & KVM_RMAP_MANY))
|
||||
return 1;
|
||||
|
||||
desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
|
||||
desc = (struct pte_list_desc *)(rmap_val & ~KVM_RMAP_MANY);
|
||||
return desc->tail_count + desc->spte_count;
|
||||
}
|
||||
|
||||
@@ -1053,6 +1206,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
|
||||
*/
|
||||
struct rmap_iterator {
|
||||
/* private fields */
|
||||
struct rmap_head *head;
|
||||
struct pte_list_desc *desc; /* holds the sptep if not NULL */
|
||||
int pos; /* index of the sptep */
|
||||
};
|
||||
@@ -1067,23 +1221,19 @@ struct rmap_iterator {
|
||||
static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head,
|
||||
struct rmap_iterator *iter)
|
||||
{
|
||||
u64 *sptep;
|
||||
unsigned long rmap_val = kvm_rmap_get(rmap_head);
|
||||
|
||||
if (!rmap_head->val)
|
||||
if (!rmap_val)
|
||||
return NULL;
|
||||
|
||||
if (!(rmap_head->val & KVM_RMAP_MANY)) {
|
||||
if (!(rmap_val & KVM_RMAP_MANY)) {
|
||||
iter->desc = NULL;
|
||||
sptep = (u64 *)rmap_head->val;
|
||||
goto out;
|
||||
return (u64 *)rmap_val;
|
||||
}
|
||||
|
||||
iter->desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
|
||||
iter->desc = (struct pte_list_desc *)(rmap_val & ~KVM_RMAP_MANY);
|
||||
iter->pos = 0;
|
||||
sptep = iter->desc->sptes[iter->pos];
|
||||
out:
|
||||
BUG_ON(!is_shadow_present_pte(*sptep));
|
||||
return sptep;
|
||||
return iter->desc->sptes[iter->pos];
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1093,14 +1243,11 @@ static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head,
|
||||
*/
|
||||
static u64 *rmap_get_next(struct rmap_iterator *iter)
|
||||
{
|
||||
u64 *sptep;
|
||||
|
||||
if (iter->desc) {
|
||||
if (iter->pos < PTE_LIST_EXT - 1) {
|
||||
++iter->pos;
|
||||
sptep = iter->desc->sptes[iter->pos];
|
||||
if (sptep)
|
||||
goto out;
|
||||
if (iter->desc->sptes[iter->pos])
|
||||
return iter->desc->sptes[iter->pos];
|
||||
}
|
||||
|
||||
iter->desc = iter->desc->more;
|
||||
@@ -1108,20 +1255,24 @@ static u64 *rmap_get_next(struct rmap_iterator *iter)
|
||||
if (iter->desc) {
|
||||
iter->pos = 0;
|
||||
/* desc->sptes[0] cannot be NULL */
|
||||
sptep = iter->desc->sptes[iter->pos];
|
||||
goto out;
|
||||
return iter->desc->sptes[iter->pos];
|
||||
}
|
||||
}
|
||||
|
||||
return NULL;
|
||||
out:
|
||||
BUG_ON(!is_shadow_present_pte(*sptep));
|
||||
return sptep;
|
||||
}
|
||||
|
||||
#define for_each_rmap_spte(_rmap_head_, _iter_, _spte_) \
|
||||
for (_spte_ = rmap_get_first(_rmap_head_, _iter_); \
|
||||
_spte_; _spte_ = rmap_get_next(_iter_))
|
||||
#define __for_each_rmap_spte(_rmap_head_, _iter_, _sptep_) \
|
||||
for (_sptep_ = rmap_get_first(_rmap_head_, _iter_); \
|
||||
_sptep_; _sptep_ = rmap_get_next(_iter_))
|
||||
|
||||
#define for_each_rmap_spte(_rmap_head_, _iter_, _sptep_) \
|
||||
__for_each_rmap_spte(_rmap_head_, _iter_, _sptep_) \
|
||||
if (!WARN_ON_ONCE(!is_shadow_present_pte(*(_sptep_)))) \
|
||||
|
||||
#define for_each_rmap_spte_lockless(_rmap_head_, _iter_, _sptep_, _spte_) \
|
||||
__for_each_rmap_spte(_rmap_head_, _iter_, _sptep_) \
|
||||
if (is_shadow_present_pte(_spte_ = mmu_spte_get_lockless(sptep)))
|
||||
|
||||
static void drop_spte(struct kvm *kvm, u64 *sptep)
|
||||
{
|
||||
@@ -1207,12 +1358,13 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
|
||||
struct rmap_iterator iter;
|
||||
bool flush = false;
|
||||
|
||||
for_each_rmap_spte(rmap_head, &iter, sptep)
|
||||
for_each_rmap_spte(rmap_head, &iter, sptep) {
|
||||
if (spte_ad_need_write_protect(*sptep))
|
||||
flush |= test_and_clear_bit(PT_WRITABLE_SHIFT,
|
||||
(unsigned long *)sptep);
|
||||
else
|
||||
flush |= spte_clear_dirty(sptep);
|
||||
}
|
||||
|
||||
return flush;
|
||||
}
|
||||
@@ -1401,7 +1553,7 @@ static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
|
||||
while (++iterator->rmap <= iterator->end_rmap) {
|
||||
iterator->gfn += KVM_PAGES_PER_HPAGE(iterator->level);
|
||||
|
||||
if (iterator->rmap->val)
|
||||
if (atomic_long_read(&iterator->rmap->val))
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -1533,7 +1685,7 @@ static void __rmap_add(struct kvm *kvm,
|
||||
kvm_update_page_stats(kvm, sp->role.level, 1);
|
||||
|
||||
rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);
|
||||
rmap_count = pte_list_add(cache, spte, rmap_head);
|
||||
rmap_count = pte_list_add(kvm, cache, spte, rmap_head);
|
||||
|
||||
if (rmap_count > kvm->stat.max_mmu_rmap_size)
|
||||
kvm->stat.max_mmu_rmap_size = rmap_count;
|
||||
@@ -1552,51 +1704,67 @@ static void rmap_add(struct kvm_vcpu *vcpu, const struct kvm_memory_slot *slot,
|
||||
}
|
||||
|
||||
static bool kvm_rmap_age_gfn_range(struct kvm *kvm,
|
||||
struct kvm_gfn_range *range, bool test_only)
|
||||
struct kvm_gfn_range *range,
|
||||
bool test_only)
|
||||
{
|
||||
struct slot_rmap_walk_iterator iterator;
|
||||
struct kvm_rmap_head *rmap_head;
|
||||
struct rmap_iterator iter;
|
||||
unsigned long rmap_val;
|
||||
bool young = false;
|
||||
u64 *sptep;
|
||||
gfn_t gfn;
|
||||
int level;
|
||||
u64 spte;
|
||||
|
||||
for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
|
||||
range->start, range->end - 1, &iterator) {
|
||||
for_each_rmap_spte(iterator.rmap, &iter, sptep) {
|
||||
u64 spte = *sptep;
|
||||
for (level = PG_LEVEL_4K; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) {
|
||||
for (gfn = range->start; gfn < range->end;
|
||||
gfn += KVM_PAGES_PER_HPAGE(level)) {
|
||||
rmap_head = gfn_to_rmap(gfn, level, range->slot);
|
||||
rmap_val = kvm_rmap_lock_readonly(rmap_head);
|
||||
|
||||
if (!is_accessed_spte(spte))
|
||||
continue;
|
||||
for_each_rmap_spte_lockless(rmap_head, &iter, sptep, spte) {
|
||||
if (!is_accessed_spte(spte))
|
||||
continue;
|
||||
|
||||
if (test_only)
|
||||
return true;
|
||||
if (test_only) {
|
||||
kvm_rmap_unlock_readonly(rmap_head, rmap_val);
|
||||
return true;
|
||||
}
|
||||
|
||||
if (spte_ad_enabled(spte)) {
|
||||
clear_bit((ffs(shadow_accessed_mask) - 1),
|
||||
(unsigned long *)sptep);
|
||||
} else {
|
||||
/*
|
||||
* WARN if mmu_spte_update() signals the need
|
||||
* for a TLB flush, as Access tracking a SPTE
|
||||
* should never trigger an _immediate_ flush.
|
||||
*/
|
||||
spte = mark_spte_for_access_track(spte);
|
||||
WARN_ON_ONCE(mmu_spte_update(sptep, spte));
|
||||
if (spte_ad_enabled(spte))
|
||||
clear_bit((ffs(shadow_accessed_mask) - 1),
|
||||
(unsigned long *)sptep);
|
||||
else
|
||||
/*
|
||||
* If the following cmpxchg fails, the
|
||||
* spte is being concurrently modified
|
||||
* and should most likely stay young.
|
||||
*/
|
||||
cmpxchg64(sptep, spte,
|
||||
mark_spte_for_access_track(spte));
|
||||
young = true;
|
||||
}
|
||||
young = true;
|
||||
|
||||
kvm_rmap_unlock_readonly(rmap_head, rmap_val);
|
||||
}
|
||||
}
|
||||
return young;
|
||||
}
|
||||
|
||||
static bool kvm_may_have_shadow_mmu_sptes(struct kvm *kvm)
|
||||
{
|
||||
return !tdp_mmu_enabled || READ_ONCE(kvm->arch.indirect_shadow_pages);
|
||||
}
|
||||
|
||||
bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
|
||||
{
|
||||
bool young = false;
|
||||
|
||||
if (kvm_memslots_have_rmaps(kvm))
|
||||
young = kvm_rmap_age_gfn_range(kvm, range, false);
|
||||
|
||||
if (tdp_mmu_enabled)
|
||||
young |= kvm_tdp_mmu_age_gfn_range(kvm, range);
|
||||
young = kvm_tdp_mmu_age_gfn_range(kvm, range);
|
||||
|
||||
if (kvm_may_have_shadow_mmu_sptes(kvm))
|
||||
young |= kvm_rmap_age_gfn_range(kvm, range, false);
|
||||
|
||||
return young;
|
||||
}
|
||||
@@ -1605,11 +1773,14 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
|
||||
{
|
||||
bool young = false;
|
||||
|
||||
if (kvm_memslots_have_rmaps(kvm))
|
||||
young = kvm_rmap_age_gfn_range(kvm, range, true);
|
||||
|
||||
if (tdp_mmu_enabled)
|
||||
young |= kvm_tdp_mmu_test_age_gfn(kvm, range);
|
||||
young = kvm_tdp_mmu_test_age_gfn(kvm, range);
|
||||
|
||||
if (young)
|
||||
return young;
|
||||
|
||||
if (kvm_may_have_shadow_mmu_sptes(kvm))
|
||||
young |= kvm_rmap_age_gfn_range(kvm, range, true);
|
||||
|
||||
return young;
|
||||
}
|
||||
@@ -1656,13 +1827,14 @@ static unsigned kvm_page_table_hashfn(gfn_t gfn)
|
||||
return hash_64(gfn, KVM_MMU_HASH_SHIFT);
|
||||
}
|
||||
|
||||
static void mmu_page_add_parent_pte(struct kvm_mmu_memory_cache *cache,
|
||||
static void mmu_page_add_parent_pte(struct kvm *kvm,
|
||||
struct kvm_mmu_memory_cache *cache,
|
||||
struct kvm_mmu_page *sp, u64 *parent_pte)
|
||||
{
|
||||
if (!parent_pte)
|
||||
return;
|
||||
|
||||
pte_list_add(cache, parent_pte, &sp->parent_ptes);
|
||||
pte_list_add(kvm, cache, parent_pte, &sp->parent_ptes);
|
||||
}
|
||||
|
||||
static void mmu_page_remove_parent_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
|
||||
@@ -2352,7 +2524,7 @@ static void __link_shadow_page(struct kvm *kvm,
|
||||
|
||||
mmu_spte_set(sptep, spte);
|
||||
|
||||
mmu_page_add_parent_pte(cache, sp, sptep);
|
||||
mmu_page_add_parent_pte(kvm, cache, sp, sptep);
|
||||
|
||||
/*
|
||||
* The non-direct sub-pagetable must be updated before linking. For
|
||||
@@ -2416,7 +2588,8 @@ static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
|
||||
* avoids retaining a large number of stale nested SPs.
|
||||
*/
|
||||
if (tdp_enabled && invalid_list &&
|
||||
child->role.guest_mode && !child->parent_ptes.val)
|
||||
child->role.guest_mode &&
|
||||
!atomic_long_read(&child->parent_ptes.val))
|
||||
return kvm_mmu_prepare_zap_page(kvm, child,
|
||||
invalid_list);
|
||||
}
|
||||
|
||||
@@ -129,25 +129,32 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns true if the SPTE has bits that may be set without holding mmu_lock.
|
||||
* The caller is responsible for checking if the SPTE is shadow-present, and
|
||||
* for determining whether or not the caller cares about non-leaf SPTEs.
|
||||
* Returns true if the SPTE needs to be updated atomically due to having bits
|
||||
* that may be changed without holding mmu_lock, and for which KVM must not
|
||||
* lose information. E.g. KVM must not drop Dirty bit information. The caller
|
||||
* is responsible for checking if the SPTE is shadow-present, and for
|
||||
* determining whether or not the caller cares about non-leaf SPTEs.
|
||||
*/
|
||||
bool spte_has_volatile_bits(u64 spte)
|
||||
bool spte_needs_atomic_update(u64 spte)
|
||||
{
|
||||
/* SPTEs can be made Writable bit by KVM's fast page fault handler. */
|
||||
if (!is_writable_pte(spte) && is_mmu_writable_spte(spte))
|
||||
return true;
|
||||
|
||||
if (is_access_track_spte(spte))
|
||||
/*
|
||||
* A/D-disabled SPTEs can be access-tracked by aging, and access-tracked
|
||||
* SPTEs can be restored by KVM's fast page fault handler.
|
||||
*/
|
||||
if (!spte_ad_enabled(spte))
|
||||
return true;
|
||||
|
||||
if (spte_ad_enabled(spte)) {
|
||||
if (!(spte & shadow_accessed_mask) ||
|
||||
(is_writable_pte(spte) && !(spte & shadow_dirty_mask)))
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
/*
|
||||
* Dirty and Accessed bits can be set by the CPU. Ignore the Accessed
|
||||
* bit, as KVM tolerates false negatives/positives, e.g. KVM doesn't
|
||||
* invalidate TLBs when aging SPTEs, and so it's safe to clobber the
|
||||
* Accessed bit (and rare in practice).
|
||||
*/
|
||||
return is_writable_pte(spte) && !(spte & shadow_dirty_mask);
|
||||
}
|
||||
|
||||
bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
|
||||
|
||||
@@ -519,7 +519,7 @@ static inline u64 get_mmio_spte_generation(u64 spte)
|
||||
return gen;
|
||||
}
|
||||
|
||||
bool spte_has_volatile_bits(u64 spte);
|
||||
bool spte_needs_atomic_update(u64 spte);
|
||||
|
||||
bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
|
||||
const struct kvm_memory_slot *slot,
|
||||
|
||||
@@ -25,6 +25,13 @@ static inline u64 kvm_tdp_mmu_write_spte_atomic(tdp_ptep_t sptep, u64 new_spte)
|
||||
return xchg(rcu_dereference(sptep), new_spte);
|
||||
}
|
||||
|
||||
static inline u64 tdp_mmu_clear_spte_bits_atomic(tdp_ptep_t sptep, u64 mask)
|
||||
{
|
||||
atomic64_t *sptep_atomic = (atomic64_t *)rcu_dereference(sptep);
|
||||
|
||||
return (u64)atomic64_fetch_and(~mask, sptep_atomic);
|
||||
}
|
||||
|
||||
static inline void __kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 new_spte)
|
||||
{
|
||||
KVM_MMU_WARN_ON(is_ept_ve_possible(new_spte));
|
||||
@@ -32,28 +39,21 @@ static inline void __kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 new_spte)
|
||||
}
|
||||
|
||||
/*
|
||||
* SPTEs must be modified atomically if they are shadow-present, leaf
|
||||
* SPTEs, and have volatile bits, i.e. has bits that can be set outside
|
||||
* of mmu_lock. The Writable bit can be set by KVM's fast page fault
|
||||
* handler, and Accessed and Dirty bits can be set by the CPU.
|
||||
*
|
||||
* Note, non-leaf SPTEs do have Accessed bits and those bits are
|
||||
* technically volatile, but KVM doesn't consume the Accessed bit of
|
||||
* non-leaf SPTEs, i.e. KVM doesn't care if it clobbers the bit. This
|
||||
* logic needs to be reassessed if KVM were to use non-leaf Accessed
|
||||
* bits, e.g. to skip stepping down into child SPTEs when aging SPTEs.
|
||||
* SPTEs must be modified atomically if they are shadow-present, leaf SPTEs,
|
||||
* and have volatile bits (bits that can be set outside of mmu_lock) that
|
||||
* must not be clobbered.
|
||||
*/
|
||||
static inline bool kvm_tdp_mmu_spte_need_atomic_write(u64 old_spte, int level)
|
||||
static inline bool kvm_tdp_mmu_spte_need_atomic_update(u64 old_spte, int level)
|
||||
{
|
||||
return is_shadow_present_pte(old_spte) &&
|
||||
is_last_spte(old_spte, level) &&
|
||||
spte_has_volatile_bits(old_spte);
|
||||
spte_needs_atomic_update(old_spte);
|
||||
}
|
||||
|
||||
static inline u64 kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 old_spte,
|
||||
u64 new_spte, int level)
|
||||
{
|
||||
if (kvm_tdp_mmu_spte_need_atomic_write(old_spte, level))
|
||||
if (kvm_tdp_mmu_spte_need_atomic_update(old_spte, level))
|
||||
return kvm_tdp_mmu_write_spte_atomic(sptep, new_spte);
|
||||
|
||||
__kvm_tdp_mmu_write_spte(sptep, new_spte);
|
||||
@@ -63,12 +63,8 @@ static inline u64 kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 old_spte,
|
||||
static inline u64 tdp_mmu_clear_spte_bits(tdp_ptep_t sptep, u64 old_spte,
|
||||
u64 mask, int level)
|
||||
{
|
||||
atomic64_t *sptep_atomic;
|
||||
|
||||
if (kvm_tdp_mmu_spte_need_atomic_write(old_spte, level)) {
|
||||
sptep_atomic = (atomic64_t *)rcu_dereference(sptep);
|
||||
return (u64)atomic64_fetch_and(~mask, sptep_atomic);
|
||||
}
|
||||
if (kvm_tdp_mmu_spte_need_atomic_update(old_spte, level))
|
||||
return tdp_mmu_clear_spte_bits_atomic(sptep, mask);
|
||||
|
||||
__kvm_tdp_mmu_write_spte(sptep, old_spte & ~mask);
|
||||
return old_spte;
|
||||
|
||||
@@ -193,6 +193,19 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
|
||||
!tdp_mmu_root_match((_root), (_types)))) { \
|
||||
} else
|
||||
|
||||
/*
|
||||
* Iterate over all TDP MMU roots in an RCU read-side critical section.
|
||||
* It is safe to iterate over the SPTEs under the root, but their values will
|
||||
* be unstable, so all writes must be atomic. As this routine is meant to be
|
||||
* used without holding the mmu_lock at all, any bits that are flipped must
|
||||
* be reflected in kvm_tdp_mmu_spte_need_atomic_write().
|
||||
*/
|
||||
#define for_each_tdp_mmu_root_rcu(_kvm, _root, _as_id, _types) \
|
||||
list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link) \
|
||||
if ((_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) || \
|
||||
!tdp_mmu_root_match((_root), (_types))) { \
|
||||
} else
|
||||
|
||||
#define for_each_valid_tdp_mmu_root(_kvm, _root, _as_id) \
|
||||
__for_each_tdp_mmu_root(_kvm, _root, _as_id, KVM_VALID_ROOTS)
|
||||
|
||||
@@ -774,9 +787,6 @@ static inline void tdp_mmu_iter_set_spte(struct kvm *kvm, struct tdp_iter *iter,
|
||||
continue; \
|
||||
else
|
||||
|
||||
#define tdp_mmu_for_each_pte(_iter, _kvm, _root, _start, _end) \
|
||||
for_each_tdp_pte(_iter, _kvm, _root, _start, _end)
|
||||
|
||||
static inline bool __must_check tdp_mmu_iter_need_resched(struct kvm *kvm,
|
||||
struct tdp_iter *iter)
|
||||
{
|
||||
@@ -1235,7 +1245,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
|
||||
|
||||
rcu_read_lock();
|
||||
|
||||
tdp_mmu_for_each_pte(iter, kvm, root, fault->gfn, fault->gfn + 1) {
|
||||
for_each_tdp_pte(iter, kvm, root, fault->gfn, fault->gfn + 1) {
|
||||
int r;
|
||||
|
||||
if (fault->nx_huge_page_workaround_enabled)
|
||||
@@ -1332,21 +1342,22 @@ bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
|
||||
* from the clear_young() or clear_flush_young() notifier, which uses the
|
||||
* return value to determine if the page has been accessed.
|
||||
*/
|
||||
static void kvm_tdp_mmu_age_spte(struct tdp_iter *iter)
|
||||
static void kvm_tdp_mmu_age_spte(struct kvm *kvm, struct tdp_iter *iter)
|
||||
{
|
||||
u64 new_spte;
|
||||
|
||||
if (spte_ad_enabled(iter->old_spte)) {
|
||||
iter->old_spte = tdp_mmu_clear_spte_bits(iter->sptep,
|
||||
iter->old_spte,
|
||||
shadow_accessed_mask,
|
||||
iter->level);
|
||||
iter->old_spte = tdp_mmu_clear_spte_bits_atomic(iter->sptep,
|
||||
shadow_accessed_mask);
|
||||
new_spte = iter->old_spte & ~shadow_accessed_mask;
|
||||
} else {
|
||||
new_spte = mark_spte_for_access_track(iter->old_spte);
|
||||
iter->old_spte = kvm_tdp_mmu_write_spte(iter->sptep,
|
||||
iter->old_spte, new_spte,
|
||||
iter->level);
|
||||
/*
|
||||
* It is safe for the following cmpxchg to fail. Leave the
|
||||
* Accessed bit set, as the spte is most likely young anyway.
|
||||
*/
|
||||
if (__tdp_mmu_set_spte_atomic(kvm, iter, new_spte))
|
||||
return;
|
||||
}
|
||||
|
||||
trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level,
|
||||
@@ -1371,9 +1382,9 @@ static bool __kvm_tdp_mmu_age_gfn_range(struct kvm *kvm,
|
||||
* valid roots!
|
||||
*/
|
||||
WARN_ON(types & ~KVM_VALID_ROOTS);
|
||||
__for_each_tdp_mmu_root(kvm, root, range->slot->as_id, types) {
|
||||
guard(rcu)();
|
||||
|
||||
guard(rcu)();
|
||||
for_each_tdp_mmu_root_rcu(kvm, root, range->slot->as_id, types) {
|
||||
tdp_root_for_each_leaf_pte(iter, kvm, root, range->start, range->end) {
|
||||
if (!is_accessed_spte(iter.old_spte))
|
||||
continue;
|
||||
@@ -1382,7 +1393,7 @@ static bool __kvm_tdp_mmu_age_gfn_range(struct kvm *kvm,
|
||||
return true;
|
||||
|
||||
ret = true;
|
||||
kvm_tdp_mmu_age_spte(&iter);
|
||||
kvm_tdp_mmu_age_spte(kvm, &iter);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1904,7 +1915,7 @@ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
|
||||
|
||||
*root_level = vcpu->arch.mmu->root_role.level;
|
||||
|
||||
tdp_mmu_for_each_pte(iter, vcpu->kvm, root, gfn, gfn + 1) {
|
||||
for_each_tdp_pte(iter, vcpu->kvm, root, gfn, gfn + 1) {
|
||||
leaf = iter.level;
|
||||
sptes[leaf] = iter.old_spte;
|
||||
}
|
||||
@@ -1931,7 +1942,7 @@ u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gfn_t gfn,
|
||||
struct tdp_iter iter;
|
||||
tdp_ptep_t sptep = NULL;
|
||||
|
||||
tdp_mmu_for_each_pte(iter, vcpu->kvm, root, gfn, gfn + 1) {
|
||||
for_each_tdp_pte(iter, vcpu->kvm, root, gfn, gfn + 1) {
|
||||
*spte = iter.old_spte;
|
||||
sptep = iter.sptep;
|
||||
}
|
||||
|
||||
@@ -267,6 +267,7 @@ struct kvm_gfn_range {
|
||||
union kvm_mmu_notifier_arg arg;
|
||||
enum kvm_gfn_range_filter attr_filter;
|
||||
bool may_block;
|
||||
bool lockless;
|
||||
};
|
||||
bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
|
||||
bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
|
||||
|
||||
@@ -104,6 +104,10 @@ config KVM_ELIDE_TLB_FLUSH_IF_YOUNG
|
||||
depends on KVM_GENERIC_MMU_NOTIFIER
|
||||
bool
|
||||
|
||||
config KVM_MMU_LOCKLESS_AGING
|
||||
depends on KVM_GENERIC_MMU_NOTIFIER
|
||||
bool
|
||||
|
||||
config KVM_GENERIC_MEMORY_ATTRIBUTES
|
||||
depends on KVM_GENERIC_MMU_NOTIFIER
|
||||
bool
|
||||
|
||||
@@ -517,6 +517,7 @@ struct kvm_mmu_notifier_range {
|
||||
on_lock_fn_t on_lock;
|
||||
bool flush_on_ret;
|
||||
bool may_block;
|
||||
bool lockless;
|
||||
};
|
||||
|
||||
/*
|
||||
@@ -551,8 +552,8 @@ static void kvm_null_fn(void)
|
||||
node; \
|
||||
node = interval_tree_iter_next(node, start, last)) \
|
||||
|
||||
static __always_inline kvm_mn_ret_t __kvm_handle_hva_range(struct kvm *kvm,
|
||||
const struct kvm_mmu_notifier_range *range)
|
||||
static __always_inline kvm_mn_ret_t kvm_handle_hva_range(struct kvm *kvm,
|
||||
const struct kvm_mmu_notifier_range *range)
|
||||
{
|
||||
struct kvm_mmu_notifier_return r = {
|
||||
.ret = false,
|
||||
@@ -571,6 +572,10 @@ static __always_inline kvm_mn_ret_t __kvm_handle_hva_range(struct kvm *kvm,
|
||||
IS_KVM_NULL_FN(range->handler)))
|
||||
return r;
|
||||
|
||||
/* on_lock will never be called for lockless walks */
|
||||
if (WARN_ON_ONCE(range->lockless && !IS_KVM_NULL_FN(range->on_lock)))
|
||||
return r;
|
||||
|
||||
idx = srcu_read_lock(&kvm->srcu);
|
||||
|
||||
for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
|
||||
@@ -607,15 +612,18 @@ static __always_inline kvm_mn_ret_t __kvm_handle_hva_range(struct kvm *kvm,
|
||||
gfn_range.start = hva_to_gfn_memslot(hva_start, slot);
|
||||
gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot);
|
||||
gfn_range.slot = slot;
|
||||
gfn_range.lockless = range->lockless;
|
||||
|
||||
if (!r.found_memslot) {
|
||||
r.found_memslot = true;
|
||||
KVM_MMU_LOCK(kvm);
|
||||
if (!IS_KVM_NULL_FN(range->on_lock))
|
||||
range->on_lock(kvm);
|
||||
if (!range->lockless) {
|
||||
KVM_MMU_LOCK(kvm);
|
||||
if (!IS_KVM_NULL_FN(range->on_lock))
|
||||
range->on_lock(kvm);
|
||||
|
||||
if (IS_KVM_NULL_FN(range->handler))
|
||||
goto mmu_unlock;
|
||||
if (IS_KVM_NULL_FN(range->handler))
|
||||
goto mmu_unlock;
|
||||
}
|
||||
}
|
||||
r.ret |= range->handler(kvm, &gfn_range);
|
||||
}
|
||||
@@ -625,7 +633,7 @@ static __always_inline kvm_mn_ret_t __kvm_handle_hva_range(struct kvm *kvm,
|
||||
kvm_flush_remote_tlbs(kvm);
|
||||
|
||||
mmu_unlock:
|
||||
if (r.found_memslot)
|
||||
if (r.found_memslot && !range->lockless)
|
||||
KVM_MMU_UNLOCK(kvm);
|
||||
|
||||
srcu_read_unlock(&kvm->srcu, idx);
|
||||
@@ -633,7 +641,7 @@ static __always_inline kvm_mn_ret_t __kvm_handle_hva_range(struct kvm *kvm,
|
||||
return r;
|
||||
}
|
||||
|
||||
static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn,
|
||||
static __always_inline int kvm_age_hva_range(struct mmu_notifier *mn,
|
||||
unsigned long start,
|
||||
unsigned long end,
|
||||
gfn_handler_t handler,
|
||||
@@ -647,17 +655,18 @@ static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn,
|
||||
.on_lock = (void *)kvm_null_fn,
|
||||
.flush_on_ret = flush_on_ret,
|
||||
.may_block = false,
|
||||
.lockless = IS_ENABLED(CONFIG_KVM_MMU_LOCKLESS_AGING),
|
||||
};
|
||||
|
||||
return __kvm_handle_hva_range(kvm, &range).ret;
|
||||
return kvm_handle_hva_range(kvm, &range).ret;
|
||||
}
|
||||
|
||||
static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn,
|
||||
unsigned long start,
|
||||
unsigned long end,
|
||||
gfn_handler_t handler)
|
||||
static __always_inline int kvm_age_hva_range_no_flush(struct mmu_notifier *mn,
|
||||
unsigned long start,
|
||||
unsigned long end,
|
||||
gfn_handler_t handler)
|
||||
{
|
||||
return kvm_handle_hva_range(mn, start, end, handler, false);
|
||||
return kvm_age_hva_range(mn, start, end, handler, false);
|
||||
}
|
||||
|
||||
void kvm_mmu_invalidate_begin(struct kvm *kvm)
|
||||
@@ -752,7 +761,7 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
|
||||
* that guest memory has been reclaimed. This needs to be done *after*
|
||||
* dropping mmu_lock, as x86's reclaim path is slooooow.
|
||||
*/
|
||||
if (__kvm_handle_hva_range(kvm, &hva_range).found_memslot)
|
||||
if (kvm_handle_hva_range(kvm, &hva_range).found_memslot)
|
||||
kvm_arch_guest_memory_reclaimed(kvm);
|
||||
|
||||
return 0;
|
||||
@@ -798,7 +807,7 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
|
||||
};
|
||||
bool wake;
|
||||
|
||||
__kvm_handle_hva_range(kvm, &hva_range);
|
||||
kvm_handle_hva_range(kvm, &hva_range);
|
||||
|
||||
/* Pairs with the increment in range_start(). */
|
||||
spin_lock(&kvm->mn_invalidate_lock);
|
||||
@@ -822,8 +831,8 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
|
||||
{
|
||||
trace_kvm_age_hva(start, end);
|
||||
|
||||
return kvm_handle_hva_range(mn, start, end, kvm_age_gfn,
|
||||
!IS_ENABLED(CONFIG_KVM_ELIDE_TLB_FLUSH_IF_YOUNG));
|
||||
return kvm_age_hva_range(mn, start, end, kvm_age_gfn,
|
||||
!IS_ENABLED(CONFIG_KVM_ELIDE_TLB_FLUSH_IF_YOUNG));
|
||||
}
|
||||
|
||||
static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
|
||||
@@ -846,7 +855,7 @@ static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
|
||||
* cadence. If we find this inaccurate, we might come up with a
|
||||
* more sophisticated heuristic later.
|
||||
*/
|
||||
return kvm_handle_hva_range_no_flush(mn, start, end, kvm_age_gfn);
|
||||
return kvm_age_hva_range_no_flush(mn, start, end, kvm_age_gfn);
|
||||
}
|
||||
|
||||
static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
|
||||
@@ -855,8 +864,8 @@ static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
|
||||
{
|
||||
trace_kvm_test_age_hva(address);
|
||||
|
||||
return kvm_handle_hva_range_no_flush(mn, address, address + 1,
|
||||
kvm_test_age_gfn);
|
||||
return kvm_age_hva_range_no_flush(mn, address, address + 1,
|
||||
kvm_test_age_gfn);
|
||||
}
|
||||
|
||||
static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
|
||||
|
||||
Reference in New Issue
Block a user