[RFC PATCH v6 31/35] KVM: arm64: Handle MMU notifiers for the SPE buffer

Alexandru Elisei alexandru.elisei at arm.com
Fri Nov 14 08:07:12 PST 2025


KVM makes changes to the stage 2 for two reasons: to mirror the changes
that happen to the host's stage 1, and when userspace makes a direct
change to the memory of the virtual machine.

Explicit changes made to the VM by userspace - like changing a memslot or
clearing the list of dirty pages, are immediately honored when it comes to
memory pinned and mapped at stage 2 for the SPE buffer. The only caveat is
when it comes to making the buffer pages read-only: to avoid a blackout
window, KVM skips making the affected buffer pages readonly and instead
immediately redirties them.

Changes to the host's stage 1 that affect the stage 2 entries for the
buffer broadly fall in two categories: changes that are attempted, but
never executed because the memory is pinned, and changes that are
committed.

The first type of changes share a similar cause: the reference count for a
page or folio is incremented with page table spinlock held, but the MMU
notifiers must be invoked from preemptible contexts. As a result, features
like THP collapse (khugepaged),  automatic NUMA balancing, KSM, etc. use
the following pattern for modifying the host's stage 1:

	mmu_notifier_invalidate_range_start(&range)
	pte = pte_offset_map_lock(.., &ptl)
	if (page_maybe_dma_pinned(page))
		goto out_unlock
	/* do stuff */

out_unlock:
	spin_unlock(ptl)
	mmu_notifier_invalidate_range_end(&range)

It is safe for KVM to ignore these type of changes, because the host's page
table won't be modified.

Changes to the host's stage 1 that are committed will be reflected in the
buffer stage 2 entries. The only exception is the access flag, for the same
reasoning as making the entries read-only.

Signed-off-by: Alexandru Elisei <alexandru.elisei at arm.com>
---
 arch/arm64/include/asm/kvm_host.h |   2 +
 arch/arm64/include/asm/kvm_mmu.h  |   7 +-
 arch/arm64/include/asm/kvm_spe.h  |  19 +++
 arch/arm64/kvm/arm.c              |  14 +-
 arch/arm64/kvm/mmu.c              | 125 ++++++++++++----
 arch/arm64/kvm/nested.c           |   9 +-
 arch/arm64/kvm/spe.c              | 232 +++++++++++++++++++++++++++++-
 arch/arm64/kvm/sys_regs.c         |   5 +-
 arch/arm64/kvm/vgic/vgic-its.c    |   4 +-
 include/kvm/arm_vgic.h            |   2 +
 include/linux/kvm_host.h          |   2 +
 11 files changed, 380 insertions(+), 41 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 876957320672..e79ec480d1d1 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -351,6 +351,8 @@ struct kvm_arch {
 #define KVM_ARCH_FLAG_GUEST_HAS_SVE			9
 	/* MIDR_EL1, REVIDR_EL1, and AIDR_EL1 are writable from userspace */
 #define KVM_ARCH_FLAG_WRITABLE_IMP_ID_REGS		10
+	/* Statistical Profiling Extension enabled for the guest */
+#define KVM_ARCH_FLAG_SPE_ENABLED			11
 	unsigned long flags;
 
 	/* VM-wide vCPU feature set */
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 37b84e9d4337..a4a0e00d1bbb 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -100,6 +100,10 @@ alternative_cb_end
 #include <asm/kvm_host.h>
 #include <asm/kvm_nested.h>
 
+#define KVM_MMU_NOTIFY_CMO		KVM_MMU_NOTIFY_ARCH1
+#define KVM_MMU_NOTIFY_SHADOW_S2	KVM_MMU_NOTIFY_ARCH2
+#define KVM_MMU_NOTIFY_SPLIT_HUGE_PAGE	KVM_MMU_NOTIFY_ARCH3
+
 void kvm_update_va_mask(struct alt_instr *alt,
 			__le32 *origptr, __le32 *updptr, int nr_inst);
 void kvm_compute_layout(void);
@@ -168,8 +172,9 @@ int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
 int create_hyp_stack(phys_addr_t phys_addr, unsigned long *haddr);
 void __init free_hyp_pgds(void);
 
+enum kvm_mmu_notifier_event;
 void kvm_stage2_unmap_range(struct kvm_s2_mmu *mmu, phys_addr_t start,
-			    u64 size, bool may_block);
+			    u64 size, bool may_block, enum kvm_mmu_notifier_event event);
 void kvm_stage2_flush_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end);
 void kvm_stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end);
 
diff --git a/arch/arm64/include/asm/kvm_spe.h b/arch/arm64/include/asm/kvm_spe.h
index 6c091fbfc95d..59a0e825a226 100644
--- a/arch/arm64/include/asm/kvm_spe.h
+++ b/arch/arm64/include/asm/kvm_spe.h
@@ -8,6 +8,8 @@
 
 #include <linux/kvm.h>
 
+#include <asm/stage2_pgtable.h>
+
 #ifdef CONFIG_KVM_ARM_SPE
 
 struct kvm_spe {
@@ -15,6 +17,7 @@ struct kvm_spe {
 	struct arm_spe_pmu *arm_spu;
 	u64 max_buffer_size;	/* Maximum per VCPU buffer size */
 	u64 guest_pmscr_el2;
+	bool dirtying_pages;
 };
 
 struct kvm_vcpu_spe {
@@ -35,6 +38,9 @@ static __always_inline bool kvm_supports_spe(void)
 #define vcpu_has_spe(vcpu)					\
 	(vcpu_has_feature(vcpu, KVM_ARM_VCPU_SPE))
 
+#define kvm_has_spe(kvm)					\
+	(test_bit(KVM_ARCH_FLAG_SPE_ENABLED, &(kvm)->arch.flags))
+
 /* Implements the function ProfilingBufferEnabled() from ARM DDI0487K.a */
 static inline bool kvm_spe_profiling_buffer_enabled(u64 pmblimitr_el1, u64 pmbsr_el1)
 {
@@ -47,6 +53,14 @@ void kvm_spe_destroy_vm(struct kvm *kvm);
 int kvm_spe_vcpu_first_run_init(struct kvm_vcpu *vcpu);
 void kvm_spe_vcpu_destroy(struct kvm_vcpu *vcpu);
 
+bool kvm_spe_allow_write_without_running_vcpu(struct kvm *kvm);
+
+enum kvm_mmu_notifier_event;
+phys_addr_t kvm_spe_adjust_range_start(struct kvm *kvm, phys_addr_t start, phys_addr_t end,
+				       enum kvm_mmu_notifier_event event);
+phys_addr_t kvm_spe_adjust_range_end(struct kvm *kvm, phys_addr_t start, phys_addr_t end,
+				     enum kvm_mmu_notifier_event event);
+
 u8 kvm_spe_get_pmsver_limit(void);
 
 int kvm_spe_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr);
@@ -72,6 +86,7 @@ struct kvm_vcpu_spe {
 
 #define kvm_supports_spe()	false
 #define vcpu_has_spe(vcpu)	false
+#define kvm_has_spe(kvm)	false
 
 static inline void kvm_spe_init_vm(struct kvm *kvm)
 {
@@ -86,6 +101,10 @@ static inline int kvm_spe_vcpu_first_run_init(struct kvm_vcpu *vcpu)
 static inline void kvm_spe_vcpu_destroy(struct kvm_vcpu *vcpu)
 {
 }
+static inline bool kvm_spe_allow_write_without_running_vcpu(struct kvm *kvm)
+{
+	return false;
+}
 static inline u8 kvm_spe_get_pmsver_limit(void)
 {
 	return 0;
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 8da772690173..d05dbb6d2d7a 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -777,6 +777,16 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
 		&& !kvm_arm_vcpu_stopped(v) && !v->arch.pause);
 }
 
+/*
+ * kvm_arch_allow_write_without_running_vcpu - allow writing guest memory
+ * without a running VCPU when dirty ring is enabled.
+ */
+bool kvm_arch_allow_write_without_running_vcpu(struct kvm *kvm)
+{
+       return kvm_vgic_allow_write_without_running_vcpu(kvm) ||
+	      kvm_spe_allow_write_without_running_vcpu(kvm);
+}
+
 bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
 {
 	return vcpu_mode_priv(vcpu);
@@ -1275,8 +1285,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
 		if (kvm_vcpu_has_pmu(vcpu))
 			kvm_pmu_sync_hwstate(vcpu);
 
-		kvm_spe_sync_hwstate(vcpu);
-
 		/*
 		 * Sync the vgic state before syncing the timer state because
 		 * the timer code needs to know if the virtual timer
@@ -1326,6 +1334,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
 
 		preempt_enable();
 
+		kvm_spe_sync_hwstate(vcpu);
+
 		/*
 		 * The ARMv8 architecture doesn't give the hypervisor
 		 * a mechanism to prevent a guest from dropping to AArch32 EL0
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 8abba9619c58..de48fb7c0fff 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -37,6 +37,22 @@ static unsigned long __ro_after_init io_map_base;
 
 #define KVM_PGT_FN(fn)		(!is_protected_kvm_enabled() ? fn : p ## fn)
 
+#ifndef CONFIG_KVM_ARM_SPE
+static inline phys_addr_t
+kvm_spe_adjust_range_start(struct kvm *kvm, phys_addr_t start, phys_addr_t end,
+			   enum kvm_mmu_notifier_event event)
+{
+	return start;
+}
+
+static inline phys_addr_t
+kvm_spe_adjust_range_end(struct kvm *kvm, phys_addr_t start, phys_addr_t end,
+			 enum kvm_mmu_notifier_event event)
+{
+	return end;
+}
+#endif
+
 static phys_addr_t __stage2_range_addr_end(phys_addr_t addr, phys_addr_t end,
 					   phys_addr_t size)
 {
@@ -62,10 +78,10 @@ static phys_addr_t stage2_range_addr_end(phys_addr_t addr, phys_addr_t end)
 static int stage2_apply_range(struct kvm_s2_mmu *mmu, phys_addr_t addr,
 			      phys_addr_t end,
 			      int (*fn)(struct kvm_pgtable *, u64, u64),
-			      bool resched)
+			      bool resched, enum kvm_mmu_notifier_event event)
 {
 	struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
-	int ret;
+	int ret = 0;
 	u64 next;
 
 	do {
@@ -73,7 +89,15 @@ static int stage2_apply_range(struct kvm_s2_mmu *mmu, phys_addr_t addr,
 		if (!pgt)
 			return -EINVAL;
 
+		if (kvm_has_spe(kvm)) {
+			addr = kvm_spe_adjust_range_start(kvm, addr, end, event);
+			if (addr == end)
+				break;
+		}
+
 		next = stage2_range_addr_end(addr, end);
+		if (kvm_has_spe(kvm))
+			next = kvm_spe_adjust_range_end(kvm, addr, next, event);
 		ret = fn(pgt, addr, next - addr);
 		if (ret)
 			break;
@@ -85,8 +109,8 @@ static int stage2_apply_range(struct kvm_s2_mmu *mmu, phys_addr_t addr,
 	return ret;
 }
 
-#define stage2_apply_range_resched(mmu, addr, end, fn)			\
-	stage2_apply_range(mmu, addr, end, fn, true)
+#define stage2_apply_range_resched(mmu, addr, end, fn, event)		\
+	stage2_apply_range(mmu, addr, end, fn, true, event)
 
 /*
  * Get the maximum number of page-tables pages needed to split a range
@@ -122,8 +146,9 @@ static int kvm_mmu_split_huge_pages(struct kvm *kvm, phys_addr_t addr,
 {
 	struct kvm_mmu_memory_cache *cache;
 	struct kvm_pgtable *pgt;
-	int ret, cache_capacity;
 	u64 next, chunk_size;
+	int cache_capacity;
+	int ret = 0;
 
 	lockdep_assert_held_write(&kvm->mmu_lock);
 
@@ -152,7 +177,18 @@ static int kvm_mmu_split_huge_pages(struct kvm *kvm, phys_addr_t addr,
 		if (!pgt)
 			return -EINVAL;
 
+		if (kvm_has_spe(kvm)) {
+			addr = kvm_spe_adjust_range_start(kvm, addr, end,
+					KVM_MMU_NOTIFY_SPLIT_HUGE_PAGE);
+			if (addr == end)
+				break;
+		}
+
 		next = __stage2_range_addr_end(addr, end, chunk_size);
+		if (kvm_has_spe(kvm)) {
+			next = kvm_spe_adjust_range_end(kvm, addr, next,
+							KVM_MMU_NOTIFY_SPLIT_HUGE_PAGE);
+		}
 		ret = KVM_PGT_FN(kvm_pgtable_stage2_split)(pgt, addr, next - addr, cache);
 		if (ret)
 			break;
@@ -319,6 +355,7 @@ static void invalidate_icache_guest_page(void *va, size_t size)
  * @start: The intermediate physical base address of the range to unmap
  * @size:  The size of the area to unmap
  * @may_block: Whether or not we are permitted to block
+ * @event: MMU notifier event
  *
  * Clear a range of stage-2 mappings, lowering the various ref-counts.  Must
  * be called while holding mmu_lock (unless for freeing the stage2 pgd before
@@ -326,7 +363,7 @@ static void invalidate_icache_guest_page(void *va, size_t size)
  * with things behind our backs.
  */
 static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size,
-				 bool may_block)
+				 bool may_block, enum kvm_mmu_notifier_event event)
 {
 	struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
 	phys_addr_t end = start + size;
@@ -334,18 +371,19 @@ static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64
 	lockdep_assert_held_write(&kvm->mmu_lock);
 	WARN_ON(size & ~PAGE_MASK);
 	WARN_ON(stage2_apply_range(mmu, start, end, KVM_PGT_FN(kvm_pgtable_stage2_unmap),
-				   may_block));
+				   may_block, event));
 }
 
 void kvm_stage2_unmap_range(struct kvm_s2_mmu *mmu, phys_addr_t start,
-			    u64 size, bool may_block)
+			    u64 size, bool may_block, enum kvm_mmu_notifier_event event)
 {
-	__unmap_stage2_range(mmu, start, size, may_block);
+	__unmap_stage2_range(mmu, start, size, may_block, event);
 }
 
 void kvm_stage2_flush_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end)
 {
-	stage2_apply_range_resched(mmu, addr, end, KVM_PGT_FN(kvm_pgtable_stage2_flush));
+	stage2_apply_range_resched(mmu, addr, end, KVM_PGT_FN(kvm_pgtable_stage2_flush),
+				   KVM_MMU_NOTIFY_CMO);
 }
 
 static void stage2_flush_memslot(struct kvm *kvm,
@@ -1028,7 +1066,8 @@ static void stage2_unmap_memslot(struct kvm *kvm,
 
 		if (!(vma->vm_flags & VM_PFNMAP)) {
 			gpa_t gpa = addr + (vm_start - memslot->userspace_addr);
-			kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, vm_end - vm_start, true);
+			kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, vm_end - vm_start, true,
+					       KVM_MMU_NOTIFY_MEMSLOT);
 		}
 		hva = vm_end;
 	} while (hva < reg_end);
@@ -1187,7 +1226,8 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
  */
 void kvm_stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end)
 {
-	stage2_apply_range_resched(mmu, addr, end, KVM_PGT_FN(kvm_pgtable_stage2_wrprotect));
+	stage2_apply_range_resched(mmu, addr, end, KVM_PGT_FN(kvm_pgtable_stage2_wrprotect),
+				   KVM_MMU_NOTIFY_WP);
 }
 
 /**
@@ -2100,22 +2140,63 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
 
 	__unmap_stage2_range(&kvm->arch.mmu, range->start << PAGE_SHIFT,
 			     (range->end - range->start) << PAGE_SHIFT,
-			     range->may_block);
+			     range->may_block, range->event);
 
 	kvm_nested_s2_unmap(kvm, range->may_block);
 	return false;
 }
 
-bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
+static bool kvm_test_age_range(struct kvm *kvm, struct kvm_gfn_range *range,
+			       bool mkold)
 {
-	u64 size = (range->end - range->start) << PAGE_SHIFT;
+	phys_addr_t range_start = range->start << PAGE_SHIFT;
+	phys_addr_t range_end = range->end << PAGE_SHIFT;
+	enum kvm_mmu_notifier_event event = range->event;
+	phys_addr_t start, end;
+	bool was_young = false;
+
+	if (!kvm_has_spe(kvm)) {
+		return KVM_PGT_FN(kvm_pgtable_stage2_test_clear_young)(kvm->arch.mmu.pgt,
+							   range->start,
+							   range->end - range->start,
+							   mkold);
+	}
 
+       /* Prime the first iteration */
+       start = end = range_start;
+       do {
+	       start = kvm_spe_adjust_range_start(kvm, start, range_end, event);
+	       /*
+		* 'start' is initialised to 'end' at the beginning of each
+		* iteration.  They can only be different because
+		* kvm_spe_adjust_range_start() detected at least one page in use
+		* for SPE.
+		*/
+	       if (start != end)
+		       was_young = true;
+	       if (start == range_end)
+		       break;
+
+               end = kvm_spe_adjust_range_end(kvm, start, range_end, event);
+               if (end != range_end)
+                      was_young = true;
+
+               was_young |= KVM_PGT_FN(kvm_pgtable_stage2_test_clear_young)(kvm->arch.mmu.pgt,
+									    start, end - start,
+									    mkold);
+	       start = end;
+       } while (end != range_end);
+
+       return was_young;
+}
+
+bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
+{
 	if (!kvm->arch.mmu.pgt)
 		return false;
 
-	return KVM_PGT_FN(kvm_pgtable_stage2_test_clear_young)(kvm->arch.mmu.pgt,
-						   range->start << PAGE_SHIFT,
-						   size, true);
+	return kvm_test_age_range(kvm, range, true);
+
 	/*
 	 * TODO: Handle nested_mmu structures here using the reverse mapping in
 	 * a later version of patch series.
@@ -2124,14 +2205,10 @@ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 
 bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-	u64 size = (range->end - range->start) << PAGE_SHIFT;
-
 	if (!kvm->arch.mmu.pgt)
 		return false;
 
-	return KVM_PGT_FN(kvm_pgtable_stage2_test_clear_young)(kvm->arch.mmu.pgt,
-						   range->start << PAGE_SHIFT,
-						   size, false);
+	return kvm_test_age_range(kvm, range, false);
 }
 
 phys_addr_t kvm_mmu_get_httbr(void)
@@ -2386,7 +2463,7 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
 	phys_addr_t size = slot->npages << PAGE_SHIFT;
 
 	write_lock(&kvm->mmu_lock);
-	kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, size, true);
+	kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, size, true, KVM_MMU_NOTIFY_MEMSLOT);
 	kvm_nested_s2_unmap(kvm, true);
 	write_unlock(&kvm->mmu_lock);
 }
diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c
index 92e94bb96bcc..73e09dcef3ca 100644
--- a/arch/arm64/kvm/nested.c
+++ b/arch/arm64/kvm/nested.c
@@ -1076,8 +1076,10 @@ void kvm_nested_s2_unmap(struct kvm *kvm, bool may_block)
 	for (i = 0; i < kvm->arch.nested_mmus_size; i++) {
 		struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];
 
-		if (kvm_s2_mmu_valid(mmu))
-			kvm_stage2_unmap_range(mmu, 0, kvm_phys_size(mmu), may_block);
+		if (kvm_s2_mmu_valid(mmu)) {
+			kvm_stage2_unmap_range(mmu, 0, kvm_phys_size(mmu), may_block,
+					       KVM_MMU_NOTIFY_SHADOW_S2);
+		}
 	}
 
 	kvm_invalidate_vncr_ipa(kvm, 0, BIT(kvm->arch.mmu.pgt->ia_bits));
@@ -1787,7 +1789,8 @@ void check_nested_vcpu_requests(struct kvm_vcpu *vcpu)
 
 		write_lock(&vcpu->kvm->mmu_lock);
 		if (mmu->pending_unmap) {
-			kvm_stage2_unmap_range(mmu, 0, kvm_phys_size(mmu), true);
+			kvm_stage2_unmap_range(mmu, 0, kvm_phys_size(mmu), true,
+					       KVM_MMU_NOTIFY_SHADOW_S2);
 			mmu->pending_unmap = false;
 		}
 		write_unlock(&vcpu->kvm->mmu_lock);
diff --git a/arch/arm64/kvm/spe.c b/arch/arm64/kvm/spe.c
index 35848e4ff68b..f80ef8cdb1d8 100644
--- a/arch/arm64/kvm/spe.c
+++ b/arch/arm64/kvm/spe.c
@@ -30,11 +30,13 @@ struct pinned_page {
 	DECLARE_BITMAP(vcpus, KVM_MAX_VCPUS);	/* The page is pinned on these VCPUs */
 	struct page *page;
 	gfn_t gfn;
+	bool unmap_after_unpin;			/* Unmap the page after the buffer is unpinned */
 	bool writable;				/* Is the page mapped as writable? */
 };
 
 static u64 max_buffer_size_to_pmbidr_el1(u64 size);
 static void kvm_spe_update_irq_level(struct kvm_vcpu *vcpu, bool level);
+static void kvm_spe_unpin_page(struct kvm *kvm, struct pinned_page *pinned_page);
 static void kvm_spe_unpin_buffer(struct kvm_vcpu *vcpu);
 
 static u64 pmblimitr_el1_res0_mask = GENMASK_ULL(11, 8) | GENMASK_ULL(6, 3);
@@ -146,6 +148,172 @@ void kvm_spe_vcpu_destroy(struct kvm_vcpu *vcpu)
 	kvm_spe_unpin_buffer(vcpu);
 }
 
+bool kvm_spe_allow_write_without_running_vcpu(struct kvm *kvm)
+{
+	return kvm->arch.kvm_spe.dirtying_pages;
+}
+
+static bool kvm_spe_allow_stage2_change(enum kvm_mmu_notifier_event event)
+{
+	switch (event) {
+	/* Host table entry will be reverted because the page is pinned. */
+	case KVM_MMU_NOTIFY_CLEAR:
+	/*
+	 * MMU_NOTIFY_PROTECTION_VMA is generated for the mprotect() call, but
+	 * also for benign reasons, like automatic NUMA balancing. In the latter
+	 * case, the changes to the host's stage 1 will be reverted when it is
+	 * observed that the page is pinned.
+	 *
+	 * In the mprotect() case, it is userspace that is explicitely changing
+	 * the protection for the VMA. Because KVM cannot distinguish between
+	 * mprotect() and the other cases, the buffer pages will be marked for
+	 * unmapping from the host's stage 1 when the guest disables the buffer.
+	 */
+	case KVM_MMU_NOTIFY_PROTECTION_VMA:
+	/* Don't allow buffer pages to be made read-only at stage 2. */
+	case KVM_MMU_NOTIFY_SOFT_DIRTY:
+	/* Host page migration will fail because the page is pinned. */
+	case KVM_MMU_NOTIFY_MIGRATE:
+	/*
+	 * SPE can write to the buffer at any time, treat the pinned pages as
+	 * young.
+	 */
+	case KVM_MMU_NOTIFY_AGE:
+	/*
+	 * This event is generated when a memslot is marked for dirty page
+	 * logging. The buffer pages will be kept mapped at stage 2 and they
+	 * will be immediately marked as dirty because KVM, without SPE
+	 * reporting a fault, has no means of detecting when a record is written
+	 * to memory.
+	 */
+	case KVM_MMU_NOTIFY_WP:
+	/*
+	 * All buffer pages are mapped with PAGE_SIZE granularity at stage 2,
+	 * it's safe to skip them.
+	 */
+	case KVM_MMU_NOTIFY_SPLIT_HUGE_PAGE:
+		return false;
+
+	/* Userspace munmap'ed the VMA. */
+	case KVM_MMU_NOTIFY_UNMAP:
+	/*
+	 * pin_user_pages() does not return a PFN without an associated struct
+	 * page, so the event shouldn't apply to a buffer page. Be conservative
+	 * and allow the stage 2 changes.
+	 */
+	case KVM_MMU_NOTIFY_PROTECTION_PAGE:
+	/*
+	 * KVM doesn't propagate this event to the architecture code because the
+	 * MMU notifier is unregistered when the VM is being destroyed and no
+	 * VCPUs should be running. Also, after the notifier is released, the
+	 * stage 2 will be destroyed. It makes little difference if we allow or
+	 * don't allow the buffer to be unmapped here, but put the event in the
+	 * allow group anyway in case anything changes.
+	 *
+	 * The buffer for each VCPU will be unpinned in the next stage of the VM
+	 * cleanup process, when the VCPUs are destroyed.
+	 */
+	case KVM_MMU_NOTIFY_RELEASE:
+	/* Same as KVM_MMU_NOTIFY_PROTECTION_PAGE. */
+	case KVM_MMU_NOTIFY_EXCLUSIVE:
+	/* x86-specific, but be conservative. */
+	case KVM_MMU_NOTIFY_MEMORY_ATTRIBUTES:
+	/* Userspace is changing a memslot while the buffer is enabled. */
+	case KVM_MMU_NOTIFY_MEMSLOT:
+	/* CMOs don't change stage 2 entries. */
+	case KVM_MMU_NOTIFY_CMO:
+	/* SPE is not yet compatible with nested virt, but be conservative. */
+	case KVM_MMU_NOTIFY_SHADOW_S2:
+		break;
+	default:
+		WARN_ON_ONCE(1);
+	}
+
+	return true;
+}
+
+phys_addr_t kvm_spe_adjust_range_start(struct kvm *kvm, phys_addr_t start, phys_addr_t end,
+				       enum kvm_mmu_notifier_event event)
+{
+	struct kvm_spe *kvm_spe = &kvm->arch.kvm_spe;
+	struct xarray *pinned_pages = &kvm_spe->pinned_pages;
+	struct pinned_page *pinned_page;
+	kvm_pfn_t gfn;
+
+	lockdep_assert_held_write(&kvm->mmu_lock);
+
+	if (kvm_spe_allow_stage2_change(event))
+		return start;
+
+	xa_lock(pinned_pages);
+	for (gfn = PHYS_PFN(start); gfn < PHYS_PFN(end); gfn++) {
+		pinned_page = xa_load(pinned_pages, gfn);
+		if (!pinned_page)
+			break;
+
+		pinned_page->unmap_after_unpin = true;
+		if (event == KVM_MMU_NOTIFY_WP && pinned_page->writable) {
+			kvm_spe->dirtying_pages = true;
+			mark_page_dirty(kvm, gfn);
+			kvm_spe->dirtying_pages = false;
+		}
+	}
+	xa_unlock(pinned_pages);
+
+	return PFN_PHYS(gfn);
+}
+
+/*
+ * Ignores pinned_page->unmap_after_unpin() because it is called only from the
+ * MMU notifiers, before changes are allowed to be made to stage 2.
+ */
+static void kvm_spe_unpin_page_range(struct kvm *kvm, phys_addr_t start, phys_addr_t end)
+{
+	struct xarray *pinned_pages = &kvm->arch.kvm_spe.pinned_pages;
+	struct pinned_page *pinned_page;
+	kvm_pfn_t gfn;
+
+	xa_lock(pinned_pages);
+	for (gfn = PHYS_PFN(start); gfn < PHYS_PFN(end); gfn++) {
+		pinned_page = xa_load(pinned_pages, gfn);
+		if (!pinned_page)
+			continue;
+
+		kvm_spe_unpin_page(kvm, pinned_page);
+		kfree(pinned_page);
+	}
+	xa_unlock(pinned_pages);
+}
+
+phys_addr_t kvm_spe_adjust_range_end(struct kvm *kvm, phys_addr_t start, phys_addr_t end,
+				     enum kvm_mmu_notifier_event event)
+{
+	struct xarray *pinned_pages = &kvm->arch.kvm_spe.pinned_pages;
+	kvm_pfn_t gfn;
+
+	lockdep_assert_held_write(&kvm->mmu_lock);
+
+	if (kvm_spe_allow_stage2_change(event)) {
+		if (event != KVM_MMU_NOTIFY_CMO)
+			kvm_spe_unpin_page_range(kvm, start, end);
+		return end;
+	}
+
+	xa_lock(pinned_pages);
+	/*
+	 * We know that @start is not a buffer page. Stop at the first buffer
+	 * page in the range [@start + PAGE_SIZE, @end) - this page will be
+	 * handled in the following call to kvm_spe_adjust_range_start().
+	 */
+	for (gfn = PHYS_PFN(start + PAGE_SIZE); gfn < PHYS_PFN(end); gfn++) {
+		if (xa_load(pinned_pages, gfn))
+			break;
+	}
+	xa_unlock(pinned_pages);
+
+	return PFN_PHYS(gfn);
+}
+
 u8 kvm_spe_get_pmsver_limit(void)
 {
 	unsigned int pmsver;
@@ -231,29 +399,78 @@ static void kvm_spe_inject_data_abort(struct kvm_vcpu *vcpu, u8 fst, bool s2)
 	kvm_spe_update_irq_level(vcpu, true);
 }
 
+static void kvm_spe_unpin_page(struct kvm *kvm, struct pinned_page *pinned_page)
+{
+	struct xarray *pinned_pages = &kvm->arch.kvm_spe.pinned_pages;
+
+	__xa_erase(pinned_pages, pinned_page->gfn);
+	unpin_user_pages_dirty_lock(&pinned_page->page, 1, pinned_page->writable);
+}
+
 static void kvm_spe_unpin_buffer(struct kvm_vcpu *vcpu)
 {
 	struct kvm *kvm = vcpu->kvm;
 	struct xarray *pinned_pages = &kvm->arch.kvm_spe.pinned_pages;
 	struct pinned_page *pinned_page;
-	unsigned long gfn;
+	int unmap_count, unmap_resched;
+	bool write_locked = false;
+	struct kvm_pgtable *pgt;
 	int idx;
 
+	XA_STATE(xas, pinned_pages, 0);
+
+	might_sleep();
+
+	/* Copy what stage2_apply_range() does */
+	unmap_resched = kvm_granule_size(KVM_PGTABLE_MIN_BLOCK_LEVEL) >> PAGE_SHIFT;
+	unmap_count = 0;
+
 	idx = srcu_read_lock(&kvm->srcu);
-	xa_lock(pinned_pages);
+	xas_lock(&xas);
+
+	xas_for_each(&xas, pinned_page, ULONG_MAX) {
+		if (xas_retry(&xas, pinned_page))
+			continue;
 
-	xa_for_each(pinned_pages, gfn, pinned_page) {
 		if (!test_bit(vcpu->vcpu_idx, pinned_page->vcpus))
 			continue;
 
 		clear_bit(vcpu->vcpu_idx, pinned_page->vcpus);
-		if (bitmap_empty(pinned_page->vcpus, KVM_MAX_VCPUS)) {
-			__xa_erase(pinned_pages, pinned_page->gfn);
-			unpin_user_pages_dirty_lock(&pinned_page->page, 1, pinned_page->writable);
+		if (!bitmap_empty(pinned_page->vcpus, KVM_MAX_VCPUS))
+			continue;
+
+		kvm_spe_unpin_page(kvm, pinned_page);
+		if (!pinned_page->unmap_after_unpin)
+			goto free_continue;
+
+		if (!write_locked) {
+			xas_pause(&xas);
+			xas_unlock(&xas);
+			write_lock(&kvm->mmu_lock);
+			xas_lock(&xas);
+			write_locked = true;
+			pgt = vcpu->arch.hw_mmu->pgt;
+		}
+
+		if (!pgt)
+			goto free_continue;
+
+		kvm_pgtable_stage2_unmap(pgt, PFN_PHYS(pinned_page->gfn), PAGE_SIZE);
+		unmap_count++;
+		if (unmap_count == unmap_resched) {
+			xas_pause(&xas);
+			xas_unlock(&xas);
+			cond_resched_rwlock_write(&kvm->mmu_lock);
+			xas_lock(&xas);
+			unmap_count = 0;
 		}
+free_continue:
+		kfree(pinned_page);
 	}
 
-	xa_unlock(pinned_pages);
+	xas_unlock(&xas);
+	if (write_locked)
+		write_unlock(&kvm->mmu_lock);
 	srcu_read_unlock(&kvm->srcu, idx);
 }
 
@@ -1314,6 +1531,7 @@ int kvm_spe_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
 			return -ENXIO;
 
 		vcpu_spe->initialized = true;
+		set_bit(KVM_ARCH_FLAG_SPE_ENABLED, &kvm->arch.flags);
 		return 0;
 	}
 
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index db86d1dcd148..e8fd1688abba 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -3947,7 +3947,8 @@ static void s2_mmu_unmap_range(struct kvm_s2_mmu *mmu,
 	 * the L1 needs to put its stage-2 in a consistent state before doing
 	 * the TLBI.
 	 */
-	kvm_stage2_unmap_range(mmu, info->range.start, info->range.size, true);
+	kvm_stage2_unmap_range(mmu, info->range.start, info->range.size, true,
+			       KVM_MMU_NOTIFY_SHADOW_S2);
 }
 
 static bool handle_vmalls12e1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
@@ -4026,7 +4027,7 @@ static void s2_mmu_unmap_ipa(struct kvm_s2_mmu *mmu,
 	 * See comment in s2_mmu_unmap_range() for why this is allowed to
 	 * reschedule.
 	 */
-	kvm_stage2_unmap_range(mmu, base_addr, max_size, true);
+	kvm_stage2_unmap_range(mmu, base_addr, max_size, true, KVM_MMU_NOTIFY_SHADOW_S2);
 }
 
 static bool handle_ipas2e1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c
index ce3e3ed3f29f..fb36f1b4fdae 100644
--- a/arch/arm64/kvm/vgic/vgic-its.c
+++ b/arch/arm64/kvm/vgic/vgic-its.c
@@ -2706,7 +2706,7 @@ static int vgic_its_ctrl(struct kvm *kvm, struct vgic_its *its, u64 attr)
 }
 
 /*
- * kvm_arch_allow_write_without_running_vcpu - allow writing guest memory
+ * kvm_vgic_allow_write_without_running_vcpu - allow writing guest memory
  * without the running VCPU when dirty ring is enabled.
  *
  * The running VCPU is required to track dirty guest pages when dirty ring
@@ -2715,7 +2715,7 @@ static int vgic_its_ctrl(struct kvm *kvm, struct vgic_its *its, u64 attr)
  * bitmap is used to track the dirty guest pages due to the missed running
  * VCPU in the period.
  */
-bool kvm_arch_allow_write_without_running_vcpu(struct kvm *kvm)
+bool kvm_vgic_allow_write_without_running_vcpu(struct kvm *kvm)
 {
 	struct vgic_dist *dist = &kvm->arch.vgic;
 
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 7a0b972eb1b1..4c0f4f80e8ef 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -446,6 +446,8 @@ int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int irq,
 
 void kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int host_irq);
 
+bool kvm_vgic_allow_write_without_running_vcpu(struct kvm *kvm);
+
 int vgic_v4_load(struct kvm_vcpu *vcpu);
 void vgic_v4_commit(struct kvm_vcpu *vcpu);
 int vgic_v4_put(struct kvm_vcpu *vcpu);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 772e75d13af1..273ee3339468 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -274,6 +274,8 @@ enum kvm_mmu_notifier_event {
 	KVM_MMU_NOTIFY_EXCLUSIVE        = MMU_NOTIFY_EXCLUSIVE,
 	KVM_MMU_NOTIFY_AGE		= 32,
 	KVM_MMU_NOTIFY_MEMORY_ATTRIBUTES,
+	KVM_MMU_NOTIFY_MEMSLOT,
+	KVM_MMU_NOTIFY_WP,
 	KVM_MMU_NOTIFY_ARCH1,
 	KVM_MMU_NOTIFY_ARCH2,
 	KVM_MMU_NOTIFY_ARCH3,
-- 
2.51.2




More information about the linux-arm-kernel mailing list