[RFC PATCH v6 34/35] KVM: arm64: Add hugetlb support for SPE

Alexandru Elisei alexandru.elisei at arm.com
Fri Nov 14 08:07:15 PST 2025


Hugetlb pages are different from transparent huge pages, as they cannot be
split by GUP with the FOLL_SPLIT_PMD flag.

Mapping hugetlb pages at stage 2 with a page mapping would be a mistake:
CPU stage 2 faults will make KVM map the entire hugetlb page with a block
mapping at stage 2. This process requires a break-before-make sequence, and
the SPU will trigger a stage 2 fault if it attempts to write a record to
memory during the break part of the sequence.

Map hugetlb pages with a block mapping at stage 2, to make sure this won't
happen.

Signed-off-by: Alexandru Elisei <alexandru.elisei at arm.com>
---
 arch/arm64/include/asm/kvm_mmu.h |   3 +
 arch/arm64/include/asm/kvm_spe.h |   6 +
 arch/arm64/kvm/mmu.c             |  23 ++-
 arch/arm64/kvm/spe.c             | 292 ++++++++++++++++++++++++-------
 4 files changed, 250 insertions(+), 74 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index a4a0e00d1bbb..4d57c6d62f4a 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -191,6 +191,9 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu);
 bool kvm_vma_mte_allowed(struct vm_area_struct *vma);
 void kvm_sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn, unsigned long size);
 
+bool kvm_stage2_supports_map_size(struct kvm_memory_slot *memslot,
+				  unsigned long hva, unsigned long map_size);
+
 phys_addr_t kvm_mmu_get_httbr(void);
 phys_addr_t kvm_get_idmap_vector(void);
 int __init kvm_mmu_init(u32 *hyp_va_bits);
diff --git a/arch/arm64/include/asm/kvm_spe.h b/arch/arm64/include/asm/kvm_spe.h
index 7dcf03980019..a22764719ecc 100644
--- a/arch/arm64/include/asm/kvm_spe.h
+++ b/arch/arm64/include/asm/kvm_spe.h
@@ -68,6 +68,8 @@ u8 kvm_spe_get_pmsver_limit(void);
 
 void kvm_spe_handle_req_memlock(struct kvm_vcpu *vcpu);
 
+bool kvm_spe_gfn_is_pinned(struct kvm_vcpu *vcpu, gfn_t gfn);
+
 int kvm_spe_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr);
 int kvm_spe_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr);
 int kvm_spe_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr);
@@ -117,6 +119,10 @@ static inline u8 kvm_spe_get_pmsver_limit(void)
 static inline void kvm_spe_handle_req_memlock(struct kvm_vcpu *vcpu)
 {
 }
+static inline bool kvm_spe_gfn_is_pinned(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+	return false;
+}
 static inline int kvm_spe_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
 {
 	return -ENXIO;
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index de48fb7c0fff..cc2993f10269 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1332,9 +1332,8 @@ static void kvm_send_hwpoison_signal(unsigned long address, short lsb)
 	send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current);
 }
 
-static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot,
-					       unsigned long hva,
-					       unsigned long map_size)
+bool kvm_stage2_supports_map_size(struct kvm_memory_slot *memslot,
+				  unsigned long hva, unsigned long map_size)
 {
 	gpa_t gpa_start;
 	hva_t uaddr_start, uaddr_end;
@@ -1417,7 +1416,7 @@ transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot,
 	 * sure that the HVA and IPA are sufficiently aligned and that the
 	 * block map is contained within the memslot.
 	 */
-	if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) {
+	if (kvm_stage2_supports_map_size(memslot, hva, PMD_SIZE)) {
 		int sz = get_user_mapping_size(kvm, hva);
 
 		if (sz < 0)
@@ -1664,6 +1663,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	struct page *page;
 	vm_flags_t vm_flags;
 	enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_MEMABORT_FLAGS;
+	bool is_vma_hugetlbfs;
 
 	if (fault_is_perm)
 		fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu);
@@ -1694,6 +1694,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 		return -EFAULT;
 	}
 
+	is_vma_hugetlbfs = is_vm_hugetlb_page(vma);
+
 	if (force_pte)
 		vma_shift = PAGE_SHIFT;
 	else
@@ -1702,7 +1704,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	switch (vma_shift) {
 #ifndef __PAGETABLE_PMD_FOLDED
 	case PUD_SHIFT:
-		if (fault_supports_stage2_huge_mapping(memslot, hva, PUD_SIZE))
+		if (kvm_stage2_supports_map_size(memslot, hva, PUD_SIZE))
 			break;
 		fallthrough;
 #endif
@@ -1710,7 +1712,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 		vma_shift = PMD_SHIFT;
 		fallthrough;
 	case PMD_SHIFT:
-		if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE))
+		if (kvm_stage2_supports_map_size(memslot, hva, PMD_SIZE))
 			break;
 		fallthrough;
 	case CONT_PTE_SHIFT:
@@ -1853,6 +1855,15 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 		goto out_unlock;
 	}
 
+	if (vcpu_has_spe(vcpu) && logging_active && is_vma_hugetlbfs) {
+		gfn_t pmd_gfn = PHYS_PFN(fault_ipa & PMD_MASK);
+
+		if (kvm_spe_gfn_is_pinned(vcpu, pmd_gfn)) {
+			/* SPE won the race, don't break the block mapping. */
+			goto out_unlock;
+		}
+	}
+
 	/*
 	 * If we are not forced to use page mapping, check if we are
 	 * backed by a THP and thus use block mapping if possible.
diff --git a/arch/arm64/kvm/spe.c b/arch/arm64/kvm/spe.c
index 2e2b97c3b861..81e07bc08ba6 100644
--- a/arch/arm64/kvm/spe.c
+++ b/arch/arm64/kvm/spe.c
@@ -6,6 +6,7 @@
 #include <linux/bitops.h>
 #include <linux/capability.h>
 #include <linux/cpumask.h>
+#include <linux/hugetlb.h>
 #include <linux/kvm_host.h>
 #include <linux/perf/arm_spe_pmu.h>
 #include <linux/swap.h>
@@ -30,6 +31,7 @@ struct pinned_page {
 	DECLARE_BITMAP(vcpus, KVM_MAX_VCPUS);	/* The page is pinned on these VCPUs */
 	struct page *page;
 	gfn_t gfn;
+	unsigned long s2_map_size;
 	bool unmap_after_unpin;			/* Unmap the page after the buffer is unpinned */
 	bool writable;				/* Is the page mapped as writable? */
 };
@@ -196,10 +198,7 @@ static bool kvm_spe_allow_stage2_change(enum kvm_mmu_notifier_event event)
 	 * to memory.
 	 */
 	case KVM_MMU_NOTIFY_WP:
-	/*
-	 * All buffer pages are mapped with PAGE_SIZE granularity at stage 2,
-	 * it's safe to skip them.
-	 */
+	/* Buffer pages will be unmapped after they are unpinned. */
 	case KVM_MMU_NOTIFY_SPLIT_HUGE_PAGE:
 		return false;
 
@@ -246,8 +245,8 @@ phys_addr_t kvm_spe_adjust_range_start(struct kvm *kvm, phys_addr_t start, phys_
 {
 	struct kvm_spe *kvm_spe = &kvm->arch.kvm_spe;
 	struct xarray *pinned_pages = &kvm_spe->pinned_pages;
+	kvm_pfn_t limit_gfn, gfn = PHYS_PFN(start);
 	struct pinned_page *pinned_page;
-	kvm_pfn_t gfn;
 
 	lockdep_assert_held_write(&kvm->mmu_lock);
 
@@ -255,21 +254,55 @@ phys_addr_t kvm_spe_adjust_range_start(struct kvm *kvm, phys_addr_t start, phys_
 		return start;
 
 	xa_lock(pinned_pages);
-	for (gfn = PHYS_PFN(start); gfn < PHYS_PFN(end); gfn++) {
+	while (gfn < PHYS_PFN(end)) {
 		pinned_page = xa_load(pinned_pages, gfn);
 		if (!pinned_page)
 			break;
 
 		pinned_page->unmap_after_unpin = true;
-		if (event == KVM_MMU_NOTIFY_WP && pinned_page->writable) {
+
+		if (event == KVM_MMU_NOTIFY_WP) {
+			if (!pinned_page->writable && pinned_page->s2_map_size == PAGE_SIZE)
+				goto next_gfn;
+
+                       /*
+			* Stage 2 block mappings are special, because, while
+			* dirty page locking is enabled, any fault will break
+			* the block mapping. Map it with all permissions to
+			* avoid the fault.
+                        */
+			if (pinned_page->s2_map_size > PAGE_SIZE) {
+				phys_addr_t gpa = PFN_PHYS(pinned_page->gfn);
+				struct kvm_s2_mmu *mmu = &kvm->arch.mmu;
+				enum kvm_pgtable_walk_flags flags;
+				enum kvm_pgtable_prot prot;
+				int ret;
+
+				prot = KVM_PGTABLE_PROT_X |
+				       KVM_PGTABLE_PROT_R |
+				       KVM_PGTABLE_PROT_W;
+				flags = KVM_PGTABLE_WALK_HANDLE_FAULT;
+				ret = kvm_pgtable_stage2_relax_perms(mmu->pgt, gpa, prot, flags);
+				if (WARN_ON_ONCE(ret))
+					goto next_gfn;
+				kvm_call_hyp(__kvm_tlb_flush_vmid_range,
+					     mmu, gpa, PHYS_PFN(pinned_page->s2_map_size));
+			}
+
+			limit_gfn = min(PHYS_PFN(end), gfn + PHYS_PFN(pinned_page->s2_map_size));
 			kvm_spe->dirtying_pages = true;
-			mark_page_dirty(kvm, gfn);
+			for (; gfn < limit_gfn; gfn++)
+				mark_page_dirty(kvm, gfn);
 			kvm_spe->dirtying_pages = false;
+
+			continue;
 		}
+next_gfn:
+		gfn += PHYS_PFN(pinned_page->s2_map_size);
 	}
 	xa_unlock(pinned_pages);
 
-	return PFN_PHYS(gfn);
+	return min_t(phys_addr_t, PFN_PHYS(gfn), end);
 }
 
 /*
@@ -280,16 +313,19 @@ static void kvm_spe_unpin_page_range(struct kvm *kvm, phys_addr_t start, phys_ad
 {
 	struct xarray *pinned_pages = &kvm->arch.kvm_spe.pinned_pages;
 	struct pinned_page *pinned_page;
-	kvm_pfn_t gfn;
+	kvm_pfn_t gfn = PHYS_PFN(start);
 
 	xa_lock(pinned_pages);
-	for (gfn = PHYS_PFN(start); gfn < PHYS_PFN(end); gfn++) {
+	while (gfn < PHYS_PFN(end)) {
 		pinned_page = xa_load(pinned_pages, gfn);
-		if (!pinned_page)
+		if (!pinned_page) {
+			gfn++;
 			continue;
+		}
 
 		kvm_spe_unpin_page(kvm, pinned_page);
 		kfree(pinned_page);
+		gfn += PHYS_PFN(pinned_page->s2_map_size);
 	}
 	xa_unlock(pinned_pages);
 }
@@ -422,7 +458,7 @@ static void kvm_spe_unpin_page(struct kvm *kvm, struct pinned_page *pinned_page)
 {
 	struct xarray *pinned_pages = &kvm->arch.kvm_spe.pinned_pages;
 
-	kvm_spe_remove_locked_mem(kvm, PAGE_SIZE);
+	kvm_spe_remove_locked_mem(kvm, pinned_page->s2_map_size);
 
 	__xa_erase(pinned_pages, pinned_page->gfn);
 	unpin_user_pages_dirty_lock(&pinned_page->page, 1, pinned_page->writable);
@@ -476,7 +512,7 @@ static void kvm_spe_unpin_buffer(struct kvm_vcpu *vcpu)
 		if (!pgt)
 			goto free_continue;
 
-		kvm_pgtable_stage2_unmap(pgt, PFN_PHYS(pinned_page->gfn), PAGE_SIZE);
+		kvm_pgtable_stage2_unmap(pgt, PFN_PHYS(pinned_page->gfn), pinned_page->s2_map_size);
 		unmap_count++;
 		if (unmap_count == unmap_resched) {
 			xas_pause(&xas);
@@ -538,6 +574,30 @@ static void kvm_spe_add_locked_mem(struct kvm_vcpu *vcpu, unsigned long size)
 		vcpu_spe->locked_mem_excess = kvm_spe->locked_mem - kvm_spe->locked_mem_watermark;
 }
 
+static void kvm_spe_unlock_mmu(struct kvm *kvm, bool exclusive_access)
+{
+	if (exclusive_access)
+		write_unlock(&kvm->mmu_lock);
+	else
+		read_unlock(&kvm->mmu_lock);
+}
+
+static bool kvm_spe_lock_mmu(struct kvm *kvm, unsigned long s2_map_size, bool logging_active)
+{
+	bool exclusive_access;
+
+	if (s2_map_size > PAGE_SIZE && logging_active) {
+		/* Prevent concurrent CPU faults breaking the block mapping. */
+		write_lock(&kvm->mmu_lock);
+		exclusive_access = true;
+	} else {
+		read_lock(&kvm->mmu_lock);
+		exclusive_access = false;
+	}
+
+	return exclusive_access;
+}
+
 #define MAP_GPA_RET_NOTIFIER_RETRY	1
 #define MAP_GPA_RET_PAGE_EXIST		2
 
@@ -549,7 +609,7 @@ static void kvm_spe_add_locked_mem(struct kvm_vcpu *vcpu, unsigned long size)
 /* Calls release_faultin_page(), regardless of the return value */
 static int kvm_spe_map_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, kvm_pfn_t hfn, struct page *page,
 			   bool make_writable, bool mte_allowed, unsigned long mmu_seq,
-			   struct pinned_page *pinned_page)
+			   struct pinned_page *pinned_page, unsigned long s2_map_size)
 {
 	struct kvm *kvm = vcpu->kvm;
 	struct xarray *pinned_pages = &kvm->arch.kvm_spe.pinned_pages;
@@ -558,21 +618,31 @@ static int kvm_spe_map_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, kvm_pfn_t hfn, stru
 	gfn_t gfn = PHYS_PFN(gpa);
 	struct kvm_pgtable *pgt = vcpu->arch.hw_mmu->pgt;
 	enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
-	enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED;
+	enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_HANDLE_FAULT;
 	int action = PGTABLE_ACTION_NONE;
+	struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn);
+	bool logging_active = kvm_slot_dirty_track_enabled(memslot);
+	bool exclusive_access;
 	s8 level = S8_MAX;
 	kvm_pte_t pte = 0;
 	int ret;
 
-	read_lock(&kvm->mmu_lock);
+	if (make_writable)
+		prot |= KVM_PGTABLE_PROT_W;
+
+	/* Avoid all faults. */
+	if (s2_map_size > PAGE_SIZE && logging_active)
+		prot |= KVM_PGTABLE_PROT_X;
+
+	exclusive_access = kvm_spe_lock_mmu(kvm, s2_map_size, logging_active);
+	if (!exclusive_access)
+		flags |= KVM_PGTABLE_WALK_SHARED;
+
 	if (mmu_invalidate_retry(kvm, mmu_seq)) {
 		ret = MAP_GPA_RET_NOTIFIER_RETRY;
 		goto mmu_unlock;
 	}
 
-	if (make_writable)
-		prot |= KVM_PGTABLE_PROT_W;
-
 	ret = kvm_pgtable_get_leaf(pgt, gpa, &pte, &level);
 	if (ret)
 		goto mmu_unlock;
@@ -589,7 +659,7 @@ static int kvm_spe_map_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, kvm_pfn_t hfn, stru
 		}
 
 		existing_prot = kvm_pgtable_stage2_pte_prot(pte);
-		if (kvm_granule_size(level) != PAGE_SIZE) {
+		if (kvm_granule_size(level) != s2_map_size) {
 			/* Break block mapping */
 			action = PGTABLE_MAP_GPA;
 		} else {
@@ -603,14 +673,14 @@ static int kvm_spe_map_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, kvm_pfn_t hfn, stru
 	}
 
 	if (action == PGTABLE_MAP_GPA) {
-		read_unlock(&kvm->mmu_lock);
+		kvm_spe_unlock_mmu(kvm, exclusive_access);
 		ret = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_page_cache,
 				kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu));
 		if (ret) {
 			kvm_release_faultin_page(kvm, page, false, make_writable);
 			goto out;
 		}
-		read_lock(&kvm->mmu_lock);
+		exclusive_access = kvm_spe_lock_mmu(kvm, s2_map_size, logging_active);
 		if (mmu_invalidate_retry(kvm, mmu_seq)) {
 			ret = MAP_GPA_RET_NOTIFIER_RETRY;
 			goto mmu_unlock;
@@ -642,7 +712,7 @@ static int kvm_spe_map_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, kvm_pfn_t hfn, stru
 
 	if (!pp && !kvm_pte_valid(pte) && kvm_has_mte(kvm)) {
 		if (mte_allowed) {
-			kvm_sanitise_mte_tags(kvm, hfn, PAGE_SIZE);
+			kvm_sanitise_mte_tags(kvm, hfn, s2_map_size);
 		} else {
 			ret = -EFAULT;
 			goto mmu_unlock;
@@ -653,7 +723,7 @@ static int kvm_spe_map_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, kvm_pfn_t hfn, stru
 	if (action & PGTABLE_RELAX_PERMS) {
 		ret = kvm_pgtable_stage2_relax_perms(pgt, gpa, prot, flags);
 	} else if (action & PGTABLE_MAP_GPA) {
-		ret = kvm_pgtable_stage2_map(pgt, gpa, PAGE_SIZE, hpa, prot,
+		ret = kvm_pgtable_stage2_map(pgt, gpa, s2_map_size, hpa, prot,
 					     &vcpu->arch.mmu_page_cache, flags);
 	}
 	if (ret)
@@ -662,14 +732,22 @@ static int kvm_spe_map_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, kvm_pfn_t hfn, stru
 	if (action & PGTABLE_MAKE_YOUNG)
 		kvm_pgtable_stage2_mkyoung(pgt, gpa, flags);
 
+	if (exclusive_access &&
+	    (action & (PGTABLE_MAKE_YOUNG | PGTABLE_RELAX_PERMS)) == PGTABLE_RELAX_PERMS) {
+		kvm_call_hyp(__kvm_tlb_flush_vmid_range, &kvm->arch.mmu, gpa,
+			     PHYS_PFN(s2_map_size));
+	}
+
 	if (pp) {
 		pp->writable = make_writable;
-		set_bit(vcpu->vcpu_idx, pp->vcpus);
+		if (!test_bit(vcpu->vcpu_idx, pp->vcpus))
+			set_bit(vcpu->vcpu_idx, pp->vcpus);
 
 		ret = MAP_GPA_RET_PAGE_EXIST;
 	} else {
 		pinned_page->page = page;
 		pinned_page->gfn = gfn;
+		pinned_page->s2_map_size = s2_map_size;
 		pinned_page->writable = make_writable;
 		set_bit(vcpu->vcpu_idx, pinned_page->vcpus);
 
@@ -679,31 +757,65 @@ static int kvm_spe_map_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, kvm_pfn_t hfn, stru
 			goto pages_unlock;
 		}
 
-		kvm_spe_add_locked_mem(vcpu, PAGE_SIZE);
+		kvm_spe_add_locked_mem(vcpu, s2_map_size);
 		ret = 0;
 	}
 
+	if (!ret && make_writable && s2_map_size > PAGE_SIZE &&
+	    kvm_slot_dirty_track_enabled(memslot)) {
+		/*
+		 * Unmap the huge page from stage 2 after unpinning to
+		 * resume normal dirty page logging.
+		 */
+		pinned_page->unmap_after_unpin = true;
+	}
+
 pages_unlock:
 	xa_unlock(pinned_pages);
 mmu_unlock:
 	kvm_release_faultin_page(kvm, page, ret < 0, make_writable);
-	if (!ret && make_writable)
-		kvm_vcpu_mark_page_dirty(vcpu, gfn);
-
-	read_unlock(&kvm->mmu_lock);
+	if (!ret && make_writable) {
+		for (int i = 0; i < PHYS_PFN(s2_map_size); i++, gfn++)
+			mark_page_dirty_in_slot(kvm, memslot, gfn);
+	}
+	kvm_spe_unlock_mmu(kvm, exclusive_access);
 out:
 	return ret;
 }
 
-static int kvm_spe_pin_hva_locked(hva_t hva, bool make_writable, struct page **page)
+static unsigned long kvm_spe_compute_stage2_map_size(struct kvm *kvm, gfn_t gfn, hva_t hva,
+						     struct vm_area_struct *vma)
+{
+	unsigned long map_size = PAGE_SIZE;
+
+	if (is_vm_hugetlb_page(vma) && !(vma->vm_flags & VM_PFNMAP)) {
+		map_size = huge_page_size(hstate_vma(vma));
+		/* Stage 2 supports only PMD_SIZE huge mappings. */
+		if (map_size > PMD_SIZE)
+			map_size = PMD_SIZE;
+		if (!kvm_stage2_supports_map_size(gfn_to_memslot(kvm, gfn), hva, map_size))
+			map_size = PAGE_SIZE;
+	}
+
+	return map_size;
+}
+
+static int kvm_spe_pin_hva_locked(hva_t hva, bool make_writable, struct vm_area_struct *vma,
+				  struct page **page)
 {
 	unsigned int gup_flags;
 	long nr_pages;
 
+	gup_flags = FOLL_LONGTERM | FOLL_HONOR_NUMA_FAULT | FOLL_HWPOISON;
+	if (make_writable)
+		gup_flags |= FOLL_WRITE;
 	/*
-	 * FOLL_SPLIT_PMD is what allows us to ignore the order of the folio and
-	 * how the page is mapped in the host and operate on a single page
-	 * instead of a higher order folio.
+	 * When the VMA is backed by hugetlb, the memory will be mapped with a
+	 * block mapping at stage 2.
+	 *
+	 * In the non-hugetlb case, FOLL_SPLIT_PMD is what allows us to ignore
+	 * the order of the folio and how the page is mapped in the host and
+	 * operate on a single page instead of a higher order folio.
 	 *
 	 * Let's assume that we don't use FOLL_SPLIT_PMD and the pinned page is
 	 * mapped with a block mapping in the host's stage 1.  kvm_spe_map_gpa()
@@ -722,9 +834,8 @@ static int kvm_spe_pin_hva_locked(hva_t hva, bool make_writable, struct page **p
 	 * true: a higher order folio can be split into PTEs regardless of its
 	 * elevated reference count (see split_huge_pmd()).
 	 */
-	gup_flags = FOLL_LONGTERM | FOLL_SPLIT_PMD | FOLL_HONOR_NUMA_FAULT | FOLL_HWPOISON;
-	if (make_writable)
-		gup_flags |= FOLL_WRITE;
+	if (!is_vm_hugetlb_page(vma))
+		gup_flags |= FOLL_SPLIT_PMD;
 
 	nr_pages = pin_user_pages(hva, 1, gup_flags, page);
 
@@ -753,7 +864,8 @@ static int kvm_spe_find_hva(struct kvm *kvm, gfn_t gfn, bool make_writable, hva_
 	return 0;
 }
 
-static bool kvm_spe_test_gpa_pinned(struct kvm_vcpu *vcpu, gpa_t gpa, bool make_writable)
+static bool kvm_spe_test_gpa_pinned(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned long s2_map_size,
+				    bool make_writable)
 {
 	struct kvm *kvm = vcpu->kvm;
 	struct xarray *pinned_pages = &kvm->arch.kvm_spe.pinned_pages;
@@ -766,17 +878,21 @@ static bool kvm_spe_test_gpa_pinned(struct kvm_vcpu *vcpu, gpa_t gpa, bool make_
 		goto out_unlock;
 
 	/*
-	 * Only happens if the buffer overlaps with a translation table, which
-	 * is almost certainly a guest bug and hopefully exceedingly rare. To
-	 * avoid unnecessary complexity, pretend that the gpa is not pinned, and
-	 * kvm_spe_map_gpa() will fix things up. Sure, it means doing a lot of
-	 * unnecessary work, but it's all on the guest for programming the
-	 * buffer with the wrong translations.
+	 * Should never happen. The buffer is mapped at stage 2 with a block
+	 * mapping if it's backed by a hugetlb page in the host, otherwise it's
+	 * mapped with PAGE_SIZE granularity. On the host side, changing a
+	 * mapping from a PAGE_SIZE page to a hugetlb page, or the other way
+	 * around, is performed only after userspace explictely unmaps the
+	 * memory.  kvm_spe_adjust_range_end() will unpin the affected buffer
+	 * page(s) when memory is unmapped by userspace.
 	 */
+	WARN_ON_ONCE(pp->s2_map_size != s2_map_size);
+
 	if (make_writable && !pp->writable)
 		goto out_unlock;
 
-	set_bit(vcpu->vcpu_idx, pp->vcpus);
+	if (!test_bit(vcpu->vcpu_idx, pp->vcpus))
+		set_bit(vcpu->vcpu_idx, pp->vcpus);
 
 	xa_unlock(pinned_pages);
 	return true;
@@ -793,7 +909,7 @@ static int kvm_spe_pin_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, bool make_writable)
 	struct kvm *kvm = vcpu->kvm;
 	struct xarray *pinned_pages = &kvm->arch.kvm_spe.pinned_pages;
 	struct pinned_page *pinned_page;
-	unsigned long mmu_seq, tries;
+	unsigned long mmu_seq, tries, s2_map_size;
 	struct vm_area_struct *vma;
 	gfn_t gfn = PHYS_PFN(gpa);
 	bool writable = false, mte_allowed = false;
@@ -804,32 +920,50 @@ static int kvm_spe_pin_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, bool make_writable)
 
 	WARN_ON_ONCE(!srcu_read_lock_held(&vcpu->kvm->srcu));
 
-	/*
-	 * For each buffer page, KVM needs to pin up to four pages, one for each
-	 * level of the guest's stage 1 translation tables. The first level
-	 * table is shared between each page of the buffer, and likely some of
-	 * the next levels too, so it's worth checking if a gpa is already
-	 * pinned.
-	 */
-	if (kvm_spe_test_gpa_pinned(vcpu, gpa, make_writable))
-		return 0;
-
 	ret = kvm_spe_find_hva(kvm, gfn, make_writable, &hva);
 	if (ret)
 		return ret;
 
 	scoped_guard(mmap_read_lock, current->mm) {
-		if (kvm_has_mte(kvm)) {
-			vma = vma_lookup(current->mm, hva);
-			if (!vma) {
-				kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
-				return -EFAULT;
-			}
-			mte_allowed = kvm_vma_mte_allowed(vma);
+		vma = vma_lookup(current->mm, hva);
+		if (unlikely(!vma)) {
+			kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
+			return -EFAULT;
+		}
+
+		s2_map_size = kvm_spe_compute_stage2_map_size(kvm, gfn, hva, vma);
+		if (s2_map_size == PMD_SIZE) {
+			/*
+			 * Make the adjustements before searching for the gpa in
+			 * pinned_pages.
+			 */
+			hva &= PMD_MASK;
+			gpa &= PMD_MASK;
+			gfn = PHYS_PFN(gpa);
+			/*
+			 * Dirty page tracking cannot be enabled on read-only
+			 * memslots.
+			 */
+			if (kvm_slot_dirty_track_enabled(gfn_to_memslot(kvm, gfn)))
+				make_writable = true;
 		}
-		ret = kvm_spe_pin_hva_locked(hva, make_writable, &page);
+
+		/*
+		 * For each buffer page, KVM needs to pin up to four pages, one
+		 * for each level of the guest's stage 1 translation tables. The
+		 * first level table is shared between each page of the buffer,
+		 * and likely some of the next levels too, so it's worth
+		 * checking if a gpa is already pinned.
+		 */
+		if (kvm_spe_test_gpa_pinned(vcpu, gpa, s2_map_size, make_writable))
+			return 0;
+
+		ret = kvm_spe_pin_hva_locked(hva, make_writable, vma, &page);
 		if (ret)
 			return ret;
+
+		if (kvm_has_mte(kvm))
+			mte_allowed = kvm_vma_mte_allowed(vma);
 	}
 
 	pinned_page = kzalloc(sizeof(*pinned_page), GFP_KERNEL_ACCOUNT);
@@ -848,7 +982,7 @@ static int kvm_spe_pin_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, bool make_writable)
 
 	get_page(page);
 	ret = kvm_spe_map_gpa(vcpu, gpa, hfn, page, make_writable, mte_allowed, mmu_seq,
-			      pinned_page);
+			      pinned_page, s2_map_size);
 	tries = 1;
 
 	while (ret == MAP_GPA_RET_NOTIFIER_RETRY) {
@@ -867,7 +1001,7 @@ static int kvm_spe_pin_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, bool make_writable)
 
 		hfn = kvm_faultin_pfn(vcpu, gfn, make_writable, &writable, &retry_page);
 		if (hfn == KVM_PFN_ERR_HWPOISON) {
-			send_sig_mceerr(BUS_MCEERR_AR, (void __user *)hva, PAGE_SIZE, current);
+			send_sig_mceerr(BUS_MCEERR_AR, (void __user *)hva, s2_map_size, current);
 			ret = 0;
 			goto out_release;
 		}
@@ -887,7 +1021,7 @@ static int kvm_spe_pin_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, bool make_writable)
 		}
 
 		ret = kvm_spe_map_gpa(vcpu, gpa, hfn, page, make_writable, mte_allowed, mmu_seq,
-				      pinned_page);
+				      pinned_page, s2_map_size);
 		/*
 		 * Choose the number of VCPUs as the limit on retrying because
 		 * the guest can enable SPE on all VCPUs at the same, and
@@ -1420,6 +1554,28 @@ void kvm_spe_handle_req_memlock(struct kvm_vcpu *vcpu)
 	vcpu_spe->locked_mem_excess = 0;
 }
 
+bool kvm_spe_gfn_is_pinned(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+	struct kvm *kvm = vcpu->kvm;
+	struct xarray *pinned_pages = &kvm->arch.kvm_spe.pinned_pages;
+	bool is_pinned = false;
+	struct pinned_page *pinned_page;
+
+	xa_lock(pinned_pages);
+	if (xa_empty(pinned_pages))
+		goto unlock;
+
+	pinned_page = xa_load(pinned_pages, gfn);
+	if (pinned_page) {
+		is_pinned = true;
+		WARN_ON_ONCE(!pinned_page->writable);
+	}
+
+unlock:
+	xa_unlock(pinned_pages);
+	return is_pinned;
+}
+
 static u64 max_buffer_size_to_pmbidr_el1(u64 size)
 {
 	u64 msb_idx, num_bits;
-- 
2.51.2




More information about the linux-arm-kernel mailing list