[PATCH 3/4] arm64: mm: support large block mapping when rodata=full

Thu Jul 24 15:11:05 PDT 2025

When rodata=full is specified, kernel linear mapping has to be mapped at
PTE level since large page table can't be split due to break-before-make
rule on ARM64.

This resulted in a couple of problems:
  - performance degradation
  - more TLB pressure
  - memory waste for kernel page table

With FEAT_BBM level 2 support, splitting large block page table to
smaller ones doesn't need to make the page table entry invalid anymore.
This allows kernel split large block mapping on the fly.

Add kernel page table split support and use large block mapping by
default when FEAT_BBM level 2 is supported for rodata=full.  When
changing permissions for kernel linear mapping, the page table will be
split to smaller size.

The machine without FEAT_BBM level 2 will fallback to have kernel linear
mapping PTE-mapped when rodata=full.

With this we saw significant performance boost with some benchmarks and
much less memory consumption on my AmpereOne machine (192 cores, 1P) with
256GB memory.

* Memory use after boot
Before:
MemTotal:       258988984 kB
MemFree:        254821700 kB

After:
MemTotal:       259505132 kB
MemFree:        255410264 kB

Around 500MB more memory are free to use.  The larger the machine, the
more memory saved.

* Memcached
We saw performance degradation when running Memcached benchmark with
rodata=full vs rodata=on.  Our profiling pointed to kernel TLB pressure.
With this patchset we saw ops/sec is increased by around 3.5%, P99
latency is reduced by around 9.6%.
The gain mainly came from reduced kernel TLB misses.  The kernel TLB
MPKI is reduced by 28.5%.

The benchmark data is now on par with rodata=on too.

* Disk encryption (dm-crypt) benchmark
Ran fio benchmark with the below command on a 128G ramdisk (ext4) with disk
encryption (by dm-crypt).
fio --directory=/data --random_generator=lfsr --norandommap --randrepeat 1 \
    --status-interval=999 --rw=write --bs=4k --loops=1 --ioengine=sync \
    --iodepth=1 --numjobs=1 --fsync_on_close=1 --group_reporting --thread \
    --name=iops-test-job --eta-newline=1 --size 100G

The IOPS is increased by 90% - 150% (the variance is high, but the worst
number of good case is around 90% more than the best number of bad case).
The bandwidth is increased and the avg clat is reduced proportionally.

* Sequential file read
Read 100G file sequentially on XFS (xfs_io read with page cache populated).
The bandwidth is increased by 150%.

Signed-off-by: Yang Shi <yang at os.amperecomputing.com>
---
 arch/arm64/include/asm/cpufeature.h |  34 ++++
 arch/arm64/include/asm/mmu.h        |   1 +
 arch/arm64/include/asm/pgtable.h    |   5 +
 arch/arm64/kernel/cpufeature.c      |  31 +--
 arch/arm64/mm/mmu.c                 | 293 +++++++++++++++++++++++++++-
 arch/arm64/mm/pageattr.c            |   4 +
 6 files changed, 333 insertions(+), 35 deletions(-)

diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index bf13d676aae2..d0d394cc837d 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -871,6 +871,40 @@ static inline bool system_supports_pmuv3(void)
 	return cpus_have_final_cap(ARM64_HAS_PMUV3);
 }
 
+static inline bool bbml2_noabort_available(void)
+{
+	/*
+	 * We want to allow usage of BBML2 in as wide a range of kernel contexts
+	 * as possible. This list is therefore an allow-list of known-good
+	 * implementations that both support BBML2 and additionally, fulfill the
+	 * extra constraint of never generating TLB conflict aborts when using
+	 * the relaxed BBML2 semantics (such aborts make use of BBML2 in certain
+	 * kernel contexts difficult to prove safe against recursive aborts).
+	 *
+	 * Note that implementations can only be considered "known-good" if their
+	 * implementors attest to the fact that the implementation never raises
+	 * TLB conflict aborts for BBML2 mapping granularity changes.
+	 */
+	static const struct midr_range supports_bbml2_noabort_list[] = {
+		MIDR_REV_RANGE(MIDR_CORTEX_X4, 0, 3, 0xf),
+		MIDR_REV_RANGE(MIDR_NEOVERSE_V3, 0, 2, 0xf),
+		MIDR_ALL_VERSIONS(MIDR_AMPERE1),
+		MIDR_ALL_VERSIONS(MIDR_AMPERE1A),
+		{}
+	};
+
+	/* Does our cpu guarantee to never raise TLB conflict aborts? */
+	if (!is_midr_in_range_list(supports_bbml2_noabort_list))
+		return false;
+
+	/*
+	 * We currently ignore the ID_AA64MMFR2_EL1 register, and only care
+	 * about whether the MIDR check passes.
+	 */
+
+	return true;
+}
+
 static inline bool system_supports_bbml2_noabort(void)
 {
 	return alternative_has_cap_unlikely(ARM64_HAS_BBML2_NOABORT);
diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
index 6e8aa8e72601..57f4b25e6f33 100644
--- a/arch/arm64/include/asm/mmu.h
+++ b/arch/arm64/include/asm/mmu.h
@@ -71,6 +71,7 @@ extern void create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
 			       pgprot_t prot, bool page_mappings_only);
 extern void *fixmap_remap_fdt(phys_addr_t dt_phys, int *size, pgprot_t prot);
 extern void mark_linear_text_alias_ro(void);
+extern int split_kernel_pgtable_mapping(unsigned long start, unsigned long end);
 
 /*
  * This check is triggered during the early boot before the cpufeature
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index ba63c8736666..ad2a6a7e86b0 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -371,6 +371,11 @@ static inline pmd_t pmd_mkcont(pmd_t pmd)
 	return __pmd(pmd_val(pmd) | PMD_SECT_CONT);
 }
 
+static inline pmd_t pmd_mknoncont(pmd_t pmd)
+{
+	return __pmd(pmd_val(pmd) & ~PMD_SECT_CONT);
+}
+
 #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
 static inline int pte_uffd_wp(pte_t pte)
 {
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index d2a8a509a58e..1c96016a7a41 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -2215,36 +2215,7 @@ static bool hvhe_possible(const struct arm64_cpu_capabilities *entry,
 
 static bool has_bbml2_noabort(const struct arm64_cpu_capabilities *caps, int scope)
 {
-	/*
-	 * We want to allow usage of BBML2 in as wide a range of kernel contexts
-	 * as possible. This list is therefore an allow-list of known-good
-	 * implementations that both support BBML2 and additionally, fulfill the
-	 * extra constraint of never generating TLB conflict aborts when using
-	 * the relaxed BBML2 semantics (such aborts make use of BBML2 in certain
-	 * kernel contexts difficult to prove safe against recursive aborts).
-	 *
-	 * Note that implementations can only be considered "known-good" if their
-	 * implementors attest to the fact that the implementation never raises
-	 * TLB conflict aborts for BBML2 mapping granularity changes.
-	 */
-	static const struct midr_range supports_bbml2_noabort_list[] = {
-		MIDR_REV_RANGE(MIDR_CORTEX_X4, 0, 3, 0xf),
-		MIDR_REV_RANGE(MIDR_NEOVERSE_V3, 0, 2, 0xf),
-		MIDR_ALL_VERSIONS(MIDR_AMPERE1),
-		MIDR_ALL_VERSIONS(MIDR_AMPERE1A),
-		{}
-	};
-
-	/* Does our cpu guarantee to never raise TLB conflict aborts? */
-	if (!is_midr_in_range_list(supports_bbml2_noabort_list))
-		return false;
-
-	/*
-	 * We currently ignore the ID_AA64MMFR2_EL1 register, and only care
-	 * about whether the MIDR check passes.
-	 */
-
-	return true;
+	return bbml2_noabort_available();
 }
 
 #ifdef CONFIG_ARM64_PAN
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 3d5fb37424ab..f63b39613571 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -480,6 +480,8 @@ void create_kpti_ng_temp_pgd(pgd_t *pgdir, phys_addr_t phys, unsigned long virt,
 			     int flags);
 #endif
 
+#define INVALID_PHYS_ADDR	-1
+
 static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm,
 				       enum pgtable_type pgtable_type)
 {
@@ -487,7 +489,9 @@ static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm,
 	struct ptdesc *ptdesc = pagetable_alloc(GFP_PGTABLE_KERNEL & ~__GFP_ZERO, 0);
 	phys_addr_t pa;
 
-	BUG_ON(!ptdesc);
+	if (!ptdesc)
+		return INVALID_PHYS_ADDR;
+
 	pa = page_to_phys(ptdesc_page(ptdesc));
 
 	switch (pgtable_type) {
@@ -509,15 +513,29 @@ static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm,
 }
 
 static phys_addr_t __maybe_unused
-pgd_pgtable_alloc_init_mm(enum pgtable_type pgtable_type)
+split_pgtable_alloc(enum pgtable_type pgtable_type)
 {
 	return __pgd_pgtable_alloc(&init_mm, pgtable_type);
 }
 
+static phys_addr_t __maybe_unused
+pgd_pgtable_alloc_init_mm(enum pgtable_type pgtable_type)
+{
+	phys_addr_t pa;
+
+	pa = __pgd_pgtable_alloc(&init_mm, pgtable_type);
+	BUG_ON(pa == INVALID_PHYS_ADDR);
+	return pa;
+}
+
 static phys_addr_t
 pgd_pgtable_alloc_special_mm(enum pgtable_type pgtable_type)
 {
-	return __pgd_pgtable_alloc(NULL, pgtable_type);
+	phys_addr_t pa;
+
+	pa = __pgd_pgtable_alloc(NULL, pgtable_type);
+	BUG_ON(pa == INVALID_PHYS_ADDR);
+	return pa;
 }
 
 /*
@@ -552,6 +570,254 @@ void __init create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
 			     pgd_pgtable_alloc_special_mm, flags);
 }
 
+static DEFINE_MUTEX(pgtable_split_lock);
+
+static int split_cont_pte(pmd_t *pmdp, unsigned long addr, unsigned long end)
+{
+	pte_t *ptep;
+	unsigned long next;
+	unsigned int nr;
+	unsigned long span;
+
+	ptep = pte_offset_kernel(pmdp, addr);
+
+	do {
+		pte_t *_ptep;
+
+		nr = 0;
+		next = pte_cont_addr_end(addr, end);
+		if (next < end)
+			nr = max(nr, ((end - next) / CONT_PTE_SIZE));
+		span = nr * CONT_PTE_SIZE;
+
+		_ptep = PTR_ALIGN_DOWN(ptep, sizeof(*ptep) * CONT_PTES);
+		ptep += pte_index(next) - pte_index(addr) + nr * CONT_PTES;
+
+		if (((addr | next) & ~CONT_PTE_MASK) == 0)
+			continue;
+
+		if (!pte_cont(__ptep_get(_ptep)))
+			continue;
+
+		for (int i = 0; i < CONT_PTES; i++, _ptep++)
+			__set_pte(_ptep, pte_mknoncont(__ptep_get(_ptep)));
+	} while (addr = next + span, addr != end);
+
+	return 0;
+}
+
+static int split_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end)
+{
+	unsigned long next;
+	unsigned int nr;
+	unsigned long span;
+	int ret = 0;
+
+	do {
+		pmd_t pmd;
+
+		nr = 1;
+		next = pmd_addr_end(addr, end);
+		if (next < end)
+			nr = max(nr, ((end - next) / PMD_SIZE));
+		span = (nr - 1) * PMD_SIZE;
+
+		if (((addr | next) & ~PMD_MASK) == 0)
+			continue;
+
+		pmd = pmdp_get(pmdp);
+		if (pmd_leaf(pmd)) {
+			phys_addr_t pte_phys;
+			pte_t *ptep;
+			pmdval_t pmdval = PMD_TYPE_TABLE | PMD_TABLE_UXN |
+					  PMD_TABLE_AF;
+			unsigned long pfn = pmd_pfn(pmd);
+			pgprot_t prot = pmd_pgprot(pmd);
+
+			pte_phys = split_pgtable_alloc(TABLE_PTE);
+			if (pte_phys == INVALID_PHYS_ADDR)
+				return -ENOMEM;
+
+			if (pgprot_val(prot) & PMD_SECT_PXN)
+				pmdval |= PMD_TABLE_PXN;
+
+			prot = __pgprot((pgprot_val(prot) & ~PTE_TYPE_MASK) |
+					PTE_TYPE_PAGE);
+			prot = __pgprot(pgprot_val(prot) | PTE_CONT);
+			ptep = (pte_t *)phys_to_virt(pte_phys);
+			for (int i = 0; i < PTRS_PER_PTE; i++, ptep++, pfn++)
+				__set_pte(ptep, pfn_pte(pfn, prot));
+
+			dsb(ishst);
+
+			__pmd_populate(pmdp, pte_phys, pmdval);
+		}
+
+		ret = split_cont_pte(pmdp, addr, next);
+		if (ret)
+			break;
+	} while (pmdp += nr, addr = next + span, addr != end);
+
+	return ret;
+}
+
+static int split_cont_pmd(pud_t *pudp, unsigned long addr, unsigned long end)
+{
+	pmd_t *pmdp;
+	unsigned long next;
+	unsigned int nr;
+	unsigned long span;
+	int ret = 0;
+
+	pmdp = pmd_offset(pudp, addr);
+
+	do {
+		pmd_t *_pmdp;
+
+		nr = 0;
+		next = pmd_cont_addr_end(addr, end);
+		if (next < end)
+			nr = max(nr, ((end - next) / CONT_PMD_SIZE));
+		span = nr * CONT_PMD_SIZE;
+
+		if (((addr | next) & ~CONT_PMD_MASK) == 0) {
+			pmdp += pmd_index(next) - pmd_index(addr) +
+				nr * CONT_PMDS;
+			continue;
+		}
+
+		_pmdp = PTR_ALIGN_DOWN(pmdp, sizeof(*pmdp) * CONT_PMDS);
+		if (!pmd_cont(pmdp_get(_pmdp)))
+			goto split;
+
+		for (int i = 0; i < CONT_PMDS; i++, _pmdp++)
+			set_pmd(_pmdp, pmd_mknoncont(pmdp_get(_pmdp)));
+
+split:
+		ret = split_pmd(pmdp, addr, next);
+		if (ret)
+			break;
+
+		pmdp += pmd_index(next) - pmd_index(addr) + nr * CONT_PMDS;
+	} while (addr = next + span, addr != end);
+
+	return ret;
+}
+
+static int split_pud(p4d_t *p4dp, unsigned long addr, unsigned long end)
+{
+	pud_t *pudp;
+	unsigned long next;
+	unsigned int nr;
+	unsigned long span;
+	int ret = 0;
+
+	pudp = pud_offset(p4dp, addr);
+
+	do {
+		pud_t pud;
+
+		nr = 1;
+		next = pud_addr_end(addr, end);
+		if (next < end)
+			nr = max(nr, ((end - next) / PUD_SIZE));
+		span = (nr - 1) * PUD_SIZE;
+
+		if (((addr | next) & ~PUD_MASK) == 0)
+			continue;
+
+		pud = pudp_get(pudp);
+		if (pud_leaf(pud)) {
+			phys_addr_t pmd_phys;
+			pmd_t *pmdp;
+			pudval_t pudval = PUD_TYPE_TABLE | PUD_TABLE_UXN |
+					  PUD_TABLE_AF;
+			unsigned long pfn = pud_pfn(pud);
+			pgprot_t prot = pud_pgprot(pud);
+			unsigned int step = PMD_SIZE >> PAGE_SHIFT;
+
+			pmd_phys = split_pgtable_alloc(TABLE_PMD);
+			if (pmd_phys == INVALID_PHYS_ADDR)
+				return -ENOMEM;
+
+			if (pgprot_val(prot) & PMD_SECT_PXN)
+				pudval |= PUD_TABLE_PXN;
+
+			prot = __pgprot((pgprot_val(prot) & ~PMD_TYPE_MASK) |
+					PMD_TYPE_SECT);
+			prot = __pgprot(pgprot_val(prot) | PTE_CONT);
+			pmdp = (pmd_t *)phys_to_virt(pmd_phys);
+			for (int i = 0; i < PTRS_PER_PMD; i++, pmdp++) {
+				set_pmd(pmdp, pfn_pmd(pfn, prot));
+				pfn += step;
+			}
+
+			dsb(ishst);
+
+			__pud_populate(pudp, pmd_phys, pudval);
+		}
+
+		ret = split_cont_pmd(pudp, addr, next);
+		if (ret)
+			break;
+	} while (pudp += nr, addr = next + span, addr != end);
+
+	return ret;
+}
+
+static int split_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end)
+{
+	p4d_t *p4dp;
+	unsigned long next;
+	int ret = 0;
+
+	p4dp = p4d_offset(pgdp, addr);
+
+	do {
+		next = p4d_addr_end(addr, end);
+
+		ret = split_pud(p4dp, addr, next);
+		if (ret)
+			break;
+	} while (p4dp++, addr = next, addr != end);
+
+	return ret;
+}
+
+static int split_pgd(pgd_t *pgdp, unsigned long addr, unsigned long end)
+{
+	unsigned long next;
+	int ret = 0;
+
+	do {
+		next = pgd_addr_end(addr, end);
+		ret = split_p4d(pgdp, addr, next);
+		if (ret)
+			break;
+	} while (pgdp++, addr = next, addr != end);
+
+	return ret;
+}
+
+int split_kernel_pgtable_mapping(unsigned long start, unsigned long end)
+{
+	int ret;
+
+	if (!system_supports_bbml2_noabort())
+		return 0;
+
+	if (start != PAGE_ALIGN(start) || end != PAGE_ALIGN(end))
+		return -EINVAL;
+
+	mutex_lock(&pgtable_split_lock);
+	arch_enter_lazy_mmu_mode();
+	ret = split_pgd(pgd_offset_k(start), start, end);
+	arch_leave_lazy_mmu_mode();
+	mutex_unlock(&pgtable_split_lock);
+
+	return ret;
+}
+
 static void update_mapping_prot(phys_addr_t phys, unsigned long virt,
 				phys_addr_t size, pgprot_t prot)
 {
@@ -639,6 +905,20 @@ static inline void arm64_kfence_map_pool(phys_addr_t kfence_pool, pgd_t *pgdp) {
 
 #endif /* CONFIG_KFENCE */
 
+bool linear_map_requires_bbml2;
+
+static inline bool force_pte_mapping(void)
+{
+	/*
+	 * Can't use cpufeature API to determine whether BBM level 2
+	 * is supported or not since cpufeature have not been
+	 * finalized yet.
+	 */
+	return (!bbml2_noabort_available() && (rodata_full ||
+		arm64_kfence_can_set_direct_map() || is_realm_world())) ||
+		debug_pagealloc_enabled();
+}
+
 static void __init map_mem(pgd_t *pgdp)
 {
 	static const u64 direct_map_end = _PAGE_END(VA_BITS_MIN);
@@ -664,7 +944,9 @@ static void __init map_mem(pgd_t *pgdp)
 
 	early_kfence_pool = arm64_kfence_alloc_pool();
 
-	if (can_set_direct_map())
+	linear_map_requires_bbml2 = !force_pte_mapping() && rodata_full;
+
+	if (force_pte_mapping())
 		flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
 
 	/*
@@ -1366,7 +1648,8 @@ int arch_add_memory(int nid, u64 start, u64 size,
 
 	VM_BUG_ON(!mhp_range_allowed(start, size, true));
 
-	if (can_set_direct_map())
+	if (force_pte_mapping() ||
+	    (linear_map_requires_bbml2 && !system_supports_bbml2_noabort()))
 		flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
 
 	__create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start),
diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c
index c6a85000fa0e..6566aa9d8abb 100644
--- a/arch/arm64/mm/pageattr.c
+++ b/arch/arm64/mm/pageattr.c
@@ -140,6 +140,10 @@ static int update_range_prot(unsigned long start, unsigned long size,
 	data.set_mask = set_mask;
 	data.clear_mask = clear_mask;
 
+	ret = split_kernel_pgtable_mapping(start, start + size);
+	if (WARN_ON_ONCE(ret))
+		return ret;
+
 	arch_enter_lazy_mmu_mode();
 
 	/*
-- 
2.50.0