[PATCH V2 2/2] arm64/mm: Reject memory removal that splits a kernel leaf mapping

Tue Feb 10 01:59:36 PST 2026

On 10/02/26 3:17 PM, Ryan Roberts wrote:
> On 03/02/2026 13:03, Anshuman Khandual wrote:
>> Linear and vmemmap mapings that get teared down during a memory hot remove
>> operation might contain leaf level entries on any page table level. If the
>> requested memory range's linear or vmemmap mappings falls within such leaf
>> entries, new mappings need to be created for the remaning memory mapped on
>> the leaf entry earlier, following standard break before make aka BBM rules.
>> But kernel cannot tolerate BBM amd hemce remapping to fine grained leaves
> 
> nits:                            ^^^ ^^^^^

Yikes ! will fix the typos.

> 
>> would not be possible on systems without BBML2_NOABORT.
>>
>> Currently memory hot remove operation does not perform such restructuring,
>> and so removing memory ranges that could split a kernel leaf level mapping
>> need to be rejected.
>>
>> while memory_hotplug.c does appear to permit hot removing arbitrary ranges
>> of memory, the higher layers that drive memory_hotplug (e.g. ACPI, virtio,
>> ...) all appear to treat memory as fixed size devices. So it is impossible
>> to hotunplug a different amount than was previously hotplugged, and hence
>> we should never see a rejection in practice, but adding the check makes us
>> robust against a future change.
>>
>> Cc: Catalin Marinas <catalin.marinas at arm.com>
>> Cc: Will Deacon <will at kernel.org>
>> Cc: linux-arm-kernel at lists.infradead.org
>> Cc: linux-kernel at vger.kernel.org
>> Closes: https://lore.kernel.org/all/aWZYXhrT6D2M-7-N@willie-the-truck/
>> Fixes: bbd6ec605c0f ("arm64/mm: Enable memory hot remove")
>> Cc: stable at vger.kernel.org
> 
> Given your statement in the commit log above about how this never happens in
> practice but is instead to make us robust to any future changes, perhaps this is
> not a stable candidate?
> 
> For clarity; I do agree that this extra robustness is useful so I think we
> should aim to get it upstream, I just don't think it needs to go to stable.

Will drop the `Cc: stable` reference.

> 
>> Suggested-by: Ryan Roberts <ryan.roberts at arm.com>
>> Signed-off-by: Anshuman Khandual <anshuman.khandual at arm.com>
> 
> Other than above nit, LGTM:
> 
> Reviewed-by: Ryan Roberts <ryan.roberts at arm.com>

Thanks ! 

Will respin the series on upcoming - rc1 version.

> 
>> ---
>>  arch/arm64/mm/mmu.c | 155 ++++++++++++++++++++++++++++++++++++++++++--
>>  1 file changed, 149 insertions(+), 6 deletions(-)
>>
>> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
>> index 8ec8a287aaa1..3fb9bcbd739a 100644
>> --- a/arch/arm64/mm/mmu.c
>> +++ b/arch/arm64/mm/mmu.c
>> @@ -2063,6 +2063,142 @@ void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
>>  	__remove_pgd_mapping(swapper_pg_dir, __phys_to_virt(start), size);
>>  }
>>  
>> +
>> +static bool addr_splits_kernel_leaf(unsigned long addr)
>> +{
>> +	pgd_t *pgdp, pgd;
>> +	p4d_t *p4dp, p4d;
>> +	pud_t *pudp, pud;
>> +	pmd_t *pmdp, pmd;
>> +	pte_t *ptep, pte;
>> +
>> +	/*
>> +	 * PGD level:
>> +	 *
>> +	 * If addr is PGD_SIZE aligned - already on a leaf boundary
>> +	 */
>> +	if (ALIGN_DOWN(addr, PGDIR_SIZE) == addr)
>> +		return false;
>> +
>> +	pgdp = pgd_offset_k(addr);
>> +	pgd = pgdp_get(pgdp);
>> +	if (!pgd_present(pgd))
>> +		return false;
>> +
>> +	/*
>> +	 * P4D level:
>> +	 *
>> +	 * If addr is P4D_SIZE aligned - already on a leaf boundary
>> +	 */
>> +	if (ALIGN_DOWN(addr, P4D_SIZE) == addr)
>> +		return false;
>> +
>> +	p4dp = p4d_offset(pgdp, addr);
>> +	p4d = p4dp_get(p4dp);
>> +	if (!p4d_present(p4d))
>> +		return false;
>> +
>> +	/*
>> +	 * PUD level:
>> +	 *
>> +	 * If addr is PUD_SIZE aligned - already on a leaf boundary
>> +	 */
>> +	if (ALIGN_DOWN(addr, PUD_SIZE) == addr)
>> +		return false;
>> +
>> +	pudp = pud_offset(p4dp, addr);
>> +	pud = pudp_get(pudp);
>> +	if (!pud_present(pud))
>> +		return false;
>> +
>> +	if (pud_leaf(pud))
>> +		return true;
>> +
>> +	/*
>> +	 * CONT_PMD level:
>> +	 *
>> +	 * If addr is CONT_PMD_SIZE aligned - already on a leaf boundary
>> +	 */
>> +	if (ALIGN_DOWN(addr, CONT_PMD_SIZE) == addr)
>> +		return false;
>> +
>> +	pmdp = pmd_offset(pudp, addr);
>> +	pmd = pmdp_get(pmdp);
>> +	if (!pmd_present(pmd))
>> +		return false;
>> +
>> +	if (pmd_cont(pmd))
>> +		return true;
>> +
>> +	/*
>> +	 * PMD level:
>> +	 *
>> +	 * If addr is PMD_SIZE aligned - already on a leaf boundary
>> +	 */
>> +	if (ALIGN_DOWN(addr, PMD_SIZE) == addr)
>> +		return false;
>> +
>> +	if (pmd_leaf(pmd))
>> +		return true;
>> +
>> +	/*
>> +	 * CONT_PTE level:
>> +	 *
>> +	 * If addr is CONT_PTE_SIZE aligned - already on a leaf boundary
>> +	 */
>> +	if (ALIGN_DOWN(addr, CONT_PTE_SIZE) == addr)
>> +		return false;
>> +
>> +	ptep = pte_offset_kernel(pmdp, addr);
>> +	pte = __ptep_get(ptep);
>> +	if (!pte_present(pte))
>> +		return false;
>> +
>> +	if (pte_cont(pte))
>> +		return true;
>> +
>> +	/*
>> +	 * PTE level:
>> +	 *
>> +	 * If addr is PAGE_SIZE aligned - already on a leaf boundary
>> +	 */
>> +	if (ALIGN_DOWN(addr, PAGE_SIZE) == addr)
>> +		return false;
>> +	return true;
>> +}
>> +
>> +static bool can_unmap_without_split(unsigned long pfn, unsigned long nr_pages)
>> +{
>> +	unsigned long phys_start, phys_end, size, start, end;
>> +
>> +	phys_start = PFN_PHYS(pfn);
>> +	phys_end = phys_start + nr_pages * PAGE_SIZE;
>> +
>> +	/*
>> +	 * PFN range's linear map edges are leaf entry aligned
>> +	 */
>> +	start = __phys_to_virt(phys_start);
>> +	end =  __phys_to_virt(phys_end);
>> +	if (addr_splits_kernel_leaf(start) || addr_splits_kernel_leaf(end)) {
>> +		pr_warn("[%lx %lx] splits a leaf entry in linear map\n",
>> +			phys_start, phys_end);
>> +		return false;
>> +	}
>> +
>> +	/*
>> +	 * PFN range's vmemmap edges are leaf entry aligned
>> +	 */
>> +	size = nr_pages * sizeof(struct page);
>> +	start = (unsigned long)pfn_to_page(pfn);
>> +	end = start + size;
>> +	if (addr_splits_kernel_leaf(start) || addr_splits_kernel_leaf(end)) {
>> +		pr_warn("[%lx %lx] splits a leaf entry in vmemmap\n",
>> +			phys_start, phys_end);
>> +		return false;
>> +	}
>> +	return true;
>> +}
>> +
>>  /*
>>   * This memory hotplug notifier helps prevent boot memory from being
>>   * inadvertently removed as it blocks pfn range offlining process in
>> @@ -2071,8 +2207,11 @@ void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
>>   * In future if and when boot memory could be removed, this notifier
>>   * should be dropped and free_hotplug_page_range() should handle any
>>   * reserved pages allocated during boot.
>> + *
>> + * This also blocks any memory remove that would have caused a split
>> + * in leaf entry in kernel linear or vmemmap mapping.
>>   */
>> -static int prevent_bootmem_remove_notifier(struct notifier_block *nb,
>> +static int prevent_memory_remove_notifier(struct notifier_block *nb,
>>  					   unsigned long action, void *data)
>>  {
>>  	struct mem_section *ms;
>> @@ -2118,11 +2257,15 @@ static int prevent_bootmem_remove_notifier(struct notifier_block *nb,
>>  			return NOTIFY_DONE;
>>  		}
>>  	}
>> +
>> +	if (!can_unmap_without_split(pfn, arg->nr_pages))
>> +		return NOTIFY_BAD;
>> +
>>  	return NOTIFY_OK;
>>  }
>>  
>> -static struct notifier_block prevent_bootmem_remove_nb = {
>> -	.notifier_call = prevent_bootmem_remove_notifier,
>> +static struct notifier_block prevent_memory_remove_nb = {
>> +	.notifier_call = prevent_memory_remove_notifier,
>>  };
>>  
>>  /*
>> @@ -2172,7 +2315,7 @@ static void validate_bootmem_online(void)
>>  	}
>>  }
>>  
>> -static int __init prevent_bootmem_remove_init(void)
>> +static int __init prevent_memory_remove_init(void)
>>  {
>>  	int ret = 0;
>>  
>> @@ -2180,13 +2323,13 @@ static int __init prevent_bootmem_remove_init(void)
>>  		return ret;
>>  
>>  	validate_bootmem_online();
>> -	ret = register_memory_notifier(&prevent_bootmem_remove_nb);
>> +	ret = register_memory_notifier(&prevent_memory_remove_nb);
>>  	if (ret)
>>  		pr_err("%s: Notifier registration failed %d\n", __func__, ret);
>>  
>>  	return ret;
>>  }
>> -early_initcall(prevent_bootmem_remove_init);
>> +early_initcall(prevent_memory_remove_init);
>>  #endif
>>  
>>  pte_t modify_prot_start_ptes(struct vm_area_struct *vma, unsigned long addr,
>