[PATCH v2] arm64: Enable vmalloc-huge with ptdump

Tue Jun 10 20:02:51 PDT 2025

On 11/06/25 12:30 am, Ryan Roberts wrote:
> On 10/06/2025 17:00, Dev Jain wrote:
>> arm64 disables vmalloc-huge when kernel page table dumping is enabled,
>> because an intermediate table may be removed, potentially causing the
>> ptdump code to dereference an invalid address. We want to be able to
>> analyze block vs page mappings for kernel mappings with ptdump, so to
>> enable vmalloc-huge with ptdump, synchronize between page table removal in
>> pmd_free_pte_page()/pud_free_pmd_page() and ptdump pagetable walking. We
>> use mmap_read_lock and not write lock because we don't need to synchronize
>> between two different vm_structs; two vmalloc objects running this same
>> code path will point to different page tables, hence there is no race.
>>
>> For pud_free_pmd_page(), we isolate the PMD table to avoid taking the lock
>> 512 times again via pmd_free_pte_page(). Note that there is no need to
>> move __flush_tlb_kernel_pgtable() to immediately after pud_clear(); the
>> only argument against this would be that we immediately require a
>> dsb(ishst) (present in __flush_tlb_kernel_pgtable()) after pud_clear(),
>> but that is not the case, since the transition is from
>> valid -> invalid, not vice-versa.
>>
>> No issues were observed with mm-selftests. No issues were observed while
>> parallelly running test_vmalloc.sh and dumping the kernel pagetable through
>> sysfs in a loop.
>>
>> v1->v2:
>>   - Take lock only when CONFIG_PTDUMP_DEBUGFS is on
> I thought we agreed that we would use a static key and some rcu synchronize
> magic? What was the reason for taking this approach?

As I understand it, the RCU magic won't work, I had replied here:
https://lore.kernel.org/all/6cd41ae9-303d-4eda-8d64-f7dba19eb106@arm.com/

>
> I'm guessing CONFIG_PTDUMP_DEBUGFS is very much a debug feature that we wouldn't
> expect to enable in production kernels; if that's the case, then perhaps this
> approach is good enough. But given Will suggested a solution that would make it
> zero overhead when ptdump is not active, why not just take that approach?
>
> Thanks,
> Ryan
>
>>   - In case of pud_free_pmd_page(), isolate the PMD table to avoid taking
>>     the lock 512 times again via pmd_free_pte_page()
>>
>> Signed-off-by: Dev Jain <dev.jain at arm.com>
>> ---
>>   arch/arm64/include/asm/vmalloc.h |  6 ++---
>>   arch/arm64/mm/mmu.c              | 43 +++++++++++++++++++++++++++++---
>>   2 files changed, 42 insertions(+), 7 deletions(-)
>>
>> diff --git a/arch/arm64/include/asm/vmalloc.h b/arch/arm64/include/asm/vmalloc.h
>> index 12f534e8f3ed..e835fd437ae0 100644
>> --- a/arch/arm64/include/asm/vmalloc.h
>> +++ b/arch/arm64/include/asm/vmalloc.h
>> @@ -12,15 +12,13 @@ static inline bool arch_vmap_pud_supported(pgprot_t prot)
>>   	/*
>>   	 * SW table walks can't handle removal of intermediate entries.
>>   	 */
>> -	return pud_sect_supported() &&
>> -	       !IS_ENABLED(CONFIG_PTDUMP_DEBUGFS);
>> +	return pud_sect_supported();
>>   }
>>   
>>   #define arch_vmap_pmd_supported arch_vmap_pmd_supported
>>   static inline bool arch_vmap_pmd_supported(pgprot_t prot)
>>   {
>> -	/* See arch_vmap_pud_supported() */
>> -	return !IS_ENABLED(CONFIG_PTDUMP_DEBUGFS);
>> +	return true;
>>   }
>>   
>>   #define arch_vmap_pte_range_map_size arch_vmap_pte_range_map_size
>> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
>> index 8fcf59ba39db..fa98a62e4baf 100644
>> --- a/arch/arm64/mm/mmu.c
>> +++ b/arch/arm64/mm/mmu.c
>> @@ -1267,7 +1267,25 @@ int pmd_clear_huge(pmd_t *pmdp)
>>   	return 1;
>>   }
>>   
>> -int pmd_free_pte_page(pmd_t *pmdp, unsigned long addr)
>> +#ifdef CONFIG_PTDUMP_DEBUGFS
>> +static inline void ptdump_synchronize_lock(void)
>> +{
>> +	/* Synchronize against ptdump_walk_pgd() */
>> +	mmap_read_lock(&init_mm);
>> +}
>> +
>> +static inline void ptdump_synchronize_unlock(void)
>> +{
>> +	mmap_read_unlock(&init_mm);
>> +}
>> +#else	/* CONFIG_PTDUMP_DEBUGFS */
>> +
>> +static inline void ptdump_synchronize_lock(void)	{}
>> +static inline void ptdump_synchronize_unlock(void)	{}
>> +
>> +#endif	/* CONFIG_PTDUMP_DEBUGFS */
>> +
>> +static int __pmd_free_pte_page(pmd_t *pmdp, unsigned long addr, bool lock)
>>   {
>>   	pte_t *table;
>>   	pmd_t pmd;
>> @@ -1280,12 +1298,23 @@ int pmd_free_pte_page(pmd_t *pmdp, unsigned long addr)
>>   	}
>>   
>>   	table = pte_offset_kernel(pmdp, addr);
>> +
>> +	if (lock)
>> +		ptdump_synchronize_lock();
>>   	pmd_clear(pmdp);
>> +	if (lock)
>> +		ptdump_synchronize_unlock();
>> +
>>   	__flush_tlb_kernel_pgtable(addr);
>>   	pte_free_kernel(NULL, table);
>>   	return 1;
>>   }
>>   
>> +int pmd_free_pte_page(pmd_t *pmdp, unsigned long addr)
>> +{
>> +	return __pmd_free_pte_page(pmdp, addr, true);
>> +}
>> +
>>   int pud_free_pmd_page(pud_t *pudp, unsigned long addr)
>>   {
>>   	pmd_t *table;
>> @@ -1301,14 +1330,22 @@ int pud_free_pmd_page(pud_t *pudp, unsigned long addr)
>>   	}
>>   
>>   	table = pmd_offset(pudp, addr);
>> +
>> +	/*
>> +	 * Isolate the PMD table; in case of race with ptdump, this helps
>> +	 * us to avoid taking the lock in __pmd_free_pte_page()
>> +	 */
>> +	ptdump_synchronize_lock();
>> +	pud_clear(pudp);
>> +	ptdump_synchronize_unlock();
>> +
>>   	pmdp = table;
>>   	next = addr;
>>   	end = addr + PUD_SIZE;
>>   	do {
>> -		pmd_free_pte_page(pmdp, next);
>> +		__pmd_free_pte_page(pmdp, next, false);
>>   	} while (pmdp++, next += PMD_SIZE, next != end);
>>   
>> -	pud_clear(pudp);
>>   	__flush_tlb_kernel_pgtable(addr);
>>   	pmd_free(NULL, table);
>>   	return 1;