[PATCH 2/3] arm64: mm: HVO: support BBM of vmemmap pgtable safely

Kefeng Wang wangkefeng.wang at huawei.com
Thu Dec 14 22:32:20 PST 2023



On 2023/12/15 12:36, Muchun Song wrote:
> 
> 
> On 2023/12/14 15:39, Nanyong Sun wrote:
>> Implement vmemmap_update_pmd and vmemmap_update_pte on arm64 to do
>> BBM(break-before-make) logic when change the page table of vmemmap
>> address, they will under the init_mm.page_table_lock.
>> If a translation fault of vmemmap address concurrently happened after
>> pte/pmd cleared, vmemmap page fault handler will acquire the
>> init_mm.page_table_lock to wait for vmemmap update to complete,
>> by then the virtual address is valid again, so PF can return and
>> access can continue.
>> In other case, do the traditional kernel fault.
> 
> Yes. BTW, we already use the same scheme to support arm64
> in our internal production. So the whole approach LGTM.
> 
>> Implement flush_tlb_vmemmap_all and flush_tlb_vmemmap_range on arm64
>> with nothing to do because tlb already flushed in every single BBM.
>>
>> Signed-off-by: Nanyong Sun <sunnanyong at huawei.com>
>> ---
>>   arch/arm64/include/asm/esr.h |  4 ++
>>   arch/arm64/include/asm/mmu.h | 20 ++++++++
>>   arch/arm64/mm/fault.c        | 94 ++++++++++++++++++++++++++++++++++++
>>   arch/arm64/mm/mmu.c          | 28 +++++++++++
>>   4 files changed, 146 insertions(+)
>>
>> diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h
>> index ae35939f395b..1c63256efd25 100644
>> --- a/arch/arm64/include/asm/esr.h
>> +++ b/arch/arm64/include/asm/esr.h
>> @@ -116,6 +116,10 @@
>>   #define ESR_ELx_FSC_SERROR    (0x11)
>>   #define ESR_ELx_FSC_ACCESS    (0x08)
>>   #define ESR_ELx_FSC_FAULT    (0x04)
>> +#define ESR_ELx_FSC_FAULT_L0    (0x04)
>> +#define ESR_ELx_FSC_FAULT_L1    (0x05)
>> +#define ESR_ELx_FSC_FAULT_L2    (0x06)
>> +#define ESR_ELx_FSC_FAULT_L3    (0x07)
>>   #define ESR_ELx_FSC_PERM    (0x0C)
>>   #define ESR_ELx_FSC_SEA_TTW0    (0x14)
>>   #define ESR_ELx_FSC_SEA_TTW1    (0x15)
>> diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
>> index 2fcf51231d6e..fcec5827f54f 100644
>> --- a/arch/arm64/include/asm/mmu.h
>> +++ b/arch/arm64/include/asm/mmu.h
>> @@ -76,5 +76,25 @@ extern bool kaslr_requires_kpti(void);
>>   #define INIT_MM_CONTEXT(name)    \
>>       .pgd = init_pg_dir,
>> +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
>> +void vmemmap_update_pmd(unsigned long start, pmd_t *pmd, pte_t 
>> *pgtable);
>> +#define vmemmap_update_pmd vmemmap_update_pmd
>> +void vmemmap_update_pte(unsigned long addr, pte_t *pte, pte_t entry);
>> +#define vmemmap_update_pte vmemmap_update_pte
>> +
>> +static inline void flush_tlb_vmemmap_all(void)
>> +{
>> +    /* do nothing, already flushed tlb in every single BBM */
>> +}
>> +#define flush_tlb_vmemmap_all flush_tlb_vmemmap_all
>> +
>> +static inline void flush_tlb_vmemmap_range(unsigned long start,
>> +                       unsigned long end)
>> +{
>> +    /* do nothing, already flushed tlb in every single BBM */
>> +}
>> +#define flush_tlb_vmemmap_range flush_tlb_vmemmap_range
>> +#endif
>> +
>>   #endif    /* !__ASSEMBLY__ */
>>   #endif
>> diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
>> index 460d799e1296..7066a273c1e0 100644
>> --- a/arch/arm64/mm/fault.c
>> +++ b/arch/arm64/mm/fault.c
>> @@ -368,6 +368,97 @@ static bool 
>> is_el1_mte_sync_tag_check_fault(unsigned long esr)
>>       return false;
>>   }
>> +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
>> +static inline bool is_vmemmap_address(unsigned long addr)
>> +{
>> +    return (addr >= VMEMMAP_START) && (addr < VMEMMAP_END);
>> +}
>> +
>> +static inline bool vmemmap_fault_may_fixup(unsigned long addr,
>> +                       unsigned long esr)
>> +{
>> +    if (!is_vmemmap_address(addr))
>> +        return false;
>> +

Squash the vmemmap range check into this function.

>> +    /*
>> +     * Only try to handle translation fault level 2 or level 3,
>> +     * because hugetlb vmemmap optimize only clear pmd or pte.
>> +     */
>> +    switch (esr & ESR_ELx_FSC) {
>> +    case ESR_ELx_FSC_FAULT_L2:
>> +    case ESR_ELx_FSC_FAULT_L3:
>> +        return true;
>> +    default:
>> +        return false;
>> +    }
>> +}
>> +
>> +/*
>> + * PMD mapped vmemmap should has been split as PTE mapped
>> + * by HVO now, here we only check this case, other cases
>> + * should fail.
>> + * Also should check the addr is healthy enough that will not cause
>> + * a level2 or level3 translation fault again after page fault
>> + * handled with success, so we need check both bits[1:0] of PMD and
>> + * PTE as ARM Spec mentioned below:
>> + * A Translation fault is generated if bits[1:0] of a translation
>> + * table descriptor identify the descriptor as either a Fault
>> + * encoding or a reserved encoding.
>> + */
>> +static inline bool vmemmap_addr_healthy(unsigned long addr)
>> +{
>> +    pgd_t *pgdp;
>> +    p4d_t *p4dp;
>> +    pud_t *pudp, pud;
>> +    pmd_t *pmdp, pmd;
>> +    pte_t *ptep, pte;
>> +
>> +    pgdp = pgd_offset_k(addr);
>> +    if (pgd_none(READ_ONCE(*pgdp)))
>> +        return false;
>> +
>> +    p4dp = p4d_offset(pgdp, addr);
>> +    if (p4d_none(READ_ONCE(*p4dp)))
>> +        return false;
>> +
>> +    pudp = pud_offset(p4dp, addr);
>> +    pud = READ_ONCE(*pudp);
>> +    if (pud_none(pud))
>> +        return false;
>> +
>> +    pmdp = pmd_offset(pudp, addr);
> 
> We already make sure it is a translation fault of level 2 or 3
> here, so we could use pmd_offset_k() macro to simplify the code
> a little. Right?
> 
>> +    pmd = READ_ONCE(*pmdp);
>> +    if (!pmd_table(pmd))
>> +        return false;
>> +
>> +    ptep = pte_offset_kernel(pmdp, addr);
>> +    pte = READ_ONCE(*ptep);
> 
> Please use ptep_get (which is supposed to do this) to access the
> raw pte, see commit c33c794828f21217.
> 
>> +    return (pte_val(pte) & PTE_TYPE_MASK) == PTE_TYPE_PAGE;
>> +}
>> +
>> +static bool vmemmap_handle_page_fault(unsigned long addr,
>> +                      unsigned long esr)
>> +{
>> +    bool ret = false;
>> +
>> +    if (likely(!vmemmap_fault_may_fixup(addr, esr)))
>> +        return false;
>> +
>> +    spin_lock(&init_mm.page_table_lock);
>> +    if (vmemmap_addr_healthy(addr))
>> +        ret = true;
> 
> It is to assign the return value to ret directly. Like:
> 
>          ret = vmemmap_addr_healthy(addr);
> 
> The the initializetion to ret also can be dropped.
> 
>> +    spin_unlock(&init_mm.page_table_lock);
>> +
>> +    return ret;
>> +}
>> +#else
>> +static inline bool vmemmap_handle_page_fault(unsigned long addr,
>> +                         unsigned long esr)
>> +{
>> +    return false;
>> +}
>> +#endif /*CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP */
>            ^
> Miss a blank between "*" and "C" here.
> 
> Thanks.
> 
>> +
>>   static bool is_translation_fault(unsigned long esr)
>>   {
>>       return (esr & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_FAULT;
>> @@ -409,6 +500,9 @@ static void __do_kernel_fault(unsigned long addr, 
>> unsigned long esr,
>>               kfence_handle_page_fault(addr, esr & ESR_ELx_WNR, regs))
>>               return;
>> +        if (vmemmap_handle_page_fault(addr, esr))
>> +            return;
>> +

Better to move under is_translation_fault()?

if (is_translation_fault(esr) {
	if (kfence_handle_page_fault())
		return;
	if (vmemmap_handle_page_fault())
		return;
}



>>           msg = "paging request";
>>       }
>> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
>> index 15f6347d23b6..81a600ccac7c 100644
>> --- a/arch/arm64/mm/mmu.c
>> +++ b/arch/arm64/mm/mmu.c
>> @@ -1146,6 +1146,34 @@ int __meminit vmemmap_check_pmd(pmd_t *pmdp, 
>> int node,
>>       return 1;
>>   }
>> +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
>> +/*
>> + * In the window between the page table entry is cleared and filled
>> + * with a new value, other threads have the opportunity to concurrently
>> + * access the vmemmap area then page translation fault occur.
>> + * Therefore, we need to ensure that the init_mm.page_table_lock is held
>> + * to synchronize the vmemmap page fault handling which will wait for
>> + * this lock to be released to ensure that the page table entry has been
>> + * refreshed with a new valid value.
>> + */
>> +void vmemmap_update_pmd(unsigned long start, pmd_t *pmd, pte_t *pgtable)
>> +{
>> +    lockdep_assert_held(&init_mm.page_table_lock);
>> +    pmd_clear(pmd);
>> +    flush_tlb_kernel_range(start, start + PMD_SIZE);
>> +    pmd_populate_kernel(&init_mm, pmd, pgtable);
>> +}
>> +
>> +void vmemmap_update_pte(unsigned long addr, pte_t *pte, pte_t entry)
>> +{
>> +    spin_lock(&init_mm.page_table_lock);
>> +    pte_clear(&init_mm, addr, pte);
>> +    flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
>> +    set_pte_at(&init_mm, addr, pte, entry);
>> +    spin_unlock(&init_mm.page_table_lock);
>> +}
>> +#endif
>> +
>>   int __meminit vmemmap_populate(unsigned long start, unsigned long 
>> end, int node,
>>           struct vmem_altmap *altmap)
>>   {
> 



More information about the linux-arm-kernel mailing list