[PATCH -v2 2/2] arm64, tlbflush: don't TLBI broadcast if page reused in write fault
Barry Song
21cnbao at gmail.com
Tue Oct 21 21:08:47 PDT 2025
> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
> index aa89c2e67ebc..35bae2e4bcfe 100644
> --- a/arch/arm64/include/asm/pgtable.h
> +++ b/arch/arm64/include/asm/pgtable.h
> @@ -130,12 +130,16 @@ static inline void arch_leave_lazy_mmu_mode(void)
> #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
>
> /*
> - * Outside of a few very special situations (e.g. hibernation), we always
> - * use broadcast TLB invalidation instructions, therefore a spurious page
> - * fault on one CPU which has been handled concurrently by another CPU
> - * does not need to perform additional invalidation.
> + * We use local TLB invalidation instruction when reusing page in
> + * write protection fault handler to avoid TLBI broadcast in the hot
> + * path. This will cause spurious page faults if stall read-only TLB
> + * entries exist.
> */
> -#define flush_tlb_fix_spurious_fault(vma, address, ptep) do { } while (0)
> +#define flush_tlb_fix_spurious_fault(vma, address, ptep) \
> + local_flush_tlb_page_nonotify(vma, address)
> +
> +#define flush_tlb_fix_spurious_fault_pmd(vma, address, pmdp) \
> + local_flush_tlb_page_nonotify(vma, address)
>
> /*
> * ZERO_PAGE is a global shared page that is always zero: used
> diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
> index 18a5dc0c9a54..651b31fd18bb 100644
> --- a/arch/arm64/include/asm/tlbflush.h
> +++ b/arch/arm64/include/asm/tlbflush.h
> @@ -249,6 +249,18 @@ static inline unsigned long get_trans_granule(void)
> * cannot be easily determined, the value TLBI_TTL_UNKNOWN will
> * perform a non-hinted invalidation.
> *
> + * local_flush_tlb_page(vma, addr)
> + * Local variant of flush_tlb_page(). Stale TLB entries may
> + * remain in remote CPUs.
> + *
> + * local_flush_tlb_page_nonotify(vma, addr)
> + * Same as local_flush_tlb_page() except MMU notifier will not be
> + * called.
> + *
> + * local_flush_tlb_contpte_range(vma, start, end)
> + * Invalidate the virtual-address range '[start, end)' mapped with
> + * contpte on local CPU for the user address space corresponding
> + * to 'vma->mm'. Stale TLB entries may remain in remote CPUs.
> *
> * Finally, take a look at asm/tlb.h to see how tlb_flush() is implemented
> * on top of these routines, since that is our interface to the mmu_gather
> @@ -282,6 +294,33 @@ static inline void flush_tlb_mm(struct mm_struct *mm)
> mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
> }
>
> +static inline void __local_flush_tlb_page_nonotify_nosync(
> + struct mm_struct *mm, unsigned long uaddr)
> +{
> + unsigned long addr;
> +
> + dsb(nshst);
We were issuing dsb(ishst) even for the nosync case, likely to ensure
PTE visibility across cores. However, since set_ptes already includes a
dsb(ishst) in __set_pte_complete(), does this mean we’re being overly
cautious in __flush_tlb_page_nosync() in many cases?
static inline void __flush_tlb_page_nosync(struct mm_struct *mm,
unsigned long uaddr)
{
unsigned long addr;
dsb(ishst);
addr = __TLBI_VADDR(uaddr, ASID(mm));
__tlbi(vale1is, addr);
__tlbi_user(vale1is, addr);
mmu_notifier_arch_invalidate_secondary_tlbs(mm, uaddr & PAGE_MASK,
(uaddr & PAGE_MASK) +
PAGE_SIZE);
}
On the other hand, __ptep_set_access_flags() doesn’t seem to use
set_ptes(), so there’s no guarantee the updated PTEs are visible to all
cores. If a remote CPU later encounters a page fault and performs a TLB
invalidation, will it still see a stable PTE?
> + addr = __TLBI_VADDR(uaddr, ASID(mm));
> + __tlbi(vale1, addr);
> + __tlbi_user(vale1, addr);
> +}
> +
> +static inline void local_flush_tlb_page_nonotify(
> + struct vm_area_struct *vma, unsigned long uaddr)
> +{
> + __local_flush_tlb_page_nonotify_nosync(vma->vm_mm, uaddr);
> + dsb(nsh);
> +}
> +
> +static inline void local_flush_tlb_page(struct vm_area_struct *vma,
> + unsigned long uaddr)
> +{
> + __local_flush_tlb_page_nonotify_nosync(vma->vm_mm, uaddr);
> + mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, uaddr & PAGE_MASK,
> + (uaddr & PAGE_MASK) + PAGE_SIZE);
> + dsb(nsh);
> +}
> +
> static inline void __flush_tlb_page_nosync(struct mm_struct *mm,
> unsigned long uaddr)
> {
> @@ -472,6 +511,23 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma,
> dsb(ish);
> }
>
We already have functions like
__flush_tlb_page_nosync() and __flush_tlb_range_nosync().
Is there a way to factor out or extract their common parts?
Is it because of the differences in barriers that this extraction of
common code isn’t feasible?
Thanks
Barry
More information about the linux-arm-kernel
mailing list