[PATCH RFC/RFT 3/4] riscv: Stop emitting preventive sfence.vma for new userspace mappings

Alexandre Ghiti alexghiti at rivosinc.com
Fri Dec 8 06:39:22 PST 2023


On Thu, Dec 7, 2023 at 5:37 PM Christophe Leroy
<christophe.leroy at csgroup.eu> wrote:
>
> The subject says "riscv:" but it changes core part and several arch.
> Maybe this commit should be split in two commits, one for API changes
> that changes flush_tlb_fix_spurious_fault() to
> flush_tlb_fix_spurious_write_fault() and adds
> flush_tlb_fix_spurious_read_fault() including the change in memory.c,
> then a second patch with the changes to riscv.

You're right, I'll do that, thanks.

>
> Le 07/12/2023 à 16:03, Alexandre Ghiti a écrit :
> > The preventive sfence.vma were emitted because new mappings must be made
> > visible to the page table walker, either the uarch caches invalid
> > entries or not.
> >
> > Actually, there is no need to preventively sfence.vma on new mappings for
> > userspace, this should be handled only in the page fault path.
> >
> > This allows to drastically reduce the number of sfence.vma emitted:
> >
> > * Ubuntu boot to login:
> > Before: ~630k sfence.vma
> > After:  ~200k sfence.vma
> >
> > * ltp - mmapstress01
> > Before: ~45k
> > After:  ~6.3k
> >
> > * lmbench - lat_pagefault
> > Before: ~665k
> > After:   832 (!)
> >
> > * lmbench - lat_mmap
> > Before: ~546k
> > After:   718 (!)
> >
> > The only issue with the removal of sfence.vma in update_mmu_cache() is
> > that on uarchs that cache invalid entries, those won't be invalidated
> > until the process takes a fault: so that's an additional fault in those
> > cases.
> >
> > Signed-off-by: Alexandre Ghiti <alexghiti at rivosinc.com>
> > ---
> >   arch/arm64/include/asm/pgtable.h              |  2 +-
> >   arch/mips/include/asm/pgtable.h               |  6 +--
> >   arch/powerpc/include/asm/book3s/64/tlbflush.h |  8 ++--
> >   arch/riscv/include/asm/pgtable.h              | 43 +++++++++++--------
> >   include/linux/pgtable.h                       |  8 +++-
> >   mm/memory.c                                   | 12 +++++-
> >   6 files changed, 48 insertions(+), 31 deletions(-)
>
> Did you forget mm/pgtable-generic.c ?

Indeed, I "missed" the occurrence of flush_tlb_fix_spurious_fault()
there, thanks.

>
> >
> > diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
> > index 7f7d9b1df4e5..728f25f529a5 100644
> > --- a/arch/arm64/include/asm/pgtable.h
> > +++ b/arch/arm64/include/asm/pgtable.h
> > @@ -57,7 +57,7 @@ static inline bool arch_thp_swp_supported(void)
> >    * fault on one CPU which has been handled concurrently by another CPU
> >    * does not need to perform additional invalidation.
> >    */
> > -#define flush_tlb_fix_spurious_fault(vma, address, ptep) do { } while (0)
> > +#define flush_tlb_fix_spurious_write_fault(vma, address, ptep) do { } while (0)
>
> Why do you need to do that change ? Nothing is explained about that in
> the commit message.

I renamed this macro because in the page fault path,
flush_tlb_fix_spurious_fault() is called only when the fault is a
write fault (see
https://elixir.bootlin.com/linux/latest/source/mm/memory.c#L5016).
I'll check if that fits the occurrence in mm/pgtable-generic.c too.

Thanks again for the review,

Alex

>
> >
> >   /*
> >    * ZERO_PAGE is a global shared page that is always zero: used
> > diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h
> > index 430b208c0130..84439fe6ed29 100644
> > --- a/arch/mips/include/asm/pgtable.h
> > +++ b/arch/mips/include/asm/pgtable.h
> > @@ -478,9 +478,9 @@ static inline pgprot_t pgprot_writecombine(pgprot_t _prot)
> >       return __pgprot(prot);
> >   }
> >
> > -static inline void flush_tlb_fix_spurious_fault(struct vm_area_struct *vma,
> > -                                             unsigned long address,
> > -                                             pte_t *ptep)
> > +static inline void flush_tlb_fix_spurious_write_fault(struct vm_area_struct *vma,
> > +                                                   unsigned long address,
> > +                                                   pte_t *ptep)
> >   {
> >   }
> >
> > diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h b/arch/powerpc/include/asm/book3s/64/tlbflush.h
> > index 1950c1b825b4..7166d56f90db 100644
> > --- a/arch/powerpc/include/asm/book3s/64/tlbflush.h
> > +++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h
> > @@ -128,10 +128,10 @@ static inline void flush_tlb_page(struct vm_area_struct *vma,
> >   #define flush_tlb_page(vma, addr)   local_flush_tlb_page(vma, addr)
> >   #endif /* CONFIG_SMP */
> >
> > -#define flush_tlb_fix_spurious_fault flush_tlb_fix_spurious_fault
> > -static inline void flush_tlb_fix_spurious_fault(struct vm_area_struct *vma,
> > -                                             unsigned long address,
> > -                                             pte_t *ptep)
> > +#define flush_tlb_fix_spurious_write_fault flush_tlb_fix_spurious_write_fault
> > +static inline void flush_tlb_fix_spurious_write_fault(struct vm_area_struct *vma,
> > +                                                   unsigned long address,
> > +                                                   pte_t *ptep)
> >   {
> >       /*
> >        * Book3S 64 does not require spurious fault flushes because the PTE
> > diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
> > index b2ba3f79cfe9..89aa5650f104 100644
> > --- a/arch/riscv/include/asm/pgtable.h
> > +++ b/arch/riscv/include/asm/pgtable.h
> > @@ -472,28 +472,20 @@ static inline void update_mmu_cache_range(struct vm_fault *vmf,
> >               struct vm_area_struct *vma, unsigned long address,
> >               pte_t *ptep, unsigned int nr)
> >   {
> > -     /*
> > -      * The kernel assumes that TLBs don't cache invalid entries, but
> > -      * in RISC-V, SFENCE.VMA specifies an ordering constraint, not a
> > -      * cache flush; it is necessary even after writing invalid entries.
> > -      * Relying on flush_tlb_fix_spurious_fault would suffice, but
> > -      * the extra traps reduce performance.  So, eagerly SFENCE.VMA.
> > -      */
> > -     while (nr--)
> > -             local_flush_tlb_page(address + nr * PAGE_SIZE);
> >   }
> >   #define update_mmu_cache(vma, addr, ptep) \
> >       update_mmu_cache_range(NULL, vma, addr, ptep, 1)
> >
> >   #define __HAVE_ARCH_UPDATE_MMU_TLB
> > -#define update_mmu_tlb update_mmu_cache
> > +static inline void update_mmu_tlb(struct vm_area_struct *vma,
> > +                               unsigned long address, pte_t *ptep)
> > +{
> > +     flush_tlb_range(vma, address, address + PAGE_SIZE);
> > +}
> >
> >   static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
> >               unsigned long address, pmd_t *pmdp)
> >   {
> > -     pte_t *ptep = (pte_t *)pmdp;
> > -
> > -     update_mmu_cache(vma, address, ptep);
> >   }
> >
> >   #define __HAVE_ARCH_PTE_SAME
> > @@ -548,13 +540,26 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma,
> >                                       unsigned long address, pte_t *ptep,
> >                                       pte_t entry, int dirty)
> >   {
> > -     if (!pte_same(*ptep, entry))
> > +     if (!pte_same(*ptep, entry)) {
> >               __set_pte_at(ptep, entry);
> > -     /*
> > -      * update_mmu_cache will unconditionally execute, handling both
> > -      * the case that the PTE changed and the spurious fault case.
> > -      */
> > -     return true;
> > +             /* Here only not svadu is impacted */
> > +             flush_tlb_page(vma, address);
> > +             return true;
> > +     }
> > +
> > +     return false;
> > +}
> > +
> > +extern u64 nr_sfence_vma_handle_exception;
> > +extern bool tlb_caching_invalid_entries;
> > +
> > +#define flush_tlb_fix_spurious_read_fault flush_tlb_fix_spurious_read_fault
> > +static inline void flush_tlb_fix_spurious_read_fault(struct vm_area_struct *vma,
> > +                                                  unsigned long address,
> > +                                                  pte_t *ptep)
> > +{
> > +     if (tlb_caching_invalid_entries)
> > +             flush_tlb_page(vma, address);
> >   }
> >
> >   #define __HAVE_ARCH_PTEP_GET_AND_CLEAR
> > diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
> > index af7639c3b0a3..7abaf42ef612 100644
> > --- a/include/linux/pgtable.h
> > +++ b/include/linux/pgtable.h
> > @@ -931,8 +931,12 @@ static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio)
> >   # define pte_accessible(mm, pte)    ((void)(pte), 1)
> >   #endif
> >
> > -#ifndef flush_tlb_fix_spurious_fault
> > -#define flush_tlb_fix_spurious_fault(vma, address, ptep) flush_tlb_page(vma, address)
> > +#ifndef flush_tlb_fix_spurious_write_fault
> > +#define flush_tlb_fix_spurious_write_fault(vma, address, ptep) flush_tlb_page(vma, address)
> > +#endif
> > +
> > +#ifndef flush_tlb_fix_spurious_read_fault
> > +#define flush_tlb_fix_spurious_read_fault(vma, address, ptep)
> >   #endif
> >
> >   /*
> > diff --git a/mm/memory.c b/mm/memory.c
> > index 517221f01303..5cb0ccf0c03f 100644
> > --- a/mm/memory.c
> > +++ b/mm/memory.c
> > @@ -5014,8 +5014,16 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
> >                * with threads.
> >                */
> >               if (vmf->flags & FAULT_FLAG_WRITE)
> > -                     flush_tlb_fix_spurious_fault(vmf->vma, vmf->address,
> > -                                                  vmf->pte);
> > +                     flush_tlb_fix_spurious_write_fault(vmf->vma, vmf->address,
> > +                                                        vmf->pte);
> > +             else
> > +                     /*
> > +                      * With the pte_same(ptep_get(vmf->pte), entry) check
> > +                      * that calls update_mmu_tlb() above, multiple threads
> > +                      * faulting at the same time won't get there.
> > +                      */
> > +                     flush_tlb_fix_spurious_read_fault(vmf->vma, vmf->address,
> > +                                                       vmf->pte);
> >       }
> >   unlock:
> >       pte_unmap_unlock(vmf->pte, vmf->ptl);



More information about the linux-riscv mailing list