[PATCH RFC/RFT 3/4] riscv: Stop emitting preventive sfence.vma for new userspace mappings

Christophe Leroy christophe.leroy at csgroup.eu
Thu Dec 7 08:37:04 PST 2023


The subject says "riscv:" but it changes core part and several arch. 
Maybe this commit should be split in two commits, one for API changes 
that changes flush_tlb_fix_spurious_fault() to 
flush_tlb_fix_spurious_write_fault() and adds 
flush_tlb_fix_spurious_read_fault() including the change in memory.c, 
then a second patch with the changes to riscv.

Le 07/12/2023 à 16:03, Alexandre Ghiti a écrit :
> The preventive sfence.vma were emitted because new mappings must be made
> visible to the page table walker, either the uarch caches invalid
> entries or not.
> 
> Actually, there is no need to preventively sfence.vma on new mappings for
> userspace, this should be handled only in the page fault path.
> 
> This allows to drastically reduce the number of sfence.vma emitted:
> 
> * Ubuntu boot to login:
> Before: ~630k sfence.vma
> After:  ~200k sfence.vma
> 
> * ltp - mmapstress01
> Before: ~45k
> After:  ~6.3k
> 
> * lmbench - lat_pagefault
> Before: ~665k
> After:   832 (!)
> 
> * lmbench - lat_mmap
> Before: ~546k
> After:   718 (!)
> 
> The only issue with the removal of sfence.vma in update_mmu_cache() is
> that on uarchs that cache invalid entries, those won't be invalidated
> until the process takes a fault: so that's an additional fault in those
> cases.
> 
> Signed-off-by: Alexandre Ghiti <alexghiti at rivosinc.com>
> ---
>   arch/arm64/include/asm/pgtable.h              |  2 +-
>   arch/mips/include/asm/pgtable.h               |  6 +--
>   arch/powerpc/include/asm/book3s/64/tlbflush.h |  8 ++--
>   arch/riscv/include/asm/pgtable.h              | 43 +++++++++++--------
>   include/linux/pgtable.h                       |  8 +++-
>   mm/memory.c                                   | 12 +++++-
>   6 files changed, 48 insertions(+), 31 deletions(-)

Did you forget mm/pgtable-generic.c ?

> 
> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
> index 7f7d9b1df4e5..728f25f529a5 100644
> --- a/arch/arm64/include/asm/pgtable.h
> +++ b/arch/arm64/include/asm/pgtable.h
> @@ -57,7 +57,7 @@ static inline bool arch_thp_swp_supported(void)
>    * fault on one CPU which has been handled concurrently by another CPU
>    * does not need to perform additional invalidation.
>    */
> -#define flush_tlb_fix_spurious_fault(vma, address, ptep) do { } while (0)
> +#define flush_tlb_fix_spurious_write_fault(vma, address, ptep) do { } while (0)

Why do you need to do that change ? Nothing is explained about that in 
the commit message.

>   
>   /*
>    * ZERO_PAGE is a global shared page that is always zero: used
> diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h
> index 430b208c0130..84439fe6ed29 100644
> --- a/arch/mips/include/asm/pgtable.h
> +++ b/arch/mips/include/asm/pgtable.h
> @@ -478,9 +478,9 @@ static inline pgprot_t pgprot_writecombine(pgprot_t _prot)
>   	return __pgprot(prot);
>   }
>   
> -static inline void flush_tlb_fix_spurious_fault(struct vm_area_struct *vma,
> -						unsigned long address,
> -						pte_t *ptep)
> +static inline void flush_tlb_fix_spurious_write_fault(struct vm_area_struct *vma,
> +						      unsigned long address,
> +						      pte_t *ptep)
>   {
>   }
>   
> diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h b/arch/powerpc/include/asm/book3s/64/tlbflush.h
> index 1950c1b825b4..7166d56f90db 100644
> --- a/arch/powerpc/include/asm/book3s/64/tlbflush.h
> +++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h
> @@ -128,10 +128,10 @@ static inline void flush_tlb_page(struct vm_area_struct *vma,
>   #define flush_tlb_page(vma, addr)	local_flush_tlb_page(vma, addr)
>   #endif /* CONFIG_SMP */
>   
> -#define flush_tlb_fix_spurious_fault flush_tlb_fix_spurious_fault
> -static inline void flush_tlb_fix_spurious_fault(struct vm_area_struct *vma,
> -						unsigned long address,
> -						pte_t *ptep)
> +#define flush_tlb_fix_spurious_write_fault flush_tlb_fix_spurious_write_fault
> +static inline void flush_tlb_fix_spurious_write_fault(struct vm_area_struct *vma,
> +						      unsigned long address,
> +						      pte_t *ptep)
>   {
>   	/*
>   	 * Book3S 64 does not require spurious fault flushes because the PTE
> diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
> index b2ba3f79cfe9..89aa5650f104 100644
> --- a/arch/riscv/include/asm/pgtable.h
> +++ b/arch/riscv/include/asm/pgtable.h
> @@ -472,28 +472,20 @@ static inline void update_mmu_cache_range(struct vm_fault *vmf,
>   		struct vm_area_struct *vma, unsigned long address,
>   		pte_t *ptep, unsigned int nr)
>   {
> -	/*
> -	 * The kernel assumes that TLBs don't cache invalid entries, but
> -	 * in RISC-V, SFENCE.VMA specifies an ordering constraint, not a
> -	 * cache flush; it is necessary even after writing invalid entries.
> -	 * Relying on flush_tlb_fix_spurious_fault would suffice, but
> -	 * the extra traps reduce performance.  So, eagerly SFENCE.VMA.
> -	 */
> -	while (nr--)
> -		local_flush_tlb_page(address + nr * PAGE_SIZE);
>   }
>   #define update_mmu_cache(vma, addr, ptep) \
>   	update_mmu_cache_range(NULL, vma, addr, ptep, 1)
>   
>   #define __HAVE_ARCH_UPDATE_MMU_TLB
> -#define update_mmu_tlb update_mmu_cache
> +static inline void update_mmu_tlb(struct vm_area_struct *vma,
> +				  unsigned long address, pte_t *ptep)
> +{
> +	flush_tlb_range(vma, address, address + PAGE_SIZE);
> +}
>   
>   static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
>   		unsigned long address, pmd_t *pmdp)
>   {
> -	pte_t *ptep = (pte_t *)pmdp;
> -
> -	update_mmu_cache(vma, address, ptep);
>   }
>   
>   #define __HAVE_ARCH_PTE_SAME
> @@ -548,13 +540,26 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma,
>   					unsigned long address, pte_t *ptep,
>   					pte_t entry, int dirty)
>   {
> -	if (!pte_same(*ptep, entry))
> +	if (!pte_same(*ptep, entry)) {
>   		__set_pte_at(ptep, entry);
> -	/*
> -	 * update_mmu_cache will unconditionally execute, handling both
> -	 * the case that the PTE changed and the spurious fault case.
> -	 */
> -	return true;
> +		/* Here only not svadu is impacted */
> +		flush_tlb_page(vma, address);
> +		return true;
> +	}
> +
> +	return false;
> +}
> +
> +extern u64 nr_sfence_vma_handle_exception;
> +extern bool tlb_caching_invalid_entries;
> +
> +#define flush_tlb_fix_spurious_read_fault flush_tlb_fix_spurious_read_fault
> +static inline void flush_tlb_fix_spurious_read_fault(struct vm_area_struct *vma,
> +						     unsigned long address,
> +						     pte_t *ptep)
> +{
> +	if (tlb_caching_invalid_entries)
> +		flush_tlb_page(vma, address);
>   }
>   
>   #define __HAVE_ARCH_PTEP_GET_AND_CLEAR
> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
> index af7639c3b0a3..7abaf42ef612 100644
> --- a/include/linux/pgtable.h
> +++ b/include/linux/pgtable.h
> @@ -931,8 +931,12 @@ static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio)
>   # define pte_accessible(mm, pte)	((void)(pte), 1)
>   #endif
>   
> -#ifndef flush_tlb_fix_spurious_fault
> -#define flush_tlb_fix_spurious_fault(vma, address, ptep) flush_tlb_page(vma, address)
> +#ifndef flush_tlb_fix_spurious_write_fault
> +#define flush_tlb_fix_spurious_write_fault(vma, address, ptep) flush_tlb_page(vma, address)
> +#endif
> +
> +#ifndef flush_tlb_fix_spurious_read_fault
> +#define flush_tlb_fix_spurious_read_fault(vma, address, ptep)
>   #endif
>   
>   /*
> diff --git a/mm/memory.c b/mm/memory.c
> index 517221f01303..5cb0ccf0c03f 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -5014,8 +5014,16 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
>   		 * with threads.
>   		 */
>   		if (vmf->flags & FAULT_FLAG_WRITE)
> -			flush_tlb_fix_spurious_fault(vmf->vma, vmf->address,
> -						     vmf->pte);
> +			flush_tlb_fix_spurious_write_fault(vmf->vma, vmf->address,
> +							   vmf->pte);
> +		else
> +			/*
> +			 * With the pte_same(ptep_get(vmf->pte), entry) check
> +			 * that calls update_mmu_tlb() above, multiple threads
> +			 * faulting at the same time won't get there.
> +			 */
> +			flush_tlb_fix_spurious_read_fault(vmf->vma, vmf->address,
> +							  vmf->pte);
>   	}
>   unlock:
>   	pte_unmap_unlock(vmf->pte, vmf->ptl);


More information about the linux-riscv mailing list