[PATCH RESEND v2 1/9] riscv: Restore the pfn in a NAPOT pte when manipulated by core mm code

Ryan Roberts ryan.roberts at arm.com
Fri May 10 05:20:47 PDT 2024


On 08/05/2024 12:34, Alexandre Ghiti wrote:
> The core mm code expects to be able to extract the pfn from a pte. NAPOT
> mappings work differently since its ptes actually point to the first pfn
> of the mapping, the other bits being used to encode the size of the
> mapping.
> 
> So modify ptep_get() so that it returns a pte value that contains the
> *real* pfn (which is then different from what the HW expects) and right
> before storing the ptes to the page table, reset the pfn LSBs to the
> size of the mapping.

Did you consider leaving the pte as is and instead modifying your pte_pfn()
implementation?

For arm64 at least, it is beneficial to keep the pte marked as contiguous when
passing it up to core-mm because there are other helpers which need to parse the
contiguous bit (e.g. pte_leaf_size()). If we were to clear the cont bit in
ptep_get() that info would be lost and perf_get_pgtable_size() would always
conclude the leaf size is 4K even when it is actually 64K.

> 
> And make sure that all NAPOT mappings are set using set_ptes().
> 
> Signed-off-by: Alexandre Ghiti <alexghiti at rivosinc.com>
> ---
>  arch/riscv/include/asm/pgtable-64.h |  11 +++
>  arch/riscv/include/asm/pgtable.h    | 105 ++++++++++++++++++++++++++--
>  arch/riscv/mm/hugetlbpage.c         |  38 +++++-----
>  3 files changed, 128 insertions(+), 26 deletions(-)
> 
> diff --git a/arch/riscv/include/asm/pgtable-64.h b/arch/riscv/include/asm/pgtable-64.h
> index 221a5c1ee287..9fe076fc503e 100644
> --- a/arch/riscv/include/asm/pgtable-64.h
> +++ b/arch/riscv/include/asm/pgtable-64.h
> @@ -106,6 +106,17 @@ enum napot_cont_order {
>  #define napot_cont_mask(order)	(~(napot_cont_size(order) - 1UL))
>  #define napot_pte_num(order)	BIT(order)
>  
> +static inline bool is_napot_order(unsigned int order)
> +{
> +	unsigned int napot_order;
> +
> +	for_each_napot_order(napot_order)
> +		if (order == napot_order)
> +			return true;
> +
> +	return false;
> +}
> +
>  #ifdef CONFIG_RISCV_ISA_SVNAPOT
>  #define HUGE_MAX_HSTATE		(2 + (NAPOT_ORDER_MAX - NAPOT_CONT_ORDER_BASE))
>  #else
> diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
> index 9f8ea0e33eb1..268c828f5152 100644
> --- a/arch/riscv/include/asm/pgtable.h
> +++ b/arch/riscv/include/asm/pgtable.h
> @@ -297,6 +297,8 @@ static inline unsigned long pte_napot(pte_t pte)
>  	return pte_val(pte) & _PAGE_NAPOT;
>  }
>  
> +#define pte_valid_napot(pte)	(pte_present(pte) && pte_napot(pte))
> +
>  static inline pte_t pte_mknapot(pte_t pte, unsigned int order)
>  {
>  	int pos = order - 1 + _PAGE_PFN_SHIFT;
> @@ -306,6 +308,12 @@ static inline pte_t pte_mknapot(pte_t pte, unsigned int order)
>  	return __pte((pte_val(pte) & napot_mask) | napot_bit | _PAGE_NAPOT);
>  }
>  
> +/* pte at entry must *not* encode the mapping size in the pfn LSBs. */
> +static inline pte_t pte_clear_napot(pte_t pte)
> +{
> +	return __pte(pte_val(pte) & ~_PAGE_NAPOT);
> +}
> +
>  #else
>  
>  static __always_inline bool has_svnapot(void) { return false; }
> @@ -315,17 +323,14 @@ static inline unsigned long pte_napot(pte_t pte)
>  	return 0;
>  }
>  
> +#define pte_valid_napot(pte)	false
> +
>  #endif /* CONFIG_RISCV_ISA_SVNAPOT */
>  
>  /* Yields the page frame number (PFN) of a page table entry */
>  static inline unsigned long pte_pfn(pte_t pte)
>  {
> -	unsigned long res  = __page_val_to_pfn(pte_val(pte));
> -
> -	if (has_svnapot() && pte_napot(pte))
> -		res = res & (res - 1UL);
> -
> -	return res;
> +	return __page_val_to_pfn(pte_val(pte));
>  }
>  
>  #define pte_page(x)     pfn_to_page(pte_pfn(x))
> @@ -525,9 +530,91 @@ static inline void __set_pte_at(struct mm_struct *mm, pte_t *ptep, pte_t pteval)
>  
>  #define PFN_PTE_SHIFT		_PAGE_PFN_SHIFT
>  
> +#ifdef CONFIG_RISCV_ISA_SVNAPOT
> +static inline int arch_contpte_get_num_contig(pte_t *ptep, unsigned long size,
> +					      size_t *pgsize)
> +{
> +	pte_t __pte;
> +
> +	/* We must read the raw value of the pte to get the size of the mapping */
> +	__pte = READ_ONCE(*ptep);
> +
> +	if (pgsize) {
> +		if (size >= PGDIR_SIZE)
> +			*pgsize = PGDIR_SIZE;
> +		else if (size >= P4D_SIZE)
> +			*pgsize = P4D_SIZE;
> +		else if (size >= PUD_SIZE)
> +			*pgsize = PUD_SIZE;
> +		else if (size >= PMD_SIZE)
> +			*pgsize = PMD_SIZE;
> +		else
> +			*pgsize = PAGE_SIZE;
> +	}
> +
> +	/* Make sure __pte is not a swap entry */
> +	if (pte_valid_napot(__pte))
> +		return napot_pte_num(napot_cont_order(__pte));
> +
> +	return 1;
> +}
> +#endif
> +
> +static inline pte_t ptep_get(pte_t *ptep)
> +{
> +	pte_t pte = READ_ONCE(*ptep);
> +
> +#ifdef CONFIG_RISCV_ISA_SVNAPOT
> +	/*
> +	 * The pte we load has the N bit set and the size of the mapping in
> +	 * the pfn LSBs: keep the N bit and replace the mapping size with
> +	 * the *real* pfn since the core mm code expects to find it there.
> +	 * The mapping size will be reset just before being written to the
> +	 * page table in set_ptes().
> +	 */
> +	if (unlikely(pte_valid_napot(pte))) {
> +		unsigned int order = napot_cont_order(pte);
> +		int pos = order - 1 + _PAGE_PFN_SHIFT;
> +		unsigned long napot_mask = ~GENMASK(pos, _PAGE_PFN_SHIFT);
> +		pte_t *orig_ptep = PTR_ALIGN_DOWN(ptep, sizeof(*ptep) * napot_pte_num(order));
> +
> +		pte = __pte((pte_val(pte) & napot_mask) + ((ptep - orig_ptep) << _PAGE_PFN_SHIFT));
> +	}
> +#endif
> +
> +	return pte;
> +}
> +#define ptep_get	ptep_get
> +
>  static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
>  		pte_t *ptep, pte_t pteval, unsigned int nr)
>  {
> +#ifdef CONFIG_RISCV_ISA_SVNAPOT
> +	if (unlikely(pte_valid_napot(pteval))) {
> +		unsigned int order = ilog2(nr);
> +
> +		if (!is_napot_order(order)) {
> +			/*
> +			 * Something's weird, we are given a NAPOT pte but the
> +			 * size of the mapping is not a known NAPOT mapping
> +			 * size, so clear the NAPOT bit and map this without
> +			 * NAPOT support: core mm only manipulates pte with the
> +			 * real pfn so we know the pte is valid without the N
> +			 * bit.
> +			 */
> +			pr_err("Incorrect NAPOT mapping, resetting.\n");
> +			pteval = pte_clear_napot(pteval);
> +		} else {
> +			/*
> +			 * NAPOT ptes that arrive here only have the N bit set
> +			 * and their pfn does not contain the mapping size, so
> +			 * set that here.
> +			 */
> +			pteval = pte_mknapot(pteval, order);
> +		}
> +	}
> +#endif

I think all this complexity comes along due to using this function both as a
public interface that the core-mm uses (which never sets napot) and also using
it as an internal interface that riscv-hugetlb uses (which does set napot)? It
might be more understandable if you layer it into a lower level/internal API and
a higher level/public API (similar to arm64)?

> +
>  	page_table_check_ptes_set(mm, ptep, pteval, nr);
>  
>  	for (;;) {
> @@ -535,6 +622,12 @@ static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
>  		if (--nr == 0)
>  			break;
>  		ptep++;
> +
> +#ifdef CONFIG_RISCV_ISA_SVNAPOT
> +		if (unlikely(pte_valid_napot(pteval)))
> +			continue;
> +#endif
> +
>  		pte_val(pteval) += 1 << _PAGE_PFN_SHIFT;
>  	}
>  }
> diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c
> index 5ef2a6891158..fe8067ee71b4 100644
> --- a/arch/riscv/mm/hugetlbpage.c
> +++ b/arch/riscv/mm/hugetlbpage.c
> @@ -256,8 +256,7 @@ void set_huge_pte_at(struct mm_struct *mm,
>  
>  	clear_flush(mm, addr, ptep, pgsize, pte_num);
>  
> -	for (i = 0; i < pte_num; i++, ptep++, addr += pgsize)
> -		set_pte_at(mm, addr, ptep, pte);
> +	set_ptes(mm, addr, ptep, pte, pte_num);
>  }
>  
>  int huge_ptep_set_access_flags(struct vm_area_struct *vma,
> @@ -267,16 +266,16 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
>  			       int dirty)
>  {
>  	struct mm_struct *mm = vma->vm_mm;
> -	unsigned long order;
> +	size_t pgsize;
>  	pte_t orig_pte;
> -	int i, pte_num;
> +	int pte_num;
>  
>  	if (!pte_napot(pte))
>  		return ptep_set_access_flags(vma, addr, ptep, pte, dirty);
>  
> -	order = napot_cont_order(pte);
> -	pte_num = napot_pte_num(order);
> -	ptep = huge_pte_offset(mm, addr, napot_cont_size(order));
> +	pte_num = arch_contpte_get_num_contig(ptep, 0, &pgsize);
> +	ptep = huge_pte_offset(mm, addr, pte_num * pgsize);
> +
>  	orig_pte = get_clear_contig_flush(mm, addr, ptep, pte_num);
>  
>  	if (pte_dirty(orig_pte))
> @@ -285,8 +284,7 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
>  	if (pte_young(orig_pte))
>  		pte = pte_mkyoung(pte);
>  
> -	for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++)
> -		set_pte_at(mm, addr, ptep, pte);
> +	set_ptes(mm, addr, ptep, pte, pte_num);
>  
>  	return true;
>  }
> @@ -301,7 +299,7 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
>  	if (!pte_napot(orig_pte))
>  		return ptep_get_and_clear(mm, addr, ptep);
>  
> -	pte_num = napot_pte_num(napot_cont_order(orig_pte));
> +	pte_num = arch_contpte_get_num_contig(ptep, 0, NULL);
>  
>  	return get_clear_contig(mm, addr, ptep, pte_num);
>  }
> @@ -311,24 +309,23 @@ void huge_ptep_set_wrprotect(struct mm_struct *mm,
>  			     pte_t *ptep)
>  {
>  	pte_t pte = ptep_get(ptep);
> -	unsigned long order;
> +	size_t pgsize;
>  	pte_t orig_pte;
> -	int i, pte_num;
> +	int pte_num;
>  
>  	if (!pte_napot(pte)) {
>  		ptep_set_wrprotect(mm, addr, ptep);
>  		return;
>  	}
>  
> -	order = napot_cont_order(pte);
> -	pte_num = napot_pte_num(order);
> -	ptep = huge_pte_offset(mm, addr, napot_cont_size(order));
> +	pte_num = arch_contpte_get_num_contig(ptep, 0, &pgsize);
> +	ptep = huge_pte_offset(mm, addr, pte_num * pgsize);
> +
>  	orig_pte = get_clear_contig_flush(mm, addr, ptep, pte_num);
>  
>  	orig_pte = pte_wrprotect(orig_pte);
>  
> -	for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++)
> -		set_pte_at(mm, addr, ptep, orig_pte);
> +	set_ptes(mm, addr, ptep, orig_pte, pte_num);
>  }
>  
>  pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
> @@ -341,7 +338,7 @@ pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
>  	if (!pte_napot(pte))
>  		return ptep_clear_flush(vma, addr, ptep);
>  
> -	pte_num = napot_pte_num(napot_cont_order(pte));
> +	pte_num = arch_contpte_get_num_contig(ptep, 0, NULL);
>  
>  	return get_clear_contig_flush(vma->vm_mm, addr, ptep, pte_num);
>  }
> @@ -351,6 +348,7 @@ void huge_pte_clear(struct mm_struct *mm,
>  		    pte_t *ptep,
>  		    unsigned long sz)
>  {
> +	size_t pgsize;
>  	pte_t pte = ptep_get(ptep);
>  	int i, pte_num;
>  
> @@ -359,8 +357,8 @@ void huge_pte_clear(struct mm_struct *mm,
>  		return;
>  	}
>  
> -	pte_num = napot_pte_num(napot_cont_order(pte));
> -	for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++)
> +	pte_num = arch_contpte_get_num_contig(ptep, 0, &pgsize);
> +	for (i = 0; i < pte_num; i++, addr += pgsize, ptep++)
>  		pte_clear(mm, addr, ptep);
>  }
>  




More information about the linux-riscv mailing list