[PATCH RESEND v2 1/9] riscv: Restore the pfn in a NAPOT pte when manipulated by core mm code

Alexandre Ghiti alexghiti at rivosinc.com
Mon May 13 06:06:37 PDT 2024


Hi Ryan,

On Fri, May 10, 2024 at 2:20 PM Ryan Roberts <ryan.roberts at arm.com> wrote:
>
> On 08/05/2024 12:34, Alexandre Ghiti wrote:
> > The core mm code expects to be able to extract the pfn from a pte. NAPOT
> > mappings work differently since its ptes actually point to the first pfn
> > of the mapping, the other bits being used to encode the size of the
> > mapping.
> >
> > So modify ptep_get() so that it returns a pte value that contains the
> > *real* pfn (which is then different from what the HW expects) and right
> > before storing the ptes to the page table, reset the pfn LSBs to the
> > size of the mapping.
>
> Did you consider leaving the pte as is and instead modifying your pte_pfn()
> implementation?
>
> For arm64 at least, it is beneficial to keep the pte marked as contiguous when
> passing it up to core-mm because there are other helpers which need to parse the
> contiguous bit (e.g. pte_leaf_size()). If we were to clear the cont bit in
> ptep_get() that info would be lost and perf_get_pgtable_size() would always
> conclude the leaf size is 4K even when it is actually 64K.

I don't clear the contpte bit here (ie the napot bit), I'm just
setting the right pfn so that the core-mm code knows exactly which
page is targeted by each pte of a contpte region (remember riscv napot
extension uses the lsb of the pte pfn to encode the size the mapping,
so all ptes of a contpte region will return the same pfn).

And from pte_pfn(), we have no way of knowing from the pte value alone
which page is targeted, we need to know its position in the page table
to "guess" the right pfn.

>
> >
> > And make sure that all NAPOT mappings are set using set_ptes().
> >
> > Signed-off-by: Alexandre Ghiti <alexghiti at rivosinc.com>
> > ---
> >  arch/riscv/include/asm/pgtable-64.h |  11 +++
> >  arch/riscv/include/asm/pgtable.h    | 105 ++++++++++++++++++++++++++--
> >  arch/riscv/mm/hugetlbpage.c         |  38 +++++-----
> >  3 files changed, 128 insertions(+), 26 deletions(-)
> >
> > diff --git a/arch/riscv/include/asm/pgtable-64.h b/arch/riscv/include/asm/pgtable-64.h
> > index 221a5c1ee287..9fe076fc503e 100644
> > --- a/arch/riscv/include/asm/pgtable-64.h
> > +++ b/arch/riscv/include/asm/pgtable-64.h
> > @@ -106,6 +106,17 @@ enum napot_cont_order {
> >  #define napot_cont_mask(order)       (~(napot_cont_size(order) - 1UL))
> >  #define napot_pte_num(order) BIT(order)
> >
> > +static inline bool is_napot_order(unsigned int order)
> > +{
> > +     unsigned int napot_order;
> > +
> > +     for_each_napot_order(napot_order)
> > +             if (order == napot_order)
> > +                     return true;
> > +
> > +     return false;
> > +}
> > +
> >  #ifdef CONFIG_RISCV_ISA_SVNAPOT
> >  #define HUGE_MAX_HSTATE              (2 + (NAPOT_ORDER_MAX - NAPOT_CONT_ORDER_BASE))
> >  #else
> > diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
> > index 9f8ea0e33eb1..268c828f5152 100644
> > --- a/arch/riscv/include/asm/pgtable.h
> > +++ b/arch/riscv/include/asm/pgtable.h
> > @@ -297,6 +297,8 @@ static inline unsigned long pte_napot(pte_t pte)
> >       return pte_val(pte) & _PAGE_NAPOT;
> >  }
> >
> > +#define pte_valid_napot(pte) (pte_present(pte) && pte_napot(pte))
> > +
> >  static inline pte_t pte_mknapot(pte_t pte, unsigned int order)
> >  {
> >       int pos = order - 1 + _PAGE_PFN_SHIFT;
> > @@ -306,6 +308,12 @@ static inline pte_t pte_mknapot(pte_t pte, unsigned int order)
> >       return __pte((pte_val(pte) & napot_mask) | napot_bit | _PAGE_NAPOT);
> >  }
> >
> > +/* pte at entry must *not* encode the mapping size in the pfn LSBs. */
> > +static inline pte_t pte_clear_napot(pte_t pte)
> > +{
> > +     return __pte(pte_val(pte) & ~_PAGE_NAPOT);
> > +}
> > +
> >  #else
> >
> >  static __always_inline bool has_svnapot(void) { return false; }
> > @@ -315,17 +323,14 @@ static inline unsigned long pte_napot(pte_t pte)
> >       return 0;
> >  }
> >
> > +#define pte_valid_napot(pte) false
> > +
> >  #endif /* CONFIG_RISCV_ISA_SVNAPOT */
> >
> >  /* Yields the page frame number (PFN) of a page table entry */
> >  static inline unsigned long pte_pfn(pte_t pte)
> >  {
> > -     unsigned long res  = __page_val_to_pfn(pte_val(pte));
> > -
> > -     if (has_svnapot() && pte_napot(pte))
> > -             res = res & (res - 1UL);
> > -
> > -     return res;
> > +     return __page_val_to_pfn(pte_val(pte));
> >  }
> >
> >  #define pte_page(x)     pfn_to_page(pte_pfn(x))
> > @@ -525,9 +530,91 @@ static inline void __set_pte_at(struct mm_struct *mm, pte_t *ptep, pte_t pteval)
> >
> >  #define PFN_PTE_SHIFT                _PAGE_PFN_SHIFT
> >
> > +#ifdef CONFIG_RISCV_ISA_SVNAPOT
> > +static inline int arch_contpte_get_num_contig(pte_t *ptep, unsigned long size,
> > +                                           size_t *pgsize)
> > +{
> > +     pte_t __pte;
> > +
> > +     /* We must read the raw value of the pte to get the size of the mapping */
> > +     __pte = READ_ONCE(*ptep);
> > +
> > +     if (pgsize) {
> > +             if (size >= PGDIR_SIZE)
> > +                     *pgsize = PGDIR_SIZE;
> > +             else if (size >= P4D_SIZE)
> > +                     *pgsize = P4D_SIZE;
> > +             else if (size >= PUD_SIZE)
> > +                     *pgsize = PUD_SIZE;
> > +             else if (size >= PMD_SIZE)
> > +                     *pgsize = PMD_SIZE;
> > +             else
> > +                     *pgsize = PAGE_SIZE;
> > +     }
> > +
> > +     /* Make sure __pte is not a swap entry */
> > +     if (pte_valid_napot(__pte))
> > +             return napot_pte_num(napot_cont_order(__pte));
> > +
> > +     return 1;
> > +}
> > +#endif
> > +
> > +static inline pte_t ptep_get(pte_t *ptep)
> > +{
> > +     pte_t pte = READ_ONCE(*ptep);
> > +
> > +#ifdef CONFIG_RISCV_ISA_SVNAPOT
> > +     /*
> > +      * The pte we load has the N bit set and the size of the mapping in
> > +      * the pfn LSBs: keep the N bit and replace the mapping size with
> > +      * the *real* pfn since the core mm code expects to find it there.
> > +      * The mapping size will be reset just before being written to the
> > +      * page table in set_ptes().
> > +      */
> > +     if (unlikely(pte_valid_napot(pte))) {
> > +             unsigned int order = napot_cont_order(pte);
> > +             int pos = order - 1 + _PAGE_PFN_SHIFT;
> > +             unsigned long napot_mask = ~GENMASK(pos, _PAGE_PFN_SHIFT);
> > +             pte_t *orig_ptep = PTR_ALIGN_DOWN(ptep, sizeof(*ptep) * napot_pte_num(order));
> > +
> > +             pte = __pte((pte_val(pte) & napot_mask) + ((ptep - orig_ptep) << _PAGE_PFN_SHIFT));
> > +     }
> > +#endif
> > +
> > +     return pte;
> > +}
> > +#define ptep_get     ptep_get
> > +
> >  static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
> >               pte_t *ptep, pte_t pteval, unsigned int nr)
> >  {
> > +#ifdef CONFIG_RISCV_ISA_SVNAPOT
> > +     if (unlikely(pte_valid_napot(pteval))) {
> > +             unsigned int order = ilog2(nr);
> > +
> > +             if (!is_napot_order(order)) {
> > +                     /*
> > +                      * Something's weird, we are given a NAPOT pte but the
> > +                      * size of the mapping is not a known NAPOT mapping
> > +                      * size, so clear the NAPOT bit and map this without
> > +                      * NAPOT support: core mm only manipulates pte with the
> > +                      * real pfn so we know the pte is valid without the N
> > +                      * bit.
> > +                      */
> > +                     pr_err("Incorrect NAPOT mapping, resetting.\n");
> > +                     pteval = pte_clear_napot(pteval);
> > +             } else {
> > +                     /*
> > +                      * NAPOT ptes that arrive here only have the N bit set
> > +                      * and their pfn does not contain the mapping size, so
> > +                      * set that here.
> > +                      */
> > +                     pteval = pte_mknapot(pteval, order);
> > +             }
> > +     }
> > +#endif
>
> I think all this complexity comes along due to using this function both as a
> public interface that the core-mm uses (which never sets napot)
> and also using
> it as an internal interface that riscv-hugetlb uses (which does set napot)? It
> might be more understandable if you layer it into a lower level/internal API and
> a higher level/public API (similar to arm64)?

I think you're right here, I'll try to do that too.

Thanks for your comments,

Alex

>
> > +
> >       page_table_check_ptes_set(mm, ptep, pteval, nr);
> >
> >       for (;;) {
> > @@ -535,6 +622,12 @@ static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
> >               if (--nr == 0)
> >                       break;
> >               ptep++;
> > +
> > +#ifdef CONFIG_RISCV_ISA_SVNAPOT
> > +             if (unlikely(pte_valid_napot(pteval)))
> > +                     continue;
> > +#endif
> > +
> >               pte_val(pteval) += 1 << _PAGE_PFN_SHIFT;
> >       }
> >  }
> > diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c
> > index 5ef2a6891158..fe8067ee71b4 100644
> > --- a/arch/riscv/mm/hugetlbpage.c
> > +++ b/arch/riscv/mm/hugetlbpage.c
> > @@ -256,8 +256,7 @@ void set_huge_pte_at(struct mm_struct *mm,
> >
> >       clear_flush(mm, addr, ptep, pgsize, pte_num);
> >
> > -     for (i = 0; i < pte_num; i++, ptep++, addr += pgsize)
> > -             set_pte_at(mm, addr, ptep, pte);
> > +     set_ptes(mm, addr, ptep, pte, pte_num);
> >  }
> >
> >  int huge_ptep_set_access_flags(struct vm_area_struct *vma,
> > @@ -267,16 +266,16 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
> >                              int dirty)
> >  {
> >       struct mm_struct *mm = vma->vm_mm;
> > -     unsigned long order;
> > +     size_t pgsize;
> >       pte_t orig_pte;
> > -     int i, pte_num;
> > +     int pte_num;
> >
> >       if (!pte_napot(pte))
> >               return ptep_set_access_flags(vma, addr, ptep, pte, dirty);
> >
> > -     order = napot_cont_order(pte);
> > -     pte_num = napot_pte_num(order);
> > -     ptep = huge_pte_offset(mm, addr, napot_cont_size(order));
> > +     pte_num = arch_contpte_get_num_contig(ptep, 0, &pgsize);
> > +     ptep = huge_pte_offset(mm, addr, pte_num * pgsize);
> > +
> >       orig_pte = get_clear_contig_flush(mm, addr, ptep, pte_num);
> >
> >       if (pte_dirty(orig_pte))
> > @@ -285,8 +284,7 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
> >       if (pte_young(orig_pte))
> >               pte = pte_mkyoung(pte);
> >
> > -     for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++)
> > -             set_pte_at(mm, addr, ptep, pte);
> > +     set_ptes(mm, addr, ptep, pte, pte_num);
> >
> >       return true;
> >  }
> > @@ -301,7 +299,7 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
> >       if (!pte_napot(orig_pte))
> >               return ptep_get_and_clear(mm, addr, ptep);
> >
> > -     pte_num = napot_pte_num(napot_cont_order(orig_pte));
> > +     pte_num = arch_contpte_get_num_contig(ptep, 0, NULL);
> >
> >       return get_clear_contig(mm, addr, ptep, pte_num);
> >  }
> > @@ -311,24 +309,23 @@ void huge_ptep_set_wrprotect(struct mm_struct *mm,
> >                            pte_t *ptep)
> >  {
> >       pte_t pte = ptep_get(ptep);
> > -     unsigned long order;
> > +     size_t pgsize;
> >       pte_t orig_pte;
> > -     int i, pte_num;
> > +     int pte_num;
> >
> >       if (!pte_napot(pte)) {
> >               ptep_set_wrprotect(mm, addr, ptep);
> >               return;
> >       }
> >
> > -     order = napot_cont_order(pte);
> > -     pte_num = napot_pte_num(order);
> > -     ptep = huge_pte_offset(mm, addr, napot_cont_size(order));
> > +     pte_num = arch_contpte_get_num_contig(ptep, 0, &pgsize);
> > +     ptep = huge_pte_offset(mm, addr, pte_num * pgsize);
> > +
> >       orig_pte = get_clear_contig_flush(mm, addr, ptep, pte_num);
> >
> >       orig_pte = pte_wrprotect(orig_pte);
> >
> > -     for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++)
> > -             set_pte_at(mm, addr, ptep, orig_pte);
> > +     set_ptes(mm, addr, ptep, orig_pte, pte_num);
> >  }
> >
> >  pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
> > @@ -341,7 +338,7 @@ pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
> >       if (!pte_napot(pte))
> >               return ptep_clear_flush(vma, addr, ptep);
> >
> > -     pte_num = napot_pte_num(napot_cont_order(pte));
> > +     pte_num = arch_contpte_get_num_contig(ptep, 0, NULL);
> >
> >       return get_clear_contig_flush(vma->vm_mm, addr, ptep, pte_num);
> >  }
> > @@ -351,6 +348,7 @@ void huge_pte_clear(struct mm_struct *mm,
> >                   pte_t *ptep,
> >                   unsigned long sz)
> >  {
> > +     size_t pgsize;
> >       pte_t pte = ptep_get(ptep);
> >       int i, pte_num;
> >
> > @@ -359,8 +357,8 @@ void huge_pte_clear(struct mm_struct *mm,
> >               return;
> >       }
> >
> > -     pte_num = napot_pte_num(napot_cont_order(pte));
> > -     for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++)
> > +     pte_num = arch_contpte_get_num_contig(ptep, 0, &pgsize);
> > +     for (i = 0; i < pte_num; i++, addr += pgsize, ptep++)
> >               pte_clear(mm, addr, ptep);
> >  }
> >
>



More information about the linux-arm-kernel mailing list