[RFC PATCH 4/6] ARM: mm: HugeTLB support for non-LPAE systems.

Tue Jan 8 12:58:44 EST 2013

On Fri, Jan 04, 2013 at 05:04:43AM +0000, Christoffer Dall wrote:
> On Thu, Oct 18, 2012 at 12:15 PM, Steve Capper <steve.capper at arm.com> wrote:

> > diff --git a/arch/arm/include/asm/hugetlb-2level.h b/arch/arm/include/asm/hugetlb-2level.h
> > new file mode 100644
> > index 0000000..3532b54
> > --- /dev/null
> > +++ b/arch/arm/include/asm/hugetlb-2level.h
> > @@ -0,0 +1,71 @@
> > +/*
> > + * arch/arm/include/asm/hugetlb-2level.h
> > + *
> > + * Copyright (C) 2012 ARM Ltd.
> > + *
> > + * Based on arch/x86/include/asm/hugetlb.h and Bill Carson's patches
> > + *
> > + * This program is free software; you can redistribute it and/or modify
> > + * it under the terms of the GNU General Public License version 2 as
> > + * published by the Free Software Foundation.
> > + *
> > + * This program is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> > + * GNU General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU General Public License
> > + * along with this program; if not, write to the Free Software
> > + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
> > + */
> > +
> > +#ifndef _ASM_ARM_HUGETLB_2LEVEL_H
> > +#define _ASM_ARM_HUGETLB_2LEVEL_H
> > +
> > +
> > +pte_t huge_ptep_get(pte_t *ptep);
> > +
> > +void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
> > +                                  pte_t *ptep, pte_t pte);
> > +
> > +static inline pte_t pte_mkhuge(pte_t pte) { return pte; }
> > +
> > +static inline void huge_ptep_clear_flush(struct vm_area_struct *vma,
> > +                                        unsigned long addr, pte_t *ptep)
> > +{
> > +       flush_tlb_range(vma, addr, addr + HPAGE_SIZE);
>
> don't you need to clear the old TLB entry first here, otherwise
> another CPU could put an entry to the old page in its TLB and access
> it even after the page_cache_release(old_page) in hugetlb_cow() ?
>

Yes I do, thanks.

> > +}
> > +
> > +static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
> > +                                          unsigned long addr, pte_t *ptep)
> > +{
> > +       pmd_t *pmdp = (pmd_t *) ptep;
> > +       set_pmd_at(mm, addr, pmdp, pmd_wrprotect(*pmdp));
> > +}
> > +
> > +
> > +static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
> > +                                           unsigned long addr, pte_t *ptep)
> > +{
> > +       pmd_t *pmdp = (pmd_t *)ptep;
> > +       pte_t pte = huge_ptep_get(ptep);
> > +       pmd_clear(pmdp);
> > +
> > +       return pte;
> > +}
> > +
> > +static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma,
> > +                                            unsigned long addr, pte_t *ptep,
> > +                                            pte_t pte, int dirty)
> > +{
> > +       int changed = !pte_same(huge_ptep_get(ptep), pte);
> > +
> > +       if (changed) {
> > +               set_huge_pte_at(vma->vm_mm, addr, ptep, pte);
> > +               huge_ptep_clear_flush(vma, addr, &pte);
> > +       }
> > +
> > +       return changed;
> > +}
> > +
> > +#endif /* _ASM_ARM_HUGETLB_2LEVEL_H */
> > diff --git a/arch/arm/include/asm/hugetlb.h b/arch/arm/include/asm/hugetlb.h
> > index 7af9cf6..1e92975 100644
> > --- a/arch/arm/include/asm/hugetlb.h
> > +++ b/arch/arm/include/asm/hugetlb.h
> > @@ -24,7 +24,11 @@
> >
> >  #include <asm/page.h>
> >
> > +#ifdef CONFIG_ARM_LPAE
> >  #include <asm/hugetlb-3level.h>
> > +#else
> > +#include <asm/hugetlb-2level.h>
> > +#endif
> >
> >  static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
> >                                           unsigned long addr, unsigned long end,
> > diff --git a/arch/arm/include/asm/pgtable-2level.h b/arch/arm/include/asm/pgtable-2level.h
> > index 662a00e..fd1d9be 100644
> > --- a/arch/arm/include/asm/pgtable-2level.h
> > +++ b/arch/arm/include/asm/pgtable-2level.h
> > @@ -163,7 +163,7 @@ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long addr)
> >         return (pmd_t *)pud;
> >  }
> >
> > -#define pmd_bad(pmd)           (pmd_val(pmd) & 2)
> > +#define pmd_bad(pmd)           ((pmd_val(pmd) & PMD_TYPE_MASK) == PMD_TYPE_FAULT)
>
> this changes the semantics of the macro - is that on purpose and safe?
>
> (fault entries didn't used to be bad, now they are...)
>

Yes, thanks, the semantics should be retained (they are for LPAE).

> >
> >  #define copy_pmd(pmdpd,pmdps)          \
> >         do {                            \
> > @@ -184,6 +184,83 @@ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long addr)
> >
> >  #define set_pte_ext(ptep,pte,ext) cpu_set_pte_ext(ptep,pte,ext)
> >
> > +
> > +#ifdef CONFIG_SYS_SUPPORTS_HUGETLBFS
> > +
> > +/*
> > + * now follows some of the definitions to allow huge page support, we can't put
> > + * these in the hugetlb source files as they are also required for transparent
> > + * hugepage support.
> > + */
> > +
> > +#define HPAGE_SHIFT             PMD_SHIFT
> > +#define HPAGE_SIZE              (_AC(1, UL) << HPAGE_SHIFT)
> > +#define HPAGE_MASK              (~(HPAGE_SIZE - 1))
> > +#define HUGETLB_PAGE_ORDER      (HPAGE_SHIFT - PAGE_SHIFT)
> > +
> > +#define HUGE_LINUX_PTE_COUNT       (PAGE_OFFSET >> HPAGE_SHIFT)
> > +#define HUGE_LINUX_PTE_SIZE        (HUGE_LINUX_PTE_COUNT * sizeof(pte_t *))
> > +#define HUGE_LINUX_PTE_INDEX(addr) (addr >> HPAGE_SHIFT)
> > +
> > +/*
> > + *  We re-purpose the following domain bits in the section descriptor
> > + */
> > +#define PMD_DSECT_DIRTY                (_AT(pmdval_t, 1) << 5)
> > +#define PMD_DSECT_AF           (_AT(pmdval_t, 1) << 6)
> > +
> > +#define PMD_BIT_FUNC(fn,op) \
> > +static inline pmd_t pmd_##fn(pmd_t pmd) { pmd_val(pmd) op; return pmd; }
> > +
> > +PMD_BIT_FUNC(wrprotect,        &= ~PMD_SECT_AP_WRITE);
> > +
> > +static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
> > +                               pmd_t *pmdp, pmd_t pmd)
> > +{
> > +       /*
> > +        * we can sometimes be passed a pmd pointing to a level 2 descriptor
> > +        * from collapse_huge_page.
> > +        */
> > +       if ((pmd_val(pmd) & PMD_TYPE_MASK) == PMD_TYPE_TABLE) {
> > +               pmdp[0] = __pmd(pmd_val(pmd));
> > +               pmdp[1] = __pmd(pmd_val(pmd) + 256 * sizeof(pte_t));
>
> eh, if I get this right, this means that in the case where the pmd
> points to level 2 descriptor, all the pages are lined up to be a huge
> page, so just point to the next level 2 pte, which directly follows
> the next level 2 descriptor, because they share the same page. But
> then why do we need to set any values here?
>

This is a little weird.

The transparent huge page code will try sometimes to collapse a group of pages
into a huge page. As part of the collapse process, it will invalidate the pmd
before it copies the physical pages into a contiguous huge page. This ensures
that memory accesses to the area being collapsed fault loop whilst the collapse
takes place. Sometimes the collapse process will be aborted after the pmd has
been invalidated, so the original pmd (which points to a page table) needs to
be put back as part of the rollback.

With 2 levels of paging, the pmds are arranged in pairs so we put back a pair
of pmds.

> > +       } else {
> > +               pmdp[0] = __pmd(pmd_val(pmd));                  /* first 1M section  */
> > +               pmdp[1] = __pmd(pmd_val(pmd) + SECTION_SIZE);   /* second 1M section */
> > +       }
> > +
> > +       flush_pmd_entry(pmdp);
> > +}
> > +
> > +#define HPMD_XLATE(res, cmp, from, to) do { if (cmp & from) res |= to; \
> > +                                           else res &= ~to;            \
> > +                                         } while (0)
> > +
> > +static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
> > +{
> > +       pmdval_t pmdval = pmd_val(pmd);
> > +       pteval_t newprotval = pgprot_val(newprot);
> > +
> > +       HPMD_XLATE(pmdval, newprotval, L_PTE_XN, PMD_SECT_XN);
> > +       HPMD_XLATE(pmdval, newprotval, L_PTE_SHARED, PMD_SECT_S);
> > +       HPMD_XLATE(pmdval, newprotval, L_PTE_YOUNG, PMD_DSECT_AF);
>
> consider something akin to:
>
> #define L_PMD_DSECT_YOUNG (PMD_DSECT_AF)
>
> then you don't have to change several places if you decide to
> rearrange the mappings for whatever reason at it makes it slightly
> easier to read this code.
>

Yeah, something along those lines may look better. I'll have a tinker.

> > +       HPMD_XLATE(pmdval, newprotval, L_PTE_DIRTY, PMD_DSECT_DIRTY);
> > +
> > +       /* preserve bits C & B */
> > +       pmdval |= (newprotval & (3 << 2));
>
> this looks superfluous?
>
> > +
> > +       /* Linux PTE bit 4 corresponds to PMD TEX bit 0 */
> > +       HPMD_XLATE(pmdval, newprotval, 1 << 4, PMD_SECT_TEX(1));
>
> define L_PTE_TEX0 and group with the others above?
>

The mapping is not quite that simple. We have multiple memory types defined
in pgtable-{23}level.h and these have different meanings depending on the
target processor. For v6 and v7 the above works, but ideally, I should be able
to look up the memory type mapping. For instance in arch/arm/mm/mmu.c, we can
see cache policies that contain linux pte information and hardware pmd
information. I'll ponder this some more, if anyone has a neat way of handling
this then please let me know :-).

> > +
> > +       if (newprotval & L_PTE_RDONLY)
> > +               pmdval &= ~PMD_SECT_AP_WRITE;
> > +       else
> > +               pmdval |= PMD_SECT_AP_WRITE;
> > +
> > +       return __pmd(pmdval);
> > +}
> > +
> > +#endif /* CONFIG_SYS_SUPPORTS_HUGETLBFS */
> > +
> >  #endif /* __ASSEMBLY__ */
> >
> >  #endif /* _ASM_PGTABLE_2LEVEL_H */
> > diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h
> > index 99a1951..685e9e87 100644
> > --- a/arch/arm/include/asm/tlb.h
> > +++ b/arch/arm/include/asm/tlb.h
> > @@ -92,10 +92,16 @@ static inline void tlb_flush(struct mmu_gather *tlb)
> >  static inline void tlb_add_flush(struct mmu_gather *tlb, unsigned long addr)
> >  {
> >         if (!tlb->fullmm) {
> > +               unsigned long size = PAGE_SIZE;
> > +
> >                 if (addr < tlb->range_start)
> >                         tlb->range_start = addr;
> > -               if (addr + PAGE_SIZE > tlb->range_end)
> > -                       tlb->range_end = addr + PAGE_SIZE;
> > +
> > +               if (tlb->vma && is_vm_hugetlb_page(tlb->vma))
> > +                       size = HPAGE_SIZE;
> > +
> > +               if (addr + size > tlb->range_end)
> > +                       tlb->range_end = addr + size;
> >         }
> >  }
> >
> > diff --git a/arch/arm/kernel/head.S b/arch/arm/kernel/head.S
> > index 4eee351..860f08e 100644
> > --- a/arch/arm/kernel/head.S
> > +++ b/arch/arm/kernel/head.S
> > @@ -410,13 +410,21 @@ __enable_mmu:
> >         mov     r5, #0
> >         mcrr    p15, 0, r4, r5, c2              @ load TTBR0
> >  #else
> > +#ifndef        CONFIG_SYS_SUPPORTS_HUGETLBFS
> >         mov     r5, #(domain_val(DOMAIN_USER, DOMAIN_MANAGER) | \
> >                       domain_val(DOMAIN_KERNEL, DOMAIN_MANAGER) | \
> >                       domain_val(DOMAIN_TABLE, DOMAIN_MANAGER) | \
> >                       domain_val(DOMAIN_IO, DOMAIN_CLIENT))
> > +#else
> > +       @ set ourselves as the client in all domains
> > +       @ this allows us to then use the 4 domain bits in the
> > +       @ section descriptors in our transparent huge pages
> > +       ldr     r5, =0x55555555
> > +#endif /* CONFIG_SYS_SUPPORTS_HUGETLBFS */
> > +
> >         mcr     p15, 0, r5, c3, c0, 0           @ load domain access register
> >         mcr     p15, 0, r4, c2, c0, 0           @ load page table pointer
> > -#endif
> > +#endif /* CONFIG_ARM_LPAE */
> >         b       __turn_mmu_on
> >  ENDPROC(__enable_mmu)
> >
> > diff --git a/arch/arm/mm/Makefile b/arch/arm/mm/Makefile
> > index 1560bbc..adf0b19 100644
> > --- a/arch/arm/mm/Makefile
> > +++ b/arch/arm/mm/Makefile
> > @@ -17,7 +17,11 @@ obj-$(CONFIG_MODULES)                += proc-syms.o
> >  obj-$(CONFIG_ALIGNMENT_TRAP)   += alignment.o
> >  obj-$(CONFIG_HIGHMEM)          += highmem.o
> >  obj-$(CONFIG_HUGETLB_PAGE)     += hugetlbpage.o
> > +ifeq ($(CONFIG_ARM_LPAE),y)
> >  obj-$(CONFIG_HUGETLB_PAGE)     += hugetlbpage-3level.o
> > +else
> > +obj-$(CONFIG_HUGETLB_PAGE)     += hugetlbpage-2level.o
> > +endif
> >
> >  obj-$(CONFIG_CPU_ABRT_NOMMU)   += abort-nommu.o
> >  obj-$(CONFIG_CPU_ABRT_EV4)     += abort-ev4.o
> > diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
> > index 5dbf13f..0884936 100644
> > --- a/arch/arm/mm/fault.c
> > +++ b/arch/arm/mm/fault.c
> > @@ -488,13 +488,13 @@ do_translation_fault(unsigned long addr, unsigned int fsr,
> >  #endif                                 /* CONFIG_MMU */
> >
> >  /*
> > - * Some section permission faults need to be handled gracefully.
> > - * They can happen due to a __{get,put}_user during an oops.
> > + * A fault in a section will likely be due to a huge page, treat it
> > + * as a page fault.
> >   */
> >  static int
> >  do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
> >  {
> > -       do_bad_area(addr, fsr, regs);
> > +       do_page_fault(addr, fsr, regs);
>
> doesn't the previous patch require this as well?
>
> (so it should strictly speaking be part of that patch)
>

Yes it does. Thanks I'll clean this up by updating the fsr_info tables for long
and short descriptors; and remove the do_sect_fault->do_page_fault daisy chaining.

> >         return 0;
> >  }
> >
> > diff --git a/arch/arm/mm/hugetlbpage-2level.c b/arch/arm/mm/hugetlbpage-2level.c
> > new file mode 100644
> > index 0000000..4b2b38c
> > --- /dev/null
> > +++ b/arch/arm/mm/hugetlbpage-2level.c
> > @@ -0,0 +1,115 @@
> > +/*
> > + * arch/arm/mm/hugetlbpage-2level.c
> > + *
> > + * Copyright (C) 2002, Rohit Seth <rohit.seth at intel.com>
> > + * Copyright (C) 2012 ARM Ltd
> > + * Copyright (C) 2012 Bill Carson.
> > + *
> > + * Based on arch/x86/include/asm/hugetlb.h and Bill Carson's patches
> > + *
> > + * This program is free software; you can redistribute it and/or modify
> > + * it under the terms of the GNU General Public License version 2 as
> > + * published by the Free Software Foundation.
> > + *
> > + * This program is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> > + * GNU General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU General Public License
> > + * along with this program; if not, write to the Free Software
> > + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
> > + */
> > +
> > +#include <linux/init.h>
> > +#include <linux/fs.h>
> > +#include <linux/mm.h>
> > +#include <linux/hugetlb.h>
> > +#include <linux/pagemap.h>
> > +#include <linux/err.h>
> > +#include <linux/sysctl.h>
> > +#include <asm/mman.h>
> > +#include <asm/tlb.h>
> > +#include <asm/tlbflush.h>
> > +#include <asm/pgalloc.h>
> > +
> > +int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
> > +{
> > +       return 0;
> > +}
> > +
> > +pte_t *huge_pte_alloc(struct mm_struct *mm,
> > +                       unsigned long addr, unsigned long sz)
> > +{
> > +       pgd_t *pgd;
> > +       pud_t *pud;
> > +       pmd_t *pmd;
> > +
> > +       pgd = pgd_offset(mm, addr);
> > +       pud = pud_offset(pgd, addr);
> > +       pmd = pmd_offset(pud, addr);
> > +
> > +       return (pte_t *)pmd; /* our huge pte is actually a pmd */
> > +}
> > +
> > +struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
> > +                            pmd_t *pmd, int write)
> > +{
> > +       struct page *page;
> > +       unsigned long pfn;
> > +
> > +       BUG_ON((pmd_val(*pmd) & PMD_TYPE_MASK) != PMD_TYPE_SECT);
>
> I could only see one caller who calls this only when this exact
> condition is fulfilled, so unless we anticipate other callers, this
> BUG_ON could go.
>

Yes thanks, this can be scrubbed.

> > +       pfn = ((pmd_val(*pmd) & HPAGE_MASK) >> PAGE_SHIFT);
> > +       page = pfn_to_page(pfn);
> > +       return page;
> > +}
> > +
> > +pte_t huge_ptep_get(pte_t *ptep)
> > +{
> > +       pmd_t *pmdp = (pmd_t*)ptep;
> > +       pmdval_t pmdval = pmd_val(*pmdp);
> > +       pteval_t retval;
> > +
> > +       if (!pmdval)
> > +               return __pte(0);
> > +
> > +       retval = (pteval_t) (pmdval & HPAGE_MASK);
> > +       HPMD_XLATE(retval, pmdval, PMD_SECT_XN, L_PTE_XN);
> > +       HPMD_XLATE(retval, pmdval, PMD_SECT_S, L_PTE_SHARED);
> > +       HPMD_XLATE(retval, pmdval, PMD_DSECT_AF, L_PTE_YOUNG);
> > +       HPMD_XLATE(retval, pmdval, PMD_DSECT_DIRTY, L_PTE_DIRTY);
> > +
> > +       /* preserve bits C & B */
> > +       retval |= (pmdval & (3 << 2));
> > +
> > +       /* PMD TEX bit 0 corresponds to Linux PTE bit 4 */
> > +       HPMD_XLATE(retval, pmdval, PMD_SECT_TEX(1), 1 << 4);
> > +
>
> again, I would define the 1 << 4 to something and treat like the others...
>
> > +       if (pmdval & PMD_SECT_AP_WRITE)
> > +               retval &= ~L_PTE_RDONLY;
> > +       else
> > +               retval |= L_PTE_RDONLY;
> > +
> > +       if ((pmdval & PMD_TYPE_MASK) == PMD_TYPE_SECT)
> > +               retval |= L_PTE_VALID;
> > +
> > +       /* we assume all hugetlb pages are user */
> > +       retval |= L_PTE_USER;
> > +
> > +       return __pte(retval);
> > +}
> > +
> > +void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
> > +                                  pte_t *ptep, pte_t pte)
> > +{
> > +       pmdval_t pmdval = (pmdval_t) pte_val(pte);
> > +       pmd_t *pmdp = (pmd_t*) ptep;
> > +
> > +       pmdval &= HPAGE_MASK;
> > +       pmdval |= PMD_SECT_AP_READ | PMD_SECT_nG | PMD_TYPE_SECT;
> > +       pmdval = pmd_val(pmd_modify(__pmd(pmdval), __pgprot(pte_val(pte))));
> > +
> > +       __sync_icache_dcache(pte);
> > +
> > +       set_pmd_at(mm, addr, pmdp, __pmd(pmdval));
> > +}
>
> so this whole scheme where the caller expects ptes, but really gets
> pmds feels strange to me, but perhaps it makes more sense on other
> architectures as to not change the caller instead of this magic?
>

It is a little strange, but expected. We are considering one level up from
normal page table entries. The short descriptor case is made stranger by the
linux/hardware pte distinction. I wanted to re-purpose the domain bits and use
translation as this allows for a much simpler transparent huge page
implemention.

I'll see if I can simplify some bits of the short descriptor hugetlb code.

> -Christoffer
>