[RFC PATCH 4/6] ARM: mm: HugeTLB support for non-LPAE systems.

Tue Jan 8 13:13:13 EST 2013

On Tue, Jan 8, 2013 at 12:58 PM, Steve Capper <steve.capper at arm.com> wrote:
> On Fri, Jan 04, 2013 at 05:04:43AM +0000, Christoffer Dall wrote:
>> On Thu, Oct 18, 2012 at 12:15 PM, Steve Capper <steve.capper at arm.com> wrote:
>
>> > diff --git a/arch/arm/include/asm/hugetlb-2level.h b/arch/arm/include/asm/hugetlb-2level.h
>> > new file mode 100644
>> > index 0000000..3532b54
>> > --- /dev/null
>> > +++ b/arch/arm/include/asm/hugetlb-2level.h
>> > @@ -0,0 +1,71 @@
>> > +/*
>> > + * arch/arm/include/asm/hugetlb-2level.h
>> > + *
>> > + * Copyright (C) 2012 ARM Ltd.
>> > + *
>> > + * Based on arch/x86/include/asm/hugetlb.h and Bill Carson's patches
>> > + *
>> > + * This program is free software; you can redistribute it and/or modify
>> > + * it under the terms of the GNU General Public License version 2 as
>> > + * published by the Free Software Foundation.
>> > + *
>> > + * This program is distributed in the hope that it will be useful,
>> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>> > + * GNU General Public License for more details.
>> > + *
>> > + * You should have received a copy of the GNU General Public License
>> > + * along with this program; if not, write to the Free Software
>> > + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
>> > + */
>> > +
>> > +#ifndef _ASM_ARM_HUGETLB_2LEVEL_H
>> > +#define _ASM_ARM_HUGETLB_2LEVEL_H
>> > +
>> > +
>> > +pte_t huge_ptep_get(pte_t *ptep);
>> > +
>> > +void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
>> > +                                  pte_t *ptep, pte_t pte);
>> > +
>> > +static inline pte_t pte_mkhuge(pte_t pte) { return pte; }
>> > +
>> > +static inline void huge_ptep_clear_flush(struct vm_area_struct *vma,
>> > +                                        unsigned long addr, pte_t *ptep)
>> > +{
>> > +       flush_tlb_range(vma, addr, addr + HPAGE_SIZE);
>>
>> don't you need to clear the old TLB entry first here, otherwise
>> another CPU could put an entry to the old page in its TLB and access
>> it even after the page_cache_release(old_page) in hugetlb_cow() ?
>>
>
> Yes I do, thanks.
>
>> > +}
>> > +
>> > +static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
>> > +                                          unsigned long addr, pte_t *ptep)
>> > +{
>> > +       pmd_t *pmdp = (pmd_t *) ptep;
>> > +       set_pmd_at(mm, addr, pmdp, pmd_wrprotect(*pmdp));
>> > +}
>> > +
>> > +
>> > +static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
>> > +                                           unsigned long addr, pte_t *ptep)
>> > +{
>> > +       pmd_t *pmdp = (pmd_t *)ptep;
>> > +       pte_t pte = huge_ptep_get(ptep);
>> > +       pmd_clear(pmdp);
>> > +
>> > +       return pte;
>> > +}
>> > +
>> > +static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma,
>> > +                                            unsigned long addr, pte_t *ptep,
>> > +                                            pte_t pte, int dirty)
>> > +{
>> > +       int changed = !pte_same(huge_ptep_get(ptep), pte);
>> > +
>> > +       if (changed) {
>> > +               set_huge_pte_at(vma->vm_mm, addr, ptep, pte);
>> > +               huge_ptep_clear_flush(vma, addr, &pte);
>> > +       }
>> > +
>> > +       return changed;
>> > +}
>> > +
>> > +#endif /* _ASM_ARM_HUGETLB_2LEVEL_H */
>> > diff --git a/arch/arm/include/asm/hugetlb.h b/arch/arm/include/asm/hugetlb.h
>> > index 7af9cf6..1e92975 100644
>> > --- a/arch/arm/include/asm/hugetlb.h
>> > +++ b/arch/arm/include/asm/hugetlb.h
>> > @@ -24,7 +24,11 @@
>> >
>> >  #include <asm/page.h>
>> >
>> > +#ifdef CONFIG_ARM_LPAE
>> >  #include <asm/hugetlb-3level.h>
>> > +#else
>> > +#include <asm/hugetlb-2level.h>
>> > +#endif
>> >
>> >  static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
>> >                                           unsigned long addr, unsigned long end,
>> > diff --git a/arch/arm/include/asm/pgtable-2level.h b/arch/arm/include/asm/pgtable-2level.h
>> > index 662a00e..fd1d9be 100644
>> > --- a/arch/arm/include/asm/pgtable-2level.h
>> > +++ b/arch/arm/include/asm/pgtable-2level.h
>> > @@ -163,7 +163,7 @@ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long addr)
>> >         return (pmd_t *)pud;
>> >  }
>> >
>> > -#define pmd_bad(pmd)           (pmd_val(pmd) & 2)
>> > +#define pmd_bad(pmd)           ((pmd_val(pmd) & PMD_TYPE_MASK) == PMD_TYPE_FAULT)
>>
>> this changes the semantics of the macro - is that on purpose and safe?
>>
>> (fault entries didn't used to be bad, now they are...)
>>
>
> Yes, thanks, the semantics should be retained (they are for LPAE).
>
>> >
>> >  #define copy_pmd(pmdpd,pmdps)          \
>> >         do {                            \
>> > @@ -184,6 +184,83 @@ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long addr)
>> >
>> >  #define set_pte_ext(ptep,pte,ext) cpu_set_pte_ext(ptep,pte,ext)
>> >
>> > +
>> > +#ifdef CONFIG_SYS_SUPPORTS_HUGETLBFS
>> > +
>> > +/*
>> > + * now follows some of the definitions to allow huge page support, we can't put
>> > + * these in the hugetlb source files as they are also required for transparent
>> > + * hugepage support.
>> > + */
>> > +
>> > +#define HPAGE_SHIFT             PMD_SHIFT
>> > +#define HPAGE_SIZE              (_AC(1, UL) << HPAGE_SHIFT)
>> > +#define HPAGE_MASK              (~(HPAGE_SIZE - 1))
>> > +#define HUGETLB_PAGE_ORDER      (HPAGE_SHIFT - PAGE_SHIFT)
>> > +
>> > +#define HUGE_LINUX_PTE_COUNT       (PAGE_OFFSET >> HPAGE_SHIFT)
>> > +#define HUGE_LINUX_PTE_SIZE        (HUGE_LINUX_PTE_COUNT * sizeof(pte_t *))
>> > +#define HUGE_LINUX_PTE_INDEX(addr) (addr >> HPAGE_SHIFT)
>> > +
>> > +/*
>> > + *  We re-purpose the following domain bits in the section descriptor
>> > + */
>> > +#define PMD_DSECT_DIRTY                (_AT(pmdval_t, 1) << 5)
>> > +#define PMD_DSECT_AF           (_AT(pmdval_t, 1) << 6)
>> > +
>> > +#define PMD_BIT_FUNC(fn,op) \
>> > +static inline pmd_t pmd_##fn(pmd_t pmd) { pmd_val(pmd) op; return pmd; }
>> > +
>> > +PMD_BIT_FUNC(wrprotect,        &= ~PMD_SECT_AP_WRITE);
>> > +
>> > +static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
>> > +                               pmd_t *pmdp, pmd_t pmd)
>> > +{
>> > +       /*
>> > +        * we can sometimes be passed a pmd pointing to a level 2 descriptor
>> > +        * from collapse_huge_page.
>> > +        */
>> > +       if ((pmd_val(pmd) & PMD_TYPE_MASK) == PMD_TYPE_TABLE) {
>> > +               pmdp[0] = __pmd(pmd_val(pmd));
>> > +               pmdp[1] = __pmd(pmd_val(pmd) + 256 * sizeof(pte_t));
>>
>> eh, if I get this right, this means that in the case where the pmd
>> points to level 2 descriptor, all the pages are lined up to be a huge
>> page, so just point to the next level 2 pte, which directly follows
>> the next level 2 descriptor, because they share the same page. But
>> then why do we need to set any values here?
>>
>
> This is a little weird.
>
> The transparent huge page code will try sometimes to collapse a group of pages
> into a huge page. As part of the collapse process, it will invalidate the pmd
> before it copies the physical pages into a contiguous huge page. This ensures
> that memory accesses to the area being collapsed fault loop whilst the collapse
> takes place. Sometimes the collapse process will be aborted after the pmd has
> been invalidated, so the original pmd (which points to a page table) needs to
> be put back as part of the rollback.
>
> With 2 levels of paging, the pmds are arranged in pairs so we put back a pair
> of pmds.
>

tricky! I got it, thanks.

>> > +       } else {
>> > +               pmdp[0] = __pmd(pmd_val(pmd));                  /* first 1M section  */
>> > +               pmdp[1] = __pmd(pmd_val(pmd) + SECTION_SIZE);   /* second 1M section */
>> > +       }
>> > +
>> > +       flush_pmd_entry(pmdp);
>> > +}
>> > +
>> > +#define HPMD_XLATE(res, cmp, from, to) do { if (cmp & from) res |= to; \
>> > +                                           else res &= ~to;            \
>> > +                                         } while (0)
>> > +
>> > +static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
>> > +{
>> > +       pmdval_t pmdval = pmd_val(pmd);
>> > +       pteval_t newprotval = pgprot_val(newprot);
>> > +
>> > +       HPMD_XLATE(pmdval, newprotval, L_PTE_XN, PMD_SECT_XN);
>> > +       HPMD_XLATE(pmdval, newprotval, L_PTE_SHARED, PMD_SECT_S);
>> > +       HPMD_XLATE(pmdval, newprotval, L_PTE_YOUNG, PMD_DSECT_AF);
>>
>> consider something akin to:
>>
>> #define L_PMD_DSECT_YOUNG (PMD_DSECT_AF)
>>
>> then you don't have to change several places if you decide to
>> rearrange the mappings for whatever reason at it makes it slightly
>> easier to read this code.
>>
>
> Yeah, something along those lines may look better. I'll have a tinker.
>
>> > +       HPMD_XLATE(pmdval, newprotval, L_PTE_DIRTY, PMD_DSECT_DIRTY);
>> > +
>> > +       /* preserve bits C & B */
>> > +       pmdval |= (newprotval & (3 << 2));
>>
>> this looks superfluous?
>>
>> > +
>> > +       /* Linux PTE bit 4 corresponds to PMD TEX bit 0 */
>> > +       HPMD_XLATE(pmdval, newprotval, 1 << 4, PMD_SECT_TEX(1));
>>
>> define L_PTE_TEX0 and group with the others above?
>>
>
> The mapping is not quite that simple. We have multiple memory types defined
> in pgtable-{23}level.h and these have different meanings depending on the
> target processor. For v6 and v7 the above works, but ideally, I should be able
> to look up the memory type mapping. For instance in arch/arm/mm/mmu.c, we can
> see cache policies that contain linux pte information and hardware pmd
> information. I'll ponder this some more, if anyone has a neat way of handling
> this then please let me know :-).
>
>> > +
>> > +       if (newprotval & L_PTE_RDONLY)
>> > +               pmdval &= ~PMD_SECT_AP_WRITE;
>> > +       else
>> > +               pmdval |= PMD_SECT_AP_WRITE;
>> > +
>> > +       return __pmd(pmdval);
>> > +}
>> > +
>> > +#endif /* CONFIG_SYS_SUPPORTS_HUGETLBFS */
>> > +
>> >  #endif /* __ASSEMBLY__ */
>> >
>> >  #endif /* _ASM_PGTABLE_2LEVEL_H */
>> > diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h
>> > index 99a1951..685e9e87 100644
>> > --- a/arch/arm/include/asm/tlb.h
>> > +++ b/arch/arm/include/asm/tlb.h
>> > @@ -92,10 +92,16 @@ static inline void tlb_flush(struct mmu_gather *tlb)
>> >  static inline void tlb_add_flush(struct mmu_gather *tlb, unsigned long addr)
>> >  {
>> >         if (!tlb->fullmm) {
>> > +               unsigned long size = PAGE_SIZE;
>> > +
>> >                 if (addr < tlb->range_start)
>> >                         tlb->range_start = addr;
>> > -               if (addr + PAGE_SIZE > tlb->range_end)
>> > -                       tlb->range_end = addr + PAGE_SIZE;
>> > +
>> > +               if (tlb->vma && is_vm_hugetlb_page(tlb->vma))
>> > +                       size = HPAGE_SIZE;
>> > +
>> > +               if (addr + size > tlb->range_end)
>> > +                       tlb->range_end = addr + size;
>> >         }
>> >  }
>> >
>> > diff --git a/arch/arm/kernel/head.S b/arch/arm/kernel/head.S
>> > index 4eee351..860f08e 100644
>> > --- a/arch/arm/kernel/head.S
>> > +++ b/arch/arm/kernel/head.S
>> > @@ -410,13 +410,21 @@ __enable_mmu:
>> >         mov     r5, #0
>> >         mcrr    p15, 0, r4, r5, c2              @ load TTBR0
>> >  #else
>> > +#ifndef        CONFIG_SYS_SUPPORTS_HUGETLBFS
>> >         mov     r5, #(domain_val(DOMAIN_USER, DOMAIN_MANAGER) | \
>> >                       domain_val(DOMAIN_KERNEL, DOMAIN_MANAGER) | \
>> >                       domain_val(DOMAIN_TABLE, DOMAIN_MANAGER) | \
>> >                       domain_val(DOMAIN_IO, DOMAIN_CLIENT))
>> > +#else
>> > +       @ set ourselves as the client in all domains
>> > +       @ this allows us to then use the 4 domain bits in the
>> > +       @ section descriptors in our transparent huge pages
>> > +       ldr     r5, =0x55555555
>> > +#endif /* CONFIG_SYS_SUPPORTS_HUGETLBFS */
>> > +
>> >         mcr     p15, 0, r5, c3, c0, 0           @ load domain access register
>> >         mcr     p15, 0, r4, c2, c0, 0           @ load page table pointer
>> > -#endif
>> > +#endif /* CONFIG_ARM_LPAE */
>> >         b       __turn_mmu_on
>> >  ENDPROC(__enable_mmu)
>> >
>> > diff --git a/arch/arm/mm/Makefile b/arch/arm/mm/Makefile
>> > index 1560bbc..adf0b19 100644
>> > --- a/arch/arm/mm/Makefile
>> > +++ b/arch/arm/mm/Makefile
>> > @@ -17,7 +17,11 @@ obj-$(CONFIG_MODULES)                += proc-syms.o
>> >  obj-$(CONFIG_ALIGNMENT_TRAP)   += alignment.o
>> >  obj-$(CONFIG_HIGHMEM)          += highmem.o
>> >  obj-$(CONFIG_HUGETLB_PAGE)     += hugetlbpage.o
>> > +ifeq ($(CONFIG_ARM_LPAE),y)
>> >  obj-$(CONFIG_HUGETLB_PAGE)     += hugetlbpage-3level.o
>> > +else
>> > +obj-$(CONFIG_HUGETLB_PAGE)     += hugetlbpage-2level.o
>> > +endif
>> >
>> >  obj-$(CONFIG_CPU_ABRT_NOMMU)   += abort-nommu.o
>> >  obj-$(CONFIG_CPU_ABRT_EV4)     += abort-ev4.o
>> > diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
>> > index 5dbf13f..0884936 100644
>> > --- a/arch/arm/mm/fault.c
>> > +++ b/arch/arm/mm/fault.c
>> > @@ -488,13 +488,13 @@ do_translation_fault(unsigned long addr, unsigned int fsr,
>> >  #endif                                 /* CONFIG_MMU */
>> >
>> >  /*
>> > - * Some section permission faults need to be handled gracefully.
>> > - * They can happen due to a __{get,put}_user during an oops.
>> > + * A fault in a section will likely be due to a huge page, treat it
>> > + * as a page fault.
>> >   */
>> >  static int
>> >  do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
>> >  {
>> > -       do_bad_area(addr, fsr, regs);
>> > +       do_page_fault(addr, fsr, regs);
>>
>> doesn't the previous patch require this as well?
>>
>> (so it should strictly speaking be part of that patch)
>>
>
> Yes it does. Thanks I'll clean this up by updating the fsr_info tables for long
> and short descriptors; and remove the do_sect_fault->do_page_fault daisy chaining.
>
>> >         return 0;
>> >  }
>> >
>> > diff --git a/arch/arm/mm/hugetlbpage-2level.c b/arch/arm/mm/hugetlbpage-2level.c
>> > new file mode 100644
>> > index 0000000..4b2b38c
>> > --- /dev/null
>> > +++ b/arch/arm/mm/hugetlbpage-2level.c
>> > @@ -0,0 +1,115 @@
>> > +/*
>> > + * arch/arm/mm/hugetlbpage-2level.c
>> > + *
>> > + * Copyright (C) 2002, Rohit Seth <rohit.seth at intel.com>
>> > + * Copyright (C) 2012 ARM Ltd
>> > + * Copyright (C) 2012 Bill Carson.
>> > + *
>> > + * Based on arch/x86/include/asm/hugetlb.h and Bill Carson's patches
>> > + *
>> > + * This program is free software; you can redistribute it and/or modify
>> > + * it under the terms of the GNU General Public License version 2 as
>> > + * published by the Free Software Foundation.
>> > + *
>> > + * This program is distributed in the hope that it will be useful,
>> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>> > + * GNU General Public License for more details.
>> > + *
>> > + * You should have received a copy of the GNU General Public License
>> > + * along with this program; if not, write to the Free Software
>> > + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
>> > + */
>> > +
>> > +#include <linux/init.h>
>> > +#include <linux/fs.h>
>> > +#include <linux/mm.h>
>> > +#include <linux/hugetlb.h>
>> > +#include <linux/pagemap.h>
>> > +#include <linux/err.h>
>> > +#include <linux/sysctl.h>
>> > +#include <asm/mman.h>
>> > +#include <asm/tlb.h>
>> > +#include <asm/tlbflush.h>
>> > +#include <asm/pgalloc.h>
>> > +
>> > +int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
>> > +{
>> > +       return 0;
>> > +}
>> > +
>> > +pte_t *huge_pte_alloc(struct mm_struct *mm,
>> > +                       unsigned long addr, unsigned long sz)
>> > +{
>> > +       pgd_t *pgd;
>> > +       pud_t *pud;
>> > +       pmd_t *pmd;
>> > +
>> > +       pgd = pgd_offset(mm, addr);
>> > +       pud = pud_offset(pgd, addr);
>> > +       pmd = pmd_offset(pud, addr);
>> > +
>> > +       return (pte_t *)pmd; /* our huge pte is actually a pmd */
>> > +}
>> > +
>> > +struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
>> > +                            pmd_t *pmd, int write)
>> > +{
>> > +       struct page *page;
>> > +       unsigned long pfn;
>> > +
>> > +       BUG_ON((pmd_val(*pmd) & PMD_TYPE_MASK) != PMD_TYPE_SECT);
>>
>> I could only see one caller who calls this only when this exact
>> condition is fulfilled, so unless we anticipate other callers, this
>> BUG_ON could go.
>>
>
> Yes thanks, this can be scrubbed.
>
>> > +       pfn = ((pmd_val(*pmd) & HPAGE_MASK) >> PAGE_SHIFT);
>> > +       page = pfn_to_page(pfn);
>> > +       return page;
>> > +}
>> > +
>> > +pte_t huge_ptep_get(pte_t *ptep)
>> > +{
>> > +       pmd_t *pmdp = (pmd_t*)ptep;
>> > +       pmdval_t pmdval = pmd_val(*pmdp);
>> > +       pteval_t retval;
>> > +
>> > +       if (!pmdval)
>> > +               return __pte(0);
>> > +
>> > +       retval = (pteval_t) (pmdval & HPAGE_MASK);
>> > +       HPMD_XLATE(retval, pmdval, PMD_SECT_XN, L_PTE_XN);
>> > +       HPMD_XLATE(retval, pmdval, PMD_SECT_S, L_PTE_SHARED);
>> > +       HPMD_XLATE(retval, pmdval, PMD_DSECT_AF, L_PTE_YOUNG);
>> > +       HPMD_XLATE(retval, pmdval, PMD_DSECT_DIRTY, L_PTE_DIRTY);
>> > +
>> > +       /* preserve bits C & B */
>> > +       retval |= (pmdval & (3 << 2));
>> > +
>> > +       /* PMD TEX bit 0 corresponds to Linux PTE bit 4 */
>> > +       HPMD_XLATE(retval, pmdval, PMD_SECT_TEX(1), 1 << 4);
>> > +
>>
>> again, I would define the 1 << 4 to something and treat like the others...
>>
>> > +       if (pmdval & PMD_SECT_AP_WRITE)
>> > +               retval &= ~L_PTE_RDONLY;
>> > +       else
>> > +               retval |= L_PTE_RDONLY;
>> > +
>> > +       if ((pmdval & PMD_TYPE_MASK) == PMD_TYPE_SECT)
>> > +               retval |= L_PTE_VALID;
>> > +
>> > +       /* we assume all hugetlb pages are user */
>> > +       retval |= L_PTE_USER;
>> > +
>> > +       return __pte(retval);
>> > +}
>> > +
>> > +void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
>> > +                                  pte_t *ptep, pte_t pte)
>> > +{
>> > +       pmdval_t pmdval = (pmdval_t) pte_val(pte);
>> > +       pmd_t *pmdp = (pmd_t*) ptep;
>> > +
>> > +       pmdval &= HPAGE_MASK;
>> > +       pmdval |= PMD_SECT_AP_READ | PMD_SECT_nG | PMD_TYPE_SECT;
>> > +       pmdval = pmd_val(pmd_modify(__pmd(pmdval), __pgprot(pte_val(pte))));
>> > +
>> > +       __sync_icache_dcache(pte);
>> > +
>> > +       set_pmd_at(mm, addr, pmdp, __pmd(pmdval));
>> > +}
>>
>> so this whole scheme where the caller expects ptes, but really gets
>> pmds feels strange to me, but perhaps it makes more sense on other
>> architectures as to not change the caller instead of this magic?
>>
>
> It is a little strange, but expected. We are considering one level up from
> normal page table entries. The short descriptor case is made stranger by the
> linux/hardware pte distinction. I wanted to re-purpose the domain bits and use
> translation as this allows for a much simpler transparent huge page
> implemention.
>
> I'll see if I can simplify some bits of the short descriptor hugetlb code.
>
>> -Christoffer
>>
>