[RESEND RFC V2] ARM: mm: make UACCESS_WITH_MEMCPY huge page aware
Nicolas Pitre
nicolas.pitre at linaro.org
Mon Sep 23 12:26:32 EDT 2013
On Mon, 23 Sep 2013, Steve Capper wrote:
> Resending, as I ommitted a few important CC's.
>
> ---
>
> The memory pinning code in uaccess_with_memcpy.c does not check
> for HugeTLB or THP pmds, and will enter an infinite loop should
> a __copy_to_user or __clear_user occur against a huge page.
>
> This patch adds detection code for huge pages to pin_page_for_write.
> As this code can be executed in a fast path it refers to the actual
> pmds rather than the vma. If a HugeTLB or THP is found (they have
> the same pmd representation on ARM), the page table spinlock is
> taken to prevent modification whilst the page is pinned.
>
> On ARM, huge pages are only represented as pmds, thus no huge pud
> checks are performed. (For huge puds one would lock the page table
> in a similar manner as in the pmd case).
>
> Two helper functions are introduced; pmd_thp_or_huge will check
> whether or not a page is huge or transparent huge (which have the
> same pmd layout on ARM), and pmd_hugewillfault will detect whether
> or not a page fault will occur on write to the page.
>
> Changes since first RFC:
> * The page mask is widened for hugepages to reduce the number
> of potential locks/unlocks.
> (A knobbled /dev/zero with its latency reduction chunks
> removed shows a 2x data rate boost with hugepages backing:
> dd if=/dev/zero of=/dev/null bs=10M count=1024 )
Are you saying that the 2x boost is due to this page mask widening?
A non negligeable drawback with this large mask is the fact that you're
holding a spinlock for a much longer period.
What kind of performance do you get by leaving the lock period to a
small page boundary?
>
> Signed-off-by: Steve Capper <steve.capper at linaro.org>
> ---
> arch/arm/include/asm/pgtable-3level.h | 3 ++
> arch/arm/lib/uaccess_with_memcpy.c | 57 ++++++++++++++++++++++++++++++-----
> 2 files changed, 52 insertions(+), 8 deletions(-)
>
> diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h
> index 5689c18..39c54cf 100644
> --- a/arch/arm/include/asm/pgtable-3level.h
> +++ b/arch/arm/include/asm/pgtable-3level.h
> @@ -206,6 +206,9 @@ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long addr)
> #define __HAVE_ARCH_PMD_WRITE
> #define pmd_write(pmd) (!(pmd_val(pmd) & PMD_SECT_RDONLY))
>
> +#define pmd_hugewillfault(pmd) (!pmd_young(pmd) || !pmd_write(pmd))
> +#define pmd_thp_or_huge(pmd) (pmd_huge(pmd) || pmd_trans_huge(pmd))
> +
> #ifdef CONFIG_TRANSPARENT_HUGEPAGE
> #define pmd_trans_huge(pmd) (pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT))
> #define pmd_trans_splitting(pmd) (pmd_val(pmd) & PMD_SECT_SPLITTING)
> diff --git a/arch/arm/lib/uaccess_with_memcpy.c b/arch/arm/lib/uaccess_with_memcpy.c
> index 025f742..78756db 100644
> --- a/arch/arm/lib/uaccess_with_memcpy.c
> +++ b/arch/arm/lib/uaccess_with_memcpy.c
> @@ -18,11 +18,13 @@
> #include <linux/hardirq.h> /* for in_atomic() */
> #include <linux/gfp.h>
> #include <linux/highmem.h>
> +#include <linux/hugetlb.h>
> #include <asm/current.h>
> #include <asm/page.h>
>
> static int
> -pin_page_for_write(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp)
> +pin_page_for_write(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp,
> + unsigned long *page_mask)
> {
> unsigned long addr = (unsigned long)_addr;
> pgd_t *pgd;
> @@ -40,7 +42,36 @@ pin_page_for_write(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp)
> return 0;
>
> pmd = pmd_offset(pud, addr);
> - if (unlikely(pmd_none(*pmd) || pmd_bad(*pmd)))
> + if (unlikely(pmd_none(*pmd)))
> + return 0;
> +
> + /*
> + * A pmd can be bad if it refers to a HugeTLB or THP page.
> + *
> + * Both THP and HugeTLB pages have the same pmd layout
> + * and should not be manipulated by the pte functions.
> + *
> + * Lock the page table for the destination and check
> + * to see that it's still huge and whether or not we will
> + * need to fault on write, or if we have a splitting THP.
> + */
> + if (unlikely(pmd_thp_or_huge(*pmd))) {
> + ptl = ¤t->mm->page_table_lock;
> + spin_lock(ptl);
> + if (unlikely(!pmd_thp_or_huge(*pmd)
> + || pmd_hugewillfault(*pmd)
> + || pmd_trans_splitting(*pmd))) {
> + spin_unlock(ptl);
> + return 0;
> + }
> +
> + *ptep = NULL;
> + *ptlp = ptl;
> + *page_mask = HPAGE_MASK;
> + return 1;
> + }
> +
> + if (unlikely(pmd_bad(*pmd)))
> return 0;
>
> pte = pte_offset_map_lock(current->mm, pmd, addr, &ptl);
> @@ -52,6 +83,7 @@ pin_page_for_write(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp)
>
> *ptep = pte;
> *ptlp = ptl;
> + *page_mask = PAGE_MASK;
>
> return 1;
> }
> @@ -60,6 +92,7 @@ static unsigned long noinline
> __copy_to_user_memcpy(void __user *to, const void *from, unsigned long n)
> {
> int atomic;
> + unsigned long page_mask;
>
> if (unlikely(segment_eq(get_fs(), KERNEL_DS))) {
> memcpy((void *)to, from, n);
> @@ -76,7 +109,7 @@ __copy_to_user_memcpy(void __user *to, const void *from, unsigned long n)
> spinlock_t *ptl;
> int tocopy;
>
> - while (!pin_page_for_write(to, &pte, &ptl)) {
> + while (!pin_page_for_write(to, &pte, &ptl, &page_mask)) {
> if (!atomic)
> up_read(¤t->mm->mmap_sem);
> if (__put_user(0, (char __user *)to))
> @@ -85,7 +118,7 @@ __copy_to_user_memcpy(void __user *to, const void *from, unsigned long n)
> down_read(¤t->mm->mmap_sem);
> }
>
> - tocopy = (~(unsigned long)to & ~PAGE_MASK) + 1;
> + tocopy = (~(unsigned long)to & ~page_mask) + 1;
> if (tocopy > n)
> tocopy = n;
>
> @@ -94,7 +127,10 @@ __copy_to_user_memcpy(void __user *to, const void *from, unsigned long n)
> from += tocopy;
> n -= tocopy;
>
> - pte_unmap_unlock(pte, ptl);
> + if (pte)
> + pte_unmap_unlock(pte, ptl);
> + else
> + spin_unlock(ptl);
> }
> if (!atomic)
> up_read(¤t->mm->mmap_sem);
> @@ -121,6 +157,8 @@ __copy_to_user(void __user *to, const void *from, unsigned long n)
> static unsigned long noinline
> __clear_user_memset(void __user *addr, unsigned long n)
> {
> + unsigned long page_mask;
> +
> if (unlikely(segment_eq(get_fs(), KERNEL_DS))) {
> memset((void *)addr, 0, n);
> return 0;
> @@ -132,14 +170,14 @@ __clear_user_memset(void __user *addr, unsigned long n)
> spinlock_t *ptl;
> int tocopy;
>
> - while (!pin_page_for_write(addr, &pte, &ptl)) {
> + while (!pin_page_for_write(addr, &pte, &ptl, &page_mask)) {
> up_read(¤t->mm->mmap_sem);
> if (__put_user(0, (char __user *)addr))
> goto out;
> down_read(¤t->mm->mmap_sem);
> }
>
> - tocopy = (~(unsigned long)addr & ~PAGE_MASK) + 1;
> + tocopy = (~(unsigned long)addr & ~page_mask) + 1;
> if (tocopy > n)
> tocopy = n;
>
> @@ -147,7 +185,10 @@ __clear_user_memset(void __user *addr, unsigned long n)
> addr += tocopy;
> n -= tocopy;
>
> - pte_unmap_unlock(pte, ptl);
> + if (pte)
> + pte_unmap_unlock(pte, ptl);
> + else
> + spin_unlock(ptl);
> }
> up_read(¤t->mm->mmap_sem);
>
> --
> 1.8.1.4
>
>
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel at lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
>
More information about the linux-arm-kernel
mailing list