[PATCH 1/2] mm: Allow architectures to request 'old' entries when prefaulting

Sun Dec 27 15:32:37 EST 2020

On Sun, 27. Dec 11:38, Linus Torvalds wrote:
> On Sat, Dec 26, 2020 at 6:38 PM Hugh Dickins <hughd at google.com> wrote:
> >
> > This patch (like its antecedents) moves the pte_unmap_unlock() from
> > after do_fault_around()'s "check if the page fault is solved" into
> > filemap_map_pages() itself (which apparently does not NULLify vmf->pte
> > after unmapping it, which is poor, but good for revealing this issue).
> > That looks cleaner, but of course there was a very good reason for its
> > original positioning.
> 
> Good catch.
> 
> > Maybe you want to change the ->map_pages prototype, to pass down the
> > requested address too, so that it can report whether the requested
> > address was resolved or not.  Or it could be left to __do_fault(),
> > or even to a repeated fault; but those would be less efficient.
> 
> Let's keep the old really odd "let's unlock in the caller" for now,
> and minimize the changes.
> 
> Adding a big big comment at the end of filemap_map_pages() to note the
> odd delayed page table unlocking.
> 
> Here's an updated patch that combines Kirill's original patch, his
> additional incremental patch, and the fix for the pte lock oddity into
> one thing.
> 
> Does this finally pass your testing?
> 
>                Linus
Hello together,

when i try to build this patch, i got the following error:

 CC      arch/x86/kernel/cpu/mce/threshold.o
mm/memory.c:3716:19: error: static declaration of ‘do_set_pmd’ follows non-static declaration
 static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
                   ^~~~~~~~~~
In file included from mm/memory.c:43:
./include/linux/mm.h:984:12: note: previous declaration of ‘do_set_pmd’ was here
 vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page);
            ^~~~~~~~~~
make[3]: *** [scripts/Makefile.build:279: mm/memory.o] Error 1
make[2]: *** [Makefile:1805: mm] Error 2
make[2]: *** Waiting for unfinished jobs....
  CC      arch/x86/kernel/cpu/mce/therm_throt.o

Best regards
Damian

> From 4d221d934d112aa40c3f4978460be098fc9ce831 Mon Sep 17 00:00:00 2001
> From: "Kirill A. Shutemov" <kirill.shutemov at linux.intel.com>
> Date: Sat, 19 Dec 2020 15:19:23 +0300
> Subject: [PATCH] mm: Cleanup faultaround and finish_fault() codepaths
> 
> alloc_set_pte() has two users with different requirements: in the
> faultaround code, it called from an atomic context and PTE page table
> has to be preallocated. finish_fault() can sleep and allocate page table
> as needed.
> 
> PTL locking rules are also strange, hard to follow and overkill for
> finish_fault().
> 
> Let's untangle the mess. alloc_set_pte() has gone now. All locking is
> explicit.
> 
> The price is some code duplication to handle huge pages in faultaround
> path, but it should be fine, having overall improvement in readability.
> 
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov at linux.intel.com>
> Signed-off-by: Linus Torvalds <torvalds at linux-foundation.org>
> ---
>  include/linux/mm.h      |   8 +-
>  include/linux/pgtable.h |  11 +++
>  mm/filemap.c            | 168 ++++++++++++++++++++++++++++++----------
>  mm/memory.c             | 161 +++++++++++---------------------------
>  4 files changed, 192 insertions(+), 156 deletions(-)
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 5299b90a6c40..c0643a0ad5ff 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -535,8 +535,8 @@ struct vm_fault {
>  					 * is not NULL, otherwise pmd.
>  					 */
>  	pgtable_t prealloc_pte;		/* Pre-allocated pte page table.
> -					 * vm_ops->map_pages() calls
> -					 * alloc_set_pte() from atomic context.
> +					 * vm_ops->map_pages() sets up a page
> +					 * table from from atomic context.
>  					 * do_fault_around() pre-allocates
>  					 * page table to avoid allocation from
>  					 * atomic context.
> @@ -981,7 +981,9 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
>  	return pte;
>  }
>  
> -vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page);
> +vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page);
> +void do_set_pte(struct vm_fault *vmf, struct page *page);
> +
>  vm_fault_t finish_fault(struct vm_fault *vmf);
>  vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf);
>  #endif
> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
> index 8fcdfa52eb4b..36eb748f3c97 100644
> --- a/include/linux/pgtable.h
> +++ b/include/linux/pgtable.h
> @@ -1314,6 +1314,17 @@ static inline int pmd_trans_unstable(pmd_t *pmd)
>  #endif
>  }
>  
> +/*
> + * the ordering of these checks is important for pmds with _page_devmap set.
> + * if we check pmd_trans_unstable() first we will trip the bad_pmd() check
> + * inside of pmd_none_or_trans_huge_or_clear_bad(). this will end up correctly
> + * returning 1 but not before it spams dmesg with the pmd_clear_bad() output.
> + */
> +static inline int pmd_devmap_trans_unstable(pmd_t *pmd)
> +{
> +	return pmd_devmap(*pmd) || pmd_trans_unstable(pmd);
> +}
> +
>  #ifndef CONFIG_NUMA_BALANCING
>  /*
>   * Technically a PTE can be PROTNONE even when not doing NUMA balancing but
> diff --git a/mm/filemap.c b/mm/filemap.c
> index 5c9d564317a5..dbc2eda92a53 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -42,6 +42,7 @@
>  #include <linux/psi.h>
>  #include <linux/ramfs.h>
>  #include <linux/page_idle.h>
> +#include <asm/pgalloc.h>
>  #include "internal.h"
>  
>  #define CREATE_TRACE_POINTS
> @@ -2911,50 +2912,133 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
>  }
>  EXPORT_SYMBOL(filemap_fault);
>  
> +static bool filemap_map_pmd(struct vm_fault *vmf, struct page *page)
> +{
> +	struct mm_struct *mm = vmf->vma->vm_mm;
> +
> +	/* Huge page is mapped? No need to proceed. */
> +	if (pmd_trans_huge(*vmf->pmd)) {
> +		unlock_page(page);
> +		put_page(page);
> +		return true;
> +	}
> +
> +	if (pmd_none(*vmf->pmd) && PageTransHuge(page)) {
> +	    vm_fault_t ret = do_set_pmd(vmf, page);
> +	    if (!ret) {
> +		    /* The page is mapped successfully, reference consumed. */
> +		    unlock_page(page);
> +		    return true;
> +	    }
> +	}
> +
> +	if (pmd_none(*vmf->pmd)) {
> +		vmf->ptl = pmd_lock(mm, vmf->pmd);
> +		if (likely(pmd_none(*vmf->pmd))) {
> +			mm_inc_nr_ptes(mm);
> +			pmd_populate(mm, vmf->pmd, vmf->prealloc_pte);
> +			vmf->prealloc_pte = NULL;
> +		}
> +		spin_unlock(vmf->ptl);
> +	}
> +
> +	/* See comment in handle_pte_fault() */
> +	if (pmd_devmap_trans_unstable(vmf->pmd)) {
> +		unlock_page(page);
> +		put_page(page);
> +		return true;
> +	}
> +
> +	return false;
> +}
> +
> +static struct page *next_uptodate_page(struct page *page, struct vm_fault *vmf,
> +				     struct xa_state *xas, pgoff_t end_pgoff)
> +{
> +	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
> +	unsigned long max_idx;
> +
> +	do {
> +		if (!page)
> +			return NULL;
> +		if (xas_retry(xas, page))
> +			continue;
> +		if (xa_is_value(page))
> +			continue;
> +		if (PageLocked(page))
> +			continue;
> +		if (!page_cache_get_speculative(page))
> +			continue;
> +		/* Has the page moved or been split? */
> +		if (unlikely(page != xas_reload(xas)))
> +			goto skip;
> +		if (!PageUptodate(page) || PageReadahead(page))
> +			goto skip;
> +		if (PageHWPoison(page))
> +			goto skip;
> +		if (!trylock_page(page))
> +			goto skip;
> +		if (page->mapping != mapping)
> +			goto unlock;
> +		if (!PageUptodate(page))
> +			goto unlock;
> +		max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
> +		if (xas->xa_index >= max_idx)
> +			goto unlock;
> +		return page;
> +unlock:
> +		unlock_page(page);
> +skip:
> +		put_page(page);
> +	} while ((page = xas_next_entry(xas, end_pgoff)) != NULL);
> +
> +	return NULL;
> +}
> +
> +static inline struct page *first_map_page(struct vm_fault *vmf,
> +					  struct xa_state *xas,
> +					  pgoff_t end_pgoff)
> +{
> +	return next_uptodate_page(xas_find(xas, end_pgoff),
> +				  vmf, xas, end_pgoff);
> +}
> +
> +static inline struct page *next_map_page(struct vm_fault *vmf,
> +					  struct xa_state *xas,
> +					  pgoff_t end_pgoff)
> +{
> +	return next_uptodate_page(xas_next_entry(xas, end_pgoff),
> +				  vmf, xas, end_pgoff);
> +}
> +
>  void filemap_map_pages(struct vm_fault *vmf,
>  		pgoff_t start_pgoff, pgoff_t end_pgoff)
>  {
> -	struct file *file = vmf->vma->vm_file;
> +	struct vm_area_struct *vma = vmf->vma;
> +	struct file *file = vma->vm_file;
>  	struct address_space *mapping = file->f_mapping;
>  	pgoff_t last_pgoff = start_pgoff;
> -	unsigned long max_idx;
>  	XA_STATE(xas, &mapping->i_pages, start_pgoff);
>  	struct page *head, *page;
>  	unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss);
>  
>  	rcu_read_lock();
> -	xas_for_each(&xas, head, end_pgoff) {
> -		if (xas_retry(&xas, head))
> -			continue;
> -		if (xa_is_value(head))
> -			goto next;
> +	head = first_map_page(vmf, &xas, end_pgoff);
> +	if (!head) {
> +		rcu_read_unlock();
> +		return;
> +	}
>  
> -		/*
> -		 * Check for a locked page first, as a speculative
> -		 * reference may adversely influence page migration.
> -		 */
> -		if (PageLocked(head))
> -			goto next;
> -		if (!page_cache_get_speculative(head))
> -			goto next;
> +	if (filemap_map_pmd(vmf, head)) {
> +		rcu_read_unlock();
> +		return;
> +	}
>  
> -		/* Has the page moved or been split? */
> -		if (unlikely(head != xas_reload(&xas)))
> -			goto skip;
> +	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
> +				       vmf->address, &vmf->ptl);
> +	do {
>  		page = find_subpage(head, xas.xa_index);
> -
> -		if (!PageUptodate(head) ||
> -				PageReadahead(page) ||
> -				PageHWPoison(page))
> -			goto skip;
> -		if (!trylock_page(head))
> -			goto skip;
> -
> -		if (head->mapping != mapping || !PageUptodate(head))
> -			goto unlock;
> -
> -		max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
> -		if (xas.xa_index >= max_idx)
> +		if (PageHWPoison(page))
>  			goto unlock;
>  
>  		if (mmap_miss > 0)
> @@ -2964,19 +3048,25 @@ void filemap_map_pages(struct vm_fault *vmf,
>  		if (vmf->pte)
>  			vmf->pte += xas.xa_index - last_pgoff;
>  		last_pgoff = xas.xa_index;
> -		if (alloc_set_pte(vmf, page))
> +
> +		if (!pte_none(*vmf->pte))
>  			goto unlock;
> +
> +		do_set_pte(vmf, page);
> +		/* no need to invalidate: a not-present page won't be cached */
> +		update_mmu_cache(vma, vmf->address, vmf->pte);
>  		unlock_page(head);
> -		goto next;
> +		continue;
>  unlock:
>  		unlock_page(head);
> -skip:
>  		put_page(head);
> -next:
> -		/* Huge page is mapped? No need to proceed. */
> -		if (pmd_trans_huge(*vmf->pmd))
> -			break;
> -	}
> +	} while ((head = next_map_page(vmf, &xas, end_pgoff)) != NULL);
> +
> +	/*
> +	 * NOTE! We return with the pte still locked! It is unlocked
> +	 * by do_fault_around() after it has tested whether the target
> +	 * address got filled in.
> +	 */
>  	rcu_read_unlock();
>  	WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss);
>  }
> diff --git a/mm/memory.c b/mm/memory.c
> index 7d608765932b..07a408c7d38b 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -3501,7 +3501,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
>  	if (pte_alloc(vma->vm_mm, vmf->pmd))
>  		return VM_FAULT_OOM;
>  
> -	/* See the comment in pte_alloc_one_map() */
> +	/* See comment in handle_pte_fault() */
>  	if (unlikely(pmd_trans_unstable(vmf->pmd)))
>  		return 0;
>  
> @@ -3641,66 +3641,6 @@ static vm_fault_t __do_fault(struct vm_fault *vmf)
>  	return ret;
>  }
>  
> -/*
> - * The ordering of these checks is important for pmds with _PAGE_DEVMAP set.
> - * If we check pmd_trans_unstable() first we will trip the bad_pmd() check
> - * inside of pmd_none_or_trans_huge_or_clear_bad(). This will end up correctly
> - * returning 1 but not before it spams dmesg with the pmd_clear_bad() output.
> - */
> -static int pmd_devmap_trans_unstable(pmd_t *pmd)
> -{
> -	return pmd_devmap(*pmd) || pmd_trans_unstable(pmd);
> -}
> -
> -static vm_fault_t pte_alloc_one_map(struct vm_fault *vmf)
> -{
> -	struct vm_area_struct *vma = vmf->vma;
> -
> -	if (!pmd_none(*vmf->pmd))
> -		goto map_pte;
> -	if (vmf->prealloc_pte) {
> -		vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
> -		if (unlikely(!pmd_none(*vmf->pmd))) {
> -			spin_unlock(vmf->ptl);
> -			goto map_pte;
> -		}
> -
> -		mm_inc_nr_ptes(vma->vm_mm);
> -		pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
> -		spin_unlock(vmf->ptl);
> -		vmf->prealloc_pte = NULL;
> -	} else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) {
> -		return VM_FAULT_OOM;
> -	}
> -map_pte:
> -	/*
> -	 * If a huge pmd materialized under us just retry later.  Use
> -	 * pmd_trans_unstable() via pmd_devmap_trans_unstable() instead of
> -	 * pmd_trans_huge() to ensure the pmd didn't become pmd_trans_huge
> -	 * under us and then back to pmd_none, as a result of MADV_DONTNEED
> -	 * running immediately after a huge pmd fault in a different thread of
> -	 * this mm, in turn leading to a misleading pmd_trans_huge() retval.
> -	 * All we have to ensure is that it is a regular pmd that we can walk
> -	 * with pte_offset_map() and we can do that through an atomic read in
> -	 * C, which is what pmd_trans_unstable() provides.
> -	 */
> -	if (pmd_devmap_trans_unstable(vmf->pmd))
> -		return VM_FAULT_NOPAGE;
> -
> -	/*
> -	 * At this point we know that our vmf->pmd points to a page of ptes
> -	 * and it cannot become pmd_none(), pmd_devmap() or pmd_trans_huge()
> -	 * for the duration of the fault.  If a racing MADV_DONTNEED runs and
> -	 * we zap the ptes pointed to by our vmf->pmd, the vmf->ptl will still
> -	 * be valid and we will re-check to make sure the vmf->pte isn't
> -	 * pte_none() under vmf->ptl protection when we return to
> -	 * alloc_set_pte().
> -	 */
> -	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
> -			&vmf->ptl);
> -	return 0;
> -}
> -
>  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
>  static void deposit_prealloc_pte(struct vm_fault *vmf)
>  {
> @@ -3715,7 +3655,7 @@ static void deposit_prealloc_pte(struct vm_fault *vmf)
>  	vmf->prealloc_pte = NULL;
>  }
>  
> -static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
> +vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
>  {
>  	struct vm_area_struct *vma = vmf->vma;
>  	bool write = vmf->flags & FAULT_FLAG_WRITE;
> @@ -3780,45 +3720,11 @@ static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
>  }
>  #endif
>  
> -/**
> - * alloc_set_pte - setup new PTE entry for given page and add reverse page
> - * mapping. If needed, the function allocates page table or use pre-allocated.
> - *
> - * @vmf: fault environment
> - * @page: page to map
> - *
> - * Caller must take care of unlocking vmf->ptl, if vmf->pte is non-NULL on
> - * return.
> - *
> - * Target users are page handler itself and implementations of
> - * vm_ops->map_pages.
> - *
> - * Return: %0 on success, %VM_FAULT_ code in case of error.
> - */
> -vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page)
> +void do_set_pte(struct vm_fault *vmf, struct page *page)
>  {
>  	struct vm_area_struct *vma = vmf->vma;
>  	bool write = vmf->flags & FAULT_FLAG_WRITE;
>  	pte_t entry;
> -	vm_fault_t ret;
> -
> -	if (pmd_none(*vmf->pmd) && PageTransCompound(page)) {
> -		ret = do_set_pmd(vmf, page);
> -		if (ret != VM_FAULT_FALLBACK)
> -			return ret;
> -	}
> -
> -	if (!vmf->pte) {
> -		ret = pte_alloc_one_map(vmf);
> -		if (ret)
> -			return ret;
> -	}
> -
> -	/* Re-check under ptl */
> -	if (unlikely(!pte_none(*vmf->pte))) {
> -		update_mmu_tlb(vma, vmf->address, vmf->pte);
> -		return VM_FAULT_NOPAGE;
> -	}
>  
>  	flush_icache_page(vma, page);
>  	entry = mk_pte(page, vma->vm_page_prot);
> @@ -3835,14 +3741,8 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page)
>  		page_add_file_rmap(page, false);
>  	}
>  	set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
> -
> -	/* no need to invalidate: a not-present page won't be cached */
> -	update_mmu_cache(vma, vmf->address, vmf->pte);
> -
> -	return 0;
>  }
>  
> -
>  /**
>   * finish_fault - finish page fault once we have prepared the page to fault
>   *
> @@ -3860,12 +3760,12 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page)
>   */
>  vm_fault_t finish_fault(struct vm_fault *vmf)
>  {
> +	struct vm_area_struct *vma = vmf->vma;
>  	struct page *page;
> -	vm_fault_t ret = 0;
> +	vm_fault_t ret;
>  
>  	/* Did we COW the page? */
> -	if ((vmf->flags & FAULT_FLAG_WRITE) &&
> -	    !(vmf->vma->vm_flags & VM_SHARED))
> +	if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED))
>  		page = vmf->cow_page;
>  	else
>  		page = vmf->page;
> @@ -3874,13 +3774,35 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
>  	 * check even for read faults because we might have lost our CoWed
>  	 * page
>  	 */
> -	if (!(vmf->vma->vm_flags & VM_SHARED))
> -		ret = check_stable_address_space(vmf->vma->vm_mm);
> -	if (!ret)
> -		ret = alloc_set_pte(vmf, page);
> -	if (vmf->pte)
> -		pte_unmap_unlock(vmf->pte, vmf->ptl);
> -	return ret;
> +	if (!(vma->vm_flags & VM_SHARED))
> +		ret = check_stable_address_space(vma->vm_mm);
> +	if (ret)
> +		return ret;
> +
> +	if (pmd_none(*vmf->pmd)) {
> +		if (PageTransCompound(page)) {
> +			ret = do_set_pmd(vmf, page);
> +			if (ret != VM_FAULT_FALLBACK)
> +				return ret;
> +		}
> +
> +		if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd)))
> +			return VM_FAULT_OOM;
> +	}
> +
> +	/* See comment in handle_pte_fault() */
> +	if (pmd_devmap_trans_unstable(vmf->pmd))
> +		return 0;
> +
> +	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
> +				      vmf->address, &vmf->ptl);
> +	/* Re-check under ptl */
> +	if (likely(pte_none(*vmf->pte)))
> +		do_set_pte(vmf, page);
> +
> +	update_mmu_tlb(vma, vmf->address, vmf->pte);
> +	pte_unmap_unlock(vmf->pte, vmf->ptl);
> +	return 0;
>  }
>  
>  static unsigned long fault_around_bytes __read_mostly =
> @@ -4351,7 +4273,18 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
>  		 */
>  		vmf->pte = NULL;
>  	} else {
> -		/* See comment in pte_alloc_one_map() */
> +		/*
> +		 * If a huge pmd materialized under us just retry later.  Use
> +		 * pmd_trans_unstable() via pmd_devmap_trans_unstable() instead
> +		 * of pmd_trans_huge() to ensure the pmd didn't become
> +		 * pmd_trans_huge under us and then back to pmd_none, as a
> +		 * result of MADV_DONTNEED running immediately after a huge pmd
> +		 * fault in a different thread of this mm, in turn leading to a
> +		 * misleading pmd_trans_huge() retval. All we have to ensure is
> +		 * that it is a regular pmd that we can walk with
> +		 * pte_offset_map() and we can do that through an atomic read
> +		 * in C, which is what pmd_trans_unstable() provides.
> +		 */
>  		if (pmd_devmap_trans_unstable(vmf->pmd))
>  			return 0;
>  		/*
> -- 
> 2.29.2.157.g1d47791a39
>