[PATCH v3 3/5] mm: Optimize mprotect() by PTE batching

Barry Song 21cnbao at gmail.com
Mon May 19 01:18:03 PDT 2025


On Mon, May 19, 2025 at 7:49 PM Dev Jain <dev.jain at arm.com> wrote:
>
> Use folio_pte_batch to batch process a large folio. Reuse the folio from prot_numa
> case if possible. Since modify_prot_start_ptes() gathers access/dirty bits,
> it lets us batch around pte_needs_flush() (for parisc, the definition includes
> the access bit).
> For all cases other than the PageAnonExclusive case, if the case holds true
> for one pte in the batch, one can confirm that that case will hold true for
> other ptes in the batch too; for pte_needs_soft_dirty_wp(), we do not pass
> FPB_IGNORE_SOFT_DIRTY. modify_prot_start_ptes() collects the dirty and access bits
> across the batch, therefore batching across pte_dirty(): this is correct since
> the dirty bit on the PTE really is just an indication that the folio got written
> to, so even if the PTE is not actually dirty (but one of the PTEs in the batch is),
> the wp-fault optimization can be made.
> The crux now is how to batch around the PageAnonExclusive case; we must check
> the corresponding condition for every single page. Therefore, from the large
> folio batch, we process sub batches of ptes mapping pages with the same PageAnonExclusive
> condition, and process that sub batch, then determine and process the next sub batch,
> and so on. Note that this does not cause any extra overhead; if suppose the size of
> the folio batch is 512, then the sub batch processing in total will take 512 iterations,
> which is the same as what we would have done before.
>
> Signed-off-by: Dev Jain <dev.jain at arm.com>
> ---
>  include/linux/mm.h |   7 ++-
>  mm/mprotect.c      | 126 +++++++++++++++++++++++++++++++++++----------
>  2 files changed, 104 insertions(+), 29 deletions(-)
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 43748c8f3454..7d5b96f005dc 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -2542,8 +2542,11 @@ int get_cmdline(struct task_struct *task, char *buffer, int buflen);
>  #define  MM_CP_UFFD_WP_ALL                 (MM_CP_UFFD_WP | \
>                                             MM_CP_UFFD_WP_RESOLVE)
>
> -bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr,
> -                            pte_t pte);
> +bool can_change_ptes_writable(struct vm_area_struct *vma, unsigned long addr,
> +                            pte_t pte, int max_len, int *len);
> +#define can_change_pte_writable(vma, addr, pte)        \
> +       can_change_ptes_writable(vma, addr, pte, 1, NULL)
> +
>  extern long change_protection(struct mmu_gather *tlb,
>                               struct vm_area_struct *vma, unsigned long start,
>                               unsigned long end, unsigned long cp_flags);
> diff --git a/mm/mprotect.c b/mm/mprotect.c
> index 124612ce3d24..6cd8cdc168fa 100644
> --- a/mm/mprotect.c
> +++ b/mm/mprotect.c
> @@ -40,25 +40,36 @@
>
>  #include "internal.h"
>
> -bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr,
> -                            pte_t pte)
> +bool can_change_ptes_writable(struct vm_area_struct *vma, unsigned long addr,
> +                            pte_t pte, int max_len, int *len)
>  {
>         struct page *page;
> +       bool temp_ret;
> +       bool ret;
> +       int i;
>
> -       if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE)))
> -               return false;
> +       if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE))) {
> +               ret = false;
> +               goto out;
> +       }
>
>         /* Don't touch entries that are not even readable. */
> -       if (pte_protnone(pte))
> -               return false;
> +       if (pte_protnone(pte)) {
> +               ret = false;
> +               goto out;
> +       }
>
>         /* Do we need write faults for softdirty tracking? */
> -       if (pte_needs_soft_dirty_wp(vma, pte))
> -               return false;
> +       if (pte_needs_soft_dirty_wp(vma, pte)) {
> +               ret = false;
> +               goto out;
> +       }
>
>         /* Do we need write faults for uffd-wp tracking? */
> -       if (userfaultfd_pte_wp(vma, pte))
> -               return false;
> +       if (userfaultfd_pte_wp(vma, pte)) {
> +               ret = false;
> +               goto out;
> +       }
>
>         if (!(vma->vm_flags & VM_SHARED)) {
>                 /*
> @@ -68,7 +79,19 @@ bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr,
>                  * any additional checks while holding the PT lock.
>                  */
>                 page = vm_normal_page(vma, addr, pte);
> -               return page && PageAnon(page) && PageAnonExclusive(page);
> +               ret = (page && PageAnon(page) && PageAnonExclusive(page));
> +               if (!len)
> +                       return ret;
> +
> +               /* Check how many consecutive pages are AnonExclusive or not */
> +               for (i = 1; i < max_len; ++i) {
> +                       ++page;
> +                       temp_ret = (page && PageAnon(page) && PageAnonExclusive(page));
> +                       if (temp_ret != ret)

Do we really need to do PageAnon for each subpage which is:
folio_test_anon(page_folio(page))

since we have checked subpage[0] ?

> +                               break;
> +               }
> +               *len = i;
> +               return ret;
>         }
>
>         VM_WARN_ON_ONCE(is_zero_pfn(pte_pfn(pte)) && pte_dirty(pte));
> @@ -80,21 +103,55 @@ bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr,
>          * FS was already notified and we can simply mark the PTE writable
>          * just like the write-fault handler would do.
>          */
> -       return pte_dirty(pte);
> +       ret = pte_dirty(pte);
> +
> +out:
> +       /* The entire batch is guaranteed to have the same return value */
> +       if (len)
> +               *len = max_len;
> +       return ret;
>  }
>
>  static int mprotect_batch(struct folio *folio, unsigned long addr, pte_t *ptep,
> -               pte_t pte, int max_nr_ptes)
> +               pte_t pte, int max_nr_ptes, bool ignore_soft_dirty)
>  {
> -       const fpb_t flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
> +       fpb_t flags = FPB_IGNORE_DIRTY;
>
> -       if (!folio_test_large(folio) || (max_nr_ptes == 1))
> +       if (ignore_soft_dirty)
> +               flags |= FPB_IGNORE_SOFT_DIRTY;
> +
> +       if (!folio || !folio_test_large(folio) || (max_nr_ptes == 1))
>                 return 1;
>
>         return folio_pte_batch(folio, addr, ptep, pte, max_nr_ptes, flags,
>                                NULL, NULL, NULL);
>  }
>
> +/**
> + * modify_sub_batch - Identifies a sub-batch which has the same return value
> + * of can_change_pte_writable(), from within a folio batch. max_len is the
> + * max length of the possible sub-batch. sub_batch_idx is the offset from
> + * the start of the original folio batch.
> + */
> +static int modify_sub_batch(struct vm_area_struct *vma, struct mmu_gather *tlb,
> +               unsigned long addr, pte_t *ptep, pte_t oldpte, pte_t ptent,
> +               int max_len, int sub_batch_idx)
> +{
> +       unsigned long new_addr = addr + sub_batch_idx * PAGE_SIZE;
> +       pte_t new_oldpte = pte_advance_pfn(oldpte, sub_batch_idx);
> +       pte_t new_ptent = pte_advance_pfn(ptent, sub_batch_idx);
> +       pte_t *new_ptep = ptep + sub_batch_idx;
> +       int len = 1;
> +
> +       if (can_change_ptes_writable(vma, new_addr, new_ptent, max_len, &len))
> +               new_ptent = pte_mkwrite(new_ptent, vma);
> +
> +       modify_prot_commit_ptes(vma, new_addr, new_ptep, new_oldpte, new_ptent, len);
> +       if (pte_needs_flush(new_oldpte, new_ptent))
> +               tlb_flush_pte_range(tlb, new_addr, len * PAGE_SIZE);
> +       return len;
> +}
> +
>  static long change_pte_range(struct mmu_gather *tlb,
>                 struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr,
>                 unsigned long end, pgprot_t newprot, unsigned long cp_flags)
> @@ -106,7 +163,7 @@ static long change_pte_range(struct mmu_gather *tlb,
>         bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
>         bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
>         bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
> -       int nr_ptes;
> +       int sub_batch_idx, max_len, len, nr_ptes;
>
>         tlb_change_page_size(tlb, PAGE_SIZE);
>         pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
> @@ -121,10 +178,12 @@ static long change_pte_range(struct mmu_gather *tlb,
>         flush_tlb_batched_pending(vma->vm_mm);
>         arch_enter_lazy_mmu_mode();
>         do {
> +               sub_batch_idx = 0;
>                 nr_ptes = 1;
>                 oldpte = ptep_get(pte);
>                 if (pte_present(oldpte)) {
>                         int max_nr_ptes = (end - addr) >> PAGE_SHIFT;
> +                       struct folio *folio = NULL;
>                         pte_t ptent;
>
>                         /*
> @@ -132,7 +191,6 @@ static long change_pte_range(struct mmu_gather *tlb,
>                          * pages. See similar comment in change_huge_pmd.
>                          */
>                         if (prot_numa) {
> -                               struct folio *folio;
>                                 int nid;
>                                 bool toptier;
>
> @@ -180,7 +238,8 @@ static long change_pte_range(struct mmu_gather *tlb,
>                                     toptier) {
>  skip_batch:
>                                         nr_ptes = mprotect_batch(folio, addr, pte,
> -                                                                oldpte, max_nr_ptes);
> +                                                                oldpte, max_nr_ptes,
> +                                                                true);
>                                         continue;
>                                 }
>                                 if (folio_use_access_time(folio))
> @@ -188,6 +247,11 @@ static long change_pte_range(struct mmu_gather *tlb,
>                                                 jiffies_to_msecs(jiffies));
>                         }
>
> +                       if (!folio)
> +                               folio = vm_normal_folio(vma, addr, oldpte);
> +
> +                       nr_ptes = mprotect_batch(folio, addr, pte, oldpte,
> +                                                max_nr_ptes, false);
>                         oldpte = modify_prot_start_ptes(vma, addr, pte, nr_ptes);
>                         ptent = pte_modify(oldpte, newprot);
>
> @@ -209,15 +273,23 @@ static long change_pte_range(struct mmu_gather *tlb,
>                          * example, if a PTE is already dirty and no other
>                          * COW or special handling is required.
>                          */
> -                       if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) &&
> -                           !pte_write(ptent) &&
> -                           can_change_pte_writable(vma, addr, ptent))
> -                               ptent = pte_mkwrite(ptent, vma);
> -
> -                       modify_prot_commit_ptes(vma, addr, pte, oldpte, ptent, nr_ptes);
> -                       if (pte_needs_flush(oldpte, ptent))
> -                               tlb_flush_pte_range(tlb, addr, PAGE_SIZE);
> -                       pages++;
> +                       if (cp_flags & MM_CP_TRY_CHANGE_WRITABLE) {
> +                               max_len = nr_ptes;
> +                               while (sub_batch_idx < nr_ptes) {
> +
> +                                       /* Get length of sub batch */
> +                                       len = modify_sub_batch(vma, tlb, addr, pte,
> +                                                              oldpte, ptent, max_len,
> +                                                              sub_batch_idx);
> +                                       sub_batch_idx += len;
> +                                       max_len -= len;
> +                               }
> +                       } else {
> +                               modify_prot_commit_ptes(vma, addr, pte, oldpte, ptent, nr_ptes);
> +                               if (pte_needs_flush(oldpte, ptent))
> +                                       tlb_flush_pte_range(tlb, addr, nr_ptes * PAGE_SIZE);
> +                       }
> +                       pages += nr_ptes;
>                 } else if (is_swap_pte(oldpte)) {
>                         swp_entry_t entry = pte_to_swp_entry(oldpte);
>                         pte_t newpte;
> --
> 2.30.2
>

Thanks
barry



More information about the linux-arm-kernel mailing list