[RFC PATCH RESEND 05/28] mm: add per-VMA lock and helper functions to control it

Suren Baghdasaryan surenb at google.com
Tue Sep 6 10:24:29 PDT 2022


On Tue, Sep 6, 2022 at 6:47 AM Laurent Dufour <ldufour at linux.ibm.com> wrote:
>
> Le 01/09/2022 à 19:34, Suren Baghdasaryan a écrit :
> > Introduce a per-VMA rw_semaphore to be used during page fault handling
> > instead of mmap_lock. Because there are cases when multiple VMAs need
> > to be exclusively locked during VMA tree modifications, instead of the
> > usual lock/unlock patter we mark a VMA as locked by taking per-VMA lock
> > exclusively and setting vma->lock_seq to the current mm->lock_seq. When
> > mmap_write_lock holder is done with all modifications and drops mmap_lock,
> > it will increment mm->lock_seq, effectively unlocking all VMAs marked as
> > locked.
> >
> > Signed-off-by: Suren Baghdasaryan <surenb at google.com>
> Despite a minor comment below,
>
> Reviewed-by: Laurent Dufour <laurent.dufour at fr.ibm.com>

Thanks for the reviews Laurent! I'll need some time to double-check
all the VMA locking locations that you spotted as potentially
unnecessary. Admittedly I was a bit paranoid when writing this
patchset and trying not to miss any potential race, so some of them
might indeed be unnecessary. Will reply to each of your comments once
I confirm the need for locking in each case.
Thanks,
Suren.

>
> > ---
> >  include/linux/mm.h        | 78 +++++++++++++++++++++++++++++++++++++++
> >  include/linux/mm_types.h  |  7 ++++
> >  include/linux/mmap_lock.h | 13 +++++++
> >  kernel/fork.c             |  4 ++
> >  mm/init-mm.c              |  3 ++
> >  5 files changed, 105 insertions(+)
> >
> > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > index 7d322a979455..476bf936c5f0 100644
> > --- a/include/linux/mm.h
> > +++ b/include/linux/mm.h
> > @@ -611,6 +611,83 @@ struct vm_operations_struct {
> >                                         unsigned long addr);
> >  };
> >
> > +#ifdef CONFIG_PER_VMA_LOCK
> > +static inline void vma_init_lock(struct vm_area_struct *vma)
> > +{
> > +     init_rwsem(&vma->lock);
> > +     vma->vm_lock_seq = -1;
> > +}
> > +
> > +static inline void vma_mark_locked(struct vm_area_struct *vma)
> > +{
> > +     int mm_lock_seq;
> > +
> > +     mmap_assert_write_locked(vma->vm_mm);
> > +
> > +     /*
> > +      * current task is holding mmap_write_lock, both vma->vm_lock_seq and
> > +      * mm->mm_lock_seq can't be concurrently modified.
> > +      */
> > +     mm_lock_seq = READ_ONCE(vma->vm_mm->mm_lock_seq);
> > +     if (vma->vm_lock_seq == mm_lock_seq)
> > +             return;
> > +
> > +     down_write(&vma->lock);
> > +     vma->vm_lock_seq = mm_lock_seq;
> > +     up_write(&vma->lock);
> > +}
> > +
> > +static inline bool vma_read_trylock(struct vm_area_struct *vma)
> > +{
> > +     if (unlikely(down_read_trylock(&vma->lock) == 0))
> > +             return false;
> > +
> > +     /*
> > +      * Overflow might produce false locked result but it's not critical.
>
> It might be good to precise here that in the case of false locked, the
> caller is assumed to fallback read locking the mm entirely before doing its
> change relative to that VMA.

Ack.

>
> > +      * False unlocked result is critical but is impossible because we
> > +      * modify and check vma->vm_lock_seq under vma->lock protection and
> > +      * mm->mm_lock_seq modification invalidates all existing locks.
> > +      */
> > +     if (vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq)) {
> > +             up_read(&vma->lock);
> > +             return false;
> > +     }
> > +     return true;
> > +}
> > +
> > +static inline void vma_read_unlock(struct vm_area_struct *vma)
> > +{
> > +     up_read(&vma->lock);
> > +}
> > +
> > +static inline void vma_assert_locked(struct vm_area_struct *vma)
> > +{
> > +     lockdep_assert_held(&vma->lock);
> > +     VM_BUG_ON_VMA(!rwsem_is_locked(&vma->lock), vma);
> > +}
> > +
> > +static inline void vma_assert_write_locked(struct vm_area_struct *vma, int pos)
> > +{
> > +     mmap_assert_write_locked(vma->vm_mm);
> > +     /*
> > +      * current task is holding mmap_write_lock, both vma->vm_lock_seq and
> > +      * mm->mm_lock_seq can't be concurrently modified.
> > +      */
> > +     VM_BUG_ON_VMA(vma->vm_lock_seq != READ_ONCE(vma->vm_mm->mm_lock_seq), vma);
> > +}
> > +
> > +#else /* CONFIG_PER_VMA_LOCK */
> > +
> > +static inline void vma_init_lock(struct vm_area_struct *vma) {}
> > +static inline void vma_mark_locked(struct vm_area_struct *vma) {}
> > +static inline bool vma_read_trylock(struct vm_area_struct *vma)
> > +             { return false; }
> > +static inline void vma_read_unlock(struct vm_area_struct *vma) {}
> > +static inline void vma_assert_locked(struct vm_area_struct *vma) {}
> > +static inline void vma_assert_write_locked(struct vm_area_struct *vma, int pos) {}
> > +
> > +#endif /* CONFIG_PER_VMA_LOCK */
> > +
> >  static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
> >  {
> >       static const struct vm_operations_struct dummy_vm_ops = {};
> > @@ -619,6 +696,7 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
> >       vma->vm_mm = mm;
> >       vma->vm_ops = &dummy_vm_ops;
> >       INIT_LIST_HEAD(&vma->anon_vma_chain);
> > +     vma_init_lock(vma);
> >  }
> >
> >  static inline void vma_set_anonymous(struct vm_area_struct *vma)
> > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> > index bed25ef7c994..6a03f59c1e78 100644
> > --- a/include/linux/mm_types.h
> > +++ b/include/linux/mm_types.h
> > @@ -486,6 +486,10 @@ struct vm_area_struct {
> >       struct mempolicy *vm_policy;    /* NUMA policy for the VMA */
> >  #endif
> >       struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
> > +#ifdef CONFIG_PER_VMA_LOCK
> > +     struct rw_semaphore lock;
> > +     int vm_lock_seq;
> > +#endif
> >  } __randomize_layout;
> >
> >  struct kioctx_table;
> > @@ -567,6 +571,9 @@ struct mm_struct {
> >                                         * init_mm.mmlist, and are protected
> >                                         * by mmlist_lock
> >                                         */
> > +#ifdef CONFIG_PER_VMA_LOCK
> > +             int mm_lock_seq;
> > +#endif
> >
> >
> >               unsigned long hiwater_rss; /* High-watermark of RSS usage */
> > diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
> > index e49ba91bb1f0..a391ae226564 100644
> > --- a/include/linux/mmap_lock.h
> > +++ b/include/linux/mmap_lock.h
> > @@ -72,6 +72,17 @@ static inline void mmap_assert_write_locked(struct mm_struct *mm)
> >       VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
> >  }
> >
> > +#ifdef CONFIG_PER_VMA_LOCK
> > +static inline void vma_mark_unlocked_all(struct mm_struct *mm)
> > +{
> > +     mmap_assert_write_locked(mm);
> > +     /* No races during update due to exclusive mmap_lock being held */
> > +     WRITE_ONCE(mm->mm_lock_seq, mm->mm_lock_seq + 1);
> > +}
> > +#else
> > +static inline void vma_mark_unlocked_all(struct mm_struct *mm) {}
> > +#endif
> > +
> >  static inline void mmap_init_lock(struct mm_struct *mm)
> >  {
> >       init_rwsem(&mm->mmap_lock);
> > @@ -114,12 +125,14 @@ static inline bool mmap_write_trylock(struct mm_struct *mm)
> >  static inline void mmap_write_unlock(struct mm_struct *mm)
> >  {
> >       __mmap_lock_trace_released(mm, true);
> > +     vma_mark_unlocked_all(mm);
> >       up_write(&mm->mmap_lock);
> >  }
> >
> >  static inline void mmap_write_downgrade(struct mm_struct *mm)
> >  {
> >       __mmap_lock_trace_acquire_returned(mm, false, true);
> > +     vma_mark_unlocked_all(mm);
> >       downgrade_write(&mm->mmap_lock);
> >  }
> >
> > diff --git a/kernel/fork.c b/kernel/fork.c
> > index 614872438393..bfab31ecd11e 100644
> > --- a/kernel/fork.c
> > +++ b/kernel/fork.c
> > @@ -475,6 +475,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
> >                */
> >               *new = data_race(*orig);
> >               INIT_LIST_HEAD(&new->anon_vma_chain);
> > +             vma_init_lock(new);
> >               new->vm_next = new->vm_prev = NULL;
> >               dup_anon_vma_name(orig, new);
> >       }
> > @@ -1130,6 +1131,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
> >       seqcount_init(&mm->write_protect_seq);
> >       mmap_init_lock(mm);
> >       INIT_LIST_HEAD(&mm->mmlist);
> > +#ifdef CONFIG_PER_VMA_LOCK
> > +     WRITE_ONCE(mm->mm_lock_seq, 0);
> > +#endif
> >       mm_pgtables_bytes_init(mm);
> >       mm->map_count = 0;
> >       mm->locked_vm = 0;
> > diff --git a/mm/init-mm.c b/mm/init-mm.c
> > index fbe7844d0912..8399f90d631c 100644
> > --- a/mm/init-mm.c
> > +++ b/mm/init-mm.c
> > @@ -37,6 +37,9 @@ struct mm_struct init_mm = {
> >       .page_table_lock =  __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
> >       .arg_lock       =  __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
> >       .mmlist         = LIST_HEAD_INIT(init_mm.mmlist),
> > +#ifdef CONFIG_PER_VMA_LOCK
> > +     .mm_lock_seq    = 0,
> > +#endif
> >       .user_ns        = &init_user_ns,
> >       .cpu_bitmap     = CPU_BITS_NONE,
> >  #ifdef CONFIG_IOMMU_SVA
>



More information about the linux-arm-kernel mailing list