[PATCH v4 42/66] fs/proc/task_mmu: Stop using linked list and highest_vm_end

Liam Howlett liam.howlett at oracle.com
Thu Jan 27 12:14:34 PST 2022


* Vlastimil Babka <vbabka at suse.cz> [220121 06:52]:
> On 12/1/21 15:30, Liam Howlett wrote:
> > From: "Liam R. Howlett" <Liam.Howlett at Oracle.com>
> > 
> > Remove references to mm_struct linked list and highest_vm_end for when
> > they are removed
> > 
> > Signed-off-by: Liam R. Howlett <Liam.Howlett at Oracle.com>
> > ---
> >  fs/proc/internal.h |  2 +-
> >  fs/proc/task_mmu.c | 73 ++++++++++++++++++++++++++--------------------
> >  2 files changed, 42 insertions(+), 33 deletions(-)
> > 
> > diff --git a/fs/proc/internal.h b/fs/proc/internal.h
> > index 03415f3fb3a8..45b132c609ff 100644
> > --- a/fs/proc/internal.h
> > +++ b/fs/proc/internal.h
> > @@ -290,7 +290,7 @@ struct proc_maps_private {
> >  	struct task_struct *task;
> >  	struct mm_struct *mm;
> >  #ifdef CONFIG_MMU
> > -	struct vm_area_struct *tail_vma;
> > +	struct vma_iterator iter;
> >  #endif
> >  #ifdef CONFIG_NUMA
> >  	struct mempolicy *task_mempolicy;
> > diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> > index 300911d6575f..7cc97cdb88c2 100644
> > --- a/fs/proc/task_mmu.c
> > +++ b/fs/proc/task_mmu.c
> > @@ -122,12 +122,26 @@ static void release_task_mempolicy(struct proc_maps_private *priv)
> >  }
> >  #endif
> >  
> > +static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv,
> > +						loff_t *ppos)
> > +{
> > +	struct vm_area_struct *vma = vma_next(&priv->iter);
> 
> This advances the iterator.

Maybe.  vma_next() will call vma_find() which calls mas_find().
mas_find() will return the VMA at that address (or the next VMA) on the
first call.

> 
> > +
> > +	if (vma) {
> > +		*ppos = vma->vm_start;
> 
> This advances *ppos.

If the vma_next() returned the next vma, yes.  If it returned the one at
vmi->mas->index, then no.

> 
> > +	} else {
> > +		*ppos = -2UL;
> > +		vma = get_gate_vma(priv->mm);
> > +	}
> > +
> > +	return vma;
> > +}
> > +
> >  static void *m_start(struct seq_file *m, loff_t *ppos)
> >  {
> >  	struct proc_maps_private *priv = m->private;
> >  	unsigned long last_addr = *ppos;
> >  	struct mm_struct *mm;
> > -	struct vm_area_struct *vma;
> >  
> >  	/* See m_next(). Zero at the start or after lseek. */
> >  	if (last_addr == -1UL)
> > @@ -151,31 +165,21 @@ static void *m_start(struct seq_file *m, loff_t *ppos)
> >  		return ERR_PTR(-EINTR);
> >  	}
> >  
> > +	vma_iter_init(&priv->iter, mm, last_addr);
> >  	hold_task_mempolicy(priv);
> > -	priv->tail_vma = get_gate_vma(mm);
> > -
> > -	vma = find_vma(mm, last_addr);
> > -	if (vma)
> > -		return vma;
> > +	if (last_addr == -2UL)
> > +		return get_gate_vma(mm);
> >  
> > -	return priv->tail_vma;
> > +	return proc_get_vma(priv, ppos);
> 
> So here we advance those as part of m_start(), which I think is wrong in the
> seqfile API. See seq_read_iter() in fs/seq_file.c how it handles a full
> buffer, around the comment "// need a bigger buffer" it will do a stop() and
> start() again and that's supposed to get the same vma.
> seqfile is tricky, part #220121

I've built, booted, and tested this as follows:

root at dev0:~# dd if=/proc/1/maps of=one bs=512; dd if=/proc/1/maps of=two bs=4096; cmp one two
23+1 records in
23+1 records out
12188 bytes (12 kB, 12 KiB) copied, 0.000114377 s, 107 MB/s
0+3 records in
0+3 records out
12188 bytes (12 kB, 12 KiB) copied, 6.9177e-05 s, 176 MB/s
root at dev0:~# dd if=/proc/1/maps of=one bs=3; dd if=/proc/1/maps of=two bs=10; cmp one two
4062+1 records in
4062+1 records out
12188 bytes (12 kB, 12 KiB) copied, 0.0184962 s, 659 kB/s
1218+1 records in
1218+1 records out
12188 bytes (12 kB, 12 KiB) copied, 0.0062038 s, 2.0 MB/s
root at dev0:~#

> 
> >  }
> >  
> >  static void *m_next(struct seq_file *m, void *v, loff_t *ppos)
> >  {
> > -	struct proc_maps_private *priv = m->private;
> > -	struct vm_area_struct *next, *vma = v;
> > -
> > -	if (vma == priv->tail_vma)
> > -		next = NULL;
> > -	else if (vma->vm_next)
> > -		next = vma->vm_next;
> > -	else
> > -		next = priv->tail_vma;
> > -
> > -	*ppos = next ? next->vm_start : -1UL;
> > -
> > -	return next;
> > +	if (*ppos == -2UL) {
> > +		*ppos = -1UL;
> > +		return NULL;
> > +	}
> > +	return proc_get_vma(m->private, ppos);
> >  }
> >  
> >  static void m_stop(struct seq_file *m, void *v)
> > @@ -843,16 +847,16 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
> >  {
> >  	struct proc_maps_private *priv = m->private;
> >  	struct mem_size_stats mss;
> > -	struct mm_struct *mm;
> > +	struct mm_struct *mm = priv->mm;
> >  	struct vm_area_struct *vma;
> > -	unsigned long last_vma_end = 0;
> > +	unsigned long vma_start = 0, last_vma_end = 0;
> >  	int ret = 0;
> > +	MA_STATE(mas, &mm->mm_mt, 0, 0);
> >  
> >  	priv->task = get_proc_task(priv->inode);
> >  	if (!priv->task)
> >  		return -ESRCH;
> >  
> > -	mm = priv->mm;
> >  	if (!mm || !mmget_not_zero(mm)) {
> >  		ret = -ESRCH;
> >  		goto out_put_task;
> > @@ -865,8 +869,13 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
> >  		goto out_put_mm;
> >  
> >  	hold_task_mempolicy(priv);
> > +	vma = mas_find(&mas, 0);
> > +
> > +	if (unlikely(!vma))
> > +		goto empty_set;
> >  
> > -	for (vma = priv->mm->mmap; vma;) {
> > +	vma_start = vma->vm_start;
> > +	do {
> >  		smap_gather_stats(vma, &mss, 0);
> >  		last_vma_end = vma->vm_end;
> >  
> > @@ -875,6 +884,7 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
> >  		 * access it for write request.
> >  		 */
> >  		if (mmap_lock_is_contended(mm)) {
> > +			mas_pause(&mas);
> >  			mmap_read_unlock(mm);
> >  			ret = mmap_read_lock_killable(mm);
> >  			if (ret) {
> > @@ -918,7 +928,7 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
> >  			 *    contains last_vma_end.
> >  			 *    Iterate VMA' from last_vma_end.
> >  			 */
> > -			vma = find_vma(mm, last_vma_end - 1);
> > +			vma = mas_find(&mas, ULONG_MAX);
> >  			/* Case 3 above */
> >  			if (!vma)
> >  				break;
> > @@ -932,11 +942,10 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
> >  				smap_gather_stats(vma, &mss, last_vma_end);
> >  		}
> >  		/* Case 2 above */
> > -		vma = vma->vm_next;
> > -	}
> > +	} while ((vma = mas_find(&mas, ULONG_MAX)) != NULL);
> >  
> > -	show_vma_header_prefix(m, priv->mm->mmap->vm_start,
> > -			       last_vma_end, 0, 0, 0, 0);
> > +empty_set:
> > +	show_vma_header_prefix(m, vma_start, last_vma_end, 0, 0, 0, 0);
> >  	seq_pad(m, ' ');
> >  	seq_puts(m, "[rollup]\n");
> >  
> > @@ -1229,6 +1238,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
> >  		return -ESRCH;
> >  	mm = get_task_mm(task);
> >  	if (mm) {
> > +		MA_STATE(mas, &mm->mm_mt, 0, 0);
> >  		struct mmu_notifier_range range;
> >  		struct clear_refs_private cp = {
> >  			.type = type,
> > @@ -1248,7 +1258,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
> >  		}
> >  
> >  		if (type == CLEAR_REFS_SOFT_DIRTY) {
> > -			for (vma = mm->mmap; vma; vma = vma->vm_next) {
> > +			mas_for_each(&mas, vma, ULONG_MAX) {
> >  				if (!(vma->vm_flags & VM_SOFTDIRTY))
> >  					continue;
> >  				vma->vm_flags &= ~VM_SOFTDIRTY;
> > @@ -1260,8 +1270,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
> >  						0, NULL, mm, 0, -1UL);
> >  			mmu_notifier_invalidate_range_start(&range);
> >  		}
> > -		walk_page_range(mm, 0, mm->highest_vm_end, &clear_refs_walk_ops,
> > -				&cp);
> > +		walk_page_range(mm, 0, -1, &clear_refs_walk_ops, &cp);
> >  		if (type == CLEAR_REFS_SOFT_DIRTY) {
> >  			mmu_notifier_invalidate_range_end(&range);
> >  			flush_tlb_mm(mm);
> 


More information about the maple-tree mailing list