[PATCH 52/53] VFS: lift d_alloc_parallel above inode_lock

NeilBrown neilb at ownmail.net
Thu Mar 12 14:12:39 PDT 2026


From: NeilBrown <neil at brown.name>

d_alloc_parallel() can block waiting on a d_in_lookup() dentry
so it is important to order it consistently with other blocking locks
such as inode_lock().

Currenty d_alloc_parallel() is ordered after inode_lock(): it can be
called while the inode is locked, and so the inode cannot be locked
while a d_in_lookup() dentry is held.

This patch reverses that order.  d_alloc_parallel() must now be called
*before* locking the directory, and must not be called afterwards.  This
allows directory locking to be moved closer to the filesystem
operations, and ultimately into those operations.

lookup_one_qstr_excl() is now called without an lock held, exclusive or
otherwise, so the "_excl" is dropped - it is now lookup_one_qstr().

As a lock is taken *after* lookup, start_dirop() and start_renaming()
must ensure that if the dentry isn't d_in_lookup() that after the lock
is taken the parent is still correct and the dentry is still hashed.

lookup_one_qstr() and lookup_slow() don't need to re-check the parent as
the dentry is always d_in_lookup() so parent cannot change.

The locking in lookup_slow() is moved into __lookup_slow() immediately
before/after ->lookup, and lookup_slow() just sets the task state for
waiting.

Parent locking is removed from open_last_lookups() and performed in
lookup_open().  A shared lock is taken if ->lookup() needs to be called.
An exclusive lock is taken separately if ->create() needs to be called -
with checks that the dentry hasn't become positive.

If ->atomic_open is needed we take exclusive or shared parent lock as
appropriate and check for a positive dentry or DEAD parent.

The fsnotify_create() call is kept inside the locked region in
lookup_open().  I don't know if this is important.

Signed-off-by: NeilBrown <neil at brown.name>
---
 fs/namei.c | 239 ++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 154 insertions(+), 85 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index bba419f2fc53..3d213070a515 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1773,8 +1773,19 @@ static struct dentry *lookup_dcache(const struct qstr *name,
 	return dentry;
 }
 
+static inline bool inode_lock_shared_state(struct inode *inode, unsigned int state)
+{
+	if (state == TASK_KILLABLE) {
+		if (down_read_killable(&inode->i_rwsem) != 0) {
+			return false;
+		}
+	} else {
+		inode_lock_shared(inode);
+	}
+	return true;
+}
+
 /*
- * Parent directory has inode locked.
  * If Lookup_EXCL or LOOKUP_RENAME_TARGET is set
  * d_lookup_done() must be called before the dentry is dput()
  * If the dentry is not d_in_lookup():
@@ -1783,8 +1794,9 @@ static struct dentry *lookup_dcache(const struct qstr *name,
  * If it is d_in_lookup() then these conditions can only be checked by the
  * file system when carrying out the intent (create or rename).
  */
-static struct dentry *lookup_one_qstr_excl(const struct qstr *name,
-					   struct dentry *base, unsigned int flags)
+static struct dentry *lookup_one_qstr(const struct qstr *name,
+				      struct dentry *base, unsigned int flags,
+				      unsigned int state)
 {
 	struct dentry *dentry;
 	struct dentry *old;
@@ -1806,7 +1818,16 @@ static struct dentry *lookup_one_qstr_excl(const struct qstr *name,
 		/* Raced with another thread which did the lookup */
 		goto found;
 
-	old = dir->i_op->lookup(dir, dentry, flags);
+	if (!inode_lock_shared_state(dir, state)) {
+		d_lookup_done(dentry);
+		dput(dentry);
+		return ERR_PTR(-EINTR);
+	}
+	if (unlikely(IS_DEADDIR(dir)))
+		old = ERR_PTR(-ENOENT);
+	else
+		old = dir->i_op->lookup(dir, dentry, flags | LOOKUP_SHARED);
+	inode_unlock_shared(dir);
 	if (unlikely(old)) {
 		d_lookup_done(dentry);
 		dput(dentry);
@@ -1897,7 +1918,8 @@ static struct dentry *lookup_fast(struct nameidata *nd)
 /* Fast lookup failed, do it the slow way */
 static struct dentry *__lookup_slow(const struct qstr *name,
 				    struct dentry *dir,
-				    unsigned int flags)
+				    unsigned int flags,
+				    unsigned int state)
 {
 	struct dentry *dentry, *old;
 	struct inode *inode = dir->d_inode;
@@ -1920,8 +1942,17 @@ static struct dentry *__lookup_slow(const struct qstr *name,
 			dput(dentry);
 			dentry = ERR_PTR(error);
 		}
+	} else if (!inode_lock_shared_state(inode, state)) {
+		d_lookup_done(dentry);
+		dput(dentry);
+		return ERR_PTR(-EINTR);
 	} else {
-		old = inode->i_op->lookup(inode, dentry, flags);
+		if (unlikely(IS_DEADDIR(inode)))
+			old = ERR_PTR(-ENOENT);
+		else
+			old = inode->i_op->lookup(inode, dentry,
+						  flags | LOOKUP_SHARED);
+		inode_unlock_shared(inode);
 		d_lookup_done(dentry);
 		if (unlikely(old)) {
 			dput(dentry);
@@ -1935,26 +1966,14 @@ static noinline struct dentry *lookup_slow(const struct qstr *name,
 				  struct dentry *dir,
 				  unsigned int flags)
 {
-	struct inode *inode = dir->d_inode;
-	struct dentry *res;
-	inode_lock_shared(inode);
-	res = __lookup_slow(name, dir, flags | LOOKUP_SHARED);
-	inode_unlock_shared(inode);
-	return res;
+	return __lookup_slow(name, dir, flags | LOOKUP_SHARED, TASK_NORMAL);
 }
 
 static struct dentry *lookup_slow_killable(const struct qstr *name,
 					   struct dentry *dir,
 					   unsigned int flags)
 {
-	struct inode *inode = dir->d_inode;
-	struct dentry *res;
-
-	if (inode_lock_shared_killable(inode))
-		return ERR_PTR(-EINTR);
-	res = __lookup_slow(name, dir, flags | LOOKUP_SHARED);
-	inode_unlock_shared(inode);
-	return res;
+	return __lookup_slow(name, dir, flags | LOOKUP_SHARED, TASK_KILLABLE);
 }
 
 static inline int may_lookup(struct mnt_idmap *idmap,
@@ -2908,18 +2927,26 @@ static struct dentry *__start_dirop(struct dentry *parent, struct qstr *name,
 	struct dentry *dentry;
 	struct inode *dir = d_inode(parent);
 
-	if (state == TASK_KILLABLE) {
-		int ret = down_write_killable_nested(&dir->i_rwsem,
-						     I_MUTEX_PARENT);
-		if (ret)
-			return ERR_PTR(ret);
-	} else {
-		inode_lock_nested(dir, I_MUTEX_PARENT);
-	}
-	dentry = lookup_one_qstr_excl(name, parent, lookup_flags);
-	if (IS_ERR(dentry))
+	while(1) {
+		dentry = lookup_one_qstr(name, parent, lookup_flags, state);
+		if (IS_ERR(dentry))
+			return dentry;
+		if (state == TASK_KILLABLE) {
+			if (down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT) != 0) {
+				d_lookup_done(dentry);
+				dput(dentry);
+				return ERR_PTR(-EINTR);
+			}
+		} else {
+			inode_lock_nested(dir, I_MUTEX_PARENT);
+		}
+		if (d_in_lookup(dentry) ||
+		    (!d_unhashed(dentry) && dentry->d_parent == parent))
+			return dentry;
 		inode_unlock(dir);
-	return dentry;
+		d_lookup_done(dentry);
+		dput(dentry);
+	}
 }
 
 /**
@@ -3830,26 +3857,37 @@ __start_renaming(struct renamedata *rd, int lookup_flags,
 	if (rd->flags & RENAME_NOREPLACE)
 		target_flags |= LOOKUP_EXCL;
 
-	trap = lock_rename(rd->old_parent, rd->new_parent);
-	if (IS_ERR(trap))
-		return PTR_ERR(trap);
-
-	d1 = lookup_one_qstr_excl(old_last, rd->old_parent,
-				  lookup_flags);
+retry:
+	d1 = lookup_one_qstr(old_last, rd->old_parent,
+			     lookup_flags, TASK_NORMAL);
 	err = PTR_ERR(d1);
 	if (IS_ERR(d1))
-		goto out_unlock;
+		goto out_err;
 
-	d2 = lookup_one_qstr_excl(new_last, rd->new_parent,
-				  lookup_flags | target_flags);
+	d2 = lookup_one_qstr(new_last, rd->new_parent,
+			     lookup_flags | target_flags, TASK_NORMAL);
 	err = PTR_ERR(d2);
 	if (IS_ERR(d2))
 		goto out_dput_d1;
 
+	trap = lock_rename(rd->old_parent, rd->new_parent);
+	err = PTR_ERR(trap);
+	if (IS_ERR(trap))
+		goto out_unlock;
+
+	if (unlikely((!d_in_lookup(d1) && d_unhashed(d1)) || d1->d_parent != rd->old_parent ||
+		     (!d_in_lookup(d2) && d_unhashed(d2)) || d2->d_parent != rd->new_parent)) {
+		unlock_rename(rd->old_parent, rd->new_parent);
+		d_lookup_done(d1); dput(d1);
+		d_lookup_done(d2); dput(d2);
+		dput(trap);
+		goto retry;
+	}
+
 	if (d1 == trap) {
 		/* source is an ancestor of target */
 		err = -EINVAL;
-		goto out_dput_d2;
+		goto out_unlock;
 	}
 
 	if (d2 == trap) {
@@ -3858,7 +3896,7 @@ __start_renaming(struct renamedata *rd, int lookup_flags,
 			err = -EINVAL;
 		else
 			err = -ENOTEMPTY;
-		goto out_dput_d2;
+		goto out_unlock;
 	}
 
 	rd->old_dentry = d1;
@@ -3866,14 +3904,14 @@ __start_renaming(struct renamedata *rd, int lookup_flags,
 	dget(rd->old_parent);
 	return 0;
 
-out_dput_d2:
+out_unlock:
+	unlock_rename(rd->old_parent, rd->new_parent);
 	d_lookup_done(d2);
 	dput(d2);
 out_dput_d1:
 	d_lookup_done(d1);
 	dput(d1);
-out_unlock:
-	unlock_rename(rd->old_parent, rd->new_parent);
+out_err:
 	return err;
 }
 
@@ -3927,10 +3965,22 @@ __start_renaming_dentry(struct renamedata *rd, int lookup_flags,
 	if (rd->flags & RENAME_NOREPLACE)
 		target_flags |= LOOKUP_EXCL;
 
-	/* Already have the dentry - need to be sure to lock the correct parent */
+retry:
+	d2 = lookup_one_qstr(new_last, rd->new_parent,
+			     lookup_flags | target_flags, TASK_NORMAL);
+	err = PTR_ERR(d2);
+	if (IS_ERR(d2))
+		goto out_unlock;
+
+	/*
+	 * Already have the old_dentry - need to be sure to lock
+	 * the correct parent
+	 */
 	trap = lock_rename_child(old_dentry, rd->new_parent);
+	err = PTR_ERR(trap);
 	if (IS_ERR(trap))
-		return PTR_ERR(trap);
+		goto out_dput_d2;
+
 	if (d_unhashed(old_dentry) ||
 	    (rd->old_parent && rd->old_parent != old_dentry->d_parent)) {
 		/* dentry was removed, or moved and explicit parent requested */
@@ -3938,16 +3988,19 @@ __start_renaming_dentry(struct renamedata *rd, int lookup_flags,
 		goto out_unlock;
 	}
 
-	d2 = lookup_one_qstr_excl(new_last, rd->new_parent,
-				  lookup_flags | target_flags);
-	err = PTR_ERR(d2);
-	if (IS_ERR(d2))
-		goto out_unlock;
+	if (unlikely((!d_in_lookup(d2) && d_unhashed(d2)) ||
+		     d2->d_parent != rd->new_parent)) {
+		/* d2 was moved/removed before lock - repeat lookup */
+		unlock_rename(old_dentry->d_parent, rd->new_parent);
+		d_lookup_done(d2); dput(d2);
+		dput(trap);
+		goto retry;
+	}
 
 	if (old_dentry == trap) {
 		/* source is an ancestor of target */
 		err = -EINVAL;
-		goto out_dput_d2;
+		goto out_unlock;
 	}
 
 	if (d2 == trap) {
@@ -3956,7 +4009,7 @@ __start_renaming_dentry(struct renamedata *rd, int lookup_flags,
 			err = -EINVAL;
 		else
 			err = -ENOTEMPTY;
-		goto out_dput_d2;
+		goto out_unlock;
 	}
 
 	rd->old_dentry = dget(old_dentry);
@@ -3964,11 +4017,11 @@ __start_renaming_dentry(struct renamedata *rd, int lookup_flags,
 	rd->old_parent = dget(old_dentry->d_parent);
 	return 0;
 
+out_unlock:
+	unlock_rename(old_dentry->d_parent, rd->new_parent);
 out_dput_d2:
 	d_lookup_done(d2);
 	dput(d2);
-out_unlock:
-	unlock_rename(old_dentry->d_parent, rd->new_parent);
 	return err;
 }
 
@@ -4319,8 +4372,19 @@ static struct dentry *atomic_open(const struct path *path, struct dentry *dentry
 
 	file->__f_path.dentry = DENTRY_NOT_SET;
 	file->__f_path.mnt = path->mnt;
-	error = dir->i_op->atomic_open(dir, dentry, file,
-				       open_to_namei_flags(open_flag), mode);
+
+	if (open_flag & O_CREAT)
+		inode_lock(dir);
+	else
+		inode_lock_shared(dir);
+	if (dentry->d_inode)
+		error = finish_no_open(file, NULL);
+	else if (unlikely(IS_DEADDIR(dir)))
+		error = -ENOENT;
+	else
+		error = dir->i_op->atomic_open(dir, dentry, file,
+					       open_to_namei_flags(open_flag),
+					       mode);
 	d_lookup_done(dentry);
 	if (!error) {
 		if (file->f_mode & FMODE_OPENED) {
@@ -4339,6 +4403,13 @@ static struct dentry *atomic_open(const struct path *path, struct dentry *dentry
 				error = -ENOENT;
 		}
 	}
+	if (!error && (file->f_mode & FMODE_CREATED))
+		fsnotify_create(dir, dentry);
+	if (open_flag & O_CREAT)
+		inode_unlock(dir);
+	else
+		inode_unlock_shared(dir);
+
 	if (error) {
 		dput(dentry);
 		dentry = ERR_PTR(error);
@@ -4372,10 +4443,6 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
 	struct dentry *dentry;
 	int error, create_error = 0;
 	umode_t mode = op->mode;
-	unsigned int shared_flag = (op->open_flag & O_CREAT) ? 0 : LOOKUP_SHARED;
-
-	if (unlikely(IS_DEADDIR(dir_inode)))
-		return ERR_PTR(-ENOENT);
 
 	file->f_mode &= ~FMODE_CREATED;
 	dentry = d_lookup(dir, &nd->last);
@@ -4420,7 +4487,7 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
 	if (open_flag & O_CREAT) {
 		if (open_flag & O_EXCL)
 			open_flag &= ~O_TRUNC;
-		mode = vfs_prepare_mode(idmap, dir->d_inode, mode, mode, mode);
+		mode = vfs_prepare_mode(idmap, dir_inode, mode, mode, mode);
 		if (likely(got_write))
 			create_error = may_o_create(idmap, &nd->path,
 						    dentry, mode);
@@ -4439,8 +4506,15 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
 	}
 
 	if (d_in_lookup(dentry)) {
-		struct dentry *res = dir_inode->i_op->lookup(dir_inode, dentry,
-							     nd->flags | shared_flag);
+		struct dentry *res;
+
+		inode_lock_shared(dir_inode);
+		if (IS_DEADDIR(dir_inode))
+			res = ERR_PTR(-ENOENT);
+		else
+			res = dir_inode->i_op->lookup(dir_inode, dentry,
+						      nd->flags | LOOKUP_SHARED);
+		inode_unlock_shared(dir_inode);
 		d_lookup_done(dentry);
 		if (unlikely(res)) {
 			if (IS_ERR(res)) {
@@ -4459,15 +4533,22 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
 		if (error)
 			goto out_dput;
 
-		file->f_mode |= FMODE_CREATED;
-		audit_inode_child(dir_inode, dentry, AUDIT_TYPE_CHILD_CREATE);
-		if (!dir_inode->i_op->create) {
-			error = -EACCES;
-			goto out_dput;
-		}
+		inode_lock(dir_inode);
+		if (!dentry->d_inode && !unlikely(IS_DEADDIR(dir_inode))) {
+			file->f_mode |= FMODE_CREATED;
+			audit_inode_child(dir_inode, dentry, AUDIT_TYPE_CHILD_CREATE);
+			if (!dir_inode->i_op->create) {
+				error = -EACCES;
+				goto out_dput;
+			}
 
-		error = dir_inode->i_op->create(idmap, dir_inode, dentry,
-						mode, open_flag & O_EXCL);
+			error = dir_inode->i_op->create(idmap, dir_inode, dentry,
+							mode, open_flag & O_EXCL);
+			if (!error)
+				fsnotify_create(dir_inode, dentry);
+		} else if (!dentry->d_inode)
+			error = -ENOENT;
+		inode_unlock(dir_inode);
 		if (error)
 			goto out_dput;
 	}
@@ -4522,7 +4603,6 @@ static const char *open_last_lookups(struct nameidata *nd,
 		   struct file *file, const struct open_flags *op)
 {
 	struct delegated_inode delegated_inode = { };
-	struct dentry *dir = nd->path.dentry;
 	int open_flag = op->open_flag;
 	bool got_write = false;
 	struct dentry *dentry;
@@ -4562,22 +4642,11 @@ static const char *open_last_lookups(struct nameidata *nd,
 		 * dropping this one anyway.
 		 */
 	}
-	if (open_flag & O_CREAT)
-		inode_lock(dir->d_inode);
-	else
-		inode_lock_shared(dir->d_inode);
 	dentry = lookup_open(nd, file, op, got_write, &delegated_inode);
 	if (!IS_ERR(dentry)) {
-		if (file->f_mode & FMODE_CREATED)
-			fsnotify_create(dir->d_inode, dentry);
 		if (file->f_mode & FMODE_OPENED)
 			fsnotify_open(file);
 	}
-	if (open_flag & O_CREAT)
-		inode_unlock(dir->d_inode);
-	else
-		inode_unlock_shared(dir->d_inode);
-
 	if (got_write)
 		mnt_drop_write(nd->path.mnt);
 
-- 
2.50.0.107.gf914562f5916.dirty




More information about the linux-afs mailing list