[PATCH 07/14] Add auto-generated "." and ".." entries to directories

Valerie Aurora val at versity.com
Thu Feb 27 06:16:16 PST 2025


There are pros and cons to real or fake "." and ".." dentries, but we
decided to go with fake because we don't have to do a potentially slow
lookup of ".." in large directories. Userspace likes "." and ".." to
come first in readdir(), and we return entries in hash value order, so
reserve 0 and 1 in the hash output space for "." and "..". Then
autogenerate them in readdir() and lookup() using the parent inode
back reference.

Signed-off-by: Valerie Aurora <val at versity.com>
---
 shared/dir.c          | 115 ++++++++++++++++++++++++++++++++++++------
 shared/format-block.h |   5 ++
 2 files changed, 104 insertions(+), 16 deletions(-)

diff --git a/shared/dir.c b/shared/dir.c
index d969e10..0439816 100644
--- a/shared/dir.c
+++ b/shared/dir.c
@@ -42,9 +42,37 @@ static bool names_equal(u8 *a, size_t a_len, u8 *b, size_t b_len)
 	return a_len == b_len && memcmp(a, b, a_len) == 0;
 }
 
-static u64 name_hash(void *name, size_t name_len)
-{
-	return xxh64(name, name_len, NGNFS_DIRENT_HASH_SEED) & NGNFS_DIRENT_HASH_MASK;
+/*
+ * The directory entries for . and .. are generated during lookup and
+ * readdir and are not "real" directory entries stored as dirents. For
+ * readdir to work properly, we need the position of each entry (its
+ * hash value) to be stable. We also want to generate . and .. first
+ * because it's easier than inserting them somewhere in the middle and
+ * because applications like it that way.
+ *
+ * The solution is to reserve the hash values 0 for . and 1 for .. so
+ * that we can return them first in readdir() and the positions returned
+ * by readdir are strictly ascending.
+ */
+
+static u64 name_hash(void *name, size_t name_len) {
+	char *s = name;
+	u64 hash;
+
+	if ((name_len <= 2) && s[0] == '.') {
+		if (name_len == 1)
+			return NGNFS_DIRENT_DOT_HASH;
+
+		if (s[1] == '.')
+			return NGNFS_DIRENT_DOT_DOT_HASH;
+	}
+
+	hash = xxh64(name, name_len, NGNFS_DIRENT_HASH_SEED) & NGNFS_DIRENT_HASH_MASK;
+
+	if (hash < NGNFS_DIRENT_MIN_HASH)
+		hash = NGNFS_DIRENT_MIN_HASH;
+
+	return hash;
 }
 
 /*
@@ -105,6 +133,16 @@ unsigned int ngnfs_type_to_dtype(enum ngnfs_dentry_type type)
 	return DT_UNKNOWN;
 }
 
+static void fill_dent(__le64 ino, __le64 version, enum ngnfs_dentry_type type, char *name,
+		      size_t name_len, struct ngnfs_dirent *dent)
+{
+	dent->ino = ino;
+	dent->type = type;
+	dent->version = version;
+	dent->name_len = name_len;
+	memcpy(dent->name, name, name_len);
+}
+
 static void init_dirent_args(struct dirent_args *da, char *name, size_t name_len, u64 ino,
 			     mode_t mode)
 {
@@ -112,18 +150,13 @@ static void init_dirent_args(struct dirent_args *da, char *name, size_t name_len
 	da->dent_size = offsetof(struct ngnfs_dirent, name) + name_len;
 	da->dtype = IFTODT(mode);
 
-	da->dent.ino = cpu_to_le64(ino);
-	da->dent.version = cpu_to_le64(0); /* XXX :/ */
-	da->dent.type = mode_to_type(mode);
-	da->dent.name_len = name_len;
+	fill_dent(cpu_to_le64(ino), 0 /* XXX :/ */, mode_to_type(mode), name, name_len, &da->dent);
 
 	/* ensure that we're stitching together a contiguous max name buffer */
 	BUILD_BUG_ON(offsetofend(struct dirent_args, dent.name) !=
 		     offsetof(struct dirent_args, __max_name_storage));
 	BUILD_BUG_ON((sizeof_field(struct dirent_args, dent.name) +
 		      sizeof_field(struct dirent_args, __max_name_storage)) != NGNFS_NAME_MAX);
-
-	memcpy(da->dent.name, name, name_len);
 }
 
 static int update_dirent_args_ino(struct dirent_args *da, u64 ino)
@@ -279,16 +312,15 @@ struct readdir_args {
 	int nr;
 };
 
-static int fill_dirent_rd(struct ngnfs_btree_key *key, void *val, size_t val_size, void *args)
+static int fill_readdir(struct ngnfs_btree_key *key, struct ngnfs_dirent *dent,
+			struct readdir_args *ra)
 {
-	struct readdir_args *ra = args;
-	struct ngnfs_dirent *dent = val;
 	size_t aligned;
 	size_t bytes;
 
 	bytes = offsetof(struct ngnfs_readdir_entry, name[dent->name_len + 1]);
 	if (bytes > ra->size)
-		return 0;
+		return -ENOBUFS;
 
 	aligned = ALIGN(bytes, __alignof__(struct ngnfs_readdir_entry));
 
@@ -304,12 +336,62 @@ static int fill_dirent_rd(struct ngnfs_btree_key *key, void *val, size_t val_siz
 	ra->nr++;
 
 	if (ra->nr == INT_MAX || aligned >= ra->size)
-		return 0;
+		return -ENOBUFS;
 
 	ra->ent = (void *)ra->ent + aligned;
 	ra->size -= aligned;
 
-	return NGNFS_BTREE_ITER_CONTINUE;
+	return 0;
+}
+
+static int fill_readdir_rd(struct ngnfs_btree_key *key, void *val, size_t val_size, void *args)
+{
+	int ret;
+	ret = fill_readdir(key, val, args);
+
+	if (ret == 0)
+		return NGNFS_BTREE_ITER_CONTINUE;
+
+	return 0;
+}
+
+static int fill_from_inode(u64 hash, struct ngnfs_inode_txn_ref *inode, char *name, size_t name_len,
+			   struct ngnfs_dirent *dent, struct readdir_args *ra)
+{
+	struct ngnfs_btree_key key = { {cpu_to_le64(hash), 0, 0} };
+
+	fill_dent(inode->ninode->ino, inode->ninode->version,
+		  mode_to_type(le32_to_cpu(inode->ninode->mode)), name, name_len, dent);
+
+	return fill_readdir(&key, dent, ra);
+}
+
+/*
+ * Fill in "." and ".." entries for readdir if it is the first read.
+ * Assumes that the buf in ra is large enough to hold both.
+ */
+static int fill_dots(struct ngnfs_fs_info *nfi, struct ngnfs_transaction *txn,
+		     struct ngnfs_btree_key *key, struct readdir_args *ra,
+		     struct ngnfs_inode_txn_ref *dir)
+{
+	struct ngnfs_inode_txn_ref parent_dir;
+	struct ngnfs_dirent dent;
+	u64 parent_ino;
+	int ret;
+
+	if ((le64_to_cpu(key->k[0]) != 0))
+		return 0;
+
+	parent_ino = le64_to_cpu(dir->ninode->parent_ino);
+
+	ret = fill_from_inode(NGNFS_DIRENT_DOT_HASH, dir, ".", 1, &dent, ra) ?:
+	      ngnfs_inode_get(nfi, txn, NBF_READ, parent_ino, &parent_dir) ?:
+	      fill_from_inode(NGNFS_DIRENT_DOT_DOT_HASH, &parent_dir, "..", 2, &dent, ra);
+
+	if (ret == 0)
+		key->k[0] = cpu_to_le64(NGNFS_DIRENT_MIN_HASH);
+
+	return ret;
 }
 
 /*
@@ -354,8 +436,9 @@ int ngnfs_dir_readdir(struct ngnfs_fs_info *nfi, u64 dir_ino, u64 pos,
 
 		ret = ngnfs_inode_get(nfi, &txn, NBF_READ, dir_ino, &dir)			?:
 		      check_ifmt(dir.ninode, S_IFDIR, -ENOTDIR)					?:
+		      fill_dots(nfi, &txn, &key, &ra, &dir)					?:
 		      ngnfs_btree_read_iter(nfi, &txn, &dir.ninode->dirents, &key,
-					    NULL, NULL, fill_dirent_rd, &ra);
+					    NULL, NULL, fill_readdir_rd, &ra);
 
 	} while (ngnfs_txn_retry(nfi, &txn, &ret));
 
diff --git a/shared/format-block.h b/shared/format-block.h
index b0bc995..49481fe 100644
--- a/shared/format-block.h
+++ b/shared/format-block.h
@@ -153,4 +153,9 @@ struct ngnfs_dirent {
  */
 #define NGNFS_DIRENT_HASH_MASK	(U64_MAX ^ (1ULL << 63) ^ NGNFS_DIRENT_COLL_BIT)
 
+/* reserved hash values for . and .. */
+#define NGNFS_DIRENT_DOT_HASH	 	0ULL
+#define NGNFS_DIRENT_DOT_DOT_HASH	1ULL
+#define NGNFS_DIRENT_MIN_HASH		2ULL
+
 #endif
-- 
2.48.1




More information about the ngnfs-devel mailing list