[PATCH 20/20] Add btree verification
Valerie Aurora
val at versity.com
Thu Jun 12 13:11:12 PDT 2025
Add super janky btree verification functions and associated debugfs
commands. They caught a couple of bugs in replace_item().
Co-authored-by: Zach Brown <zab at zabbo.net>
Signed-off-by: Valerie Aurora <val at versity.com>
---
cli/debugfs.c | 73 ++++++++
shared/btree.c | 489 ++++++++++++++++++++++++++++++++++++++++++++++++-
shared/btree.h | 2 +
3 files changed, 555 insertions(+), 9 deletions(-)
diff --git a/cli/debugfs.c b/cli/debugfs.c
index 6fb21a2..6f23211 100644
--- a/cli/debugfs.c
+++ b/cli/debugfs.c
@@ -17,6 +17,7 @@
#include "shared/lk/types.h"
#include "shared/lk/xattr.h"
+#include "shared/btree.h"
#include "shared/data.h"
#include "shared/dir.h"
#include "shared/format-block.h"
@@ -155,6 +156,76 @@ static void cmd_brename(struct debugfs_context *ctx, int argc, char **argv)
}
}
+static void cmd_btree_info(struct debugfs_context *ctx, int argc, char **argv)
+{
+ struct ngnfs_inode_ino_gen ig = ctx->cwd_ig;
+ struct ngnfs_dir_lookup_entry lent;
+ struct ngnfs_inode ninode;
+ char *name;
+ int ret;
+
+ if (argc > 2) {
+ printf("usage: btree_info [filename]\n");
+ return;
+ }
+
+ if (argc == 2) {
+ name = argv[1];
+ ret = ngnfs_dir_lookup(ctx->nfi, &ctx->cwd_ig, name, strlen(name), &lent);
+ if (ret < 0) {
+ print_err("btree_info", ret);
+ return;
+ }
+ ig = lent.ig;
+ }
+
+ ret = ngnfs_inode_read_copy(ctx->nfi, &ig, &ninode, sizeof(ninode));
+
+ if (ret < 0) {
+ print_err("btree_info", ret);
+ } else if (ret < sizeof(ninode)) {
+ log("returned inode buffer size %d too small, wanted at least %zu",
+ ret, sizeof(ninode));
+ } else {
+ if (lent.dtype == DT_DIR) {
+ printf("dirent btree bnr: %llu\n"
+ "dirent btree height: %u\n",
+ le64_to_cpu(ninode.dirents.ref.bnr),
+ ninode.dirents.height);
+ }
+
+ printf("xattr btree bnr: %llu\n"
+ "xattr btree height: %u\n",
+ le64_to_cpu(ninode.xattrs.ref.bnr),
+ ninode.xattrs.height);
+ }
+}
+
+static void cmd_btree_pb(struct debugfs_context *ctx, int argc, char **argv)
+{
+ u64 bnr;
+ int ret;
+
+ if (argc != 2) {
+ printf("usage: btree_pb <bnr>\n");
+ return;
+ }
+
+ ret = strtoull_nerr(&bnr, argv[1], NULL, 0);
+ if (ret < 0) {
+ print_err("parsing block number", ret);
+ return;
+ }
+
+ ret = ngnfs_print_btree_block(ctx->nfi, bnr, "debugfs");
+ if (ret < 0) {
+ print_err("ngnfs_print_btree_block", ret);
+ return;
+ }
+
+ return;
+}
+
static void cmd_cd(struct debugfs_context *ctx, int argc, char **argv)
{
struct ngnfs_dir_lookup_entry lent;
@@ -782,6 +853,8 @@ static struct command {
} commands[] = {
{ "bcreate", cmd_bcreate, },
{ "brename", cmd_brename, },
+ { "btree_info", cmd_btree_info, },
+ { "btree_pb", cmd_btree_pb, },
{ "cd", cmd_cd, },
{ "create", cmd_create, },
{ "getxattr", cmd_getxattr, },
diff --git a/shared/btree.c b/shared/btree.c
index b19b88f..db168e5 100644
--- a/shared/btree.c
+++ b/shared/btree.c
@@ -168,6 +168,486 @@ static u16 find_key_ind(struct ngnfs_btree_block *bt, struct ngnfs_btree_key *ke
return ind;
}
+static int cmp_ihdr_off(const void *A, const void *B, const void *priv)
+{
+ const struct ngnfs_btree_block *bt = priv;
+ const struct ngnfs_btree_item_header *a = &bt->ihdrs[*(u16 *)A];
+ const struct ngnfs_btree_item_header *b = &bt->ihdrs[*(u16 *)B];
+
+ return (int)le16_to_cpu(a->off) - (int)le16_to_cpu(b->off);
+}
+
+#define verify_printf(print, fmt, args...) \
+ if (print) dprintf(STDOUT_FILENO, fmt, ##args)
+
+static void print_key(int print, struct ngnfs_btree_key *key)
+{
+ verify_printf(print, "[%020llu %020llu %020llu]",
+ le64_to_cpu(key->k[0]), le64_to_cpu(key->k[1]), le64_to_cpu(key->k[2]));
+}
+
+/*
+ * Verify a btree block is internally consistent. If it is not, it
+ * returns -EUCLEAN. May also return -ENOMEM.
+ *
+ * Avoid using functions that may have BUG_ON() checking in them so that
+ * we can print out damaged btree blocks without crashing.
+ */
+static int do_verify_btree_block(struct ngnfs_fs_info *nfi, struct ngnfs_transaction *txn,
+ char *str, struct ngnfs_btree_block *bt, int print)
+{
+ struct ngnfs_btree_item *item;
+ struct ngnfs_block_ref *ref;
+ struct ngnfs_btree_key *prev;
+ u16 *inds = NULL;
+ u16 val_size, val_bytes;
+ u16 off, prev_end;
+ u16 item_bytes;
+ u16 ini_items;
+ u16 unz_items;
+ u16 uni_items;
+ u16 big_items;
+ u16 off_items;
+ u16 oor_items;
+ u16 ooo_items;
+ u16 nr_items;
+ u16 ind;
+ u16 i, j;
+ u8 c;
+ int ret;
+
+ ret = 0;
+
+ if (!bt) {
+ verify_printf(print, "verify btree block %s: ERROR: NULL btree block pointer\n",
+ str);
+ return -EUCLEAN;
+ }
+
+ verify_printf(print, "\n%s: verifying btree block\n\n", str);
+ verify_printf(print,
+ "bnr:\t\t%llu\n"
+ "level:\t\t%u\n"
+ "nr_items:\t%u\n"
+ "tail free:\t%u\n"
+ "total free:\t%u\n",
+ le64_to_cpu(bt->bnr),
+ bt->level,
+ le16_to_cpu(bt->nr_items),
+ le16_to_cpu(bt->tail_free),
+ le16_to_cpu(bt->total_free));
+
+ verify_printf(print, "bt->first:\t");
+ print_key(print, &bt->first);
+ verify_printf(print, "\nbt->last:\t");
+ print_key(print, &bt->last);
+ verify_printf(print, "\n\n");
+
+ nr_items = le16_to_cpu(bt->nr_items);
+
+ if (nr_items > NGNFS_BTREE_MAX_ITEMS) {
+ verify_printf(print, "ERROR: nr_items %u > %lu\n",
+ nr_items, NGNFS_BTREE_MAX_ITEMS);
+ ret = -EUCLEAN;
+ if (!print)
+ goto out;
+ }
+
+ ini_items = 0;
+ unz_items = 0;
+ uni_items = 0;
+ big_items = 0;
+ off_items = 0;
+ oor_items = 0;
+ ooo_items = 0;
+ prev = NULL;
+
+ for (ind = 0; ind < NGNFS_BTREE_MAX_ITEMS; ind++) {
+ off = le16_to_cpu(bt->ihdrs[ind].off);
+ val_size = le16_to_cpu(bt->ihdrs[ind].val_size);
+ item_bytes = aligned_item_size(val_size);
+
+ /* check that invalid item headers are zeroed */
+ if (ind >= nr_items) {
+ if (off || val_size) {
+ unz_items++;
+ verify_printf(print, "ERROR: ihdrs[%3u]: free ihdr not zeroed:"
+ " off %u val_size %u\n", ind, off, val_size);
+ ret = -EUCLEAN;
+ if (!print)
+ goto out;
+ }
+ continue;
+ }
+
+ /* check that item is initialized */
+ if (off == 0 && val_size == 0) {
+ verify_printf(print, "ERROR: ihdrs[%3u]: marked as in use but has zero "
+ "off and size (nr_items %u wrong?)\n", ind, nr_items);
+ uni_items++;
+ ret = -EUCLEAN;
+ if (!print)
+ goto out;
+ continue;
+ }
+
+ ini_items++;
+
+ /* check the item offset */
+ if (off < NGNFS_BLOCK_SIZE - NGNFS_BTREE_MAX_FREE ||
+ off + sizeof(struct ngnfs_btree_key) > NGNFS_BLOCK_SIZE) {
+ verify_printf(print, "ERROR: ihdrs[%3u]: invalid offset %u\n", ind, off);
+ off_items++;
+ ret = -EUCLEAN;
+ if (!print)
+ goto out;
+ continue;
+ }
+
+ /* check the item size */
+ if (off + item_bytes > NGNFS_BLOCK_SIZE) {
+ verify_printf(print, "ERROR: ihdrs[%3u]: item val_size too big: %u "
+ "(%u aligned)\n", ind, val_size, item_bytes);
+ big_items++;
+ ret = -EUCLEAN;
+ if (!print)
+ goto out;
+ continue;
+ }
+
+ /* safe to read the item now */
+ item = (void *)bt + off;
+
+ /* check range and order of keys */
+ if (compare_keys(&bt->first, &item->key) > 0 ||
+ compare_keys(&item->key, &bt->last) > 0) {
+ verify_printf(print, "ERROR: ihdrs[%3u]: key out of range\n", ind);
+ verify_printf(print, "bt->first key:\t");
+ print_key(print, &bt->first);
+ verify_printf(print, "\nkey:\t");
+ print_key(print, &item->key);
+ verify_printf(print, "\nbt->last key:\t");
+ print_key(print, &bt->last);
+ verify_printf(print, "\n");
+
+ oor_items++;
+ ret = -EUCLEAN;
+ if (!print)
+ goto out;
+
+ } else if (prev && compare_keys(prev, &item->key) != -1) {
+ verify_printf(print, "ERROR: ihdrs[%3u]: key <= prev key\n", ind);
+ verify_printf(print, "prev key:\t");
+ print_key(print, prev);
+ verify_printf(print, "\nkey:\t");
+ print_key(print, &item->key);
+ verify_printf(print, "\n");
+
+ ooo_items++;
+ ret = -EUCLEAN;
+ if (!print)
+ goto out;
+ }
+
+ prev = &item->key;
+
+ /* if not a leaf, print the child's block number */
+ if (bt->level) {
+ ref = (struct ngnfs_block_ref *) item->val;
+
+ if (val_size != sizeof(struct ngnfs_block_ref)) {
+ verify_printf(print, "ERROR: ihdrs[%3u]: "
+ "level %u wrong item value size for block ref %u\n",
+ ind, bt->level, val_size);
+ ret = -EUCLEAN;
+ if (!print)
+ goto out;
+
+ } else if (ref->bnr == 0) {
+ verify_printf(print, "ERROR: ihdrs[%3u]: "
+ "level %u block ref has invalid bnr %llu\n",
+ ind, bt->level, ref->bnr);
+ ret = -EUCLEAN;
+ if (!print)
+ goto out;
+
+ } else {
+ verify_printf(print, "ihdrs[%3u]: key\t", ind);
+ print_key(print, &item->key);
+ verify_printf(print, " lvl %u ref bnr %llu\n", bt->level,
+ le64_to_cpu(ref->bnr));
+ }
+ } else {
+ verify_printf(print, "ihdrs[%3u]: key\t", ind);
+ print_key(print, &item->key);
+ verify_printf(print, " off %4u size %4u\n", off, val_size);
+ }
+ }
+
+ if (ini_items != nr_items) {
+ verify_printf(print, "ERROR: bt->nr_items = %u but counted %u initialized items\n",
+ nr_items, ini_items);
+ ret = -EUCLEAN;
+ if (!print)
+ goto out;
+ }
+
+ /* check that the item values don't overlap and have zeroes between */
+ inds = kmalloc_array(nr_items, sizeof(inds[0]), GFP_NOFS);
+ if (!inds) {
+ verify_printf(print, "ERROR: couldn't allocate memory\n");
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ for (i = 0; i < nr_items; i++)
+ inds[i] = i;
+
+ sort_r(inds, nr_items, sizeof(inds[0]), cmp_ihdr_off, NULL, bt);
+
+ prev_end = NGNFS_BLOCK_SIZE - NGNFS_BTREE_MAX_FREE;
+
+ for (i = 0; i < nr_items; i++) {
+ ind = inds[i];
+ item = item_from_ind(bt, ind);
+ off = le16_to_cpu(bt->ihdrs[ind].off);
+ val_size = le16_to_cpu(bt->ihdrs[ind].val_size);
+ val_bytes = ALIGN(val_size, NGNFS_BTREE_ITEM_ALIGN);
+ item_bytes = aligned_item_size(val_size);
+
+ if (off < prev_end) {
+ if (off == 0) {
+ verify_printf(print, "ERROR: ihdrs[%3u].off is zero, ignoring\n",
+ ind);
+ ret = -EUCLEAN;
+ if (!print)
+ goto out;
+
+ } else if (off < NGNFS_BLOCK_SIZE - NGNFS_BTREE_MAX_FREE) {
+ verify_printf(print, "ERROR: ihdrs[%3u].off %4u overlaps btree "
+ "block header ending at %lu\n", ind, off,
+ NGNFS_BLOCK_SIZE - NGNFS_BTREE_MAX_FREE);
+ ret = -EUCLEAN;
+ if (!print)
+ goto out;
+
+ } else {
+ verify_printf(print, "ERROR: ihdrs[%3u].off %4u overlaps "
+ "previous item value(s) ending %4u\n",
+ ind, off, prev_end);
+ ret = -EUCLEAN;
+ if (!print)
+ goto out;
+ }
+ } else {
+ prev_end = off + item_bytes;
+ }
+
+ /* check that the item tail is zeroed */
+ for (j = val_size; j < val_bytes; j++) {
+ c = item->val[j];
+ if (c != 0) {
+ verify_printf(print, "ERROR: ihdrs[%3u] tail not zeroed: "
+ "byte %u = %0x\n", ind, j - val_size, c);
+ ret = -EUCLEAN;
+ if (!print)
+ goto out;
+ }
+ }
+
+ /* check that everything between this and previous item is zeroed */
+ for (j = prev_end; j < off; j++) {
+ c = *((char *)((void *)bt + j));
+ if (c == 0)
+ continue;
+
+ verify_printf(print, "ERROR: non-zero byte %0x at offset %u "
+ "between item[%u] offset %u ", c, j, ind, off);
+ if (prev_end == NGNFS_BLOCK_SIZE - NGNFS_BTREE_MAX_FREE) {
+ verify_printf(print, "and beginning of values %lu\n",
+ NGNFS_BLOCK_SIZE - NGNFS_BTREE_MAX_FREE);
+ } else {
+ verify_printf(print, "and end of previous item %u offset %u\n",
+ inds[ind - 1],
+ le16_to_cpu(bt->ihdrs[inds[ind - 1]].off) +
+ le16_to_cpu(bt->ihdrs[inds[ind - 1]].val_size));
+ }
+
+ ret = -EUCLEAN;
+ if (!print)
+ goto out;
+ break;
+ }
+ }
+
+ /* check that end of block is zeroed */
+ for (j = prev_end; j < NGNFS_BLOCK_SIZE; j++) {
+ c = *((char *)((void *)bt + j));
+ if (c == 0)
+ continue;
+
+ verify_printf(print, "ERROR: non-zero byte %0x at offset %u after end of data\n",
+ c, j);
+
+ ret = -EUCLEAN;
+ if (!print)
+ goto out;
+ break;
+ }
+
+out:
+ kfree(inds);
+
+ verify_printf(print, "\n");
+
+ if (ret == 0 || ret == -ENOMEM)
+ return ret;
+
+ verify_printf(print,
+ "\t%4u items in use in block header\n"
+ "\t%4u initialized items\n"
+ "\t%4u unzeroed items\n"
+ "\t%4u uninitialized items\n"
+ "\t%4u items with bad offset\n"
+ "\t%4u too big items\n"
+ "\t%4u out of range items\n"
+ "\t%4u out of order items\n",
+ nr_items,
+ ini_items,
+ unz_items,
+ uni_items,
+ off_items,
+ big_items,
+ oor_items,
+ ooo_items);
+
+ verify_printf(print, "\tERROR: btree block %s bnr %llu failed verification\n\n",
+ str, le64_to_cpu(bt->bnr));
+
+ return ret;
+}
+
+static void print_btree_block(struct ngnfs_fs_info *nfi, struct ngnfs_transaction *txn, char *str,
+ struct ngnfs_btree_block *bt)
+{
+ do_verify_btree_block(nfi, txn, str, bt, 1);
+}
+
+/*
+ * Verify the btree structure, but only print it out if there's an
+ * error. Returns the last error encountered.
+ */
+static int verify_btree_block(struct ngnfs_fs_info *nfi, struct ngnfs_transaction *txn, char *str,
+ struct ngnfs_btree_block *bt)
+{
+ int ret;
+
+ ret = do_verify_btree_block(nfi, txn, str, bt, 0);
+ if (ret)
+ ret = do_verify_btree_block(nfi, txn, str, bt, 1);
+
+ return ret;
+}
+
+/*
+ * Verify a parent/child set of btree blocks and the relationship
+ * between their keys. @key is in the range of the child block. @parent
+ * may be null. If @bt is null, it returns without checking anything.
+ * Returns -EUCLEAN if there is an inconsistency in the btree blocks.
+ * May also return -ENOMEM.
+ */
+__unused
+static int verify_btree_block_parent(struct ngnfs_fs_info *nfi, struct ngnfs_transaction *txn,
+ char *str, struct ngnfs_btree_block *bt,
+ struct ngnfs_btree_block *parent, struct ngnfs_btree_key *key)
+{
+ struct ngnfs_btree_key start;
+ struct ngnfs_btree_key end;
+ struct ngnfs_btree_item *item;
+ int ind;
+ int ret, parent_ret;
+
+ if (!bt)
+ return 0;
+
+ if (parent) {
+ parent_ret = verify_btree_block(nfi, txn, str, parent);
+
+ /* get expected first/last keys of child from parent */
+ ind = find_key_ind(parent, key);
+
+ if (ind >= le16_to_cpu(parent->nr_items)) {
+ verify_printf(1, "ERROR: key >= parent's last key\n");
+ verify_printf(1, "key:\t");
+ print_key(1, key);
+ verify_printf(1, "\nlast:\t");
+ print_key(1, &parent->last);
+ verify_printf(1, "\n");
+ parent_ret = -EUCLEAN;
+ goto out;
+ }
+
+ item = item_from_ind(parent, ind);
+
+ if (ind == 0) {
+ ngnfs_btree_key_set_min(&start);
+ end = item->key;
+
+ } else if (ind == le16_to_cpu(parent->nr_items)) {
+ start = item->key;
+ ngnfs_btree_key_set_max(&end);
+
+ } else {
+ end = item->key;
+ item = item_from_ind(parent, ind - 1);
+ start = item->key;
+ ngnfs_btree_key_inc(&start);
+ }
+
+ if (compare_keys(&bt->first, &start) || compare_keys(&bt->last, &end)) {
+ verify_printf(1, "\tERROR: parent's start/end keys inconsistent with "
+ "child block's first/last keys\n");
+ verify_printf(1, "first:\t");
+ print_key(1, &bt->first);
+ verify_printf(1, "\nstart:\t");
+ print_key(1, &start);
+ verify_printf(1, "\nlast:\t");
+ print_key(1, &bt->last);
+ verify_printf(1, "\nend:\t");
+ print_key(1, &end);
+ verify_printf(1, "\n");
+ parent_ret = -EUCLEAN;
+ goto out;
+ }
+ } else {
+ parent_ret = 0;
+ }
+out:
+ ret = verify_btree_block(nfi, txn, str, bt);
+
+ return parent_ret ?: ret;
+}
+
+int ngnfs_print_btree_block(struct ngnfs_fs_info *nfi, u64 bnr, char *str)
+{
+ struct ngnfs_transaction txn;
+ struct ngnfs_btree_block *bt;
+ int ret;
+
+ ngnfs_txn_init(&txn);
+
+ do {
+ ret = ngnfs_txn_get_block(nfi, &txn, bnr, NBF_READ, NULL, (void **)&bt);
+ if (ret == 0)
+ print_btree_block(nfi, &txn, str, bt);
+
+ } while (ngnfs_txn_retry(nfi, &txn, &ret));
+
+ ngnfs_txn_teardown(nfi, &txn);
+
+ return ret;
+}
+
/*
* True if there's room in the block for an insertion or replacement,
* just not contiguously available at the tail of the block. The caller
@@ -390,15 +870,6 @@ static struct ngnfs_btree_item *replace_item(struct ngnfs_txn_block *tblk,
return item;
}
-static int cmp_ihdr_off(const void *A, const void *B, const void *priv)
-{
- const struct ngnfs_btree_block *bt = priv;
- const struct ngnfs_btree_item_header *a = &bt->ihdrs[*(u16 *)A];
- const struct ngnfs_btree_item_header *b = &bt->ihdrs[*(u16 *)B];
-
- return (int)le16_to_cpu(a->off) - (int)le16_to_cpu(b->off);
-}
-
/*
* Defragment internal free space by moving all the items towards the
* front of the block, gathering all free space to the end. We sort the
diff --git a/shared/btree.h b/shared/btree.h
index 36e6bc6..7d2a325 100644
--- a/shared/btree.h
+++ b/shared/btree.h
@@ -86,4 +86,6 @@ int ngnfs_btree_write_iter(struct ngnfs_fs_info *nfi, struct ngnfs_transaction *
struct ngnfs_btree_key *key, struct ngnfs_btree_key *last,
ngnfs_btree_write_iter_fn_t iter, void *iter_arg);
+int ngnfs_print_btree_block(struct ngnfs_fs_info *nfi, u64 bnr, char *str);
+
#endif
--
2.49.0
More information about the ngnfs-devel
mailing list