[patch 10/15] fs/logfs/memtree.c

Tue Apr 1 14:13:08 EDT 2008

--- /dev/null	2008-04-02 16:29:12.813336657 +0200
+++ linux-2.6.24logfs/fs/logfs/memtree.c	2008-04-01 21:43:14.593326689 +0200
@@ -0,0 +1,405 @@
+/*
+ * fs/logfs/memtree.c	- Simple In-memory B+Tree
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2007 Joern Engel <joern at logfs.org>
+ *
+ *
+ * This could possibly get moved to lib/.
+ *
+ * A relatively simple B+Tree implementation.  I have written it as a learning
+ * excercise to understand how B+Trees work.  Turned out to be useful as well.
+ *
+ * B+Trees can be used similar to Linux radix trees (which don't have anything
+ * in common with textbook radix trees, beware).  Prerequisite for them working
+ * well is that access to a random tree node is much faster than a large number
+ * of operations within each node.
+ *
+ * Disks have fulfilled the prerequite for a long time.  More recently DRAM
+ * has gained similar properties, as memory access times, when measured in cpu
+ * cycles, have increased.  Cacheline sizes have increased as well, which also
+ * helps B+Trees.
+ *
+ * Compared to radix trees, B+Trees are more efficient when dealing with a
+ * sparsely populated address space.  Between 25% and 50% of the memory is
+ * occupied with valid pointers.  When densely populated, radix trees contain
+ * ~98% pointers - hard to beat.  Very sparse radix trees contain only ~2%
+ * pointers.
+ *
+ * This particular implementation stores pointers identified by a long value.
+ * Storing NULL pointers is illegal, lookup will return NULL when no entry
+ * was found.
+ *
+ * Two tricks were used that are not commonly found in textbooks.  First, the
+ * lowest values are to the right, not to the left.  All used slots within a
+ * node are on the left, all unused slots contain NUL values.  Most operations
+ * simply loop once over all slots and terminate on the first NUL.
+ *
+ * Second trick is to special-case the key "0" or NUL.  As seen above, this
+ * value indicates an unused slot, so such a value should not be stored in the
+ * tree itself.  Instead it is stored in the null_ptr field in the btree_head.
+ */
+/* FIXME: use mempool for allocations */
+#include "logfs.h"
+
+/*
+ * Prerequisite of B+Trees performing well is that node lookup is much slower
+ * than a large number of operations within a node.  That can be true if the
+ * node size is identical to cacheline size.  All that is highly
+ * machine-dependent, just like the #define below is not.
+ *
+ * Patches to do something smarter are welcome.  Just beware that too small
+ * node with less than 8 slots have a bad fan-out and won't perform well
+ * either.
+ */
+#if BITS_PER_LONG == 32
+#define BTREE_NODES 20	/* 32bit, 240 byte nodes */
+#else
+#define BTREE_NODES 16	/* 64bit, 256 byte nodes */
+#endif
+
+struct btree_node {
+	u64 key;
+	struct btree_node *node;
+};
+
+void btree_init(struct btree_head *head)
+{
+	head->node = NULL;
+	head->height = 0;
+	head->null_ptr = NULL;
+}
+
+#if 0
+static void __dump_tree(struct btree_node *node, int height)
+{
+	int i;
+
+	if (!height)
+		return;
+
+	printk(KERN_DEBUG"%p ", node);
+	for (i = 0; i < BTREE_NODES; i++)
+		printk("(%llx,%p) ", node[i].key, node[i].node);
+	printk("\n");
+
+	for (i = 0; i < BTREE_NODES; i++)
+		if (node[i].key)
+			__dump_tree(node[i].node, height-1);
+}
+
+static void dump_tree(struct btree_head *head)
+{
+	printk(KERN_DEBUG"%p\n", head->null_ptr);
+	__dump_tree(head->node, head->height);
+}
+#endif
+
+static u64 btree_last(struct btree_head *head)
+{
+	int height = head->height;
+	struct btree_node *node = head->node;
+
+	if (height == 0)
+		return 0;
+
+	for ( ; height > 1; height--)
+		node = node[0].node;
+
+	return node[0].key;
+}
+
+void *btree_lookup(struct btree_head *head, u64 key)
+{
+	int i, height = head->height;
+	struct btree_node *node = head->node;
+
+	if (key == 0)
+		return head->null_ptr;
+
+	if (height == 0)
+		return NULL;
+
+	for ( ; height > 1; height--) {
+		for (i = 0; i < BTREE_NODES; i++)
+			if (node[i].key <= key)
+				break;
+		node = node[i].node;
+		if (!node)
+			return NULL;
+	}
+
+	if (!node)
+		return NULL;
+
+	for (i = 0; i < BTREE_NODES; i++)
+		if (node[i].key == key)
+			return node[i].node;
+
+	return NULL;
+}
+
+/*
+ * Returns two values:
+ * pos - the position of the first slot equal or less than key
+ * fill - the number of positions filled with any value
+ */
+static void find_pos(struct btree_node *node, u64 key, int *pos, int *fill)
+{
+	int i;
+
+	for (i = 0; i < BTREE_NODES; i++)
+		if (node[i].key <= key)
+			break;
+	*pos = i;
+	for (i = *pos; i < BTREE_NODES; i++)
+		if (node[i].key == 0)
+			break;
+	*fill = i;
+}
+
+/*
+ * locate the correct leaf node in the btree
+ */
+static struct btree_node *find_level(struct btree_head *head, u64 key,
+		int level)
+{
+	struct btree_node *node = head->node;
+	int i, height;
+
+	for (height = head->height; height > level; height--) {
+		for (i = 0; i < BTREE_NODES; i++)
+			if (node[i].key <= key)
+				break;
+
+		if ((i == BTREE_NODES) || !node[i].key) {
+			/* right-most key is too large, update it */
+			i--;
+			node[i].key = key;
+		}
+		BUG_ON(i < 0);
+		node = node[i].node;
+	}
+	BUG_ON(!node);
+	return node;
+}
+
+static int btree_grow(struct btree_head *head)
+{
+	struct btree_node *node;
+
+	node = kcalloc(BTREE_NODES, sizeof(*node), GFP_KERNEL);
+	BUG_ON(!node);
+	if (!node)
+		return -ENOMEM;
+	if (head->node) {
+		node->key = head->node[BTREE_NODES-1].key;
+		node->node = head->node;
+	}
+	head->node = node;
+	head->height++;
+	return 0;
+}
+
+static int btree_insert_level(struct btree_head *head, u64 key, void *ptr,
+		int level)
+{
+	struct btree_node *node;
+	int i, pos, fill, err;
+
+	BUG_ON(!ptr);
+	if (key == 0) {
+		/* 0 identifies empty slots, so special-case this */
+		BUG_ON(level != 1);
+		head->null_ptr = ptr;
+		return 0;
+	}
+
+	if (head->height < level) {
+		err = btree_grow(head);
+		if (err)
+			return err;
+	}
+
+retry:
+	node = find_level(head, key, level);
+	find_pos(node, key, &pos, &fill);
+	BUG_ON(node[pos].key == key);
+
+	if (fill == BTREE_NODES) {
+		/* need to split node */
+		struct btree_node *new;
+
+		new = kcalloc(BTREE_NODES, sizeof(*node), GFP_KERNEL);
+		BUG_ON(!new);
+		if (!new)
+			return -ENOMEM;
+		err = btree_insert_level(head, node[BTREE_NODES/2 - 1].key, new,
+				level+1);
+		if (err) {
+			kfree(new);
+			return err;
+		}
+		for (i = 0; i < BTREE_NODES / 2; i++) {
+			new[i].key = node[i].key;
+			new[i].node = node[i].node;
+			node[i].key = node[i + BTREE_NODES/2].key;
+			node[i].node = node[i + BTREE_NODES/2].node;
+			node[i + BTREE_NODES/2].key = 0;
+			node[i + BTREE_NODES/2].node = NULL;
+		}
+		goto retry;
+	}
+	BUG_ON(fill >= BTREE_NODES);
+
+	/* shift and insert */
+	for (i = fill; i > pos; i--) {
+		node[i].key = node[i-1].key;
+		node[i].node = node[i-1].node;
+	}
+	node[pos].key = key;
+	node[pos].node = ptr;
+
+	return 0;
+}
+
+int btree_insert(struct btree_head *head, u64 key, void *ptr)
+{
+	BUG_ON(!ptr);
+	return btree_insert_level(head, key, ptr, 1);
+}
+
+static void *btree_remove_level(struct btree_head *head, u64 key, int level)
+{
+	struct btree_node *node;
+	int i, pos, fill;
+	void *ret;
+
+	if (level > head->height) {
+		/* we recursed all the way up */
+		head->height = 0;
+		head->node = NULL;
+		return NULL;
+	}
+
+	node = find_level(head, key, level);
+	find_pos(node, key, &pos, &fill);
+	if ((level == 1) && (node[pos].key != key))
+		return NULL;
+	ret = node[pos].node;
+
+	/* remove and shift */
+	for (i = pos; i < fill-1; i++) {
+		node[i].key = node[i+1].key;
+		node[i].node = node[i+1].node;
+	}
+	node[fill-1].key = 0;
+	node[fill-1].node = NULL;
+
+	if (fill-1 < BTREE_NODES/2) {
+		/*
+		 * At this point there *should* be code to either merge with
+		 * a neighboring node or steal some entries from it to preserve
+		 * the btree invariant of only having nodes with n/2..n
+		 * elements.
+		 *
+		 * As you can see, that code is left as an excercise to the
+		 * reader or anyone noticing severe performance problems in
+		 * very rare cases.
+		 *
+		 * As-is this code "implements" a method called lazy deletion,
+		 * which according to text books is relatively common in
+		 * databases and usually works quite well.
+		 * Not so usually, the btree can degrade into very long lists
+		 * of 1-element nodes and perform accordingly.
+		 */
+	}
+	if (fill-1 == 0) {
+		btree_remove_level(head, key, level+1);
+		kfree(node);
+	}
+
+	return ret;
+}
+
+void *btree_remove(struct btree_head *head, u64 key)
+{
+	void *ret;
+
+	if (key == 0) {
+		/* 0 identifies empty slots, so special-case this */
+		ret = head->null_ptr;
+		head->null_ptr = NULL;
+		return ret;
+	}
+	if (head->height == 0)
+		return NULL;
+
+	return btree_remove_level(head, key, 1);
+}
+
+int btree_merge(struct btree_head *target, struct btree_head *victim)
+{
+	struct btree_node *node;
+	u64 key;
+	int err;
+
+	BUG_ON(target == victim);
+
+	if (!(target->node || target->null_ptr)) {
+		/* target is empty, just copy fields over */
+		target->null_ptr = victim->null_ptr;
+		target->node = victim->node;
+		target->height = victim->height;
+		btree_init(victim);
+		return 0;
+	}
+
+	for (;;) {
+		key = btree_last(victim);
+		node = btree_remove(victim, key);
+		if (!node)
+			break;
+		err = btree_insert(target, key, node);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static void __btree_for_each(struct btree_node *node, long opaque,
+		void (*func)(void *elem, long opaque, u64 key),  int reap,
+		int height)
+{
+	int i;
+
+	for (i = 0; i < BTREE_NODES && node[i].key; i++) {
+		if (height > 1)
+			__btree_for_each(node[i].node, opaque, func, reap,
+					height-1);
+		else
+			func(node[i].node, opaque, node[i].key);
+	}
+	if (reap)
+		kfree(node);
+}
+
+void btree_visitor(struct btree_head *head, long opaque,
+		void (*func)(void *elem, long opaque, u64 key))
+{
+	if (head->node)
+		__btree_for_each(head->node, opaque, func, 0, head->height);
+	if (head->null_ptr)
+		func(head->null_ptr, opaque, 0);
+}
+
+void btree_grim_visitor(struct btree_head *head, long opaque,
+		void (*func)(void *elem, long opaque, u64 key))
+{
+	if (head->node)
+		__btree_for_each(head->node, opaque, func, 1, head->height);
+	if (head->null_ptr)
+		func(head->null_ptr, opaque, 0);
+	btree_init(head);
+}