[PATCH v3 00/30] maple_tree: Replace big node with maple copy

Andrew Morton akpm at linux-foundation.org
Sat Jan 31 12:27:24 PST 2026


On Fri, 30 Jan 2026 15:59:05 -0500 "Liam R. Howlett" <Liam.Howlett at oracle.com> wrote:

> The big node struct was created for simplicity of splitting,
> rebalancing, and spanning store operations by using a copy buffer to
> create the data necessary prior to breaking it up into 256B nodes.
> Certain operations were rather tricky due to the restriction of keeping
> NULL entries together and never at the end of a node (except the
> right-most node).
> 

Updated, thanks.

What are your thoughts on adding this to 6.19?  I'd expect to move it
into mm-stable Feb 17ish.

> Changes since v2:
>  - Whitespace fix in rebalance_data()
>  - Added ma_init_slot() for cleaner casting during RCU_INIT_POINTER().
>    Thanks test robot & SK [1]
>  - Fixed off-by-one in rebalance_data() which caused node overuse,
>    reported by linux-next. Thanks Mark [2]
>  - Added new patch to reproducer to test cases in userspace testing for
>    the rebalance_data() off-by-one

Below is how v3 altered mm.git:

--- a/lib/maple_tree.c~b
+++ a/lib/maple_tree.c
@@ -314,6 +314,13 @@ static inline struct maple_enode *mt_mk_
 			(type << MAPLE_ENODE_TYPE_SHIFT) | MAPLE_ENODE_NULL);
 }
 
+static inline void ma_init_slot(void __rcu **slot, const struct maple_node *mn,
+				const enum maple_type mt)
+{
+	/* WARNING: this is unsafe if the slot is exposed to readers. */
+	RCU_INIT_POINTER(*slot, (void *)mt_mk_node(mn, mt));
+}
+
 static inline void *mte_mk_root(const struct maple_enode *node)
 {
 	return (void *)((unsigned long)node | MAPLE_ROOT_NODE);
@@ -2231,7 +2238,7 @@ static inline void rebalance_data(struct
 {
 	cp_data_calc(cp, wr_mas, wr_mas);
 	sib->end = 0;
-	if (cp->data >= mt_slots[wr_mas->type]) {
+	if (cp->data > mt_slots[wr_mas->type]) {
 		push_data_sib(cp, wr_mas->mas, sib, parent);
 		if (sib->end)
 			goto use_sib;
@@ -2246,7 +2253,8 @@ static inline void rebalance_data(struct
 	return;
 
 use_sib:
-		cp->data += sib->end + 1;
+
+	cp->data += sib->end + 1;
 }
 
 /*
@@ -2553,7 +2561,7 @@ static inline void cp_dst_to_slots(struc
 		 * read-side operations that can view them until they are
 		 * inserted into the tree after an rcu_assign_pointer() call.
 		 */
-		RCU_INIT_POINTER(cp->slot[d], (void *)mt_mk_node(mn, mt));
+		ma_init_slot(&cp->slot[d], mn, mt);
 		cp->pivot[d] = slot_max;
 		if (mt_is_alloc(mas->tree)) {
 			if (ma_is_leaf(mt)) {
@@ -2603,8 +2611,7 @@ static inline bool cp_is_new_root(struct
 		 * read-side operations that can view it until it is insert into
 		 * the tree after an rcu_assign_pointer() call.
 		 */
-		RCU_INIT_POINTER(cp->slot[0],
-				 (void *)mt_mk_node(cp->dst[0].node, mt));
+		RCU_INIT_POINTER(cp->slot[0], mt_mk_node(cp->dst[0].node, mt));
 		cp->height++;
 	}
 	WARN_ON_ONCE(cp->dst[0].node != mte_to_node(
--- a/tools/testing/radix-tree/maple.c~b
+++ a/tools/testing/radix-tree/maple.c
@@ -35899,6 +35899,127 @@ unlock:
 	return ret;
 }
 
+static noinline void __init check_erase_rebalance(struct maple_tree *mt)
+{
+	unsigned long val;
+	void *enode;
+	int ret;
+
+	MA_STATE(mas, mt, 0, 0);
+
+	/*
+	 * During removal of big node, the rebalance started going too high,
+	 * which resulted in too many nodes trying to be used.
+	 *
+	 * Create a rebalance which results in an exactly full parent (0-9) that
+	 * does not need to be rebalanced.  This required two full levels,
+	 * followed by an insufficient level which will be rebalanced into two
+	 * nodes, finally leaves that need to be rebalanced into one node.
+	 *
+	 * The bugs tree:
+	 * root    4      Label     R
+	 *        /\               /\
+	 *       9   X            F
+	 *      /\   /\          /
+	 *     9   X            E
+	 *    /\   /\          /\
+	 *   4  8             C  D
+	 *  /\               /\
+	 * 6  9             A  B
+	 * ^ becomes 5 with the write.
+	 *
+	 * Below, the reconstruction leaves the root with 2 entries, the setup
+	 * uses the letter labels above.
+	 */
+
+	ret = build_full_tree(mt, MT_FLAGS_ALLOC_RANGE, 4);
+	MT_BUG_ON(mt, ret);
+
+	/* Cheap expansion to 5 levels */
+	mtree_store(mt, ULONG_MAX, xa_mk_value(0), GFP_KERNEL);
+	/* rcu is used to ensure node use */
+	mt_set_in_rcu(mt);
+	mas_lock(&mas);
+
+	/* Node A had 6 entries */
+	mas_walk(&mas);
+	MAS_BUG_ON(&mas, mas_data_end(&mas) < 6);
+	while (mas_data_end(&mas) > 6) {
+		mas_erase(&mas);
+		mas_next(&mas, ULONG_MAX);
+	}
+
+	/* Move to Node B */
+	enode = (void*) mas.node;
+	while (mas.node == enode)
+		mas_next(&mas, ULONG_MAX);
+
+	/* Node B had 9 entries */
+	MAS_BUG_ON(&mas, mas_data_end(&mas) < 9);
+	while (mas_data_end(&mas) > 9) {
+		mas_erase(&mas);
+		mas_next(&mas, ULONG_MAX);
+	}
+
+	/* Move to Node C */
+	mas_ascend(&mas);
+	val = mas.max;
+	/* Adjust entries to be 4 */
+	while (mas_data_end(&mas) > 4) {
+		mas_set(&mas, val);
+		mas_erase(&mas);
+		mas_prev(&mas, 0);
+		val = mas.index;
+		mas_ascend(&mas);
+	}
+
+	/* Move to Node D */
+	mas_ascend(&mas);
+	mas.offset = 1;
+	mas_descend(&mas);
+	val = mas.max;
+	/* Adjust entries to be 8 */
+	while (mas_data_end(&mas) < 8) {
+		mas_set(&mas, val--);
+		mas_store_gfp(&mas, &mas, GFP_KERNEL);
+		mas_ascend(&mas);
+	}
+
+	/* Move to Node E */
+	mas_ascend(&mas);
+	val = mas.max;
+	MAS_BUG_ON(&mas, mas_data_end(&mas) > 9);
+	/* Adjust Node E to 9 entries */
+	while (mas_data_end(&mas) < 9) {
+		mas_set(&mas, val--);
+		mas_store_gfp(&mas, &mas, GFP_KERNEL);
+		mas_ascend(&mas);
+		mas_ascend(&mas);
+	}
+
+	/* Move to Node F */
+	mas_ascend(&mas);
+	val = mas.max;
+	MAS_BUG_ON(&mas, mas_data_end(&mas) > 9);
+	/* Adjust Node F to 9 entries */
+	while (mas_data_end(&mas) < 9) {
+		mas_set(&mas, val--);
+		mas_store_gfp(&mas, &mas, GFP_KERNEL);
+		mas_ascend(&mas);
+		mas_ascend(&mas);
+		mas_ascend(&mas);
+	}
+
+	/* Test is set up, walk to first entry */
+	mas_set(&mas, 0);
+	mas_next(&mas, ULONG_MAX);
+	/* overwrite the entry to cause a rebalance, which was 1 too few */
+	mas_set_range(&mas, 0, mas.last);
+	mas_preallocate(&mas, NULL, GFP_KERNEL);
+	mas_store_prealloc(&mas, NULL);
+	mas_unlock(&mas);
+}
+
 static noinline void __init check_mtree_dup(struct maple_tree *mt)
 {
 	DEFINE_MTREE(new);
@@ -36260,6 +36381,10 @@ void farmer_tests(void)
 	check_mtree_dup(&tree);
 	mtree_destroy(&tree);
 
+	mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE);
+	check_erase_rebalance(&tree);
+	mtree_destroy(&tree);
+
 	/* RCU testing */
 	mt_init_flags(&tree, 0);
 	check_erase_testset(&tree);
_




More information about the maple-tree mailing list