[PATCH v3 00/30] maple_tree: Replace big node with maple copy
Andrew Morton
akpm at linux-foundation.org
Sat Jan 31 12:27:24 PST 2026
On Fri, 30 Jan 2026 15:59:05 -0500 "Liam R. Howlett" <Liam.Howlett at oracle.com> wrote:
> The big node struct was created for simplicity of splitting,
> rebalancing, and spanning store operations by using a copy buffer to
> create the data necessary prior to breaking it up into 256B nodes.
> Certain operations were rather tricky due to the restriction of keeping
> NULL entries together and never at the end of a node (except the
> right-most node).
>
Updated, thanks.
What are your thoughts on adding this to 6.19? I'd expect to move it
into mm-stable Feb 17ish.
> Changes since v2:
> - Whitespace fix in rebalance_data()
> - Added ma_init_slot() for cleaner casting during RCU_INIT_POINTER().
> Thanks test robot & SK [1]
> - Fixed off-by-one in rebalance_data() which caused node overuse,
> reported by linux-next. Thanks Mark [2]
> - Added new patch to reproducer to test cases in userspace testing for
> the rebalance_data() off-by-one
Below is how v3 altered mm.git:
--- a/lib/maple_tree.c~b
+++ a/lib/maple_tree.c
@@ -314,6 +314,13 @@ static inline struct maple_enode *mt_mk_
(type << MAPLE_ENODE_TYPE_SHIFT) | MAPLE_ENODE_NULL);
}
+static inline void ma_init_slot(void __rcu **slot, const struct maple_node *mn,
+ const enum maple_type mt)
+{
+ /* WARNING: this is unsafe if the slot is exposed to readers. */
+ RCU_INIT_POINTER(*slot, (void *)mt_mk_node(mn, mt));
+}
+
static inline void *mte_mk_root(const struct maple_enode *node)
{
return (void *)((unsigned long)node | MAPLE_ROOT_NODE);
@@ -2231,7 +2238,7 @@ static inline void rebalance_data(struct
{
cp_data_calc(cp, wr_mas, wr_mas);
sib->end = 0;
- if (cp->data >= mt_slots[wr_mas->type]) {
+ if (cp->data > mt_slots[wr_mas->type]) {
push_data_sib(cp, wr_mas->mas, sib, parent);
if (sib->end)
goto use_sib;
@@ -2246,7 +2253,8 @@ static inline void rebalance_data(struct
return;
use_sib:
- cp->data += sib->end + 1;
+
+ cp->data += sib->end + 1;
}
/*
@@ -2553,7 +2561,7 @@ static inline void cp_dst_to_slots(struc
* read-side operations that can view them until they are
* inserted into the tree after an rcu_assign_pointer() call.
*/
- RCU_INIT_POINTER(cp->slot[d], (void *)mt_mk_node(mn, mt));
+ ma_init_slot(&cp->slot[d], mn, mt);
cp->pivot[d] = slot_max;
if (mt_is_alloc(mas->tree)) {
if (ma_is_leaf(mt)) {
@@ -2603,8 +2611,7 @@ static inline bool cp_is_new_root(struct
* read-side operations that can view it until it is insert into
* the tree after an rcu_assign_pointer() call.
*/
- RCU_INIT_POINTER(cp->slot[0],
- (void *)mt_mk_node(cp->dst[0].node, mt));
+ RCU_INIT_POINTER(cp->slot[0], mt_mk_node(cp->dst[0].node, mt));
cp->height++;
}
WARN_ON_ONCE(cp->dst[0].node != mte_to_node(
--- a/tools/testing/radix-tree/maple.c~b
+++ a/tools/testing/radix-tree/maple.c
@@ -35899,6 +35899,127 @@ unlock:
return ret;
}
+static noinline void __init check_erase_rebalance(struct maple_tree *mt)
+{
+ unsigned long val;
+ void *enode;
+ int ret;
+
+ MA_STATE(mas, mt, 0, 0);
+
+ /*
+ * During removal of big node, the rebalance started going too high,
+ * which resulted in too many nodes trying to be used.
+ *
+ * Create a rebalance which results in an exactly full parent (0-9) that
+ * does not need to be rebalanced. This required two full levels,
+ * followed by an insufficient level which will be rebalanced into two
+ * nodes, finally leaves that need to be rebalanced into one node.
+ *
+ * The bugs tree:
+ * root 4 Label R
+ * /\ /\
+ * 9 X F
+ * /\ /\ /
+ * 9 X E
+ * /\ /\ /\
+ * 4 8 C D
+ * /\ /\
+ * 6 9 A B
+ * ^ becomes 5 with the write.
+ *
+ * Below, the reconstruction leaves the root with 2 entries, the setup
+ * uses the letter labels above.
+ */
+
+ ret = build_full_tree(mt, MT_FLAGS_ALLOC_RANGE, 4);
+ MT_BUG_ON(mt, ret);
+
+ /* Cheap expansion to 5 levels */
+ mtree_store(mt, ULONG_MAX, xa_mk_value(0), GFP_KERNEL);
+ /* rcu is used to ensure node use */
+ mt_set_in_rcu(mt);
+ mas_lock(&mas);
+
+ /* Node A had 6 entries */
+ mas_walk(&mas);
+ MAS_BUG_ON(&mas, mas_data_end(&mas) < 6);
+ while (mas_data_end(&mas) > 6) {
+ mas_erase(&mas);
+ mas_next(&mas, ULONG_MAX);
+ }
+
+ /* Move to Node B */
+ enode = (void*) mas.node;
+ while (mas.node == enode)
+ mas_next(&mas, ULONG_MAX);
+
+ /* Node B had 9 entries */
+ MAS_BUG_ON(&mas, mas_data_end(&mas) < 9);
+ while (mas_data_end(&mas) > 9) {
+ mas_erase(&mas);
+ mas_next(&mas, ULONG_MAX);
+ }
+
+ /* Move to Node C */
+ mas_ascend(&mas);
+ val = mas.max;
+ /* Adjust entries to be 4 */
+ while (mas_data_end(&mas) > 4) {
+ mas_set(&mas, val);
+ mas_erase(&mas);
+ mas_prev(&mas, 0);
+ val = mas.index;
+ mas_ascend(&mas);
+ }
+
+ /* Move to Node D */
+ mas_ascend(&mas);
+ mas.offset = 1;
+ mas_descend(&mas);
+ val = mas.max;
+ /* Adjust entries to be 8 */
+ while (mas_data_end(&mas) < 8) {
+ mas_set(&mas, val--);
+ mas_store_gfp(&mas, &mas, GFP_KERNEL);
+ mas_ascend(&mas);
+ }
+
+ /* Move to Node E */
+ mas_ascend(&mas);
+ val = mas.max;
+ MAS_BUG_ON(&mas, mas_data_end(&mas) > 9);
+ /* Adjust Node E to 9 entries */
+ while (mas_data_end(&mas) < 9) {
+ mas_set(&mas, val--);
+ mas_store_gfp(&mas, &mas, GFP_KERNEL);
+ mas_ascend(&mas);
+ mas_ascend(&mas);
+ }
+
+ /* Move to Node F */
+ mas_ascend(&mas);
+ val = mas.max;
+ MAS_BUG_ON(&mas, mas_data_end(&mas) > 9);
+ /* Adjust Node F to 9 entries */
+ while (mas_data_end(&mas) < 9) {
+ mas_set(&mas, val--);
+ mas_store_gfp(&mas, &mas, GFP_KERNEL);
+ mas_ascend(&mas);
+ mas_ascend(&mas);
+ mas_ascend(&mas);
+ }
+
+ /* Test is set up, walk to first entry */
+ mas_set(&mas, 0);
+ mas_next(&mas, ULONG_MAX);
+ /* overwrite the entry to cause a rebalance, which was 1 too few */
+ mas_set_range(&mas, 0, mas.last);
+ mas_preallocate(&mas, NULL, GFP_KERNEL);
+ mas_store_prealloc(&mas, NULL);
+ mas_unlock(&mas);
+}
+
static noinline void __init check_mtree_dup(struct maple_tree *mt)
{
DEFINE_MTREE(new);
@@ -36260,6 +36381,10 @@ void farmer_tests(void)
check_mtree_dup(&tree);
mtree_destroy(&tree);
+ mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE);
+ check_erase_rebalance(&tree);
+ mtree_destroy(&tree);
+
/* RCU testing */
mt_init_flags(&tree, 0);
check_erase_testset(&tree);
_
More information about the maple-tree
mailing list