[PATCH 1/1] [Bug] list_add corruption during CPU hotplug stress test

Kuyo Chang kuyo.chang at mediatek.com
Tue May 20 23:42:18 PDT 2025


From: kuyo chang <kuyo.chang at mediatek.com>

The commit (stop_machine, sched: Fix migrate_swap() vs. active_balance() deadlock)
introduces wake_q_add rather than wake_up_process.
However, it encounters sporadic failures during CPU hotplug stress test.

The kernel log shows list add fail as below
kmemleak: list_add corruption. prev->next should be next (ffffff82812c7a00), but was 0000000000000000. (prev=ffffff82812c3208).
kmemleak: kernel BUG at lib/list_debug.c:34!
kmemleak: Call trace:
kmemleak:  __list_add_valid_or_report+0x11c/0x144
kmemleak:  cpu_stop_queue_work+0x440/0x474
kmemleak:  stop_one_cpu_nowait_rec+0xe4/0x138
kmemleak:  balance_push+0x1f4/0x3e4
kmemleak:  __schedule+0x1adc/0x23bc
kmemleak:  preempt_schedule_common+0x68/0xd0
kmemleak:  preempt_schedule+0x60/0x80
kmemleak:  _raw_spin_unlock_irqrestore+0x9c/0xa0
kmemleak:  scan_gray_list+0x220/0x3e4
kmemleak:  kmemleak_scan+0x410/0x740
kmemleak:  kmemleak_scan_thread+0xb0/0xdc
kmemleak:  kthread+0x2bc/0x494
kmemleak:  ret_from_fork+0x10/0x20

Because the reproduction rate is very low, I designed a simple debug patch to help find the root cause of these sporadic failures.
The purpose of this debug patch is to record the status of push_work at balance_push, cpu_stopper, and cpu_stop_work.
In the regular case, we queue the push_work to stopper->works and set work->exec_state = WORK_QUEUE.
Then, we call wake_q_add for the stopper and set cpu_stopper.stopper_wakeq = WAKEQ_ADD_OK.
Finally, we wake up the stopper, which picks up the work from cpu_stop_work to execute.
We then set stopper->exec_state = WORK_PREP_EXEC to indicate that the push_work has been consumed.

However, in the failure case, by memory dump
cpu_stopper.stopper_wakeq = WAKEQ_ADD_FAIL
stopper->exec_state = WORK_QUEUE.
cpu_stopper.enabled = TRUE

Here is the failure sequence that leads to the bug.
CPU0
1st balance_push
stop_one_cpu_nowait
cpu_stop_queue_work
__cpu_stop_queue_work
list_add_tail  -> fist add work
wake_q_add   -> returns failure
wake_up_q(&wakeq); -> it doesn't wake up the stopper.
2nd balance_push
stop_one_cpu_nowait
cpu_stop_queue_work
__cpu_stop_queue_work
list_add_tail  -> A double list add is detected, which triggers kernel bugs.

Should we add queue status tracking in __cpu_stop_queue_work,
or is there a better place to do this?

Signed-off-by: kuyo chang <kuyo.chang at mediatek.com>
---
 include/linux/sched/wake_q.h |  1 +
 include/linux/stop_machine.h | 20 ++++++++++++++++++++
 kernel/sched/core.c          | 15 ++++++++++++++-
 kernel/stop_machine.c        | 36 ++++++++++++++++++++++++++++++++++--
 4 files changed, 69 insertions(+), 3 deletions(-)

diff --git a/include/linux/sched/wake_q.h b/include/linux/sched/wake_q.h
index 0f28b4623ad4..083ced4bb5dc 100644
--- a/include/linux/sched/wake_q.h
+++ b/include/linux/sched/wake_q.h
@@ -60,6 +60,7 @@ static inline bool wake_q_empty(struct wake_q_head *head)
 }
 
 extern void wake_q_add(struct wake_q_head *head, struct task_struct *task);
+extern bool wake_q_add_ret(struct wake_q_head *head, struct task_struct *task);
 extern void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task);
 extern void wake_up_q(struct wake_q_head *head);
 
diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h
index 3132262a404d..6daec44dcb99 100644
--- a/include/linux/stop_machine.h
+++ b/include/linux/stop_machine.h
@@ -21,18 +21,38 @@ typedef int (*cpu_stop_fn_t)(void *arg);
 
 #ifdef CONFIG_SMP
 
+enum work_state {
+	WORK_INIT = 0,
+	WORK_READY,
+	WORK_QUEUE,
+	WORK_PREP_EXEC,
+};
+
+enum stopper_wakeq_state {
+	WAKEQ_ADD_FAIL = 0,
+	WAKEQ_ADD_OK,
+};
+
+enum work_rec {
+	WORK_NO_REC = 0,
+	WORK_REC,
+};
 struct cpu_stop_work {
 	struct list_head	list;		/* cpu_stopper->works */
 	cpu_stop_fn_t		fn;
 	unsigned long		caller;
 	void			*arg;
 	struct cpu_stop_done	*done;
+	enum work_rec		rec;
+	enum work_state exec_state;
 };
 
 int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg);
 int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *arg);
 bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
 			 struct cpu_stop_work *work_buf);
+bool stop_one_cpu_nowait_rec(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
+			struct cpu_stop_work *work_buf);
 void stop_machine_park(int cpu);
 void stop_machine_unpark(int cpu);
 void stop_machine_yield(const struct cpumask *cpumask);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c81cf642dba0..a617f51d3e5a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1037,6 +1037,19 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task)
 		get_task_struct(task);
 }
 
+/**
+ * wake_q_add_ret() - queue a wakeup for 'later' waking. record status.
+ */
+bool wake_q_add_ret(struct wake_q_head *head, struct task_struct *task)
+{
+	bool ret = false;
+
+	ret = __wake_q_add(head, task);
+	if (ret)
+		get_task_struct(task);
+	return ret;
+}
+
 /**
  * wake_q_add_safe() - safely queue a wakeup for 'later' waking.
  * @head: the wake_q_head to add @task to
@@ -8100,7 +8113,7 @@ static void balance_push(struct rq *rq)
 	 */
 	preempt_disable();
 	raw_spin_rq_unlock(rq);
-	stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop, push_task,
+	stop_one_cpu_nowait_rec(rq->cpu, __balance_push_cpu_stop, push_task,
 			    this_cpu_ptr(&push_work));
 	preempt_enable();
 	/*
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 5d2d0562115b..3a2c48ed2182 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -42,6 +42,8 @@ struct cpu_stopper {
 	struct list_head	works;		/* list of pending works */
 
 	struct cpu_stop_work	stop_work;	/* for stop_cpus */
+	enum stopper_wakeq_state stopper_wakeq;		/* for stopper wakeup */
+	enum work_state exec_state;			/* for stopper exec state */
 	unsigned long		caller;
 	cpu_stop_fn_t		fn;
 };
@@ -85,8 +87,18 @@ static void __cpu_stop_queue_work(struct cpu_stopper *stopper,
 					struct cpu_stop_work *work,
 					struct wake_q_head *wakeq)
 {
+	bool wake_flag;
+
 	list_add_tail(&work->list, &stopper->works);
-	wake_q_add(wakeq, stopper->thread);
+	wake_flag = wake_q_add_ret(wakeq, stopper->thread);
+	if (work->rec == WORK_REC) {
+		work->exec_state = WORK_QUEUE;
+		stopper->exec_state = WORK_QUEUE;
+		if (wake_flag)
+			stopper->stopper_wakeq = WAKEQ_ADD_OK;
+		else
+			stopper->stopper_wakeq = WAKEQ_ADD_FAIL;
+	}
 }
 
 /* queue @work to @stopper.  if offline, @work is completed immediately */
@@ -141,6 +153,8 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
 	struct cpu_stop_done done;
 	struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done, .caller = _RET_IP_ };
 
+	work.rec = WORK_NO_REC;
+	work.exec_state = WORK_INIT;
 	cpu_stop_init_done(&done, 1);
 	if (!cpu_stop_queue_work(cpu, &work))
 		return -ENOENT;
@@ -350,6 +364,8 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
 		.arg = &msdata,
 		.done = &done,
 		.caller = _RET_IP_,
+		.rec = WORK_NO_REC,
+		.exec_state = WORK_INIT,
 	};
 
 	cpu_stop_init_done(&done, 2);
@@ -386,6 +402,17 @@ bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
 			struct cpu_stop_work *work_buf)
 {
 	*work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, .caller = _RET_IP_, };
+	work_buf->rec = WORK_NO_REC;
+	work_buf->exec_state = WORK_INIT;
+	return cpu_stop_queue_work(cpu, work_buf);
+}
+
+bool stop_one_cpu_nowait_rec(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
+			struct cpu_stop_work *work_buf)
+{
+	*work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, .caller = _RET_IP_, };
+	work_buf->rec = WORK_REC;
+	work_buf->exec_state = WORK_READY;
 	return cpu_stop_queue_work(cpu, work_buf);
 }
 
@@ -411,6 +438,8 @@ static bool queue_stop_cpus_work(const struct cpumask *cpumask,
 		work->arg = arg;
 		work->done = done;
 		work->caller = _RET_IP_;
+		work->rec = WORK_NO_REC;
+		work->exec_state = WORK_INIT;
 		if (cpu_stop_queue_work(cpu, work))
 			queued = true;
 	}
@@ -496,6 +525,8 @@ static void cpu_stopper_thread(unsigned int cpu)
 		work = list_first_entry(&stopper->works,
 					struct cpu_stop_work, list);
 		list_del_init(&work->list);
+		if (work->rec == WORK_REC)
+			stopper->exec_state = WORK_PREP_EXEC;
 	}
 	raw_spin_unlock_irq(&stopper->lock);
 
@@ -572,7 +603,8 @@ static int __init cpu_stop_init(void)
 
 	for_each_possible_cpu(cpu) {
 		struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
-
+		stopper->exec_state = WORK_INIT;
+		stopper->stopper_wakeq = WAKEQ_ADD_OK;
 		raw_spin_lock_init(&stopper->lock);
 		INIT_LIST_HEAD(&stopper->works);
 	}
-- 
2.45.2




More information about the Linux-mediatek mailing list