[PATCH 1/1] stop_machine: Fix migrate_swap() vs. balance_push() racing

Kuyo Chang kuyo.chang at mediatek.com
Thu May 29 01:43:35 PDT 2025


From: kuyo chang <kuyo.chang at mediatek.com>

Hi guys,

It encounters sporadic failures during CPU hotplug stress test.

[Syndrome]
The kernel log shows list add fail as below.

kmemleak: list_add corruption.
prev->next should be next (ffffff82812c7a00),
but was 0000000000000000. (prev=ffffff82812c3208).

kmemleak: kernel BUG at lib/list_debug.c:34!
kmemleak: Call trace:
kmemleak:  __list_add_valid_or_report+0x11c/0x144
kmemleak:  cpu_stop_queue_work+0x440/0x474
kmemleak:  stop_one_cpu_nowait+0xe4/0x138
kmemleak:  balance_push+0x1f4/0x3e4
kmemleak:  __schedule+0x1adc/0x23bc
kmemleak:  preempt_schedule_common+0x68/0xd0
kmemleak:  preempt_schedule+0x60/0x80
kmemleak:  _raw_spin_unlock_irqrestore+0x9c/0xa0
kmemleak:  scan_gray_list+0x220/0x3e4
kmemleak:  kmemleak_scan+0x410/0x740
kmemleak:  kmemleak_scan_thread+0xb0/0xdc
kmemleak:  kthread+0x2bc/0x494
kmemleak:  ret_from_fork+0x10/0x20

[Analysis]
In the failure case, by memory dump, we find
cpu_stopper.enabled = TRUE
but the wakeq is empty(the migrate/1 is at another wakeq)
static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
{
...
..
	enabled = stopper->enabled;
	if (enabled)
		__cpu_stop_queue_work(stopper, work, &wakeq);
...
...
	wake_up_q(&wakeq);  -> wakeq is empty !!
	preempt_enable();
	return enabled;
}

Through analysis of the CPU0 call trace and memory dump

   CPU0: migration/0, pid: 43, priority: 99
   Native callstack:
	vmlinux  __kern_my_cpu_offset()           <arch/arm64/include/asm/percpu.h:40>
	vmlinux  ct_state_inc(incby=8)            <include/linux/context_tracking.h:137>
	vmlinux  rcu_momentary_eqs() + 72         <kernel/rcu/tree.c:375>
	vmlinux  multi_cpu_stop() + 316           <kernel/stop_machine.c:278>
	vmlinux  cpu_stopper_thread() + 676       <kernel/stop_machine.c:563>
	vmlinux  smpboot_thread_fn(data=0) + 1188 <kernel/smpboot.c:164>
	vmlinux  kthread() + 696                  <kernel/kthread.c:389>
	vmlinux  0xFFFFFFC08005941C()             <arch/arm64/kernel/entry.S:845>

  (struct migration_swap_arg *)0xFFFFFFC08FF87A40 (
    src_task = 0xFFFFFF80FF519740 ,
    dst_task = 0xFFFFFF802A579740 ,
    src_cpu = 0x0,
    dst_cpu = 0x1)

  (struct multi_stop_data)*    0xFFFFFFC08FF87930 = (
    fn = 0xFFFFFFC0802657F4 = migrate_swap_stop,
    data = 0xFFFFFFC08FF87A40
    num_threads = 0x2,
    active_cpus = cpu_bit_bitmap[1] -> (
      bits = (0x2)),
    state = MULTI_STOP_PREPARE = 0x1,
    thread_ack = (
      counter = 0x1))

By cpu mask memory dump:
  ((const struct cpumask *)&__cpu_online_mask) (
    bits = (0xFF))
  ((const struct cpumask *)&__cpu_dying_mask)  (
    bits = (0x2))
  ((const struct cpumask *)&__cpu_active_mask)(
    bits = (0xFD))
  ((const struct cpumask *)&__cpu_possible_mask) (
    bits = (0xFF))
->Imply cpu1 is dying & non-active

So, the potential race scenario is:

	CPU0							CPU1
	// doing migrate_swap(cpu0/cpu1)
	stop_two_cpus()
							  ...
							 // doing _cpu_down()
							      sched_cpu_deactivate()
								set_cpu_active(cpu, false);
								balance_push_set(cpu, true);
	cpu_stop_queue_two_works
	    __cpu_stop_queue_work(stopper1,...);
	    __cpu_stop_queue_work(stopper2,..);
	stop_cpus_in_progress -> true
		preempt_enable();
								...
							1st balance_push
							stop_one_cpu_nowait
							cpu_stop_queue_work
							__cpu_stop_queue_work
							list_add_tail  -> 1st add push_work
							wake_up_q(&wakeq);  -> "wakeq is empty. This implies that the stopper is at wakeq at migrate_swap."
	preempt_disable
	wake_up_q(&wakeq);
	        wake_up_process // wakeup migrate/0
		    try_to_wake_up
		        ttwu_queue
		            ttwu_queue_cond ->meet below case
		                if (cpu == smp_processor_id())
			         return false;
			ttwu_do_activate
			//migrate/0 wakeup done
		wake_up_process // wakeup migrate/1
	           try_to_wake_up
		    ttwu_queue
			ttwu_queue_cond
		        ttwu_queue_wakelist
			__ttwu_queue_wakelist
			__smp_call_single_queue
	preempt_enable();

							2nd balance_push
							stop_one_cpu_nowait
							cpu_stop_queue_work
							__cpu_stop_queue_work
							list_add_tail  -> 2nd add push_work, so the double list add is detected
							...
							...
							cpu1 get ipi, do sched_ttwu_pending, wakeup migrate/1

[Solution]
Maybe add queue status tracking in __cpu_stop_queue_work to avoid the corner case?
Or don't use ttwu_queu_wakelist while wakeup migrate?

Signed-off-by: kuyo chang <kuyo.chang at mediatek.com>
---
 include/linux/stop_machine.h |  7 +++++++
 kernel/stop_machine.c        | 12 +++++++++++-
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h
index 3132262a404d..a0748416dd69 100644
--- a/include/linux/stop_machine.h
+++ b/include/linux/stop_machine.h
@@ -21,11 +21,18 @@ typedef int (*cpu_stop_fn_t)(void *arg);
 
 #ifdef CONFIG_SMP
 
+enum work_state {
+	WORK_INIT = 0,
+	WORK_QUEUE,
+	WORK_EXEC,
+};
+
 struct cpu_stop_work {
 	struct list_head	list;		/* cpu_stopper->works */
 	cpu_stop_fn_t		fn;
 	unsigned long		caller;
 	void			*arg;
+	enum work_state		state;
 	struct cpu_stop_done	*done;
 };
 
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 5d2d0562115b..9ab6b07708e3 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -87,6 +87,7 @@ static void __cpu_stop_queue_work(struct cpu_stopper *stopper,
 {
 	list_add_tail(&work->list, &stopper->works);
 	wake_q_add(wakeq, stopper->thread);
+	work->state = WORK_QUEUE;
 }
 
 /* queue @work to @stopper.  if offline, @work is completed immediately */
@@ -385,7 +386,15 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
 bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
 			struct cpu_stop_work *work_buf)
 {
-	*work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, .caller = _RET_IP_, };
+	if (unlikely(work_buf->state == WORK_QUEUE))
+		return true;
+
+	*work_buf = (struct cpu_stop_work){
+	    .fn = fn,
+	    .arg = arg,
+	    .caller = _RET_IP_,
+	    .state = WORK_INIT
+	};
 	return cpu_stop_queue_work(cpu, work_buf);
 }
 
@@ -496,6 +505,7 @@ static void cpu_stopper_thread(unsigned int cpu)
 		work = list_first_entry(&stopper->works,
 					struct cpu_stop_work, list);
 		list_del_init(&work->list);
+		work->state = WORK_EXEC;
 	}
 	raw_spin_unlock_irq(&stopper->lock);
 
-- 
2.45.2




More information about the linux-arm-kernel mailing list