[PATCH 1/1] stop_machine: Fix migrate_swap() vs. balance_push() racing
Kuyo Chang
kuyo.chang at mediatek.com
Thu May 29 01:43:35 PDT 2025
From: kuyo chang <kuyo.chang at mediatek.com>
Hi guys,
It encounters sporadic failures during CPU hotplug stress test.
[Syndrome]
The kernel log shows list add fail as below.
kmemleak: list_add corruption.
prev->next should be next (ffffff82812c7a00),
but was 0000000000000000. (prev=ffffff82812c3208).
kmemleak: kernel BUG at lib/list_debug.c:34!
kmemleak: Call trace:
kmemleak: __list_add_valid_or_report+0x11c/0x144
kmemleak: cpu_stop_queue_work+0x440/0x474
kmemleak: stop_one_cpu_nowait+0xe4/0x138
kmemleak: balance_push+0x1f4/0x3e4
kmemleak: __schedule+0x1adc/0x23bc
kmemleak: preempt_schedule_common+0x68/0xd0
kmemleak: preempt_schedule+0x60/0x80
kmemleak: _raw_spin_unlock_irqrestore+0x9c/0xa0
kmemleak: scan_gray_list+0x220/0x3e4
kmemleak: kmemleak_scan+0x410/0x740
kmemleak: kmemleak_scan_thread+0xb0/0xdc
kmemleak: kthread+0x2bc/0x494
kmemleak: ret_from_fork+0x10/0x20
[Analysis]
In the failure case, by memory dump, we find
cpu_stopper.enabled = TRUE
but the wakeq is empty(the migrate/1 is at another wakeq)
static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
{
...
..
enabled = stopper->enabled;
if (enabled)
__cpu_stop_queue_work(stopper, work, &wakeq);
...
...
wake_up_q(&wakeq); -> wakeq is empty !!
preempt_enable();
return enabled;
}
Through analysis of the CPU0 call trace and memory dump
CPU0: migration/0, pid: 43, priority: 99
Native callstack:
vmlinux __kern_my_cpu_offset() <arch/arm64/include/asm/percpu.h:40>
vmlinux ct_state_inc(incby=8) <include/linux/context_tracking.h:137>
vmlinux rcu_momentary_eqs() + 72 <kernel/rcu/tree.c:375>
vmlinux multi_cpu_stop() + 316 <kernel/stop_machine.c:278>
vmlinux cpu_stopper_thread() + 676 <kernel/stop_machine.c:563>
vmlinux smpboot_thread_fn(data=0) + 1188 <kernel/smpboot.c:164>
vmlinux kthread() + 696 <kernel/kthread.c:389>
vmlinux 0xFFFFFFC08005941C() <arch/arm64/kernel/entry.S:845>
(struct migration_swap_arg *)0xFFFFFFC08FF87A40 (
src_task = 0xFFFFFF80FF519740 ,
dst_task = 0xFFFFFF802A579740 ,
src_cpu = 0x0,
dst_cpu = 0x1)
(struct multi_stop_data)* 0xFFFFFFC08FF87930 = (
fn = 0xFFFFFFC0802657F4 = migrate_swap_stop,
data = 0xFFFFFFC08FF87A40
num_threads = 0x2,
active_cpus = cpu_bit_bitmap[1] -> (
bits = (0x2)),
state = MULTI_STOP_PREPARE = 0x1,
thread_ack = (
counter = 0x1))
By cpu mask memory dump:
((const struct cpumask *)&__cpu_online_mask) (
bits = (0xFF))
((const struct cpumask *)&__cpu_dying_mask) (
bits = (0x2))
((const struct cpumask *)&__cpu_active_mask)(
bits = (0xFD))
((const struct cpumask *)&__cpu_possible_mask) (
bits = (0xFF))
->Imply cpu1 is dying & non-active
So, the potential race scenario is:
CPU0 CPU1
// doing migrate_swap(cpu0/cpu1)
stop_two_cpus()
...
// doing _cpu_down()
sched_cpu_deactivate()
set_cpu_active(cpu, false);
balance_push_set(cpu, true);
cpu_stop_queue_two_works
__cpu_stop_queue_work(stopper1,...);
__cpu_stop_queue_work(stopper2,..);
stop_cpus_in_progress -> true
preempt_enable();
...
1st balance_push
stop_one_cpu_nowait
cpu_stop_queue_work
__cpu_stop_queue_work
list_add_tail -> 1st add push_work
wake_up_q(&wakeq); -> "wakeq is empty. This implies that the stopper is at wakeq at migrate_swap."
preempt_disable
wake_up_q(&wakeq);
wake_up_process // wakeup migrate/0
try_to_wake_up
ttwu_queue
ttwu_queue_cond ->meet below case
if (cpu == smp_processor_id())
return false;
ttwu_do_activate
//migrate/0 wakeup done
wake_up_process // wakeup migrate/1
try_to_wake_up
ttwu_queue
ttwu_queue_cond
ttwu_queue_wakelist
__ttwu_queue_wakelist
__smp_call_single_queue
preempt_enable();
2nd balance_push
stop_one_cpu_nowait
cpu_stop_queue_work
__cpu_stop_queue_work
list_add_tail -> 2nd add push_work, so the double list add is detected
...
...
cpu1 get ipi, do sched_ttwu_pending, wakeup migrate/1
[Solution]
Maybe add queue status tracking in __cpu_stop_queue_work to avoid the corner case?
Or don't use ttwu_queu_wakelist while wakeup migrate?
Signed-off-by: kuyo chang <kuyo.chang at mediatek.com>
---
include/linux/stop_machine.h | 7 +++++++
kernel/stop_machine.c | 12 +++++++++++-
2 files changed, 18 insertions(+), 1 deletion(-)
diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h
index 3132262a404d..a0748416dd69 100644
--- a/include/linux/stop_machine.h
+++ b/include/linux/stop_machine.h
@@ -21,11 +21,18 @@ typedef int (*cpu_stop_fn_t)(void *arg);
#ifdef CONFIG_SMP
+enum work_state {
+ WORK_INIT = 0,
+ WORK_QUEUE,
+ WORK_EXEC,
+};
+
struct cpu_stop_work {
struct list_head list; /* cpu_stopper->works */
cpu_stop_fn_t fn;
unsigned long caller;
void *arg;
+ enum work_state state;
struct cpu_stop_done *done;
};
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 5d2d0562115b..9ab6b07708e3 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -87,6 +87,7 @@ static void __cpu_stop_queue_work(struct cpu_stopper *stopper,
{
list_add_tail(&work->list, &stopper->works);
wake_q_add(wakeq, stopper->thread);
+ work->state = WORK_QUEUE;
}
/* queue @work to @stopper. if offline, @work is completed immediately */
@@ -385,7 +386,15 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
struct cpu_stop_work *work_buf)
{
- *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, .caller = _RET_IP_, };
+ if (unlikely(work_buf->state == WORK_QUEUE))
+ return true;
+
+ *work_buf = (struct cpu_stop_work){
+ .fn = fn,
+ .arg = arg,
+ .caller = _RET_IP_,
+ .state = WORK_INIT
+ };
return cpu_stop_queue_work(cpu, work_buf);
}
@@ -496,6 +505,7 @@ static void cpu_stopper_thread(unsigned int cpu)
work = list_first_entry(&stopper->works,
struct cpu_stop_work, list);
list_del_init(&work->list);
+ work->state = WORK_EXEC;
}
raw_spin_unlock_irq(&stopper->lock);
--
2.45.2
More information about the linux-arm-kernel
mailing list