[PATCH 1/1] [Bug] list_add corruption during CPU hotplug stress test

Kuyo Chang kuyo.chang at mediatek.com
Thu May 22 01:10:26 PDT 2025


On Wed, 2025-05-21 at 14:42 +0800, Kuyo Chang wrote:
> From: kuyo chang <kuyo.chang at mediatek.com>
> 
> The commit (stop_machine, sched: Fix migrate_swap() vs.
> active_balance() deadlock)
> introduces wake_q_add rather than wake_up_process.
> However, it encounters sporadic failures during CPU hotplug stress
> test.
> 
> The kernel log shows list add fail as below
> kmemleak: list_add corruption. prev->next should be next
> (ffffff82812c7a00), but was 0000000000000000.
> (prev=ffffff82812c3208).
> kmemleak: kernel BUG at lib/list_debug.c:34!
> kmemleak: Call trace:
> kmemleak:  __list_add_valid_or_report+0x11c/0x144
> kmemleak:  cpu_stop_queue_work+0x440/0x474
> kmemleak:  stop_one_cpu_nowait_rec+0xe4/0x138
> kmemleak:  balance_push+0x1f4/0x3e4
> kmemleak:  __schedule+0x1adc/0x23bc
> kmemleak:  preempt_schedule_common+0x68/0xd0
> kmemleak:  preempt_schedule+0x60/0x80
> kmemleak:  _raw_spin_unlock_irqrestore+0x9c/0xa0
> kmemleak:  scan_gray_list+0x220/0x3e4
> kmemleak:  kmemleak_scan+0x410/0x740
> kmemleak:  kmemleak_scan_thread+0xb0/0xdc
> kmemleak:  kthread+0x2bc/0x494
> kmemleak:  ret_from_fork+0x10/0x20
> 
> Because the reproduction rate is very low, I designed a simple debug
> patch to help find the root cause of these sporadic failures.
> The purpose of this debug patch is to record the status of push_work
> at balance_push, cpu_stopper, and cpu_stop_work.
> In the regular case, we queue the push_work to stopper->works and set
> work->exec_state = WORK_QUEUE.
> Then, we call wake_q_add for the stopper and set
> cpu_stopper.stopper_wakeq = WAKEQ_ADD_OK.
> Finally, we wake up the stopper, which picks up the work from
> cpu_stop_work to execute.
> We then set stopper->exec_state = WORK_PREP_EXEC to indicate that the
> push_work has been consumed.
> 
> However, in the failure case, by memory dump
> cpu_stopper.stopper_wakeq = WAKEQ_ADD_FAIL
> stopper->exec_state = WORK_QUEUE.
> cpu_stopper.enabled = TRUE
> 
> Here is the failure sequence that leads to the bug.
> CPU0
> 1st balance_push
> stop_one_cpu_nowait
> cpu_stop_queue_work
> __cpu_stop_queue_work
> list_add_tail  -> fist add work
> wake_q_add   -> returns failure
> wake_up_q(&wakeq); -> it doesn't wake up the stopper.
> 2nd balance_push
> stop_one_cpu_nowait
> cpu_stop_queue_work
> __cpu_stop_queue_work
> list_add_tail  -> A double list add is detected, which triggers
> kernel bugs.
> 
> Should we add queue status tracking in __cpu_stop_queue_work,
> or is there a better place to do this?
> 
> Signed-off-by: kuyo chang <kuyo.chang at mediatek.com>
> ---
>  include/linux/sched/wake_q.h |  1 +
>  include/linux/stop_machine.h | 20 ++++++++++++++++++++
>  kernel/sched/core.c          | 15 ++++++++++++++-
>  kernel/stop_machine.c        | 36
> ++++++++++++++++++++++++++++++++++--
>  4 files changed, 69 insertions(+), 3 deletions(-)
> 
> diff --git a/include/linux/sched/wake_q.h
> b/include/linux/sched/wake_q.h
> index 0f28b4623ad4..083ced4bb5dc 100644
> --- a/include/linux/sched/wake_q.h
> +++ b/include/linux/sched/wake_q.h
> @@ -60,6 +60,7 @@ static inline bool wake_q_empty(struct wake_q_head
> *head)
>  }
>  
>  extern void wake_q_add(struct wake_q_head *head, struct task_struct
> *task);
> +extern bool wake_q_add_ret(struct wake_q_head *head, struct
> task_struct *task);
>  extern void wake_q_add_safe(struct wake_q_head *head, struct
> task_struct *task);
>  extern void wake_up_q(struct wake_q_head *head);
>  
> diff --git a/include/linux/stop_machine.h
> b/include/linux/stop_machine.h
> index 3132262a404d..6daec44dcb99 100644
> --- a/include/linux/stop_machine.h
> +++ b/include/linux/stop_machine.h
> @@ -21,18 +21,38 @@ typedef int (*cpu_stop_fn_t)(void *arg);
>  
>  #ifdef CONFIG_SMP
>  
> +enum work_state {
> +	WORK_INIT = 0,
> +	WORK_READY,
> +	WORK_QUEUE,
> +	WORK_PREP_EXEC,
> +};
> +
> +enum stopper_wakeq_state {
> +	WAKEQ_ADD_FAIL = 0,
> +	WAKEQ_ADD_OK,
> +};
> +
> +enum work_rec {
> +	WORK_NO_REC = 0,
> +	WORK_REC,
> +};
>  struct cpu_stop_work {
>  	struct list_head	list;		/* cpu_stopper-
> >works */
>  	cpu_stop_fn_t		fn;
>  	unsigned long		caller;
>  	void			*arg;
>  	struct cpu_stop_done	*done;
> +	enum work_rec		rec;
> +	enum work_state exec_state;
>  };
>  
>  int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg);
>  int stop_two_cpus(unsigned int cpu1, unsigned int cpu2,
> cpu_stop_fn_t fn, void *arg);
>  bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void
> *arg,
>  			 struct cpu_stop_work *work_buf);
> +bool stop_one_cpu_nowait_rec(unsigned int cpu, cpu_stop_fn_t fn,
> void *arg,
> +			struct cpu_stop_work *work_buf);
>  void stop_machine_park(int cpu);
>  void stop_machine_unpark(int cpu);
>  void stop_machine_yield(const struct cpumask *cpumask);
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index c81cf642dba0..a617f51d3e5a 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -1037,6 +1037,19 @@ void wake_q_add(struct wake_q_head *head,
> struct task_struct *task)
>  		get_task_struct(task);
>  }
>  
> +/**
> + * wake_q_add_ret() - queue a wakeup for 'later' waking. record
> status.
> + */
> +bool wake_q_add_ret(struct wake_q_head *head, struct task_struct
> *task)
> +{
> +	bool ret = false;
> +
> +	ret = __wake_q_add(head, task);
> +	if (ret)
> +		get_task_struct(task);
> +	return ret;
> +}
> +
>  /**
>   * wake_q_add_safe() - safely queue a wakeup for 'later' waking.
>   * @head: the wake_q_head to add @task to
> @@ -8100,7 +8113,7 @@ static void balance_push(struct rq *rq)
>  	 */
>  	preempt_disable();
>  	raw_spin_rq_unlock(rq);
> -	stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop,
> push_task,
> +	stop_one_cpu_nowait_rec(rq->cpu, __balance_push_cpu_stop,
> push_task,
>  			    this_cpu_ptr(&push_work));
>  	preempt_enable();
>  	/*
> diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
> index 5d2d0562115b..3a2c48ed2182 100644
> --- a/kernel/stop_machine.c
> +++ b/kernel/stop_machine.c
> @@ -42,6 +42,8 @@ struct cpu_stopper {
>  	struct list_head	works;		/* list of pending
> works */
>  
>  	struct cpu_stop_work	stop_work;	/* for stop_cpus */
> +	enum stopper_wakeq_state stopper_wakeq;		/*
> for stopper wakeup */
> +	enum work_state exec_state;			/* for
> stopper exec state */
>  	unsigned long		caller;
>  	cpu_stop_fn_t		fn;
>  };
> @@ -85,8 +87,18 @@ static void __cpu_stop_queue_work(struct
> cpu_stopper *stopper,
>  					struct cpu_stop_work *work,
>  					struct wake_q_head *wakeq)
>  {
> +	bool wake_flag;
> +
>  	list_add_tail(&work->list, &stopper->works);
> -	wake_q_add(wakeq, stopper->thread);
> +	wake_flag = wake_q_add_ret(wakeq, stopper->thread);
> +	if (work->rec == WORK_REC) {
> +		work->exec_state = WORK_QUEUE;
> +		stopper->exec_state = WORK_QUEUE;
> +		if (wake_flag)
> +			stopper->stopper_wakeq = WAKEQ_ADD_OK;
> +		else
> +			stopper->stopper_wakeq = WAKEQ_ADD_FAIL;
> +	}
>  }
>  
>  /* queue @work to @stopper.  if offline, @work is completed
> immediately */
> @@ -141,6 +153,8 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t
> fn, void *arg)
>  	struct cpu_stop_done done;
>  	struct cpu_stop_work work = { .fn = fn, .arg = arg, .done =
> &done, .caller = _RET_IP_ };
>  
> +	work.rec = WORK_NO_REC;
> +	work.exec_state = WORK_INIT;
>  	cpu_stop_init_done(&done, 1);
>  	if (!cpu_stop_queue_work(cpu, &work))
>  		return -ENOENT;
> @@ -350,6 +364,8 @@ int stop_two_cpus(unsigned int cpu1, unsigned int
> cpu2, cpu_stop_fn_t fn, void *
>  		.arg = &msdata,
>  		.done = &done,
>  		.caller = _RET_IP_,
> +		.rec = WORK_NO_REC,
> +		.exec_state = WORK_INIT,
>  	};
>  
>  	cpu_stop_init_done(&done, 2);
> @@ -386,6 +402,17 @@ bool stop_one_cpu_nowait(unsigned int cpu,
> cpu_stop_fn_t fn, void *arg,
>  			struct cpu_stop_work *work_buf)
>  {
>  	*work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg,
> .caller = _RET_IP_, };
> +	work_buf->rec = WORK_NO_REC;
> +	work_buf->exec_state = WORK_INIT;
> +	return cpu_stop_queue_work(cpu, work_buf);
> +}
> +
> +bool stop_one_cpu_nowait_rec(unsigned int cpu, cpu_stop_fn_t fn,
> void *arg,
> +			struct cpu_stop_work *work_buf)
> +{
> +	*work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg,
> .caller = _RET_IP_, };
> +	work_buf->rec = WORK_REC;
> +	work_buf->exec_state = WORK_READY;
>  	return cpu_stop_queue_work(cpu, work_buf);
>  }
>  
> @@ -411,6 +438,8 @@ static bool queue_stop_cpus_work(const struct
> cpumask *cpumask,
>  		work->arg = arg;
>  		work->done = done;
>  		work->caller = _RET_IP_;
> +		work->rec = WORK_NO_REC;
> +		work->exec_state = WORK_INIT;
>  		if (cpu_stop_queue_work(cpu, work))
>  			queued = true;
>  	}
> @@ -496,6 +525,8 @@ static void cpu_stopper_thread(unsigned int cpu)
>  		work = list_first_entry(&stopper->works,
>  					struct cpu_stop_work, list);
>  		list_del_init(&work->list);
> +		if (work->rec == WORK_REC)
> +			stopper->exec_state = WORK_PREP_EXEC;
>  	}
>  	raw_spin_unlock_irq(&stopper->lock);
>  
> @@ -572,7 +603,8 @@ static int __init cpu_stop_init(void)
>  
>  	for_each_possible_cpu(cpu) {
>  		struct cpu_stopper *stopper = &per_cpu(cpu_stopper,
> cpu);
> -
> +		stopper->exec_state = WORK_INIT;
> +		stopper->stopper_wakeq = WAKEQ_ADD_OK;
>  		raw_spin_lock_init(&stopper->lock);
>  		INIT_LIST_HEAD(&stopper->works);
>  	}


The test kernel version is 6.12.22

And add the timeslot log for this issue.
[   99407090000] kmemleak: list_add corruption. prev->next should be
next (ffffff82812c7a00), but was 0000000000000000.
(prev=ffffff82812c3208).
[   99408874000] kmemleak: ------------[ cut here ]------------
[   99409727000] kmemleak: kernel BUG at lib/list_debug.c:34!
[   99410555000] kmemleak: Internal error: Oops - BUG: 00000000f2000800
[#1] PREEMPT SMP

I dumped the stop task's sched_info as below.
on_cpu = 0x0,
on_rq = 0x1,
sched_info = (
        pcount = 0x0B7D,
        run_delay = 0x08248680,
        last_arrival = 0x0000001724054D00,
        last_queued = 0x0000001725208400),
}

The last_queued time (0x0000001725208400 = 99,407,135,744 ns) indicates
when the stopper was enqueued (wake_up_q->wake_up_process->ttwu), 
which occurred just after the list_add corruption (99,407,090,000 ns).
Therefore, let me correct my previous description: the wake_q_add
stopper is fine, but the subsequent latency wakeup stopper causes this
issue.






More information about the Linux-mediatek mailing list