[PATCH v2 6/8] riscv: stacktrace: switch to frame-pointer based unwinder

Shuai Xue xueshuai at linux.alibaba.com
Tue Jun 2 18:35:55 PDT 2026



On 5/28/26 4:23 PM, Wang Han wrote:
> Replace the open-coded frame-pointer walker in arch_stack_walk() with a
> robust kunwind state machine, modelled on arch/arm64/kernel/stacktrace.c
> and retargeted to the RISC-V {fp, ra} frame record convention. The new
> walker tracks stack bounds, consumes frame records monotonically,
> understands the metadata pt_regs records added in the previous frame
> record metadata patch, and recovers return addresses replaced by
> function graph tracing and kretprobes.
> 
> This commit introduces arch_stack_walk_reliable() but does not yet
> select HAVE_RELIABLE_STACKTRACE; that is done in a follow-up Kconfig
> patch so this commit can be reviewed and bisected as a pure unwinder
> replacement. Until that Kconfig change lands, livepatch is not yet
> enabled and arch_stack_walk_reliable() has no in-tree caller.
> 
> Three related callers are updated to keep the same frame-record
> assumptions everywhere:
> 
>    * Function graph tracing: the old RISC-V unwinder matched function
>      graph return-stack entries by the saved return-address slot. That
>      was consistent with the static mcount path, but not with the dynamic
>      ftrace path where the parent slot is ftrace_regs::ra. Use the
>      architectural frame pointer as the function graph return-address
>      cookie, matching the kunwind walker.
> 
>    * Perf callchains: route kernel callchain collection through
>      arch_stack_walk() so perf sees the same frame-pointer unwind
>      behaviour as dump_stack() and the upcoming livepatch path.
> 
>    * dump_backtrace() / __get_wchan() / show_stack(): these now go
>      through arch_stack_walk(); the explicit "Call Trace:" header is
>      moved into dump_backtrace() to preserve the original output.
> 
> The non-frame-pointer fallback walker is kept untouched for
> !CONFIG_FRAME_POINTER builds.
> 
> Signed-off-by: Wang Han <wanghan at linux.alibaba.com>
> ---
>   arch/riscv/kernel/ftrace.c         |   6 +-
>   arch/riscv/kernel/perf_callchain.c |   2 +-
>   arch/riscv/kernel/stacktrace.c     | 560 ++++++++++++++++++++++++-----
>   3 files changed, 472 insertions(+), 96 deletions(-)
> 
> diff --git a/arch/riscv/kernel/ftrace.c b/arch/riscv/kernel/ftrace.c
> index b430edfb83f4..5d55199a9230 100644
> --- a/arch/riscv/kernel/ftrace.c
> +++ b/arch/riscv/kernel/ftrace.c
> @@ -242,7 +242,8 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
>   	 */
>   	old = *parent;
>   
> -	if (!function_graph_enter(old, self_addr, frame_pointer, parent))
> +	if (!function_graph_enter(old, self_addr, frame_pointer,
> +				  (void *)frame_pointer))
>   		*parent = return_hooker;
>   }
>   
> @@ -264,7 +265,8 @@ void ftrace_graph_func(unsigned long ip, unsigned long parent_ip,
>   	 */
>   	old = *parent;
>   
> -	if (!function_graph_enter_regs(old, ip, frame_pointer, parent, fregs))
> +	if (!function_graph_enter_regs(old, ip, frame_pointer,
> +				       (void *)frame_pointer, fregs))
>   		*parent = return_hooker;
>   }
>   #endif /* CONFIG_DYNAMIC_FTRACE */
> diff --git a/arch/riscv/kernel/perf_callchain.c b/arch/riscv/kernel/perf_callchain.c
> index b465bc9eb870..436af96ea59c 100644
> --- a/arch/riscv/kernel/perf_callchain.c
> +++ b/arch/riscv/kernel/perf_callchain.c
> @@ -44,5 +44,5 @@ void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
>   		return;
>   	}
>   
> -	walk_stackframe(NULL, regs, fill_callchain, entry);
> +	arch_stack_walk(fill_callchain, entry, NULL, regs);
>   }
> diff --git a/arch/riscv/kernel/stacktrace.c b/arch/riscv/kernel/stacktrace.c
> index 2692d3a06afa..0d76320b3a29 100644
> --- a/arch/riscv/kernel/stacktrace.c
> +++ b/arch/riscv/kernel/stacktrace.c
> @@ -11,98 +11,16 @@
>   #include <linux/sched/task_stack.h>
>   #include <linux/stacktrace.h>
>   #include <linux/ftrace.h>
> +#include <linux/kprobes.h>
> +#include <linux/llist.h>
>   
>   #include <asm/stacktrace.h>
>   
> -#ifdef CONFIG_FRAME_POINTER
> -
>   /*
> - * This disables KASAN checking when reading a value from another task's stack,
> - * since the other task could be running on another CPU and could have poisoned
> - * the stack in the meantime.
> + * Non-frame-pointer fallback unwinder.
> + * Only compiled when CONFIG_FRAME_POINTER is not enabled.
>    */
> -#define READ_ONCE_TASK_STACK(task, x)			\
> -({							\
> -	unsigned long val;				\
> -	unsigned long addr = x;				\
> -	if ((task) == current)				\
> -		val = READ_ONCE(addr);			\
> -	else						\
> -		val = READ_ONCE_NOCHECK(addr);		\
> -	val;						\
> -})
> -
> -extern asmlinkage void handle_exception(void);
> -extern unsigned long ret_from_exception_end;
> -
> -static inline int fp_is_valid(unsigned long fp, unsigned long sp)
> -{
> -	unsigned long low, high;
> -
> -	low = sp + sizeof(struct stackframe);
> -	high = ALIGN(sp, THREAD_SIZE);
> -
> -	return !(fp < low || fp > high || fp & 0x07);
> -}
> -
> -void notrace walk_stackframe(struct task_struct *task, struct pt_regs *regs,
> -			     bool (*fn)(void *, unsigned long), void *arg)
> -{
> -	unsigned long fp, sp, pc;
> -	int graph_idx = 0;
> -	int level = 0;
> -
> -	if (regs) {
> -		fp = frame_pointer(regs);
> -		sp = user_stack_pointer(regs);
> -		pc = instruction_pointer(regs);
> -	} else if (task == NULL || task == current) {
> -		fp = (unsigned long)__builtin_frame_address(0);
> -		sp = current_stack_pointer;
> -		pc = (unsigned long)walk_stackframe;
> -		level = -1;
> -	} else {
> -		/* task blocked in __switch_to */
> -		fp = task->thread.s[0];
> -		sp = task->thread.sp;
> -		pc = task->thread.ra;
> -	}
> -
> -	for (;;) {
> -		struct stackframe *frame;
> -
> -		if (unlikely(!__kernel_text_address(pc) || (level++ >= 0 && !fn(arg, pc))))
> -			break;
> -
> -		if (unlikely(!fp_is_valid(fp, sp)))
> -			break;
> -
> -		/* Unwind stack frame */
> -		frame = (struct stackframe *)fp - 1;
> -		sp = fp;
> -		if (regs && (regs->epc == pc) && fp_is_valid(frame->ra, sp)) {
> -			/* We hit function where ra is not saved on the stack */
> -			fp = frame->ra;
> -			pc = regs->ra;
> -		} else {
> -			fp = READ_ONCE_TASK_STACK(task, frame->fp);
> -			pc = READ_ONCE_TASK_STACK(task, frame->ra);
> -			pc = ftrace_graph_ret_addr(task, &graph_idx, pc,
> -						   &frame->ra);
> -			if (pc >= (unsigned long)handle_exception &&
> -			    pc < (unsigned long)&ret_from_exception_end) {
> -				if (unlikely(!fn(arg, pc)))
> -					break;
> -
> -				pc = ((struct pt_regs *)sp)->epc;
> -				fp = ((struct pt_regs *)sp)->s0;
> -			}
> -		}
> -
> -	}
> -}
> -
> -#else /* !CONFIG_FRAME_POINTER */
> +#ifndef CONFIG_FRAME_POINTER
>   
>   void notrace walk_stackframe(struct task_struct *task,
>   	struct pt_regs *regs, bool (*fn)(void *, unsigned long), void *arg)
> @@ -133,7 +51,12 @@ void notrace walk_stackframe(struct task_struct *task,
>   	}
>   }
>   
> -#endif /* CONFIG_FRAME_POINTER */
> +#endif /* !CONFIG_FRAME_POINTER */
> +
> +/*
> + * Common trace helpers.
> + * These are used by both the FP (kunwind) and non-FP (walk_stackframe) paths.
> + */
>   
>   static bool print_trace_address(void *arg, unsigned long pc)
>   {
> @@ -146,12 +69,12 @@ static bool print_trace_address(void *arg, unsigned long pc)
>   noinline void dump_backtrace(struct pt_regs *regs, struct task_struct *task,
>   		    const char *loglvl)
>   {
> -	walk_stackframe(task, regs, print_trace_address, (void *)loglvl);
> +	printk("%sCall Trace:\n", loglvl);
> +	arch_stack_walk(print_trace_address, (void *)loglvl, task, regs);
>   }
>   
>   void show_stack(struct task_struct *task, unsigned long *sp, const char *loglvl)
>   {
> -	pr_cont("%sCall Trace:\n", loglvl);
>   	dump_backtrace(NULL, task, loglvl);
>   }
>   
> @@ -171,17 +94,468 @@ unsigned long __get_wchan(struct task_struct *task)
>   
>   	if (!try_get_task_stack(task))
>   		return 0;
> -	walk_stackframe(task, NULL, save_wchan, &pc);
> +	arch_stack_walk(save_wchan, &pc, task, NULL);
>   	put_task_stack(task);
>   	return pc;
>   }
>   
> -noinline noinstr void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie,
> -		     struct task_struct *task, struct pt_regs *regs)
> +/*
> + * Frame-pointer-based kernel unwind infrastructure.
> + * Only compiled when CONFIG_FRAME_POINTER is enabled.
> + *
> + * See: arch/arm64/kernel/stacktrace.c for the reference implementation.
> + */
> +#ifdef CONFIG_FRAME_POINTER
> +
> +/*
> + * Per-cpu stacks are only accessible when unwinding the current task in a
> + * non-preemptible context.
> + */
> +#define STACKINFO_CPU(task, name)				\
> +	({							\
> +		(((task) == current) && !preemptible())		\
> +			? stackinfo_get_##name()		\
> +			: stackinfo_get_unknown();		\
> +	})
> +
> +enum kunwind_source {
> +	KUNWIND_SOURCE_UNKNOWN,
> +	KUNWIND_SOURCE_FRAME,
> +	KUNWIND_SOURCE_CALLER,
> +	KUNWIND_SOURCE_TASK,
> +	KUNWIND_SOURCE_REGS_PC,
> +};
> +
> +union unwind_flags {
> +	unsigned long	all;
> +	struct {
> +		unsigned long	fgraph : 1,
> +				kretprobe : 1;
> +	};
> +};
> +
> +/*
> + * Kernel unwind state
> + *
> + * @common:    Common unwind state.
> + * @task:      The task being unwound.
> + * @graph_idx: Used by ftrace_graph_ret_addr() for optimized stack unwinding.
> + * @kr_cur:    When KRETPROBES is selected, holds the kretprobe instance
> + *             associated with the most recently encountered replacement ra
> + *             value.
> + */
> +struct kunwind_state {
> +	struct unwind_state common;
> +	struct task_struct *task;
> +	int graph_idx;
> +#ifdef CONFIG_KRETPROBES
> +	struct llist_node *kr_cur;
> +#endif
> +	enum kunwind_source source;
> +	union unwind_flags flags;
> +	struct pt_regs *regs;
> +};
> +
> +static __always_inline void
> +kunwind_init(struct kunwind_state *state,
> +	     struct task_struct *task)
> +{
> +	unwind_init_common(&state->common);
> +	state->task = task;
> +	state->source = KUNWIND_SOURCE_UNKNOWN;
> +	state->flags.all = 0;
> +	state->regs = NULL;
> +}
> +
> +/*
> + * Start an unwind from a pt_regs.
> + *
> + * The unwind will begin at the PC within the regs.
> + *
> + * The regs must be on a stack currently owned by the calling task.
> + */
> +static __always_inline void
> +kunwind_init_from_regs(struct kunwind_state *state,
> +		       struct pt_regs *regs)
> +{
> +	kunwind_init(state, current);
> +
> +	state->regs = regs;
> +	state->common.fp = frame_pointer(regs);
> +	state->common.pc = instruction_pointer(regs);
> +	state->source = KUNWIND_SOURCE_REGS_PC;
> +}
> +
> +/*
> + * Start an unwind from a caller.
> + *
> + * The unwind will begin at the caller of whichever function this is inlined
> + * into.
> + *
> + * The function which invokes this must be noinline.
> + */
> +static __always_inline void
> +kunwind_init_from_caller(struct kunwind_state *state)
> +{
> +	unsigned long fp = (unsigned long)__builtin_frame_address(0);
> +	struct frame_record *record = (struct frame_record *)fp - 1;
> +
> +	kunwind_init(state, current);
> +
> +	state->common.fp = READ_ONCE(record->fp);
> +	state->common.pc = READ_ONCE(record->ra);
> +	state->source = KUNWIND_SOURCE_CALLER;
> +}
> +
> +/*
> + * Start an unwind from a blocked task.
> + *
> + * The unwind will begin at the blocked task's saved PC (i.e. the caller of
> + * __switch_to).
> + *
> + * The caller should ensure the task is blocked in __switch_to for the
> + * duration of the unwind, or the unwind will be bogus. It is never valid to
> + * call this for the current task.
> + */
> +static __always_inline void
> +kunwind_init_from_task(struct kunwind_state *state,
> +		       struct task_struct *task)
> +{
> +	kunwind_init(state, task);
> +
> +	state->common.fp = task->thread.s[0];
> +	state->common.pc = task->thread.ra;
> +	state->source = KUNWIND_SOURCE_TASK;
> +}
> +
> +static __always_inline int
> +kunwind_recover_return_address(struct kunwind_state *state)
> +{
> +#ifdef CONFIG_FUNCTION_GRAPH_TRACER
> +	if (state->task->ret_stack &&
> +	    state->common.pc == (unsigned long)return_to_handler) {
> +		unsigned long orig_pc;
> +
> +		orig_pc = ftrace_graph_ret_addr(state->task, &state->graph_idx,
> +						state->common.pc,
> +						(void *)state->common.fp);
> +		if (state->common.pc == orig_pc) {
> +			WARN_ON_ONCE(state->task == current);
> +			return -EINVAL;
> +		}
> +		state->common.pc = orig_pc;
> +		state->flags.fgraph = 1;
> +	}
> +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
> +
> +#ifdef CONFIG_KRETPROBES
> +	if (is_kretprobe_trampoline(state->common.pc)) {
> +		unsigned long orig_pc;
> +
> +		orig_pc = kretprobe_find_ret_addr(state->task,
> +						  (void *)state->common.fp,
> +						  &state->kr_cur);
> +		if (!orig_pc)
> +			return -EINVAL;
> +		state->common.pc = orig_pc;
> +		state->flags.kretprobe = 1;
> +	}
> +#endif /* CONFIG_KRETPROBES */
> +
> +	return 0;
> +}
> +
> +/*
> + * When we reach an exception boundary marked by a metadata frame record,
> + * extract pt_regs from the stack and continue unwinding from the saved
> + * context (epc and s0/fp).
> + *
> + * On RISC-V, fp points above the metadata record, so the record's
> + * frame_record portion is at fp - sizeof(struct frame_record).
> + */
> +static __always_inline int
> +kunwind_next_regs_pc(struct kunwind_state *state)
> +{
> +	struct stack_info *info;
> +	unsigned long fp = state->common.fp;
> +	struct pt_regs *regs;
> +
> +	regs = container_of((unsigned long *)(fp - sizeof(struct frame_record)),
> +			    struct pt_regs, stackframe.record.fp);
> +
> +	info = unwind_find_stack(&state->common, (unsigned long)regs,
> +				 sizeof(*regs));
> +	if (!info)
> +		return -EINVAL;
> +
> +	unwind_consume_stack(&state->common, info, (unsigned long)regs,
> +			     sizeof(*regs));
> +
> +	state->regs = regs;
> +	state->common.pc = regs->epc;
> +	state->common.fp = frame_pointer(regs);
> +	state->regs = NULL;

state->regs is a dead field, and kunwind_next_regs_pc() clears
         it in a way that contradicts both arm64 and your own
         init_from_regs.

struct kunwind_state has a `struct pt_regs *regs`, but I can't find any
reader of it anywhere in the file — arch_kunwind_consume_entry() and
arch_reliable_kunwind_consume_entry() only ever read common.pc and
source. It is written in three places:

     kunwind_init():           state->regs = NULL;
     kunwind_init_from_regs():  state->regs = regs;     /* not cleared */
     kunwind_next_regs_pc():    state->regs = regs;
                                state->common.pc = regs->epc;
                                state->common.fp = frame_pointer(regs);
                                state->regs = NULL;      /* cleared! */

Two things:

   (a) The field has no consumer, so it's currently dead.

   (b) In kunwind_next_regs_pc() you set state->regs = regs and then
       immediately reset it to NULL two lines later. The arm64 reference
       does *not* clear it there, and your own kunwind_init_from_regs()
       leaves it set. So the three REGS_PC producers disagree on whether
       ->regs is valid.

It's harmless today only because nothing reads ->regs. But the moment
someone adds a consumer (e.g. to expose the pt_regs at an exception
boundary for a reliable dump), the stray `state->regs = NULL;` in
kunwind_next_regs_pc() becomes a silent bug.

Please either:
   - drop the field and all three writes, if it's genuinely unused, or
   - keep it and remove the `state->regs = NULL;` in
     kunwind_next_regs_pc() so it matches arm64 and init_from_regs.

> +	state->source = KUNWIND_SOURCE_REGS_PC;
> +	return 0;
> +}
> +
> +/*
> + * Handle a metadata frame record embedded in pt_regs.
> + *
> + * On RISC-V, fp points above the record (fp = metadata + 16), so the
> + * frame_record_meta starts at fp - sizeof(struct frame_record).
> + *
> + * FRAME_META_TYPE_FINAL: This is the outermost exception entry
> + *   (user -> kernel). Unwinding terminates successfully.
> + * FRAME_META_TYPE_PT_REGS: This is a nested exception entry
> + *   (kernel -> kernel). Continue unwinding from the saved context.
> + */
> +static __always_inline int
> +kunwind_next_frame_record_meta(struct kunwind_state *state)
> +{
> +	struct task_struct *tsk = state->task;
> +	unsigned long fp = state->common.fp;
> +	unsigned long meta_base = fp - sizeof(struct frame_record);
> +	struct frame_record_meta *meta;
> +	struct stack_info *info;
> +
> +	info = unwind_find_stack(&state->common, meta_base, sizeof(*meta));
> +	if (!info)
> +		return -EINVAL;
> +
> +	meta = (struct frame_record_meta *)meta_base;
> +	switch (READ_ONCE(meta->type)) {
> +	case FRAME_META_TYPE_FINAL:
> +		if (meta == &task_pt_regs(tsk)->stackframe)
> +			return -ENOENT;
> +		WARN_ON_ONCE(tsk == current);
> +		return -EINVAL;
> +	case FRAME_META_TYPE_PT_REGS:
> +		return kunwind_next_regs_pc(state);
> +	default:
> +		WARN_ON_ONCE(tsk == current);
> +		return -EINVAL;
> +	}
> +}
> +
> +/*
> + * Unwind from one frame record to the next.
> + *
> + * On RISC-V, the frame record sits at fp - sizeof(struct frame_record),
> + * immediately below the address pointed to by fp/s0. This applies to both
> + * normal frame records and metadata frame records (embedded in pt_regs).
> + *
> + * A metadata record is identified by both fp and ra being zero in the
> + * frame_record portion, with a type value following at fp + 16.
> + */
> +static __always_inline int
> +kunwind_next_frame_record(struct kunwind_state *state)
> +{
> +	unsigned long fp = state->common.fp;
> +	struct frame_record *record;
> +	struct stack_info *info;
> +	unsigned long new_fp, new_pc;
> +	unsigned long record_base;
> +
> +	if (fp & 0x7)
> +		return -EINVAL;
> +
> +	record_base = fp - sizeof(*record);
> +
> +	info = unwind_find_stack(&state->common, record_base, sizeof(*record));
> +	if (!info)
> +		return -EINVAL;
> +
> +	record = (struct frame_record *)record_base;
> +	new_fp = READ_ONCE(record->fp);
> +	new_pc = READ_ONCE(record->ra);
> +
> +	if (!new_fp && !new_pc)
> +		return kunwind_next_frame_record_meta(state);
> +
> +	unwind_consume_stack(&state->common, info, record_base,
> +			     sizeof(*record));
> +
> +	state->common.fp = new_fp;
> +	state->common.pc = new_pc;
> +	state->source = KUNWIND_SOURCE_FRAME;
> +
> +	return 0;
> +}
> +
> +/*
> + * Unwind from one frame record (A) to the next frame record (B).
> + *
> + * We terminate early if the location of B indicates a malformed chain of frame
> + * records (e.g. a cycle), determined based on the location and fp value of A
> + * and the location (but not the fp value) of B.
> + */
> +static __always_inline int
> +kunwind_next(struct kunwind_state *state)
> +{
> +	int err;
> +
> +	state->flags.all = 0;
> +
> +	switch (state->source) {
> +	case KUNWIND_SOURCE_FRAME:
> +	case KUNWIND_SOURCE_CALLER:
> +	case KUNWIND_SOURCE_TASK:
> +	case KUNWIND_SOURCE_REGS_PC:
> +		err = kunwind_next_frame_record(state);
> +		break;
> +	default:
> +		err = -EINVAL;
> +	}
> +
> +	if (err)
> +		return err;
> +
> +	return kunwind_recover_return_address(state);
> +}
> +
> +typedef bool (*kunwind_consume_fn)(const struct kunwind_state *state, void *cookie);
> +
> +static __always_inline int
> +do_kunwind(struct kunwind_state *state, kunwind_consume_fn consume_state,
> +	   void *cookie)
> +{
> +	int ret;
> +
> +	ret = kunwind_recover_return_address(state);
> +	if (ret)
> +		return ret;
> +
> +	while (1) {
> +		if (!consume_state(state, cookie))
> +			return -EINVAL;
> +		ret = kunwind_next(state);
> +		if (ret == -ENOENT)
> +			return 0;
> +		if (ret < 0)
> +			return ret;
> +	}
> +}
> +
> +static __always_inline int
> +kunwind_stack_walk(kunwind_consume_fn consume_state,
> +		   void *cookie, struct task_struct *task,
> +		   struct pt_regs *regs)
> +{
> +	struct task_struct *tsk = task ?: current;
> +	struct stack_info stacks[] = {
> +		stackinfo_get_task(tsk),
> +		STACKINFO_CPU(tsk, irq),
> +#ifdef CONFIG_VMAP_STACK
> +		STACKINFO_CPU(tsk, overflow),
> +#endif
> +	};
> +	struct kunwind_state state = {
> +		.common = {
> +			.stacks = stacks,
> +			.nr_stacks = ARRAY_SIZE(stacks),
> +		},
> +	};
> +
> +	if (regs) {
> +		if (tsk != current)
> +			return -EINVAL;
> +		kunwind_init_from_regs(&state, regs);
> +	} else if (tsk == current) {
> +		kunwind_init_from_caller(&state);
> +	} else {
> +		kunwind_init_from_task(&state, tsk);
> +	}
> +
> +	return do_kunwind(&state, consume_state, cookie);
> +}
> +
> +struct kunwind_consume_entry_data {
> +	stack_trace_consume_fn consume_entry;
> +	void *cookie;
> +};
> +
> +static __always_inline bool
> +arch_kunwind_consume_entry(const struct kunwind_state *state, void *cookie)
> +{
> +	struct kunwind_consume_entry_data *data = cookie;
> +
> +	return data->consume_entry(data->cookie, state->common.pc);
> +}
> +
> +static __always_inline bool
> +arch_reliable_kunwind_consume_entry(const struct kunwind_state *state, void *cookie)
> +{
> +	/*
> +	 * At an exception boundary we can reliably consume the saved PC. We do
> +	 * not know whether the LR was live when the exception was taken, and

Nit: s/LR/ra/ here. RISC-V has no link register; the equivalent is the
return-address register ra (x1). You already localized this correctly in
the kr_cur docstring ("replacement ra value"), so this comment is just an
oversight.

Thanks.
Shuai




More information about the linux-riscv mailing list