[PATCH] RISC-V: store precomputed percpu_offset in the task struct

Radim Krčmář rkrcmar at ventanamicro.com
Thu Jul 10 09:40:02 PDT 2025


2025-07-10T19:47:27+08:00, yunhui cui <cuiyunhui at bytedance.com>:
> On Thu, Jul 10, 2025 at 2:35 PM Radim Krčmář <rkrcmar at ventanamicro.com> wrote:
>> It would be to store the percpu offset in CSR_SCRATCH permanently, do
>> the early exception register shuffling with a percpu area storage, and
>> load the thread pointer from there as well.
>> That method would also eliminate writing CSR_SCRATCH on every exception
>> entry+exit, so maybe it makes sense to try it even if CSRs are slow...
>
> Based on the patch, optimizations for percpu offset have been added,
> with the following data:
> 6.989 7.046 6.976 6.986 7.001 7.017 7.007 7.064 7.008 7.039
> Geometric mean: 7.013248303
> Compared to reusing the scratch register, the performance has improved
> by approximately 0.7%.

Nice, thanks.  The CSR_SCRATCH accesses seem much slower than GPRs, and
possibly even slower than L1 hit -- we might gain more by storing the
precomputed offset in the task struct.

Can you check this patch as well?

(It should be compared against a variant of CSR_SCRATCH that uses the
 TASK_TI_PERCPU_OFFSET optimizations, but we can try to interpolate. :])

---8<---
RISC-V: store precomputed percpu_offset in the task struct

Exploring the memoization trade-off... hoping that __set_task_cpu covers
everything. :)

I didn't put any though into where the percpu_offset should live, and
the naive approach is to put it next to cpu.
This needs more work to not break build on other arches, because I
directly added RISC-V specific code to __set_task_cpu, to save time
figuring out where else it could be.
---
 arch/riscv/include/asm/asm.h         | 6 +-----
 arch/riscv/include/asm/percpu.h      | 8 ++++++++
 arch/riscv/include/asm/thread_info.h | 3 ++-
 arch/riscv/kernel/asm-offsets.c      | 1 +
 arch/riscv/kernel/smpboot.c          | 6 ++++++
 kernel/sched/sched.h                 | 1 +
 6 files changed, 19 insertions(+), 6 deletions(-)
 create mode 100644 arch/riscv/include/asm/percpu.h

diff --git a/arch/riscv/include/asm/asm.h b/arch/riscv/include/asm/asm.h
index a8a2af6dfe9d..2a6b831d9cdf 100644
--- a/arch/riscv/include/asm/asm.h
+++ b/arch/riscv/include/asm/asm.h
@@ -91,11 +91,7 @@
 #endif
 
 .macro asm_per_cpu dst sym tmp
-	REG_L \tmp, TASK_TI_CPU_NUM(tp)
-	slli  \tmp, \tmp, PER_CPU_OFFSET_SHIFT
-	la    \dst, __per_cpu_offset
-	add   \dst, \dst, \tmp
-	REG_L \tmp, 0(\dst)
+	REG_L \tmp, TASK_TI_PERCPU_OFFSET(tp)
 	la    \dst, \sym
 	add   \dst, \dst, \tmp
 .endm
diff --git a/arch/riscv/include/asm/percpu.h b/arch/riscv/include/asm/percpu.h
new file mode 100644
index 000000000000..c37a0fce6ebc
--- /dev/null
+++ b/arch/riscv/include/asm/percpu.h
@@ -0,0 +1,8 @@
+#ifndef __ASM_PERCPU_H
+#define __ASM_PERCPU_H
+
+#define __my_cpu_offset (current_thread_info()->percpu_offset)
+
+#include <asm-generic/percpu.h>
+
+#endif
diff --git a/arch/riscv/include/asm/thread_info.h b/arch/riscv/include/asm/thread_info.h
index f5916a70879a..da776b7a1d02 100644
--- a/arch/riscv/include/asm/thread_info.h
+++ b/arch/riscv/include/asm/thread_info.h
@@ -60,8 +60,9 @@ struct thread_info {
 	 */
 	long			kernel_sp;	/* Kernel stack pointer */
 	long			user_sp;	/* User stack pointer */
-	int			cpu;
+	int			cpu;		// TODO: could be packed better
 	unsigned long		syscall_work;	/* SYSCALL_WORK_ flags */
+	unsigned long		percpu_offset;	// XXX: randomly placed here
 #ifdef CONFIG_SHADOW_CALL_STACK
 	void			*scs_base;
 	void			*scs_sp;
diff --git a/arch/riscv/kernel/asm-offsets.c b/arch/riscv/kernel/asm-offsets.c
index 6e8c0d6feae9..9c7bb4d7e3b3 100644
--- a/arch/riscv/kernel/asm-offsets.c
+++ b/arch/riscv/kernel/asm-offsets.c
@@ -50,6 +50,7 @@ void asm_offsets(void)
 #endif
 
 	OFFSET(TASK_TI_CPU_NUM, task_struct, thread_info.cpu);
+	OFFSET(TASK_TI_PERCPU_OFFSET, task_struct, thread_info.percpu_offset);
 	OFFSET(TASK_THREAD_F0,  task_struct, thread.fstate.f[0]);
 	OFFSET(TASK_THREAD_F1,  task_struct, thread.fstate.f[1]);
 	OFFSET(TASK_THREAD_F2,  task_struct, thread.fstate.f[2]);
diff --git a/arch/riscv/kernel/smpboot.c b/arch/riscv/kernel/smpboot.c
index 601a321e0f17..3c09c8f3e30c 100644
--- a/arch/riscv/kernel/smpboot.c
+++ b/arch/riscv/kernel/smpboot.c
@@ -41,6 +41,11 @@
 
 static DECLARE_COMPLETION(cpu_running);
 
+void __init smp_prepare_boot_cpu(void)
+{
+	current_thread_info()->percpu_offset = per_cpu_offset(smp_processor_id());
+}
+
 void __init smp_prepare_cpus(unsigned int max_cpus)
 {
 	int cpuid;
@@ -183,6 +188,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *tidle)
 {
 	int ret = 0;
 	tidle->thread_info.cpu = cpu;
+	tidle->thread_info.percpu_offset = per_cpu_offset(cpu);
 
 	ret = start_secondary_cpu(cpu, tidle);
 	if (!ret) {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 475bb5998295..2180a85b1403 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2199,6 +2199,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 	 */
 	smp_wmb();
 	WRITE_ONCE(task_thread_info(p)->cpu, cpu);
+	WRITE_ONCE(task_thread_info(p)->percpu_offset, per_cpu_offset(cpu));
 	p->wake_cpu = cpu;
 #endif
 }



More information about the linux-riscv mailing list