[PATCH v3 9/9] riscv: kexec: Route normal kexec through the trampoline page table

fangyu.yu at linux.alibaba.com fangyu.yu at linux.alibaba.com
Thu Jun 4 06:24:18 PDT 2026


From: Fangyu Yu <fangyu.yu at linux.alibaba.com>

riscv_kexec_relocate (copied into control_code_buffer) uses an stvec
trick to drop the MMU and land on the PA of the next loop label.
Under VS-mode KVM cannot emulate this single-step transition and the
VCPU dies with "kvm run failed Operation not supported".

Route normal kexec through riscv_kexec_relocate_entry, the trampoline
wrapper added in the previous patch. It drops SATP with PC already on
a PA, then hands off to control_code_buffer where the relocate body
runs with SATP=0.

Drop the stvec trick from the relocate body and pass first_ind_entry
as a physical address since the body now starts with SATP=0. The
".align 2" plus filler "nop" that ensured the PA of the loop top was
4-byte aligned -- required because the legacy stvec trick wrote that
PA into stvec.BASE, whose low two bits are MODE and are discarded by
the hardware -- is no longer load-bearing and is removed as well.

Signed-off-by: Fangyu Yu <fangyu.yu at linux.alibaba.com>
---
 arch/riscv/kernel/kexec_relocate.S | 26 ++++++--------------------
 arch/riscv/kernel/machine_kexec.c  | 27 +++++++++++++++++++--------
 2 files changed, 25 insertions(+), 28 deletions(-)

diff --git a/arch/riscv/kernel/kexec_relocate.S b/arch/riscv/kernel/kexec_relocate.S
index 6c624560c9ac..7ffb83ea45fc 100644
--- a/arch/riscv/kernel/kexec_relocate.S
+++ b/arch/riscv/kernel/kexec_relocate.S
@@ -34,27 +34,13 @@ SYM_CODE_START(riscv_kexec_relocate)
 	csrw	CSR_SIP, zero
 
 	/*
-	 * When we switch SATP.MODE to "Bare" we'll only
-	 * play with physical addresses. However the first time
-	 * we try to jump somewhere, the offset on the jump
-	 * will be relative to pc which will still be on VA. To
-	 * deal with this we set stvec to the physical address at
-	 * the start of the loop below so that we jump there in
-	 * any case.
+	 * The trampoline wrapper (riscv_kexec_relocate_entry) has already
+	 * dropped the MMU and handed control to us at this PA copy of the
+	 * relocate code. From here on the entire loop runs with SATP=0 and
+	 * every address (s0, s5, source/dest pointers) is a physical one.
 	 */
-	la	s6, 1f
-	sub	s6, s6, s4
-	csrw	CSR_STVEC, s6
-
-	/*
-	 * With C-extension, here we get 42 Bytes and the next
-	 * .align directive would pad zeros here up to 44 Bytes.
-	 * So manually put a nop here to avoid zeros padding.
-	*/
-	nop
 
 	/* Process entries in a loop */
-.align 2
 1:
 	REG_L	t0, 0(s0)		/* t0 = *image->entry */
 	addi	s0, s0, RISCV_SZPTR	/* image->entry++ */
@@ -70,8 +56,8 @@ SYM_CODE_START(riscv_kexec_relocate)
 	andi	t1, t0, 0x2
 	beqz	t1, 2f
 	andi	s0, t0, ~0x2
-	csrw	CSR_SATP, zero
-	jr	s6
+	/* MMU is already off; the entry wrapper handled the transition. */
+	j	1b
 
 2:
 	/* IND_DONE entry ? -> jump to done label */
diff --git a/arch/riscv/kernel/machine_kexec.c b/arch/riscv/kernel/machine_kexec.c
index 71688c63af65..82fcb84a03ec 100644
--- a/arch/riscv/kernel/machine_kexec.c
+++ b/arch/riscv/kernel/machine_kexec.c
@@ -164,9 +164,6 @@ machine_kexec_prepare(struct kimage *image)
 		memcpy(control_code_buffer, riscv_kexec_relocate,
 			riscv_kexec_relocate_size);
 
-		/* Mark the control page executable */
-		set_memory_x((unsigned long) control_code_buffer, 1);
-
 		WRITE_ONCE(riscv_kexec_relocate_entry_pa,
 			   __pa_symbol(&riscv_kexec_relocate_entry));
 	} else {
@@ -262,11 +259,15 @@ machine_kexec(struct kimage *image)
 {
 	struct kimage_arch *internal = &image->arch;
 	unsigned long jump_addr = (unsigned long) image->start;
-	unsigned long first_ind_entry = (unsigned long) &image->head;
+	/*
+	 * The relocate body runs entirely with the MMU off (the wrapper
+	 * drops SATP before jumping into control_code_buffer), so the very
+	 * first entry must be a physical address.
+	 */
+	unsigned long first_ind_entry = __pa(&image->head);
 	unsigned long this_cpu_id = __smp_processor_id();
 	unsigned long this_hart_id = cpuid_to_hartid_map(this_cpu_id);
 	unsigned long fdt_addr = internal->fdt_addr;
-	void *control_code_buffer = page_address(image->control_code_page);
 	riscv_kexec_method kexec_method = NULL;
 
 #ifdef CONFIG_SMP
@@ -274,10 +275,20 @@ machine_kexec(struct kimage *image)
 		"Some CPUs may be stale, kdump will be unreliable.\n");
 #endif
 
-	if (image->type != KEXEC_TYPE_CRASH)
-		kexec_method = control_code_buffer;
-	else
+	if (image->type != KEXEC_TYPE_CRASH) {
+		kexec_method = (riscv_kexec_method) &riscv_kexec_relocate_entry;
+		/*
+		 * Publish the per-image control_code_buffer PA at dispatch
+		 * time rather than in machine_kexec_prepare(). machine_kexec()
+		 * only runs once the image has been fully loaded and committed
+		 * as kexec_image, so the global cannot be left pointing at a
+		 * page freed by a failed load.
+		 */
+		WRITE_ONCE(riscv_kexec_cc_buffer_pa,
+			   __pa(page_address(image->control_code_page)));
+	} else {
 		kexec_method = (riscv_kexec_method) &riscv_kexec_norelocate;
+	}
 
 	pr_notice("Will call new kernel at %08lx from hart id %lx\n",
 		  jump_addr, this_hart_id);
-- 
2.50.1




More information about the kexec mailing list