[PATCH v2 3/3] arm64: escalate smp_send_stop() to an SDEI NMI as a last resort

Kiryl Shutsemau kirill at shutemov.name
Tue Jun 9 06:58:35 PDT 2026


From: "Kiryl Shutsemau (Meta)" <kas at kernel.org>

A CPU wedged with interrupts masked ignores the stop IPI, and without
pseudo-NMI there is no NMI IPI to escalate to: a reboot proceeds with
the CPU still running, and a kdump misses its registers.

Add a third rung to smp_send_stop()'s escalation: signal SDEI event 0
at whatever is still online after the IPI (and pseudo-NMI IPI, if
enabled) rungs. The handler routes like the IPI handlers do --
crash_stop distinguishes a kdump crash stop (crash_save_cpu() on the
wedged context) from a plain stop -- and the CPU acks by marking
itself offline, which the caller already polls.

arm64_nmi_cpu_stop() lives in smp.c rather than the SDEI provider
because it needs the crash_stop discriminator and shares its shape with
ipi_cpu_crash_stop(); it is exported only so the provider's event-0
handler, which owns the trigger, can route into it.

Two differences against an IPI-stopped CPU: the SDEI event is never
completed, since completing it would resume the wedged context, so EL3
retains the event's dispatch slot until reset; and the CPU parks
instead of trying PSCI CPU_OFF, which must not be called from inside
an unfinished SDEI event.

Signed-off-by: Kiryl Shutsemau (Meta) <kas at kernel.org>
---
 arch/arm64/include/asm/nmi.h    | 14 +++++++
 arch/arm64/kernel/smp.c         | 53 ++++++++++++++++++++++++
 drivers/firmware/Kconfig        |  2 +
 drivers/firmware/arm_sdei_nmi.c | 71 +++++++++++++++++++++++++++++++++
 4 files changed, 140 insertions(+)

diff --git a/arch/arm64/include/asm/nmi.h b/arch/arm64/include/asm/nmi.h
index 9366be419d18..2a9e6065f7af 100644
--- a/arch/arm64/include/asm/nmi.h
+++ b/arch/arm64/include/asm/nmi.h
@@ -4,21 +4,35 @@
 
 #include <linux/cpumask.h>
 
+struct pt_regs;
+
 /*
  * Cross-CPU NMI provider hooks, consulted by the arm64 arch code before
  * its regular-IRQ / pseudo-NMI IPI paths. The SDEI provider in
  * drivers/firmware/arm_sdei_nmi.c implements them when active; a future
  * FEAT_NMI provider could slot in here too. The stubs let callers stay
  * unconditional when ARM_SDEI_NMI is off.
+ *
+ * arm64_nmi_cpu_stop() is the reverse direction: the arch entry point
+ * (arch/arm64/kernel/smp.c) that the provider's NMI handler routes a
+ * stop request into.
  */
 #ifdef CONFIG_ARM_SDEI_NMI
 bool sdei_nmi_trigger_cpumask_backtrace(const cpumask_t *mask, int exclude_cpu);
+bool sdei_nmi_stop_cpus(const cpumask_t *mask);
+
+void __noreturn arm64_nmi_cpu_stop(struct pt_regs *regs);
 #else
 static inline bool sdei_nmi_trigger_cpumask_backtrace(const cpumask_t *mask,
 						      int exclude_cpu)
 {
 	return false;
 }
+
+static inline bool sdei_nmi_stop_cpus(const cpumask_t *mask)
+{
+	return false;
+}
 #endif
 
 #endif /* __ASM_NMI_H */
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index a670434a8cae..1af7fdae48db 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -33,6 +33,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/kexec.h>
 #include <linux/kgdb.h>
+#include <linux/kprobes.h>
 #include <linux/kvm_host.h>
 #include <linux/nmi.h>
 
@@ -910,6 +911,35 @@ static void __noreturn ipi_cpu_crash_stop(unsigned int cpu, struct pt_regs *regs
 #endif
 }
 
+#ifdef CONFIG_ARM_SDEI_NMI
+/*
+ * Stop entry for the SDEI cross-CPU NMI service: its event-0 handler
+ * lands here when this CPU was asked to stop. The bookkeeping mirrors
+ * the IPI_CPU_STOP{,_NMI} handling; the park happens inside the SDEI
+ * event, which is never completed -- completing it would have firmware
+ * resume the interrupted (typically wedged) context. No PSCI CPU_OFF
+ * either: powering off a PE that EL3 still considers mid-event invites
+ * firmware trouble.
+ */
+void __noreturn arm64_nmi_cpu_stop(struct pt_regs *regs)
+{
+	unsigned int cpu = smp_processor_id();
+
+	local_daif_mask();
+
+	if (IS_ENABLED(CONFIG_KEXEC_CORE) && crash_stop)
+		crash_save_cpu(regs, cpu);
+
+	/* the ack the stop requester polls for */
+	set_cpu_online(cpu, false);
+
+	sdei_mask_local_cpu();
+
+	cpu_park_loop();
+}
+NOKPROBE_SYMBOL(arm64_nmi_cpu_stop);
+#endif
+
 static void arm64_send_ipi(const cpumask_t *mask, unsigned int nr)
 {
 	unsigned int cpu;
@@ -1263,6 +1293,29 @@ void smp_send_stop(void)
 			udelay(1);
 	}
 
+	/*
+	 * If CPUs are *still* online, try the SDEI cross-CPU NMI. Firmware
+	 * delivers it regardless of the target's DAIF state, so it reaches
+	 * a CPU spinning with interrupts masked, which neither rung above
+	 * could (without pseudo-NMI there is no NMI rung at all). Allow
+	 * 100ms: a firmware round-trip per CPU, with headroom.
+	 */
+	if (num_other_online_cpus()) {
+		/* re-snapshot after the rungs above took CPUs offline */
+		smp_rmb();
+		cpumask_copy(&mask, cpu_online_mask);
+		cpumask_clear_cpu(smp_processor_id(), &mask);
+
+		if (sdei_nmi_stop_cpus(&mask)) {
+			pr_info("SMP: retry stop with SDEI NMI for CPUs %*pbl\n",
+				cpumask_pr_args(&mask));
+
+			timeout = USEC_PER_MSEC * 100;
+			while (num_other_online_cpus() && timeout--)
+				udelay(1);
+		}
+	}
+
 	if (num_other_online_cpus()) {
 		smp_rmb();
 		cpumask_copy(&mask, cpu_online_mask);
diff --git a/drivers/firmware/Kconfig b/drivers/firmware/Kconfig
index 6501087ff90d..ab0ee36d46e7 100644
--- a/drivers/firmware/Kconfig
+++ b/drivers/firmware/Kconfig
@@ -46,6 +46,8 @@ config ARM_SDEI_NMI
 	    - arch_trigger_cpumask_backtrace()  (sysrq-l, RCU stalls,
 	      hardlockup_all_cpu_backtrace, soft-lockup secondary dumps,
 	      hung-task auxiliary dumps)
+	    - smp_send_stop() escalation         (reboot/halt and the
+	      panic / kdump crash stop)
 
 	  The driver registers a handler for the SDEI software-signalled
 	  event (event 0) and reaches a target CPU by signalling it with
diff --git a/drivers/firmware/arm_sdei_nmi.c b/drivers/firmware/arm_sdei_nmi.c
index a82776e7b55a..b34ea42cfe5c 100644
--- a/drivers/firmware/arm_sdei_nmi.c
+++ b/drivers/firmware/arm_sdei_nmi.c
@@ -29,6 +29,11 @@
  *     hardlockup_all_cpu_backtrace, soft-lockup/hung-task secondary
  *     dumps all reach interrupt-masked CPUs.
  *
+ *   - sdei_nmi_stop_cpus() — the last rung of smp_send_stop()'s
+ *     escalation (reboot/halt and the panic/kdump crash stop alike),
+ *     reaching CPUs that ignored the stop IPIs; on the kdump path the
+ *     wedged context is captured into the vmcore before the CPU parks.
+ *
  * Delivery uses the standard SDEI software-signalled event (event 0) and
  * SDEI_EVENT_SIGNAL. We register a handler for event 0, enable it, and
  * poke a target CPU with sdei_event_signal(0, mpidr): firmware makes
@@ -59,8 +64,45 @@ static bool sdei_nmi_available;
 
 #define SDEI_NMI_EVENT			0
 
+/*
+ * Stop-request dispatch lives on the same SDEI event 0 as everything
+ * else. The requesting CPU sets each target's bit in sdei_nmi_stop_mask
+ * before signalling event 0; the target's handler test-and-clears its
+ * bit and hands the CPU to arm64_nmi_cpu_stop(), which saves crash
+ * state when the stop is a kdump crash-stop, marks the CPU offline
+ * (which is what the requester polls for) and parks it.
+ *
+ * This mirrors the cpumask the framework's nmi_cpu_backtrace() consults
+ * just below, and a shared mask rather than a separate SDEI event avoids
+ * extra registrations from firmware.
+ */
+static cpumask_t sdei_nmi_stop_mask;
+
 static int sdei_nmi_handler(u32 event, struct pt_regs *regs, void *arg)
 {
+	int cpu = smp_processor_id();
+
+	if (cpumask_test_and_clear_cpu(cpu, &sdei_nmi_stop_mask)) {
+		/*
+		 * Never returns, and deliberately never completes the SDEI
+		 * event: SDEI_EVENT_COMPLETE has firmware restore the
+		 * interrupted context, which would land the CPU back in
+		 * the wedged loop (or in do_idle, which BUGs at
+		 * cpuhp_report_idle_dead once it sees itself offline).
+		 * Returning a modified pt_regs doesn't help --
+		 * arch/arm64/kernel/sdei.c::do_sdei_event only honours a PC
+		 * override via its IRQ-state heuristic and otherwise hands
+		 * EL3 its own saved-context slot back.
+		 *
+		 * Trade-off: EL3 retains ~one saved-context slot per parked
+		 * CPU until the next hardware reset (~hundreds of bytes per
+		 * CPU). Recoverability is unchanged versus an IPI-stopped
+		 * CPU: neither comes back without a reset.
+		 */
+		arm64_nmi_cpu_stop(regs);
+		/* unreachable */
+	}
+
 	/*
 	 * nmi_cpu_backtrace() no-ops unless this CPU's bit is set in the
 	 * global backtrace mask (driven by nmi_trigger_cpumask_backtrace()),
@@ -115,6 +157,35 @@ bool sdei_nmi_trigger_cpumask_backtrace(const cpumask_t *mask, int exclude_cpu)
 	return true;
 }
 
+/*
+ * Last rung of the stop escalation in smp_send_stop() (see
+ * arch/arm64/kernel/smp.c). The caller runs the regular stop IPI (and
+ * the pseudo-NMI stop IPI, where available) first; @mask holds whatever
+ * stayed online through those -- typically CPUs wedged with interrupts
+ * masked, unreachable by an IPI. Set each target's stop-request flag and
+ * signal event 0 at it; a target acks by marking itself offline, which
+ * the caller polls for.
+ *
+ * Returns false when SDEI isn't active, so the caller can skip the wait.
+ */
+bool sdei_nmi_stop_cpus(const cpumask_t *mask)
+{
+	unsigned int cpu;
+
+	if (!sdei_nmi_available)
+		return false;
+
+	cpumask_or(&sdei_nmi_stop_mask, &sdei_nmi_stop_mask, mask);
+
+	/* Publish the mask before the SMCs read it on the target side. */
+	smp_wmb();
+
+	for_each_cpu(cpu, mask)
+		sdei_nmi_fire(cpu);
+
+	return true;
+}
+
 /*
  * device_initcall (after arch_initcall(sdei_init), so the SDEI subsystem
  * is up): probe the firmware, register the event, and turn on the
-- 
2.54.0




More information about the kexec mailing list