[PATCH v2 3/3] arm64: escalate smp_send_stop() to an SDEI NMI as a last resort
Kiryl Shutsemau
kirill at shutemov.name
Tue Jun 9 06:58:35 PDT 2026
From: "Kiryl Shutsemau (Meta)" <kas at kernel.org>
A CPU wedged with interrupts masked ignores the stop IPI, and without
pseudo-NMI there is no NMI IPI to escalate to: a reboot proceeds with
the CPU still running, and a kdump misses its registers.
Add a third rung to smp_send_stop()'s escalation: signal SDEI event 0
at whatever is still online after the IPI (and pseudo-NMI IPI, if
enabled) rungs. The handler routes like the IPI handlers do --
crash_stop distinguishes a kdump crash stop (crash_save_cpu() on the
wedged context) from a plain stop -- and the CPU acks by marking
itself offline, which the caller already polls.
arm64_nmi_cpu_stop() lives in smp.c rather than the SDEI provider
because it needs the crash_stop discriminator and shares its shape with
ipi_cpu_crash_stop(); it is exported only so the provider's event-0
handler, which owns the trigger, can route into it.
Two differences against an IPI-stopped CPU: the SDEI event is never
completed, since completing it would resume the wedged context, so EL3
retains the event's dispatch slot until reset; and the CPU parks
instead of trying PSCI CPU_OFF, which must not be called from inside
an unfinished SDEI event.
Signed-off-by: Kiryl Shutsemau (Meta) <kas at kernel.org>
---
arch/arm64/include/asm/nmi.h | 14 +++++++
arch/arm64/kernel/smp.c | 53 ++++++++++++++++++++++++
drivers/firmware/Kconfig | 2 +
drivers/firmware/arm_sdei_nmi.c | 71 +++++++++++++++++++++++++++++++++
4 files changed, 140 insertions(+)
diff --git a/arch/arm64/include/asm/nmi.h b/arch/arm64/include/asm/nmi.h
index 9366be419d18..2a9e6065f7af 100644
--- a/arch/arm64/include/asm/nmi.h
+++ b/arch/arm64/include/asm/nmi.h
@@ -4,21 +4,35 @@
#include <linux/cpumask.h>
+struct pt_regs;
+
/*
* Cross-CPU NMI provider hooks, consulted by the arm64 arch code before
* its regular-IRQ / pseudo-NMI IPI paths. The SDEI provider in
* drivers/firmware/arm_sdei_nmi.c implements them when active; a future
* FEAT_NMI provider could slot in here too. The stubs let callers stay
* unconditional when ARM_SDEI_NMI is off.
+ *
+ * arm64_nmi_cpu_stop() is the reverse direction: the arch entry point
+ * (arch/arm64/kernel/smp.c) that the provider's NMI handler routes a
+ * stop request into.
*/
#ifdef CONFIG_ARM_SDEI_NMI
bool sdei_nmi_trigger_cpumask_backtrace(const cpumask_t *mask, int exclude_cpu);
+bool sdei_nmi_stop_cpus(const cpumask_t *mask);
+
+void __noreturn arm64_nmi_cpu_stop(struct pt_regs *regs);
#else
static inline bool sdei_nmi_trigger_cpumask_backtrace(const cpumask_t *mask,
int exclude_cpu)
{
return false;
}
+
+static inline bool sdei_nmi_stop_cpus(const cpumask_t *mask)
+{
+ return false;
+}
#endif
#endif /* __ASM_NMI_H */
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index a670434a8cae..1af7fdae48db 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -33,6 +33,7 @@
#include <linux/kernel_stat.h>
#include <linux/kexec.h>
#include <linux/kgdb.h>
+#include <linux/kprobes.h>
#include <linux/kvm_host.h>
#include <linux/nmi.h>
@@ -910,6 +911,35 @@ static void __noreturn ipi_cpu_crash_stop(unsigned int cpu, struct pt_regs *regs
#endif
}
+#ifdef CONFIG_ARM_SDEI_NMI
+/*
+ * Stop entry for the SDEI cross-CPU NMI service: its event-0 handler
+ * lands here when this CPU was asked to stop. The bookkeeping mirrors
+ * the IPI_CPU_STOP{,_NMI} handling; the park happens inside the SDEI
+ * event, which is never completed -- completing it would have firmware
+ * resume the interrupted (typically wedged) context. No PSCI CPU_OFF
+ * either: powering off a PE that EL3 still considers mid-event invites
+ * firmware trouble.
+ */
+void __noreturn arm64_nmi_cpu_stop(struct pt_regs *regs)
+{
+ unsigned int cpu = smp_processor_id();
+
+ local_daif_mask();
+
+ if (IS_ENABLED(CONFIG_KEXEC_CORE) && crash_stop)
+ crash_save_cpu(regs, cpu);
+
+ /* the ack the stop requester polls for */
+ set_cpu_online(cpu, false);
+
+ sdei_mask_local_cpu();
+
+ cpu_park_loop();
+}
+NOKPROBE_SYMBOL(arm64_nmi_cpu_stop);
+#endif
+
static void arm64_send_ipi(const cpumask_t *mask, unsigned int nr)
{
unsigned int cpu;
@@ -1263,6 +1293,29 @@ void smp_send_stop(void)
udelay(1);
}
+ /*
+ * If CPUs are *still* online, try the SDEI cross-CPU NMI. Firmware
+ * delivers it regardless of the target's DAIF state, so it reaches
+ * a CPU spinning with interrupts masked, which neither rung above
+ * could (without pseudo-NMI there is no NMI rung at all). Allow
+ * 100ms: a firmware round-trip per CPU, with headroom.
+ */
+ if (num_other_online_cpus()) {
+ /* re-snapshot after the rungs above took CPUs offline */
+ smp_rmb();
+ cpumask_copy(&mask, cpu_online_mask);
+ cpumask_clear_cpu(smp_processor_id(), &mask);
+
+ if (sdei_nmi_stop_cpus(&mask)) {
+ pr_info("SMP: retry stop with SDEI NMI for CPUs %*pbl\n",
+ cpumask_pr_args(&mask));
+
+ timeout = USEC_PER_MSEC * 100;
+ while (num_other_online_cpus() && timeout--)
+ udelay(1);
+ }
+ }
+
if (num_other_online_cpus()) {
smp_rmb();
cpumask_copy(&mask, cpu_online_mask);
diff --git a/drivers/firmware/Kconfig b/drivers/firmware/Kconfig
index 6501087ff90d..ab0ee36d46e7 100644
--- a/drivers/firmware/Kconfig
+++ b/drivers/firmware/Kconfig
@@ -46,6 +46,8 @@ config ARM_SDEI_NMI
- arch_trigger_cpumask_backtrace() (sysrq-l, RCU stalls,
hardlockup_all_cpu_backtrace, soft-lockup secondary dumps,
hung-task auxiliary dumps)
+ - smp_send_stop() escalation (reboot/halt and the
+ panic / kdump crash stop)
The driver registers a handler for the SDEI software-signalled
event (event 0) and reaches a target CPU by signalling it with
diff --git a/drivers/firmware/arm_sdei_nmi.c b/drivers/firmware/arm_sdei_nmi.c
index a82776e7b55a..b34ea42cfe5c 100644
--- a/drivers/firmware/arm_sdei_nmi.c
+++ b/drivers/firmware/arm_sdei_nmi.c
@@ -29,6 +29,11 @@
* hardlockup_all_cpu_backtrace, soft-lockup/hung-task secondary
* dumps all reach interrupt-masked CPUs.
*
+ * - sdei_nmi_stop_cpus() — the last rung of smp_send_stop()'s
+ * escalation (reboot/halt and the panic/kdump crash stop alike),
+ * reaching CPUs that ignored the stop IPIs; on the kdump path the
+ * wedged context is captured into the vmcore before the CPU parks.
+ *
* Delivery uses the standard SDEI software-signalled event (event 0) and
* SDEI_EVENT_SIGNAL. We register a handler for event 0, enable it, and
* poke a target CPU with sdei_event_signal(0, mpidr): firmware makes
@@ -59,8 +64,45 @@ static bool sdei_nmi_available;
#define SDEI_NMI_EVENT 0
+/*
+ * Stop-request dispatch lives on the same SDEI event 0 as everything
+ * else. The requesting CPU sets each target's bit in sdei_nmi_stop_mask
+ * before signalling event 0; the target's handler test-and-clears its
+ * bit and hands the CPU to arm64_nmi_cpu_stop(), which saves crash
+ * state when the stop is a kdump crash-stop, marks the CPU offline
+ * (which is what the requester polls for) and parks it.
+ *
+ * This mirrors the cpumask the framework's nmi_cpu_backtrace() consults
+ * just below, and a shared mask rather than a separate SDEI event avoids
+ * extra registrations from firmware.
+ */
+static cpumask_t sdei_nmi_stop_mask;
+
static int sdei_nmi_handler(u32 event, struct pt_regs *regs, void *arg)
{
+ int cpu = smp_processor_id();
+
+ if (cpumask_test_and_clear_cpu(cpu, &sdei_nmi_stop_mask)) {
+ /*
+ * Never returns, and deliberately never completes the SDEI
+ * event: SDEI_EVENT_COMPLETE has firmware restore the
+ * interrupted context, which would land the CPU back in
+ * the wedged loop (or in do_idle, which BUGs at
+ * cpuhp_report_idle_dead once it sees itself offline).
+ * Returning a modified pt_regs doesn't help --
+ * arch/arm64/kernel/sdei.c::do_sdei_event only honours a PC
+ * override via its IRQ-state heuristic and otherwise hands
+ * EL3 its own saved-context slot back.
+ *
+ * Trade-off: EL3 retains ~one saved-context slot per parked
+ * CPU until the next hardware reset (~hundreds of bytes per
+ * CPU). Recoverability is unchanged versus an IPI-stopped
+ * CPU: neither comes back without a reset.
+ */
+ arm64_nmi_cpu_stop(regs);
+ /* unreachable */
+ }
+
/*
* nmi_cpu_backtrace() no-ops unless this CPU's bit is set in the
* global backtrace mask (driven by nmi_trigger_cpumask_backtrace()),
@@ -115,6 +157,35 @@ bool sdei_nmi_trigger_cpumask_backtrace(const cpumask_t *mask, int exclude_cpu)
return true;
}
+/*
+ * Last rung of the stop escalation in smp_send_stop() (see
+ * arch/arm64/kernel/smp.c). The caller runs the regular stop IPI (and
+ * the pseudo-NMI stop IPI, where available) first; @mask holds whatever
+ * stayed online through those -- typically CPUs wedged with interrupts
+ * masked, unreachable by an IPI. Set each target's stop-request flag and
+ * signal event 0 at it; a target acks by marking itself offline, which
+ * the caller polls for.
+ *
+ * Returns false when SDEI isn't active, so the caller can skip the wait.
+ */
+bool sdei_nmi_stop_cpus(const cpumask_t *mask)
+{
+ unsigned int cpu;
+
+ if (!sdei_nmi_available)
+ return false;
+
+ cpumask_or(&sdei_nmi_stop_mask, &sdei_nmi_stop_mask, mask);
+
+ /* Publish the mask before the SMCs read it on the target side. */
+ smp_wmb();
+
+ for_each_cpu(cpu, mask)
+ sdei_nmi_fire(cpu);
+
+ return true;
+}
+
/*
* device_initcall (after arch_initcall(sdei_init), so the SDEI subsystem
* is up): probe the firmware, register the event, and turn on the
--
2.54.0
More information about the kexec
mailing list