[PATCHv3 14/14] x86/acpi: Add support for CPU offlining for ACPI MADT wakeup method

kirill.shutemov at linux.intel.com kirill.shutemov at linux.intel.com
Fri Dec 1 07:57:39 PST 2023


On Thu, Nov 23, 2023 at 09:38:13AM +0000, Huang, Kai wrote:
> 
> > diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
> > index 171d86fe71ef..602b5d3982ff 100644
> > --- a/arch/x86/kernel/acpi/boot.c
> > +++ b/arch/x86/kernel/acpi/boot.c
> > @@ -22,6 +22,7 @@
> >  #include <linux/efi-bgrt.h>
> >  #include <linux/serial_core.h>
> >  #include <linux/pgtable.h>
> > +#include <linux/sched/hotplug.h>
> >  
> >  #include <asm/e820/api.h>
> >  #include <asm/irqdomain.h>
> > @@ -33,6 +34,7 @@
> >  #include <asm/smp.h>
> >  #include <asm/i8259.h>
> >  #include <asm/setup.h>
> > +#include <asm/init.h>
> 
> The above two are leftovers I believe?
> 
> [...]
> 

Right.


> > +
> > +static atomic_t waiting_for_crash_ipi;
> > +
> > +void asm_acpi_mp_play_dead(u64 reset_vector, u64 pgd_pa);
> > +
> > +static void acpi_mp_play_dead(void)
> > +{
> > +	play_dead_common();
> > +	asm_acpi_mp_play_dead(acpi_mp_reset_vector_paddr,
> > +			      acpi_mp_pgd);
> > +}
> > +
> > +static void acpi_mp_cpu_die(unsigned int cpu)
> > +{
> > +	u32 apicid = per_cpu(x86_cpu_to_apicid, cpu);
> > +	unsigned long timeout;
> > +
> > +	/*
> > +	 * Use TEST mailbox command to prove that BIOS got control over
> > +	 * the CPU before declaring it dead.
> > +	 *
> > +	 * BIOS has to clear 'command' field of the mailbox.
> > +	 */
> > +	acpi_mp_wake_mailbox->apic_id = apicid;
> > +	smp_store_release(&acpi_mp_wake_mailbox->command,
> > +			  ACPI_MP_WAKE_COMMAND_TEST);
> > +
> > +	/* Don't wait longer than a second. */
> > +	timeout = USEC_PER_SEC;
> > +	while (READ_ONCE(acpi_mp_wake_mailbox->command) && timeout--)
> > +		udelay(1);
> > +}
> > +
> > +static void acpi_mp_stop_other_cpus(int wait)
> > +{
> > +	smp_shutdown_nonboot_cpus(smp_processor_id());
> > +}
> > +
> > +static int crash_nmi_callback(unsigned int val, struct pt_regs *regs)
> > +{
> > +	local_irq_disable();
> > +
> > +	crash_save_cpu(regs, raw_smp_processor_id());
> > +
> > +	cpu_emergency_stop_pt();
> > +
> > +	disable_local_APIC();
> > +
> > +	/*
> > +	 * Prepare the CPU for reboot _after_ invoking the callback so that the
> > +	 * callback can safely use virtualization instructions, e.g. VMCLEAR.
> > +	 */
> > +	cpu_emergency_disable_virtualization();
> > +
> > +	atomic_dec(&waiting_for_crash_ipi);
> > +
> > +	asm_acpi_mp_play_dead(acpi_mp_reset_vector_paddr,
> > +			      acpi_mp_pgd);
> > +
> > +	return NMI_HANDLED;
> > +}
> > +
> > +static void acpi_mp_crash_stop_other_cpus(void)
> > +{
> > +	unsigned long timeout;
> > +
> > +	/* The kernel is broken so disable interrupts */
> > +	local_irq_disable();
> > +
> > +
> > +	atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
> > +
> > +	/* Would it be better to replace the trap vector here? */
> > +	if (register_nmi_handler(NMI_LOCAL, crash_nmi_callback,
> > +				 NMI_FLAG_FIRST, "crash"))
> > +		return;		/* Return what? */
> > +
> > +	apic_send_IPI_allbutself(NMI_VECTOR);
> > +
> > +	/* Don't wait longer than a second. */
> > +	timeout = USEC_PER_SEC;
> > +	while (atomic_read(&waiting_for_crash_ipi) && timeout--)
> > +		udelay(1);
> > +}
> > +
> > 
> 
> [...]
> 
> > +	smp_ops.stop_other_cpus = acpi_mp_stop_other_cpus;
> > +	smp_ops.crash_stop_other_cpus = acpi_mp_crash_stop_other_cpus;
> > +
> > 
> 
> The above acpi_mp_crash_stop_other_cpus() and crash_nmi_callback() etc are kinda
> duplicated code with the normal crash kexec() in reboot.c.
> 
> I am not expert here but spent some time looking into the code, and it appears
> the main reason preventing us from reusing that code should be TDX guest doesn't
> play nicely with mwait/halt staff when putting the cpu to offline.  
> 
> I am thinking if we skip/replace them with the asm_acpi_mp_play_dead(), we
> should be able to just reuse the existing smp_ops.stop_other_cpus() and
> smp_ops.crash_stop_other_cpus()?

Okay, fair enough. See fixup below.

> > +
> >  int __init acpi_parse_mp_wake(union acpi_subtable_headers *header,
> >  			      const unsigned long end)
> >  {
> >  	struct acpi_madt_multiproc_wakeup *mp_wake;
> >  
> >  	mp_wake = (struct acpi_madt_multiproc_wakeup *)header;
> > -	if (BAD_MADT_ENTRY(mp_wake, end))
> > +	if (!mp_wake)
> > +		return -EINVAL;
> 
> I think you can keep the BAD_MADT_ENTRY() check as a standard check, and ...

No. BAD_MADT_ENTRY() will fail if the struct version is V0 because the
size will be smaller than sizeof(struct acpi_madt_multiproc_wakeup).


diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 4fab2ed454f3..3c8efba86d5c 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -38,6 +38,7 @@ struct smp_ops {
 	int (*cpu_disable)(void);
 	void (*cpu_die)(unsigned int cpu);
 	void (*play_dead)(void);
+	void (*crash_play_dead)(void);
 
 	void (*send_call_func_ipi)(const struct cpumask *mask);
 	void (*send_call_func_single_ipi)(int cpu);
diff --git a/arch/x86/kernel/acpi/madt_wakeup.c b/arch/x86/kernel/acpi/madt_wakeup.c
index 782fe8fd533c..a801f773f9f1 100644
--- a/arch/x86/kernel/acpi/madt_wakeup.c
+++ b/arch/x86/kernel/acpi/madt_wakeup.c
@@ -25,6 +25,12 @@ static u64 acpi_mp_reset_vector_paddr __ro_after_init;
 
 void asm_acpi_mp_play_dead(u64 reset_vector, u64 pgd_pa);
 
+static void crash_acpi_mp_play_dead(void)
+{
+	asm_acpi_mp_play_dead(acpi_mp_reset_vector_paddr,
+			      acpi_mp_pgd);
+}
+
 static void acpi_mp_play_dead(void)
 {
 	play_dead_common();
@@ -58,57 +64,6 @@ static void acpi_mp_stop_other_cpus(int wait)
 	smp_shutdown_nonboot_cpus(smp_processor_id());
 }
 
-#ifdef CONFIG_KEXEC_CORE
-static atomic_t waiting_for_crash_ipi;
-
-static int crash_nmi_callback(unsigned int val, struct pt_regs *regs)
-{
-	local_irq_disable();
-
-	crash_save_cpu(regs, raw_smp_processor_id());
-
-	cpu_emergency_stop_pt();
-
-	disable_local_APIC();
-
-	/*
-	 * Prepare the CPU for reboot _after_ invoking the callback so that the
-	 * callback can safely use virtualization instructions, e.g. VMCLEAR.
-	 */
-	cpu_emergency_disable_virtualization();
-
-	atomic_dec(&waiting_for_crash_ipi);
-
-	asm_acpi_mp_play_dead(acpi_mp_reset_vector_paddr,
-			      acpi_mp_pgd);
-
-	return NMI_HANDLED;
-}
-
-static void acpi_mp_crash_stop_other_cpus(void)
-{
-	unsigned long timeout;
-
-	/* The kernel is broken so disable interrupts */
-	local_irq_disable();
-
-
-	atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
-
-	/* Would it be better to replace the trap vector here? */
-	if (register_nmi_handler(NMI_LOCAL, crash_nmi_callback,
-				 NMI_FLAG_FIRST, "crash"))
-		return;		/* Return what? */
-
-	apic_send_IPI_allbutself(NMI_VECTOR);
-
-	/* Don't wait longer than a second. */
-	timeout = USEC_PER_SEC;
-	while (atomic_read(&waiting_for_crash_ipi) && timeout--)
-		udelay(1);
-}
-#endif
-
 /* The argument is required to match type of x86_mapping_info::alloc_pgt_page */
 static void __init *alloc_pgt_page(void *dummy)
 {
@@ -277,11 +232,9 @@ static int __init acpi_mp_setup_reset(u64 reset_vector)
 	}
 
 	smp_ops.play_dead = acpi_mp_play_dead;
+	smp_ops.crash_play_dead = crash_acpi_mp_play_dead;
 	smp_ops.cpu_die = acpi_mp_cpu_die;
 	smp_ops.stop_other_cpus = acpi_mp_stop_other_cpus;
-#ifdef CONFIG_KEXEC_CORE
-	smp_ops.crash_stop_other_cpus = acpi_mp_crash_stop_other_cpus;
-#endif
 
 	acpi_mp_reset_vector_paddr = reset_vector;
 	acpi_mp_pgd = __pa(pgd);
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index c81afffaa954..99e6ab552da0 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -878,10 +878,14 @@ static int crash_nmi_callback(unsigned int val, struct pt_regs *regs)
 	cpu_emergency_disable_virtualization();
 
 	atomic_dec(&waiting_for_crash_ipi);
-	/* Assume hlt works */
-	halt();
-	for (;;)
-		cpu_relax();
+
+	if (smp_ops.crash_play_dead) {
+	    smp_ops.crash_play_dead();
+	} else {
+		halt();
+		for (;;)
+			cpu_relax();
+	}
 
 	return NMI_HANDLED;
 }
-- 
  Kiryl Shutsemau / Kirill A. Shutemov



More information about the kexec mailing list