[PATCHv3 14/14] x86/acpi: Add support for CPU offlining for ACPI MADT wakeup method
Huang, Kai
kai.huang at intel.com
Thu Nov 23 01:38:13 PST 2023
> diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
> index 171d86fe71ef..602b5d3982ff 100644
> --- a/arch/x86/kernel/acpi/boot.c
> +++ b/arch/x86/kernel/acpi/boot.c
> @@ -22,6 +22,7 @@
> #include <linux/efi-bgrt.h>
> #include <linux/serial_core.h>
> #include <linux/pgtable.h>
> +#include <linux/sched/hotplug.h>
>
> #include <asm/e820/api.h>
> #include <asm/irqdomain.h>
> @@ -33,6 +34,7 @@
> #include <asm/smp.h>
> #include <asm/i8259.h>
> #include <asm/setup.h>
> +#include <asm/init.h>
The above two are leftovers I believe?
[...]
> +
> +static atomic_t waiting_for_crash_ipi;
> +
> +void asm_acpi_mp_play_dead(u64 reset_vector, u64 pgd_pa);
> +
> +static void acpi_mp_play_dead(void)
> +{
> + play_dead_common();
> + asm_acpi_mp_play_dead(acpi_mp_reset_vector_paddr,
> + acpi_mp_pgd);
> +}
> +
> +static void acpi_mp_cpu_die(unsigned int cpu)
> +{
> + u32 apicid = per_cpu(x86_cpu_to_apicid, cpu);
> + unsigned long timeout;
> +
> + /*
> + * Use TEST mailbox command to prove that BIOS got control over
> + * the CPU before declaring it dead.
> + *
> + * BIOS has to clear 'command' field of the mailbox.
> + */
> + acpi_mp_wake_mailbox->apic_id = apicid;
> + smp_store_release(&acpi_mp_wake_mailbox->command,
> + ACPI_MP_WAKE_COMMAND_TEST);
> +
> + /* Don't wait longer than a second. */
> + timeout = USEC_PER_SEC;
> + while (READ_ONCE(acpi_mp_wake_mailbox->command) && timeout--)
> + udelay(1);
> +}
> +
> +static void acpi_mp_stop_other_cpus(int wait)
> +{
> + smp_shutdown_nonboot_cpus(smp_processor_id());
> +}
> +
> +static int crash_nmi_callback(unsigned int val, struct pt_regs *regs)
> +{
> + local_irq_disable();
> +
> + crash_save_cpu(regs, raw_smp_processor_id());
> +
> + cpu_emergency_stop_pt();
> +
> + disable_local_APIC();
> +
> + /*
> + * Prepare the CPU for reboot _after_ invoking the callback so that the
> + * callback can safely use virtualization instructions, e.g. VMCLEAR.
> + */
> + cpu_emergency_disable_virtualization();
> +
> + atomic_dec(&waiting_for_crash_ipi);
> +
> + asm_acpi_mp_play_dead(acpi_mp_reset_vector_paddr,
> + acpi_mp_pgd);
> +
> + return NMI_HANDLED;
> +}
> +
> +static void acpi_mp_crash_stop_other_cpus(void)
> +{
> + unsigned long timeout;
> +
> + /* The kernel is broken so disable interrupts */
> + local_irq_disable();
> +
> +
> + atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
> +
> + /* Would it be better to replace the trap vector here? */
> + if (register_nmi_handler(NMI_LOCAL, crash_nmi_callback,
> + NMI_FLAG_FIRST, "crash"))
> + return; /* Return what? */
> +
> + apic_send_IPI_allbutself(NMI_VECTOR);
> +
> + /* Don't wait longer than a second. */
> + timeout = USEC_PER_SEC;
> + while (atomic_read(&waiting_for_crash_ipi) && timeout--)
> + udelay(1);
> +}
> +
>
[...]
> + smp_ops.stop_other_cpus = acpi_mp_stop_other_cpus;
> + smp_ops.crash_stop_other_cpus = acpi_mp_crash_stop_other_cpus;
> +
>
The above acpi_mp_crash_stop_other_cpus() and crash_nmi_callback() etc are kinda
duplicated code with the normal crash kexec() in reboot.c.
I am not expert here but spent some time looking into the code, and it appears
the main reason preventing us from reusing that code should be TDX guest doesn't
play nicely with mwait/halt staff when putting the cpu to offline.
I am thinking if we skip/replace them with the asm_acpi_mp_play_dead(), we
should be able to just reuse the existing smp_ops.stop_other_cpus() and
smp_ops.crash_stop_other_cpus()?
Idea only:
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index b6f4e8399fca..9aee6f29a21c 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -841,7 +841,10 @@ void __noreturn stop_this_cpu(void *dummy)
* (stack usage and variables) after possibly issuing the
* native_wbinvd() above.
*/
- native_halt();
+ if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST))
+ asm_acpi_mp_play_dead();
+ else
+ native_halt();
}
}
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 830425e6d38e..8358b292bd42 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -868,8 +868,13 @@ static int crash_nmi_callback(unsigned int val, struct
pt_regs *regs)
cpu_emergency_disable_virtualization();
atomic_dec(&waiting_for_crash_ipi);
- /* Assume hlt works */
- halt();
+
+ if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST))
+ asm_acpi_mp_play_dead();
+ else
+ /* Assume hlt works */
+ halt();
+
for (;;)
cpu_relax();
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 96a771f9f930..f86cb10602aa 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -159,7 +159,7 @@ static void native_stop_other_cpus(int wait)
return;
/* For kexec, ensure that offline CPUs are out of MWAIT and in HLT */
- if (kexec_in_progress)
+ if (kexec_in_progress && !cpu_feature_enabled(X86_FEATURE_TDX_GUEST))
smp_kick_mwait_play_dead();
/*
[...]
> +
> int __init acpi_parse_mp_wake(union acpi_subtable_headers *header,
> const unsigned long end)
> {
> struct acpi_madt_multiproc_wakeup *mp_wake;
>
> mp_wake = (struct acpi_madt_multiproc_wakeup *)header;
> - if (BAD_MADT_ENTRY(mp_wake, end))
> + if (!mp_wake)
> + return -EINVAL;
I think you can keep the BAD_MADT_ENTRY() check as a standard check, and ...
> +
> + if (end - (unsigned long)mp_wake < ACPI_MADT_MP_WAKEUP_SIZE_V0)
> + return -EINVAL;
> + if (mp_wake->header.length < ACPI_MADT_MP_WAKEUP_SIZE_V0)
> return -EINVAL;
... these can be additional sanity check.
More information about the kexec
mailing list