[PATCHv6 16/16] x86/acpi: Add support for CPU offlining for ACPI MADT wakeup method

Kuppuswamy Sathyanarayanan sathyanarayanan.kuppuswamy at linux.intel.com
Fri Jan 26 12:03:14 PST 2024


On 1/24/24 4:55 AM, Kirill A. Shutemov wrote:
> MADT Multiprocessor Wakeup structure version 1 brings support of CPU
> offlining: BIOS provides a reset vector where the CPU has to jump to
> for offlining itself. The new TEST mailbox command can be used to test
> whether the CPU offlined itself which means the BIOS has control over
> the CPU and can online it again via the ACPI MADT wakeup method.
>
> Add CPU offling support for the ACPI MADT wakeup method by implementing
> custom cpu_die(), play_dead() and stop_this_cpu() SMP operations.
>
> CPU offlining makes is possible to hand over secondary CPUs over kexec,
> not limiting the second kernel to a single CPU.
>
> The change conforms to the approved ACPI spec change proposal. See the
> Link.
>
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov at linux.intel.com>
> Link: https://lore.kernel.org/all/13356251.uLZWGnKmhe@kreacher
> ---

Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy at linux.intel.com>

>  arch/x86/include/asm/acpi.h          |   2 +
>  arch/x86/kernel/acpi/Makefile        |   2 +-
>  arch/x86/kernel/acpi/madt_playdead.S |  28 ++++
>  arch/x86/kernel/acpi/madt_wakeup.c   | 184 ++++++++++++++++++++++++++-
>  include/acpi/actbl2.h                |  15 ++-
>  5 files changed, 227 insertions(+), 4 deletions(-)
>  create mode 100644 arch/x86/kernel/acpi/madt_playdead.S
>
> diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
> index 2625b915ae7f..021cafa214c2 100644
> --- a/arch/x86/include/asm/acpi.h
> +++ b/arch/x86/include/asm/acpi.h
> @@ -81,6 +81,8 @@ union acpi_subtable_headers;
>  int __init acpi_parse_mp_wake(union acpi_subtable_headers *header,
>  			      const unsigned long end);
>  
> +void asm_acpi_mp_play_dead(u64 reset_vector, u64 pgd_pa);
> +
>  /*
>   * Check if the CPU can handle C2 and deeper
>   */
> diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile
> index 8c7329c88a75..37b1f28846de 100644
> --- a/arch/x86/kernel/acpi/Makefile
> +++ b/arch/x86/kernel/acpi/Makefile
> @@ -4,7 +4,7 @@ obj-$(CONFIG_ACPI)			+= boot.o
>  obj-$(CONFIG_ACPI_SLEEP)		+= sleep.o wakeup_$(BITS).o
>  obj-$(CONFIG_ACPI_APEI)			+= apei.o
>  obj-$(CONFIG_ACPI_CPPC_LIB)		+= cppc.o
> -obj-$(CONFIG_X86_ACPI_MADT_WAKEUP)	+= madt_wakeup.o
> +obj-$(CONFIG_X86_ACPI_MADT_WAKEUP)	+= madt_wakeup.o madt_playdead.o
>  
>  ifneq ($(CONFIG_ACPI_PROCESSOR),)
>  obj-y					+= cstate.o
> diff --git a/arch/x86/kernel/acpi/madt_playdead.S b/arch/x86/kernel/acpi/madt_playdead.S
> new file mode 100644
> index 000000000000..4e498d28cdc8
> --- /dev/null
> +++ b/arch/x86/kernel/acpi/madt_playdead.S
> @@ -0,0 +1,28 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#include <linux/linkage.h>
> +#include <asm/nospec-branch.h>
> +#include <asm/page_types.h>
> +#include <asm/processor-flags.h>
> +
> +	.text
> +	.align PAGE_SIZE
> +
> +/*
> + * asm_acpi_mp_play_dead() - Hand over control of the CPU to the BIOS
> + *
> + * rdi: Address of the ACPI MADT MPWK ResetVector
> + * rsi: PGD of the identity mapping
> + */
> +SYM_FUNC_START(asm_acpi_mp_play_dead)
> +	/* Turn off global entries. Following CR3 write will flush them. */
> +	movq	%cr4, %rdx
> +	andq	$~(X86_CR4_PGE), %rdx
> +	movq	%rdx, %cr4
> +
> +	/* Switch to identity mapping */
> +	movq	%rsi, %cr3
> +
> +	/* Jump to reset vector */
> +	ANNOTATE_RETPOLINE_SAFE
> +	jmp	*%rdi
> +SYM_FUNC_END(asm_acpi_mp_play_dead)
> diff --git a/arch/x86/kernel/acpi/madt_wakeup.c b/arch/x86/kernel/acpi/madt_wakeup.c
> index 30820f9de5af..9e984e2191ba 100644
> --- a/arch/x86/kernel/acpi/madt_wakeup.c
> +++ b/arch/x86/kernel/acpi/madt_wakeup.c
> @@ -1,10 +1,19 @@
>  // SPDX-License-Identifier: GPL-2.0-or-later
>  #include <linux/acpi.h>
>  #include <linux/cpu.h>
> +#include <linux/delay.h>
>  #include <linux/io.h>
> +#include <linux/kexec.h>
> +#include <linux/memblock.h>
> +#include <linux/pgtable.h>
> +#include <linux/sched/hotplug.h>
>  #include <asm/apic.h>
>  #include <asm/barrier.h>
> +#include <asm/init.h>
> +#include <asm/intel_pt.h>
> +#include <asm/nmi.h>
>  #include <asm/processor.h>
> +#include <asm/reboot.h>
>  
>  /* Physical address of the Multiprocessor Wakeup Structure mailbox */
>  static u64 acpi_mp_wake_mailbox_paddr __ro_after_init;
> @@ -12,6 +21,154 @@ static u64 acpi_mp_wake_mailbox_paddr __ro_after_init;
>  /* Virtual address of the Multiprocessor Wakeup Structure mailbox */
>  static struct acpi_madt_multiproc_wakeup_mailbox *acpi_mp_wake_mailbox __ro_after_init;
>  
> +static u64 acpi_mp_pgd __ro_after_init;
> +static u64 acpi_mp_reset_vector_paddr __ro_after_init;
> +
> +static void acpi_mp_stop_this_cpu(void)
> +{
> +	asm_acpi_mp_play_dead(acpi_mp_reset_vector_paddr, acpi_mp_pgd);
> +}
> +
> +static void acpi_mp_play_dead(void)
> +{
> +	play_dead_common();
> +	asm_acpi_mp_play_dead(acpi_mp_reset_vector_paddr, acpi_mp_pgd);
> +}
> +
> +static void acpi_mp_cpu_die(unsigned int cpu)
> +{
> +	u32 apicid = per_cpu(x86_cpu_to_apicid, cpu);
> +	unsigned long timeout;
> +
> +	/*
> +	 * Use TEST mailbox command to prove that BIOS got control over
> +	 * the CPU before declaring it dead.
> +	 *
> +	 * BIOS has to clear 'command' field of the mailbox.
> +	 */
> +	acpi_mp_wake_mailbox->apic_id = apicid;
> +	smp_store_release(&acpi_mp_wake_mailbox->command,
> +			  ACPI_MP_WAKE_COMMAND_TEST);
> +
> +	/* Don't wait longer than a second. */
> +	timeout = USEC_PER_SEC;
> +	while (READ_ONCE(acpi_mp_wake_mailbox->command) && --timeout)
> +		udelay(1);
> +
> +	if (!timeout)
Nit: IMO, since you are dumping failure error message (not timeout
message), you can use non zero acpi_mp_wake_mailbox->command
check. But it is up to you.
> +		pr_err("Failed to hand over CPU %d to BIOS\n", cpu);
> +}
> +
> +/* The argument is required to match type of x86_mapping_info::alloc_pgt_page */
> +static void __init *alloc_pgt_page(void *dummy)
> +{
> +	return memblock_alloc(PAGE_SIZE, PAGE_SIZE);
> +}
> +
> +static void __init free_pgt_page(void *pgt, void *dummy)
> +{
> +	return memblock_free(pgt, PAGE_SIZE);
> +}
> +
> +/*
> + * Make sure asm_acpi_mp_play_dead() is present in the identity mapping at
> + * the same place as in the kernel page tables. asm_acpi_mp_play_dead() switches
> + * to the identity mapping and the function has be present at the same spot in
> + * the virtual address space before and after switching page tables.
> + */
> +static int __init init_transition_pgtable(pgd_t *pgd)
> +{
> +	pgprot_t prot = PAGE_KERNEL_EXEC_NOENC;
> +	unsigned long vaddr, paddr;
> +	p4d_t *p4d;
> +	pud_t *pud;
> +	pmd_t *pmd;
> +	pte_t *pte;
> +
> +	vaddr = (unsigned long)asm_acpi_mp_play_dead;
> +	pgd += pgd_index(vaddr);
> +	if (!pgd_present(*pgd)) {
> +		p4d = (p4d_t *)alloc_pgt_page(NULL);
> +		if (!p4d)
> +			return -ENOMEM;
> +		set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE));
> +	}
> +	p4d = p4d_offset(pgd, vaddr);
> +	if (!p4d_present(*p4d)) {
> +		pud = (pud_t *)alloc_pgt_page(NULL);
> +		if (!pud)
> +			return -ENOMEM;
> +		set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE));
> +	}
> +	pud = pud_offset(p4d, vaddr);
> +	if (!pud_present(*pud)) {
> +		pmd = (pmd_t *)alloc_pgt_page(NULL);
> +		if (!pmd)
> +			return -ENOMEM;
> +		set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
> +	}
> +	pmd = pmd_offset(pud, vaddr);
> +	if (!pmd_present(*pmd)) {
> +		pte = (pte_t *)alloc_pgt_page(NULL);
> +		if (!pte)
> +			return -ENOMEM;
> +		set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
> +	}
> +	pte = pte_offset_kernel(pmd, vaddr);
> +
> +	paddr = __pa(vaddr);
> +	set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot));
> +
> +	return 0;
> +}
> +
> +static int __init acpi_mp_setup_reset(u64 reset_vector)
> +{
> +	pgd_t *pgd;
> +	struct x86_mapping_info info = {
> +		.alloc_pgt_page = alloc_pgt_page,
> +		.free_pgt_page	= free_pgt_page,
> +		.page_flag      = __PAGE_KERNEL_LARGE_EXEC,
> +		.kernpg_flag    = _KERNPG_TABLE_NOENC,
> +	};
> +
> +	pgd = alloc_pgt_page(NULL);
> +	if (!pgd)
> +		return -ENOMEM;
> +
> +	for (int i = 0; i < nr_pfn_mapped; i++) {
> +		unsigned long mstart, mend;
> +
> +		mstart = pfn_mapped[i].start << PAGE_SHIFT;
> +		mend   = pfn_mapped[i].end << PAGE_SHIFT;
> +		if (kernel_ident_mapping_init(&info, pgd, mstart, mend)) {
> +			kernel_ident_mapping_free(&info, pgd);
> +			return -ENOMEM;
> +		}
> +	}
> +
> +	if (kernel_ident_mapping_init(&info, pgd,
> +				      PAGE_ALIGN_DOWN(reset_vector),
> +				      PAGE_ALIGN(reset_vector + 1))) {
> +		kernel_ident_mapping_free(&info, pgd);
> +		return -ENOMEM;
> +	}
> +
> +	if (init_transition_pgtable(pgd)) {
> +		kernel_ident_mapping_free(&info, pgd);
> +		return -ENOMEM;
> +	}
> +
> +	smp_ops.play_dead = acpi_mp_play_dead;
> +	smp_ops.stop_this_cpu = acpi_mp_stop_this_cpu;
> +	smp_ops.cpu_die = acpi_mp_cpu_die;
> +
> +	acpi_mp_reset_vector_paddr = reset_vector;
> +	acpi_mp_pgd = __pa(pgd);
> +
> +	return 0;
> +}
> +
>  static int acpi_wakeup_cpu(u32 apicid, unsigned long start_ip)
>  {
>  	if (!acpi_mp_wake_mailbox_paddr) {
> @@ -97,14 +254,37 @@ int __init acpi_parse_mp_wake(union acpi_subtable_headers *header,
>  	struct acpi_madt_multiproc_wakeup *mp_wake;
>  
>  	mp_wake = (struct acpi_madt_multiproc_wakeup *)header;
> -	if (BAD_MADT_ENTRY(mp_wake, end))
> +
> +	/*
> +	 * Cannot use the standard BAD_MADT_ENTRY() to sanity check the @mp_wake
> +	 * entry.  'sizeof (struct acpi_madt_multiproc_wakeup)' can be larger
> +	 * than the actual size of the MP wakeup entry in ACPI table because the
> +	 * 'reset_vector' is only available in the V1 MP wakeup structure.
> +	 */
> +	if (!mp_wake)
> +		return -EINVAL;
> +	if (end - (unsigned long)mp_wake < ACPI_MADT_MP_WAKEUP_SIZE_V0)
> +		return -EINVAL;
> +	if (mp_wake->header.length < ACPI_MADT_MP_WAKEUP_SIZE_V0)
>  		return -EINVAL;
>  
>  	acpi_table_print_madt_entry(&header->common);
>  
>  	acpi_mp_wake_mailbox_paddr = mp_wake->mailbox_address;
>  
> -	acpi_mp_disable_offlining(mp_wake);
> +	if (mp_wake->version >= ACPI_MADT_MP_WAKEUP_VERSION_V1 &&
> +	    mp_wake->header.length >= ACPI_MADT_MP_WAKEUP_SIZE_V1) {
> +		if (acpi_mp_setup_reset(mp_wake->reset_vector)) {
> +			pr_warn("Failed to setup MADT reset vector\n");
> +			acpi_mp_disable_offlining(mp_wake);
> +		}
> +	} else {
> +		/*
> +		 * CPU offlining requires version 1 of the ACPI MADT wakeup
> +		 * structure.
> +		 */
> +		acpi_mp_disable_offlining(mp_wake);
> +	}
>  
>  	apic_update_callback(wakeup_secondary_cpu_64, acpi_wakeup_cpu);
>  
> diff --git a/include/acpi/actbl2.h b/include/acpi/actbl2.h
> index e1a395af7591..2aedda70ef88 100644
> --- a/include/acpi/actbl2.h
> +++ b/include/acpi/actbl2.h
> @@ -1120,8 +1120,20 @@ struct acpi_madt_multiproc_wakeup {
>  	u16 version;
>  	u32 reserved;		/* reserved - must be zero */
>  	u64 mailbox_address;
> +	u64 reset_vector;
>  };
>  
> +/* Values for Version field above */
> +
> +enum acpi_madt_multiproc_wakeup_version {
> +	ACPI_MADT_MP_WAKEUP_VERSION_NONE = 0,
> +	ACPI_MADT_MP_WAKEUP_VERSION_V1 = 1,
> +	ACPI_MADT_MP_WAKEUP_VERSION_RESERVED = 2, /* 2 and greater are reserved */
> +};
> +
> +#define ACPI_MADT_MP_WAKEUP_SIZE_V0	16
> +#define ACPI_MADT_MP_WAKEUP_SIZE_V1	24
> +
>  #define ACPI_MULTIPROC_WAKEUP_MB_OS_SIZE        2032
>  #define ACPI_MULTIPROC_WAKEUP_MB_FIRMWARE_SIZE  2048
>  
> @@ -1134,7 +1146,8 @@ struct acpi_madt_multiproc_wakeup_mailbox {
>  	u8 reserved_firmware[ACPI_MULTIPROC_WAKEUP_MB_FIRMWARE_SIZE];	/* reserved for firmware use */
>  };
>  
> -#define ACPI_MP_WAKE_COMMAND_WAKEUP    1
> +#define ACPI_MP_WAKE_COMMAND_WAKEUP	1
> +#define ACPI_MP_WAKE_COMMAND_TEST	2
>  
>  /* 17: CPU Core Interrupt Controller (ACPI 6.5) */
>  

-- 
Sathyanarayanan Kuppuswamy
Linux Kernel Developer




More information about the kexec mailing list