[PATCH v17 05/10] x86: kdump: move reserve_crashkernel[_low]() into crash_core.c
Baoquan He
bhe at redhat.com
Tue Dec 14 02:45:58 PST 2021
On 12/10/21 at 02:55pm, Zhen Lei wrote:
> From: Chen Zhou <chenzhou10 at huawei.com>
>
> Make the functions reserve_crashkernel[_low]() as generic. Since
> reserve_crashkernel[_low]() implementations are quite similar on other
> architectures as well, we can have more users of this later.
>
> So have CONFIG_ARCH_WANT_RESERVE_CRASH_KERNEL in arch/Kconfig and
> select this by X86.
>
> Signed-off-by: Chen Zhou <chenzhou10 at huawei.com>
> Signed-off-by: Zhen Lei <thunder.leizhen at huawei.com>
> Tested-by: John Donnelly <John.p.donnelly at oracle.com>
> Tested-by: Dave Kleikamp <dave.kleikamp at oracle.com>
> ---
> arch/Kconfig | 3 +
> arch/x86/Kconfig | 2 +
> arch/x86/include/asm/elf.h | 3 +
> arch/x86/include/asm/kexec.h | 28 ++++++-
> arch/x86/kernel/setup.c | 143 +-------------------------------
> include/linux/crash_core.h | 3 +
> include/linux/kexec.h | 2 -
> kernel/crash_core.c | 156 +++++++++++++++++++++++++++++++++++
> kernel/kexec_core.c | 17 ----
> 9 files changed, 194 insertions(+), 163 deletions(-)
>
> diff --git a/arch/Kconfig b/arch/Kconfig
> index d3c4ab249e9c275..7bdb32c41985dc5 100644
> --- a/arch/Kconfig
> +++ b/arch/Kconfig
> @@ -24,6 +24,9 @@ config KEXEC_ELF
> config HAVE_IMA_KEXEC
> bool
>
> +config ARCH_WANT_RESERVE_CRASH_KERNEL
> + bool
> +
> config SET_FS
> bool
>
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index 5c2ccb85f2efb86..bd78ed8193079b9 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -12,6 +12,7 @@ config X86_32
> depends on !64BIT
> # Options that are inherently 32-bit kernel only:
> select ARCH_WANT_IPC_PARSE_VERSION
> + select ARCH_WANT_RESERVE_CRASH_KERNEL if KEXEC_CORE
> select CLKSRC_I8253
> select CLONE_BACKWARDS
> select GENERIC_VDSO_32
> @@ -28,6 +29,7 @@ config X86_64
> select ARCH_HAS_GIGANTIC_PAGE
> select ARCH_SUPPORTS_INT128 if CC_HAS_INT128
> select ARCH_USE_CMPXCHG_LOCKREF
> + select ARCH_WANT_RESERVE_CRASH_KERNEL if KEXEC_CORE
> select HAVE_ARCH_SOFT_DIRTY
> select MODULES_USE_ELF_RELA
> select NEED_DMA_MAP_STATE
> diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
> index 29fea180a6658e8..7a6c36cff8331f5 100644
> --- a/arch/x86/include/asm/elf.h
> +++ b/arch/x86/include/asm/elf.h
> @@ -94,6 +94,9 @@ extern unsigned int vdso32_enabled;
>
> #define elf_check_arch(x) elf_check_arch_ia32(x)
>
> +/* We can also handle crash dumps from 64 bit kernel. */
> +# define vmcore_elf_check_arch_cross(x) ((x)->e_machine == EM_X86_64)
> +
> /* SVR4/i386 ABI (pages 3-31, 3-32) says that when the program starts %edx
> contains a pointer to a function which might be registered using `atexit'.
> This provides a mean for the dynamic linker to call DT_FINI functions for
> diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
> index 3a22e65262aa70b..3ff38a1353a2b86 100644
> --- a/arch/x86/include/asm/kexec.h
> +++ b/arch/x86/include/asm/kexec.h
> @@ -21,6 +21,27 @@
> /* 16M alignment for crash kernel regions */
> #define CRASH_ALIGN SZ_16M
>
> +/*
> + * Keep the crash kernel below this limit.
> + *
> + * Earlier 32-bits kernels would limit the kernel to the low 512 MB range
> + * due to mapping restrictions.
> + *
> + * 64-bit kdump kernels need to be restricted to be under 64 TB, which is
> + * the upper limit of system RAM in 4-level paging mode. Since the kdump
> + * jump could be from 5-level paging to 4-level paging, the jump will fail if
> + * the kernel is put above 64 TB, and during the 1st kernel bootup there's
> + * no good way to detect the paging mode of the target kernel which will be
> + * loaded for dumping.
> + */
> +#ifdef CONFIG_X86_32
> +# define CRASH_ADDR_LOW_MAX SZ_512M
> +# define CRASH_ADDR_HIGH_MAX SZ_512M
> +#else
> +# define CRASH_ADDR_LOW_MAX SZ_4G
> +# define CRASH_ADDR_HIGH_MAX SZ_64T
> +#endif
> +
> #ifndef __ASSEMBLY__
>
> #include <linux/string.h>
> @@ -51,9 +72,6 @@ struct kimage;
>
> /* The native architecture */
> # define KEXEC_ARCH KEXEC_ARCH_386
> -
> -/* We can also handle crash dumps from 64 bit kernel. */
> -# define vmcore_elf_check_arch_cross(x) ((x)->e_machine == EM_X86_64)
> #else
> /* Maximum physical address we can use pages from */
> # define KEXEC_SOURCE_MEMORY_LIMIT (MAXMEM-1)
> @@ -195,6 +213,10 @@ typedef void crash_vmclear_fn(void);
> extern crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss;
> extern void kdump_nmi_shootdown_cpus(void);
>
> +#ifdef CONFIG_KEXEC_CORE
> +extern void __init reserve_crashkernel(void);
> +#endif
> +
> #endif /* __ASSEMBLY__ */
>
> #endif /* _ASM_X86_KEXEC_H */
> diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
> index 7ae00716a208f82..5519baa7f4b964e 100644
> --- a/arch/x86/kernel/setup.c
> +++ b/arch/x86/kernel/setup.c
> @@ -39,6 +39,7 @@
> #include <asm/io_apic.h>
> #include <asm/kasan.h>
> #include <asm/kaslr.h>
> +#include <asm/kexec.h>
> #include <asm/mce.h>
> #include <asm/mtrr.h>
> #include <asm/realmode.h>
> @@ -386,147 +387,7 @@ static void __init memblock_x86_reserve_range_setup_data(void)
> }
> }
>
> -/*
> - * --------- Crashkernel reservation ------------------------------
> - */
> -
> -#ifdef CONFIG_KEXEC_CORE
> -
> -/*
> - * Keep the crash kernel below this limit.
> - *
> - * Earlier 32-bits kernels would limit the kernel to the low 512 MB range
> - * due to mapping restrictions.
> - *
> - * 64-bit kdump kernels need to be restricted to be under 64 TB, which is
> - * the upper limit of system RAM in 4-level paging mode. Since the kdump
> - * jump could be from 5-level paging to 4-level paging, the jump will fail if
> - * the kernel is put above 64 TB, and during the 1st kernel bootup there's
> - * no good way to detect the paging mode of the target kernel which will be
> - * loaded for dumping.
> - */
> -#ifdef CONFIG_X86_32
> -# define CRASH_ADDR_LOW_MAX SZ_512M
> -# define CRASH_ADDR_HIGH_MAX SZ_512M
> -#else
> -# define CRASH_ADDR_LOW_MAX SZ_4G
> -# define CRASH_ADDR_HIGH_MAX SZ_64T
> -#endif
> -
> -static int __init reserve_crashkernel_low(void)
> -{
> -#ifdef CONFIG_X86_64
> - unsigned long long base, low_base = 0, low_size = 0;
> - unsigned long low_mem_limit;
> - int ret;
> -
> - low_mem_limit = min(memblock_phys_mem_size(), CRASH_ADDR_LOW_MAX);
> -
> - /* crashkernel=Y,low */
> - ret = parse_crashkernel_low(boot_command_line, low_mem_limit, &low_size, &base);
> - if (ret) {
> - /*
> - * two parts from kernel/dma/swiotlb.c:
> - * -swiotlb size: user-specified with swiotlb= or default.
> - *
> - * -swiotlb overflow buffer: now hardcoded to 32k. We round it
> - * to 8M for other buffers that may need to stay low too. Also
> - * make sure we allocate enough extra low memory so that we
> - * don't run out of DMA buffers for 32-bit devices.
> - */
> - low_size = max(swiotlb_size_or_default() + (8UL << 20), 256UL << 20);
> - } else {
> - /* passed with crashkernel=0,low ? */
> - if (!low_size)
> - return 0;
> - }
> -
> - low_base = memblock_phys_alloc_range(low_size, CRASH_ALIGN, CRASH_ALIGN,
> - CRASH_ADDR_LOW_MAX);
> - if (!low_base) {
> - pr_err("Cannot reserve %ldMB crashkernel low memory, please try smaller size.\n",
> - (unsigned long)(low_size >> 20));
> - return -ENOMEM;
> - }
> -
> - pr_info("Reserving %ldMB of low memory at %ldMB for crashkernel (low RAM limit: %ldMB)\n",
> - (unsigned long)(low_size >> 20),
> - (unsigned long)(low_base >> 20),
> - (unsigned long)(low_mem_limit >> 20));
> -
> - crashk_low_res.start = low_base;
> - crashk_low_res.end = low_base + low_size - 1;
> -#endif
> - return 0;
> -}
> -
> -static void __init reserve_crashkernel(void)
> -{
> - unsigned long long crash_size, crash_base, total_mem;
> - bool high = false;
> - int ret;
> -
> - total_mem = memblock_phys_mem_size();
> -
> - /* crashkernel=XM */
> - ret = parse_crashkernel(boot_command_line, total_mem, &crash_size, &crash_base);
> - if (ret != 0 || crash_size <= 0) {
> - /* crashkernel=X,high */
> - ret = parse_crashkernel_high(boot_command_line, total_mem,
> - &crash_size, &crash_base);
> - if (ret != 0 || crash_size <= 0)
> - return;
> - high = true;
> - }
> -
> - /* 0 means: find the address automatically */
> - if (!crash_base) {
> - /*
> - * Set CRASH_ADDR_LOW_MAX upper bound for crash memory,
> - * crashkernel=x,high reserves memory over CRASH_ADDR_LOW_MAX,
> - * also allocates 256M extra low memory for DMA buffers
> - * and swiotlb.
> - * But the extra memory is not required for all machines.
> - * So try low memory first and fall back to high memory
> - * unless "crashkernel=size[KMG],high" is specified.
> - */
> - if (!high)
> - crash_base = memblock_phys_alloc_range(crash_size,
> - CRASH_ALIGN, CRASH_ALIGN,
> - CRASH_ADDR_LOW_MAX);
> - if (!crash_base)
> - crash_base = memblock_phys_alloc_range(crash_size,
> - CRASH_ALIGN, CRASH_ALIGN,
> - CRASH_ADDR_HIGH_MAX);
> - if (!crash_base) {
> - pr_info("crashkernel reservation failed - No suitable area found.\n");
> - return;
> - }
> - } else {
> - unsigned long long start;
> -
> - start = memblock_phys_alloc_range(crash_size, SZ_1M, crash_base,
> - crash_base + crash_size);
> - if (start != crash_base) {
> - pr_info("crashkernel reservation failed - memory is in use.\n");
> - return;
> - }
> - }
> -
> - if (crash_base >= CRASH_ADDR_LOW_MAX && reserve_crashkernel_low()) {
> - memblock_phys_free(crash_base, crash_size);
> - return;
> - }
> -
> - pr_info("Reserving %ldMB of memory at %ldMB for crashkernel (System RAM: %ldMB)\n",
> - (unsigned long)(crash_size >> 20),
> - (unsigned long)(crash_base >> 20),
> - (unsigned long)(total_mem >> 20));
> -
> - crashk_res.start = crash_base;
> - crashk_res.end = crash_base + crash_size - 1;
> -}
> -#else
> +#ifndef CONFIG_KEXEC_CORE
> static void __init reserve_crashkernel(void)
> {
> }
> diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
> index de62a722431e7db..f6b99da4ed08ecf 100644
> --- a/include/linux/crash_core.h
> +++ b/include/linux/crash_core.h
> @@ -73,6 +73,9 @@ extern unsigned char *vmcoreinfo_data;
> extern size_t vmcoreinfo_size;
> extern u32 *vmcoreinfo_note;
>
> +extern struct resource crashk_res;
> +extern struct resource crashk_low_res;
> +
> Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
> void *data, size_t data_len);
> void final_note(Elf_Word *buf);
> diff --git a/include/linux/kexec.h b/include/linux/kexec.h
> index 0c994ae37729e1e..cd744d962f6f417 100644
> --- a/include/linux/kexec.h
> +++ b/include/linux/kexec.h
> @@ -352,8 +352,6 @@ extern int kexec_load_disabled;
>
> /* Location of a reserved region to hold the crash kernel.
> */
> -extern struct resource crashk_res;
> -extern struct resource crashk_low_res;
> extern note_buf_t __percpu *crash_notes;
>
> /* flag to track if kexec reboot is in progress */
> diff --git a/kernel/crash_core.c b/kernel/crash_core.c
> index eb53f5ec62c900f..b23cfc0ca8905fd 100644
> --- a/kernel/crash_core.c
> +++ b/kernel/crash_core.c
> @@ -8,6 +8,12 @@
> #include <linux/crash_core.h>
> #include <linux/utsname.h>
> #include <linux/vmalloc.h>
> +#include <linux/memblock.h>
> +#include <linux/swiotlb.h>
> +
> +#ifdef CONFIG_KEXEC_CORE
> +#include <asm/kexec.h>
> +#endif
>
> #include <asm/page.h>
> #include <asm/sections.h>
> @@ -22,6 +28,22 @@ u32 *vmcoreinfo_note;
> /* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */
> static unsigned char *vmcoreinfo_data_safecopy;
>
> +/* Location of the reserved area for the crash kernel */
> +struct resource crashk_res = {
> + .name = "Crash kernel",
> + .start = 0,
> + .end = 0,
> + .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
> + .desc = IORES_DESC_CRASH_KERNEL
> +};
> +struct resource crashk_low_res = {
> + .name = "Crash kernel",
> + .start = 0,
> + .end = 0,
> + .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
> + .desc = IORES_DESC_CRASH_KERNEL
> +};
> +
> /*
> * parsing the "crashkernel" commandline
> *
> @@ -295,6 +317,140 @@ int __init parse_crashkernel_low(char *cmdline,
> "crashkernel=", suffix_tbl[SUFFIX_LOW]);
> }
>
> +/*
> + * --------- Crashkernel reservation ------------------------------
> + */
> +
> +#ifdef CONFIG_ARCH_WANT_RESERVE_CRASH_KERNEL
> +static int __init reserve_crashkernel_low(void)
> +{
> +#ifdef CONFIG_64BIT
> + unsigned long long base, low_base = 0, low_size = 0;
> + unsigned long low_mem_limit;
> + int ret;
> +
> + low_mem_limit = min(memblock_phys_mem_size(), CRASH_ADDR_LOW_MAX);
> +
> + /* crashkernel=Y,low */
> + ret = parse_crashkernel_low(boot_command_line, low_mem_limit, &low_size, &base);
> + if (ret) {
> + /*
> + * two parts from kernel/dma/swiotlb.c:
> + * -swiotlb size: user-specified with swiotlb= or default.
> + *
> + * -swiotlb overflow buffer: now hardcoded to 32k. We round it
> + * to 8M for other buffers that may need to stay low too. Also
> + * make sure we allocate enough extra low memory so that we
> + * don't run out of DMA buffers for 32-bit devices.
> + */
> + low_size = max(swiotlb_size_or_default() + (8UL << 20), 256UL << 20);
> + } else {
> + /* passed with crashkernel=0,low ? */
> + if (!low_size)
> + return 0;
> + }
> +
> + low_base = memblock_phys_alloc_range(low_size, CRASH_ALIGN, CRASH_ALIGN,
> + CRASH_ADDR_LOW_MAX);
> + if (!low_base) {
> + pr_err("Cannot reserve %ldMB crashkernel low memory, please try smaller size.\n",
> + (unsigned long)(low_size >> 20));
> + return -ENOMEM;
> + }
> +
> + pr_info("Reserving %ldMB of low memory at %ldMB for crashkernel (low RAM limit: %ldMB)\n",
> + (unsigned long)(low_size >> 20),
> + (unsigned long)(low_base >> 20),
> + (unsigned long)(low_mem_limit >> 20));
> +
> + crashk_low_res.start = low_base;
> + crashk_low_res.end = low_base + low_size - 1;
> +#endif
> + return 0;
> +}
> +
> +/*
> + * reserve_crashkernel() - reserves memory for crash kernel
> + *
> + * This function reserves memory area given in "crashkernel=" kernel command
> + * line parameter. The memory reserved is used by dump capture kernel when
> + * primary kernel is crashing.
> + */
> +void __init reserve_crashkernel(void)
> +{
> + unsigned long long crash_size, crash_base, total_mem;
> + bool high = false;
> + int ret;
> +
> + total_mem = memblock_phys_mem_size();
> +
> + /* crashkernel=XM */
> + ret = parse_crashkernel(boot_command_line, total_mem, &crash_size, &crash_base);
> + if (ret != 0 || crash_size <= 0) {
> + /* crashkernel=X,high */
> + ret = parse_crashkernel_high(boot_command_line, total_mem,
> + &crash_size, &crash_base);
> + if (ret != 0 || crash_size <= 0)
> + return;
> + high = true;
> + }
> +
> + /* 0 means: find the address automatically */
> + if (!crash_base) {
> + /*
> + * Set CRASH_ADDR_LOW_MAX upper bound for crash memory,
> + * crashkernel=x,high reserves memory over CRASH_ADDR_LOW_MAX,
> + * also allocates 256M extra low memory for DMA buffers
> + * and swiotlb.
> + * But the extra memory is not required for all machines.
> + * So try low memory first and fall back to high memory
> + * unless "crashkernel=size[KMG],high" is specified.
> + */
> + if (!high)
> + crash_base = memblock_phys_alloc_range(crash_size,
> + CRASH_ALIGN, CRASH_ALIGN,
> + CRASH_ADDR_LOW_MAX);
> + if (!crash_base)
> + crash_base = memblock_phys_alloc_range(crash_size,
> + CRASH_ALIGN, CRASH_ALIGN,
> + CRASH_ADDR_HIGH_MAX);
> + if (!crash_base) {
> + pr_info("crashkernel reservation failed - No suitable area found.\n");
> + return;
> + }
> + } else {
> + /* User specifies base address explicitly. */
If you plan to repost, please take above sentence off either. Then we
can say this patch is only doing code moving.
> + unsigned long long start;
> +
OK, I can see that this patch is only moving code, and introducing
CONFIG_ARCH_WANT_RESERVE_CRASH_KERNEL to wrap them appropriately, no
extra functionality change added or removed, except of this place.
An alignment checking is added for the user specified base address.
I love this checking. While I have to say it will be more perfect if
it's put in another small patch, that will be look much better from
patch splitting and reviewing point of view.
This whole patch looks great to me, thanks for the effort.
> + if (!IS_ALIGNED(crash_base, CRASH_ALIGN)) {
> + pr_warn("cannot reserve crashkernel: base address is not %ldMB aligned\n",
> + (unsigned long)CRASH_ALIGN >> 20);
> + return;
> + }
> +
> + start = memblock_phys_alloc_range(crash_size, SZ_1M, crash_base,
> + crash_base + crash_size);
> + if (start != crash_base) {
> + pr_info("crashkernel reservation failed - memory is in use.\n");
> + return;
> + }
> + }
> +
> + if (crash_base >= CRASH_ADDR_LOW_MAX && reserve_crashkernel_low()) {
> + memblock_phys_free(crash_base, crash_size);
> + return;
> + }
> +
> + pr_info("Reserving %ldMB of memory at %ldMB for crashkernel (System RAM: %ldMB)\n",
> + (unsigned long)(crash_size >> 20),
> + (unsigned long)(crash_base >> 20),
> + (unsigned long)(total_mem >> 20));
> +
> + crashk_res.start = crash_base;
> + crashk_res.end = crash_base + crash_size - 1;
> +}
> +#endif /* CONFIG_ARCH_WANT_RESERVE_CRASH_KERNEL */
> +
> Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
> void *data, size_t data_len)
> {
> diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
> index 5a5d192a89ac307..1e0d4909bbb6b77 100644
> --- a/kernel/kexec_core.c
> +++ b/kernel/kexec_core.c
> @@ -54,23 +54,6 @@ note_buf_t __percpu *crash_notes;
> /* Flag to indicate we are going to kexec a new kernel */
> bool kexec_in_progress = false;
>
> -
> -/* Location of the reserved area for the crash kernel */
> -struct resource crashk_res = {
> - .name = "Crash kernel",
> - .start = 0,
> - .end = 0,
> - .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
> - .desc = IORES_DESC_CRASH_KERNEL
> -};
> -struct resource crashk_low_res = {
> - .name = "Crash kernel",
> - .start = 0,
> - .end = 0,
> - .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
> - .desc = IORES_DESC_CRASH_KERNEL
> -};
> -
> int kexec_should_crash(struct task_struct *p)
> {
> /*
> --
> 2.25.1
>
More information about the kexec
mailing list