[PATCH v2 2/3] powerpc/kexec_load: add hotplug support

Hari Bathini hbathini at linux.ibm.com
Wed Jun 26 03:07:58 PDT 2024



On 14/06/24 12:37 am, Sourabh Jain wrote:
> Kernel commits b741092d5976 ("powerpc/crash: add crash CPU hotplug
> support") and 849599b702ef ("powerpc/crash: add crash memory hotplug
> support") added crash CPU/Memory hotplug support on PowerPC. This patch
> extends that support for the kexec_load syscall.
> 
> During CPU/Memory hotplug events on PowerPC, two kexec segments,
> elfcorehdr, and FDT, get updated by the kernel. To ensure the kernel
> can safely update these two kexec segments for the kdump image loaded
> using the kexec_load system call, the following changes are made:
> 
> 1. Extra size is allocated for both elfcorehdr and FDT to accommodate
>     additional resources in the future. For the elfcorehdr, the size hint
>     is taken from /sys/kernel/crash_elfcorehdr_size sysfs, while for FDT,
>     extra size is allocated to hold possible CPU nodes.
> 
> 2. Both elfcorehdr and FDT are skipped from SHA calculation.
> 
> Cc: Aditya Gupta <adityag at linux.ibm.com>
> Cc: Baoquan He <bhe at redhat.com>
> Cc: Coiby Xu <coxu at redhat.com>
> Cc: Hari Bathini <hbathini at linux.ibm.com>
> Cc: Mahesh Salgaonkar <mahesh at linux.ibm.com>
> Cc: Simon Horman <horms at kernel.org>

LGTM.

Acked-by: Hari Bathini <hbathini at linux.ibm.com>

> Signed-off-by: Sourabh Jain <sourabhjain at linux.ibm.com>
> ---
> 
> Changelog:
> 
> Since v1:
>    - Find CPUs in the system using the /sys/devices/system/cpu/present sysfs
>      instead of traversing all nodes under /proc/device-tree/cpus.
> 
>    - Added a new function to find present CPUs in the system.
> 
>    - Removed unnecessary NULL check on seg_ptr from arch_do_exclude_segment().
> 
> ---
>   kexec/arch/ppc64/crashdump-ppc64.c  |  16 +-
>   kexec/arch/ppc64/fdt.c              | 236 +++++++++++++++++++++++++++-
>   kexec/arch/ppc64/include/arch/fdt.h |   2 +-
>   kexec/arch/ppc64/kexec-elf-ppc64.c  |   2 +-
>   kexec/arch/ppc64/kexec-ppc64.c      |   9 +-
>   5 files changed, 258 insertions(+), 7 deletions(-)
> 
> diff --git a/kexec/arch/ppc64/crashdump-ppc64.c b/kexec/arch/ppc64/crashdump-ppc64.c
> index 6d47898..98d439a 100644
> --- a/kexec/arch/ppc64/crashdump-ppc64.c
> +++ b/kexec/arch/ppc64/crashdump-ppc64.c
> @@ -476,7 +476,7 @@ int load_crashdump_segments(struct kexec_info *info, char* mod_cmdline,
>   				uint64_t max_addr, unsigned long min_base)
>   {
>   	void *tmp;
> -	unsigned long sz;
> +	unsigned long sz, memsz;
>   	uint64_t elfcorehdr;
>   	int nr_ranges, align = 1024, i;
>   	unsigned long long end;
> @@ -531,8 +531,18 @@ int load_crashdump_segments(struct kexec_info *info, char* mod_cmdline,
>   		}
>   	}
>   
> -	elfcorehdr = add_buffer(info, tmp, sz, sz, align, min_base,
> -				max_addr, 1);
> +	memsz = sz;
> +	/* To support --hotplug, replace the calculated memsz with the value
> +	 * from /sys/kernel/crash_elfcorehdr_size and align it correctly.
> +	 */
> +	if (do_hotplug) {
> +		if (elfcorehdrsz > sz)
> +			memsz = _ALIGN(elfcorehdrsz, align);
> +	}
> +
> +	/* Record the location of the elfcorehdr for hotplug handling */
> +	info->elfcorehdr = elfcorehdr = add_buffer(info, tmp, sz, memsz, align,
> +						   min_base, max_addr, 1);
>   	reserve(elfcorehdr, sz);
>   	/* modify and store the cmdline in a global array. This is later
>   	 * read by flatten_device_tree and modified if required
> diff --git a/kexec/arch/ppc64/fdt.c b/kexec/arch/ppc64/fdt.c
> index 8bc6d2d..879240f 100644
> --- a/kexec/arch/ppc64/fdt.c
> +++ b/kexec/arch/ppc64/fdt.c
> @@ -17,6 +17,13 @@
>   #include <libfdt.h>
>   #include <stdio.h>
>   #include <stdlib.h>
> +#include <limits.h>
> +#include <stdbool.h>
> +#include <dirent.h>
> +#include <sys/stat.h>
> +
> +#include "../../kexec.h"
> +#include "../../kexec-syscall.h"
>   
>   /*
>    * Let the kernel know it booted from kexec, as some things (e.g.
> @@ -46,17 +53,244 @@ static int fixup_kexec_prop(void *fdt)
>   	return 0;
>   }
>   
> +static inline bool is_dot_dir(char * d_path)
> +{
> +	return d_path[0] == '.';
> +}
> +
> +/*
> + * get_cpu_node_size - Returns size of files including file name size under
> + *                     the given @cpu_node_path.
> + */
> +static int get_cpu_node_size(char *cpu_node_path)
> +{
> +	DIR *d;
> +	struct dirent *de;
> +	struct stat statbuf;
> +	int cpu_node_size = 0;
> +	char cpu_prop_path[2 * PATH_MAX];
> +
> +	d = opendir(cpu_node_path);
> +	if (!d)
> +		return 0;
> +
> +	while ((de = readdir(d)) != NULL) {
> +		if (de->d_type != DT_REG)
> +			continue;
> +
> +		memset(cpu_prop_path, '\0', PATH_MAX);
> +		snprintf(cpu_prop_path, 2 * PATH_MAX, "%s/%s", cpu_node_path,
> +			 de->d_name);
> +
> +		if (stat(cpu_prop_path, &statbuf))
> +			continue;
> +
> +		cpu_node_size += statbuf.st_size;
> +		cpu_node_size += strlen(de->d_name);
> +	}
> +
> +	return cpu_node_size;
> +}
> +
> +/*
> + * is_cpu_node - Checks if the node specified by the given @path
> + *               represents a CPU node.
> + *
> + * Returns true if the @path has a "device_type" file containing "cpu";
> + * otherwise, returns false.
> + */
> +static bool is_cpu_node(char *path)
> +{
> +	FILE *file;
> +	bool ret = false;
> +	char device_type[4];
> +
> +	file = fopen(path, "r");
> +	if (!file)
> +		return false;
> +
> +	memset(device_type, '\0', 4);
> +	if (fread(device_type, 1, 3, file) < 3)
> +		goto out;
> +
> +	if (strcmp(device_type, "cpu"))
> +		goto out;
> +
> +	ret = true;
> +out:
> +	fclose(file);
> +	return ret;
> +}
> +
> +static int get_threads_per_cpu(char *path)
> +{
> +	struct stat statbuf;
> +	if (stat(path, &statbuf))
> +		return 0;
> +
> +	return statbuf.st_size / 4;
> +}
> +
> +/**
> + * get_present_cpus - finds the present CPUs in the system
> + *
> + * This function opens the file `/sys/devices/system/cpu/present` to read
> + * the range of present CPUs. It parses the range and calculates the
> + * total number of present CPUs in the system.
> + *
> + * Returns total number of present CPUs on success, -1 on failure.
> + */
> +static int get_present_cpus()
> +{
> +	char *range;
> +	char buf[1024];
> +	int start, end;
> +	int cpu_count = 0;
> +	FILE *file = fopen("/sys/devices/system/cpu/present", "r");
> +
> +	if (!file)
> +		return -1;
> +
> +	if (!fgets(buf, sizeof(buf), file))
> +		return -1;
> +
> +	fclose(file);
> +
> +	range = strtok(buf, ",");
> +	while (range != NULL) {
> +		if (sscanf(range, "%d-%d", &start, &end) == 2) {
> +			for (int i = start; i <= end; i++)
> +				cpu_count++;
> +		} else if (sscanf(range, "%d", &start) == 1) {
> +			cpu_count++;
> +		} else {
> +			return -1;
> +		}
> +		range = strtok(NULL, ",");
> +	}
> +
> +	return cpu_count;
> +}
> +
> +/*
> + * get_cpu_info - Finds the following CPU attributes:
> + *
> + * threads_per_cpu: Number of threads per CPU, based on the device tree entry
> + *                  /proc/device-tree/cpus/<cpu_node>/ibm,ppc-interrupt-server#s.
> + * cpu_node_size: Size of files including file name size under a CPU node.
> + *
> + * Returns 0 on success, else -1.
> + */
> +static int get_cpu_info(int *_present_cpus, int *_threads_per_cpu, int *_cpu_node_size)
> +{
> +	DIR *d;
> +	struct dirent *de;
> +	char path[PATH_MAX];
> +	int present_cpus = 0, threads_per_cpu = 0, cpu_node_size = 0;
> +	char *cpus_node_path = "/proc/device-tree/cpus";
> +
> +	present_cpus = get_present_cpus();
> +	if (present_cpus < 0)
> +		return -1;
> +
> +	d = opendir(cpus_node_path);
> +	if (!d)
> +		return -1;
> +
> +	while ((de = readdir(d)) != NULL) {
> +		if ((de->d_type != DT_DIR) || is_dot_dir(de->d_name))
> +			continue;
> +
> +		memset(path, '\0', PATH_MAX);
> +		snprintf(path, PATH_MAX, "%s/%s/%s", cpus_node_path,
> +			 de->d_name, "device_type");
> +
> +		/* Skip nodes with device_type != "cpu" */
> +		if (!is_cpu_node(path))
> +			continue;
> +
> +		/*
> +		 * Found the first node under /proc/device-tree/cpus with
> +		 * device_type == "cpu"
> +		 */
> +		memset(path, '\0', PATH_MAX);
> +		snprintf(path, PATH_MAX, "%s/%s", cpus_node_path, de->d_name);
> +		cpu_node_size = get_cpu_node_size(path);
> +
> +		memset(path, '\0', PATH_MAX);
> +		snprintf(path, PATH_MAX, "%s/%s/%s", cpus_node_path,
> +		de->d_name, "ibm,ppc-interrupt-server#s");
> +		threads_per_cpu = get_threads_per_cpu(path);
> +		break;
> +	}
> +
> +	closedir(d);
> +
> +	if (!(threads_per_cpu && cpu_node_size))
> +		return -1;
> +
> +	*_present_cpus = present_cpus;
> +	*_cpu_node_size = cpu_node_size;
> +	*_threads_per_cpu = threads_per_cpu;
> +
> +	dbgprintf("present_cpus: %d, threads_per_cpu: %d, cpu_node_size: %d\n",
> +		  present_cpus, threads_per_cpu, cpu_node_size);
> +
> +	return 0;
> +}
> +
> +/*
> + * kdump_fdt_extra_size - Calculates the extra size needed for the Flattened
> + *                        Device Tree (FDT) based on the possible and present
> + *                        CPUs in the system.
> + */
> +static unsigned int kdump_fdt_extra_size(void)
> +{
> +	int cpus_in_system;
> +	unsigned int extra_size = 0;
> +	int present_cpus = 0, threads_per_cpu = 0, cpu_node_size = 0;
> +	int possible_cpus;
> +
> +	/* ALL possible CPUs are present in FDT so no extra size required */
> +	if (sysconf(_SC_NPROCESSORS_ONLN) == sysconf(_SC_NPROCESSORS_CONF))
> +		return 0;
> +
> +	if (get_cpu_info(&present_cpus, &threads_per_cpu, &cpu_node_size)) {
> +		die("Failed to get cpu info\n");
> +	}
> +
> +	cpus_in_system = present_cpus / threads_per_cpu;
> +	possible_cpus = sysconf(_SC_NPROCESSORS_CONF) / threads_per_cpu;
> +	dbgprintf("cpus_in_system: %d, possible_cpus: %d\n", cpus_in_system,
> +		  possible_cpus);
> +
> +	if (cpus_in_system > possible_cpus)
> +		die("Possible CPU nodes can't be less than active CPU nodes\n");
> +
> +	extra_size = (possible_cpus - cpus_in_system) * cpu_node_size;
> +	dbgprintf("kdump fdt extra size: %u\n", extra_size);
> +
> +	return extra_size;
> +}
>   
>   /*
>    * For now, assume that the added content fits in the file.
>    * This should be the case when flattening from /proc/device-tree,
>    * and when passing in a dtb, dtc can be told to add padding.
>    */
> -int fixup_dt(char **fdt, off_t *size)
> +int fixup_dt(char **fdt, off_t *size, unsigned long kexec_flags)
>   {
>   	int ret;
>   
>   	*size += 4096;
> +
> +	/* To support --hotplug option for the kexec_load syscall, consider
> +	 * adding extra buffer to FDT so that the kernel can add CPU nodes
> +	 * of hot-added CPUs.
> +	 */
> +	if (do_hotplug && (kexec_flags & KEXEC_ON_CRASH))
> +		*size += kdump_fdt_extra_size();
> +
>   	*fdt = realloc(*fdt, *size);
>   	if (!*fdt) {
>   		fprintf(stderr, "%s: out of memory\n", __func__);
> diff --git a/kexec/arch/ppc64/include/arch/fdt.h b/kexec/arch/ppc64/include/arch/fdt.h
> index b19f185..5f340b0 100644
> --- a/kexec/arch/ppc64/include/arch/fdt.h
> +++ b/kexec/arch/ppc64/include/arch/fdt.h
> @@ -3,6 +3,6 @@
>   
>   #include <sys/types.h>
>   
> -int fixup_dt(char **fdt, off_t *size);
> +int fixup_dt(char **fdt, off_t *size, unsigned long kexec_flags);
>   
>   #endif
> diff --git a/kexec/arch/ppc64/kexec-elf-ppc64.c b/kexec/arch/ppc64/kexec-elf-ppc64.c
> index bdcfd20..858c994 100644
> --- a/kexec/arch/ppc64/kexec-elf-ppc64.c
> +++ b/kexec/arch/ppc64/kexec-elf-ppc64.c
> @@ -345,7 +345,7 @@ int elf_ppc64_load(int argc, char **argv, const char *buf, off_t len,
>   		create_flatten_tree(&seg_buf, &seg_size, cmdline);
>   	}
>   
> -	result = fixup_dt(&seg_buf, &seg_size);
> +	result = fixup_dt(&seg_buf, &seg_size, info->kexec_flags);
>   	if (result < 0)
>   		return result;
>   
> diff --git a/kexec/arch/ppc64/kexec-ppc64.c b/kexec/arch/ppc64/kexec-ppc64.c
> index fb27b6b..13c3ce3 100644
> --- a/kexec/arch/ppc64/kexec-ppc64.c
> +++ b/kexec/arch/ppc64/kexec-ppc64.c
> @@ -24,6 +24,7 @@
>   #include <errno.h>
>   #include <stdint.h>
>   #include <string.h>
> +#include <libfdt.h>
>   #include <sys/stat.h>
>   #include <sys/types.h>
>   #include <dirent.h>
> @@ -968,7 +969,13 @@ void arch_update_purgatory(struct kexec_info *UNUSED(info))
>   {
>   }
>   
> -int arch_do_exclude_segment(struct kexec_segment *UNUSED(seg_ptr), struct kexec_info *UNUSED(info))
> +int arch_do_exclude_segment(struct kexec_segment *seg_ptr, struct kexec_info *info)
>   {
> +	if (info->elfcorehdr == (unsigned long) seg_ptr->mem)
> +		return 1;
> +
> +	if (seg_ptr->buf && fdt_magic(seg_ptr->buf) == FDT_MAGIC)
> +		return 1;
> +
>   	return 0;
>   }



More information about the kexec mailing list