[PATCH 2/3] powerpc/kexec_load: add hotplug support
Sourabh Jain
sourabhjain at linux.ibm.com
Mon Jun 10 05:19:11 PDT 2024
Hello Hari,
On 10/06/24 15:08, Hari Bathini wrote:
>
>
> On 22/05/24 6:43 pm, Sourabh Jain wrote:
>> Kernel commits b741092d5976 ("powerpc/crash: add crash CPU hotplug
>> support") and 849599b702ef ("powerpc/crash: add crash memory hotplug
>> support") added crash CPU/Memory hotplug support on PowerPC. This patch
>> extends that support for the kexec_load syscall.
>>
>> During CPU/Memory hotplug events on PowerPC, two kexec segments,
>> elfcorehdr, and FDT, get updated by the kernel. To ensure the kernel
>> can safely update these two kexec segments for the kdump image loaded
>> using the kexec_load system call, the following changes are made:
>>
>> 1. Extra size is allocated for both elfcorehdr and FDT to accommodate
>> additional resources in the future. For the elfcorehdr, the size
>> hint
>> is taken from /sys/kernel/crash_elfcorehdr_size sysfs, while for
>> FDT,
>> extra size is allocated to hold possible CPU nodes.
>>
>> 2. Both elfcorehdr and FDT are skipped from SHA calculation.
>>
>> Cc: Aditya Gupta <adityag at linux.ibm.com>
>> Cc: Baoquan He <bhe at redhat.com>
>> Cc: Coiby Xu <coxu at redhat.com>
>> Cc: Hari Bathini <hbathini at linux.ibm.com>
>> Cc: Mahesh Salgaonkar <mahesh at linux.ibm.com>
>> Signed-off-by: Sourabh Jain <sourabhjain at linux.ibm.com>
>> ---
>> kexec/arch/ppc64/crashdump-ppc64.c | 16 ++-
>> kexec/arch/ppc64/fdt.c | 200 +++++++++++++++++++++++++++-
>> kexec/arch/ppc64/include/arch/fdt.h | 2 +-
>> kexec/arch/ppc64/kexec-elf-ppc64.c | 2 +-
>> kexec/arch/ppc64/kexec-ppc64.c | 12 +-
>> 5 files changed, 225 insertions(+), 7 deletions(-)
>>
>> diff --git a/kexec/arch/ppc64/crashdump-ppc64.c
>> b/kexec/arch/ppc64/crashdump-ppc64.c
>> index 6d47898..c14b593 100644
>> --- a/kexec/arch/ppc64/crashdump-ppc64.c
>> +++ b/kexec/arch/ppc64/crashdump-ppc64.c
>> @@ -476,7 +476,7 @@ int load_crashdump_segments(struct kexec_info
>> *info, char* mod_cmdline,
>> uint64_t max_addr, unsigned long min_base)
>> {
>> void *tmp;
>> - unsigned long sz;
>> + unsigned long sz, memsz;
>> uint64_t elfcorehdr;
>> int nr_ranges, align = 1024, i;
>> unsigned long long end;
>> @@ -531,8 +531,18 @@ int load_crashdump_segments(struct kexec_info
>> *info, char* mod_cmdline,
>> }
>> }
>> - elfcorehdr = add_buffer(info, tmp, sz, sz, align, min_base,
>> - max_addr, 1);
>> + memsz = sz;
>> + /* To support --hotplug, replace the calculated minimum size
>> with the
>> + * value from /sys/kernel/crash_elfcorehdr_size and align it
>> correctly.
>> + */
>> + if (do_hotplug) {
>> + if (elfcorehdrsz > sz)
>> + memsz = _ALIGN(elfcorehdrsz, align);
>> + }
>> +
>> + /* Record the location of the elfcorehdr for hotplug handling */
>> + info->elfcorehdr = elfcorehdr = add_buffer(info, tmp, sz, memsz,
>> align,
>> + min_base, max_addr, 1);
>> reserve(elfcorehdr, sz);
>> /* modify and store the cmdline in a global array. This is later
>> * read by flatten_device_tree and modified if required
>> diff --git a/kexec/arch/ppc64/fdt.c b/kexec/arch/ppc64/fdt.c
>> index 8bc6d2d..10abc29 100644
>> --- a/kexec/arch/ppc64/fdt.c
>> +++ b/kexec/arch/ppc64/fdt.c
>> @@ -17,6 +17,13 @@
>> #include <libfdt.h>
>> #include <stdio.h>
>> #include <stdlib.h>
>> +#include <limits.h>
>> +#include <stdbool.h>
>> +#include <dirent.h>
>> +#include <sys/stat.h>
>> +
>> +#include "../../kexec.h"
>> +#include "../../kexec-syscall.h"
>> /*
>> * Let the kernel know it booted from kexec, as some things (e.g.
>> @@ -46,17 +53,208 @@ static int fixup_kexec_prop(void *fdt)
>> return 0;
>> }
>> +static inline bool is_dot_dir(char * d_path)
>> +{
>> + return d_path[0] == '.';
>> +}
>> +
>> +/*
>> + * Returns size of files including file name size under the given
>> + * @cpu_node_path.
>> + */
>> +static unsigned int get_cpu_node_size(char *cpu_node_path)
>> +{
>> + DIR *d;
>> + struct dirent *de;
>> + struct stat statbuf;
>> + unsigned int cpu_node_size = 0;
>> + char cpu_prop_path[2 * PATH_MAX];
>> +
>> + d = opendir(cpu_node_path);
>> + if (!d)
>> + return 0;
>> +
>> + while ((de = readdir(d)) != NULL) {
>> + if (de->d_type != DT_REG)
>> + continue;
>> +
>> + memset(cpu_prop_path, '\0', PATH_MAX);
>> + snprintf(cpu_prop_path, 2 * PATH_MAX, "%s/%s",
>> cpu_node_path, de->d_name);
>> +
>> + if (stat(cpu_prop_path, &statbuf))
>> + continue;
>> +
>> + cpu_node_size += statbuf.st_size;
>> + cpu_node_size += strlen(de->d_name);
>> + }
>> +
>> + return cpu_node_size;
>> +}
>> +
>> +/*
>> + * Checks if the node specified by the given @path represents a CPU
>> node.
>> + *
>> + * Returns true if the @path has a "device_type" file containing "cpu";
>> + * otherwise, returns false.
>> + */
>> +static bool is_cpu_node(char *path)
>> +{
>> + FILE *file;
>> + bool ret = false;
>> + char device_type[4];
>> +
>> + file = fopen(path, "r");
>> + if (!file)
>> + return false;
>> +
>> + memset(device_type, '\0', 4);
>> + if (fread(device_type, 1, 3, file) < 3)
>> + goto out;
>> +
>> + if (strcmp(device_type, "cpu"))
>> + goto out;
>> +
>> + ret = true;
>> +
>> +out:
>> + fclose(file);
>> + return ret;
>> +}
>> +
>> +static unsigned int get_threads_per_cpu(char *path)
>> +{
>> + struct stat statbuf;
>> + if (stat(path, &statbuf))
>> + return 0;
>> +
>> + return statbuf.st_size / 4;
>> +}
>> +
>> +/*
>> + * Finds the following CPU attributes:
>> + *
>> + * cpus_in_system: Currently available CPU nodes present under
>> + * /proc/device-tree/cpus.
>> + * threads_per_cpu: Number of threads per CPU, based on the device
>> tree entry
>> + * /proc/device-tree/cpus/<cpu_node>/ibm,ppc-interrupt-server#s.
>> + * cpu_node_size: Size of files including file name size under a CPU
>> node.
>> + *
>> + * Returns 0 on success, else -1.
>> + */
>> +static unsigned int get_cpu_info(int *_cpus_in_system, int
>> *_threads_per_cpu,
>> + int *_cpu_node_size)
>> +{
>> + DIR *d;
>> + struct dirent *de;
>> + int first_cpu = 1;
>> + char path[PATH_MAX];
>> + char *cpus_node_path = "/proc/device-tree/cpus";
>> + int cpus_in_system = 0, threads_per_cpu = 0, cpu_node_size = 0;
>> +
>> + d = opendir(cpus_node_path);
>> + if (!d)
>> + return -1;
>> +
>> + while ((de = readdir(d)) != NULL) {
>> + if ((de->d_type != DT_DIR) || is_dot_dir(de->d_name))
>> + continue;
>> +
>> + memset(path, '\0', PATH_MAX);
>> + snprintf(path, PATH_MAX, "%s/%s/%s", cpus_node_path,
>> + de->d_name, "device_type");
>> +
>> + /* Skip nodes with device_type != "cpu" */
>> + if (!is_cpu_node(path))
>> + continue;
>> +
>> + /*
>> + * Found the first node under /proc/device-tree/cpus with
>> + * device_type == "cpu"
>> + */
>> + if (first_cpu) {
>> + memset(path, '\0', PATH_MAX);
>> + snprintf(path, PATH_MAX, "%s/%s", cpus_node_path,
>> de->d_name);
>> + cpu_node_size = get_cpu_node_size(path);
>> +
>> + memset(path, '\0', PATH_MAX);
>> + snprintf(path, PATH_MAX, "%s/%s/%s", cpus_node_path,
>> + de->d_name, "ibm,ppc-interrupt-server#s");
>> + threads_per_cpu = get_threads_per_cpu(path);
>> +
>> + first_cpu = 0;
>> + }
>> +
>> + cpus_in_system++;
>> + }
>> +
>> + closedir(d);
>> +
>> + dbgprintf("cpus_in_system: %d, threads_per_cpus: %d,
>> cpu_node_size: %d\n",
>> + cpus_in_system, threads_per_cpu, cpu_node_size);
>> +
>> + if (!(cpus_in_system && threads_per_cpu && cpu_node_size))
>> + return -1;
>> +
>> + *_cpus_in_system = cpus_in_system;
>> + *_threads_per_cpu = threads_per_cpu;
>> + *_cpu_node_size = cpu_node_size;
>> +
>> + return 0;
>> +}
>> +
>> +/*
>> + * Calculates the extra size needed for the flattened device tree
>> (FDT) based
>> + * on the difference between the possible number of CPU nodes and
>> the number
>> + * of CPU nodes present under /proc/device-tree/cpus.
>> + */
>> +static unsigned int kdump_fdt_extra_size(void)
>> +{
>> + unsigned int extra_size = 0;
>> + int cpus_in_system = 0, threads_per_cpu = 0, cpu_node_size = 0;
>> + int possible_cpus;
>> +
>> + /* ALL possible CPUs are present in FDT so no extra size
>> required */
>> + if (sysconf(_SC_NPROCESSORS_ONLN) == sysconf(_SC_NPROCESSORS_CONF))
>> + return 0;
>> +
>> + if (get_cpu_info(&cpus_in_system, &threads_per_cpu,
>> &cpu_node_size)) {
>
> Is (sysconf(_SC_NPROCESSORS_ONLN) / threads_per_cpu) not the same as
> 'cpus_in_system'? If they are same, should be able to get cpu_node_size
> & threads_per_cpu by examining a single core instead of going through
> all cores on the system??
Agree we can avoid traversing all cpu nodes. I will include the above
suggestion
in next version.
>
>> + die("Failed to get cpu info\n");
>> + }
>> +
>> + /*
>> + * Maximum number of CPU nodes with device_type = "cpu" possible
>> under
>> + * /proc/device-tree/cpus/
>> + */
>> + possible_cpus = sysconf(_SC_NPROCESSORS_CONF) / threads_per_cpu;
>> +
>> + if (cpus_in_system > possible_cpus)
>> + die("Possible CPU nodes can't be less than active CPU
>> nodes\n");
>> +
>> +
>> + extra_size = (possible_cpus - cpus_in_system) * cpu_node_size;
>> + dbgprintf("kdump fdt extra size: %u\n", extra_size);
>> +
>> + return extra_size;
>> +}
>> /*
>> * For now, assume that the added content fits in the file.
>> * This should be the case when flattening from /proc/device-tree,
>> * and when passing in a dtb, dtc can be told to add padding.
>> */
>> -int fixup_dt(char **fdt, off_t *size)
>> +int fixup_dt(char **fdt, off_t *size, unsigned long kexec_flags)
>> {
>> int ret;
>> *size += 4096;
>> +
>> + /* To support --hotplug option for the kexec_load syscall, consider
>> + * adding extra buffer to FDT so that the kernel can add CPU nodes
>> + * of hot-added CPUs.
>> + */
>> + if (do_hotplug && (kexec_flags & KEXEC_ON_CRASH))
>> + *size += kdump_fdt_extra_size();
>> +
>> *fdt = realloc(*fdt, *size);
>> if (!*fdt) {
>> fprintf(stderr, "%s: out of memory\n", __func__);
>> diff --git a/kexec/arch/ppc64/include/arch/fdt.h
>> b/kexec/arch/ppc64/include/arch/fdt.h
>> index b19f185..5f340b0 100644
>> --- a/kexec/arch/ppc64/include/arch/fdt.h
>> +++ b/kexec/arch/ppc64/include/arch/fdt.h
>> @@ -3,6 +3,6 @@
>> #include <sys/types.h>
>> -int fixup_dt(char **fdt, off_t *size);
>> +int fixup_dt(char **fdt, off_t *size, unsigned long kexec_flags);
>> #endif
>> diff --git a/kexec/arch/ppc64/kexec-elf-ppc64.c
>> b/kexec/arch/ppc64/kexec-elf-ppc64.c
>> index bdcfd20..858c994 100644
>> --- a/kexec/arch/ppc64/kexec-elf-ppc64.c
>> +++ b/kexec/arch/ppc64/kexec-elf-ppc64.c
>> @@ -345,7 +345,7 @@ int elf_ppc64_load(int argc, char **argv, const
>> char *buf, off_t len,
>> create_flatten_tree(&seg_buf, &seg_size, cmdline);
>> }
>> - result = fixup_dt(&seg_buf, &seg_size);
>> + result = fixup_dt(&seg_buf, &seg_size, info->kexec_flags);
>> if (result < 0)
>> return result;
>> diff --git a/kexec/arch/ppc64/kexec-ppc64.c
>> b/kexec/arch/ppc64/kexec-ppc64.c
>> index fb27b6b..f27d76b 100644
>> --- a/kexec/arch/ppc64/kexec-ppc64.c
>> +++ b/kexec/arch/ppc64/kexec-ppc64.c
>> @@ -24,6 +24,7 @@
>> #include <errno.h>
>> #include <stdint.h>
>> #include <string.h>
>> +#include <libfdt.h>
>> #include <sys/stat.h>
>> #include <sys/types.h>
>> #include <dirent.h>
>> @@ -968,7 +969,16 @@ void arch_update_purgatory(struct kexec_info
>> *UNUSED(info))
>> {
>> }
>> -int arch_do_exclude_segment(struct kexec_segment *UNUSED(seg_ptr),
>> struct kexec_info *UNUSED(info))
>> +int arch_do_exclude_segment(struct kexec_segment *seg_ptr, struct
>> kexec_info *info)
>> {
>
>> + if (!seg_ptr)
>
> When can this be true? If this is a possiblilty, why do we not have the
> same thing for i386?
I agree that seg_ptr can't be NULL, which makes the above check unnecessary.
I will remove it in the next version.
Thanks,
Sourabh Jain
More information about the kexec
mailing list