[PATCHv3 5/9] kexec: Introduce kexec_pe_image to parse and load PE file
Pingfan Liu
piliu at redhat.com
Wed May 28 21:17:40 PDT 2025
As UEFI becomes popular, a few architectures support to boot a PE format
kernel image directly. But the internal of PE format varies, which means
each parser for each format.
This patch (with the rest in this series) introduces a common skeleton
to all parsers, and leave the format parsing in
bpf-prog, so the kernel code can keep relative stable.
A new kexec_file_ops is implementation, named pe_image_ops.
There are some place holder function in this patch. (They will take
effect after the introduction of kexec bpf light skeleton and bpf
helpers). Overall the parsing progress is a pipeline, the current
bpf-prog parser is attached to bpf_handle_pefile(), and detatched at the
end of the current stage 'disarm_bpf_prog()' the current parsed result
by the current bpf-prog will be buffered in kernel 'prepare_nested_pe()'
, and deliver to the next stage. For each stage, the bpf bytecode is
extracted from the '.bpf' section in the PE file.
Signed-off-by: Pingfan Liu <piliu at redhat.com>
Cc: Baoquan He <bhe at redhat.com>
Cc: Dave Young <dyoung at redhat.com>
Cc: Andrew Morton <akpm at linux-foundation.org>
Cc: Philipp Rudo <prudo at redhat.com>
To: kexec at lists.infradead.org
---
include/linux/kexec.h | 1 +
kernel/Kconfig.kexec | 8 +
kernel/Makefile | 1 +
kernel/kexec_pe_image.c | 356 ++++++++++++++++++++++++++++++++++++++++
4 files changed, 366 insertions(+)
create mode 100644 kernel/kexec_pe_image.c
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 26398b269ac29..bca8136dcf1fd 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -392,6 +392,7 @@ static inline int machine_kexec_post_load(struct kimage *image) { return 0; }
extern struct kimage *kexec_image;
extern struct kimage *kexec_crash_image;
+extern const struct kexec_file_ops pe_image_ops;
bool kexec_load_permitted(int kexec_image_type);
diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec
index 4d111f8719516..686eb7cb96142 100644
--- a/kernel/Kconfig.kexec
+++ b/kernel/Kconfig.kexec
@@ -47,6 +47,14 @@ config KEXEC_FILE
for kernel and initramfs as opposed to list of segments as
accepted by kexec system call.
+config KEXEC_PE_IMAGE
+ bool "Enable parsing UEFI PE file through kexec file based system call"
+ depends on KEXEC_FILE
+ depends on DEBUG_INFO_BTF && BPF_SYSCALL
+ help
+ This option makes the kexec_file_load() syscall cooperates with bpf-prog
+ to parse PE format file
+
config KEXEC_SIG
bool "Verify kernel signature during kexec_file_load() syscall"
depends on ARCH_SUPPORTS_KEXEC_SIG
diff --git a/kernel/Makefile b/kernel/Makefile
index 434929de17ef2..ab82d73d8ce81 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -79,6 +79,7 @@ obj-$(CONFIG_KEXEC_CORE) += kexec_core.o
obj-$(CONFIG_CRASH_DUMP) += crash_core.o
obj-$(CONFIG_KEXEC) += kexec.o
obj-$(CONFIG_KEXEC_FILE) += kexec_file.o
+obj-$(CONFIG_KEXEC_PE_IMAGE) += kexec_pe_image.o
obj-$(CONFIG_KEXEC_ELF) += kexec_elf.o
obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
obj-$(CONFIG_COMPAT) += compat.o
diff --git a/kernel/kexec_pe_image.c b/kernel/kexec_pe_image.c
new file mode 100644
index 0000000000000..3097efccb8502
--- /dev/null
+++ b/kernel/kexec_pe_image.c
@@ -0,0 +1,356 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Kexec PE image loader
+
+ * Copyright (C) 2025 Red Hat, Inc
+ */
+
+#define pr_fmt(fmt) "kexec_file(Image): " fmt
+
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/list.h>
+#include <linux/kernel.h>
+#include <linux/vmalloc.h>
+#include <linux/kexec.h>
+#include <linux/pe.h>
+#include <linux/string.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <asm/byteorder.h>
+#include <asm/image.h>
+#include <asm/memory.h>
+
+
+static LIST_HEAD(phase_head);
+
+struct parsed_phase {
+ struct list_head head;
+ struct list_head res_head;
+};
+
+static struct parsed_phase *cur_phase;
+
+static char *kexec_res_names[3] = {"kernel", "initrd", "cmdline"};
+
+struct kexec_res {
+ struct list_head node;
+ char *name;
+ /* The free of buffer is deferred to kimage_file_post_load_cleanup */
+ bool deferred_free;
+ struct mem_range_result *r;
+};
+
+static struct parsed_phase *alloc_new_phase(void)
+{
+ struct parsed_phase *phase = kzalloc(sizeof(struct parsed_phase), GFP_KERNEL);
+
+ INIT_LIST_HEAD(&phase->head);
+ INIT_LIST_HEAD(&phase->res_head);
+ list_add_tail(&phase->head, &phase_head);
+
+ return phase;
+}
+
+static bool is_valid_pe(const char *kernel_buf, unsigned long kernel_len)
+{
+ struct mz_hdr *mz;
+ struct pe_hdr *pe;
+
+ if (!kernel_buf)
+ return false;
+ mz = (struct mz_hdr *)kernel_buf;
+ if (mz->magic != MZ_MAGIC)
+ return false;
+ pe = (struct pe_hdr *)(kernel_buf + mz->peaddr);
+ if (pe->magic != PE_MAGIC)
+ return false;
+ if (pe->opt_hdr_size == 0) {
+ pr_err("optional header is missing\n");
+ return false;
+ }
+
+ return true;
+}
+
+static bool is_valid_format(const char *kernel_buf, unsigned long kernel_len)
+{
+ return is_valid_pe(kernel_buf, kernel_len);
+}
+
+/*
+ * The UEFI Terse Executable (TE) image has MZ header.
+ */
+static int pe_image_probe(const char *kernel_buf, unsigned long kernel_len)
+{
+ return is_valid_pe(kernel_buf, kernel_len) ? 0 : -1;
+}
+
+static int get_pe_section(char *file_buf, const char *sect_name,
+ char **sect_start, unsigned long *sect_sz)
+{
+ struct pe_hdr *pe_hdr;
+ struct pe32plus_opt_hdr *opt_hdr;
+ struct section_header *sect_hdr;
+ int section_nr, i;
+ struct mz_hdr *mz = (struct mz_hdr *)file_buf;
+
+ *sect_start = NULL;
+ *sect_sz = 0;
+ pe_hdr = (struct pe_hdr *)(file_buf + mz->peaddr);
+ section_nr = pe_hdr->sections;
+ opt_hdr = (struct pe32plus_opt_hdr *)(file_buf + mz->peaddr + sizeof(struct pe_hdr));
+ sect_hdr = (struct section_header *)((char *)opt_hdr + pe_hdr->opt_hdr_size);
+
+ for (i = 0; i < section_nr; i++) {
+ if (strcmp(sect_hdr->name, sect_name) == 0) {
+ *sect_start = file_buf + sect_hdr->data_addr;
+ *sect_sz = sect_hdr->raw_data_size;
+ return 0;
+ }
+ sect_hdr++;
+ }
+
+ return -1;
+}
+
+static bool pe_has_bpf_section(char *file_buf, unsigned long pe_sz)
+{
+ char *sect_start = NULL;
+ unsigned long sect_sz = 0;
+ int ret;
+
+ ret = get_pe_section(file_buf, ".bpf", §_start, §_sz);
+ if (ret < 0)
+ return false;
+ return true;
+}
+
+/* Load a ELF */
+static int arm_bpf_prog(char *bpf_elf, unsigned long sz)
+{
+ return 0;
+}
+
+static void disarm_bpf_prog(void)
+{
+}
+
+struct kexec_context {
+ bool kdump;
+ char *image;
+ int image_sz;
+ char *initrd;
+ int initrd_sz;
+ char *cmdline;
+ int cmdline_sz;
+};
+
+void bpf_handle_pefile(struct kexec_context *context);
+void bpf_post_handle_pefile(struct kexec_context *context);
+
+
+/*
+ * optimize("O0") prevents inline, compiler constant propagation
+ */
+__attribute__((used, optimize("O0"))) void bpf_handle_pefile(struct kexec_context *context)
+{
+}
+
+__attribute__((used, optimize("O0"))) void bpf_post_handle_pefile(struct kexec_context *context)
+{
+}
+
+/*
+ * PE file may be nested and should be unfold one by one.
+ * Query 'kernel', 'initrd', 'cmdline' in cur_phase, as they are inputs for the
+ * next phase.
+ */
+static int prepare_nested_pe(char **kernel, unsigned long *kernel_len, char **initrd,
+ unsigned long *initrd_len, char **cmdline)
+{
+ struct kexec_res *res;
+ int ret = -1;
+
+ *kernel = NULL;
+ *kernel_len = 0;
+
+ list_for_each_entry(res, &cur_phase->res_head, node) {
+ if (res->name == kexec_res_names[0]) {
+ *kernel = res->r->buf;
+ *kernel_len = res->r->data_sz;
+ ret = 0;
+ } else if (res->name == kexec_res_names[1]) {
+ *initrd = res->r->buf;
+ *initrd_len = res->r->data_sz;
+ } else if (res->name == kexec_res_names[2]) {
+ *cmdline = res->r->buf;
+ }
+ }
+
+ return ret;
+}
+
+static void *pe_image_load(struct kimage *image,
+ char *kernel, unsigned long kernel_len,
+ char *initrd, unsigned long initrd_len,
+ char *cmdline, unsigned long cmdline_len)
+{
+ char *parsed_kernel = NULL;
+ unsigned long parsed_len;
+ char *linux_start, *initrd_start, *cmdline_start, *bpf_start;
+ unsigned long linux_sz, initrd_sz, cmdline_sz, bpf_sz;
+ struct parsed_phase *phase, *phase_tmp;
+ struct kexec_res *res, *res_tmp;
+ void *ldata;
+ int ret;
+
+ linux_start = kernel;
+ linux_sz = kernel_len;
+ initrd_start = initrd;
+ initrd_sz = initrd_len;
+ cmdline_start = cmdline;
+ cmdline_sz = cmdline_len;
+
+ while (is_valid_format(linux_start, linux_sz) &&
+ pe_has_bpf_section(linux_start, linux_sz)) {
+ struct kexec_context context;
+
+ get_pe_section(linux_start, ".bpf", &bpf_start, &bpf_sz);
+ if (!!bpf_sz) {
+ /* load and attach bpf-prog */
+ ret = arm_bpf_prog(bpf_start, bpf_sz);
+ if (ret) {
+ pr_err("Fail to load .bpf section\n");
+ ldata = ERR_PTR(ret);
+ goto err;
+ }
+ }
+ cur_phase = alloc_new_phase();
+ if (image->type != KEXEC_TYPE_CRASH)
+ context.kdump = false;
+ else
+ context.kdump = true;
+ context.image = linux_start;
+ context.image_sz = linux_sz;
+ context.initrd = initrd_start;
+ context.initrd_sz = initrd_sz;
+ context.cmdline = cmdline_start;
+ context.cmdline_sz = strlen(cmdline_start);
+ /* bpf-prog fentry, which handle above buffers. */
+ bpf_handle_pefile(&context);
+
+ prepare_nested_pe(&linux_start, &linux_sz, &initrd_start,
+ &initrd_sz, &cmdline_start);
+ /* bpf-prog fentry */
+ bpf_post_handle_pefile(&context);
+ /*
+ * detach the current bpf-prog from their attachment points.
+ * It also a point to free any registered interim resource.
+ * Any resource except attached to phase is interim.
+ */
+ disarm_bpf_prog();
+ }
+
+ /* the rear of parsed phase contains the result */
+ list_for_each_entry_reverse(phase, &phase_head, head) {
+ if (initrd != NULL && cmdline != NULL && parsed_kernel != NULL)
+ break;
+ list_for_each_entry(res, &phase->res_head, node) {
+ if (!strcmp(res->name, "kernel") && !parsed_kernel) {
+ parsed_kernel = res->r->buf;
+ parsed_len = res->r->data_sz;
+ res->deferred_free = true;
+ } else if (!strcmp(res->name, "initrd") && !initrd) {
+ initrd = res->r->buf;
+ initrd_len = res->r->data_sz;
+ res->deferred_free = true;
+ } else if (!strcmp(res->name, "cmdline") && !cmdline) {
+ cmdline = res->r->buf;
+ cmdline_len = res->r->data_sz;
+ res->deferred_free = true;
+ }
+ }
+
+ }
+
+ if (initrd == NULL || cmdline == NULL || parsed_kernel == NULL) {
+ char *c, buf[64];
+
+ c = buf;
+ if (parsed_kernel == NULL) {
+ strcpy(c, "kernel ");
+ c += strlen("kernel ");
+ }
+ if (initrd == NULL) {
+ strcpy(c, "initrd ");
+ c += strlen("initrd ");
+ }
+ if (cmdline == NULL) {
+ strcpy(c, "cmdline ");
+ c += strlen("cmdline ");
+ }
+ c = '\0';
+ pr_err("Can not extract data for %s", buf);
+ ldata = ERR_PTR(-EINVAL);
+ goto err;
+ }
+ /*
+ * image's kernel_buf, initrd_buf, cmdline_buf are set. Now they should
+ * be updated to the new content.
+ */
+ if (image->kernel_buf != parsed_kernel) {
+ vfree(image->kernel_buf);
+ image->kernel_buf = parsed_kernel;
+ image->kernel_buf_len = parsed_len;
+ }
+ if (image->initrd_buf != initrd) {
+ vfree(image->initrd_buf);
+ image->initrd_buf = initrd;
+ image->initrd_buf_len = initrd_len;
+ }
+ if (image->cmdline_buf != cmdline) {
+ kfree(image->cmdline_buf);
+ image->cmdline_buf = cmdline;
+ image->cmdline_buf_len = cmdline_len;
+ }
+ ret = arch_kexec_kernel_image_probe(image, image->kernel_buf,
+ image->kernel_buf_len);
+ if (ret) {
+ pr_err("Fail to find suitable image loader\n");
+ ldata = ERR_PTR(ret);
+ goto err;
+ }
+ ldata = kexec_image_load_default(image);
+ if (IS_ERR(ldata)) {
+ pr_err("architecture code fails to load image\n");
+ goto err;
+ }
+ image->image_loader_data = ldata;
+
+err:
+ list_for_each_entry_safe(phase, phase_tmp, &phase_head, head) {
+ list_for_each_entry_safe(res, res_tmp, &phase->res_head, node) {
+ list_del(&res->node);
+ /* defer to kimage_file_post_load_cleanup() */
+ if (res->deferred_free) {
+ res->r->buf = NULL;
+ res->r->buf_sz = 0;
+ }
+ mem_range_result_put(res->r);
+ kfree(res);
+ }
+ list_del(&phase->head);
+ kfree(phase);
+ }
+
+ return ldata;
+}
+
+const struct kexec_file_ops kexec_pe_image_ops = {
+ .probe = pe_image_probe,
+ .load = pe_image_load,
+#ifdef CONFIG_KEXEC_IMAGE_VERIFY_SIG
+ .verify_sig = kexec_kernel_verify_pe_sig,
+#endif
+};
--
2.49.0
More information about the kexec
mailing list