[RFCv2 4/7] bpf/kexec: Introduce three bpf kfunc for kexec
Pingfan Liu
piliu at redhat.com
Mon Apr 28 21:12:11 PDT 2025
This patch introduces three kfunc dedicated for kexec_file_load.
In the case of kexec, kexec_trylock() ensures no concurrent, which
relieves the kexec bpf kfunc design. (Maybe later, a dedicate
BPF_PROG_TYPE_KEXEC to limit their use case to improve the safety)
bpf_kexec_decompress(): It creates a bridge to the kernel decompressor,
avoiding the need to reimplement the lib/decompress_* in bpf-programs.
bpf_kexec_result_release(): It releases the resource when bpf-prog is
done with that.
bpf_kexec_carrier(): The common data flow in bpf scheme is from kernel
to bpf-prog. In the case of kexec_file_load, the kexec component needs
to buffer the parsed result by bpf-prog (opposite the usual direction)
to the next stage parsing. bpf_kexec_carrier() makes the opposite data
flow possible. A bpf-prog can publish the parsed payload address to the
kernel, and the latter can copy them for future use.
Signed-off-by: Pingfan Liu <piliu at redhat.com>
Cc: Alexei Starovoitov <ast at kernel.org>
Cc: Daniel Borkmann <daniel at iogearbox.net>
Cc: Andrii Nakryiko <andrii at kernel.org>
Cc: Martin KaFai Lau <martin.lau at linux.dev>
Cc: Eduard Zingerman <eddyz87 at gmail.com>
Cc: Song Liu <song at kernel.org>
Cc: Yonghong Song <yonghong.song at linux.dev>
Cc: John Fastabend <john.fastabend at gmail.com>
Cc: KP Singh <kpsingh at kernel.org>
Cc: Stanislav Fomichev <sdf at fomichev.me>
Cc: Hao Luo <haoluo at google.com>
Cc: Jiri Olsa <jolsa at kernel.org>
Cc: Baoquan He <bhe at redhat.com>
Cc: Dave Young <dyoung at redhat.com>
Cc: Eric Biederman <ebiederm at xmission.com>
To: bpf at vger.kernel.org
To: kexec at lists.infradead.org
---
kernel/kexec_pe_image.c | 194 ++++++++++++++++++++++++++++++++++++++++
1 file changed, 194 insertions(+)
diff --git a/kernel/kexec_pe_image.c b/kernel/kexec_pe_image.c
index accf6b0f02e39..610bb134f5e34 100644
--- a/kernel/kexec_pe_image.c
+++ b/kernel/kexec_pe_image.c
@@ -15,6 +15,9 @@
#include <linux/kexec.h>
#include <linux/pe.h>
#include <linux/string.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/decompress/generic.h>
#include <asm/byteorder.h>
#include <asm/cpufeature.h>
#include <asm/image.h>
@@ -52,6 +55,186 @@ static struct parsed_phase *alloc_new_phase(void)
return phase;
}
+struct mem_range_result {
+ refcount_t usage;
+ /*
+ * Pointer to a kernel space, which is written by kfunc and read by
+ * bpf-prog. Hence kfunc guarantees its validation.
+ */
+ char *buf;
+ uint32_t size; // Size of decompressed data
+ int status; // Status code (0 for success)
+};
+
+#define MAX_KEXEC_RES_SIZE (1 << 29)
+
+BTF_KFUNCS_START(bpf_kexec_ids)
+BTF_ID_FLAGS(func, bpf_kexec_carrier, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_kexec_decompress, KF_TRUSTED_ARGS | KF_ACQUIRE)
+BTF_ID_FLAGS(func, bpf_kexec_result_release, KF_RELEASE)
+BTF_KFUNCS_END(bpf_kexec_ids)
+
+static const struct btf_kfunc_id_set kexec_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &bpf_kexec_ids,
+};
+
+/*
+ * Copy the partial decompressed content in [buf, buf + len) to dst.
+ * If the dst size is beyond the capacity, return 0 to indicate the
+ * decompress method that something is wrong.
+ */
+//to do
+static long flush_buffer(void *buf, unsigned long len)
+{
+
+ //return len to indicate everything goest smoothly
+ return 0;
+}
+
+
+__bpf_kfunc_start_defs();
+
+/*
+ * @name should be one of : kernel, initrd, cmdline
+ */
+__bpf_kfunc int bpf_kexec_carrier(const char *name, struct mem_range_result *r)
+{
+ struct kexec_res *res;
+ int ret = 0;
+
+ if (!r) {
+ pr_err("%s, receive invalid range\n", __func__);
+ return -EINVAL;
+ }
+
+ if (!r || !name)
+ return -EINVAL;
+ if (r->size == 0 || r->size > MAX_KEXEC_RES_SIZE) {
+ pr_err("Invalid resource size: 0x%x\n", r->size);
+ return -EINVAL;
+ }
+
+ res = kzalloc(sizeof(struct kexec_res), GFP_KERNEL);
+ if (!res)
+ return -ENOMEM;
+
+ for (int i = 0; i < ARRAY_SIZE(kexec_res_names); i++) {
+ if (!strcmp(kexec_res_names[i], name))
+ res->name = kexec_res_names[i];
+ }
+
+ if (res->name == NULL) {
+ pr_err("Invalid resource name: %s, should be 'kernel', 'initrd', 'cmdline'\n", name);
+ kfree(res);
+ return -EINVAL;
+ }
+
+ res->buf = vmalloc(r->size);
+ if (!res->buf) {
+ kfree(res);
+ return -ENOMEM;
+ }
+ ret = copy_from_kernel_nofault(res->buf, r->buf, r->size);
+ if (unlikely(ret < 0)) {
+ kfree(res->buf);
+ kfree(res);
+ return -EINVAL;
+ }
+ res->size = r->size;
+
+ INIT_LIST_HEAD(&res->node);
+ list_add_tail(&res->node, &cur_phase->res_head);
+ return 0;
+}
+
+__bpf_kfunc struct mem_range_result *bpf_kexec_decompress(char *image_gz_payload, int image_gz_sz,
+ unsigned int expected_decompressed_sz)
+{
+ decompress_fn decompressor;
+ //todo, use flush to cap the memory size used by decompression
+ long (*flush)(void*, unsigned long) = NULL;
+ struct mem_range_result *range;
+ const char *name;
+ void *output_buf;
+ char *input_buf;
+ int ret;
+
+ range = kmalloc(sizeof(struct mem_range_result), GFP_KERNEL);
+ if (!range) {
+ pr_err("fail to allocate mem_range_result\n");
+ return NULL;
+ }
+ refcount_set(&range->usage, 1);
+
+ input_buf = vmalloc(image_gz_sz);
+ if (!input_buf) {
+ pr_err("fail to allocate input buffer\n");
+ kfree(range);
+ return NULL;
+ }
+
+ ret = copy_from_kernel_nofault(input_buf, image_gz_payload, image_gz_sz);
+ if (ret < 0) {
+ pr_err("Error when copying from 0x%px, size:0x%x\n",
+ image_gz_payload, image_gz_sz);
+ kfree(range);
+ vfree(input_buf);
+ return NULL;
+ }
+
+ output_buf = vmalloc(expected_decompressed_sz);
+ if (!output_buf) {
+ pr_err("fail to allocate output buffer\n");
+ kfree(range);
+ vfree(input_buf);
+ return NULL;
+ }
+
+ decompressor = decompress_method(input_buf, image_gz_sz, &name);
+ if (!decompressor) {
+ pr_err("Can not find decompress method\n");
+ kfree(range);
+ vfree(input_buf);
+ vfree(output_buf);
+ return NULL;
+ }
+ //to do, use flush
+ ret = decompressor(image_gz_payload, image_gz_sz, NULL, NULL,
+ output_buf, NULL, NULL);
+
+ /* Update the range map */
+ if (ret == 0) {
+ range->buf = output_buf;
+ range->size = expected_decompressed_sz;
+ range->status = 0;
+ } else {
+ pr_err("Decompress error\n");
+ vfree(output_buf);
+ kfree(range);
+ return NULL;
+ }
+ pr_info("%s, return range 0x%lx\n", __func__, range);
+ return range;
+}
+
+__bpf_kfunc int bpf_kexec_result_release(struct mem_range_result *result)
+{
+ if (!result) {
+ pr_err("%s, receive invalid range\n", __func__);
+ return -EINVAL;
+ }
+
+ if (refcount_dec_and_test(&result->usage)) {
+ vfree(result->buf);
+ kfree(result);
+ }
+
+ return 0;
+}
+
+__bpf_kfunc_end_defs();
+
static bool is_valid_pe(const char *kernel_buf, unsigned long kernel_len)
{
struct mz_hdr *mz;
@@ -336,3 +519,14 @@ const struct kexec_file_ops kexec_pe_image_ops = {
.verify_sig = kexec_kernel_verify_pe_sig,
#endif
};
+
+static int __init bpf_kfunc_init(void)
+{
+ int ret;
+
+ ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &kexec_kfunc_set);
+ if (!!ret)
+ pr_err("Fail to register btf for kexec_kfunc_set\n");
+ return ret;
+}
+late_initcall(bpf_kfunc_init);
--
2.49.0
More information about the kexec
mailing list