[PATCHv3 3/9] bpf: Introduce bpf_copy_to_kernel() to buffer the content from bpf-prog

Pingfan Liu piliu at redhat.com
Wed May 28 21:17:38 PDT 2025


In the security kexec_file_load case, the buffer which holds the kernel
image is invisible to the userspace.

The common data flow in bpf scheme is from kernel to bpf-prog.  In the
case of kexec_file_load, the kexec component needs to buffer the parsed
result by bpf-prog (opposite the usual direction) to the next stage
parsing. bpf_kexec_carrier() makes the opposite data flow possible. A
bpf-prog can publish the parsed payload address to the kernel, and the
latter can copy them for future use.

Signed-off-by: Pingfan Liu <piliu at redhat.com>
Cc: Alexei Starovoitov <ast at kernel.org>
Cc: Daniel Borkmann <daniel at iogearbox.net>
Cc: John Fastabend <john.fastabend at gmail.com>
Cc: Andrii Nakryiko <andrii at kernel.org>
Cc: Martin KaFai Lau <martin.lau at linux.dev>
Cc: Eduard Zingerman <eddyz87 at gmail.com>
Cc: Song Liu <song at kernel.org>
Cc: Yonghong Song <yonghong.song at linux.dev>
Cc: KP Singh <kpsingh at kernel.org>
Cc: Stanislav Fomichev <sdf at fomichev.me>
Cc: Hao Luo <haoluo at google.com>
Cc: Jiri Olsa <jolsa at kernel.org>
To: bpf at vger.kernel.org
---
 include/linux/bpf.h          |  23 +++++
 kernel/bpf/Makefile          |   2 +-
 kernel/bpf/helpers.c         |   2 +
 kernel/bpf/helpers_carrier.c | 194 +++++++++++++++++++++++++++++++++++
 4 files changed, 220 insertions(+), 1 deletion(-)
 create mode 100644 kernel/bpf/helpers_carrier.c

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 3f0cc89c0622c..104974a6d18cb 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -3568,4 +3568,27 @@ static inline bool bpf_is_subprog(const struct bpf_prog *prog)
 	return prog->aux->func_idx != 0;
 }
 
+struct mem_range_result {
+	struct kref ref;
+	struct rcu_head rcu;
+	char *buf;
+	uint32_t buf_sz;
+	uint32_t data_sz;
+	/* kmalloc-ed or vmalloc-ed */
+	bool kmalloc;
+	int status;
+	struct mem_cgroup *memcg;
+};
+int mem_range_result_put(struct mem_range_result *result);
+
+typedef int (*resource_handler)(const char *name, struct mem_range_result *r);
+
+struct carrier_listener {
+	char *name;
+	bool kmalloc;
+	resource_handler handler;
+};
+
+int register_carrier_listener(struct carrier_listener *listener);
+int unregister_carrier_listener(char *str);
 #endif /* _LINUX_BPF_H */
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 70502f038b921..d1f1f50e23cc8 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -6,7 +6,7 @@ cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse
 endif
 CFLAGS_core.o += -Wno-override-init $(cflags-nogcse-yy)
 
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o token.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o helpers_carrier.o tnum.o log.o token.o
 obj-$(CONFIG_BPF_SYSCALL) += bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o
 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o
 obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index e3a2662f4e336..1f4284e58400b 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -3200,6 +3200,8 @@ BTF_KFUNCS_START(generic_btf_ids)
 #ifdef CONFIG_CRASH_DUMP
 BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE)
 #endif
+BTF_ID_FLAGS(func, bpf_mem_range_result_put, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_copy_to_kernel, KF_TRUSTED_ARGS | KF_SLEEPABLE)
 BTF_ID_FLAGS(func, bpf_obj_new_impl, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_percpu_obj_new_impl, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_obj_drop_impl, KF_RELEASE)
diff --git a/kernel/bpf/helpers_carrier.c b/kernel/bpf/helpers_carrier.c
new file mode 100644
index 0000000000000..c4e45fdf0ebb8
--- /dev/null
+++ b/kernel/bpf/helpers_carrier.c
@@ -0,0 +1,194 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/bpf.h>
+#include <linux/bpf-cgroup.h>
+#include <linux/cgroup.h>
+#include <linux/rcupdate.h>
+#include <linux/hashtable.h>
+#include <linux/jhash.h>
+#include <linux/mutex.h>
+#include <linux/kref.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+
+
+struct str_listener {
+	struct hlist_node node;
+	char *str;
+	resource_handler handler;
+	bool kmalloc;
+};
+
+DEFINE_STATIC_SRCU(srcu);
+static DEFINE_MUTEX(str_listeners_mutex);
+static DEFINE_HASHTABLE(str_listeners, 8);
+
+static struct str_listener *find_listener(const char *str)
+{
+	struct str_listener *item;
+	unsigned int hash = jhash(str, strlen(str), 0);
+
+	hash_for_each_possible(str_listeners, item, node, hash) {
+		if (strcmp(item->str, str) == 0)
+			return item;
+	}
+	return NULL;
+}
+
+static void __mem_range_result_free(struct rcu_head *rcu)
+{
+	struct mem_range_result *result = container_of(rcu, struct mem_range_result, rcu);
+	struct mem_cgroup *memcg, *old_memcg;
+
+	memcg = result->memcg;
+	old_memcg = set_active_memcg(memcg);
+	if (likely(!!result->buf)) {
+		if (result->kmalloc)
+			kfree(result->buf);
+		else
+			vfree(result->buf);
+	}
+	kfree(result);
+	set_active_memcg(old_memcg);
+	mem_cgroup_put(memcg);
+}
+
+static void __mem_range_result_put(struct kref *kref)
+{
+	struct mem_range_result *result = container_of(kref, struct mem_range_result, ref);
+
+	call_srcu(&srcu, &result->rcu, __mem_range_result_free);
+}
+
+int mem_range_result_put(struct mem_range_result *result)
+{
+
+	if (!result) {
+		pr_err("%s, receive invalid range\n", __func__);
+		return -EINVAL;
+	}
+
+	kref_put(&result->ref, __mem_range_result_put);
+	return 0;
+}
+
+__bpf_kfunc int bpf_mem_range_result_put(struct mem_range_result *result)
+{
+	return mem_range_result_put(result);
+}
+
+/*
+ * Cache the content in @buf into kernel
+ */
+__bpf_kfunc int bpf_copy_to_kernel(const char *name, char *buf, int size)
+{
+	struct mem_range_result *range;
+	struct mem_cgroup *memcg, *old_memcg;
+	struct str_listener *item;
+	resource_handler handler;
+	bool kmalloc;
+	char *kbuf;
+	int id, ret = 0;
+
+	id = srcu_read_lock(&srcu);
+	item = find_listener(name);
+	if (!item) {
+		srcu_read_unlock(&srcu, id);
+		return -EINVAL;
+	}
+	kmalloc = item->kmalloc;
+	handler = item->handler;
+	srcu_read_unlock(&srcu, id);
+	memcg = get_mem_cgroup_from_current();
+	old_memcg = set_active_memcg(memcg);
+	range = kmalloc(sizeof(struct mem_range_result), GFP_KERNEL);
+	if (!range) {
+		pr_err("fail to allocate mem_range_result\n");
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	kref_init(&range->ref);
+	if (item->kmalloc)
+		kbuf = kmalloc(size, GFP_KERNEL | __GFP_ACCOUNT);
+	else
+		kbuf = __vmalloc(size, GFP_KERNEL | __GFP_ACCOUNT);
+	if (!kbuf) {
+		kfree(range);
+		ret = -ENOMEM;
+		goto err;
+	}
+	ret = copy_from_kernel_nofault(kbuf, buf, size);
+	if (unlikely(ret < 0)) {
+		kfree(range);
+		if (item->kmalloc)
+			kfree(kbuf);
+		else
+			vfree(kbuf);
+		ret = -EINVAL;
+		goto err;
+	}
+	range->kmalloc = item->kmalloc;
+	range->buf = kbuf;
+	range->buf_sz = size;
+	range->data_sz = size;
+	range->memcg = memcg;
+	mem_cgroup_tryget(memcg);
+	range->status = 0;
+	ret = handler(name, range);
+	mem_range_result_put(range);
+err:
+	set_active_memcg(old_memcg);
+	mem_cgroup_put(memcg);
+	return ret;
+}
+
+int register_carrier_listener(struct carrier_listener *listener)
+{
+	struct str_listener *item;
+	unsigned int hash;
+	int ret;
+
+	if (!listener->name)
+		return -EINVAL;
+	item = kmalloc(sizeof(*item), GFP_KERNEL);
+	if (!item)
+		return -ENOMEM;
+	item->str = kstrdup(listener->name, GFP_KERNEL);
+	if (!item->str) {
+		kfree(item);
+		return -ENOMEM;
+	}
+	item->handler = listener->handler;
+	item->kmalloc = listener->kmalloc;
+	hash = jhash(item->str, strlen(item->str), 0);
+	mutex_lock(&str_listeners_mutex);
+	if (!find_listener(item->str)) {
+		hash_add(str_listeners, &item->node, hash);
+	} else {
+		kfree(item->str);
+		kfree(item);
+		ret = -EBUSY;
+	}
+	mutex_unlock(&str_listeners_mutex);
+
+	return ret;
+}
+EXPORT_SYMBOL(register_carrier_listener);
+
+int unregister_carrier_listener(char *str)
+{
+	struct str_listener *item;
+	int ret = 0;
+
+	mutex_lock(&str_listeners_mutex);
+	item = find_listener(str);
+	if (!!item)
+		hash_del(&item->node);
+	else
+		ret = -EINVAL;
+	mutex_unlock(&str_listeners_mutex);
+
+	return ret;
+}
+EXPORT_SYMBOL(unregister_carrier_listener);
+
-- 
2.49.0




More information about the kexec mailing list