[PATCH 5/6] nvme-bpf: eBPF struct_ops path selectors

hare at kernel.org hare at kernel.org
Tue Jul 29 00:06:52 PDT 2025


From: Hannes Reinecke <hare at kernel.org>

Add support for eBPF struct_ops based path selectors. Due to eBPF limitations
we cannot pass in a 'struct nvme_ns' as argument nor return a 'struct nvme_ns'
from a function, so the eBPF path selectors use a 'struct nvme_bpf_iter'
to iterate over all paths in a struct nvme_ns_head.
That satifies the constrains of the eBPF verifier and allows us to provide
two helper functions 'nvme_bpf_first_path()' and 'nvme_bpf_next_path()'
to iterate over all paths in a namespace.

Signed-off-by: Hannes Reinecke <hare at kernel.org>
---
 drivers/nvme/host/Kconfig     |   9 +
 drivers/nvme/host/Makefile    |   1 +
 drivers/nvme/host/bpf.h       |  33 ++++
 drivers/nvme/host/bpf_ops.c   | 347 ++++++++++++++++++++++++++++++++++
 drivers/nvme/host/core.c      |   3 +
 drivers/nvme/host/multipath.c |   9 +-
 drivers/nvme/host/nvme.h      |   6 +-
 include/linux/nvme-bpf.h      |  54 ++++++
 8 files changed, 460 insertions(+), 2 deletions(-)
 create mode 100644 drivers/nvme/host/bpf.h
 create mode 100644 drivers/nvme/host/bpf_ops.c
 create mode 100644 include/linux/nvme-bpf.h

diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig
index 31974c7dd20c..7cc1f3898712 100644
--- a/drivers/nvme/host/Kconfig
+++ b/drivers/nvme/host/Kconfig
@@ -122,6 +122,15 @@ config NVME_HOST_AUTH
 
 	  If unsure, say N.
 
+config NVME_BPF
+	bool "NVMe multipath BPF path selector"
+	depends on NVME_MULTIPATH
+	depends on BPF_SYSCALL
+	help
+	  Provide support for eBPF multipath path selectors
+
+	  if unsure, say N.
+
 config NVME_APPLE
 	tristate "Apple ANS2 NVM Express host driver"
 	depends on OF && BLOCK
diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile
index 6414ec968f99..f81d6349faf6 100644
--- a/drivers/nvme/host/Makefile
+++ b/drivers/nvme/host/Makefile
@@ -18,6 +18,7 @@ nvme-core-$(CONFIG_BLK_DEV_ZONED)	+= zns.o
 nvme-core-$(CONFIG_FAULT_INJECTION_DEBUG_FS)	+= fault_inject.o
 nvme-core-$(CONFIG_NVME_HWMON)		+= hwmon.o
 nvme-core-$(CONFIG_NVME_HOST_AUTH)	+= auth.o
+nvme-core-$(CONFIG_NVME_BPF)		+= bpf_ops.o
 
 nvme-y					+= pci.o
 
diff --git a/drivers/nvme/host/bpf.h b/drivers/nvme/host/bpf.h
new file mode 100644
index 000000000000..f819332d5293
--- /dev/null
+++ b/drivers/nvme/host/bpf.h
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#ifndef NVME_INT_BPF_HEADER
+#define NVME_INT_BPF_HEADER
+
+#ifdef CONFIG_NVME_BPF
+#include <linux/filter.h>
+#include <linux/nvme-bpf.h>
+
+static inline bool nvme_bpf_enabled(struct nvme_ns_head *head)
+{
+	return !!(srcu_dereference(head->bpf_ops, &head->srcu));
+}
+
+void nvme_bpf_detach(struct nvme_ns_head *head);
+struct nvme_ns *nvme_bpf_select_path(struct nvme_ns_head *head, sector_t sector);
+
+int __init nvme_bpf_struct_ops_init(void);
+
+#else
+
+static inline bool nvme_bpf_enabled(struct nvme_ns_head *head)
+{
+	return false;
+}
+
+static inline void nvme_bpf_detach(struct nvme_ns_head *head) {}
+static inline struct nvme_ns *nvme_bpf_select_path(struct nvme_ns_head *head, sector_t sector)
+{
+	return NULL;
+}
+
+#endif
+#endif
diff --git a/drivers/nvme/host/bpf_ops.c b/drivers/nvme/host/bpf_ops.c
new file mode 100644
index 000000000000..5413541f6f22
--- /dev/null
+++ b/drivers/nvme/host/bpf_ops.c
@@ -0,0 +1,347 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2025 Hannes Reinecke, SUSE */
+
+#include <linux/bpf_verifier.h>
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include "nvme.h"
+#include "bpf.h"
+
+static struct btf *nvme_bpf_ops_btf;
+static char nvme_bpf_ops_name[] = "nvme_bpf_ops";
+
+static int nvme_bpf_ops_init(struct btf *btf)
+{
+	nvme_bpf_ops_btf = btf;
+	return 0;
+}
+
+static bool nvme_bpf_ops_is_valid_access(int off, int size,
+					  enum bpf_access_type type,
+					  const struct bpf_prog *prog,
+					  struct bpf_insn_access_aux *info)
+{
+	return bpf_tracing_btf_ctx_access(off, size, type, prog, info);
+}
+
+BTF_ID_LIST(nvme_bpf_ops_args_ids)
+BTF_ID(struct, nvme_ns_head)
+BTF_ID(struct, nvme_ns)
+BTF_ID(struct, nvme_bpf_iter)
+
+static int nvme_bpf_ops_btf_struct_access(struct bpf_verifier_log *log,
+					  const struct bpf_reg_state *reg,
+					  int off, int size)
+{
+	const struct btf_type *nhit, *nit, *niter, *t;
+
+	nhit = btf_type_by_id(reg->btf, nvme_bpf_ops_args_ids[0]);
+	nit = btf_type_by_id(reg->btf, nvme_bpf_ops_args_ids[1]);
+	niter = btf_type_by_id(reg->btf, nvme_bpf_ops_args_ids[2]);
+
+	t = btf_type_by_id(reg->btf, reg->btf_id);
+	if (t != nhit && t != niter) {
+		bpf_log(log, "write access to struct %d is not supported\n", reg->btf_id);
+		return -EACCES;
+	}
+	if (t == niter) {
+		/* Allow writes to the 'head' element */
+		if (off >= offsetof(struct nvme_bpf_iter, head) &&
+		    off + size < offsetofend(struct nvme_bpf_iter, head))
+			return NOT_INIT;
+	} else {
+		/* Allow writes to the 'bpf_ops' element */
+		if (off >= offsetof(struct nvme_ns_head, bpf_ops) &&
+		    off + size < offsetofend(struct nvme_ns_head, bpf_ops)) {
+			return NOT_INIT;
+		}
+	}
+	bpf_log(log, "write access for struct %s at off %d with size %d\n",
+		nvme_bpf_ops_name, off, size);
+	return -EACCES;
+}
+
+static const struct bpf_verifier_ops nvme_bpf_verifier_ops = {
+	.get_func_proto = bpf_base_func_proto,
+	.is_valid_access = nvme_bpf_ops_is_valid_access,
+	.btf_struct_access = nvme_bpf_ops_btf_struct_access,
+};
+
+static int nvme_bpf_ops_check_member(const struct btf_type *t,
+				     const struct btf_member *member,
+				     const struct bpf_prog *prog)
+{
+	u32 moff = __btf_member_bit_offset(t, member) / 8;
+
+	switch (moff) {
+	case offsetof(struct nvme_bpf_ops, select_path):
+		break;
+	default:
+		if (prog->sleepable)
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int nvme_bpf_ops_init_member(const struct btf_type *t,
+				    const struct btf_member *member,
+				    void *kdata, const void *udata)
+{
+	const struct nvme_bpf_ops *uops;
+	struct nvme_bpf_ops *kops;
+	u32 moff;
+
+	uops = (const struct nvme_bpf_ops *)udata;
+	kops = (struct nvme_bpf_ops *)kdata;
+
+	moff = __btf_member_bit_offset(t, member) / 8;
+
+	switch (moff) {
+	case offsetof(struct nvme_bpf_ops, subsysnqn):
+		memcpy(kops->subsysnqn, uops->subsysnqn,
+		       sizeof(kops->subsysnqn));
+		return 1;
+	case offsetof(struct nvme_bpf_ops, nsid):
+		kops->nsid = uops->nsid;
+		return 1;
+	case offsetof(struct nvme_bpf_ops, uuid):
+		if (uuid_is_null(&uops->uuid))
+			uuid_gen(&kops->uuid);
+		else
+			uuid_copy(&kops->uuid, &uops->uuid);
+		return 1;
+	}
+	return 0;
+}
+
+static int nvme_bpf_reg(void *kdata, struct bpf_link *link)
+{
+	struct nvme_bpf_ops *ops = kdata;
+	struct nvme_ns_head *head;
+	struct nvme_subsystem *subsys = NULL;
+
+	pr_debug("%s: register %s nsid %d\n",
+		 __func__, ops->subsysnqn, ops->nsid);
+
+	subsys = nvme_find_get_subsystem(ops->subsysnqn);
+	if (!subsys)
+		return -EINVAL;
+
+	mutex_lock(&subsys->lock);
+	list_for_each_entry(head, &subsys->nsheads, entry) {
+		if (head->ns_id != ops->nsid)
+			continue;
+		if (head->bpf_ops) {
+			pr_debug("%s: instance %d already attached\n",
+				 __func__, head->instance);
+			continue;
+		}
+		if (nvme_tryget_ns_head(head)) {
+			mutex_lock(&head->lock);
+			ops->head = head;
+			head->bpf_ops = ops;
+			mutex_unlock(&head->lock);
+			pr_debug("%s: attached to %d\n",
+				 __func__, head->instance);
+			synchronize_srcu(&head->srcu);
+			break;
+		}
+	}
+	mutex_unlock(&subsys->lock);
+	nvme_put_subsystem(subsys);
+
+	return 0;
+}
+
+static void nvme_bpf_unreg(void *kdata, struct bpf_link *link)
+{
+	struct nvme_bpf_ops *ops = kdata;
+	struct nvme_ns_head *head;
+
+	if (ops->head) {
+		head = ops->head;
+		pr_debug("%s: unregistered from %d\n",
+			 __func__, head->instance);
+		mutex_lock(&head->lock);
+		head->bpf_ops = NULL;
+		ops->head = NULL;
+		mutex_unlock(&head->lock);
+		nvme_put_ns_head(head);
+		synchronize_srcu(&head->srcu);
+	}
+}
+
+void nvme_bpf_detach(struct nvme_ns_head *head)
+{
+	struct nvme_bpf_ops *ops =
+		srcu_dereference(head->bpf_ops, &head->srcu);
+
+	if (ops) {
+		mutex_lock(&head->lock);
+		rcu_assign_pointer(head->bpf_ops, NULL);
+		list_del_init(&head->bpf_list);
+		mutex_unlock(&head->lock);
+		nvme_put_ns_head(head);
+	}
+}
+
+static int __nvme_bpf_select_path(struct nvme_bpf_iter *iter,
+				  sector_t sector)
+{
+	return -ENXIO;
+}
+
+static struct nvme_bpf_ops __bpf_nvme_bpf_ops = {
+	.uuid = {},
+	.subsysnqn = "",
+	.nsid = UINT_MAX,
+	.select_path = __nvme_bpf_select_path,
+	.head = NULL,
+};
+
+struct nvme_ns *nvme_bpf_select_path(struct nvme_ns_head *head,
+				     sector_t sector)
+{
+	struct nvme_ns *ns = NULL;
+	struct nvme_bpf_ops *ops =
+		srcu_dereference(head->bpf_ops, &head->srcu);
+	struct nvme_bpf_iter iter = {
+		.head = head,
+	};
+	s32 cntlid;
+
+	if (ops) {
+		cntlid = ops->select_path(&iter, sector);
+		if (cntlid < 0)
+			return ERR_PTR(cntlid);
+		if (iter.curr) {
+			ns = iter.curr;
+			if (ns->ctrl->cntlid == cntlid)
+				return ns;
+		}
+	}
+	return ERR_PTR(-ENXIO);
+}
+
+__bpf_kfunc_start_defs();
+
+/**
+ * nvme_bpf_first_path - select the first path from a nvme bpf path iterator
+ * @iter: nvme_bpf path iterator
+ *
+ * Initializes @iter with the first nvme namespace path (if present) and
+ * returns the controller id of the first nvme namespace path or
+ * -ENXIO if no namespace path is present.
+ */
+__bpf_kfunc int nvme_bpf_first_path(struct nvme_bpf_iter *iter)
+{
+	struct nvme_ns *ns;
+
+	if (!iter || !iter->head)
+		return -EINVAL;
+	if (!nvme_bpf_enabled(iter->head))
+		return -EPERM;
+
+	ns = list_first_or_null_rcu(&iter->head->list, struct nvme_ns, siblings);
+	iter->curr = ns;
+	iter->prev = NULL;
+	return ns ? ns->ctrl->cntlid : -ENXIO;
+}
+EXPORT_SYMBOL_GPL(nvme_bpf_first_path);
+
+/**
+ * nvme_bpf_next_path - select the next path from a nvme bpf path iterator
+ * @iter: nvme_bpf path iterator
+ *
+ * Moves @iter to the next namespace path in @curr, storing the previous namespace
+ * path in @prev. Returns the controller id of the current namespace path, -ENXIO
+ * if no current path is set, or -EAGAIN if no next namespace is found.
+ */
+__bpf_kfunc int nvme_bpf_next_path(struct nvme_bpf_iter *iter)
+{
+	struct nvme_ns *ns, *old;
+
+	if (!iter || !iter->head)
+		return -EINVAL;
+	if (!nvme_bpf_enabled(iter->head))
+		return -EPERM;
+	if (!iter->curr)
+		return -ENXIO;
+	old = iter->curr;
+	ns = list_next_or_null_rcu(&iter->head->list, &old->siblings, struct nvme_ns,
+				   siblings);
+	iter->prev = old;
+	iter->curr = ns;
+	return ns ? ns->ctrl->cntlid : -EAGAIN;
+}
+EXPORT_SYMBOL_GPL(nvme_bpf_next_path);
+
+/**
+ * nvme_bpf_count_paths - count the number of paths in a nvme bpf path iterator
+ * @iter: nvme_bpf namespace path iterator
+ *
+ * Returns number of paths in @iter
+ */
+__bpf_kfunc u32 nvme_bpf_count_paths(struct nvme_bpf_iter *iter)
+{
+	struct nvme_ns *ns;
+	u32 num = 0;
+
+	if (!iter || !iter->head)
+		return 0;
+	if (!nvme_bpf_enabled(iter->head))
+		return num;
+
+	ns = list_first_or_null_rcu(&iter->head->list, struct nvme_ns, siblings);
+	while (ns) {
+		num++;
+		ns = list_next_or_null_rcu(&iter->head->list, &ns->siblings, struct nvme_ns,
+					   siblings);
+	}
+	return num;
+}
+EXPORT_SYMBOL_GPL(nvme_bpf_count_paths);
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(nvme_bpf_kfunc_set_ids)
+BTF_ID_FLAGS(func, nvme_bpf_first_path, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, nvme_bpf_next_path, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, nvme_bpf_count_paths, KF_TRUSTED_ARGS)
+BTF_KFUNCS_END(nvme_bpf_kfunc_set_ids)
+
+static const struct btf_kfunc_id_set nvme_bpf_kfunc_set = {
+	.owner = THIS_MODULE,
+	.set = &nvme_bpf_kfunc_set_ids,
+};
+
+static struct bpf_struct_ops bpf_nvme_bpf_ops = {
+	.verifier_ops = &nvme_bpf_verifier_ops,
+	.init = nvme_bpf_ops_init,
+	.check_member = nvme_bpf_ops_check_member,
+	.init_member = nvme_bpf_ops_init_member,
+	.reg = nvme_bpf_reg,
+	.unreg = nvme_bpf_unreg,
+	.name = nvme_bpf_ops_name,
+	.cfi_stubs = &__bpf_nvme_bpf_ops,
+	.owner = THIS_MODULE,
+};
+
+int __init nvme_bpf_struct_ops_init(void)
+{
+	int ret;
+
+	ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
+					&nvme_bpf_kfunc_set);
+	if (ret) {
+		pr_err("Failed to register nvme_bpf_kfunc_set, error %d\n", ret);
+		return ret;
+	}
+	ret = register_bpf_struct_ops(&bpf_nvme_bpf_ops, nvme_bpf_ops);
+	if (ret)
+		pr_err("Failed to register nvme_bpf_ops, error %d\n", ret);
+	else
+		pr_info("nvme_bpf_ops registered\n");
+	return ret;
+}
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index a2f3da453af4..e4f69b2f946b 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -27,6 +27,7 @@
 #include "nvme.h"
 #include "fabrics.h"
 #include <linux/nvme-auth.h>
+#include "bpf.h"
 
 #define CREATE_TRACE_POINTS
 #include "trace.h"
@@ -5381,6 +5382,8 @@ static int __init nvme_core_init(void)
 	result = nvme_init_auth();
 	if (result)
 		goto destroy_ns_chr;
+	if (IS_ENABLED(CONFIG_NVME_BPF))
+		nvme_bpf_struct_ops_init();
 	return 0;
 
 destroy_ns_chr:
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index dee40bd73449..e2c6b13591c4 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -8,6 +8,7 @@
 #include <linux/vmalloc.h>
 #include <trace/events/block.h>
 #include "nvme.h"
+#include "bpf.h"
 
 bool multipath = true;
 static bool multipath_always_on;
@@ -462,12 +463,15 @@ static struct nvme_ns *nvme_numa_path(struct nvme_ns_head *head)
 	else if (unlikely(!nvme_path_is_optimized(ns)))
 		ns = __nvme_find_path(head, node);
 	if (ns)
-		rcu_assign_pointer(head->current_path[node], found);
+		rcu_assign_pointer(head->current_path[node], ns);
 	return ns;
 }
 
 inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head, sector_t sector)
 {
+	if (nvme_bpf_enabled(head))
+		return nvme_bpf_select_path(head, sector);
+
 	switch (READ_ONCE(head->subsys->iopolicy)) {
 	case NVME_IOPOLICY_QD:
 		return nvme_queue_depth_path(head);
@@ -693,6 +697,7 @@ static void nvme_remove_head(struct nvme_ns_head *head)
 		kblockd_schedule_work(&head->requeue_work);
 
 		nvme_cdev_del(&head->cdev, &head->cdev_device);
+		nvme_bpf_detach(head);
 		synchronize_srcu(&head->srcu);
 		del_gendisk(head->disk);
 	}
@@ -727,6 +732,8 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
 	INIT_WORK(&head->requeue_work, nvme_requeue_work);
 	INIT_WORK(&head->partition_scan_work, nvme_partition_scan_work);
 	INIT_DELAYED_WORK(&head->remove_work, nvme_remove_head_work);
+	if (IS_ENABLED(CONFIG_NVME_BPF))
+		INIT_LIST_HEAD(&head->bpf_list);
 	head->delayed_removal_secs = 0;
 
 	/*
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 73b966a6653a..3498620d650b 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -17,7 +17,7 @@
 #include <linux/wait.h>
 #include <linux/t10-pi.h>
 #include <linux/ratelimit_types.h>
-
+#include <linux/bpf.h>
 #include <trace/events/block.h>
 
 extern const struct pr_ops nvme_pr_ops;
@@ -499,6 +499,10 @@ struct nvme_ns_head {
 
 	u16			nr_plids;
 	u16			*plids;
+#ifdef CONFIG_NVME_BPF
+	struct list_head	bpf_list;
+	struct nvme_bpf_ops __rcu *bpf_ops;
+#endif
 #ifdef CONFIG_NVME_MULTIPATH
 	struct bio_list		requeue_list;
 	spinlock_t		requeue_lock;
diff --git a/include/linux/nvme-bpf.h b/include/linux/nvme-bpf.h
new file mode 100644
index 000000000000..687b96e101ef
--- /dev/null
+++ b/include/linux/nvme-bpf.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2025 Hannes Reinecke, SUSE Software Solutions
+ */
+
+#ifndef _NVME_BPF_H
+#define _NVME_BPF_H
+
+struct nvme_ns_head;
+struct nvme_ns;
+
+/**
+ * struct nvme_bpf_iter - Iterator for select_path BPF function
+ * @head: namespace head to iterate over
+ * @curr: current namespace path
+ * @prev: previous namespace path
+ */
+struct nvme_bpf_iter {
+	struct nvme_ns_head *head;
+	struct nvme_ns *curr;
+	struct nvme_ns *prev;
+};
+
+/**
+ * struct nvme_bpf_ops - A BPF struct_ops of callbacks allowing to implement
+ * 			an nvme bpf path selector
+ * @uuid: ops uuid
+ * @subsys_id: instance number of the subsystem to attach to
+ * @nsid: namespace ID within @subsys_id to attach to
+ * @select_path: callback for selecting the path for @sector
+ */
+struct nvme_bpf_ops {
+	/* UUID to distinguish different instances */
+	uuid_t			uuid;
+
+	/* Subsystem NQN */
+	char			subsysnqn[256];
+
+	/* Namespace ID number or -1 if valid for all namespace */
+	int		nsid;
+
+	/* Return the controller ID of the selected path or -1 if not found */
+	int		(*select_path)(struct nvme_bpf_iter *, sector_t);
+
+	/* private: don't show in doc, must be the last field */
+	struct nvme_ns_head *head;
+};
+
+int nvme_bpf_first_path(struct nvme_bpf_iter *iter);
+int nvme_bpf_next_path(struct nvme_bpf_iter *iter);
+u32 nvme_bpf_count_paths(struct nvme_bpf_iter *iter);
+
+#endif
+
-- 
2.43.0




More information about the Linux-nvme mailing list