[PATCHv5 RFC 1/3] nvme-multipah: Add visibility for round-robin io-policy
Nilay Shroff
nilay at linux.ibm.com
Wed Oct 30 03:41:41 PDT 2024
This patch helps add nvme native multipath visibility for round-robin
io-policy. It creates a "multipath" sysfs directory under head gendisk
device node directory and then from "multipath" directory it adds a link
to each namespace path device the head node refers.
For instance, if we have a shared namespace accessible from two different
controllers/paths then we create a soft link to each path device from head
disk node as shown below:
$ ls -l /sys/block/nvme1n1/multipath/
nvme1c1n1 -> ../../../../../pci052e:78/052e:78:00.0/nvme/nvme1/nvme1c1n1
nvme1c3n1 -> ../../../../../pci058e:78/058e:78:00.0/nvme/nvme3/nvme1c3n1
In the above example, nvme1n1 is head gendisk node created for a shared
namespace and the namespace is accessible from nvme1c1n1 and nvme1c3n1
paths.
For round-robin I/O policy, we could easily infer from the above output
that I/O workload targeted to nvme1n1 would toggle across paths nvme1c1n1
and nvme1c3n1.
Signed-off-by: Nilay Shroff <nilay at linux.ibm.com>
---
drivers/nvme/host/core.c | 3 ++
drivers/nvme/host/multipath.c | 81 +++++++++++++++++++++++++++++++++++
drivers/nvme/host/nvme.h | 18 ++++++--
drivers/nvme/host/sysfs.c | 14 ++++++
4 files changed, 112 insertions(+), 4 deletions(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 84cb859a911d..c923bd2c5468 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3940,6 +3940,9 @@ static void nvme_ns_remove(struct nvme_ns *ns)
if (!nvme_ns_head_multipath(ns->head))
nvme_cdev_del(&ns->cdev, &ns->cdev_device);
+
+ nvme_mpath_remove_sysfs_link(ns);
+
del_gendisk(ns->disk);
mutex_lock(&ns->ctrl->namespaces_lock);
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 6a15873055b9..43c87a946cc7 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -682,6 +682,8 @@ static void nvme_mpath_set_live(struct nvme_ns *ns)
kblockd_schedule_work(&head->partition_scan_work);
}
+ nvme_mpath_add_sysfs_link(ns->head);
+
mutex_lock(&head->lock);
if (nvme_path_is_optimized(ns)) {
int node, srcu_idx;
@@ -764,6 +766,25 @@ static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc,
if (nvme_state_is_live(ns->ana_state) &&
nvme_ctrl_state(ns->ctrl) == NVME_CTRL_LIVE)
nvme_mpath_set_live(ns);
+ else {
+ /*
+ * Add sysfs link from multipath head gendisk node to path
+ * device gendisk node.
+ * If path's ana state is live (i.e. state is either optimized
+ * or non-optimized) while we alloc the ns then sysfs link would
+ * be created from nvme_mpath_set_live(). In that case we would
+ * not fallthrough this code path. However for the path's ana
+ * state other than live, we call nvme_mpath_set_live() only
+ * after ana state transitioned to the live state. But we still
+ * want to create the sysfs link from head node to a path device
+ * irrespctive of the path's ana state.
+ * If we reach through here then it means that path's ana state
+ * is not live but still create the sysfs link to this path from
+ * head node if head node of the path has already come alive.
+ */
+ if (test_bit(NVME_NSHEAD_DISK_LIVE, &ns->head->flags))
+ nvme_mpath_add_sysfs_link(ns->head);
+ }
}
static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
@@ -962,6 +983,66 @@ static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl,
return -ENXIO; /* just break out of the loop */
}
+void nvme_mpath_add_sysfs_link(struct nvme_ns_head *head)
+{
+ struct device *target;
+ int rc, srcu_idx;
+ struct nvme_ns *ns;
+ struct kobject *kobj = &disk_to_dev(head->disk)->kobj;
+
+ /*
+ * loop through each ns chained through the head->list and create the
+ * sysfs link from head node to the ns path node
+ */
+ srcu_idx = srcu_read_lock(&head->srcu);
+
+ list_for_each_entry_rcu(ns, &head->list, siblings) {
+ /*
+ * Avoid creating link if it already exists for the given path.
+ * When path ana state transitions from optimized to non-
+ * optimized or vice-versa, the nvme_mpath_set_live() is
+ * invoked which in truns call this function. Now if the sysfs
+ * link already exists for the given path and we attempt to re-
+ * create the link then sysfs code would warn about it loudly.
+ * So we evaluate NVME_NS_SYSFS_ATTR_LINK flag here to ensure
+ * that we're not creating duplicate link.
+ */
+ if (test_bit(NVME_NS_SYSFS_ATTR_LINK, &ns->flags))
+ continue;
+
+ target = disk_to_dev(ns->disk);
+ /*
+ * Create sysfs link from head gendisk kobject @kobj to the
+ * ns path gendisk kobject @target->kobj.
+ */
+ rc = sysfs_add_link_to_group(kobj, nvme_ns_mpath_attr_group.name,
+ &target->kobj, dev_name(target));
+ if (unlikely(rc)) {
+ dev_err(disk_to_dev(ns->head->disk),
+ "failed to create link to %s\n",
+ dev_name(target));
+ } else
+ set_bit(NVME_NS_SYSFS_ATTR_LINK, &ns->flags);
+ }
+
+ srcu_read_unlock(&head->srcu, srcu_idx);
+}
+
+void nvme_mpath_remove_sysfs_link(struct nvme_ns *ns)
+{
+ struct device *target;
+ struct kobject *kobj;
+
+ if (!test_bit(NVME_NS_SYSFS_ATTR_LINK, &ns->flags))
+ return;
+
+ target = disk_to_dev(ns->disk);
+ kobj = &disk_to_dev(ns->head->disk)->kobj;
+ sysfs_remove_link_from_group(kobj, nvme_ns_mpath_attr_group.name,
+ dev_name(target));
+ clear_bit(NVME_NS_SYSFS_ATTR_LINK, &ns->flags);
+}
+
void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid)
{
if (nvme_ctrl_use_ana(ns->ctrl)) {
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 093cb423f536..ff234bce58ef 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -528,10 +528,11 @@ struct nvme_ns {
struct nvme_ns_head *head;
unsigned long flags;
-#define NVME_NS_REMOVING 0
-#define NVME_NS_ANA_PENDING 2
-#define NVME_NS_FORCE_RO 3
-#define NVME_NS_READY 4
+#define NVME_NS_REMOVING 0
+#define NVME_NS_ANA_PENDING 2
+#define NVME_NS_FORCE_RO 3
+#define NVME_NS_READY 4
+#define NVME_NS_SYSFS_ATTR_LINK 5
struct cdev cdev;
struct device cdev_device;
@@ -927,6 +928,7 @@ int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo);
int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags);
extern const struct attribute_group *nvme_ns_attr_groups[];
+extern const struct attribute_group nvme_ns_mpath_attr_group;
extern const struct pr_ops nvme_pr_ops;
extern const struct block_device_operations nvme_ns_head_ops;
extern const struct attribute_group nvme_dev_attrs_group;
@@ -949,6 +951,8 @@ void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys);
void nvme_failover_req(struct request *req);
void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl);
int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head);
+void nvme_mpath_add_sysfs_link(struct nvme_ns_head *ns);
+void nvme_mpath_remove_sysfs_link(struct nvme_ns *ns);
void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid);
void nvme_mpath_remove_disk(struct nvme_ns_head *head);
int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id);
@@ -1003,6 +1007,12 @@ static inline void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid)
static inline void nvme_mpath_remove_disk(struct nvme_ns_head *head)
{
}
+static inline void nvme_mpath_add_sysfs_link(struct nvme_ns *ns)
+{
+}
+static inline void nvme_mpath_remove_sysfs_link(struct nvme_ns *ns)
+{
+}
static inline bool nvme_mpath_clear_current_path(struct nvme_ns *ns)
{
return false;
diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c
index b68a9e5f1ea3..5a23e23b0d01 100644
--- a/drivers/nvme/host/sysfs.c
+++ b/drivers/nvme/host/sysfs.c
@@ -299,8 +299,22 @@ static const struct attribute_group nvme_ns_attr_group = {
.is_visible = nvme_ns_attrs_are_visible,
};
+#ifdef CONFIG_NVME_MULTIPATH
+static struct attribute *nvme_ns_mpath_attrs[] = {
+ NULL,
+};
+
+const struct attribute_group nvme_ns_mpath_attr_group = {
+ .name = "multipath",
+ .attrs = nvme_ns_mpath_attrs,
+};
+#endif
+
const struct attribute_group *nvme_ns_attr_groups[] = {
&nvme_ns_attr_group,
+#ifdef CONFIG_NVME_MULTIPATH
+ &nvme_ns_mpath_attr_group,
+#endif
NULL,
};
--
2.45.2
More information about the Linux-nvme
mailing list