[PATCHv4 RFC 1/1] nvme-multipath: Add sysfs attributes for showing multipath info
Sagi Grimberg
sagi at grimberg.me
Sun Oct 20 16:17:59 PDT 2024
On 11/09/2024 9:26, Nilay Shroff wrote:
> NVMe native multipath supports different IO policies for selecting I/O
> path, however we don't have any visibility about which path is being
> selected by multipath code for forwarding I/O.
> This patch helps add that visibility by adding new sysfs attribute files
> named "numa_nodes" and "queue_depth" under each namespace block device
> path /sys/block/nvmeXcYnZ/. We also create a "multipath" sysfs directory
> under head disk node and then from this directory add a link to each
> namespace path device this head disk node points to.
>
> For instance, /sys/block/nvmeXnY/multipath/ would create a soft link to
> each path the head disk node <nvmeXnY> points to:
>
> $ ls -1 /sys/block/nvme1n1/
> nvme1c1n1 -> ../../../../../pci052e:78/052e:78:00.0/nvme/nvme1/nvme1c1n1
> nvme1c3n1 -> ../../../../../pci058e:78/058e:78:00.0/nvme/nvme3/nvme1c3n1
>
> For round-robin I/O policy, we could easily infer from the above output
> that I/O workload targeted to nvme3n1 would toggle across paths nvme1c1n1
> and nvme1c3n1.
>
> For numa I/O policy, the "numa_nodes" attribute file shows the numa nodes
> being preferred by the respective block device path. The numa nodes value
> is comma delimited list of nodes or A-B range of nodes.
>
> For queue-depth I/O policy, the "queue_depth" attribute file shows the
> number of active/in-flight I/O requests currently queued for each path.
Would it be possible to separate this to 3 patches, one for sysfs links, one
for numa, and one for queue_depth?
Also, are the sysfs files always exist? what do numa_nodes output in case
the policy is round-robin? or what would queue_depth file produce?
>
> Signed-off-by: Nilay Shroff <nilay at linux.ibm.com>
> ---
> drivers/nvme/host/core.c | 3 ++
> drivers/nvme/host/multipath.c | 71 +++++++++++++++++++++++++++++++++++
> drivers/nvme/host/nvme.h | 20 ++++++++--
> drivers/nvme/host/sysfs.c | 20 ++++++++++
> 4 files changed, 110 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 983909a600ad..6be29fd64236 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -3951,6 +3951,9 @@ static void nvme_ns_remove(struct nvme_ns *ns)
>
> if (!nvme_ns_head_multipath(ns->head))
> nvme_cdev_del(&ns->cdev, &ns->cdev_device);
> +
> + nvme_mpath_remove_sysfs_link(ns);
> +
> del_gendisk(ns->disk);
>
> mutex_lock(&ns->ctrl->namespaces_lock);
> diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
> index 518e22dd4f9b..7d9c36a7a261 100644
> --- a/drivers/nvme/host/multipath.c
> +++ b/drivers/nvme/host/multipath.c
> @@ -654,6 +654,8 @@ static void nvme_mpath_set_live(struct nvme_ns *ns)
> nvme_add_ns_head_cdev(head);
> }
>
> + nvme_mpath_add_sysfs_link(ns);
> +
> mutex_lock(&head->lock);
> if (nvme_path_is_optimized(ns)) {
> int node, srcu_idx;
> @@ -922,6 +924,39 @@ static ssize_t ana_state_show(struct device *dev, struct device_attribute *attr,
> }
> DEVICE_ATTR_RO(ana_state);
>
> +static ssize_t queue_depth_show(struct device *dev, struct device_attribute *attr,
> + char *buf)
> +{
> + struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
> +
> + return sysfs_emit(buf, "%d\n", atomic_read(&ns->ctrl->nr_active));
> +}
> +DEVICE_ATTR_RO(queue_depth);
> +
> +static ssize_t numa_nodes_show(struct device *dev, struct device_attribute *attr,
> + char *buf)
> +{
> + int node, srcu_idx;
> + nodemask_t numa_nodes;
> + struct nvme_ns *current_ns;
> + struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
> + struct nvme_ns_head *head = ns->head;
> +
> + nodes_clear(numa_nodes);
> +
> + srcu_idx = srcu_read_lock(&head->srcu);
> + for_each_node(node) {
> + current_ns = srcu_dereference(head->current_path[node],
> + &head->srcu);
> + if (ns == current_ns)
> + node_set(node, numa_nodes);
> + }
> + srcu_read_unlock(&head->srcu, srcu_idx);
> +
> + return sysfs_emit(buf, "%*pbl\n", nodemask_pr_args(&numa_nodes));
> +}
> +DEVICE_ATTR_RO(numa_nodes);
> +
> static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl,
> struct nvme_ana_group_desc *desc, void *data)
> {
> @@ -934,6 +969,42 @@ static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl,
> return -ENXIO; /* just break out of the loop */
> }
>
> +void nvme_mpath_add_sysfs_link(struct nvme_ns *ns)
> +{
> + struct device *target;
> + struct kobject *kobj;
> + int rc;
> +
> + if (test_bit(NVME_NS_SYSFS_ATTR_LINK, &ns->flags))
> + return;
> +
> + target = disk_to_dev(ns->disk);
> + kobj = &disk_to_dev(ns->head->disk)->kobj;
> + rc = sysfs_add_link_to_group(kobj, nvme_ns_mpath_attr_group.name,
> + &target->kobj, dev_name(target));
> + if (unlikely(rc)) {
> + dev_err(disk_to_dev(ns->head->disk),
> + "failed to create link to %s\n",
> + dev_name(target));
> + } else
> + set_bit(NVME_NS_SYSFS_ATTR_LINK, &ns->flags);
This complication is not clear at all, not sure why this is needed, and
it is
not documented.
More information about the Linux-nvme
mailing list