[PATCHv4 RFC 1/1] nvme-multipath: Add sysfs attributes for showing multipath info

Wed Oct 23 02:58:34 PDT 2024

>> Also, are the sysfs files always exist? what do numa_nodes output in case
>> the policy is round-robin? or what would queue_depth file produce?
>>
> Assuming kernel is compiled with CONFIG_NVME_MULTIPATH, the sysfs files
> "numa_nodes" and "round_robin" would be always created once gendisk node
> (nvmeXCYnZ) for the per-path namespace device is added.
>
> In case io-policy is round-robin then numa_nodes file would show the numa
> nodes preferred for the respective per-path namespace device in case the
> policy is set to numa. For instance, if we read the file
> "/sys/block/nvme1n1/multipath/nvme1c1n1/numa_nodes" then it shows the numa
> nodes preferring the path nvme1c1n1 when I/O workload is targeted to nvme1n1.
>
> The queue_depth file would contain the value 0 if the io-policy is anything
> but queue-depth. The reason being queue_depth file emits the output of num of
> active requests currently queued up in the nvme/fabric queue (i.e. ctrl->nr_active)
> however if the io-policy is not queue-depth then ->nr_active would typically remains
> zero.
>
> BTW, we don't expect to read the numa_nodes file when the nvme subsystem
> io-policy is NOT numa. Similarly we don't expect user to review the
> queue_depth file when io-policy is NOT set to queue-depth. Isn't it?

I'm asking what does it show in case it _is_ read. queue_depth and 
numa_nodes
output seems to be a "lie" in case the policy is round-robin.

>
>>
>>> Signed-off-by: Nilay Shroff <nilay at linux.ibm.com>
>>> ---
>>>    drivers/nvme/host/core.c      |  3 ++
>>>    drivers/nvme/host/multipath.c | 71 +++++++++++++++++++++++++++++++++++
>>>    drivers/nvme/host/nvme.h      | 20 ++++++++--
>>>    drivers/nvme/host/sysfs.c     | 20 ++++++++++
>>>    4 files changed, 110 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
>>> index 983909a600ad..6be29fd64236 100644
>>> --- a/drivers/nvme/host/core.c
>>> +++ b/drivers/nvme/host/core.c
>>> @@ -3951,6 +3951,9 @@ static void nvme_ns_remove(struct nvme_ns *ns)
>>>          if (!nvme_ns_head_multipath(ns->head))
>>>            nvme_cdev_del(&ns->cdev, &ns->cdev_device);
>>> +
>>> +    nvme_mpath_remove_sysfs_link(ns);
>>> +
>>>        del_gendisk(ns->disk);
>>>          mutex_lock(&ns->ctrl->namespaces_lock);
>>> diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
>>> index 518e22dd4f9b..7d9c36a7a261 100644
>>> --- a/drivers/nvme/host/multipath.c
>>> +++ b/drivers/nvme/host/multipath.c
>>> @@ -654,6 +654,8 @@ static void nvme_mpath_set_live(struct nvme_ns *ns)
>>>            nvme_add_ns_head_cdev(head);
>>>        }
>>>    +    nvme_mpath_add_sysfs_link(ns);
>>> +
>>>        mutex_lock(&head->lock);
>>>        if (nvme_path_is_optimized(ns)) {
>>>            int node, srcu_idx;
>>> @@ -922,6 +924,39 @@ static ssize_t ana_state_show(struct device *dev, struct device_attribute *attr,
>>>    }
>>>    DEVICE_ATTR_RO(ana_state);
>>>    +static ssize_t queue_depth_show(struct device *dev, struct device_attribute *attr,
>>> +        char *buf)
>>> +{
>>> +    struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
>>> +
>>> +    return sysfs_emit(buf, "%d\n", atomic_read(&ns->ctrl->nr_active));
>>> +}
>>> +DEVICE_ATTR_RO(queue_depth);
>>> +
>>> +static ssize_t numa_nodes_show(struct device *dev, struct device_attribute *attr,
>>> +        char *buf)
>>> +{
>>> +    int node, srcu_idx;
>>> +    nodemask_t numa_nodes;
>>> +    struct nvme_ns *current_ns;
>>> +    struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
>>> +    struct nvme_ns_head *head = ns->head;
>>> +
>>> +    nodes_clear(numa_nodes);
>>> +
>>> +    srcu_idx = srcu_read_lock(&head->srcu);
>>> +    for_each_node(node) {
>>> +        current_ns = srcu_dereference(head->current_path[node],
>>> +                &head->srcu);
>>> +        if (ns == current_ns)
>>> +            node_set(node, numa_nodes);
>>> +    }
>>> +    srcu_read_unlock(&head->srcu, srcu_idx);
>>> +
>>> +    return sysfs_emit(buf, "%*pbl\n", nodemask_pr_args(&numa_nodes));
>>> +}
>>> +DEVICE_ATTR_RO(numa_nodes);
>>> +
>>>    static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl,
>>>            struct nvme_ana_group_desc *desc, void *data)
>>>    {
>>> @@ -934,6 +969,42 @@ static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl,
>>>        return -ENXIO; /* just break out of the loop */
>>>    }
>>>    +void nvme_mpath_add_sysfs_link(struct nvme_ns *ns)
>>> +{
>>> +    struct device *target;
>>> +    struct kobject *kobj;
>>> +    int rc;
>>> +
>>> +    if (test_bit(NVME_NS_SYSFS_ATTR_LINK, &ns->flags))
>>> +        return;
>>> +
>>> +    target = disk_to_dev(ns->disk);
>>> +    kobj = &disk_to_dev(ns->head->disk)->kobj;
>>> +    rc = sysfs_add_link_to_group(kobj, nvme_ns_mpath_attr_group.name,
>>> +            &target->kobj, dev_name(target));
>>> +    if (unlikely(rc)) {
>>> +        dev_err(disk_to_dev(ns->head->disk),
>>> +                "failed to create link to %s\n",
>>> +                dev_name(target));
>>> +    } else
>>> +        set_bit(NVME_NS_SYSFS_ATTR_LINK, &ns->flags);
>> This complication is not clear at all, not sure why this is needed, and it is
>> not documented.
> The nvme_mpath_add_sysfs_link function is invoked from nvme_mpath_set_live() when a
> new namespace is allocated and its ANA state is set to OPTIMIZED or NON-OPTIMIZED.
> The nvme_mpath_add_sysfs_link function simply creates the link to the target nvme
> path-device (nvmeXCYnZ) from head disk-node (nvmeXnY) device. If this looks complex
> then I would add commentary in the code in the next patch.

Still not clear why we need a flag for this? The usage suggest that we 
may call add/remove sysfs link
multiple times... What about inaccessible paths? or paths that 
transition to/from inaccessible? They
will add/remove their corresponding sysfs links?