[RFC PATCH 1/2] nvme-multipath: introduce delayed removal of the multipath head node
John Meneghini
jmeneghi at redhat.com
Tue Mar 25 08:21:12 PDT 2025
Thanks for the patch Nilay!
I've rebased this patch onto my changes at:
https://lore.kernel.org/linux-nvme/20250322232848.225140-1-jmeneghi@redhat.com/T/
This patch applies and clean compiles w/no problems.
I will test this out after LSF/MM with my fabrics testbed and get back to you.
linux(nilay_mp) > git log --oneline -5
0834c24969a1 (HEAD -> nilay_mp) nvme-multipath: introduce delayed removal of the multipath head node
f9ca6ae944e4 (johnm/config_ana5, config_ana5) nvme: update the multipath warning in nvme_init_ns_head
79feb9e51d89 nvme-multipath: add the NVME_MULTIPATH_PARAM config option
7e2928cacd71 nvme-multipath: change the NVME_MULTIPATH config option
64ea88e3afa8 (tag: nvme-6.15-2025-03-20, nvme/nvme-6.15, nvme-6.15) nvmet: replace max(a, min(b, c)) by clamp(val, lo, hi)
/John
On 3/21/25 2:37 AM, Nilay Shroff wrote:
> Currently, the multipath head node of a PCIe NVMe disk is removed
> immediately as soon as all paths of the disk are removed. However,
> this can cause issues in scenarios where:
>
> - The disk hot-removal followed by re-addition.
> - Transient PCIe link failures that trigger re-enumeration,
> temporarily removing and then restoring the disk.
>
> In these cases, removing the head node prematurely may lead to a head
> disk node name change upon re-addition, requiring applications to
> reopen their handles if they were performing I/O during the failure.
>
> To address this, introduce a delayed removal mechanism of head disk
> node. During transient failure, instead of immediate removal of head
> disk node, the system waits for a configurable timeout, allowing the
> disk to recover. Additionally, please note that the head disk node is
> now always created for all types of NVMe disks (single-ported or multi-
> ported), unless the multipath module parameter is explicitly set to
> false or CONFIG_NVME_MULTIPATH is disabled.
>
> During transient disk failure, if application sends any IO then we
> queue it instead of failing such IO immediately. If the disk comes back
> online within the timeout, the queued IOs are resubmitted to the disk
> ensuring seamless operation. In case disk couldn't recover from the
> failure then queued IOs are failed to its completion and application
> receives the error.
>
> So this way, if disk comes back online within the configured period,
> the head node remains unchanged, ensuring uninterrupted workloads
> without requiring applications to reopen device handles.
>
> A new sysfs attribute, named "delayed_shutdown_sec" is added for user
> who wish to configure time for the delayed removal of head disk node.
> The default value of this attribute is set to 0 seconds ensuring no
> behavior change unless explicitly configured.
>
> Link: https://lore.kernel.org/linux-nvme/Y9oGTKCFlOscbPc2@infradead.org/
> Link: https://lore.kernel.org/linux-nvme/Y+1aKcQgbskA2tra@kbusch-mbp.dhcp.thefacebook.com/
> Suggested-by: Keith Busch <kbusch at kernel.org>
> Suggested-by: Christoph Hellwig <hch at infradead.org>
> [nilay: reworked based on the original idea/POC from Christoph and Keith]
> Signed-off-by: Nilay Shroff <nilay at linux.ibm.com>
> ---
> drivers/nvme/host/core.c | 18 +++---
> drivers/nvme/host/multipath.c | 118 +++++++++++++++++++++++++++++-----
> drivers/nvme/host/nvme.h | 4 ++
> drivers/nvme/host/sysfs.c | 13 ++++
> 4 files changed, 127 insertions(+), 26 deletions(-)
>
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 870314c52107..e798809a8325 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -3562,7 +3562,7 @@ static struct nvme_ns_head *nvme_find_ns_head(struct nvme_ctrl *ctrl,
> */
> if (h->ns_id != nsid || !nvme_is_unique_nsid(ctrl, h))
> continue;
> - if (!list_empty(&h->list) && nvme_tryget_ns_head(h))
> + if (nvme_tryget_ns_head(h))
> return h;
> }
>
> @@ -3690,6 +3690,10 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
> ratelimit_state_init(&head->rs_nuse, 5 * HZ, 1);
> ratelimit_set_flags(&head->rs_nuse, RATELIMIT_MSG_ON_RELEASE);
> kref_init(&head->ref);
> +#ifdef CONFIG_NVME_MULTIPATH
> + if (ctrl->ops->flags & NVME_F_FABRICS)
> + set_bit(NVME_NSHEAD_FABRICS, &head->flags);
> +#endif
>
> if (head->ids.csi) {
> ret = nvme_get_effects_log(ctrl, head->ids.csi, &head->effects);
> @@ -3806,7 +3810,8 @@ static int nvme_init_ns_head(struct nvme_ns *ns, struct nvme_ns_info *info)
> }
> } else {
> ret = -EINVAL;
> - if (!info->is_shared || !head->shared) {
> + if ((!info->is_shared || !head->shared) &&
> + !list_empty(&head->list)) {
> dev_err(ctrl->device,
> "Duplicate unshared namespace %d\n",
> info->nsid);
> @@ -3988,8 +3993,6 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
>
> static void nvme_ns_remove(struct nvme_ns *ns)
> {
> - bool last_path = false;
> -
> if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags))
> return;
>
> @@ -4009,10 +4012,6 @@ static void nvme_ns_remove(struct nvme_ns *ns)
>
> mutex_lock(&ns->ctrl->subsys->lock);
> list_del_rcu(&ns->siblings);
> - if (list_empty(&ns->head->list)) {
> - list_del_init(&ns->head->entry);
> - last_path = true;
> - }
> mutex_unlock(&ns->ctrl->subsys->lock);
>
> /* guarantee not available in head->list */
> @@ -4030,8 +4029,7 @@ static void nvme_ns_remove(struct nvme_ns *ns)
> mutex_unlock(&ns->ctrl->namespaces_lock);
> synchronize_srcu(&ns->ctrl->srcu);
>
> - if (last_path)
> - nvme_mpath_shutdown_disk(ns->head);
> + nvme_mpath_shutdown_disk(ns->head);
> nvme_put_ns(ns);
> }
>
> diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
> index 6b12ca80aa27..0f54889bd483 100644
> --- a/drivers/nvme/host/multipath.c
> +++ b/drivers/nvme/host/multipath.c
> @@ -421,7 +421,6 @@ inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
> return nvme_numa_path(head);
> }
> }
> -
> static bool nvme_available_path(struct nvme_ns_head *head)
> {
> struct nvme_ns *ns;
> @@ -429,6 +428,16 @@ static bool nvme_available_path(struct nvme_ns_head *head)
> if (!test_bit(NVME_NSHEAD_DISK_LIVE, &head->flags))
> return NULL;
>
> + /*
> + * For non-fabric controllers we support delayed removal of head disk
> + * node. If we reached up to here then it means that head disk is still
> + * alive and so we assume here that even if there's no path available
> + * maybe due to the transient link failure, we could queue up the IO
> + * and later when path becomes ready we re-submit queued IO.
> + */
> + if (!(test_bit(NVME_NSHEAD_FABRICS, &head->flags)))
> + return true;
> +
> list_for_each_entry_srcu(ns, &head->list, siblings,
> srcu_read_lock_held(&head->srcu)) {
> if (test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ns->ctrl->flags))
> @@ -444,7 +453,6 @@ static bool nvme_available_path(struct nvme_ns_head *head)
> }
> return false;
> }
> -
> static void nvme_ns_head_submit_bio(struct bio *bio)
> {
> struct nvme_ns_head *head = bio->bi_bdev->bd_disk->private_data;
> @@ -617,6 +625,40 @@ static void nvme_requeue_work(struct work_struct *work)
> }
> }
>
> +static void nvme_remove_head(struct nvme_ns_head *head)
> +{
> + if (test_and_clear_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
> + /*
> + * requeue I/O after NVME_NSHEAD_DISK_LIVE has been cleared
> + * to allow multipath to fail all I/O.
> + */
> + kblockd_schedule_work(&head->requeue_work);
> +
> + nvme_cdev_del(&head->cdev, &head->cdev_device);
> + synchronize_srcu(&head->srcu);
> + del_gendisk(head->disk);
> + nvme_put_ns_head(head);
> + }
> +}
> +
> +static void nvme_remove_head_work(struct work_struct *work)
> +{
> + struct nvme_ns_head *head = container_of(to_delayed_work(work),
> + struct nvme_ns_head, remove_work);
> + bool shutdown = false;
> +
> + mutex_lock(&head->subsys->lock);
> + if (list_empty(&head->list)) {
> + list_del_init(&head->entry);
> + shutdown = true;
> + }
> + mutex_unlock(&head->subsys->lock);
> + if (shutdown)
> + nvme_remove_head(head);
> +
> + module_put(THIS_MODULE);
> +}
> +
> int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
> {
> struct queue_limits lim;
> @@ -626,14 +668,15 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
> spin_lock_init(&head->requeue_lock);
> INIT_WORK(&head->requeue_work, nvme_requeue_work);
> INIT_WORK(&head->partition_scan_work, nvme_partition_scan_work);
> + INIT_DELAYED_WORK(&head->remove_work, nvme_remove_head_work);
> + head->delayed_shutdown_sec = 0;
>
> /*
> - * Add a multipath node if the subsystems supports multiple controllers.
> - * We also do this for private namespaces as the namespace sharing flag
> - * could change after a rescan.
> + * A head disk node is always created for all types of NVMe disks
> + * (single-ported and multi-ported), unless the multipath module
> + * parameter is explicitly set to false.
> */
> - if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) ||
> - !nvme_is_unique_nsid(ctrl, head) || !multipath)
> + if (!multipath)
> return 0;
>
> blk_set_stacking_limits(&lim);
> @@ -659,6 +702,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
> set_bit(GD_SUPPRESS_PART_SCAN, &head->disk->state);
> sprintf(head->disk->disk_name, "nvme%dn%d",
> ctrl->subsys->instance, head->instance);
> + nvme_tryget_ns_head(head);
> return 0;
> }
>
> @@ -1015,6 +1059,40 @@ static ssize_t numa_nodes_show(struct device *dev, struct device_attribute *attr
> }
> DEVICE_ATTR_RO(numa_nodes);
>
> +static ssize_t delayed_shutdown_sec_show(struct device *dev,
> + struct device_attribute *attr, char *buf)
> +{
> + struct gendisk *disk = dev_to_disk(dev);
> + struct nvme_ns_head *head = disk->private_data;
> + int ret;
> +
> + mutex_lock(&head->subsys->lock);
> + ret = sysfs_emit(buf, "%u\n", head->delayed_shutdown_sec);
> + mutex_unlock(&head->subsys->lock);
> + return ret;
> +}
> +
> +static ssize_t delayed_shutdown_sec_store(struct device *dev,
> + struct device_attribute *attr, const char *buf, size_t count)
> +{
> + struct gendisk *disk = dev_to_disk(dev);
> + struct nvme_ns_head *head = disk->private_data;
> + unsigned int sec;
> + int ret;
> +
> + ret = kstrtouint(buf, 0, &sec);
> + if (ret < 0)
> + return ret;
> +
> + mutex_lock(&head->subsys->lock);
> + head->delayed_shutdown_sec = sec;
> + mutex_unlock(&head->subsys->lock);
> +
> + return count;
> +}
> +
> +DEVICE_ATTR_RW(delayed_shutdown_sec);
> +
> static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl,
> struct nvme_ana_group_desc *desc, void *data)
> {
> @@ -1138,18 +1216,26 @@ void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid)
>
> void nvme_mpath_shutdown_disk(struct nvme_ns_head *head)
> {
> - if (!head->disk)
> - return;
> - if (test_and_clear_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
> - nvme_cdev_del(&head->cdev, &head->cdev_device);
> + mutex_lock(&head->subsys->lock);
> +
> + if (!list_empty(&head->list) || !head->disk)
> + goto out;
> +
> + if (head->delayed_shutdown_sec) {
> /*
> - * requeue I/O after NVME_NSHEAD_DISK_LIVE has been cleared
> - * to allow multipath to fail all I/O.
> + * Ensure that no one could remove this module while the head
> + * remove work is pending.
> */
> - synchronize_srcu(&head->srcu);
> - kblockd_schedule_work(&head->requeue_work);
> - del_gendisk(head->disk);
> + if (!try_module_get(THIS_MODULE))
> + goto out;
> + queue_delayed_work(nvme_wq, &head->remove_work,
> + head->delayed_shutdown_sec * HZ);
> + } else {
> + list_del_init(&head->entry);
> + nvme_remove_head(head);
> }
> +out:
> + mutex_unlock(&head->subsys->lock);
> }
>
> void nvme_mpath_remove_disk(struct nvme_ns_head *head)
> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
> index 51e078642127..4375357b8cd7 100644
> --- a/drivers/nvme/host/nvme.h
> +++ b/drivers/nvme/host/nvme.h
> @@ -503,7 +503,10 @@ struct nvme_ns_head {
> struct work_struct partition_scan_work;
> struct mutex lock;
> unsigned long flags;
> + struct delayed_work remove_work;
> + unsigned int delayed_shutdown_sec;
> #define NVME_NSHEAD_DISK_LIVE 0
> +#define NVME_NSHEAD_FABRICS 1
> struct nvme_ns __rcu *current_path[];
> #endif
> };
> @@ -986,6 +989,7 @@ extern struct device_attribute dev_attr_ana_grpid;
> extern struct device_attribute dev_attr_ana_state;
> extern struct device_attribute dev_attr_queue_depth;
> extern struct device_attribute dev_attr_numa_nodes;
> +extern struct device_attribute dev_attr_delayed_shutdown_sec;
> extern struct device_attribute subsys_attr_iopolicy;
>
> static inline bool nvme_disk_is_ns_head(struct gendisk *disk)
> diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c
> index 6d31226f7a4f..170897349093 100644
> --- a/drivers/nvme/host/sysfs.c
> +++ b/drivers/nvme/host/sysfs.c
> @@ -260,6 +260,7 @@ static struct attribute *nvme_ns_attrs[] = {
> &dev_attr_ana_state.attr,
> &dev_attr_queue_depth.attr,
> &dev_attr_numa_nodes.attr,
> + &dev_attr_delayed_shutdown_sec.attr,
> #endif
> &dev_attr_io_passthru_err_log_enabled.attr,
> NULL,
> @@ -296,6 +297,18 @@ static umode_t nvme_ns_attrs_are_visible(struct kobject *kobj,
> if (nvme_disk_is_ns_head(dev_to_disk(dev)))
> return 0;
> }
> + if (a == &dev_attr_delayed_shutdown_sec.attr) {
> + struct nvme_ns_head *head = dev_to_ns_head(dev);
> + struct gendisk *disk = dev_to_disk(dev);
> +
> + /*
> + * This attribute is only valid for head node and non-fabric
> + * controllers.
> + */
> + if (!nvme_disk_is_ns_head(disk) ||
> + test_bit(NVME_NSHEAD_FABRICS, &head->flags))
> + return 0;
> + }
> #endif
> return a->mode;
> }
More information about the Linux-nvme
mailing list