[RFC PATCHv3 1/3] nvme-multipath: introduce delayed removal of the multipath head node
Hannes Reinecke
hare at suse.de
Sun May 4 23:39:11 PDT 2025
On 5/4/25 19:50, Nilay Shroff wrote:
> Currently, the multipath head node of a PCIe NVMe disk is removed
> immediately as soon as all paths of the disk are removed. However,
> this can cause issues in scenarios where:
>
> - The disk hot-removal followed by re-addition.
> - Transient PCIe link failures that trigger re-enumeration,
> temporarily removing and then restoring the disk.
>
> In these cases, removing the head node prematurely may lead to a head
> disk node name change upon re-addition, requiring applications to
> reopen their handles if they were performing I/O during the failure.
>
> To address this, introduce a delayed removal mechanism of head disk
> node. During transient failure, instead of immediate removal of head
> disk node, the system waits for a configurable timeout, allowing the
> disk to recover.
>
> During transient disk failure, if application sends any IO then we
> queue it instead of failing such IO immediately. If the disk comes back
> online within the timeout, the queued IOs are resubmitted to the disk
> ensuring seamless operation. In case disk couldn't recover from the
> failure then queued IOs are failed to its completion and application
> receives the error.
>
> So this way, if disk comes back online within the configured period,
> the head node remains unchanged, ensuring uninterrupted workloads
> without requiring applications to reopen device handles.
>
> A new sysfs attribute, named "delayed_removal_secs" is added under head
> disk blkdev for user who wish to configure time for the delayed removal
> of head disk node. The default value of this attribute is set to zero
> second ensuring no behavior change unless explicitly configured.
>
> Link: https://lore.kernel.org/linux-nvme/Y9oGTKCFlOscbPc2@infradead.org/
> Link: https://lore.kernel.org/linux-nvme/Y+1aKcQgbskA2tra@kbusch-mbp.dhcp.thefacebook.com/
> Suggested-by: Keith Busch <kbusch at kernel.org>
> Suggested-by: Christoph Hellwig <hch at infradead.org>
> [nilay: reworked based on the original idea/POC from Christoph and Keith]
> Signed-off-by: Nilay Shroff <nilay at linux.ibm.com>
> ---
> drivers/nvme/host/core.c | 10 ++-
> drivers/nvme/host/multipath.c | 137 +++++++++++++++++++++++++++++++---
> drivers/nvme/host/nvme.h | 25 ++++++-
> drivers/nvme/host/sysfs.c | 13 ++++
> 4 files changed, 171 insertions(+), 14 deletions(-)
>
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index eb6ea8acb3cc..0f6b01f10c70 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -3560,7 +3560,7 @@ static struct nvme_ns_head *nvme_find_ns_head(struct nvme_ctrl *ctrl,
> */
> if (h->ns_id != nsid || !nvme_is_unique_nsid(ctrl, h))
> continue;
> - if (!list_empty(&h->list) && nvme_tryget_ns_head(h))
> + if (nvme_tryget_ns_head(h))
> return h;
> }
>
> @@ -3688,6 +3688,8 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
> ratelimit_state_init(&head->rs_nuse, 5 * HZ, 1);
> ratelimit_set_flags(&head->rs_nuse, RATELIMIT_MSG_ON_RELEASE);
> kref_init(&head->ref);
> + if (ctrl->ops->flags & NVME_F_FABRICS)
> + nvme_mpath_set_fabrics(head);
>
Please make this a separate patch.
> if (head->ids.csi) {
> ret = nvme_get_effects_log(ctrl, head->ids.csi, &head->effects);
> @@ -3804,7 +3806,8 @@ static int nvme_init_ns_head(struct nvme_ns *ns, struct nvme_ns_info *info)
> }
> } else {
> ret = -EINVAL;
> - if (!info->is_shared || !head->shared) {
> + if ((!info->is_shared || !head->shared) &&
> + !list_empty(&head->list)) {
> dev_err(ctrl->device,
> "Duplicate unshared namespace %d\n",
> info->nsid);
> @@ -4008,7 +4011,8 @@ static void nvme_ns_remove(struct nvme_ns *ns)
> mutex_lock(&ns->ctrl->subsys->lock);
> list_del_rcu(&ns->siblings);
> if (list_empty(&ns->head->list)) {
> - list_del_init(&ns->head->entry);
> + if (!nvme_mpath_queue_if_no_path(ns->head))
> + list_del_init(&ns->head->entry);
> last_path = true;
> }
> mutex_unlock(&ns->ctrl->subsys->lock);
> diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
> index 250f3da67cc9..f3f981c9f3ec 100644
> --- a/drivers/nvme/host/multipath.c
> +++ b/drivers/nvme/host/multipath.c
> @@ -442,7 +442,24 @@ static bool nvme_available_path(struct nvme_ns_head *head)
> break;
> }
> }
> - return false;
> +
> + /*
> + * If "head->delayed_removal_secs" is configured (i.e., non-zero), do
> + * not immediately fail I/O. Instead, requeue the I/O for the configured
> + * duration, anticipating that if there's a transient link failure then
> + * it may recover within this time window. This parameter is exported to
> + * userspace via sysfs, and its default value is zero. It is internally
> + * mapped to NVME_NSHEAD_QUEUE_IF_NO_PATH. When delayed_removal_secs is
> + * non-zero, this flag is set to true. When zero, the flag is cleared.
> + *
> + * Note: This option is not exposed to the users for NVMeoF connections.
> + * In fabric-based setups, fabric link failure is managed through other
> + * parameters such as "reconnect_delay" and "max_reconnects" which is
> + * handled at respective fabric driver layer. Therefor, head->delayed_
> + * removal_secs" is intended exclusively for non-fabric (e.g., PCIe)
> + * multipath configurations.
> + */
> + return nvme_mpath_queue_if_no_path(head);
> }
>
> static void nvme_ns_head_submit_bio(struct bio *bio)
> @@ -617,6 +634,40 @@ static void nvme_requeue_work(struct work_struct *work)
> }
> }
>
> +static void nvme_remove_head(struct nvme_ns_head *head)
> +{
> + if (test_and_clear_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
> + /*
> + * requeue I/O after NVME_NSHEAD_DISK_LIVE has been cleared
> + * to allow multipath to fail all I/O.
> + */
> + kblockd_schedule_work(&head->requeue_work);
> +
> + nvme_cdev_del(&head->cdev, &head->cdev_device);
> + synchronize_srcu(&head->srcu);
> + del_gendisk(head->disk);
> + nvme_put_ns_head(head);
> + }
> +}
> +
> +static void nvme_remove_head_work(struct work_struct *work)
> +{
> + struct nvme_ns_head *head = container_of(to_delayed_work(work),
> + struct nvme_ns_head, remove_work);
> + bool shutdown = false;
> +
Don't you need to check here if the 'QUEUE_IF_NO_PATH' bit is still
cleared?
> + mutex_lock(&head->subsys->lock);
> + if (list_empty(&head->list)) {
> + list_del_init(&head->entry);
> + shutdown = true;
> + }
> + mutex_unlock(&head->subsys->lock);
> + if (shutdown)
> + nvme_remove_head(head);
> +
> + module_put(THIS_MODULE);
> +}
> +
> int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
> {
> struct queue_limits lim;
> @@ -626,6 +677,8 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
> spin_lock_init(&head->requeue_lock);
> INIT_WORK(&head->requeue_work, nvme_requeue_work);
> INIT_WORK(&head->partition_scan_work, nvme_partition_scan_work);
> + INIT_DELAYED_WORK(&head->remove_work, nvme_remove_head_work);
> + head->delayed_removal_secs = 0;
>
> /*
> * Add a multipath node if the subsystems supports multiple controllers.
> @@ -659,6 +712,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
> set_bit(GD_SUPPRESS_PART_SCAN, &head->disk->state);
> sprintf(head->disk->disk_name, "nvme%dn%d",
> ctrl->subsys->instance, head->instance);
> + nvme_tryget_ns_head(head);
What is that doing here?
Why do you tak an additional reference?
> return 0;
> }
>
> @@ -1015,6 +1069,49 @@ static ssize_t numa_nodes_show(struct device *dev, struct device_attribute *attr
> }
> DEVICE_ATTR_RO(numa_nodes);
>
> +static ssize_t delayed_removal_secs_show(struct device *dev,
> + struct device_attribute *attr, char *buf)
> +{
> + struct gendisk *disk = dev_to_disk(dev);
> + struct nvme_ns_head *head = disk->private_data;
> + int ret;
> +
> + mutex_lock(&head->subsys->lock);
> + ret = sysfs_emit(buf, "%u\n", head->delayed_removal_secs);
> + mutex_unlock(&head->subsys->lock);
> + return ret;
> +}
> +
> +static ssize_t delayed_removal_secs_store(struct device *dev,
> + struct device_attribute *attr, const char *buf, size_t count)
> +{
> + struct gendisk *disk = dev_to_disk(dev);
> + struct nvme_ns_head *head = disk->private_data;
> + unsigned int sec;
> + int ret;
> +
> + ret = kstrtouint(buf, 0, &sec);
> + if (ret < 0)
> + return ret;
> +
> + mutex_lock(&head->subsys->lock);
> + head->delayed_removal_secs = sec;
> + if (sec)
> + set_bit(NVME_NSHEAD_QUEUE_IF_NO_PATH, &head->flags);
> + else
> + clear_bit(NVME_NSHEAD_QUEUE_IF_NO_PATH, &head->flags);
> + mutex_unlock(&head->subsys->lock);
> + /*
> + * Ensure that update to NVME_NSHEAD_QUEUE_IF_NO_PATH is seen
> + * by its reader.
> + */
> + synchronize_srcu(&head->srcu);
> +
> + return count;
> +}
> +
> +DEVICE_ATTR_RW(delayed_removal_secs);
> +
> static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl,
> struct nvme_ana_group_desc *desc, void *data)
> {
> @@ -1138,18 +1235,38 @@ void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid)
>
> void nvme_mpath_shutdown_disk(struct nvme_ns_head *head)
> {
> - if (!head->disk)
> - return;
> - if (test_and_clear_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
> - nvme_cdev_del(&head->cdev, &head->cdev_device);
> + bool shutdown = false;
> +
> + mutex_lock(&head->subsys->lock);
> + /*
> + * We are called when all paths have been removed, and at that point
> + * head->list is expected to be empty. However, nvme_remove_ns() and
> + * nvme_init_ns_head() can run concurrently and so if head->delayed_
> + * removal_secs is configured, it is possible that by the time we reach
> + * this point, head->list may no longer be empty. Therefore, we recheck
> + * head->list here. If it is no longer empty then we skip enqueuing the
> + * delayed head removal work.
> + */
> + if (!list_empty(&head->list))
> + goto out;
> +
> + if (head->delayed_removal_secs) {
> /*
> - * requeue I/O after NVME_NSHEAD_DISK_LIVE has been cleared
> - * to allow multipath to fail all I/O.
> + * Ensure that no one could remove this module while the head
> + * remove work is pending.
> */
> - synchronize_srcu(&head->srcu);
> - kblockd_schedule_work(&head->requeue_work);
> - del_gendisk(head->disk);
> + if (!try_module_get(THIS_MODULE))
> + goto out;
> + queue_delayed_work(nvme_wq, &head->remove_work,
> + head->delayed_removal_secs * HZ);
> + } else {
> + list_del_init(&head->entry);
> + shutdown = true;
> }
> +out:
> + mutex_unlock(&head->subsys->lock);
> + if (shutdown)
> + nvme_remove_head(head);
> }
>
> void nvme_mpath_remove_disk(struct nvme_ns_head *head)
> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
> index 51e078642127..b2d02425dc3c 100644
> --- a/drivers/nvme/host/nvme.h
> +++ b/drivers/nvme/host/nvme.h
> @@ -503,7 +503,11 @@ struct nvme_ns_head {
> struct work_struct partition_scan_work;
> struct mutex lock;
> unsigned long flags;
> -#define NVME_NSHEAD_DISK_LIVE 0
> + struct delayed_work remove_work;
> + unsigned int delayed_removal_secs;
> +#define NVME_NSHEAD_DISK_LIVE 0
> +#define NVME_NSHEAD_FABRICS 1
> +#define NVME_NSHEAD_QUEUE_IF_NO_PATH 2
> struct nvme_ns __rcu *current_path[];
> #endif
> };
> @@ -986,12 +990,23 @@ extern struct device_attribute dev_attr_ana_grpid;
> extern struct device_attribute dev_attr_ana_state;
> extern struct device_attribute dev_attr_queue_depth;
> extern struct device_attribute dev_attr_numa_nodes;
> +extern struct device_attribute dev_attr_delayed_removal_secs;
> extern struct device_attribute subsys_attr_iopolicy;
>
> static inline bool nvme_disk_is_ns_head(struct gendisk *disk)
> {
> return disk->fops == &nvme_ns_head_ops;
> }
> +static inline void nvme_mpath_set_fabrics(struct nvme_ns_head *head)
> +{
> + set_bit(NVME_NSHEAD_FABRICS, &head->flags);
> +}
> +static inline bool nvme_mpath_queue_if_no_path(struct nvme_ns_head *head)
> +{
> + if (test_bit(NVME_NSHEAD_QUEUE_IF_NO_PATH, &head->flags))
> + return true;
> + return false;
> +}
> #else
> #define multipath false
> static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
> @@ -1079,6 +1094,14 @@ static inline bool nvme_disk_is_ns_head(struct gendisk *disk)
> {
> return false;
> }
> +static inline void nvme_mpath_set_fabrics(struct nvme_ns_head *head)
> +{
> +
> +}
> +static inline bool nvme_mpath_queue_if_no_path(struct nvme_ns_head *head)
> +{
> + return false;
> +}
> #endif /* CONFIG_NVME_MULTIPATH */
>
> int nvme_ns_get_unique_id(struct nvme_ns *ns, u8 id[16],
> diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c
> index 6d31226f7a4f..51633670d177 100644
> --- a/drivers/nvme/host/sysfs.c
> +++ b/drivers/nvme/host/sysfs.c
> @@ -260,6 +260,7 @@ static struct attribute *nvme_ns_attrs[] = {
> &dev_attr_ana_state.attr,
> &dev_attr_queue_depth.attr,
> &dev_attr_numa_nodes.attr,
> + &dev_attr_delayed_removal_secs.attr,
> #endif
> &dev_attr_io_passthru_err_log_enabled.attr,
> NULL,
> @@ -296,6 +297,18 @@ static umode_t nvme_ns_attrs_are_visible(struct kobject *kobj,
> if (nvme_disk_is_ns_head(dev_to_disk(dev)))
> return 0;
> }
> + if (a == &dev_attr_delayed_removal_secs.attr) {
> + struct nvme_ns_head *head = dev_to_ns_head(dev);
> + struct gendisk *disk = dev_to_disk(dev);
> +
> + /*
> + * This attribute is only valid for head node and non-fabric
> + * setup.
> + */
> + if (!nvme_disk_is_ns_head(disk) ||
> + test_bit(NVME_NSHEAD_FABRICS, &head->flags))
> + return 0;
> + }
> #endif
> return a->mode;
> }
Cheers,
Hannes
--
Dr. Hannes Reinecke Kernel Storage Architect
hare at suse.de +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich
More information about the Linux-nvme
mailing list