[RFC PATCHv3 1/3] nvme-multipath: introduce delayed removal of the multipath head node

Tue May 6 01:28:00 PDT 2025

On 5/5/25 12:09 PM, Hannes Reinecke wrote:
> On 5/4/25 19:50, Nilay Shroff wrote:
> 
>> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
>> index eb6ea8acb3cc..0f6b01f10c70 100644
>> --- a/drivers/nvme/host/core.c
>> +++ b/drivers/nvme/host/core.c
>> @@ -3560,7 +3560,7 @@ static struct nvme_ns_head *nvme_find_ns_head(struct nvme_ctrl *ctrl,
>>            */
>>           if (h->ns_id != nsid || !nvme_is_unique_nsid(ctrl, h))
>>               continue;
>> -        if (!list_empty(&h->list) && nvme_tryget_ns_head(h))
>> +        if (nvme_tryget_ns_head(h))
>>               return h;
>>       }
>>   @@ -3688,6 +3688,8 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
>>       ratelimit_state_init(&head->rs_nuse, 5 * HZ, 1);
>>       ratelimit_set_flags(&head->rs_nuse, RATELIMIT_MSG_ON_RELEASE);
>>       kref_init(&head->ref);
>> +    if (ctrl->ops->flags & NVME_F_FABRICS)
>> +        nvme_mpath_set_fabrics(head);
>>   
> 
> Please make this a separate patch.
Yes sure, will do it while I spin the next patchset.

> 
>>       if (head->ids.csi) {
>>           ret = nvme_get_effects_log(ctrl, head->ids.csi, &head->effects);
>> @@ -3804,7 +3806,8 @@ static int nvme_init_ns_head(struct nvme_ns *ns, struct nvme_ns_info *info)
>>           }
>>       } else {
>>           ret = -EINVAL;
>> -        if (!info->is_shared || !head->shared) {
>> +        if ((!info->is_shared || !head->shared) &&
>> +            !list_empty(&head->list)) {
>>               dev_err(ctrl->device,
>>                   "Duplicate unshared namespace %d\n",
>>                   info->nsid);
>> @@ -4008,7 +4011,8 @@ static void nvme_ns_remove(struct nvme_ns *ns)
>>       mutex_lock(&ns->ctrl->subsys->lock);
>>       list_del_rcu(&ns->siblings);
>>       if (list_empty(&ns->head->list)) {
>> -        list_del_init(&ns->head->entry);
>> +        if (!nvme_mpath_queue_if_no_path(ns->head))
>> +            list_del_init(&ns->head->entry);
>>           last_path = true;
>>       }
>>       mutex_unlock(&ns->ctrl->subsys->lock);
>> diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
>> index 250f3da67cc9..f3f981c9f3ec 100644
>> --- a/drivers/nvme/host/multipath.c
>> +++ b/drivers/nvme/host/multipath.c
>> @@ -442,7 +442,24 @@ static bool nvme_available_path(struct nvme_ns_head *head)
>>               break;
>>           }
>>       }
>> -    return false;
>> +
>> +    /*
>> +     * If "head->delayed_removal_secs" is configured (i.e., non-zero), do
>> +     * not immediately fail I/O. Instead, requeue the I/O for the configured
>> +     * duration, anticipating that if there's a transient link failure then
>> +     * it may recover within this time window. This parameter is exported to
>> +     * userspace via sysfs, and its default value is zero. It is internally
>> +     * mapped to NVME_NSHEAD_QUEUE_IF_NO_PATH. When delayed_removal_secs is
>> +     * non-zero, this flag is set to true. When zero, the flag is cleared.
>> +     *
>> +     * Note: This option is not exposed to the users for NVMeoF connections.
>> +     * In fabric-based setups, fabric link failure is managed through other
>> +     * parameters such as "reconnect_delay" and "max_reconnects" which is
>> +     * handled at respective fabric driver layer. Therefor, head->delayed_
>> +     * removal_secs" is intended exclusively for non-fabric (e.g., PCIe)
>> +     * multipath configurations.
>> +     */
>> +    return nvme_mpath_queue_if_no_path(head);
>>   }
>>     static void nvme_ns_head_submit_bio(struct bio *bio)
>> @@ -617,6 +634,40 @@ static void nvme_requeue_work(struct work_struct *work)
>>       }
>>   }
>>   +static void nvme_remove_head(struct nvme_ns_head *head)
>> +{
>> +    if (test_and_clear_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
>> +        /*
>> +         * requeue I/O after NVME_NSHEAD_DISK_LIVE has been cleared
>> +         * to allow multipath to fail all I/O.
>> +         */
>> +        kblockd_schedule_work(&head->requeue_work);
>> +
>> +        nvme_cdev_del(&head->cdev, &head->cdev_device);
>> +        synchronize_srcu(&head->srcu);
>> +        del_gendisk(head->disk);
>> +        nvme_put_ns_head(head);
>> +    }
>> +}
>> +
>> +static void nvme_remove_head_work(struct work_struct *work)
>> +{
>> +    struct nvme_ns_head *head = container_of(to_delayed_work(work),
>> +            struct nvme_ns_head, remove_work);
>> +    bool shutdown = false;
>> +
> 
> Don't you need to check here if the 'QUEUE_IF_NO_PATH' bit is still
> cleared?

No, we don't need to do that, because the QUEUE_IF_NO_PATH flag is
tied to the head->delayed_removal_secs setting. 
Specifically, the QUEUE_IF_NO_PATH flag is set when head->delayed_
removal_secs is configured to a non-zero value. It is cleared only
when head->delayed_removal_secs is reset to zero.

For example, if the user sets delayed_removal_secs to a non-zero 
value for a head node, the QUEUE_IF_NO_PATH flag is set. Even if
all paths to a shared namespace are lost, the flag remains set 
until delayed_removal_secs is reset to zero.

Hence, instead of relying solely on this flag, here we check whether
head->list is empty after the delayed removal timeout has elapsed. 
If head->list is not empty, it indicates that a path to the namespace
has been recovered (e.g., the link failure was resolved in time). 
Therefore, we should not remove the head node.

In short in this function, checking whether head->list is empty
is sufficient to determine whether the head node should be removed
or retained.
> 
>> +    mutex_lock(&head->subsys->lock);
>> +    if (list_empty(&head->list)) {
>> +        list_del_init(&head->entry);
>> +        shutdown = true;
>> +    }
>> +    mutex_unlock(&head->subsys->lock);
>> +    if (shutdown)
>> +        nvme_remove_head(head);
>> +
>> +    module_put(THIS_MODULE);
>> +}
>> +
>>   int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
>>   {
>>       struct queue_limits lim;
>> @@ -626,6 +677,8 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
>>       spin_lock_init(&head->requeue_lock);
>>       INIT_WORK(&head->requeue_work, nvme_requeue_work);
>>       INIT_WORK(&head->partition_scan_work, nvme_partition_scan_work);
>> +    INIT_DELAYED_WORK(&head->remove_work, nvme_remove_head_work);
>> +    head->delayed_removal_secs = 0;
>>         /*
>>        * Add a multipath node if the subsystems supports multiple controllers.
>> @@ -659,6 +712,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
>>       set_bit(GD_SUPPRESS_PART_SCAN, &head->disk->state);
>>       sprintf(head->disk->disk_name, "nvme%dn%d",
>>               ctrl->subsys->instance, head->instance);
>> +    nvme_tryget_ns_head(head);
> 
> What is that doing here?
> Why do you tak an additional reference?

We take an additional reference to the head node here to ensure that
it isn't freed immediately when all paths to the shared namespace
are removed. The final reference to the head node is only released when:
- All paths to the namespace are lost, and
- No new path to the namespace has appeared even after the configured 
  delayed_removal_secs (if set) has elapsed.
This way it's ensured that the head node remains valid during the 
grace period, allowing recovery if a path becomes available again.

Thanks,
--Nilay