nvme/pcie hot plug results in /dev name change
Keith Busch
kbusch at kernel.org
Wed Feb 15 14:18:17 PST 2023
On Tue, Jan 31, 2023 at 10:27:24PM -0800, Christoph Hellwig wrote:
> On Wed, Feb 01, 2023 at 10:33:15AM +0800, Ming Lei wrote:
> > > The native nvme multipath looks like it could be leveraged to improving that
> > > user experience if we wanted to make that layer an option for non-multipath
> > > devices.
> >
> > Can you share the basic idea? Will nvme mulitpath hold the io error and
> > not propagate to upper layer until new device is probed? What if the
> > new device is probed late, and IO has been timed out and err is returned
> > to upper layer?
>
> Attached is what I did, but I'm really not sure it is the right final
> approach.
Just adding my proof-of-concept attempt to this below. It uses a hard-coded 10
second delay before proceeding with deletion. There may be some unlikely race
conditions with this, but the concept works. I tested mounting a single path
nvme namespace, and quick remove-add sequences while running file fio on the
filesystem. Everything proceeded seemlessly with no need to remount.
---
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 8698410aeb843..faccbd8bbb310 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3974,7 +3974,7 @@ static struct nvme_ns_head *nvme_find_ns_head(struct nvme_ctrl *ctrl,
*/
if (h->ns_id != nsid || !nvme_is_unique_nsid(ctrl, h))
continue;
- if (!list_empty(&h->list) && nvme_tryget_ns_head(h))
+ if (nvme_tryget_ns_head(h))
return h;
}
@@ -4185,7 +4185,8 @@ static int nvme_init_ns_head(struct nvme_ns *ns, struct nvme_ns_info *info)
}
} else {
ret = -EINVAL;
- if (!info->is_shared || !head->shared) {
+ if ((!info->is_shared || !head->shared) &&
+ !list_empty(&head->list)) {
dev_err(ctrl->device,
"Duplicate unshared namespace %d\n",
info->nsid);
@@ -4350,8 +4351,6 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
static void nvme_ns_remove(struct nvme_ns *ns)
{
- bool last_path = false;
-
if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags))
return;
@@ -4371,10 +4370,6 @@ static void nvme_ns_remove(struct nvme_ns *ns)
mutex_lock(&ns->ctrl->subsys->lock);
list_del_rcu(&ns->siblings);
- if (list_empty(&ns->head->list)) {
- list_del_init(&ns->head->entry);
- last_path = true;
- }
mutex_unlock(&ns->ctrl->subsys->lock);
/* guarantee not available in head->list */
@@ -4388,8 +4383,7 @@ static void nvme_ns_remove(struct nvme_ns *ns)
list_del_init(&ns->list);
up_write(&ns->ctrl->namespaces_rwsem);
- if (last_path)
- nvme_mpath_shutdown_disk(ns->head);
+ nvme_mpath_shutdown_disk(ns->head);
nvme_put_ns(ns);
}
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index fc39d01e7b63b..c74d0326b285c 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -387,7 +387,8 @@ static void nvme_ns_head_submit_bio(struct bio *bio)
trace_block_bio_remap(bio, disk_devt(ns->head->disk),
bio->bi_iter.bi_sector);
submit_bio_noacct(bio);
- } else if (nvme_available_path(head)) {
+ } else if (nvme_available_path(head) ||
+ delayed_work_pending(&head->remove_work)) {
dev_warn_ratelimited(dev, "no usable path - requeuing I/O\n");
spin_lock_irq(&head->requeue_lock);
@@ -505,6 +506,23 @@ static void nvme_requeue_work(struct work_struct *work)
}
}
+static void nvme_remove_head_work(struct work_struct *work)
+{
+ struct nvme_ns_head *head = container_of(to_delayed_work(work),
+ struct nvme_ns_head, remove_work);
+
+ if (!list_empty(&head->list))
+ return;
+
+ list_del_init(&head->entry);
+ kblockd_schedule_work(&head->requeue_work);
+ if (test_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
+ nvme_cdev_del(&head->cdev, &head->cdev_device);
+ del_gendisk(head->disk);
+ }
+ nvme_put_ns_head(head);
+}
+
int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
{
bool vwc = false;
@@ -513,14 +531,14 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
bio_list_init(&head->requeue_list);
spin_lock_init(&head->requeue_lock);
INIT_WORK(&head->requeue_work, nvme_requeue_work);
+ INIT_DELAYED_WORK(&head->remove_work, nvme_remove_head_work);
/*
* Add a multipath node if the subsystems supports multiple controllers.
* We also do this for private namespaces as the namespace sharing flag
* could change after a rescan.
*/
- if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) ||
- !nvme_is_unique_nsid(ctrl, head) || !multipath)
+ if (!nvme_is_unique_nsid(ctrl, head) || !multipath)
return 0;
head->disk = blk_alloc_disk(ctrl->numa_node);
@@ -553,6 +571,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
vwc = true;
blk_queue_write_cache(head->disk->queue, vwc, vwc);
+ nvme_tryget_ns_head(head);
return 0;
}
@@ -871,13 +890,9 @@ void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid)
void nvme_mpath_shutdown_disk(struct nvme_ns_head *head)
{
- if (!head->disk)
+ if (!list_empty(&head->list) || !head->disk)
return;
- kblockd_schedule_work(&head->requeue_work);
- if (test_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
- nvme_cdev_del(&head->cdev, &head->cdev_device);
- del_gendisk(head->disk);
- }
+ queue_delayed_work(nvme_wq, &head->remove_work, 10 * HZ);
}
void nvme_mpath_remove_disk(struct nvme_ns_head *head)
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index bf46f122e9e1e..f6d7d826cd48a 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -446,6 +446,9 @@ struct nvme_ns_head {
struct work_struct requeue_work;
struct mutex lock;
unsigned long flags;
+
+ struct delayed_work remove_work;
+
#define NVME_NSHEAD_DISK_LIVE 0
struct nvme_ns __rcu *current_path[];
#endif
--
More information about the Linux-nvme
mailing list