nvme/pcie hot plug results in /dev name change

Keith Busch kbusch at kernel.org
Wed Feb 15 14:18:17 PST 2023


On Tue, Jan 31, 2023 at 10:27:24PM -0800, Christoph Hellwig wrote:
> On Wed, Feb 01, 2023 at 10:33:15AM +0800, Ming Lei wrote:
> > > The native nvme multipath looks like it could be leveraged to improving that
> > > user experience if we wanted to make that layer an option for non-multipath
> > > devices.
> > 
> > Can you share the basic idea? Will nvme mulitpath hold the io error and
> > not propagate to upper layer until new device is probed? What if the
> > new device is probed late, and IO has been timed out and err is returned
> > to upper layer?
> 
> Attached is what I did, but I'm really not sure it is the right final
> approach.

Just adding my proof-of-concept attempt to this below. It uses a hard-coded 10
second delay before proceeding with deletion. There may be some unlikely race
conditions with this, but the concept works. I tested mounting a single path
nvme namespace, and quick remove-add sequences while running file fio on the
filesystem. Everything proceeded seemlessly with no need to remount.

---
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 8698410aeb843..faccbd8bbb310 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3974,7 +3974,7 @@ static struct nvme_ns_head *nvme_find_ns_head(struct nvme_ctrl *ctrl,
 		 */
 		if (h->ns_id != nsid || !nvme_is_unique_nsid(ctrl, h))
 			continue;
-		if (!list_empty(&h->list) && nvme_tryget_ns_head(h))
+		if (nvme_tryget_ns_head(h))
 			return h;
 	}
 
@@ -4185,7 +4185,8 @@ static int nvme_init_ns_head(struct nvme_ns *ns, struct nvme_ns_info *info)
 		}
 	} else {
 		ret = -EINVAL;
-		if (!info->is_shared || !head->shared) {
+		if ((!info->is_shared || !head->shared) &&
+		    !list_empty(&head->list)) {
 			dev_err(ctrl->device,
 				"Duplicate unshared namespace %d\n",
 				info->nsid);
@@ -4350,8 +4351,6 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
 
 static void nvme_ns_remove(struct nvme_ns *ns)
 {
-	bool last_path = false;
-
 	if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags))
 		return;
 
@@ -4371,10 +4370,6 @@ static void nvme_ns_remove(struct nvme_ns *ns)
 
 	mutex_lock(&ns->ctrl->subsys->lock);
 	list_del_rcu(&ns->siblings);
-	if (list_empty(&ns->head->list)) {
-		list_del_init(&ns->head->entry);
-		last_path = true;
-	}
 	mutex_unlock(&ns->ctrl->subsys->lock);
 
 	/* guarantee not available in head->list */
@@ -4388,8 +4383,7 @@ static void nvme_ns_remove(struct nvme_ns *ns)
 	list_del_init(&ns->list);
 	up_write(&ns->ctrl->namespaces_rwsem);
 
-	if (last_path)
-		nvme_mpath_shutdown_disk(ns->head);
+	nvme_mpath_shutdown_disk(ns->head);
 	nvme_put_ns(ns);
 }
 
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index fc39d01e7b63b..c74d0326b285c 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -387,7 +387,8 @@ static void nvme_ns_head_submit_bio(struct bio *bio)
 		trace_block_bio_remap(bio, disk_devt(ns->head->disk),
 				      bio->bi_iter.bi_sector);
 		submit_bio_noacct(bio);
-	} else if (nvme_available_path(head)) {
+	} else if (nvme_available_path(head) ||
+		   delayed_work_pending(&head->remove_work)) {
 		dev_warn_ratelimited(dev, "no usable path - requeuing I/O\n");
 
 		spin_lock_irq(&head->requeue_lock);
@@ -505,6 +506,23 @@ static void nvme_requeue_work(struct work_struct *work)
 	}
 }
 
+static void nvme_remove_head_work(struct work_struct *work)
+{
+	struct nvme_ns_head *head = container_of(to_delayed_work(work),
+					struct nvme_ns_head, remove_work);
+
+	if (!list_empty(&head->list))
+		return;
+
+	list_del_init(&head->entry);
+	kblockd_schedule_work(&head->requeue_work);
+	if (test_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
+		nvme_cdev_del(&head->cdev, &head->cdev_device);
+		del_gendisk(head->disk);
+	}
+	nvme_put_ns_head(head);
+}
+
 int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
 {
 	bool vwc = false;
@@ -513,14 +531,14 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
 	bio_list_init(&head->requeue_list);
 	spin_lock_init(&head->requeue_lock);
 	INIT_WORK(&head->requeue_work, nvme_requeue_work);
+	INIT_DELAYED_WORK(&head->remove_work, nvme_remove_head_work);
 
 	/*
 	 * Add a multipath node if the subsystems supports multiple controllers.
 	 * We also do this for private namespaces as the namespace sharing flag
 	 * could change after a rescan.
 	 */
-	if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) ||
-	    !nvme_is_unique_nsid(ctrl, head) || !multipath)
+	if (!nvme_is_unique_nsid(ctrl, head) || !multipath)
 		return 0;
 
 	head->disk = blk_alloc_disk(ctrl->numa_node);
@@ -553,6 +571,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
 	if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
 		vwc = true;
 	blk_queue_write_cache(head->disk->queue, vwc, vwc);
+	nvme_tryget_ns_head(head);
 	return 0;
 }
 
@@ -871,13 +890,9 @@ void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid)
 
 void nvme_mpath_shutdown_disk(struct nvme_ns_head *head)
 {
-	if (!head->disk)
+	if (!list_empty(&head->list) || !head->disk)
 		return;
-	kblockd_schedule_work(&head->requeue_work);
-	if (test_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
-		nvme_cdev_del(&head->cdev, &head->cdev_device);
-		del_gendisk(head->disk);
-	}
+	queue_delayed_work(nvme_wq, &head->remove_work, 10 * HZ);
 }
 
 void nvme_mpath_remove_disk(struct nvme_ns_head *head)
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index bf46f122e9e1e..f6d7d826cd48a 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -446,6 +446,9 @@ struct nvme_ns_head {
 	struct work_struct	requeue_work;
 	struct mutex		lock;
 	unsigned long		flags;
+
+	struct delayed_work	remove_work;
+
 #define NVME_NSHEAD_DISK_LIVE	0
 	struct nvme_ns __rcu	*current_path[];
 #endif
--



More information about the Linux-nvme mailing list