[PATCH RFC 5/5] block, nvme: add failed_bio callback for multipath bio failover

Keith Busch kbusch at meta.com
Tue May 19 10:23:26 PDT 2026


From: Keith Busch <kbusch at kernel.org>

The nvme driver has long utilized a zero capacity to indicate the path
isn't reachable, which creates a race condition with IO dispatch when
paths are being detached on a live system: when the block layer rejects
a bio early due to a capacity check failure, drivers with multipath
support using the original bio have no interception point to redirect
the bio to another path.

We don't want to have to clone the bio just for this condition, so add a
failed_bio callback to block_device_operations, called from
bio_io_error. If the callback returns true, the driver has taken
ownership of the bio and the error completion is skipped.

Implement the callback for NVMe multipath. nvme_failed_bio redirects
failing bios back to the multipath head device's requeue list for
path re-selection, but only when all three conditions are met:

  - The bio came through the multipath head (REQ_NVME_MPATH)
  - The error is a path-related error (blk_path_error)
  - The path is no longer ready (!NVME_NS_READY)

Signed-off-by: Keith Busch <kbusch at kernel.org>
---
 drivers/nvme/host/core.c      |  1 +
 drivers/nvme/host/multipath.c | 26 ++++++++++++++++++++++++++
 drivers/nvme/host/nvme.h      |  2 ++
 include/linux/bio.h           |  6 ------
 include/linux/blkdev.h        | 16 ++++++++++++++++
 5 files changed, 45 insertions(+), 6 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index c3032d6ad6b1e..ac33b4dd19127 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2650,6 +2650,7 @@ const struct block_device_operations nvme_bdev_ops = {
 	.get_unique_id	= nvme_get_unique_id,
 	.report_zones	= nvme_report_zones,
 	.pr_ops		= &nvme_pr_ops,
+	.failed_bio	= nvme_failed_bio,
 };
 
 static int nvme_wait_ready(struct nvme_ctrl *ctrl, u32 mask, u32 val,
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 263161cb8ac06..250d0719d32cf 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -134,6 +134,32 @@ void nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
 			blk_freeze_queue_start(h->disk->queue);
 }
 
+bool nvme_failed_bio(struct bio *bio)
+{
+	unsigned long flags;
+	struct nvme_ns *ns;
+
+	if (!(bio->bi_opf & REQ_NVME_MPATH))
+		return false;
+	if (!blk_path_error(bio->bi_status))
+		return false;
+
+	ns = bio->bi_bdev->bd_disk->queue->queuedata;
+	if (test_bit(NVME_NS_READY, &ns->flags))
+		return false;
+	nvme_mpath_clear_current_path(ns);
+
+	bio->bi_status = BLK_STS_OK;
+	bio_set_dev(bio, ns->head->disk->part0);
+
+	spin_lock_irqsave(&ns->head->requeue_lock, flags);
+	bio_list_add(&ns->head->requeue_list, bio);
+	spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
+
+	kblockd_schedule_work(&ns->head->requeue_work);
+	return true;
+}
+
 void nvme_failover_req(struct request *req)
 {
 	struct nvme_ns *ns = req->q->queuedata;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index ccd5e05dac98f..37d4f037b9a8a 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -1028,6 +1028,7 @@ void nvme_mpath_unfreeze(struct nvme_subsystem *subsys);
 void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys);
 void nvme_mpath_start_freeze(struct nvme_subsystem *subsys);
 void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys);
+bool nvme_failed_bio(struct bio *bio);
 void nvme_failover_req(struct request *req);
 void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl);
 int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head);
@@ -1079,6 +1080,7 @@ static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
 {
 	return false;
 }
+#define nvme_failed_bio	NULL
 static inline void nvme_failover_req(struct request *req)
 {
 }
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 1a83a6753d70d..4f01033c32ced 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -371,12 +371,6 @@ void submit_bio(struct bio *bio);
 
 extern void bio_endio(struct bio *);
 
-static inline void bio_io_error(struct bio *bio)
-{
-	bio->bi_status = BLK_STS_IOERR;
-	bio_endio(bio);
-}
-
 static inline void bio_wouldblock_error(struct bio *bio)
 {
 	bio_set_flag(bio, BIO_QUIET);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 17270a28c66d5..75cbf496e0efa 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1686,8 +1686,24 @@ struct block_device_operations {
 	 * driver.
 	 */
 	int (*alternative_gpt_sector)(struct gendisk *disk, sector_t *sector);
+	bool (*failed_bio)(struct bio *bio);
 };
 
+static inline void bio_io_error(struct bio *bio)
+{
+	bio->bi_status = BLK_STS_IOERR;
+
+	if (bio->bi_bdev) {
+		const struct block_device_operations *ops =
+			bio->bi_bdev->bd_disk->fops;
+
+		if (ops->failed_bio && ops->failed_bio(bio))
+			return;
+	}
+
+	bio_endio(bio);
+}
+
 #ifdef CONFIG_COMPAT
 extern int blkdev_compat_ptr_ioctl(struct block_device *, blk_mode_t,
 				      unsigned int, unsigned long);
-- 
2.53.0-Meta




More information about the Linux-nvme mailing list