[PATCH 2/2] nvme: add emulation for zone-append

Kanchan Joshi joshi.k at samsung.com
Tue Aug 18 01:29:36 EDT 2020


If drive does not support zone-append natively, enable emulation using
regular write.
Make emulated zone-append cmd write-lock the zone, preventing
concurrent append/write on the same zone.

To determine the start-lba for such writes, an array of 32 bit
zone-relative write-pointer (WP) positions is attached with namespace.
This cached WP-position is updated on successful completion as follows:
- APPEND/WRITE/WRITE_ZEROS/WRITE_SAME update it by number of sectors
(512b) copied
- ZONE_RESET updates it to 0 for target zone. ZONE_RESET_ALL does the
same for all zones.
- ZONE_FINISH sets it to zone-size.

On failed-completion for above requests, cached WP-position of target zone
is marked invalid. On subsequent zone-append to that zone, WP position is
refreshed by querying it from device (i.e. zone-report).

If emulated-append cannot immediately proceed due to zone write-lock
or invalid WP position, block-layer is asked to retry it.

Signed-off-by: Kanchan Joshi <joshi.k at samsung.com>
Signed-off-by: Nitesh Shetty <nj.shetty at samsung.com>
Signed-off-by: SelvaKumar S <selvakuma.s1 at samsung.com>
Signed-off-by: Javier Gonzalez <javier.gonz at samsung.com>
---
 drivers/nvme/host/core.c |  41 +++++-
 drivers/nvme/host/nvme.h |  60 ++++++++
 drivers/nvme/host/zns.c  | 306 ++++++++++++++++++++++++++++++++++++++-
 3 files changed, 398 insertions(+), 9 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 88cff309d8e4..78faddf444c3 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -287,10 +287,17 @@ void nvme_complete_rq(struct request *req)
 			nvme_retry_req(req);
 			return;
 		}
-	} else if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
-		   req_op(req) == REQ_OP_ZONE_APPEND) {
-		req->__sector = nvme_lba_to_sect(req->q->queuedata,
-			le64_to_cpu(nvme_req(req)->result.u64));
+	} else if (IS_ENABLED(CONFIG_BLK_DEV_ZONED)) {
+		bool need_wp_offset_update = false;
+		struct nvme_ns *ns = req->q->queuedata;
+		/* append-emulation requires wp update for some cmds*/
+		if (ns && nvme_is_append_emulated(ns))
+			need_wp_offset_update = nvme_need_zone_wp_update(req);
+		if (need_wp_offset_update)
+			nvme_zone_wp_update(ns, req, status);
+		else if (req_op(req) == REQ_OP_ZONE_APPEND)
+			req->__sector = nvme_lba_to_sect(ns,
+					le64_to_cpu(nvme_req(req)->result.u64));
 	}
 
 	nvme_trace_bio_complete(req, status);
@@ -456,6 +463,8 @@ static void nvme_free_ns(struct kref *kref)
 {
 	struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref);
 
+	if (nvme_is_append_emulated(ns))
+		nvme_teardown_append_emulate(ns);
 	if (ns->ndev)
 		nvme_nvm_unregister(ns);
 
@@ -809,7 +818,15 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
 		ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_write);
 		break;
 	case REQ_OP_ZONE_APPEND:
-		ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_zone_append);
+		if (!nvme_is_append_emulated(ns))
+			ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_zone_append);
+		else {
+			/* prepare append like write, and adjust lba afterwards */
+			ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_write);
+			if (ret)
+				break;
+			ret = nvme_append_to_write(ns, req, cmd);
+		}
 		break;
 	default:
 		WARN_ON_ONCE(1);
@@ -2150,7 +2167,7 @@ static int nvme_revalidate_disk(struct gendisk *disk)
 		struct nvme_ns *ns = disk->private_data;
 		struct nvme_ctrl *ctrl = ns->ctrl;
 
-		ret = blk_revalidate_disk_zones(disk, NULL);
+		ret = nvme_revalidate_disk_zones(disk);
 		if (!ret)
 			blk_queue_max_zone_append_sectors(disk->queue,
 							  ctrl->max_zone_append);
@@ -3900,6 +3917,18 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 	if (__nvme_revalidate_disk(disk, id))
 		goto out_put_disk;
 
+	/* setup append-emulation if required */
+	if (nvme_is_append_emulated(ns)) {
+		ret = nvme_setup_append_emulate(ns);
+		if (ret) {
+			dev_warn(ns->ctrl->device,
+				"append-emulation failed, zoned namespace:%d\n",
+				ns->head->ns_id);
+			nvme_clear_append_emulated(ns);
+			goto out_put_disk;
+		}
+	}
+
 	if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) {
 		ret = nvme_nvm_register(ns, disk_name, node);
 		if (ret) {
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index ebb8c3ed3885..c84d418fb001 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -421,6 +421,19 @@ enum nvme_ns_features {
 	NVME_NS_METADATA_SUPPORTED = 1 << 1, /* support getting generated md */
 };
 
+#ifdef CONFIG_BLK_DEV_ZONED
+struct nvme_za_emul {
+	unsigned int nr_zones;
+	spinlock_t zones_wp_offset_lock;
+	u32 *zones_wp_offset;
+	u32 *rev_wp_offset;
+	struct work_struct zone_wp_offset_work;
+	char *zone_wp_update_buf;
+	struct mutex rev_mutex;
+	struct nvme_ns *ns;
+};
+#endif
+
 struct nvme_ns {
 	struct list_head list;
 
@@ -443,6 +456,10 @@ struct nvme_ns {
 	u8 pi_type;
 #ifdef CONFIG_BLK_DEV_ZONED
 	u64 zsze;
+	/* set if append needs to be emulated */
+	u8 append_emulate;
+	/* contains all other append-emulation fields */
+	struct nvme_za_emul *za_emul;
 #endif
 	unsigned long features;
 	unsigned long flags;
@@ -759,9 +776,52 @@ int nvme_report_zones(struct gendisk *disk, sector_t sector,
 blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req,
 				       struct nvme_command *cmnd,
 				       enum nvme_zone_mgmt_action action);
+
+int nvme_revalidate_disk_zones(struct gendisk *disk);
+/* append-emulation only helpers */
+int nvme_setup_append_emulate(struct nvme_ns *ns);
+void nvme_teardown_append_emulate(struct nvme_ns *ns);
+blk_status_t nvme_append_to_write(struct nvme_ns *ns, struct request *req,
+				  struct nvme_command *cmd);
+bool nvme_need_zone_wp_update(struct request *rq);
+void nvme_zone_wp_update(struct nvme_ns *ns, struct request *rq,
+			 blk_status_t status);
+void nvme_set_append_emulated(struct nvme_ns *ns);
+void nvme_clear_append_emulated(struct nvme_ns *ns);
+int nvme_is_append_emulated(struct nvme_ns *ns);
 #else
 #define nvme_report_zones NULL
 
+static inline void nvme_set_append_emulated(struct nvme_ns *ns) {}
+
+static inline void nvme_clear_append_emulated(struct nvme_ns *ns) {}
+
+static inline int nvme_is_append_emulated(struct nvme_ns *ns)
+{
+	return 0;
+}
+
+static inline int nvme_setup_append_emulate(struct nvme_ns *ns)
+{
+	return 0;
+}
+
+static inline void nvme_teardown_append_emulate(struct nvme_ns *ns) {}
+
+static inline blk_status_t nvme_append_to_write(struct nvme_ns *ns, struct request *req,
+						struct nvme_command *cmd)
+{
+	return BLK_STS_NOTSUPP;
+}
+
+static inline bool nvme_need_zone_wp_update(struct request *rq)
+{
+	return false;
+}
+
+static inline void nvme_zone_wp_update(struct nvme_ns *ns, struct request *rq,
+			 blk_status_t status) {}
+
 static inline blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns,
 		struct request *req, struct nvme_command *cmnd,
 		enum nvme_zone_mgmt_action action)
diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c
index cabd870fb64e..0b1e9f62045a 100644
--- a/drivers/nvme/host/zns.c
+++ b/drivers/nvme/host/zns.c
@@ -7,6 +7,10 @@
 #include <linux/vmalloc.h>
 #include "nvme.h"
 
+/* used for append-emulation */
+#define ZNS_INVALID_WP_OFST  (~0u)
+#define ZNS_UPDATING_WP_OFST (ZNS_INVALID_WP_OFST - 1)
+
 static int nvme_set_max_append(struct nvme_ctrl *ctrl)
 {
 	struct nvme_command c = { };
@@ -44,13 +48,14 @@ int nvme_update_zone_info(struct gendisk *disk, struct nvme_ns *ns,
 	struct nvme_id_ns_zns *id;
 	int status;
 
-	/* Driver requires zone append support */
+	/* Driver does append-emulation if drive does not support zone-append */
 	if (!(le32_to_cpu(log->iocs[nvme_cmd_zone_append]) &
 			NVME_CMD_EFFECTS_CSUPP)) {
 		dev_warn(ns->ctrl->device,
-			"append not supported for zoned namespace:%d\n",
+			"append is emulated for zoned namespace:%d\n",
 			ns->head->ns_id);
-		return -EINVAL;
+		/* activate append-emulation */
+		nvme_set_append_emulated(ns);
 	}
 
 	/* Lazily query controller append limit for the first zoned namespace */
@@ -255,3 +260,298 @@ blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req,
 
 	return BLK_STS_OK;
 }
+
+static void nvme_revalidate_zones_cb(struct gendisk *disk)
+{
+	struct nvme_ns_head *head = NULL;
+	struct nvme_ns *ns;
+	int srcu_idx;
+
+	ns = nvme_get_ns_from_disk(disk, &head, &srcu_idx);
+	swap(ns->za_emul->zones_wp_offset, ns->za_emul->rev_wp_offset);
+	nvme_put_ns_from_disk(head, srcu_idx);
+}
+
+inline int nvme_is_append_emulated(struct nvme_ns *ns)
+{
+	return (ns->append_emulate == 1);
+}
+
+inline void nvme_set_append_emulated(struct nvme_ns *ns)
+{
+	ns->append_emulate = 1;
+}
+
+inline void nvme_clear_append_emulated(struct nvme_ns *ns)
+{
+	ns->append_emulate = 0;
+}
+
+int nvme_revalidate_disk_zones(struct gendisk *disk)
+{
+	int ret = 0;
+	struct nvme_ns *ns = disk->private_data;
+
+	if (!nvme_is_append_emulated(ns))
+		ret = blk_revalidate_disk_zones(disk, NULL);
+	else {
+		struct nvme_za_emul *za_emul = ns->za_emul;
+		unsigned int nr_zones;
+
+		/* serialize multiple revalidate calls */
+		mutex_lock(&za_emul->rev_mutex);
+		nr_zones = get_capacity(disk) >> ilog2(ns->zsze);
+
+		/* avoid rescan zones if possible */
+		if (nr_zones == za_emul->nr_zones &&
+				disk->queue->nr_zones == nr_zones) {
+			mutex_unlock(&za_emul->rev_mutex);
+			goto out;
+		}
+		za_emul->rev_wp_offset = kvcalloc(nr_zones,
+						sizeof(u32), GFP_NOIO);
+		if (!za_emul->rev_wp_offset) {
+			ret = -ENOMEM;
+			goto unlock;
+		}
+		ret = blk_revalidate_disk_zones(disk,
+				nvme_revalidate_zones_cb);
+		/* rev_wp_offset has been swapped with zones_wp_offset */
+		kvfree(za_emul->rev_wp_offset);
+		za_emul->rev_wp_offset = NULL;
+unlock:
+		mutex_unlock(&za_emul->rev_mutex);
+	}
+out:
+	return ret;
+}
+
+static unsigned int nvme_get_zone_wp_offset(struct blk_zone *zone)
+{
+	switch (zone->cond) {
+	case BLK_ZONE_COND_IMP_OPEN:
+	case BLK_ZONE_COND_EXP_OPEN:
+	case BLK_ZONE_COND_CLOSED:
+		return zone->wp - zone->start;
+	case BLK_ZONE_COND_FULL:
+		return zone->len;
+	case BLK_ZONE_COND_EMPTY:
+	case BLK_ZONE_COND_OFFLINE:
+	case BLK_ZONE_COND_READONLY:
+	default:
+		/*
+		 * Offline and read-only zones do not have a valid
+		 * write pointer. Use 0 as for an empty zone.
+		 */
+		return 0;
+	}
+}
+
+static int nvme_update_wp_offset_cb(struct blk_zone *zone, unsigned int idx,
+				    void *data)
+{
+	struct nvme_za_emul *za_emul = data;
+
+	lockdep_assert_held(&za_emul->zones_wp_offset_lock);
+	za_emul->zones_wp_offset[idx] = nvme_get_zone_wp_offset(zone);
+	return 0;
+}
+
+static void nvme_update_wp_offset_workfn(struct work_struct *work)
+{
+	struct nvme_za_emul *za_emul;
+	struct nvme_ns *ns;
+	unsigned int zno;
+	unsigned long flags;
+	struct nvme_zone_report *report;
+	int buflen, ret;
+
+	buflen = sizeof(struct nvme_zone_report) +
+				   sizeof(struct nvme_zone_descriptor);
+	za_emul = container_of(work, struct nvme_za_emul, zone_wp_offset_work);
+	ns = za_emul->ns;
+
+	spin_lock_irqsave(&za_emul->zones_wp_offset_lock, flags);
+
+	for (zno = 0; zno < za_emul->nr_zones; zno++) {
+		if (za_emul->zones_wp_offset[zno] != ZNS_UPDATING_WP_OFST)
+			continue;
+		spin_unlock_irqrestore(&za_emul->zones_wp_offset_lock, flags);
+
+		report = (struct nvme_zone_report *)za_emul->zone_wp_update_buf;
+		memset(report, 0, buflen);
+		ret = __nvme_ns_report_zones(ns, (zno * ns->zsze),
+					     report,
+					     buflen);
+
+		spin_lock_irqsave(&za_emul->zones_wp_offset_lock, flags);
+		if (ret > 0)
+			nvme_zone_parse_entry(ns, &report->entries[0],
+					    zno, nvme_update_wp_offset_cb,
+					    za_emul);
+	}
+	spin_unlock_irqrestore(&za_emul->zones_wp_offset_lock, flags);
+	/* remove the reference obtained earlier */
+	nvme_put_ns(ns);
+}
+
+blk_status_t nvme_append_to_write(struct nvme_ns *ns, struct request *req,
+				  struct nvme_command *cmd)
+{
+	blk_status_t ret = 0;
+	struct nvme_za_emul *za_emul = ns->za_emul;
+	unsigned int nr_sectors = (blk_rq_bytes(req) >> SECTOR_SHIFT);
+	unsigned int wp_offset, zno = blk_rq_zone_no(req);
+	sector_t lba = blk_rq_pos(req);
+	unsigned long flags;
+
+	if (!blk_req_zone_write_trylock(req))
+		return BLK_STS_RESOURCE;
+
+	spin_lock_irqsave(&za_emul->zones_wp_offset_lock, flags);
+	wp_offset = za_emul->zones_wp_offset[zno];
+	switch (wp_offset) {
+	case ZNS_INVALID_WP_OFST:
+		/*
+		 * update zone wp-offset in a deferred worker.
+		 * postpone processing current request until worker manages
+		 * to refresh wp by querying from device.
+		 */
+		kref_get(&ns->kref);
+		za_emul->zones_wp_offset[zno] = ZNS_UPDATING_WP_OFST;
+		queue_work(nvme_wq, &za_emul->zone_wp_offset_work);
+		fallthrough;
+	case ZNS_UPDATING_WP_OFST:
+		ret = BLK_STS_RESOURCE;
+		break;
+	default:
+		if (wp_offset + nr_sectors > ns->zsze) {
+			ret = BLK_STS_IOERR;
+			break;
+		}
+		lba += wp_offset;
+	}
+	spin_unlock_irqrestore(&za_emul->zones_wp_offset_lock, flags);
+	/* unlock zone in case of error, update lba otherwise */
+	if (ret)
+		blk_req_zone_write_unlock(req);
+	else
+		cmd->rw.slba = cpu_to_le64(nvme_sect_to_lba(ns, lba));
+	return ret;
+}
+
+bool nvme_need_zone_wp_update(struct request *rq)
+{
+	switch (req_op(rq)) {
+	case REQ_OP_ZONE_APPEND:
+	case REQ_OP_ZONE_FINISH:
+	case REQ_OP_ZONE_RESET:
+	case REQ_OP_ZONE_RESET_ALL:
+		return true;
+	case REQ_OP_WRITE:
+	case REQ_OP_WRITE_ZEROES:
+	case REQ_OP_WRITE_SAME:
+		return blk_rq_zone_is_seq(rq);
+	default:
+		return false;
+	}
+}
+
+void nvme_zone_wp_update(struct nvme_ns *ns, struct request *rq,
+			 blk_status_t status)
+{
+	struct nvme_za_emul *za_emul = ns->za_emul;
+	unsigned long flags;
+	unsigned int zno = blk_rq_zone_no(rq);
+	enum req_opf op = req_op(rq);
+	unsigned int res_bytes = blk_rq_bytes(rq);
+
+	spin_lock_irqsave(&za_emul->zones_wp_offset_lock, flags);
+	/*
+	 * Failure handling first, mark wp_offset invalid.
+	 * This will force updating wp from device on subsequent access
+	 */
+	if (status) {
+		if (op != REQ_OP_ZONE_RESET_ALL) {
+			if (za_emul->zones_wp_offset[zno] !=
+					ZNS_UPDATING_WP_OFST)
+				za_emul->zones_wp_offset[zno] = ZNS_INVALID_WP_OFST;
+
+		} else
+			memset(za_emul->zones_wp_offset, ZNS_INVALID_WP_OFST,
+				za_emul->nr_zones * sizeof(unsigned int));
+		goto unlock;
+	}
+	/* success case handling, update wp-offset */
+	switch (op) {
+	case REQ_OP_ZONE_APPEND:
+		rq->__sector += za_emul->zones_wp_offset[zno];
+		fallthrough;
+	case REQ_OP_WRITE_ZEROES:
+	case REQ_OP_WRITE_SAME:
+	case REQ_OP_WRITE:
+		/* every write should update the wp_offset */
+		if (za_emul->zones_wp_offset[zno] < ns->zsze)
+			za_emul->zones_wp_offset[zno] +=
+						res_bytes >> SECTOR_SHIFT;
+		break;
+	case REQ_OP_ZONE_RESET:
+		za_emul->zones_wp_offset[zno] = 0;
+		break;
+	case REQ_OP_ZONE_FINISH:
+		za_emul->zones_wp_offset[zno] = ns->zsze;
+		break;
+	case REQ_OP_ZONE_RESET_ALL:
+		memset(za_emul->zones_wp_offset, 0,
+		       za_emul->nr_zones * sizeof(unsigned int));
+		break;
+	default:
+		break;
+	}
+unlock:
+	spin_unlock_irqrestore(&za_emul->zones_wp_offset_lock, flags);
+	/* release zone write-lock for append */
+	if (op == REQ_OP_ZONE_APPEND)
+		blk_req_zone_write_unlock(rq);
+}
+
+int nvme_setup_append_emulate(struct nvme_ns *ns)
+{
+	struct nvme_za_emul *za_emul;
+	size_t bufsize;
+
+	WARN_ON(ns->za_emul);
+	za_emul = kmalloc(sizeof(struct nvme_za_emul), GFP_KERNEL);
+	if (!za_emul)
+		return -ENOMEM;
+
+	za_emul->zones_wp_offset = NULL;
+	spin_lock_init(&za_emul->zones_wp_offset_lock);
+	za_emul->rev_wp_offset = NULL;
+	mutex_init(&za_emul->rev_mutex);
+	INIT_WORK(&za_emul->zone_wp_offset_work, nvme_update_wp_offset_workfn);
+	/* preallocate buffer for single zone-report */
+	bufsize = sizeof(struct nvme_zone_report) +
+			sizeof(struct nvme_zone_descriptor);
+	za_emul->zone_wp_update_buf = kzalloc(bufsize, GFP_KERNEL);
+	if (!za_emul->zone_wp_update_buf) {
+		kfree(za_emul);
+		return -ENOMEM;
+	}
+	za_emul->nr_zones = get_capacity(ns->disk) >> ilog2(ns->zsze);
+
+	ns->za_emul = za_emul;
+	za_emul->ns = ns;
+
+	return 0;
+}
+
+void nvme_teardown_append_emulate(struct nvme_ns *ns)
+{
+	WARN_ON(!ns->za_emul);
+	kvfree(ns->za_emul->zones_wp_offset);
+	kfree(ns->za_emul->zone_wp_update_buf);
+	ns->za_emul->zones_wp_offset = NULL;
+	ns->za_emul->rev_wp_offset = NULL;
+	kfree(ns->za_emul);
+}
-- 
2.17.1




More information about the Linux-nvme mailing list