[PATCH V11 2/4] nvmet: add ZBD over ZNS backend support

Chaitanya Kulkarni chaitanya.kulkarni at wdc.com
Thu Mar 11 04:39:06 GMT 2021


NVMe TP 4053 – Zoned Namespaces (ZNS) allows host software to
communicate with a non-volatile memory subsystem using zones for 
NVMe protocol-based controllers. NVMeOF already support the ZNS NVMe
Protocol compliant devices on the target in the passthru mode. There
are Generic zoned block devices like  Shingled Magnetic Recording (SMR)
HDDs that are not based on the NVMe protocol.

This patch adds ZNS backend to support the ZBDs for NVMeOF target.

This support includes implementing the new command set NVME_CSI_ZNS,
adding different command handlers for ZNS command set such as
NVMe Identify Controller, NVMe Identify Namespace, NVMe Zone Append,
NVMe Zone Management Send and NVMe Zone Management Receive.

With the new command set identifier, we also update the target command
effects logs to reflect the ZNS compliant commands.

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni at wdc.com>
---
 drivers/nvme/target/Makefile      |   1 +
 drivers/nvme/target/admin-cmd.c   |  27 +++
 drivers/nvme/target/io-cmd-bdev.c |  34 +++-
 drivers/nvme/target/nvmet.h       |  38 ++++
 drivers/nvme/target/zns.c         | 327 ++++++++++++++++++++++++++++++
 5 files changed, 419 insertions(+), 8 deletions(-)
 create mode 100644 drivers/nvme/target/zns.c

diff --git a/drivers/nvme/target/Makefile b/drivers/nvme/target/Makefile
index ebf91fc4c72e..9837e580fa7e 100644
--- a/drivers/nvme/target/Makefile
+++ b/drivers/nvme/target/Makefile
@@ -12,6 +12,7 @@ obj-$(CONFIG_NVME_TARGET_TCP)		+= nvmet-tcp.o
 nvmet-y		+= core.o configfs.o admin-cmd.o fabrics-cmd.o \
 			discovery.o io-cmd-file.o io-cmd-bdev.o
 nvmet-$(CONFIG_NVME_TARGET_PASSTHRU)	+= passthru.o
+nvmet-$(CONFIG_BLK_DEV_ZONED)		+= zns.o
 nvme-loop-y	+= loop.o
 nvmet-rdma-y	+= rdma.o
 nvmet-fc-y	+= fc.o
diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index 176c8593d341..bf4876df624a 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -179,6 +179,13 @@ static void nvmet_set_csi_nvm_effects(struct nvme_effects_log *log)
 	log->iocs[nvme_cmd_write_zeroes]	= cpu_to_le32(1 << 0);
 }
 
+static void nvmet_set_csi_zns_effects(struct nvme_effects_log *log)
+{
+	log->iocs[nvme_cmd_zone_append]		= cpu_to_le32(1 << 0);
+	log->iocs[nvme_cmd_zone_mgmt_send]	= cpu_to_le32(1 << 0);
+	log->iocs[nvme_cmd_zone_mgmt_recv]	= cpu_to_le32(1 << 0);
+}
+
 static void nvmet_execute_get_log_cmd_effects_ns(struct nvmet_req *req)
 {
 	struct nvme_effects_log *log;
@@ -194,6 +201,15 @@ static void nvmet_execute_get_log_cmd_effects_ns(struct nvmet_req *req)
 	case NVME_CSI_NVM:
 		nvmet_set_csi_nvm_effects(log);
 		break;
+	case NVME_CSI_ZNS:
+		if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) {
+			status = NVME_SC_INVALID_IO_CMD_SET;
+			goto free;
+		}
+
+		nvmet_set_csi_nvm_effects(log);
+		nvmet_set_csi_zns_effects(log);
+		break;
 	default:
 		status = NVME_SC_INVALID_LOG_PAGE;
 		goto free;
@@ -630,6 +646,13 @@ static u16 nvmet_execute_identify_desclist_csi(struct nvmet_req *req, off_t *o)
 {
 	switch (req->ns->csi) {
 	case NVME_CSI_NVM:
+		return nvmet_copy_ns_identifier(req, NVME_NIDT_CSI,
+						NVME_NIDT_CSI_LEN,
+						&req->ns->csi, o);
+	case NVME_CSI_ZNS:
+		if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED))
+			return NVME_SC_INVALID_IO_CMD_SET;
+
 		return nvmet_copy_ns_identifier(req, NVME_NIDT_CSI,
 						NVME_NIDT_CSI_LEN,
 						&req->ns->csi, o);
@@ -682,8 +705,12 @@ static void nvmet_execute_identify(struct nvmet_req *req)
 	switch (req->cmd->identify.cns) {
 	case NVME_ID_CNS_NS:
 		return nvmet_execute_identify_ns(req);
+	case NVME_ID_CNS_CS_NS:
+		return nvmet_execute_identify_cns_cs_ns(req);
 	case NVME_ID_CNS_CTRL:
 		return nvmet_execute_identify_ctrl(req);
+	case NVME_ID_CNS_CS_CTRL:
+		return nvmet_execute_identify_cns_cs_ctrl(req);
 	case NVME_ID_CNS_NS_ACTIVE_LIST:
 		return nvmet_execute_identify_nslist(req);
 	case NVME_ID_CNS_NS_DESC_LIST:
diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c
index 9a8b3726a37c..ada0215f5e56 100644
--- a/drivers/nvme/target/io-cmd-bdev.c
+++ b/drivers/nvme/target/io-cmd-bdev.c
@@ -63,6 +63,14 @@ static void nvmet_bdev_ns_enable_integrity(struct nvmet_ns *ns)
 	}
 }
 
+void nvmet_bdev_ns_disable(struct nvmet_ns *ns)
+{
+	if (ns->bdev) {
+		blkdev_put(ns->bdev, FMODE_WRITE | FMODE_READ);
+		ns->bdev = NULL;
+	}
+}
+
 int nvmet_bdev_ns_enable(struct nvmet_ns *ns)
 {
 	int ret;
@@ -86,15 +94,16 @@ int nvmet_bdev_ns_enable(struct nvmet_ns *ns)
 	if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY_T10))
 		nvmet_bdev_ns_enable_integrity(ns);
 
-	return 0;
-}
-
-void nvmet_bdev_ns_disable(struct nvmet_ns *ns)
-{
-	if (ns->bdev) {
-		blkdev_put(ns->bdev, FMODE_WRITE | FMODE_READ);
-		ns->bdev = NULL;
+	/* bdev_is_zoned() is stubbed out of CONFIG_BLK_DEV_ZONED */
+	if (bdev_is_zoned(ns->bdev)) {
+		if (!nvmet_bdev_zns_enable(ns)) {
+			nvmet_bdev_ns_disable(ns);
+			return -EINVAL;
+		}
+		ns->csi = NVME_CSI_ZNS;
 	}
+
+	return 0;
 }
 
 void nvmet_bdev_ns_revalidate(struct nvmet_ns *ns)
@@ -448,6 +457,15 @@ u16 nvmet_bdev_parse_io_cmd(struct nvmet_req *req)
 	case nvme_cmd_write_zeroes:
 		req->execute = nvmet_bdev_execute_write_zeroes;
 		return 0;
+	case nvme_cmd_zone_append:
+		req->execute = nvmet_bdev_execute_zone_append;
+		return 0;
+	case nvme_cmd_zone_mgmt_recv:
+		req->execute = nvmet_bdev_execute_zone_mgmt_recv;
+		return 0;
+	case nvme_cmd_zone_mgmt_send:
+		req->execute = nvmet_bdev_execute_zone_mgmt_send;
+		return 0;
 	default:
 		return nvmet_report_invalid_opcode(req);
 	}
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index ee5999920155..f3fccc49de03 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -247,6 +247,10 @@ struct nvmet_subsys {
 	unsigned int		admin_timeout;
 	unsigned int		io_timeout;
 #endif /* CONFIG_NVME_TARGET_PASSTHRU */
+
+#ifdef CONFIG_BLK_DEV_ZONED
+	u8			zasl;
+#endif /* CONFIG_BLK_DEV_ZONED */
 };
 
 static inline struct nvmet_subsys *to_subsys(struct config_item *item)
@@ -584,6 +588,40 @@ static inline struct nvme_ctrl *nvmet_passthru_ctrl(struct nvmet_subsys *subsys)
 }
 #endif /* CONFIG_NVME_TARGET_PASSTHRU */
 
+#ifdef CONFIG_BLK_DEV_ZONED
+bool nvmet_bdev_zns_enable(struct nvmet_ns *ns);
+void nvmet_execute_identify_cns_cs_ctrl(struct nvmet_req *req);
+void nvmet_execute_identify_cns_cs_ns(struct nvmet_req *req);
+void nvmet_bdev_execute_zone_mgmt_recv(struct nvmet_req *req);
+void nvmet_bdev_execute_zone_mgmt_send(struct nvmet_req *req);
+void nvmet_bdev_execute_zone_append(struct nvmet_req *req);
+#else  /* CONFIG_BLK_DEV_ZONED */
+static inline bool nvmet_bdev_zns_enable(struct nvmet_ns *ns)
+{
+	return false;
+}
+static inline void
+nvmet_execute_identify_cns_cs_ctrl(struct nvmet_req *req)
+{
+}
+static inline void
+nvmet_execute_identify_cns_cs_ns(struct nvmet_req *req)
+{
+}
+static inline void
+nvmet_bdev_execute_zone_mgmt_recv(struct nvmet_req *req)
+{
+}
+static inline void
+nvmet_bdev_execute_zone_mgmt_send(struct nvmet_req *req)
+{
+}
+static inline void
+nvmet_bdev_execute_zone_append(struct nvmet_req *req)
+{
+}
+#endif /* CONFIG_BLK_DEV_ZONED */
+
 static inline struct nvme_ctrl *
 nvmet_req_passthru_ctrl(struct nvmet_req *req)
 {
diff --git a/drivers/nvme/target/zns.c b/drivers/nvme/target/zns.c
new file mode 100644
index 000000000000..8121b29df766
--- /dev/null
+++ b/drivers/nvme/target/zns.c
@@ -0,0 +1,327 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NVMe ZNS-ZBD command implementation.
+ * Copyright (c) 2020-2021 HGST, a Western Digital Company.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/nvme.h>
+#include <linux/blkdev.h>
+#include "nvmet.h"
+
+/*
+ * We set the Memory Page Size Minimum (MPSMIN) for target controller to 0
+ * which gets added by 12 in the nvme_enable_ctrl() which results in 2^12 = 4k
+ * as page_shift value. When calculating the ZASL use shift by 12.
+ */
+#define NVMET_MPSMIN_SHIFT	12
+
+static u16 nvmet_bdev_zns_checks(struct nvmet_req *req)
+{
+	if (!bdev_is_zoned(req->ns->bdev))
+		return NVME_SC_INVALID_NS | NVME_SC_DNR;
+
+	if (req->cmd->zmr.zra != NVME_ZRA_ZONE_REPORT) {
+		req->error_loc = offsetof(struct nvme_zone_mgmt_recv_cmd, zra);
+		return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+	}
+
+	if (req->cmd->zmr.zrasf != NVME_ZRASF_ZONE_REPORT_ALL) {
+		req->error_loc =
+			offsetof(struct nvme_zone_mgmt_recv_cmd, zrasf);
+		return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+	}
+
+	if (req->cmd->zmr.pr != NVME_REPORT_ZONE_PARTIAL) {
+		req->error_loc = offsetof(struct nvme_zone_mgmt_recv_cmd, pr);
+		return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+	}
+
+	return NVME_SC_SUCCESS;
+}
+
+static inline u8 nvmet_zasl(unsigned int zone_append_sects)
+{
+	/*
+	 * Zone Append Size Limit is the value experessed in the units
+	 * of minimum memory page size (i.e. 12) and is reported power of 2.
+	 */
+	return ilog2((zone_append_sects << 9) >> NVMET_MPSMIN_SHIFT);
+}
+
+static inline bool nvmet_zns_update_zasl(struct nvmet_ns *ns)
+{
+	struct request_queue *q = ns->bdev->bd_disk->queue;
+	u8 zasl = nvmet_zasl(queue_max_zone_append_sectors(q));
+
+	if (ns->subsys->zasl)
+		return ns->subsys->zasl < zasl;
+
+	ns->subsys->zasl = zasl;
+	return true;
+}
+
+static int nvmet_bdev_validate_zns_zones_cb(struct blk_zone *z,
+					    unsigned int i, void *data)
+{
+	if (z->type == BLK_ZONE_TYPE_CONVENTIONAL)
+		return -EOPNOTSUPP;
+	return 0;
+}
+
+static bool nvmet_bdev_has_conv_zones(struct block_device *bdev)
+{
+	int ret;
+
+	if (bdev->bd_disk->queue->conv_zones_bitmap)
+		return true;
+
+	ret = blkdev_report_zones(bdev, 0, blkdev_nr_zones(bdev->bd_disk),
+				  nvmet_bdev_validate_zns_zones_cb, NULL);
+
+	return ret <= 0 ? true : false;
+}
+
+bool nvmet_bdev_zns_enable(struct nvmet_ns *ns)
+{
+	if (nvmet_bdev_has_conv_zones(ns->bdev))
+		return false;
+
+	ns->blksize_shift = blksize_bits(bdev_physical_block_size(ns->bdev));
+
+	if (!nvmet_zns_update_zasl(ns))
+		return false;
+
+	return !(get_capacity(ns->bdev->bd_disk) &
+			(bdev_zone_sectors(ns->bdev) - 1));
+}
+
+void nvmet_execute_identify_cns_cs_ctrl(struct nvmet_req *req)
+{
+	u8 zasl = req->sq->ctrl->subsys->zasl;
+	struct nvmet_ctrl *ctrl = req->sq->ctrl;
+	struct nvme_id_ctrl_zns *id;
+	u16 status;
+
+	if (req->cmd->identify.csi != NVME_CSI_ZNS) {
+		req->error_loc = offsetof(struct nvme_common_command, opcode);
+		status = NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
+		goto out;
+	}
+
+	id = kzalloc(sizeof(*id), GFP_KERNEL);
+	if (!id) {
+		status = NVME_SC_INTERNAL;
+		goto out;
+	}
+
+	if (ctrl->ops->get_mdts)
+		id->zasl = min_t(u8, ctrl->ops->get_mdts(ctrl), zasl);
+	else
+		id->zasl = zasl;
+
+	status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id));
+
+	kfree(id);
+out:
+	nvmet_req_complete(req, status);
+}
+
+void nvmet_execute_identify_cns_cs_ns(struct nvmet_req *req)
+{
+	struct nvme_id_ns_zns *id_zns;
+	u64 zsze;
+	u16 status;
+
+	if (req->cmd->identify.csi != NVME_CSI_ZNS) {
+		req->error_loc = offsetof(struct nvme_common_command, opcode);
+		status = NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
+		goto out;
+	}
+
+	if (le32_to_cpu(req->cmd->identify.nsid) == NVME_NSID_ALL) {
+		req->error_loc = offsetof(struct nvme_identify, nsid);
+		status = NVME_SC_INVALID_NS | NVME_SC_DNR;
+		goto out;
+	}
+
+	id_zns = kzalloc(sizeof(*id_zns), GFP_KERNEL);
+	if (!id_zns) {
+		status = NVME_SC_INTERNAL;
+		goto out;
+	}
+
+	status = nvmet_req_find_ns(req);
+	if (status) {
+		status = NVME_SC_INTERNAL;
+		goto done;
+	}
+
+	if (!bdev_is_zoned(req->ns->bdev)) {
+		req->error_loc = offsetof(struct nvme_identify, nsid);
+		status = NVME_SC_INVALID_NS | NVME_SC_DNR;
+		goto done;
+	}
+
+	nvmet_ns_revalidate(req->ns);
+	zsze = (bdev_zone_sectors(req->ns->bdev) << 9) >>
+					req->ns->blksize_shift;
+	id_zns->lbafe[0].zsze = cpu_to_le64(zsze);
+	id_zns->mor = cpu_to_le32(bdev_max_open_zones(req->ns->bdev));
+	id_zns->mar = cpu_to_le32(bdev_max_active_zones(req->ns->bdev));
+
+done:
+	status = nvmet_copy_to_sgl(req, 0, id_zns, sizeof(*id_zns));
+	kfree(id_zns);
+out:
+	nvmet_req_complete(req, status);
+}
+
+struct nvmet_report_zone_data {
+	struct nvmet_ns *ns;
+	struct nvme_zone_report *rz;
+};
+
+static int nvmet_bdev_report_zone_cb(struct blk_zone *z, unsigned i, void *d)
+{
+	struct nvmet_report_zone_data *report_zone_data = d;
+	struct nvme_zone_descriptor *entries = report_zone_data->rz->entries;
+	struct nvmet_ns *ns = report_zone_data->ns;
+
+	entries[i].zcap = nvmet_sect_to_lba(ns, z->capacity);
+	entries[i].zslba = nvmet_sect_to_lba(ns, z->start);
+	entries[i].wp = nvmet_sect_to_lba(ns, z->wp);
+	entries[i].za = z->reset ? 1 << 2 : 0;
+	entries[i].zt = z->type;
+	entries[i].zs = z->cond << 4;
+
+	return 0;
+}
+
+void nvmet_bdev_execute_zone_mgmt_recv(struct nvmet_req *req)
+{
+	sector_t sect = nvmet_lba_to_sect(req->ns, req->cmd->zmr.slba);
+	u32 bufsize = (le32_to_cpu(req->cmd->zmr.numd) + 1) << 2;
+	struct nvmet_report_zone_data data = { .ns = req->ns };
+	unsigned int nr_zones;
+	int reported_zones;
+	u16 status;
+
+	status = nvmet_bdev_zns_checks(req);
+	if (status)
+		goto out;
+
+	data.rz = __vmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY | __GFP_ZERO);
+	if (!data.rz) {
+		status = NVME_SC_INTERNAL;
+		goto out;
+	}
+
+	nr_zones = (bufsize - sizeof(struct nvme_zone_report)) /
+			sizeof(struct nvme_zone_descriptor);
+	reported_zones = blkdev_report_zones(req->ns->bdev, sect, nr_zones,
+					     nvmet_bdev_report_zone_cb, &data);
+	if (reported_zones < 0) {
+		status = NVME_SC_INTERNAL;
+		goto out_free_report_zones;
+	}
+
+	data.rz->nr_zones = cpu_to_le64(reported_zones);
+
+	status = nvmet_copy_to_sgl(req, 0, data.rz, bufsize);
+
+out_free_report_zones:
+	kvfree(data.rz);
+out:
+	nvmet_req_complete(req, status);
+}
+
+void nvmet_bdev_execute_zone_mgmt_send(struct nvmet_req *req)
+{
+	sector_t sect = nvmet_lba_to_sect(req->ns, req->cmd->zms.slba);
+	sector_t nr_sect = bdev_zone_sectors(req->ns->bdev);
+	u16 status = NVME_SC_SUCCESS;
+	u8 zsa = req->cmd->zms.zsa;
+	enum req_opf op;
+	int ret;
+	const unsigned int zsa_to_op[] = {
+		[NVME_ZONE_OPEN]	= REQ_OP_ZONE_OPEN,
+		[NVME_ZONE_CLOSE]	= REQ_OP_ZONE_CLOSE,
+		[NVME_ZONE_FINISH]	= REQ_OP_ZONE_FINISH,
+		[NVME_ZONE_RESET]	= REQ_OP_ZONE_RESET,
+	};
+
+	if (zsa > ARRAY_SIZE(zsa_to_op) || !zsa_to_op[zsa]) {
+		status = NVME_SC_INVALID_FIELD;
+		goto out;
+	}
+
+	op = zsa_to_op[zsa];
+
+	if (req->cmd->zms.select_all)
+		nr_sect = get_capacity(req->ns->bdev->bd_disk);
+
+	ret = blkdev_zone_mgmt(req->ns->bdev, op, sect, nr_sect, GFP_KERNEL);
+	if (ret)
+		status = NVME_SC_INTERNAL;
+out:
+	nvmet_req_complete(req, status);
+}
+
+void nvmet_bdev_execute_zone_append(struct nvmet_req *req)
+{
+	sector_t sect = nvmet_lba_to_sect(req->ns, req->cmd->rw.slba);
+	u16 status = NVME_SC_SUCCESS;
+	unsigned int total_len = 0;
+	struct scatterlist *sg;
+	int ret = 0, sg_cnt;
+	struct bio *bio;
+
+	if (!nvmet_check_transfer_len(req, nvmet_rw_data_len(req)))
+		return;
+
+	if (!req->sg_cnt) {
+		nvmet_req_complete(req, 0);
+		return;
+	}
+
+	if (req->transfer_len <= NVMET_MAX_INLINE_DATA_LEN) {
+		bio = &req->b.inline_bio;
+		bio_init(bio, req->inline_bvec, ARRAY_SIZE(req->inline_bvec));
+	} else {
+		bio = bio_alloc(GFP_KERNEL, req->sg_cnt);
+	}
+
+	bio_set_dev(bio, req->ns->bdev);
+	bio->bi_iter.bi_sector = sect;
+	bio->bi_opf = REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE;
+	if (req->cmd->rw.control & cpu_to_le16(NVME_RW_FUA))
+		bio->bi_opf |= REQ_FUA;
+
+	for_each_sg(req->sg, sg, req->sg_cnt, sg_cnt) {
+		struct page *p = sg_page(sg);
+		unsigned int l = sg->length;
+		unsigned int o = sg->offset;
+
+		ret = bio_add_zone_append_page(bio, p, l, o);
+		if (ret != sg->length) {
+			status = NVME_SC_INTERNAL;
+			goto out_bio_put;
+		}
+
+		total_len += sg->length;
+	}
+
+	if (total_len != nvmet_rw_data_len(req)) {
+		status = NVME_SC_INTERNAL | NVME_SC_DNR;
+		goto out_bio_put;
+	}
+
+	ret = submit_bio_wait(bio);
+	req->cqe->result.u64 = nvmet_sect_to_lba(req->ns,
+						 bio->bi_iter.bi_sector);
+
+out_bio_put:
+	if (bio != &req->b.inline_bio)
+		bio_put(bio);
+	nvmet_req_complete(req, ret < 0 ? NVME_SC_INTERNAL : status);
+}
-- 
2.22.1




More information about the Linux-nvme mailing list