[PATCHv3 5/5] nvme: support for zoned namespaces

Javier González javier at javigon.com
Wed Jun 24 05:11:23 EDT 2020


On 22.06.2020 09:25, Keith Busch wrote:
>From: Keith Busch <keith.busch at wdc.com>
>
>Add support for NVM Express Zoned Namespaces (ZNS) Command Set defined
>in NVM Express TP4053. Zoned namespaces are discovered based on their
>Command Set Identifier reported in the namespaces Namespace
>Identification Descriptor list. A successfully discovered Zoned
>Namespace will be registered with the block layer as a host managed
>zoned block device with Zone Append command support. A namespace that
>does not support append is not supported by the driver.
>
>Reviewed-by: Martin K. Petersen <martin.petersen at oracle.com>
>Signed-off-by: Hans Holmberg <hans.holmberg at wdc.com>
>Signed-off-by: Dmitry Fomichev <dmitry.fomichev at wdc.com>
>Signed-off-by: Ajay Joshi <ajay.joshi at wdc.com>
>Signed-off-by: Aravind Ramesh <aravind.ramesh at wdc.com>
>Signed-off-by: Niklas Cassel <niklas.cassel at wdc.com>
>Signed-off-by: Matias Bjørling <matias.bjorling at wdc.com>
>Signed-off-by: Damien Le Moal <damien.lemoal at wdc.com>
>Signed-off-by: Keith Busch <keith.busch at wdc.com>
>---
> block/Kconfig              |   5 +-
> drivers/nvme/host/Makefile |   1 +
> drivers/nvme/host/core.c   |  91 ++++++++++++--
> drivers/nvme/host/nvme.h   |  39 ++++++
> drivers/nvme/host/zns.c    | 245 +++++++++++++++++++++++++++++++++++++
> include/linux/nvme.h       | 114 ++++++++++++++++-
> 6 files changed, 480 insertions(+), 15 deletions(-)
> create mode 100644 drivers/nvme/host/zns.c
>
>diff --git a/block/Kconfig b/block/Kconfig
>index 9357d7302398..bbad5e8bbffe 100644
>--- a/block/Kconfig
>+++ b/block/Kconfig
>@@ -86,9 +86,10 @@ config BLK_DEV_ZONED
> 	select MQ_IOSCHED_DEADLINE
> 	help
> 	Block layer zoned block device support. This option enables
>-	support for ZAC/ZBC host-managed and host-aware zoned block devices.
>+	support for ZAC/ZBC/ZNS host-managed and host-aware zoned block
>+	devices.
>
>-	Say yes here if you have a ZAC or ZBC storage device.
>+	Say yes here if you have a ZAC, ZBC, or ZNS storage device.
>
> config BLK_DEV_THROTTLING
> 	bool "Block layer bio throttling support"
>diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile
>index fc7b26be692d..d7f6a87687b8 100644
>--- a/drivers/nvme/host/Makefile
>+++ b/drivers/nvme/host/Makefile
>@@ -13,6 +13,7 @@ nvme-core-y				:= core.o
> nvme-core-$(CONFIG_TRACING)		+= trace.o
> nvme-core-$(CONFIG_NVME_MULTIPATH)	+= multipath.o
> nvme-core-$(CONFIG_NVM)			+= lightnvm.o
>+nvme-core-$(CONFIG_BLK_DEV_ZONED)	+= zns.o
> nvme-core-$(CONFIG_FAULT_INJECTION_DEBUG_FS)	+= fault_inject.o
> nvme-core-$(CONFIG_NVME_HWMON)		+= hwmon.o
>
>diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
>index 64b7b9fc2817..885db561de17 100644
>--- a/drivers/nvme/host/core.c
>+++ b/drivers/nvme/host/core.c
>@@ -89,7 +89,7 @@ static dev_t nvme_chr_devt;
> static struct class *nvme_class;
> static struct class *nvme_subsys_class;
>
>-static int nvme_revalidate_disk(struct gendisk *disk);
>+static int _nvme_revalidate_disk(struct gendisk *disk);
> static void nvme_put_subsystem(struct nvme_subsystem *subsys);
> static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
> 					   unsigned nsid);
>@@ -287,6 +287,10 @@ void nvme_complete_rq(struct request *req)
> 			nvme_retry_req(req);
> 			return;
> 		}
>+	} else if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
>+		   req_op(req) == REQ_OP_ZONE_APPEND) {
>+		req->__sector = nvme_lba_to_sect(req->q->queuedata,
>+			le64_to_cpu(nvme_req(req)->result.u64));
> 	}
>
> 	nvme_trace_bio_complete(req, status);
>@@ -673,7 +677,8 @@ static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns,
> }
>
> static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
>-		struct request *req, struct nvme_command *cmnd)
>+		struct request *req, struct nvme_command *cmnd,
>+		enum nvme_opcode op)
> {
> 	struct nvme_ctrl *ctrl = ns->ctrl;
> 	u16 control = 0;
>@@ -687,7 +692,7 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
> 	if (req->cmd_flags & REQ_RAHEAD)
> 		dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
>
>-	cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read);
>+	cmnd->rw.opcode = op;
> 	cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id);
> 	cmnd->rw.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
> 	cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
>@@ -716,6 +721,8 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
> 		case NVME_NS_DPS_PI_TYPE2:
> 			control |= NVME_RW_PRINFO_PRCHK_GUARD |
> 					NVME_RW_PRINFO_PRCHK_REF;
>+			if (op == nvme_cmd_zone_append)
>+				control |= NVME_RW_APPEND_PIREMAP;
> 			cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req));
> 			break;
> 		}
>@@ -756,6 +763,19 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
> 	case REQ_OP_FLUSH:
> 		nvme_setup_flush(ns, cmd);
> 		break;
>+	case REQ_OP_ZONE_RESET_ALL:
>+	case REQ_OP_ZONE_RESET:
>+		ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_RESET);
>+		break;
>+	case REQ_OP_ZONE_OPEN:
>+		ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_OPEN);
>+		break;
>+	case REQ_OP_ZONE_CLOSE:
>+		ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_CLOSE);
>+		break;
>+	case REQ_OP_ZONE_FINISH:
>+		ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_FINISH);
>+		break;
> 	case REQ_OP_WRITE_ZEROES:
> 		ret = nvme_setup_write_zeroes(ns, req, cmd);
> 		break;
>@@ -763,8 +783,13 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
> 		ret = nvme_setup_discard(ns, req, cmd);
> 		break;
> 	case REQ_OP_READ:
>+		ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_read);
>+		break;
> 	case REQ_OP_WRITE:
>-		ret = nvme_setup_rw(ns, req, cmd);
>+		ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_write);
>+		break;
>+	case REQ_OP_ZONE_APPEND:
>+		ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_zone_append);
> 		break;
> 	default:
> 		WARN_ON_ONCE(1);
>@@ -1391,14 +1416,23 @@ static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
> 	return effects;
> }
>
>-static void nvme_update_formats(struct nvme_ctrl *ctrl)
>+static void nvme_update_formats(struct nvme_ctrl *ctrl, u32 *effects)
> {
> 	struct nvme_ns *ns;
>
> 	down_read(&ctrl->namespaces_rwsem);
> 	list_for_each_entry(ns, &ctrl->namespaces, list)
>-		if (ns->disk && nvme_revalidate_disk(ns->disk))
>+		if (ns->disk && _nvme_revalidate_disk(ns->disk))
> 			nvme_set_queue_dying(ns);
>+		else if (blk_queue_is_zoned(ns->disk->queue)) {
>+			/*
>+			 * IO commands are required to fully revalidate a zoned
>+			 * device. Force the command effects to trigger rescan
>+			 * work so report zones can run in a context with
>+			 * unfrozen IO queues.
>+			 */
>+			*effects |= NVME_CMD_EFFECTS_NCC;
>+		}
> 	up_read(&ctrl->namespaces_rwsem);
> }
>
>@@ -1410,7 +1444,7 @@ static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects)
> 	 * this command.
> 	 */
> 	if (effects & NVME_CMD_EFFECTS_LBCC)
>-		nvme_update_formats(ctrl);
>+		nvme_update_formats(ctrl, &effects);
> 	if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) {
> 		nvme_unfreeze(ctrl);
> 		nvme_mpath_unfreeze(ctrl->subsys);
>@@ -1525,7 +1559,7 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
>  * Issue ioctl requests on the first available path.  Note that unlike normal
>  * block layer requests we will not retry failed request on another controller.
>  */
>-static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
>+struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
> 		struct nvme_ns_head **head, int *srcu_idx)
> {
> #ifdef CONFIG_NVME_MULTIPATH
>@@ -1545,7 +1579,7 @@ static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
> 	return disk->private_data;
> }
>
>-static void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx)
>+void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx)
> {
> 	if (head)
> 		srcu_read_unlock(&head->srcu, idx);
>@@ -1938,21 +1972,28 @@ static void nvme_update_disk_info(struct gendisk *disk,
>
> static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
> {
>+	unsigned lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
> 	struct nvme_ns *ns = disk->private_data;
> 	struct nvme_ctrl *ctrl = ns->ctrl;
>+	int ret;
> 	u32 iob;
>
> 	/*
> 	 * If identify namespace failed, use default 512 byte block size so
> 	 * block layer can use before failing read/write for 0 capacity.
> 	 */
>-	ns->lba_shift = id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ds;
>+	ns->lba_shift = id->lbaf[lbaf].ds;
> 	if (ns->lba_shift == 0)
> 		ns->lba_shift = 9;
>
> 	switch (ns->head->ids.csi) {
> 	case NVME_CSI_NVM:
> 		break;
>+	case NVME_CSI_ZNS:
>+		ret = nvme_update_zone_info(disk, ns, lbaf);
>+		if (ret)
>+			return ret;
>+		break;
> 	default:
> 		dev_warn(ctrl->device, "unknown csi:%d ns:%d\n",
> 			ns->head->ids.csi, ns->head->ns_id);
>@@ -1966,7 +2007,7 @@ static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
> 		iob = nvme_lba_to_sect(ns, le16_to_cpu(id->noiob));
>
> 	ns->features = 0;
>-	ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms);
>+	ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
> 	/* the PI implementation requires metadata equal t10 pi tuple size */
> 	if (ns->ms == sizeof(struct t10_pi_tuple))
> 		ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
>@@ -2009,7 +2050,7 @@ static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
> 	return 0;
> }
>
>-static int nvme_revalidate_disk(struct gendisk *disk)
>+static int _nvme_revalidate_disk(struct gendisk *disk)
> {
> 	struct nvme_ns *ns = disk->private_data;
> 	struct nvme_ctrl *ctrl = ns->ctrl;
>@@ -2057,6 +2098,28 @@ static int nvme_revalidate_disk(struct gendisk *disk)
> 	return ret;
> }
>
>+static int nvme_revalidate_disk(struct gendisk *disk)
>+{
>+	int ret;
>+
>+	ret = _nvme_revalidate_disk(disk);
>+	if (ret)
>+		return ret;
>+
>+#ifdef CONFIG_BLK_DEV_ZONED
>+	if (blk_queue_is_zoned(disk->queue)) {
>+		struct nvme_ns *ns = disk->private_data;
>+		struct nvme_ctrl *ctrl = ns->ctrl;
>+
>+		ret = blk_revalidate_disk_zones(disk, NULL);
>+		if (!ret)
>+			blk_queue_max_zone_append_sectors(disk->queue,
>+							  ctrl->max_zone_append);
>+	}
>+#endif
>+	return ret;
>+}
>+
> static char nvme_pr_type(enum pr_type type)
> {
> 	switch (type) {
>@@ -2187,6 +2250,7 @@ static const struct block_device_operations nvme_fops = {
> 	.release	= nvme_release,
> 	.getgeo		= nvme_getgeo,
> 	.revalidate_disk= nvme_revalidate_disk,
>+	.report_zones	= nvme_report_zones,
> 	.pr_ops		= &nvme_pr_ops,
> };
>
>@@ -2212,6 +2276,7 @@ const struct block_device_operations nvme_ns_head_ops = {
> 	.ioctl		= nvme_ioctl,
> 	.compat_ioctl	= nvme_compat_ioctl,
> 	.getgeo		= nvme_getgeo,
>+	.report_zones	= nvme_report_zones,
> 	.pr_ops		= &nvme_pr_ops,
> };
> #endif /* CONFIG_NVME_MULTIPATH */
>@@ -4439,6 +4504,8 @@ static inline void _nvme_check_size(void)
> 	BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
> 	BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != NVME_IDENTIFY_DATA_SIZE);
> 	BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE);
>+	BUILD_BUG_ON(sizeof(struct nvme_id_ns_zns) != NVME_IDENTIFY_DATA_SIZE);
>+	BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_zns) != NVME_IDENTIFY_DATA_SIZE);
> 	BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
> 	BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
> 	BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);
>diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
>index 4a1982133e9a..ecf443efdf91 100644
>--- a/drivers/nvme/host/nvme.h
>+++ b/drivers/nvme/host/nvme.h
>@@ -238,6 +238,9 @@ struct nvme_ctrl {
> 	u32 max_hw_sectors;
> 	u32 max_segments;
> 	u32 max_integrity_segments;
>+#ifdef CONFIG_BLK_DEV_ZONED
>+	u32 max_zone_append;
>+#endif
> 	u16 crdt[3];
> 	u16 oncs;
> 	u16 oacs;
>@@ -402,6 +405,9 @@ struct nvme_ns {
> 	u16 sgs;
> 	u32 sws;
> 	u8 pi_type;
>+#ifdef CONFIG_BLK_DEV_ZONED
>+	u64 zsze;
>+#endif
> 	unsigned long features;
> 	unsigned long flags;
> #define NVME_NS_REMOVING	0
>@@ -567,6 +573,9 @@ int nvme_delete_ctrl(struct nvme_ctrl *ctrl);
>
> int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi,
> 		void *log, size_t size, u64 offset);
>+struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
>+		struct nvme_ns_head **head, int *srcu_idx);
>+void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx);
>
> extern const struct attribute_group *nvme_ns_id_attr_groups[];
> extern const struct block_device_operations nvme_ns_head_ops;
>@@ -688,6 +697,36 @@ static inline void nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
> }
> #endif /* CONFIG_NVME_MULTIPATH */
>
>+#ifdef CONFIG_BLK_DEV_ZONED
>+int nvme_update_zone_info(struct gendisk *disk, struct nvme_ns *ns,
>+			  unsigned lbaf);
>+
>+int nvme_report_zones(struct gendisk *disk, sector_t sector,
>+		      unsigned int nr_zones, report_zones_cb cb, void *data);
>+
>+blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req,
>+				       struct nvme_command *cmnd,
>+				       enum nvme_zone_mgmt_action action);
>+#else
>+#define nvme_report_zones NULL
>+
>+static inline blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns,
>+		struct request *req, struct nvme_command *cmnd,
>+		enum nvme_zone_mgmt_action action)
>+{
>+	return BLK_STS_NOTSUPP;
>+}
>+
>+static inline int nvme_update_zone_info(struct gendisk *disk,
>+					struct nvme_ns *ns,
>+					unsigned lbaf)
>+{
>+	dev_warn(ns->ctrl->device,
>+		 "Please enable CONFIG_BLK_DEV_ZONED to support ZNS devices\n");
>+	return -EPROTONOSUPPORT;
>+}
>+#endif
>+
> #ifdef CONFIG_NVM
> int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node);
> void nvme_nvm_unregister(struct nvme_ns *ns);
>diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c
>new file mode 100644
>index 000000000000..ee6f49a8aee4
>--- /dev/null
>+++ b/drivers/nvme/host/zns.c
>@@ -0,0 +1,245 @@
>+// SPDX-License-Identifier: GPL-2.0
>+/*
>+ * Copyright (C) 2020 Western Digital Corporation or its affiliates.
>+ */
>+
>+#include <linux/blkdev.h>
>+#include <linux/vmalloc.h>
>+#include "nvme.h"
>+
>+static int nvme_set_max_append(struct nvme_ctrl *ctrl)
>+{
>+	struct nvme_command c = { };
>+	struct nvme_id_ctrl_zns *id;
>+	int status;
>+
>+	id = kzalloc(sizeof(*id), GFP_KERNEL);
>+	if (!id)
>+		return -ENOMEM;
>+
>+	c.identify.opcode = nvme_admin_identify;
>+	c.identify.cns = NVME_ID_CNS_CS_CTRL;
>+	c.identify.csi = NVME_CSI_ZNS;
>+
>+	status = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id));
>+	if (status) {
>+		kfree(id);
>+		return status;
>+	}
>+
>+	ctrl->max_zone_append = 1 << (id->zamds + 3);
>+	kfree(id);
>+	return 0;
>+}
>+
>+int nvme_update_zone_info(struct gendisk *disk, struct nvme_ns *ns,
>+			  unsigned lbaf)
>+{
>+	struct nvme_effects_log *log = ns->head->effects;
>+	struct request_queue *q = disk->queue;
>+	struct nvme_command c = { };
>+	struct nvme_id_ns_zns *id;
>+	int status;
>+
>+	/* Driver requires zone append support */
>+	if (!(log->iocs[nvme_cmd_zone_append] & NVME_CMD_EFFECTS_CSUPP)) {
>+		dev_warn(ns->ctrl->device,
>+			"append not supported for zoned namespace:%d\n",
>+			ns->head->ns_id);
>+		return -ENODEV;
>+	}
>+
>+	/* Lazily query controller append limit for the first zoned namespace */
>+	if (!ns->ctrl->max_zone_append) {
>+		status = nvme_set_max_append(ns->ctrl);
>+		if (status)
>+			return status;
>+	}
>+
>+	id = kzalloc(sizeof(*id), GFP_KERNEL);
>+	if (!id)
>+		return -ENOMEM;
>+
>+	c.identify.opcode = nvme_admin_identify;
>+	c.identify.nsid = cpu_to_le32(ns->head->ns_id);
>+	c.identify.cns = NVME_ID_CNS_CS_NS;
>+	c.identify.csi = NVME_CSI_ZNS;
>+
>+	status = nvme_submit_sync_cmd(ns->ctrl->admin_q, &c, id, sizeof(*id));
>+	if (status)
>+		goto free_data;
>+
>+	/*
>+	 * We currently do not handle devices requiring any of the zoned
>+	 * operation characteristics.
>+	 */
>+	if (id->zoc) {
>+		dev_warn(ns->ctrl->device,
>+			"zone operations:%x not supported for namespace:%d\n",
>+			le_to_cpu16(id->zoc), ns->head->ns_id);
>+		status = -EINVAL;
>+		goto free_data;
>+	}
>+
>+	ns->zsze = nvme_lba_to_sect(ns, le64_to_cpu(id->lbafe[lbaf].zsze));
>+	if (!ns->zsze) {
>+		status = -EINVAL;
>+		goto free_data;
>+	}
>+
>+	q->limits.zoned = BLK_ZONED_HM;
>+	blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q);
>+free_data:
>+	kfree(id);
>+	return status;
>+}
>+
>+static void *nvme_zns_alloc_report_buffer(struct nvme_ns *ns,
>+					  unsigned int nr_zones, size_t *buflen)
>+{
>+	struct request_queue *q = ns->disk->queue;
>+	size_t bufsize;
>+	void *buf;
>+
>+	const size_t min_bufsize = sizeof(struct nvme_zone_report) +
>+				   sizeof(struct nvme_zone_descriptor);
>+
>+	nr_zones = min_t(unsigned int, nr_zones,
>+			 get_capacity(ns->disk) >> ilog2(ns->zsze));
>+
>+	bufsize = sizeof(struct nvme_zone_report) +
>+		nr_zones * sizeof(struct nvme_zone_descriptor);
>+	bufsize = min_t(size_t, bufsize,
>+			queue_max_hw_sectors(q) << SECTOR_SHIFT);
>+	bufsize = min_t(size_t, bufsize, queue_max_segments(q) << PAGE_SHIFT);
>+
>+	while (bufsize >= min_bufsize) {
>+		buf = __vmalloc(bufsize,
>+				GFP_KERNEL | __GFP_ZERO | __GFP_NORETRY);
>+		if (buf) {
>+			*buflen = bufsize;
>+			return buf;
>+		}
>+		bufsize >>= 1;
>+	}
>+	return NULL;
>+}
>+
>+static int __nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
>+				  struct nvme_zone_report *report,
>+				  size_t buflen)
>+{
>+	struct nvme_command c = { };
>+	int ret;
>+
>+	c.zmr.opcode = nvme_cmd_zone_mgmt_recv;
>+	c.zmr.nsid = cpu_to_le32(ns->head->ns_id);
>+	c.zmr.slba = cpu_to_le64(nvme_sect_to_lba(ns, sector));
>+	c.zmr.numd = cpu_to_le32(nvme_bytes_to_numd(buflen));
>+	c.zmr.zra = NVME_ZRA_ZONE_REPORT;
>+	c.zmr.zrasf = NVME_ZRASF_ZONE_REPORT_ALL;
>+	c.zmr.pr = NVME_REPORT_ZONE_PARTIAL;
>+
>+	ret = nvme_submit_sync_cmd(ns->queue, &c, report, buflen);
>+	if (ret)
>+		return ret;
>+
>+	return le64_to_cpu(report->nr_zones);
>+}
>+
>+static int nvme_zone_parse_entry(struct nvme_ns *ns,
>+				 struct nvme_zone_descriptor *entry,
>+				 unsigned int idx, report_zones_cb cb,
>+				 void *data)
>+{
>+	struct blk_zone zone = { };
>+
>+	if ((entry->zt & 0xf) != NVME_ZONE_TYPE_SEQWRITE_REQ) {
>+		dev_err(ns->ctrl->device, "invalid zone type %#x\n",
>+				entry->zt);
>+		return -EINVAL;
>+	}
>+
>+	zone.type = BLK_ZONE_TYPE_SEQWRITE_REQ;
>+	zone.cond = entry->zs >> 4;
>+	zone.len = ns->zsze;
>+	zone.capacity = nvme_lba_to_sect(ns, le64_to_cpu(entry->zcap));
>+	zone.start = nvme_lba_to_sect(ns, le64_to_cpu(entry->zslba));
>+	zone.wp = nvme_lba_to_sect(ns, le64_to_cpu(entry->wp));
>+
>+	return cb(&zone, idx, data);
>+}
>+
>+static int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
>+			unsigned int nr_zones, report_zones_cb cb, void *data)
>+{
>+	struct nvme_zone_report *report;
>+	int ret, zone_idx = 0;
>+	unsigned int nz, i;
>+	size_t buflen;
>+
>+	report = nvme_zns_alloc_report_buffer(ns, nr_zones, &buflen);
>+	if (!report)
>+		return -ENOMEM;
>+
>+	sector &= ~(ns->zsze - 1);
>+	while (zone_idx < nr_zones && sector < get_capacity(ns->disk)) {
>+		memset(report, 0, buflen);
>+		ret = __nvme_ns_report_zones(ns, sector, report, buflen);
>+		if (ret < 0)
>+			goto out_free;
>+
>+		nz = min_t(unsigned int, ret, nr_zones);
>+		if (!nz)
>+			break;
>+
>+		for (i = 0; i < nz && zone_idx < nr_zones; i++) {
>+			ret = nvme_zone_parse_entry(ns, &report->entries[i],
>+						    zone_idx, cb, data);
>+			if (ret)
>+				goto out_free;
>+			zone_idx++;
>+		}
>+
>+		sector += ns->zsze * nz;
>+	}
>+
>+	ret = zone_idx;
>+out_free:
>+	kvfree(report);
>+	return ret;
>+}
>+
>+int nvme_report_zones(struct gendisk *disk, sector_t sector,
>+		      unsigned int nr_zones, report_zones_cb cb, void *data)
>+{
>+	struct nvme_ns_head *head = NULL;
>+	struct nvme_ns *ns;
>+	int srcu_idx, ret;
>+
>+	ns = nvme_get_ns_from_disk(disk, &head, &srcu_idx);
>+	if (unlikely(!ns))
>+		return -EWOULDBLOCK;
>+
>+	if (ns->head->ids.csi == NVME_CSI_ZNS)
>+		ret = nvme_ns_report_zones(ns, sector, nr_zones, cb, data);
>+	else
>+		ret = -EINVAL;
>+	nvme_put_ns_from_disk(head, srcu_idx);
>+
>+	return ret;
>+}
>+
>+blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req,
>+		struct nvme_command *c, enum nvme_zone_mgmt_action action)
>+{
>+	c->zms.opcode = nvme_cmd_zone_mgmt_send;
>+	c->zms.nsid = cpu_to_le32(ns->head->ns_id);
>+	c->zms.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
>+	c->zms.zsa = action;
>+
>+	if (req_op(req) == REQ_OP_ZONE_RESET_ALL)
>+		c->zms.select_all = 1;
>+
>+	return BLK_STS_OK;
>+}
>diff --git a/include/linux/nvme.h b/include/linux/nvme.h
>index 95cd03e240a1..d862e5d70818 100644
>--- a/include/linux/nvme.h
>+++ b/include/linux/nvme.h
>@@ -1,6 +1,7 @@
> /* SPDX-License-Identifier: GPL-2.0 */
> /*
>- * Definitions for the NVM Express interface
>+ * Definitions for the NVM Ex
>+ * ress interface
>  * Copyright (c) 2011-2014, Intel Corporation.
>  */
>
>@@ -374,6 +375,30 @@ struct nvme_id_ns {
> 	__u8			vs[3712];
> };
>
>+struct nvme_zns_lbafe {
>+	__le64			zsze;
>+	__u8			zdes;
>+	__u8			rsvd9[7];
>+};
>+
>+struct nvme_id_ns_zns {
>+	__le16			zoc;
>+	__le16			ozcs;
>+	__le32			mar;
>+	__le32			mor;
>+	__le32			rrl;
>+	__le32			frl;
>+	__u8			rsvd20[2796];
>+	struct nvme_zns_lbafe	lbafe[16];
>+	__u8			rsvd3072[768];
>+	__u8			vs[256];
>+};
>+
>+struct nvme_id_ctrl_zns {
>+	__u8	zamds;
>+	__u8	rsvd1[4095];
>+};
>+
> enum {
> 	NVME_ID_CNS_NS			= 0x00,
> 	NVME_ID_CNS_CTRL		= 0x01,
>@@ -392,6 +417,7 @@ enum {
>
> enum {
> 	NVME_CSI_NVM			= 0,
>+	NVME_CSI_ZNS			= 2,
> };
>
> enum {
>@@ -532,6 +558,27 @@ struct nvme_ana_rsp_hdr {
> 	__le16	rsvd10[3];
> };
>
>+struct nvme_zone_descriptor {
>+	__u8		zt;
>+	__u8		zs;
>+	__u8		za;
>+	__u8		rsvd3[5];
>+	__le64		zcap;
>+	__le64		zslba;
>+	__le64		wp;
>+	__u8		rsvd32[32];
>+};
>+
>+enum {
>+	NVME_ZONE_TYPE_SEQWRITE_REQ	= 0x2,
>+};
>+
>+struct nvme_zone_report {
>+	__le64		nr_zones;
>+	__u8		resv8[56];
>+	struct nvme_zone_descriptor entries[];
>+};
>+
> enum {
> 	NVME_SMART_CRIT_SPARE		= 1 << 0,
> 	NVME_SMART_CRIT_TEMPERATURE	= 1 << 1,
>@@ -626,6 +673,9 @@ enum nvme_opcode {
> 	nvme_cmd_resv_report	= 0x0e,
> 	nvme_cmd_resv_acquire	= 0x11,
> 	nvme_cmd_resv_release	= 0x15,
>+	nvme_cmd_zone_mgmt_send	= 0x79,
>+	nvme_cmd_zone_mgmt_recv	= 0x7a,
>+	nvme_cmd_zone_append	= 0x7d,
> };
>
> #define nvme_opcode_name(opcode)	{ opcode, #opcode }
>@@ -764,6 +814,7 @@ struct nvme_rw_command {
> enum {
> 	NVME_RW_LR			= 1 << 15,
> 	NVME_RW_FUA			= 1 << 14,
>+	NVME_RW_APPEND_PIREMAP		= 1 << 9,
> 	NVME_RW_DSM_FREQ_UNSPEC		= 0,
> 	NVME_RW_DSM_FREQ_TYPICAL	= 1,
> 	NVME_RW_DSM_FREQ_RARE		= 2,
>@@ -829,6 +880,53 @@ struct nvme_write_zeroes_cmd {
> 	__le16			appmask;
> };
>
>+enum nvme_zone_mgmt_action {
>+	NVME_ZONE_CLOSE		= 0x1,
>+	NVME_ZONE_FINISH	= 0x2,
>+	NVME_ZONE_OPEN		= 0x3,
>+	NVME_ZONE_RESET		= 0x4,
>+	NVME_ZONE_OFFLINE	= 0x5,
>+	NVME_ZONE_SET_DESC_EXT	= 0x10,
>+};
>+
>+struct nvme_zone_mgmt_send_cmd {
>+	__u8			opcode;
>+	__u8			flags;
>+	__u16			command_id;
>+	__le32			nsid;
>+	__le32			cdw2[2];
>+	__le64			metadata;
>+	union nvme_data_ptr	dptr;
>+	__le64			slba;
>+	__le32			cdw12;
>+	__u8			zsa;
>+	__u8			select_all;
>+	__u8			rsvd13[2];
>+	__le32			cdw14[2];
>+};
>+
>+struct nvme_zone_mgmt_recv_cmd {
>+	__u8			opcode;
>+	__u8			flags;
>+	__u16			command_id;
>+	__le32			nsid;
>+	__le64			rsvd2[2];
>+	union nvme_data_ptr	dptr;
>+	__le64			slba;
>+	__le32			numd;
>+	__u8			zra;
>+	__u8			zrasf;
>+	__u8			pr;
>+	__u8			rsvd13;
>+	__le32			cdw14[2];
>+};
>+
>+enum {
>+	NVME_ZRA_ZONE_REPORT		= 0,
>+	NVME_ZRASF_ZONE_REPORT_ALL	= 0,
>+	NVME_REPORT_ZONE_PARTIAL	= 1,
>+};
>+
> /* Features */
>
> enum {
>@@ -1300,6 +1398,8 @@ struct nvme_command {
> 		struct nvme_format_cmd format;
> 		struct nvme_dsm_cmd dsm;
> 		struct nvme_write_zeroes_cmd write_zeroes;
>+		struct nvme_zone_mgmt_send_cmd zms;
>+		struct nvme_zone_mgmt_recv_cmd zmr;
> 		struct nvme_abort_cmd abort;
> 		struct nvme_get_log_page_command get_log_page;
> 		struct nvmf_common_command fabrics;
>@@ -1433,6 +1533,18 @@ enum {
> 	NVME_SC_DISCOVERY_RESTART	= 0x190,
> 	NVME_SC_AUTH_REQUIRED		= 0x191,
>
>+	/*
>+	 * I/O Command Set Specific - Zoned commands:
>+	 */
>+	NVME_SC_ZONE_BOUNDARY_ERROR	= 0x1b8,
>+	NVME_SC_ZONE_FULL		= 0x1b9,
>+	NVME_SC_ZONE_READ_ONLY		= 0x1ba,
>+	NVME_SC_ZONE_OFFLINE		= 0x1bb,
>+	NVME_SC_ZONE_INVALID_WRITE	= 0x1bc,
>+	NVME_SC_ZONE_TOO_MANY_ACTIVE	= 0x1bd,
>+	NVME_SC_ZONE_TOO_MANY_OPEN	= 0x1be,
>+	NVME_SC_ZONE_INVALID_TRANSITION	= 0x1bf,
>+
> 	/*
> 	 * Media and Data Integrity Errors:
> 	 */
>-- 
>2.24.1
>
>
>_______________________________________________
>Linux-nvme mailing list
>Linux-nvme at lists.infradead.org
>http://lists.infradead.org/mailman/listinfo/linux-nvme

Looks good!

Reviewed-by: Javier González <javier.gonz at samsung.com>



More information about the Linux-nvme mailing list