[PATCHv4 5/5] nvme: support for zoned namespaces

Himanshu Madhani himanshu.madhani at oracle.com
Tue Jun 30 12:36:47 EDT 2020



> On Jun 29, 2020, at 2:06 PM, Keith Busch <kbusch at kernel.org> wrote:
> 
> From: Keith Busch <keith.busch at wdc.com>
> 
> Add support for NVM Express Zoned Namespaces (ZNS) Command Set defined
> in NVM Express TP4053. Zoned namespaces are discovered based on their
> Command Set Identifier reported in the namespaces Namespace
> Identification Descriptor list. A successfully discovered Zoned
> Namespace will be registered with the block layer as a host managed
> zoned block device with Zone Append command support. A namespace that
> does not support append is not supported by the driver.
> 
> Reviewed-by: Martin K. Petersen <martin.petersen at oracle.com>
> Reviewed-by: Johannes Thumshirn <johannes.thumshirn at wdc.com>
> Reviewed-by: Hannes Reinecke <hare at suse.de>
> Reviewed-by: Sagi Grimberg <sagi at grimberg.me>
> Reviewed-by: Javier González <javier.gonz at samsung.com>
> Signed-off-by: Hans Holmberg <hans.holmberg at wdc.com>
> Signed-off-by: Dmitry Fomichev <dmitry.fomichev at wdc.com>
> Signed-off-by: Ajay Joshi <ajay.joshi at wdc.com>
> Signed-off-by: Aravind Ramesh <aravind.ramesh at wdc.com>
> Signed-off-by: Niklas Cassel <niklas.cassel at wdc.com>
> Signed-off-by: Matias Bjørling <matias.bjorling at wdc.com>
> Signed-off-by: Damien Le Moal <damien.lemoal at wdc.com>
> Signed-off-by: Keith Busch <keith.busch at wdc.com>
> ---
> block/Kconfig              |   5 +-
> drivers/nvme/host/Makefile |   1 +
> drivers/nvme/host/core.c   |  97 ++++++++++++--
> drivers/nvme/host/nvme.h   |  39 ++++++
> drivers/nvme/host/zns.c    | 253 +++++++++++++++++++++++++++++++++++++
> include/linux/nvme.h       | 111 ++++++++++++++++
> 6 files changed, 491 insertions(+), 15 deletions(-)
> create mode 100644 drivers/nvme/host/zns.c
> 
> diff --git a/block/Kconfig b/block/Kconfig
> index 9357d7302398..bbad5e8bbffe 100644
> --- a/block/Kconfig
> +++ b/block/Kconfig
> @@ -86,9 +86,10 @@ config BLK_DEV_ZONED
> 	select MQ_IOSCHED_DEADLINE
> 	help
> 	Block layer zoned block device support. This option enables
> -	support for ZAC/ZBC host-managed and host-aware zoned block devices.
> +	support for ZAC/ZBC/ZNS host-managed and host-aware zoned block
> +	devices.
> 
> -	Say yes here if you have a ZAC or ZBC storage device.
> +	Say yes here if you have a ZAC, ZBC, or ZNS storage device.
> 
> config BLK_DEV_THROTTLING
> 	bool "Block layer bio throttling support"
> diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile
> index fc7b26be692d..d7f6a87687b8 100644
> --- a/drivers/nvme/host/Makefile
> +++ b/drivers/nvme/host/Makefile
> @@ -13,6 +13,7 @@ nvme-core-y				:= core.o
> nvme-core-$(CONFIG_TRACING)		+= trace.o
> nvme-core-$(CONFIG_NVME_MULTIPATH)	+= multipath.o
> nvme-core-$(CONFIG_NVM)			+= lightnvm.o
> +nvme-core-$(CONFIG_BLK_DEV_ZONED)	+= zns.o
> nvme-core-$(CONFIG_FAULT_INJECTION_DEBUG_FS)	+= fault_inject.o
> nvme-core-$(CONFIG_NVME_HWMON)		+= hwmon.o
> 
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index a4a61d7e793d..02b4273a0b10 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -89,7 +89,7 @@ static dev_t nvme_chr_devt;
> static struct class *nvme_class;
> static struct class *nvme_subsys_class;
> 
> -static int nvme_revalidate_disk(struct gendisk *disk);
> +static int _nvme_revalidate_disk(struct gendisk *disk);
> static void nvme_put_subsystem(struct nvme_subsystem *subsys);
> static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
> 					   unsigned nsid);
> @@ -287,6 +287,10 @@ void nvme_complete_rq(struct request *req)
> 			nvme_retry_req(req);
> 			return;
> 		}
> +	} else if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
> +		   req_op(req) == REQ_OP_ZONE_APPEND) {
> +		req->__sector = nvme_lba_to_sect(req->q->queuedata,
> +			le64_to_cpu(nvme_req(req)->result.u64));
> 	}
> 
> 	nvme_trace_bio_complete(req, status);
> @@ -673,7 +677,8 @@ static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns,
> }
> 
> static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
> -		struct request *req, struct nvme_command *cmnd)
> +		struct request *req, struct nvme_command *cmnd,
> +		enum nvme_opcode op)
> {
> 	struct nvme_ctrl *ctrl = ns->ctrl;
> 	u16 control = 0;
> @@ -687,7 +692,7 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
> 	if (req->cmd_flags & REQ_RAHEAD)
> 		dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
> 
> -	cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read);
> +	cmnd->rw.opcode = op;
> 	cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id);
> 	cmnd->rw.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
> 	cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
> @@ -716,6 +721,8 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
> 		case NVME_NS_DPS_PI_TYPE2:
> 			control |= NVME_RW_PRINFO_PRCHK_GUARD |
> 					NVME_RW_PRINFO_PRCHK_REF;
> +			if (op == nvme_cmd_zone_append)
> +				control |= NVME_RW_APPEND_PIREMAP;
> 			cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req));
> 			break;
> 		}
> @@ -756,6 +763,19 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
> 	case REQ_OP_FLUSH:
> 		nvme_setup_flush(ns, cmd);
> 		break;
> +	case REQ_OP_ZONE_RESET_ALL:
> +	case REQ_OP_ZONE_RESET:
> +		ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_RESET);
> +		break;
> +	case REQ_OP_ZONE_OPEN:
> +		ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_OPEN);
> +		break;
> +	case REQ_OP_ZONE_CLOSE:
> +		ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_CLOSE);
> +		break;
> +	case REQ_OP_ZONE_FINISH:
> +		ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_FINISH);
> +		break;
> 	case REQ_OP_WRITE_ZEROES:
> 		ret = nvme_setup_write_zeroes(ns, req, cmd);
> 		break;
> @@ -763,8 +783,13 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
> 		ret = nvme_setup_discard(ns, req, cmd);
> 		break;
> 	case REQ_OP_READ:
> +		ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_read);
> +		break;
> 	case REQ_OP_WRITE:
> -		ret = nvme_setup_rw(ns, req, cmd);
> +		ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_write);
> +		break;
> +	case REQ_OP_ZONE_APPEND:
> +		ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_zone_append);
> 		break;
> 	default:
> 		WARN_ON_ONCE(1);
> @@ -1398,14 +1423,23 @@ static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
> 	return effects;
> }
> 
> -static void nvme_update_formats(struct nvme_ctrl *ctrl)
> +static void nvme_update_formats(struct nvme_ctrl *ctrl, u32 *effects)
> {
> 	struct nvme_ns *ns;
> 
> 	down_read(&ctrl->namespaces_rwsem);
> 	list_for_each_entry(ns, &ctrl->namespaces, list)
> -		if (ns->disk && nvme_revalidate_disk(ns->disk))
> +		if (ns->disk && _nvme_revalidate_disk(ns->disk))
> 			nvme_set_queue_dying(ns);
> +		else if (blk_queue_is_zoned(ns->disk->queue)) {
> +			/*
> +			 * IO commands are required to fully revalidate a zoned
> +			 * device. Force the command effects to trigger rescan
> +			 * work so report zones can run in a context with
> +			 * unfrozen IO queues.
> +			 */
> +			*effects |= NVME_CMD_EFFECTS_NCC;
> +		}
> 	up_read(&ctrl->namespaces_rwsem);
> }
> 
> @@ -1417,7 +1451,7 @@ static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects)
> 	 * this command.
> 	 */
> 	if (effects & NVME_CMD_EFFECTS_LBCC)
> -		nvme_update_formats(ctrl);
> +		nvme_update_formats(ctrl, &effects);
> 	if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) {
> 		nvme_unfreeze(ctrl);
> 		nvme_mpath_unfreeze(ctrl->subsys);
> @@ -1532,7 +1566,7 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
>  * Issue ioctl requests on the first available path.  Note that unlike normal
>  * block layer requests we will not retry failed request on another controller.
>  */
> -static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
> +struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
> 		struct nvme_ns_head **head, int *srcu_idx)
> {
> #ifdef CONFIG_NVME_MULTIPATH
> @@ -1552,7 +1586,7 @@ static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
> 	return disk->private_data;
> }
> 
> -static void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx)
> +void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx)
> {
> 	if (head)
> 		srcu_read_unlock(&head->srcu, idx);
> @@ -1945,23 +1979,34 @@ static void nvme_update_disk_info(struct gendisk *disk,
> 
> static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
> {
> +	unsigned lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
> 	struct nvme_ns *ns = disk->private_data;
> 	struct nvme_ctrl *ctrl = ns->ctrl;
> +	int ret;
> 	u32 iob;
> 
> 	/*
> 	 * If identify namespace failed, use default 512 byte block size so
> 	 * block layer can use before failing read/write for 0 capacity.
> 	 */
> -	ns->lba_shift = id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ds;
> +	ns->lba_shift = id->lbaf[lbaf].ds;
> 	if (ns->lba_shift == 0)
> 		ns->lba_shift = 9;
> 
> 	switch (ns->head->ids.csi) {
> 	case NVME_CSI_NVM:
> 		break;
> +	case NVME_CSI_ZNS:
> +		ret = nvme_update_zone_info(disk, ns, lbaf);
> +		if (ret) {
> +			dev_warn(ctrl->device,
> +				"failed to add zoned namespace:%u ret:%d\n",
> +				ns->head->ns_id, ret);
> +			return ret;
> +		}
> +		break;
> 	default:
> -		dev_warn(ctrl->device, "unknown csi:%d ns:%d\n",
> +		dev_warn(ctrl->device, "unknown csi:%u ns:%u\n",
> 			ns->head->ids.csi, ns->head->ns_id);
> 		return -ENODEV;
> 	}
> @@ -1973,7 +2018,7 @@ static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
> 		iob = nvme_lba_to_sect(ns, le16_to_cpu(id->noiob));
> 
> 	ns->features = 0;
> -	ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms);
> +	ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
> 	/* the PI implementation requires metadata equal t10 pi tuple size */
> 	if (ns->ms == sizeof(struct t10_pi_tuple))
> 		ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
> @@ -2015,7 +2060,7 @@ static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
> 	return 0;
> }
> 
> -static int nvme_revalidate_disk(struct gendisk *disk)
> +static int _nvme_revalidate_disk(struct gendisk *disk)
> {
> 	struct nvme_ns *ns = disk->private_data;
> 	struct nvme_ctrl *ctrl = ns->ctrl;
> @@ -2063,6 +2108,28 @@ static int nvme_revalidate_disk(struct gendisk *disk)
> 	return ret;
> }
> 
> +static int nvme_revalidate_disk(struct gendisk *disk)
> +{
> +	int ret;
> +
> +	ret = _nvme_revalidate_disk(disk);
> +	if (ret)
> +		return ret;
> +
> +#ifdef CONFIG_BLK_DEV_ZONED
> +	if (blk_queue_is_zoned(disk->queue)) {
> +		struct nvme_ns *ns = disk->private_data;
> +		struct nvme_ctrl *ctrl = ns->ctrl;
> +
> +		ret = blk_revalidate_disk_zones(disk, NULL);
> +		if (!ret)
> +			blk_queue_max_zone_append_sectors(disk->queue,
> +							  ctrl->max_zone_append);
> +	}
> +#endif
> +	return ret;
> +}
> +
> static char nvme_pr_type(enum pr_type type)
> {
> 	switch (type) {
> @@ -2193,6 +2260,7 @@ static const struct block_device_operations nvme_fops = {
> 	.release	= nvme_release,
> 	.getgeo		= nvme_getgeo,
> 	.revalidate_disk= nvme_revalidate_disk,
> +	.report_zones	= nvme_report_zones,
> 	.pr_ops		= &nvme_pr_ops,
> };
> 
> @@ -2218,6 +2286,7 @@ const struct block_device_operations nvme_ns_head_ops = {
> 	.ioctl		= nvme_ioctl,
> 	.compat_ioctl	= nvme_compat_ioctl,
> 	.getgeo		= nvme_getgeo,
> +	.report_zones	= nvme_report_zones,
> 	.pr_ops		= &nvme_pr_ops,
> };
> #endif /* CONFIG_NVME_MULTIPATH */
> @@ -4445,6 +4514,8 @@ static inline void _nvme_check_size(void)
> 	BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
> 	BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != NVME_IDENTIFY_DATA_SIZE);
> 	BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE);
> +	BUILD_BUG_ON(sizeof(struct nvme_id_ns_zns) != NVME_IDENTIFY_DATA_SIZE);
> +	BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_zns) != NVME_IDENTIFY_DATA_SIZE);
> 	BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
> 	BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
> 	BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);
> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
> index dd24a94c42ff..ed10bfff91e1 100644
> --- a/drivers/nvme/host/nvme.h
> +++ b/drivers/nvme/host/nvme.h
> @@ -238,6 +238,9 @@ struct nvme_ctrl {
> 	u32 max_hw_sectors;
> 	u32 max_segments;
> 	u32 max_integrity_segments;
> +#ifdef CONFIG_BLK_DEV_ZONED
> +	u32 max_zone_append;
> +#endif
> 	u16 crdt[3];
> 	u16 oncs;
> 	u16 oacs;
> @@ -404,6 +407,9 @@ struct nvme_ns {
> 	u16 sgs;
> 	u32 sws;
> 	u8 pi_type;
> +#ifdef CONFIG_BLK_DEV_ZONED
> +	u64 zsze;
> +#endif
> 	unsigned long features;
> 	unsigned long flags;
> #define NVME_NS_REMOVING	0
> @@ -571,6 +577,9 @@ int nvme_delete_ctrl(struct nvme_ctrl *ctrl);
> 
> int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi,
> 		void *log, size_t size, u64 offset);
> +struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
> +		struct nvme_ns_head **head, int *srcu_idx);
> +void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx);
> 
> extern const struct attribute_group *nvme_ns_id_attr_groups[];
> extern const struct block_device_operations nvme_ns_head_ops;
> @@ -692,6 +701,36 @@ static inline void nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
> }
> #endif /* CONFIG_NVME_MULTIPATH */
> 
> +#ifdef CONFIG_BLK_DEV_ZONED
> +int nvme_update_zone_info(struct gendisk *disk, struct nvme_ns *ns,
> +			  unsigned lbaf);
> +
> +int nvme_report_zones(struct gendisk *disk, sector_t sector,
> +		      unsigned int nr_zones, report_zones_cb cb, void *data);
> +
> +blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req,
> +				       struct nvme_command *cmnd,
> +				       enum nvme_zone_mgmt_action action);
> +#else
> +#define nvme_report_zones NULL
> +
> +static inline blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns,
> +		struct request *req, struct nvme_command *cmnd,
> +		enum nvme_zone_mgmt_action action)
> +{
> +	return BLK_STS_NOTSUPP;
> +}
> +
> +static inline int nvme_update_zone_info(struct gendisk *disk,
> +					struct nvme_ns *ns,
> +					unsigned lbaf)
> +{
> +	dev_warn(ns->ctrl->device,
> +		 "Please enable CONFIG_BLK_DEV_ZONED to support ZNS devices\n");
> +	return -EPROTONOSUPPORT;
> +}
> +#endif
> +
> #ifdef CONFIG_NVM
> int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node);
> void nvme_nvm_unregister(struct nvme_ns *ns);
> diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c
> new file mode 100644
> index 000000000000..c033c60ce751
> --- /dev/null
> +++ b/drivers/nvme/host/zns.c
> @@ -0,0 +1,253 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (C) 2020 Western Digital Corporation or its affiliates.
> + */
> +
> +#include <linux/blkdev.h>
> +#include <linux/vmalloc.h>
> +#include "nvme.h"
> +
> +static int nvme_set_max_append(struct nvme_ctrl *ctrl)
> +{
> +	struct nvme_command c = { };
> +	struct nvme_id_ctrl_zns *id;
> +	int status;
> +
> +	id = kzalloc(sizeof(*id), GFP_KERNEL);
> +	if (!id)
> +		return -ENOMEM;
> +
> +	c.identify.opcode = nvme_admin_identify;
> +	c.identify.cns = NVME_ID_CNS_CS_CTRL;
> +	c.identify.csi = NVME_CSI_ZNS;
> +
> +	status = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id));
> +	if (status) {
> +		kfree(id);
> +		return status;
> +	}
> +
> +	if (id->zasl)
> +		ctrl->max_zone_append = 1 << (id->zasl + 3);
> +	else
> +		ctrl->max_zone_append = ctrl->max_hw_sectors;
> +	kfree(id);
> +	return 0;
> +}
> +
> +int nvme_update_zone_info(struct gendisk *disk, struct nvme_ns *ns,
> +			  unsigned lbaf)
> +{
> +	struct nvme_effects_log *log = ns->head->effects;
> +	struct request_queue *q = disk->queue;
> +	struct nvme_command c = { };
> +	struct nvme_id_ns_zns *id;
> +	int status;
> +
> +	/* Driver requires zone append support */
> +	if (!(log->iocs[nvme_cmd_zone_append] & NVME_CMD_EFFECTS_CSUPP)) {
> +		dev_warn(ns->ctrl->device,
> +			"append not supported for zoned namespace:%d\n",
> +			ns->head->ns_id);
> +		return -EINVAL;
> +	}
> +
> +	/* Lazily query controller append limit for the first zoned namespace */
> +	if (!ns->ctrl->max_zone_append) {
> +		status = nvme_set_max_append(ns->ctrl);
> +		if (status)
> +			return status;
> +	}
> +
> +	id = kzalloc(sizeof(*id), GFP_KERNEL);
> +	if (!id)
> +		return -ENOMEM;
> +
> +	c.identify.opcode = nvme_admin_identify;
> +	c.identify.nsid = cpu_to_le32(ns->head->ns_id);
> +	c.identify.cns = NVME_ID_CNS_CS_NS;
> +	c.identify.csi = NVME_CSI_ZNS;
> +
> +	status = nvme_submit_sync_cmd(ns->ctrl->admin_q, &c, id, sizeof(*id));
> +	if (status)
> +		goto free_data;
> +
> +	/*
> +	 * We currently do not handle devices requiring any of the zoned
> +	 * operation characteristics.
> +	 */
> +	if (id->zoc) {
> +		dev_warn(ns->ctrl->device,
> +			"zone operations:%x not supported for namespace:%u\n",
> +			le16_to_cpu(id->zoc), ns->head->ns_id);
> +		status = -EINVAL;
> +		goto free_data;
> +	}
> +
> +	ns->zsze = nvme_lba_to_sect(ns, le64_to_cpu(id->lbafe[lbaf].zsze));
> +	if (!is_power_of_2(ns->zsze)) {
> +		dev_warn(ns->ctrl->device,
> +			"invalid zone size:%llu for namespace:%u\n",
> +			ns->zsze, ns->head->ns_id);
> +		status = -EINVAL;
> +		goto free_data;
> +	}
> +
> +	q->limits.zoned = BLK_ZONED_HM;
> +	blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q);
> +free_data:
> +	kfree(id);
> +	return status;
> +}
> +
> +static void *nvme_zns_alloc_report_buffer(struct nvme_ns *ns,
> +					  unsigned int nr_zones, size_t *buflen)
> +{
> +	struct request_queue *q = ns->disk->queue;
> +	size_t bufsize;
> +	void *buf;
> +
> +	const size_t min_bufsize = sizeof(struct nvme_zone_report) +
> +				   sizeof(struct nvme_zone_descriptor);
> +
> +	nr_zones = min_t(unsigned int, nr_zones,
> +			 get_capacity(ns->disk) >> ilog2(ns->zsze));
> +
> +	bufsize = sizeof(struct nvme_zone_report) +
> +		nr_zones * sizeof(struct nvme_zone_descriptor);
> +	bufsize = min_t(size_t, bufsize,
> +			queue_max_hw_sectors(q) << SECTOR_SHIFT);
> +	bufsize = min_t(size_t, bufsize, queue_max_segments(q) << PAGE_SHIFT);
> +
> +	while (bufsize >= min_bufsize) {
> +		buf = __vmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY);
> +		if (buf) {
> +			*buflen = bufsize;
> +			return buf;
> +		}
> +		bufsize >>= 1;
> +	}
> +	return NULL;
> +}
> +
> +static int __nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
> +				  struct nvme_zone_report *report,
> +				  size_t buflen)
> +{
> +	struct nvme_command c = { };
> +	int ret;
> +
> +	c.zmr.opcode = nvme_cmd_zone_mgmt_recv;
> +	c.zmr.nsid = cpu_to_le32(ns->head->ns_id);
> +	c.zmr.slba = cpu_to_le64(nvme_sect_to_lba(ns, sector));
> +	c.zmr.numd = cpu_to_le32(nvme_bytes_to_numd(buflen));
> +	c.zmr.zra = NVME_ZRA_ZONE_REPORT;
> +	c.zmr.zrasf = NVME_ZRASF_ZONE_REPORT_ALL;
> +	c.zmr.pr = NVME_REPORT_ZONE_PARTIAL;
> +
> +	ret = nvme_submit_sync_cmd(ns->queue, &c, report, buflen);
> +	if (ret)
> +		return ret;
> +
> +	return le64_to_cpu(report->nr_zones);
> +}
> +
> +static int nvme_zone_parse_entry(struct nvme_ns *ns,
> +				 struct nvme_zone_descriptor *entry,
> +				 unsigned int idx, report_zones_cb cb,
> +				 void *data)
> +{
> +	struct blk_zone zone = { };
> +
> +	if ((entry->zt & 0xf) != NVME_ZONE_TYPE_SEQWRITE_REQ) {
> +		dev_err(ns->ctrl->device, "invalid zone type %#x\n",
> +				entry->zt);
> +		return -EINVAL;
> +	}
> +
> +	zone.type = BLK_ZONE_TYPE_SEQWRITE_REQ;
> +	zone.cond = entry->zs >> 4;
> +	zone.len = ns->zsze;
> +	zone.capacity = nvme_lba_to_sect(ns, le64_to_cpu(entry->zcap));
> +	zone.start = nvme_lba_to_sect(ns, le64_to_cpu(entry->zslba));
> +	zone.wp = nvme_lba_to_sect(ns, le64_to_cpu(entry->wp));
> +
> +	return cb(&zone, idx, data);
> +}
> +
> +static int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
> +			unsigned int nr_zones, report_zones_cb cb, void *data)
> +{
> +	struct nvme_zone_report *report;
> +	int ret, zone_idx = 0;
> +	unsigned int nz, i;
> +	size_t buflen;
> +
> +	report = nvme_zns_alloc_report_buffer(ns, nr_zones, &buflen);
> +	if (!report)
> +		return -ENOMEM;
> +
> +	sector &= ~(ns->zsze - 1);
> +	while (zone_idx < nr_zones && sector < get_capacity(ns->disk)) {
> +		memset(report, 0, buflen);
> +		ret = __nvme_ns_report_zones(ns, sector, report, buflen);
> +		if (ret < 0)
> +			goto out_free;
> +
> +		nz = min_t(unsigned int, ret, nr_zones);
> +		if (!nz)
> +			break;
> +
> +		for (i = 0; i < nz && zone_idx < nr_zones; i++) {
> +			ret = nvme_zone_parse_entry(ns, &report->entries[i],
> +						    zone_idx, cb, data);
> +			if (ret)
> +				goto out_free;
> +			zone_idx++;
> +		}
> +
> +		sector += ns->zsze * nz;
> +	}
> +
> +	if (zone_idx > 0)
> +		ret = zone_idx;
> +	else
> +		ret = -EINVAL;
> +out_free:
> +	kvfree(report);
> +	return ret;
> +}
> +
> +int nvme_report_zones(struct gendisk *disk, sector_t sector,
> +		      unsigned int nr_zones, report_zones_cb cb, void *data)
> +{
> +	struct nvme_ns_head *head = NULL;
> +	struct nvme_ns *ns;
> +	int srcu_idx, ret;
> +
> +	ns = nvme_get_ns_from_disk(disk, &head, &srcu_idx);
> +	if (unlikely(!ns))
> +		return -EWOULDBLOCK;
> +
> +	if (ns->head->ids.csi == NVME_CSI_ZNS)
> +		ret = nvme_ns_report_zones(ns, sector, nr_zones, cb, data);
> +	else
> +		ret = -EINVAL;
> +	nvme_put_ns_from_disk(head, srcu_idx);
> +
> +	return ret;
> +}
> +
> +blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req,
> +		struct nvme_command *c, enum nvme_zone_mgmt_action action)
> +{
> +	c->zms.opcode = nvme_cmd_zone_mgmt_send;
> +	c->zms.nsid = cpu_to_le32(ns->head->ns_id);
> +	c->zms.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
> +	c->zms.zsa = action;
> +
> +	if (req_op(req) == REQ_OP_ZONE_RESET_ALL)
> +		c->zms.select_all = 1;
> +
> +	return BLK_STS_OK;
> +}
> diff --git a/include/linux/nvme.h b/include/linux/nvme.h
> index 95cd03e240a1..1643005d21e3 100644
> --- a/include/linux/nvme.h
> +++ b/include/linux/nvme.h
> @@ -374,6 +374,30 @@ struct nvme_id_ns {
> 	__u8			vs[3712];
> };
> 
> +struct nvme_zns_lbafe {
> +	__le64			zsze;
> +	__u8			zdes;
> +	__u8			rsvd9[7];
> +};
> +
> +struct nvme_id_ns_zns {
> +	__le16			zoc;
> +	__le16			ozcs;
> +	__le32			mar;
> +	__le32			mor;
> +	__le32			rrl;
> +	__le32			frl;
> +	__u8			rsvd20[2796];
> +	struct nvme_zns_lbafe	lbafe[16];
> +	__u8			rsvd3072[768];
> +	__u8			vs[256];
> +};
> +
> +struct nvme_id_ctrl_zns {
> +	__u8	zasl;
> +	__u8	rsvd1[4095];
> +};
> +
> enum {
> 	NVME_ID_CNS_NS			= 0x00,
> 	NVME_ID_CNS_CTRL		= 0x01,
> @@ -392,6 +416,7 @@ enum {
> 
> enum {
> 	NVME_CSI_NVM			= 0,
> +	NVME_CSI_ZNS			= 2,
> };
> 
> enum {
> @@ -532,6 +557,27 @@ struct nvme_ana_rsp_hdr {
> 	__le16	rsvd10[3];
> };
> 
> +struct nvme_zone_descriptor {
> +	__u8		zt;
> +	__u8		zs;
> +	__u8		za;
> +	__u8		rsvd3[5];
> +	__le64		zcap;
> +	__le64		zslba;
> +	__le64		wp;
> +	__u8		rsvd32[32];
> +};
> +
> +enum {
> +	NVME_ZONE_TYPE_SEQWRITE_REQ	= 0x2,
> +};
> +
> +struct nvme_zone_report {
> +	__le64		nr_zones;
> +	__u8		resv8[56];
> +	struct nvme_zone_descriptor entries[];
> +};
> +
> enum {
> 	NVME_SMART_CRIT_SPARE		= 1 << 0,
> 	NVME_SMART_CRIT_TEMPERATURE	= 1 << 1,
> @@ -626,6 +672,9 @@ enum nvme_opcode {
> 	nvme_cmd_resv_report	= 0x0e,
> 	nvme_cmd_resv_acquire	= 0x11,
> 	nvme_cmd_resv_release	= 0x15,
> +	nvme_cmd_zone_mgmt_send	= 0x79,
> +	nvme_cmd_zone_mgmt_recv	= 0x7a,
> +	nvme_cmd_zone_append	= 0x7d,
> };
> 
> #define nvme_opcode_name(opcode)	{ opcode, #opcode }
> @@ -764,6 +813,7 @@ struct nvme_rw_command {
> enum {
> 	NVME_RW_LR			= 1 << 15,
> 	NVME_RW_FUA			= 1 << 14,
> +	NVME_RW_APPEND_PIREMAP		= 1 << 9,
> 	NVME_RW_DSM_FREQ_UNSPEC		= 0,
> 	NVME_RW_DSM_FREQ_TYPICAL	= 1,
> 	NVME_RW_DSM_FREQ_RARE		= 2,
> @@ -829,6 +879,53 @@ struct nvme_write_zeroes_cmd {
> 	__le16			appmask;
> };
> 
> +enum nvme_zone_mgmt_action {
> +	NVME_ZONE_CLOSE		= 0x1,
> +	NVME_ZONE_FINISH	= 0x2,
> +	NVME_ZONE_OPEN		= 0x3,
> +	NVME_ZONE_RESET		= 0x4,
> +	NVME_ZONE_OFFLINE	= 0x5,
> +	NVME_ZONE_SET_DESC_EXT	= 0x10,
> +};
> +
> +struct nvme_zone_mgmt_send_cmd {
> +	__u8			opcode;
> +	__u8			flags;
> +	__u16			command_id;
> +	__le32			nsid;
> +	__le32			cdw2[2];
> +	__le64			metadata;
> +	union nvme_data_ptr	dptr;
> +	__le64			slba;
> +	__le32			cdw12;
> +	__u8			zsa;
> +	__u8			select_all;
> +	__u8			rsvd13[2];
> +	__le32			cdw14[2];
> +};
> +
> +struct nvme_zone_mgmt_recv_cmd {
> +	__u8			opcode;
> +	__u8			flags;
> +	__u16			command_id;
> +	__le32			nsid;
> +	__le64			rsvd2[2];
> +	union nvme_data_ptr	dptr;
> +	__le64			slba;
> +	__le32			numd;
> +	__u8			zra;
> +	__u8			zrasf;
> +	__u8			pr;
> +	__u8			rsvd13;
> +	__le32			cdw14[2];
> +};
> +
> +enum {
> +	NVME_ZRA_ZONE_REPORT		= 0,
> +	NVME_ZRASF_ZONE_REPORT_ALL	= 0,
> +	NVME_REPORT_ZONE_PARTIAL	= 1,
> +};
> +
> /* Features */
> 
> enum {
> @@ -1300,6 +1397,8 @@ struct nvme_command {
> 		struct nvme_format_cmd format;
> 		struct nvme_dsm_cmd dsm;
> 		struct nvme_write_zeroes_cmd write_zeroes;
> +		struct nvme_zone_mgmt_send_cmd zms;
> +		struct nvme_zone_mgmt_recv_cmd zmr;
> 		struct nvme_abort_cmd abort;
> 		struct nvme_get_log_page_command get_log_page;
> 		struct nvmf_common_command fabrics;
> @@ -1433,6 +1532,18 @@ enum {
> 	NVME_SC_DISCOVERY_RESTART	= 0x190,
> 	NVME_SC_AUTH_REQUIRED		= 0x191,
> 
> +	/*
> +	 * I/O Command Set Specific - Zoned commands:
> +	 */
> +	NVME_SC_ZONE_BOUNDARY_ERROR	= 0x1b8,
> +	NVME_SC_ZONE_FULL		= 0x1b9,
> +	NVME_SC_ZONE_READ_ONLY		= 0x1ba,
> +	NVME_SC_ZONE_OFFLINE		= 0x1bb,
> +	NVME_SC_ZONE_INVALID_WRITE	= 0x1bc,
> +	NVME_SC_ZONE_TOO_MANY_ACTIVE	= 0x1bd,
> +	NVME_SC_ZONE_TOO_MANY_OPEN	= 0x1be,
> +	NVME_SC_ZONE_INVALID_TRANSITION	= 0x1bf,
> +
> 	/*
> 	 * Media and Data Integrity Errors:
> 	 */
> -- 
> 2.24.1
> 

Reviewed-by: Himanshu Madhani <himanshu.madhani at oracle.com>

--
Himanshu Madhani	Oracle Linux Engineering








More information about the Linux-nvme mailing list