[PATCH 2/9] nvmet: add ZNS support for bdev-ns

Damien Le Moal Damien.LeMoal at wdc.com
Thu Nov 26 04:06:23 EST 2020


On 2020/11/26 11:42, Chaitanya Kulkarni wrote:
> Add zns-bdev-config, id-ctrl, id-ns, zns-cmd-effects, zone-mgmt-send,
> zone-mgmt-recv and zone-append handlers for NVMeOF target to enable ZNS
> support for bdev.
> 
> Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni at wdc.com>
> ---
>  drivers/nvme/target/Makefile      |   2 +
>  drivers/nvme/target/admin-cmd.c   |   4 +-
>  drivers/nvme/target/io-cmd-file.c |   2 +-
>  drivers/nvme/target/nvmet.h       |  18 ++
>  drivers/nvme/target/zns.c         | 390 ++++++++++++++++++++++++++++++
>  5 files changed, 413 insertions(+), 3 deletions(-)
>  create mode 100644 drivers/nvme/target/zns.c
> 
> diff --git a/drivers/nvme/target/Makefile b/drivers/nvme/target/Makefile
> index ebf91fc4c72e..bc147ff2df5d 100644
> --- a/drivers/nvme/target/Makefile
> +++ b/drivers/nvme/target/Makefile
> @@ -12,6 +12,8 @@ obj-$(CONFIG_NVME_TARGET_TCP)		+= nvmet-tcp.o
>  nvmet-y		+= core.o configfs.o admin-cmd.o fabrics-cmd.o \
>  			discovery.o io-cmd-file.o io-cmd-bdev.o
>  nvmet-$(CONFIG_NVME_TARGET_PASSTHRU)	+= passthru.o
> +nvmet-$(CONFIG_BLK_DEV_ZONED)		+= zns.o
> +
>  nvme-loop-y	+= loop.o
>  nvmet-rdma-y	+= rdma.o
>  nvmet-fc-y	+= fc.o
> diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
> index dca34489a1dc..509fd8dcca0c 100644
> --- a/drivers/nvme/target/admin-cmd.c
> +++ b/drivers/nvme/target/admin-cmd.c
> @@ -579,8 +579,8 @@ static void nvmet_execute_identify_nslist(struct nvmet_req *req)
>  	nvmet_req_complete(req, status);
>  }
>  
> -static u16 nvmet_copy_ns_identifier(struct nvmet_req *req, u8 type, u8 len,
> -				    void *id, off_t *off)
> +u16 nvmet_copy_ns_identifier(struct nvmet_req *req, u8 type, u8 len,
> +			     void *id, off_t *off)
>  {
>  	struct nvme_ns_id_desc desc = {
>  		.nidt = type,
> diff --git a/drivers/nvme/target/io-cmd-file.c b/drivers/nvme/target/io-cmd-file.c
> index 0abbefd9925e..2bd10960fa50 100644
> --- a/drivers/nvme/target/io-cmd-file.c
> +++ b/drivers/nvme/target/io-cmd-file.c
> @@ -89,7 +89,7 @@ int nvmet_file_ns_enable(struct nvmet_ns *ns)
>  	return ret;
>  }
>  
> -static void nvmet_file_init_bvec(struct bio_vec *bv, struct scatterlist *sg)
> +void nvmet_file_init_bvec(struct bio_vec *bv, struct scatterlist *sg)
>  {
>  	bv->bv_page = sg_page(sg);
>  	bv->bv_offset = sg->offset;
> diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
> index 592763732065..0542ba672a31 100644
> --- a/drivers/nvme/target/nvmet.h
> +++ b/drivers/nvme/target/nvmet.h
> @@ -81,6 +81,9 @@ struct nvmet_ns {
>  	struct pci_dev		*p2p_dev;
>  	int			pi_type;
>  	int			metadata_size;
> +#ifdef CONFIG_BLK_DEV_ZONED
> +	struct nvme_id_ns_zns	id_zns;
> +#endif
>  };
>  
>  static inline struct nvmet_ns *to_nvmet_ns(struct config_item *item)
> @@ -251,6 +254,10 @@ struct nvmet_subsys {
>  	unsigned int		admin_timeout;
>  	unsigned int		io_timeout;
>  #endif /* CONFIG_NVME_TARGET_PASSTHRU */
> +
> +#ifdef CONFIG_BLK_DEV_ZONED
> +	struct nvme_id_ctrl_zns	id_ctrl_zns;
> +#endif
>  };
>  
>  static inline struct nvmet_subsys *to_subsys(struct config_item *item)
> @@ -603,4 +610,15 @@ static inline bool nvmet_ns_has_pi(struct nvmet_ns *ns)
>  	return ns->pi_type && ns->metadata_size == sizeof(struct t10_pi_tuple);
>  }
>  
> +void nvmet_execute_identify_cns_cs_ctrl(struct nvmet_req *req);
> +void nvmet_execute_identify_cns_cs_ns(struct nvmet_req *req);
> +u16 nvmet_process_zns_cis(struct nvmet_req *req, off_t *off);
> +bool nvmet_bdev_zns_config(struct nvmet_ns *ns);
> +void nvmet_bdev_execute_zone_mgmt_recv(struct nvmet_req *req);
> +void nvmet_bdev_execute_zone_mgmt_send(struct nvmet_req *req);
> +void nvmet_bdev_execute_zone_append(struct nvmet_req *req);
> +void nvmet_zns_add_cmd_effects(struct nvme_effects_log *log);
> +u16 nvmet_copy_ns_identifier(struct nvmet_req *req, u8 type, u8 len,
> +			     void *id, off_t *off);
> +void nvmet_file_init_bvec(struct bio_vec *bv, struct scatterlist *sg);
>  #endif /* _NVMET_H */
> diff --git a/drivers/nvme/target/zns.c b/drivers/nvme/target/zns.c
> new file mode 100644
> index 000000000000..8ea6641a55e3
> --- /dev/null
> +++ b/drivers/nvme/target/zns.c
> @@ -0,0 +1,390 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * NVMe ZNS-ZBD command implementation.
> + * Copyright (c) 2020-2021 HGST, a Western Digital Company.
> + */
> +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
> +#include <linux/uio.h>
> +#include <linux/nvme.h>
> +#include <linux/blkdev.h>
> +#include <linux/module.h>
> +#include "nvmet.h"
> +
> +#ifdef CONFIG_BLK_DEV_ZONED
> +
> +static u16 nvmet_bdev_zns_checks(struct nvmet_req *req)
> +{
> +	u16 status = 0;
> +
> +	if (!bdev_is_zoned(req->ns->bdev)) {
> +		status = NVME_SC_INVALID_NS | NVME_SC_DNR;
> +		goto out;
> +	}
> +
> +	if (req->cmd->zmr.zra != NVME_ZRA_ZONE_REPORT) {
> +		status = NVME_SC_INVALID_FIELD;
> +		goto out;
> +	}
> +
> +	if (req->cmd->zmr.zrasf != NVME_ZRASF_ZONE_REPORT_ALL) {
> +		status = NVME_SC_INVALID_FIELD;
> +		goto out;
> +	}
> +
> +	if (req->cmd->zmr.pr != NVME_REPORT_ZONE_PARTIAL)
> +		status = NVME_SC_INVALID_FIELD;
> +out:
> +	return status;
> +}
> +
> +static struct block_device *nvmet_bdev(struct nvmet_req *req)
> +{
> +	return req->ns->bdev;
> +}
> +
> +static u64 nvmet_zones_to_descsize(unsigned int nr_zones)
> +{
> +	return sizeof(struct nvme_zone_report) +
> +		(sizeof(struct nvme_zone_descriptor) * nr_zones);
> +}
> +
> +static inline u64 nvmet_sect_to_lba(struct nvmet_ns *ns, sector_t sect)
> +{
> +	return sect >> (ns->blksize_shift - SECTOR_SHIFT);
> +}
> +
> +static inline sector_t nvmet_lba_to_sect(struct nvmet_ns *ns, __le64 lba)
> +{
> +	return le64_to_cpu(lba) << (ns->blksize_shift - SECTOR_SHIFT);
> +}
> +
> +/*
> + *  ZNS related command implementation and helprs.
> + */
> +
> +u16 nvmet_process_zns_cis(struct nvmet_req *req, off_t *off)
> +{
> +	u16 nvme_cis_zns = NVME_CSI_ZNS;
> +
> +	if (bdev_is_zoned(nvmet_bdev(req))) {
> +		return nvmet_copy_ns_identifier(req, NVME_NIDT_CSI,
> +						 NVME_NIDT_CSI_LEN,
> +						 &nvme_cis_zns, off);
> +	}
> +
> +	return NVME_SC_SUCCESS;
> +}
> +
> +void nvmet_zns_add_cmd_effects(struct nvme_effects_log *log)
> +{
> +	log->iocs[nvme_cmd_zone_append]		= cpu_to_le32(1 << 0);
> +	log->iocs[nvme_cmd_zone_mgmt_send]	= cpu_to_le32(1 << 0);
> +	log->iocs[nvme_cmd_zone_mgmt_recv]	= cpu_to_le32(1 << 0);
> +}
> +
> +bool nvmet_bdev_zns_config(struct nvmet_ns *ns)
> +{
> +	if (ns->bdev->bd_disk->queue->conv_zones_bitmap) {
> +		pr_err("block device with conventional zones not supported.");
> +		return false;
> +	}
> +	/*
> +	 * SMR drives will results in error if writes are not aligned to the
> +	 * physical block size just override.
> +	 */
> +	ns->blksize_shift = blksize_bits(bdev_physical_block_size(ns->bdev));
> +	return true;
> +}
> +
> +static int nvmet_bdev_report_zone_cb(struct blk_zone *zone, unsigned int idx,
> +				     void *data)
> +{
> +	struct blk_zone *zones = data;
> +
> +	memcpy(&zones[idx], zone, sizeof(struct blk_zone));
> +
> +	return 0;
> +}
> +
> +static void nvmet_get_zone_desc(struct nvmet_ns *ns, struct blk_zone *z,
> +				struct nvme_zone_descriptor *rz)
> +{
> +	rz->zcap = cpu_to_le64(nvmet_sect_to_lba(ns, z->capacity));
> +	rz->zslba = cpu_to_le64(nvmet_sect_to_lba(ns, z->start));
> +	rz->wp = cpu_to_le64(nvmet_sect_to_lba(ns, z->wp));
> +	rz->za = z->reset ? 1 << 2 : 0;
> +	rz->zt = z->type;
> +	rz->zs = z->cond << 4;
> +}
> +
> +/*
> + * ZNS related Admin and I/O command handlers.
> + */
> +void nvmet_execute_identify_cns_cs_ctrl(struct nvmet_req *req)
> +{
> +	struct nvme_id_ctrl_zns *id;
> +	u16 status = 0;
> +
> +	id = kzalloc(sizeof(*id), GFP_KERNEL);
> +	if (!id) {
> +		status = NVME_SC_INTERNAL;
> +		goto out;
> +	}
> +
> +	/*
> +	 * Even though this function sets Zone Append Size Limit to 0,
> +	 * the 0 value here indicates that the maximum data transfer size for
> +	 * the Zone Append command is indicated by the ctrl
> +	 * Maximum Data Transfer Size (MDTS).
> +	 */
> +	id->zasl = 0;
> +
> +	status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id));
> +
> +	kfree(id);
> +out:
> +	nvmet_req_complete(req, status);
> +}
> +
> +void nvmet_execute_identify_cns_cs_ns(struct nvmet_req *req)
> +{
> +	struct nvme_id_ns_zns *id_zns;
> +	u16 status = 0;
> +	u64 zsze;
> +
> +	if (le32_to_cpu(req->cmd->identify.nsid) == NVME_NSID_ALL) {
> +		req->error_loc = offsetof(struct nvme_identify, nsid);
> +		status = NVME_SC_INVALID_NS | NVME_SC_DNR;
> +		goto out;
> +	}
> +
> +	id_zns = kzalloc(sizeof(*id_zns), GFP_KERNEL);
> +	if (!id_zns) {
> +		status = NVME_SC_INTERNAL;
> +		goto out;
> +	}
> +
> +	req->ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->identify.nsid);
> +	if (!req->ns) {
> +		status = NVME_SC_INTERNAL;
> +		goto done;
> +	}
> +
> +	if (!bdev_is_zoned(nvmet_bdev(req))) {
> +		req->error_loc = offsetof(struct nvme_identify, nsid);
> +		status = NVME_SC_INVALID_NS | NVME_SC_DNR;
> +		goto done;
> +	}
> +
> +	nvmet_ns_revalidate(req->ns);
> +	zsze = (bdev_zone_sectors(nvmet_bdev(req)) << 9) >>
> +					req->ns->blksize_shift;
> +	id_zns->lbafe[0].zsze = cpu_to_le64(zsze);
> +	id_zns->mor = cpu_to_le32(bdev_max_open_zones(nvmet_bdev(req)));
> +	id_zns->mar = cpu_to_le32(bdev_max_active_zones(nvmet_bdev(req)));
> +
> +done:
> +	status = nvmet_copy_to_sgl(req, 0, id_zns, sizeof(*id_zns));
> +	kfree(id_zns);
> +out:
> +	nvmet_req_complete(req, status);
> +}
> +
> +void nvmet_bdev_execute_zone_mgmt_recv(struct nvmet_req *req)
> +{
> +	struct request_queue *q = nvmet_bdev(req)->bd_disk->queue;
> +	struct nvme_zone_mgmt_recv_cmd *zmr = &req->cmd->zmr;
> +	unsigned int nz = blk_queue_nr_zones(q);
> +	u64 bufsize = (zmr->numd << 2) + 1;
> +	struct nvme_zone_report *rz;
> +	struct blk_zone *zones;
> +	int reported_zones;
> +	sector_t sect;
> +	u64 desc_size;
> +	u16 status;
> +	int i;
> +
> +	desc_size = nvmet_zones_to_descsize(blk_queue_nr_zones(q));
> +	status = nvmet_bdev_zns_checks(req);
> +	if (status)
> +		goto out;
> +
> +	zones = kvcalloc(blkdev_nr_zones(nvmet_bdev(req)->bd_disk),
> +			      sizeof(struct blk_zone), GFP_KERNEL);
> +	if (!zones) {
> +		status = NVME_SC_INTERNAL;
> +		goto out;
> +	}
> +
> +	rz = __vmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY);
> +	if (!rz) {
> +		status = NVME_SC_INTERNAL;
> +		goto out_free_zones;
> +	}
> +
> +	sect = nvmet_lba_to_sect(req->ns, le64_to_cpu(req->cmd->zmr.slba));
> +
> +	for (nz = blk_queue_nr_zones(q); desc_size >= bufsize; nz--)
> +		desc_size = nvmet_zones_to_descsize(nz);

desc_size is actually not used anywhere to do something. So what is the purpose
of this ? If only to determine nz, the number of zones that can be reported,
surely you can calculate it instead of using this loop.

> +
> +	reported_zones = blkdev_report_zones(nvmet_bdev(req), sect, nz,
> +					     nvmet_bdev_report_zone_cb,
> +					     zones);
> +	if (reported_zones < 0) {
> +		status = NVME_SC_INTERNAL;
> +		goto out_free_report_zones;
> +	}
> +
> +	rz->nr_zones = cpu_to_le64(reported_zones);
> +	for (i = 0; i < reported_zones; i++)
> +		nvmet_get_zone_desc(req->ns, &zones[i], &rz->entries[i]);
> +
> +	status = nvmet_copy_to_sgl(req, 0, rz, bufsize);
> +
> +out_free_report_zones:
> +	kvfree(rz);
> +out_free_zones:
> +	kvfree(zones);
> +out:
> +	nvmet_req_complete(req, status);
> +}
> +
> +void nvmet_bdev_execute_zone_mgmt_send(struct nvmet_req *req)
> +{
> +	sector_t nr_sect = bdev_zone_sectors(nvmet_bdev(req));
> +	struct nvme_zone_mgmt_send_cmd *c = &req->cmd->zms;
> +	u16 status = NVME_SC_SUCCESS;
> +	enum req_opf op;
> +	sector_t sect;
> +	int ret;
> +
> +	sect = nvmet_lba_to_sect(req->ns, le64_to_cpu(req->cmd->zms.slba));
> +
> +	switch (c->zsa) {
> +	case NVME_ZONE_OPEN:
> +		op = REQ_OP_ZONE_OPEN;
> +		break;
> +	case NVME_ZONE_CLOSE:
> +		op = REQ_OP_ZONE_CLOSE;
> +		break;
> +	case NVME_ZONE_FINISH:
> +		op = REQ_OP_ZONE_FINISH;
> +		break;
> +	case NVME_ZONE_RESET:
> +		if (c->select_all)
> +			nr_sect = get_capacity(nvmet_bdev(req)->bd_disk);
> +		op = REQ_OP_ZONE_RESET;
> +		break;
> +	default:
> +		status = NVME_SC_INVALID_FIELD;
> +		break;
> +	}
> +
> +	ret = blkdev_zone_mgmt(nvmet_bdev(req), op, sect, nr_sect, GFP_KERNEL);
> +	if (ret)
> +		status = NVME_SC_INTERNAL;
> +
> +	nvmet_req_complete(req, status);
> +}
> +
> +void nvmet_bdev_execute_zone_append(struct nvmet_req *req)
> +{
> +	unsigned long bv_cnt = min(req->sg_cnt, BIO_MAX_PAGES);
> +	int op = REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE;
> +	u64 slba = le64_to_cpu(req->cmd->rw.slba);
> +	sector_t sect = nvmet_lba_to_sect(req->ns, slba);
> +	u16 status = NVME_SC_SUCCESS;
> +	int sg_cnt = req->sg_cnt;
> +	struct scatterlist *sg;
> +	size_t mapped_data_len;
> +	struct iov_iter from;
> +	struct bio_vec *bvec;
> +	size_t mapped_cnt;
> +	size_t io_len = 0;
> +	struct bio *bio;
> +	int ret;
> +
> +	if (!nvmet_check_transfer_len(req, nvmet_rw_data_len(req)))
> +		return;
> +
> +	if (!req->sg_cnt) {
> +		nvmet_req_complete(req, 0);
> +		return;
> +	}
> +
> +	bvec = kmalloc_array(bv_cnt, sizeof(*bvec), GFP_KERNEL);
> +	if (!bvec) {
> +		status = NVME_SC_INTERNAL;
> +		goto out;
> +	}
> +
> +	while (sg_cnt) {
> +		mapped_data_len = 0;
> +		for_each_sg(req->sg, sg, req->sg_cnt, mapped_cnt) {
> +			nvmet_file_init_bvec(bvec, sg);
> +			mapped_data_len += bvec[mapped_cnt].bv_len;
> +			sg_cnt--;
> +			if (mapped_cnt == bv_cnt)
> +				break;
> +		}
> +		iov_iter_bvec(&from, WRITE, bvec, mapped_cnt, mapped_data_len);
> +
> +		bio = bio_alloc(GFP_KERNEL, bv_cnt);
> +		bio_set_dev(bio, nvmet_bdev(req));
> +		bio->bi_iter.bi_sector = sect;
> +		bio->bi_opf = op;
> +
> +		ret =  __bio_iov_append_get_pages(bio, &from);
> +		if (unlikely(ret)) {
> +			status = NVME_SC_INTERNAL;
> +			bio_io_error(bio);
> +			kfree(bvec);
> +			goto out;
> +		}
> +
> +		ret = submit_bio_wait(bio);
> +		bio_put(bio);
> +		if (ret < 0) {
> +			status = NVME_SC_INTERNAL;
> +			break;
> +		}
> +
> +		io_len += mapped_data_len;
> +	}
> +
> +	sect += (io_len >> 9);
> +	req->cqe->result.u64 = le64_to_cpu(nvmet_sect_to_lba(req->ns, sect));
> +	kfree(bvec);
> +
> +out:
> +	nvmet_req_complete(req, status);
> +}
> +
> +#else  /* CONFIG_BLK_DEV_ZONED */
> +static void nvmet_execute_identify_cns_cs_ctrl(struct nvmet_req *req)
> +{
> +}
> +static void nvmet_execute_identify_cns_cs_ns(struct nvmet_req *req)
> +{
> +}
> +u16 nvmet_process_zns_cis(struct nvmet_req *req, off_t *off)
> +{
> +	return 0;
> +}
> +static bool nvmet_bdev_zns_config(struct nvmet_ns *ns)
> +{
> +	return false;
> +}
> +void nvmet_bdev_execute_zone_mgmt_recv(struct nvmet_req *req)
> +{
> +}
> +void nvmet_bdev_execute_zone_mgmt_send(struct nvmet_req *req)
> +{
> +}
> +void nvmet_bdev_execute_zone_append(struct nvmet_req *req)
> +{
> +}
> +void nvmet_zns_add_cmd_effects(struct nvme_effects_log *log)
> +{
> +}
> +#endif /* CONFIG_BLK_DEV_ZONED */
> 


-- 
Damien Le Moal
Western Digital Research



More information about the Linux-nvme mailing list