[PATCH 2/9] nvmet: add ZNS support for bdev-ns

Damien Le Moal Damien.LeMoal at wdc.com
Sun Nov 29 19:16:07 EST 2020


On 2020/11/28 9:09, Chaitanya Kulkarni wrote:
> On 11/26/20 00:36, Damien Le Moal wrote:
>> On 2020/11/26 11:42, Chaitanya Kulkarni wrote:
>>> Add zns-bdev-config, id-ctrl, id-ns, zns-cmd-effects, zone-mgmt-send,
>>> zone-mgmt-recv and zone-append handlers for NVMeOF target to enable ZNS
>>> support for bdev.
>>>
>>> Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni at wdc.com>
>>> ---
>>>  drivers/nvme/target/Makefile      |   2 +
>>>  drivers/nvme/target/admin-cmd.c   |   4 +-
>>>  drivers/nvme/target/io-cmd-file.c |   2 +-
>>>  drivers/nvme/target/nvmet.h       |  18 ++
>>>  drivers/nvme/target/zns.c         | 390 ++++++++++++++++++++++++++++++
>>>  5 files changed, 413 insertions(+), 3 deletions(-)
>>>  create mode 100644 drivers/nvme/target/zns.c
>>>
>>> diff --git a/drivers/nvme/target/Makefile b/drivers/nvme/target/Makefile
>>> index ebf91fc4c72e..bc147ff2df5d 100644
>>> --- a/drivers/nvme/target/Makefile
>>> +++ b/drivers/nvme/target/Makefile
>>> @@ -12,6 +12,8 @@ obj-$(CONFIG_NVME_TARGET_TCP)		+= nvmet-tcp.o
>>>  nvmet-y		+= core.o configfs.o admin-cmd.o fabrics-cmd.o \
>>>  			discovery.o io-cmd-file.o io-cmd-bdev.o
>>>  nvmet-$(CONFIG_NVME_TARGET_PASSTHRU)	+= passthru.o
>>> +nvmet-$(CONFIG_BLK_DEV_ZONED)		+= zns.o
>>> +
>>>  nvme-loop-y	+= loop.o
>>>  nvmet-rdma-y	+= rdma.o
>>>  nvmet-fc-y	+= fc.o
>>> diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
>>> index dca34489a1dc..509fd8dcca0c 100644
>>> --- a/drivers/nvme/target/admin-cmd.c
>>> +++ b/drivers/nvme/target/admin-cmd.c
>>> @@ -579,8 +579,8 @@ static void nvmet_execute_identify_nslist(struct nvmet_req *req)
>>>  	nvmet_req_complete(req, status);
>>>  }
>>>  
>>> -static u16 nvmet_copy_ns_identifier(struct nvmet_req *req, u8 type, u8 len,
>>> -				    void *id, off_t *off)
>>> +u16 nvmet_copy_ns_identifier(struct nvmet_req *req, u8 type, u8 len,
>>> +			     void *id, off_t *off)
>>>  {
>>>  	struct nvme_ns_id_desc desc = {
>>>  		.nidt = type,
>>> diff --git a/drivers/nvme/target/io-cmd-file.c b/drivers/nvme/target/io-cmd-file.c
>>> index 0abbefd9925e..2bd10960fa50 100644
>>> --- a/drivers/nvme/target/io-cmd-file.c
>>> +++ b/drivers/nvme/target/io-cmd-file.c
>>> @@ -89,7 +89,7 @@ int nvmet_file_ns_enable(struct nvmet_ns *ns)
>>>  	return ret;
>>>  }
>>>  
>>> -static void nvmet_file_init_bvec(struct bio_vec *bv, struct scatterlist *sg)
>>> +void nvmet_file_init_bvec(struct bio_vec *bv, struct scatterlist *sg)
>>>  {
>>>  	bv->bv_page = sg_page(sg);
>>>  	bv->bv_offset = sg->offset;
>>> diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
>>> index 592763732065..0542ba672a31 100644
>>> --- a/drivers/nvme/target/nvmet.h
>>> +++ b/drivers/nvme/target/nvmet.h
>>> @@ -81,6 +81,9 @@ struct nvmet_ns {
>>>  	struct pci_dev		*p2p_dev;
>>>  	int			pi_type;
>>>  	int			metadata_size;
>>> +#ifdef CONFIG_BLK_DEV_ZONED
>>> +	struct nvme_id_ns_zns	id_zns;
>>> +#endif
>>>  };
>>>  
>>>  static inline struct nvmet_ns *to_nvmet_ns(struct config_item *item)
>>> @@ -251,6 +254,10 @@ struct nvmet_subsys {
>>>  	unsigned int		admin_timeout;
>>>  	unsigned int		io_timeout;
>>>  #endif /* CONFIG_NVME_TARGET_PASSTHRU */
>>> +
>>> +#ifdef CONFIG_BLK_DEV_ZONED
>>> +	struct nvme_id_ctrl_zns	id_ctrl_zns;
>>> +#endif
>>>  };
>>>  
>>>  static inline struct nvmet_subsys *to_subsys(struct config_item *item)
>>> @@ -603,4 +610,15 @@ static inline bool nvmet_ns_has_pi(struct nvmet_ns *ns)
>>>  	return ns->pi_type && ns->metadata_size == sizeof(struct t10_pi_tuple);
>>>  }
>>>  
>>> +void nvmet_execute_identify_cns_cs_ctrl(struct nvmet_req *req);
>>> +void nvmet_execute_identify_cns_cs_ns(struct nvmet_req *req);
>>> +u16 nvmet_process_zns_cis(struct nvmet_req *req, off_t *off);
>>> +bool nvmet_bdev_zns_config(struct nvmet_ns *ns);
>>> +void nvmet_bdev_execute_zone_mgmt_recv(struct nvmet_req *req);
>>> +void nvmet_bdev_execute_zone_mgmt_send(struct nvmet_req *req);
>>> +void nvmet_bdev_execute_zone_append(struct nvmet_req *req);
>>> +void nvmet_zns_add_cmd_effects(struct nvme_effects_log *log);
>>> +u16 nvmet_copy_ns_identifier(struct nvmet_req *req, u8 type, u8 len,
>>> +			     void *id, off_t *off);
>>> +void nvmet_file_init_bvec(struct bio_vec *bv, struct scatterlist *sg);
>>>  #endif /* _NVMET_H */
>>> diff --git a/drivers/nvme/target/zns.c b/drivers/nvme/target/zns.c
>>> new file mode 100644
>>> index 000000000000..8ea6641a55e3
>>> --- /dev/null
>>> +++ b/drivers/nvme/target/zns.c
>>> @@ -0,0 +1,390 @@
>>> +// SPDX-License-Identifier: GPL-2.0
>>> +/*
>>> + * NVMe ZNS-ZBD command implementation.
>>> + * Copyright (c) 2020-2021 HGST, a Western Digital Company.
>>> + */
>>> +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
>>> +#include <linux/uio.h>
>>> +#include <linux/nvme.h>
>>> +#include <linux/blkdev.h>
>>> +#include <linux/module.h>
>>> +#include "nvmet.h"
>>> +
>>> +#ifdef CONFIG_BLK_DEV_ZONED
>> This file is compiled only if CONFIG_BLK_DEV_ZONED is defined, so what is the
>> point of this ? The stubs for the !CONFIG_BLK_DEV_ZONED case should go into the
>> header file, no ?
> 
> Actually the conditional compilation of zns.c with CONFIG_BLK_DEV_ZONED
> 
> needs to be removed in the Makefile. I'm against putting these empty
> stubs in the
> 
> makefile when CONFIG_BLK_DEV_ZONED is not true, as there are several files
> 
> transport, discovery, file/passthru backend etc in the nvme/target/*.c
> which will add
> 
> empty stubs which has nothing to do with zoned bdev backend.

Each file will not need to add empty stubs if these stubs are in a common
header. And empty stubs are compiled away so I do not see the problem. Having
such empty stubs in a header file under an #ifdef is I think a fairly standard
coding style in the kernel. The block layer zone code and scsi SMR code is coded
like that.

> 
> i.e. for Makefile it should be :-
> 
> diff --git a/drivers/nvme/target/Makefile b/drivers/nvme/target/Makefile
> index ebf91fc4c72e..c67276a25363 100644
> --- a/drivers/nvme/target/Makefile
> +++ b/drivers/nvme/target/Makefile
> @@ -10,7 +10,7 @@ obj-$(CONFIG_NVME_TARGET_FCLOOP)      += nvme-fcloop.o
>  obj-$(CONFIG_NVME_TARGET_TCP)          += nvmet-tcp.o
>  
>  nvmet-y                += core.o configfs.o admin-cmd.o fabrics-cmd.o \
> -                       discovery.o io-cmd-file.o io-cmd-bdev.o
> +                       zns,o discovery.o io-cmd-file.o io-cmd-bdev.o
>  nvmet-$(CONFIG_NVME_TARGET_PASSTHRU)   += passthru.o
>  nvme-loop-y    += loop.o
>  nvmet-rdma-y   += rdma.o

The NS scan and test for support of ZNS could go in discovery.c, and compiled
unconditionally exactly like the host nvme driver does. Everything else (ZNS
commands execution) can go into zns.c and that file compiled conditionally, with
empty stubs in zns.h or any other appropriate header file. I think that make
things clean and easy to understand, and avoid the big #ifdef in the C code.
Just my opinion here. You are the maintainer, so your call...

>>> +
>>> +static u16 nvmet_bdev_zns_checks(struct nvmet_req *req)
>>> +{
>>> +	u16 status = 0;
>>> +
>>> +	if (!bdev_is_zoned(req->ns->bdev)) {
>>> +		status = NVME_SC_INVALID_NS | NVME_SC_DNR;
>>> +		goto out;
>> Why not return status directly here ? Same for the other cases below.
> I prefer centralize returns with goto, which follows the similar code.

Sure, but for such a simple function, this looks rather strange and in my
opinion uselessly complicates the code.

>>> +	}
>>> +
>>> +	if (req->cmd->zmr.zra != NVME_ZRA_ZONE_REPORT) {
>>> +		status = NVME_SC_INVALID_FIELD;
>>> +		goto out;
>>> +	}
>>> +
>>> +	if (req->cmd->zmr.zrasf != NVME_ZRASF_ZONE_REPORT_ALL) {
>>> +		status = NVME_SC_INVALID_FIELD;
>>> +		goto out;
>>> +	}
>>> +
>>> +	if (req->cmd->zmr.pr != NVME_REPORT_ZONE_PARTIAL)
>>> +		status = NVME_SC_INVALID_FIELD;
>>> +out:
>>> +	return status;
>>> +}

[...]
>>> +void nvmet_execute_identify_cns_cs_ns(struct nvmet_req *req)
>>> +{
>>> +	struct nvme_id_ns_zns *id_zns;
>>> +	u16 status = 0;
>>> +	u64 zsze;
>>> +
>>> +	if (le32_to_cpu(req->cmd->identify.nsid) == NVME_NSID_ALL) {
>>> +		req->error_loc = offsetof(struct nvme_identify, nsid);
>>> +		status = NVME_SC_INVALID_NS | NVME_SC_DNR;
>>> +		goto out;
>>> +	}
>>> +
>>> +	id_zns = kzalloc(sizeof(*id_zns), GFP_KERNEL);
>>> +	if (!id_zns) {
>>> +		status = NVME_SC_INTERNAL;
>>> +		goto out;
>>> +	}
>>> +
>>> +	req->ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->identify.nsid);
>>> +	if (!req->ns) {
>>> +		status = NVME_SC_INTERNAL;
>>> +		goto done;
>> That will result in nvmet_copy_to_sgl() being executed. Is that OK ?
>> Shouldn't you do only the kfree(id_zns) and complete with an error here ?
> Call to nvmet_copy_to_sgl() zeroout the values if any when we return the
> 
> buffer in case of error. I don't see any problem with zeroing out buffer in
> 
> case of error. Can you please explain why we shouldn't do that ?

I cannot explain anything. I was merely pointing out what the code was doing and
if that was intentional. If there are no problems, then fine.

> 
>>> +	}
>>> +
>>> +	if (!bdev_is_zoned(nvmet_bdev(req))) {
>>> +		req->error_loc = offsetof(struct nvme_identify, nsid);
>>> +		status = NVME_SC_INVALID_NS | NVME_SC_DNR;
>>> +		goto done;
>> Same comment.
> See above reply.
>>> +	}
>>> +
>>> +	nvmet_ns_revalidate(req->ns);
>>> +	zsze = (bdev_zone_sectors(nvmet_bdev(req)) << 9) >>
>>> +					req->ns->blksize_shift;
>>> +	id_zns->lbafe[0].zsze = cpu_to_le64(zsze);
>>> +	id_zns->mor = cpu_to_le32(bdev_max_open_zones(nvmet_bdev(req)));
>>> +	id_zns->mar = cpu_to_le32(bdev_max_active_zones(nvmet_bdev(req)));
>>> +
>>> +done:
>>> +	status = nvmet_copy_to_sgl(req, 0, id_zns, sizeof(*id_zns));
>>> +	kfree(id_zns);
>>> +out:
>>> +	nvmet_req_complete(req, status);
>>> +}
>>> +
>>> +void nvmet_bdev_execute_zone_mgmt_recv(struct nvmet_req *req)
>>> +{
>>> +	struct request_queue *q = nvmet_bdev(req)->bd_disk->queue;
>>> +	struct nvme_zone_mgmt_recv_cmd *zmr = &req->cmd->zmr;
>>> +	unsigned int nz = blk_queue_nr_zones(q);
>>> +	u64 bufsize = (zmr->numd << 2) + 1;
>>> +	struct nvme_zone_report *rz;
>>> +	struct blk_zone *zones;
>>> +	int reported_zones;
>>> +	sector_t sect;
>>> +	u64 desc_size;
>>> +	u16 status;
>>> +	int i;
>>> +
>>> +	desc_size = nvmet_zones_to_descsize(blk_queue_nr_zones(q));
>>> +	status = nvmet_bdev_zns_checks(req);
>>> +	if (status)
>>> +		goto out;
>>> +
>>> +	zones = kvcalloc(blkdev_nr_zones(nvmet_bdev(req)->bd_disk),
>>> +			      sizeof(struct blk_zone), GFP_KERNEL);
>> This is not super nice: a large disk will have an enormous number of zones
>> (75000+ for largest SMR HDD today). But you actually do not need more zones
>> descs than what fits in req buffer.
> Call to nvmet_copy_to_sgl() nicely fail and return error.

That is not my point. The point is that this code will do an allocation for
75,000 x 64B = 4.8MB even if a single zone report is being requested. That is
not acceptable. This needs optimization: allocate only as many zone descriptors
as is requested.

>>> +	if (!zones) {
>>> +		status = NVME_SC_INTERNAL;
>>> +		goto out;
>>> +	}
>>> +
>>> +	rz = __vmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY);
>>> +	if (!rz) {
>>> +		status = NVME_SC_INTERNAL;
>>> +		goto out_free_zones;
>>> +	}
>>> +
>>> +	sect = nvmet_lba_to_sect(req->ns, le64_to_cpu(req->cmd->zmr.slba));
>>> +
>>> +	for (nz = blk_queue_nr_zones(q); desc_size >= bufsize; nz--)
>>> +		desc_size = nvmet_zones_to_descsize(nz);
>>> +
>>> +	reported_zones = blkdev_report_zones(nvmet_bdev(req), sect, nz,
>>> +					     nvmet_bdev_report_zone_cb,
>>> +					     zones);
>>> +	if (reported_zones < 0) {
>>> +		status = NVME_SC_INTERNAL;
>>> +		goto out_free_report_zones;
>>> +	}
>>> +
>>> +	rz->nr_zones = cpu_to_le64(reported_zones);
>>> +	for (i = 0; i < reported_zones; i++)
>>> +		nvmet_get_zone_desc(req->ns, &zones[i], &rz->entries[i]);
>> This can be done directly in the report zones cb. That will avoid looping twice
>> over the reported zones.
> Okay, I'll try and remove this loop.
>>> +
>>> +	status = nvmet_copy_to_sgl(req, 0, rz, bufsize);
>>> +
>>> +out_free_report_zones:
>>> +	kvfree(rz);
>>> +out_free_zones:
>>> +	kvfree(zones);
>>> +out:
>>> +	nvmet_req_complete(req, status);
>>> +}
>>> +
>>> +void nvmet_bdev_execute_zone_mgmt_send(struct nvmet_req *req)
>>> +{
>>> +	sector_t nr_sect = bdev_zone_sectors(nvmet_bdev(req));
>>> +	struct nvme_zone_mgmt_send_cmd *c = &req->cmd->zms;
>>> +	u16 status = NVME_SC_SUCCESS;
>>> +	enum req_opf op;
>>> +	sector_t sect;
>>> +	int ret;
>>> +
>>> +	sect = nvmet_lba_to_sect(req->ns, le64_to_cpu(req->cmd->zms.slba));
>>> +
>>> +	switch (c->zsa) {
>>> +	case NVME_ZONE_OPEN:
>>> +		op = REQ_OP_ZONE_OPEN;
>>> +		break;
>>> +	case NVME_ZONE_CLOSE:
>>> +		op = REQ_OP_ZONE_CLOSE;
>>> +		break;
>>> +	case NVME_ZONE_FINISH:
>>> +		op = REQ_OP_ZONE_FINISH;
>>> +		break;
>>> +	case NVME_ZONE_RESET:
>>> +		if (c->select_all)
>>> +			nr_sect = get_capacity(nvmet_bdev(req)->bd_disk);
>>> +		op = REQ_OP_ZONE_RESET;
>>> +		break;
>>> +	default:
>>> +		status = NVME_SC_INVALID_FIELD;
>>> +		break;
>> You needa goto here or blkdev_zone_mgmt() will be called.
>>
> True.
>>> +	}
>>> +
>>> +	ret = blkdev_zone_mgmt(nvmet_bdev(req), op, sect, nr_sect, GFP_KERNEL);
>>> +	if (ret)
>>> +		status = NVME_SC_INTERNAL;
>>> +
>>> +	nvmet_req_complete(req, status);
>>> +}
>>> +
>>> +void nvmet_bdev_execute_zone_append(struct nvmet_req *req)
>>> +{
>>> +	unsigned long bv_cnt = min(req->sg_cnt, BIO_MAX_PAGES);
>>> +	int op = REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE;
>>> +	u64 slba = le64_to_cpu(req->cmd->rw.slba);
>>> +	sector_t sect = nvmet_lba_to_sect(req->ns, slba);
>>> +	u16 status = NVME_SC_SUCCESS;
>>> +	int sg_cnt = req->sg_cnt;
>>> +	struct scatterlist *sg;
>>> +	size_t mapped_data_len;
>>> +	struct iov_iter from;
>>> +	struct bio_vec *bvec;
>>> +	size_t mapped_cnt;
>>> +	size_t io_len = 0;
>>> +	struct bio *bio;
>>> +	int ret;
>>> +
>>> +	if (!nvmet_check_transfer_len(req, nvmet_rw_data_len(req)))
>>> +		return;
>> No request completion ?
> See nvmet_check_transfer_len().
>>> +
>>> +	if (!req->sg_cnt) {
>>> +		nvmet_req_complete(req, 0);
>>> +		return;
>>> +	}
>>> +
>>> +	bvec = kmalloc_array(bv_cnt, sizeof(*bvec), GFP_KERNEL);
>>> +	if (!bvec) {
>>> +		status = NVME_SC_INTERNAL;
>>> +		goto out;
>>> +	}
>>> +
>>> +	while (sg_cnt) {
>>> +		mapped_data_len = 0;
>>> +		for_each_sg(req->sg, sg, req->sg_cnt, mapped_cnt) {
>>> +			nvmet_file_init_bvec(bvec, sg);
>>> +			mapped_data_len += bvec[mapped_cnt].bv_len;
>>> +			sg_cnt--;
>>> +			if (mapped_cnt == bv_cnt)
>>> +				break;
>>> +		}
>>> +		iov_iter_bvec(&from, WRITE, bvec, mapped_cnt, mapped_data_len);
>>> +
>>> +		bio = bio_alloc(GFP_KERNEL, bv_cnt);
>>> +		bio_set_dev(bio, nvmet_bdev(req));
>>> +		bio->bi_iter.bi_sector = sect;
>>> +		bio->bi_opf = op;
>>> +
>>> +		ret =  __bio_iov_append_get_pages(bio, &from);
>>> +		if (unlikely(ret)) {
>>> +			status = NVME_SC_INTERNAL;
>>> +			bio_io_error(bio);
>>> +			kfree(bvec);
>>> +			goto out;
>>> +		}
>>> +
>>> +		ret = submit_bio_wait(bio);
>>> +		bio_put(bio);
>>> +		if (ret < 0) {
>>> +			status = NVME_SC_INTERNAL;
>>> +			break;
>>> +		}
>>> +
>>> +		io_len += mapped_data_len;
>>> +	}
>> This loop is equivalent to splitting a zone append. That must not be done as
>> that can lead to totally unpredictable ordering of the chunks. What if another
>> thread is doing zone append to the same zone at the same time ?
>>
> We can add something like per-zone bit locking here to prevent that,
> multiple
> 
> threads. With zasl value derived from max_zone_append_sector (as
> mentioned in
> 
> my reply ideally we shouldn't get the data len more than what we can
> handle if I'm
> 
> not missing something.

No way: a locking mechanism will negate the benefits of zone append vs regular
writes. So NACK on that. As you say, since you advertized the max zone append
sectors, you should not be getting a request larger than that limit. If you do,
fail the request immediately instead of trying to split the zone append command.

> 
>>> +
>>> +	sect += (io_len >> 9);
>>> +	req->cqe->result.u64 = le64_to_cpu(nvmet_sect_to_lba(req->ns, sect));
>>> +	kfree(bvec);
>>> +
>>> +out:
>>> +	nvmet_req_complete(req, status);
>>> +}
>>> +
>>> +#else  /* CONFIG_BLK_DEV_ZONED */
>>> +static void nvmet_execute_identify_cns_cs_ctrl(struct nvmet_req *req)
>>> +{
>>> +}
>>> +static void nvmet_execute_identify_cns_cs_ns(struct nvmet_req *req)
>>> +{
>>> +}
>>> +u16 nvmet_process_zns_cis(struct nvmet_req *req, off_t *off)
>>> +{
>>> +	return 0;
>>> +}
>>> +static bool nvmet_bdev_zns_config(struct nvmet_ns *ns)
>>> +{
>>> +	return false;
>>> +}
>>> +void nvmet_bdev_execute_zone_mgmt_recv(struct nvmet_req *req)
>>> +{
>>> +}
>>> +void nvmet_bdev_execute_zone_mgmt_send(struct nvmet_req *req)
>>> +{
>>> +}
>>> +void nvmet_bdev_execute_zone_append(struct nvmet_req *req)
>>> +{
>>> +}
>>> +void nvmet_zns_add_cmd_effects(struct nvme_effects_log *log)
>>> +{
>>> +}
>> These should go in the header file. And put the brackets on the same line.
>>
> As I explained earlier this bloats the header file with empty stubs and
> 
> adds functions in the traget transport code from nvmet.h which has
> 
> nothing to do with the backend. Regarding the {} style I don't see braces
> 
> on the same line for the empty stubs so keeping it consistent what is in
> 
> the repo.

As I said above, I am not a fan of this style... At the very least, please
remove the static stubs as that likely will generate a cdefined but not used
compiler warning.

> 
>>> +#endif /* CONFIG_BLK_DEV_ZONED */
>>>
> 


-- 
Damien Le Moal
Western Digital Research



More information about the Linux-nvme mailing list