[PATCH V5 4/6] nvmet: add ZBD over ZNS backend support

Sat Dec 12 01:54:21 EST 2020

On 12/10/20 17:18, Damien Le Moal wrote:
> On 2020/12/10 15:27, Chaitanya Kulkarni wrote:
>> NVMe TP 4053 – Zoned Namespaces (ZNS) allows host software to
>> communicate with a non-volatile memory subsystem using zones for
>> NVMe protocol based controllers. NVMeOF already support the ZNS NVMe
>> Protocol compliant devices on the target in the passthru mode. There
>> are Generic Zoned Block Devices like  Shingled Magnetic Recording (SMR)
> Please remove "Generic" from this sentence. It is confusing. Also, I do not
> think you need to capitalize ZOned Block Devices. That is something Linux
> defines. It is not defined by any standard.
Since target supports ZNS in the passthru mode term generic is needed
to differentiate the backend type.
Regarding zbd I'm fine with that.
>> HDD which are not based on the NVMe protocol.
>>
>> This patch adds ZNS backend to support the ZBDs for NVMeOF target.
>>
>> This support inculdes implementing the new command set NVME_CSI_ZNS,
> s/inculdes/includes
Okay.
>> adding different command handlers for ZNS command set such as
>> NVMe Identify Controller, NVMe Identify Namespace, NVMe Zone Append,
>> NVMe Zone Management Send and NVMe Zone Management Receive.
>>
>> With new command set identifier we also update the target command effects
>> logs to reflect the ZNS compliant commands.
>>
>> Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni at wdc.com>
>> ---
>>  drivers/nvme/target/Makefile      |   1 +
>>  drivers/nvme/target/admin-cmd.c   |  26 +++
>>  drivers/nvme/target/core.c        |   1 +
>>  drivers/nvme/target/io-cmd-bdev.c |  33 ++-
>>  drivers/nvme/target/nvmet.h       |  38 ++++
>>  drivers/nvme/target/zns.c         | 327 ++++++++++++++++++++++++++++++
>>  6 files changed, 418 insertions(+), 8 deletions(-)
>>  create mode 100644 drivers/nvme/target/zns.c
>>
>> diff --git a/drivers/nvme/target/Makefile b/drivers/nvme/target/Makefile
>> index ebf91fc4c72e..9837e580fa7e 100644
>> --- a/drivers/nvme/target/Makefile
>> +++ b/drivers/nvme/target/Makefile
>> @@ -12,6 +12,7 @@ obj-$(CONFIG_NVME_TARGET_TCP)		+= nvmet-tcp.o
>>  nvmet-y		+= core.o configfs.o admin-cmd.o fabrics-cmd.o \
>>  			discovery.o io-cmd-file.o io-cmd-bdev.o
>>  nvmet-$(CONFIG_NVME_TARGET_PASSTHRU)	+= passthru.o
>> +nvmet-$(CONFIG_BLK_DEV_ZONED)		+= zns.o
>>  nvme-loop-y	+= loop.o
>>  nvmet-rdma-y	+= rdma.o
>>  nvmet-fc-y	+= fc.o
>> diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
>> index f4c0f3aca485..6f5279b15aa6 100644
>> --- a/drivers/nvme/target/admin-cmd.c
>> +++ b/drivers/nvme/target/admin-cmd.c
>> @@ -192,6 +192,15 @@ static void nvmet_execute_get_log_cmd_effects_ns(struct nvmet_req *req)
>>  		log->iocs[nvme_cmd_dsm]			= cpu_to_le32(1 << 0);
>>  		log->iocs[nvme_cmd_write_zeroes]	= cpu_to_le32(1 << 0);
>>  		break;
>> +	case NVME_CSI_ZNS:
>> +		if (IS_ENABLED(CONFIG_BLK_DEV_ZONED)) {
>> +			u32 *iocs = log->iocs;
>> +
>> +			iocs[nvme_cmd_zone_append]	= cpu_to_le32(1 << 0);
>> +			iocs[nvme_cmd_zone_mgmt_send]	= cpu_to_le32(1 << 0);
>> +			iocs[nvme_cmd_zone_mgmt_recv]	= cpu_to_le32(1 << 0);
>> +		}
>> +		break;
>>  	default:
>>  		status = NVME_SC_INVALID_LOG_PAGE;
>>  		break;
>> @@ -614,6 +623,7 @@ static u16 nvmet_copy_ns_identifier(struct nvmet_req *req, u8 type, u8 len,
>>  
>>  static void nvmet_execute_identify_desclist(struct nvmet_req *req)
>>  {
>> +	u16 nvme_cis_zns = NVME_CSI_ZNS;
>>  	u16 status = 0;
>>  	off_t off = 0;
>>  
>> @@ -638,6 +648,14 @@ static void nvmet_execute_identify_desclist(struct nvmet_req *req)
>>  		if (status)
>>  			goto out;
>>  	}
>> +	if (IS_ENABLED(CONFIG_BLK_DEV_ZONED)) {
>> +		if (req->ns->csi == NVME_CSI_ZNS)
>> +			status = nvmet_copy_ns_identifier(req, NVME_NIDT_CSI,
>> +							  NVME_NIDT_CSI_LEN,
>> +							  &nvme_cis_zns, &off);
>> +		if (status)
>> +			goto out;
>> +	}
>>  
>>  	if (sg_zero_buffer(req->sg, req->sg_cnt, NVME_IDENTIFY_DATA_SIZE - off,
>>  			off) != NVME_IDENTIFY_DATA_SIZE - off)
>> @@ -655,8 +673,16 @@ static void nvmet_execute_identify(struct nvmet_req *req)
>>  	switch (req->cmd->identify.cns) {
>>  	case NVME_ID_CNS_NS:
>>  		return nvmet_execute_identify_ns(req);
>> +	case NVME_ID_CNS_CS_NS:
>> +		if (req->cmd->identify.csi == NVME_CSI_ZNS)
>> +			return nvmet_execute_identify_cns_cs_ns(req);
>> +		break;
>>  	case NVME_ID_CNS_CTRL:
>>  		return nvmet_execute_identify_ctrl(req);
>> +	case NVME_ID_CNS_CS_CTRL:
>> +		if (req->cmd->identify.csi == NVME_CSI_ZNS)
>> +			return nvmet_execute_identify_cns_cs_ctrl(req);
>> +		break;
>>  	case NVME_ID_CNS_NS_ACTIVE_LIST:
>>  		return nvmet_execute_identify_nslist(req);
>>  	case NVME_ID_CNS_NS_DESC_LIST:
>> diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
>> index 672e4009f8d6..17a99c7134dc 100644
>> --- a/drivers/nvme/target/core.c
>> +++ b/drivers/nvme/target/core.c
>> @@ -1107,6 +1107,7 @@ static inline u8 nvmet_cc_iocqes(u32 cc)
>>  static inline bool nvmet_cc_css_check(u8 cc_css)
>>  {
>>  	switch (cc_css <<= NVME_CC_CSS_SHIFT) {
>> +	case NVME_CC_CSS_CSI:
>>  	case NVME_CC_CSS_NVM:
>>  		return true;
>>  	default:
>> diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c
>> index 23095bdfce06..6178ef643962 100644
>> --- a/drivers/nvme/target/io-cmd-bdev.c
>> +++ b/drivers/nvme/target/io-cmd-bdev.c
>> @@ -63,6 +63,14 @@ static void nvmet_bdev_ns_enable_integrity(struct nvmet_ns *ns)
>>  	}
>>  }
>>  
>> +void nvmet_bdev_ns_disable(struct nvmet_ns *ns)
>> +{
>> +	if (ns->bdev) {
>> +		blkdev_put(ns->bdev, FMODE_WRITE | FMODE_READ);
>> +		ns->bdev = NULL;
>> +	}
>> +}
>> +
>>  int nvmet_bdev_ns_enable(struct nvmet_ns *ns)
>>  {
>>  	int ret;
>> @@ -86,15 +94,15 @@ int nvmet_bdev_ns_enable(struct nvmet_ns *ns)
>>  	if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY_T10))
>>  		nvmet_bdev_ns_enable_integrity(ns);
>>  
>> -	return 0;
>> -}
>> -
>> -void nvmet_bdev_ns_disable(struct nvmet_ns *ns)
>> -{
>> -	if (ns->bdev) {
>> -		blkdev_put(ns->bdev, FMODE_WRITE | FMODE_READ);
>> -		ns->bdev = NULL;
>> +	if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && bdev_is_zoned(ns->bdev)) {
>> +		if (!nvmet_bdev_zns_enable(ns)) {
>> +			nvmet_bdev_ns_disable(ns);
>> +			return -EINVAL;
>> +		}
>> +		ns->csi = NVME_CSI_ZNS;
>>  	}
>> +
>> +	return 0;
>>  }
>>  
>>  void nvmet_bdev_ns_revalidate(struct nvmet_ns *ns)
>> @@ -448,6 +456,15 @@ u16 nvmet_bdev_parse_io_cmd(struct nvmet_req *req)
>>  	case nvme_cmd_write_zeroes:
>>  		req->execute = nvmet_bdev_execute_write_zeroes;
>>  		return 0;
>> +	case nvme_cmd_zone_append:
>> +		req->execute = nvmet_bdev_execute_zone_append;
>> +		return 0;
>> +	case nvme_cmd_zone_mgmt_recv:
>> +		req->execute = nvmet_bdev_execute_zone_mgmt_recv;
>> +		return 0;
>> +	case nvme_cmd_zone_mgmt_send:
>> +		req->execute = nvmet_bdev_execute_zone_mgmt_send;
>> +		return 0;
>>  	default:
>>  		pr_err("unhandled cmd %d on qid %d\n", cmd->common.opcode,
>>  		       req->sq->qid);
>> diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
>> index 0360594abd93..dae6ecba6780 100644
>> --- a/drivers/nvme/target/nvmet.h
>> +++ b/drivers/nvme/target/nvmet.h
>> @@ -252,6 +252,10 @@ struct nvmet_subsys {
>>  	unsigned int		admin_timeout;
>>  	unsigned int		io_timeout;
>>  #endif /* CONFIG_NVME_TARGET_PASSTHRU */
>> +
>> +#ifdef CONFIG_BLK_DEV_ZONED
>> +	u8			zasl;
>> +#endif /* CONFIG_BLK_DEV_ZONED */
>>  };
>>  
>>  static inline struct nvmet_subsys *to_subsys(struct config_item *item)
>> @@ -614,4 +618,38 @@ static inline sector_t nvmet_lba_to_sect(struct nvmet_ns *ns, __le64 lba)
>>  	return le64_to_cpu(lba) << (ns->blksize_shift - SECTOR_SHIFT);
>>  }
>>  
>> +#ifdef CONFIG_BLK_DEV_ZONED
>> +bool nvmet_bdev_zns_enable(struct nvmet_ns *ns);
>> +void nvmet_execute_identify_cns_cs_ctrl(struct nvmet_req *req);
>> +void nvmet_execute_identify_cns_cs_ns(struct nvmet_req *req);
>> +void nvmet_bdev_execute_zone_mgmt_recv(struct nvmet_req *req);
>> +void nvmet_bdev_execute_zone_mgmt_send(struct nvmet_req *req);
>> +void nvmet_bdev_execute_zone_append(struct nvmet_req *req);
>> +#else  /* CONFIG_BLK_DEV_ZONED */
>> +static inline bool nvmet_bdev_zns_enable(struct nvmet_ns *ns)
>> +{
>> +	return false;
>> +}
>> +static inline void
>> +nvmet_execute_identify_cns_cs_ctrl(struct nvmet_req *req)
>> +{
>> +}
>> +static inline void
>> +nvmet_execute_identify_cns_cs_ns(struct nvmet_req *req)
>> +{
>> +}
>> +static inline void
>> +nvmet_bdev_execute_zone_mgmt_recv(struct nvmet_req *req)
>> +{
>> +}
>> +static inline void
>> +nvmet_bdev_execute_zone_mgmt_send(struct nvmet_req *req)
>> +{
>> +}
>> +static inline void
>> +nvmet_bdev_execute_zone_append(struct nvmet_req *req)
>> +{
>> +}
>> +#endif /* CONFIG_BLK_DEV_ZONED */
>> +
>>  #endif /* _NVMET_H */
>> diff --git a/drivers/nvme/target/zns.c b/drivers/nvme/target/zns.c
>> new file mode 100644
>> index 000000000000..ae51bae996f9
>> --- /dev/null
>> +++ b/drivers/nvme/target/zns.c
>> @@ -0,0 +1,327 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +/*
>> + * NVMe ZNS-ZBD command implementation.
>> + * Copyright (c) 2020-2021 HGST, a Western Digital Company.
>> + */
>> +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
>> +#include <linux/uio.h>
>> +#include <linux/nvme.h>
>> +#include <linux/xarray.h>
>> +#include <linux/blkdev.h>
>> +#include <linux/module.h>
>> +#include "nvmet.h"
>> +
>> +/*
>> + * We set the Memory Page Size Minimum (MPSMIN) for target controller to 0
>> + * which gets added by 12 in the nvme_enable_ctrl() which results in 2^12 = 4k
>> + * as page_shift value. When calculating the ZASL use shift by 12.
>> + */
>> +#define NVMET_MPSMIN_SHIFT	12
>> +
>> +static u16 nvmet_bdev_zns_checks(struct nvmet_req *req)
>> +{
>> +	u16 status = 0;
>> +
>> +	if (!bdev_is_zoned(req->ns->bdev)) {
>> +		status = NVME_SC_INVALID_NS | NVME_SC_DNR;
>> +		goto out;
>> +	}
>> +
>> +	if (req->cmd->zmr.zra != NVME_ZRA_ZONE_REPORT) {
>> +		status = NVME_SC_INVALID_FIELD;
>> +		goto out;
>> +	}
>> +
>> +	if (req->cmd->zmr.zrasf != NVME_ZRASF_ZONE_REPORT_ALL) {
>> +		status = NVME_SC_INVALID_FIELD;
>> +		goto out;
>> +	}
>> +
>> +	if (req->cmd->zmr.pr != NVME_REPORT_ZONE_PARTIAL)
>> +		status = NVME_SC_INVALID_FIELD;
>> +
>> +out:
>> +	return status;
>> +}
> I really fail to see how the gotos here help with the code "being better".
> Are you absolutely sure you want to keep this super convoluted style for a
> function that is that simple ?
>
> +
> +/*
> + *  ZNS related command implementation and helpers.
> + */
> +
> +static inline u8 nvmet_zasl(unsigned int zone_append_sects)
> +{
> +	/*
> +	 * Zone Append Size Limit is the value experessed in the units
> +	 * of minimum memory page size (i.e. 12) and is reported power of 2.
> +	 */
> +	return ilog2((zone_append_sects << 9) >> NVMET_MPSMIN_SHIFT);
> +}
> +
> +static inline bool nvmet_zns_update_zasl(struct nvmet_ns *ns)
> +{
> +	struct request_queue *q = ns->bdev->bd_disk->queue;
> +	u8 zasl = nvmet_zasl(queue_max_zone_append_sectors(q));
> +
> +	if (ns->subsys->zasl)
> +		return ns->subsys->zasl < zasl ? false : true;
> +
> +	ns->subsys->zasl = zasl;
> +	return true;
> +}
> +
> +bool nvmet_bdev_zns_enable(struct nvmet_ns *ns)
> +{
> +	if (ns->bdev->bd_disk->queue->conv_zones_bitmap) {
> Hmm... BIO based DM devices do not have this bitmap set since they do not have a
> scheduler. So if one setup a dm-linear device on top of an SMR disk and export
> the DM device through fabric, then this check will fail to verify if
> conventional zones are present. There may be no other option than to do a full
> report zones here if queue->seq_zones_wlock is NULL (meaning the queue is for a
> stacked device).

If I'm not wrong each LLD does call the report zones at disk revalidation,
as we should be able to reuse it instead of repeating for each zbd ns
especially for static property:-

1. drivers/block/null_blk_zoned.c:-
    null_register_zoned_dev int
        ret = blk_revalidate_disk_zones(nullb->disk, NULL);
2. drivers/nvme/host/zns.c:-
    nvme_revalidate_zones
        ret = blk_revalidate_disk_zones(ns->disk, NULL);
3. drivers/scsi/sd_zbc.c:-
    sd_zbc_revalidate_zones
        ret = blk_revalidate_disk_zones(disk, sd_zbc_revalidate_zones_cb);

Calling report again is a duplication of the work consuming cpu cycles for
each zbd ns.

Unless something wrong we can get away with following prep patch with one
call in zns.c :-

>From abceef7bfdf9b278c492c755bf5f242159ef51e5 Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni at wdc.com>
Date: Fri, 11 Dec 2020 21:21:44 -0800
Subject: [PATCH V6 2/7] block: add nr_conv_zones and nr_seq_zones helpers

Add two request members that are needed to implement the NVMeOF ZBD
backend which exports a number of conventional zones and a number of
sequential zones so we don't have to repeat the work what
blk_revalidate_disk_zones() already does.

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni at wdc.com>
---
 block/blk-sysfs.c      | 14 ++++++++++++++
 block/blk-zoned.c      |  9 +++++++++
 include/linux/blkdev.h | 13 +++++++++++++
 3 files changed, 36 insertions(+)

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index b513f1683af0..f10cf45ae177 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -307,6 +307,16 @@ static ssize_t queue_nr_zones_show(struct
request_queue *q, char *page)
     return queue_var_show(blk_queue_nr_zones(q), page);
 }
 
+static ssize_t queue_nr_conv_zones_show(struct request_queue *q, char
*page)
+{
+    return queue_var_show(blk_queue_nr_conv_zones(q), page);
+}
+
+static ssize_t queue_nr_seq_zones_show(struct request_queue *q, char *page)
+{
+    return queue_var_show(blk_queue_nr_seq_zones(q), page);
+}
+
 static ssize_t queue_max_open_zones_show(struct request_queue *q, char
*page)
 {
     return queue_var_show(queue_max_open_zones(q), page);
@@ -588,6 +598,8 @@ QUEUE_RO_ENTRY(queue_zone_append_max,
"zone_append_max_bytes");
 
 QUEUE_RO_ENTRY(queue_zoned, "zoned");
 QUEUE_RO_ENTRY(queue_nr_zones, "nr_zones");
+QUEUE_RO_ENTRY(queue_nr_conv_zones, "nr_conv_zones");
+QUEUE_RO_ENTRY(queue_nr_seq_zones, "nr_seq_zones");
 QUEUE_RO_ENTRY(queue_max_open_zones, "max_open_zones");
 QUEUE_RO_ENTRY(queue_max_active_zones, "max_active_zones");
 
@@ -642,6 +654,8 @@ static struct attribute *queue_attrs[] = {
     &queue_nonrot_entry.attr,
     &queue_zoned_entry.attr,
     &queue_nr_zones_entry.attr,
+    &queue_nr_conv_zones_entry.attr,
+    &queue_nr_seq_zones_entry.attr,
     &queue_max_open_zones_entry.attr,
     &queue_max_active_zones_entry.attr,
     &queue_nomerges_entry.attr,
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 6817a673e5ce..ea38c7928e41 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -390,6 +390,8 @@ struct blk_revalidate_zone_args {
     unsigned long    *conv_zones_bitmap;
     unsigned long    *seq_zones_wlock;
     unsigned int    nr_zones;
+    unsigned int    nr_conv_zones;
+    unsigned int    nr_seq_zones;
     sector_t    zone_sectors;
     sector_t    sector;
 };
@@ -449,6 +451,7 @@ static int blk_revalidate_zone_cb(struct blk_zone
*zone, unsigned int idx,
                 return -ENOMEM;
         }
         set_bit(idx, args->conv_zones_bitmap);
+        args->nr_conv_zones++;
         break;
     case BLK_ZONE_TYPE_SEQWRITE_REQ:
     case BLK_ZONE_TYPE_SEQWRITE_PREF:
@@ -458,6 +461,7 @@ static int blk_revalidate_zone_cb(struct blk_zone
*zone, unsigned int idx,
             if (!args->seq_zones_wlock)
                 return -ENOMEM;
         }
+        args->nr_seq_zones++;
         break;
     default:
         pr_warn("%s: Invalid zone type 0x%x at sectors %llu\n",
@@ -489,6 +493,9 @@ int blk_revalidate_disk_zones(struct gendisk *disk,
     struct request_queue *q = disk->queue;
     struct blk_revalidate_zone_args args = {
         .disk        = disk,
+        /* just for redability */
+        .nr_conv_zones    = 0,
+        .nr_seq_zones    = 0,
     };
     unsigned int noio_flag;
     int ret;
@@ -519,6 +526,8 @@ int blk_revalidate_disk_zones(struct gendisk *disk,
     if (ret >= 0) {
         blk_queue_chunk_sectors(q, args.zone_sectors);
         q->nr_zones = args.nr_zones;
+        q->nr_conv_zones = args.nr_conv_zones;
+        q->nr_seq_zones = args.nr_seq_zones;
         swap(q->seq_zones_wlock, args.seq_zones_wlock);
         swap(q->conv_zones_bitmap, args.conv_zones_bitmap);
         if (update_driver_data)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 2bdaa7cacfa3..697ded01e049 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -526,6 +526,9 @@ struct request_queue {
     unsigned long        *seq_zones_wlock;
     unsigned int        max_open_zones;
     unsigned int        max_active_zones;
+    unsigned int        nr_conv_zones;
+    unsigned int        nr_seq_zones;
+
 #endif /* CONFIG_BLK_DEV_ZONED */
 
     /*
@@ -726,6 +729,16 @@ static inline unsigned int
blk_queue_nr_zones(struct request_queue *q)
     return blk_queue_is_zoned(q) ? q->nr_zones : 0;
 }
 
+static inline unsigned int blk_queue_nr_conv_zones(struct request_queue *q)
+{
+    return blk_queue_is_zoned(q) ? q->nr_conv_zones : 0;
+}
+
+static inline unsigned int blk_queue_nr_seq_zones(struct request_queue *q)
+{
+    return blk_queue_is_zoned(q) ? q->nr_seq_zones : 0;
+}
+
 static inline unsigned int blk_queue_zone_no(struct request_queue *q,
                          sector_t sector)
 {
-- 
2.22.1


>> +		pr_err("block devices with conventional zones are not supported.");
>> +		return false;
>> +	}
>> +
>> +	/*
>> +	 * For ZBC and ZAC devices, writes into sequential zones must be aligned
>> +	 * to the device physical block size. So use this value as the logical
>> +	 * block size to avoid errors.
>> +	 */
>> +	ns->blksize_shift = blksize_bits(bdev_physical_block_size(ns->bdev));
>> +
>> +	if (!nvmet_zns_update_zasl(ns))
>> +		return false;
>> +
>> +	return !(get_capacity(ns->bdev->bd_disk) &
>> +			(bdev_zone_sectors(ns->bdev) - 1));
>> +}
>> +
>> +/*
>> + * ZNS related Admin and I/O command handlers.
>> + */
>> +void nvmet_execute_identify_cns_cs_ctrl(struct nvmet_req *req)
>> +{
>> +	u8 zasl = req->sq->ctrl->subsys->zasl;
>> +	struct nvmet_ctrl *ctrl = req->sq->ctrl;
>> +	struct nvme_id_ctrl_zns *id;
>> +	u16 status;
>> +
>> +	id = kzalloc(sizeof(*id), GFP_KERNEL);
>> +	if (!id) {
>> +		status = NVME_SC_INTERNAL;
>> +		goto out;
>> +	}
>> +
>> +	if (ctrl->ops->get_mdts)
>> +		id->zasl = min_t(u8, ctrl->ops->get_mdts(ctrl), zasl);
>> +	else
>> +		id->zasl = zasl;
>> +
>> +	status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id));
>> +
>> +	kfree(id);
>> +out:
>> +	nvmet_req_complete(req, status);
>> +}
>> +
>> +void nvmet_execute_identify_cns_cs_ns(struct nvmet_req *req)
>> +{
>> +	struct nvme_id_ns_zns *id_zns;
>> +	u16 status = 0;
>> +	u64 zsze;
>> +
>> +	if (le32_to_cpu(req->cmd->identify.nsid) == NVME_NSID_ALL) {
>> +		req->error_loc = offsetof(struct nvme_identify, nsid);
>> +		status = NVME_SC_INVALID_NS | NVME_SC_DNR;
>> +		goto out;
>> +	}
>> +
>> +	id_zns = kzalloc(sizeof(*id_zns), GFP_KERNEL);
>> +	if (!id_zns) {
>> +		status = NVME_SC_INTERNAL;
>> +		goto out;
>> +	}
>> +
>> +	req->ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->identify.nsid);
>> +	if (!req->ns) {
>> +		status = NVME_SC_INTERNAL;
>> +		goto done;
>> +	}
>> +
>> +	if (!bdev_is_zoned(req->ns->bdev)) {
>> +		req->error_loc = offsetof(struct nvme_identify, nsid);
>> +		status = NVME_SC_INVALID_NS | NVME_SC_DNR;
>> +		goto done;
>> +	}
>> +
>> +	nvmet_ns_revalidate(req->ns);
>> +	zsze = (bdev_zone_sectors(req->ns->bdev) << 9) >>
>> +					req->ns->blksize_shift;
>> +	id_zns->lbafe[0].zsze = cpu_to_le64(zsze);
>> +	id_zns->mor = cpu_to_le32(bdev_max_open_zones(req->ns->bdev));
>> +	id_zns->mar = cpu_to_le32(bdev_max_active_zones(req->ns->bdev));
>> +
>> +done:
>> +	status = nvmet_copy_to_sgl(req, 0, id_zns, sizeof(*id_zns));
>> +	kfree(id_zns);
>> +out:
>> +	nvmet_req_complete(req, status);
>> +}
>> +
>> +struct nvmet_report_zone_data {
>> +	struct nvmet_ns *ns;
>> +	struct nvme_zone_report *rz;
>> +};
>> +
>> +static int nvmet_bdev_report_zone_cb(struct blk_zone *z, unsigned int idx,
>> +				     void *data)
>> +{
>> +	struct nvmet_report_zone_data *report_zone_data = data;
>> +	struct nvme_zone_descriptor *entries = report_zone_data->rz->entries;
>> +	struct nvmet_ns *ns = report_zone_data->ns;
>> +
>> +	entries[idx].zcap = cpu_to_le64(nvmet_sect_to_lba(ns, z->capacity));
>> +	entries[idx].zslba = cpu_to_le64(nvmet_sect_to_lba(ns, z->start));
>> +	entries[idx].wp = cpu_to_le64(nvmet_sect_to_lba(ns, z->wp));
>> +	entries[idx].za = z->reset ? 1 << 2 : 0;
>> +	entries[idx].zt = z->type;
>> +	entries[idx].zs = z->cond << 4;
>> +
>> +	return 0;
>> +}
>> +
>> +void nvmet_bdev_execute_zone_mgmt_recv(struct nvmet_req *req)
>> +{
>> +	sector_t sect = nvmet_lba_to_sect(req->ns, req->cmd->zmr.slba);
>> +	u64 bufsize = (le32_to_cpu(req->cmd->zmr.numd) + 1) << 2;
>> +	struct nvmet_report_zone_data data = { .ns = req->ns };
>> +	unsigned int nr_zones;
>> +	int reported_zones;
>> +	u16 status;
>> +
>> +	nr_zones = (bufsize - sizeof(struct nvme_zone_report)) /
>> +			sizeof(struct nvme_zone_descriptor);
> This is a 64 bits division. This will not compile on 32-bits arch, no ? I.e.,
> you must use do_div64() here I think. Or just a bit shift since struct
> nvme_zone_descriptor is exactly 64B. Or have bufsize be a 32 bits. There is no
> need for that variable to be 64 bits.
I'll make it u32.
>> +
>> +	status = nvmet_bdev_zns_checks(req);
>> +	if (status)
>> +		goto out;
>> +
>> +	data.rz = __vmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY);
>> +	if (!data.rz) {
>> +		status = NVME_SC_INTERNAL;
>> +		goto out;
>> +	}
>> +
>> +	reported_zones = blkdev_report_zones(req->ns->bdev, sect, nr_zones,
>> +					     nvmet_bdev_report_zone_cb,
>> +					     &data);
>> +	if (reported_zones < 0) {
>> +		status = NVME_SC_INTERNAL;
>> +		goto out_free_report_zones;
>> +	}
>> +
>> +	data.rz->nr_zones = cpu_to_le64(reported_zones);
>> +
>> +	status = nvmet_copy_to_sgl(req, 0, data.rz, bufsize);
> reported_zoned may be far less than the request nr_zones. So you may want to
> recalculate bufsize before doing this to not copy garbage data into the reply
> buffer. Another thing that needs verification is: don't you need to zero-oput
> the data.rz buffer with GFP_ZERO, for security ?
With __GFP_ZERO used (for data.rz) we need to use the entire buffer size
such
thatit will zerout the command buffer that we get from the host. If we only
use the updated buffer size function of the reported_zones then rest of
the buffer will not get zeroed.
>
>> +
>> +out_free_report_zones:
>> +	kvfree(data.rz);
>> +out:
>> +	nvmet_req_complete(req, status);
>> +}
>> +
>> +void nvmet_bdev_execute_zone_mgmt_send(struct nvmet_req *req)
>> +{
>> +	sector_t sect = nvmet_lba_to_sect(req->ns, req->cmd->zms.slba);
>> +	sector_t nr_sect = bdev_zone_sectors(req->ns->bdev);
>> +	enum req_opf op = REQ_OP_LAST;
>> +	u16 status = NVME_SC_SUCCESS;
>> +	int ret;
>> +
>> +	if (req->cmd->zms.select_all)
>> +		nr_sect = get_capacity(req->ns->bdev->bd_disk);
>> +
>> +	switch (req->cmd->zms.zsa) {
>> +	case NVME_ZONE_OPEN:
>> +		op = REQ_OP_ZONE_OPEN;
>> +		break;
>> +	case NVME_ZONE_CLOSE:
>> +		op = REQ_OP_ZONE_CLOSE;
>> +		break;
>> +	case NVME_ZONE_FINISH:
>> +		op = REQ_OP_ZONE_FINISH;
>> +		break;
>> +	case NVME_ZONE_RESET:
>> +		op = REQ_OP_ZONE_RESET;
>> +		break;
>> +	default:
>> +		status = NVME_SC_INVALID_FIELD;
>> +		goto out;
>> +	}
>> +
>> +	ret = blkdev_zone_mgmt(req->ns->bdev, op, sect, nr_sect, GFP_KERNEL);
>> +	if (ret)
>> +		status = NVME_SC_INTERNAL;
>> +out:
>> +	nvmet_req_complete(req, status);
>> +}
>> +
>> +void nvmet_bdev_execute_zone_append(struct nvmet_req *req)
>> +{
>> +	sector_t sect = nvmet_lba_to_sect(req->ns, req->cmd->rw.slba);
>> +	struct request_queue *q = req->ns->bdev->bd_disk->queue;
>> +	unsigned int max_sects = queue_max_zone_append_sectors(q);
>> +	u16 status = NVME_SC_SUCCESS;
>> +	unsigned int total_len = 0;
>> +	struct scatterlist *sg;
>> +	int ret = 0, sg_cnt;
>> +	struct bio *bio;
>> +
>> +	if (!nvmet_check_transfer_len(req, nvmet_rw_data_len(req)))
>> +		return;
>> +
>> +	if (!req->sg_cnt) {
>> +		nvmet_req_complete(req, 0);
>> +		return;
>> +	}
>> +
>> +	if (req->transfer_len <= NVMET_MAX_INLINE_DATA_LEN) {
>> +		bio = &req->b.inline_bio;
>> +		bio_init(bio, req->inline_bvec, ARRAY_SIZE(req->inline_bvec));
>> +	} else {
>> +		bio = bio_alloc(GFP_KERNEL, req->sg_cnt);
>> +	}
>> +
>> +	bio_set_dev(bio, req->ns->bdev);
>> +	bio->bi_iter.bi_sector = sect;
>> +	bio->bi_opf = REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE;
>> +	if (req->cmd->rw.control & cpu_to_le16(NVME_RW_FUA))
>> +		bio->bi_opf |= REQ_FUA;
>> +
>> +	for_each_sg(req->sg, sg, req->sg_cnt, sg_cnt) {
>> +		struct page *p = sg_page(sg);
>> +		unsigned int l = sg->length;
>> +		unsigned int o = sg->offset;
>> +		bool same_page = false;
>> +
>> +		ret = bio_add_hw_page(q, bio, p, l, o, max_sects, &same_page);
>> +		if (ret != sg->length) {
>> +			status = NVME_SC_INTERNAL;
>> +			goto out_bio_put;
>> +		}
>> +		if (same_page)
>> +			put_page(p);
>> +
>> +		total_len += sg->length;
>> +	}
>> +
>> +	if (total_len != nvmet_rw_data_len(req)) {
>> +		status = NVME_SC_INTERNAL | NVME_SC_DNR;
>> +		goto out_bio_put;
>> +	}
>> +
>> +	ret = submit_bio_wait(bio);
>> +	status = ret < 0 ? NVME_SC_INTERNAL : status;
>> +
>> +	sect += (total_len >> 9);
> This is incorrect: sect is the start sector of the append operation target zone.
> Adding to it the request size does not give the actual sector written on the
> backend. You need to get that from the completed BIO. And do not add the request
> size. The result is the first written sector, not the sector following the last
> one written.
Yes, indeed it should be bio->bi_iter.bi_sector correct me if I'm wrong.
>> +	req->cqe->result.u64 = cpu_to_le64(nvmet_sect_to_lba(req->ns, sect));
>> +
>> +out_bio_put:
>> +	if (bio != &req->b.inline_bio)
>> +		bio_put(bio);
>> +	nvmet_req_complete(req, status);
>> +}
>>
>