[PATCHv4 5/5] nvme: support for zoned namespaces
Himanshu Madhani
himanshu.madhani at oracle.com
Tue Jun 30 12:36:47 EDT 2020
> On Jun 29, 2020, at 2:06 PM, Keith Busch <kbusch at kernel.org> wrote:
>
> From: Keith Busch <keith.busch at wdc.com>
>
> Add support for NVM Express Zoned Namespaces (ZNS) Command Set defined
> in NVM Express TP4053. Zoned namespaces are discovered based on their
> Command Set Identifier reported in the namespaces Namespace
> Identification Descriptor list. A successfully discovered Zoned
> Namespace will be registered with the block layer as a host managed
> zoned block device with Zone Append command support. A namespace that
> does not support append is not supported by the driver.
>
> Reviewed-by: Martin K. Petersen <martin.petersen at oracle.com>
> Reviewed-by: Johannes Thumshirn <johannes.thumshirn at wdc.com>
> Reviewed-by: Hannes Reinecke <hare at suse.de>
> Reviewed-by: Sagi Grimberg <sagi at grimberg.me>
> Reviewed-by: Javier González <javier.gonz at samsung.com>
> Signed-off-by: Hans Holmberg <hans.holmberg at wdc.com>
> Signed-off-by: Dmitry Fomichev <dmitry.fomichev at wdc.com>
> Signed-off-by: Ajay Joshi <ajay.joshi at wdc.com>
> Signed-off-by: Aravind Ramesh <aravind.ramesh at wdc.com>
> Signed-off-by: Niklas Cassel <niklas.cassel at wdc.com>
> Signed-off-by: Matias Bjørling <matias.bjorling at wdc.com>
> Signed-off-by: Damien Le Moal <damien.lemoal at wdc.com>
> Signed-off-by: Keith Busch <keith.busch at wdc.com>
> ---
> block/Kconfig | 5 +-
> drivers/nvme/host/Makefile | 1 +
> drivers/nvme/host/core.c | 97 ++++++++++++--
> drivers/nvme/host/nvme.h | 39 ++++++
> drivers/nvme/host/zns.c | 253 +++++++++++++++++++++++++++++++++++++
> include/linux/nvme.h | 111 ++++++++++++++++
> 6 files changed, 491 insertions(+), 15 deletions(-)
> create mode 100644 drivers/nvme/host/zns.c
>
> diff --git a/block/Kconfig b/block/Kconfig
> index 9357d7302398..bbad5e8bbffe 100644
> --- a/block/Kconfig
> +++ b/block/Kconfig
> @@ -86,9 +86,10 @@ config BLK_DEV_ZONED
> select MQ_IOSCHED_DEADLINE
> help
> Block layer zoned block device support. This option enables
> - support for ZAC/ZBC host-managed and host-aware zoned block devices.
> + support for ZAC/ZBC/ZNS host-managed and host-aware zoned block
> + devices.
>
> - Say yes here if you have a ZAC or ZBC storage device.
> + Say yes here if you have a ZAC, ZBC, or ZNS storage device.
>
> config BLK_DEV_THROTTLING
> bool "Block layer bio throttling support"
> diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile
> index fc7b26be692d..d7f6a87687b8 100644
> --- a/drivers/nvme/host/Makefile
> +++ b/drivers/nvme/host/Makefile
> @@ -13,6 +13,7 @@ nvme-core-y := core.o
> nvme-core-$(CONFIG_TRACING) += trace.o
> nvme-core-$(CONFIG_NVME_MULTIPATH) += multipath.o
> nvme-core-$(CONFIG_NVM) += lightnvm.o
> +nvme-core-$(CONFIG_BLK_DEV_ZONED) += zns.o
> nvme-core-$(CONFIG_FAULT_INJECTION_DEBUG_FS) += fault_inject.o
> nvme-core-$(CONFIG_NVME_HWMON) += hwmon.o
>
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index a4a61d7e793d..02b4273a0b10 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -89,7 +89,7 @@ static dev_t nvme_chr_devt;
> static struct class *nvme_class;
> static struct class *nvme_subsys_class;
>
> -static int nvme_revalidate_disk(struct gendisk *disk);
> +static int _nvme_revalidate_disk(struct gendisk *disk);
> static void nvme_put_subsystem(struct nvme_subsystem *subsys);
> static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
> unsigned nsid);
> @@ -287,6 +287,10 @@ void nvme_complete_rq(struct request *req)
> nvme_retry_req(req);
> return;
> }
> + } else if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
> + req_op(req) == REQ_OP_ZONE_APPEND) {
> + req->__sector = nvme_lba_to_sect(req->q->queuedata,
> + le64_to_cpu(nvme_req(req)->result.u64));
> }
>
> nvme_trace_bio_complete(req, status);
> @@ -673,7 +677,8 @@ static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns,
> }
>
> static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
> - struct request *req, struct nvme_command *cmnd)
> + struct request *req, struct nvme_command *cmnd,
> + enum nvme_opcode op)
> {
> struct nvme_ctrl *ctrl = ns->ctrl;
> u16 control = 0;
> @@ -687,7 +692,7 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
> if (req->cmd_flags & REQ_RAHEAD)
> dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
>
> - cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read);
> + cmnd->rw.opcode = op;
> cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id);
> cmnd->rw.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
> cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
> @@ -716,6 +721,8 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
> case NVME_NS_DPS_PI_TYPE2:
> control |= NVME_RW_PRINFO_PRCHK_GUARD |
> NVME_RW_PRINFO_PRCHK_REF;
> + if (op == nvme_cmd_zone_append)
> + control |= NVME_RW_APPEND_PIREMAP;
> cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req));
> break;
> }
> @@ -756,6 +763,19 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
> case REQ_OP_FLUSH:
> nvme_setup_flush(ns, cmd);
> break;
> + case REQ_OP_ZONE_RESET_ALL:
> + case REQ_OP_ZONE_RESET:
> + ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_RESET);
> + break;
> + case REQ_OP_ZONE_OPEN:
> + ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_OPEN);
> + break;
> + case REQ_OP_ZONE_CLOSE:
> + ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_CLOSE);
> + break;
> + case REQ_OP_ZONE_FINISH:
> + ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_FINISH);
> + break;
> case REQ_OP_WRITE_ZEROES:
> ret = nvme_setup_write_zeroes(ns, req, cmd);
> break;
> @@ -763,8 +783,13 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
> ret = nvme_setup_discard(ns, req, cmd);
> break;
> case REQ_OP_READ:
> + ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_read);
> + break;
> case REQ_OP_WRITE:
> - ret = nvme_setup_rw(ns, req, cmd);
> + ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_write);
> + break;
> + case REQ_OP_ZONE_APPEND:
> + ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_zone_append);
> break;
> default:
> WARN_ON_ONCE(1);
> @@ -1398,14 +1423,23 @@ static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
> return effects;
> }
>
> -static void nvme_update_formats(struct nvme_ctrl *ctrl)
> +static void nvme_update_formats(struct nvme_ctrl *ctrl, u32 *effects)
> {
> struct nvme_ns *ns;
>
> down_read(&ctrl->namespaces_rwsem);
> list_for_each_entry(ns, &ctrl->namespaces, list)
> - if (ns->disk && nvme_revalidate_disk(ns->disk))
> + if (ns->disk && _nvme_revalidate_disk(ns->disk))
> nvme_set_queue_dying(ns);
> + else if (blk_queue_is_zoned(ns->disk->queue)) {
> + /*
> + * IO commands are required to fully revalidate a zoned
> + * device. Force the command effects to trigger rescan
> + * work so report zones can run in a context with
> + * unfrozen IO queues.
> + */
> + *effects |= NVME_CMD_EFFECTS_NCC;
> + }
> up_read(&ctrl->namespaces_rwsem);
> }
>
> @@ -1417,7 +1451,7 @@ static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects)
> * this command.
> */
> if (effects & NVME_CMD_EFFECTS_LBCC)
> - nvme_update_formats(ctrl);
> + nvme_update_formats(ctrl, &effects);
> if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) {
> nvme_unfreeze(ctrl);
> nvme_mpath_unfreeze(ctrl->subsys);
> @@ -1532,7 +1566,7 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
> * Issue ioctl requests on the first available path. Note that unlike normal
> * block layer requests we will not retry failed request on another controller.
> */
> -static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
> +struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
> struct nvme_ns_head **head, int *srcu_idx)
> {
> #ifdef CONFIG_NVME_MULTIPATH
> @@ -1552,7 +1586,7 @@ static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
> return disk->private_data;
> }
>
> -static void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx)
> +void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx)
> {
> if (head)
> srcu_read_unlock(&head->srcu, idx);
> @@ -1945,23 +1979,34 @@ static void nvme_update_disk_info(struct gendisk *disk,
>
> static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
> {
> + unsigned lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
> struct nvme_ns *ns = disk->private_data;
> struct nvme_ctrl *ctrl = ns->ctrl;
> + int ret;
> u32 iob;
>
> /*
> * If identify namespace failed, use default 512 byte block size so
> * block layer can use before failing read/write for 0 capacity.
> */
> - ns->lba_shift = id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ds;
> + ns->lba_shift = id->lbaf[lbaf].ds;
> if (ns->lba_shift == 0)
> ns->lba_shift = 9;
>
> switch (ns->head->ids.csi) {
> case NVME_CSI_NVM:
> break;
> + case NVME_CSI_ZNS:
> + ret = nvme_update_zone_info(disk, ns, lbaf);
> + if (ret) {
> + dev_warn(ctrl->device,
> + "failed to add zoned namespace:%u ret:%d\n",
> + ns->head->ns_id, ret);
> + return ret;
> + }
> + break;
> default:
> - dev_warn(ctrl->device, "unknown csi:%d ns:%d\n",
> + dev_warn(ctrl->device, "unknown csi:%u ns:%u\n",
> ns->head->ids.csi, ns->head->ns_id);
> return -ENODEV;
> }
> @@ -1973,7 +2018,7 @@ static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
> iob = nvme_lba_to_sect(ns, le16_to_cpu(id->noiob));
>
> ns->features = 0;
> - ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms);
> + ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
> /* the PI implementation requires metadata equal t10 pi tuple size */
> if (ns->ms == sizeof(struct t10_pi_tuple))
> ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
> @@ -2015,7 +2060,7 @@ static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
> return 0;
> }
>
> -static int nvme_revalidate_disk(struct gendisk *disk)
> +static int _nvme_revalidate_disk(struct gendisk *disk)
> {
> struct nvme_ns *ns = disk->private_data;
> struct nvme_ctrl *ctrl = ns->ctrl;
> @@ -2063,6 +2108,28 @@ static int nvme_revalidate_disk(struct gendisk *disk)
> return ret;
> }
>
> +static int nvme_revalidate_disk(struct gendisk *disk)
> +{
> + int ret;
> +
> + ret = _nvme_revalidate_disk(disk);
> + if (ret)
> + return ret;
> +
> +#ifdef CONFIG_BLK_DEV_ZONED
> + if (blk_queue_is_zoned(disk->queue)) {
> + struct nvme_ns *ns = disk->private_data;
> + struct nvme_ctrl *ctrl = ns->ctrl;
> +
> + ret = blk_revalidate_disk_zones(disk, NULL);
> + if (!ret)
> + blk_queue_max_zone_append_sectors(disk->queue,
> + ctrl->max_zone_append);
> + }
> +#endif
> + return ret;
> +}
> +
> static char nvme_pr_type(enum pr_type type)
> {
> switch (type) {
> @@ -2193,6 +2260,7 @@ static const struct block_device_operations nvme_fops = {
> .release = nvme_release,
> .getgeo = nvme_getgeo,
> .revalidate_disk= nvme_revalidate_disk,
> + .report_zones = nvme_report_zones,
> .pr_ops = &nvme_pr_ops,
> };
>
> @@ -2218,6 +2286,7 @@ const struct block_device_operations nvme_ns_head_ops = {
> .ioctl = nvme_ioctl,
> .compat_ioctl = nvme_compat_ioctl,
> .getgeo = nvme_getgeo,
> + .report_zones = nvme_report_zones,
> .pr_ops = &nvme_pr_ops,
> };
> #endif /* CONFIG_NVME_MULTIPATH */
> @@ -4445,6 +4514,8 @@ static inline void _nvme_check_size(void)
> BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
> BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != NVME_IDENTIFY_DATA_SIZE);
> BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE);
> + BUILD_BUG_ON(sizeof(struct nvme_id_ns_zns) != NVME_IDENTIFY_DATA_SIZE);
> + BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_zns) != NVME_IDENTIFY_DATA_SIZE);
> BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
> BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
> BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);
> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
> index dd24a94c42ff..ed10bfff91e1 100644
> --- a/drivers/nvme/host/nvme.h
> +++ b/drivers/nvme/host/nvme.h
> @@ -238,6 +238,9 @@ struct nvme_ctrl {
> u32 max_hw_sectors;
> u32 max_segments;
> u32 max_integrity_segments;
> +#ifdef CONFIG_BLK_DEV_ZONED
> + u32 max_zone_append;
> +#endif
> u16 crdt[3];
> u16 oncs;
> u16 oacs;
> @@ -404,6 +407,9 @@ struct nvme_ns {
> u16 sgs;
> u32 sws;
> u8 pi_type;
> +#ifdef CONFIG_BLK_DEV_ZONED
> + u64 zsze;
> +#endif
> unsigned long features;
> unsigned long flags;
> #define NVME_NS_REMOVING 0
> @@ -571,6 +577,9 @@ int nvme_delete_ctrl(struct nvme_ctrl *ctrl);
>
> int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi,
> void *log, size_t size, u64 offset);
> +struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
> + struct nvme_ns_head **head, int *srcu_idx);
> +void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx);
>
> extern const struct attribute_group *nvme_ns_id_attr_groups[];
> extern const struct block_device_operations nvme_ns_head_ops;
> @@ -692,6 +701,36 @@ static inline void nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
> }
> #endif /* CONFIG_NVME_MULTIPATH */
>
> +#ifdef CONFIG_BLK_DEV_ZONED
> +int nvme_update_zone_info(struct gendisk *disk, struct nvme_ns *ns,
> + unsigned lbaf);
> +
> +int nvme_report_zones(struct gendisk *disk, sector_t sector,
> + unsigned int nr_zones, report_zones_cb cb, void *data);
> +
> +blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req,
> + struct nvme_command *cmnd,
> + enum nvme_zone_mgmt_action action);
> +#else
> +#define nvme_report_zones NULL
> +
> +static inline blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns,
> + struct request *req, struct nvme_command *cmnd,
> + enum nvme_zone_mgmt_action action)
> +{
> + return BLK_STS_NOTSUPP;
> +}
> +
> +static inline int nvme_update_zone_info(struct gendisk *disk,
> + struct nvme_ns *ns,
> + unsigned lbaf)
> +{
> + dev_warn(ns->ctrl->device,
> + "Please enable CONFIG_BLK_DEV_ZONED to support ZNS devices\n");
> + return -EPROTONOSUPPORT;
> +}
> +#endif
> +
> #ifdef CONFIG_NVM
> int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node);
> void nvme_nvm_unregister(struct nvme_ns *ns);
> diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c
> new file mode 100644
> index 000000000000..c033c60ce751
> --- /dev/null
> +++ b/drivers/nvme/host/zns.c
> @@ -0,0 +1,253 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (C) 2020 Western Digital Corporation or its affiliates.
> + */
> +
> +#include <linux/blkdev.h>
> +#include <linux/vmalloc.h>
> +#include "nvme.h"
> +
> +static int nvme_set_max_append(struct nvme_ctrl *ctrl)
> +{
> + struct nvme_command c = { };
> + struct nvme_id_ctrl_zns *id;
> + int status;
> +
> + id = kzalloc(sizeof(*id), GFP_KERNEL);
> + if (!id)
> + return -ENOMEM;
> +
> + c.identify.opcode = nvme_admin_identify;
> + c.identify.cns = NVME_ID_CNS_CS_CTRL;
> + c.identify.csi = NVME_CSI_ZNS;
> +
> + status = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id));
> + if (status) {
> + kfree(id);
> + return status;
> + }
> +
> + if (id->zasl)
> + ctrl->max_zone_append = 1 << (id->zasl + 3);
> + else
> + ctrl->max_zone_append = ctrl->max_hw_sectors;
> + kfree(id);
> + return 0;
> +}
> +
> +int nvme_update_zone_info(struct gendisk *disk, struct nvme_ns *ns,
> + unsigned lbaf)
> +{
> + struct nvme_effects_log *log = ns->head->effects;
> + struct request_queue *q = disk->queue;
> + struct nvme_command c = { };
> + struct nvme_id_ns_zns *id;
> + int status;
> +
> + /* Driver requires zone append support */
> + if (!(log->iocs[nvme_cmd_zone_append] & NVME_CMD_EFFECTS_CSUPP)) {
> + dev_warn(ns->ctrl->device,
> + "append not supported for zoned namespace:%d\n",
> + ns->head->ns_id);
> + return -EINVAL;
> + }
> +
> + /* Lazily query controller append limit for the first zoned namespace */
> + if (!ns->ctrl->max_zone_append) {
> + status = nvme_set_max_append(ns->ctrl);
> + if (status)
> + return status;
> + }
> +
> + id = kzalloc(sizeof(*id), GFP_KERNEL);
> + if (!id)
> + return -ENOMEM;
> +
> + c.identify.opcode = nvme_admin_identify;
> + c.identify.nsid = cpu_to_le32(ns->head->ns_id);
> + c.identify.cns = NVME_ID_CNS_CS_NS;
> + c.identify.csi = NVME_CSI_ZNS;
> +
> + status = nvme_submit_sync_cmd(ns->ctrl->admin_q, &c, id, sizeof(*id));
> + if (status)
> + goto free_data;
> +
> + /*
> + * We currently do not handle devices requiring any of the zoned
> + * operation characteristics.
> + */
> + if (id->zoc) {
> + dev_warn(ns->ctrl->device,
> + "zone operations:%x not supported for namespace:%u\n",
> + le16_to_cpu(id->zoc), ns->head->ns_id);
> + status = -EINVAL;
> + goto free_data;
> + }
> +
> + ns->zsze = nvme_lba_to_sect(ns, le64_to_cpu(id->lbafe[lbaf].zsze));
> + if (!is_power_of_2(ns->zsze)) {
> + dev_warn(ns->ctrl->device,
> + "invalid zone size:%llu for namespace:%u\n",
> + ns->zsze, ns->head->ns_id);
> + status = -EINVAL;
> + goto free_data;
> + }
> +
> + q->limits.zoned = BLK_ZONED_HM;
> + blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q);
> +free_data:
> + kfree(id);
> + return status;
> +}
> +
> +static void *nvme_zns_alloc_report_buffer(struct nvme_ns *ns,
> + unsigned int nr_zones, size_t *buflen)
> +{
> + struct request_queue *q = ns->disk->queue;
> + size_t bufsize;
> + void *buf;
> +
> + const size_t min_bufsize = sizeof(struct nvme_zone_report) +
> + sizeof(struct nvme_zone_descriptor);
> +
> + nr_zones = min_t(unsigned int, nr_zones,
> + get_capacity(ns->disk) >> ilog2(ns->zsze));
> +
> + bufsize = sizeof(struct nvme_zone_report) +
> + nr_zones * sizeof(struct nvme_zone_descriptor);
> + bufsize = min_t(size_t, bufsize,
> + queue_max_hw_sectors(q) << SECTOR_SHIFT);
> + bufsize = min_t(size_t, bufsize, queue_max_segments(q) << PAGE_SHIFT);
> +
> + while (bufsize >= min_bufsize) {
> + buf = __vmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY);
> + if (buf) {
> + *buflen = bufsize;
> + return buf;
> + }
> + bufsize >>= 1;
> + }
> + return NULL;
> +}
> +
> +static int __nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
> + struct nvme_zone_report *report,
> + size_t buflen)
> +{
> + struct nvme_command c = { };
> + int ret;
> +
> + c.zmr.opcode = nvme_cmd_zone_mgmt_recv;
> + c.zmr.nsid = cpu_to_le32(ns->head->ns_id);
> + c.zmr.slba = cpu_to_le64(nvme_sect_to_lba(ns, sector));
> + c.zmr.numd = cpu_to_le32(nvme_bytes_to_numd(buflen));
> + c.zmr.zra = NVME_ZRA_ZONE_REPORT;
> + c.zmr.zrasf = NVME_ZRASF_ZONE_REPORT_ALL;
> + c.zmr.pr = NVME_REPORT_ZONE_PARTIAL;
> +
> + ret = nvme_submit_sync_cmd(ns->queue, &c, report, buflen);
> + if (ret)
> + return ret;
> +
> + return le64_to_cpu(report->nr_zones);
> +}
> +
> +static int nvme_zone_parse_entry(struct nvme_ns *ns,
> + struct nvme_zone_descriptor *entry,
> + unsigned int idx, report_zones_cb cb,
> + void *data)
> +{
> + struct blk_zone zone = { };
> +
> + if ((entry->zt & 0xf) != NVME_ZONE_TYPE_SEQWRITE_REQ) {
> + dev_err(ns->ctrl->device, "invalid zone type %#x\n",
> + entry->zt);
> + return -EINVAL;
> + }
> +
> + zone.type = BLK_ZONE_TYPE_SEQWRITE_REQ;
> + zone.cond = entry->zs >> 4;
> + zone.len = ns->zsze;
> + zone.capacity = nvme_lba_to_sect(ns, le64_to_cpu(entry->zcap));
> + zone.start = nvme_lba_to_sect(ns, le64_to_cpu(entry->zslba));
> + zone.wp = nvme_lba_to_sect(ns, le64_to_cpu(entry->wp));
> +
> + return cb(&zone, idx, data);
> +}
> +
> +static int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
> + unsigned int nr_zones, report_zones_cb cb, void *data)
> +{
> + struct nvme_zone_report *report;
> + int ret, zone_idx = 0;
> + unsigned int nz, i;
> + size_t buflen;
> +
> + report = nvme_zns_alloc_report_buffer(ns, nr_zones, &buflen);
> + if (!report)
> + return -ENOMEM;
> +
> + sector &= ~(ns->zsze - 1);
> + while (zone_idx < nr_zones && sector < get_capacity(ns->disk)) {
> + memset(report, 0, buflen);
> + ret = __nvme_ns_report_zones(ns, sector, report, buflen);
> + if (ret < 0)
> + goto out_free;
> +
> + nz = min_t(unsigned int, ret, nr_zones);
> + if (!nz)
> + break;
> +
> + for (i = 0; i < nz && zone_idx < nr_zones; i++) {
> + ret = nvme_zone_parse_entry(ns, &report->entries[i],
> + zone_idx, cb, data);
> + if (ret)
> + goto out_free;
> + zone_idx++;
> + }
> +
> + sector += ns->zsze * nz;
> + }
> +
> + if (zone_idx > 0)
> + ret = zone_idx;
> + else
> + ret = -EINVAL;
> +out_free:
> + kvfree(report);
> + return ret;
> +}
> +
> +int nvme_report_zones(struct gendisk *disk, sector_t sector,
> + unsigned int nr_zones, report_zones_cb cb, void *data)
> +{
> + struct nvme_ns_head *head = NULL;
> + struct nvme_ns *ns;
> + int srcu_idx, ret;
> +
> + ns = nvme_get_ns_from_disk(disk, &head, &srcu_idx);
> + if (unlikely(!ns))
> + return -EWOULDBLOCK;
> +
> + if (ns->head->ids.csi == NVME_CSI_ZNS)
> + ret = nvme_ns_report_zones(ns, sector, nr_zones, cb, data);
> + else
> + ret = -EINVAL;
> + nvme_put_ns_from_disk(head, srcu_idx);
> +
> + return ret;
> +}
> +
> +blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req,
> + struct nvme_command *c, enum nvme_zone_mgmt_action action)
> +{
> + c->zms.opcode = nvme_cmd_zone_mgmt_send;
> + c->zms.nsid = cpu_to_le32(ns->head->ns_id);
> + c->zms.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
> + c->zms.zsa = action;
> +
> + if (req_op(req) == REQ_OP_ZONE_RESET_ALL)
> + c->zms.select_all = 1;
> +
> + return BLK_STS_OK;
> +}
> diff --git a/include/linux/nvme.h b/include/linux/nvme.h
> index 95cd03e240a1..1643005d21e3 100644
> --- a/include/linux/nvme.h
> +++ b/include/linux/nvme.h
> @@ -374,6 +374,30 @@ struct nvme_id_ns {
> __u8 vs[3712];
> };
>
> +struct nvme_zns_lbafe {
> + __le64 zsze;
> + __u8 zdes;
> + __u8 rsvd9[7];
> +};
> +
> +struct nvme_id_ns_zns {
> + __le16 zoc;
> + __le16 ozcs;
> + __le32 mar;
> + __le32 mor;
> + __le32 rrl;
> + __le32 frl;
> + __u8 rsvd20[2796];
> + struct nvme_zns_lbafe lbafe[16];
> + __u8 rsvd3072[768];
> + __u8 vs[256];
> +};
> +
> +struct nvme_id_ctrl_zns {
> + __u8 zasl;
> + __u8 rsvd1[4095];
> +};
> +
> enum {
> NVME_ID_CNS_NS = 0x00,
> NVME_ID_CNS_CTRL = 0x01,
> @@ -392,6 +416,7 @@ enum {
>
> enum {
> NVME_CSI_NVM = 0,
> + NVME_CSI_ZNS = 2,
> };
>
> enum {
> @@ -532,6 +557,27 @@ struct nvme_ana_rsp_hdr {
> __le16 rsvd10[3];
> };
>
> +struct nvme_zone_descriptor {
> + __u8 zt;
> + __u8 zs;
> + __u8 za;
> + __u8 rsvd3[5];
> + __le64 zcap;
> + __le64 zslba;
> + __le64 wp;
> + __u8 rsvd32[32];
> +};
> +
> +enum {
> + NVME_ZONE_TYPE_SEQWRITE_REQ = 0x2,
> +};
> +
> +struct nvme_zone_report {
> + __le64 nr_zones;
> + __u8 resv8[56];
> + struct nvme_zone_descriptor entries[];
> +};
> +
> enum {
> NVME_SMART_CRIT_SPARE = 1 << 0,
> NVME_SMART_CRIT_TEMPERATURE = 1 << 1,
> @@ -626,6 +672,9 @@ enum nvme_opcode {
> nvme_cmd_resv_report = 0x0e,
> nvme_cmd_resv_acquire = 0x11,
> nvme_cmd_resv_release = 0x15,
> + nvme_cmd_zone_mgmt_send = 0x79,
> + nvme_cmd_zone_mgmt_recv = 0x7a,
> + nvme_cmd_zone_append = 0x7d,
> };
>
> #define nvme_opcode_name(opcode) { opcode, #opcode }
> @@ -764,6 +813,7 @@ struct nvme_rw_command {
> enum {
> NVME_RW_LR = 1 << 15,
> NVME_RW_FUA = 1 << 14,
> + NVME_RW_APPEND_PIREMAP = 1 << 9,
> NVME_RW_DSM_FREQ_UNSPEC = 0,
> NVME_RW_DSM_FREQ_TYPICAL = 1,
> NVME_RW_DSM_FREQ_RARE = 2,
> @@ -829,6 +879,53 @@ struct nvme_write_zeroes_cmd {
> __le16 appmask;
> };
>
> +enum nvme_zone_mgmt_action {
> + NVME_ZONE_CLOSE = 0x1,
> + NVME_ZONE_FINISH = 0x2,
> + NVME_ZONE_OPEN = 0x3,
> + NVME_ZONE_RESET = 0x4,
> + NVME_ZONE_OFFLINE = 0x5,
> + NVME_ZONE_SET_DESC_EXT = 0x10,
> +};
> +
> +struct nvme_zone_mgmt_send_cmd {
> + __u8 opcode;
> + __u8 flags;
> + __u16 command_id;
> + __le32 nsid;
> + __le32 cdw2[2];
> + __le64 metadata;
> + union nvme_data_ptr dptr;
> + __le64 slba;
> + __le32 cdw12;
> + __u8 zsa;
> + __u8 select_all;
> + __u8 rsvd13[2];
> + __le32 cdw14[2];
> +};
> +
> +struct nvme_zone_mgmt_recv_cmd {
> + __u8 opcode;
> + __u8 flags;
> + __u16 command_id;
> + __le32 nsid;
> + __le64 rsvd2[2];
> + union nvme_data_ptr dptr;
> + __le64 slba;
> + __le32 numd;
> + __u8 zra;
> + __u8 zrasf;
> + __u8 pr;
> + __u8 rsvd13;
> + __le32 cdw14[2];
> +};
> +
> +enum {
> + NVME_ZRA_ZONE_REPORT = 0,
> + NVME_ZRASF_ZONE_REPORT_ALL = 0,
> + NVME_REPORT_ZONE_PARTIAL = 1,
> +};
> +
> /* Features */
>
> enum {
> @@ -1300,6 +1397,8 @@ struct nvme_command {
> struct nvme_format_cmd format;
> struct nvme_dsm_cmd dsm;
> struct nvme_write_zeroes_cmd write_zeroes;
> + struct nvme_zone_mgmt_send_cmd zms;
> + struct nvme_zone_mgmt_recv_cmd zmr;
> struct nvme_abort_cmd abort;
> struct nvme_get_log_page_command get_log_page;
> struct nvmf_common_command fabrics;
> @@ -1433,6 +1532,18 @@ enum {
> NVME_SC_DISCOVERY_RESTART = 0x190,
> NVME_SC_AUTH_REQUIRED = 0x191,
>
> + /*
> + * I/O Command Set Specific - Zoned commands:
> + */
> + NVME_SC_ZONE_BOUNDARY_ERROR = 0x1b8,
> + NVME_SC_ZONE_FULL = 0x1b9,
> + NVME_SC_ZONE_READ_ONLY = 0x1ba,
> + NVME_SC_ZONE_OFFLINE = 0x1bb,
> + NVME_SC_ZONE_INVALID_WRITE = 0x1bc,
> + NVME_SC_ZONE_TOO_MANY_ACTIVE = 0x1bd,
> + NVME_SC_ZONE_TOO_MANY_OPEN = 0x1be,
> + NVME_SC_ZONE_INVALID_TRANSITION = 0x1bf,
> +
> /*
> * Media and Data Integrity Errors:
> */
> --
> 2.24.1
>
Reviewed-by: Himanshu Madhani <himanshu.madhani at oracle.com>
--
Himanshu Madhani Oracle Linux Engineering
More information about the Linux-nvme
mailing list