[PATCH V2 2/9] nvmet: add ZNS support for bdev-ns

Tue Dec 1 17:37:27 EST 2020

On 11/30/20 21:44, Damien Le Moal wrote:
> On 2020/12/01 12:38, Chaitanya Kulkarni wrote:
> [...]
>>>> +	 * namespaces.
>>>> +	 */
>>>> +	min_zasl = ns->zasl = nvmet_zasl(queue_max_zone_append_sectors(q));
>>>> +
>>>> +	xa_for_each(&(ns->subsys->namespaces), idx, ins) {
>>>> +		struct request_queue *iq = ins->bdev->bd_disk->queue;
>>>> +		unsigned int imax_za_sects = queue_max_zone_append_sectors(iq);
>>>> +		u8 izasl = nvmet_zasl(imax_za_sects);
>>>> +
>>>> +		if (!bdev_is_zoned(ins->bdev))
>>>> +			continue;
>>>> +
>>>> +		min_zasl = min_zasl > izasl ? izasl : min_zasl;
>>>> +	}
>>>> +
>>>> +	ns->subsys->id_ctrl_zns.zasl = min_t(u8, min_zasl, bio_max_zasl);
>>> I do not understand what bio_max_zasl is used for here, as it does not represent
>>> anything related to the zoned namespaces backends.
>> If we don't consider the bio max zasl then we will get the request more than
>> one bio can handle for zone append and will lead to bio split whichneeds to
>> be avoided.
> Why ? zasl comes from the target backend max zone append sectors, which we
> already know is OK and does not lead to BIO splitting for zone append. So why do
> you need an extra verification against that bio_max_zasl ?
>
>>>> +}> +
>>>> +bool nvmet_bdev_zns_enable(struct nvmet_ns *ns)
>>>> +{
>>>> +	if (ns->bdev->bd_disk->queue->conv_zones_bitmap) {
>>>> +		pr_err("block devices with conventional zones are not supported.");
>>>> +		return false;
>>>> +	}
>>> It may be better to move this test in nvmet_bdev_validate_zns_zones(), so that
>>> all backend zone configuration checks are together.
>> Okay.
>>>> +
>>>> +	if (!nvmet_bdev_validate_zns_zones(ns))
>>>> +		return false;
>>>> +
>>>> +	/*
>>>> +	 * For ZBC and ZAC devices, writes into sequential zones must be aligned
>>>> +	 * to the device physical block size. So use this value as the logical
>>>> +	 * block size to avoid errors.
>>>> +	 */
>>>> +	ns->blksize_shift = blksize_bits(bdev_physical_block_size(ns->bdev));
>>>> +
>>>> +	nvmet_zns_update_zasl(ns);
>>>> +
>>>> +	return true;
>>>> +}
>>>> +
>>>> +/*
>>>> + * ZNS related Admin and I/O command handlers.
>>>> + */
>>>> +void nvmet_execute_identify_cns_cs_ctrl(struct nvmet_req *req)
>>>> +{
>>>> +	struct nvmet_ctrl *ctrl = req->sq->ctrl;
>>>> +	struct nvme_id_ctrl_zns *id;
>>>> +	u16 status = 0;
>>>> +	u8 mdts;
>>>> +
>>>> +	id = kzalloc(sizeof(*id), GFP_KERNEL);
>>>> +	if (!id) {
>>>> +		status = NVME_SC_INTERNAL;
>>>> +		goto out;
>>>> +	}
>>>> +
>>>> +	/*
>>>> +	 * Even though this function sets Zone Append Size Limit to 0,
>>>> +	 * the 0 value here indicates that the maximum data transfer size for
>>>> +	 * the Zone Append command is indicated by the ctrl
>>>> +	 * Maximum Data Transfer Size (MDTS).
>>>> +	 */
>>> I do not understand this comment. It does not really exactly match what the code
>>> below is doing.
>> Let me fix the code and the comment in next version.
>>>> +
>>>> +	mdts = ctrl->ops->get_mdts ? ctrl->ops->get_mdts(ctrl) : 0;
>>>> +
>>>> +	id->zasl = min_t(u8, mdts, req->sq->ctrl->subsys->id_ctrl_zns.zasl);
>>>> +
>>>> +	status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id));
>>>> +
>>>> +	kfree(id);
>>>> +out:
>>>> +	nvmet_req_complete(req, status);
>>>> +}
>>>> +
>>>> +void nvmet_execute_identify_cns_cs_ns(struct nvmet_req *req)
>>>> +{
>>>> +	struct nvme_id_ns_zns *id_zns;
>>>> +	u16 status = 0;
>>>> +	u64 zsze;
>>>> +
>>>> +	if (le32_to_cpu(req->cmd->identify.nsid) == NVME_NSID_ALL) {
>>>> +		req->error_loc = offsetof(struct nvme_identify, nsid);
>>>> +		status = NVME_SC_INVALID_NS | NVME_SC_DNR;
>>>> +		goto out;
>>>> +	}
>>>> +
>>>> +	id_zns = kzalloc(sizeof(*id_zns), GFP_KERNEL);
>>>> +	if (!id_zns) {
>>>> +		status = NVME_SC_INTERNAL;
>>>> +		goto out;
>>>> +	}
>>>> +
>>>> +	req->ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->identify.nsid);
>>>> +	if (!req->ns) {
>>>> +		status = NVME_SC_INTERNAL;
>>>> +		goto done;
>>>> +	}
>>>> +
>>>> +	if (!bdev_is_zoned(nvmet_bdev(req))) {
>>>> +		req->error_loc = offsetof(struct nvme_identify, nsid);
>>>> +		status = NVME_SC_INVALID_NS | NVME_SC_DNR;
>>>> +		goto done;
>>>> +	}
>>>> +
>>>> +	nvmet_ns_revalidate(req->ns);
>>>> +	zsze = (bdev_zone_sectors(nvmet_bdev(req)) << 9) >>
>>>> +					req->ns->blksize_shift;
>>>> +	id_zns->lbafe[0].zsze = cpu_to_le64(zsze);
>>>> +	id_zns->mor = cpu_to_le32(bdev_max_open_zones(nvmet_bdev(req)));
>>>> +	id_zns->mar = cpu_to_le32(bdev_max_active_zones(nvmet_bdev(req)));
>>>> +
>>>> +done:
>>>> +	status = nvmet_copy_to_sgl(req, 0, id_zns, sizeof(*id_zns));
>>>> +	kfree(id_zns);
>>>> +out:
>>>> +	nvmet_req_complete(req, status);
>>>> +}
>>>> +
>>>> +struct nvmet_report_zone_data {
>>>> +	struct nvmet_ns *ns;
>>>> +	struct nvme_zone_report *rz;
>>>> +};
>>>> +
>>>> +static int nvmet_bdev_report_zone_cb(struct blk_zone *z, unsigned int idx,
>>>> +				     void *data)
>>>> +{
>>>> +	struct nvmet_report_zone_data *report_zone_data = data;
>>>> +	struct nvme_zone_descriptor *entries = report_zone_data->rz->entries;
>>>> +	struct nvmet_ns *ns = report_zone_data->ns;
>>>> +
>>>> +	entries[idx].zcap = cpu_to_le64(nvmet_sect_to_lba(ns, z->capacity));
>>>> +	entries[idx].zslba = cpu_to_le64(nvmet_sect_to_lba(ns, z->start));
>>>> +	entries[idx].wp = cpu_to_le64(nvmet_sect_to_lba(ns, z->wp));
>>>> +	entries[idx].za = z->reset ? 1 << 2 : 0;
>>>> +	entries[idx].zt = z->type;
>>>> +	entries[idx].zs = z->cond << 4;
>>>> +
>>>> +	return 0;
>>>> +}
>>>> +
>>>> +void nvmet_bdev_execute_zone_mgmt_recv(struct nvmet_req *req)
>>>> +{
>>>> +	u32 bufsize = (le32_to_cpu(req->cmd->zmr.numd) + 1) << 2;
>>> size_t ?
>> Yes.
>>>> +	struct nvmet_report_zone_data data = { .ns = req->ns };
>>>> +	struct nvme_zone_mgmt_recv_cmd *zmr = &req->cmd->zmr;
>>>> +	sector_t sect = nvmet_lba_to_sect(req->ns, le64_to_cpu(zmr->slba));
>>>> +	unsigned int nr_zones = bufsize / nvmet_zones_to_desc_size(1);
>>> I think this is wrong since nvmet_zones_to_desc_size includes sizeof(struct
>>> nvme_zone_report). I think what you want here is:
>>>
>>> nr_zones = (bufsize - sizeof(struct nvme_zone_report)) /
>>> 	sizeof(struct nvme_zone_descriptor);
>>>
>>> And then you can get rid of nvmet_zones_to_desc_size();
>> Yes, it doesn't need nvmet_zones_to_desc_size() anymore from v1.
>>>> +	int reported_zones;Maybe there is better way.
>>>> +	u16 status;
>>>> +
>>>> +	status = nvmet_bdev_zns_checks(req);
>>>> +	if (status)
>>>> +		goto out;
>>>> +
>>>> +	data.rz = __vmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY);
>>>> +	if (!data.rz) {
>>>> +		status = NVME_SC_INTERNAL;
>>>> +		goto out;
>>>> +	}
>>>> +
>>>> +	reported_zones = blkdev_report_zones(nvmet_bdev(req), sect, nr_zones,
>>>> +					     nvmet_bdev_report_zone_cb,
>>>> +					     &data);
>>>> +	if (reported_zones < 0) {
>>>> +		status = NVME_SC_INTERNAL;
>>>> +		goto out_free_report_zones;
>>>> +	}
>>>> +
>>>> +	data.rz->nr_zones = cpu_to_le64(reported_zones);
>>>> +
>>>> +	status = nvmet_copy_to_sgl(req, 0, data.rz, bufsize);
>>>> +
>>>> +out_free_report_zones:
>>>> +	kvfree(data.rz);
>>>> +out:
>>>> +	nvmet_req_complete(req, status);
>>>> +}
>>>> +
>>>> +void nvmet_bdev_execute_zone_mgmt_send(struct nvmet_req *req)
>>>> +{
>>>> +	sector_t nr_sect = bdev_zone_sectors(nvmet_bdev(req));
>>>> +	struct nvme_zone_mgmt_send_cmd *c = &req->cmd->zms;
>>>> +	enum req_opf op = REQ_OP_LAST;
>>>> +	u16 status = NVME_SC_SUCCESS;
>>>> +	sector_t sect;
>>>> +	int ret;
>>>> +
>>>> +	sect = nvmet_lba_to_sect(req->ns, le64_to_cpu(req->cmd->zms.slba));
>>>> +
>>>> +	switch (c->zsa) {
>>>> +	case NVME_ZONE_OPEN:
>>>> +		op = REQ_OP_ZONE_OPEN;
>>>> +		break;
>>>> +	case NVME_ZONE_CLOSE:
>>>> +		op = REQ_OP_ZONE_CLOSE;
>>>> +		break;
>>>> +	case NVME_ZONE_FINISH:
>>>> +		op = REQ_OP_ZONE_FINISH;
>>>> +		break;
>>>> +	case NVME_ZONE_RESET:
>>>> +		if (c->select_all)
>>>> +			nr_sect = get_capacity(nvmet_bdev(req)->bd_disk);
>>>> +		op = REQ_OP_ZONE_RESET;
>>>> +		break;
>>>> +	default:
>>>> +		status = NVME_SC_INVALID_FIELD;
>>>> +		goto out;
>>>> +	}
>>>> +
>>>> +	ret = blkdev_zone_mgmt(nvmet_bdev(req), op, sect, nr_sect, GFP_KERNEL);
>>>> +	if (ret)
>>>> +		status = NVME_SC_INTERNAL;
>>>> +
>>>> +out:
>>>> +	nvmet_req_complete(req, status);
>>>> +}
>>>> +
>>>> +void nvmet_bdev_execute_zone_append(struct nvmet_req *req)
>>>> +{
>>>> +	unsigned long bv_cnt = req->sg_cnt;
>>>> +	int op = REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE;
>>>> +	u64 slba = le64_to_cpu(req->cmd->rw.slba);
>>>> +	sector_t sect = nvmet_lba_to_sect(req->ns, slba);
>>>> +	u16 status = NVME_SC_SUCCESS;
>>>> +	size_t mapped_data_len = 0;
>>>> +	int sg_cnt = req->sg_cnt;
>>>> +	struct scatterlist *sg;
>>>> +	struct iov_iter from;
>>>> +	struct bio_vec *bvec;
>>>> +	size_t mapped_cnt;
>>>> +	struct bio *bio;
>>>> +	int ret;
>>>> +
>>>> +	if (!nvmet_check_transfer_len(req, nvmet_rw_data_len(req)))
>>>> +		return;
>>>> +
>>>> +	/*
>>>> +	 * When setting the ctrl->zasl we consider the BIO_MAX_PAGES so that we
>>>> +	 * don't have to split the bio, i.e. we shouldn't get
>>>> +	 * sg_cnt > BIO_MAX_PAGES since zasl on the host will limit the I/Os
>>>> +	 * with the size that considers the BIO_MAX_PAGES.
>>>> +	 */
>>> What ? Unclear... You advertize max zone append as ctrl->zasl = min(all ns max
>>> zone append sectors). What does BIO_MAX_PAGES has to do with anything ?
>> That is not true, ctrl->zasl is advertised
>> min(min all ns max zone append sectors), bio_max_zasl) to avoid the
>> splitting
>> of the bio on the target side:-
> We already know that zasl for each NS, given by the max zone append sector of
> the backends, are already OK, regardless of BIO_MAX_PAGES (see below).
>
>> See nvmet_zns_update_zasl()
>>
>> +	ns->subsys->id_ctrl_zns.zasl = min_t(u8, min_zasl, bio_max_zasl);
>>
>> Without considering the bio_max_pages we may have to set the ctrl->zasl
>> value
>> thatis > bio_max_pages_zasl, then we'll get a request that is greater
>> than what
>> one bio can handle, that will lead to splitting the bios, which we want to
>> avoid as per the comment in the V1.
>>
>> Can you please explain what is wrong with this approach which regulates the
>> zasl with all the possible namespaces zasl and bio_zasl?
> For starter, with multi-page bvecs now, I am not sure BIO_MAX_PAGES makes any
> sense anymore. Next, on the target side, the max zone append sectors limit for
> each NS guarantee that a single BIO can be used without splitting needed. Taking
> the min  of all of them will NOT remove that guarantee. Hence I think that
> BIO_MAX_PAGES has nothing to do in the middle of all this.
>
Okay, let me remove the bio size check and send v4.
>> May be there is better way?
>>>> +	if (!req->sg_cnt)
>>>> +		goto out;
>>>> +
>>>> +	if (WARN_ON(req->sg_cnt > BIO_MAX_PAGES)) {
>>>> +		status = NVME_SC_INTERNAL | NVME_SC_DNR;
>>>> +		goto out;
>>>> +	}
>>>> +
>>>> +	bvec = kmalloc_array(bv_cnt, sizeof(*bvec), GFP_KERNEL);
>>>> +	if (!bvec) {
>>>> +		status = NVME_SC_INTERNAL | NVME_SC_DNR;
>>>> +		goto out;
>>>> +	}
>>>> +
>>>> +	for_each_sg(req->sg, sg, req->sg_cnt, mapped_cnt) {
>>>> +		nvmet_file_init_bvec(bvec, sg);
>>>> +		mapped_data_len += bvec[mapped_cnt].bv_len;
>>>> +		sg_cnt--;
>>>> +		if (mapped_cnt == bv_cnt)
>>>> +			brhigh frequency I/O operationeak;
>>>> +	}
>>>> +
>>>> +	if (WARN_ON(sg_cnt)) {
>>>> +		status = NVME_SC_INTERNAL | NVME_SC_DNR;
>>>> +		goto out;
>>>> +	}
>>>> +
>>>> +	iov_iter_bvec(&from, WRITE, bvec, mapped_cnt, mapped_data_len);
>>>> +
>>>> +	bio = bio_alloc(GFP_KERNEL, bv_cnt);
>>>> +	bio_set_dev(bio, nvmet_bdev(req));
>>>> +	bio->bi_iter.bi_sector = sect;
>>>> +	bio->bi_opf = op;
>>> The op variable is used only here. It is not necessary.
>> Yes.
>>>> +
>>>> +	ret =  __bio_iov_append_get_pages(bio, &from);
>>> I still do not see why bio_iov_iter_get_pages() does not work here. Would you
>>> care to explain please ?
>>>
>> The same reason pointed out by the Johannes. Instead of calling wrapper,
>> call the underlaying core API, since we don't care about the rest of the
>> generic code. This also avoids an extra function callfor zone-append
>> fast path.
> I am still not convinced that __bio_iov_append_get_pages() will do the right
> thing as it lacks the loop that bio_iov_iter_get_pages() has.
>
>>>> +	if (unlikely(ret)) {
>>>> +		status = NVME_SC_INTERNAL | NVME_SC_DNR;
>>>> +		bio_io_error(bio);
>>>> +		goto bvec_free;
>>>> +	}
>>>> +
>>>> +	ret = submit_bio_wait(bio);
>>>> +	status = ret < 0 ? NVME_SC_INTERNAL : status;
>>>> +	bio_put(bio);
>>>> +
>>>> +	sect += (mapped_data_len >> 9);
>>>> +	req->cqe->result.u64 = le64_to_cpu(nvmet_sect_to_lba(req->ns, sect));
>>>> +
>>>> +bvec_free:
>>>> +	kfree(bvec);
>>>> +
>>>> +out:
>>>> +	nvmet_req_complete(req, status);
>>>> +}
>>>> +
>>>> +#else  /* CONFIG_BLK_DEV_ZONED */
>>>> +void nvmet_execute_identify_cns_cs_ctrl(struct nvmet_req *req)
>>>> +{
>>>> +}
>>>> +void nvmet_execute_identify_cns_cs_ns(struct nvmet_req *req)
>>>> +{
>>>> +}
>>>> +u16 nvmet_process_zns_cis(struct nvmet_req *req, off_t *off)
>>>> +{
>>>> +	return 0;
>>>> +}
>>>> +bool nvmet_bdev_zns_config(struct nvmet_ns *ns)
>>>> +{
>>>> +	return false;
>>>> +}
>>>> +void nvmet_bdev_execute_zone_mgmt_recv(struct nvmet_req *req)
>>>> +{
>>>> +}
>>>> +void nvmet_bdev_execute_zone_mgmt_send(struct nvmet_req *req)
>>>> +{
>>>> +}
>>>> +void nvmet_bdev_execute_zone_append(struct nvmet_req *req)
>>>> +{
>>>> +}
>>>> +void nvmet_zns_add_cmd_effects(struct nvme_effects_log *log)
>>>> +{
>>>> +}
>>>> +#endif /* CONFIG_BLK_DEV_ZONED */
>>>>
>>
>