[PATCH V9 4/9] nvmet: add ZBD over ZNS backend support
Damien Le Moal
Damien.LeMoal at wdc.com
Mon Jan 18 23:28:00 EST 2021
On Mon, 2021-01-18 at 19:25 +0100, Christoph Hellwig wrote:
> On Tue, Jan 12, 2021 at 07:52:27AM +0000, Damien Le Moal wrote:
> > >
> > > I do not understand the logic here, given that NVMe does not have
> > > conventional zones.
> >
> > 512e SAS & SATA SMR drives (512B logical, 4K physical) are a big thing, and for
> > these, all writes in sequential zones must be 4K aligned. So I suggested to
> > Chaitanya to simply use the physical block size as the LBA size for the target
> > to avoid weird IO errors that would not make sense in ZNS/NVMe world (e.g. 512B
> > aligned write requests failing).
>
> But in NVMe the physical block size exposes the atomic write unit, which
> could be way too large. Іf we want to do this cleanly we need to expose
> a minimum sequential zone write alignment value in the block layer.
What about something like this below to add to Chaitanya series ?
This adds the queue limit zone_write_granularity which is set to the physical
block size for scsi, and for NVMe/ZNS too since that value is limited to the
atomic block size. Lightly tested with both 512e and 4kn SMR drives.
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 43990b1d148b..d6c2677a38df 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -60,6 +60,7 @@ void blk_set_default_limits(struct queue_limits *lim)
lim->io_opt = 0;
lim->misaligned = 0;
lim->zoned = BLK_ZONED_NONE;
+ lim->zone_write_granularity = 0;
}
EXPORT_SYMBOL(blk_set_default_limits);
@@ -341,6 +342,14 @@ void blk_queue_logical_block_size(struct request_queue *q,
unsigned int size)
round_down(limits->max_hw_sectors, size >> SECTOR_SHIFT);
limits->max_sectors =
round_down(limits->max_sectors, size >> SECTOR_SHIFT);
+
+ if (blk_queue_is_zoned(q)) {
+ if (limits->zone_write_granularity < limits-
>logical_block_size)
+ limits->zone_write_granularity =
+ limits->logical_block_size;
+ if (q->limits.zone_write_granularity < q->limits.io_min)
+ q->limits.zone_write_granularity = q->limits.io_min;
+ }
}
EXPORT_SYMBOL(blk_queue_logical_block_size);
@@ -361,11 +370,39 @@ void blk_queue_physical_block_size(struct request_queue
*q, unsigned int size)
if (q->limits.physical_block_size < q->limits.logical_block_size)
q->limits.physical_block_size = q->limits.logical_block_size;
- if (q->limits.io_min < q->limits.physical_block_size)
+ if (q->limits.io_min < q->limits.physical_block_size) {
q->limits.io_min = q->limits.physical_block_size;
+ if (blk_queue_is_zoned(q)
+ && q->limits.zone_write_granularity < q->limits.io_min)
+ q->limits.zone_write_granularity = q->limits.io_min;
+ }
}
EXPORT_SYMBOL(blk_queue_physical_block_size);
+/**
+ * blk_queue_zone_write_granularity - set zone write granularity for the queue
+ * @q: the request queue for the zoned device
+ * @size: the zone write granularity size, in bytes
+ *
+ * Description:
+ * This should be set to the lowest possible size allowing to write in
+ * sequential zones of a zoned block device.
+ */
+void blk_queue_zone_write_granularity(struct request_queue *q, unsigned int
size)
+{
+ if (WARN_ON(!blk_queue_is_zoned(q)))
+ return;
+
+ q->limits.zone_write_granularity = size;
+
+ if (q->limits.zone_write_granularity < q->limits.logical_block_size)
+ q->limits.zone_write_granularity = q-
>limits.logical_block_size;
+
+ if (q->limits.zone_write_granularity < q->limits.io_min)
+ q->limits.zone_write_granularity = q->limits.io_min;
+}
+EXPORT_SYMBOL(blk_queue_zone_write_granularity);
+
/**
* blk_queue_alignment_offset - set physical block alignment offset
* @q: the request queue for the device
@@ -631,6 +668,8 @@ int blk_stack_limits(struct queue_limits *t, struct
queue_limits *b,
t->discard_granularity;
}
+ t->zone_write_granularity = max(t->zone_write_granularity,
+ b->zone_write_granularity);
t->zoned = max(t->zoned, b->zoned);
return ret;
}
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index b513f1683af0..7ea3dd4d876b 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -219,6 +219,11 @@ static ssize_t queue_write_zeroes_max_show(struct
request_queue *q, char *page)
(unsigned long long)q->limits.max_write_zeroes_sectors << 9);
}
+static ssize_t queue_zone_write_granularity_show(struct request_queue *q, char
*page)
+{
+ return queue_var_show(q->limits.zone_write_granularity, page);
+}
+
static ssize_t queue_zone_append_max_show(struct request_queue *q, char *page)
{
unsigned long long max_sectors = q->limits.max_zone_append_sectors;
@@ -585,6 +590,7 @@ QUEUE_RO_ENTRY(queue_discard_zeroes_data,
"discard_zeroes_data");
QUEUE_RO_ENTRY(queue_write_same_max, "write_same_max_bytes");
QUEUE_RO_ENTRY(queue_write_zeroes_max, "write_zeroes_max_bytes");
QUEUE_RO_ENTRY(queue_zone_append_max, "zone_append_max_bytes");
+QUEUE_RO_ENTRY(queue_zone_write_granularity, "zone_write_granularity");
QUEUE_RO_ENTRY(queue_zoned, "zoned");
QUEUE_RO_ENTRY(queue_nr_zones, "nr_zones");
@@ -639,6 +645,7 @@ static struct attribute *queue_attrs[] = {
&queue_write_same_max_entry.attr,
&queue_write_zeroes_max_entry.attr,
&queue_zone_append_max_entry.attr,
+ &queue_zone_write_granularity_entry.attr,
&queue_nonrot_entry.attr,
&queue_zoned_entry.attr,
&queue_nr_zones_entry.attr,
diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c
index 1dfe9a3500e3..def76ac88248 100644
--- a/drivers/nvme/host/zns.c
+++ b/drivers/nvme/host/zns.c
@@ -113,6 +113,13 @@ int nvme_update_zone_info(struct nvme_ns *ns, unsigned
lbaf)
blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q);
blk_queue_max_open_zones(q, le32_to_cpu(id->mor) + 1);
blk_queue_max_active_zones(q, le32_to_cpu(id->mar) + 1);
+
+ /*
+ * The physical block size is limited to the Atomic Write Unit Power
+ * Fail parameter. Use this value as the zone write granularity as it
+ * may be different from the logical block size.
+ */
+ blk_queue_zone_write_granularity(q, q->limits.physical_block_size);
free_data:
kfree(id);
return status;
diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index cf07b7f93579..41d602f7e62e 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -789,6 +789,16 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned
char *buf)
blk_queue_max_active_zones(q, 0);
nr_zones = round_up(sdkp->capacity, zone_blocks) >>
ilog2(zone_blocks);
+ /*
+ * Per ZBC and ZAC specifications, writes in sequential write required
+ * zones of host-managed devices must be aligned to the device
physical
+ * block size.
+ */
+ if (blk_queue_zoned_model(q) == BLK_ZONED_HM)
+ blk_queue_zone_write_granularity(q, sdkp-
>physical_block_size);
+ else
+ blk_queue_zone_write_granularity(q, sdkp->device-
>sector_size);
+
/* READ16/WRITE16 is mandatory for ZBC disks */
sdkp->device->use_16_for_rw = 1;
sdkp->device->use_10_for_rw = 0;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index f94ee3089e01..4b4df2644882 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -337,6 +337,7 @@ struct queue_limits {
unsigned int max_zone_append_sectors;
unsigned int discard_granularity;
unsigned int discard_alignment;
+ unsigned int zone_write_granularity;
unsigned short max_segments;
unsigned short max_integrity_segments;
@@ -1161,6 +1162,7 @@ extern void blk_queue_logical_block_size(struct
request_queue *, unsigned int);
extern void blk_queue_max_zone_append_sectors(struct request_queue *q,
unsigned int max_zone_append_sectors);
extern void blk_queue_physical_block_size(struct request_queue *, unsigned
int);
+void blk_queue_zone_write_granularity(struct request_queue *, unsigned int);
extern void blk_queue_alignment_offset(struct request_queue *q,
unsigned int alignment);
void blk_queue_update_readahead(struct request_queue *q);
--
Damien Le Moal
Western Digital
More information about the Linux-nvme
mailing list