[PATCH 6/6] null_blk: Add support for power_of_2 emulation to the null blk device
Damien Le Moal
damien.lemoal at opensource.wdc.com
Tue Mar 8 20:09:21 PST 2022
On 3/9/22 01:53, Pankaj Raghav wrote:
> power_of_2(PO2) emulation support is added to the null blk device to
> measure performance.
>
> Callbacks are added in the hotpaths that require PO2 emulation specific
> behaviour to reduce the impact on exisiting path.
>
> The power_of_2 emulation support is wired up for both the request and
> the bio queue mode and it is automatically enabled when the given zone
> size is non-power_of_2.
This does not make any sense. Why would you want to add power of 2 zone
size emulation to nullblk ? Just set the zone size to be a power of 2...
If this is for test purpose, then use QEMU. These changes make no sense
to me here.
A change that would make sense in the context of this series is to allow
for setting a zoned null_blk device zone size to a non power of 2 size.
But this series does not actually deal with that. So do not touch this
driver please.
>
> Signed-off-by: Pankaj Raghav <p.raghav at samsung.com>
> ---
> drivers/block/null_blk/main.c | 8 +-
> drivers/block/null_blk/null_blk.h | 12 ++
> drivers/block/null_blk/zoned.c | 203 ++++++++++++++++++++++++++----
> 3 files changed, 196 insertions(+), 27 deletions(-)
>
> diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c
> index 625a06bfa5ad..c926b59f2b17 100644
> --- a/drivers/block/null_blk/main.c
> +++ b/drivers/block/null_blk/main.c
> @@ -210,7 +210,7 @@ MODULE_PARM_DESC(zoned, "Make device as a host-managed zoned block device. Defau
>
> static unsigned long g_zone_size = 256;
> module_param_named(zone_size, g_zone_size, ulong, S_IRUGO);
> -MODULE_PARM_DESC(zone_size, "Zone size in MB when block device is zoned. Must be power-of-two: Default: 256");
> +MODULE_PARM_DESC(zone_size, "Zone size in MB when block device is zoned. Default: 256");
>
> static unsigned long g_zone_capacity;
> module_param_named(zone_capacity, g_zone_capacity, ulong, 0444);
> @@ -1772,11 +1772,13 @@ static const struct block_device_operations null_bio_ops = {
> .owner = THIS_MODULE,
> .submit_bio = null_submit_bio,
> .report_zones = null_report_zones,
> + .npo2_zone_setup = null_po2_zone_emu_setup,
> };
>
> static const struct block_device_operations null_rq_ops = {
> .owner = THIS_MODULE,
> .report_zones = null_report_zones,
> + .npo2_zone_setup = null_po2_zone_emu_setup,
> };
>
> static int setup_commands(struct nullb_queue *nq)
> @@ -1929,8 +1931,8 @@ static int null_validate_conf(struct nullb_device *dev)
> dev->mbps = 0;
>
> if (dev->zoned &&
> - (!dev->zone_size || !is_power_of_2(dev->zone_size))) {
> - pr_err("zone_size must be power-of-two\n");
> + (!dev->zone_size)) {
> + pr_err("zone_size must be zero\n");
> return -EINVAL;
> }
>
> diff --git a/drivers/block/null_blk/null_blk.h b/drivers/block/null_blk/null_blk.h
> index 78eb56b0ca55..34c1b7b2546b 100644
> --- a/drivers/block/null_blk/null_blk.h
> +++ b/drivers/block/null_blk/null_blk.h
> @@ -74,6 +74,16 @@ struct nullb_device {
> unsigned int imp_close_zone_no;
> struct nullb_zone *zones;
> sector_t zone_size_sects;
> + sector_t zone_size_po2_sects;
> + sector_t zone_size_diff_sects;
> + /* The callbacks below are used as hook to perform po2 emulation for a
> + * zoned device.
> + */
> + unsigned int (*zone_no)(struct nullb_device *dev,
> + sector_t sect);
> + sector_t (*zone_update_sector)(struct nullb_device *dev, sector_t sect);
> + sector_t (*zone_update_sector_append)(struct nullb_device *dev,
> + sector_t sect);
> bool need_zone_res_mgmt;
> spinlock_t zone_res_lock;
>
> @@ -137,6 +147,7 @@ int null_register_zoned_dev(struct nullb *nullb);
> void null_free_zoned_dev(struct nullb_device *dev);
> int null_report_zones(struct gendisk *disk, sector_t sector,
> unsigned int nr_zones, report_zones_cb cb, void *data);
> +void null_po2_zone_emu_setup(struct gendisk *disk);
> blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd,
> enum req_opf op, sector_t sector,
> sector_t nr_sectors);
> @@ -166,5 +177,6 @@ static inline size_t null_zone_valid_read_len(struct nullb *nullb,
> return len;
> }
> #define null_report_zones NULL
> +#define null_po2_zone_emu_setup NULL
> #endif /* CONFIG_BLK_DEV_ZONED */
> #endif /* __NULL_BLK_H */
> diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c
> index dae54dd1aeac..3bb63c170149 100644
> --- a/drivers/block/null_blk/zoned.c
> +++ b/drivers/block/null_blk/zoned.c
> @@ -16,6 +16,44 @@ static inline unsigned int null_zone_no(struct nullb_device *dev, sector_t sect)
> return sect >> ilog2(dev->zone_size_sects);
> }
>
> +static inline unsigned int null_npo2_zone_no(struct nullb_device *dev,
> + sector_t sect)
> +{
> + return sect / dev->zone_size_sects;
> +}
> +
> +static inline bool null_is_po2_zone_emu(struct nullb_device *dev)
> +{
> + return !!dev->zone_size_diff_sects;
> +}
> +
> +static inline sector_t null_zone_update_sector_noop(struct nullb_device *dev,
> + sector_t sect)
> +{
> + return sect;
> +}
> +
> +static inline sector_t null_zone_update_sector_po2_emu(struct nullb_device *dev,
> + sector_t sect)
> +{
> + sector_t zsze_po2 = dev->zone_size_po2_sects;
> + sector_t zsze_diff = dev->zone_size_diff_sects;
> + u32 zone_idx = sect >> ilog2(zsze_po2);
> +
> + sect = sect - (zone_idx * zsze_diff);
> + return sect;
> +}
> +
> +static inline sector_t
> +null_zone_update_sector_append_po2_emu(struct nullb_device *dev, sector_t sect)
> +{
> + /* Need to readjust the sector if po2 emulation is used. */
> + u32 zone_no = dev->zone_no(dev, sect);
> +
> + sect += dev->zone_size_diff_sects * zone_no;
> + return sect;
> +}
> +
> static inline void null_lock_zone_res(struct nullb_device *dev)
> {
> if (dev->need_zone_res_mgmt)
> @@ -62,15 +100,14 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q)
> sector_t sector = 0;
> unsigned int i;
>
> - if (!is_power_of_2(dev->zone_size)) {
> - pr_err("zone_size must be power-of-two\n");
> - return -EINVAL;
> - }
> if (dev->zone_size > dev->size) {
> pr_err("Zone size larger than device capacity\n");
> return -EINVAL;
> }
>
> + if (!is_power_of_2(dev->zone_size))
> + pr_info("zone_size is not power-of-two. power-of-two emulation is enabled");
> +
> if (!dev->zone_capacity)
> dev->zone_capacity = dev->zone_size;
>
> @@ -83,8 +120,14 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q)
> zone_capacity_sects = mb_to_sects(dev->zone_capacity);
> dev_capacity_sects = mb_to_sects(dev->size);
> dev->zone_size_sects = mb_to_sects(dev->zone_size);
> - dev->nr_zones = round_up(dev_capacity_sects, dev->zone_size_sects)
> - >> ilog2(dev->zone_size_sects);
> +
> + dev->nr_zones = roundup(dev_capacity_sects, dev->zone_size_sects) /
> + dev->zone_size_sects;
> +
> + dev->zone_no = null_zone_no;
> + dev->zone_update_sector = null_zone_update_sector_noop;
> + dev->zone_update_sector_append = null_zone_update_sector_noop;
> +
>
> dev->zones = kvmalloc_array(dev->nr_zones, sizeof(struct nullb_zone),
> GFP_KERNEL | __GFP_ZERO);
> @@ -166,7 +209,13 @@ int null_register_zoned_dev(struct nullb *nullb)
> if (ret)
> return ret;
> } else {
> - blk_queue_chunk_sectors(q, dev->zone_size_sects);
> + nullb->disk->fops->npo2_zone_setup(nullb->disk);
> +
> + if (null_is_po2_zone_emu(dev))
> + blk_queue_chunk_sectors(q, dev->zone_size_po2_sects);
> + else
> + blk_queue_chunk_sectors(q, dev->zone_size_sects);
> +
> q->nr_zones = blkdev_nr_zones(nullb->disk);
> }
>
> @@ -183,17 +232,49 @@ void null_free_zoned_dev(struct nullb_device *dev)
> dev->zones = NULL;
> }
>
> +static void null_update_zone_info(struct nullb *nullb, struct blk_zone *blkz,
> + struct nullb_zone *zone)
> +{
> + unsigned int zone_idx;
> + u64 zone_gap;
> + struct nullb_device *dev = nullb->dev;
> +
> + if (null_is_po2_zone_emu(dev)) {
> + zone_idx = zone->start / zone->len;
> + zone_gap = zone_idx * dev->zone_size_diff_sects;
> +
> + blkz->start = zone->start + zone_gap;
> + blkz->len = dev->zone_size_po2_sects;
> + blkz->wp = zone->wp + zone_gap;
> + } else {
> + blkz->start = zone->start;
> + blkz->len = zone->len;
> + blkz->wp = zone->wp;
> + }
> +
> + blkz->type = zone->type;
> + blkz->cond = zone->cond;
> + blkz->capacity = zone->capacity;
> +}
> +
> int null_report_zones(struct gendisk *disk, sector_t sector,
> unsigned int nr_zones, report_zones_cb cb, void *data)
> {
> struct nullb *nullb = disk->private_data;
> struct nullb_device *dev = nullb->dev;
> unsigned int first_zone, i;
> + sector_t zone_size;
> struct nullb_zone *zone;
> struct blk_zone blkz;
> int error;
>
> - first_zone = null_zone_no(dev, sector);
> + if (null_is_po2_zone_emu(dev))
> + zone_size = dev->zone_size_po2_sects;
> + else
> + zone_size = dev->zone_size_sects;
> +
> + first_zone = sector / zone_size;
> +
> if (first_zone >= dev->nr_zones)
> return 0;
>
> @@ -210,12 +291,7 @@ int null_report_zones(struct gendisk *disk, sector_t sector,
> * array.
> */
> null_lock_zone(dev, zone);
> - blkz.start = zone->start;
> - blkz.len = zone->len;
> - blkz.wp = zone->wp;
> - blkz.type = zone->type;
> - blkz.cond = zone->cond;
> - blkz.capacity = zone->capacity;
> + null_update_zone_info(nullb, &blkz, zone);
> null_unlock_zone(dev, zone);
>
> error = cb(&blkz, i, data);
> @@ -226,6 +302,35 @@ int null_report_zones(struct gendisk *disk, sector_t sector,
> return nr_zones;
> }
>
> +void null_po2_zone_emu_setup(struct gendisk *disk)
> +{
> + struct nullb *nullb = disk->private_data;
> + struct nullb_device *dev = nullb->dev;
> + sector_t capacity;
> +
> + if (is_power_of_2(dev->zone_size_sects))
> + return;
> +
> + if (!get_capacity(disk))
> + return;
> +
> + blk_mq_freeze_queue(disk->queue);
> +
> + blk_queue_po2_zone_emu(disk->queue, 1);
> + dev->zone_size_po2_sects =
> + 1 << get_count_order_long(dev->zone_size_sects);
> + dev->zone_size_diff_sects =
> + dev->zone_size_po2_sects - dev->zone_size_sects;
> + dev->zone_no = null_npo2_zone_no;
> + dev->zone_update_sector = null_zone_update_sector_po2_emu;
> + dev->zone_update_sector_append = null_zone_update_sector_append_po2_emu;
> +
> + capacity = dev->nr_zones * dev->zone_size_po2_sects;
> + set_capacity(disk, capacity);
> +
> + blk_mq_unfreeze_queue(disk->queue);
> +}
> +
> /*
> * This is called in the case of memory backing from null_process_cmd()
> * with the target zone already locked.
> @@ -234,7 +339,7 @@ size_t null_zone_valid_read_len(struct nullb *nullb,
> sector_t sector, unsigned int len)
> {
> struct nullb_device *dev = nullb->dev;
> - struct nullb_zone *zone = &dev->zones[null_zone_no(dev, sector)];
> + struct nullb_zone *zone = &dev->zones[dev->zone_no(dev, sector)];
> unsigned int nr_sectors = len >> SECTOR_SHIFT;
>
> /* Read must be below the write pointer position */
> @@ -363,11 +468,24 @@ static blk_status_t null_check_zone_resources(struct nullb_device *dev,
> }
> }
>
> +static void null_update_sector_append(struct nullb_cmd *cmd, sector_t sector)
> +{
> + struct nullb_device *dev = cmd->nq->dev;
> +
> + sector = dev->zone_update_sector_append(dev, sector);
> +
> + if (cmd->bio) {
> + cmd->bio->bi_iter.bi_sector = sector;
> + } else {
> + cmd->rq->__sector = sector;
> + }
> +}
> +
> static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector,
> unsigned int nr_sectors, bool append)
> {
> struct nullb_device *dev = cmd->nq->dev;
> - unsigned int zno = null_zone_no(dev, sector);
> + unsigned int zno = dev->zone_no(dev, sector);
> struct nullb_zone *zone = &dev->zones[zno];
> blk_status_t ret;
>
> @@ -395,10 +513,7 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector,
> */
> if (append) {
> sector = zone->wp;
> - if (cmd->bio)
> - cmd->bio->bi_iter.bi_sector = sector;
> - else
> - cmd->rq->__sector = sector;
> + null_update_sector_append(cmd, sector);
> } else if (sector != zone->wp) {
> ret = BLK_STS_IOERR;
> goto unlock;
> @@ -619,7 +734,7 @@ static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_opf op,
> return BLK_STS_OK;
> }
>
> - zone_no = null_zone_no(dev, sector);
> + zone_no = dev->zone_no(dev, sector);
> zone = &dev->zones[zone_no];
>
> null_lock_zone(dev, zone);
> @@ -650,13 +765,54 @@ static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_opf op,
> return ret;
> }
>
> +static blk_status_t null_handle_po2_zone_emu_violation(struct nullb_cmd *cmd,
> + enum req_opf op)
> +{
> + if (op == REQ_OP_READ) {
> + if (cmd->bio)
> + zero_fill_bio(cmd->bio);
> + else
> + zero_fill_bio(cmd->rq->bio);
> +
> + return BLK_STS_OK;
> + } else {
> + return BLK_STS_IOERR;
> + }
> +}
> +
> +static bool null_verify_sector_violation(struct nullb_device *dev,
> + sector_t sector)
> +{
> + if (unlikely(null_is_po2_zone_emu(dev))) {
> + /* The zone idx should be calculated based on the emulated
> + * layout
> + */
> + u32 zone_idx = sector >> ilog2(dev->zone_size_po2_sects);
> + sector_t zsze = dev->zone_size_sects;
> + sector_t sect = null_zone_update_sector_po2_emu(dev, sector);
> +
> + if (sect - (zone_idx * zsze) > zsze)
> + return true;
> + }
> + return false;
> +}
> +
> blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, enum req_opf op,
> sector_t sector, sector_t nr_sectors)
> {
> - struct nullb_device *dev;
> + struct nullb_device *dev = cmd->nq->dev;
> struct nullb_zone *zone;
> blk_status_t sts;
>
> + /* Handle when the sector falls in the emulated area */
> + if (unlikely(null_verify_sector_violation(dev, sector)))
> + return null_handle_po2_zone_emu_violation(cmd, op);
> +
> + /* The sector value is updated if po2 emulation is enabled, else it
> + * will have no effect on the value
> + */
> + sector = dev->zone_update_sector(dev, sector);
> +
> switch (op) {
> case REQ_OP_WRITE:
> return null_zone_write(cmd, sector, nr_sectors, false);
> @@ -669,8 +825,7 @@ blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, enum req_opf op,
> case REQ_OP_ZONE_FINISH:
> return null_zone_mgmt(cmd, op, sector);
> default:
> - dev = cmd->nq->dev;
> - zone = &dev->zones[null_zone_no(dev, sector)];
> + zone = &dev->zones[dev->zone_no(dev, sector)];
>
> null_lock_zone(dev, zone);
> sts = null_process_cmd(cmd, op, sector, nr_sectors);
--
Damien Le Moal
Western Digital Research
More information about the Linux-nvme
mailing list