[PATCH 06/47] blk-mq: add a flags parameter to blk_mq_alloc_request
Jeff Moyer
jmoyer at redhat.com
Tue Nov 24 07:19:54 PST 2015
Christoph Hellwig <hch at lst.de> writes:
> We already have the reserved flag, and a nowait flag awkwardly encoded as
> a gfp_t. Add a real flags argument to make the scheme more extensible and
> allow for a nicer calling convention.
>
> Signed-off-by: Christoph Hellwig <hch at lst.de>
> ---
> block/blk-core.c | 11 +-
> block/blk-mq-tag.c | 11 +-
> block/blk-mq.c | 20 +-
> block/blk-mq.h | 11 +-
> block/blk.h | 2 +-
> drivers/block/mtip32xx/mtip32xx.c | 2 +-
> drivers/nvme/host/core.c | 1172 +++++++++++++++++++++++++++++++++++++
Christoph, I think you included a bit too much in this patch! ;-)
-Jeff
> drivers/nvme/host/pci.c | 11 +-
> include/linux/blk-mq.h | 8 +-
> 9 files changed, 1210 insertions(+), 38 deletions(-)
> create mode 100644 drivers/nvme/host/core.c
>
> diff --git a/block/blk-core.c b/block/blk-core.c
> index af9c315..d2100aa 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -630,7 +630,7 @@ struct request_queue *blk_alloc_queue(gfp_t gfp_mask)
> }
> EXPORT_SYMBOL(blk_alloc_queue);
>
> -int blk_queue_enter(struct request_queue *q, gfp_t gfp)
> +int blk_queue_enter(struct request_queue *q, bool nowait)
> {
> while (true) {
> int ret;
> @@ -638,7 +638,7 @@ int blk_queue_enter(struct request_queue *q, gfp_t gfp)
> if (percpu_ref_tryget_live(&q->q_usage_counter))
> return 0;
>
> - if (!gfpflags_allow_blocking(gfp))
> + if (nowait)
> return -EBUSY;
>
> ret = wait_event_interruptible(q->mq_freeze_wq,
> @@ -1284,7 +1284,9 @@ static struct request *blk_old_get_request(struct request_queue *q, int rw,
> struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
> {
> if (q->mq_ops)
> - return blk_mq_alloc_request(q, rw, gfp_mask, false);
> + return blk_mq_alloc_request(q, rw,
> + (gfp_mask & __GFP_DIRECT_RECLAIM) ?
> + 0 : BLK_MQ_REQ_NOWAIT);
> else
> return blk_old_get_request(q, rw, gfp_mask);
> }
> @@ -2052,8 +2054,7 @@ blk_qc_t generic_make_request(struct bio *bio)
> do {
> struct request_queue *q = bdev_get_queue(bio->bi_bdev);
>
> - if (likely(blk_queue_enter(q, __GFP_DIRECT_RECLAIM) == 0)) {
> -
> + if (likely(blk_queue_enter(q, false) == 0)) {
> ret = q->make_request_fn(q, bio);
>
> blk_queue_exit(q);
> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
> index a07ca34..abdbb47 100644
> --- a/block/blk-mq-tag.c
> +++ b/block/blk-mq-tag.c
> @@ -268,7 +268,7 @@ static int bt_get(struct blk_mq_alloc_data *data,
> if (tag != -1)
> return tag;
>
> - if (!gfpflags_allow_blocking(data->gfp))
> + if (data->flags & BLK_MQ_REQ_NOWAIT)
> return -1;
>
> bs = bt_wait_ptr(bt, hctx);
> @@ -303,7 +303,7 @@ static int bt_get(struct blk_mq_alloc_data *data,
> data->ctx = blk_mq_get_ctx(data->q);
> data->hctx = data->q->mq_ops->map_queue(data->q,
> data->ctx->cpu);
> - if (data->reserved) {
> + if (data->flags & BLK_MQ_REQ_RESERVED) {
> bt = &data->hctx->tags->breserved_tags;
> } else {
> last_tag = &data->ctx->last_tag;
> @@ -349,10 +349,9 @@ static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_alloc_data *data)
>
> unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
> {
> - if (!data->reserved)
> - return __blk_mq_get_tag(data);
> -
> - return __blk_mq_get_reserved_tag(data);
> + if (data->flags & BLK_MQ_REQ_RESERVED)
> + return __blk_mq_get_reserved_tag(data);
> + return __blk_mq_get_tag(data);
> }
>
> static struct bt_wait_state *bt_wake_ptr(struct blk_mq_bitmap_tags *bt)
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index c932605..6da03f1 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -230,8 +230,8 @@ __blk_mq_alloc_request(struct blk_mq_alloc_data *data, int rw)
> return NULL;
> }
>
> -struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp,
> - bool reserved)
> +struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
> + unsigned int flags)
> {
> struct blk_mq_ctx *ctx;
> struct blk_mq_hw_ctx *hctx;
> @@ -239,24 +239,22 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp,
> struct blk_mq_alloc_data alloc_data;
> int ret;
>
> - ret = blk_queue_enter(q, gfp);
> + ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT);
> if (ret)
> return ERR_PTR(ret);
>
> ctx = blk_mq_get_ctx(q);
> hctx = q->mq_ops->map_queue(q, ctx->cpu);
> - blk_mq_set_alloc_data(&alloc_data, q, gfp & ~__GFP_DIRECT_RECLAIM,
> - reserved, ctx, hctx);
> + blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx);
>
> rq = __blk_mq_alloc_request(&alloc_data, rw);
> - if (!rq && (gfp & __GFP_DIRECT_RECLAIM)) {
> + if (!rq && !(flags & BLK_MQ_REQ_NOWAIT)) {
> __blk_mq_run_hw_queue(hctx);
> blk_mq_put_ctx(ctx);
>
> ctx = blk_mq_get_ctx(q);
> hctx = q->mq_ops->map_queue(q, ctx->cpu);
> - blk_mq_set_alloc_data(&alloc_data, q, gfp, reserved, ctx,
> - hctx);
> + blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx);
> rq = __blk_mq_alloc_request(&alloc_data, rw);
> ctx = alloc_data.ctx;
> }
> @@ -1181,8 +1179,7 @@ static struct request *blk_mq_map_request(struct request_queue *q,
> rw |= REQ_SYNC;
>
> trace_block_getrq(q, bio, rw);
> - blk_mq_set_alloc_data(&alloc_data, q, GFP_ATOMIC, false, ctx,
> - hctx);
> + blk_mq_set_alloc_data(&alloc_data, q, BLK_MQ_REQ_NOWAIT, ctx, hctx);
> rq = __blk_mq_alloc_request(&alloc_data, rw);
> if (unlikely(!rq)) {
> __blk_mq_run_hw_queue(hctx);
> @@ -1191,8 +1188,7 @@ static struct request *blk_mq_map_request(struct request_queue *q,
>
> ctx = blk_mq_get_ctx(q);
> hctx = q->mq_ops->map_queue(q, ctx->cpu);
> - blk_mq_set_alloc_data(&alloc_data, q,
> - __GFP_RECLAIM|__GFP_HIGH, false, ctx, hctx);
> + blk_mq_set_alloc_data(&alloc_data, q, 0, ctx, hctx);
> rq = __blk_mq_alloc_request(&alloc_data, rw);
> ctx = alloc_data.ctx;
> hctx = alloc_data.hctx;
> diff --git a/block/blk-mq.h b/block/blk-mq.h
> index 713820b..eaede8e 100644
> --- a/block/blk-mq.h
> +++ b/block/blk-mq.h
> @@ -96,8 +96,7 @@ static inline void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
> struct blk_mq_alloc_data {
> /* input parameter */
> struct request_queue *q;
> - gfp_t gfp;
> - bool reserved;
> + unsigned int flags;
>
> /* input & output parameter */
> struct blk_mq_ctx *ctx;
> @@ -105,13 +104,11 @@ struct blk_mq_alloc_data {
> };
>
> static inline void blk_mq_set_alloc_data(struct blk_mq_alloc_data *data,
> - struct request_queue *q, gfp_t gfp, bool reserved,
> - struct blk_mq_ctx *ctx,
> - struct blk_mq_hw_ctx *hctx)
> + struct request_queue *q, unsigned int flags,
> + struct blk_mq_ctx *ctx, struct blk_mq_hw_ctx *hctx)
> {
> data->q = q;
> - data->gfp = gfp;
> - data->reserved = reserved;
> + data->flags = flags;
> data->ctx = ctx;
> data->hctx = hctx;
> }
> diff --git a/block/blk.h b/block/blk.h
> index 1d95107..38bf997 100644
> --- a/block/blk.h
> +++ b/block/blk.h
> @@ -72,7 +72,7 @@ void blk_dequeue_request(struct request *rq);
> void __blk_queue_free_tags(struct request_queue *q);
> bool __blk_end_bidi_request(struct request *rq, int error,
> unsigned int nr_bytes, unsigned int bidi_bytes);
> -int blk_queue_enter(struct request_queue *q, gfp_t gfp);
> +int blk_queue_enter(struct request_queue *q, bool nowait);
> void blk_queue_exit(struct request_queue *q);
> void blk_freeze_queue(struct request_queue *q);
>
> diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
> index a28a562..cf3b51a 100644
> --- a/drivers/block/mtip32xx/mtip32xx.c
> +++ b/drivers/block/mtip32xx/mtip32xx.c
> @@ -173,7 +173,7 @@ static struct mtip_cmd *mtip_get_int_command(struct driver_data *dd)
> {
> struct request *rq;
>
> - rq = blk_mq_alloc_request(dd->queue, 0, __GFP_RECLAIM, true);
> + rq = blk_mq_alloc_request(dd->queue, 0, BLK_MQ_REQ_RESERVED);
> return blk_mq_rq_to_pdu(rq);
> }
>
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> new file mode 100644
> index 0000000..53cf507
> --- /dev/null
> +++ b/drivers/nvme/host/core.c
> @@ -0,0 +1,1172 @@
> +/*
> + * NVM Express device driver
> + * Copyright (c) 2011-2014, Intel Corporation.
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms and conditions of the GNU General Public License,
> + * version 2, as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope it will be useful, but WITHOUT
> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
> + * more details.
> + */
> +
> +#include <linux/blkdev.h>
> +#include <linux/blk-mq.h>
> +#include <linux/errno.h>
> +#include <linux/hdreg.h>
> +#include <linux/kernel.h>
> +#include <linux/module.h>
> +#include <linux/list_sort.h>
> +#include <linux/slab.h>
> +#include <linux/types.h>
> +#include <linux/pr.h>
> +#include <linux/ptrace.h>
> +#include <linux/nvme_ioctl.h>
> +#include <linux/t10-pi.h>
> +#include <scsi/sg.h>
> +#include <asm/unaligned.h>
> +
> +#include "nvme.h"
> +
> +#define NVME_MINORS (1U << MINORBITS)
> +
> +static int nvme_major;
> +module_param(nvme_major, int, 0);
> +
> +static int nvme_char_major;
> +module_param(nvme_char_major, int, 0);
> +
> +static LIST_HEAD(nvme_ctrl_list);
> +DEFINE_SPINLOCK(dev_list_lock);
> +
> +static struct class *nvme_class;
> +
> +static void nvme_free_ns(struct kref *kref)
> +{
> + struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref);
> +
> + if (ns->type == NVME_NS_LIGHTNVM)
> + nvme_nvm_unregister(ns->queue, ns->disk->disk_name);
> +
> + spin_lock(&dev_list_lock);
> + ns->disk->private_data = NULL;
> + spin_unlock(&dev_list_lock);
> +
> + nvme_put_ctrl(ns->ctrl);
> + put_disk(ns->disk);
> + kfree(ns);
> +}
> +
> +static void nvme_put_ns(struct nvme_ns *ns)
> +{
> + kref_put(&ns->kref, nvme_free_ns);
> +}
> +
> +static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk)
> +{
> + struct nvme_ns *ns;
> +
> + spin_lock(&dev_list_lock);
> + ns = disk->private_data;
> + if (ns && !kref_get_unless_zero(&ns->kref))
> + ns = NULL;
> + spin_unlock(&dev_list_lock);
> +
> + return ns;
> +}
> +
> +static struct request *nvme_alloc_request(struct request_queue *q,
> + struct nvme_command *cmd)
> +{
> + bool write = cmd->common.opcode & 1;
> + struct request *req;
> +
> + req = blk_mq_alloc_request(q, write, 0);
> + if (IS_ERR(req))
> + return req;
> +
> + req->cmd_type = REQ_TYPE_DRV_PRIV;
> + req->cmd_flags |= REQ_FAILFAST_DRIVER;
> + req->__data_len = 0;
> + req->__sector = (sector_t) -1;
> + req->bio = req->biotail = NULL;
> +
> + req->cmd = (unsigned char *)cmd;
> + req->cmd_len = sizeof(struct nvme_command);
> + req->special = (void *)0;
> +
> + return req;
> +}
> +
> +/*
> + * Returns 0 on success. If the result is negative, it's a Linux error code;
> + * if the result is positive, it's an NVM Express status code
> + */
> +int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
> + void *buffer, unsigned bufflen, u32 *result, unsigned timeout)
> +{
> + struct request *req;
> + int ret;
> +
> + req = nvme_alloc_request(q, cmd);
> + if (IS_ERR(req))
> + return PTR_ERR(req);
> +
> + req->timeout = timeout ? timeout : ADMIN_TIMEOUT;
> +
> + if (buffer && bufflen) {
> + ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL);
> + if (ret)
> + goto out;
> + }
> +
> + blk_execute_rq(req->q, NULL, req, 0);
> + if (result)
> + *result = (u32)(uintptr_t)req->special;
> + ret = req->errors;
> + out:
> + blk_mq_free_request(req);
> + return ret;
> +}
> +
> +int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
> + void *buffer, unsigned bufflen)
> +{
> + return __nvme_submit_sync_cmd(q, cmd, buffer, bufflen, NULL, 0);
> +}
> +
> +int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd,
> + void __user *ubuffer, unsigned bufflen,
> + void __user *meta_buffer, unsigned meta_len, u32 meta_seed,
> + u32 *result, unsigned timeout)
> +{
> + bool write = cmd->common.opcode & 1;
> + struct nvme_ns *ns = q->queuedata;
> + struct gendisk *disk = ns ? ns->disk : NULL;
> + struct request *req;
> + struct bio *bio = NULL;
> + void *meta = NULL;
> + int ret;
> +
> + req = nvme_alloc_request(q, cmd);
> + if (IS_ERR(req))
> + return PTR_ERR(req);
> +
> + req->timeout = timeout ? timeout : ADMIN_TIMEOUT;
> +
> + if (ubuffer && bufflen) {
> + ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen,
> + GFP_KERNEL);
> + if (ret)
> + goto out;
> + bio = req->bio;
> +
> + if (!disk)
> + goto submit;
> + bio->bi_bdev = bdget_disk(disk, 0);
> + if (!bio->bi_bdev) {
> + ret = -ENODEV;
> + goto out_unmap;
> + }
> +
> + if (meta_buffer) {
> + struct bio_integrity_payload *bip;
> +
> + meta = kmalloc(meta_len, GFP_KERNEL);
> + if (!meta) {
> + ret = -ENOMEM;
> + goto out_unmap;
> + }
> +
> + if (write) {
> + if (copy_from_user(meta, meta_buffer,
> + meta_len)) {
> + ret = -EFAULT;
> + goto out_free_meta;
> + }
> + }
> +
> + bip = bio_integrity_alloc(bio, GFP_KERNEL, 1);
> + if (!bip) {
> + ret = -ENOMEM;
> + goto out_free_meta;
> + }
> +
> + bip->bip_iter.bi_size = meta_len;
> + bip->bip_iter.bi_sector = meta_seed;
> +
> + ret = bio_integrity_add_page(bio, virt_to_page(meta),
> + meta_len, offset_in_page(meta));
> + if (ret != meta_len) {
> + ret = -ENOMEM;
> + goto out_free_meta;
> + }
> + }
> + }
> + submit:
> + blk_execute_rq(req->q, disk, req, 0);
> + ret = req->errors;
> + if (result)
> + *result = (u32)(uintptr_t)req->special;
> + if (meta && !ret && !write) {
> + if (copy_to_user(meta_buffer, meta, meta_len))
> + ret = -EFAULT;
> + }
> + out_free_meta:
> + kfree(meta);
> + out_unmap:
> + if (bio) {
> + if (disk && bio->bi_bdev)
> + bdput(bio->bi_bdev);
> + blk_rq_unmap_user(bio);
> + }
> + out:
> + blk_mq_free_request(req);
> + return ret;
> +}
> +
> +int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd,
> + void __user *ubuffer, unsigned bufflen, u32 *result,
> + unsigned timeout)
> +{
> + return __nvme_submit_user_cmd(q, cmd, ubuffer, bufflen, NULL, 0, 0,
> + result, timeout);
> +}
> +
> +int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
> +{
> + struct nvme_command c = { };
> + int error;
> +
> + /* gcc-4.4.4 (at least) has issues with initializers and anon unions */
> + c.identify.opcode = nvme_admin_identify;
> + c.identify.cns = cpu_to_le32(1);
> +
> + *id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL);
> + if (!*id)
> + return -ENOMEM;
> +
> + error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
> + sizeof(struct nvme_id_ctrl));
> + if (error)
> + kfree(*id);
> + return error;
> +}
> +
> +static int nvme_identify_ns_list(struct nvme_ctrl *dev, unsigned nsid, __le32 *ns_list)
> +{
> + struct nvme_command c = { };
> +
> + c.identify.opcode = nvme_admin_identify;
> + c.identify.cns = cpu_to_le32(2);
> + c.identify.nsid = cpu_to_le32(nsid);
> + return nvme_submit_sync_cmd(dev->admin_q, &c, ns_list, 0x1000);
> +}
> +
> +int nvme_identify_ns(struct nvme_ctrl *dev, unsigned nsid,
> + struct nvme_id_ns **id)
> +{
> + struct nvme_command c = { };
> + int error;
> +
> + /* gcc-4.4.4 (at least) has issues with initializers and anon unions */
> + c.identify.opcode = nvme_admin_identify,
> + c.identify.nsid = cpu_to_le32(nsid),
> +
> + *id = kmalloc(sizeof(struct nvme_id_ns), GFP_KERNEL);
> + if (!*id)
> + return -ENOMEM;
> +
> + error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
> + sizeof(struct nvme_id_ns));
> + if (error)
> + kfree(*id);
> + return error;
> +}
> +
> +int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid,
> + dma_addr_t dma_addr, u32 *result)
> +{
> + struct nvme_command c;
> +
> + memset(&c, 0, sizeof(c));
> + c.features.opcode = nvme_admin_get_features;
> + c.features.nsid = cpu_to_le32(nsid);
> + c.features.prp1 = cpu_to_le64(dma_addr);
> + c.features.fid = cpu_to_le32(fid);
> +
> + return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0, result, 0);
> +}
> +
> +int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11,
> + dma_addr_t dma_addr, u32 *result)
> +{
> + struct nvme_command c;
> +
> + memset(&c, 0, sizeof(c));
> + c.features.opcode = nvme_admin_set_features;
> + c.features.prp1 = cpu_to_le64(dma_addr);
> + c.features.fid = cpu_to_le32(fid);
> + c.features.dword11 = cpu_to_le32(dword11);
> +
> + return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0, result, 0);
> +}
> +
> +int nvme_get_log_page(struct nvme_ctrl *dev, struct nvme_smart_log **log)
> +{
> + struct nvme_command c = { };
> + int error;
> +
> + c.common.opcode = nvme_admin_get_log_page,
> + c.common.nsid = cpu_to_le32(0xFFFFFFFF),
> + c.common.cdw10[0] = cpu_to_le32(
> + (((sizeof(struct nvme_smart_log) / 4) - 1) << 16) |
> + NVME_LOG_SMART),
> +
> + *log = kmalloc(sizeof(struct nvme_smart_log), GFP_KERNEL);
> + if (!*log)
> + return -ENOMEM;
> +
> + error = nvme_submit_sync_cmd(dev->admin_q, &c, *log,
> + sizeof(struct nvme_smart_log));
> + if (error)
> + kfree(*log);
> + return error;
> +}
> +
> +static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
> +{
> + struct nvme_user_io io;
> + struct nvme_command c;
> + unsigned length, meta_len;
> + void __user *metadata;
> +
> + if (copy_from_user(&io, uio, sizeof(io)))
> + return -EFAULT;
> +
> + switch (io.opcode) {
> + case nvme_cmd_write:
> + case nvme_cmd_read:
> + case nvme_cmd_compare:
> + break;
> + default:
> + return -EINVAL;
> + }
> +
> + length = (io.nblocks + 1) << ns->lba_shift;
> + meta_len = (io.nblocks + 1) * ns->ms;
> + metadata = (void __user *)(uintptr_t)io.metadata;
> +
> + if (ns->ext) {
> + length += meta_len;
> + meta_len = 0;
> + } else if (meta_len) {
> + if ((io.metadata & 3) || !io.metadata)
> + return -EINVAL;
> + }
> +
> + memset(&c, 0, sizeof(c));
> + c.rw.opcode = io.opcode;
> + c.rw.flags = io.flags;
> + c.rw.nsid = cpu_to_le32(ns->ns_id);
> + c.rw.slba = cpu_to_le64(io.slba);
> + c.rw.length = cpu_to_le16(io.nblocks);
> + c.rw.control = cpu_to_le16(io.control);
> + c.rw.dsmgmt = cpu_to_le32(io.dsmgmt);
> + c.rw.reftag = cpu_to_le32(io.reftag);
> + c.rw.apptag = cpu_to_le16(io.apptag);
> + c.rw.appmask = cpu_to_le16(io.appmask);
> +
> + return __nvme_submit_user_cmd(ns->queue, &c,
> + (void __user *)(uintptr_t)io.addr, length,
> + metadata, meta_len, io.slba, NULL, 0);
> +}
> +
> +static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
> + struct nvme_passthru_cmd __user *ucmd)
> +{
> + struct nvme_passthru_cmd cmd;
> + struct nvme_command c;
> + unsigned timeout = 0;
> + int status;
> +
> + if (!capable(CAP_SYS_ADMIN))
> + return -EACCES;
> + if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
> + return -EFAULT;
> +
> + memset(&c, 0, sizeof(c));
> + c.common.opcode = cmd.opcode;
> + c.common.flags = cmd.flags;
> + c.common.nsid = cpu_to_le32(cmd.nsid);
> + c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
> + c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
> + c.common.cdw10[0] = cpu_to_le32(cmd.cdw10);
> + c.common.cdw10[1] = cpu_to_le32(cmd.cdw11);
> + c.common.cdw10[2] = cpu_to_le32(cmd.cdw12);
> + c.common.cdw10[3] = cpu_to_le32(cmd.cdw13);
> + c.common.cdw10[4] = cpu_to_le32(cmd.cdw14);
> + c.common.cdw10[5] = cpu_to_le32(cmd.cdw15);
> +
> + if (cmd.timeout_ms)
> + timeout = msecs_to_jiffies(cmd.timeout_ms);
> +
> + status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
> + (void __user *)cmd.addr, cmd.data_len,
> + &cmd.result, timeout);
> + if (status >= 0) {
> + if (put_user(cmd.result, &ucmd->result))
> + return -EFAULT;
> + }
> +
> + return status;
> +}
> +
> +static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
> + unsigned int cmd, unsigned long arg)
> +{
> + struct nvme_ns *ns = bdev->bd_disk->private_data;
> +
> + switch (cmd) {
> + case NVME_IOCTL_ID:
> + force_successful_syscall_return();
> + return ns->ns_id;
> + case NVME_IOCTL_ADMIN_CMD:
> + return nvme_user_cmd(ns->ctrl, NULL, (void __user *)arg);
> + case NVME_IOCTL_IO_CMD:
> + return nvme_user_cmd(ns->ctrl, ns, (void __user *)arg);
> + case NVME_IOCTL_SUBMIT_IO:
> + return nvme_submit_io(ns, (void __user *)arg);
> + case SG_GET_VERSION_NUM:
> + return nvme_sg_get_version_num((void __user *)arg);
> + case SG_IO:
> + return nvme_sg_io(ns, (void __user *)arg);
> + default:
> + return -ENOTTY;
> + }
> +}
> +
> +#ifdef CONFIG_COMPAT
> +static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode,
> + unsigned int cmd, unsigned long arg)
> +{
> + switch (cmd) {
> + case SG_IO:
> + return -ENOIOCTLCMD;
> + }
> + return nvme_ioctl(bdev, mode, cmd, arg);
> +}
> +#else
> +#define nvme_compat_ioctl NULL
> +#endif
> +
> +static int nvme_open(struct block_device *bdev, fmode_t mode)
> +{
> + return nvme_get_ns_from_disk(bdev->bd_disk) ? 0 : -ENXIO;
> +}
> +
> +static void nvme_release(struct gendisk *disk, fmode_t mode)
> +{
> + nvme_put_ns(disk->private_data);
> +}
> +
> +static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
> +{
> + /* some standard values */
> + geo->heads = 1 << 6;
> + geo->sectors = 1 << 5;
> + geo->cylinders = get_capacity(bdev->bd_disk) >> 11;
> + return 0;
> +}
> +
> +#ifdef CONFIG_BLK_DEV_INTEGRITY
> +static void nvme_init_integrity(struct nvme_ns *ns)
> +{
> + struct blk_integrity integrity;
> +
> + switch (ns->pi_type) {
> + case NVME_NS_DPS_PI_TYPE3:
> + integrity.profile = &t10_pi_type3_crc;
> + break;
> + case NVME_NS_DPS_PI_TYPE1:
> + case NVME_NS_DPS_PI_TYPE2:
> + integrity.profile = &t10_pi_type1_crc;
> + break;
> + default:
> + integrity.profile = NULL;
> + break;
> + }
> + integrity.tuple_size = ns->ms;
> + blk_integrity_register(ns->disk, &integrity);
> + blk_queue_max_integrity_segments(ns->queue, 1);
> +}
> +#else
> +static void nvme_init_integrity(struct nvme_ns *ns)
> +{
> +}
> +#endif /* CONFIG_BLK_DEV_INTEGRITY */
> +
> +static void nvme_config_discard(struct nvme_ns *ns)
> +{
> + u32 logical_block_size = queue_logical_block_size(ns->queue);
> + ns->queue->limits.discard_zeroes_data = 0;
> + ns->queue->limits.discard_alignment = logical_block_size;
> + ns->queue->limits.discard_granularity = logical_block_size;
> + blk_queue_max_discard_sectors(ns->queue, 0xffffffff);
> + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue);
> +}
> +
> +static int nvme_revalidate_disk(struct gendisk *disk)
> +{
> + struct nvme_ns *ns = disk->private_data;
> + struct nvme_id_ns *id;
> + u8 lbaf, pi_type;
> + u16 old_ms;
> + unsigned short bs;
> +
> + if (nvme_identify_ns(ns->ctrl, ns->ns_id, &id)) {
> + dev_warn(ns->ctrl->dev, "%s: Identify failure nvme%dn%d\n",
> + __func__, ns->ctrl->instance, ns->ns_id);
> + return -ENODEV;
> + }
> + if (id->ncap == 0) {
> + kfree(id);
> + return -ENODEV;
> + }
> +
> + if (nvme_nvm_ns_supported(ns, id) && ns->type != NVME_NS_LIGHTNVM) {
> + if (nvme_nvm_register(ns->queue, disk->disk_name)) {
> + dev_warn(ns->ctrl->dev,
> + "%s: LightNVM init failure\n", __func__);
> + kfree(id);
> + return -ENODEV;
> + }
> + ns->type = NVME_NS_LIGHTNVM;
> + }
> +
> + old_ms = ns->ms;
> + lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
> + ns->lba_shift = id->lbaf[lbaf].ds;
> + ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
> + ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT);
> +
> + /*
> + * If identify namespace failed, use default 512 byte block size so
> + * block layer can use before failing read/write for 0 capacity.
> + */
> + if (ns->lba_shift == 0)
> + ns->lba_shift = 9;
> + bs = 1 << ns->lba_shift;
> + /* XXX: PI implementation requires metadata equal t10 pi tuple size */
> + pi_type = ns->ms == sizeof(struct t10_pi_tuple) ?
> + id->dps & NVME_NS_DPS_PI_MASK : 0;
> +
> + blk_mq_freeze_queue(disk->queue);
> + if (blk_get_integrity(disk) && (ns->pi_type != pi_type ||
> + ns->ms != old_ms ||
> + bs != queue_logical_block_size(disk->queue) ||
> + (ns->ms && ns->ext)))
> + blk_integrity_unregister(disk);
> +
> + ns->pi_type = pi_type;
> + blk_queue_logical_block_size(ns->queue, bs);
> +
> + if (ns->ms && !blk_get_integrity(disk) && !ns->ext)
> + nvme_init_integrity(ns);
> + if (ns->ms && !(ns->ms == 8 && ns->pi_type) && !blk_get_integrity(disk))
> + set_capacity(disk, 0);
> + else
> + set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
> +
> + if (ns->ctrl->oncs & NVME_CTRL_ONCS_DSM)
> + nvme_config_discard(ns);
> + blk_mq_unfreeze_queue(disk->queue);
> +
> + kfree(id);
> + return 0;
> +}
> +
> +static char nvme_pr_type(enum pr_type type)
> +{
> + switch (type) {
> + case PR_WRITE_EXCLUSIVE:
> + return 1;
> + case PR_EXCLUSIVE_ACCESS:
> + return 2;
> + case PR_WRITE_EXCLUSIVE_REG_ONLY:
> + return 3;
> + case PR_EXCLUSIVE_ACCESS_REG_ONLY:
> + return 4;
> + case PR_WRITE_EXCLUSIVE_ALL_REGS:
> + return 5;
> + case PR_EXCLUSIVE_ACCESS_ALL_REGS:
> + return 6;
> + default:
> + return 0;
> + }
> +};
> +
> +static int nvme_pr_command(struct block_device *bdev, u32 cdw10,
> + u64 key, u64 sa_key, u8 op)
> +{
> + struct nvme_ns *ns = bdev->bd_disk->private_data;
> + struct nvme_command c;
> + u8 data[16] = { 0, };
> +
> + put_unaligned_le64(key, &data[0]);
> + put_unaligned_le64(sa_key, &data[8]);
> +
> + memset(&c, 0, sizeof(c));
> + c.common.opcode = op;
> + c.common.nsid = cpu_to_le32(ns->ns_id);
> + c.common.cdw10[0] = cpu_to_le32(cdw10);
> +
> + return nvme_submit_sync_cmd(ns->queue, &c, data, 16);
> +}
> +
> +static int nvme_pr_register(struct block_device *bdev, u64 old,
> + u64 new, unsigned flags)
> +{
> + u32 cdw10;
> +
> + if (flags & ~PR_FL_IGNORE_KEY)
> + return -EOPNOTSUPP;
> +
> + cdw10 = old ? 2 : 0;
> + cdw10 |= (flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0;
> + cdw10 |= (1 << 30) | (1 << 31); /* PTPL=1 */
> + return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_register);
> +}
> +
> +static int nvme_pr_reserve(struct block_device *bdev, u64 key,
> + enum pr_type type, unsigned flags)
> +{
> + u32 cdw10;
> +
> + if (flags & ~PR_FL_IGNORE_KEY)
> + return -EOPNOTSUPP;
> +
> + cdw10 = nvme_pr_type(type) << 8;
> + cdw10 |= ((flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0);
> + return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_acquire);
> +}
> +
> +static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new,
> + enum pr_type type, bool abort)
> +{
> + u32 cdw10 = nvme_pr_type(type) << 8 | abort ? 2 : 1;
> + return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire);
> +}
> +
> +static int nvme_pr_clear(struct block_device *bdev, u64 key)
> +{
> + u32 cdw10 = 1 | key ? 1 << 3 : 0;
> + return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_register);
> +}
> +
> +static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
> +{
> + u32 cdw10 = nvme_pr_type(type) << 8 | key ? 1 << 3 : 0;
> + return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release);
> +}
> +
> +static const struct pr_ops nvme_pr_ops = {
> + .pr_register = nvme_pr_register,
> + .pr_reserve = nvme_pr_reserve,
> + .pr_release = nvme_pr_release,
> + .pr_preempt = nvme_pr_preempt,
> + .pr_clear = nvme_pr_clear,
> +};
> +
> +static const struct block_device_operations nvme_fops = {
> + .owner = THIS_MODULE,
> + .ioctl = nvme_ioctl,
> + .compat_ioctl = nvme_compat_ioctl,
> + .open = nvme_open,
> + .release = nvme_release,
> + .getgeo = nvme_getgeo,
> + .revalidate_disk= nvme_revalidate_disk,
> + .pr_ops = &nvme_pr_ops,
> +};
> +
> +/*
> + * Initialize the cached copies of the Identify data and various controller
> + * register in our nvme_ctrl structure. This should be called as soon as
> + * the admin queue is fully up and running.
> + */
> +int nvme_init_identify(struct nvme_ctrl *ctrl)
> +{
> + struct nvme_id_ctrl *id;
> + u64 cap;
> + int ret, page_shift;
> +
> + ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs);
> + if (ret) {
> + dev_err(ctrl->dev, "Reading VS failed (%d)\n", ret);
> + return ret;
> + }
> +
> + ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &cap);
> + if (ret) {
> + dev_err(ctrl->dev, "Reading CAP failed (%d)\n", ret);
> + return ret;
> + }
> + page_shift = NVME_CAP_MPSMIN(cap) + 12;
> + ctrl->page_size = 1 << page_shift;
> +
> + if (ctrl->vs >= NVME_VS(1, 1))
> + ctrl->subsystem = NVME_CAP_NSSRC(cap);
> +
> + ret = nvme_identify_ctrl(ctrl, &id);
> + if (ret) {
> + dev_err(ctrl->dev, "Identify Controller failed (%d)\n", ret);
> + return -EIO;
> + }
> +
> + ctrl->oncs = le16_to_cpup(&id->oncs);
> + atomic_set(&ctrl->abort_limit, id->acl + 1);
> + ctrl->vwc = id->vwc;
> + memcpy(ctrl->serial, id->sn, sizeof(id->sn));
> + memcpy(ctrl->model, id->mn, sizeof(id->mn));
> + memcpy(ctrl->firmware_rev, id->fr, sizeof(id->fr));
> + if (id->mdts)
> + ctrl->max_hw_sectors = 1 << (id->mdts + page_shift - 9);
> + else
> + ctrl->max_hw_sectors = UINT_MAX;
> +
> + if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) && id->vs[3]) {
> + unsigned int max_hw_sectors;
> +
> + ctrl->stripe_size = 1 << (id->vs[3] + page_shift);
> + max_hw_sectors = ctrl->stripe_size >> (page_shift - 9);
> + if (ctrl->max_hw_sectors) {
> + ctrl->max_hw_sectors = min(max_hw_sectors,
> + ctrl->max_hw_sectors);
> + } else {
> + ctrl->max_hw_sectors = max_hw_sectors;
> + }
> + }
> +
> + kfree(id);
> + return 0;
> +}
> +
> +static int nvme_dev_open(struct inode *inode, struct file *file)
> +{
> + struct nvme_ctrl *ctrl;
> + int instance = iminor(inode);
> + int ret = -ENODEV;
> +
> + spin_lock(&dev_list_lock);
> + list_for_each_entry(ctrl, &nvme_ctrl_list, node) {
> + if (ctrl->instance != instance)
> + continue;
> +
> + if (!ctrl->admin_q) {
> + ret = -EWOULDBLOCK;
> + break;
> + }
> + if (!kref_get_unless_zero(&ctrl->kref))
> + break;
> + file->private_data = ctrl;
> + ret = 0;
> + break;
> + }
> + spin_unlock(&dev_list_lock);
> +
> + return ret;
> +}
> +
> +static int nvme_dev_release(struct inode *inode, struct file *file)
> +{
> + nvme_put_ctrl(file->private_data);
> + return 0;
> +}
> +
> +static long nvme_dev_ioctl(struct file *file, unsigned int cmd,
> + unsigned long arg)
> +{
> + struct nvme_ctrl *ctrl = file->private_data;
> + void __user *argp = (void __user *)arg;
> + struct nvme_ns *ns;
> +
> + switch (cmd) {
> + case NVME_IOCTL_ADMIN_CMD:
> + return nvme_user_cmd(ctrl, NULL, argp);
> + case NVME_IOCTL_IO_CMD:
> + if (list_empty(&ctrl->namespaces))
> + return -ENOTTY;
> + ns = list_first_entry(&ctrl->namespaces, struct nvme_ns, list);
> + return nvme_user_cmd(ctrl, ns, argp);
> + case NVME_IOCTL_RESET:
> + dev_warn(ctrl->dev, "resetting controller\n");
> + return ctrl->ops->reset_ctrl(ctrl);
> + case NVME_IOCTL_SUBSYS_RESET:
> + return nvme_reset_subsystem(ctrl);
> + default:
> + return -ENOTTY;
> + }
> +}
> +
> +static const struct file_operations nvme_dev_fops = {
> + .owner = THIS_MODULE,
> + .open = nvme_dev_open,
> + .release = nvme_dev_release,
> + .unlocked_ioctl = nvme_dev_ioctl,
> + .compat_ioctl = nvme_dev_ioctl,
> +};
> +
> +static ssize_t nvme_sysfs_reset(struct device *dev,
> + struct device_attribute *attr, const char *buf,
> + size_t count)
> +{
> + struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
> + int ret;
> +
> + ret = ctrl->ops->reset_ctrl(ctrl);
> + if (ret < 0)
> + return ret;
> + return count;
> +}
> +static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset);
> +
> +static int ns_cmp(void *priv, struct list_head *a, struct list_head *b)
> +{
> + struct nvme_ns *nsa = container_of(a, struct nvme_ns, list);
> + struct nvme_ns *nsb = container_of(b, struct nvme_ns, list);
> +
> + return nsa->ns_id - nsb->ns_id;
> +}
> +
> +static struct nvme_ns *nvme_find_ns(struct nvme_ctrl *ctrl, unsigned nsid)
> +{
> + struct nvme_ns *ns;
> +
> + list_for_each_entry(ns, &ctrl->namespaces, list) {
> + if (ns->ns_id == nsid)
> + return ns;
> + if (ns->ns_id > nsid)
> + break;
> + }
> + return NULL;
> +}
> +
> +static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
> +{
> + struct nvme_ns *ns;
> + struct gendisk *disk;
> + int node = dev_to_node(ctrl->dev);
> +
> + ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
> + if (!ns)
> + return;
> +
> + ns->queue = blk_mq_init_queue(ctrl->tagset);
> + if (IS_ERR(ns->queue))
> + goto out_free_ns;
> + queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue);
> + queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue);
> + ns->queue->queuedata = ns;
> + ns->ctrl = ctrl;
> +
> + disk = alloc_disk_node(0, node);
> + if (!disk)
> + goto out_free_queue;
> +
> + kref_init(&ns->kref);
> + ns->ns_id = nsid;
> + ns->disk = disk;
> + ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */
> +
> + blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
> + if (ctrl->max_hw_sectors) {
> + blk_queue_max_hw_sectors(ns->queue, ctrl->max_hw_sectors);
> + blk_queue_max_segments(ns->queue,
> + (ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1);
> + }
> + if (ctrl->stripe_size)
> + blk_queue_chunk_sectors(ns->queue, ctrl->stripe_size >> 9);
> + if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
> + blk_queue_flush(ns->queue, REQ_FLUSH | REQ_FUA);
> + blk_queue_virt_boundary(ns->queue, ctrl->page_size - 1);
> +
> + disk->major = nvme_major;
> + disk->first_minor = 0;
> + disk->fops = &nvme_fops;
> + disk->private_data = ns;
> + disk->queue = ns->queue;
> + disk->driverfs_dev = ctrl->device;
> + disk->flags = GENHD_FL_EXT_DEVT;
> + sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance, nsid);
> +
> + if (nvme_revalidate_disk(ns->disk))
> + goto out_free_disk;
> +
> + list_add_tail(&ns->list, &ctrl->namespaces);
> + kref_get(&ctrl->kref);
> + if (ns->type != NVME_NS_LIGHTNVM)
> + add_disk(ns->disk);
> +
> + return;
> + out_free_disk:
> + kfree(disk);
> + out_free_queue:
> + blk_cleanup_queue(ns->queue);
> + out_free_ns:
> + kfree(ns);
> +}
> +
> +static void nvme_ns_remove(struct nvme_ns *ns)
> +{
> + bool kill = nvme_io_incapable(ns->ctrl) &&
> + !blk_queue_dying(ns->queue);
> +
> + if (kill)
> + blk_set_queue_dying(ns->queue);
> + if (ns->disk->flags & GENHD_FL_UP) {
> + if (blk_get_integrity(ns->disk))
> + blk_integrity_unregister(ns->disk);
> + del_gendisk(ns->disk);
> + }
> + if (kill || !blk_queue_dying(ns->queue)) {
> + blk_mq_abort_requeue_list(ns->queue);
> + blk_cleanup_queue(ns->queue);
> + }
> + list_del_init(&ns->list);
> + nvme_put_ns(ns);
> +}
> +
> +static void nvme_validate_ns(struct nvme_ctrl *ctrl, unsigned nsid)
> +{
> + struct nvme_ns *ns;
> +
> + ns = nvme_find_ns(ctrl, nsid);
> + if (ns) {
> + if (revalidate_disk(ns->disk))
> + nvme_ns_remove(ns);
> + } else
> + nvme_alloc_ns(ctrl, nsid);
> +}
> +
> +static int nvme_scan_ns_list(struct nvme_ctrl *ctrl, unsigned nn)
> +{
> + struct nvme_ns *ns;
> + __le32 *ns_list;
> + unsigned i, j, nsid, prev = 0, num_lists = DIV_ROUND_UP(nn, 1024);
> + int ret = 0;
> +
> + ns_list = kzalloc(0x1000, GFP_KERNEL);
> + if (!ns_list)
> + return -ENOMEM;
> +
> + for (i = 0; i < num_lists; i++) {
> + ret = nvme_identify_ns_list(ctrl, prev, ns_list);
> + if (ret)
> + goto out;
> +
> + for (j = 0; j < min(nn, 1024U); j++) {
> + nsid = le32_to_cpu(ns_list[j]);
> + if (!nsid)
> + goto out;
> +
> + nvme_validate_ns(ctrl, nsid);
> +
> + while (++prev < nsid) {
> + ns = nvme_find_ns(ctrl, prev);
> + if (ns)
> + nvme_ns_remove(ns);
> + }
> + }
> + nn -= j;
> + }
> + out:
> + kfree(ns_list);
> + return ret;
> +}
> +
> +static void __nvme_scan_namespaces(struct nvme_ctrl *ctrl, unsigned nn)
> +{
> + struct nvme_ns *ns, *next;
> + unsigned i;
> +
> + for (i = 1; i <= nn; i++)
> + nvme_validate_ns(ctrl, i);
> +
> + list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) {
> + if (ns->ns_id > nn)
> + nvme_ns_remove(ns);
> + }
> +}
> +
> +void nvme_scan_namespaces(struct nvme_ctrl *ctrl)
> +{
> + struct nvme_id_ctrl *id;
> + unsigned nn;
> +
> + if (nvme_identify_ctrl(ctrl, &id))
> + return;
> +
> + nn = le32_to_cpu(id->nn);
> + if (ctrl->vs >= NVME_VS(1, 1)) {
> + if (!nvme_scan_ns_list(ctrl, nn))
> + goto done;
> + }
> + __nvme_scan_namespaces(ctrl, le32_to_cpup(&id->nn));
> + done:
> + list_sort(NULL, &ctrl->namespaces, ns_cmp);
> + kfree(id);
> +}
> +
> +void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
> +{
> + struct nvme_ns *ns, *next;
> +
> + list_for_each_entry_safe(ns, next, &ctrl->namespaces, list)
> + nvme_ns_remove(ns);
> +}
> +
> +static DEFINE_IDA(nvme_instance_ida);
> +
> +static int nvme_set_instance(struct nvme_ctrl *ctrl)
> +{
> + int instance, error;
> +
> + do {
> + if (!ida_pre_get(&nvme_instance_ida, GFP_KERNEL))
> + return -ENODEV;
> +
> + spin_lock(&dev_list_lock);
> + error = ida_get_new(&nvme_instance_ida, &instance);
> + spin_unlock(&dev_list_lock);
> + } while (error == -EAGAIN);
> +
> + if (error)
> + return -ENODEV;
> +
> + ctrl->instance = instance;
> + return 0;
> +}
> +
> +static void nvme_release_instance(struct nvme_ctrl *ctrl)
> +{
> + spin_lock(&dev_list_lock);
> + ida_remove(&nvme_instance_ida, ctrl->instance);
> + spin_unlock(&dev_list_lock);
> +}
> +
> +void nvme_uninit_ctrl(struct nvme_ctrl *ctrl)
> + {
> + device_remove_file(ctrl->device, &dev_attr_reset_controller);
> + device_destroy(nvme_class, MKDEV(nvme_char_major, ctrl->instance));
> +
> + spin_lock(&dev_list_lock);
> + list_del(&ctrl->node);
> + spin_unlock(&dev_list_lock);
> +}
> +
> +static void nvme_free_ctrl(struct kref *kref)
> +{
> + struct nvme_ctrl *ctrl = container_of(kref, struct nvme_ctrl, kref);
> +
> + put_device(ctrl->device);
> + nvme_release_instance(ctrl);
> +
> + ctrl->ops->free_ctrl(ctrl);
> +}
> +
> +void nvme_put_ctrl(struct nvme_ctrl *ctrl)
> +{
> + kref_put(&ctrl->kref, nvme_free_ctrl);
> +}
> +
> +/*
> + * Initialize a NVMe controller structures. This needs to be called during
> + * earliest initialization so that we have the initialized structured around
> + * during probing.
> + */
> +int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
> + const struct nvme_ctrl_ops *ops, u16 vendor,
> + unsigned long quirks)
> +{
> + int ret;
> +
> + INIT_LIST_HEAD(&ctrl->namespaces);
> + kref_init(&ctrl->kref);
> + ctrl->dev = dev;
> + ctrl->ops = ops;
> + ctrl->vendor = vendor;
> + ctrl->quirks = quirks;
> +
> + ret = nvme_set_instance(ctrl);
> + if (ret)
> + goto out;
> +
> + ctrl->device = device_create(nvme_class, ctrl->dev,
> + MKDEV(nvme_char_major, ctrl->instance),
> + dev, "nvme%d", ctrl->instance);
> + if (IS_ERR(ctrl->device)) {
> + ret = PTR_ERR(ctrl->device);
> + goto out_release_instance;
> + }
> + get_device(ctrl->device);
> + dev_set_drvdata(ctrl->device, ctrl);
> +
> + ret = device_create_file(ctrl->device, &dev_attr_reset_controller);
> + if (ret)
> + goto out_put_device;
> +
> + spin_lock(&dev_list_lock);
> + list_add_tail(&ctrl->node, &nvme_ctrl_list);
> + spin_unlock(&dev_list_lock);
> +
> + return 0;
> +
> +out_put_device:
> + put_device(ctrl->device);
> + device_destroy(nvme_class, MKDEV(nvme_char_major, ctrl->instance));
> +out_release_instance:
> + nvme_release_instance(ctrl);
> +out:
> + return ret;
> +}
> +
> +int __init nvme_core_init(void)
> +{
> + int result;
> +
> + result = register_blkdev(nvme_major, "nvme");
> + if (result < 0)
> + return result;
> + else if (result > 0)
> + nvme_major = result;
> +
> + result = __register_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme",
> + &nvme_dev_fops);
> + if (result < 0)
> + goto unregister_blkdev;
> + else if (result > 0)
> + nvme_char_major = result;
> +
> + nvme_class = class_create(THIS_MODULE, "nvme");
> + if (IS_ERR(nvme_class)) {
> + result = PTR_ERR(nvme_class);
> + goto unregister_chrdev;
> + }
> +
> + return 0;
> +
> + unregister_chrdev:
> + __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme");
> + unregister_blkdev:
> + unregister_blkdev(nvme_major, "nvme");
> + return result;
> +}
> +
> +void nvme_core_exit(void)
> +{
> + unregister_blkdev(nvme_major, "nvme");
> + class_destroy(nvme_class);
> + __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme");
> +}
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index 9444884..5c5f455 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -1040,7 +1040,7 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
> struct request *req;
> int ret;
>
> - req = blk_mq_alloc_request(q, write, GFP_KERNEL, false);
> + req = blk_mq_alloc_request(q, write, 0);
> if (IS_ERR(req))
> return PTR_ERR(req);
>
> @@ -1093,7 +1093,8 @@ static int nvme_submit_async_admin_req(struct nvme_dev *dev)
> struct nvme_cmd_info *cmd_info;
> struct request *req;
>
> - req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_ATOMIC, true);
> + req = blk_mq_alloc_request(dev->admin_q, WRITE,
> + BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_RESERVED);
> if (IS_ERR(req))
> return PTR_ERR(req);
>
> @@ -1118,7 +1119,7 @@ static int nvme_submit_admin_async_cmd(struct nvme_dev *dev,
> struct request *req;
> struct nvme_cmd_info *cmd_rq;
>
> - req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_KERNEL, false);
> + req = blk_mq_alloc_request(dev->admin_q, WRITE, 0);
> if (IS_ERR(req))
> return PTR_ERR(req);
>
> @@ -1319,8 +1320,8 @@ static void nvme_abort_req(struct request *req)
> if (!dev->abort_limit)
> return;
>
> - abort_req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_ATOMIC,
> - false);
> + abort_req = blk_mq_alloc_request(dev->admin_q, WRITE,
> + BLK_MQ_REQ_NOWAIT);
> if (IS_ERR(abort_req))
> return;
>
> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
> index daf17d7..7fc9296 100644
> --- a/include/linux/blk-mq.h
> +++ b/include/linux/blk-mq.h
> @@ -188,8 +188,14 @@ void blk_mq_insert_request(struct request *, bool, bool, bool);
> void blk_mq_free_request(struct request *rq);
> void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *, struct request *rq);
> bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
> +
> +enum {
> + BLK_MQ_REQ_NOWAIT = (1 << 0), /* return when out of requests */
> + BLK_MQ_REQ_RESERVED = (1 << 1), /* allocate from reserved pool */
> +};
> +
> struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
> - gfp_t gfp, bool reserved);
> + unsigned int flags);
> struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag);
> struct cpumask *blk_mq_tags_cpumask(struct blk_mq_tags *tags);
More information about the Linux-nvme
mailing list