[RFC PATCH 1/2] virtio_nvme(kernel): virtual NVMe driver using virtio

Wed Sep 9 22:48:31 PDT 2015

Signed-off-by: Ming Lin <ming.l at ssi.samsung.com>
---
 drivers/block/Kconfig            |   7 +
 drivers/block/Makefile           |   1 +
 drivers/block/nvme-core.c        |   1 +
 drivers/block/virtio_nvme.c      | 853 +++++++++++++++++++++++++++++++++++++++
 include/linux/virtio_nvme.h      |  53 +++
 include/uapi/linux/virtio_ids.h  |   1 +
 include/uapi/linux/virtio_nvme.h |  30 ++
 7 files changed, 946 insertions(+)
 create mode 100644 drivers/block/virtio_nvme.c
 create mode 100644 include/linux/virtio_nvme.h
 create mode 100644 include/uapi/linux/virtio_nvme.h

diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 1b8094d..7149885 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -519,6 +519,13 @@ config VIRTIO_BLK
 	  This is the virtual block driver for virtio.  It can be used with
           lguest or QEMU based VMMs (like KVM or Xen).  Say Y or M.
 
+config VIRTIO_NVME
+	tristate "Virtio NVMe driver"
+	depends on VIRTIO
+	---help---
+	  This is the virtual NVMe driver for virtio.  It can be used with
+          lguest or QEMU based VMMs (like KVM or Xen).  Say Y or M.
+
 config BLK_DEV_HD
 	bool "Very old hard disk (MFM/RLL/IDE) driver"
 	depends on HAVE_IDE
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 02b688d..3b73f59 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -30,6 +30,7 @@ obj-$(CONFIG_BLK_DEV_UMEM)	+= umem.o
 obj-$(CONFIG_BLK_DEV_NBD)	+= nbd.o
 obj-$(CONFIG_BLK_DEV_CRYPTOLOOP) += cryptoloop.o
 obj-$(CONFIG_VIRTIO_BLK)	+= virtio_blk.o
+obj-$(CONFIG_VIRTIO_NVME)	+= virtio_nvme.o
 
 obj-$(CONFIG_BLK_DEV_SX8)	+= sx8.o
 obj-$(CONFIG_BLK_DEV_HD)	+= hd.o
diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 7920c27..7895606 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -1059,6 +1059,7 @@ int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
 {
 	return __nvme_submit_sync_cmd(q, cmd, buffer, NULL, bufflen, NULL, 0);
 }
+EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd);
 
 static int nvme_submit_async_admin_req(struct nvme_dev *dev)
 {
diff --git a/drivers/block/virtio_nvme.c b/drivers/block/virtio_nvme.c
new file mode 100644
index 0000000..57f81fc
--- /dev/null
+++ b/drivers/block/virtio_nvme.c
@@ -0,0 +1,853 @@
+/* Modified from virtio_blk.c and nvme-core.c */
+
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/hdreg.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/delay.h>
+#include <linux/virtio.h>
+#include <linux/virtio_nvme.h>
+#include <linux/scatterlist.h>
+#include <linux/string_helpers.h>
+#include <linux/idr.h>
+#include <linux/blk-mq.h>
+#include <linux/numa.h>
+#include <linux/virtio_nvme.h>
+#include <linux/nvme.h>
+#include <linux/blk-mq.h>
+
+#define ADMIN_TIMEOUT           (2 * HZ)
+#define NVME_AQ_DEPTH		256
+
+static int virtnvme_major;
+module_param(virtnvme_major, int, 0);
+
+static unsigned int virtnvme_queue_depth;
+module_param_named(queue_depth, virtnvme_queue_depth, uint, 0444);
+
+static DEFINE_SPINLOCK(dev_list_lock);
+static LIST_HEAD(dev_list);
+
+static void virtnvme_free_namespaces(struct virtio_nvme_dev *dev);
+
+static const struct virtio_device_id id_table[] = {
+	{ VIRTIO_ID_NVME, VIRTIO_DEV_ANY_ID },
+	{ 0 },
+};
+
+struct virtnvme_req
+{
+	struct request *req;
+	struct nvme_command cmd;
+	struct virtio_nvme_resp resp;
+	struct scatterlist sg[];
+};
+
+static int virtnvme_identify_ctrl(struct virtio_nvme_dev *dev, struct nvme_id_ctrl **id)
+{
+	struct nvme_command c = { };
+	int error;
+
+	/* gcc-4.4.4 (at least) has issues with initializers and anon unions */
+	c.identify.opcode = nvme_admin_identify;
+	c.identify.cns = cpu_to_le32(1);
+
+	*id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL);
+	if (!*id)
+		return -ENOMEM;
+
+	error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
+			sizeof(struct nvme_id_ctrl));
+	if (error)
+		kfree(*id);
+	return error;
+}
+
+static int virtnvme_identify_ns(struct virtio_nvme_dev *dev, unsigned nsid,
+		struct nvme_id_ns **id)
+{
+	struct nvme_command c = { };
+	int error;
+
+	/* gcc-4.4.4 (at least) has issues with initializers and anon unions */
+	c.identify.opcode = nvme_admin_identify,
+	c.identify.nsid = cpu_to_le32(nsid),
+
+	*id = kmalloc(sizeof(struct nvme_id_ns), GFP_KERNEL);
+	if (!*id)
+		return -ENOMEM;
+
+	error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
+			sizeof(struct nvme_id_ns));
+	if (error)
+		kfree(*id);
+	return error;
+}
+
+static int virtnvme_wait_ready(struct virtio_nvme_dev *dev, u64 cap)
+{
+	struct virtio_device *vdev = dev->vdev;
+	unsigned long timeout;
+	u32 csts;
+
+	timeout = ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies;
+
+	while (1) {
+		virtio_cread(vdev, struct virtio_nvme_config, csts, &csts);
+		if ((csts & NVME_CSTS_RDY) == NVME_CSTS_RDY)
+			break;
+
+		msleep(100);
+		if (fatal_signal_pending(current))
+			return -EINTR;
+		if (time_after(jiffies, timeout)) {
+			printk("Device not ready; aborting initialisation\n");
+			return -ENODEV;
+		}
+	}
+
+	return 0;
+}
+
+static void virtnvme_admin_done(struct virtqueue *vq)
+{
+	struct virtio_nvme_dev *dev = vq->vdev->priv;
+	struct virtnvme_req *vnr;
+	int qid = vq->index;
+	unsigned long flags;
+	unsigned int len;
+
+	spin_lock_irqsave(&dev->vqs[qid].lock, flags);
+	do {
+		virtqueue_disable_cb(vq);
+		while ((vnr = virtqueue_get_buf(dev->vqs[qid].vq, &len)) != NULL)
+			blk_mq_complete_request(vnr->req);
+		if (unlikely(virtqueue_is_broken(vq)))
+			break;
+	} while (!virtqueue_enable_cb(vq));
+
+	spin_unlock_irqrestore(&dev->vqs[qid].lock, flags);
+}
+
+static void virtnvme_io_done(struct virtqueue *vq)
+{
+	struct virtio_nvme_dev *dev = vq->vdev->priv;
+	int qid = vq->index;
+	struct virtnvme_req *vnr;
+	unsigned long flags;
+	unsigned int len;
+	bool bio_done = false;
+
+	spin_lock_irqsave(&dev->vqs[qid].lock, flags);
+	do {
+		virtqueue_disable_cb(vq);
+		while ((vnr = virtqueue_get_buf(dev->vqs[qid].vq, &len)) != NULL) {
+			blk_mq_complete_request(vnr->req);
+			bio_done = true;
+		}
+
+		if (unlikely(virtqueue_is_broken(vq)))
+			break;
+	} while (!virtqueue_enable_cb(vq));
+
+	spin_unlock_irqrestore(&dev->vqs[qid].lock, flags);
+
+	if (bio_done)
+		wake_up(&dev->queue_wait);
+}
+
+static int virtnvme_init_vq(struct virtio_nvme_dev *dev)
+{
+	int err = 0;
+	int i;
+	vq_callback_t **callbacks;
+	const char **names;
+	struct virtqueue **vqs;
+	unsigned num_vqs;
+	struct virtio_device *vdev = dev->vdev;
+
+	err = virtio_cread_feature(vdev, VIRTIO_NVME_F_MQ,
+				   struct virtio_nvme_config, num_queues,
+				   &num_vqs);
+	if (err)
+		num_vqs = 1;
+
+	num_vqs++;
+
+	dev->vqs = kmalloc(sizeof(*dev->vqs) * num_vqs, GFP_KERNEL);
+	if (!dev->vqs) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	names = kmalloc(sizeof(*names) * num_vqs, GFP_KERNEL);
+	if (!names)
+		goto err_names;
+
+	callbacks = kmalloc(sizeof(*callbacks) * num_vqs, GFP_KERNEL);
+	if (!callbacks)
+		goto err_callbacks;
+
+	vqs = kmalloc(sizeof(*vqs) * num_vqs, GFP_KERNEL);
+	if (!vqs)
+		goto err_vqs;
+
+	callbacks[0] = virtnvme_admin_done;
+	names[0] = "admin";
+	dev->vqs[0].dev = dev;
+
+	for (i = 1; i < num_vqs; i++) {
+		callbacks[i] = virtnvme_io_done;
+		snprintf(dev->vqs[i].name, VQ_NAME_LEN, "req.%d", i);
+		names[i] = dev->vqs[i].name;
+		dev->vqs[i].dev = dev;
+	}
+
+	/* Discover virtqueues and write information to configuration.  */
+	err = vdev->config->find_vqs(vdev, num_vqs, vqs, callbacks, names);
+	if (err)
+		goto err_find_vqs;
+
+	for (i = 0; i < num_vqs; i++) {
+		spin_lock_init(&dev->vqs[i].lock);
+		dev->vqs[i].vq = vqs[i];
+	}
+	dev->num_vqs = num_vqs;
+
+err_find_vqs:
+	kfree(vqs);
+err_vqs:
+	kfree(callbacks);
+err_callbacks:
+	kfree(names);
+err_names:
+	if (err)
+		kfree(dev->vqs);
+out:
+	return err;
+}
+
+static inline struct virtnvme_req *virtnvme_alloc_req(struct virtio_nvme_dev *dev,
+		gfp_t gfp_mask)
+{
+	struct virtnvme_req *vnr;
+
+	vnr = kmalloc(sizeof(*vnr) + dev->sg_elems*sizeof(struct scatterlist),
+			gfp_mask);
+	if (!vnr)
+		return NULL;
+
+	sg_init_table(vnr->sg, dev->sg_elems);
+
+	return vnr;
+}
+
+static inline u64 virtnvme_block_nr(struct virtio_nvme_ns *ns, sector_t sector)
+{
+        return (sector >> (ns->lba_shift - 9));
+}
+
+static int virtnvme_add_req(struct virtio_nvme_ns *ns, struct virtqueue *vq,
+			     struct virtnvme_req *vnr,
+			     struct scatterlist *data_sg,
+			     bool have_data)
+{
+	struct scatterlist cmd, resp, *sgs[5];
+	unsigned int num_out = 0, num_in = 0;
+
+	sg_init_one(&cmd, vnr->req->cmd, sizeof(struct nvme_command));
+	sgs[num_out++] = &cmd;
+
+	if (have_data) {
+		if (rq_data_dir(vnr->req))
+			sgs[num_out++] = data_sg;
+		else
+			sgs[num_out + num_in++] = data_sg;
+	}
+
+	sg_init_one(&resp, &vnr->resp, sizeof(struct virtio_nvme_resp));
+	sgs[num_out + num_in++] = &resp;
+
+	return virtqueue_add_sgs(vq, sgs, num_out, num_in, vnr, GFP_ATOMIC);
+}
+
+static int virtnvme_setup_io(struct virtnvme_req *vnr, struct virtio_nvme_ns *ns)
+{
+	struct nvme_command *cmnd;
+	struct request *req = vnr->req;
+	u16 control = 0;
+	u32 dsmgmt = 0;
+
+#if 0 /* TODO */
+	if (req->cmd_flags & REQ_FUA)
+		control |= NVME_RW_FUA;
+	if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD))
+		control |= NVME_RW_LR;
+
+	if (req->cmd_flags & REQ_RAHEAD)
+		dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
+#endif
+
+	cmnd = &vnr->cmd;
+	req->cmd = (unsigned char *)cmnd;
+	req->cmd_len = sizeof(struct nvme_command);
+	memset(cmnd, 0, sizeof(*cmnd));
+
+	cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read);
+	cmnd->rw.command_id = req->tag;
+	cmnd->rw.nsid = cpu_to_le32(ns->ns_id);
+	cmnd->rw.slba = cpu_to_le64(virtnvme_block_nr(ns, blk_rq_pos(req)));
+	cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
+	cmnd->rw.control = cpu_to_le16(control);
+	cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
+
+	return 0;
+}
+
+static int virtnvme_queue_rq(struct blk_mq_hw_ctx *hctx,
+		const struct blk_mq_queue_data *bd)
+{
+	struct virtio_nvme_ns *ns = hctx->queue->queuedata;
+	struct virtio_nvme_queue *nvmeq = hctx->driver_data;
+	struct request *req = bd->rq;
+	struct virtnvme_req *vnr = blk_mq_rq_to_pdu(req);
+	unsigned long flags;
+	unsigned int num;
+	int err;
+	bool notify = false;
+
+	vnr->req = req;
+
+	if (req->cmd_type == REQ_TYPE_DRV_PRIV)
+		; /* TODO: nvme_submit_priv(nvmeq, req, iod) */
+	else if (req->cmd_flags & REQ_DISCARD)
+		; /* TODO: nvme_submit_discard(nvmeq, ns, req, iod) */
+	else if (req->cmd_flags & REQ_FLUSH)
+		; /* TODO: nvme_submit_flush(nvmeq, ns, req->tag) */
+	else
+		virtnvme_setup_io(vnr, ns);
+
+	blk_mq_start_request(req);
+
+	num = blk_rq_map_sg(hctx->queue, vnr->req, vnr->sg);
+
+	spin_lock_irqsave(&nvmeq->lock, flags);
+	err = virtnvme_add_req(ns, nvmeq->vq, vnr, vnr->sg, num);
+	if (err) {
+		virtqueue_kick(nvmeq->vq);
+		blk_mq_stop_hw_queue(hctx);
+		spin_unlock_irqrestore(&nvmeq->lock, flags);
+		if (err == -ENOMEM || err == -ENOSPC)
+			return BLK_MQ_RQ_QUEUE_BUSY;
+		return BLK_MQ_RQ_QUEUE_ERROR;
+	}
+
+	if (bd->last && virtqueue_kick_prepare(nvmeq->vq))
+		notify = true;
+	spin_unlock_irqrestore(&nvmeq->lock, flags);
+
+	if (notify)
+		virtqueue_notify(nvmeq->vq);
+	return BLK_MQ_RQ_QUEUE_OK;
+}
+
+static inline void virtnvme_request_done(struct request *req)
+{
+	struct virtnvme_req *vnr = blk_mq_rq_to_pdu(req);
+	int error = vnr->resp.status;
+
+#if 0 /* TODO */
+	if (req->cmd_type == REQ_TYPE_BLOCK_PC) {
+		req->resid_len = virtio32_to_cpu(dev->vdev, vbr->in_hdr.residual);
+		req->sense_len = virtio32_to_cpu(dev->vdev, vbr->in_hdr.sense_len);
+		req->errors = virtio32_to_cpu(dev->vdev, vbr->in_hdr.errors);
+	} else if (req->cmd_type == REQ_TYPE_DRV_PRIV) {
+		req->errors = (error != 0);
+	}
+#endif
+
+	blk_mq_end_request(req, error);
+}
+
+static int virtnvme_init_request(void *data, struct request *rq,
+		unsigned int hctx_idx, unsigned int request_idx,
+		unsigned int numa_node)
+{
+	struct virtio_nvme_dev *dev = data;
+	struct virtnvme_req *vnr = blk_mq_rq_to_pdu(rq);
+
+	sg_init_table(vnr->sg, dev->sg_elems);
+	return 0;
+}
+
+static int virtnvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
+				unsigned int hctx_idx)
+{
+	struct virtio_nvme_dev *dev = data;
+	struct virtio_nvme_queue *nvmeq = &dev->vqs[0];
+
+	hctx->driver_data = nvmeq;
+	return 0;
+}
+
+static int virtnvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
+				unsigned int hctx_idx)
+{
+	struct virtio_nvme_dev *dev = data;
+	struct virtio_nvme_queue *nvmeq = &dev->vqs[hctx_idx+1];
+
+	hctx->driver_data = nvmeq;
+	return 0;
+}
+
+static struct blk_mq_ops virtio_nvme_mq_admin_ops = {
+	.queue_rq	= virtnvme_queue_rq,
+	.map_queue	= blk_mq_map_queue,
+	.init_hctx	= virtnvme_admin_init_hctx,
+	.complete	= virtnvme_request_done,
+	.init_request	= virtnvme_init_request,
+};
+
+static struct blk_mq_ops virtio_nvme_mq_ops = {
+	.queue_rq	= virtnvme_queue_rq,
+	.map_queue	= blk_mq_map_queue,
+	.init_hctx	= virtnvme_init_hctx,
+	.complete	= virtnvme_request_done,
+	.init_request	= virtnvme_init_request,
+};
+
+static int virtnvme_open(struct block_device *bdev, fmode_t mode)
+{
+	struct virtio_nvme_ns *ns = bdev->bd_disk->private_data;
+	struct virtio_nvme_dev *dev = ns->dev;
+
+	kref_get(&dev->kref);
+	return 0;
+}
+
+static DEFINE_IDA(nvme_instance_ida);
+
+static int nvme_set_instance(struct virtio_nvme_dev *dev)
+{
+	int instance, error;
+
+	do {
+		if (!ida_pre_get(&nvme_instance_ida, GFP_KERNEL))
+			return -ENODEV;
+
+		spin_lock(&dev_list_lock);
+		error = ida_get_new(&nvme_instance_ida, &instance);
+		spin_unlock(&dev_list_lock);
+	} while (error == -EAGAIN);
+
+	if (error)
+		return -ENODEV;
+
+	dev->instance = instance;
+	return 0;
+}
+
+static void virtnvme_release_instance(struct virtio_nvme_dev *dev)
+{
+	spin_lock(&dev_list_lock);
+	ida_remove(&nvme_instance_ida, dev->instance);
+	spin_unlock(&dev_list_lock);
+}
+
+static void virtnvme_free_dev(struct kref *kref)
+{
+        struct virtio_nvme_dev *dev = container_of(kref,
+					struct virtio_nvme_dev, kref);
+
+        virtnvme_free_namespaces(dev);
+        virtnvme_release_instance(dev);
+	if (dev->tagset.tags)
+		blk_mq_free_tag_set(&dev->tagset);
+	if (dev->admin_q)
+		blk_put_queue(dev->admin_q);
+        kfree(dev);
+}
+
+static void virtnvme_release(struct gendisk *disk, fmode_t mode)
+{
+	struct virtio_nvme_ns *ns = disk->private_data;
+	struct virtio_nvme_dev *dev = ns->dev;
+
+	kref_put(&dev->kref, virtnvme_free_dev);
+}
+
+static const struct block_device_operations virtnvme_fops = {
+	.owner		= THIS_MODULE,
+	.open		= virtnvme_open,
+	.release	= virtnvme_release,
+};
+
+static struct virtio_nvme_ns *virtnvme_alloc_ns(struct virtio_nvme_dev *dev, unsigned nsid,
+			struct nvme_id_ns *id)
+{
+	struct virtio_nvme_ns *ns;
+	struct gendisk *disk;
+	int lbaf;
+
+	ns = kzalloc(sizeof(*ns), GFP_KERNEL);
+	if (!ns)
+		return NULL;
+	ns->queue = blk_mq_init_queue(&dev->tagset);
+	if (!ns->queue)
+		goto out_free_ns;
+	ns->queue->queue_flags = QUEUE_FLAG_DEFAULT;
+	queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue);
+	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue);
+	queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, ns->queue);
+	ns->dev = dev;
+	ns->queue->queuedata = ns;
+
+	disk = alloc_disk(0);
+	if (!disk)
+		goto out_free_queue;
+	ns->ns_id = nsid;
+	ns->disk = disk;
+	lbaf = id->flbas & 0xf;
+	ns->lba_shift = id->lbaf[lbaf].ds;
+	ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
+	blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
+	if (dev->max_hw_sectors)
+		blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors);
+	disk->major = virtnvme_major;
+	disk->first_minor = 0;
+	disk->fops = &virtnvme_fops;
+	disk->private_data = ns;
+	disk->queue = ns->queue;
+	disk->flags = GENHD_FL_EXT_DEVT;
+	sprintf(disk->disk_name, "vnvme%dn%d", dev->instance, nsid);
+	set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
+
+	return ns;
+
+out_free_queue:
+	blk_cleanup_queue(ns->queue);
+out_free_ns:
+	kfree(ns);
+	return NULL;
+}
+
+static unsigned int virtnvme_cmd_size(struct virtio_nvme_dev *dev)
+{
+	unsigned int ret;
+
+	ret = sizeof(struct virtnvme_req) +
+		sizeof(struct scatterlist) * dev->sg_elems;
+
+        return ret;
+}
+
+static int virtnvme_dev_add(struct virtio_nvme_dev *dev)
+{
+	int res;
+	unsigned nn, i;
+	struct virtio_nvme_ns *ns;
+	struct nvme_id_ctrl *ctrl;
+	struct nvme_id_ns *id_ns;
+	int err;
+
+	res = virtnvme_identify_ctrl(dev, &ctrl);
+	if (res) {
+		printk("Identify Controller failed (%d)\n", res);
+                res = -EIO;
+		goto out;
+	}
+
+	nn = le32_to_cpup(&ctrl->nn);
+
+	memset(&dev->tagset, 0, sizeof(dev->tagset));
+	dev->tagset.ops = &virtio_nvme_mq_ops;
+	/* Default queue sizing is to fill the ring. */
+	if (!virtnvme_queue_depth)
+		virtnvme_queue_depth = dev->vqs[1].vq->num_free;
+	dev->tagset.queue_depth = virtnvme_queue_depth;
+	dev->tagset.numa_node = NUMA_NO_NODE;
+	dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
+	dev->tagset.cmd_size = virtnvme_cmd_size(dev);
+	dev->tagset.driver_data = dev;
+	dev->tagset.nr_hw_queues = dev->num_vqs - 1;
+
+	err = blk_mq_alloc_tag_set(&dev->tagset);
+	if (err)
+		goto out;
+
+	for (i = 1; i <= nn; i++) {
+		res = virtnvme_identify_ns(dev, i, &id_ns);
+		if (res)
+			continue;
+
+		if (id_ns->ncap == 0)
+			continue;
+
+		ns = virtnvme_alloc_ns(dev, i, id_ns);
+		if (ns)
+			list_add_tail(&ns->list, &dev->namespaces);
+	}
+	list_for_each_entry(ns, &dev->namespaces, list)
+		add_disk(ns->disk);
+
+out:
+	return res;
+}
+
+static void virtnvme_dev_remove_admin(struct virtio_nvme_dev *dev)
+{
+	if (dev->admin_q && !blk_queue_dying(dev->admin_q)) {
+		blk_cleanup_queue(dev->admin_q);
+		blk_mq_free_tag_set(&dev->admin_tagset);
+	}
+}
+
+static int virtnvme_alloc_admin_tags(struct virtio_nvme_dev *dev)
+{
+	if (!dev->admin_q) {
+		dev->admin_tagset.ops = &virtio_nvme_mq_admin_ops;
+		dev->admin_tagset.nr_hw_queues = 1;
+		dev->admin_tagset.queue_depth = NVME_AQ_DEPTH;
+		dev->admin_tagset.reserved_tags = 1;
+		dev->admin_tagset.timeout = ADMIN_TIMEOUT;
+		dev->admin_tagset.numa_node = NUMA_NO_NODE;
+		dev->admin_tagset.cmd_size = virtnvme_cmd_size(dev);
+		dev->admin_tagset.driver_data = dev;
+
+		if (blk_mq_alloc_tag_set(&dev->admin_tagset))
+			return -ENOMEM;
+
+		dev->admin_q = blk_mq_init_queue(&dev->admin_tagset);
+		if (IS_ERR(dev->admin_q)) {
+			blk_mq_free_tag_set(&dev->admin_tagset);
+			return -ENOMEM;
+		}
+		if (!blk_get_queue(dev->admin_q)) {
+			virtnvme_dev_remove_admin(dev);
+			dev->admin_q = NULL;
+			return -ENODEV;
+		}
+	} else
+		blk_mq_unfreeze_queue(dev->admin_q);
+
+	return 0;
+}
+
+static int virtnvme_probe(struct virtio_device *vdev)
+{
+	struct virtio_nvme_dev *dev;
+	u64 cap;
+	u32 ctrl_config;
+	u32 sg_elems;
+	int err;
+
+	if (!vdev->config->get) {
+		printk("%s failure: config access disabled\n", __func__);
+		return -EINVAL;
+	}
+
+	vdev->priv = dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+	if (!dev)
+		return -ENOMEM;
+	INIT_LIST_HEAD(&dev->namespaces);
+	kref_init(&dev->kref);
+
+	init_waitqueue_head(&dev->queue_wait);
+	dev->vdev = vdev;
+
+	err = nvme_set_instance(dev);
+	if (err)
+		goto out_free_dev;
+
+	/* We need to know how many segments before we allocate. */
+	err = virtio_cread_feature(vdev, VIRTIO_NVME_F_SEG_MAX,
+				   struct virtio_nvme_config, seg_max,
+				   &sg_elems);
+	/* We need at least one SG element, whatever they say. */
+	if (err || !sg_elems)
+		sg_elems = 1;
+
+	/* We need two extra sg elements at head for command and response */
+	sg_elems += 2;
+	dev->sg_elems = sg_elems;
+
+	/*
+	 * 1. The host determines the controller capabilities
+	 */
+	virtio_cread(vdev, struct virtio_nvme_config, cap, &cap);
+
+	/*
+	 * 2. The host configures controller settings. Specific settings include:
+	 *	a. The arbitration mechanism should be selected in CC.AMS.
+	 *	b. The memory page size should be initialized in CC.MPS.
+	 *	c. The I/O Command Set that is to be used should be selected in CC.CSS.
+	 * 3. The controller should be enabled by setting CC.EN to 1
+	 */
+	ctrl_config = NVME_CC_ENABLE | NVME_CC_CSS_NVM;
+	ctrl_config |= (PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT;
+	ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE;
+	ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
+	virtio_cwrite(vdev, struct virtio_nvme_config, ctrl_config, &ctrl_config);
+
+	/*
+	 * 4. The host should wait for the controller to indicate it is ready to
+	 *    process commands. The controller is ready to process commands when
+	 *    CSTS.RDY is set to 1.
+	 */
+	err = virtnvme_wait_ready(dev, cap);
+	if (err)
+		goto release;
+
+	/* Qemu starts controller and creates VQs */
+	err = virtnvme_init_vq(dev);
+	if (err)
+		goto release;
+
+	err = virtnvme_alloc_admin_tags(dev);
+	if (err)
+		goto release;
+
+	spin_lock(&dev_list_lock);
+	list_add(&dev->node, &dev_list);
+	spin_unlock(&dev_list_lock);
+
+	/*
+	 * 6. The host should determine the configuration of the controller by
+	 *    issuing the Identify command, specifying the Controller data
+	 *    structure. The host should then determine the configuration of
+	 *    each namespace by issuing the Identify command for each namespace,
+	 *    specifying the Namespace data structure
+	 */
+	err = virtnvme_dev_add(dev);
+	if (err)
+		goto out_free_vq;
+
+	return 0;
+
+out_free_vq:
+	vdev->config->del_vqs(vdev);
+
+release:
+	virtnvme_release_instance(dev);
+
+out_free_dev:
+	kfree(dev);
+	return err;
+}
+
+static void virtnvme_ns_remove(struct virtio_nvme_ns *ns)
+{
+	bool kill = !blk_queue_dying(ns->queue);
+
+	if (kill)
+		blk_set_queue_dying(ns->queue);
+	if (ns->disk->flags & GENHD_FL_UP) {
+		if (blk_get_integrity(ns->disk))
+			blk_integrity_unregister(ns->disk);
+		del_gendisk(ns->disk);
+	}
+	if (kill || !blk_queue_dying(ns->queue)) {
+		blk_mq_abort_requeue_list(ns->queue);
+		blk_cleanup_queue(ns->queue);
+        }
+}
+
+static void virtnvme_dev_remove(struct virtio_nvme_dev *dev)
+{
+	struct virtio_nvme_ns *ns;
+
+	list_for_each_entry(ns, &dev->namespaces, list)
+		virtnvme_ns_remove(ns);
+}
+
+static void virtnvme_free_namespace(struct virtio_nvme_ns *ns)
+{
+	list_del(&ns->list);
+
+	spin_lock(&dev_list_lock);
+	ns->disk->private_data = NULL;
+	spin_unlock(&dev_list_lock);
+
+	put_disk(ns->disk);
+	kfree(ns);
+}
+
+static void virtnvme_free_namespaces(struct virtio_nvme_dev *dev)
+{
+	struct virtio_nvme_ns *ns, *next;
+
+	list_for_each_entry_safe(ns, next, &dev->namespaces, list)
+		virtnvme_free_namespace(ns);
+}
+
+static void virtnvme_remove(struct virtio_device *vdev)
+{
+	struct virtio_nvme_dev *dev = vdev->priv;
+
+	spin_lock(&dev_list_lock);
+	list_del_init(&dev->node);
+	spin_unlock(&dev_list_lock);
+
+	/* Stop all the virtqueues. */
+	vdev->config->reset(vdev);
+
+	vdev->config->del_vqs(vdev);
+
+	virtnvme_dev_remove(dev);
+	virtnvme_dev_remove_admin(dev);
+
+	blk_mq_free_tag_set(&dev->tagset);
+	kfree(dev->vqs);
+
+	kref_put(&dev->kref, virtnvme_free_dev);
+}
+
+static unsigned int features[] = {
+	VIRTIO_NVME_F_SEG_MAX, VIRTIO_NVME_F_MQ,
+};
+
+static struct virtio_driver virtio_nvme_driver = {
+	.feature_table			= features,
+	.feature_table_size		= ARRAY_SIZE(features),
+	.driver.name			= KBUILD_MODNAME,
+	.driver.owner			= THIS_MODULE,
+	.id_table			= id_table,
+	.probe				= virtnvme_probe,
+	.remove				= virtnvme_remove,
+};
+
+static int __init virtnvme_init(void)
+{
+	int error;
+
+	virtnvme_major = register_blkdev(0, "virtnvme");
+	if (virtnvme_major < 0) {
+		error = virtnvme_major;
+		goto out;
+	}
+
+	error = register_virtio_driver(&virtio_nvme_driver);
+	if (error)
+		goto out_unregister_blkdev;
+	return 0;
+
+out_unregister_blkdev:
+	unregister_blkdev(virtnvme_major, "virtnvme");
+out:
+	return error;
+}
+
+static void __exit virtnvme_exit(void)
+{
+	unregister_virtio_driver(&virtio_nvme_driver);
+	unregister_blkdev(virtnvme_major, "virtnvme");
+}
+module_init(virtnvme_init);
+module_exit(virtnvme_exit);
+
+MODULE_DEVICE_TABLE(virtio, id_table);
+MODULE_DESCRIPTION("Virtio NVMe driver");
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Ming Lin <ming.l at ssi.samsung.com>");
diff --git a/include/linux/virtio_nvme.h b/include/linux/virtio_nvme.h
new file mode 100644
index 0000000..c8db9a2
--- /dev/null
+++ b/include/linux/virtio_nvme.h
@@ -0,0 +1,53 @@
+#ifndef _LINUX_VIRTIO_NVME_H
+#define _LINUX_VIRTIO_NVME_H
+
+#include <uapi/linux/virtio_nvme.h>
+#include <linux/blk-mq.h>
+
+#define VQ_NAME_LEN 16
+
+struct virtio_nvme_dev;
+struct virtio_nvme_queue {
+	struct virtio_nvme_dev *dev;
+	struct virtqueue *vq;
+	spinlock_t lock;
+	char name[VQ_NAME_LEN];
+} ____cacheline_aligned_in_smp;
+
+struct virtio_nvme_dev {
+	struct virtio_device *vdev;
+	wait_queue_head_t queue_wait;
+	struct request_queue *admin_q;
+	struct blk_mq_tag_set admin_tagset;
+	struct blk_mq_tag_set tagset;
+
+	/* num of vqs */
+	int num_vqs;
+	struct virtio_nvme_queue *vqs;
+	struct list_head node;
+	int instance;
+	u32 ctrl_config;
+	struct list_head namespaces;
+	struct kref kref;
+	char name[12];
+	char serial[20];
+	char model[40];
+	char firmware_rev[8];
+	u32 max_hw_sectors;
+
+	unsigned int sg_elems;
+};
+
+struct virtio_nvme_ns {
+	struct list_head list;
+
+	struct virtio_nvme_dev *dev;
+	struct request_queue *queue;
+	struct gendisk *disk;
+
+	unsigned ns_id;
+	int lba_shift;
+	int ms;
+};
+
+#endif
diff --git a/include/uapi/linux/virtio_ids.h b/include/uapi/linux/virtio_ids.h
index 77925f5..d59d323 100644
--- a/include/uapi/linux/virtio_ids.h
+++ b/include/uapi/linux/virtio_ids.h
@@ -41,5 +41,6 @@
 #define VIRTIO_ID_CAIF	       12 /* Virtio caif */
 #define VIRTIO_ID_GPU          16 /* virtio GPU */
 #define VIRTIO_ID_INPUT        18 /* virtio input */
+#define VIRTIO_ID_NVME         19 /* TBD: virtio NVMe, need Redhat's help to get this id */
 
 #endif /* _LINUX_VIRTIO_IDS_H */
diff --git a/include/uapi/linux/virtio_nvme.h b/include/uapi/linux/virtio_nvme.h
new file mode 100644
index 0000000..33f6077
--- /dev/null
+++ b/include/uapi/linux/virtio_nvme.h
@@ -0,0 +1,30 @@
+#ifndef _UAPI_LINUX_VIRTIO_NVME_H
+#define _UAPI_LINUX_VIRTIO_NVME_H
+
+#include <linux/types.h>
+#include <linux/virtio_ids.h>
+#include <linux/virtio_config.h>
+#include <linux/virtio_types.h>
+
+/* Feature bits */
+#define VIRTIO_NVME_F_SEG_MAX	1	/* Indicates maximum # of segments */
+#define VIRTIO_NVME_F_MQ	2	/* support more than one vq */
+
+struct virtio_nvme_config {
+	__u64	cap;
+	__u32	ctrl_config;
+	__u32	csts;
+
+	/* The maximum number of segments (if VIRTIO_NVME_F_SEG_MAX) */
+	__u32	seg_max;
+	/* number of vqs, only available when VIRTIO_NVME_F_MQ is set */
+	__u32 num_queues;
+} __attribute__((packed));
+
+struct virtio_nvme_resp {
+    __u32	result;
+    __u16	cid;
+    __u16	status;
+};
+
+#endif
-- 
1.9.1