[PATCH] NVMe: Add rw_page support

Thu Nov 13 16:05:38 PST 2014

This adds the rw_page entry point to the nvme driver so a page can be
written/read without going through the block layer and not requiring
additional allocations.

Just because we implement this doesn't mean we want to use it. I only see
a performance win on some types of work, like swap where I see about 15%
reduction in system time (compared to 20% prior to blk-mq when we didn't
allocate a request to get a command id). Even then, system time accounts
for very little of the real time anyway, and it's only an over-all win
if the device has very low-latency. But the driver doesn't know this
nor if the expected workload will even benefit from using page io,
so I added a queue flag that a user can toggle on/off.

The other benefit besides reduced system time is that we can swap
pages in/out without having to allocate anything since everything is
preallocated in this path.

Signed-off-by: Keith Busch <keith.busch at intel.com>
---
 block/blk-sysfs.c         |    8 +++++
 drivers/block/nvme-core.c |   78 +++++++++++++++++++++++++++++++++++++++++++++
 fs/block_dev.c            |    4 +--
 include/linux/blkdev.h    |    2 ++
 4 files changed, 90 insertions(+), 2 deletions(-)

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 1fac434..b29de5f 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -221,6 +221,7 @@ queue_store_##name(struct request_queue *q, const char *page, size_t count) \
 QUEUE_SYSFS_BIT_FNS(nonrot, NONROT, 1);
 QUEUE_SYSFS_BIT_FNS(random, ADD_RANDOM, 0);
 QUEUE_SYSFS_BIT_FNS(iostats, IO_STAT, 0);
+QUEUE_SYSFS_BIT_FNS(rw_page, RW_PG, 0);
 #undef QUEUE_SYSFS_BIT_FNS
 
 static ssize_t queue_nomerges_show(struct request_queue *q, char *page)
@@ -404,6 +405,12 @@ static struct queue_sysfs_entry queue_random_entry = {
 	.store = queue_store_random,
 };
 
+static struct queue_sysfs_entry queue_rw_page_entry = {
+	.attr = {.name = "rw_page", .mode = S_IRUGO | S_IWUSR },
+	.show = queue_show_rw_page,
+	.store = queue_store_rw_page,
+};
+
 static struct attribute *default_attrs[] = {
 	&queue_requests_entry.attr,
 	&queue_ra_entry.attr,
@@ -427,6 +434,7 @@ static struct attribute *default_attrs[] = {
 	&queue_rq_affinity_entry.attr,
 	&queue_iostats_entry.attr,
 	&queue_random_entry.attr,
+	&queue_rw_page_entry.attr,
 	NULL,
 };
 
diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 8393f91..96a1d61 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -143,6 +143,7 @@ struct nvme_cmd_info {
 	nvme_completion_fn fn;
 	void *ctx;
 	int aborted;
+	dma_addr_t dma;
 	struct nvme_queue *nvmeq;
 };
 
@@ -1807,8 +1808,85 @@ static int nvme_revalidate_disk(struct gendisk *disk)
 	return 0;
 }
 
+static void pgrd_completion(struct nvme_queue *nvmeq, void *ctx,
+						struct nvme_completion *cqe)
+{
+	struct request *req = ctx;
+	struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req);
+	struct page *page = req->special;
+	u16 status = le16_to_cpup(&cqe->status) >> 1;
+
+	dma_unmap_page(nvmeq->q_dmadev, cmd_rq->dma, PAGE_CACHE_SIZE, DMA_FROM_DEVICE);
+	page_endio(page, READ, status != NVME_SC_SUCCESS);
+	blk_put_request(req);
+}
+
+static void pgwr_completion(struct nvme_queue *nvmeq, void *ctx,
+						struct nvme_completion *cqe)
+{
+	struct request *req = ctx;
+	struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req);
+	struct page *page = req->special;
+	u16 status = le16_to_cpup(&cqe->status) >> 1;
+
+	dma_unmap_page(nvmeq->q_dmadev, cmd_rq->dma, PAGE_CACHE_SIZE, DMA_TO_DEVICE);
+	page_endio(page, WRITE, status != NVME_SC_SUCCESS);
+	blk_put_request(req);
+}
+
+static const enum dma_data_direction nvme_to_direction[] = {
+	DMA_NONE, DMA_TO_DEVICE, DMA_FROM_DEVICE, DMA_BIDIRECTIONAL
+};
+
+static int nvme_rw_page(struct block_device *bdev, sector_t sector,
+					struct page *page, int rw)
+{
+	dma_addr_t dma;
+	struct request *req;
+	struct nvme_command *cmd;
+	struct nvme_queue *nvmeq;
+	struct nvme_cmd_info *cmd_rq;
+	struct nvme_ns *ns = bdev->bd_disk->private_data;
+	nvme_completion_fn fn = (rw & WRITE) ? pgwr_completion : pgrd_completion;
+	u8 op = (rw & WRITE) ? nvme_cmd_write : nvme_cmd_read;
+	enum dma_data_direction dma_dir = nvme_to_direction[op & 3];
+
+	req = blk_mq_alloc_request(ns->queue, rw, GFP_ATOMIC, false);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+	req->special = page;
+	cmd_rq = blk_mq_rq_to_pdu(req);
+	nvmeq = cmd_rq->nvmeq;
+
+	nvme_set_info(cmd_rq, req, fn);
+
+	dma = dma_map_page(nvmeq->q_dmadev, page, 0, PAGE_CACHE_SIZE, dma_dir);
+	cmd_rq->dma = dma;
+
+	spin_lock_irq(&nvmeq->q_lock);
+	cmd = &nvmeq->sq_cmds[nvmeq->sq_tail];
+	memset(cmd, 0, sizeof(*cmd));
+
+	cmd->rw.opcode = op;
+	cmd->rw.command_id = req->tag;
+	cmd->rw.nsid = cpu_to_le32(ns->ns_id);
+	cmd->rw.slba = cpu_to_le64(nvme_block_nr(ns, sector));
+	cmd->rw.length = cpu_to_le16((PAGE_CACHE_SIZE >> ns->lba_shift) - 1);
+	cmd->rw.prp1 = cpu_to_le64(dma);
+
+	if (++nvmeq->sq_tail == nvmeq->q_depth)
+		nvmeq->sq_tail = 0;
+	writel(nvmeq->sq_tail, nvmeq->q_db);
+
+	nvme_process_cq(nvmeq);
+	spin_unlock_irq(&nvmeq->q_lock);
+
+	return 0;
+}
+
 static const struct block_device_operations nvme_fops = {
 	.owner		= THIS_MODULE,
+	.rw_page	= nvme_rw_page,
 	.ioctl		= nvme_ioctl,
 	.compat_ioctl	= nvme_compat_ioctl,
 	.open		= nvme_open,
diff --git a/fs/block_dev.c b/fs/block_dev.c
index cc9d411..f17f95d 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -380,7 +380,7 @@ int bdev_read_page(struct block_device *bdev, sector_t sector,
 			struct page *page)
 {
 	const struct block_device_operations *ops = bdev->bd_disk->fops;
-	if (!ops->rw_page)
+	if (!ops->rw_page || !blk_queue_rw_page(bdev->bd_queue))
 		return -EOPNOTSUPP;
 	return ops->rw_page(bdev, sector + get_start_sect(bdev), page, READ);
 }
@@ -411,7 +411,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
 	int result;
 	int rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE;
 	const struct block_device_operations *ops = bdev->bd_disk->fops;
-	if (!ops->rw_page)
+	if (!ops->rw_page || !blk_queue_rw_page(bdev->bd_queue))
 		return -EOPNOTSUPP;
 	set_page_writeback(page);
 	result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, rw);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 77db6dc..17a6058 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -509,6 +509,7 @@ struct request_queue {
 #define QUEUE_FLAG_INIT_DONE   20	/* queue is initialized */
 #define QUEUE_FLAG_NO_SG_MERGE 21	/* don't attempt to merge SG segments*/
 #define QUEUE_FLAG_SG_GAPS     22	/* queue doesn't support SG gaps */
+#define QUEUE_FLAG_RW_PG       23	/* use .rw_page if implemented */
 
 #define QUEUE_FLAG_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
 				 (1 << QUEUE_FLAG_STACKABLE)	|	\
@@ -596,6 +597,7 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q)
 #define blk_queue_discard(q)	test_bit(QUEUE_FLAG_DISCARD, &(q)->queue_flags)
 #define blk_queue_secdiscard(q)	(blk_queue_discard(q) && \
 	test_bit(QUEUE_FLAG_SECDISCARD, &(q)->queue_flags))
+#define blk_queue_rw_page(q)	test_bit(QUEUE_FLAG_RW_PG, &(q)->queue_flags)
 
 #define blk_noretry_request(rq) \
 	((rq)->cmd_flags & (REQ_FAILFAST_DEV|REQ_FAILFAST_TRANSPORT| \
-- 
1.7.10.4