[PATCH v10] NVMe: Convert to blk-mq

Mon Jul 14 05:41:31 PDT 2014

> +static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
> +			  unsigned int hctx_idx)
> +	struct nvme_queue *nvmeq = dev->queues[
> +					(hctx_idx % dev->queue_count) + 1];
> +
> +	/* nvmeq queues are shared between namespaces. We assume here that
> +	 * blk-mq map the tags so they match up with the nvme queue tags */
> +	if (!nvmeq->hctx)
> +		nvmeq->hctx = hctx;
> +	else
> +		WARN_ON(nvmeq->hctx->tags != hctx->tags);

This wrong to me, as you're overwriting the value of nvmeq->hctx for each
new requeust_queue.  But nothing but ->tagsis ever used from nvmeq->hctx,
so you shold rather set up nvmeq->tags in nvme_dev_add.

> +static int nvme_init_request(void *data, struct request *req,
> +				unsigned int hctx_idx, unsigned int rq_idx,
> +				unsigned int numa_node)
> +{
> +	struct nvme_dev *dev = data;
> +	struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
> +	struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1];
> +
> +	WARN_ON(!nvmeq);
> +	cmd->nvmeq = nvmeq;

Shouldn't this fail instead of the warn_on?

> +static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req)
>  {
> +	struct nvme_ns *ns = hctx->queue->queuedata;
> +	struct nvme_queue *nvmeq = hctx->driver_data;
> +	struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
>  	struct nvme_iod *iod;
> +	enum dma_data_direction dma_dir;
> +	int psegs = req->nr_phys_segments;
> +	int result = BLK_MQ_RQ_QUEUE_BUSY;
> +	/*
> +	 * Requeued IO has already been prepped
> +	 */
> +	iod = req->special;
> +	if (iod)
> +		goto submit_iod;
>  
> +	iod = nvme_alloc_iod(psegs, blk_rq_bytes(req), GFP_ATOMIC);
>  	if (!iod)
> +		return result;

So there's still a memory allocation for each request here.  Any reason
this can't be preallocated at least for reasonable sized I/O?

No need for GFP_ATOMIC here either, and you probably need a mempool to
guarantee forward progress.

> +	if (req->cmd_flags & REQ_DISCARD) {
>  		void *range;
>  		/*
>  		 * We reuse the small pool to allocate the 16-byte range here
> @@ -752,33 +602,53 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
>  		range = dma_pool_alloc(nvmeq->dev->prp_small_pool,
>  						GFP_ATOMIC,
>  						&iod->first_dma);
> +		if (!range)
> +			goto finish_cmd;
>  		iod_list(iod)[0] = (__le64 *)range;
>  		iod->npages = 0;
>  	} else if (psegs) {
> +		dma_dir = rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
> +
> +		sg_init_table(iod->sg, psegs);
> +		iod->nents = blk_rq_map_sg(req->q, req, iod->sg);
> +		if (!iod->nents) {
> +			result = BLK_MQ_RQ_QUEUE_ERROR;
> +			goto finish_cmd;
>  		}
> +
> +		if (!dma_map_sg(nvmeq->q_dmadev, iod->sg, iod->nents, dma_dir))
> +			goto finish_cmd;
> +
> +		if (blk_rq_bytes(req) != nvme_setup_prps(nvmeq->dev, iod,
> +						blk_rq_bytes(req), GFP_ATOMIC))
> +			goto finish_cmd;
> +	}

Would be nice to factor these two into helpers, that could also avoid
the submid_iod goto..

> +
> +	if (req->cmd_flags & REQ_DISCARD) {
> +		nvme_submit_discard(nvmeq, ns, req, iod);
> +		goto queued;
> +	}
> +
> +	if (req->cmd_flags & REQ_FLUSH) {
> +		nvme_submit_flush(nvmeq, ns, req->tag);
> +		goto queued;
>  	}
> -	return 0;
>  
> +	nvme_submit_iod(nvmeq, iod, ns);
> + queued:

A simple

	if (req->cmd_flags & REQ_DISCARD)
		nvme_submit_discard(nvmeq, ns, req, iod);
	else if if (req->cmd_flags & REQ_FLUSH)
		nvme_submit_flush(nvmeq, ns, req->tag);
	else
		nvme_submit_iod(nvmeq, iod, ns);

seems preferable here.

> +static void nvme_cancel_queue_ios(void *data, unsigned long *tag_map)
>  {
> +	struct nvme_queue *nvmeq = data;
> +	struct blk_mq_hw_ctx *hctx = nvmeq->hctx;
> +	unsigned int tag = 0;
>  
> +	tag = 0;
> +	do {
> +		struct request *req;
>  		void *ctx;
>  		nvme_completion_fn fn;
> +		struct nvme_cmd_info *cmd;
>  		static struct nvme_completion cqe = {
>  			.status = cpu_to_le16(NVME_SC_ABORT_REQ << 1),
>  		};
> +		int qdepth = nvmeq == nvmeq->dev->queues[0] ?
> +					nvmeq->dev->admin_tagset.queue_depth :
> +					nvmeq->dev->tagset.queue_depth;
>  
> +		/* zero'd bits are free tags */
> +		tag = find_next_zero_bit(tag_map, qdepth, tag);
> +		if (tag >= qdepth)
> +			break;
> +
> +		req = blk_mq_tag_to_rq(hctx->tags, tag++);
> +		cmd = blk_mq_rq_to_pdu(req);

Seems like blk-mq would make your life easier by exporting an iterator
that goes over each in-use request instead of the current
blk_mq_tag_busy_iter prototype.  blk_mq_timeout_check would also be able
to make use of that, so maybe that would be a good preparatory patch?

> +static enum blk_eh_timer_return nvme_timeout(struct request *req)
>  {
> +	struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
> +	struct nvme_queue *nvmeq = cmd->nvmeq;
>  
> +	dev_warn(nvmeq->q_dmadev, "Timeout I/O %d QID %d\n", req->tag,
> +							nvmeq->qid);
> +	if (nvmeq->dev->initialized)
> +		nvme_abort_req(req);
>  
> +	return BLK_EH_RESET_TIMER;
> +}

Aborting a request but then resetting the timer looks wrong to me.

If that's indeed the intended behavior please add a comment explaining
it.

> +
> +static int nvme_alloc_admin_tags(struct nvme_dev *dev)
> +{
> +	if (!dev->admin_q) {

When would it be non-NULL?  Seems like the resume case might be the
case, but I suspect the code could be restructured to avoid even calling
nvme_alloc_admin_tags for that case.

> +static void nvme_free_admin_tags(struct nvme_dev *dev)
> +{
> +	if (dev->admin_q)
> +		blk_mq_free_tag_set(&dev->admin_tagset);
> +}

When would this be called with the admin queue not initialized?

> +static void nvme_dev_remove_admin(struct nvme_dev *dev)
> +{
> +	if (dev->admin_q && !blk_queue_dying(dev->admin_q))
> +		blk_cleanup_queue(dev->admin_q);
> +}

Same here.