nvme: batch completions and do them outside of the queue lock

Thu May 17 00:16:03 PDT 2018

>  static inline void nvme_handle_cqe(struct nvme_queue *nvmeq,
> -		struct nvme_completion *cqe)
> +				   volatile struct nvme_completion *cqe)

Skip the bogus reindentation here :)

>  {
>  	struct request *req;
>  
> @@ -950,21 +949,17 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq,
>  	if (unlikely(nvmeq->qid == 0 &&
>  			cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH)) {
>  		nvme_complete_async_event(&nvmeq->dev->ctrl,
> -				cqe->status, &cqe->result);
> +				cqe->status, (union nvme_result *) &cqe->result);

Please find a way to avoid that cast.  Either always make it volatile
or pass by value if modern complilers have stopped generating shit code
for passing unions by value.

> -static inline bool nvme_read_cqe(struct nvme_queue *nvmeq,
> -		struct nvme_completion *cqe)
> +static inline bool nvme_read_cqe(struct nvme_queue *nvmeq)
>  {
>  	if (nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase)) {
> -		*cqe = nvmeq->cqes[nvmeq->cq_head];
> -

Without actually reading the CQE this function is now grossly misnamed.
What about nvme_consume_cqe or something like that instead?

> +static inline void nvme_process_cq(struct nvme_queue *nvmeq, u16 *start,
> +				   u16 *end)
>  {
> +	*start = nvmeq->cq_head;
> +	while (nvme_read_cqe(nvmeq))
> +		;
> +	*end = nvmeq->cq_head;
> +
> +	if (*start != *end)
> +		nvme_ring_cq_doorbell(nvmeq);
> +}

Or in fact just kill off nvme_read_cqe, as that appears to be the only
callers with your patch (just reading the patch so I might be wrong).

The this could become:

	*start = nvmeq->cq_head;
	while (nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase)) {
		if (++nvmeq->cq_head == nvmeq->q_depth) {
			nvmeq->cq_head = 0;
			nvmeq->cq_phase = !nvmeq->cq_phase;
		}
	}
	*end = nvmeq->cq_head;

	if (*start != *end)
		nvme_ring_cq_doorbell(nvmeq);
}

which starts to make a lot more sense.

> +static bool nvme_complete_cqes(struct nvme_queue *nvmeq, u16 start, u16 end,
> +			       unsigned int tag)
> +{
> +	bool found = false;
> +
> +	if (start == end)
> +		return false;
> +
> +	while (start != end) {
> +		volatile struct nvme_completion *cqe = &nvmeq->cqes[start];
> +
> +		if (!found && tag == cqe->command_id)
> +			found = true;
> +		nvme_handle_cqe(nvmeq, cqe);
> +		if (++start == nvmeq->q_depth)
> +			start = 0;
>  	}
>  
> +	return found;

I don't think we need the if (start == end) check, the while loop already
handles that.  I also wonder if we should move the tag matching into
nvme_handle_cqe.  It already looks at cqe->command_id, so that would keep
the access together, and would remove the need to even look at the CQE at
all in this function.  And nvme_complete_cqes would be so trivial now that
we can still keep the poll optimization.

Irq/delete queue path, including handling the irqreturn_t value:

static irqreturn_t nvme_complete_cqes(struct nvme_queue *nvmeq,
		u16 start, u16 end)
{
	irqreturn_t ret = IRQ_NONE;

	while (start != end) {
		nvme_handle_cqe(nvmeq, cqe, -1);
		if (++start == nvmeq->q_depth)
			start = 0;
		ret = IRQ_HANDLED;
 	}

	return ret;
}

poll path:

static int __nvme_poll(struct nvme_queue *nvmeq, unsigned int tag)
{
	struct nvme_completion cqe;
	u16 start, end;

 	if (!nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase))
 		return 0;

 	spin_lock_irq(&nvmeq->q_lock);
	nvme_process_cq(nvmeq, &start, &end);
	spin_unlock(&nvmeq->q_lock);

	while (start != end) {
		if (nvme_handle_cqe(nvmeq, cqe, tag))
			return 1;
		if (++start == nvmeq->q_depth)
			start = 0;
 	}

	return 0;
}

that would also keep the early exit behavior for poll once we found
the tag.

> @@ -2006,8 +2016,9 @@ static void nvme_del_cq_end(struct request *req, blk_status_t error)
>  		 */
>  		spin_lock_irqsave_nested(&nvmeq->q_lock, flags,
>  					SINGLE_DEPTH_NESTING);
> -		nvme_process_cq(nvmeq);
> +		nvme_process_cq(nvmeq, &start, &end);
>  		spin_unlock_irqrestore(&nvmeq->q_lock, flags);
> +		nvme_complete_cqes(nvmeq, start, end, -1U);

If we could somehow move this into a workqueue we could even move the
locking into nvme_process_cq, but that's something left for another time.