nvme: batch completions and do them outside of the queue lock
Christoph Hellwig
hch at infradead.org
Thu May 17 00:16:03 PDT 2018
> static inline void nvme_handle_cqe(struct nvme_queue *nvmeq,
> - struct nvme_completion *cqe)
> + volatile struct nvme_completion *cqe)
Skip the bogus reindentation here :)
> {
> struct request *req;
>
> @@ -950,21 +949,17 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq,
> if (unlikely(nvmeq->qid == 0 &&
> cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH)) {
> nvme_complete_async_event(&nvmeq->dev->ctrl,
> - cqe->status, &cqe->result);
> + cqe->status, (union nvme_result *) &cqe->result);
Please find a way to avoid that cast. Either always make it volatile
or pass by value if modern complilers have stopped generating shit code
for passing unions by value.
> -static inline bool nvme_read_cqe(struct nvme_queue *nvmeq,
> - struct nvme_completion *cqe)
> +static inline bool nvme_read_cqe(struct nvme_queue *nvmeq)
> {
> if (nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase)) {
> - *cqe = nvmeq->cqes[nvmeq->cq_head];
> -
Without actually reading the CQE this function is now grossly misnamed.
What about nvme_consume_cqe or something like that instead?
> +static inline void nvme_process_cq(struct nvme_queue *nvmeq, u16 *start,
> + u16 *end)
> {
> + *start = nvmeq->cq_head;
> + while (nvme_read_cqe(nvmeq))
> + ;
> + *end = nvmeq->cq_head;
> +
> + if (*start != *end)
> + nvme_ring_cq_doorbell(nvmeq);
> +}
Or in fact just kill off nvme_read_cqe, as that appears to be the only
callers with your patch (just reading the patch so I might be wrong).
The this could become:
*start = nvmeq->cq_head;
while (nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase)) {
if (++nvmeq->cq_head == nvmeq->q_depth) {
nvmeq->cq_head = 0;
nvmeq->cq_phase = !nvmeq->cq_phase;
}
}
*end = nvmeq->cq_head;
if (*start != *end)
nvme_ring_cq_doorbell(nvmeq);
}
which starts to make a lot more sense.
> +static bool nvme_complete_cqes(struct nvme_queue *nvmeq, u16 start, u16 end,
> + unsigned int tag)
> +{
> + bool found = false;
> +
> + if (start == end)
> + return false;
> +
> + while (start != end) {
> + volatile struct nvme_completion *cqe = &nvmeq->cqes[start];
> +
> + if (!found && tag == cqe->command_id)
> + found = true;
> + nvme_handle_cqe(nvmeq, cqe);
> + if (++start == nvmeq->q_depth)
> + start = 0;
> }
>
> + return found;
I don't think we need the if (start == end) check, the while loop already
handles that. I also wonder if we should move the tag matching into
nvme_handle_cqe. It already looks at cqe->command_id, so that would keep
the access together, and would remove the need to even look at the CQE at
all in this function. And nvme_complete_cqes would be so trivial now that
we can still keep the poll optimization.
Irq/delete queue path, including handling the irqreturn_t value:
static irqreturn_t nvme_complete_cqes(struct nvme_queue *nvmeq,
u16 start, u16 end)
{
irqreturn_t ret = IRQ_NONE;
while (start != end) {
nvme_handle_cqe(nvmeq, cqe, -1);
if (++start == nvmeq->q_depth)
start = 0;
ret = IRQ_HANDLED;
}
return ret;
}
poll path:
static int __nvme_poll(struct nvme_queue *nvmeq, unsigned int tag)
{
struct nvme_completion cqe;
u16 start, end;
if (!nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase))
return 0;
spin_lock_irq(&nvmeq->q_lock);
nvme_process_cq(nvmeq, &start, &end);
spin_unlock(&nvmeq->q_lock);
while (start != end) {
if (nvme_handle_cqe(nvmeq, cqe, tag))
return 1;
if (++start == nvmeq->q_depth)
start = 0;
}
return 0;
}
that would also keep the early exit behavior for poll once we found
the tag.
> @@ -2006,8 +2016,9 @@ static void nvme_del_cq_end(struct request *req, blk_status_t error)
> */
> spin_lock_irqsave_nested(&nvmeq->q_lock, flags,
> SINGLE_DEPTH_NESTING);
> - nvme_process_cq(nvmeq);
> + nvme_process_cq(nvmeq, &start, &end);
> spin_unlock_irqrestore(&nvmeq->q_lock, flags);
> + nvme_complete_cqes(nvmeq, start, end, -1U);
If we could somehow move this into a workqueue we could even move the
locking into nvme_process_cq, but that's something left for another time.
More information about the Linux-nvme
mailing list