[PATCH 4/4] nvme: add support for mq_ops->queue_rqs()
Max Gurtovoy
mgurtovoy at nvidia.com
Thu Dec 16 05:02:24 PST 2021
On 12/15/2021 6:24 PM, Jens Axboe wrote:
> This enables the block layer to send us a full plug list of requests
> that need submitting. The block layer guarantees that they all belong
> to the same queue, but we do have to check the hardware queue mapping
> for each request.
>
> If errors are encountered, leave them in the passed in list. Then the
> block layer will handle them individually.
>
> This is good for about a 4% improvement in peak performance, taking us
> from 9.6M to 10M IOPS/core.
>
> Reviewed-by: Hannes Reinecke <hare at suse.de>
> Signed-off-by: Jens Axboe <axboe at kernel.dk>
> ---
> drivers/nvme/host/pci.c | 61 +++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 61 insertions(+)
>
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index 6be6b1ab4285..197aa45ef7ef 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -981,6 +981,66 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
> return BLK_STS_OK;
> }
>
> +static void nvme_submit_cmds(struct nvme_queue *nvmeq, struct request **rqlist)
> +{
> + spin_lock(&nvmeq->sq_lock);
> + while (!rq_list_empty(*rqlist)) {
> + struct request *req = rq_list_pop(rqlist);
> + struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
> +
> + memcpy(nvmeq->sq_cmds + (nvmeq->sq_tail << nvmeq->sqes),
> + absolute_pointer(&iod->cmd), sizeof(iod->cmd));
> + if (++nvmeq->sq_tail == nvmeq->q_depth)
> + nvmeq->sq_tail = 0;
> + }
> + nvme_write_sq_db(nvmeq, true);
> + spin_unlock(&nvmeq->sq_lock);
> +}
> +
> +static bool nvme_prep_rq_batch(struct nvme_queue *nvmeq, struct request *req)
> +{
> + /*
> + * We should not need to do this, but we're still using this to
> + * ensure we can drain requests on a dying queue.
> + */
> + if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags)))
> + return false;
> + if (unlikely(!nvme_check_ready(&nvmeq->dev->ctrl, req, true)))
> + return false;
> +
> + req->mq_hctx->tags->rqs[req->tag] = req;
> + return nvme_prep_rq(nvmeq->dev, req) == BLK_STS_OK;
> +}
> +
> +static void nvme_queue_rqs(struct request **rqlist)
> +{
> + struct request *req = rq_list_peek(rqlist), *prev = NULL;
> + struct request *requeue_list = NULL;
> +
> + do {
> + struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
> +
> + if (!nvme_prep_rq_batch(nvmeq, req)) {
> + /* detach 'req' and add to remainder list */
> + if (prev)
> + prev->rq_next = req->rq_next;
> + rq_list_add(&requeue_list, req);
> + } else {
> + prev = req;
> + }
> +
> + req = rq_list_next(req);
> + if (!req || (prev && req->mq_hctx != prev->mq_hctx)) {
> + /* detach rest of list, and submit */
> + prev->rq_next = NULL;
if req == NULL and prev == NULL we'll get a NULL deref here.
I think this can happen in the first iteration.
Correct me if I'm wrong..
> + nvme_submit_cmds(nvmeq, rqlist);
> + *rqlist = req;
> + }
> + } while (req);
> +
> + *rqlist = requeue_list;
> +}
> +
> static __always_inline void nvme_pci_unmap_rq(struct request *req)
> {
> struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
> @@ -1678,6 +1738,7 @@ static const struct blk_mq_ops nvme_mq_admin_ops = {
>
> static const struct blk_mq_ops nvme_mq_ops = {
> .queue_rq = nvme_queue_rq,
> + .queue_rqs = nvme_queue_rqs,
> .complete = nvme_pci_complete_rq,
> .commit_rqs = nvme_commit_rqs,
> .init_hctx = nvme_init_hctx,
More information about the Linux-nvme
mailing list