[PATCH v4, under testing] nvme-rdma: support devices with queue size < 32
Leon Romanovsky
leonro at mellanox.com
Wed May 3 04:17:59 PDT 2017
On Wed, May 03, 2017 at 12:05:15PM +0200, Marta Rybczynska wrote:
> In the case of small NVMe-oF queue size (<32) we may enter
> a deadlock caused by the fact that the IB completions aren't sent
> waiting for 32 and the send queue will fill up.
>
> The error is seen as (using mlx5):
> [ 2048.693355] mlx5_0:mlx5_ib_post_send:3765:(pid 7273):
> [ 2048.693360] nvme nvme1: nvme_rdma_post_send failed with error code -12
>
> This patch changes the way the signalling is done so
> that it depends on the queue depth now. The magic define has
> been removed completely. It also reworks the signalling
> code to use atomic operations.
>
> Signed-off-by: Marta Rybczynska <marta.rybczynska at kalray.eu>
> Signed-off-by: Samuel Jones <sjones at kalray.eu>
> [v1]
^^^^ This part of commit message is not needed.
Thanks
>
> ---
>
> Changes in v4:
> * use atomic operations as suggested by Sagi
>
> Changes in v3:
> * avoid division in the fast path
> * reverse sig_count logic to simplify the code: it now counts down
> from the queue depth/2 to 0
> * change sig_count to int to avoid overflows for big queues
>
> Changes in v2:
> * signal by queue size/2, remove hardcoded 32
> * support queue depth of 1
> ---
> drivers/nvme/host/rdma.c | 40 +++++++++++++++++++++++++++++++++++-----
> 1 file changed, 35 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
> index 16f84eb..234b010 100644
> --- a/drivers/nvme/host/rdma.c
> +++ b/drivers/nvme/host/rdma.c
> @@ -88,7 +88,7 @@ enum nvme_rdma_queue_flags {
>
> struct nvme_rdma_queue {
> struct nvme_rdma_qe *rsp_ring;
> - u8 sig_count;
> + atomic_t sig_count;
> int queue_size;
> size_t cmnd_capsule_len;
> struct nvme_rdma_ctrl *ctrl;
> @@ -257,6 +257,15 @@ static int nvme_rdma_wait_for_cm(struct nvme_rdma_queue *queue)
> return queue->cm_error;
> }
>
> +static inline int nvme_rdma_init_sig_count(int queue_size)
> +{
> + /* We signal completion every queue depth/2 and also
> + * handle the case of possible device with queue_depth=1,
> + * where we would need to signal every message.
> + */
> + return max(queue_size / 2, 1);
> +}
> +
> static int nvme_rdma_create_qp(struct nvme_rdma_queue *queue, const int factor)
> {
> struct nvme_rdma_device *dev = queue->device;
> @@ -561,6 +570,8 @@ static int nvme_rdma_init_queue(struct nvme_rdma_ctrl *ctrl,
>
> queue->queue_size = queue_size;
>
> + atomic_set(&queue->sig_count, nvme_rdma_init_sig_count(queue_size));
> +
> queue->cm_id = rdma_create_id(&init_net, nvme_rdma_cm_handler, queue,
> RDMA_PS_TCP, IB_QPT_RC);
> if (IS_ERR(queue->cm_id)) {
> @@ -1029,6 +1040,28 @@ static void nvme_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc)
> nvme_rdma_wr_error(cq, wc, "SEND");
> }
>
> +static inline bool nvme_rdma_queue_sig_limit(struct nvme_rdma_queue *queue)
> +{
> + int v, old;
> +
> + v = atomic_read(&queue->sig_count);
> + while (1) {
> + if (v > 1) {
> + old = atomic_cmpxchg(&queue->sig_count, v, v - 1);
> + if (old == v)
> + return false;
> + } else {
> + int new_count;
> +
> + new_count = nvme_rdma_init_sig_count(queue->queue_size);
> + old = atomic_cmpxchg(&queue->sig_count, v, new_count);
> + if (old == v)
> + return true;
> + }
> + v = old;
> + }
> +}
> +
> static int nvme_rdma_post_send(struct nvme_rdma_queue *queue,
> struct nvme_rdma_qe *qe, struct ib_sge *sge, u32 num_sge,
> struct ib_send_wr *first, bool flush)
> @@ -1056,9 +1089,6 @@ static int nvme_rdma_post_send(struct nvme_rdma_queue *queue,
> * Would have been way to obvious to handle this in hardware or
> * at least the RDMA stack..
> *
> - * This messy and racy code sniplet is copy and pasted from the iSER
> - * initiator, and the magic '32' comes from there as well.
> - *
> * Always signal the flushes. The magic request used for the flush
> * sequencer is not allocated in our driver's tagset and it's
> * triggered to be freed by blk_cleanup_queue(). So we need to
> @@ -1066,7 +1096,7 @@ static int nvme_rdma_post_send(struct nvme_rdma_queue *queue,
> * embedded in request's payload, is not freed when __ib_process_cq()
> * calls wr_cqe->done().
> */
> - if ((++queue->sig_count % 32) == 0 || flush)
> + if (nvme_rdma_queue_sig_limit(queue) || flush)
> wr.send_flags |= IB_SEND_SIGNALED;
>
> if (first)
> --
> 1.8.3.1
>
> _______________________________________________
> Linux-nvme mailing list
> Linux-nvme at lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-nvme
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 833 bytes
Desc: not available
URL: <http://lists.infradead.org/pipermail/linux-nvme/attachments/20170503/2d93fdcb/attachment.sig>
More information about the Linux-nvme
mailing list