[PATCH v3] nvme-rdma: support devices with queue size < 32

Thu Apr 20 02:43:34 PDT 2017

In the case of small NVMe-oF queue size (<32) we may enter
a deadlock caused by the fact that the IB completions aren't sent
waiting for 32 and the send queue will fill up.

The error is seen as (using mlx5):
[ 2048.693355] mlx5_0:mlx5_ib_post_send:3765:(pid 7273):
[ 2048.693360] nvme nvme1: nvme_rdma_post_send failed with error code -12

This patch changes the way the signalling is done so
that it depends on the queue depth now. The magic define has
been removed completely.

Signed-off-by: Marta Rybczynska <marta.rybczynska at kalray.eu>
Signed-off-by: Samuel Jones <sjones at kalray.eu>
---
Changes in v3:
* avoid division in the fast path
* reverse sig_count logic to simplify the code: it now counts down
  from the queue depth/2 to 0
* change sig_count to int to avoid overflows for big queues

Changes in v2:
* signal by queue size/2, remove hardcoded 32
* support queue depth of 1

 drivers/nvme/host/rdma.c | 28 +++++++++++++++++++++++-----
 1 file changed, 23 insertions(+), 5 deletions(-)

diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 3d25add..ee6f747 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -88,7 +88,7 @@ enum nvme_rdma_queue_flags {
 
 struct nvme_rdma_queue {
 	struct nvme_rdma_qe	*rsp_ring;
-	u8			sig_count;
+	int			sig_count;
 	int			queue_size;
 	size_t			cmnd_capsule_len;
 	struct nvme_rdma_ctrl	*ctrl;
@@ -251,6 +251,15 @@ static int nvme_rdma_wait_for_cm(struct nvme_rdma_queue *queue)
 	return queue->cm_error;
 }
 
+static inline int nvme_rdma_init_sig_count(int queue_size)
+{
+	/* We signal completion every queue depth/2 and also
+	 * handle the case of possible device with queue_depth=1,
+	 * where we would need to signal every message.
+	 */
+	return max(queue_size / 2, 1);
+}
+
 static int nvme_rdma_create_qp(struct nvme_rdma_queue *queue, const int factor)
 {
 	struct nvme_rdma_device *dev = queue->device;
@@ -556,6 +565,8 @@ static int nvme_rdma_init_queue(struct nvme_rdma_ctrl *ctrl,
 
 	queue->queue_size = queue_size;
 
+	queue->sig_count = nvme_rdma_init_sig_count(queue_size);
+
 	queue->cm_id = rdma_create_id(&init_net, nvme_rdma_cm_handler, queue,
 			RDMA_PS_TCP, IB_QPT_RC);
 	if (IS_ERR(queue->cm_id)) {
@@ -1011,6 +1022,16 @@ static void nvme_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc)
 		nvme_rdma_wr_error(cq, wc, "SEND");
 }
 
+static inline int nvme_rdma_queue_sig_limit(struct nvme_rdma_queue *queue)
+{
+	queue->sig_count--;
+	if (queue->sig_count == 0) {
+		queue->sig_count = nvme_rdma_init_sig_count(queue->queue_size);
+		return 1;
+	}
+	return 0;
+}
+
 static int nvme_rdma_post_send(struct nvme_rdma_queue *queue,
 		struct nvme_rdma_qe *qe, struct ib_sge *sge, u32 num_sge,
 		struct ib_send_wr *first, bool flush)
@@ -1038,9 +1059,6 @@ static int nvme_rdma_post_send(struct nvme_rdma_queue *queue,
 	 * Would have been way to obvious to handle this in hardware or
 	 * at least the RDMA stack..
 	 *
-	 * This messy and racy code sniplet is copy and pasted from the iSER
-	 * initiator, and the magic '32' comes from there as well.
-	 *
 	 * Always signal the flushes. The magic request used for the flush
 	 * sequencer is not allocated in our driver's tagset and it's
 	 * triggered to be freed by blk_cleanup_queue(). So we need to
@@ -1048,7 +1066,7 @@ static int nvme_rdma_post_send(struct nvme_rdma_queue *queue,
 	 * embeded in request's payload, is not freed when __ib_process_cq()
 	 * calls wr_cqe->done().
 	 */
-	if ((++queue->sig_count % 32) == 0 || flush)
+	if (nvme_rdma_queue_sig_limit(queue) || flush)
 		wr.send_flags |= IB_SEND_SIGNALED;
 
 	if (first)
-- 
1.8.3.1