[PATCH v3] nvme-rdma: support devices with queue size < 32
Marta Rybczynska
mrybczyn at kalray.eu
Thu Apr 20 02:43:34 PDT 2017
In the case of small NVMe-oF queue size (<32) we may enter
a deadlock caused by the fact that the IB completions aren't sent
waiting for 32 and the send queue will fill up.
The error is seen as (using mlx5):
[ 2048.693355] mlx5_0:mlx5_ib_post_send:3765:(pid 7273):
[ 2048.693360] nvme nvme1: nvme_rdma_post_send failed with error code -12
This patch changes the way the signalling is done so
that it depends on the queue depth now. The magic define has
been removed completely.
Signed-off-by: Marta Rybczynska <marta.rybczynska at kalray.eu>
Signed-off-by: Samuel Jones <sjones at kalray.eu>
---
Changes in v3:
* avoid division in the fast path
* reverse sig_count logic to simplify the code: it now counts down
from the queue depth/2 to 0
* change sig_count to int to avoid overflows for big queues
Changes in v2:
* signal by queue size/2, remove hardcoded 32
* support queue depth of 1
drivers/nvme/host/rdma.c | 28 +++++++++++++++++++++++-----
1 file changed, 23 insertions(+), 5 deletions(-)
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 3d25add..ee6f747 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -88,7 +88,7 @@ enum nvme_rdma_queue_flags {
struct nvme_rdma_queue {
struct nvme_rdma_qe *rsp_ring;
- u8 sig_count;
+ int sig_count;
int queue_size;
size_t cmnd_capsule_len;
struct nvme_rdma_ctrl *ctrl;
@@ -251,6 +251,15 @@ static int nvme_rdma_wait_for_cm(struct nvme_rdma_queue *queue)
return queue->cm_error;
}
+static inline int nvme_rdma_init_sig_count(int queue_size)
+{
+ /* We signal completion every queue depth/2 and also
+ * handle the case of possible device with queue_depth=1,
+ * where we would need to signal every message.
+ */
+ return max(queue_size / 2, 1);
+}
+
static int nvme_rdma_create_qp(struct nvme_rdma_queue *queue, const int factor)
{
struct nvme_rdma_device *dev = queue->device;
@@ -556,6 +565,8 @@ static int nvme_rdma_init_queue(struct nvme_rdma_ctrl *ctrl,
queue->queue_size = queue_size;
+ queue->sig_count = nvme_rdma_init_sig_count(queue_size);
+
queue->cm_id = rdma_create_id(&init_net, nvme_rdma_cm_handler, queue,
RDMA_PS_TCP, IB_QPT_RC);
if (IS_ERR(queue->cm_id)) {
@@ -1011,6 +1022,16 @@ static void nvme_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc)
nvme_rdma_wr_error(cq, wc, "SEND");
}
+static inline int nvme_rdma_queue_sig_limit(struct nvme_rdma_queue *queue)
+{
+ queue->sig_count--;
+ if (queue->sig_count == 0) {
+ queue->sig_count = nvme_rdma_init_sig_count(queue->queue_size);
+ return 1;
+ }
+ return 0;
+}
+
static int nvme_rdma_post_send(struct nvme_rdma_queue *queue,
struct nvme_rdma_qe *qe, struct ib_sge *sge, u32 num_sge,
struct ib_send_wr *first, bool flush)
@@ -1038,9 +1059,6 @@ static int nvme_rdma_post_send(struct nvme_rdma_queue *queue,
* Would have been way to obvious to handle this in hardware or
* at least the RDMA stack..
*
- * This messy and racy code sniplet is copy and pasted from the iSER
- * initiator, and the magic '32' comes from there as well.
- *
* Always signal the flushes. The magic request used for the flush
* sequencer is not allocated in our driver's tagset and it's
* triggered to be freed by blk_cleanup_queue(). So we need to
@@ -1048,7 +1066,7 @@ static int nvme_rdma_post_send(struct nvme_rdma_queue *queue,
* embeded in request's payload, is not freed when __ib_process_cq()
* calls wr_cqe->done().
*/
- if ((++queue->sig_count % 32) == 0 || flush)
+ if (nvme_rdma_queue_sig_limit(queue) || flush)
wr.send_flags |= IB_SEND_SIGNALED;
if (first)
--
1.8.3.1
More information about the Linux-nvme
mailing list