[PATCH v5 net-next 34/36] net/mlx5e: NVMEoTCP DDGST TX handle OOO packets

Boris Pismenny borisp at nvidia.com
Thu Jul 22 04:03:23 PDT 2021


From: Yoray Zack <yorayz at nvidia.com>

When the driver indicate an OOO NVMEoTCP Tx packet it starts OOO flow:

1. Get pdu_info from nvme-tcp.
2. Send indication to NIC (set psv)- NIC will rebuild the parse machine.
3. Send the data the NIC needs for computing the DDGST using DUMP wqes.

Signed-off-by: Yoray Zack <yorayz at nvidia.com>
---
 .../net/ethernet/mellanox/mlx5/core/en/txrx.h |   2 +-
 .../mellanox/mlx5/core/en_accel/nvmeotcp.c    | 281 +++++++++++++++++-
 2 files changed, 280 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h
index c7f979dfdd69..1f4beaac488a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h
@@ -142,7 +142,7 @@ struct mlx5e_tx_wqe_info {
 	u8 num_wqebbs;
 	u8 num_dma;
 	u8 num_fifo_pkts;
-#ifdef CONFIG_MLX5_EN_TLS
+#if defined CONFIG_MLX5_EN_TLS || defined CONFIG_MLX5_EN_NVMEOTCP
 	struct page *resync_dump_frag_page;
 	enum mlx5e_dump_wqe_type type;
 #endif
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp.c
index d9f6125f5dbc..f8cba90679ea 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp.c
@@ -3,6 +3,7 @@
 
 #include <linux/netdevice.h>
 #include <linux/idr.h>
+#include <linux/blk-mq.h>
 #include <linux/nvme-tcp.h>
 #include "en_accel/nvmeotcp.h"
 #include "en_accel/nvmeotcp_utils.h"
@@ -267,6 +268,18 @@ fill_nvmeotcp_progress_params(struct mlx5e_nvmeotcp_queue *queue,
 		MLX5_SET(nvmeotcp_progress_params, ctx, offloading_state, 0);
 }
 
+struct mlx5e_dump_wqe {
+	struct mlx5_wqe_ctrl_seg ctrl;
+	struct mlx5_wqe_data_seg data;
+};
+
+#define MLX5E_NVME_DUMP_WQEBBS\
+	(DIV_ROUND_UP(sizeof(struct mlx5e_dump_wqe), MLX5_SEND_WQE_BB))
+
+#define MLX5E_NVME_FETCH_DUMP_WQE(sq, pi) \
+	((struct mlx5e_dump_wqe *)\
+	 mlx5e_fetch_wqe(&(sq)->wq, pi, sizeof(struct mlx5e_dump_wqe)))
+
 static void nvme_tx_fill_wi(struct mlx5e_txqsq *sq,
 			    u16 pi, u8 num_wqebbs, u32 num_bytes,
 			    struct page *page, enum mlx5e_dump_wqe_type type)
@@ -276,9 +289,65 @@ static void nvme_tx_fill_wi(struct mlx5e_txqsq *sq,
 	*wi = (struct mlx5e_tx_wqe_info) {
 		.num_wqebbs = num_wqebbs,
 		.num_bytes  = num_bytes,
+		.resync_dump_frag_page = page,
+		.type = type,
 	};
 }
 
+static void mlx5e_nvmeotcp_tx_post_fence_nop(struct mlx5e_txqsq *sq)
+{
+	struct mlx5_wq_cyc *wq = &sq->wq;
+	u16 pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc);
+
+	nvme_tx_fill_wi(sq, pi, 1, 0, NULL, MLX5E_DUMP_WQE_NVMEOTCP);
+
+	mlx5e_post_nop_fence(wq, sq->sqn, &sq->pc);
+}
+
+static int
+nvmeotcp_post_resync_dump(struct mlx5e_txqsq *sq, skb_frag_t *frag,
+			  u32 tisn, bool first, enum mlx5e_dump_wqe_type type)
+{
+	struct mlx5_wqe_ctrl_seg *cseg;
+	struct mlx5_wqe_data_seg *dseg;
+	struct mlx5e_dump_wqe *wqe;
+	dma_addr_t dma_addr;
+	u16 ds_cnt;
+	int fsz;
+	u16 pi;
+
+	BUILD_BUG_ON(MLX5E_NVME_DUMP_WQEBBS != 1);
+	pi = mlx5_wq_cyc_ctr2ix(&sq->wq, sq->pc);
+	wqe = MLX5E_NVME_FETCH_DUMP_WQE(sq, pi);
+
+	ds_cnt = sizeof(*wqe) / MLX5_SEND_WQE_DS;
+
+	cseg = &wqe->ctrl;
+	dseg = &wqe->data;
+
+	cseg->opmod_idx_opcode = cpu_to_be32((sq->pc << 8)  | MLX5_OPCODE_DUMP);
+	cseg->qpn_ds           = cpu_to_be32((sq->sqn << 8) | ds_cnt);
+	cseg->tis_tir_num      = cpu_to_be32(tisn << 8);
+	cseg->fm_ce_se         = first ? MLX5_FENCE_MODE_INITIATOR_SMALL : 0;
+
+	fsz = skb_frag_size(frag);
+	dma_addr = skb_frag_dma_map(sq->pdev, frag, 0, fsz,
+				    DMA_TO_DEVICE);
+	if (unlikely(dma_mapping_error(sq->pdev, dma_addr)))
+		return -ENOMEM;
+
+	dseg->addr       = cpu_to_be64(dma_addr);
+	dseg->lkey       = sq->mkey_be;
+	dseg->byte_count = cpu_to_be32(fsz);
+	mlx5e_dma_push(sq, dma_addr, fsz, MLX5E_DMA_MAP_PAGE);
+
+	nvme_tx_fill_wi(sq, pi, MLX5E_NVME_DUMP_WQEBBS,
+			fsz, skb_frag_page(frag), type);
+	sq->pc +=  MLX5E_NVME_DUMP_WQEBBS;
+	mlx5e_notify_hw(&sq->wq, sq->pc, sq->uar_map, &wqe->ctrl);
+	return 0;
+}
+
 void
 build_nvmeotcp_progress_params(struct mlx5e_nvmeotcp_queue *queue,
 			       struct mlx5e_set_nvmeotcp_progress_params_wqe *wqe,
@@ -295,6 +364,7 @@ build_nvmeotcp_progress_params(struct mlx5e_nvmeotcp_queue *queue,
 					     MLX5_OPCODE_SET_PSV | (opc_mod << 24));
 	cseg->qpn_ds = cpu_to_be32((sqn << MLX5_WQE_CTRL_QPN_SHIFT) |
 				   PROGRESS_PARAMS_DS_CNT);
+	cseg->fm_ce_se         = resync ? MLX5_FENCE_MODE_INITIATOR_SMALL : 0;
 	fill_nvmeotcp_progress_params(queue, &wqe->params, seq, !is_rx);
 }
 
@@ -1160,6 +1230,202 @@ void mlx5e_nvmeotcp_tx_post_param_wqes(struct mlx5e_txqsq *sq, struct sock *sk,
 	mlx5e_nvmeotcp_tx_post_progress_params(ctx, sq, tcp_sk(sk)->copied_seq, false);
 }
 
+enum mlx5e_nvmeotcp_resync_retval {
+	MLX5E_NVMEOTCP_RESYNC_DONE,
+	MLX5E_NVMEOTCP_RESYNC_FAIL,
+	MLX5E_NVMEOTCP_RESYNC_SKIP,
+};
+
+static
+int mlx5e_nvmeotcp_resync_frag(struct mlx5e_nvmeotcp_queue *queue,
+			       struct mlx5e_txqsq *sq, struct sk_buff *skb,
+			       int i, skb_frag_t *frag, u32  seq)
+{
+	unsigned int orig_fsz, frag_offset = 0, n = 0;
+	enum mlx5e_dump_wqe_type type = MLX5E_DUMP_WQE_NVMEOTCP;
+
+	orig_fsz = skb_frag_size(frag);
+
+	do {
+		bool fence = !(i || frag_offset);
+		unsigned int fsz;
+
+		n++;
+		fsz = min_t(unsigned int, sq->hw_mtu, orig_fsz - frag_offset);
+		skb_frag_size_set(frag, fsz);
+		if (nvmeotcp_post_resync_dump(sq, frag, queue->tisn, fence, type)) {
+			page_ref_add(compound_head(skb_frag_page(frag)), n - 1);
+			return -1;
+		}
+
+		skb_frag_off_add(frag, fsz);
+		frag_offset += fsz;
+	} while (frag_offset < orig_fsz);
+
+	page_ref_add(compound_head(skb_frag_page(frag)), n);
+
+	return 0;
+}
+
+static int mlx5e_nvmeotcp_resync_hdr(struct mlx5e_nvmeotcp_queue *queue,
+				     struct mlx5e_txqsq *sq, u32 seq,
+				     struct sk_buff *skb, int remaining,
+				     struct ulp_ddp_pdu_info *pdu_info)
+{
+	skb_frag_t pdu_frag;
+	int size = min_t(int, remaining, pdu_info->hdr_len);
+
+	__skb_frag_set_page(&pdu_frag, virt_to_page(pdu_info->hdr));
+	skb_frag_off_set(&pdu_frag, offset_in_page(pdu_info->hdr));
+	skb_frag_size_set(&pdu_frag, size);
+
+	return mlx5e_nvmeotcp_resync_frag(queue, sq, skb, 0, &pdu_frag, seq);
+}
+
+static void mlx5e_nvmeotcp_init_iter(struct iov_iter *iter, struct bio *bio)
+{
+	unsigned int bio_size;
+	struct bio_vec *vec;
+	int nsegs;
+
+	vec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
+	nsegs = bio_segments(bio);
+	bio_size = bio->bi_iter.bi_size;
+	iov_iter_bvec(iter, 1, vec, nsegs, bio_size);
+	iter->iov_offset = 0;
+}
+
+static int mlx5e_nvmeotcp_resync_data(struct mlx5e_nvmeotcp_queue *queue,
+				      struct mlx5e_txqsq *sq, u32 seq,
+				      struct sk_buff *skb, int remaining,
+				      struct ulp_ddp_pdu_info *pdu_info)
+{
+	struct request *req = pdu_info->req;
+	struct bio *bio = req->bio;
+	struct iov_iter iter;
+	int data_remaining;
+	size_t data_sent = 0;
+
+	mlx5e_nvmeotcp_init_iter(&iter, bio);
+
+	data_remaining = min_t(int, remaining, pdu_info->data_len);
+
+	while (data_remaining > 0) {
+		skb_frag_t frag;
+		size_t size = min_t(size_t,
+				    iter.bvec->bv_len - iter.iov_offset
+				    , data_remaining);
+
+		__skb_frag_set_page(&frag, iter.bvec->bv_page);
+		skb_frag_off_set(&frag, iter.bvec->bv_offset + iter.iov_offset);
+		skb_frag_size_set(&frag, size);
+		data_remaining -= size;
+
+		if (mlx5e_nvmeotcp_resync_frag(queue, sq, skb, 1, &frag, seq))
+			goto err_out;
+
+		if (!data_remaining)
+			break;
+
+		data_sent += size;
+		iov_iter_advance(&iter, size);
+		if (!iov_iter_count(&iter) && data_sent < pdu_info->data_len) {
+			bio = bio->bi_next;
+			mlx5e_nvmeotcp_init_iter(&iter, bio);
+		}
+	}
+
+	return 0;
+err_out:
+	return -1;
+}
+
+static int mlx5e_nvmeotcp_resync_crc(struct mlx5e_nvmeotcp_queue *queue,
+				     struct mlx5e_txqsq *sq, u32 seq,
+				     struct sk_buff *skb, int remaining,
+				     struct ulp_ddp_pdu_info *pdu_info)
+{
+	skb_frag_t crc_frag;
+	u32 dummy_ddigest = 0;
+
+	__skb_frag_set_page(&crc_frag, virt_to_page(&dummy_ddigest));
+	skb_frag_off_set(&crc_frag, offset_in_page(&dummy_ddigest));
+	skb_frag_size_set(&crc_frag, remaining);
+	return mlx5e_nvmeotcp_resync_frag(queue, sq, skb, 1, &crc_frag, seq);
+}
+
+/* for a pdu info mapping [--------seq----] capsule
+ ******* send to HW [-------|seq *******************/
+static
+bool mlx5e_nvmeotcp_resync_cap(struct mlx5e_nvmeotcp_queue *queue,
+			       struct mlx5e_txqsq *sq, struct sk_buff *skb,
+			       struct ulp_ddp_pdu_info *pdu_info,
+			       u32  seq)
+{
+	int remaining = seq - pdu_info->start_seq;
+	int ret;
+
+	ret = mlx5e_nvmeotcp_resync_hdr(queue, sq, seq, skb, remaining,
+					pdu_info);
+	if (unlikely(ret))
+		goto err_out;
+
+	remaining -= pdu_info->hdr_len;
+	if (remaining <= 0)
+		goto out;
+
+	ret = mlx5e_nvmeotcp_resync_data(queue, sq, seq, skb, remaining,
+					 pdu_info);
+	if (unlikely(ret))
+		goto err_out;
+
+	remaining -= pdu_info->data_len;
+	if (remaining <= 0)
+		goto out;
+
+	ret = mlx5e_nvmeotcp_resync_crc(queue, sq, seq, skb, remaining,
+					pdu_info);
+	if (unlikely(ret))
+		goto err_out;
+out:
+	return true;
+err_out:
+	return false;
+}
+
+static enum mlx5e_nvmeotcp_resync_retval
+mlx5e_nvmeotcp_handle_ooo_skb(struct mlx5e_nvmeotcp_queue *queue,
+			      struct mlx5e_txqsq *sq, struct sk_buff *skb,
+			      u32  seq, int datalen)
+{
+	struct ulp_ddp_pdu_info *pdu_info = NULL;
+
+	/* ask for pdu_info that includes the tcp_seq */
+	pdu_info = ulp_ddp_get_pdu_info(skb->sk, seq);
+
+	if (!pdu_info)
+		return MLX5E_NVMEOTCP_RESYNC_SKIP;
+
+	/*update NIC about resync - he will rebuild parse machine
+	 *send psv with small fence
+	 */
+	mlx5e_nvmeotcp_tx_post_progress_params(queue, sq, pdu_info->start_seq, true);
+
+	if (seq == pdu_info->start_seq || seq == pdu_info->end_seq) {
+		mlx5e_nvmeotcp_tx_post_fence_nop(sq);
+		return MLX5E_NVMEOTCP_RESYNC_DONE;
+	}
+
+	/* post dump wqes -
+	 * transfer the needed data to NIC HW using DUMP WQE with data [*,^]
+	 * saved in pdu_info
+	 */
+	if (unlikely(!mlx5e_nvmeotcp_resync_cap(queue, sq, skb, pdu_info, seq)))
+		return MLX5E_NVMEOTCP_RESYNC_FAIL;
+
+	return MLX5E_NVMEOTCP_RESYNC_DONE;
+}
+
 static inline bool mlx5e_is_sk_tx_device_offloaded(struct sock *sk)
 {
 	/* Return True after smp_store_release assing in
@@ -1199,8 +1465,19 @@ bool mlx5e_nvmeotcp_handle_tx_skb(struct net_device *netdev,
 		mlx5e_nvmeotcp_tx_post_param_wqes(sq, skb->sk, ctx);
 
 	seq = ntohl(tcp_hdr(skb)->seq);
-	if (unlikely(ctx->ulp_ddp_ctx.expected_seq != seq))
-		goto err_out;
+	if (unlikely(ctx->ulp_ddp_ctx.expected_seq != seq)) {
+		enum mlx5e_nvmeotcp_resync_retval ret =
+			mlx5e_nvmeotcp_handle_ooo_skb(ctx, sq, skb,
+						      seq, datalen);
+		switch (ret) {
+		case MLX5E_NVMEOTCP_RESYNC_DONE:
+			break;
+		case MLX5E_NVMEOTCP_RESYNC_SKIP:
+			goto out;
+		case MLX5E_NVMEOTCP_RESYNC_FAIL:
+			goto err_out;
+		}
+	}
 
 	*nvmeotcp_tisn = ctx->tisn;
 	ctx->ulp_ddp_ctx.expected_seq = seq + datalen;
-- 
2.24.1




More information about the Linux-nvme mailing list