[PATCH v5 net-next 34/36] net/mlx5e: NVMEoTCP DDGST TX handle OOO packets
Boris Pismenny
borisp at nvidia.com
Thu Jul 22 04:03:23 PDT 2021
From: Yoray Zack <yorayz at nvidia.com>
When the driver indicate an OOO NVMEoTCP Tx packet it starts OOO flow:
1. Get pdu_info from nvme-tcp.
2. Send indication to NIC (set psv)- NIC will rebuild the parse machine.
3. Send the data the NIC needs for computing the DDGST using DUMP wqes.
Signed-off-by: Yoray Zack <yorayz at nvidia.com>
---
.../net/ethernet/mellanox/mlx5/core/en/txrx.h | 2 +-
.../mellanox/mlx5/core/en_accel/nvmeotcp.c | 281 +++++++++++++++++-
2 files changed, 280 insertions(+), 3 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h
index c7f979dfdd69..1f4beaac488a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h
@@ -142,7 +142,7 @@ struct mlx5e_tx_wqe_info {
u8 num_wqebbs;
u8 num_dma;
u8 num_fifo_pkts;
-#ifdef CONFIG_MLX5_EN_TLS
+#if defined CONFIG_MLX5_EN_TLS || defined CONFIG_MLX5_EN_NVMEOTCP
struct page *resync_dump_frag_page;
enum mlx5e_dump_wqe_type type;
#endif
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp.c
index d9f6125f5dbc..f8cba90679ea 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp.c
@@ -3,6 +3,7 @@
#include <linux/netdevice.h>
#include <linux/idr.h>
+#include <linux/blk-mq.h>
#include <linux/nvme-tcp.h>
#include "en_accel/nvmeotcp.h"
#include "en_accel/nvmeotcp_utils.h"
@@ -267,6 +268,18 @@ fill_nvmeotcp_progress_params(struct mlx5e_nvmeotcp_queue *queue,
MLX5_SET(nvmeotcp_progress_params, ctx, offloading_state, 0);
}
+struct mlx5e_dump_wqe {
+ struct mlx5_wqe_ctrl_seg ctrl;
+ struct mlx5_wqe_data_seg data;
+};
+
+#define MLX5E_NVME_DUMP_WQEBBS\
+ (DIV_ROUND_UP(sizeof(struct mlx5e_dump_wqe), MLX5_SEND_WQE_BB))
+
+#define MLX5E_NVME_FETCH_DUMP_WQE(sq, pi) \
+ ((struct mlx5e_dump_wqe *)\
+ mlx5e_fetch_wqe(&(sq)->wq, pi, sizeof(struct mlx5e_dump_wqe)))
+
static void nvme_tx_fill_wi(struct mlx5e_txqsq *sq,
u16 pi, u8 num_wqebbs, u32 num_bytes,
struct page *page, enum mlx5e_dump_wqe_type type)
@@ -276,9 +289,65 @@ static void nvme_tx_fill_wi(struct mlx5e_txqsq *sq,
*wi = (struct mlx5e_tx_wqe_info) {
.num_wqebbs = num_wqebbs,
.num_bytes = num_bytes,
+ .resync_dump_frag_page = page,
+ .type = type,
};
}
+static void mlx5e_nvmeotcp_tx_post_fence_nop(struct mlx5e_txqsq *sq)
+{
+ struct mlx5_wq_cyc *wq = &sq->wq;
+ u16 pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc);
+
+ nvme_tx_fill_wi(sq, pi, 1, 0, NULL, MLX5E_DUMP_WQE_NVMEOTCP);
+
+ mlx5e_post_nop_fence(wq, sq->sqn, &sq->pc);
+}
+
+static int
+nvmeotcp_post_resync_dump(struct mlx5e_txqsq *sq, skb_frag_t *frag,
+ u32 tisn, bool first, enum mlx5e_dump_wqe_type type)
+{
+ struct mlx5_wqe_ctrl_seg *cseg;
+ struct mlx5_wqe_data_seg *dseg;
+ struct mlx5e_dump_wqe *wqe;
+ dma_addr_t dma_addr;
+ u16 ds_cnt;
+ int fsz;
+ u16 pi;
+
+ BUILD_BUG_ON(MLX5E_NVME_DUMP_WQEBBS != 1);
+ pi = mlx5_wq_cyc_ctr2ix(&sq->wq, sq->pc);
+ wqe = MLX5E_NVME_FETCH_DUMP_WQE(sq, pi);
+
+ ds_cnt = sizeof(*wqe) / MLX5_SEND_WQE_DS;
+
+ cseg = &wqe->ctrl;
+ dseg = &wqe->data;
+
+ cseg->opmod_idx_opcode = cpu_to_be32((sq->pc << 8) | MLX5_OPCODE_DUMP);
+ cseg->qpn_ds = cpu_to_be32((sq->sqn << 8) | ds_cnt);
+ cseg->tis_tir_num = cpu_to_be32(tisn << 8);
+ cseg->fm_ce_se = first ? MLX5_FENCE_MODE_INITIATOR_SMALL : 0;
+
+ fsz = skb_frag_size(frag);
+ dma_addr = skb_frag_dma_map(sq->pdev, frag, 0, fsz,
+ DMA_TO_DEVICE);
+ if (unlikely(dma_mapping_error(sq->pdev, dma_addr)))
+ return -ENOMEM;
+
+ dseg->addr = cpu_to_be64(dma_addr);
+ dseg->lkey = sq->mkey_be;
+ dseg->byte_count = cpu_to_be32(fsz);
+ mlx5e_dma_push(sq, dma_addr, fsz, MLX5E_DMA_MAP_PAGE);
+
+ nvme_tx_fill_wi(sq, pi, MLX5E_NVME_DUMP_WQEBBS,
+ fsz, skb_frag_page(frag), type);
+ sq->pc += MLX5E_NVME_DUMP_WQEBBS;
+ mlx5e_notify_hw(&sq->wq, sq->pc, sq->uar_map, &wqe->ctrl);
+ return 0;
+}
+
void
build_nvmeotcp_progress_params(struct mlx5e_nvmeotcp_queue *queue,
struct mlx5e_set_nvmeotcp_progress_params_wqe *wqe,
@@ -295,6 +364,7 @@ build_nvmeotcp_progress_params(struct mlx5e_nvmeotcp_queue *queue,
MLX5_OPCODE_SET_PSV | (opc_mod << 24));
cseg->qpn_ds = cpu_to_be32((sqn << MLX5_WQE_CTRL_QPN_SHIFT) |
PROGRESS_PARAMS_DS_CNT);
+ cseg->fm_ce_se = resync ? MLX5_FENCE_MODE_INITIATOR_SMALL : 0;
fill_nvmeotcp_progress_params(queue, &wqe->params, seq, !is_rx);
}
@@ -1160,6 +1230,202 @@ void mlx5e_nvmeotcp_tx_post_param_wqes(struct mlx5e_txqsq *sq, struct sock *sk,
mlx5e_nvmeotcp_tx_post_progress_params(ctx, sq, tcp_sk(sk)->copied_seq, false);
}
+enum mlx5e_nvmeotcp_resync_retval {
+ MLX5E_NVMEOTCP_RESYNC_DONE,
+ MLX5E_NVMEOTCP_RESYNC_FAIL,
+ MLX5E_NVMEOTCP_RESYNC_SKIP,
+};
+
+static
+int mlx5e_nvmeotcp_resync_frag(struct mlx5e_nvmeotcp_queue *queue,
+ struct mlx5e_txqsq *sq, struct sk_buff *skb,
+ int i, skb_frag_t *frag, u32 seq)
+{
+ unsigned int orig_fsz, frag_offset = 0, n = 0;
+ enum mlx5e_dump_wqe_type type = MLX5E_DUMP_WQE_NVMEOTCP;
+
+ orig_fsz = skb_frag_size(frag);
+
+ do {
+ bool fence = !(i || frag_offset);
+ unsigned int fsz;
+
+ n++;
+ fsz = min_t(unsigned int, sq->hw_mtu, orig_fsz - frag_offset);
+ skb_frag_size_set(frag, fsz);
+ if (nvmeotcp_post_resync_dump(sq, frag, queue->tisn, fence, type)) {
+ page_ref_add(compound_head(skb_frag_page(frag)), n - 1);
+ return -1;
+ }
+
+ skb_frag_off_add(frag, fsz);
+ frag_offset += fsz;
+ } while (frag_offset < orig_fsz);
+
+ page_ref_add(compound_head(skb_frag_page(frag)), n);
+
+ return 0;
+}
+
+static int mlx5e_nvmeotcp_resync_hdr(struct mlx5e_nvmeotcp_queue *queue,
+ struct mlx5e_txqsq *sq, u32 seq,
+ struct sk_buff *skb, int remaining,
+ struct ulp_ddp_pdu_info *pdu_info)
+{
+ skb_frag_t pdu_frag;
+ int size = min_t(int, remaining, pdu_info->hdr_len);
+
+ __skb_frag_set_page(&pdu_frag, virt_to_page(pdu_info->hdr));
+ skb_frag_off_set(&pdu_frag, offset_in_page(pdu_info->hdr));
+ skb_frag_size_set(&pdu_frag, size);
+
+ return mlx5e_nvmeotcp_resync_frag(queue, sq, skb, 0, &pdu_frag, seq);
+}
+
+static void mlx5e_nvmeotcp_init_iter(struct iov_iter *iter, struct bio *bio)
+{
+ unsigned int bio_size;
+ struct bio_vec *vec;
+ int nsegs;
+
+ vec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
+ nsegs = bio_segments(bio);
+ bio_size = bio->bi_iter.bi_size;
+ iov_iter_bvec(iter, 1, vec, nsegs, bio_size);
+ iter->iov_offset = 0;
+}
+
+static int mlx5e_nvmeotcp_resync_data(struct mlx5e_nvmeotcp_queue *queue,
+ struct mlx5e_txqsq *sq, u32 seq,
+ struct sk_buff *skb, int remaining,
+ struct ulp_ddp_pdu_info *pdu_info)
+{
+ struct request *req = pdu_info->req;
+ struct bio *bio = req->bio;
+ struct iov_iter iter;
+ int data_remaining;
+ size_t data_sent = 0;
+
+ mlx5e_nvmeotcp_init_iter(&iter, bio);
+
+ data_remaining = min_t(int, remaining, pdu_info->data_len);
+
+ while (data_remaining > 0) {
+ skb_frag_t frag;
+ size_t size = min_t(size_t,
+ iter.bvec->bv_len - iter.iov_offset
+ , data_remaining);
+
+ __skb_frag_set_page(&frag, iter.bvec->bv_page);
+ skb_frag_off_set(&frag, iter.bvec->bv_offset + iter.iov_offset);
+ skb_frag_size_set(&frag, size);
+ data_remaining -= size;
+
+ if (mlx5e_nvmeotcp_resync_frag(queue, sq, skb, 1, &frag, seq))
+ goto err_out;
+
+ if (!data_remaining)
+ break;
+
+ data_sent += size;
+ iov_iter_advance(&iter, size);
+ if (!iov_iter_count(&iter) && data_sent < pdu_info->data_len) {
+ bio = bio->bi_next;
+ mlx5e_nvmeotcp_init_iter(&iter, bio);
+ }
+ }
+
+ return 0;
+err_out:
+ return -1;
+}
+
+static int mlx5e_nvmeotcp_resync_crc(struct mlx5e_nvmeotcp_queue *queue,
+ struct mlx5e_txqsq *sq, u32 seq,
+ struct sk_buff *skb, int remaining,
+ struct ulp_ddp_pdu_info *pdu_info)
+{
+ skb_frag_t crc_frag;
+ u32 dummy_ddigest = 0;
+
+ __skb_frag_set_page(&crc_frag, virt_to_page(&dummy_ddigest));
+ skb_frag_off_set(&crc_frag, offset_in_page(&dummy_ddigest));
+ skb_frag_size_set(&crc_frag, remaining);
+ return mlx5e_nvmeotcp_resync_frag(queue, sq, skb, 1, &crc_frag, seq);
+}
+
+/* for a pdu info mapping [--------seq----] capsule
+ ******* send to HW [-------|seq *******************/
+static
+bool mlx5e_nvmeotcp_resync_cap(struct mlx5e_nvmeotcp_queue *queue,
+ struct mlx5e_txqsq *sq, struct sk_buff *skb,
+ struct ulp_ddp_pdu_info *pdu_info,
+ u32 seq)
+{
+ int remaining = seq - pdu_info->start_seq;
+ int ret;
+
+ ret = mlx5e_nvmeotcp_resync_hdr(queue, sq, seq, skb, remaining,
+ pdu_info);
+ if (unlikely(ret))
+ goto err_out;
+
+ remaining -= pdu_info->hdr_len;
+ if (remaining <= 0)
+ goto out;
+
+ ret = mlx5e_nvmeotcp_resync_data(queue, sq, seq, skb, remaining,
+ pdu_info);
+ if (unlikely(ret))
+ goto err_out;
+
+ remaining -= pdu_info->data_len;
+ if (remaining <= 0)
+ goto out;
+
+ ret = mlx5e_nvmeotcp_resync_crc(queue, sq, seq, skb, remaining,
+ pdu_info);
+ if (unlikely(ret))
+ goto err_out;
+out:
+ return true;
+err_out:
+ return false;
+}
+
+static enum mlx5e_nvmeotcp_resync_retval
+mlx5e_nvmeotcp_handle_ooo_skb(struct mlx5e_nvmeotcp_queue *queue,
+ struct mlx5e_txqsq *sq, struct sk_buff *skb,
+ u32 seq, int datalen)
+{
+ struct ulp_ddp_pdu_info *pdu_info = NULL;
+
+ /* ask for pdu_info that includes the tcp_seq */
+ pdu_info = ulp_ddp_get_pdu_info(skb->sk, seq);
+
+ if (!pdu_info)
+ return MLX5E_NVMEOTCP_RESYNC_SKIP;
+
+ /*update NIC about resync - he will rebuild parse machine
+ *send psv with small fence
+ */
+ mlx5e_nvmeotcp_tx_post_progress_params(queue, sq, pdu_info->start_seq, true);
+
+ if (seq == pdu_info->start_seq || seq == pdu_info->end_seq) {
+ mlx5e_nvmeotcp_tx_post_fence_nop(sq);
+ return MLX5E_NVMEOTCP_RESYNC_DONE;
+ }
+
+ /* post dump wqes -
+ * transfer the needed data to NIC HW using DUMP WQE with data [*,^]
+ * saved in pdu_info
+ */
+ if (unlikely(!mlx5e_nvmeotcp_resync_cap(queue, sq, skb, pdu_info, seq)))
+ return MLX5E_NVMEOTCP_RESYNC_FAIL;
+
+ return MLX5E_NVMEOTCP_RESYNC_DONE;
+}
+
static inline bool mlx5e_is_sk_tx_device_offloaded(struct sock *sk)
{
/* Return True after smp_store_release assing in
@@ -1199,8 +1465,19 @@ bool mlx5e_nvmeotcp_handle_tx_skb(struct net_device *netdev,
mlx5e_nvmeotcp_tx_post_param_wqes(sq, skb->sk, ctx);
seq = ntohl(tcp_hdr(skb)->seq);
- if (unlikely(ctx->ulp_ddp_ctx.expected_seq != seq))
- goto err_out;
+ if (unlikely(ctx->ulp_ddp_ctx.expected_seq != seq)) {
+ enum mlx5e_nvmeotcp_resync_retval ret =
+ mlx5e_nvmeotcp_handle_ooo_skb(ctx, sq, skb,
+ seq, datalen);
+ switch (ret) {
+ case MLX5E_NVMEOTCP_RESYNC_DONE:
+ break;
+ case MLX5E_NVMEOTCP_RESYNC_SKIP:
+ goto out;
+ case MLX5E_NVMEOTCP_RESYNC_FAIL:
+ goto err_out;
+ }
+ }
*nvmeotcp_tisn = ctx->tisn;
ctx->ulp_ddp_ctx.expected_seq = seq + datalen;
--
2.24.1
More information about the Linux-nvme
mailing list