[PATCH blktests v5 5/5] nvme/055: add test for nvme-tcp zero-copy offload

Aurelien Aptel aaptel at nvidia.com
Fri Dec 6 05:51:20 PST 2024

This commit adds a new test for the kernel ULP DDP (Direct Data
Placement) feature with NVMe-TCP.

Configuration of DDP is per NIC and is done through a script in the
kernel source. For this reason we add 2 new config vars:
- KERNELSRC: path to the running kernel sources
- NVME_IFACE: name of the network interface to configure the offload on

Signed-off-by: Aurelien Aptel <aaptel at nvidia.com>
Signed-off-by: Shai Malin smalin at nvidia.com
Reviewed-by: Daniel Wagner <dwagner at suse.de>
 Documentation/running-tests.md |   9 ++
 README.md                      |   1 +
 common/rc                      |   8 +
 tests/nvme/055                 | 268 +++++++++++++++++++++++++++++++++
 tests/nvme/055.out             |  44 ++++++
 tests/nvme/rc                  |   8 +
 6 files changed, 338 insertions(+)
 create mode 100755 tests/nvme/055
 create mode 100644 tests/nvme/055.out

diff --git a/Documentation/running-tests.md b/Documentation/running-tests.md
index fe4f729..a42fc91 100644
--- a/Documentation/running-tests.md
+++ b/Documentation/running-tests.md
@@ -124,6 +124,15 @@ The NVMe tests can be additionally parameterized via environment variables.
   be skipped and this script gets called. This makes it possible to run
   the fabric nvme tests against a real target.
+#### NVMe-TCP zero-copy offload
+The NVMe-TCP ZC offload tests use a couple more variables.
+- KERNELSRC: Path to running kernel sources.
+  Needed for the script to configure the offload.
+- NVME_IFACE: Name of the interface the offload should be enabled on.
+  This should be the same interface the NVMe connection is made with.
 ### Running nvme-rdma and SRP tests
 These tests will use the siw (soft-iWARP) driver by default. The rdma_rxe
diff --git a/README.md b/README.md
index 55227d9..5073510 100644
--- a/README.md
+++ b/README.md
@@ -30,6 +30,7 @@ Some tests require the following:
 - nbd-client and nbd-server (Debian) or nbd (Fedora, openSUSE, Arch Linux)
 - dmsetup (Debian) or device-mapper (Fedora, openSUSE, Arch Linux)
 - rublk (`cargo install --version=^0.1 rublk`) for ublk test
+- python3, ethtool, iproute2 for nvme-tcp zero-copy offload test
 Build blktests with `make`. Optionally, install it to a known location with
 `make install` (`/usr/local/blktests` by default, but this can be changed by
diff --git a/common/rc b/common/rc
index b2e68b2..0c8b51f 100644
--- a/common/rc
+++ b/common/rc
@@ -148,6 +148,14 @@ _have_loop() {
 	_have_driver loop && _have_program losetup
+_have_kernel_source() {
+	if [ -z "${KERNELSRC}" ]; then
+		return 1
+	fi
+	return 0
 _have_blktrace() {
 	# CONFIG_BLK_DEV_IO_TRACE might still be disabled, but this is easier
 	# to check. We can fix it if someone complains.
diff --git a/tests/nvme/055 b/tests/nvme/055
new file mode 100755
index 0000000..4f2f87e
--- /dev/null
+++ b/tests/nvme/055
@@ -0,0 +1,268 @@
+# SPDX-License-Identifier: GPL-3.0+
+# Copyright (C) 2024 Aurelien Aptel <aaptel at nvidia.com>
+# zero-copy offload
+. tests/nvme/rc
+DESCRIPTION="enable zero copy offload and run rw traffic"
+# these vars get updated after each call to connect_run_disconnect()
+requires() {
+	_nvme_requires
+	_require_remote_nvme_target
+	_require_nvme_trtype tcp
+	_have_kernel_option ULP_DDP
+	# require nvme-tcp as a module to be able to change the ddp_offload param
+	_have_module nvme_tcp && _have_module_param nvme_tcp ddp_offload
+	_have_fio
+	_have_program ip
+	_have_program ethtool
+	_have_kernel_source && _have_program python3 && have_netlink_cli
+	have_iface
+have_netlink_cli() {
+	local cli
+	cli="${KERNELSRC}/tools/net/ynl/cli.py"
+	if ! [ -f "$cli" ]; then
+		SKIP_REASONS+=("Kernel sources do not have tools/net/ynl/cli.py")
+		return 1
+	fi
+	if ! "$cli" -h &> /dev/null; then
+		SKIP_REASONS+=("Cannot run the kernel tools/net/ynl/cli.py")
+		return 1;
+	fi
+	if ! [ -f "${KERNELSRC}/Documentation/netlink/specs/ulp_ddp.yaml" ]; then
+		SKIP_REASONS+=("Kernel sources do not have the ULP DDP netlink specs")
+		return 1
+	fi
+have_iface() {
+	if [ -z "${NVME_IFACE}" ]; then
+		return 1
+	fi
+	return 0
+set_conditions() {
+	_set_nvme_trtype "$@"
+netlink_cli() {
+	"${KERNELSRC}/tools/net/ynl/cli.py" \
+		--spec "${KERNELSRC}/Documentation/netlink/specs/ulp_ddp.yaml" \
+		"$@"
+eth_stat() {
+	ethtool -S "${NVME_IFACE}" | awk "/ $1:/ { print \$2 }"
+ddp_stat() {
+	netlink_cli --do stats-get --json "{\"ifindex\": $iface_idx}" \
+		| awk -F: "/'$1'/{print \$2;}" | tr -d '{},'
+ddp_caps() {
+	local out
+	out="$(netlink_cli --do caps-get --json "{\"ifindex\": $iface_idx}")"
+	echo "$out" | tr '{},' '\n' | tr -d ' '| awk -F: "/$1/ { print \$2 }"
+configure_ddp() {
+	local mod_param
+	local cap
+	mod_param=$1
+	cap=$2
+	echo "=== configured with ddp_offload=$mod_param and caps=$cap ==="
+	# set ddp_offload module param
+	modprobe -q -r nvme-tcp
+	modprobe -q nvme-tcp ddp_offload="$mod_param"
+	# set capabilities
+	netlink_cli --do caps-set --json "{\"ifindex\": $iface_idx, \"wanted\": $cap, \"wanted_mask\": 3}" >> "$FULL" 2>&1
+connect_run_disconnect() {
+	local io_size nvme_dev
+	# offload stat counters
+	# sockets
+	local beg_sk_add beg_sk_add_fail beg_sk_del
+	local end_sk_add end_sk_add_fail end_sk_del
+	# loss
+	local beg_drop beg_resync
+	local end_drop end_resync
+	# bw stats
+	local beg_off_bytes beg_eth_bytes beg_off_packets beg_eth_packets
+	local end_off_bytes end_eth_bytes end_off_packets end_eth_packets
+	# pdu offload setup/teardown
+	local end_setup beg_setup_fail end_setup_fail end_teardown
+	local nb_drop drop_ratio
+	local nb_resync resync_ratio
+	io_size=$1
+	beg_sk_add=$(ddp_stat rx-nvme-tcp-sk-add)
+	beg_sk_add_fail=$(ddp_stat rx-nvme-tcp-sk-add-fail)
+	beg_sk_del=$(ddp_stat rx-nvme-tcp-sk-del)
+	beg_setup_fail=$(ddp_stat rx-nvme-tcp-setup-fail)
+	beg_drop=$(ddp_stat rx-nvme-tcp-drop)
+	beg_resync=$(ddp_stat rx-nvme-tcp-resync)
+	beg_off_packets=$(ddp_stat rx-nvme-tcp-packets)
+	beg_off_bytes=$(ddp_stat rx-nvme-tcp-bytes)
+	beg_eth_packets=$(eth_stat rx_packets)
+	beg_eth_bytes=$(eth_stat rx_bytes)
+	_nvme_connect_subsys --hdr-digest --data-digest --nr-io-queues 8
+	nvme_dev="/dev/$(_find_nvme_ns "${def_subsys_uuid}")"
+	local common_args=(
+		--blocksize_range="$io_size"
+		--rw=randrw
+		--numjobs=8
+		--iodepth=128
+		--name=randrw
+		--ioengine=libaio
+		--time_based
+		--runtime="$TIMEOUT"
+		--direct=1
+		--invalidate=1
+		--randrepeat=1
+		--norandommap
+		--filename="$nvme_dev"
+	)
+	echo "IO size: $io_size"
+	_run_fio "${common_args[@]}"
+	_nvme_disconnect_subsys >> "$FULL" 2>&1
+	end_sk_add=$(ddp_stat rx-nvme-tcp-sk-add)
+	end_sk_add_fail=$(ddp_stat rx-nvme-tcp-sk-add-fail)
+	end_sk_del=$(ddp_stat rx-nvme-tcp-sk-del)
+	end_setup=$(ddp_stat rx-nvme-tcp-setup)
+	end_setup_fail=$(ddp_stat rx-nvme-tcp-setup-fail)
+	end_teardown=$(ddp_stat rx-nvme-tcp-teardown)
+	end_drop=$(ddp_stat rx-nvme-tcp-drop)
+	end_resync=$(ddp_stat rx-nvme-tcp-resync)
+	end_off_packets=$(ddp_stat rx-nvme-tcp-packets)
+	end_eth_packets=$(eth_stat rx_packets)
+	end_off_bytes=$(ddp_stat rx-nvme-tcp-bytes)
+	end_eth_bytes=$(eth_stat rx_bytes)
+	echo "Offloaded sockets: $((end_sk_add - beg_sk_add))"
+	echo "Failed sockets:    $((end_sk_add_fail - beg_sk_add_fail))"
+	echo "Unoffloaded sockets:   $((end_sk_del - beg_sk_del))"
+	echo "Offload packet leaked: $((end_setup - end_teardown))"
+	echo "Failed packet setup:   $((end_setup_fail - beg_setup_fail))"
+	# global var results
+	nb_drop=$(( end_drop - beg_drop ))
+	nb_resync=$(( end_resync - beg_resync ))
+	nb_packets=$(( end_eth_packets - beg_eth_packets ))
+	nb_offload_packets=$(( end_off_packets - beg_off_packets ))
+	nb_bytes=$(( end_eth_bytes - beg_eth_bytes ))
+	nb_offload_bytes=$(( end_off_bytes - beg_off_bytes ))
+	offload_packets_ratio=0
+	offload_bytes_ratio=0
+	# sanity check and avoid div by zero in ratio calculation
+	if [[ nb_bytes -eq 0 || nb_packets -eq 0 ]]; then
+		echo "No traffic: $nb_bytes bytes, $nb_packets packets"
+		return
+	fi
+	offload_packets_ratio=$(( nb_offload_packets*100/nb_packets ))
+	offload_bytes_ratio=$(( nb_offload_bytes*100/nb_bytes ))
+	drop_ratio=$(( nb_drop*100/nb_packets ))
+	resync_ratio=$(( nb_resync*100/nb_packets ))
+	[[ drop_ratio -gt 5 ]] && echo "High drop ratio: $drop_ratio %"
+	[[ resync_ratio -gt 5 ]] && echo "High resync ratio: $resync_ratio %"
+test() {
+	local starting_ddp
+	local starting_cap
+	: "${TIMEOUT:=30}"
+	echo "Running ${TEST_NAME}"
+	# get iface index
+	iface_idx=$(ip address | awk -F: "/${NVME_IFACE}/ { print \$1; exit; }")
+	# check hw supports ddp
+	if [[ $(( $(ddp_caps hw) & 3)) -ne 3 ]]; then
+		SKIP_REASONS+=("${NVME_IFACE} does not support nvme-tcp ddp offload")
+		return
+	fi
+	_setup_nvmet
+	_nvmet_target_setup
+	starting_ddp="$(cat "/sys/module/nvme_tcp/parameters/ddp_offload")"
+	starting_cap="$(ddp_caps active)"
+	# if any of the offload knobs are disabled, no offload should occur
+	# and offloaded packets & bytes should be zero
+	configure_ddp N 0
+	connect_run_disconnect 32k-1M
+	echo "Offloaded packets: $nb_offload_packets"
+	echo "Offloaded bytes: $nb_offload_bytes"
+	configure_ddp N 3
+	connect_run_disconnect 32k-1M
+	echo "Offloaded packets: $nb_offload_packets"
+	echo "Offloaded bytes: $nb_offload_bytes"
+	configure_ddp Y 0
+	connect_run_disconnect 32k-1M
+	echo "Offloaded packets: $nb_offload_packets"
+	echo "Offloaded bytes: $nb_offload_bytes"
+	# if everything is enabled, the offload should happen for large IOs only
+	configure_ddp Y 3
+	connect_run_disconnect 32k-1M
+	[[ nb_offload_packets -lt 100 ]] && echo "Low offloaded packets: $nb_offload_packets"
+	[[ nb_offload_bytes -lt 32768 ]] && echo "Low offloaded bytes: $nb_offload_bytes"
+	[[ offload_bytes_ratio -lt 90 ]] && echo "Low offloaded bytes ratio: $offload_bytes_ratio %"
+	[[ offload_packets_ratio -lt 95 ]] && echo "Low offloaded packets ratio: $offload_packets_ratio %"
+	# small IO should be under the offload threshold, ratio should be zero
+	connect_run_disconnect 4k-16k
+	echo "Offload bytes ratio: $offload_bytes_ratio %"
+	echo "Offload packets ratio: $offload_packets_ratio %"
+	_nvmet_target_cleanup
+	# restore starting config
+	configure_ddp "$starting_ddp" "$starting_cap" > /dev/null
+	echo "Test complete"
diff --git a/tests/nvme/055.out b/tests/nvme/055.out
new file mode 100644
index 0000000..9284a6f
--- /dev/null
+++ b/tests/nvme/055.out
@@ -0,0 +1,44 @@
+Running nvme/055
+=== configured with ddp_offload=N and caps=0 ===
+IO size: 32k-1M
+Offloaded sockets: 0
+Failed sockets:    0
+Unoffloaded sockets:   0
+Offload packet leaked: 0
+Failed packet setup:   0
+Offloaded packets: 0
+Offloaded bytes: 0
+=== configured with ddp_offload=N and caps=3 ===
+IO size: 32k-1M
+Offloaded sockets: 0
+Failed sockets:    0
+Unoffloaded sockets:   0
+Offload packet leaked: 0
+Failed packet setup:   0
+Offloaded packets: 0
+Offloaded bytes: 0
+=== configured with ddp_offload=Y and caps=0 ===
+IO size: 32k-1M
+Offloaded sockets: 0
+Failed sockets:    0
+Unoffloaded sockets:   0
+Offload packet leaked: 0
+Failed packet setup:   0
+Offloaded packets: 0
+Offloaded bytes: 0
+=== configured with ddp_offload=Y and caps=3 ===
+IO size: 32k-1M
+Offloaded sockets: 8
+Failed sockets:    0
+Unoffloaded sockets:   8
+Offload packet leaked: 0
+Failed packet setup:   0
+IO size: 4k-16k
+Offloaded sockets: 8
+Failed sockets:    0
+Unoffloaded sockets:   8
+Offload packet leaked: 0
+Failed packet setup:   0
+Offload bytes ratio: 0 %
+Offload packets ratio: 0 %
+Test complete
diff --git a/tests/nvme/rc b/tests/nvme/rc
index d1a4c01..4a43e43 100644
--- a/tests/nvme/rc
+++ b/tests/nvme/rc
@@ -199,6 +199,14 @@ _require_kernel_nvme_target() {
 	return 0
+_require_remote_nvme_target() {
+	if [ -z "${nvme_target_control}" ]; then
+		SKIP_REASONS+=("Remote target required but NVME_TARGET_CONTROL is not set")
+		return 1
+	fi
+	return 0
 _test_dev_nvme_ctrl() {
 	echo "/dev/char/$(cat "${TEST_DEV_SYSFS}/device/dev")"

More information about the Linux-nvme mailing list