[PATCH blktests v3] nvme/068: add a test for multipath delayed removal

John Garry john.g.garry at oracle.com
Thu Apr 30 01:46:35 PDT 2026


For NVMe multipath, the delayed removal feature allows the multipath
gendisk to remain present when all available paths are gone. The purpose of
this feature is to ensure that we keep the gendisk for intermittent path
failures.

The delayed removal works on a timer - when all paths are gone, a timer is
kicked off; once the timer expires and no paths have returned, the gendisk
is removed.

When all paths are gone and the gendisk is still present, all reads and
writes to the disk are queued. If a path returns before the timer
expiration, the timer canceled and the queued IO is submitted;
otherwise they fail when the timer expires.

This testcase covers two scenarios in separate parts:
a. test that IOs submitted after all paths are removed (and do not return)
   fail
b. test that IOs submitted between all paths removed and a path
   returning succeed

During the period of the timer being active, it must be ensured that the
nvme-core module is not removed. Otherwise the driver may not be present
to handle the timeout expiry. The kernel ensures this by taking a
reference to the module. Ideally, we would try to remove the module during
this test to prove that this is not possible (and the kernel behaves as
expected), but that module will probably not be removable anyway due to
many references. To test this feature, check that the refcount of the
nvme-core module is incremented when the delayed timer is active.

Reviewed-by: Chaitanya Kulkarni <kch at nvidia.com>
Reviewed-by: Nilay Shroff <nilay at linux.ibm.com>
Signed-off-by: John Garry <john.g.garry at oracle.com>
---

@Chaitanya, @Nilay, Please check changes since RB tag granted

Differences to v2:
- Stop using sleeps so often (Daniel)
- Add RB tag from Nilay (Thanks!)

Differences to v1:
- Add RB tag from Chaitanya (Thanks!)
- Address shellcheck issues (Shinichiro)
- Drop Quick=1 (Shinichiro)
- Reference kernel commit (Nilay)
- Tidy setting refcnt (Nilay)

Note that a fix for a refcnt issue has been merged into nvme-7.1 queue

diff --git a/common/nvme b/common/nvme
index 565de59..3dc9e2c 100644
--- a/common/nvme
+++ b/common/nvme
@@ -268,6 +268,38 @@ _nvme_disconnect_ctrl() {
 	nvme disconnect --device "${ctrl}"
 }
 
+_nvme_disconnect_ctrl_sync() {
+	local ctrl="$1"
+	local subsysnqn="$2"
+
+	_nvme_disconnect_ctrl "${ctrl}"
+
+	for ((i = 0; i < 10; i++)); do
+		_nvme_ctrl_ready "${ctrl}" "${subsysnqn}"
+		if [[ $? -eq 1 ]]; then
+			break
+		fi
+		sleep .1
+	done
+}
+
+_nvme_wait_subsys_removed() {
+	local subsysnqn="$def_subsysnqn"
+
+	for subsyspath in /sys/class/nvme-subsystem/*; do
+		_subsysnqn=$(cat "${subsyspath}/subsysnqn" 2> /dev/null)
+		if [ "$subsysnqn" == "$_subsysnqn" ]; then
+			for ((i = 0; i < 10; i++)); do
+				if [ ! -d "$subsyspath" ]; then
+					break
+				fi
+				sleep .1
+			done
+			break
+		fi
+	done
+}
+
 _nvme_connect_subsys() {
 	local subsysnqn="$def_subsysnqn"
 	local hostnqn="$def_hostnqn"
diff --git a/common/rc b/common/rc
index d2c1d74..f73c22f 100644
--- a/common/rc
+++ b/common/rc
@@ -117,6 +117,16 @@ _module_not_in_use() {
 	fi
 }
 
+_module_use_count() {
+	local refcnt
+	if [ -f "/sys/module/$1/refcnt" ]; then
+		refcnt="$(cat /sys/module/"$1"/refcnt)"
+		echo "$refcnt"
+		return
+	fi
+	echo ""
+}
+
 _have_module_param() {
 	 _have_driver "$1" || return
 
diff --git a/tests/nvme/068 b/tests/nvme/068
new file mode 100755
index 0000000..41e799b
--- /dev/null
+++ b/tests/nvme/068
@@ -0,0 +1,113 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-3.0+
+# Copyright (C) 2026 John Garry
+#
+# Test NVMe multipath delayed removal works as expected. This feature was
+# introduced in commit 62188639ec16 ("nvme-multipath: introduce delayed removal
+# of the multipath head node")
+
+. tests/nvme/rc
+. common/xfs
+
+DESCRIPTION="NVMe multipath delayed removal test"
+
+requires() {
+	_nvme_requires
+	_have_loop
+	_have_module_param_value nvme_core multipath Y
+	_require_nvme_trtype_is_fabrics
+}
+
+set_conditions() {
+	_set_nvme_trtype "$@"
+}
+
+_delayed_nvme_reconnect_ctrl() {
+	sleep 1
+	_nvme_connect_subsys
+}
+
+test() {
+	echo "Running ${TEST_NAME}"
+
+	_setup_nvmet
+
+	local nvmedev
+	local ns
+	local bytes_written
+	local refcnt_orig
+	local refcnt
+	_nvmet_target_setup
+
+	_nvme_connect_subsys
+
+	# Part a: Prove that writes fail when no path returns. Any reads or
+	#	  writes are queued during the delayed removal period. If no
+	#	  paths return before the timer expires, then those IOs should
+	#	  fail.
+	#	  During the delayed removal period, ensure that the module
+	#	  refcnt is incremented, to prove that we cannot remove the
+	#	  driver during this period.
+	nvmedev=$(_find_nvme_dev "${def_subsysnqn}")
+	ns=$(_find_nvme_ns "${def_subsys_uuid}")
+	refcnt_orig=$(_module_use_count nvme_core)
+	echo 5 > "/sys/block/${ns}/delayed_removal_secs"
+	_nvme_disconnect_ctrl_sync "${nvmedev}" "${def_subsysnqn}"
+	ns=$(_find_nvme_ns "${def_subsys_uuid}")
+	if [[ "${ns}" = "" ]]; then
+		echo "could not find ns after disconnect"
+	fi
+	refcnt=$(_module_use_count nvme_core)
+	if [ "$refcnt" != "" ] && [ "$refcnt" -le "$refcnt_orig" ]; then
+		echo "module refcount did not increase"
+	fi
+	bytes_written=$(run_xfs_io_pwritev2 /dev/"$ns" 4096)
+	if [ "$bytes_written" == 4096 ]; then
+		echo "wrote successfully after disconnect"
+	fi
+	_nvme_wait_subsys_removed
+	ns=$(_find_nvme_ns "${def_subsys_uuid}")
+	if [[ ! "${ns}" = "" ]]; then
+		echo "found ns after delayed removal"
+	fi
+	refcnt=$(_module_use_count nvme_core)
+	if [ "$refcnt" != "" ] && [ "$refcnt" -ne "$refcnt_orig" ]; then
+		echo "module refcount not as original"
+	fi
+
+	# Part b: Ensure writes for an intermittent disconnect are successful.
+	#	  During an intermittent disconnect, any reads or writes
+	#	  queued should succeed after a path returns.
+	#	  Also ensure module refcount behaviour is as expected, as
+	#	  above.
+	_nvme_connect_subsys
+	nvmedev=$(_find_nvme_dev "${def_subsysnqn}")
+	ns=$(_find_nvme_ns "${def_subsys_uuid}")
+	refcnt_orig=$(_module_use_count nvme_core)
+	echo 5 > "/sys/block/${ns}/delayed_removal_secs"
+	_nvme_disconnect_ctrl_sync "${nvmedev}" "${def_subsysnqn}"
+	ns=$(_find_nvme_ns "${def_subsys_uuid}")
+	if [[ "${ns}" = "" ]]; then
+		echo "could not find ns after disconnect"
+	fi
+	_delayed_nvme_reconnect_ctrl "${nvmedev}" &
+	bytes_written=$(run_xfs_io_pwritev2 /dev/"$ns" 4096)
+	if [ "$bytes_written" != 4096 ]; then
+		echo "could not write successfully with reconnect"
+	fi
+	if ! _nvmf_wait_for_state "${def_subsysnqn}" "live" ; then
+		echo "subsystem did not return"
+	fi
+	refcnt=$(_module_use_count nvme_core)
+	if [ "$refcnt" != "" ] && [ "$refcnt" -ne "$refcnt_orig" ]; then
+		echo "module refcount not as original"
+	fi
+
+	# Final tidy-up
+	echo 0 > "/sys/block/${ns}/delayed_removal_secs"
+	nvmedev=$(_find_nvme_dev "${def_subsysnqn}")
+	_nvme_disconnect_ctrl "${nvmedev}"
+	_nvmet_target_cleanup
+
+	echo "Test complete"
+}
diff --git a/tests/nvme/068.out b/tests/nvme/068.out
new file mode 100644
index 0000000..b913d19
--- /dev/null
+++ b/tests/nvme/068.out
@@ -0,0 +1,3 @@
+Running nvme/068
+pwrite: Input/output error
+Test complete
-- 
2.43.5




More information about the Linux-nvme mailing list