[PATCH] tests/nvme: Add admin-passthru+reset race test

Jonathan Derrick jonathan.derrick at linux.dev
Mon Nov 14 12:34:12 PST 2022


Adds a test which runs many formats and reset_controllers in parallel.
The intent is to expose timing holes in the controller state machine
which will lead to hung task timing and the controller becoming
unavailable.

Reported by https://bugzilla.kernel.org/show_bug.cgi?id=216354

Signed-off-by: Jonathan Derrick <jonathan.derrick at linux.dev>
---
 tests/nvme/046     | 85 ++++++++++++++++++++++++++++++++++++++++++++++
 tests/nvme/046.out |  2 ++
 2 files changed, 87 insertions(+)
 create mode 100755 tests/nvme/046
 create mode 100644 tests/nvme/046.out

diff --git a/tests/nvme/046 b/tests/nvme/046
new file mode 100755
index 0000000..4b47783
--- /dev/null
+++ b/tests/nvme/046
@@ -0,0 +1,85 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-3.0+
+# Copyright (C) 2022 Jonathan Derrick <jonathan.derrick at linux.dev>
+#
+# Test nvme reset controller during admin passthru
+#
+# Regression for issue reported by
+# https://bugzilla.kernel.org/show_bug.cgi?id=216354
+
+. tests/nvme/rc
+
+#restrict test to nvme-pci only
+nvme_trtype=pci
+
+DESCRIPTION="test nvme reset controller during admin passthru"
+QUICK=1
+CAN_BE_ZONED=1
+
+requires() {
+	_nvme_requires
+}
+
+device_requires() {
+	_require_test_dev_is_nvme
+}
+
+test_device() {
+	echo "Running ${TEST_NAME}"
+
+	local sysfs
+	local attr
+	local m
+
+	sysfs="$TEST_DEV_SYSFS/device"
+	timeout=$(($(cat /proc/sys/kernel/hung_task_timeout_secs) / 2))
+
+	sleep 5
+
+	if [[ ! -d "$sysfs" ]]; then
+		echo "$sysfs doesn't exist"
+	fi
+
+	# do reset controller/format loops
+	# don't check status now because a timing race is desired
+	i=0
+	start=0
+	timing_out=false
+	while [[ $i -le 1000 ]]; do
+		start=$SECONDS
+		if [[ -f "$sysfs/reset_controller" ]]; then
+			echo 1 > "$sysfs/reset_controller" 2>/dev/null &
+			i=$((i+1))
+		fi
+		nvme format -l 0 -f $TEST_DEV 2>/dev/null &
+
+		#Assume the controller is hung and unrecoverable
+		if [[ $(($SECONDS - $start)) -gt $timeout ]]; then
+			echo "nvme controller timing out"
+			timing_out=true
+			break
+		fi
+	done
+
+	{ kill $!; wait; } &> /dev/null
+
+	# at this point it may have waited hung_task_timeout / 2 already, so
+	# only wait 25% longer for a total of about 75% of allowed timeout
+	m=0
+	while [[ $m -le $((timeout / 2)) ]]; do
+		if [[ $timing_out == true ]]; then
+			break
+		fi
+		if grep -q live "$sysfs/state"; then
+			break
+		fi
+		sleep 1
+		m=$((m+1))
+	done
+	if ! grep -q live "$sysfs/state"; then
+		echo "nvme still not live after $(($SECONDS - $start)) seconds!"
+	fi
+	udevadm settle
+
+	echo "Test complete"
+}
diff --git a/tests/nvme/046.out b/tests/nvme/046.out
new file mode 100644
index 0000000..2b5fa6a
--- /dev/null
+++ b/tests/nvme/046.out
@@ -0,0 +1,2 @@
+Running nvme/046
+Test complete
-- 
2.31.1




More information about the Linux-nvme mailing list