blktests failures with v6.19 kernel

Chaitanya Kulkarni chaitanyak at nvidia.com
Sat Feb 14 13:19:47 PST 2026


On 2/13/26 01:56, Daniel Wagner wrote:
> nvmet_fc_target_assoc_free runs in the nvmet_wq context and calls
>
>    nvmet_fc_delete_target_queue
>      nvmet_cq_put
>        nvmet_cq_destroy
>          nvmet_ctrl_put
>           nvmet_ctrl_free
>             flush_work(&ctrl->async_event_work);
>             cancel_work_sync(&ctrl->fatal_err_work);
>   
> The async_event_work could be running on nvmet_wq. So this deadlock is
> real. No idea how to fix it yet.
>

Can following patch be the potential fix for above issue as well ?
totally untested ...

 From ad58e979ab9a2d4a7cc6234d28f2d90c174e4df9 Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <kch at nvidia.com>
Date: Thu, 5 Feb 2026 17:05:27 -0800
Subject: [INTERNAL PATCH] nvmet: move async event work off nvmet-wq

For target nvmet_ctrl_free() flushes ctrl->async_event_work.
If nvmet_ctrl_free() runs on nvmet-wq, the flush re-enters workqueue
completion for the same worker:-

A. Async event work queued on nvmet-wq (prior to disconnect):
   nvmet_execute_async_event()
      queue_work(nvmet_wq, &ctrl->async_event_work)

   nvmet_add_async_event()
      queue_work(nvmet_wq, &ctrl->async_event_work)

B. Full pre-work chain (RDMA CM path):
   nvmet_rdma_cm_handler()
      nvmet_rdma_queue_disconnect()
        __nvmet_rdma_queue_disconnect()
          queue_work(nvmet_wq, &queue->release_work)
            process_one_work()
              lock((wq_completion)nvmet-wq)  <--------- 1st
              nvmet_rdma_release_queue_work()

C. Recursive path (same worker):
   nvmet_rdma_release_queue_work()
      nvmet_rdma_free_queue()
        nvmet_sq_destroy()
          nvmet_ctrl_put()
            nvmet_ctrl_free()
              flush_work(&ctrl->async_event_work)
                __flush_work()
                  touch_wq_lockdep_map()
                  lock((wq_completion)nvmet-wq)i <--------- 2nd

Lockdep splat:

   ============================================
   WARNING: possible recursive locking detected
   6.19.0-rc3nvme+ #14 Tainted: G                 N
   --------------------------------------------
   kworker/u192:42/44933 is trying to acquire lock:
   ffff888118a00948 ((wq_completion)nvmet-wq){+.+.}-{0:0}, at: touch_wq_lockdep_map+0x26/0x90

   but task is already holding lock:
   ffff888118a00948 ((wq_completion)nvmet-wq){+.+.}-{0:0}, at: process_one_work+0x53e/0x660

   3 locks held by kworker/u192:42/44933:
    #0: ffff888118a00948 ((wq_completion)nvmet-wq){+.+.}-{0:0}, at: process_one_work+0x53e/0x660
    #1: ffffc9000e6cbe28 ((work_completion)(&queue->release_work)){+.+.}-{0:0}, at: process_one_work+0x1c5/0x660
    #2: ffffffff82d4db60 (rcu_read_lock){....}-{1:3}, at: __flush_work+0x62/0x530

   Workqueue: nvmet-wq nvmet_rdma_release_queue_work [nvmet_rdma]
   Call Trace:
    __flush_work+0x268/0x530
    nvmet_ctrl_free+0x140/0x310 [nvmet]
    nvmet_cq_put+0x74/0x90 [nvmet]
    nvmet_rdma_free_queue+0x23/0xe0 [nvmet_rdma]
    nvmet_rdma_release_queue_work+0x19/0x50 [nvmet_rdma]
    process_one_work+0x206/0x660
    worker_thread+0x184/0x320
    kthread+0x10c/0x240
    ret_from_fork+0x319/0x390

Move async event work to a dedicated nvmet-aen-wq to avoid reentrant
flush on nvmet-wq.

Signed-off-by: Chaitanya Kulkarni <kch at nvidia.com>
---
  drivers/nvme/target/admin-cmd.c |  2 +-
  drivers/nvme/target/core.c      | 13 +++++++++++--
  drivers/nvme/target/nvmet.h     |  1 +
  3 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index 3da31bb1183e..100d1466ff84 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -1586,7 +1586,7 @@ void nvmet_execute_async_event(struct nvmet_req *req)
  	ctrl->async_event_cmds[ctrl->nr_async_event_cmds++] = req;
  	mutex_unlock(&ctrl->lock);
  
-	queue_work(nvmet_wq, &ctrl->async_event_work);
+	queue_work(nvmet_aen_wq, &ctrl->async_event_work);
  }
  
  void nvmet_execute_keep_alive(struct nvmet_req *req)
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index cc88e5a28c8a..b0883c7fdb8f 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -26,6 +26,7 @@ static DEFINE_IDA(cntlid_ida);
  
  struct workqueue_struct *nvmet_wq;
  EXPORT_SYMBOL_GPL(nvmet_wq);
+struct workqueue_struct *nvmet_aen_wq;
  
  /*
   * This read/write semaphore is used to synchronize access to configuration
@@ -205,7 +206,7 @@ void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type,
  	list_add_tail(&aen->entry, &ctrl->async_events);
  	mutex_unlock(&ctrl->lock);
  
-	queue_work(nvmet_wq, &ctrl->async_event_work);
+	queue_work(nvmet_aen_wq, &ctrl->async_event_work);
  }
  
  static void nvmet_add_to_changed_ns_log(struct nvmet_ctrl *ctrl, __le32 nsid)
@@ -1958,9 +1959,14 @@ static int __init nvmet_init(void)
  	if (!nvmet_wq)
  		goto out_free_buffered_work_queue;
  
+	nvmet_aen_wq = alloc_workqueue("nvmet-aen-wq",
+			WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
+	if (!nvmet_aen_wq)
+		goto out_free_nvmet_work_queue;
+
  	error = nvmet_init_debugfs();
  	if (error)
-		goto out_free_nvmet_work_queue;
+		goto out_free_nvmet_aen_work_queue;
  
  	error = nvmet_init_discovery();
  	if (error)
@@ -1976,6 +1982,8 @@ static int __init nvmet_init(void)
  	nvmet_exit_discovery();
  out_exit_debugfs:
  	nvmet_exit_debugfs();
+out_free_nvmet_aen_work_queue:
+	destroy_workqueue(nvmet_aen_wq);
  out_free_nvmet_work_queue:
  	destroy_workqueue(nvmet_wq);
  out_free_buffered_work_queue:
@@ -1993,6 +2001,7 @@ static void __exit nvmet_exit(void)
  	nvmet_exit_discovery();
  	nvmet_exit_debugfs();
  	ida_destroy(&cntlid_ida);
+	destroy_workqueue(nvmet_aen_wq);
  	destroy_workqueue(nvmet_wq);
  	destroy_workqueue(buffered_io_wq);
  	destroy_workqueue(zbd_wq);
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index b664b584fdc8..319d6a5e9cf0 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -501,6 +501,7 @@ extern struct kmem_cache *nvmet_bvec_cache;
  extern struct workqueue_struct *buffered_io_wq;
  extern struct workqueue_struct *zbd_wq;
  extern struct workqueue_struct *nvmet_wq;
+extern struct workqueue_struct *nvmet_aen_wq;
  
  static inline void nvmet_set_result(struct nvmet_req *req, u32 result)
  {
-- 
2.39.5


-ck






More information about the Linux-nvme mailing list