kdump details (Re: nvme-multipath: round-robin infinite looping)

Mon Mar 21 15:43:02 PDT 2022

Backtrace of a soft-lockup while executing in nvme_ns_head_make_request

crash> bt
PID: 6      TASK: ffff90550742c000  CPU: 0   COMMAND: "kworker/0:0H"
 #0 [ffffb40a40003d48] machine_kexec at ffffffff93864e9e
 #1 [ffffb40a40003da0] __crash_kexec at ffffffff939a4c4d
 #2 [ffffb40a40003e68] panic at ffffffff938ed587
 #3 [ffffb40a40003ee8] watchdog_timer_fn.cold.10 at ffffffff939db197
 #4 [ffffb40a40003f18] __hrtimer_run_queues at ffffffff93983c20
 #5 [ffffb40a40003f78] hrtimer_interrupt at ffffffff93984480
 #6 [ffffb40a40003fd8] smp_apic_timer_interrupt at ffffffff942026ba
 #7 [ffffb40a40003ff0] apic_timer_interrupt at ffffffff94201c4f
--- <IRQ stack> ---
 #8 [ffffb40a403d3d18] apic_timer_interrupt at ffffffff94201c4f
    [exception RIP: nvme_ns_head_make_request+592]
    RIP: ffffffffc0280810  RSP: ffffb40a403d3dc0  RFLAGS: 00000206
    RAX: 0000000000000003  RBX: ffff9059517dd800  RCX: 0000000000000001
    RDX: 000043822f41b300  RSI: 0000000000000000  RDI: ffff9058da8d0010
    RBP: ffff9058da8d0010   R8: 0000000000000001   R9: 0000000000000000
    R10: 0000000000000003  R11: 0000000000000600  R12: 0000000000000001
    R13: ffff90724096e000  R14: ffff9058da8dca50  R15: ffff9058da8d0000
    ORIG_RAX: ffffffffffffff13  CS: 0010  SS: 0018
 #9 [ffffb40a403d3e18] generic_make_request at ffffffff93c6e8bb
#10 [ffffb40a403d3e80] nvme_requeue_work at ffffffffc02805aa [nvme_core]
#11 [ffffb40a403d3e98] process_one_work at ffffffff9390b237
#12 [ffffb40a403d3ed8] worker_thread at ffffffff9390b8f0
#13 [ffffb40a403d3f10] kthread at ffffffff9391269a
#14 [ffffb40a403d3f50] ret_from_fork at ffffffff94200255

RIP here is in the nvme_round_robin_path loop.  I've seen a report
without softlockup_panic enabled where multiple CPUs locked with a
variety of addresses reported all within the loop.

Disassembling and looking for the loop test expression, we can find the
instructions for ns && ns != old

236             for (ns = nvme_next_ns(head, old);
237                  ns && ns != old;
   0x000000000000982f <+575>:   test   %rbx,%rbx
   0x0000000000009832 <+578>:   je     0x98a8 <nvme_ns_head_make_request+696>
   0x0000000000009834 <+580>:   cmp    %rbx,%r13
   0x0000000000009837 <+583>:   je     0x98a8 <nvme_ns_head_make_request+696>

At this point ns is in rbx, and old is in r13, that doesn’t appear to
change while in this loop.

Similarly head is in r15, being used here in list_first_or_null_rcu

./include/linux/compiler.h:
276             __READ_ONCE_SIZE;
   0x0000000000009961 <+881>:   mov    (%r15),%rax

drivers/nvme/host/multipath.c:
222             return list_first_or_null_rcu(&head->list, struct nvme_ns, siblings);
   0x0000000000009964 <+884>:   mov    %rax,0x18(%rsp)
   0x0000000000009969 <+889>:   cmp    %rax,%r15
   0x000000000000996c <+892>:   je     0x99d4 <nvme_ns_head_make_request+996>

So, from the saved register values in the backtrace:
old = ffff90724096e000
head = ffff9058da8d0000

Checking that old.head == head

crash> struct nvme_ns.head ffff90724096e000
  head = 0xffff9058da8d0000

Dumping the nvme_ns structs on head.list

crash> list nvme_ns.siblings -H 0xffff9058da8d0000
ffff907084504000
ffff9059517dd800

Only 2, and neither of them match “old”

What do the list pointers in old look like?

crash> struct nvme_ns.siblings ffff90724096e000
  siblings = {
    next = 0xffff9059517dd830, 
    prev = 0xdead000000000200
  }

old.siblings->next points into a valid nvme_ns on the list, but prev has
been poisoned.

This loop can still exit if any ns on the list is not disabled and an
ANA optimized path.

crash> list nvme_ns.siblings -H 0xffff9058da8d0000 -s nvme_ns.ctrl
ffff907084504000
  ctrl = 0xffff90555c13c338
ffff9059517dd800
  ctrl = 0xffff90555c138338
crash> struct nvme_ctrl.state 0xffff90555c13c338
  state = NVME_CTRL_CONNECTING
crash> struct nvme_ctrl.state 0xffff90555c138338
  state = NVME_CTRL_CONNECTING

No good, both are disabled while connecting.

Dump head.current_path[], and there’s “old”

crash> struct nvme_ns_head ffff9058da8d0000 -o
struct nvme_ns_head {
  [ffff9058da8d0000] struct list_head list;
  [ffff9058da8d0010] struct srcu_struct srcu;
  [ffff9058da8dc510] struct nvme_subsystem *subsys;
  [ffff9058da8dc518] unsigned int ns_id;
  [ffff9058da8dc51c] struct nvme_ns_ids ids;
  [ffff9058da8dc548] struct list_head entry;
  [ffff9058da8dc558] struct kref ref;
  [ffff9058da8dc55c] bool shared;
  [ffff9058da8dc560] int instance;
  [ffff9058da8dc568] struct nvme_effects_log *effects;
  [ffff9058da8dc570] struct cdev cdev;
  [ffff9058da8dc5f8] struct device cdev_device;
  [ffff9058da8dc9c8] struct gendisk *disk;
  [ffff9058da8dc9d0] struct bio_list requeue_list;
  [ffff9058da8dc9e0] spinlock_t requeue_lock;
  [ffff9058da8dc9e8] struct work_struct requeue_work;
  [ffff9058da8dca28] struct mutex lock;
  [ffff9058da8dca48] unsigned long flags;
  [ffff9058da8dca50] struct nvme_ns *current_path[];
}
SIZE: 51792

crash> p *(struct nvme_ns **) 0xffff9058da8dca50 @ 4
$18 = 
 {0xffff90724096e000, 0x0, 0x0, 0x0}

So, we end up in an infinite loop because:

The nvme_ns_head list contains 2 nvme_ns.  Both of them have ctrl->state
as NVME_CTRL_CONNECTING, so they are considered as disabled and we will
execute the continue statement in the loop. “old” has been removed from
the siblings list, so it will never trigger the ns != old condition