[bug report] blktests nvme/022 lead kernel WARNING and NULL pointer

Hannes Reinecke hare at suse.de
Thu May 20 07:19:01 BST 2021


On 5/19/21 2:36 AM, Yi Zhang wrote:
> Hi Hannes
> 
> Gentle ping here.
> 
> On Wed, May 12, 2021 at 8:32 AM Yi Zhang <yi.zhang at redhat.com> wrote:
>>
>> On Sun, May 9, 2021 at 4:44 PM Hannes Reinecke <hare at suse.de> wrote:
>>>
>>> On 5/7/21 9:50 PM, Sagi Grimberg wrote:
>>>>
>>>>>> Hi Sagi
>>>>>>
>>>>>> Bisect shows bellow commits was the first bad commit.
>>>>>>
>>>>>> commit a70b81bd4d9d2d6c05cfe6ef2a10bccc2e04357a (refs/bisect/bad)
>>>>>> Author: Hannes Reinecke <hare at suse.de>
>>>>>> Date:   Fri Apr 16 13:46:20 2021 +0200
>>>>>>
>>>>>>       nvme: sanitize KATO setting
>>>>>>
>>>>>
>>>>> Hi Sagi
>>>>> This issue has been consistently reproduced with blktests on recent
>>>>> linux-block/for-next, do you have a chance to check it?
>>>>
>>>> I had not, Hannes can you have a look?
>>>>
>>> Can you check if this patch helps?
>>>
>>> diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
>>> index 74b3b150e1a5..a48d8219cf90 100644
>>> --- a/drivers/nvme/target/loop.c
>>> +++ b/drivers/nvme/target/loop.c
>>> @@ -458,6 +458,9 @@ static void nvme_loop_reset_ctrl_work(struct
>>> work_struct *work)
>>>                   container_of(work, struct nvme_loop_ctrl, ctrl.reset_work);
>>>           int ret;
>>>
>>> +       /* reset_work might run concurrently to delete_work */
>>> +       if (ctrl->state == NVME_CTRL_DELETING)
>>> +               return;
>>>           nvme_stop_ctrl(&ctrl->ctrl);
>>>           nvme_loop_shutdown_ctrl(ctrl);
>>>
>>
>> Hi Hannes
>> The issue still can be reproduced with this patch, here is the full log:
>>
>> <12>[  213.737602] run blktests nvme/022 at 2021-05-11 20:29:23
>> <6>[  213.857826] nvmet: adding nsid 1 to subsystem blktests-subsystem-1
>> <6>[  213.879592] nvmet: creating controller 1 for subsystem
>> blktests-subsystem-1 for NQN
>> nqn.2014-08.org.nvmexpress:uuid:7bc1767264664586a5ebf3ec1cdca2b9.
>> <6>[  213.893281] nvme nvme0: creating 128 I/O queues.
>> <6>[  213.910709] nvme nvme0: new ctrl: "blktests-subsystem-1"
>> <4>[  214.943818] nvme nvme0: resetting controller
>> <3>[  224.004540] nvmet: ctrl 1 keep-alive timer (5 seconds) expired!
>> <3>[  224.010452] nvmet: ctrl 1 fatal error occurred!
>> <6>[  224.014997] nvme nvme0: Removing ctrl: NQN "blktests-subsystem-1"
>> <4>[  227.734741] ------------[ cut here ]------------
>> <4>[  227.739348] WARNING: CPU: 110 PID: 1648 at
>> drivers/nvme/target/loop.c:469 nvme_loop_reset_ctrl_work+0x5c/0x110
>> [nvme_loop]
>> <4>[  227.750386] Modules linked in: nvme_loop nvme_fabrics nvmet
>> nvme_core rpcrdma rdma_ucm ib_srpt ib_isert iscsi_target_mod
>> target_core_mod ib_iser libiscsi ib_umad scsi_transport_iscsi ib_ipoib
>> rdma_cm iw_cm ib_cm rfkill mlx5_ib ib_uverbs ib_core sunrpc
>> coresight_etm4x i2c_smbus coresight_tmc coresight_replicator
>> coresight_tpiu mlx5_core joydev acpi_ipmi psample ipmi_ssif mlxfw
>> ipmi_devintf ipmi_msghandler coresight_funnel coresight thunderx2_pmu
>> vfat fat fuse zram ip_tables xfs ast i2c_algo_bit drm_vram_helper
>> drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops cec
>> drm_ttm_helper ttm drm crct10dif_ce ghash_ce gpio_xlp i2c_xlp9xx uas
>> usb_storage aes_neon_bs
>> <4>[  227.809227] CPU: 110 PID: 1648 Comm: kworker/u513:2 Not tainted
>> 5.13.0-rc1.fix+ #1
>> <4>[  227.816784] Hardware name: HPE Apollo 70
>> /C01_APACHE_MB         , BIOS L50_5.13_1.16 07/29/2020
>> <4>[  227.826511] Workqueue: nvme-reset-wq nvme_loop_reset_ctrl_work [nvme_loop]
>> <4>[  227.833376] pstate: 40400009 (nZcv daif +PAN -UAO -TCO BTYPE=--)
>> <4>[  227.839370] pc : nvme_loop_reset_ctrl_work+0x5c/0x110 [nvme_loop]
>> <4>[  227.845451] lr : nvme_loop_reset_ctrl_work+0x54/0x110 [nvme_loop]
>> <4>[  227.851532] sp : ffff800020c43d80
>> <4>[  227.854833] x29: ffff800020c43d80 x28: 0000000000000000 x27:
>> ffff00080d1b73c0
>> <4>[  227.861959] x26: ffff00080d1fb76c x25: 0000000000000000 x24:
>> ffff0008126be8c0
>> <4>[  227.869082] x23: 0000000000000000 x22: ffff000826eafb00 x21:
>> ffff0008126be4b0
>> <4>[  227.876205] x20: ffff0008126be000 x19: ffff0008126be8b8 x18:
>> 0000000000000020
>> <4>[  227.883328] x17: 00000000f911e638 x16: 0000000032ecb495 x15:
>> 000000000000000f
>> <4>[  227.890452] x14: 0000000000000003 x13: 0000000000000000 x12:
>> ffff008b822d32d0
>> <4>[  227.897575] x11: ffff008b822d3230 x10: 0000000000000001 x9 :
>> ffff800009058e74
>> <4>[  227.904699] x8 : ffff008b7cfeaab8 x7 : fffffffffffc0000 x6 :
>> ffff000000000000
>> <4>[  227.911822] x5 : 000000008040002a x4 : 0000000000000000 x3 :
>> ffff0008126be4bc
>> <4>[  227.918946] x2 : 0000000000000001 x1 : ffff0008126be4bc x0 :
>> 0000000000000000
>> <4>[  227.926069] Call trace:
>> <4>[  227.928503]  nvme_loop_reset_ctrl_work+0x5c/0x110 [nvme_loop]
>> <4>[  227.934237]  process_one_work+0x1f0/0x4ac
>> <4>[  227.938239]  worker_thread+0x22c/0x40c
>> <4>[  227.941976]  kthread+0x11c/0x120
>> <4>[  227.945193]  ret_from_fork+0x10/0x18
>> <4>[  227.948759] ---[ end trace 57e2d1ace7d39024 ]---
>> <1>[  228.355440] Unable to handle kernel NULL pointer dereference at
>> virtual address 0000000000000008
>> <1>[  228.364215] Mem abort info:
>> <1>[  228.366996]   ESR = 0x96000004
>> <1>[  228.370037]   EC = 0x25: DABT (current EL), IL = 32 bits
>> <1>[  228.375336]   SET = 0, FnV = 0
>> <1>[  228.378377]   EA = 0, S1PTW = 0
>> <1>[  228.381505] Data abort info:
>> <1>[  228.384372]   ISV = 0, ISS = 0x00000004
>> <1>[  228.388194]   CM = 0, WnR = 0
>> <1>[  228.391149] user pgtable: 4k pages, 48-bit VAs, pgdp=0000000897413000
>> <1>[  228.397576] [0000000000000008] pgd=0000000000000000, p4d=0000000000000000
>> <0>[  228.404354] Internal error: Oops: 96000004 [#1] SMP
>> <4>[  228.409220] Modules linked in: nvme_loop nvme_fabrics nvmet
>> nvme_core rpcrdma rdma_ucm ib_srpt ib_isert iscsi_target_mod
>> target_core_mod ib_iser libiscsi ib_umad scsi_transport_iscsi ib_ipoib
>> rdma_cm iw_cm ib_cm rfkill mlx5_ib ib_uverbs ib_core sunrpc
>> coresight_etm4x i2c_smbus coresight_tmc coresight_replicator
>> coresight_tpiu mlx5_core joydev acpi_ipmi psample ipmi_ssif mlxfw
>> ipmi_devintf ipmi_msghandler coresight_funnel coresight thunderx2_pmu
>> vfat fat fuse zram ip_tables xfs ast i2c_algo_bit drm_vram_helper
>> drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops cec
>> drm_ttm_helper ttm drm crct10dif_ce ghash_ce gpio_xlp i2c_xlp9xx uas
>> usb_storage aes_neon_bs
>> <4>[  228.468051] CPU: 5 PID: 2015 Comm: kworker/u513:7 Tainted: G
>>     W         5.13.0-rc1.fix+ #1
>> <4>[  228.476822] Hardware name: HPE Apollo 70
>> /C01_APACHE_MB         , BIOS L50_5.13_1.16 07/29/2020
>> <4>[  228.486548] Workqueue: nvme-delete-wq nvme_delete_ctrl_work [nvme_core]
>> <4>[  228.493166] pstate: 204000c9 (nzCv daIF +PAN -UAO -TCO BTYPE=--)
>> <4>[  228.499160] pc : percpu_ref_kill_and_confirm+0x94/0xb0
>> <4>[  228.504288] lr : percpu_ref_kill_and_confirm+0x20/0xb0
>> <4>[  228.509414] sp : ffff800020f63c90
>> <4>[  228.512715] x29: ffff800020f63c90 x28: 0000000000000000 x27:
>> ffff0008214969c0
>> <4>[  228.519840] x26: ffff000809dc8b6c x25: 0000000000000000 x24:
>> ffff00084f1d00d0
>> <4>[  228.526963] x23: ffff00084f1d00b0 x22: ffff0008126be130 x21:
>> ffff80000939a590
>> <4>[  228.534087] x20: 0000000000000000 x19: ffff00084f1d0090 x18:
>> 0000000000000020
>> <4>[  228.541210] x17: 0000000000000000 x16: 0000000000000000 x15:
>> 0000000000000013
>> <4>[  228.548334] x14: 0000000000000003 x13: 0000000000000000 x12:
>> 0000000000000040
>> <4>[  228.555457] x11: ffff000800401918 x10: ffff00080040191a x9 :
>> ffff8000090589ac
>> <4>[  228.562580] x8 : ffff008b7cfeaab8 x7 : 0000000000000018 x6 :
>> ffff000000000000
>> <4>[  228.569703] x5 : 0000000000000000 x4 : ffff008b85321400 x3 :
>> 0000000000000001
>> <4>[  228.576827] x2 : 0000000000000000 x1 : ffff800011035d20 x0 :
>> ffff80001147f000
>> <4>[  228.583952] Call trace:
>> <4>[  228.586385]  percpu_ref_kill_and_confirm+0x94/0xb0
>> <4>[  228.591163]  nvmet_sq_destroy+0xf0/0x194 [nvmet]
>> <4>[  228.595777]  nvme_loop_destroy_io_queues+0x6c/0xa0 [nvme_loop]
>> <4>[  228.601598]  nvme_loop_shutdown_ctrl+0x64/0xc0 [nvme_loop]
>> <4>[  228.607071]  nvme_loop_delete_ctrl_host+0x20/0x30 [nvme_loop]
>> <4>[  228.612805]  nvme_do_delete_ctrl+0x5c/0x74 [nvme_core]
>> <4>[  228.617939]  nvme_delete_ctrl_work+0x20/0x44 [nvme_core]
>> <4>[  228.623246]  process_one_work+0x1f0/0x4ac
>> <4>[  228.627244]  worker_thread+0x184/0x40c
>> <4>[  228.630981]  kthread+0x11c/0x120
>> <4>[  228.634197]  ret_from_fork+0x10/0x18
>> <0>[  228.637763] Code: b0004841 3905dc03 91348021 f0006a80 (f9400442)
>> <4>[  228.643843] ---[ end trace 57e2d1ace7d39025 ]---
> 
> 
> What about this?

diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index 74b3b150e1a5..9838a7d27bc1 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -263,7 +263,8 @@ static const struct blk_mq_ops 
nvme_loop_admin_mq_ops = {

  static void nvme_loop_destroy_admin_queue(struct nvme_loop_ctrl *ctrl)
  {
-       clear_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[0].flags);
+       if (!test_and_clear_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[0].flags))
+               return;
         nvmet_sq_destroy(&ctrl->queues[0].nvme_sq);
         blk_cleanup_queue(ctrl->ctrl.admin_q);
         blk_cleanup_queue(ctrl->ctrl.fabrics_q);
@@ -299,6 +300,7 @@ static void nvme_loop_destroy_io_queues(struct 
nvme_loop_ctrl *ctrl)
                 clear_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[i].flags);
                 nvmet_sq_destroy(&ctrl->queues[i].nvme_sq);
         }
+       ctrl->ctrl.queue_count = 1;
  }

  static int nvme_loop_init_io_queues(struct nvme_loop_ctrl *ctrl)
@@ -405,6 +407,7 @@ static int nvme_loop_configure_admin_queue(struct 
nvme_loop_ctrl *ctrl)
         return 0;

  out_cleanup_queue:
+       clear_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[0].flags);
         blk_cleanup_queue(ctrl->ctrl.admin_q);
  out_cleanup_fabrics_q:
         blk_cleanup_queue(ctrl->ctrl.fabrics_q);
@@ -462,8 +465,10 @@ static void nvme_loop_reset_ctrl_work(struct 
work_struct *work)
         nvme_loop_shutdown_ctrl(ctrl);

         if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
-               /* state change failure should never happen */
-               WARN_ON_ONCE(1);
+               if (ctrl->ctrl.state != NVME_CTRL_DELETING &&
+                   ctrl->ctrl.state != NVME_CTRL_DELETING_NOIO)
+                       /* state change failure for non-deleted ctrl? */
+                       WARN_ON_ONCE(1);
                 return;
         }


(I'll be sending out a formal patchset once it's confirmed).

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                Kernel Storage Architect
hare at suse.de                              +49 911 74053 688
SUSE Software Solutions GmbH, Maxfeldstr. 5, 90409 Nürnberg
HRB 36809 (AG Nürnberg), Geschäftsführer: Felix Imendörffer



More information about the Linux-nvme mailing list