nvme-tcp: fix a possible UAF when failing to send request
zhang.guanghui at cestc.cn
zhang.guanghui at cestc.cn
Sun Feb 9 23:41:38 PST 2025
Hello
When using the nvme-tcp driver in a storage cluster, the driver may trigger a null pointer causing the host to crash several times.
By analyzing the vmcore, we know the direct cause is that the request->mq_hctx was used after free.
CPU1 CPU2
nvme_tcp_poll nvme_tcp_try_send --failed to send reqrest 13
nvme_tcp_try_recv nvme_tcp_fail_request
nvme_tcp_recv_skb nvme_tcp_end_request
nvme_tcp_recv_pdu nvme_complete_rq
nvme_tcp_handle_comp nvme_retry_req -- request->mq_hctx have been freed, is NULL.
nvme_tcp_process_nvme_cqe
nvme_complete_rq
nvme_end_req
blk_mq_end_request
when nvme_tcp_try_send failed to send reqrest 13, it maybe be resulted by selinux or other reasons, this is a problem. then the nvme_tcp_fail_request would execute。
but the nvme_tcp_recv_pdu may have received the responding pdu and the nvme_tcp_process_nvme_cqe would have completed the request. request->mq_hctx was used after free.
the follow patch is to solve it.
can you give some suggestions? thanks!
diff --git a/linux/drivers/nvme/host/core.c b/linux/drivers/nvme/host/core.c
index a65b1dce8..417466674 100644
--- a/linux/drivers/nvme/host/core.c
+++ b/linux/drivers/nvme/host/core.c
@@ -288,6 +288,9 @@ static void nvme_retry_req(struct request *req)
unsigned long delay = 0;
u16 crd;
+ if(!req->mq_hctx && req->state == MQ_RQ_IDLE)
+ return;
+
/* The mask and shift result must be <= 3 */
crd = (nvme_req(req)->status & NVME_SC_CRD) >> 11;
The details are as follows
[35665.692310] nvme nvme2: failed to send request -13
[35665.692683] nvme nvme2: queue 1 failed to send request 00000000b42f4e2b state 2 pdu 00000000d7fb8da3 type 4 rq_state 1 nrq_status 0
[35665.693323] nvme nvme2: failed to send rq 00000000f86a68c3 state 2 nrq_status 370
[35665.702265] nvme nvme2: unsupported pdu type (3)
[35665.702272] BUG: kernel NULL pointer dereference, address: 0000000000000000
[35665.702542] nvme nvme2: queue 1 receive failed: -22
[35665.703209] #PF: supervisor write access in kernel mode
[35665.703213] #PF: error_code(0x0002) - not-present page
[35665.703214] PGD 8000003801cce067 P4D 8000003801cce067 PUD 37e6f79067 PMD 0
[35665.703220] Oops: 0002 [#1] SMP PTI
[35665.703658] nvme nvme2: starting error recovery
[35665.704442] CPU: 20 PID: 815 Comm: kworker/20:1H Kdump: loaded Not tainted 5.15.131-17.cl9.x86_64 #1
[35665.705168] nvme nvme2: queue 1 receive again after receive failed
[35665.705809] Hardware name: Inspur aaabbb/YZMB-00882-104, BIOS 4.1.26 09/22/2022
[35665.705812] Workqueue: kblockd blk_mq_requeue_work
[35665.709172] RIP: 0010:_raw_spin_lock+0xc/0x30
[35665.709606] Code: 05 c3 cc cc cc cc 89 c6 e8 31 05 68 ff 66 90 c3 cc cc cc cc 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 31 c0 ba 01 00 00 00 <f0> 0f b1 17 75 05 c3 cc cc cc cc 89 c6 e8 02 05 68 ff 66 90 c3 cc
[35665.710470] RSP: 0000:ffffa67bcd797e08 EFLAGS: 00010246
[35665.710925] RAX: 0000000000000000 RBX: ffff92f6bbcc9840 RCX: ffff92f6bbcc9888
[35665.711393] RDX: 0000000000000001 RSI: 0000000000000000 RDI: 0000000000000000
[35665.711849] RBP: 0000000000000000 R08: ffffa67bcd797e48 R09: ffff932346d576f4
[35665.712275] R10: 0000000000000008 R11: 0000000000000008 R12: 0000000000000000
[35665.712725] R13: ffff92f6bbcc9888 R14: 0000000000000008 R15: 0000000000000000
[35665.713158] FS: 0000000000000000(0000) GS:ffff93527d400000(0000) knlGS:0000000000000000
[35665.713603] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[35665.714063] CR2: 0000000000000000 CR3: 000000371aa02006 CR4: 00000000007706e0
[35665.714534] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[35665.714961] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[35665.715359] PKRU: 55555554
[35665.715788] Call Trace:
[35665.716201] <TASK>
[35665.716613] ? show_trace_log_lvl+0x1c1/0x2d9
[35665.717049] ? show_trace_log_lvl+0x1c1/0x2d9
[35665.717457] ? blk_mq_request_bypass_insert+0x2c/0xb0
[35665.717950] ? __die_body.cold+0x8/0xd
[35665.718361] ? page_fault_oops+0xac/0x140
[35665.718749] ? blk_mq_start_request+0x30/0xf0
[35665.719144] ? nvme_tcp_queue_rq+0xc7/0x170 [nvme_tcp]
[35665.719547] ? exc_page_fault+0x62/0x130
[35665.719938] ? asm_exc_page_fault+0x22/0x30
[35665.720333] ? _raw_spin_lock+0xc/0x30
[35665.720723] blk_mq_request_bypass_insert+0x2c/0xb0
[35665.721101] blk_mq_requeue_work+0xa5/0x180
[35665.721451] process_one_work+0x1e8/0x390
[35665.721809] worker_thread+0x53/0x3d0
[35665.722159] ? process_one_work+0x390/0x390
[35665.722501] kthread+0x124/0x150
[35665.722849] ? set_kthread_struct+0x50/0x50
[35665.723182] ret_from_fork+0x1f/0x30
[35665.723508] </TASK>
crash> struct nvme_tcp_request ffff92f6bbcc9950
struct nvme_tcp_request {
req = {
cmd = 0xffff92f5b83f6748,
result = {
u16 = 0,
u32 = 0,
u64 = 0
},
genctr = 169 '\251',
retries = 1 '\001',
flags = 0 '\000',
status = 6,
ctrl = 0xffff92f5e5df7348
},
pdu = 0xffff92f5b83f6740,
queue = 0xffff92f407cc9128,
data_len = 4096,
pdu_len = 4096,
pdu_sent = 0,
h2cdata_left = 0,
h2cdata_offset = 0,
ttag = 62,
status = 12,
entry = {
next = 0xdead000000000100,
prev = 0xdead000000000122
},
lentry = {
next = 0x0
},
ddgst = 0,
curr_bio = 0xffff9324d639e240,
iter = {
iter_type = 2 '\002',
nofault = false,
data_source = true,
iov_offset = 0,
count = 4096,
{
iov = 0xffff92f6bbcc98a8,
kvec = 0xffff92f6bbcc98a8,
bvec = 0xffff92f6bbcc98a8,
xarray = 0xffff92f6bbcc98a8,
pipe = 0xffff92f6bbcc98a8
},
{
nr_segs = 1,
{
head = 1,
start_head = 0
},
xarray_start = 1
}
},
offset = 0,
data_sent = 0,
state = NVME_TCP_SEND_DATA
}
crash> nvme_tcp_hdr.type 0xffff92f5b83f6740
type = 4 '\004',
crash>
crash> struct request ffff92f6bbcc9840
struct request {
q = 0xffff92f59d55c240,
mq_ctx = 0xffffc67bb9a1f040,
mq_hctx = 0x0,
cmd_flags = 33556483,
rq_flags = 139456,
tag = 87,
internal_tag = -1,
__data_len = 0,
__sector = 66846720,
bio = 0x0,
biotail = 0x0,
queuelist = {
next = 0xffff92f6bbcc9888,
prev = 0xffff92f6bbcc9888
},
{
hash = {
next = 0x0,
pprev = 0x0
},
ipi_list = {
next = 0x0
}
},
{
rb_node = {
__rb_parent_color = 18446685131795018112,
rb_right = 0x1000,
rb_left = 0x0
},
special_vec = {
bv_page = 0xffffca64841f3180,
bv_len = 4096,
bv_offset = 0
},
completion_data = 0xffffca64841f3180,
error_count = -2078330496
},
{
elv = {
icq = 0x0,
priv = {0xffff92f6bbcc98c8, 0xffff92f6bbcc98c8}
},
flush = {
seq = 0,
list = {
next = 0xffff92f6bbcc98c8,
prev = 0xffff92f6bbcc98c8
},
saved_end_io = 0x0
}
},
rq_disk = 0xffff92f6bdbff600,
part = 0xffff92f59f557800,
start_time_ns = 35665692557229,
io_start_time_ns = 35665692566268,
wbt_flags = 0,
stats_sectors = 0,
nr_phys_segments = 1,
nr_integrity_segments = 0,
write_hint = 0,
ioprio = 0,
state = MQ_RQ_IDLE,
ref = {
refs = {
counter = 0
}
},
timeout = 180000,
deadline = 4330512774,
{
csd = {
node = {
llist = {
next = 0x0
},
{
u_flags = 0,
a_flags = {
counter = 0
}
},
src = 0,
dst = 0
},
func = 0xffffffff8eed62d0 <__blk_mq_complete_request_remote>,
info = 0xffff92f6bbcc9840
},
fifo_time = 0
},
end_io = 0x0,
end_io_data = 0x0
}
Best regards
zhang.guanghui at cestc.cn
More information about the Linux-nvme
mailing list