[PATCH v1] blk-mq: add one blk_mq_req_flags_t type to support mq ctx fallback
zhuxiaohui
zhuxiaohui400 at gmail.com
Sun Oct 20 07:40:41 PDT 2024
From: Zhu Xiaohui <zhuxiaohui.400 at bytedance.com>
It is observed that nvme connect to a nvme over fabric target will
always fail when 'nohz_full' is set.
In commit a46c27026da1 ("blk-mq: don't schedule block kworker on
isolated CPUs"), it clears hctx->cpumask for all isolate CPUs,
and when nvme connect to a remote target, it may fails on this stack:
blk_mq_alloc_request_hctx+1
__nvme_submit_sync_cmd+106
nvmf_connect_io_queue+181
nvme_tcp_start_queue+293
nvme_tcp_setup_ctrl+948
nvme_tcp_create_ctrl+735
nvmf_dev_write+532
vfs_write+237
ksys_write+107
do_syscall_64+128
entry_SYSCALL_64_after_hwframe+118
due to that the given blk_mq_hw_ctx->cpumask is cleared with no available
blk_mq_ctx on the hw queue.
This patch introduce a new blk_mq_req_flags_t flag 'BLK_MQ_REQ_ARB_MQ'
as well as a nvme_submit_flags_t 'NVME_SUBMIT_ARB_MQ' which are used to
indicate that block layer can fallback to a blk_mq_ctx whose cpu
is not isolated.
Signed-off-by: Zhu Xiaohui <zhuxiaohui.400 at bytedance.com>
---
block/blk-mq.c | 12 ++++++++++--
drivers/nvme/host/core.c | 5 ++++-
drivers/nvme/host/fabrics.c | 3 ++-
drivers/nvme/host/nvme.h | 2 ++
include/linux/blk-mq.h | 4 +++-
5 files changed, 21 insertions(+), 5 deletions(-)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index cf626e061dd7..e4e791fd6d80 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -654,8 +654,16 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
if (!blk_mq_hw_queue_mapped(data.hctx))
goto out_queue_exit;
cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask);
- if (cpu >= nr_cpu_ids)
- goto out_queue_exit;
+ if (cpu >= nr_cpu_ids) {
+ if (!(flags & BLK_MQ_REQ_ARB_MQ))
+ goto out_queue_exit;
+ /* fallback to the first cpu not isolated */
+ for_each_online_cpu(cpu) {
+ if (!cpu_is_isolated(cpu))
+ break;
+ }
+ }
+
data.ctx = __blk_mq_get_ctx(q, cpu);
if (q->elevator)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 84cb859a911d..dbb9cb59e54c 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1130,9 +1130,12 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
blk_flags |= BLK_MQ_REQ_RESERVED;
if (qid == NVME_QID_ANY)
req = blk_mq_alloc_request(q, nvme_req_op(cmd), blk_flags);
- else
+ else {
+ if (flags & NVME_SUBMIT_ARB_MQ)
+ blk_flags |= BLK_MQ_REQ_ARB_MQ;
req = blk_mq_alloc_request_hctx(q, nvme_req_op(cmd), blk_flags,
qid - 1);
+ }
if (IS_ERR(req))
return PTR_ERR(req);
diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index 432efcbf9e2f..ef34958e33c0 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -539,7 +539,8 @@ int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid)
data, sizeof(*data), qid,
NVME_SUBMIT_AT_HEAD |
NVME_SUBMIT_RESERVED |
- NVME_SUBMIT_NOWAIT);
+ NVME_SUBMIT_NOWAIT |
+ NVME_SUBMIT_ARB_MQ);
if (ret) {
nvmf_log_connect_error(ctrl, ret, le32_to_cpu(res.u32),
&cmd, data);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 093cb423f536..a61b35b1cd90 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -880,6 +880,8 @@ enum {
NVME_SUBMIT_RESERVED = (__force nvme_submit_flags_t)(1 << 2),
/* Retry command when NVME_STATUS_DNR is not set in the result */
NVME_SUBMIT_RETRY = (__force nvme_submit_flags_t)(1 << 3),
+ /* Submit command with arbitrary mq ctx */
+ NVME_SUBMIT_ARB_MQ = (__force nvme_submit_flags_t)(1 << 4),
};
int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 4fecf46ef681..d14be341ea4b 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -746,6 +746,8 @@ enum {
BLK_MQ_REQ_RESERVED = (__force blk_mq_req_flags_t)(1 << 1),
/* set RQF_PM */
BLK_MQ_REQ_PM = (__force blk_mq_req_flags_t)(1 << 2),
+ /* use arbitrary mq ctx */
+ BLK_MQ_REQ_ARB_MQ = (__force blk_mq_req_flags_t)(1 << 3),
};
struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf,
@@ -824,7 +826,7 @@ static inline int blk_mq_request_completed(struct request *rq)
}
/*
- *
+ *
* Set the state to complete when completing a request from inside ->queue_rq.
* This is used by drivers that want to ensure special complete actions that
* need access to the request are called on failure, e.g. by nvme for
--
2.39.5
More information about the Linux-nvme
mailing list