[PATCH] nvmet: serialize delete_ctrl and install_queue flow to avoid race
Sagi Grimberg
sagi at grimberg.me
Wed Mar 23 08:04:21 PDT 2022
> From: Amit Engel <amit.engel at dell.com>
>
> In case of delete_ctrl, need to ensure that no install_queue occurs
> delete_ctrl flow need to wait until all running install_queue will be done
>
> without this fix, we might end up with a ctrl that will never be deleted
Is this a leak or a hang?
>
> Signed-off-by: Amit Engel <amit.engel at dell.com>
> ---
> drivers/nvme/target/core.c | 10 ++++++++++
> drivers/nvme/target/fabrics-cmd.c | 8 ++++++++
> drivers/nvme/target/nvmet.h | 1 +
> 3 files changed, 19 insertions(+)
>
> diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
> index 92af37d33ec3..f00f2636f4db 100644
> --- a/drivers/nvme/target/core.c
> +++ b/drivers/nvme/target/core.c
> @@ -303,6 +303,11 @@ void nvmet_port_del_ctrls(struct nvmet_port *port, struct nvmet_subsys *subsys)
> struct nvmet_ctrl *ctrl;
>
> mutex_lock(&subsys->lock);
> + list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
> + if (ctrl->port == port)
> + ctrl->deleting = true;
Perhaps we need ctrls to have a proper state machine? Seems rather
random...
> + }
> + synchronize_rcu();
> list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
> if (ctrl->port == port)
> ctrl->ops->delete_ctrl(ctrl);
Probably you want to introduce nvmet_delete_ctrl that sets the state
and calls ->delete_ctrl()
> @@ -1329,6 +1334,8 @@ static void nvmet_fatal_error_handler(struct work_struct *work)
> container_of(work, struct nvmet_ctrl, fatal_err_work);
>
> pr_err("ctrl %d fatal error occurred!\n", ctrl->cntlid);
> + ctrl->deleting = true;
> + synchronize_rcu();
> ctrl->ops->delete_ctrl(ctrl);
> }
>
> @@ -1593,6 +1600,9 @@ void nvmet_subsys_del_ctrls(struct nvmet_subsys *subsys)
> struct nvmet_ctrl *ctrl;
>
> mutex_lock(&subsys->lock);
> + list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
> + ctrl->deleting = true;
> + synchronize_rcu();
> list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
> ctrl->ops->delete_ctrl(ctrl);
> mutex_unlock(&subsys->lock);
> diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c
> index 70fb587e9413..8758f4ae624f 100644
> --- a/drivers/nvme/target/fabrics-cmd.c
> +++ b/drivers/nvme/target/fabrics-cmd.c
> @@ -134,8 +134,15 @@ static u16 nvmet_install_queue(struct nvmet_ctrl *ctrl, struct nvmet_req *req)
> return NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR;
> }
>
> + rcu_read_lock();
> + if (ctrl->deleting) {
> + rcu_read_unlock();
> + return NVME_SC_CONNECT_CTRL_BUSY | NVME_SC_DNR;
> + }
I think that it should probably fail way sooner than that...
> +
> old = cmpxchg(&req->sq->ctrl, NULL, ctrl);
> if (old) {
> + rcu_read_unlock();
> pr_warn("queue already connected!\n");
> req->error_loc = offsetof(struct nvmf_connect_command, opcode);
> return NVME_SC_CONNECT_CTRL_BUSY | NVME_SC_DNR;
> @@ -144,6 +151,7 @@ static u16 nvmet_install_queue(struct nvmet_ctrl *ctrl, struct nvmet_req *req)
> /* note: convert queue size from 0's-based value to 1's-based value */
> nvmet_cq_setup(ctrl, req->cq, qid, sqsize + 1);
> nvmet_sq_setup(ctrl, req->sq, qid, sqsize + 1);
> + rcu_read_unlock();
>
> if (c->cattr & NVME_CONNECT_DISABLE_SQFLOW) {
> req->sq->sqhd_disabled = true;
> diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
> index d910c6aad4b6..ef56b4811918 100644
> --- a/drivers/nvme/target/nvmet.h
> +++ b/drivers/nvme/target/nvmet.h
> @@ -209,6 +209,7 @@ struct nvmet_ctrl {
> u64 err_counter;
> struct nvme_error_slot slots[NVMET_ERROR_LOG_SLOTS];
> bool pi_support;
> + bool deleting;
> };
>
> struct nvmet_subsys {
More information about the Linux-nvme
mailing list