[PATCH v3] nvme-tcp: fix a possible UAF when failing to allocate an io queue

Wed Mar 29 19:14:19 PDT 2023

From: Yanjun Zhang <zhangyanjun at cestc.cn>

> When we allocate a nvme-tcp queue, we set the data_ready callback before
> we actually need to use it. This creates the potential that if a stray
> controller sends us data on the socket before we connect, we can trigger
> the io_work and start consuming the socket.
> 
> In this case reported: we failed to allocate one of the io queues, and
> as we start releasing the queues that we already allocated, we get
> a UAF [1] from the io_work which is running before it should really.
> 
> Fix this by setting the socket ops callbacks only before we start the
> queue, so that we can't accidentally schedule the io_work in the
> initialization phase before the queue started. While we are at it,
> rename nvme_tcp_restore_sock_calls to pair with nvme_tcp_setup_sock_ops.
> 
> [1]:
> [16802.107284] nvme nvme4: starting error recovery
> [16802.109166] nvme nvme4: Reconnecting in 10 seconds...
> [16812.173535] nvme nvme4: failed to connect socket: -111
> [16812.173745] nvme nvme4: Failed reconnect attempt 1
> [16812.173747] nvme nvme4: Reconnecting in 10 seconds...
> [16822.413555] nvme nvme4: failed to connect socket: -111
> [16822.413762] nvme nvme4: Failed reconnect attempt 2
> [16822.413765] nvme nvme4: Reconnecting in 10 seconds...
> [16832.661274] nvme nvme4: creating 32 I/O queues.
> [16833.919887] BUG: kernel NULL pointer dereference, address: 0000000000000088
> [16833.920068] nvme nvme4: Failed reconnect attempt 3
> [16833.920094] #PF: supervisor write access in kernel mode
> [16833.920261] nvme nvme4: Reconnecting in 10 seconds...
> [16833.920368] #PF: error_code(0x0002) - not-present page
> [16833.921086] Workqueue: nvme_tcp_wq nvme_tcp_io_work [nvme_tcp]
> [16833.921191] RIP: 0010:_raw_spin_lock_bh+0x17/0x30
> ...
> [16833.923138] Call Trace:
> [16833.923271] <TASK>
> [16833.923402] lock_sock_nested+0x1e/0x50
> [16833.923545] nvme_tcp_try_recv+0x40/0xa0 [nvme_tcp]
> [16833.923685] nvme_tcp_io_work+0x68/0xa0 [nvme_tcp]
> [16833.923824] process_one_work+0x1e8/0x390
> [16833.923969] worker_thread+0x53/0x3d0
> [16833.924104] ? process_one_work+0x390/0x390
> [16833.924240] kthread+0x124/0x150
> [16833.924376] ? set_kthread_struct+0x50/0x50
> [16833.924518] ret_from_fork+0x1f/0x30
> [16833.924655] </TASK>
> 
> Reported-by: Yanjun Zhang <zhangyanjun at cestc.cn>
> Signed-off-by: Sagi Grimberg <sagi at grimberg.me>
> ---
> Changes from v2:
> - Move sock ops assignment to its own function (Hannes)
> 
> Changes from v1:
> - Fix silly compliation error
> 
> Yanjun, I'll be waiting for your Tested-by tag. We need it
> in order to apply this fix.
>
> drivers/nvme/host/tcp.c | 46 +++++++++++++++++++++++------------------
> 1 file changed, 26 insertions(+), 20 deletions(-)
> 
> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
> index 42c0598c31f2..49c9e7bc9116 100644
> --- a/drivers/nvme/host/tcp.c
> +++ b/drivers/nvme/host/tcp.c
> @@ -1620,22 +1620,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid)
>	if (ret)
> 		goto err_init_connect;
> 
> - 	queue->rd_enabled = true;
> 	set_bit(NVME_TCP_Q_ALLOCATED, &queue->flags);
> - 	nvme_tcp_init_recv_ctx(queue);
> -
> - 	write_lock_bh(&queue->sock->sk->sk_callback_lock);
> - 	queue->sock->sk->sk_user_data = queue;
> - 	queue->state_change = queue->sock->sk->sk_state_change;
> - 	queue->data_ready = queue->sock->sk->sk_data_ready;
> -	queue->write_space = queue->sock->sk->sk_write_space;
> - 	queue->sock->sk->sk_data_ready = nvme_tcp_data_ready;
> - 	queue->sock->sk->sk_state_change = nvme_tcp_state_change;
> - 	queue->sock->sk->sk_write_space = nvme_tcp_write_space;
> -#ifdef CONFIG_NET_RX_BUSY_POLL
> - 	queue->sock->sk->sk_ll_usec = 1;
> -#endif
> - 	write_unlock_bh(&queue->sock->sk->sk_callback_lock);
> 
> 	return 0;
> 
> @@ -1655,7 +1640,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid)
> 	return ret;
> }
> 
> -static void nvme_tcp_restore_sock_calls(struct nvme_tcp_queue *queue)
> +static void nvme_tcp_restore_sock_ops(struct nvme_tcp_queue *queue)
> {
> 	struct socket *sock = queue->sock;
> 
> @@ -1670,7 +1655,7 @@ static void nvme_tcp_restore_sock_calls(struct nvme_tcp_queue *queue)
> static void __nvme_tcp_stop_queue(struct nvme_tcp_queue *queue)
> {
> 	kernel_sock_shutdown(queue->sock, SHUT_RDWR);
> - 	nvme_tcp_restore_sock_calls(queue);
> + 	nvme_tcp_restore_sock_ops(queue);
> 	cancel_work_sync(&queue->io_work);
> }
> 
> @@ -1688,21 +1673,42 @@ static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid)
> 	mutex_unlock(&queue->queue_lock);
> }
> 
> +static void nvme_tcp_setup_sock_ops(struct nvme_tcp_queue *queue)
> +{
> +	 write_lock_bh(&queue->sock->sk->sk_callback_lock);
> + 	queue->sock->sk->sk_user_data = queue;
> + 	queue->state_change = queue->sock->sk->sk_state_change;
> +	queue->data_ready = queue->sock->sk->sk_data_ready;
> + 	queue->write_space = queue->sock->sk->sk_write_space;
> +	queue->sock->sk->sk_data_ready = nvme_tcp_data_ready;
> + 	queue->sock->sk->sk_state_change = nvme_tcp_state_change;
> + 	queue->sock->sk->sk_write_space = nvme_tcp_write_space;
> +#ifdef CONFIG_NET_RX_BUSY_POLL
> +	queue->sock->sk->sk_ll_usec = 1;
> +#endif
> + 	write_unlock_bh(&queue->sock->sk->sk_callback_lock);
> +}
> +
> static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx)
> {
> 	struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
> + 	struct nvme_tcp_queue *queue = &ctrl->queues[idx];
> 	int ret;
> 
> + 	queue->rd_enabled = true;
> + 	nvme_tcp_init_recv_ctx(queue);
> + 	nvme_tcp_setup_sock_ops(queue);
> +
> 	if (idx)
> 		ret = nvmf_connect_io_queue(nctrl, idx);
> 	else
> 		ret = nvmf_connect_admin_queue(nctrl);
> 
> 	if (!ret) {
> - 		set_bit(NVME_TCP_Q_LIVE, &ctrl->queues[idx].flags);
> + 		set_bit(NVME_TCP_Q_LIVE, &queue->flags);
> 	} else {
> - 		if (test_bit(NVME_TCP_Q_ALLOCATED, &ctrl->queues[idx].flags))
> - 			__nvme_tcp_stop_queue(&ctrl->queues[idx]);
> + 		if (test_bit(NVME_TCP_Q_ALLOCATED, &queue->flags))
> + 			__nvme_tcp_stop_queue(queue);
> 		dev_err(nctrl->device, 
>			"failed to connect queue: %d ret=%d\n", idx, ret);
> 	}
> --
> 2.34.1

With this patch our cluster nodes have not shown up the same problem recently. I think it is solved.
Tested-by: Yanjun Zhang <zhangyanjun at cestc.com>