[PATCH 6/8] nvme-tcp: reduce callback lock contention

Sagi Grimberg sagi at grimberg.me
Wed Jul 17 14:19:29 PDT 2024



On 16/07/2024 10:36, Hannes Reinecke wrote:
> From: Hannes Reinecke <hare at suse.de>
>
> We have heavily queued tx and rx flows, so callbacks might happen
> at the same time. As the callbacks influence the state machine we
> really should remove contention here to not impact I/O performance.
>
> Signed-off-by: Hannes Reinecke <hare at kernel.org>
> ---
>   drivers/nvme/host/tcp.c | 14 ++++++++------
>   1 file changed, 8 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
> index a758fbb3f9bb..9634c16d7bc0 100644
> --- a/drivers/nvme/host/tcp.c
> +++ b/drivers/nvme/host/tcp.c
> @@ -1153,28 +1153,28 @@ static void nvme_tcp_data_ready(struct sock *sk)
>   
>   	trace_sk_data_ready(sk);
>   
> -	read_lock_bh(&sk->sk_callback_lock);
> -	queue = sk->sk_user_data;
> +	rcu_read_lock();
> +	queue = rcu_dereference_sk_user_data(sk);
>   	if (likely(queue && queue->rd_enabled) &&
>   	    !test_bit(NVME_TCP_Q_POLLING, &queue->flags)) {
>   		queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
>   		queue->data_ready_cnt++;
>   	}
> -	read_unlock_bh(&sk->sk_callback_lock);
> +	rcu_read_unlock();

Umm, this looks dangerous...

Please give a concrete (numeric) justification for this change, and 
preferably a big fat comment
on why it is safe to do (for either .data_ready or .write_space).

Is there any precedence of another tcp ulp that does this? I'd like to 
have the netdev folks
review this change. CC'ing netdev.

>   }
>   
>   static void nvme_tcp_write_space(struct sock *sk)
>   {
>   	struct nvme_tcp_queue *queue;
>   
> -	read_lock_bh(&sk->sk_callback_lock);
> -	queue = sk->sk_user_data;
> +	rcu_read_lock();
> +	queue = rcu_dereference_sk_user_data(sk);
>   	if (likely(queue && sk_stream_is_writeable(sk))) {
>   		clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
>   		queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
>   		queue->write_space_cnt++;
>   	}
> -	read_unlock_bh(&sk->sk_callback_lock);
> +	rcu_read_unlock();
>   }
>   
>   static void nvme_tcp_state_change(struct sock *sk)
> @@ -2076,6 +2076,7 @@ static void nvme_tcp_restore_sock_ops(struct nvme_tcp_queue *queue)
>   	sock->sk->sk_state_change = queue->state_change;
>   	sock->sk->sk_write_space  = queue->write_space;
>   	write_unlock_bh(&sock->sk->sk_callback_lock);
> +	synchronize_rcu();
>   }
>   
>   static void __nvme_tcp_stop_queue(struct nvme_tcp_queue *queue)
> @@ -2115,6 +2116,7 @@ static void nvme_tcp_setup_sock_ops(struct nvme_tcp_queue *queue)
>   	queue->sock->sk->sk_ll_usec = 1;
>   #endif
>   	write_unlock_bh(&queue->sock->sk->sk_callback_lock);
> +	synchronize_rcu();
>   }
>   
>   static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx)




More information about the Linux-nvme mailing list