[PATCH 15/18] nvmet-tcp: enable TLS handshake upcall

Wed Mar 22 05:51:35 PDT 2023

On 3/22/23 14:34, Hannes Reinecke wrote:
> On 3/22/23 13:13, Sagi Grimberg wrote:
>>
>>
>> On 3/21/23 14:43, Hannes Reinecke wrote:
>>> Add functions to start the TLS handshake upcall.
>>>
>>> Signed-off-by: Hannes Reincke <hare at suse.de>
>>> ---
>>>   drivers/nvme/target/tcp.c | 188 ++++++++++++++++++++++++++++++++++++--
>>>   1 file changed, 181 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
>>> index 5c43767c5ecd..6e88e98a2c59 100644
>>> --- a/drivers/nvme/target/tcp.c
>>> +++ b/drivers/nvme/target/tcp.c
>>> @@ -9,8 +9,10 @@
>>>   #include <linux/slab.h>
>>>   #include <linux/err.h>
>>>   #include <linux/nvme-tcp.h>
>>> +#include <linux/nvme-keyring.h>
>>>   #include <net/sock.h>
>>>   #include <net/tcp.h>
>>> +#include <net/handshake.h>
>>>   #include <linux/inet.h>
>>>   #include <linux/llist.h>
>>>   #include <crypto/hash.h>
>>> @@ -40,6 +42,14 @@ module_param(idle_poll_period_usecs, int, 0644);
>>>   MODULE_PARM_DESC(idle_poll_period_usecs,
>>>           "nvmet tcp io_work poll till idle time period in usecs");
>>> +/*
>>> + * TLS handshake timeout
>>> + */
>>> +static int tls_handshake_timeout = 30;
>>
>> 30 ?
>>
> Yeah; will be changing it to 10.
> 
>>> +module_param(tls_handshake_timeout, int, 0644);
>>> +MODULE_PARM_DESC(tls_handshake_timeout,
>>> +         "nvme TLS handshake timeout in seconds (default 30)");
>>> +
>>>   #define NVMET_TCP_RECV_BUDGET        8
>>>   #define NVMET_TCP_SEND_BUDGET        8
>>>   #define NVMET_TCP_IO_WORK_BUDGET    64
>>> @@ -131,6 +141,9 @@ struct nvmet_tcp_queue {
>>>       struct ahash_request    *snd_hash;
>>>       struct ahash_request    *rcv_hash;
>>> +    struct key        *tls_psk;
>>> +    struct delayed_work    tls_handshake_work;
>>> +
>>>       unsigned long           poll_end;
>>>       spinlock_t        state_lock;
>>> @@ -168,6 +181,7 @@ static struct workqueue_struct *nvmet_tcp_wq;
>>>   static const struct nvmet_fabrics_ops nvmet_tcp_ops;
>>>   static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c);
>>>   static void nvmet_tcp_free_cmd_buffers(struct nvmet_tcp_cmd *cmd);
>>> +static void nvmet_tcp_tls_handshake_timeout_work(struct work_struct 
>>> *work);
>>>   static inline u16 nvmet_tcp_cmd_tag(struct nvmet_tcp_queue *queue,
>>>           struct nvmet_tcp_cmd *cmd)
>>> @@ -1400,6 +1414,8 @@ static void 
>>> nvmet_tcp_restore_socket_callbacks(struct nvmet_tcp_queue *queue)
>>>   {
>>>       struct socket *sock = queue->sock;
>>> +    if (!sock->sk)
>>> +        return;
>>
>> Umm, when will the sock not have an sk?
>>
> When someone called 'sock_release()'.
> But that's basically a leftover from development.
> 
>>>       write_lock_bh(&sock->sk->sk_callback_lock);
>>>       sock->sk->sk_data_ready =  queue->data_ready;
>>>       sock->sk->sk_state_change = queue->state_change;
>>> @@ -1448,7 +1464,8 @@ static void nvmet_tcp_release_queue_work(struct 
>>> work_struct *w)
>>>       list_del_init(&queue->queue_list);
>>>       mutex_unlock(&nvmet_tcp_queue_mutex);
>>> -    nvmet_tcp_restore_socket_callbacks(queue);
>>> +    if (queue->state != NVMET_TCP_Q_TLS_HANDSHAKE)
>>> +        nvmet_tcp_restore_socket_callbacks(queue);
>>
>> This is because you only save the callbacks after the handshake
>> phase is done? Maybe it would be simpler to clear the ops because
>> the socket is going away anyways...
>>
> Or just leave it in place, as they'll be cleared up on sock_release().

This plays a role today, because after we clear sock callbacks, and
flush io_work, we know we are not going to be triggered from the
network, which is needed to continue teardown safely. So if you leave
them in place, you need to do a different fence here.

> 
>>>       cancel_work_sync(&queue->io_work);
>>>       /* stop accepting incoming data */
>>>       queue->rcv_state = NVMET_TCP_RECV_ERR;
>>> @@ -1469,6 +1486,8 @@ static void nvmet_tcp_release_queue_work(struct 
>>> work_struct *w)
>>>       nvmet_tcp_free_cmds(queue);
>>>       if (queue->hdr_digest || queue->data_digest)
>>>           nvmet_tcp_free_crypto(queue);
>>> +    if (queue->tls_psk)
>>> +        key_put(queue->tls_psk);
>>>       ida_free(&nvmet_tcp_queue_ida, queue->idx);
>>>       page = virt_to_head_page(queue->pf_cache.va);
>>>       __page_frag_cache_drain(page, queue->pf_cache.pagecnt_bias);
>>> @@ -1481,11 +1500,15 @@ static void nvmet_tcp_data_ready(struct sock 
>>> *sk)
>>>       trace_sk_data_ready(sk);
>>> -    read_lock_bh(&sk->sk_callback_lock);
>>> -    queue = sk->sk_user_data;
>>> -    if (likely(queue))
>>> -        queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work);
>>> -    read_unlock_bh(&sk->sk_callback_lock);
>>> +    rcu_read_lock_bh();
>>> +    queue = rcu_dereference_sk_user_data(sk);
>>> +    if (queue->data_ready)
>>> +        queue->data_ready(sk);
>>> +    if (likely(queue) &&
>>> +        queue->state != NVMET_TCP_Q_TLS_HANDSHAKE)
>>> +        queue_work_on(queue_cpu(queue), nvmet_tcp_wq,
>>> +                  &queue->io_work);
>>> +    rcu_read_unlock_bh();
>>
>> Same comment as the host side. separate rcu stuff from data_ready call.
>>
> Ok.
> 
>>>   }
>>>   static void nvmet_tcp_write_space(struct sock *sk)
>>> @@ -1585,13 +1608,139 @@ static int nvmet_tcp_set_queue_sock(struct 
>>> nvmet_tcp_queue *queue)
>>>           sock->sk->sk_write_space = nvmet_tcp_write_space;
>>>           if (idle_poll_period_usecs)
>>>               nvmet_tcp_arm_queue_deadline(queue);
>>> -        queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work);
>>> +        queue_work_on(queue_cpu(queue), nvmet_tcp_wq,
>>> +                  &queue->io_work);
>>
>> Why the change?
>>
> Left-over from development.
> 
>>>       }
>>>       write_unlock_bh(&sock->sk->sk_callback_lock);
>>>       return ret;
>>>   }
>>> +static void nvmet_tcp_tls_data_ready(struct sock *sk)
>>> +{
>>> +    struct socket_wq *wq;
>>> +
>>> +    rcu_read_lock();
>>> +    /* kTLS will change the callback */
>>> +    if (sk->sk_data_ready == nvmet_tcp_tls_data_ready) {
>>> +        wq = rcu_dereference(sk->sk_wq);
>>> +        if (skwq_has_sleeper(wq))
>>> +            wake_up_interruptible_all(&wq->wait);
>>> +    }
>>> +    rcu_read_unlock();
>>> +}
>>
>> Can you explain why this is needed? It looks out-of-place.
>> Who is this waking up? isn't tls already calling the socket
>> default data_ready that does something similar for userspace?
>>
> Black magic.

:)

> The 'data_ready' call might happen at any time after the 'accept' call 
> and us calling into userspace.
> In particular we have this flow of control:
> 
> 1. Kernel: accept()
> 2. Kernel: handshake request
> 3. Userspace: read data from socket
> 4. Userspace: tls handshake
> 5. Kernel: handshake complete
> 
> If the 'data_ready' event occurs between 1. and 3. userspace wouldn't 
> know that something has happened, and will be sitting there waiting for 
> data which is already present.

Umm, doesn't userspace read from the socket once we trigger the upcall?
it should. But I still don't understand what is the difference between
us waiking up userspace, from the default sock doing the same?

>>> +
>>> +static void nvmet_tcp_tls_handshake_restart(struct nvmet_tcp_queue 
>>> *queue)
>>> +{
>>> +    spin_lock(&queue->state_lock);
>>> +    if (queue->state != NVMET_TCP_Q_TLS_HANDSHAKE) {
>>> +        pr_warn("queue %d: TLS handshake already completed\n",
>>> +            queue->idx);
>>> +        spin_unlock(&queue->state_lock);
>>> +        return;
>>> +    }
>>> +    queue->state = NVMET_TCP_Q_CONNECTING;
>>> +    spin_unlock(&queue->state_lock);
>>> +
>>> +    pr_debug("queue %d: restarting queue after TLS handshake\n",
>>> +         queue->idx);
>>> +    /*
>>> +     * Set callbacks after handshake; TLS implementation
>>> +     * might have changed the socket callbacks.
>>> +     */
>>> +    nvmet_tcp_set_queue_sock(queue);
>>
>> My understanding is that this is the desired end-state, i.e.
>> tls connection is ready and now we are expecting nvme traffic?
>>
> Yes.
> 
>> I think that the function name should be changed, it sounds like
>> it is restarting the handshake, and it does not appear to do that.
>>
> Sure, np.
> 
> nvmet_tcp_set_queue_callbacks()?

I meant about nvmet_tcp_tls_handshake_restart()

> 
>>> +}
>>> +
>>> +static void nvmet_tcp_save_tls_callbacks(struct nvmet_tcp_queue *queue)
>>> +{
>>> +    struct sock *sk = queue->sock->sk;
>>> +
>>> +    write_lock_bh(&sk->sk_callback_lock);
>>> +    rcu_assign_sk_user_data(sk, queue);
>>> +    queue->data_ready = sk->sk_data_ready;
>>> +    sk->sk_data_ready = nvmet_tcp_tls_data_ready;
>>> +    write_unlock_bh(&sk->sk_callback_lock);
>>> +}
>>> +
>>> +static void nvmet_tcp_restore_tls_callbacks(struct nvmet_tcp_queue 
>>> *queue)
>>> +{
>>> +    struct sock *sk = queue->sock->sk;
>>> +
>>> +    if (WARN_ON(!sk))
>>> +        return;
>>> +    write_lock_bh(&sk->sk_callback_lock);
>>> +    /* Only reset the callback if it really is ours */
>>> +    if (sk->sk_data_ready == nvmet_tcp_tls_data_ready)
>>
>> I still don't understand why our data_ready for tls is needed.
>> Who are
>>
> See above for an explanation.
> 
>>> +        sk->sk_data_ready = queue->data_ready;
>>> +    rcu_assign_sk_user_data(sk, NULL);
>>> +    queue->data_ready = NULL;
>>> +    write_unlock_bh(&sk->sk_callback_lock);
>>> +}
>>> +
>>> +static void nvmet_tcp_tls_handshake_done(void *data, int status,
>>> +                     key_serial_t peerid)
>>> +{
>>> +    struct nvmet_tcp_queue *queue = data;
>>> +
>>> +    pr_debug("queue %d: TLS handshake done, key %x, status %d\n",
>>> +         queue->idx, peerid, status);
>>> +    if (!status) {
>>> +        spin_lock(&queue->state_lock);
>>> +        queue->tls_psk = key_lookup(peerid);
>>> +        if (IS_ERR(queue->tls_psk)) {
>>> +            pr_warn("queue %d: TLS key %x not found\n",
>>> +                queue->idx, peerid);
>>> +            queue->tls_psk = NULL;
>>> +        }
>>> +        spin_unlock(&queue->state_lock);
>>> +    }
>>> +    cancel_delayed_work_sync(&queue->tls_handshake_work);
>>> +    nvmet_tcp_restore_tls_callbacks(queue);
>>> +    if (status)
>>> +        nvmet_tcp_schedule_release_queue(queue);
>>> +    else
>>> +        nvmet_tcp_tls_handshake_restart(queue);
>>> +}
>>> +
>>> +static void nvmet_tcp_tls_handshake_timeout_work(struct work_struct *w)
>>> +{
>>> +    struct nvmet_tcp_queue *queue = container_of(to_delayed_work(w),
>>> +            struct nvmet_tcp_queue, tls_handshake_work);
>>> +
>>> +    pr_debug("queue %d: TLS handshake timeout\n", queue->idx);
>>> +    nvmet_tcp_restore_tls_callbacks(queue);
>>> +    nvmet_tcp_schedule_release_queue(queue);
>>> +}
>>> +
>>> +static int nvmet_tcp_tls_handshake(struct nvmet_tcp_queue *queue)
>>> +{
>>> +    int ret = -EOPNOTSUPP;
>>> +    struct tls_handshake_args args;
>>> +
>>> +    if (queue->state != NVMET_TCP_Q_TLS_HANDSHAKE) {
>>> +        pr_warn("cannot start TLS in state %d\n", queue->state);
>>> +        return -EINVAL;
>>> +    }
>>> +
>>> +    pr_debug("queue %d: TLS ServerHello\n", queue->idx);
>>> +    args.ta_sock = queue->sock;
>>> +    args.ta_done = nvmet_tcp_tls_handshake_done;
>>> +    args.ta_data = queue;
>>> +    args.ta_keyring = nvme_keyring_id();
>>> +    args.ta_timeout_ms = tls_handshake_timeout * 2 * 1024;
>>
>>   why the 2x timeout?
>>
> Because I'm chicken. Will be changing it.l

:)

> 
>>> +
>>> +    ret = tls_server_hello_psk(&args, GFP_KERNEL);
>>> +    if (ret) {
>>> +        pr_err("failed to start TLS, err=%d\n", ret);
>>> +    } else {
>>> +        pr_debug("queue %d wakeup userspace\n", queue->idx);
>>> +        nvmet_tcp_tls_data_ready(queue->sock->sk);
>>> +        queue_delayed_work(nvmet_wq, &queue->tls_handshake_work,
>>> +                   tls_handshake_timeout * HZ);
>>> +    }
>>> +    return ret;
>>> +}
>>> +
>>>   static void nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port,
>>>           struct socket *newsock)
>>>   {
>>> @@ -1604,6 +1753,8 @@ static void nvmet_tcp_alloc_queue(struct 
>>> nvmet_tcp_port *port,
>>>       INIT_WORK(&queue->release_work, nvmet_tcp_release_queue_work);
>>>       INIT_WORK(&queue->io_work, nvmet_tcp_io_work);
>>> +    INIT_DELAYED_WORK(&queue->tls_handshake_work,
>>> +              nvmet_tcp_tls_handshake_timeout_work);
>>>       queue->sock = newsock;
>>>       queue->port = port;
>>>       queue->nr_cmds = 0;
>>> @@ -1646,6 +1797,29 @@ static void nvmet_tcp_alloc_queue(struct 
>>> nvmet_tcp_port *port,
>>>       list_add_tail(&queue->queue_list, &nvmet_tcp_queue_list);
>>>       mutex_unlock(&nvmet_tcp_queue_mutex);
>>> +    if (queue->state == NVMET_TCP_Q_TLS_HANDSHAKE) {
>>> +        nvmet_tcp_save_tls_callbacks(queue);
>>> +        if (!nvmet_tcp_tls_handshake(queue))
>>> +            return;
>>> +        nvmet_tcp_restore_tls_callbacks(queue);
>>> +
>>> +        /*
>>> +         * If sectype is set to 'tls1.3' TLS is required
>>> +         * so terminate the connection if the TLS handshake
>>> +         * failed.
>>> +         */
>>> +        if (queue->port->nport->disc_addr.tsas.tcp.sectype ==
>>> +            NVMF_TCP_SECTYPE_TLS13) {
>>> +            pr_debug("queue %d sectype tls1.3, terminate connection\n",
>>> +                 queue->idx);
>>> +            goto out_destroy_sq;
>>> +        }
>>> +        pr_debug("queue %d fallback to icreq\n", queue->idx);
>>> +        spin_lock(&queue->state_lock);
>>> +        queue->state = NVMET_TCP_Q_CONNECTING;
>>> +        spin_unlock(&queue->state_lock);
>>> +    }
>>> +
>>>       ret = nvmet_tcp_set_queue_sock(queue);
>>>       if (ret)
>>>           goto out_destroy_sq;
>>
>> I'm still trying to learn the state machine here, can you share a few 
>> words on it? Also please include it in the next round in the change log.
> 
> As outlined in the response to the nvme-tcp upcall, on the server side 
> we _have_ to allow for non-TLS connections (eg. for discovery).

But in essence what you are doing is that you allow normal connections
for a secured port...

btw, why not enforce a psk for the discovery controller (on this port)
as well? for secured ports? No one said that we must accept a
non-secured discovery connecting host on a secured port.

> And we have to start the daemon _before_ the first packet arrives,

Not sure why that is.

> but only the first packet will tell us what we should have done.
> So really we have to start the upcall and see what happens.
> The 'real' handling / differentiation between these two modes is done 
> with the 'peek pdu' patch later on.

I am hoping we can kill it.