[PATCH 1/1] Add 'Transport Interface' (triface) option. This can be used to specify the IP interface to use for the connection. The driver uses that to set SO_BINDTODEVICE on the socket before connecting.

Belanger, Martin Martin.Belanger at dell.com
Mon May 3 17:59:10 BST 2021


Hi Hannes,

I just noticed there were in-line comments. to answer you questions:

Q1) Why not simply 'host_iface' ? 'triface' is a bit awkward.
A1) I used TRIFACE to keep consistency with all other transport options: traddr, trsvcid, host_traddr. I will rename to host_iface at your suggestion.

Q2) Is this valid for all transports? I guess it would only work for 'tcp', and maybe 'rdma' if one would be running ROCE. Shouldn't we error out on other transports like 'fc' or 'loop'?
A2) This is only for TCP, and we do check that this option is only allowed for TCP by specifying it in the "allowed_opts" as follows (see file tcp.c):

static struct nvmf_transport_ops nvme_tcp_transport = {
        .name           = "tcp",
        .module         = THIS_MODULE,
        .required_opts  = NVMF_OPT_TRADDR,
        .allowed_opts   = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
                          NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO |
                          NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST |
                          NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES |
                          NVMF_OPT_TOS | NVMF_OPT_HOST_TRIFACE,
        .create_ctrl    = nvme_tcp_create_ctrl,
};

Q3) Normally the options are just parts of the 'address' string; why didn't
you use that approach here?
A3) I don't understand what you mean?

Regards,
Martin


________________________________________
From: Hannes Reinecke <hare at suse.de>
Sent: Saturday, May 1, 2021 07:34
To: Martin Belanger; linux-nvme at lists.infradead.org
Cc: kbusch at kernel.org; axboe at fb.com; hch at lst.de; sagi at grimberg.me; Belanger, Martin
Subject: Re: [PATCH 1/1] Add 'Transport Interface' (triface) option. This can be used to specify the IP interface to use for the connection. The driver uses that to set SO_BINDTODEVICE on the socket before connecting.


[EXTERNAL EMAIL]

On 4/15/21 9:28 PM, Martin Belanger wrote:
> From: Martin Belanger <martin.belanger at dell.com>
>
Please fix up the subject and description.

> ---
>   drivers/nvme/host/core.c    |  5 +++++
>   drivers/nvme/host/fabrics.c | 14 +++++++++++++
>   drivers/nvme/host/fabrics.h |  6 +++++-
>   drivers/nvme/host/tcp.c     | 41 ++++++++++++++++++++++++++++++++++---
>   4 files changed, 62 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 288ac47ff5b4..91ae11a1ae26 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -3961,6 +3961,11 @@ static int nvme_class_uevent(struct device *dev, struct kobj_uevent_env *env)
>
>               ret = add_uevent_var(env, "NVME_HOST_TRADDR=%s",
>                               opts->host_traddr ?: "none");
> +             if (ret)
> +                     return ret;
> +
> +             ret = add_uevent_var(env, "NVME_HOST_TRIFACE=%s",
> +                             opts->host_triface ?: "none");
>       }
>       return ret;
>   }

Why not simply 'host_iface' ? 'triface' is a bit awkward.

> diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
> index 604ab0e5a2ad..f5d0d760b53b 100644
> --- a/drivers/nvme/host/fabrics.c
> +++ b/drivers/nvme/host/fabrics.c
> @@ -112,6 +112,9 @@ int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size)
>       if (ctrl->opts->mask & NVMF_OPT_HOST_TRADDR)
>               len += scnprintf(buf + len, size - len, "%shost_traddr=%s",
>                               (len) ? "," : "", ctrl->opts->host_traddr);
> +     if (ctrl->opts->mask & NVMF_OPT_HOST_TRIFACE)
> +             len += scnprintf(buf + len, size - len, "%shost_triface=%s",
> +                             (len) ? "," : "", ctrl->opts->host_triface);
>       len += scnprintf(buf + len, size - len, "\n");
>
>       return len;
> @@ -604,6 +607,7 @@ static const match_table_t opt_tokens = {
>       { NVMF_OPT_KATO,                "keep_alive_tmo=%d"     },
>       { NVMF_OPT_HOSTNQN,             "hostnqn=%s"            },
>       { NVMF_OPT_HOST_TRADDR,         "host_traddr=%s"        },
> +     { NVMF_OPT_HOST_TRIFACE,        "host_triface=%s"       },
>       { NVMF_OPT_HOST_ID,             "hostid=%s"             },
>       { NVMF_OPT_DUP_CONNECT,         "duplicate_connect"     },
>       { NVMF_OPT_DISABLE_SQFLOW,      "disable_sqflow"        },
> @@ -813,6 +817,15 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
>                       kfree(opts->host_traddr);
>                       opts->host_traddr = p;
>                       break;
> +             case NVMF_OPT_HOST_TRIFACE:
> +                     p = match_strdup(args);
> +                     if (!p) {
> +                             ret = -ENOMEM;
> +                             goto out;
> +                     }
> +                     kfree(opts->host_triface);
> +                     opts->host_triface = p;
> +                     break;
>               case NVMF_OPT_HOST_ID:
>                       p = match_strdup(args);
>                       if (!p) {
> @@ -997,6 +1010,7 @@ void nvmf_free_options(struct nvmf_ctrl_options *opts)
>       kfree(opts->trsvcid);
>       kfree(opts->subsysnqn);
>       kfree(opts->host_traddr);
> +     kfree(opts->host_triface);
>       kfree(opts);
>   }
>   EXPORT_SYMBOL_GPL(nvmf_free_options);
> diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
> index 733010d2eafd..17c64ff4db8c 100644
> --- a/drivers/nvme/host/fabrics.h
> +++ b/drivers/nvme/host/fabrics.h
> @@ -59,6 +59,7 @@ enum {
>       NVMF_OPT_NR_POLL_QUEUES = 1 << 18,
>       NVMF_OPT_TOS            = 1 << 19,
>       NVMF_OPT_FAIL_FAST_TMO  = 1 << 20,
> +     NVMF_OPT_HOST_TRIFACE   = 1 << 21,
>   };
>
>   /**
> @@ -76,7 +77,9 @@ enum {
>    * @trsvcid:        The transport-specific TRSVCID field for a port on the
>    *              subsystem which is adding a controller.
>    * @host_traddr: A transport-specific field identifying the NVME host port
> - *              to use for the connection to the controller.
> + *           to use for the connection to the controller.
> + * @host_triface: A transport-specific field identifying the NVME host
> + *           interface to use for the connection to the controller.
>    * @queue_size: Number of IO queue elements.
>    * @nr_io_queues: Number of controller IO queues that will be established.
>    * @reconnect_delay: Time between two consecutive reconnect attempts.
> @@ -101,6 +104,7 @@ struct nvmf_ctrl_options {
>       char                    *traddr;
>       char                    *trsvcid;
>       char                    *host_traddr;
> +     char                    *host_triface;
>       size_t                  queue_size;
>       unsigned int            nr_io_queues;
>       unsigned int            reconnect_delay;
> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
> index 8e55d8bc0c50..28eb7f88b487 100644
> --- a/drivers/nvme/host/tcp.c
> +++ b/drivers/nvme/host/tcp.c
> @@ -1447,6 +1447,20 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
>               }
>       }
>
> +     if (nctrl->opts->mask & NVMF_OPT_HOST_TRIFACE) {
> +             char *iface = nctrl->opts->host_triface;
> +             sockptr_t optval = KERNEL_SOCKPTR(iface);
> +
> +             ret = sock_setsockopt(queue->sock, SOL_SOCKET, SO_BINDTODEVICE,
> +                                   optval, strlen(iface));
> +             if (ret) {
> +                     dev_err(nctrl->device,
> +                       "failed to bind to interface %s queue %d err %d\n",
> +                       iface, qid, ret);
> +                     goto err_sock;
> +             }
> +     }
> +
>       queue->hdr_digest = nctrl->opts->hdr_digest;
>       queue->data_digest = nctrl->opts->data_digest;
>       if (queue->hdr_digest || queue->data_digest) {

Is this valid for all transports? I guess it would only work for 'tcp',
and maybe 'rdma' if one would be running ROCE.
Shouldn't we error out on other transports like 'fc' or 'loop'?

> @@ -2457,6 +2471,10 @@ nvme_tcp_existing_controller(struct nvmf_ctrl_options *opts)
>   static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
>               struct nvmf_ctrl_options *opts)
>   {
> +     const char *iface_key = "";
> +     const char *iface_val = "";
> +     const char *srce_key  = "";
> +     const char *srce_val  = "";
>       struct nvme_tcp_ctrl *ctrl;
>       int ret;
>
> @@ -2502,6 +2520,22 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
>                              opts->host_traddr);
>                       goto out_free_ctrl;
>               }
> +             srce_key = ", src-addr ";
> +             srce_val = opts->host_traddr;
> +     }
> +
> +     if (opts->mask & NVMF_OPT_HOST_TRIFACE) {
> +             struct net_device *ndev;
> +
> +             ndev = dev_get_by_name(&init_net, opts->host_triface);
> +             if (!ndev) {
> +                     pr_err("invalid interface passed: %s\n",
> +                            opts->host_triface);
> +                     ret = -ENODEV;
> +                     goto out_free_ctrl;
> +             }
> +             iface_key = ", iface ";
> +             iface_val = opts->host_triface;
>       }
>
>       if (!opts->duplicate_connect && nvme_tcp_existing_controller(opts)) {

Normally the options are just parts of the 'address' string; why didn't
you use that approach here?

> @@ -2530,8 +2564,9 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
>       if (ret)
>               goto out_uninit_ctrl;
>
> -     dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp\n",
> -             ctrl->ctrl.opts->subsysnqn, &ctrl->addr);
> +     dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp%s%s%s%s\n",
> +              ctrl->ctrl.opts->subsysnqn, &ctrl->addr,
> +              srce_key, srce_val, iface_key, iface_val);
>
>       mutex_lock(&nvme_tcp_ctrl_mutex);
>       list_add_tail(&ctrl->list, &nvme_tcp_ctrl_list);
> @@ -2560,7 +2595,7 @@ static struct nvmf_transport_ops nvme_tcp_transport = {
>                         NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO |
>                         NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST |
>                         NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES |
> -                       NVMF_OPT_TOS,
> +                       NVMF_OPT_TOS | NVMF_OPT_HOST_TRIFACE,
>       .create_ctrl    = nvme_tcp_create_ctrl,
>   };
>
>
Cheers,

Hannes
--
Dr. Hannes Reinecke                Kernel Storage Architect
hare at suse.de                              +49 911 74053 688
SUSE Software Solutions GmbH, Maxfeldstr. 5, 90409 Nürnberg
HRB 36809 (AG Nürnberg), Geschäftsführer: Felix Imendörffer



More information about the Linux-nvme mailing list