[PATCH rfc 1/3] nvmet-rdma: automatic listening port re-activation
Max Gurtovoy
maxg at mellanox.com
Mon Mar 26 00:01:09 PDT 2018
On 3/22/2018 9:03 PM, Sagi Grimberg wrote:
> In case the device goes away (or resets) we get a device
> removal event (or .remove ib_client callback). So what
> we want is to destroy the listening cm_id and re-activate
> (or enable) when the same device comes back. Hence we introduce
> nvmet_rdma_port which stores the ib_device node guid, and when
> a new device comes in to the system (ib_client .add callback) we
> search for an existing listener port on this device and reconfigure
> the listener cm_id.
>
> Signed-off-by: Sagi Grimberg <sagi at grimberg.me>
> ---
> drivers/nvme/target/rdma.c | 223 +++++++++++++++++++++++++++------------------
> 1 file changed, 136 insertions(+), 87 deletions(-)
>
> diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c
> index d7831372e1f9..1b7f72925e9f 100644
> --- a/drivers/nvme/target/rdma.c
> +++ b/drivers/nvme/target/rdma.c
> @@ -119,6 +119,15 @@ struct nvmet_rdma_device {
> struct list_head entry;
> };
>
> +struct nvmet_rdma_port {
> + struct nvmet_port *nport;
> + struct sockaddr_storage addr;
> + struct rdma_cm_id *cm_id;
> + __be64 node_guid;
> + struct list_head entry;
> + struct delayed_work enable_work;
> +};
> +
> static bool nvmet_rdma_use_srq;
> module_param_named(use_srq, nvmet_rdma_use_srq, bool, 0444);
> MODULE_PARM_DESC(use_srq, "Use shared receive queue.");
> @@ -130,6 +139,9 @@ static DEFINE_MUTEX(nvmet_rdma_queue_mutex);
> static LIST_HEAD(device_list);
> static DEFINE_MUTEX(device_list_mutex);
>
> +static LIST_HEAD(port_list);
> +static DEFINE_MUTEX(port_list_mutex);
> +
> static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp);
> static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc);
> static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc);
> @@ -1130,6 +1142,7 @@ static int nvmet_rdma_cm_accept(struct rdma_cm_id *cm_id,
> static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id,
> struct rdma_cm_event *event)
> {
> + struct nvmet_rdma_port *port = cm_id->context;
> struct nvmet_rdma_device *ndev;
> struct nvmet_rdma_queue *queue;
> int ret = -EINVAL;
> @@ -1145,7 +1158,7 @@ static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id,
> ret = -ENOMEM;
> goto put_device;
> }
> - queue->port = cm_id->context;
> + queue->port = port->nport;
>
> if (queue->host_qid == 0) {
> /* Let inflight controller teardown complete */
> @@ -1252,53 +1265,6 @@ static void nvmet_rdma_queue_connect_fail(struct rdma_cm_id *cm_id,
> schedule_work(&queue->release_work);
> }
>
> -/**
> - * nvme_rdma_device_removal() - Handle RDMA device removal
> - * @cm_id: rdma_cm id, used for nvmet port
> - * @queue: nvmet rdma queue (cm id qp_context)
> - *
> - * DEVICE_REMOVAL event notifies us that the RDMA device is about
> - * to unplug. Note that this event can be generated on a normal
> - * queue cm_id and/or a device bound listener cm_id (where in this
> - * case queue will be null).
> - *
> - * We registered an ib_client to handle device removal for queues,
> - * so we only need to handle the listening port cm_ids. In this case
> - * we nullify the priv to prevent double cm_id destruction and destroying
> - * the cm_id implicitely by returning a non-zero rc to the callout.
> - */
> -static int nvmet_rdma_device_removal(struct rdma_cm_id *cm_id,
> - struct nvmet_rdma_queue *queue)
> -{
> - struct nvmet_port *port;
> -
> - if (queue) {
> - /*
> - * This is a queue cm_id. we have registered
> - * an ib_client to handle queues removal
> - * so don't interfear and just return.
> - */
> - return 0;
> - }
> -
> - port = cm_id->context;
> -
> - /*
> - * This is a listener cm_id. Make sure that
> - * future remove_port won't invoke a double
> - * cm_id destroy. use atomic xchg to make sure
> - * we don't compete with remove_port.
> - */
> - if (xchg(&port->priv, NULL) != cm_id)
> - return 0;
> -
> - /*
> - * We need to return 1 so that the core will destroy
> - * it's own ID. What a great API design..
> - */
> - return 1;
> -}
> -
> static int nvmet_rdma_cm_handler(struct rdma_cm_id *cm_id,
> struct rdma_cm_event *event)
> {
> @@ -1331,8 +1297,7 @@ static int nvmet_rdma_cm_handler(struct rdma_cm_id *cm_id,
> nvmet_rdma_queue_disconnect(queue);
> break;
> case RDMA_CM_EVENT_DEVICE_REMOVAL:
> - ret = nvmet_rdma_device_removal(cm_id, queue);
> - break;
> + break; /* handled by nvmet_rdma_remove_one */
> case RDMA_CM_EVENT_REJECTED:
> pr_debug("Connection rejected: %s\n",
> rdma_reject_msg(cm_id, event->status));
> @@ -1368,34 +1333,12 @@ static void nvmet_rdma_delete_ctrl(struct nvmet_ctrl *ctrl)
> mutex_unlock(&nvmet_rdma_queue_mutex);
> }
>
> -static int nvmet_rdma_add_port(struct nvmet_port *port)
> +static int nvmet_rdma_enable_port(struct nvmet_rdma_port *port)
> {
> + struct sockaddr *addr = (struct sockaddr *)&port->addr;
> struct rdma_cm_id *cm_id;
> - struct sockaddr_storage addr = { };
> - __kernel_sa_family_t af;
> int ret;
>
> - switch (port->disc_addr.adrfam) {
> - case NVMF_ADDR_FAMILY_IP4:
> - af = AF_INET;
> - break;
> - case NVMF_ADDR_FAMILY_IP6:
> - af = AF_INET6;
> - break;
> - default:
> - pr_err("address family %d not supported\n",
> - port->disc_addr.adrfam);
> - return -EINVAL;
> - }
> -
> - ret = inet_pton_with_scope(&init_net, af, port->disc_addr.traddr,
> - port->disc_addr.trsvcid, &addr);
> - if (ret) {
> - pr_err("malformed ip/port passed: %s:%s\n",
> - port->disc_addr.traddr, port->disc_addr.trsvcid);
> - return ret;
> - }
> -
> cm_id = rdma_create_id(&init_net, nvmet_rdma_cm_handler, port,
> RDMA_PS_TCP, IB_QPT_RC);
> if (IS_ERR(cm_id)) {
> @@ -1413,23 +1356,22 @@ static int nvmet_rdma_add_port(struct nvmet_port *port)
> goto out_destroy_id;
> }
>
> - ret = rdma_bind_addr(cm_id, (struct sockaddr *)&addr);
> + ret = rdma_bind_addr(cm_id, addr);
> if (ret) {
> - pr_err("binding CM ID to %pISpcs failed (%d)\n",
> - (struct sockaddr *)&addr, ret);
> + pr_err("binding CM ID to %pISpcs failed (%d)\n", addr, ret);
> goto out_destroy_id;
> }
>
> ret = rdma_listen(cm_id, 128);
> if (ret) {
> - pr_err("listening to %pISpcs failed (%d)\n",
> - (struct sockaddr *)&addr, ret);
> + pr_err("listening to %pISpcs failed (%d)\n", addr, ret);
> goto out_destroy_id;
> }
>
> - pr_info("enabling port %d (%pISpcs)\n",
> - le16_to_cpu(port->disc_addr.portid), (struct sockaddr *)&addr);
> - port->priv = cm_id;
> + port->cm_id = cm_id;
> + if (cm_id->device)
can device be NULL here ?
> + port->node_guid = cm_id->device->node_guid;
> +
> return 0;
>
> out_destroy_id:
> @@ -1437,18 +1379,100 @@ static int nvmet_rdma_add_port(struct nvmet_port *port)
> return ret;
> }
>
> -static void nvmet_rdma_remove_port(struct nvmet_port *port)
> +static void nvmet_rdma_enable_port_work(struct work_struct *w)
> +{
> + struct nvmet_rdma_port *port = container_of(to_delayed_work(w),
> + struct nvmet_rdma_port, enable_work);
> + int ret;
> +
> + ret = nvmet_rdma_enable_port(port);
> + if (ret)
> + schedule_delayed_work(&port->enable_work, HZ);
I think it's beter to schedule it after 5 seconds since we use the
system_wq and since it's a recovery process.
> +}
> +
> +static int nvmet_rdma_add_port(struct nvmet_port *nport)
> +{
> + struct nvmet_rdma_port *port;
> + __kernel_sa_family_t af;
> + int ret;
> +
> + port = kzalloc(sizeof(*port), GFP_KERNEL);
> + if (!port)
> + return -ENOMEM;
> +
> + switch (nport->disc_addr.adrfam) {
> + case NVMF_ADDR_FAMILY_IP4:
> + af = AF_INET;
> + break;
> + case NVMF_ADDR_FAMILY_IP6:
> + af = AF_INET6;
> + break;
> + default:
> + pr_err("address family %d not supported\n",
> + nport->disc_addr.adrfam);
> + ret = -EINVAL;
> + goto out_free_port;
> + }
> +
> + ret = inet_pton_with_scope(&init_net, af, nport->disc_addr.traddr,
> + nport->disc_addr.trsvcid, &port->addr);
> + if (ret) {
> + pr_err("malformed ip/port passed: %s:%s\n",
> + nport->disc_addr.traddr, nport->disc_addr.trsvcid);
> + goto out_free_port;
> + }
> +
> + ret = nvmet_rdma_enable_port(port);
> + if(ret)
> + goto out_free_port;
> +
> + pr_info("enabling port %d (%pISpcs)\n",
> + le16_to_cpu(nport->disc_addr.portid),
> + (struct sockaddr *)&port->addr);
> +
> + nport->priv = port;
> + port->nport = nport;
> + INIT_DELAYED_WORK(&port->enable_work, nvmet_rdma_enable_port_work);
> +
> + mutex_lock(&port_list_mutex);
> + list_add_tail(&port->entry, &port_list);
> + mutex_unlock(&port_list_mutex);
> +
> + return 0;
> +
> +out_free_port:
> + kfree(port);
> + return ret;
> +}
> +
> +static void nvmet_rdma_disable_port(struct nvmet_rdma_port *port)
> {
> - struct rdma_cm_id *cm_id = xchg(&port->priv, NULL);
> + struct rdma_cm_id *cm_id = port->cm_id;
>
> + port->cm_id = NULL;
> if (cm_id)
> rdma_destroy_id(cm_id);
> }
>
> +static void nvmet_rdma_remove_port(struct nvmet_port *nport)
> +{
> + struct nvmet_rdma_port *port = nport->priv;
> +
> + mutex_lock(&port_list_mutex);
> + list_del(&port->entry);
> + mutex_unlock(&port_list_mutex);
> +
> + cancel_delayed_work_sync(&port->enable_work);
> +
> + nvmet_rdma_disable_port(port);
> + kfree(port);
> +}
> +
> static void nvmet_rdma_disc_port_addr(struct nvmet_req *req,
> - struct nvmet_port *port, char *traddr)
> + struct nvmet_port *nport, char *traddr)
> {
> - struct rdma_cm_id *cm_id = port->priv;
> + struct nvmet_rdma_port *port = nport->priv;
> + struct rdma_cm_id *cm_id = port->cm_id;
>
> if (inet_addr_is_any((struct sockaddr *)&cm_id->route.addr.src_addr)) {
> struct nvmet_rdma_rsp *rsp =
> @@ -1458,7 +1482,7 @@ static void nvmet_rdma_disc_port_addr(struct nvmet_req *req,
>
> sprintf(traddr, "%pISc", addr);
> } else {
> - memcpy(traddr, port->disc_addr.traddr, NVMF_TRADDR_SIZE);
> + memcpy(traddr, nport->disc_addr.traddr, NVMF_TRADDR_SIZE);
> }
> }
>
> @@ -1475,9 +1499,24 @@ static struct nvmet_fabrics_ops nvmet_rdma_ops = {
> .disc_traddr = nvmet_rdma_disc_port_addr,
> };
>
> +static void nvmet_rdma_add_one(struct ib_device *ib_device)
> +{
> + struct nvmet_rdma_port *port, *n;
> +
> + mutex_lock(&port_list_mutex);
> + list_for_each_entry_safe(port, n, &port_list, entry) {
> + if (port->node_guid != ib_device->node_guid)
> + continue;
> +
> + schedule_delayed_work(&port->enable_work, HZ);
> + }
> + mutex_unlock(&port_list_mutex);
> +}
> +
> static void nvmet_rdma_remove_one(struct ib_device *ib_device, void *client_data)
> {
> struct nvmet_rdma_queue *queue, *tmp;
> + struct nvmet_rdma_port *port, *n;
>
> /* Device is being removed, delete all queues using this device */
> mutex_lock(&nvmet_rdma_queue_mutex);
> @@ -1492,11 +1531,21 @@ static void nvmet_rdma_remove_one(struct ib_device *ib_device, void *client_data
> }
> mutex_unlock(&nvmet_rdma_queue_mutex);
>
> + mutex_lock(&port_list_mutex);
> + list_for_each_entry_safe(port, n, &port_list, entry) {
> + if (port->node_guid != ib_device->node_guid)
> + continue;
> +
> + nvmet_rdma_disable_port(port);
> + }
> + mutex_unlock(&port_list_mutex);
> +
I'm not sure it's rebased, but please check my "nvmet-rdma: Don't flush
system_wq by default during remove_one"
We need to add a condition here:
if (found)
flush_scheduled_work();
and remove it from above.
> flush_scheduled_work();
> }
>
> static struct ib_client nvmet_rdma_ib_client = {
> .name = "nvmet_rdma",
> + .add = nvmet_rdma_add_one,
> .remove = nvmet_rdma_remove_one
> };
>
>
We've started testing this and looks good with the above changes. We'll
increase the scale today.
Last issue we saw is some tabs/spaces mixture.
Cheers,
Max.
More information about the Linux-nvme
mailing list