[PATCH V2 1/2] nvme-core: use xarray for ctrl ns tracking

Chaitanya Kulkarni chaitanya.kulkarni at wdc.com
Tue Jun 30 22:25:16 EDT 2020


This patch replaces the ctrl->namespaces tracking from linked list to
xarray and improves the performance.

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni at wdc.com>
---
 drivers/nvme/host/core.c      | 235 ++++++++++++++++++++--------------
 drivers/nvme/host/multipath.c |  15 +--
 drivers/nvme/host/nvme.h      |   5 +-
 3 files changed, 145 insertions(+), 110 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index e62fdc208b27..10e1fda8a21d 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -437,10 +437,8 @@ static void nvme_put_ns_head(struct nvme_ns_head *head)
 	kref_put(&head->ref, nvme_free_ns_head);
 }
 
-static void nvme_free_ns(struct kref *kref)
+static void __nvme_free_ns(struct nvme_ns *ns)
 {
-	struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref);
-
 	if (ns->ndev)
 		nvme_nvm_unregister(ns);
 
@@ -450,6 +448,13 @@ static void nvme_free_ns(struct kref *kref)
 	kfree(ns);
 }
 
+static void nvme_free_ns(struct kref *kref)
+{
+	struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref);
+
+	__nvme_free_ns(ns);
+}
+
 static void nvme_put_ns(struct nvme_ns *ns)
 {
 	kref_put(&ns->kref, nvme_free_ns);
@@ -1381,12 +1386,11 @@ static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 static void nvme_update_formats(struct nvme_ctrl *ctrl)
 {
 	struct nvme_ns *ns;
+	unsigned long idx;
 
-	down_read(&ctrl->namespaces_rwsem);
-	list_for_each_entry(ns, &ctrl->namespaces, list)
+	xa_for_each(&ctrl->namespaces, idx, ns)
 		if (ns->disk && nvme_revalidate_disk(ns->disk))
 			nvme_set_queue_dying(ns);
-	up_read(&ctrl->namespaces_rwsem);
 }
 
 static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects)
@@ -3063,34 +3067,36 @@ static int nvme_dev_open(struct inode *inode, struct file *file)
 
 static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp)
 {
+	struct nvme_id_ctrl *id;
 	struct nvme_ns *ns;
-	int ret;
+	int ret = 0;
 
-	down_read(&ctrl->namespaces_rwsem);
-	if (list_empty(&ctrl->namespaces)) {
+	if (xa_empty(&ctrl->namespaces)) {
 		ret = -ENOTTY;
-		goto out_unlock;
+		goto out;
 	}
 
-	ns = list_first_entry(&ctrl->namespaces, struct nvme_ns, list);
-	if (ns != list_last_entry(&ctrl->namespaces, struct nvme_ns, list)) {
-		dev_warn(ctrl->device,
-			"NVME_IOCTL_IO_CMD not supported when multiple namespaces present!\n");
+	/* Let the scan work finish updating ctrl->namespaces */
+	flush_work(&ctrl->scan_work);
+	if (nvme_identify_ctrl(ctrl, &id)) {
+		dev_err(ctrl->device, "nvme_identify_ctrl() failed\n");
 		ret = -EINVAL;
-		goto out_unlock;
+		goto out;
+	}
+	if (le32_to_cpu(id->nn) > 1) {
+		dev_warn(ctrl->device,
+		"NVME_IOCTL_IO_CMD not supported when multiple namespaces present!\n");
+		goto out;
 	}
 
 	dev_warn(ctrl->device,
 		"using deprecated NVME_IOCTL_IO_CMD ioctl on the char device!\n");
 	kref_get(&ns->kref);
-	up_read(&ctrl->namespaces_rwsem);
 
 	ret = nvme_user_cmd(ctrl, ns, argp);
 	nvme_put_ns(ns);
-	return ret;
-
-out_unlock:
-	up_read(&ctrl->namespaces_rwsem);
+out:
+	kfree(id);
 	return ret;
 }
 
@@ -3590,31 +3596,21 @@ static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid,
 	return ret;
 }
 
-static int ns_cmp(void *priv, struct list_head *a, struct list_head *b)
-{
-	struct nvme_ns *nsa = container_of(a, struct nvme_ns, list);
-	struct nvme_ns *nsb = container_of(b, struct nvme_ns, list);
-
-	return nsa->head->ns_id - nsb->head->ns_id;
-}
-
 static struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 {
-	struct nvme_ns *ns, *ret = NULL;
+	struct nvme_ns *ns;
+	XA_STATE(xas, &ctrl->namespaces, nsid);
 
-	down_read(&ctrl->namespaces_rwsem);
-	list_for_each_entry(ns, &ctrl->namespaces, list) {
-		if (ns->head->ns_id == nsid) {
-			if (!kref_get_unless_zero(&ns->kref))
-				continue;
-			ret = ns;
-			break;
-		}
-		if (ns->head->ns_id > nsid)
-			break;
-	}
-	up_read(&ctrl->namespaces_rwsem);
-	return ret;
+	rcu_read_lock();
+	do {
+		ns = xas_load(&xas);
+		if (xa_is_zero(ns))
+			ns = NULL;
+	} while (xas_retry(&xas, ns));
+	ns = ns && kref_get_unless_zero(&ns->kref) ? ns : NULL;
+	rcu_read_unlock();
+
+	return ns;
 }
 
 static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
@@ -3684,9 +3680,19 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 		}
 	}
 
-	down_write(&ctrl->namespaces_rwsem);
-	list_add_tail(&ns->list, &ctrl->namespaces);
-	up_write(&ctrl->namespaces_rwsem);
+	ret = xa_insert(&ctrl->namespaces, nsid, ns, GFP_KERNEL);
+	if (ret) {
+		switch (ret) {
+		case -ENOMEM:
+			dev_err(ctrl->device,
+				"xa insert memory allocation\n");
+			break;
+		case -EBUSY:
+			dev_err(ctrl->device,
+				"xa insert entry already present\n");
+			break;
+		}
+	}
 
 	nvme_get_ctrl(ctrl);
 
@@ -3718,6 +3724,9 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 
 static void nvme_ns_remove(struct nvme_ns *ns)
 {
+	struct xarray *xa = &ns->ctrl->namespaces;
+	bool free;
+
 	if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags))
 		return;
 
@@ -3740,12 +3749,14 @@ static void nvme_ns_remove(struct nvme_ns *ns)
 			blk_integrity_unregister(ns->disk);
 	}
 
-	down_write(&ns->ctrl->namespaces_rwsem);
-	list_del_init(&ns->list);
-	up_write(&ns->ctrl->namespaces_rwsem);
+	xa_lock(xa);
+	__xa_erase(xa, ns->head->ns_id);
+	free = refcount_dec_and_test(&ns->kref.refcount) ? true : false;
+	xa_unlock(xa);
 
 	nvme_mpath_check_last_path(ns);
-	nvme_put_ns(ns);
+	if (free)
+		__nvme_free_ns(ns);
 }
 
 static void nvme_ns_remove_by_nsid(struct nvme_ctrl *ctrl, u32 nsid)
@@ -3774,19 +3785,38 @@ static void nvme_validate_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
 					unsigned nsid)
 {
-	struct nvme_ns *ns, *next;
-	LIST_HEAD(rm_list);
+	struct xarray *namespaces = &ctrl->namespaces;
+	struct xarray rm_array;
+	unsigned long tnsid;
+	struct nvme_ns *ns;
+	unsigned long idx;
+	int ret;
 
-	down_write(&ctrl->namespaces_rwsem);
-	list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) {
-		if (ns->head->ns_id > nsid || test_bit(NVME_NS_DEAD, &ns->flags))
-			list_move_tail(&ns->list, &rm_list);
+	xa_init(&rm_array);
+
+	xa_lock(namespaces);
+	xa_for_each(namespaces, idx, ns) {
+		tnsid = ns->head->ns_id;
+		if (tnsid > nsid || test_bit(NVME_NS_DEAD, &ns->flags)) {
+			xa_unlock(namespaces);
+			xa_erase(namespaces, tnsid);
+			/* Even if insert fails keep going */
+			ret = xa_insert(&rm_array, nsid, ns, GFP_KERNEL);
+			switch (ret) {
+			case -ENOMEM:
+				pr_err("xa insert memory allocation failed\n");
+				break;
+			case -EBUSY:
+				pr_err("xa insert entry already present\n");
+				break;
+			}
+			xa_lock(namespaces);
+		}
 	}
-	up_write(&ctrl->namespaces_rwsem);
+	xa_unlock(namespaces);
 
-	list_for_each_entry_safe(ns, next, &rm_list, list)
+	xa_for_each(&rm_array, idx, ns)
 		nvme_ns_remove(ns);
-
 }
 
 static int nvme_scan_ns_list(struct nvme_ctrl *ctrl)
@@ -3884,10 +3914,6 @@ static void nvme_scan_work(struct work_struct *work)
 	if (nvme_scan_ns_list(ctrl) != 0)
 		nvme_scan_ns_sequential(ctrl);
 	mutex_unlock(&ctrl->scan_lock);
-
-	down_write(&ctrl->namespaces_rwsem);
-	list_sort(NULL, &ctrl->namespaces, ns_cmp);
-	up_write(&ctrl->namespaces_rwsem);
 }
 
 /*
@@ -3897,8 +3923,13 @@ static void nvme_scan_work(struct work_struct *work)
  */
 void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
 {
-	struct nvme_ns *ns, *next;
-	LIST_HEAD(ns_list);
+	struct xarray rm_array;
+	unsigned long tnsid;
+	struct nvme_ns *ns;
+	unsigned long idx;
+	int ret;
+
+	xa_init(&rm_array);
 
 	/*
 	 * make sure to requeue I/O to all namespaces as these
@@ -3919,11 +3950,30 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
 	if (ctrl->state == NVME_CTRL_DEAD)
 		nvme_kill_queues(ctrl);
 
-	down_write(&ctrl->namespaces_rwsem);
-	list_splice_init(&ctrl->namespaces, &ns_list);
-	up_write(&ctrl->namespaces_rwsem);
+	xa_lock(&ctrl->namespaces);
+	xa_for_each(&ctrl->namespaces, idx, ns) {
+		tnsid = ns->head->ns_id;
+		xa_unlock(&ctrl->namespaces);
+		xa_erase(&ctrl->namespaces, tnsid);
+		/* Even if insert fails keep going */
+		ret = xa_insert(&rm_array, tnsid, ns, GFP_KERNEL);
+		if (ret) {
+			switch (ret) {
+			case -ENOMEM:
+				dev_err(ctrl->device,
+					"xa insert memory allocation\n");
+				break;
+			case -EBUSY:
+				dev_err(ctrl->device,
+					"xa insert entry already present\n");
+				break;
+			}
+		}
+		xa_lock(&ctrl->namespaces);
+	}
+	xa_unlock(&ctrl->namespaces);
 
-	list_for_each_entry_safe(ns, next, &ns_list, list)
+	xa_for_each(&rm_array, idx, ns)
 		nvme_ns_remove(ns);
 }
 EXPORT_SYMBOL_GPL(nvme_remove_namespaces);
@@ -4144,6 +4194,9 @@ static void nvme_free_ctrl(struct device *dev)
 	if (subsys && ctrl->instance != subsys->instance)
 		ida_simple_remove(&nvme_instance_ida, ctrl->instance);
 
+	WARN_ON_ONCE(!xa_empty(&ctrl->namespaces));
+
+	xa_destroy(&ctrl->namespaces);
 	kfree(ctrl->effects);
 	nvme_mpath_uninit(ctrl);
 	__free_page(ctrl->discard_page);
@@ -4174,8 +4227,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
 	ctrl->state = NVME_CTRL_NEW;
 	spin_lock_init(&ctrl->lock);
 	mutex_init(&ctrl->scan_lock);
-	INIT_LIST_HEAD(&ctrl->namespaces);
-	init_rwsem(&ctrl->namespaces_rwsem);
+	xa_init(&ctrl->namespaces);
 	ctrl->dev = dev;
 	ctrl->ops = ops;
 	ctrl->quirks = quirks;
@@ -4255,98 +4307,87 @@ EXPORT_SYMBOL_GPL(nvme_init_ctrl);
 void nvme_kill_queues(struct nvme_ctrl *ctrl)
 {
 	struct nvme_ns *ns;
-
-	down_read(&ctrl->namespaces_rwsem);
+	unsigned long idx;
 
 	/* Forcibly unquiesce queues to avoid blocking dispatch */
 	if (ctrl->admin_q && !blk_queue_dying(ctrl->admin_q))
 		blk_mq_unquiesce_queue(ctrl->admin_q);
 
-	list_for_each_entry(ns, &ctrl->namespaces, list)
+	xa_for_each(&ctrl->namespaces, idx, ns)
 		nvme_set_queue_dying(ns);
-
-	up_read(&ctrl->namespaces_rwsem);
 }
 EXPORT_SYMBOL_GPL(nvme_kill_queues);
 
 void nvme_unfreeze(struct nvme_ctrl *ctrl)
 {
 	struct nvme_ns *ns;
+	unsigned long idx;
 
-	down_read(&ctrl->namespaces_rwsem);
-	list_for_each_entry(ns, &ctrl->namespaces, list)
+	xa_for_each(&ctrl->namespaces, idx, ns)
 		blk_mq_unfreeze_queue(ns->queue);
-	up_read(&ctrl->namespaces_rwsem);
 }
 EXPORT_SYMBOL_GPL(nvme_unfreeze);
 
 void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout)
 {
 	struct nvme_ns *ns;
+	unsigned long idx;
 
-	down_read(&ctrl->namespaces_rwsem);
-	list_for_each_entry(ns, &ctrl->namespaces, list) {
+	xa_for_each(&ctrl->namespaces, idx, ns) {
 		timeout = blk_mq_freeze_queue_wait_timeout(ns->queue, timeout);
 		if (timeout <= 0)
 			break;
 	}
-	up_read(&ctrl->namespaces_rwsem);
 }
 EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout);
 
 void nvme_wait_freeze(struct nvme_ctrl *ctrl)
 {
 	struct nvme_ns *ns;
+	unsigned long idx;
 
-	down_read(&ctrl->namespaces_rwsem);
-	list_for_each_entry(ns, &ctrl->namespaces, list)
+	xa_for_each(&ctrl->namespaces, idx, ns)
 		blk_mq_freeze_queue_wait(ns->queue);
-	up_read(&ctrl->namespaces_rwsem);
 }
 EXPORT_SYMBOL_GPL(nvme_wait_freeze);
 
 void nvme_start_freeze(struct nvme_ctrl *ctrl)
 {
 	struct nvme_ns *ns;
+	unsigned long idx;
 
-	down_read(&ctrl->namespaces_rwsem);
-	list_for_each_entry(ns, &ctrl->namespaces, list)
+	xa_for_each(&ctrl->namespaces, idx, ns)
 		blk_freeze_queue_start(ns->queue);
-	up_read(&ctrl->namespaces_rwsem);
 }
 EXPORT_SYMBOL_GPL(nvme_start_freeze);
 
 void nvme_stop_queues(struct nvme_ctrl *ctrl)
 {
 	struct nvme_ns *ns;
+	unsigned long idx;
 
-	down_read(&ctrl->namespaces_rwsem);
-	list_for_each_entry(ns, &ctrl->namespaces, list)
+	xa_for_each(&ctrl->namespaces, idx, ns)
 		blk_mq_quiesce_queue(ns->queue);
-	up_read(&ctrl->namespaces_rwsem);
 }
 EXPORT_SYMBOL_GPL(nvme_stop_queues);
 
 void nvme_start_queues(struct nvme_ctrl *ctrl)
 {
 	struct nvme_ns *ns;
+	unsigned long idx;
 
-	down_read(&ctrl->namespaces_rwsem);
-	list_for_each_entry(ns, &ctrl->namespaces, list)
+	xa_for_each(&ctrl->namespaces, idx, ns)
 		blk_mq_unquiesce_queue(ns->queue);
-	up_read(&ctrl->namespaces_rwsem);
 }
 EXPORT_SYMBOL_GPL(nvme_start_queues);
 
-
 void nvme_sync_queues(struct nvme_ctrl *ctrl)
 {
 	struct nvme_ns *ns;
+	unsigned long idx;
 
-	down_read(&ctrl->namespaces_rwsem);
-	list_for_each_entry(ns, &ctrl->namespaces, list)
+	xa_for_each(&ctrl->namespaces, idx, ns)
 		blk_sync_queue(ns->queue);
-	up_read(&ctrl->namespaces_rwsem);
 
 	if (ctrl->admin_q)
 		blk_sync_queue(ctrl->admin_q);
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 18d084ed497e..18674735c4bc 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -115,13 +115,12 @@ bool nvme_failover_req(struct request *req)
 void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
 {
 	struct nvme_ns *ns;
+	unsigned long idx;
 
-	down_read(&ctrl->namespaces_rwsem);
-	list_for_each_entry(ns, &ctrl->namespaces, list) {
+	xa_for_each(&ctrl->namespaces, idx, ns) {
 		if (ns->head->disk)
 			kblockd_schedule_work(&ns->head->requeue_work);
 	}
-	up_read(&ctrl->namespaces_rwsem);
 }
 
 static const char *nvme_ana_state_names[] = {
@@ -155,13 +154,12 @@ bool nvme_mpath_clear_current_path(struct nvme_ns *ns)
 void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl)
 {
 	struct nvme_ns *ns;
+	unsigned long idx;
 
 	mutex_lock(&ctrl->scan_lock);
-	down_read(&ctrl->namespaces_rwsem);
-	list_for_each_entry(ns, &ctrl->namespaces, list)
+	xa_for_each(&ctrl->namespaces, idx, ns)
 		if (nvme_mpath_clear_current_path(ns))
 			kblockd_schedule_work(&ns->head->requeue_work);
-	up_read(&ctrl->namespaces_rwsem);
 	mutex_unlock(&ctrl->scan_lock);
 }
 
@@ -497,6 +495,7 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
 	u32 nr_nsids = le32_to_cpu(desc->nnsids), n = 0;
 	unsigned *nr_change_groups = data;
 	struct nvme_ns *ns;
+	unsigned long idx;
 
 	dev_dbg(ctrl->device, "ANA group %d: %s.\n",
 			le32_to_cpu(desc->grpid),
@@ -508,8 +507,7 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
 	if (!nr_nsids)
 		return 0;
 
-	down_read(&ctrl->namespaces_rwsem);
-	list_for_each_entry(ns, &ctrl->namespaces, list) {
+	xa_for_each(&ctrl->namespaces, idx, ns) {
 		unsigned nsid = le32_to_cpu(desc->nsids[n]);
 
 		if (ns->head->ns_id < nsid)
@@ -519,7 +517,6 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
 		if (++n == nr_nsids)
 			break;
 	}
-	up_read(&ctrl->namespaces_rwsem);
 	return 0;
 }
 
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 2ef8d501e2a8..cff40e567bee 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -206,8 +206,7 @@ struct nvme_ctrl {
 	int numa_node;
 	struct blk_mq_tag_set *tagset;
 	struct blk_mq_tag_set *admin_tagset;
-	struct list_head namespaces;
-	struct rw_semaphore namespaces_rwsem;
+	struct xarray namespaces;
 	struct device ctrl_device;
 	struct device *device;	/* char device */
 	struct cdev cdev;
@@ -376,8 +375,6 @@ enum nvme_ns_features {
 };
 
 struct nvme_ns {
-	struct list_head list;
-
 	struct nvme_ctrl *ctrl;
 	struct request_queue *queue;
 	struct gendisk *disk;
-- 
2.26.0




More information about the Linux-nvme mailing list