[RFC PATCH 3/3] nvme: implement SQ associations
Andrey Nikitin
nikitina at amazon.com
Fri Sep 24 14:08:09 PDT 2021
This patch implements SQ association feature for PCI host.
If the controller indicates support for NVM sets and SQ associations
and has a sufficient number of queues supported then each NVM set
gets its own blk-mq tagset and queues are distributed uniformly
across NVM sets.
If the number of mxi-x vectors supported by a controller is less
than the total number of queues for all NVM sets the vectors will
be shared by queues associated with different NVM sets, they’ll
be assigned in a round-robin fashion.
Signed-off-by: Andrey Nikitin <nikitina at amazon.com>
---
drivers/nvme/host/core.c | 17 ++-
drivers/nvme/host/fc.c | 1 +
drivers/nvme/host/nvme.h | 2 +
drivers/nvme/host/pci.c | 215 ++++++++++++++++++++++++++++---------
drivers/nvme/host/rdma.c | 1 +
drivers/nvme/host/tcp.c | 1 +
drivers/nvme/target/loop.c | 1 +
7 files changed, 182 insertions(+), 56 deletions(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 63d6a2162e72..6aa6163ec407 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -412,10 +412,16 @@ EXPORT_SYMBOL_GPL(nvme_cancel_request);
void nvme_cancel_tagset(struct nvme_ctrl *ctrl)
{
- if (ctrl->tagset) {
- blk_mq_tagset_busy_iter(ctrl->tagset,
+ int i;
+
+ if (!ctrl->tagset) {
+ return;
+ }
+
+ for (i = 0; i < ctrl->nr_tagsets; i++) {
+ blk_mq_tagset_busy_iter(&ctrl->tagset[i],
nvme_cancel_request, ctrl);
- blk_mq_tagset_wait_completed_request(ctrl->tagset);
+ blk_mq_tagset_wait_completed_request(&ctrl->tagset[i]);
}
}
EXPORT_SYMBOL_GPL(nvme_cancel_tagset);
@@ -3716,6 +3722,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid,
struct gendisk *disk;
struct nvme_id_ns *id;
int node = ctrl->numa_node;
+ int tagset_idx;
if (nvme_identify_ns(ctrl, nsid, ids, &id))
return;
@@ -3724,7 +3731,9 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid,
if (!ns)
goto out_free_id;
- disk = blk_mq_alloc_disk(ctrl->tagset, ns);
+ tagset_idx = id->nvmsetid && id->nvmsetid <= ctrl->nr_tagsets ?
+ id->nvmsetid - 1 : 0;
+ disk = blk_mq_alloc_disk(&ctrl->tagset[tagset_idx], ns);
if (IS_ERR(disk))
goto out_free_ns;
disk->fops = &nvme_bdev_ops;
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index b08a61ca283f..e3eaee68d436 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -2889,6 +2889,7 @@ nvme_fc_create_io_queues(struct nvme_fc_ctrl *ctrl)
return ret;
ctrl->ctrl.tagset = &ctrl->tag_set;
+ ctrl->ctrl.nr_tagsets = 1;
ctrl->ctrl.connect_q = blk_mq_init_queue(&ctrl->tag_set);
if (IS_ERR(ctrl->ctrl.connect_q)) {
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 03be6c913216..ceb3a2c4f99b 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -235,6 +235,8 @@ struct nvme_ctrl {
struct device *dev;
int instance;
int numa_node;
+ int nr_tagsets;
+ int nr_tagset_queues;
struct blk_mq_tag_set *tagset;
struct blk_mq_tag_set *admin_tagset;
struct list_head namespaces;
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 37de7ac79b3f..0d6f33399a26 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -114,6 +114,8 @@ static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode);
*/
struct nvme_queue {
struct nvme_dev *dev;
+ int tagset_idx;
+ int hctx_idx;
spinlock_t sq_lock;
void *sq_cmds;
/* only used for poll queues: */
@@ -147,7 +149,8 @@ struct nvme_queue {
*/
struct nvme_dev {
struct nvme_queue *queues;
- struct blk_mq_tag_set tagset;
+ struct blk_mq_tag_set *tagset;
+ unsigned int tagset_hw_queues;
struct nvme_queue admin_queue;
struct blk_mq_tag_set admin_tagset;
u32 __iomem *dbs;
@@ -368,6 +371,13 @@ static bool nvme_dbbuf_update_and_check_event(u16 value, u32 *dbbuf_db,
return true;
}
+static bool nvme_check_qassoc(struct nvme_dev *dev, unsigned int nr_io_queues)
+{
+ return nvme_is_qassoc_supported(&dev->ctrl) &&
+ !dev->nr_write_queues && !dev->nr_poll_queues &&
+ nr_io_queues >= dev->ctrl.nsetidmax;
+}
+
/*
* Will slightly overestimate the number of pages needed. This is OK
* as it only leads to a small amount of wasted memory for the lifetime of
@@ -415,9 +425,23 @@ static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
unsigned int hctx_idx)
{
struct nvme_dev *dev = data;
- struct nvme_queue *nvmeq = &dev->queues[hctx_idx];
+ struct nvme_queue *nvmeq = NULL;
+ int i;
- WARN_ON(dev->tagset.tags[hctx_idx] != hctx->tags);
+ for (i = 0; i < dev->ctrl.nr_tagsets; i++)
+ if (dev->tagset[i].tags[hctx_idx] == hctx->tags) {
+ nvmeq = &dev->queues[i * dev->tagset_hw_queues + hctx_idx];
+ break;
+ }
+
+ if (!nvmeq) {
+ dev_warn(dev->ctrl.device,
+ "Device queue not found for hctx %d\n", hctx_idx);
+ return -EINVAL;
+ }
+
+ WARN_ON(dev->tagset[nvmeq->tagset_idx].tags[hctx_idx] != hctx->tags);
+ nvmeq->hctx_idx = hctx_idx;
hctx->driver_data = nvmeq;
return 0;
}
@@ -442,8 +466,18 @@ static int nvme_init_request(struct blk_mq_tag_set *set, struct request *req,
{
struct nvme_dev *dev = set->driver_data;
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- int queue_idx = hctx_idx;
- struct nvme_queue *nvmeq = &dev->queues[queue_idx];
+ int tagset_idx, queue_idx;
+ struct nvme_queue *nvmeq;
+
+ if (set < dev->tagset || set >= &dev->tagset[dev->ctrl.nr_tagsets]) {
+ dev_warn(dev->ctrl.device,
+ "Invalid tagset for request init %p\n", set);
+ return -EINVAL;
+ }
+
+ tagset_idx = set - dev->tagset;
+ queue_idx = tagset_idx * dev->tagset_hw_queues + hctx_idx;
+ nvmeq = &dev->queues[queue_idx];
BUG_ON(!nvmeq);
iod->nvmeq = nvmeq;
@@ -471,7 +505,8 @@ static int nvme_pci_map_queues(struct blk_mq_tag_set *set)
for (i = 0, qoff = 0; i < set->nr_maps; i++) {
struct blk_mq_queue_map *map = &set->map[i];
- map->nr_queues = dev->io_queues[i];
+ map->nr_queues = dev->ctrl.nr_tagsets > 1 && i == 0 ?
+ dev->tagset_hw_queues : dev->io_queues[i];
if (!map->nr_queues) {
BUG_ON(i == HCTX_TYPE_DEFAULT);
continue;
@@ -1013,7 +1048,7 @@ static inline struct blk_mq_tags *nvme_queue_tagset(struct nvme_queue *nvmeq)
{
if (!nvmeq->qid)
return nvmeq->dev->admin_tagset.tags[0];
- return nvmeq->dev->tagset.tags[nvmeq->qid - 1];
+ return nvmeq->dev->tagset[nvmeq->tagset_idx].tags[nvmeq->hctx_idx];
}
static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
@@ -1172,7 +1207,7 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
}
static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
- struct nvme_queue *nvmeq)
+ struct nvme_queue *nvmeq, u16 qassoc)
{
struct nvme_ctrl *ctrl = &dev->ctrl;
struct nvme_command c = { };
@@ -1196,6 +1231,7 @@ static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
c.create_sq.sq_flags = cpu_to_le16(flags);
c.create_sq.cqid = cpu_to_le16(qid);
+ c.create_sq.nvmsetid = cpu_to_le16(qassoc);
return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
}
@@ -1598,7 +1634,8 @@ static int nvme_setup_io_queues_trylock(struct nvme_dev *dev)
return 0;
}
-static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
+static int nvme_create_queue(struct nvme_queue *nvmeq, int qid,
+ bool polled, int qassoc)
{
struct nvme_dev *dev = nvmeq->dev;
int result;
@@ -1607,11 +1644,12 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
clear_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags);
/*
- * A queue's vector matches the queue identifier unless the controller
- * has only one vector available.
+ * A queue's vector matches the queue identifier (modulo number of
+ * vectors) unless the controller has only one vector available.
*/
if (!polled)
- vector = dev->num_vecs == 1 ? 0 : qid;
+ vector = dev->num_vecs == 1 ? 0 :
+ ((qid - 1) % (dev->num_vecs - 1)) + 1;
else
set_bit(NVMEQ_POLLED, &nvmeq->flags);
@@ -1619,13 +1657,14 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
if (result)
return result;
- result = adapter_alloc_sq(dev, qid, nvmeq);
+ result = adapter_alloc_sq(dev, qid, nvmeq, qassoc);
if (result < 0)
return result;
if (result)
goto release_cq;
nvmeq->cq_vector = vector;
+ nvmeq->tagset_idx = qassoc ? qassoc - 1 : 0;
result = nvme_setup_io_queues_trylock(dev);
if (result)
@@ -1796,7 +1835,8 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev)
static int nvme_create_io_queues(struct nvme_dev *dev)
{
unsigned i, max, rw_queues;
- int ret = 0;
+ int ret = 0i, queues_per_set;
+ bool qassoc_en;
for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) {
if (nvme_alloc_queue(dev, i, dev->q_depth)) {
@@ -1813,10 +1853,14 @@ static int nvme_create_io_queues(struct nvme_dev *dev)
rw_queues = max;
}
+ qassoc_en = nvme_check_qassoc(dev, max);
+ queues_per_set = qassoc_en ? max / dev->ctrl.nsetidmax : 0;
+
for (i = dev->online_queues; i <= max; i++) {
bool polled = i > rw_queues;
+ int qassoc = qassoc_en ? ((i - 1) / queues_per_set) + 1 : 0;
- ret = nvme_create_queue(nvme_get_queue(dev, i), i, polled);
+ ret = nvme_create_queue(nvme_get_queue(dev, i), i, polled, qassoc);
if (ret)
break;
}
@@ -2243,8 +2287,10 @@ static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
* vector.
*/
irq_queues = 1;
- if (!(dev->ctrl.quirks & NVME_QUIRK_SINGLE_VECTOR))
+ if (!(dev->ctrl.quirks & NVME_QUIRK_SINGLE_VECTOR)) {
irq_queues += (nr_io_queues - poll_queues);
+ }
+
return pci_alloc_irq_vectors_affinity(pdev, 1, irq_queues,
PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
}
@@ -2255,6 +2301,11 @@ static void nvme_disable_io_queues(struct nvme_dev *dev)
__nvme_disable_io_queues(dev, nvme_admin_delete_cq);
}
+static unsigned int nvme_default_io_queues(struct nvme_dev *dev)
+{
+ return num_possible_cpus() + dev->nr_write_queues + dev->nr_poll_queues;
+}
+
static unsigned int nvme_max_io_queues(struct nvme_dev *dev)
{
/*
@@ -2263,7 +2314,13 @@ static unsigned int nvme_max_io_queues(struct nvme_dev *dev)
*/
if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS)
return 1;
- return num_possible_cpus() + dev->nr_write_queues + dev->nr_poll_queues;
+
+ if (nvme_check_qassoc(dev, NVME_MAX_QID)) {
+ return min(num_possible_cpus() * dev->ctrl.nsetidmax,
+ (unsigned int)NVME_MAX_QID);
+ }
+
+ return nvme_default_io_queues(dev);
}
static int nvme_setup_io_queues(struct nvme_dev *dev)
@@ -2282,6 +2339,15 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
dev->nr_poll_queues = poll_queues;
nr_io_queues = nvme_max_io_queues(dev);
+
+ result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues);
+ if (result < 0)
+ return result;
+
+ nr_io_queues = nvme_check_qassoc(dev, nr_io_queues) ?
+ rounddown(nr_io_queues, dev->ctrl.nsetidmax) :
+ min(nr_io_queues, nvme_default_io_queues(dev));
+
if (nr_io_queues != dev->nr_allocated_queues) {
dev->nr_allocated_queues = nr_io_queues;
@@ -2293,10 +2359,6 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
return -ENOMEM;
}
- result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues);
- if (result < 0)
- return result;
-
if (nr_io_queues == 0)
return 0;
@@ -2353,7 +2415,9 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
dev->num_vecs = result;
result = max(result - 1, 1);
- dev->max_qid = result + dev->io_queues[HCTX_TYPE_POLL];
+ dev->max_qid = nvme_check_qassoc(dev, nr_io_queues) ?
+ min((int)nr_io_queues, result * dev->ctrl.nsetidmax) :
+ result + dev->io_queues[HCTX_TYPE_POLL];
/*
* Should investigate if there's a performance win from allocating
@@ -2460,39 +2524,74 @@ static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode)
static void nvme_dev_add(struct nvme_dev *dev)
{
- int ret;
+ int ret, i;
- if (!dev->ctrl.tagset) {
- dev->tagset.ops = &nvme_mq_ops;
- dev->tagset.nr_hw_queues = dev->online_queues - 1;
- dev->tagset.nr_maps = 2; /* default + read */
- if (dev->io_queues[HCTX_TYPE_POLL])
- dev->tagset.nr_maps++;
- dev->tagset.timeout = NVME_IO_TIMEOUT;
- dev->tagset.numa_node = dev->ctrl.numa_node;
- dev->tagset.queue_depth = min_t(unsigned int, dev->q_depth,
- BLK_MQ_MAX_DEPTH) - 1;
- dev->tagset.cmd_size = sizeof(struct nvme_iod);
- dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
- dev->tagset.driver_data = dev;
+ dev->tagset_hw_queues = dev->online_queues - 1;
- /*
- * Some Apple controllers requires tags to be unique
- * across admin and IO queue, so reserve the first 32
- * tags of the IO queue.
- */
- if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS)
- dev->tagset.reserved_tags = NVME_AQ_DEPTH;
+ if (nvme_check_qassoc(dev, dev->tagset_hw_queues)) {
+ if (dev->ctrl.nsetidmax != dev->ctrl.nr_tagsets) {
+ kfree(dev->tagset);
+ dev->ctrl.tagset = NULL;
+ dev->ctrl.nr_tagsets = dev->ctrl.nsetidmax;
+ }
+
+ dev->tagset_hw_queues /= dev->ctrl.nsetidmax;
+ } else if (dev->ctrl.nr_tagsets != 1) {
+ kfree(dev->tagset);
+ dev->ctrl.tagset = NULL;
+ dev->ctrl.nr_tagsets = 1;
+ }
- ret = blk_mq_alloc_tag_set(&dev->tagset);
- if (ret) {
+ if (!dev->ctrl.tagset) {
+ dev->tagset = kcalloc_node(dev->ctrl.nr_tagsets,
+ sizeof(struct blk_mq_tag_set), GFP_KERNEL,
+ dev_to_node(dev->dev));
+ if (!dev->tagset) {
dev_warn(dev->ctrl.device,
- "IO queues tagset allocation failed %d\n", ret);
+ "Device %d tagsets allocation failed\n",
+ dev->ctrl.nr_tagsets);
+ dev->ctrl.nr_tagsets = 0;
return;
}
- dev->ctrl.tagset = &dev->tagset;
+
+ for (i = 0; i < dev->ctrl.nr_tagsets; i++) {
+ struct blk_mq_tag_set *tagset = &dev->tagset[i];
+
+ tagset->ops = &nvme_mq_ops;
+ tagset->nr_hw_queues = dev->tagset_hw_queues;
+ tagset->nr_maps = 2; /* default + read */
+ if (dev->io_queues[HCTX_TYPE_POLL])
+ tagset->nr_maps++;
+ tagset->timeout = NVME_IO_TIMEOUT;
+ tagset->numa_node = dev->ctrl.numa_node;
+ tagset->queue_depth = min_t(unsigned int, dev->q_depth,
+ BLK_MQ_MAX_DEPTH) - 1;
+ tagset->cmd_size = sizeof(struct nvme_iod);
+ tagset->flags = BLK_MQ_F_SHOULD_MERGE;
+ tagset->driver_data = dev;
+
+ /*
+ * Some Apple controllers requires tags to be unique
+ * across admin and IO queue, so reserve the first 32
+ * tags of the IO queue.
+ */
+ if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS)
+ tagset->reserved_tags = NVME_AQ_DEPTH;
+
+ ret = blk_mq_alloc_tag_set(tagset);
+ if (ret) {
+ dev_warn(dev->ctrl.device,
+ "IO queues tagset allocation failed %d\n",
+ ret);
+ return;
+ }
+ }
+
+ dev->ctrl.tagset = dev->tagset;
} else {
- blk_mq_update_nr_hw_queues(&dev->tagset, dev->online_queues - 1);
+ for (i = 0; i < dev->ctrl.nr_tagsets; i++)
+ blk_mq_update_nr_hw_queues(&dev->tagset[i],
+ dev->tagset_hw_queues);
/* Free previously allocated queues that are no longer usable */
nvme_free_queues(dev, dev->online_queues);
@@ -2612,6 +2711,7 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
{
bool dead = true, freeze = false;
struct pci_dev *pdev = to_pci_dev(dev->dev);
+ int i;
mutex_lock(&dev->shutdown_lock);
if (pci_is_enabled(pdev)) {
@@ -2644,9 +2744,13 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
nvme_pci_disable(dev);
nvme_reap_pending_cqes(dev);
- blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl);
+ for (i = 0; i < dev->ctrl.nr_tagsets; i++) {
+ blk_mq_tagset_busy_iter(&dev->tagset[i], nvme_cancel_request,
+ &dev->ctrl);
+ blk_mq_tagset_wait_completed_request(&dev->tagset[i]);
+ }
+
blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_request, &dev->ctrl);
- blk_mq_tagset_wait_completed_request(&dev->tagset);
blk_mq_tagset_wait_completed_request(&dev->admin_tagset);
/*
@@ -2696,9 +2800,15 @@ static void nvme_release_prp_pools(struct nvme_dev *dev)
static void nvme_free_tagset(struct nvme_dev *dev)
{
- if (dev->tagset.tags)
- blk_mq_free_tag_set(&dev->tagset);
+ int i;
+
+ for (i = 0; i < dev->ctrl.nr_tagsets; i++) {
+ if (dev->tagset[i].tags)
+ blk_mq_free_tag_set(&dev->tagset[i]);
+ }
+
dev->ctrl.tagset = NULL;
+ dev->ctrl.nr_tagsets = 0;
}
static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
@@ -2713,6 +2823,7 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
mempool_destroy(dev->iod_mempool);
put_device(dev->dev);
kfree(dev->queues);
+ kfree(dev->tagset);
kfree(dev);
}
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index a68704e39084..47d29512d96b 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -977,6 +977,7 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new)
goto out_free_io_queues;
}
+ ctrl->ctrl.nr_tagsets = 1;
ctrl->ctrl.connect_q = blk_mq_init_queue(&ctrl->tag_set);
if (IS_ERR(ctrl->ctrl.connect_q)) {
ret = PTR_ERR(ctrl->ctrl.connect_q);
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 645025620154..979164bca237 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -1803,6 +1803,7 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new)
goto out_free_io_queues;
}
+ ctrl->nr_tagsets = 1;
ctrl->connect_q = blk_mq_init_queue(ctrl->tagset);
if (IS_ERR(ctrl->connect_q)) {
ret = PTR_ERR(ctrl->connect_q);
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index 0285ccc7541f..2df8aa11f46a 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -536,6 +536,7 @@ static int nvme_loop_create_io_queues(struct nvme_loop_ctrl *ctrl)
ctrl->tag_set.nr_hw_queues = ctrl->ctrl.queue_count - 1;
ctrl->tag_set.timeout = NVME_IO_TIMEOUT;
ctrl->ctrl.tagset = &ctrl->tag_set;
+ ctrl->ctrl.nr_tagsets = 1;
ret = blk_mq_alloc_tag_set(&ctrl->tag_set);
if (ret)
--
2.32.0
More information about the Linux-nvme
mailing list