[PATCH 7/7] NVMe: Share interrupt vectors among IO queues
Keith Busch
keith.busch at intel.com
Fri Jan 24 18:50:54 EST 2014
The driver code contained a comment that proposed investigating if there's
a win from allocating more io queues than interupt vectors. The answer
is "yes", it's a win. Well, it's a win most of the time, but it's never
a loss, and the win was enough for me to post a patch proposal.
For the most extreme case, I used a 32 core machine, forced allocating
a single MSI-x interrupt per nvme device, and compared a single NVMe IO
queue with 32 IO queues for various IO workloads. The 32 queue tests were
always greater or equal to 1 queue. I measured a whopping 22% improvement
in IOPs from sharing among multiple queues in the workload below.
For reference, this is the fio profile I used that showed the biggest win
(not listing all 32 jobs, but should be obvious):
; per-cpu 4k O_DIRECT async rand read
[global]
rw=randread
bs=4k
direct=1
filename=/dev/nvme0n1
ioengine=libaio
iodepth=64
[cpu0]
cpus_allowed=0
[cpu1]
cpus_allowed=1
...
[cpu30]
cpus_allowed=30
[cpu31]
cpus_allowed=31
Signed-off-by: Keith Busch <keith.busch at intel.com>
---
drivers/block/nvme-core.c | 42 +++++++++++++++++++++++++++---------------
include/linux/nvme.h | 2 ++
2 files changed, 29 insertions(+), 15 deletions(-)
diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index f59af3f..fb009a2 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -1186,7 +1186,8 @@ static void nvme_free_queues(struct nvme_dev *dev, int lowest)
*/
static int nvme_suspend_queue(struct nvme_queue *nvmeq)
{
- int vector = nvmeq->dev->entry[nvmeq->cq_vector].vector;
+ struct nvme_dev *dev = nvmeq->dev;
+ int vector = dev->entry[nvmeq->cq_vector].vector;
spin_lock_irq(&nvmeq->q_lock);
if (nvmeq->q_suspended) {
@@ -1199,7 +1200,13 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq)
irq_set_affinity_hint(vector, NULL);
free_irq(vector, nvmeq);
- nvmeq->dev->online_queues--;
+ cpumask_andnot(&dev->affinity_masks[vector],
+ &dev->affinity_masks[vector],
+ &nvmeq->cpu_mask);
+ if (cpus_weight(dev->affinity_masks[vector]))
+ irq_set_affinity_hint(vector, &dev->affinity_masks[vector]);
+
+ dev->online_queues--;
return 0;
}
@@ -1927,13 +1934,14 @@ static void nvme_add_cpus(cpumask_t *mask, const cpumask_t *unassigned_cpus,
}
}
+/* XXX: How to share irqs among queues before we've assigned them to cpus? */
static void nvme_create_io_queues(struct nvme_dev *dev)
{
unsigned i, max;
max = min(dev->max_qid, num_online_cpus());
for (i = dev->queue_count; i <= max; i++)
- if (!nvme_alloc_queue(dev, i, dev->q_depth, i - 1))
+ if (!nvme_alloc_queue(dev, i, dev->q_depth, i % dev->nr_vecs))
break;
max = min(dev->queue_count - 1, num_online_cpus());
@@ -1953,6 +1961,8 @@ static void nvme_assign_io_queues(struct nvme_dev *dev)
unsigned cpu, cpus_per_queue, queues, remainder, i;
cpumask_t unassigned_cpus;
+ for (i = 0; i < dev->nr_vecs; i++)
+ cpumask_clear(&dev->affinity_masks[i]);
nvme_create_io_queues(dev);
queues = min(dev->online_queues - 1, num_online_cpus());
@@ -2001,9 +2011,9 @@ static void nvme_assign_io_queues(struct nvme_dev *dev)
"nvme%d qid:%d mis-matched queue-to-cpu assignment\n",
dev->instance, i);
- irq_set_affinity_hint(dev->entry[nvmeq->cq_vector].vector,
- &nvmeq->cpu_mask);
-
+ cpumask_or(&dev->affinity_masks[nvmeq->cq_vector],
+ &dev->affinity_masks[nvmeq->cq_vector],
+ &nvmeq->cpu_mask);
cpumask_andnot(&unassigned_cpus, &unassigned_cpus,
&nvmeq->cpu_mask);
@@ -2088,7 +2098,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
/* Deregister the admin queue's interrupt */
free_irq(dev->entry[0].vector, dev->queues[0]);
- vecs = nr_io_queues;
+ dev->max_qid = vecs = nr_io_queues;
for (i = 0; i < vecs; i++)
dev->entry[i].entry = i;
for (;;) {
@@ -2116,14 +2126,10 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
}
}
- /*
- * Should investigate if there's a performance win from allocating
- * more queues than interrupt vectors; it might allow the submission
- * path to scale better, even if the receive path is limited by the
- * number of interrupts.
- */
- nr_io_queues = vecs;
- dev->max_qid = nr_io_queues;
+ dev->nr_vecs = vecs;
+ for (i = 0; i < vecs; i++)
+ irq_set_affinity_hint(dev->entry[i].vector,
+ &dev->affinity_masks[i]);
result = queue_request_irq(dev, dev->queues[0], "nvme admin");
if (result) {
@@ -2519,6 +2525,7 @@ static void nvme_free_dev(struct kref *kref)
nvme_free_namespaces(dev);
free_percpu(dev->io_queues);
+ kfree(dev->affinity_masks);
kfree(dev->queues);
kfree(dev->entry);
kfree(dev);
@@ -2664,6 +2671,10 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
GFP_KERNEL);
if (!dev->queues)
goto free;
+ dev->affinity_masks = kcalloc(num_possible_cpus() + 1,
+ sizeof(*dev->affinity_masks), GFP_KERNEL);
+ if (!dev->queues)
+ goto free;
dev->io_queues = alloc_percpu(struct nvme_queue *);
if (!dev->io_queues)
goto free;
@@ -2715,6 +2726,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
nvme_release_instance(dev);
free:
free_percpu(dev->io_queues);
+ kfree(dev->affinity_masks);
kfree(dev->queues);
kfree(dev->entry);
kfree(dev);
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index beeddee..b5f0352 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -83,10 +83,12 @@ struct nvme_dev {
unsigned queue_count;
unsigned online_queues;
unsigned max_qid;
+ unsigned nr_vecs;
int q_depth;
u32 db_stride;
u32 ctrl_config;
struct msix_entry *entry;
+ cpumask_t *affinity_masks;
struct nvme_bar __iomem *bar;
struct list_head namespaces;
struct kref kref;
--
1.7.10.4
More information about the Linux-nvme
mailing list