Linux NVME Driver - Added Single MSI supoort

Ramachandra Rao Gajula rama at fastorsystems.com
Thu Jan 24 11:40:11 EST 2013


-----
Fixes include:
1. Support for single MSI interrupt; if MSI-X fails, then a single MSI
is enabled.
 - Old code didn't enable MSI-X for the initial admin queue setup
time, and later enables it at the time of setting up all the queues.
The new code enables 1 MSI-X vector for admin queue setup time,
disables it and later enables all the vectors needed for the number of
queues at the time of setting up all the queues.; not sure if this
code is correct for MSI-X and requires some review.

2. Free all the queues before deleting thee nvme devices at the time
of driver unload/shutdown.
------

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c

------

@@ -41,10 +41,12 @@
 #include <linux/types.h>
 #include <linux/version.h>

+
 #define NVME_Q_DEPTH 1024
 #define SQ_SIZE(depth)		(depth * sizeof(struct nvme_command))
 #define CQ_SIZE(depth)		(depth * sizeof(struct nvme_completion))
 #define NVME_MINORS 64
+
 #define NVME_IO_TIMEOUT	(5 * HZ)
 #define ADMIN_TIMEOUT	(60 * HZ)

@@ -72,6 +74,8 @@ struct nvme_dev {
 	int queue_count;
 	int db_stride;
 	u32 ctrl_config;
+	u32 int_method;
+	u32 msi_messages;
 	struct msix_entry *entry;
 	struct nvme_bar __iomem *bar;
 	struct list_head namespaces;
@@ -81,6 +85,12 @@ struct nvme_dev {
 	u32 max_hw_sectors;
 };

+/*
+ * Interrupt used for this NVME device - set in int_method based on
what is available/enabled by the driver
+ */
+#define NVME_INTERRUPT_MSI_X	1
+#define NVME_INTERRUPT_MSI	2
+
 /*
  * An NVM Express namespace is equivalent to a SCSI LUN
  */
@@ -678,6 +688,27 @@ static irqreturn_t nvme_process_cq(struct
nvme_queue *nvmeq)
 	return IRQ_HANDLED;
 }

+static irqreturn_t nvme_msi(int irq, void *data)
+{
+        struct nvme_dev *dev = (struct nvme_dev *)data;
+        unsigned long flags;
+
+        spin_lock_irqsave(&dev_list_lock,flags);
+        list_for_each_entry(dev, &dev_list, node) {
+                int i;
+                for (i = 0; i < dev->queue_count; i++) {
+                        struct nvme_queue *nvmeq = dev->queues[i];
+                        if (!nvmeq)
+                                continue;
+                        spin_lock_irq(&nvmeq->q_lock);
+
+                        nvme_process_cq(nvmeq);
+                        spin_unlock_irq(&nvmeq->q_lock);
+                }
+        }
+        spin_unlock_irqrestore(&dev_list_lock, flags);
+        return IRQ_HANDLED;
+}
 static irqreturn_t nvme_irq(int irq, void *data)
 {
 	irqreturn_t result;
@@ -697,6 +728,7 @@ static irqreturn_t nvme_irq_check(int irq, void *data)
 	return IRQ_WAKE_THREAD;
 }

+
 static void nvme_abort_command(struct nvme_queue *nvmeq, int cmdid)
 {
 	spin_lock_irq(&nvmeq->q_lock);
@@ -913,8 +945,11 @@ static void nvme_free_queue(struct nvme_dev *dev, int qid)
 	nvme_cancel_ios(nvmeq, false);
 	spin_unlock_irq(&nvmeq->q_lock);

-	irq_set_affinity_hint(vector, NULL);
-	free_irq(vector, nvmeq);
+	// Free up the interrupt vector/message only when we use multiple
vectors/messages - one per queue
+	if (dev->int_method == NVME_INTERRUPT_MSI_X) {
+		irq_set_affinity_hint(vector, NULL);
+		free_irq(vector, nvmeq);
+	}

 	/* Don't tell the adapter to delete the admin queue */
 	if (qid) {
@@ -971,15 +1006,29 @@ static struct nvme_queue
*nvme_alloc_queue(struct nvme_dev *dev, int qid,
 static int queue_request_irq(struct nvme_dev *dev, struct nvme_queue *nvmeq,
 							const char *name)
 {
-	if (use_threaded_interrupts)
-		return request_threaded_irq(dev->entry[nvmeq->cq_vector].vector,
+	int result = 0;
+
+	if (dev->int_method == NVME_INTERRUPT_MSI_X)
+	{
+		if (use_threaded_interrupts)
+			result = request_threaded_irq(dev->entry[nvmeq->cq_vector].vector,
 					nvme_irq_check, nvme_irq,
 					IRQF_DISABLED | IRQF_SHARED,
 					name, nvmeq);
-	return request_irq(dev->entry[nvmeq->cq_vector].vector, nvme_irq,
+		else
+			result = request_irq(dev->entry[nvmeq->cq_vector].vector, nvme_irq,
 				IRQF_DISABLED | IRQF_SHARED, name, nvmeq);
+	}
+	else if ((dev->int_method == NVME_INTERRUPT_MSI) && (!dev->msi_messages))
+	{
+		result = request_irq(dev->pci_dev->irq,nvme_msi,
+				IRQF_DISABLED | IRQF_SHARED,name,dev);
+		dev->msi_messages++;
+	}
+	return result;
 }

+
 static __devinit struct nvme_queue *nvme_create_queue(struct nvme_dev *dev,
 					int qid, int cq_size, int vector)
 {
@@ -1016,6 +1065,40 @@ static __devinit struct nvme_queue
*nvme_create_queue(struct nvme_dev *dev,
 	return ERR_PTR(result);
 }

+static int nvme_enable_interrupt(struct nvme_dev *dev,int *count)
+{
+	int result = 0;
+	u32	i = 0;
+
+	// first try MSI-X; if it fails, fall back to single MSI message interrupt
+	// initialize vector number table for MSI-X
+	for (i = 0; i < *count; i++)
+		dev->entry[i].entry = i;
+
+	dev->int_method = NVME_INTERRUPT_MSI_X;
+	result = pci_enable_msix(dev->pci_dev, dev->entry, *count);
+	if (result == 0)
+		return result;
+	else if (result > 0) {
+		*count = result;	// update available/allocated vectors
+		result = pci_enable_msix(dev->pci_dev, dev->entry, *count);
+		if (result == 0)
+			return result;
+		// since we were told correct count of vectors, result cannot be >
0 at this point, but can be < 0 - error case; just fall through
+	}
+
+	// Fall through here if pci_enable_msix failed (result < 0 ); try
allocating a single MSI interrupt
+	dev->int_method = NVME_INTERRUPT_MSI;
+	result = pci_enable_msi(dev->pci_dev);
+	if (result == 0)
+		return result;
+
+	dev->int_method = 0; // neither MSI/MSI-X; all we do is return queue
count as 1
+	//printk(KERN_WARN "NVME: Failed to enable MSI-X or MSI; running
without interrupts\n");
+	*count = 1;
+	return result;
+}
+
 static int __devinit nvme_configure_admin_queue(struct nvme_dev *dev)
 {
 	int result = 0;
@@ -1023,6 +1106,7 @@ static int __devinit
nvme_configure_admin_queue(struct nvme_dev *dev)
 	u64 cap;
 	unsigned long timeout;
 	struct nvme_queue *nvmeq;
+	int admin_int_count = 1;

 	dev->dbs = ((void __iomem *)dev->bar) + 4096;

@@ -1064,7 +1148,12 @@ static int __devinit
nvme_configure_admin_queue(struct nvme_dev *dev)
 		return result;
 	}

-	result = queue_request_irq(dev, nvmeq, "nvme admin");
+	result = nvme_enable_interrupt(dev,&admin_int_count);
+
+	if (result == 0) // ask for irq only msi-x/msi got enabled first
+		result = queue_request_irq(dev, nvmeq, "nvme admin");
+	//XXX: TODO: Handle errors here
+
 	dev->queues[0] = nvmeq;
 	return result;
 }
@@ -1295,6 +1384,7 @@ static int nvme_kthread(void *data)
 				struct nvme_queue *nvmeq = dev->queues[i];
 				if (!nvmeq)
 					continue;
+
 				spin_lock_irq(&nvmeq->q_lock);
 				if (nvme_process_cq(nvmeq))
 					printk("process_cq did something\n");
@@ -1415,7 +1505,7 @@ static int set_queue_count(struct nvme_dev *dev,
int count)

 static int __devinit nvme_setup_io_queues(struct nvme_dev *dev)
 {
-	int result, cpu, i, nr_io_queues, db_bar_size, q_depth;
+	int result, i, nr_io_queues, db_bar_size, q_depth;

 	nr_io_queues = num_online_cpus();
 	result = set_queue_count(dev, nr_io_queues);
@@ -1425,7 +1515,15 @@ static int __devinit
nvme_setup_io_queues(struct nvme_dev *dev)
 		nr_io_queues = result;

 	/* Deregister the admin queue's interrupt */
-	free_irq(dev->entry[0].vector, dev->queues[0]);
+	if (dev->int_method == NVME_INTERRUPT_MSI_X) {
+		free_irq(dev->entry[0].vector, dev->queues[0]);
+		pci_disable_msix(dev->pci_dev);
+	}
+	else if (dev->int_method == NVME_INTERRUPT_MSI) {
+		free_irq(dev->pci_dev->irq,dev);
+		dev->msi_messages = 0;
+		pci_disable_msi(dev->pci_dev);
+	}

 	db_bar_size = 4096 + ((nr_io_queues + 1) << (dev->db_stride + 3));
 	if (db_bar_size > 8192) {
@@ -1436,29 +1534,19 @@ static int __devinit
nvme_setup_io_queues(struct nvme_dev *dev)
 		dev->queues[0]->q_db = dev->dbs;
 	}

-	for (i = 0; i < nr_io_queues; i++)
-		dev->entry[i].entry = i;
-	for (;;) {
-		result = pci_enable_msix(dev->pci_dev, dev->entry,
-								nr_io_queues);
-		if (result == 0) {
-			break;
-		} else if (result > 0) {
-			nr_io_queues = result;
-			continue;
-		} else {
-			nr_io_queues = 1;
-			break;
-		}
-	}
+	result = nvme_enable_interrupt(dev,&nr_io_queues);

 	result = queue_request_irq(dev, dev->queues[0], "nvme admin");
+
 	/* XXX: handle failure here */

-	cpu = cpumask_first(cpu_online_mask);
-	for (i = 0; i < nr_io_queues; i++) {
-		irq_set_affinity_hint(dev->entry[i].vector, get_cpu_mask(cpu));
-		cpu = cpumask_next(cpu, cpu_online_mask);
+	if (dev->int_method == NVME_INTERRUPT_MSI_X ) {
+		// assign CPU affinity only for MSI-X interrupts!
+		int cpu = cpumask_first(cpu_online_mask);
+		for (i = 0; i < nr_io_queues; i++) {
+			irq_set_affinity_hint(dev->entry[i].vector, get_cpu_mask(cpu));
+			cpu = cpumask_next(cpu, cpu_online_mask);
+		}
 	}

 	q_depth = min_t(int, NVME_CAP_MQES(readq(&dev->bar->cap)) + 1,
@@ -1484,6 +1572,13 @@ static void nvme_free_queues(struct nvme_dev *dev)

 	for (i = dev->queue_count - 1; i >= 0; i--)
 		nvme_free_queue(dev, i);
+
+	// if MSI was used, free up the single vector after freeing up all the queues
+	if (dev->int_method == NVME_INTERRUPT_MSI)
+	{
+		free_irq(dev->pci_dev->irq, dev);
+		dev->msi_messages = 0;
+	}
 }

 static int __devinit nvme_dev_add(struct nvme_dev *dev)
@@ -1556,6 +1651,8 @@ static int nvme_dev_remove(struct nvme_dev *dev)
 {
 	struct nvme_ns *ns, *next;

+	nvme_free_queues(dev);
+
 	spin_lock(&dev_list_lock);
 	list_del(&dev->node);
 	spin_unlock(&dev_list_lock);
@@ -1566,7 +1663,7 @@ static int nvme_dev_remove(struct nvme_dev *dev)
 		nvme_ns_free(ns);
 	}

-	nvme_free_queues(dev);
+	//nvme_free_queues(dev);

 	return 0;
 }
@@ -1694,7 +1791,10 @@ static int __devinit nvme_probe(struct pci_dev *pdev,
  unmap:
 	iounmap(dev->bar);
  disable_msix:
-	pci_disable_msix(pdev);
+	if (dev->int_method == NVME_INTERRUPT_MSI_X)
+		pci_disable_msix(pdev);
+	else if (dev->int_method == NVME_INTERRUPT_MSI)
+		pci_disable_msi(pdev);
 	nvme_release_instance(dev);
 	nvme_release_prp_pools(dev);
  disable:
@@ -1711,7 +1811,12 @@ static void __devexit nvme_remove(struct pci_dev *pdev)
 {
 	struct nvme_dev *dev = pci_get_drvdata(pdev);
 	nvme_dev_remove(dev);
-	pci_disable_msix(pdev);
+
+	if (dev->int_method == NVME_INTERRUPT_MSI_X)
+		pci_disable_msix(pdev);
+	else if (dev->int_method == NVME_INTERRUPT_MSI)
+		pci_disable_msi(pdev);
+
 	iounmap(dev->bar);
 	nvme_release_instance(dev);
 	nvme_release_prp_pools(dev);



More information about the Linux-nvme mailing list