[RFCv2] nvme-pci: adjust tagset parameters to match b/w

Keith Busch kbusch at kernel.org
Wed Oct 13 08:21:36 PDT 2021


See v1 for background:

  http://lists.infradead.org/pipermail/linux-nvme/2021-October/027998.html

Instead of auto-adjusting the timeout to cope with the worst case
scenario, this version adjusts the IO depth and max transfer size so
that the worst case scenario fits within the driver's timeout tolerance.

I also fixed the b/w units since v1, as they are in megabits, not bytes.

I have encoded seemingly arbitrary lower-bounds for depth and transfer
sizes. The values were selected based on anecdotal/empirical observations
where going lower can negatively impact performance.

Signed-off-by: Keith Busch <kbusch at kernel.org>
---
 drivers/nvme/host/pci.c | 89 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 87 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 7fc992a99624..02a69f4ed6ba 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -16,6 +16,7 @@
 #include <linux/io.h>
 #include <linux/mm.h>
 #include <linux/module.h>
+#include <linux/msi.h>
 #include <linux/mutex.h>
 #include <linux/once.h>
 #include <linux/pci.h>
@@ -2424,11 +2425,84 @@ static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode)
 	return true;
 }
 
+static void nvme_adjust_tagset_parms(struct nvme_dev *dev)
+{
+	static const u32 min_bytes = 128 * 1024;
+	static const u32 min_depth = 128;
+
+	u32 timeout, max_bytes, max_prps, max_prp_lists, max_prp_list_size,
+		total_depth, max_xfer, bw, queue_depth;
+
+	/* bw is returned in Mb/s units */
+	bw = pcie_bandwidth_available(to_pci_dev(dev->dev), NULL, NULL, NULL);
+
+	/*
+	 * PCIe DLLP/TLP overhead based on worst case MPS (128b) achieves
+	 * roughy 86% link efficiency for host data. Also, convert to MiB/s
+	 * from megabits/s.
+	 *
+	 * XXX: Calculate efficiency from current MPS?
+	 */
+	bw = ((bw * 86) / 100) / 8;
+	if (!bw)
+		return;
+
+retry:
+	max_bytes = dev->ctrl.max_hw_sectors << SECTOR_SHIFT;
+	max_prps = DIV_ROUND_UP(max_bytes, NVME_CTRL_PAGE_SIZE);
+	max_prp_lists = DIV_ROUND_UP(max_prps * sizeof(__le64),
+					 NVME_CTRL_PAGE_SIZE);
+	max_prp_list_size = NVME_CTRL_PAGE_SIZE * max_prp_lists;
+	queue_depth = dev->tagset.queue_depth;
+	total_depth = dev->tagset.nr_hw_queues * queue_depth;
+
+	/* Max outstanding NVMe protocol transfer in MiB */
+	max_xfer = (total_depth * (max_bytes + max_prp_list_size +
+			   sizeof(struct nvme_command) +
+			   sizeof(struct nvme_completion) +
+			   sizeof(struct msi_msg))) >> 20;
+
+	timeout = DIV_ROUND_UP(max_xfer, bw);
+
+	/*
+	 * Double the time to generously allow for device side overhead and
+	 * link sharing.
+	 *
+	 * XXX: Calculate link sharing?
+	 */
+	timeout = (2 * timeout) * HZ;
+
+	if (timeout > NVME_IO_TIMEOUT &&
+	    (max_bytes > min_bytes ||
+	     queue_depth > min_depth)) {
+		if (max_bytes / 2 > min_bytes)
+			dev->ctrl.max_hw_sectors = DIV_ROUND_UP(
+						dev->ctrl.max_hw_sectors, 2);
+		else
+			dev->ctrl.max_hw_sectors = min_t(u32,
+					min_bytes >> SECTOR_SHIFT,
+					dev->ctrl.max_hw_sectors);
+
+		if (queue_depth / 2 > min_depth)
+			dev->tagset.queue_depth = DIV_ROUND_UP(
+						dev->tagset.queue_depth, 2);
+		else
+			dev->tagset.queue_depth = min_t(u32, min_depth,
+						dev->tagset.queue_depth);
+
+		goto retry;
+	}
+}
+
 static void nvme_dev_add(struct nvme_dev *dev)
 {
 	int ret;
 
 	if (!dev->ctrl.tagset) {
+		u32 queue_depth = min_t(unsigned int, dev->q_depth,
+					BLK_MQ_MAX_DEPTH) - 1;
+		u32 max_hw_sectors = dev->ctrl.max_hw_sectors;
+
 		dev->tagset.ops = &nvme_mq_ops;
 		dev->tagset.nr_hw_queues = dev->online_queues - 1;
 		dev->tagset.nr_maps = 2; /* default + read */
@@ -2436,12 +2510,23 @@ static void nvme_dev_add(struct nvme_dev *dev)
 			dev->tagset.nr_maps++;
 		dev->tagset.timeout = NVME_IO_TIMEOUT;
 		dev->tagset.numa_node = dev->ctrl.numa_node;
-		dev->tagset.queue_depth = min_t(unsigned int, dev->q_depth,
-						BLK_MQ_MAX_DEPTH) - 1;
+		dev->tagset.queue_depth = queue_depth;
 		dev->tagset.cmd_size = sizeof(struct nvme_iod);
 		dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
 		dev->tagset.driver_data = dev;
 
+		nvme_adjust_tagset_parms(dev);
+
+		if (dev->tagset.queue_depth != queue_depth ||
+		    dev->ctrl.max_hw_sectors != max_hw_sectors) {
+			dev_warn(dev->ctrl.device,
+				 "qdepth (%u) and max sectors (%u) exceed driver timeout tolerance (%ums)\n"
+				 "nvme ctrl qdepth and sectors adjusted to %u %u\n",
+				 queue_depth, max_hw_sectors, NVME_IO_TIMEOUT,
+				 dev->tagset.queue_depth,
+				 dev->ctrl.max_hw_sectors);
+		}
+
 		/*
 		 * Some Apple controllers requires tags to be unique
 		 * across admin and IO queue, so reserve the first 32
-- 
2.25.4




More information about the Linux-nvme mailing list