[PATCH] nvme: Add weighted-round-robin arbitration support
Kanchan Joshi
joshi.k at samsung.com
Thu Jan 4 07:02:09 PST 2018
This patch enables support for Weighted-Round-Robin (WRR) arbitration, so
that applications can make use of the prioritization capabilities natively
present in NVMe controller.
- It links existing io-nice classes (real-time, best-effort, none, low)
to NVMe priorities (urgent, high, medium, low). This is done through
'request->ioprio' field inside 'queue_rq' function.
- Current driver has 1:1 mapping (1 SQ, 1 CQ) per cpu, encapsulated in
'nvmeq' structure. This patch refactors the code so that N:1 mapping per
cpu can be created; 'nvmeq' has been changed to contain variable number of SQ
related fields. For WRR, 4 submission-queues (corresponding to each queue
priorites) need to be created on each cpu.
- When 'enable_wrr' module param is passed, it creates 4:1 mapping and enables
controller in WRR mode. Otherwise, it cotinues to retain 1:1 mapping and
controller remains in RR mode.
- NVMe device may have less number of queues than required for 4:1 mapping
per cpu. For example, when num_possible_cpus is 64, 256 submission-queues are
required for 4:1 mapping while device may support, say, 128.
This case is handled by creating 32 queue-pairs which are shared among 64 cpus.
Another way to handle this could have been reducing to 3:1 or 2:1 mapping
(and remapping 4 ionice classes as well).
-Admin queue, contains 1:1 mapping irrespective of the mode (RR or WRR) used.
Earlier I had collected results on 4.10
kernel, which indicate distribution happening as per
weights applied. Please refer to (section 5) in this paper -
http://www.usenix.org/system/files/conference/hotstorage17/hotstorage17-paper-joshi.pdf
I see similar results in current kernel as well.
Signed-off-by: Kanchan Joshi <joshi.k at samsung.com>
---
drivers/nvme/host/core.c | 4 +-
drivers/nvme/host/pci.c | 310 +++++++++++++++++++++++++++++++----------------
include/linux/nvme.h | 1 +
3 files changed, 210 insertions(+), 105 deletions(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 1e46e60..6920bdf 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1649,9 +1649,9 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
ctrl->page_size = 1 << page_shift;
- ctrl->ctrl_config = NVME_CC_CSS_NVM;
+ ctrl->ctrl_config |= NVME_CC_CSS_NVM;
ctrl->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT;
- ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE;
+ ctrl->ctrl_config |= NVME_CC_SHN_NONE;
ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
ctrl->ctrl_config |= NVME_CC_ENABLE;
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index f5800c3..5f99ee5e 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -37,6 +37,13 @@
#define SGES_PER_PAGE (PAGE_SIZE / sizeof(struct nvme_sgl_desc))
+#define SQ_PER_CORE_RR 1
+#define SQ_PER_CORE_WRR 4
+
+static bool enable_wrr = false;
+module_param(enable_wrr, bool, 0644);
+MODULE_PARM_DESC(enable_wrr, "enable wrr arbitration among I/O SQes");
+
static int use_threaded_interrupts;
module_param(use_threaded_interrupts, int, 0);
@@ -111,6 +118,9 @@ struct nvme_dev {
dma_addr_t host_mem_descs_dma;
struct nvme_host_mem_buf_desc *host_mem_descs;
void **host_mem_desc_bufs;
+
+ /* 1 for RR, 4 for WRR */
+ u8 sq_per_core;
};
static int io_queue_depth_set(const char *val, const struct kernel_param *kp)
@@ -147,24 +157,31 @@ struct nvme_queue {
struct device *q_dmadev;
struct nvme_dev *dev;
spinlock_t q_lock;
- struct nvme_command *sq_cmds;
- struct nvme_command __iomem *sq_cmds_io;
volatile struct nvme_completion *cqes;
struct blk_mq_tags **tags;
- dma_addr_t sq_dma_addr;
dma_addr_t cq_dma_addr;
- u32 __iomem *q_db;
+ u32 __iomem *cq_db;
u16 q_depth;
s16 cq_vector;
- u16 sq_tail;
u16 cq_head;
- u16 qid;
+ u16 cq_id;
u8 cq_phase;
u8 cqe_seen;
- u32 *dbbuf_sq_db;
u32 *dbbuf_cq_db;
- u32 *dbbuf_sq_ei;
u32 *dbbuf_cq_ei;
+ /* sq related fields start here */
+ u8 nr_sq;
+ struct sq_data {
+ struct nvme_command *sq_cmds;
+ struct nvme_command __iomem *sq_cmds_io;
+ dma_addr_t sq_dma_addr;
+ u32 __iomem *sq_db;
+ u16 id;
+ u16 sq_tail;
+ u32 *dbbuf_sq_db;
+ u32 *dbbuf_sq_ei;
+ } sq[];
+
};
/*
@@ -181,6 +198,7 @@ struct nvme_iod {
int npages; /* In the PRP list. 0 means small pool in use */
int nents; /* Used in scatterlist */
int length; /* Of data, in bytes */
+ int sq_indx;
dma_addr_t first_dma;
struct scatterlist meta_sg; /* metadata requires single contiguous buffer */
struct scatterlist *sg;
@@ -207,14 +225,14 @@ static inline void _nvme_check_size(void)
BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);
}
-static inline unsigned int nvme_dbbuf_size(u32 stride)
+static inline unsigned int nvme_dbbuf_size(u32 stride, u8 sq_per_core)
{
- return ((num_possible_cpus() + 1) * 8 * stride);
+ return ((sq_per_core * num_possible_cpus() + 1) * 8 * stride);
}
static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev)
{
- unsigned int mem_size = nvme_dbbuf_size(dev->db_stride);
+ unsigned int mem_size = nvme_dbbuf_size(dev->db_stride, dev->sq_per_core);
if (dev->dbbuf_dbs)
return 0;
@@ -239,7 +257,7 @@ static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev)
static void nvme_dbbuf_dma_free(struct nvme_dev *dev)
{
- unsigned int mem_size = nvme_dbbuf_size(dev->db_stride);
+ unsigned int mem_size = nvme_dbbuf_size(dev->db_stride, dev->sq_per_core);
if (dev->dbbuf_dbs) {
dma_free_coherent(dev->dev, mem_size,
@@ -256,13 +274,17 @@ static void nvme_dbbuf_dma_free(struct nvme_dev *dev)
static void nvme_dbbuf_init(struct nvme_dev *dev,
struct nvme_queue *nvmeq, int qid)
{
+ int i;
if (!dev->dbbuf_dbs || !qid)
return;
-
- nvmeq->dbbuf_sq_db = &dev->dbbuf_dbs[sq_idx(qid, dev->db_stride)];
- nvmeq->dbbuf_cq_db = &dev->dbbuf_dbs[cq_idx(qid, dev->db_stride)];
- nvmeq->dbbuf_sq_ei = &dev->dbbuf_eis[sq_idx(qid, dev->db_stride)];
- nvmeq->dbbuf_cq_ei = &dev->dbbuf_eis[cq_idx(qid, dev->db_stride)];
+ for (i = 0; i < nvmeq->nr_sq; i++) {
+ nvmeq->sq[i].dbbuf_sq_db =
+ &dev->dbbuf_dbs[sq_idx(nvmeq->sq[i].id, dev->db_stride)];
+ nvmeq->sq[i].dbbuf_sq_ei =
+ &dev->dbbuf_eis[sq_idx(nvmeq->sq[i].id, dev->db_stride)];
+ }
+ nvmeq->dbbuf_cq_db = &dev->dbbuf_dbs[cq_idx(nvmeq->cq_id, dev->db_stride)];
+ nvmeq->dbbuf_cq_ei = &dev->dbbuf_eis[cq_idx(nvmeq->cq_id, dev->db_stride)];
}
static void nvme_dbbuf_set(struct nvme_dev *dev)
@@ -425,21 +447,22 @@ static int nvme_pci_map_queues(struct blk_mq_tag_set *set)
* Safe to use from interrupt context
*/
static void __nvme_submit_cmd(struct nvme_queue *nvmeq,
- struct nvme_command *cmd)
+ struct nvme_command *cmd,
+ int idx)
{
- u16 tail = nvmeq->sq_tail;
+ u16 tail = nvmeq->sq[idx].sq_tail;
- if (nvmeq->sq_cmds_io)
- memcpy_toio(&nvmeq->sq_cmds_io[tail], cmd, sizeof(*cmd));
+ if (nvmeq->sq[idx].sq_cmds_io)
+ memcpy_toio(&nvmeq->sq[idx].sq_cmds_io[tail], cmd, sizeof(*cmd));
else
- memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));
+ memcpy(&nvmeq->sq[idx].sq_cmds[tail], cmd, sizeof(*cmd));
if (++tail == nvmeq->q_depth)
tail = 0;
- if (nvme_dbbuf_update_and_check_event(tail, nvmeq->dbbuf_sq_db,
- nvmeq->dbbuf_sq_ei))
- writel(tail, nvmeq->q_db);
- nvmeq->sq_tail = tail;
+ if (nvme_dbbuf_update_and_check_event(tail, nvmeq->sq[idx].dbbuf_sq_db,
+ nvmeq->sq[idx].dbbuf_sq_ei))
+ writel(tail, nvmeq->sq[idx].sq_db);
+ nvmeq->sq[idx].sq_tail = tail;
}
static void **nvme_pci_iod_list(struct request *req)
@@ -448,7 +471,8 @@ static void **nvme_pci_iod_list(struct request *req)
return (void **)(iod->sg + blk_rq_nr_phys_segments(req));
}
-static blk_status_t nvme_init_iod(struct request *rq, struct nvme_dev *dev)
+static blk_status_t nvme_init_iod(struct request *rq, struct nvme_dev *dev,
+ int sq_indx)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(rq);
int nseg = blk_rq_nr_phys_segments(rq);
@@ -469,6 +493,7 @@ static blk_status_t nvme_init_iod(struct request *rq, struct nvme_dev *dev)
iod->npages = -1;
iod->nents = 0;
iod->length = size;
+ iod->sq_indx = sq_indx;
return BLK_STS_OK;
}
@@ -780,7 +805,7 @@ static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req)
if (!(dev->ctrl.sgls & ((1 << 0) | (1 << 1))))
return false;
- if (!iod->nvmeq->qid)
+ if (!iod->nvmeq->cq_id)
return false;
if (!sgl_threshold || avg_seg_size < sgl_threshold)
return false;
@@ -859,6 +884,12 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
nvme_free_iod(dev, req);
}
+static inline int ioprio_to_sqindx(struct nvme_queue *nvmeq, struct request *req)
+{
+ int ioprio_class;
+ ioprio_class = req->ioprio >> IOPRIO_CLASS_SHIFT;
+ return (ioprio_class % nvmeq->nr_sq);
+}
/*
* NOTE: ns is NULL when called on the admin queue.
*/
@@ -871,12 +902,18 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
struct request *req = bd->rq;
struct nvme_command cmnd;
blk_status_t ret;
+ int sq_indx = 0;
+ /*
+ * no need to check iopriority for admin queue, and when in RR mode
+ */
+ if (nvmeq->nr_sq > SQ_PER_CORE_RR)
+ sq_indx = ioprio_to_sqindx(nvmeq, req);
ret = nvme_setup_cmd(ns, req, &cmnd);
if (ret)
return ret;
- ret = nvme_init_iod(req, dev);
+ ret = nvme_init_iod(req, dev, sq_indx);
if (ret)
goto out_free_cmd;
@@ -894,7 +931,7 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
spin_unlock_irq(&nvmeq->q_lock);
goto out_cleanup_iod;
}
- __nvme_submit_cmd(nvmeq, &cmnd);
+ __nvme_submit_cmd(nvmeq, &cmnd, sq_indx);
nvme_process_cq(nvmeq);
spin_unlock_irq(&nvmeq->q_lock);
return BLK_STS_OK;
@@ -927,7 +964,7 @@ static inline void nvme_ring_cq_doorbell(struct nvme_queue *nvmeq)
if (likely(nvmeq->cq_vector >= 0)) {
if (nvme_dbbuf_update_and_check_event(head, nvmeq->dbbuf_cq_db,
nvmeq->dbbuf_cq_ei))
- writel(head, nvmeq->q_db + nvmeq->dev->db_stride);
+ writel(head, nvmeq->cq_db);
}
}
@@ -935,7 +972,6 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq,
struct nvme_completion *cqe)
{
struct request *req;
-
if (unlikely(cqe->command_id >= nvmeq->q_depth)) {
dev_warn(nvmeq->dev->ctrl.device,
"invalid id %d completed on queue %d\n",
@@ -949,7 +985,7 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq,
* aborts. We don't even bother to allocate a struct request
* for them but rather special case them here.
*/
- if (unlikely(nvmeq->qid == 0 &&
+ if (unlikely(nvmeq->cq_id == 0 &&
cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH)) {
nvme_complete_async_event(&nvmeq->dev->ctrl,
cqe->status, &cqe->result);
@@ -1054,7 +1090,7 @@ static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl)
c.common.command_id = NVME_AQ_BLK_MQ_DEPTH;
spin_lock_irq(&nvmeq->q_lock);
- __nvme_submit_cmd(nvmeq, &c);
+ __nvme_submit_cmd(nvmeq, &c, 0);
spin_unlock_irq(&nvmeq->q_lock);
}
@@ -1086,28 +1122,36 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
c.create_cq.cq_flags = cpu_to_le16(flags);
c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector);
-
return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
}
-static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
+static int adapter_alloc_sq(struct nvme_dev *dev, int sq_indx,
struct nvme_queue *nvmeq)
{
struct nvme_command c;
int flags = NVME_QUEUE_PHYS_CONTIG;
+ if (enable_wrr) {
+ /*
+ * Note: io-prio class to nvme priority mapping
+ * none -> medium, realtime -> urgent, best-effort -> high,
+ * idle->low
+ */
+ int prio[] = {NVME_SQ_PRIO_MEDIUM, NVME_SQ_PRIO_URGENT,
+ NVME_SQ_PRIO_HIGH, NVME_SQ_PRIO_LOW};
+ flags |= prio[sq_indx];
+ }
/*
* Note: we (ab)use the fact that the prp fields survive if no data
* is attached to the request.
*/
memset(&c, 0, sizeof(c));
c.create_sq.opcode = nvme_admin_create_sq;
- c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr);
- c.create_sq.sqid = cpu_to_le16(qid);
+ c.create_sq.prp1 = cpu_to_le64(nvmeq->sq[sq_indx].sq_dma_addr);
+ c.create_sq.sqid = cpu_to_le16(nvmeq->sq[sq_indx].id);
c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
c.create_sq.sq_flags = cpu_to_le16(flags);
- c.create_sq.cqid = cpu_to_le16(qid);
-
+ c.create_sq.cqid = cpu_to_le16(nvmeq->cq_id);
return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
}
@@ -1202,7 +1246,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
if (__nvme_poll(nvmeq, req->tag)) {
dev_warn(dev->ctrl.device,
"I/O %d QID %d timeout, completion polled\n",
- req->tag, nvmeq->qid);
+ req->tag, nvmeq->cq_id);
return BLK_EH_HANDLED;
}
@@ -1215,7 +1259,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
if (dev->ctrl.state == NVME_CTRL_RESETTING) {
dev_warn(dev->ctrl.device,
"I/O %d QID %d timeout, disable controller\n",
- req->tag, nvmeq->qid);
+ req->tag, nvmeq->cq_id);
nvme_dev_disable(dev, false);
nvme_req(req)->flags |= NVME_REQ_CANCELLED;
return BLK_EH_HANDLED;
@@ -1226,10 +1270,10 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
* command was already aborted once before and still hasn't been
* returned to the driver, or if this is the admin queue.
*/
- if (!nvmeq->qid || iod->aborted) {
+ if (!nvmeq->cq_id || iod->aborted) {
dev_warn(dev->ctrl.device,
"I/O %d QID %d timeout, reset controller\n",
- req->tag, nvmeq->qid);
+ req->tag, nvmeq->cq_id);
nvme_dev_disable(dev, false);
nvme_reset_ctrl(&dev->ctrl);
@@ -1250,11 +1294,11 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
memset(&cmd, 0, sizeof(cmd));
cmd.abort.opcode = nvme_admin_abort_cmd;
cmd.abort.cid = req->tag;
- cmd.abort.sqid = cpu_to_le16(nvmeq->qid);
+ cmd.abort.sqid = cpu_to_le16(iod->sq_indx);
dev_warn(nvmeq->dev->ctrl.device,
"I/O %d QID %d timeout, aborting\n",
- req->tag, nvmeq->qid);
+ req->tag, nvmeq->cq_id);
abort_req = nvme_alloc_request(dev->ctrl.admin_q, &cmd,
BLK_MQ_REQ_NOWAIT, NVME_QID_ANY);
@@ -1277,11 +1321,17 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
static void nvme_free_queue(struct nvme_queue *nvmeq)
{
+ unsigned idx = 0;
dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
(void *)nvmeq->cqes, nvmeq->cq_dma_addr);
- if (nvmeq->sq_cmds)
- dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
- nvmeq->sq_cmds, nvmeq->sq_dma_addr);
+ for (idx = 0; idx < nvmeq->nr_sq; idx++) {
+ if (nvmeq->sq[idx].sq_cmds)
+ dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
+ nvmeq->sq[idx].sq_cmds,
+ nvmeq->sq[idx].sq_dma_addr);
+
+
+ }
kfree(nvmeq);
}
@@ -1315,7 +1365,7 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq)
nvmeq->cq_vector = -1;
spin_unlock_irq(&nvmeq->q_lock);
- if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q)
+ if (!nvmeq->cq_id && nvmeq->dev->ctrl.admin_q)
blk_mq_quiesce_queue(nvmeq->dev->ctrl.admin_q);
pci_free_irq(to_pci_dev(nvmeq->dev->dev), vector, nvmeq);
@@ -1367,17 +1417,18 @@ static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues,
}
static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
- int qid, int depth)
+ int sq_indx, int depth)
{
+ int qid = nvmeq->sq[sq_indx].id;
if (qid && dev->cmb && use_cmb_sqes && NVME_CMB_SQS(dev->cmbsz)) {
unsigned offset = (qid - 1) * roundup(SQ_SIZE(depth),
dev->ctrl.page_size);
- nvmeq->sq_dma_addr = dev->cmb_bus_addr + offset;
- nvmeq->sq_cmds_io = dev->cmb + offset;
+ nvmeq->sq[sq_indx].sq_dma_addr = dev->cmb_bus_addr + offset;
+ nvmeq->sq[sq_indx].sq_cmds_io = dev->cmb + offset;
} else {
- nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth),
- &nvmeq->sq_dma_addr, GFP_KERNEL);
- if (!nvmeq->sq_cmds)
+ nvmeq->sq[sq_indx].sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth),
+ &nvmeq->sq[sq_indx].sq_dma_addr, GFP_KERNEL);
+ if (!nvmeq->sq[sq_indx].sq_cmds)
return -ENOMEM;
}
@@ -1385,36 +1436,51 @@ static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
}
static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
- int depth, int node)
+ int depth, int node,
+ int nr_sq)
{
- struct nvme_queue *nvmeq = kzalloc_node(sizeof(*nvmeq), GFP_KERNEL,
+ struct nvme_queue *nvmeq = kzalloc_node(sizeof(*nvmeq) + \
+ (nr_sq * sizeof(struct sq_data)), GFP_KERNEL,
node);
+ int cq_id, i;
if (!nvmeq)
return NULL;
-
nvmeq->cqes = dma_zalloc_coherent(dev->dev, CQ_SIZE(depth),
&nvmeq->cq_dma_addr, GFP_KERNEL);
if (!nvmeq->cqes)
goto free_nvmeq;
- if (nvme_alloc_sq_cmds(dev, nvmeq, qid, depth))
- goto free_cqdma;
+ nvmeq->nr_sq = nr_sq;
+ cq_id = (qid * nr_sq) - nr_sq + 1;
+ nvmeq->cq_id = cq_id;
+ nvmeq->cq_db = &dev->dbs[cq_idx(nvmeq->cq_id, dev->db_stride)];
+ for (i = 0; i < nr_sq; i++) {
+ nvmeq->sq[i].id = cq_id++;
+ if (nvme_alloc_sq_cmds(dev, nvmeq, i, depth))
+ goto free_cqdma;
+
+ nvmeq->sq[i].sq_db = &dev->dbs[sq_idx(nvmeq->sq[i].id, dev->db_stride)];
+ }
nvmeq->q_dmadev = dev->dev;
nvmeq->dev = dev;
spin_lock_init(&nvmeq->q_lock);
nvmeq->cq_head = 0;
nvmeq->cq_phase = 1;
- nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
nvmeq->q_depth = depth;
- nvmeq->qid = qid;
nvmeq->cq_vector = -1;
dev->queues[qid] = nvmeq;
dev->ctrl.queue_count++;
-
return nvmeq;
free_cqdma:
+ for (i = 0; i < nr_sq; i++) {
+ if (nvmeq->sq[i].sq_cmds) {
+ dma_free_coherent(dev->dev, SQ_SIZE(depth),
+ nvmeq->sq[i].sq_cmds,
+ nvmeq->sq[i].sq_dma_addr);
+ }
+ }
dma_free_coherent(dev->dev, CQ_SIZE(depth), (void *)nvmeq->cqes,
nvmeq->cq_dma_addr);
free_nvmeq:
@@ -1429,22 +1495,26 @@ static int queue_request_irq(struct nvme_queue *nvmeq)
if (use_threaded_interrupts) {
return pci_request_irq(pdev, nvmeq->cq_vector, nvme_irq_check,
- nvme_irq, nvmeq, "nvme%dq%d", nr, nvmeq->qid);
+ nvme_irq, nvmeq, "nvme%dq%d", nr, nvmeq->cq_id);
} else {
return pci_request_irq(pdev, nvmeq->cq_vector, nvme_irq,
- NULL, nvmeq, "nvme%dq%d", nr, nvmeq->qid);
+ NULL, nvmeq, "nvme%dq%d", nr, nvmeq->cq_id);
}
}
static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
{
struct nvme_dev *dev = nvmeq->dev;
-
+ int i;
spin_lock_irq(&nvmeq->q_lock);
- nvmeq->sq_tail = 0;
+ for (i = 0; i < nvmeq->nr_sq; i++) {
+ nvmeq->sq[i].sq_tail = 0;
+ nvmeq->sq[i].sq_db = &dev->dbs[sq_idx(nvmeq->sq[i].id,
+ dev->db_stride)];
+ }
+ nvmeq->cq_db = &dev->dbs[cq_idx(nvmeq->cq_id, dev->db_stride)];
nvmeq->cq_head = 0;
nvmeq->cq_phase = 1;
- nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth));
nvme_dbbuf_init(dev, nvmeq, qid);
dev->online_queues++;
@@ -1454,16 +1524,16 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
{
struct nvme_dev *dev = nvmeq->dev;
- int result;
-
+ int result, i;
nvmeq->cq_vector = qid - 1;
- result = adapter_alloc_cq(dev, qid, nvmeq);
+ result = adapter_alloc_cq(dev, nvmeq->cq_id, nvmeq);
if (result < 0)
return result;
-
- result = adapter_alloc_sq(dev, qid, nvmeq);
- if (result < 0)
- goto release_cq;
+ for (i = 0; i < nvmeq->nr_sq; i++) {
+ result = adapter_alloc_sq(dev, i, nvmeq);
+ if (result < 0)
+ goto release_cq;
+ }
nvme_init_queue(nvmeq, qid);
result = queue_request_irq(nvmeq);
@@ -1473,9 +1543,12 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
return result;
release_sq:
- adapter_delete_sq(dev, qid);
+ while (i) {
+ adapter_delete_sq(dev, nvmeq->sq[i].id);
+ --i;
+ }
release_cq:
- adapter_delete_cq(dev, qid);
+ adapter_delete_cq(dev, nvmeq->cq_id);
return result;
}
@@ -1595,7 +1668,7 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev)
nvmeq = dev->queues[0];
if (!nvmeq) {
nvmeq = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH,
- dev_to_node(dev->dev));
+ dev_to_node(dev->dev), 1);
if (!nvmeq)
return -ENOMEM;
}
@@ -1604,13 +1677,12 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev)
aqa |= aqa << 16;
writel(aqa, dev->bar + NVME_REG_AQA);
- lo_hi_writeq(nvmeq->sq_dma_addr, dev->bar + NVME_REG_ASQ);
+ lo_hi_writeq(nvmeq->sq[0].sq_dma_addr, dev->bar + NVME_REG_ASQ);
lo_hi_writeq(nvmeq->cq_dma_addr, dev->bar + NVME_REG_ACQ);
result = nvme_enable_ctrl(&dev->ctrl, dev->ctrl.cap);
if (result)
return result;
-
nvmeq->cq_vector = 0;
nvme_init_queue(nvmeq, 0);
result = queue_request_irq(nvmeq);
@@ -1626,11 +1698,11 @@ static int nvme_create_io_queues(struct nvme_dev *dev)
{
unsigned i, max;
int ret = 0;
-
for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) {
/* vector == qid - 1, match nvme_create_queue */
if (!nvme_alloc_queue(dev, i, dev->q_depth,
- pci_irq_get_node(to_pci_dev(dev->dev), i - 1))) {
+ pci_irq_get_node(to_pci_dev(dev->dev), i - 1),
+ dev->sq_per_core)) {
ret = -ENOMEM;
break;
}
@@ -1896,19 +1968,18 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
{
struct nvme_queue *adminq = dev->queues[0];
struct pci_dev *pdev = to_pci_dev(dev->dev);
- int result, nr_io_queues;
+ int result, nr_io_sqes, nr_io_cqes;
unsigned long size;
- nr_io_queues = num_present_cpus();
- result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues);
+ nr_io_sqes = num_present_cpus() * dev->sq_per_core;
+ result = nvme_set_queue_count(&dev->ctrl, &nr_io_sqes);
if (result < 0)
return result;
- if (nr_io_queues == 0)
+ if (nr_io_sqes == 0)
return 0;
-
if (dev->cmb && NVME_CMB_SQS(dev->cmbsz)) {
- result = nvme_cmb_qdepth(dev, nr_io_queues,
+ result = nvme_cmb_qdepth(dev, nr_io_sqes,
sizeof(struct nvme_command));
if (result > 0)
dev->q_depth = result;
@@ -1917,14 +1988,18 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
}
do {
- size = db_bar_size(dev, nr_io_queues);
+ size = db_bar_size(dev, nr_io_sqes);
result = nvme_remap_bar(dev, size);
if (!result)
break;
- if (!--nr_io_queues)
+ nr_io_sqes -= dev->sq_per_core;
+ if (!nr_io_sqes)
return -ENOMEM;
} while (1);
- adminq->q_db = dev->dbs;
+ adminq->sq[0].sq_db = dev->dbs;
+ adminq->cq_db = &dev->dbs[dev->db_stride];
+
+ nr_io_cqes = nr_io_sqes / dev->sq_per_core;
/* Deregister the admin queue's interrupt */
pci_free_irq(pdev, 0, adminq);
@@ -1934,11 +2009,15 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
* setting up the full range we need.
*/
pci_free_irq_vectors(pdev);
- nr_io_queues = pci_alloc_irq_vectors(pdev, 1, nr_io_queues,
+ nr_io_cqes = pci_alloc_irq_vectors(pdev, 1, nr_io_cqes,
PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY);
- if (nr_io_queues <= 0)
+ if (nr_io_cqes <= 0)
return -EIO;
- dev->max_qid = nr_io_queues;
+ /*
+ * Recalculate sqes, in case nr_io_cqes reduces due to above call
+ */
+ nr_io_sqes = nr_io_cqes * dev->sq_per_core;
+ dev->max_qid = nr_io_cqes;
/*
* Should investigate if there's a performance win from allocating
@@ -1984,7 +2063,7 @@ static void nvme_del_cq_end(struct request *req, blk_status_t error)
nvme_del_queue_end(req, error);
}
-static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode)
+static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode, u16 qid)
{
struct request_queue *q = nvmeq->dev->ctrl.admin_q;
struct request *req;
@@ -1992,7 +2071,7 @@ static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode)
memset(&cmd, 0, sizeof(cmd));
cmd.delete_queue.opcode = opcode;
- cmd.delete_queue.qid = cpu_to_le16(nvmeq->qid);
+ cmd.delete_queue.qid = cpu_to_le16(qid);
req = nvme_alloc_request(q, &cmd, BLK_MQ_REQ_NOWAIT, NVME_QID_ANY);
if (IS_ERR(req))
@@ -2009,20 +2088,34 @@ static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode)
static void nvme_disable_io_queues(struct nvme_dev *dev, int queues)
{
- int pass;
+ int pass, err;
unsigned long timeout;
u8 opcode = nvme_admin_delete_sq;
for (pass = 0; pass < 2; pass++) {
- int sent = 0, i = queues;
+ int sent = 0, i = queues, j;
reinit_completion(&dev->ioq_wait);
retry:
timeout = ADMIN_TIMEOUT;
- for (; i > 0; i--, sent++)
- if (nvme_delete_queue(dev->queues[i], opcode))
- break;
+ if (opcode == nvme_admin_delete_cq) {
+ for (; i > 0; i--, sent++)
+ if (nvme_delete_queue(dev->queues[i], opcode,
+ dev->queues[i]->cq_id))
+ break;
+ } else {
+ for (; i > 0; i--) {
+ for (j = 0; j < dev->sq_per_core; j++) {
+ err = nvme_delete_queue(dev->queues[i],
+ opcode,
+ dev->queues[i]->sq[j].id);
+ if (err)
+ break;
+ ++sent;
+ }
+ }
+ }
while (sent--) {
timeout = wait_for_completion_io_timeout(&dev->ioq_wait, timeout);
if (timeout == 0)
@@ -2106,7 +2199,6 @@ static int nvme_pci_enable(struct nvme_dev *dev)
io_queue_depth);
dev->db_stride = 1 << NVME_CAP_STRIDE(dev->ctrl.cap);
dev->dbs = dev->bar + 4096;
-
/*
* Temporary fix for the Apple controller found in the MacBook8,1 and
* some MacBook7,1 to avoid controller resets and data loss.
@@ -2306,6 +2398,18 @@ static void nvme_reset_work(struct work_struct *work)
if (result)
goto out;
+ dev->sq_per_core = SQ_PER_CORE_RR;
+ if (enable_wrr) {
+ if (NVME_CAP_WRR(dev->ctrl.cap)) {
+ dev->sq_per_core = SQ_PER_CORE_WRR;
+ dev->ctrl.ctrl_config = NVME_CC_AMS_WRRU;
+ dev_info(dev->ctrl.device,
+ "enabling wrr, %u sq per core\n",
+ dev->sq_per_core);
+ } else
+ dev_warn(dev->ctrl.device, "does not support WRR\n");
+ }
+
result = nvme_pci_configure_admin_queue(dev);
if (result)
goto out;
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index aea87f0d..7b33a47 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -116,6 +116,7 @@ enum {
};
#define NVME_CAP_MQES(cap) ((cap) & 0xffff)
+#define NVME_CAP_WRR(cap) (((cap) >> 17) & 0x1)
#define NVME_CAP_TIMEOUT(cap) (((cap) >> 24) & 0xff)
#define NVME_CAP_STRIDE(cap) (((cap) >> 32) & 0xf)
#define NVME_CAP_NSSRC(cap) (((cap) >> 36) & 0x1)
--
2.7.4
More information about the Linux-nvme
mailing list