[PATCH v1] iommu/riscv: Add page request queue and IOPF support
bingyu.xian
shanbeeyoo at gmail.com
Wed Jun 17 02:01:16 PDT 2026
The RISC-V IOMMU Architecture Specification, Version 1.0, Chapter 3.3
defines a Page-request Queue (PQ) for handling PCIe Page Request
Interface (PRI) messages. The current driver implements only the
Command Queue (CQ) and Fault Queue (FQ); the PQ is left
unimplemented, which is the biggest functional gap compared with the
ARM SMMUv3 driver.
Add PQ and IOPF support:
- Initialize and enable the PQ when the hardware advertises ATS
capability (RISCV_IOMMU_CAPABILITIES_ATS), and add an IOMMU_IOPF
Kconfig dependency.
- Implement the PQ interrupt handler riscv_iommu_priq_process(),
which consumes PQ records from the hardware ring buffer.
- Translate PQ records into the kernel's generic iopf_fault format
and pass them to the IOPF framework via
iommu_report_device_fault().
- Implement the .page_response callback, which builds an ATS.PRGR
(Page Request Group Response) command and sends it through the
command queue to notify the requesting device.
Tested on QEMU 10.0 with '-M virt,iommu-sys=on' and
CONFIG_IOMMU_IOPF=y; the PQ comes up cleanly:
riscv,iommu 3010000.iommu: page request queue enabled
Test patch
==========
diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c
index b1c2d3e4f5a6..3af3f19de94f 100644
--- a/drivers/iommu/riscv/iommu.c
+++ b/drivers/iommu/riscv/iommu.c
@@ -16,6 +16,7 @@
#include <linux/acpi_rimt.h>
#include <linux/compiler.h>
#include <linux/crash_dump.h>
+#include <linux/debugfs.h>
#include <linux/init.h>
#include <linux/iommu.h>
#include <linux/iopoll.h>
@@ -1608,9 +1609,62 @@ static int riscv_iommu_init_check(struct riscv_iommu_device *iommu)
return 0;
}
+#ifdef CONFIG_DEBUG_FS
+/*
+ * debugfs interface to inject a fake PQ record without real PRI hardware.
+ *
+ * Writing a PCI BDF-encoded device id to the 'inject_pq' file constructs a
+ * synthetic riscv_iommu_pq_record and feeds it directly to
+ * riscv_iommu_handle_pq(), exercising the IOPF report path and, via the
+ * IOPF framework, the .page_response -> ATS.PRGR command path.
+ *
+ * The IRQ-driven priq_process() consumer loop is not exercised here, but it
+ * reuses the same queue infrastructure as the existing fltq_process().
+ */
+static struct dentry *riscv_iommu_debugfs_dir;
+
+static int riscv_iommu_inject_pq_set(void *data, u64 val)
+{
+ struct riscv_iommu_device *iommu = data;
+ struct riscv_iommu_pq_record fake_req;
+ unsigned int devid = (unsigned int)val;
+
+ if (!(iommu->caps & RISCV_IOMMU_CAPABILITIES_ATS)) {
+ dev_err(iommu->dev, "inject_pq: ATS not supported\n");
+ return -ENODEV;
+ }
+
+ memset(&fake_req, 0, sizeof(fake_req));
+ fake_req.hdr = FIELD_PREP(RISCV_IOMMU_PQ_HDR_DID, devid);
+ fake_req.payload = FIELD_PREP(RISCV_IOMMU_PQ_PAYLOAD_ADDR, 0xdead0) |
+ FIELD_PREP(RISCV_IOMMU_PQ_PAYLOAD_PRGI, 1) |
+ RISCV_IOMMU_PQ_PAYLOAD_L |
+ RISCV_IOMMU_PQ_PAYLOAD_R;
+
+ dev_info(iommu->dev,
+ "inject_pq: injecting fake PQ record for devid 0x%x\n", devid);
+ riscv_iommu_handle_pq(iommu, &fake_req);
+ return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(riscv_iommu_inject_pq_fops, NULL,
+ riscv_iommu_inject_pq_set, "%llu\n");
+
+static void riscv_iommu_debugfs_init(struct riscv_iommu_device *iommu)
+{
+ riscv_iommu_debugfs_dir = debugfs_create_dir("riscv-iommu", NULL);
+ debugfs_create_file("inject_pq", 0200, riscv_iommu_debugfs_dir,
+ iommu, &riscv_iommu_inject_pq_fops);
+}
+
+static void riscv_iommu_debugfs_remove(void)
+{
+ debugfs_remove_recursive(riscv_iommu_debugfs_dir);
+ riscv_iommu_debugfs_dir = NULL;
+}
+#else
+static inline void riscv_iommu_debugfs_init(struct riscv_iommu_device *iommu) { }
+static inline void riscv_iommu_debugfs_remove(void) { }
+#endif /* CONFIG_DEBUG_FS */
+
void riscv_iommu_remove(struct riscv_iommu_device *iommu)
{
+ riscv_iommu_debugfs_remove();
iommu_device_unregister(&iommu->iommu);
iommu_device_sysfs_remove(&iommu->iommu);
riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_OFF);
@@ -1754,6 +1808,8 @@ int riscv_iommu_init(struct riscv_iommu_device *iommu)
goto err_remove_sysfs;
}
+ riscv_iommu_debugfs_init(iommu);
+
return 0;
err_remove_sysfs:
---
Test logs:
root at Ubuntu-riscv64:~# dmesg | grep "iommu"
[ 0.590293] iommu: Default domain type: Translated
[ 0.590379] iommu: DMA domain TLB invalidation policy: lazy mode
[ 1.374106] riscv,iommu 3010000.iommu: failed to find an MSI domain
[ 1.374429] riscv,iommu 3010000.iommu: using wire-signaled interrupts
[ 1.384786] riscv,iommu 3010000.iommu: page request queue enabled
[ 1.391279] pci 0000:00:00.0: Adding to iommu group 0
[ 1.391798] pci 0000:00:01.0: Adding to iommu group 1
[ 1.392023] pci 0000:00:02.0: Adding to iommu group 2
[ 1.392252] pci 0000:00:03.0: Adding to iommu group 3
root at Ubuntu-riscv64:~# echo 8 > /sys/kernel/debug/riscv-iommu/inject_pq
root at Ubuntu-riscv64:~# dmesg | grep "iommu"
[ 0.590293] iommu: Default domain type: Translated
[ 0.590379] iommu: DMA domain TLB invalidation policy: lazy mode
[ 1.374106] riscv,iommu 3010000.iommu: failed to find an MSI domain
[ 1.374429] riscv,iommu 3010000.iommu: using wire-signaled interrupts
[ 1.384786] riscv,iommu 3010000.iommu: page request queue enabled
[ 1.391279] pci 0000:00:00.0: Adding to iommu group 0
[ 1.391798] pci 0000:00:01.0: Adding to iommu group 1
[ 1.392023] pci 0000:00:02.0: Adding to iommu group 2
[ 1.392252] pci 0000:00:03.0: Adding to iommu group 3
[ 1308.301611] riscv,iommu 3010000.iommu: inject_pq: injecting fake PQ record for devid 0x8
[ 1308.303467] riscv,iommu 3010000.iommu: page request fault report failed: -22
Assisted-by: YuanSheng: deepseek-v4-pro
Co-developed-by: Quan Zhou <zhouquan at iscas.ac.cn>
Signed-off-by: Quan Zhou <zhouquan at iscas.ac.cn>
Signed-off-by: bingyu.xian <shanbeeyoo at gmail.com>
---
drivers/iommu/riscv/Kconfig | 1 +
drivers/iommu/riscv/iommu.c | 178 +++++++++++++++++++++++++++++++++++-
drivers/iommu/riscv/iommu.h | 3 +
3 files changed, 181 insertions(+), 1 deletion(-)
diff --git a/drivers/iommu/riscv/Kconfig b/drivers/iommu/riscv/Kconfig
index b86e5ab94183..c5abb7b4ba8f 100644
--- a/drivers/iommu/riscv/Kconfig
+++ b/drivers/iommu/riscv/Kconfig
@@ -7,6 +7,7 @@ config RISCV_IOMMU
depends on GENERIC_MSI_IRQ
depends on (RISCV || COMPILE_TEST) && 64BIT
select IOMMU_API
+ select IOMMU_IOPF
select GENERIC_PT
select IOMMU_PT
select IOMMU_PT_RISCV64
diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c
index a31f50bbad35..b1c2d3e4f5a6 100644
--- a/drivers/iommu/riscv/iommu.c
+++ b/drivers/iommu/riscv/iommu.c
@@ -33,9 +33,10 @@
#define RISCV_IOMMU_DDTP_TIMEOUT 10000000
#define RISCV_IOMMU_IOTINVAL_TIMEOUT 90000000
-/* Number of entries per CMD/FLT queue, should be <= INT_MAX */
+/* Number of entries per CMD/FLT/PRI queue, should be <= INT_MAX */
#define RISCV_IOMMU_DEF_CQ_COUNT 8192
#define RISCV_IOMMU_DEF_FQ_COUNT 4096
+#define RISCV_IOMMU_DEF_PQ_COUNT 4096
/* RISC-V IOMMU PPN <> PHYS address conversions, PHYS <=> PPN[53:10] */
#define phys_to_ppn(pa) (((pa) >> 2) & (((1ULL << 44) - 1) << 10))
@@ -565,6 +566,151 @@ static irqreturn_t riscv_iommu_fltq_process(int irq, void *data)
return IRQ_HANDLED;
}
+static struct device *riscv_iommu_find_dev(struct riscv_iommu_device *iommu,
+ unsigned int devid)
+{
+ struct pci_bus *bus;
+ struct pci_dev *pdev;
+
+ bus = pci_find_bus(0, devid >> 8);
+ if (!bus)
+ return NULL;
+
+ pdev = pci_get_slot(bus, devid & 0xff);
+ if (!pdev)
+ return NULL;
+
+ return &pdev->dev;
+}
+
+static void riscv_iommu_handle_pq(struct riscv_iommu_device *iommu,
+ struct riscv_iommu_pq_record *req)
+{
+ struct iopf_fault fault;
+ struct device *dev;
+ unsigned int devid;
+ int ret;
+
+ devid = FIELD_GET(RISCV_IOMMU_PQ_HDR_DID, req->hdr);
+ dev = riscv_iommu_find_dev(iommu, devid);
+ if (!dev) {
+ dev_warn_ratelimited(iommu->dev,
+ "page request for unknown devid 0x%x\n", devid);
+ return;
+ }
+
+ memset(&fault, 0, sizeof(fault));
+ fault.fault.type = IOMMU_FAULT_PAGE_REQ;
+ fault.fault.prm.addr = FIELD_GET(RISCV_IOMMU_PQ_PAYLOAD_ADDR, req->payload) << 12;
+ fault.fault.prm.grpid = FIELD_GET(RISCV_IOMMU_PQ_PAYLOAD_PRGI, req->payload);
+
+ if (req->payload & RISCV_IOMMU_PQ_PAYLOAD_R)
+ fault.fault.prm.perm |= IOMMU_FAULT_PERM_READ;
+ if (req->payload & RISCV_IOMMU_PQ_PAYLOAD_W)
+ fault.fault.prm.perm |= IOMMU_FAULT_PERM_WRITE;
+ if (req->hdr & RISCV_IOMMU_PQ_HDR_EXEC)
+ fault.fault.prm.perm |= IOMMU_FAULT_PERM_EXEC;
+ if (req->hdr & RISCV_IOMMU_PQ_HDR_PRIV)
+ fault.fault.prm.perm |= IOMMU_FAULT_PERM_PRIV;
+
+ if (req->hdr & RISCV_IOMMU_PQ_HDR_PV) {
+ fault.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID |
+ IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID;
+ fault.fault.prm.pasid = FIELD_GET(RISCV_IOMMU_PQ_HDR_PID, req->hdr);
+ }
+
+ if (req->payload & RISCV_IOMMU_PQ_PAYLOAD_L)
+ fault.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE;
+
+ /* Store DID in private_data for page_response to send ATS.PRGR */
+ fault.fault.prm.private_data[0] = devid;
+
+ ret = iommu_report_device_fault(dev, &fault);
+ if (ret) {
+ dev_warn_ratelimited(iommu->dev,
+ "page request fault report failed: %d\n", ret);
+ }
+}
+
+/* Page request queue interrupt handler thread function */
+static irqreturn_t riscv_iommu_priq_process(int irq, void *data)
+{
+ struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data;
+ struct riscv_iommu_device *iommu = queue->iommu;
+ struct riscv_iommu_pq_record *requests;
+ unsigned int ctrl, idx;
+ int cnt, len;
+
+ requests = (struct riscv_iommu_pq_record *)queue->base;
+
+ /* Clear page request interrupt pending and process all records. */
+ riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue));
+
+ do {
+ cnt = riscv_iommu_queue_consume(queue, &idx);
+ for (len = 0; len < cnt; idx++, len++)
+ riscv_iommu_handle_pq(iommu, &requests[Q_ITEM(queue, idx)]);
+ riscv_iommu_queue_release(queue, cnt);
+ } while (cnt > 0);
+
+ /* Clear MF/OF errors */
+ ctrl = riscv_iommu_readl(iommu, queue->qcr);
+ if (ctrl & (RISCV_IOMMU_PQCSR_PQMF | RISCV_IOMMU_PQCSR_PQOF)) {
+ riscv_iommu_writel(iommu, queue->qcr, ctrl);
+ dev_warn(iommu->dev,
+ "Queue #%u error; memory fault:%d overflow:%d\n",
+ queue->qid,
+ !!(ctrl & RISCV_IOMMU_PQCSR_PQMF),
+ !!(ctrl & RISCV_IOMMU_PQCSR_PQOF));
+ }
+
+ return IRQ_HANDLED;
+}
+
+/* Send ATS.PRGR page response through the command queue */
+static void riscv_iommu_page_response(struct device *dev,
+ struct iopf_fault *evt,
+ struct iommu_page_response *msg)
+{
+ struct riscv_iommu_device *iommu = dev_to_iommu(dev);
+ struct riscv_iommu_command cmd;
+ unsigned int devid;
+ u8 resp_code;
+
+ /* Recover DID from private_data stored during PQ processing */
+ devid = evt->fault.prm.private_data[0];
+
+ switch (msg->code) {
+ case IOMMU_PAGE_RESP_SUCCESS:
+ resp_code = 0; /* Success */
+ break;
+ case IOMMU_PAGE_RESP_INVALID:
+ resp_code = 1; /* Invalid Request */
+ break;
+ case IOMMU_PAGE_RESP_FAILURE:
+ default:
+ resp_code = 0xF; /* Response Failure */
+ break;
+ }
+
+ /* Build ATS.PRGR command */
+ cmd.dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_ATS_OPCODE) |
+ FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_ATS_FUNC_PRGR) |
+ FIELD_PREP(RISCV_IOMMU_CMD_ATS_RID, devid);
+
+ if (evt->fault.prm.flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID)
+ cmd.dword0 |= FIELD_PREP(RISCV_IOMMU_CMD_ATS_PID, evt->fault.prm.pasid) |
+ RISCV_IOMMU_CMD_ATS_PV;
+
+ cmd.dword1 = FIELD_PREP(RISCV_IOMMU_CMD_ATS_PRGR_PRG_INDEX, msg->grpid) |
+ FIELD_PREP(RISCV_IOMMU_CMD_ATS_PRGR_RESP_CODE, resp_code) |
+ FIELD_PREP(RISCV_IOMMU_CMD_ATS_PRGR_DST_ID, devid);
+
+ riscv_iommu_cmd_send(iommu, &cmd);
+}
+
/* Lookup and initialize device context info structure. */
static struct riscv_iommu_dc *riscv_iommu_get_dc(struct riscv_iommu_device *iommu,
unsigned int devid)
@@ -1404,6 +1550,7 @@ static const struct iommu_ops riscv_iommu_ops = {
.device_group = riscv_iommu_device_group,
.probe_device = riscv_iommu_probe_device,
.release_device = riscv_iommu_release_device,
+ .page_response = riscv_iommu_page_response,
};
static int riscv_iommu_init_check(struct riscv_iommu_device *iommu)
@@ -1466,6 +1613,8 @@ void riscv_iommu_remove(struct riscv_iommu_device *iommu)
riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_OFF);
riscv_iommu_queue_disable(&iommu->cmdq);
riscv_iommu_queue_disable(&iommu->fltq);
+ if (iommu->caps & RISCV_IOMMU_CAPABILITIES_ATS)
+ riscv_iommu_queue_disable(&iommu->priq);
}
int riscv_iommu_init(struct riscv_iommu_device *iommu)
@@ -1494,6 +1643,15 @@ int riscv_iommu_init(struct riscv_iommu_device *iommu)
if (rc)
return rc;
+ /* Allocate page request queue if ATS is supported */
+ if (iommu->caps & RISCV_IOMMU_CAPABILITIES_ATS) {
+ RISCV_IOMMU_QUEUE_INIT(&iommu->priq, PQ);
+ rc = riscv_iommu_queue_alloc(iommu, &iommu->priq,
+ sizeof(struct riscv_iommu_pq_record));
+ if (rc)
+ return rc;
+ }
+
rc = riscv_iommu_queue_enable(iommu, &iommu->cmdq, riscv_iommu_cmdq_process);
if (rc)
return rc;
@@ -1502,6 +1660,15 @@ int riscv_iommu_init(struct riscv_iommu_device *iommu)
if (rc)
goto err_queue_disable;
+ /* Enable page request queue if ATS is supported */
+ if (iommu->caps & RISCV_IOMMU_CAPABILITIES_ATS) {
+ rc = riscv_iommu_queue_enable(iommu, &iommu->priq,
+ riscv_iommu_priq_process);
+ if (rc)
+ goto err_queue_disable;
+ dev_info(iommu->dev, "page request queue enabled\n");
+ }
+
rc = riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_MAX);
if (rc)
goto err_queue_disable;
@@ -1534,6 +1701,8 @@ int riscv_iommu_init(struct riscv_iommu_device *iommu)
err_iodir_off:
riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_OFF);
err_queue_disable:
+ if (iommu->caps & RISCV_IOMMU_CAPABILITIES_ATS)
+ riscv_iommu_queue_disable(&iommu->priq);
riscv_iommu_queue_disable(&iommu->fltq);
riscv_iommu_queue_disable(&iommu->cmdq);
return rc;
diff --git a/drivers/iommu/riscv/iommu.h b/drivers/iommu/riscv/iommu.h
index 46df79dd5495..5c5ab24539f2 100644
--- a/drivers/iommu/riscv/iommu.h
+++ b/drivers/iommu/riscv/iommu.h
@@ -14,6 +14,8 @@
#include <linux/iommu.h>
#include <linux/types.h>
#include <linux/iopoll.h>
+#include <linux/pci.h>
+#include <linux/pci-ats.h>
#include "iommu-bits.h"
@@ -55,6 +57,7 @@ struct riscv_iommu_device {
/* hardware queues */
struct riscv_iommu_queue cmdq;
struct riscv_iommu_queue fltq;
+ struct riscv_iommu_queue priq;
/* device directory */
unsigned int ddt_mode;
--
2.53.0
More information about the linux-riscv
mailing list