[PATCH v1] iommu/riscv: Add page request queue and IOPF support

Gong Shuai gsh517025 at gmail.com
Wed Jun 17 07:49:10 PDT 2026


On 6/17/2026 5:01 PM, bingyu.xian wrote:
> The RISC-V IOMMU Architecture Specification, Version 1.0, Chapter 3.3
> defines a Page-request Queue (PQ) for handling PCIe Page Request
> Interface (PRI) messages. The current driver implements only the
> Command Queue (CQ) and Fault Queue (FQ); the PQ is left
> unimplemented, which is the biggest functional gap compared with the
> ARM SMMUv3 driver.
> 
> Add PQ and IOPF support:
> 
> - Initialize and enable the PQ when the hardware advertises ATS
>    capability (RISCV_IOMMU_CAPABILITIES_ATS), and add an IOMMU_IOPF
>    Kconfig dependency.
> - Implement the PQ interrupt handler riscv_iommu_priq_process(),
>    which consumes PQ records from the hardware ring buffer.
> - Translate PQ records into the kernel's generic iopf_fault format
>    and pass them to the IOPF framework via
>    iommu_report_device_fault().
> - Implement the .page_response callback, which builds an ATS.PRGR
>    (Page Request Group Response) command and sends it through the
>    command queue to notify the requesting device.
> 


Hi, bingyu

Thanks for your work.

Tomasz already has a complete ATS/PRI/SVA support series on GitHub [1],
which hasn't been posted to the mailing list yet.

[1] https://github.com/tjeznach/linux/commits/riscv_iommu.next


> Tested on QEMU 10.0 with '-M virt,iommu-sys=on' and
> CONFIG_IOMMU_IOPF=y; the PQ comes up cleanly:
> 
>    riscv,iommu 3010000.iommu: page request queue enabled
>    
> Test patch
> ==========


It would be better to separate the test helper into its own patch rather
than mixing it with the core implementation, which makes it hard to
apply directly.

Regards,
Shuai


> 
> diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c
> index b1c2d3e4f5a6..3af3f19de94f 100644
> --- a/drivers/iommu/riscv/iommu.c
> +++ b/drivers/iommu/riscv/iommu.c
> @@ -16,6 +16,7 @@
>   #include <linux/acpi_rimt.h>
>   #include <linux/compiler.h>
>   #include <linux/crash_dump.h>
> +#include <linux/debugfs.h>
>   #include <linux/init.h>
>   #include <linux/iommu.h>
>   #include <linux/iopoll.h>
> @@ -1608,9 +1609,62 @@ static int riscv_iommu_init_check(struct riscv_iommu_device *iommu)
>   	return 0;
>   }
> 
> +#ifdef CONFIG_DEBUG_FS
> +/*
> + * debugfs interface to inject a fake PQ record without real PRI hardware.
> + *
> + * Writing a PCI BDF-encoded device id to the 'inject_pq' file constructs a
> + * synthetic riscv_iommu_pq_record and feeds it directly to
> + * riscv_iommu_handle_pq(), exercising the IOPF report path and, via the
> + * IOPF framework, the .page_response -> ATS.PRGR command path.
> + *
> + * The IRQ-driven priq_process() consumer loop is not exercised here, but it
> + * reuses the same queue infrastructure as the existing fltq_process().
> + */
> +static struct dentry *riscv_iommu_debugfs_dir;
> +
> +static int riscv_iommu_inject_pq_set(void *data, u64 val)
> +{
> +	struct riscv_iommu_device *iommu = data;
> +	struct riscv_iommu_pq_record fake_req;
> +	unsigned int devid = (unsigned int)val;
> +
> +	if (!(iommu->caps & RISCV_IOMMU_CAPABILITIES_ATS)) {
> +		dev_err(iommu->dev, "inject_pq: ATS not supported\n");
> +		return -ENODEV;
> +	}
> +
> +	memset(&fake_req, 0, sizeof(fake_req));
> +	fake_req.hdr = FIELD_PREP(RISCV_IOMMU_PQ_HDR_DID, devid);
> +	fake_req.payload = FIELD_PREP(RISCV_IOMMU_PQ_PAYLOAD_ADDR, 0xdead0) |
> +			   FIELD_PREP(RISCV_IOMMU_PQ_PAYLOAD_PRGI, 1) |
> +			   RISCV_IOMMU_PQ_PAYLOAD_L |
> +			   RISCV_IOMMU_PQ_PAYLOAD_R;
> +
> +	dev_info(iommu->dev,
> +		 "inject_pq: injecting fake PQ record for devid 0x%x\n", devid);
> +	riscv_iommu_handle_pq(iommu, &fake_req);
> +	return 0;
> +}
> +
> +DEFINE_DEBUGFS_ATTRIBUTE(riscv_iommu_inject_pq_fops, NULL,
> +			 riscv_iommu_inject_pq_set, "%llu\n");
> +
> +static void riscv_iommu_debugfs_init(struct riscv_iommu_device *iommu)
> +{
> +	riscv_iommu_debugfs_dir = debugfs_create_dir("riscv-iommu", NULL);
> +	debugfs_create_file("inject_pq", 0200, riscv_iommu_debugfs_dir,
> +			    iommu, &riscv_iommu_inject_pq_fops);
> +}
> +
> +static void riscv_iommu_debugfs_remove(void)
> +{
> +	debugfs_remove_recursive(riscv_iommu_debugfs_dir);
> +	riscv_iommu_debugfs_dir = NULL;
> +}
> +#else
> +static inline void riscv_iommu_debugfs_init(struct riscv_iommu_device *iommu) { }
> +static inline void riscv_iommu_debugfs_remove(void) { }
> +#endif /* CONFIG_DEBUG_FS */
> +
>   void riscv_iommu_remove(struct riscv_iommu_device *iommu)
>   {
> +	riscv_iommu_debugfs_remove();
>   	iommu_device_unregister(&iommu->iommu);
>   	iommu_device_sysfs_remove(&iommu->iommu);
>   	riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_OFF);
> @@ -1754,6 +1808,8 @@ int riscv_iommu_init(struct riscv_iommu_device *iommu)
>   		goto err_remove_sysfs;
>   	}
> 
> +	riscv_iommu_debugfs_init(iommu);
> +
>   	return 0;
> 
>   err_remove_sysfs:
> ---
> 
> Test logs:
>    root at Ubuntu-riscv64:~# dmesg | grep "iommu"
>    [    0.590293] iommu: Default domain type: Translated
>    [    0.590379] iommu: DMA domain TLB invalidation policy: lazy mode
>    [    1.374106] riscv,iommu 3010000.iommu: failed to find an MSI domain
>    [    1.374429] riscv,iommu 3010000.iommu: using wire-signaled interrupts
>    [    1.384786] riscv,iommu 3010000.iommu: page request queue enabled
>    [    1.391279] pci 0000:00:00.0: Adding to iommu group 0
>    [    1.391798] pci 0000:00:01.0: Adding to iommu group 1
>    [    1.392023] pci 0000:00:02.0: Adding to iommu group 2
>    [    1.392252] pci 0000:00:03.0: Adding to iommu group 3
> 
>    root at Ubuntu-riscv64:~# echo 8 > /sys/kernel/debug/riscv-iommu/inject_pq
> 
>    root at Ubuntu-riscv64:~# dmesg | grep "iommu"
>    [    0.590293] iommu: Default domain type: Translated
>    [    0.590379] iommu: DMA domain TLB invalidation policy: lazy mode
>    [    1.374106] riscv,iommu 3010000.iommu: failed to find an MSI domain
>    [    1.374429] riscv,iommu 3010000.iommu: using wire-signaled interrupts
>    [    1.384786] riscv,iommu 3010000.iommu: page request queue enabled
>    [    1.391279] pci 0000:00:00.0: Adding to iommu group 0
>    [    1.391798] pci 0000:00:01.0: Adding to iommu group 1
>    [    1.392023] pci 0000:00:02.0: Adding to iommu group 2
>    [    1.392252] pci 0000:00:03.0: Adding to iommu group 3
>    [ 1308.301611] riscv,iommu 3010000.iommu: inject_pq: injecting fake PQ record for devid 0x8
>    [ 1308.303467] riscv,iommu 3010000.iommu: page request fault report failed: -22
> 
> Assisted-by: YuanSheng: deepseek-v4-pro
> Co-developed-by: Quan Zhou <zhouquan at iscas.ac.cn>
> Signed-off-by: Quan Zhou <zhouquan at iscas.ac.cn>
> Signed-off-by: bingyu.xian <shanbeeyoo at gmail.com>
> ---
>   drivers/iommu/riscv/Kconfig |   1 +
>   drivers/iommu/riscv/iommu.c | 178 +++++++++++++++++++++++++++++++++++-
>   drivers/iommu/riscv/iommu.h |   3 +
>   3 files changed, 181 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/iommu/riscv/Kconfig b/drivers/iommu/riscv/Kconfig
> index b86e5ab94183..c5abb7b4ba8f 100644
> --- a/drivers/iommu/riscv/Kconfig
> +++ b/drivers/iommu/riscv/Kconfig
> @@ -7,6 +7,7 @@ config RISCV_IOMMU
>   	depends on GENERIC_MSI_IRQ
>   	depends on (RISCV || COMPILE_TEST) && 64BIT
>   	select IOMMU_API
> +	select IOMMU_IOPF
>   	select GENERIC_PT
>   	select IOMMU_PT
>   	select IOMMU_PT_RISCV64
> diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c
> index a31f50bbad35..b1c2d3e4f5a6 100644
> --- a/drivers/iommu/riscv/iommu.c
> +++ b/drivers/iommu/riscv/iommu.c
> @@ -33,9 +33,10 @@
>   #define RISCV_IOMMU_DDTP_TIMEOUT	10000000
>   #define RISCV_IOMMU_IOTINVAL_TIMEOUT	90000000
> 
> -/* Number of entries per CMD/FLT queue, should be <= INT_MAX */
> +/* Number of entries per CMD/FLT/PRI queue, should be <= INT_MAX */
>   #define RISCV_IOMMU_DEF_CQ_COUNT	8192
>   #define RISCV_IOMMU_DEF_FQ_COUNT	4096
> +#define RISCV_IOMMU_DEF_PQ_COUNT	4096
> 
>   /* RISC-V IOMMU PPN <> PHYS address conversions, PHYS <=> PPN[53:10] */
>   #define phys_to_ppn(pa)  (((pa) >> 2) & (((1ULL << 44) - 1) << 10))
> @@ -565,6 +566,151 @@ static irqreturn_t riscv_iommu_fltq_process(int irq, void *data)
>   	return IRQ_HANDLED;
>   }
> 
> 
> +static struct device *riscv_iommu_find_dev(struct riscv_iommu_device *iommu,
> +					   unsigned int devid)
> +{
> +	struct pci_bus *bus;
> +	struct pci_dev *pdev;
> +
> +	bus = pci_find_bus(0, devid >> 8);
> +	if (!bus)
> +		return NULL;
> +
> +	pdev = pci_get_slot(bus, devid & 0xff);
> +	if (!pdev)
> +		return NULL;
> +
> +	return &pdev->dev;
> +}
> +
> +static void riscv_iommu_handle_pq(struct riscv_iommu_device *iommu,
> +				  struct riscv_iommu_pq_record *req)
> +{
> +	struct iopf_fault fault;
> +	struct device *dev;
> +	unsigned int devid;
> +	int ret;
> +
> +	devid = FIELD_GET(RISCV_IOMMU_PQ_HDR_DID, req->hdr);
> +	dev = riscv_iommu_find_dev(iommu, devid);
> +	if (!dev) {
> +		dev_warn_ratelimited(iommu->dev,
> +				     "page request for unknown devid 0x%x\n", devid);
> +		return;
> +	}
> +
> +	memset(&fault, 0, sizeof(fault));
> +	fault.fault.type = IOMMU_FAULT_PAGE_REQ;
> +	fault.fault.prm.addr = FIELD_GET(RISCV_IOMMU_PQ_PAYLOAD_ADDR, req->payload) << 12;
> +	fault.fault.prm.grpid = FIELD_GET(RISCV_IOMMU_PQ_PAYLOAD_PRGI, req->payload);
> +
> +	if (req->payload & RISCV_IOMMU_PQ_PAYLOAD_R)
> +		fault.fault.prm.perm |= IOMMU_FAULT_PERM_READ;
> +	if (req->payload & RISCV_IOMMU_PQ_PAYLOAD_W)
> +		fault.fault.prm.perm |= IOMMU_FAULT_PERM_WRITE;
> +	if (req->hdr & RISCV_IOMMU_PQ_HDR_EXEC)
> +		fault.fault.prm.perm |= IOMMU_FAULT_PERM_EXEC;
> +	if (req->hdr & RISCV_IOMMU_PQ_HDR_PRIV)
> +		fault.fault.prm.perm |= IOMMU_FAULT_PERM_PRIV;
> +
> +	if (req->hdr & RISCV_IOMMU_PQ_HDR_PV) {
> +		fault.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID |
> +					 IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID;
> +		fault.fault.prm.pasid = FIELD_GET(RISCV_IOMMU_PQ_HDR_PID, req->hdr);
> +	}
> +
> +	if (req->payload & RISCV_IOMMU_PQ_PAYLOAD_L)
> +		fault.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE;
> +
> +	/* Store DID in private_data for page_response to send ATS.PRGR */
> +	fault.fault.prm.private_data[0] = devid;
> +
> +	ret = iommu_report_device_fault(dev, &fault);
> +	if (ret) {
> +		dev_warn_ratelimited(iommu->dev,
> +				     "page request fault report failed: %d\n", ret);
> +	}
> +}
> +
> +/* Page request queue interrupt handler thread function */
> +static irqreturn_t riscv_iommu_priq_process(int irq, void *data)
> +{
> +	struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data;
> +	struct riscv_iommu_device *iommu = queue->iommu;
> +	struct riscv_iommu_pq_record *requests;
> +	unsigned int ctrl, idx;
> +	int cnt, len;
> +
> +	requests = (struct riscv_iommu_pq_record *)queue->base;
> +
> +	/* Clear page request interrupt pending and process all records. */
> +	riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue));
> +
> +	do {
> +		cnt = riscv_iommu_queue_consume(queue, &idx);
> +		for (len = 0; len < cnt; idx++, len++)
> +			riscv_iommu_handle_pq(iommu, &requests[Q_ITEM(queue, idx)]);
> +		riscv_iommu_queue_release(queue, cnt);
> +	} while (cnt > 0);
> +
> +	/* Clear MF/OF errors */
> +	ctrl = riscv_iommu_readl(iommu, queue->qcr);
> +	if (ctrl & (RISCV_IOMMU_PQCSR_PQMF | RISCV_IOMMU_PQCSR_PQOF)) {
> +		riscv_iommu_writel(iommu, queue->qcr, ctrl);
> +		dev_warn(iommu->dev,
> +			 "Queue #%u error; memory fault:%d overflow:%d\n",
> +			 queue->qid,
> +			 !!(ctrl & RISCV_IOMMU_PQCSR_PQMF),
> +			 !!(ctrl & RISCV_IOMMU_PQCSR_PQOF));
> +	}
> +
> +	return IRQ_HANDLED;
> +}
> +
> +/* Send ATS.PRGR page response through the command queue */
> +static void riscv_iommu_page_response(struct device *dev,
> +				      struct iopf_fault *evt,
> +				      struct iommu_page_response *msg)
> +{
> +	struct riscv_iommu_device *iommu = dev_to_iommu(dev);
> +	struct riscv_iommu_command cmd;
> +	unsigned int devid;
> +	u8 resp_code;
> +
> +	/* Recover DID from private_data stored during PQ processing */
> +	devid = evt->fault.prm.private_data[0];
> +
> +	switch (msg->code) {
> +	case IOMMU_PAGE_RESP_SUCCESS:
> +		resp_code = 0; /* Success */
> +		break;
> +	case IOMMU_PAGE_RESP_INVALID:
> +		resp_code = 1; /* Invalid Request */
> +		break;
> +	case IOMMU_PAGE_RESP_FAILURE:
> +	default:
> +		resp_code = 0xF; /* Response Failure */
> +		break;
> +	}
> +
> +	/* Build ATS.PRGR command */
> +	cmd.dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_ATS_OPCODE) |
> +		     FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_ATS_FUNC_PRGR) |
> +		     FIELD_PREP(RISCV_IOMMU_CMD_ATS_RID, devid);
> +
> +	if (evt->fault.prm.flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID)
> +		cmd.dword0 |= FIELD_PREP(RISCV_IOMMU_CMD_ATS_PID, evt->fault.prm.pasid) |
> +			      RISCV_IOMMU_CMD_ATS_PV;
> +
> +	cmd.dword1 = FIELD_PREP(RISCV_IOMMU_CMD_ATS_PRGR_PRG_INDEX, msg->grpid) |
> +		     FIELD_PREP(RISCV_IOMMU_CMD_ATS_PRGR_RESP_CODE, resp_code) |
> +		     FIELD_PREP(RISCV_IOMMU_CMD_ATS_PRGR_DST_ID, devid);
> +
> +	riscv_iommu_cmd_send(iommu, &cmd);
> +}
> +
>   /* Lookup and initialize device context info structure. */
>   static struct riscv_iommu_dc *riscv_iommu_get_dc(struct riscv_iommu_device *iommu,
>   						 unsigned int devid)
> @@ -1404,6 +1550,7 @@ static const struct iommu_ops riscv_iommu_ops = {
>   	.device_group = riscv_iommu_device_group,
>   	.probe_device = riscv_iommu_probe_device,
>   	.release_device	= riscv_iommu_release_device,
> +	.page_response = riscv_iommu_page_response,
>   };
> 
>   static int riscv_iommu_init_check(struct riscv_iommu_device *iommu)
> @@ -1466,6 +1613,8 @@ void riscv_iommu_remove(struct riscv_iommu_device *iommu)
>   	riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_OFF);
>   	riscv_iommu_queue_disable(&iommu->cmdq);
>   	riscv_iommu_queue_disable(&iommu->fltq);
> +	if (iommu->caps & RISCV_IOMMU_CAPABILITIES_ATS)
> +		riscv_iommu_queue_disable(&iommu->priq);
>   }
> 
>   int riscv_iommu_init(struct riscv_iommu_device *iommu)
> @@ -1494,6 +1643,15 @@ int riscv_iommu_init(struct riscv_iommu_device *iommu)
>   	if (rc)
>   		return rc;
> 
> +	/* Allocate page request queue if ATS is supported */
> +	if (iommu->caps & RISCV_IOMMU_CAPABILITIES_ATS) {
> +		RISCV_IOMMU_QUEUE_INIT(&iommu->priq, PQ);
> +		rc = riscv_iommu_queue_alloc(iommu, &iommu->priq,
> +					     sizeof(struct riscv_iommu_pq_record));
> +		if (rc)
> +			return rc;
> +	}
> +
>   	rc = riscv_iommu_queue_enable(iommu, &iommu->cmdq, riscv_iommu_cmdq_process);
>   	if (rc)
>   		return rc;
> @@ -1502,6 +1660,15 @@ int riscv_iommu_init(struct riscv_iommu_device *iommu)
>   	if (rc)
>   		goto err_queue_disable;
> 
> +	/* Enable page request queue if ATS is supported */
> +	if (iommu->caps & RISCV_IOMMU_CAPABILITIES_ATS) {
> +		rc = riscv_iommu_queue_enable(iommu, &iommu->priq,
> +					      riscv_iommu_priq_process);
> +		if (rc)
> +			goto err_queue_disable;
> +		dev_info(iommu->dev, "page request queue enabled\n");
> +	}
> +
>   	rc = riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_MAX);
>   	if (rc)
>   		goto err_queue_disable;
> @@ -1534,6 +1701,8 @@ int riscv_iommu_init(struct riscv_iommu_device *iommu)
>   err_iodir_off:
>   	riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_OFF);
>   err_queue_disable:
> +	if (iommu->caps & RISCV_IOMMU_CAPABILITIES_ATS)
> +		riscv_iommu_queue_disable(&iommu->priq);
>   	riscv_iommu_queue_disable(&iommu->fltq);
>   	riscv_iommu_queue_disable(&iommu->cmdq);
>   	return rc;
> diff --git a/drivers/iommu/riscv/iommu.h b/drivers/iommu/riscv/iommu.h
> index 46df79dd5495..5c5ab24539f2 100644
> --- a/drivers/iommu/riscv/iommu.h
> +++ b/drivers/iommu/riscv/iommu.h
> @@ -14,6 +14,8 @@
>   #include <linux/iommu.h>
>   #include <linux/types.h>
>   #include <linux/iopoll.h>
> +#include <linux/pci.h>
> +#include <linux/pci-ats.h>
> 
>   #include "iommu-bits.h"
> 
> @@ -55,6 +57,7 @@ struct riscv_iommu_device {
>   	/* hardware queues */
>   	struct riscv_iommu_queue cmdq;
>   	struct riscv_iommu_queue fltq;
> +	struct riscv_iommu_queue priq;
> 
>   	/* device directory */
>   	unsigned int ddt_mode;
> --
> 2.53.0
> 
> 
> _______________________________________________
> linux-riscv mailing list
> linux-riscv at lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-riscv




More information about the linux-riscv mailing list