[PATCH v4 11/24] iommu: Add iommu_report_device_broken() to quarantine a broken device

Mon May 18 20:38:54 PDT 2026

When an IOMMU hardware detects an error due to a faulty device (e.g. an ATS
invalidation timeout), IOMMU drivers may quarantine the device by disabling
specific hardware features or dropping translation capabilities.

However, the core-level states of the faulty device are out of sync, as the
device can still be attached to a translation domain or even potentially be
moved to a new domain that might overwrite the driver-level quarantine.

Given that such an error can likely be triggered from an ISR, introduce an
asynchronous broken_work per group_device, and provide a helper function to
allow the driver to initiate a quarantine in the core. __dev_to_gdev_rcu()
is required here to safely iterate the group_device list.

Note: gdev and gdev->group teardown will be blocked by disable_work_sync(),
so it's completely safe for the worker thread to access the group pointer.

Signed-off-by: Nicolin Chen <nicolinc at nvidia.com>
---
 include/linux/iommu.h |  11 +++-
 drivers/iommu/iommu.c | 139 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 149 insertions(+), 1 deletion(-)

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 6c124e9e9af8b..c088c8e8c1e2b 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -631,7 +631,10 @@ __iommu_copy_struct_to_user(const struct iommu_user_data *dst_data,
  *                  group and attached to the groups domain
  * @reset_device_done: Notify the driver that a device has reset successfully.
  *                     Note that the core invokes the callback function while
- *                     holding the group->mutex
+ *                     holding the group->mutex. Before returning, the driver
+ *                     must drain or filter pre-reset fault reports so that
+ *                     subsequent calls to iommu_report_device_broken() will
+ *                     reflect only post-reset faults.
  * @device_group: find iommu group for a particular device
  * @get_resv_regions: Request list of reserved regions for a device
  * @of_xlate: add OF master IDs to iommu grouping
@@ -896,6 +899,8 @@ static inline struct iommu_device *__iommu_get_iommu_dev(struct device *dev)
 #define iommu_get_iommu_dev(dev, type, member) \
 	container_of(__iommu_get_iommu_dev(dev), type, member)
 
+void iommu_report_device_broken(struct device *dev);
+
 static inline void iommu_iotlb_gather_init(struct iommu_iotlb_gather *gather)
 {
 	*gather = (struct iommu_iotlb_gather) {
@@ -1211,6 +1216,10 @@ struct iommu_iotlb_gather {};
 struct iommu_dirty_bitmap {};
 struct iommu_dirty_ops {};
 
+static inline void iommu_report_device_broken(struct device *dev)
+{
+}
+
 static inline bool device_iommu_capable(struct device *dev, enum iommu_cap cap)
 {
 	return false;
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index b150d22d8015f..6e5a7e38c5e67 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -82,6 +82,7 @@ enum gdev_blocked {
 	BLOCKED_NO = 0, /* Not blocked */
 	BLOCKED_RESETTING, /* PCI reset in flight */
 	BLOCKED_RESET_FAILED, /* PCI reset failed */
+	BLOCKED_BROKEN, /* Driver flagged a fault */
 };
 
 struct group_device {
@@ -95,6 +96,9 @@ struct group_device {
 	 */
 	enum gdev_blocked blocked;
 	unsigned int reset_depth;
+	struct work_struct broken_work;
+	/* Transient state before broken_worker thread starts */
+	bool broken_pending;
 };
 
 /* Iterate over each struct group_device in a struct iommu_group */
@@ -117,6 +121,25 @@ static struct group_device *__dev_to_gdev(struct device *dev)
 	return NULL;
 }
 
+/* Caller must be inside rcu_read_lock(). */
+static struct group_device *__dev_to_gdev_rcu(struct device *dev)
+{
+	struct iommu_group *group;
+	struct group_device *gdev;
+
+	lockdep_assert(rcu_read_lock_held());
+
+	group = rcu_dereference(dev_iommu_group_rcu(dev));
+	if (!group)
+		return NULL;
+
+	for_each_group_device(group, gdev) {
+		if (gdev->dev == dev)
+			return gdev;
+	}
+	return NULL;
+}
+
 struct iommu_group_attribute {
 	struct attribute attr;
 	ssize_t (*show)(struct iommu_group *group, char *buf);
@@ -180,6 +203,7 @@ static ssize_t iommu_group_store_type(struct iommu_group *group,
 static struct group_device *iommu_group_alloc_device(struct iommu_group *group,
 						     struct device *dev);
 static void __iommu_group_free_device(struct group_device *grp_dev);
+static void iommu_group_broken_worker(struct work_struct *work);
 static void iommu_domain_init(struct iommu_domain *domain, unsigned int type,
 			      const struct iommu_ops *ops);
 
@@ -762,6 +786,12 @@ static void __iommu_group_free_device(struct group_device *grp_dev)
 	sysfs_remove_link(group->devices_kobj, grp_dev->name);
 	sysfs_remove_link(&dev->kobj, "iommu_group");
 
+	/*
+	 * Disable (not just cancel) broken_work to prevent UAF; otherwise a
+	 * concurrent schedule_work() from iommu_report_device_broken() would
+	 * queue onto an about-to-be-freed gdev.
+	 */
+	disable_work_sync(&grp_dev->broken_work);
 	trace_remove_device_from_group(group->id, dev);
 
 	kfree(grp_dev->name);
@@ -1308,6 +1338,7 @@ static struct group_device *iommu_group_alloc_device(struct iommu_group *group,
 	device->group = group;
 	/* Keep dev alive for any in-flight RCU reader of grp_dev->dev. */
 	get_device(dev);
+	INIT_WORK(&device->broken_work, iommu_group_broken_worker);
 
 	ret = sysfs_create_link(&dev->kobj, &group->kobj, "iommu_group");
 	if (ret)
@@ -4301,6 +4332,14 @@ void pci_dev_reset_iommu_done(struct pci_dev *pdev, int reset_result)
 	 */
 	if (ops->reset_device_done)
 		ops->reset_device_done(&pdev->dev);
+	/*
+	 * Clear broken_pending after the reset_device_done callback to
+	 * neutralize any stale pre-reset report. Until this moment, a
+	 * driver may call iommu_report_device_broken() on a pre-reset
+	 * fault. Legitimate post-reset reports can only fire after the
+	 * re-attach below, so this clear cannot hide them.
+	 */
+	WRITE_ONCE(gdev->broken_pending, false);
 
 	/*
 	 * Re-attach RID domain back to group->domain
@@ -4346,6 +4385,106 @@ void pci_dev_reset_iommu_done(struct pci_dev *pdev, int reset_result)
 }
 EXPORT_SYMBOL_GPL(pci_dev_reset_iommu_done);
 
+static void iommu_group_broken_worker(struct work_struct *work)
+{
+	struct group_device *gdev =
+		container_of(work, struct group_device, broken_work);
+	struct iommu_group *group = gdev->group;
+	struct device *dev = gdev->dev;
+
+	guard(mutex)(&group->mutex);
+
+	/*
+	 * iommu_deinit_device() frees dev->iommu under group->mutex. Bail
+	 * out if the device has already been removed from IOMMU handling.
+	 */
+	if (!dev_has_iommu(dev))
+		return;
+
+	/*
+	 * A successful reset between schedule_work() and now would have cleared
+	 * broken_pending. Skip as the device has already been recovered.
+	 */
+	if (!READ_ONCE(gdev->broken_pending))
+		return;
+
+	/*
+	 * Quarantine the device completely. For a PCI device, it will be lifted
+	 * upon a pci_dev_reset_iommu_done(pdev, reset_result=0) call indicating
+	 * a device recovery.
+	 *
+	 * For a non-PCI device, currently it has no recovery framework tied to
+	 * the IOMMU subsystem. Quarantine it indefinitely until a recovery path
+	 * is introduced.
+	 */
+	if (WARN_ON(__iommu_group_block_device(gdev, BLOCKED_BROKEN)))
+		return;
+
+	dev_warn(dev, "IOMMU has quarantined the device\n");
+	WRITE_ONCE(gdev->broken_pending, false);
+}
+
+/**
+ * iommu_report_device_broken() - Report a broken device to quarantine it
+ * @dev: Device that has encountered an unrecoverable IOMMU-related error
+ *
+ * When an IOMMU driver detects a critical error caused by a device (e.g. an ATC
+ * invalidation timeout), this function should be used to quarantine the device
+ * at the IOMMU core level.
+ *
+ * The quarantine moves the device's RID and PASIDs to group->blocking_domain to
+ * prevent any further DMA/ATS activity that can potentially corrupt the system
+ * memory due to stale device cache entries.
+ *
+ * This function must not be called from a reset_device_done callback: setting
+ * broken_pending there would race the clear in pci_dev_reset_iommu_done() that
+ * follows. Otherwise, this function is safe to call from any context (including
+ * interrupt handlers), as the actual quarantine is done in an asynchronous work
+ * thread. The caller should have already taken driver-level measures (e.g., ATS
+ * disabled in HW) to contain the fault promptly before calling this function.
+ *
+ * An asynchronous reset can occur while the driver is handling an IOMMU-related
+ * error. The driver is responsible for calling this only for post-reset faults.
+ * A queued or delayed pre-reset fault must be drained or filtered by the driver
+ * before delivery. Otherwise, a stale report would falsely quarantine a freshly
+ * recovered device.
+ *
+ * For PCI devices, the quarantine will be lifted by a successful device reset
+ * via pci_dev_reset_iommu_done(). For non-PCI devices, the quarantine remains
+ * in effect indefinitely until a recovery mechanism is introduced.
+ *
+ * If the device is concurrently being removed or has already been removed from
+ * the IOMMU subsystem, this function will silently return without any action.
+ */
+void iommu_report_device_broken(struct device *dev)
+{
+	struct group_device *gdev;
+
+	/*
+	 * We cannot hold group->mutex here. Rely on iommu_group_broken_worker()
+	 * to validate dev_has_iommu(). The iommu_group memory is RCU-protected
+	 * via kfree_rcu() in iommu_group_release(), and group->devices is an
+	 * RCU-protected list, so the lookup runs entirely under rcu_read_lock.
+	 *
+	 * Note the device might have been concurrently removed from the group
+	 * (list_del_rcu) before iommu_deinit_device() cleared the dev->iommu.
+	 */
+	rcu_read_lock();
+	gdev = __dev_to_gdev_rcu(dev);
+	if (gdev) {
+		/*
+		 * Narrow chance we re-set broken_pending right after a concurrent
+		 * worker cleared it. Benign: the worker we are queueing here will
+		 * read it true and clear it again (skipping if already blocked);
+		 * pci_dev_reset_iommu_done() also clears it on a successful reset.
+		 */
+		WRITE_ONCE(gdev->broken_pending, true);
+		schedule_work(&gdev->broken_work);
+	}
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(iommu_report_device_broken);
+
 #if IS_ENABLED(CONFIG_IRQ_MSI_IOMMU)
 /**
  * iommu_dma_prepare_msi() - Map the MSI page in the IOMMU domain
-- 
2.43.0