[PATCH v3 07/11] iommu: Add iommu_report_device_broken() to quarantine a broken device

Nicolin Chen nicolinc at nvidia.com
Thu Apr 16 16:28:36 PDT 2026


When an IOMMU hardware detects an error due to a faulty device (e.g. an ATS
invalidation timeout), IOMMU drivers may quarantine the device by disabling
specific hardware features or dropping translation capabilities.

However, the core-level states of the faulty device are out of sync, as the
device can be still attached to a translation domain or even potentially be
moved to a new domain that might overwrite the driver-level quarantine.

Given that such an error can likely be triggered from an ISR, introduce an
asynchronous broken_work per group_device, and provide a helper function to
allow driver initiate a quarantine in the core.

Note that the worker function must not use dev->iommu_group that is NULLed
by iommu_deinit_device() holding group->mutex. The cancel_work_sync() only
gets called afterwards outside the mutex. So, this would be a NULL pointer
dereference. Add a stable group backpointer to struct group_device instead.

Signed-off-by: Nicolin Chen <nicolinc at nvidia.com>
---
 include/linux/iommu.h |   6 +++
 drivers/iommu/iommu.c | 100 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 106 insertions(+)

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 3c5c5fa5cdc6a..97d0e5b90c58f 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -893,6 +893,8 @@ static inline struct iommu_device *__iommu_get_iommu_dev(struct device *dev)
 #define iommu_get_iommu_dev(dev, type, member) \
 	container_of(__iommu_get_iommu_dev(dev), type, member)
 
+void iommu_report_device_broken(struct device *dev);
+
 static inline void iommu_iotlb_gather_init(struct iommu_iotlb_gather *gather)
 {
 	*gather = (struct iommu_iotlb_gather) {
@@ -1207,6 +1209,10 @@ struct iommu_iotlb_gather {};
 struct iommu_dirty_bitmap {};
 struct iommu_dirty_ops {};
 
+static inline void iommu_report_device_broken(struct device *dev)
+{
+}
+
 static inline bool device_iommu_capable(struct device *dev, enum iommu_cap cap)
 {
 	return false;
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 810e7b94a1ae2..bb00918e1b70d 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -73,6 +73,7 @@ struct iommu_group {
 };
 
 struct group_device {
+	struct iommu_group *group;
 	struct list_head list;
 	struct device *dev;
 	char *name;
@@ -81,10 +82,12 @@ struct group_device {
 	 * retained. This can happen when:
 	 *  - Device is undergoing a reset
 	 *  - Device failed the last reset
+	 *  - Device is broken and quarantined
 	 */
 	bool blocked;
 	unsigned int reset_depth;
 	struct rcu_head rcu;
+	struct work_struct broken_work;
 };
 
 /* Iterate over each struct group_device in a struct iommu_group */
@@ -170,6 +173,7 @@ static struct group_device *iommu_group_alloc_device(struct iommu_group *group,
 						     struct device *dev);
 static void __iommu_group_free_device(struct iommu_group *group,
 				      struct group_device *grp_dev);
+static void iommu_group_broken_worker(struct work_struct *work);
 static void iommu_domain_init(struct iommu_domain *domain, unsigned int type,
 			      const struct iommu_ops *ops);
 
@@ -752,6 +756,8 @@ static void __iommu_group_free_device(struct iommu_group *group,
 	sysfs_remove_link(group->devices_kobj, grp_dev->name);
 	sysfs_remove_link(&dev->kobj, "iommu_group");
 
+	/* Must wait for broken_work to prevent UAF */
+	cancel_work_sync(&grp_dev->broken_work);
 	trace_remove_device_from_group(group->id, dev);
 
 	kfree(grp_dev->name);
@@ -1284,6 +1290,8 @@ static struct group_device *iommu_group_alloc_device(struct iommu_group *group,
 		return ERR_PTR(-ENOMEM);
 
 	device->dev = dev;
+	device->group = group;
+	INIT_WORK(&device->broken_work, iommu_group_broken_worker);
 
 	ret = sysfs_create_link(&dev->kobj, &group->kobj, "iommu_group");
 	if (ret)
@@ -4178,6 +4186,98 @@ void pci_dev_reset_iommu_done(struct pci_dev *pdev, bool reset_succeeds)
 }
 EXPORT_SYMBOL_GPL(pci_dev_reset_iommu_done);
 
+static void iommu_group_broken_worker(struct work_struct *work)
+{
+	struct group_device *gdev =
+		container_of(work, struct group_device, broken_work);
+	struct iommu_group *group = gdev->group;
+	struct device *dev = gdev->dev;
+
+	mutex_lock(&group->mutex);
+
+	/*
+	 * iommu_deinit_device() frees dev->iommu under group->mutex. Bail
+	 * out if the device has already been removed from IOMMU handling.
+	 */
+	if (!dev_has_iommu(dev))
+		goto out_unlock;
+
+	if (gdev->blocked) {
+		dev_dbg(dev, "IOMMU has already quarantined the device\n");
+		goto out_unlock;
+	}
+
+	/*
+	 * Quarantine the device completely. For a PCI device, it will be lifted
+	 * upon a pci_dev_reset_iommu_done(pdev, succeeds=true) call indicating
+	 * a device recovery.
+	 *
+	 * For a non-PCI device, currently it has no recovery framework tied to
+	 * the IOMMU subsystem. Quarantine it indefinitely until a recovery path
+	 * is introduced.
+	 */
+	if (!WARN_ON(__iommu_group_block_device(group, gdev)))
+		dev_warn(dev, "IOMMU has quarantined the device\n");
+
+out_unlock:
+	mutex_unlock(&group->mutex);
+	iommu_group_put(group);
+}
+
+/**
+ * iommu_report_device_broken() - Report a broken device to quarantine it
+ * @dev: Device that has encountered an unrecoverable IOMMU-related error
+ *
+ * When an IOMMU driver detects a critical error caused by a device (e.g. an ATC
+ * invalidation timeout), this function should be used to quarantine the device
+ * at the IOMMU core level.
+ *
+ * The quarantine moves the device's RID and PASIDs to group->blocking_domain to
+ * prevent any further DMA/ATS activity that can potentially corrupt the system
+ * memory due to stale device cache entries.
+ *
+ * This function is safe to call from any context, including interrupt handlers,
+ * as it schedules the actual quarantine work asynchronously. The caller should
+ * have already taken driver-level measures (e.g., disabling ATS in hardware) to
+ * contain the fault immediately, before calling this function.
+ *
+ * For PCI devices, the quarantine will be lifted by a successful device reset
+ * via pci_dev_reset_iommu_done(). For non-PCI devices, the quarantine remains
+ * in effect indefinitely until a recovery mechanism is introduced.
+ *
+ * If the device is concurrently being removed or has already been removed from
+ * the IOMMU subsystem, this function will silently return without any action.
+ */
+void iommu_report_device_broken(struct device *dev)
+{
+	struct iommu_group *group = iommu_group_get(dev);
+	struct group_device *gdev;
+	bool scheduled = false;
+
+	if (!group)
+		return;
+	if (!dev_has_iommu(dev))
+		goto out;
+
+	rcu_read_lock();
+	/*
+	 * Note the device might have been concurrently removed from the group
+	 * (list_del_rcu) before iommu_deinit_device() cleared the dev->iommu.
+	 */
+	list_for_each_entry_rcu(gdev, &group->devices, list) {
+		if (gdev->dev != dev)
+			continue;
+		/* iommu_group_broken_worker() must put the group ref */
+		scheduled = schedule_work(&gdev->broken_work);
+		break;
+	}
+	rcu_read_unlock();
+out:
+	if (!scheduled)
+		iommu_group_put(group);
+}
+EXPORT_SYMBOL_GPL(iommu_report_device_broken);
+
 #if IS_ENABLED(CONFIG_IRQ_MSI_IOMMU)
 /**
  * iommu_dma_prepare_msi() - Map the MSI page in the IOMMU domain
-- 
2.43.0




More information about the linux-arm-kernel mailing list