[PATCH v4 11/24] iommu: Add iommu_report_device_broken() to quarantine a broken device
Nicolin Chen
nicolinc at nvidia.com
Mon May 18 20:38:54 PDT 2026
When an IOMMU hardware detects an error due to a faulty device (e.g. an ATS
invalidation timeout), IOMMU drivers may quarantine the device by disabling
specific hardware features or dropping translation capabilities.
However, the core-level states of the faulty device are out of sync, as the
device can still be attached to a translation domain or even potentially be
moved to a new domain that might overwrite the driver-level quarantine.
Given that such an error can likely be triggered from an ISR, introduce an
asynchronous broken_work per group_device, and provide a helper function to
allow the driver to initiate a quarantine in the core. __dev_to_gdev_rcu()
is required here to safely iterate the group_device list.
Note: gdev and gdev->group teardown will be blocked by disable_work_sync(),
so it's completely safe for the worker thread to access the group pointer.
Signed-off-by: Nicolin Chen <nicolinc at nvidia.com>
---
include/linux/iommu.h | 11 +++-
drivers/iommu/iommu.c | 139 ++++++++++++++++++++++++++++++++++++++++++
2 files changed, 149 insertions(+), 1 deletion(-)
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 6c124e9e9af8b..c088c8e8c1e2b 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -631,7 +631,10 @@ __iommu_copy_struct_to_user(const struct iommu_user_data *dst_data,
* group and attached to the groups domain
* @reset_device_done: Notify the driver that a device has reset successfully.
* Note that the core invokes the callback function while
- * holding the group->mutex
+ * holding the group->mutex. Before returning, the driver
+ * must drain or filter pre-reset fault reports so that
+ * subsequent calls to iommu_report_device_broken() will
+ * reflect only post-reset faults.
* @device_group: find iommu group for a particular device
* @get_resv_regions: Request list of reserved regions for a device
* @of_xlate: add OF master IDs to iommu grouping
@@ -896,6 +899,8 @@ static inline struct iommu_device *__iommu_get_iommu_dev(struct device *dev)
#define iommu_get_iommu_dev(dev, type, member) \
container_of(__iommu_get_iommu_dev(dev), type, member)
+void iommu_report_device_broken(struct device *dev);
+
static inline void iommu_iotlb_gather_init(struct iommu_iotlb_gather *gather)
{
*gather = (struct iommu_iotlb_gather) {
@@ -1211,6 +1216,10 @@ struct iommu_iotlb_gather {};
struct iommu_dirty_bitmap {};
struct iommu_dirty_ops {};
+static inline void iommu_report_device_broken(struct device *dev)
+{
+}
+
static inline bool device_iommu_capable(struct device *dev, enum iommu_cap cap)
{
return false;
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index b150d22d8015f..6e5a7e38c5e67 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -82,6 +82,7 @@ enum gdev_blocked {
BLOCKED_NO = 0, /* Not blocked */
BLOCKED_RESETTING, /* PCI reset in flight */
BLOCKED_RESET_FAILED, /* PCI reset failed */
+ BLOCKED_BROKEN, /* Driver flagged a fault */
};
struct group_device {
@@ -95,6 +96,9 @@ struct group_device {
*/
enum gdev_blocked blocked;
unsigned int reset_depth;
+ struct work_struct broken_work;
+ /* Transient state before broken_worker thread starts */
+ bool broken_pending;
};
/* Iterate over each struct group_device in a struct iommu_group */
@@ -117,6 +121,25 @@ static struct group_device *__dev_to_gdev(struct device *dev)
return NULL;
}
+/* Caller must be inside rcu_read_lock(). */
+static struct group_device *__dev_to_gdev_rcu(struct device *dev)
+{
+ struct iommu_group *group;
+ struct group_device *gdev;
+
+ lockdep_assert(rcu_read_lock_held());
+
+ group = rcu_dereference(dev_iommu_group_rcu(dev));
+ if (!group)
+ return NULL;
+
+ for_each_group_device(group, gdev) {
+ if (gdev->dev == dev)
+ return gdev;
+ }
+ return NULL;
+}
+
struct iommu_group_attribute {
struct attribute attr;
ssize_t (*show)(struct iommu_group *group, char *buf);
@@ -180,6 +203,7 @@ static ssize_t iommu_group_store_type(struct iommu_group *group,
static struct group_device *iommu_group_alloc_device(struct iommu_group *group,
struct device *dev);
static void __iommu_group_free_device(struct group_device *grp_dev);
+static void iommu_group_broken_worker(struct work_struct *work);
static void iommu_domain_init(struct iommu_domain *domain, unsigned int type,
const struct iommu_ops *ops);
@@ -762,6 +786,12 @@ static void __iommu_group_free_device(struct group_device *grp_dev)
sysfs_remove_link(group->devices_kobj, grp_dev->name);
sysfs_remove_link(&dev->kobj, "iommu_group");
+ /*
+ * Disable (not just cancel) broken_work to prevent UAF; otherwise a
+ * concurrent schedule_work() from iommu_report_device_broken() would
+ * queue onto an about-to-be-freed gdev.
+ */
+ disable_work_sync(&grp_dev->broken_work);
trace_remove_device_from_group(group->id, dev);
kfree(grp_dev->name);
@@ -1308,6 +1338,7 @@ static struct group_device *iommu_group_alloc_device(struct iommu_group *group,
device->group = group;
/* Keep dev alive for any in-flight RCU reader of grp_dev->dev. */
get_device(dev);
+ INIT_WORK(&device->broken_work, iommu_group_broken_worker);
ret = sysfs_create_link(&dev->kobj, &group->kobj, "iommu_group");
if (ret)
@@ -4301,6 +4332,14 @@ void pci_dev_reset_iommu_done(struct pci_dev *pdev, int reset_result)
*/
if (ops->reset_device_done)
ops->reset_device_done(&pdev->dev);
+ /*
+ * Clear broken_pending after the reset_device_done callback to
+ * neutralize any stale pre-reset report. Until this moment, a
+ * driver may call iommu_report_device_broken() on a pre-reset
+ * fault. Legitimate post-reset reports can only fire after the
+ * re-attach below, so this clear cannot hide them.
+ */
+ WRITE_ONCE(gdev->broken_pending, false);
/*
* Re-attach RID domain back to group->domain
@@ -4346,6 +4385,106 @@ void pci_dev_reset_iommu_done(struct pci_dev *pdev, int reset_result)
}
EXPORT_SYMBOL_GPL(pci_dev_reset_iommu_done);
+static void iommu_group_broken_worker(struct work_struct *work)
+{
+ struct group_device *gdev =
+ container_of(work, struct group_device, broken_work);
+ struct iommu_group *group = gdev->group;
+ struct device *dev = gdev->dev;
+
+ guard(mutex)(&group->mutex);
+
+ /*
+ * iommu_deinit_device() frees dev->iommu under group->mutex. Bail
+ * out if the device has already been removed from IOMMU handling.
+ */
+ if (!dev_has_iommu(dev))
+ return;
+
+ /*
+ * A successful reset between schedule_work() and now would have cleared
+ * broken_pending. Skip as the device has already been recovered.
+ */
+ if (!READ_ONCE(gdev->broken_pending))
+ return;
+
+ /*
+ * Quarantine the device completely. For a PCI device, it will be lifted
+ * upon a pci_dev_reset_iommu_done(pdev, reset_result=0) call indicating
+ * a device recovery.
+ *
+ * For a non-PCI device, currently it has no recovery framework tied to
+ * the IOMMU subsystem. Quarantine it indefinitely until a recovery path
+ * is introduced.
+ */
+ if (WARN_ON(__iommu_group_block_device(gdev, BLOCKED_BROKEN)))
+ return;
+
+ dev_warn(dev, "IOMMU has quarantined the device\n");
+ WRITE_ONCE(gdev->broken_pending, false);
+}
+
+/**
+ * iommu_report_device_broken() - Report a broken device to quarantine it
+ * @dev: Device that has encountered an unrecoverable IOMMU-related error
+ *
+ * When an IOMMU driver detects a critical error caused by a device (e.g. an ATC
+ * invalidation timeout), this function should be used to quarantine the device
+ * at the IOMMU core level.
+ *
+ * The quarantine moves the device's RID and PASIDs to group->blocking_domain to
+ * prevent any further DMA/ATS activity that can potentially corrupt the system
+ * memory due to stale device cache entries.
+ *
+ * This function must not be called from a reset_device_done callback: setting
+ * broken_pending there would race the clear in pci_dev_reset_iommu_done() that
+ * follows. Otherwise, this function is safe to call from any context (including
+ * interrupt handlers), as the actual quarantine is done in an asynchronous work
+ * thread. The caller should have already taken driver-level measures (e.g., ATS
+ * disabled in HW) to contain the fault promptly before calling this function.
+ *
+ * An asynchronous reset can occur while the driver is handling an IOMMU-related
+ * error. The driver is responsible for calling this only for post-reset faults.
+ * A queued or delayed pre-reset fault must be drained or filtered by the driver
+ * before delivery. Otherwise, a stale report would falsely quarantine a freshly
+ * recovered device.
+ *
+ * For PCI devices, the quarantine will be lifted by a successful device reset
+ * via pci_dev_reset_iommu_done(). For non-PCI devices, the quarantine remains
+ * in effect indefinitely until a recovery mechanism is introduced.
+ *
+ * If the device is concurrently being removed or has already been removed from
+ * the IOMMU subsystem, this function will silently return without any action.
+ */
+void iommu_report_device_broken(struct device *dev)
+{
+ struct group_device *gdev;
+
+ /*
+ * We cannot hold group->mutex here. Rely on iommu_group_broken_worker()
+ * to validate dev_has_iommu(). The iommu_group memory is RCU-protected
+ * via kfree_rcu() in iommu_group_release(), and group->devices is an
+ * RCU-protected list, so the lookup runs entirely under rcu_read_lock.
+ *
+ * Note the device might have been concurrently removed from the group
+ * (list_del_rcu) before iommu_deinit_device() cleared the dev->iommu.
+ */
+ rcu_read_lock();
+ gdev = __dev_to_gdev_rcu(dev);
+ if (gdev) {
+ /*
+ * Narrow chance we re-set broken_pending right after a concurrent
+ * worker cleared it. Benign: the worker we are queueing here will
+ * read it true and clear it again (skipping if already blocked);
+ * pci_dev_reset_iommu_done() also clears it on a successful reset.
+ */
+ WRITE_ONCE(gdev->broken_pending, true);
+ schedule_work(&gdev->broken_work);
+ }
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(iommu_report_device_broken);
+
#if IS_ENABLED(CONFIG_IRQ_MSI_IOMMU)
/**
* iommu_dma_prepare_msi() - Map the MSI page in the IOMMU domain
--
2.43.0
More information about the linux-arm-kernel
mailing list