[PATCH v1 2/2] iommu/arm-smmu-v3: Recover ATC invalidate timeouts
Robin Murphy
robin.murphy at arm.com
Thu Mar 5 07:24:30 PST 2026
On 2026-03-05 5:21 am, Nicolin Chen wrote:
> Currently, when GERROR_CMDQ_ERR occurs, the arm_smmu_cmdq_skip_err() won't
> do anything for the CMDQ_ERR_CERROR_ATC_INV_IDX.
>
> When a device wasn't responsive to an ATC invalidation request, this often
> results in constant CMDQ errors:
> unexpected global error reported (0x00000001), this could be serious
> CMDQ error (cons 0x0302bb84): ATC invalidate timeout
> unexpected global error reported (0x00000001), this could be serious
> CMDQ error (cons 0x0302bb88): ATC invalidate timeout
> unexpected global error reported (0x00000001), this could be serious
> CMDQ error (cons 0x0302bb8c): ATC invalidate timeout
> ...
>
> An ATC invalidation timeout indicates that the device failed to respond to
> a protocol-critical coherency request, which means that device's internal
> ATS state is desynchronized from the SMMU.
>
> Furthermore, ignoring the timeout leaves the system in an unsafe state, as
> the device cache may retain stale ATC entries for memory pages that the OS
> has already reclaimed and reassigned. This might lead to data corruption.
>
> The only safe recovery action is to issue a PCI reset, which guarantees to
> flush all internal device caches and recover the device.
>
> Read the ATC_INV command that led to the timeouts, and schedule a recovery
> worker to reset the device corresponding to the Stream ID. If reset fails,
> keep the device in the resetting/blocking domain to avoid data corruption.
>
> Though it'd be ideal to block it immediately in the ISR, it cannot be done
> because an STE update would require another CFIG_STE command that couldn't
> finish in the context of an ISR handling a CMDQ error.
Why not? As soon as we've acked GERRORN.CMDQ_ERR, command consumption
will resume and we're free to do whatever we fancy. Admittedly this
probably represents more work than we *want* to be doing in the SMMU's
IRQ handler (arguably even in a thread, since all the PCI housekeeping
isn't really the SMMU driver's own problem), but I would say the
workqueue is a definite design choice, not a functional requirement.
> Signed-off-by: Nicolin Chen <nicolinc at nvidia.com>
> ---
> drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 5 +
> drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 131 +++++++++++++++++++-
> 2 files changed, 132 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
> index 3c6d65d36164f..8789cf8294504 100644
> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
> @@ -803,6 +803,11 @@ struct arm_smmu_device {
>
> struct rb_root streams;
> struct mutex streams_mutex;
> +
> + struct {
> + struct list_head list;
> + spinlock_t lock; /* Lock the list */
> + } atc_recovery;
> };
>
> struct arm_smmu_stream {
> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> index 4d00d796f0783..de182c27c77c4 100644
> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> @@ -106,6 +106,8 @@ static const char * const event_class_str[] = {
> [3] = "Reserved",
> };
>
> +static struct arm_smmu_master *
> +arm_smmu_find_master(struct arm_smmu_device *smmu, u32 sid);
> static int arm_smmu_alloc_cd_tables(struct arm_smmu_master *master);
>
> static void parse_driver_options(struct arm_smmu_device *smmu)
> @@ -174,6 +176,13 @@ static void queue_inc_cons(struct arm_smmu_ll_queue *q)
> q->cons = Q_OVF(q->cons) | Q_WRP(q, cons) | Q_IDX(q, cons);
> }
>
> +static u32 queue_prev_cons(struct arm_smmu_ll_queue *q, u32 cons)
> +{
> + u32 idx_wrp = (Q_WRP(q, cons) | Q_IDX(q, cons)) - 1;
> +
> + return Q_OVF(cons) | Q_WRP(q, idx_wrp) | Q_IDX(q, idx_wrp);
> +}
> +
> static void queue_sync_cons_ovf(struct arm_smmu_queue *q)
> {
> struct arm_smmu_ll_queue *llq = &q->llq;
> @@ -410,6 +419,97 @@ static void arm_smmu_cmdq_build_sync_cmd(u64 *cmd, struct arm_smmu_device *smmu,
> u64p_replace_bits(cmd, CMDQ_SYNC_0_CS_NONE, CMDQ_SYNC_0_CS);
> }
>
> +/* ATC recovery upon ATC invalidation timeout */
> +struct arm_smmu_atc_recovery_param {
> + struct arm_smmu_device *smmu;
> + struct pci_dev *pdev;
> + u32 sid;
> +
> + struct work_struct work;
> + struct list_head node;
> +};
> +
> +static void arm_smmu_atc_recovery_worker(struct work_struct *work)
> +{
> + struct arm_smmu_atc_recovery_param *param =
> + container_of(work, struct arm_smmu_atc_recovery_param, work);
> + struct pci_dev *pdev;
> +
> + scoped_guard(mutex, ¶m->smmu->streams_mutex) {
> + struct arm_smmu_master *master;
> +
> + master = arm_smmu_find_master(param->smmu, param->sid);
> + if (!master || WARN_ON(!dev_is_pci(master->dev)))
> + goto free_param;
> + pdev = to_pci_dev(master->dev);
> + pci_dev_get(pdev);
> + }
> +
> + scoped_guard(spinlock_irqsave, ¶m->smmu->atc_recovery.lock) {
> + struct arm_smmu_atc_recovery_param *e;
> +
> + list_for_each_entry(e, ¶m->smmu->atc_recovery.list, node) {
> + /* Device is already being recovered */
> + if (e->pdev == pdev)
> + goto put_pdev;
> + }
> + param->pdev = pdev;
> + list_add(¶m->node, ¶m->smmu->atc_recovery.list);
> + }
> +
> + /*
> + * Stop DMA (PCI) and block ATS (IOMMU) immediately, to prevent memory
> + * corruption. This must take pci_dev_lock to prevent any racy unplug.
> + *
> + * If pci_dev_reset_iommu_prepare() fails, pci_reset_function will call
> + * it again internally.
> + */
> + pci_dev_lock(pdev);
> + pci_clear_master(pdev);
> + if (pci_dev_reset_iommu_prepare(pdev))
> + pci_err(pdev, "failed to block ATS!\n");
> + pci_dev_unlock(pdev);
> +
> + /*
> + * ATC timeout indicates the device has stopped responding to coherence
> + * protocol requests. The only safe recovery is a reset to flush stale
> + * cached translations. Note that pci_reset_function() internally calls
> + * pci_dev_reset_iommu_prepare/done() as well and ensures to block ATS
> + * if PCI-level reset fails.
> + */
> + if (!pci_reset_function(pdev)) {
> + /*
> + * If reset succeeds, set BME back. Otherwise, fence the system
> + * from a faulty device, in which case user will have to replug
> + * the device to invoke pci_set_master().
> + */
> + pci_dev_lock(pdev);
> + pci_set_master(pdev);
> + pci_dev_unlock(pdev);
> + }
> + scoped_guard(spinlock_irqsave, ¶m->smmu->atc_recovery.lock)
> + list_del(¶m->node);
> +put_pdev:
> + pci_dev_put(pdev);
> +free_param:
> + kfree(param);
> +}
The only thing SMMU-specific about this seems to be the use of
arm_smmu_find_master() to resolve the device, which could just as well
be done upon submission anyway - why isn't this a generic IOMMU/IOPF
mechanism?
> +
> +static int arm_smmu_sched_atc_recovery(struct arm_smmu_device *smmu, u32 sid)
> +{
> + struct arm_smmu_atc_recovery_param *param;
> +
> + param = kzalloc_obj(*param, GFP_ATOMIC);
> + if (!param)
> + return -ENOMEM;
> + param->smmu = smmu;
> + param->sid = sid;
> +
> + INIT_WORK(¶m->work, arm_smmu_atc_recovery_worker);
> + queue_work(system_unbound_wq, ¶m->work);
Might it make more sense to have a single work item associated with the
list_head and use the latter as an actual queue, such that the work runs
until the list is empty, then here at submisison time we do the
list_add() and schedule_work()? That could perhaps even be a global
queue, since ATS timeouts can hardly be expected to be a
highly-contended high-perfoamnce concern.
Right now it seems the list is barely doing anything - a "deduplication"
mechanism that only works if multiple resets for the same device happen
to have their work scheduled concurrently seems pretty ineffective.
> + return 0;
> +}
> +
> void __arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu,
> struct arm_smmu_cmdq *cmdq)
> {
> @@ -441,11 +541,10 @@ void __arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu,
> case CMDQ_ERR_CERROR_ATC_INV_IDX:
> /*
> * ATC Invalidation Completion timeout. CONS is still pointing
> - * at the CMD_SYNC. Attempt to complete other pending commands
> - * by repeating the CMD_SYNC, though we might well end up back
> - * here since the ATC invalidation may still be pending.
> + * at the CMD_SYNC. Rewind it to read the ATC_INV command.
> */
> - return;
> + cons = queue_prev_cons(&q->llq, cons);
> + fallthrough;
> case CMDQ_ERR_CERROR_ILL_IDX:
> default:
> break;
> @@ -456,6 +555,27 @@ void __arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu,
> * not to touch any of the shadow cmdq state.
> */
> queue_read(cmd, Q_ENT(q, cons), q->ent_dwords);
> +
> + if (idx == CMDQ_ERR_CERROR_ATC_INV_IDX) {
> + /*
> + * Since commands can be issued in batch making it difficult to
> + * identify which CMDQ_OP_ATC_INV actually timed out, the driver
> + * must ensure only CMDQ_OP_ATC_INV commands for the same device
> + * can be batched.
But this *is* "the driver" - arm_smmu_atc_inv_domain() is literally
further down this same C file, and does not do what this comment is
saying it must do, so how are you expecting this to work correctly?
Thanks,
Robin.
> + */
> + WARN_ON(FIELD_GET(CMDQ_0_OP, cmd[0]) != CMDQ_OP_ATC_INV);
> +
> + /*
> + * If we failed to schedule a recovery worker, we would well end
> + * up back here since the ATC invalidation may still be pending.
> + * This gives us another chance to reschedule a recovery worker.
> + */
> + arm_smmu_sched_atc_recovery(smmu,
> + FIELD_GET(CMDQ_ATC_0_SID, cmd[0]));
> + return;
> + }
> +
> + /* idx == CMDQ_ERR_CERROR_ILL_IDX */
> dev_err(smmu->dev, "skipping command in error state:\n");
> for (i = 0; i < ARRAY_SIZE(cmd); ++i)
> dev_err(smmu->dev, "\t0x%016llx\n", (unsigned long long)cmd[i]);
> @@ -3942,6 +4062,9 @@ static int arm_smmu_init_structures(struct arm_smmu_device *smmu)
> {
> int ret;
>
> + INIT_LIST_HEAD(&smmu->atc_recovery.list);
> + spin_lock_init(&smmu->atc_recovery.lock);
> +
> mutex_init(&smmu->streams_mutex);
> smmu->streams = RB_ROOT;
>
More information about the linux-arm-kernel
mailing list