[PATCH v2 7/7] iommu/arm-smmu-v3: Block ATS upon an ATC invalidation timeout
Shuai Xue
xueshuai at linux.alibaba.com
Wed Mar 18 19:56:43 PDT 2026
On 3/18/26 3:15 AM, Nicolin Chen wrote:
> Currently, when GERROR_CMDQ_ERR occurs, the arm_smmu_cmdq_skip_err() won't
> do anything for the CMDQ_ERR_CERROR_ATC_INV_IDX.
>
> When a device wasn't responsive to an ATC invalidation request, this often
> results in constant CMDQ errors:
> unexpected global error reported (0x00000001), this could be serious
> CMDQ error (cons 0x0302bb84): ATC invalidate timeout
> unexpected global error reported (0x00000001), this could be serious
> CMDQ error (cons 0x0302bb88): ATC invalidate timeout
> unexpected global error reported (0x00000001), this could be serious
> CMDQ error (cons 0x0302bb8c): ATC invalidate timeout
> ...
>
> An ATC invalidation timeout indicates that the device failed to respond to
> a protocol-critical coherency request, which means that device's internal
> ATS state is desynchronized from the SMMU.
>
> Furthermore, ignoring the timeout leaves the system in an unsafe state, as
> the device cache may retain stale ATC entries for memory pages that the OS
> has already reclaimed and reassigned. This might lead to data corruption.
>
> Isolate the device that is confirmed to be unresponsive by a surgical STE
> update to unset its EATS bit so as to reject any further ATS transaction,
> which could corrupt the memory.
>
> Also, set the master->ats_broken flag that is revertible after the device
> completes a reset. This flag avoids further ATS requests and invalidations
> from happening.
>
> Finally, report this broken device to the IOMMU core to isolate the device
> in the core level too.
>
> For batched ATC_INV commands, SMMU hardware only reports a timeout at the
> CMD_SYNC, which could follow the batch issued for multiple devices. So, it
> isn't straightforward to identify which command in a batch resulted in the
> timeout. Fortunately, the invs array has a sorted list of ATC entries. So,
> the issued batch must be sorted as well. This makes it possible to bisect
> the batch to retry the command per Stream ID and identify the master.
Nit: The implementation is a linear per-SID retry, not a binary
search / bisection. Suggest rewording to:
"retry the ATC_INV command for each unique Stream ID in the batch
to identify the unresponsive master"
>
> Signed-off-by: Nicolin Chen <nicolinc at nvidia.com>
> ---
> drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 92 ++++++++++++++++++++-
> 1 file changed, 89 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> index 366d812668011..418ee2f40d96d 100644
> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> @@ -107,6 +107,8 @@ static const char * const event_class_str[] = {
> [3] = "Reserved",
> };
>
> +static struct arm_smmu_ste *
> +arm_smmu_get_step_for_sid(struct arm_smmu_device *smmu, u32 sid);
> static int arm_smmu_alloc_cd_tables(struct arm_smmu_master *master);
>
> static void parse_driver_options(struct arm_smmu_device *smmu)
> @@ -2495,10 +2497,43 @@ arm_smmu_atc_inv_to_cmd(int ssid, unsigned long iova, size_t size,
> cmd->atc.size = log2_span;
> }
>
> +static void arm_smmu_disable_eats_for_sid(struct arm_smmu_device *smmu,
> + struct arm_smmu_cmdq *cmdq, u32 sid)
> +{
> + struct arm_smmu_cmdq_ent ent = {
> + .opcode = CMDQ_OP_CFGI_STE,
> + .cfgi = {
> + .sid = sid,
> + .leaf = true,
> + },
> + };
> + struct arm_smmu_ste *step;
> + u64 cmd[CMDQ_ENT_DWORDS];
> +
> + step = arm_smmu_get_step_for_sid(smmu, sid);
> + WRITE_ONCE(step->data[1],
> + READ_ONCE(step->data[1]) & cpu_to_le64(~STRTAB_STE_1_EATS));
This non-atomic read-modify-write on step->data[1] can race with the
normal STE installation path (arm_smmu_write_entry → entry_set →
WRITE_ONCE).
The error path runs from:
__arm_smmu_domain_inv_range() (data path, no group->mutex)
→ arm_smmu_cmdq_batch_retry()
→ arm_smmu_master_disable_ats()
→ arm_smmu_disable_eats_for_sid() ← NO locks on STE
The normal STE path runs from:
iommu_attach_device()
→ mutex_lock(&group->mutex)
→ arm_smmu_attach_dev()
→ mutex_lock(&arm_smmu_asid_lock)
→ arm_smmu_install_ste_for_dev()
→ arm_smmu_write_entry() ← holds both mutexes
Since the error path holds neither group->mutex nor arm_smmu_asid_lock,
the following race is possible:
CPU A (error path): CPU B (attach path):
READ data[1] = X
WRITE data[1] = Y (new STE config)
WRITE data[1] = X & ~EATS
// Y is lost
This could clobber a concurrent STE update from the attach path.
> +
> + arm_smmu_cmdq_build_cmd(cmd, &ent);
> + arm_smmu_cmdq_issue_cmdlist(smmu, cmdq, cmd, 1, true);
> +}
> +
> +static void arm_smmu_master_disable_ats(struct arm_smmu_master *master,
> + struct arm_smmu_cmdq *cmdq)
> +{
> + int i;
> +
> + for (i = 0; i < master->num_streams; i++)
> + arm_smmu_disable_eats_for_sid(master->smmu, cmdq,
> + master->streams[i].id);
> + WRITE_ONCE(master->ats_broken, true);
> + iommu_report_device_broken(master->dev);
> +}
> +
> static int arm_smmu_atc_inv_master(struct arm_smmu_master *master,
> ioasid_t ssid)
> {
> - int i;
> + int i, ret;
> struct arm_smmu_cmdq_ent cmd;
> struct arm_smmu_cmdq_batch cmds;
>
> @@ -2514,7 +2549,10 @@ static int arm_smmu_atc_inv_master(struct arm_smmu_master *master,
> arm_smmu_cmdq_batch_add(master->smmu, &cmds, &cmd);
> }
>
> - return arm_smmu_cmdq_batch_submit(master->smmu, &cmds);
> + ret = arm_smmu_cmdq_batch_submit(master->smmu, &cmds);
> + if (ret)
> + arm_smmu_master_disable_ats(master, cmds.cmdq);
> + return ret;
> }
>
> /* IO_PGTABLE API */
> @@ -2661,6 +2699,53 @@ static inline bool arm_smmu_invs_end_batch(struct arm_smmu_inv *cur,
> return false;
> }
>
> +static void arm_smmu_invs_disable_ats(struct arm_smmu_invs *invs,
> + struct arm_smmu_cmdq *cmdq,
> + struct arm_smmu_device *smmu, u32 sid)
> +{
> + struct arm_smmu_inv *cur;
> + size_t i;
> +
> + arm_smmu_invs_for_each_entry(invs, i, cur) {
> + if (cur->master->smmu == smmu && arm_smmu_inv_is_ats(cur) &&
> + cur->id == sid) {
> + arm_smmu_master_disable_ats(cur->master, cmdq);
> + break;
> + }
> + }
> +}
> +
> +static void arm_smmu_cmdq_batch_retry(struct arm_smmu_device *smmu,
> + struct arm_smmu_invs *invs,
> + struct arm_smmu_cmdq_batch *cmds)
> +{
> + u64 atc[CMDQ_ENT_DWORDS] = {0};
> + int i;
> +
> + /* Only a timed out ATC_INV command needs a retry */
> + if (!invs->has_ats)
> + return;
> +
> + for (i = 0; i < cmds->num * CMDQ_ENT_DWORDS; i += CMDQ_ENT_DWORDS) {
> + struct arm_smmu_cmdq *cmdq = cmds->cmdq;
> + u32 sid;
> +
> + /* Only need to retry ATC invalidations */
> + if (FIELD_GET(CMDQ_0_OP, cmds->cmds[i]) != CMDQ_OP_ATC_INV)
> + continue;
> +
> + /* Only need to retry with one ATC_INV per Stream ID (device) */
> + sid = FIELD_GET(CMDQ_ATC_0_SID, cmds->cmds[i]);
> + if (atc[0] && sid == FIELD_GET(CMDQ_ATC_0_SID, atc[0]))
> + continue;
> +
> + atc[0] = cmds->cmds[i];
> + atc[1] = cmds->cmds[i + 1];
> + if (arm_smmu_cmdq_issue_cmdlist(smmu, cmdq, atc, 1, true))
> + arm_smmu_invs_disable_ats(invs, cmdq, smmu, sid);
> + }
> +}
> +
> static void __arm_smmu_domain_inv_range(struct arm_smmu_invs *invs,
> unsigned long iova, size_t size,
> unsigned int granule, bool leaf)
> @@ -2739,7 +2824,8 @@ static void __arm_smmu_domain_inv_range(struct arm_smmu_invs *invs,
>
> if (cmds.num &&
> (next == end || arm_smmu_invs_end_batch(cur, next))) {
> - arm_smmu_cmdq_batch_submit(smmu, &cmds);
> + if (arm_smmu_cmdq_batch_submit(smmu, &cmds))
> + arm_smmu_cmdq_batch_retry(smmu, invs, &cmds);
> cmds.num = 0;
> }
> cur = next;
More information about the linux-arm-kernel
mailing list