[PATCH v2 7/7] iommu/arm-smmu-v3: Block ATS upon an ATC invalidation timeout

Wed Mar 18 19:56:43 PDT 2026

On 3/18/26 3:15 AM, Nicolin Chen wrote:
> Currently, when GERROR_CMDQ_ERR occurs, the arm_smmu_cmdq_skip_err() won't
> do anything for the CMDQ_ERR_CERROR_ATC_INV_IDX.
> 
> When a device wasn't responsive to an ATC invalidation request, this often
> results in constant CMDQ errors:
>    unexpected global error reported (0x00000001), this could be serious
>    CMDQ error (cons 0x0302bb84): ATC invalidate timeout
>    unexpected global error reported (0x00000001), this could be serious
>    CMDQ error (cons 0x0302bb88): ATC invalidate timeout
>    unexpected global error reported (0x00000001), this could be serious
>    CMDQ error (cons 0x0302bb8c): ATC invalidate timeout
>    ...
> 
> An ATC invalidation timeout indicates that the device failed to respond to
> a protocol-critical coherency request, which means that device's internal
> ATS state is desynchronized from the SMMU.
> 
> Furthermore, ignoring the timeout leaves the system in an unsafe state, as
> the device cache may retain stale ATC entries for memory pages that the OS
> has already reclaimed and reassigned. This might lead to data corruption.
> 
> Isolate the device that is confirmed to be unresponsive by a surgical STE
> update to unset its EATS bit so as to reject any further ATS transaction,
> which could corrupt the memory.
> 
> Also, set the master->ats_broken flag that is revertible after the device
> completes a reset. This flag avoids further ATS requests and invalidations
> from happening.
> 
> Finally, report this broken device to the IOMMU core to isolate the device
> in the core level too.
> 
> For batched ATC_INV commands, SMMU hardware only reports a timeout at the
> CMD_SYNC, which could follow the batch issued for multiple devices. So, it
> isn't straightforward to identify which command in a batch resulted in the
> timeout. Fortunately, the invs array has a sorted list of ATC entries. So,
> the issued batch must be sorted as well. This makes it possible to bisect
> the batch to retry the command per Stream ID and identify the master.

Nit: The implementation is a linear per-SID retry, not a binary
search / bisection. Suggest rewording to:

   "retry the ATC_INV command for each unique Stream ID in the batch
    to identify the unresponsive master"

> 
> Signed-off-by: Nicolin Chen <nicolinc at nvidia.com>
> ---
>   drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 92 ++++++++++++++++++++-
>   1 file changed, 89 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> index 366d812668011..418ee2f40d96d 100644
> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> @@ -107,6 +107,8 @@ static const char * const event_class_str[] = {
>   	[3] = "Reserved",
>   };
>   
> +static struct arm_smmu_ste *
> +arm_smmu_get_step_for_sid(struct arm_smmu_device *smmu, u32 sid);
>   static int arm_smmu_alloc_cd_tables(struct arm_smmu_master *master);
>   
>   static void parse_driver_options(struct arm_smmu_device *smmu)
> @@ -2495,10 +2497,43 @@ arm_smmu_atc_inv_to_cmd(int ssid, unsigned long iova, size_t size,
>   	cmd->atc.size	= log2_span;
>   }
>   
> +static void arm_smmu_disable_eats_for_sid(struct arm_smmu_device *smmu,
> +					  struct arm_smmu_cmdq *cmdq, u32 sid)
> +{
> +	struct arm_smmu_cmdq_ent ent = {
> +		.opcode = CMDQ_OP_CFGI_STE,
> +		.cfgi	= {
> +			.sid = sid,
> +			.leaf = true,
> +		},
> +	};
> +	struct arm_smmu_ste *step;
> +	u64 cmd[CMDQ_ENT_DWORDS];
> +
> +	step = arm_smmu_get_step_for_sid(smmu, sid);
> +	WRITE_ONCE(step->data[1],
> +		   READ_ONCE(step->data[1]) & cpu_to_le64(~STRTAB_STE_1_EATS));

This non-atomic read-modify-write on step->data[1] can race with the
normal STE installation path (arm_smmu_write_entry → entry_set →
WRITE_ONCE).

The error path runs from:

   __arm_smmu_domain_inv_range()  (data path, no group->mutex)
     → arm_smmu_cmdq_batch_retry()
       → arm_smmu_master_disable_ats()
         → arm_smmu_disable_eats_for_sid()   ← NO locks on STE

The normal STE path runs from:

   iommu_attach_device()
     → mutex_lock(&group->mutex)
       → arm_smmu_attach_dev()
         → mutex_lock(&arm_smmu_asid_lock)
           → arm_smmu_install_ste_for_dev()
             → arm_smmu_write_entry()        ← holds both mutexes

Since the error path holds neither group->mutex nor arm_smmu_asid_lock,
the following race is possible:

   CPU A (error path):             CPU B (attach path):
   READ data[1] = X
                                   WRITE data[1] = Y (new STE config)
   WRITE data[1] = X & ~EATS
                                   // Y is lost

This could clobber a concurrent STE update from the attach path.

> +
> +	arm_smmu_cmdq_build_cmd(cmd, &ent);
> +	arm_smmu_cmdq_issue_cmdlist(smmu, cmdq, cmd, 1, true);
> +}
> +
> +static void arm_smmu_master_disable_ats(struct arm_smmu_master *master,
> +					struct arm_smmu_cmdq *cmdq)
> +{
> +	int i;
> +
> +	for (i = 0; i < master->num_streams; i++)
> +		arm_smmu_disable_eats_for_sid(master->smmu, cmdq,
> +					      master->streams[i].id);
> +	WRITE_ONCE(master->ats_broken, true);
> +	iommu_report_device_broken(master->dev);
> +}
> +
>   static int arm_smmu_atc_inv_master(struct arm_smmu_master *master,
>   				   ioasid_t ssid)
>   {
> -	int i;
> +	int i, ret;
>   	struct arm_smmu_cmdq_ent cmd;
>   	struct arm_smmu_cmdq_batch cmds;
>   
> @@ -2514,7 +2549,10 @@ static int arm_smmu_atc_inv_master(struct arm_smmu_master *master,
>   		arm_smmu_cmdq_batch_add(master->smmu, &cmds, &cmd);
>   	}
>   
> -	return arm_smmu_cmdq_batch_submit(master->smmu, &cmds);
> +	ret = arm_smmu_cmdq_batch_submit(master->smmu, &cmds);
> +	if (ret)
> +		arm_smmu_master_disable_ats(master, cmds.cmdq);
> +	return ret;
>   }
>   
>   /* IO_PGTABLE API */
> @@ -2661,6 +2699,53 @@ static inline bool arm_smmu_invs_end_batch(struct arm_smmu_inv *cur,
>   	return false;
>   }
>   
> +static void arm_smmu_invs_disable_ats(struct arm_smmu_invs *invs,
> +				      struct arm_smmu_cmdq *cmdq,
> +				      struct arm_smmu_device *smmu, u32 sid)
> +{
> +	struct arm_smmu_inv *cur;
> +	size_t i;
> +
> +	arm_smmu_invs_for_each_entry(invs, i, cur) {
> +		if (cur->master->smmu == smmu && arm_smmu_inv_is_ats(cur) &&
> +		    cur->id == sid) {
> +			arm_smmu_master_disable_ats(cur->master, cmdq);
> +			break;
> +		}
> +	}
> +}
> +
> +static void arm_smmu_cmdq_batch_retry(struct arm_smmu_device *smmu,
> +				      struct arm_smmu_invs *invs,
> +				      struct arm_smmu_cmdq_batch *cmds)
> +{
> +	u64 atc[CMDQ_ENT_DWORDS] = {0};
> +	int i;
> +
> +	/* Only a timed out ATC_INV command needs a retry */
> +	if (!invs->has_ats)
> +		return;
> +
> +	for (i = 0; i < cmds->num * CMDQ_ENT_DWORDS; i += CMDQ_ENT_DWORDS) {
> +		struct arm_smmu_cmdq *cmdq = cmds->cmdq;
> +		u32 sid;
> +
> +		/* Only need to retry ATC invalidations */
> +		if (FIELD_GET(CMDQ_0_OP, cmds->cmds[i]) != CMDQ_OP_ATC_INV)
> +			continue;
> +
> +		/* Only need to retry with one ATC_INV per Stream ID (device) */
> +		sid = FIELD_GET(CMDQ_ATC_0_SID, cmds->cmds[i]);
> +		if (atc[0] && sid == FIELD_GET(CMDQ_ATC_0_SID, atc[0]))
> +			continue;
> +
> +		atc[0] = cmds->cmds[i];
> +		atc[1] = cmds->cmds[i + 1];
> +		if (arm_smmu_cmdq_issue_cmdlist(smmu, cmdq, atc, 1, true))
> +			arm_smmu_invs_disable_ats(invs, cmdq, smmu, sid);
> +	}
> +}
> +
>   static void __arm_smmu_domain_inv_range(struct arm_smmu_invs *invs,
>   					unsigned long iova, size_t size,
>   					unsigned int granule, bool leaf)
> @@ -2739,7 +2824,8 @@ static void __arm_smmu_domain_inv_range(struct arm_smmu_invs *invs,
>   
>   		if (cmds.num &&
>   		    (next == end || arm_smmu_invs_end_batch(cur, next))) {
> -			arm_smmu_cmdq_batch_submit(smmu, &cmds);
> +			if (arm_smmu_cmdq_batch_submit(smmu, &cmds))
> +				arm_smmu_cmdq_batch_retry(smmu, invs, &cmds);
>   			cmds.num = 0;
>   		}
>   		cur = next;