An ATC invalidation timeout is a fatal error. While the SMMUv3 hardware is
aware of the timeout via a GERROR interrupt, the driver thread issuing the
commands lacks a direct mechanism to verify whether its specific batch was
the cause or not, as polling the CMD_SYNC status doesn't natively return a
failure code, making it very difficult to coordinate per-device recovery.
Introduce an atc_sync_timeouts bitmap in the cmdq structure to bridge this
gap. When the ISR detects an ATC timeout, set the bit corresponding to the
physical CMDQ index of the faulting CMD_SYNC command.
On the issuer side, after polling completes (or times out), test and clear
its dedicated bit. If set, return -EIO to trigger device quarantine.
Signed-off-by: Nicolin Chen <nicolinc at nvidia.com>
---
drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 1 +
drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 42 ++++++++++++++++++++-
2 files changed, 42 insertions(+), 1 deletion(-)
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 16353596e08ad..46f9e292a1cc8 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -700,6 +700,7 @@ struct arm_smmu_cmdq {
atomic_long_t *valid_map;
atomic_t owner_prod;
atomic_t lock;
+ unsigned long *atc_sync_timeouts;
bool (*supports_cmd)(struct arm_smmu_cmd *cmd);
};
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 9be589d14a3bd..1065301a54eeb 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -343,7 +343,10 @@ void __arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu,
* at the CMD_SYNC. Attempt to complete other pending commands
* by repeating the CMD_SYNC, though we might well end up back
* here since the ATC invalidation may still be pending.
+ *
+ * Mark the faulty batch in the bitmap for the issuer to match.
*/
+ set_bit(Q_IDX(&q->llq, cons), cmdq->atc_sync_timeouts);
return;
case CMDQ_ERR_CERROR_ILL_IDX:
default:
@@ -750,6 +753,14 @@ int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu,
queue_write(Q_ENT(&cmdq->q, prod), cmd_sync.data,
ARRAY_SIZE(cmd_sync.data));
+ /*
+ * Clear any stale ATC-timeout bit left in the slot from a prior
+ * wraparound, before the slot becomes visible to the SMMU. Must
+ * do this prior to step 3 to prevent potentially races with the
+ * GERROR ISR calling set_bit() for our own CMD_SYNC.
+ */
+ clear_bit(Q_IDX(&llq, prod), cmdq->atc_sync_timeouts);
+
/*
* In order to determine completion of our CMD_SYNC, we must
* ensure that the queue can't wrap twice without us noticing.
@@ -796,9 +807,33 @@ int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu,
/* 5. If we are inserting a CMD_SYNC, we must wait for it to complete */
if (sync) {
+ u32 sync_prod;
+
llq.prod = queue_inc_prod_n(&llq, n);
+ sync_prod = llq.prod;
ret = arm_smmu_cmdq_poll_until_sync(smmu, cmdq, &llq);
- if (ret) {
+
+ /*
+ * Test atc_sync_timeouts first and see if there is ATC timeout
+ * resulted from this cmdlist. Return -EIO to separate from the
+ * ARM_SMMU_POLL_TIMEOUT_US software timeout.
+ *
+ * FIXME possible unhandled ATC invalidation timeout scenario:
+ * PCI Completion Timeout can be set to a range longer than the
+ * ARM_SMMU_POLL_TIMEOUT_US software timeout. -ETIMEDOUT can be
+ * returned by arm_smmu_cmdq_poll_until_sync() while the ATC_INV
+ * is still pending and not yet reflected in GERROR, so the bit
+ * on atc_sync_timeouts is not set. In this case, we can hardly
+ * do anything here, since the command queue HW is still pending
+ * on the ATC command.
+ */
+ if (test_and_clear_bit(Q_IDX(&llq, sync_prod),
+ cmdq->atc_sync_timeouts)) {
+ dev_err_ratelimited(smmu->dev,
+ "CMD_SYNC for ATC_INV timeout at prod=0x%08x\n",
+ sync_prod);
+ ret = -EIO;
+ } else if (ret) {
dev_err_ratelimited(smmu->dev,
"CMD_SYNC timeout at 0x%08x [hwprod 0x%08x, hwcons 0x%08x]\n",
llq.prod,
@@ -4332,6 +4367,11 @@ int arm_smmu_cmdq_init(struct arm_smmu_device *smmu,
if (!cmdq->valid_map)
return -ENOMEM;
+ cmdq->atc_sync_timeouts =
+ devm_bitmap_zalloc(smmu->dev, nents, GFP_KERNEL);
+ if (!cmdq->atc_sync_timeouts)
+ return -ENOMEM;
+
return 0;
}
--
2.43.0