[PATCH 5/8] iommu/arm-smmu-v3: Precompute the invalidation commands
Jason Gunthorpe
jgg at nvidia.com
Mon May 18 12:43:42 PDT 2026
Store the required cmd data in the tlbi and just copy it out when
processing each item in the invs list. The cmd form only depends on
if the instance supports RIL or not, otherwise it is always the same.
This avoids redundant calculations for each invs entry.
Signed-off-by: Jason Gunthorpe <jgg at nvidia.com>
---
drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 132 +++++++++++---------
drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 11 ++
2 files changed, 81 insertions(+), 62 deletions(-)
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 3b0b273fcde829..9aa08f782e8986 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -2398,32 +2398,15 @@ static bool arm_smmu_ttl_addr_aligned(u64 address, unsigned int tg,
return !(address & GENMASK_U64(pgsz_lg2 - 1, 0));
}
-static void arm_smmu_cmdq_batch_add_ril(struct arm_smmu_device *smmu,
- struct arm_smmu_cmdq_batch *cmds,
- struct arm_smmu_cmd *cmd, bool leaf,
- u64 address, unsigned int num,
- unsigned int scale, u8 ttl, u8 tg_enc)
-{
- cmd->data[0] |= FIELD_PREP(CMDQ_TLBI_0_NUM, num) |
- FIELD_PREP(CMDQ_TLBI_0_SCALE, scale);
- cmd->data[1] = FIELD_PREP(CMDQ_TLBI_1_LEAF, leaf) |
- FIELD_PREP(CMDQ_TLBI_1_TTL, ttl) |
- FIELD_PREP(CMDQ_TLBI_1_TG, tg_enc) | address;
- arm_smmu_cmdq_batch_add_cmd_p(smmu, cmds, cmd);
-}
-
/*
- * Issue a single range TLBI command covering [iova, iova+size). Returns true if
- * successful, false if the range is too large for a single command.
+ * Generate a single range TLBI command covering [iova, iova+size). Sets
+ * use_full_inv if the range is too large for a single command.
*
* The algorithm finds the smallest SCALE where the range (in tg-sized pages)
* fits in the 5-bit NUM field (max 32 units of 2^SCALE pages). This may widen
* the invalidation range.
*/
-static bool arm_smmu_cmdq_batch_add_range(struct arm_smmu_device *smmu,
- struct arm_smmu_cmdq_batch *cmds,
- struct arm_smmu_cmd *cmd,
- struct arm_smmu_tlbi *tlbi)
+static void arm_smmu_tlbi_calc_range(struct arm_smmu_tlbi *tlbi)
{
unsigned int tg_lg2 = tlbi->smmu_domain->tgsz_lg2;
u64 cur_tg = tlbi->iova >> tg_lg2;
@@ -2433,9 +2416,6 @@ static bool arm_smmu_cmdq_batch_add_range(struct arm_smmu_device *smmu,
unsigned int scale;
u8 ttl = 0;
- if (!tlbi->size)
- return false;
-
/*
* Determine what level the granule is at. For non-leaf, both
* io-pgtable and SVA pass a nominal last-level granule because they
@@ -2452,10 +2432,13 @@ static bool arm_smmu_cmdq_batch_add_range(struct arm_smmu_device *smmu,
if (num_tg == 1) {
if (!ttl)
ttl = 3;
- arm_smmu_cmdq_batch_add_ril(smmu, cmds, cmd, tlbi->leaf_only,
- cur_tg << tg_lg2, 0, 0, ttl,
- tg_enc);
- return true;
+ tlbi->range.data0 = 0;
+ tlbi->range.data1 =
+ FIELD_PREP(CMDQ_TLBI_1_LEAF, tlbi->leaf_only) |
+ FIELD_PREP(CMDQ_TLBI_1_TTL, ttl) |
+ FIELD_PREP(CMDQ_TLBI_1_TG, tg_enc) |
+ (cur_tg << tg_lg2);
+ return;
}
/*
@@ -2474,7 +2457,8 @@ static bool arm_smmu_cmdq_batch_add_range(struct arm_smmu_device *smmu,
/*
* Range too large for a single command, use full invalidation.
*/
- return false;
+ tlbi->range.use_full_inv = true;
+ return;
}
/* 16K granule TTL=1 is reserved (Section 4.4.1) */
@@ -2485,38 +2469,31 @@ static bool arm_smmu_cmdq_batch_add_range(struct arm_smmu_device *smmu,
if (ttl && !arm_smmu_ttl_addr_aligned(cur_tg << tg_lg2, tg_lg2, ttl))
ttl = 0;
- arm_smmu_cmdq_batch_add_ril(smmu, cmds, cmd, tlbi->leaf_only,
- cur_tg << tg_lg2,
- DIV_ROUND_UP_ULL(num_tg, 1ULL << scale) - 1,
- scale, ttl, tg_enc);
- return true;
+ tlbi->range.data0 =
+ FIELD_PREP(CMDQ_TLBI_0_NUM,
+ DIV_ROUND_UP_ULL(num_tg, 1ULL << scale) - 1) |
+ FIELD_PREP(CMDQ_TLBI_0_SCALE, scale);
+ tlbi->range.data1 = FIELD_PREP(CMDQ_TLBI_1_LEAF, tlbi->leaf_only) |
+ FIELD_PREP(CMDQ_TLBI_1_TTL, ttl) |
+ FIELD_PREP(CMDQ_TLBI_1_TG, tg_enc) |
+ (cur_tg << tg_lg2);
}
/*
* One TLBI command per IOTLB entry, assuming the entries are all at least
- * iopte_granule sized. Returns false if too many commands would be needed which
- * indicates too high a latency. The threshold is similar to MAX_DVM_OPS in
- * arch/arm64/include/asm/tlbflush.h for the 4k PAGE_SIZE.
+ * iopte_granule sized. Sets use_full_inv if too many commands would be needed
+ * which indicates too high a latency. The threshold is similar to MAX_DVM_OPS
+ * in arch/arm64/include/asm/tlbflush.h for the 4k PAGE_SIZE.
*/
-static bool arm_smmu_cmdq_batch_add_single(struct arm_smmu_device *smmu,
- struct arm_smmu_cmdq_batch *cmds,
- struct arm_smmu_cmd *cmd,
- struct arm_smmu_tlbi *tlbi)
+static void arm_smmu_tlbi_calc_single(struct arm_smmu_tlbi *tlbi)
{
unsigned long num_ops = tlbi->size / tlbi->iopte_granule;
- unsigned long iova = tlbi->iova;
- unsigned long i;
- if (!num_ops || num_ops > 512)
- return false;
-
- for (i = 0; i < num_ops; i++) {
- cmd->data[1] = FIELD_PREP(CMDQ_TLBI_1_LEAF, tlbi->leaf_only) |
- (iova & ~GENMASK_U64(11, 0));
- arm_smmu_cmdq_batch_add_cmd_p(smmu, cmds, cmd);
- iova += tlbi->iopte_granule;
+ if (!num_ops || num_ops > 512) {
+ tlbi->single.use_full_inv = true;
+ return;
}
- return true;
+ tlbi->single.num = num_ops;
}
static void arm_smmu_inv_all_cmd(struct arm_smmu_inv *inv,
@@ -2536,16 +2513,32 @@ static bool arm_smmu_inv_to_cmdq_batch(struct arm_smmu_inv *inv,
struct arm_smmu_cmd *cmd,
struct arm_smmu_tlbi *tlbi)
{
+ u64 iova = tlbi->iova;
+ unsigned int i;
+
if (inv->smmu->features & ARM_SMMU_FEAT_RANGE_INV) {
- if (arm_smmu_cmdq_batch_add_range(inv->smmu, cmds, cmd, tlbi))
- return false;
- } else {
- if (arm_smmu_cmdq_batch_add_single(inv->smmu, cmds, cmd, tlbi))
- return false;
+ if (tlbi->range.use_full_inv) {
+ arm_smmu_inv_all_cmd(inv, cmds, cmd);
+ return true;
+ }
+ cmd->data[0] |= tlbi->range.data0;
+ cmd->data[1] = tlbi->range.data1;
+ arm_smmu_cmdq_batch_add_cmd_p(inv->smmu, cmds, cmd);
+ return false;
}
- arm_smmu_inv_all_cmd(inv, cmds, cmd);
- return true;
+ if (tlbi->single.use_full_inv) {
+ arm_smmu_inv_all_cmd(inv, cmds, cmd);
+ return true;
+ }
+
+ for (i = 0; i < tlbi->single.num; i++) {
+ cmd->data[1] = FIELD_PREP(CMDQ_TLBI_1_LEAF, tlbi->leaf_only) |
+ (iova & ~GENMASK_U64(11, 0));
+ iova += tlbi->iopte_granule;
+ arm_smmu_cmdq_batch_add_cmd_p(inv->smmu, cmds, cmd);
+ }
+ return false;
}
static inline bool arm_smmu_invs_end_batch(struct arm_smmu_inv *cur,
@@ -2564,8 +2557,8 @@ static inline bool arm_smmu_invs_end_batch(struct arm_smmu_inv *cur,
return false;
}
-static void __arm_smmu_domain_inv_range(struct arm_smmu_tlbi *tlbi,
- struct arm_smmu_invs *invs)
+static void arm_smmu_domain_tlbi_inv(struct arm_smmu_tlbi *tlbi,
+ struct arm_smmu_invs *invs)
{
struct arm_smmu_cmdq_batch cmds = {};
bool used_s12_vmall = false;
@@ -2662,6 +2655,13 @@ void arm_smmu_domain_inv_range(struct arm_smmu_domain *smmu_domain,
};
struct arm_smmu_invs *invs;
+ if (!size || size == SIZE_MAX) {
+ tlbi.single.use_full_inv = true;
+ tlbi.range.use_full_inv = true;
+ } else {
+ arm_smmu_tlbi_calc_single(&tlbi);
+ }
+
/*
* An invalidation request must follow some IOPTE change and then load
* an invalidation array. In the meantime, a domain attachment mutates
@@ -2692,6 +2692,14 @@ void arm_smmu_domain_inv_range(struct arm_smmu_domain *smmu_domain,
rcu_read_lock();
invs = rcu_dereference(smmu_domain->invs);
+ /* Only precaculate RIL if it will be used. */
+ if (invs->has_range_inv) {
+ if (!tlbi.range.use_full_inv)
+ arm_smmu_tlbi_calc_range(&tlbi);
+ } else {
+ tlbi.range.use_full_inv = true;
+ }
+
/*
* Avoid locking unless ATS is being used. No ATC invalidation can be
* going on after a domain is detached.
@@ -2700,10 +2708,10 @@ void arm_smmu_domain_inv_range(struct arm_smmu_domain *smmu_domain,
unsigned long flags;
read_lock_irqsave(&invs->rwlock, flags);
- __arm_smmu_domain_inv_range(&tlbi, invs);
+ arm_smmu_domain_tlbi_inv(&tlbi, invs);
read_unlock_irqrestore(&invs->rwlock, flags);
} else {
- __arm_smmu_domain_inv_range(&tlbi, invs);
+ arm_smmu_domain_tlbi_inv(&tlbi, invs);
}
rcu_read_unlock();
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index d58fe91a96325f..b5e214b428d644 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -809,6 +809,17 @@ struct arm_smmu_tlbi {
size_t size;
unsigned int iopte_granule;
bool leaf_only;
+
+ struct {
+ bool use_full_inv;
+ u16 num;
+ } single;
+
+ struct {
+ bool use_full_inv;
+ u32 data0;
+ u64 data1;
+ } range;
};
struct arm_smmu_evtq {
--
2.43.0
More information about the linux-arm-kernel
mailing list