[PATCH v6 24/25] iommu/arm-smmu-v3-kvm: Enable nesting
Mostafa Saleh
smostafa at google.com
Fri May 1 04:19:26 PDT 2026
Now, as the hypervisor controls the command queue, stream table,
and shadows the stage-2 page table.
Enable stage-2 in case the host puts an STE in bypass or stage-1.
Signed-off-by: Mostafa Saleh <smostafa at google.com>
---
.../iommu/arm/arm-smmu-v3/pkvm/arm-smmu-v3.c | 108 ++++++++++++++++--
1 file changed, 101 insertions(+), 7 deletions(-)
diff --git a/drivers/iommu/arm/arm-smmu-v3/pkvm/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/pkvm/arm-smmu-v3.c
index b73a2462f0dd..3d727d6dfbf0 100644
--- a/drivers/iommu/arm/arm-smmu-v3/pkvm/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/pkvm/arm-smmu-v3.c
@@ -411,6 +411,57 @@ static int smmu_init_cmdq(struct hyp_arm_smmu_v3_device *smmu)
return 0;
}
+static int smmu_attach_stage_2(struct arm_smmu_ste *ste)
+{
+ unsigned long vttbr;
+ unsigned long ts, sl, ic, oc, sh, tg, ps;
+ unsigned long cfg;
+ struct io_pgtable_cfg *pgt_cfg = &idmap_pgtable->cfg;
+
+ cfg = FIELD_GET(STRTAB_STE_0_CFG, le64_to_cpu(ste->data[0]));
+ if (!FIELD_GET(STRTAB_STE_0_V, le64_to_cpu(ste->data[0])) ||
+ (cfg == STRTAB_STE_0_CFG_ABORT)) {
+ ste->data[2] = 0;
+ ste->data[3] = 0;
+ return 0;
+ }
+ /* S2 is not advertised, that should never be attempted. */
+ if (cfg == STRTAB_STE_0_CFG_NESTED)
+ return -EINVAL;
+ vttbr = pgt_cfg->arm_lpae_s2_cfg.vttbr;
+ ps = pgt_cfg->arm_lpae_s2_cfg.vtcr.ps;
+ tg = pgt_cfg->arm_lpae_s2_cfg.vtcr.tg;
+ sh = pgt_cfg->arm_lpae_s2_cfg.vtcr.sh;
+ oc = pgt_cfg->arm_lpae_s2_cfg.vtcr.orgn;
+ ic = pgt_cfg->arm_lpae_s2_cfg.vtcr.irgn;
+ sl = pgt_cfg->arm_lpae_s2_cfg.vtcr.sl;
+ ts = pgt_cfg->arm_lpae_s2_cfg.vtcr.tsz;
+
+ ste->data[1] &= ~cpu_to_le64(STRTAB_STE_1_SHCFG);
+ ste->data[1] |= cpu_to_le64(FIELD_PREP(STRTAB_STE_1_SHCFG, STRTAB_STE_1_SHCFG_INCOMING));
+
+ /* The host shouldn't write dwords 2 and 3, overwrite them. */
+ ste->data[2] = cpu_to_le64(FIELD_PREP(STRTAB_STE_2_VTCR,
+ FIELD_PREP(STRTAB_STE_2_VTCR_S2PS, ps) |
+ FIELD_PREP(STRTAB_STE_2_VTCR_S2TG, tg) |
+ FIELD_PREP(STRTAB_STE_2_VTCR_S2SH0, sh) |
+ FIELD_PREP(STRTAB_STE_2_VTCR_S2OR0, oc) |
+ FIELD_PREP(STRTAB_STE_2_VTCR_S2IR0, ic) |
+ FIELD_PREP(STRTAB_STE_2_VTCR_S2SL0, sl) |
+ FIELD_PREP(STRTAB_STE_2_VTCR_S2T0SZ, ts)) |
+ FIELD_PREP(STRTAB_STE_2_S2VMID, 0) |
+ STRTAB_STE_2_S2AA64 | STRTAB_STE_2_S2R |
+ #ifdef __BIG_ENDIAN
+ STRTAB_STE_2_S2ENDI |
+#endif
+ STRTAB_STE_2_S2PTW);
+
+ ste->data[3] = cpu_to_le64(vttbr & STRTAB_STE_3_S2TTB_MASK);
+ /* Convert S1 => nested and bypass => S2 */
+ ste->data[0] |= cpu_to_le64(FIELD_PREP(STRTAB_STE_0_CFG, cfg | BIT(1)));
+ return 0;
+}
+
static int smmu_get_host_l2_ste(struct hyp_arm_smmu_v3_device *smmu, u32 sid,
struct arm_smmu_ste *host_ste_out)
{
@@ -440,9 +491,18 @@ static int smmu_get_host_l2_ste(struct hyp_arm_smmu_v3_device *smmu, u32 sid,
static int smmu_reshadow_ste(struct hyp_arm_smmu_v3_device *smmu, u32 sid, bool leaf)
{
struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg;
- struct arm_smmu_ste *hyp_ste_ptr, *host_ste_ptr, host_ste_copy;
+ struct arm_smmu_ste *hyp_ste_ptr;
u64 *hyp_ste_base = strtab_hyp_base(smmu);
- int ret;
+ struct arm_smmu_ste target = {};
+ struct arm_smmu_cmdq_ent cfgi_cmd = {
+ .opcode = CMDQ_OP_CFGI_STE,
+ .cfgi = {
+ .sid = sid,
+ .leaf = true,
+ },
+ };
+ bool cur_valid, target_valid;
+ int i, ret;
/*
* Linux only uses leaf = 1, when leaf is 0, we need to verify that this
@@ -463,7 +523,7 @@ static int smmu_reshadow_ste(struct hyp_arm_smmu_v3_device *smmu, u32 sid, bool
return -E2BIG;
hyp_ste_ptr = &hyp_table[sid];
- host_ste_ptr = &host_table[sid];
+ memcpy(target.data, host_table[sid].data, STRTAB_STE_DWORDS << 3);
} else {
struct arm_smmu_strtab_l1 *l1tab = (struct arm_smmu_strtab_l1 *)hyp_ste_base;
u32 l1_idx = arm_smmu_strtab_l1_idx(sid);
@@ -472,8 +532,7 @@ static int smmu_reshadow_ste(struct hyp_arm_smmu_v3_device *smmu, u32 sid, bool
if (l1_idx >= cfg->l2.num_l1_ents)
return -E2BIG;
- host_ste_ptr = &host_ste_copy;
- ret = smmu_get_host_l2_ste(smmu, sid, host_ste_ptr);
+ ret = smmu_get_host_l2_ste(smmu, sid, &target);
if (ret)
return ret;
@@ -491,9 +550,44 @@ static int smmu_reshadow_ste(struct hyp_arm_smmu_v3_device *smmu, u32 sid, bool
hyp_ste_ptr = &l2ptr->stes[arm_smmu_strtab_l2_idx(sid)];
}
- memcpy(hyp_ste_ptr->data, host_ste_ptr->data, STRTAB_STE_DWORDS << 3);
- return 0;
+ /*
+ * Summary of each host emulated state vs real HW.
+ * | Host | HW |
+ * ==============================
+ * | V=0 | V=0 |
+ * | Abort | Abort |
+ * | Bypass | S2 |
+ * | S1 | S1+S2 |
+ *
+ * For the host, any V=0 transition is not hitless, all other permutations of
+ * (abort, bypass, S1) transitions are hitless.
+ * For the HW state, any V=0 transition is not hitless, as all the S2 config is
+ * always the same (ttbr, vtcr...), all other transitions should be hitless too.
+ * However, the host is not trusted, which means that any V=0 <=> V=1 transitions
+ * we need to enforce writing order of the STE and add CFGI.
+ */
+ cur_valid = FIELD_GET(STRTAB_STE_0_V, le64_to_cpu(hyp_ste_ptr->data[0]));
+ ret = smmu_attach_stage_2(&target);
+ if (ret)
+ return ret;
+ target_valid = FIELD_GET(STRTAB_STE_0_V, le64_to_cpu(target.data[0]));
+ if (cur_valid && !target_valid) {
+ WRITE_ONCE(hyp_ste_ptr->data[0], target.data[0]);
+ WARN_ON(smmu_send_cmd(smmu, &cfgi_cmd));
+ for (i = 1; i < STRTAB_STE_DWORDS; i++)
+ WRITE_ONCE(hyp_ste_ptr->data[i], target.data[i]);
+ } else if (!cur_valid && target_valid) {
+ for (i = 1; i < STRTAB_STE_DWORDS; i++)
+ WRITE_ONCE(hyp_ste_ptr->data[i], target.data[i]);
+ WARN_ON(smmu_send_cmd(smmu, &cfgi_cmd));
+ WRITE_ONCE(hyp_ste_ptr->data[0], target.data[0]);
+ } else {
+ for (i = 0; i < STRTAB_STE_DWORDS; i++)
+ WRITE_ONCE(hyp_ste_ptr->data[i], target.data[i]);
+ }
+
+ return smmu_send_cmd(smmu, &cfgi_cmd);
}
static int smmu_init_strtab(struct hyp_arm_smmu_v3_device *smmu)
--
2.54.0.545.g6539524ca2-goog
More information about the linux-arm-kernel
mailing list