[PATCH 04/19] iommu/arm-smmu-v3: Make STE programming independent of the callers

Wed Oct 18 03:54:10 PDT 2023

On Wed, Oct 11, 2023 at 8:33 AM Jason Gunthorpe <jgg at nvidia.com> wrote:
>
> As the comment in arm_smmu_write_strtab_ent() explains, this routine has
> been limited to only work correctly in certain scenarios that the caller
> must ensure. Generally the caller must put the STE into ABORT or BYPASS
> before attempting to program it to something else.
>
> The next patches/series are going to start removing some of this logic
> from the callers, and add more complex state combinations than currently.
>
> Thus, consolidate all the complexity here. Callers do not have to care
> about what STE transition they are doing, this function will handle
> everything optimally.
>
> Revise arm_smmu_write_strtab_ent() so it algorithmically computes the
> required programming sequence to avoid creating an incoherent 'torn' STE
> in the HW caches. The update algorithm follows the same design that the
> driver already uses: it is safe to change bits that HW doesn't currently
> use and then do a single 64 bit update, with sync's in between.
>
> The basic idea is to express in a bitmask what bits the HW is actually
> using based on the V and CFG bits. Based on that mask we know what STE
> changes are safe and which are disruptive. We can count how many 64 bit
> QWORDS need a disruptive update and know if a step with V=0 is required.
>
> This gives two basic flows through the algorithm.
>
> If only a single 64 bit quantity needs disruptive replacement:
>  - Write the target value into all currently unused bits
>  - Write the single 64 bit quantity
>  - Zero the remaining different bits
>
> If multiple 64 bit quantities need disruptive replacement then do:
>  - Write V=0 to QWORD 0
>  - Write the entire STE except QWORD 0
>  - Write QWORD 0
>
> With HW flushes at each step, that can be skipped if the STE didn't change
> in that step.
>
> At this point it generates the same sequence of updates as the current
> code, except that zeroing the VMID on entry to BYPASS/ABORT will do an
> extra sync (this seems to be an existing bug).
>
> Going forward this will use a V=0 transition instead of cycling through
> ABORT if a hitfull change is required. This seems more appropriate as ABORT
> will fail DMAs without any logging, but dropping a DMA due to transient
> V=0 is probably signaling a bug, so the C_BAD_STE is valuable.
>
> Signed-off-by: Jason Gunthorpe <jgg at nvidia.com>
> ---
>  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 247 +++++++++++++++-----
>  1 file changed, 183 insertions(+), 64 deletions(-)
>
> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> index bf7218adbc2822..6e6b1ebb5ac0ef 100644
> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> @@ -971,6 +971,69 @@ void arm_smmu_tlb_inv_asid(struct arm_smmu_device *smmu, u16 asid)
>         arm_smmu_cmdq_issue_cmd_with_sync(smmu, &cmd);
>  }
>
> +/*
> + * Do one step along the coherent update algorithm. Each step either changes
> + * only bits that the HW isn't using or entirely changes 1 qword. It may take
> + * several iterations of this routine to make the full change.
> + */
> +static bool arm_smmu_write_entry_step(__le64 *cur, const __le64 *cur_used,
> +                                     const __le64 *target,
> +                                     const __le64 *target_used, __le64 *step,
> +                                     __le64 v_bit,
> +                                     unsigned int len)
> +{
> +       u8 step_used_diff = 0;
> +       u8 step_change = 0;
> +       unsigned int i;
> +
> +       /*
> +        * Compute a step that has all the bits currently unused by HW set to
> +        * their target values.
> +        */
> +       for (i = 0; i != len; i++) {
> +               step[i] = (cur[i] & cur_used[i]) | (target[i] & ~cur_used[i]);
> +               if (cur[i] != step[i])
> +                       step_change |= 1 << i;
> +               /*
> +                * Each bit indicates if the step is incorrect compared to the
> +                * target, considering only the used bits in the target
> +                */
> +               if ((step[i] & target_used[i]) != (target[i] & target_used[i]))
> +                       step_used_diff |= 1 << i;
> +       }
> +
> +       if (hweight8(step_used_diff) > 1) {
> +               /*
> +                * More than 1 qword is mismatched, this cannot be done without
> +                * a break. Clear the V bit and go again.
> +                */
> +               step[0] &= ~v_bit;
> +       } else if (!step_change && step_used_diff) {
> +               /*
> +                * Have exactly one critical qword, all the other qwords are set
> +                * correctly, so we can set this qword now.
> +                */
> +               i = ffs(step_used_diff) - 1;
> +               step[i] = target[i];
> +       } else if (!step_change) {
> +               /* cur == target, so all done */
> +               if (memcmp(cur, target, sizeof(*cur)) == 0)
> +                       return true;
Shouldn't this be len * sizeof(*cur)?

> +
> +               /*
> +                * All the used HW bits match, but unused bits are different.
> +                * Set them as well. Technically this isn't necessary but it
> +                * brings the entry to the full target state, so if there are
> +                * bugs in the mask calculation this will obscure them.
> +                */
> +               memcpy(step, target, len * sizeof(*step));
> +       }
> +
> +       for (i = 0; i != len; i++)
> +               WRITE_ONCE(cur[i], step[i]);
> +       return false;
> +}
> +
>  static void arm_smmu_sync_cd(struct arm_smmu_master *master,
>                              int ssid, bool leaf)
>  {
> @@ -1248,37 +1311,122 @@ static void arm_smmu_sync_ste_for_sid(struct arm_smmu_device *smmu, u32 sid)
>         arm_smmu_cmdq_issue_cmd_with_sync(smmu, &cmd);
>  }
>
> +/*
> + * Based on the value of ent report which bits of the STE the HW will access. It
> + * would be nice if this was complete according to the spec, but minimally it
> + * has to capture the bits this driver uses.
> + */
> +static void arm_smmu_get_ste_used(const struct arm_smmu_ste *ent,
> +                                 struct arm_smmu_ste *used_bits)
> +{
> +       memset(used_bits, 0, sizeof(*used_bits));
> +
> +       used_bits->data[0] = cpu_to_le64(STRTAB_STE_0_V);
> +       if (!(ent->data[0] & cpu_to_le64(STRTAB_STE_0_V)))
> +               return;
> +
> +       used_bits->data[0] |= cpu_to_le64(STRTAB_STE_0_CFG);
> +       switch (FIELD_GET(STRTAB_STE_0_CFG, le64_to_cpu(ent->data[0]))) {
> +       case STRTAB_STE_0_CFG_ABORT:
> +               break;
> +       case STRTAB_STE_0_CFG_BYPASS:
> +               used_bits->data[1] |= cpu_to_le64(STRTAB_STE_1_SHCFG);
> +               break;
> +       case STRTAB_STE_0_CFG_S1_TRANS:
> +               used_bits->data[0] |= cpu_to_le64(STRTAB_STE_0_S1FMT |
> +                                                 STRTAB_STE_0_S1CTXPTR_MASK |
> +                                                 STRTAB_STE_0_S1CDMAX);
> +               used_bits->data[1] |=
> +                       cpu_to_le64(STRTAB_STE_1_S1DSS | STRTAB_STE_1_S1CIR |
> +                                   STRTAB_STE_1_S1COR | STRTAB_STE_1_S1CSH |
> +                                   STRTAB_STE_1_S1STALLD | STRTAB_STE_1_STRW);
> +               used_bits->data[1] |= cpu_to_le64(STRTAB_STE_1_EATS);
> +
> +               if (FIELD_GET(STRTAB_STE_1_S1DSS, le64_to_cpu(ent->data[1])) ==
> +                   STRTAB_STE_1_S1DSS_BYPASS)
> +                       used_bits->data[1] |= cpu_to_le64(STRTAB_STE_1_SHCFG);

Although the driver only explicitly sets SHCFG for bypass streams, my
reading of the spec is it is also accessed for S1 and S2 STEs:
"The SMMU might convey attributes input from a device through this
process, so that the device might influence the final transaction
access, and input attributes might be overridden on a per-device basis
using the MTCFG/MemAttr, SHCFG, ALLOCCFG STE fields. The input
attribute, modified by these fields, is primarily useful for setting
the resulting output access attribute when both stage 1 and stage 2
translation is bypassed (no translation table descriptors to determine
attribute) but can also be useful for stage 2-only configurations in
which a device stream might have finer knowledge about the required
access behavior than the general virtual machine-global stage 2
translation tables."

> +               break;
> +       case STRTAB_STE_0_CFG_S2_TRANS:
> +               used_bits->data[1] |= cpu_to_le64(STRTAB_STE_1_EATS);
> +               used_bits->data[2] |=
> +                       cpu_to_le64(STRTAB_STE_2_S2VMID | STRTAB_STE_2_VTCR |
> +                                   STRTAB_STE_2_S2AA64 | STRTAB_STE_2_S2ENDI |
> +                                   STRTAB_STE_2_S2PTW | STRTAB_STE_2_S2R);
> +               used_bits->data[3] |= cpu_to_le64(STRTAB_STE_3_S2TTB_MASK);
> +               break;
> +
> +       default:
> +               memset(used_bits, 0xFF, sizeof(*used_bits));

Can we consider a WARN here since this driver only ever uses one of
the above 4 values and we probably have a programming error if we see
something else.

> +       }
> +}
> +
> +static bool arm_smmu_write_ste_step(struct arm_smmu_ste *cur,
> +                                   const struct arm_smmu_ste *target,
> +                                   const struct arm_smmu_ste *target_used)
> +{
> +       struct arm_smmu_ste cur_used;
> +       struct arm_smmu_ste step;
> +
> +       arm_smmu_get_ste_used(cur, &cur_used);
> +       return arm_smmu_write_entry_step(cur->data, cur_used.data, target->data,
> +                                        target_used->data, step.data,

What's up with requiring callers to allocate and provide step.data if
it's not used by any of the arm_smmu_write_entry_step callers?

> +                                        cpu_to_le64(STRTAB_STE_0_V),
This also looks a bit strange at this stage since CD entries aren't
yet supported..... but sure.

> +                                        ARRAY_SIZE(cur->data));
> +}
> +
> +/*
> + * This algorithm updates any STE to any value without creating a situation
> + * where the HW can percieve a corrupted STE. HW is only required to have a 64
> + * bit atomicity with stores.
> + *
> + * In the most general case we can make any update by disrupting the STE (making
> + * it abort, or clearing the V bit) using a single qword store. Then all the
> + * other qwords can be written safely, and finally the full STE written.
> + *
> + * However this disrupts the HW while it is happening. There are several
> + * interesting cases where a STE can be updated without disturbing the HW
> + * because only a small number of bits are changing (S1DSS, CONFIG, etc) or
> + * because the used bits don't intersect.
> + */
> +static void arm_smmu_write_ste(struct arm_smmu_device *smmu, u32 sid,
> +                              struct arm_smmu_ste *ste,
> +                              const struct arm_smmu_ste *target)
> +{
> +       struct arm_smmu_ste target_used;
> +       int i;
> +
> +       arm_smmu_get_ste_used(target, &target_used);
> +       /* Masks in arm_smmu_get_ste_used() are up to date */
> +       for (i = 0; i != ARRAY_SIZE(target->data); i++)
> +               WARN_ON_ONCE(target->data[i] & ~target_used.data[i]);
> +
> +       while (true) {
> +               if (arm_smmu_write_ste_step(ste, target, &target_used))
> +                       break;
> +               arm_smmu_sync_ste_for_sid(smmu, sid);
> +       }
> +
> +       /* It's likely that we'll want to use the new STE soon */
> +       if (!(smmu->options & ARM_SMMU_OPT_SKIP_PREFETCH)) {
> +               struct arm_smmu_cmdq_ent
> +                       prefetch_cmd = { .opcode = CMDQ_OP_PREFETCH_CFG,
> +                                        .prefetch = {
> +                                                .sid = sid,
> +                                        } };
> +
> +               arm_smmu_cmdq_issue_cmd(smmu, &prefetch_cmd);
> +       }
> +}
> +
>  static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid,
>                                       struct arm_smmu_ste *dst)
>  {
> -       /*
> -        * This is hideously complicated, but we only really care about
> -        * three cases at the moment:
> -        *
> -        * 1. Invalid (all zero) -> bypass/fault (init)
> -        * 2. Bypass/fault -> translation/bypass (attach)
> -        * 3. Translation/bypass -> bypass/fault (detach)
> -        *
> -        * Given that we can't update the STE atomically and the SMMU
> -        * doesn't read the thing in a defined order, that leaves us
> -        * with the following maintenance requirements:
> -        *
> -        * 1. Update Config, return (init time STEs aren't live)
> -        * 2. Write everything apart from dword 0, sync, write dword 0, sync
> -        * 3. Update Config, sync
> -        */
> -       u64 val = le64_to_cpu(dst->data[0]);
> -       bool ste_live = false;
> +       u64 val;
>         struct arm_smmu_device *smmu = master->smmu;
>         struct arm_smmu_ctx_desc_cfg *cd_table = NULL;
>         struct arm_smmu_s2_cfg *s2_cfg = NULL;
>         struct arm_smmu_domain *smmu_domain = master->domain;
> -       struct arm_smmu_cmdq_ent prefetch_cmd = {
> -               .opcode         = CMDQ_OP_PREFETCH_CFG,
> -               .prefetch       = {
> -                       .sid    = sid,
> -               },
> -       };
> +       struct arm_smmu_ste target = {};
>
>         if (smmu_domain) {
>                 switch (smmu_domain->stage) {
> @@ -1293,22 +1441,6 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid,
>                 }
>         }
>
> -       if (val & STRTAB_STE_0_V) {
> -               switch (FIELD_GET(STRTAB_STE_0_CFG, val)) {
> -               case STRTAB_STE_0_CFG_BYPASS:
> -                       break;
> -               case STRTAB_STE_0_CFG_S1_TRANS:
> -               case STRTAB_STE_0_CFG_S2_TRANS:
> -                       ste_live = true;
> -                       break;
> -               case STRTAB_STE_0_CFG_ABORT:
> -                       BUG_ON(!disable_bypass);
> -                       break;
> -               default:
> -                       BUG(); /* STE corruption */
> -               }
> -       }
> -
>         /* Nuke the existing STE_0 value, as we're going to rewrite it */
>         val = STRTAB_STE_0_V;
>
> @@ -1319,16 +1451,11 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid,
>                 else
>                         val |= FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_BYPASS);
>
> -               dst->data[0] = cpu_to_le64(val);
> -               dst->data[1] = cpu_to_le64(FIELD_PREP(STRTAB_STE_1_SHCFG,
> +               target.data[0] = cpu_to_le64(val);
> +               target.data[1] = cpu_to_le64(FIELD_PREP(STRTAB_STE_1_SHCFG,
>                                                 STRTAB_STE_1_SHCFG_INCOMING));
> -               dst->data[2] = 0; /* Nuke the VMID */
> -               /*
> -                * The SMMU can perform negative caching, so we must sync
> -                * the STE regardless of whether the old value was live.
> -                */
> -               if (smmu)
> -                       arm_smmu_sync_ste_for_sid(smmu, sid);
> +               target.data[2] = 0; /* Nuke the VMID */
> +               arm_smmu_write_ste(smmu, sid, dst, &target);
>                 return;
>         }
>
> @@ -1336,8 +1463,7 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid,
>                 u64 strw = smmu->features & ARM_SMMU_FEAT_E2H ?
>                         STRTAB_STE_1_STRW_EL2 : STRTAB_STE_1_STRW_NSEL1;
>
> -               BUG_ON(ste_live);
> -               dst->data[1] = cpu_to_le64(
> +               target.data[1] = cpu_to_le64(
>                          FIELD_PREP(STRTAB_STE_1_S1DSS, STRTAB_STE_1_S1DSS_SSID0) |
>                          FIELD_PREP(STRTAB_STE_1_S1CIR, STRTAB_STE_1_S1C_CACHE_WBRA) |
>                          FIELD_PREP(STRTAB_STE_1_S1COR, STRTAB_STE_1_S1C_CACHE_WBRA) |
> @@ -1346,7 +1472,7 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid,
>
>                 if (smmu->features & ARM_SMMU_FEAT_STALLS &&
>                     !master->stall_enabled)
> -                       dst->data[1] |= cpu_to_le64(STRTAB_STE_1_S1STALLD);
> +                       target.data[1] |= cpu_to_le64(STRTAB_STE_1_S1STALLD);
>
>                 val |= (cd_table->cdtab_dma & STRTAB_STE_0_S1CTXPTR_MASK) |
>                         FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_S1_TRANS) |
> @@ -1355,8 +1481,7 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid,
>         }
>
>         if (s2_cfg) {
> -               BUG_ON(ste_live);
> -               dst->data[2] = cpu_to_le64(
> +               target.data[2] = cpu_to_le64(
>                          FIELD_PREP(STRTAB_STE_2_S2VMID, s2_cfg->vmid) |
>                          FIELD_PREP(STRTAB_STE_2_VTCR, s2_cfg->vtcr) |
>  #ifdef __BIG_ENDIAN
> @@ -1365,23 +1490,17 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid,
>                          STRTAB_STE_2_S2PTW | STRTAB_STE_2_S2AA64 |
>                          STRTAB_STE_2_S2R);
>
> -               dst->data[3] = cpu_to_le64(s2_cfg->vttbr & STRTAB_STE_3_S2TTB_MASK);
> +               target.data[3] = cpu_to_le64(s2_cfg->vttbr & STRTAB_STE_3_S2TTB_MASK);
>
>                 val |= FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_S2_TRANS);
>         }
>
>         if (master->ats_enabled)
> -               dst->data[1] |= cpu_to_le64(FIELD_PREP(STRTAB_STE_1_EATS,
> +               target.data[1] |= cpu_to_le64(FIELD_PREP(STRTAB_STE_1_EATS,
>                                                  STRTAB_STE_1_EATS_TRANS));
>
> -       arm_smmu_sync_ste_for_sid(smmu, sid);
> -       /* See comment in arm_smmu_write_ctx_desc() */
> -       WRITE_ONCE(dst->data[0], cpu_to_le64(val));
> -       arm_smmu_sync_ste_for_sid(smmu, sid);
> -
> -       /* It's likely that we'll want to use the new STE soon */
> -       if (!(smmu->options & ARM_SMMU_OPT_SKIP_PREFETCH))
> -               arm_smmu_cmdq_issue_cmd(smmu, &prefetch_cmd);
> +       target.data[0] = cpu_to_le64(val);
> +       arm_smmu_write_ste(smmu, sid, dst, &target);
>  }
>
>  static void arm_smmu_init_bypass_stes(struct arm_smmu_ste *strtab,
> --
> 2.42.0
>