[PATCH 04/19] iommu/arm-smmu-v3: Make STE programming independent of the callers

Michael Shavit mshavit at google.com
Tue Jan 2 00:13:28 PST 2024


On Tue, Dec 19, 2023 at 9:42 PM Michael Shavit <mshavit at google.com> wrote:
...
> +       if (hweight8(entry_qwords_used_diff) > 1) {
> +               /*
> +                * If transitioning to the target entry with a single qword
> +                * write isn't possible, then we must first transition to an
> +                * intermediate entry. The intermediate entry may either be an
> +                * entry that melds bits of the target entry into the current
> +                * entry without disrupting the hardware, or a breaking entry if
> +                * a hitless transition to the target is impossible.
> +                */
> +
> +               /*
> +                * Compute a staging entry that has all the bits currently
> +                * unused by HW set to their target values, such that comitting
> +                * it to the entry table woudn't disrupt the hardware.
> +                */
> +               memcpy(staging_entry, cur, writer->entry_length);
> +               writer->ops.set_unused_bits(staging_entry, target);
> +
> +               entry_qwords_used_diff =
> +                       writer->ops.get_used_qword_diff_indexes(staging_entry,
> +                                                               target);
> +               if (hweight8(entry_qwords_used_diff) > 1) {
> +                       /*
> +                        * More than 1 qword is mismatched between the staging
> +                        * and target entry. A hitless transition to the target
> +                        * entry is not possible. Set the staging entry to be
> +                        * equal to the target entry, apart from the V bit's
> +                        * qword. As long as the V bit is cleared first then
> +                        * writes to the subsequent qwords will not further
> +                        * disrupt the hardware.
> +                        */
> +                       memcpy(staging_entry, target, writer->entry_length);
> +                       staging_entry[0] &= ~writer->v_bit;
> +                       /*
> +                        * After comitting the staging entry, only the 0th qword
> +                        * will differ from the target.
> +                        */
> +                       entry_qwords_used_diff = 1;
> +               }
> +
> +               /*
> +                * Commit the staging entry. Note that the iteration order
> +                * matters, as we may be comitting a breaking entry in the
> +                * non-hitless case. The 0th qword which holds the valid bit
> +                * must be written first in that case.
> +                */
> +               for (i = 0; i != writer->entry_length; i++)
> +                       WRITE_ONCE(cur[i], staging_entry[i]);
> +               writer->ops.sync_entry(writer);

Realized while replying to your latest email that this is wrong (and
the unit-test as well!). It's not enough to just write the 0th qword
first if it's a breaking entry, it must also sync after that 0th qword
write.

On Tue, Dec 19, 2023 at 9:42 PM Michael Shavit <mshavit at google.com> wrote:
>
> On Mon, Dec 18, 2023 at 8:35 PM Michael Shavit <mshavit at google.com> wrote:
> >
> > On Sun, Dec 17, 2023 at 9:03 PM Jason Gunthorpe <jgg at nvidia.com> wrote:
> > >
> > > On Sat, Dec 16, 2023 at 04:26:48AM +0800, Michael Shavit wrote:
> > >
> > > > Ok, I took a proper stab at trying to unroll the loop on the github
> > > > version of this patch (v3+)
> > > > As you suspected, it's not easy to re-use the unrolled version for
> > > > both STE and CD writing as we'd have to pass in callbacks for syncing
> > > > the STE/CD and recomputing arm_smmu_{get_ste/cd}_used.
> > >
> > > Yes, that is why I structured it as an iterator
> >
> > On second thought, perhaps defining a helper class implementing
> > entry_sync() and entry_get_used_bits() might not be so bad?
> > It's a little bit more verbose, but avoids deduplication of the
> > complicated parts.
>
> Gave this a try so that we have something more concrete to compare.
> Consider the following two patches as alternatives to this patch and
> patch "Make CD programming use arm_smmu_write_entry_step" from the
> next part of the patch series.
>
> STE programming patch
> ---
> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> index b120d83668...1e17bff37f 100644
> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> @@ -971,6 +971,174 @@
>         arm_smmu_cmdq_issue_cmd_with_sync(smmu, &cmd);
>  }
>
> +struct arm_smmu_entry_writer;
> +
> +/**
> + * struct arm_smmu_entry_writer_ops - Helper class for writing a CD/STE entry.
> + * @sync_entry: sync entry to the hardware after writing to it.
> + * @set_unused_bits: Make bits of the entry that aren't in use by the hardware
> + *                   equal to the target's bits.
> + * @get_used_qword_diff_indexes: Compute the list of qwords in the entry that
> + *                               are incorrect compared to the target,
> + *                               considering only the used bits in the target.
> + *                               The set bits in the return value
> represents the
> + *                               indexes of those qwords.
> + */
> +struct arm_smmu_entry_writer_ops {
> +       void (*sync_entry)(struct arm_smmu_entry_writer *);
> +       void (*set_unused_bits)(__le64 *entry, const __le64 *target);
> +       u8 (*get_used_qword_diff_indexes)(__le64 *entry, const __le64 *target);
> +};
> +
> +struct arm_smmu_entry_writer {
> +       struct arm_smmu_entry_writer_ops ops;
> +       __le64 v_bit;
> +       unsigned int entry_length;
> +};
> +
> +static void arm_smmu_entry_set_unused_bits(__le64 *entry, const __le64 *target,
> +                                          const __le64 *entry_used,
> +                                          unsigned int length)
> +{
> +       int i = 0;
> +
> +       for (i = 0; i < length; i++)
> +               entry[i] = (entry[i] & entry_used[i]) |
> +                          (target[i] & ~entry_used[i]);
> +}
> +
> +static u8 arm_smmu_entry_used_qword_diff_indexes(__le64 *entry,
> +                                                const __le64 *target,
> +                                                const __le64 *target_used,
> +                                                unsigned int length)
> +{
> +       u8 qword_diff_indexes = 0;
> +       int i = 0;
> +
> +       for (i = 0; i < length; i++) {
> +               if ((entry[i] & target_used[i]) != (target[i] & target_used[i]))
> +                       qword_diff_indexes |= 1 << i;
> +       }
> +       return qword_diff_indexes;
> +}
> +
> +/*
> + * Update the STE/CD to the target configuration. The transition from
> the current
> + * entry to the target entry takes place over multiple steps that
> attempts to make
> + * the transition hitless if possible. This function takes care not to create a
> + * situation where the HW can perceive a corrupted entry. HW is only
> required to
> + * have a 64 bit atomicity with stores from the CPU, while entries are many 64
> + * bit values big.
> + *
> + * The algorithm works by evolving the entry toward the target in a series of
> + * steps. Each step synchronizes with the HW so that the HW can not
> see an entry
> + * torn across two steps. During each step the HW can observe a torn entry that
> + * has any combination of the step's old/new 64 bit words. The algorithm
> + * objective is for the HW behavior to always be one of current behavior, V=0,
> + * or new behavior.
> + *
> + * In the most general case we can make any update in three steps:
> + *  - Disrupting the entry (V=0)
> + *  - Fill now unused bits, all bits except V
> + *  - Make valid (V=1), single 64 bit store
> + *
> + * However this disrupts the HW while it is happening. There are several
> + * interesting cases where a STE/CD can be updated without disturbing the HW
> + * because only a small number of bits are changing (S1DSS, CONFIG, etc) or
> + * because the used bits don't intersect. We can detect this by calculating how
> + * many 64 bit values need update after adjusting the unused bits and skip the
> + * V=0 process. This relies on the IGNORED behavior described in the
> + * specification
> + */
> +static void arm_smmu_write_entry(struct arm_smmu_entry_writer *writer,
> +                                __le64 *cur, const __le64 *target,
> +                                __le64 *staging_entry)
> +{
> +       bool cleanup_sync_required = false;
> +       u8 entry_qwords_used_diff = 0;
> +       int i = 0;
> +
> +       entry_qwords_used_diff =
> +               writer->ops.get_used_qword_diff_indexes(cur, target);
> +       if (WARN_ON_ONCE(entry_qwords_used_diff == 0))
> +               return;
> +
> +       if (hweight8(entry_qwords_used_diff) > 1) {
> +               /*
> +                * If transitioning to the target entry with a single qword
> +                * write isn't possible, then we must first transition to an
> +                * intermediate entry. The intermediate entry may either be an
> +                * entry that melds bits of the target entry into the current
> +                * entry without disrupting the hardware, or a breaking entry if
> +                * a hitless transition to the target is impossible.
> +                */
> +
> +               /*
> +                * Compute a staging entry that has all the bits currently
> +                * unused by HW set to their target values, such that comitting
> +                * it to the entry table woudn't disrupt the hardware.
> +                */
> +               memcpy(staging_entry, cur, writer->entry_length);
> +               writer->ops.set_unused_bits(staging_entry, target);
> +
> +               entry_qwords_used_diff =
> +                       writer->ops.get_used_qword_diff_indexes(staging_entry,
> +                                                               target);
> +               if (hweight8(entry_qwords_used_diff) > 1) {
> +                       /*
> +                        * More than 1 qword is mismatched between the staging
> +                        * and target entry. A hitless transition to the target
> +                        * entry is not possible. Set the staging entry to be
> +                        * equal to the target entry, apart from the V bit's
> +                        * qword. As long as the V bit is cleared first then
> +                        * writes to the subsequent qwords will not further
> +                        * disrupt the hardware.
> +                        */
> +                       memcpy(staging_entry, target, writer->entry_length);
> +                       staging_entry[0] &= ~writer->v_bit;
> +                       /*
> +                        * After comitting the staging entry, only the 0th qword
> +                        * will differ from the target.
> +                        */
> +                       entry_qwords_used_diff = 1;
> +               }
> +
> +               /*
> +                * Commit the staging entry. Note that the iteration order
> +                * matters, as we may be comitting a breaking entry in the
> +                * non-hitless case. The 0th qword which holds the valid bit
> +                * must be written first in that case.
> +                */
> +               for (i = 0; i != writer->entry_length; i++)
> +                       WRITE_ONCE(cur[i], staging_entry[i]);
> +               writer->ops.sync_entry(writer);
> +       }
> +
> +       /*
> +        * It's now possible to switch to the target configuration with a write
> +        * to a single qword. Make that switch now.
> +        */
> +       i = ffs(entry_qwords_used_diff) - 1;
> +       WRITE_ONCE(cur[i], target[i]);
> +       writer->ops.sync_entry(writer);
> +
> +       /*
> +        * Some of the bits set under the previous configuration but unused
> +        * under the target configuration might still be set. Clear them as
> +        * well. Technically this isn't necessary but it brings the entry to
> +        * the full target state, so if there are bugs in the mask calculation
> +        * this will obscure them.
> +        */
> +       for (i = 0; i != writer->entry_length; i++) {
> +               if (cur[i] != target[i]) {
> +                       WRITE_ONCE(cur[i], target[i]);
> +                       cleanup_sync_required = true;
> +               }
> +       }
> +       if (cleanup_sync_required)
> +               writer->ops.sync_entry(writer);
> +}
> +
>  static void arm_smmu_sync_cd(struct arm_smmu_master *master,
>                              int ssid, bool leaf)
>  {
> @@ -1248,37 +1416,142 @@
>         arm_smmu_cmdq_issue_cmd_with_sync(smmu, &cmd);
>  }
>
> +/*
> + * Based on the value of ent report which bits of the STE the HW will
> access. It
> + * would be nice if this was complete according to the spec, but minimally it
> + * has to capture the bits this driver uses.
> + */
> +static void arm_smmu_get_ste_used(const __le64 *ent,
> +                                 struct arm_smmu_ste *used_bits)
> +{
> +       memset(used_bits, 0, sizeof(*used_bits));
> +
> +       used_bits->data[0] = cpu_to_le64(STRTAB_STE_0_V);
> +       if (!(ent[0] & cpu_to_le64(STRTAB_STE_0_V)))
> +               return;
> +
> +       /*
> +        * If S1 is enabled S1DSS is valid, see 13.5 Summary of
> +        * attribute/permission configuration fields for the SHCFG behavior.
> +        */
> +       if (FIELD_GET(STRTAB_STE_0_CFG, le64_to_cpu(ent[0])) & 1 &&
> +           FIELD_GET(STRTAB_STE_1_S1DSS, le64_to_cpu(ent[1])) ==
> +                   STRTAB_STE_1_S1DSS_BYPASS)
> +               used_bits->data[1] |= cpu_to_le64(STRTAB_STE_1_SHCFG);
> +
> +       used_bits->data[0] |= cpu_to_le64(STRTAB_STE_0_CFG);
> +       switch (FIELD_GET(STRTAB_STE_0_CFG, le64_to_cpu(ent[0]))) {
> +       case STRTAB_STE_0_CFG_ABORT:
> +               break;
> +       case STRTAB_STE_0_CFG_BYPASS:
> +               used_bits->data[1] |= cpu_to_le64(STRTAB_STE_1_SHCFG);
> +               break;
> +       case STRTAB_STE_0_CFG_S1_TRANS:
> +               used_bits->data[0] |= cpu_to_le64(STRTAB_STE_0_S1FMT |
> +                                                 STRTAB_STE_0_S1CTXPTR_MASK |
> +                                                 STRTAB_STE_0_S1CDMAX);
> +               used_bits->data[1] |=
> +                       cpu_to_le64(STRTAB_STE_1_S1DSS | STRTAB_STE_1_S1CIR |
> +                                   STRTAB_STE_1_S1COR | STRTAB_STE_1_S1CSH |
> +                                   STRTAB_STE_1_S1STALLD | STRTAB_STE_1_STRW);
> +               used_bits->data[1] |= cpu_to_le64(STRTAB_STE_1_EATS);
> +               break;
> +       case STRTAB_STE_0_CFG_S2_TRANS:
> +               used_bits->data[1] |=
> +                       cpu_to_le64(STRTAB_STE_1_EATS | STRTAB_STE_1_SHCFG);
> +               used_bits->data[2] |=
> +                       cpu_to_le64(STRTAB_STE_2_S2VMID | STRTAB_STE_2_VTCR |
> +                                   STRTAB_STE_2_S2AA64 | STRTAB_STE_2_S2ENDI |
> +                                   STRTAB_STE_2_S2PTW | STRTAB_STE_2_S2R);
> +               used_bits->data[3] |= cpu_to_le64(STRTAB_STE_3_S2TTB_MASK);
> +               break;
> +
> +       default:
> +               memset(used_bits, 0xFF, sizeof(*used_bits));
> +               WARN_ON(true);
> +       }
> +}
> +
> +struct arm_smmu_ste_writer {
> +       struct arm_smmu_entry_writer writer;
> +       struct arm_smmu_device *smmu;
> +       u32 sid;
> +};
> +
> +static void arm_smmu_ste_set_unused_bits(__le64 *entry, const __le64 *target)
> +{
> +       struct arm_smmu_ste entry_used;
> +       arm_smmu_get_ste_used(entry, &entry_used);
> +
> +       arm_smmu_entry_set_unused_bits(entry, target, entry_used.data,
> +                                      ARRAY_SIZE(entry_used.data));
> +}
> +
> +static u8 arm_smmu_ste_used_qword_diff_indexes(__le64 *cur,
> +                                              const __le64 *target)
> +{
> +       struct arm_smmu_ste target_used;
> +
> +       arm_smmu_get_ste_used(target, &target_used);
> +       return arm_smmu_entry_used_qword_diff_indexes(
> +               cur, target, target_used.data, ARRAY_SIZE(target_used.data));
> +}
> +
> +static void arm_smmu_ste_writer_sync_entry(struct
> arm_smmu_entry_writer *writer)
> +{
> +       struct arm_smmu_ste_writer *ste_writer =
> +               container_of(writer, struct arm_smmu_ste_writer, writer);
> +
> +       arm_smmu_sync_ste_for_sid(ste_writer->smmu, ste_writer->sid);
> +}
> +
> +static const struct arm_smmu_entry_writer_ops arm_smmu_ste_writer_ops = {
> +       .sync_entry = arm_smmu_ste_writer_sync_entry,
> +       .set_unused_bits = arm_smmu_ste_set_unused_bits,
> +       .get_used_qword_diff_indexes = arm_smmu_ste_used_qword_diff_indexes,
> +};
> +
> +static void arm_smmu_write_ste(struct arm_smmu_device *smmu, u32 sid,
> +                              struct arm_smmu_ste *ste,
> +                              const struct arm_smmu_ste *target)
> +{
> +       struct arm_smmu_ste preallocated_staging_ste = {0};
> +       struct arm_smmu_ste_writer ste_writer = {
> +               .writer = {
> +                       .ops = arm_smmu_ste_writer_ops,
> +                       .v_bit = cpu_to_le64(STRTAB_STE_0_V),
> +                       .entry_length = ARRAY_SIZE(ste->data),
> +               },
> +               .smmu = smmu,
> +               .sid = sid,
> +       };
> +
> +       arm_smmu_write_entry(&ste_writer.writer,
> +                              ste->data,
> +                              target->data,
> +                              preallocated_staging_ste.data);
> +
> +       /* It's likely that we'll want to use the new STE soon */
> +       if (!(smmu->options & ARM_SMMU_OPT_SKIP_PREFETCH)) {
> +               struct arm_smmu_cmdq_ent
> +                       prefetch_cmd = { .opcode = CMDQ_OP_PREFETCH_CFG,
> +                                        .prefetch = {
> +                                                .sid = sid,
> +                                        } };
> +
> +               arm_smmu_cmdq_issue_cmd(smmu, &prefetch_cmd);
> +       }
> +}
> +
>  static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid,
>                                       struct arm_smmu_ste *dst)
>  {
> -       /*
> -        * This is hideously complicated, but we only really care about
> -        * three cases at the moment:
> -        *
> -        * 1. Invalid (all zero) -> bypass/fault (init)
> -        * 2. Bypass/fault -> translation/bypass (attach)
> -        * 3. Translation/bypass -> bypass/fault (detach)
> -        *
> -        * Given that we can't update the STE atomically and the SMMU
> -        * doesn't read the thing in a defined order, that leaves us
> -        * with the following maintenance requirements:
> -        *
> -        * 1. Update Config, return (init time STEs aren't live)
> -        * 2. Write everything apart from dword 0, sync, write dword 0, sync
> -        * 3. Update Config, sync
> -        */
> -       u64 val = le64_to_cpu(dst->data[0]);
> -       bool ste_live = false;
> +       u64 val;
>         struct arm_smmu_device *smmu = master->smmu;
>         struct arm_smmu_ctx_desc_cfg *cd_table = NULL;
>         struct arm_smmu_s2_cfg *s2_cfg = NULL;
>         struct arm_smmu_domain *smmu_domain = master->domain;
> -       struct arm_smmu_cmdq_ent prefetch_cmd = {
> -               .opcode         = CMDQ_OP_PREFETCH_CFG,
> -               .prefetch       = {
> -                       .sid    = sid,
> -               },
> -       };
> +       struct arm_smmu_ste target = {};
>
>         if (smmu_domain) {
>                 switch (smmu_domain->stage) {
> @@ -1293,22 +1566,6 @@
>                 }
>         }
>
> -       if (val & STRTAB_STE_0_V) {
> -               switch (FIELD_GET(STRTAB_STE_0_CFG, val)) {
> -               case STRTAB_STE_0_CFG_BYPASS:
> -                       break;
> -               case STRTAB_STE_0_CFG_S1_TRANS:
> -               case STRTAB_STE_0_CFG_S2_TRANS:
> -                       ste_live = true;
> -                       break;
> -               case STRTAB_STE_0_CFG_ABORT:
> -                       BUG_ON(!disable_bypass);
> -                       break;
> -               default:
> -                       BUG(); /* STE corruption */
> -               }
> -       }
> -
>         /* Nuke the existing STE_0 value, as we're going to rewrite it */
>         val = STRTAB_STE_0_V;
>
> @@ -1319,16 +1576,11 @@
>                 else
>                         val |= FIELD_PREP(STRTAB_STE_0_CFG,
> STRTAB_STE_0_CFG_BYPASS);
>
> -               dst->data[0] = cpu_to_le64(val);
> -               dst->data[1] = cpu_to_le64(FIELD_PREP(STRTAB_STE_1_SHCFG,
> +               target.data[0] = cpu_to_le64(val);
> +               target.data[1] = cpu_to_le64(FIELD_PREP(STRTAB_STE_1_SHCFG,
>                                                 STRTAB_STE_1_SHCFG_INCOMING));
> -               dst->data[2] = 0; /* Nuke the VMID */
> -               /*
> -                * The SMMU can perform negative caching, so we must sync
> -                * the STE regardless of whether the old value was live.
> -                */
> -               if (smmu)
> -                       arm_smmu_sync_ste_for_sid(smmu, sid);
> +               target.data[2] = 0; /* Nuke the VMID */
> +               arm_smmu_write_ste(smmu, sid, dst, &target);
>                 return;
>         }
>
> @@ -1336,8 +1588,7 @@
>                 u64 strw = smmu->features & ARM_SMMU_FEAT_E2H ?
>                         STRTAB_STE_1_STRW_EL2 : STRTAB_STE_1_STRW_NSEL1;
>
> -               BUG_ON(ste_live);
> -               dst->data[1] = cpu_to_le64(
> +               target.data[1] = cpu_to_le64(
>                          FIELD_PREP(STRTAB_STE_1_S1DSS,
> STRTAB_STE_1_S1DSS_SSID0) |
>                          FIELD_PREP(STRTAB_STE_1_S1CIR,
> STRTAB_STE_1_S1C_CACHE_WBRA) |
>                          FIELD_PREP(STRTAB_STE_1_S1COR,
> STRTAB_STE_1_S1C_CACHE_WBRA) |
> @@ -1346,7 +1597,7 @@
>
>                 if (smmu->features & ARM_SMMU_FEAT_STALLS &&
>                     !master->stall_enabled)
> -                       dst->data[1] |= cpu_to_le64(STRTAB_STE_1_S1STALLD);
> +                       target.data[1] |= cpu_to_le64(STRTAB_STE_1_S1STALLD);
>
>                 val |= (cd_table->cdtab_dma & STRTAB_STE_0_S1CTXPTR_MASK) |
>                         FIELD_PREP(STRTAB_STE_0_CFG,
> STRTAB_STE_0_CFG_S1_TRANS) |
> @@ -1355,8 +1606,7 @@
>         }
>
>         if (s2_cfg) {
> -               BUG_ON(ste_live);
> -               dst->data[2] = cpu_to_le64(
> +               target.data[2] = cpu_to_le64(
>                          FIELD_PREP(STRTAB_STE_2_S2VMID, s2_cfg->vmid) |
>                          FIELD_PREP(STRTAB_STE_2_VTCR, s2_cfg->vtcr) |
>  #ifdef __BIG_ENDIAN
> @@ -1365,23 +1615,17 @@
>                          STRTAB_STE_2_S2PTW | STRTAB_STE_2_S2AA64 |
>                          STRTAB_STE_2_S2R);
>
> -               dst->data[3] = cpu_to_le64(s2_cfg->vttbr &
> STRTAB_STE_3_S2TTB_MASK);
> +               target.data[3] = cpu_to_le64(s2_cfg->vttbr &
> STRTAB_STE_3_S2TTB_MASK);
>
>                 val |= FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_S2_TRANS);
>         }
>
>         if (master->ats_enabled)
> -               dst->data[1] |= cpu_to_le64(FIELD_PREP(STRTAB_STE_1_EATS,
> +               target.data[1] |= cpu_to_le64(FIELD_PREP(STRTAB_STE_1_EATS,
>                                                  STRTAB_STE_1_EATS_TRANS));
>
> -       arm_smmu_sync_ste_for_sid(smmu, sid);
> -       /* See comment in arm_smmu_write_ctx_desc() */
> -       WRITE_ONCE(dst->data[0], cpu_to_le64(val));
> -       arm_smmu_sync_ste_for_sid(smmu, sid);
> -
> -       /* It's likely that we'll want to use the new STE soon */
> -       if (!(smmu->options & ARM_SMMU_OPT_SKIP_PREFETCH))
> -               arm_smmu_cmdq_issue_cmd(smmu, &prefetch_cmd);
> +       target.data[0] = cpu_to_le64(val);
> +       arm_smmu_write_ste(smmu, sid, dst, &target);
>  }
>
>  static void arm_smmu_init_bypass_stes(struct arm_smmu_ste *strtab,
>
>
>
>
>
> ---
> CD programming
> ---
> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> index 55703a5d62...c849b26c43 100644
> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> @@ -1219,6 +1219,86 @@
>         return &l1_desc->l2ptr[idx];
>  }
>
> +static void arm_smmu_get_cd_used(const __le64 *ent,
> +                                struct arm_smmu_cd *used_bits)
> +{
> +       memset(used_bits, 0, sizeof(*used_bits));
> +
> +       used_bits->data[0] = cpu_to_le64(CTXDESC_CD_0_V);
> +       if (!(ent[0] & cpu_to_le64(CTXDESC_CD_0_V)))
> +               return;
> +       memset(used_bits, 0xFF, sizeof(*used_bits));
> +
> +       /* EPD0 means T0SZ/TG0/IR0/OR0/SH0/TTB0 are IGNORED */
> +       if (ent[0] & cpu_to_le64(CTXDESC_CD_0_TCR_EPD0)) {
> +               used_bits->data[0] &= ~cpu_to_le64(
> +                       CTXDESC_CD_0_TCR_T0SZ | CTXDESC_CD_0_TCR_TG0 |
> +                       CTXDESC_CD_0_TCR_IRGN0 | CTXDESC_CD_0_TCR_ORGN0 |
> +                       CTXDESC_CD_0_TCR_SH0);
> +               used_bits->data[1] &= ~cpu_to_le64(CTXDESC_CD_1_TTB0_MASK);
> +       }
> +}
> +
> +struct arm_smmu_cd_writer {
> +       struct arm_smmu_entry_writer writer;
> +       struct arm_smmu_master *master;
> +       int ssid;
> +};
> +
> +static void arm_smmu_cd_set_unused_bits(__le64 *entry, const __le64 *target)
> +{
> +       struct arm_smmu_cd entry_used;
> +       arm_smmu_get_cd_used(entry, &entry_used);
> +
> +       arm_smmu_entry_set_unused_bits(entry, target, entry_used.data,
> +                                      ARRAY_SIZE(entry_used.data));
> +}
> +
> +static u8 arm_smmu_cd_used_qword_diff_indexes(__le64 *cur,
> +                                              const __le64 *target)
> +{
> +       struct arm_smmu_cd target_used;
> +
> +       arm_smmu_get_cd_used(target, &target_used);
> +       return arm_smmu_entry_used_qword_diff_indexes(
> +               cur, target, target_used.data, ARRAY_SIZE(target_used.data));
> +}
> +
> +static void arm_smmu_cd_writer_sync_entry(struct arm_smmu_entry_writer *writer)
> +{
> +       struct arm_smmu_cd_writer *cd_writer =
> +               container_of(writer, struct arm_smmu_cd_writer, writer);
> +
> +       arm_smmu_sync_cd(cd_writer->master, cd_writer->ssid, true);
> +}
> +
> +static const struct arm_smmu_entry_writer_ops arm_smmu_cd_writer_ops = {
> +       .sync_entry = arm_smmu_cd_writer_sync_entry,
> +       .set_unused_bits = arm_smmu_cd_set_unused_bits,
> +       .get_used_qword_diff_indexes = arm_smmu_cd_used_qword_diff_indexes,
> +};
> +
> +static void arm_smmu_write_cd_entry(struct arm_smmu_master *master, int ssid,
> +                                   struct arm_smmu_cd *cdptr,
> +                                   const struct arm_smmu_cd *target)
> +{
> +       struct arm_smmu_cd preallocated_staging_cd = {0};
> +       struct arm_smmu_cd_writer cd_writer = {
> +               .writer = {
> +                       .ops = arm_smmu_cd_writer_ops,
> +                       .v_bit = cpu_to_le64(CTXDESC_CD_0_V),
> +                       .entry_length = ARRAY_SIZE(cdptr->data),
> +               },
> +               .master = master,
> +               .ssid = ssid,
> +       };
> +
> +       arm_smmu_write_entry(&cd_writer.writer,
> +                              cdptr->data,
> +                              target->data,
> +                              preallocated_staging_cd.data);
> +}
> +
>  int arm_smmu_write_ctx_desc(struct arm_smmu_master *master, int ssid,
>                             struct arm_smmu_ctx_desc *cd)
>  {
> @@ -1235,16 +1315,19 @@
>          */
>         u64 val;
>         bool cd_live;
> -       struct arm_smmu_cd *cdptr;
> +       struct arm_smmu_cd target;
> +       struct arm_smmu_cd *cdptr = ⌖
> +       struct arm_smmu_cd *cd_table_entry;
>         struct arm_smmu_ctx_desc_cfg *cd_table = &master->cd_table;
>
>         if (WARN_ON(ssid >= (1 << cd_table->s1cdmax)))
>                 return -E2BIG;
>
> -       cdptr = arm_smmu_get_cd_ptr(master, ssid);
> -       if (!cdptr)
> +       cd_table_entry = arm_smmu_get_cd_ptr(master, ssid);
> +       if (!cd_table_entry)
>                 return -ENOMEM;
>
> +       target = *cd_table_entry;
>         val = le64_to_cpu(cdptr->data[0]);
>         cd_live = !!(val & CTXDESC_CD_0_V);
>
> @@ -1264,13 +1347,6 @@
>                 cdptr->data[2] = 0;
>                 cdptr->data[3] = cpu_to_le64(cd->mair);
>
> -               /*
> -                * STE may be live, and the SMMU might read dwords of
> this CD in any
> -                * order. Ensure that it observes valid values before reading
> -                * V=1.
> -                */
> -               arm_smmu_sync_cd(master, ssid, true);
> -
>                 val = cd->tcr |
>  #ifdef __BIG_ENDIAN
>                         CTXDESC_CD_0_ENDI |
> @@ -1284,18 +1360,8 @@
>                 if (cd_table->stall_enabled)
>                         val |= CTXDESC_CD_0_S;
>         }
> -
> -       /*
> -        * The SMMU accesses 64-bit values atomically. See IHI0070Ca 3.21.3
> -        * "Configuration structures and configuration invalidation completion"
> -        *
> -        *   The size of single-copy atomic reads made by the SMMU is
> -        *   IMPLEMENTATION DEFINED but must be at least 64 bits. Any single
> -        *   field within an aligned 64-bit span of a structure can be altered
> -        *   without first making the structure invalid.
> -        */
> -       WRITE_ONCE(cdptr->data[0], cpu_to_le64(val));
> -       arm_smmu_sync_cd(master, ssid, true);
> +       cdptr->data[0] = cpu_to_le64(val);
> +       arm_smmu_write_cd_entry(master, ssid, cd_table_entry, &target);
>         return 0;
>  }



More information about the linux-arm-kernel mailing list