[PATCH v5 24/27] iommu/arm-smmu-v3: Bring back SVA BTM support

Tue Mar 19 10:07:01 PDT 2024

On Tue, Mar 5, 2024 at 7:44 AM Jason Gunthorpe <jgg at nvidia.com> wrote:
>
> BTM support is a feature where the CPU TLB invalidation can be forwarded
> to the IOMMU and also invalidate the IOTLB. For this to work the CPU and
> IOMMU ASID must be the same.
>
> Retain the prior SVA design here of keeping the ASID allocator for the
> IOMMU private to SMMU and force SVA domains to set an ASID that matches
> the CPU ASID.
>
> This requires changing the ASID assigned to a S1 domain if it happens to
> be overlapping with the required CPU ASID. We hold on to the CPU ASID so
> long as the SVA iommu_domain exists, so SVA domain conflict is not
> possible.
>
> With the asid per-smmu we no longer have a problem that two per-smmu
> iommu_domain's would need to share a CPU ASID entry in the IOMMU's xarray.
>
> Use the same ASID move algorithm for the S1 domains as before with some
> streamlining around how the xarray is being used. Do not synchronize the
> ASID's if BTM mode is not supported. Just leave BTM features off
> everywhere.
>
> Audit all the places that touch cd->asid and think carefully about how the
> locking works with the change to the cd->asid by the move algorithm. Use
> xarray internal locking during xa_alloc() instead of double locking. Add a
> note that concurrent S1 invalidation doesn't fully work.
>
> Note that this is all still dead code, ARM_SMMU_FEAT_BTM is never set.

Clearly a lot of work/thought has gone into this patch, but I have to
wonder if this complexity is worth carrying forward for a feature that
was never enabled... Is moving this to part 3 or dropping it all
together an option?

>
> Tested-by: Nicolin Chen <nicolinc at nvidia.com>
> Signed-off-by: Jason Gunthorpe <jgg at nvidia.com>
> ---
>  .../iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c   | 133 ++++++++++++++++--
>  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c   |  15 +-
>  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h   |   2 +-
>  3 files changed, 129 insertions(+), 21 deletions(-)
>
> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
> index 3acd699433b7d8..0b5aeaa3a85575 100644
> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
> @@ -15,12 +15,33 @@
>
>  static DEFINE_MUTEX(sva_lock);
>
> -static void __maybe_unused
> -arm_smmu_update_s1_domain_cd_entry(struct arm_smmu_domain *smmu_domain)
> +static int arm_smmu_realloc_s1_domain_asid(struct arm_smmu_device *smmu,
> +                                          struct arm_smmu_domain *smmu_domain)
>  {
>         struct arm_smmu_master_domain *master_domain;
> +       u32 old_asid = smmu_domain->cd.asid;
>         struct arm_smmu_cd target_cd;
>         unsigned long flags;
> +       int ret;
> +
> +       lockdep_assert_held(&smmu->asid_lock);
> +
> +       /*
> +        * FIXME: The unmap and invalidation path doesn't take any locks but
> +        * this is not fully safe. Since updating the CD tables is not atomic
> +        * there is always a hole where invalidating only one ASID of two active
> +        * ASIDs during unmap will cause the IOTLB to become stale.
> +        *
> +        * This approach is to hopefully shift the racing CPUs to the new ASID
> +        * before we start programming the HW. This increases the chance that
> +        * racing IOPTE changes will pick up an invalidation for the new ASID
> +        * and we achieve eventual consistency. For the brief period where the
> +        * old ASID is still in the CD entries it will become incoherent.
> +        */
> +       ret = xa_alloc(&smmu->asid_map, &smmu_domain->cd.asid, smmu_domain,
> +                      XA_LIMIT(1, (1 << smmu->asid_bits) - 1), GFP_KERNEL);
> +       if (ret)
> +               return ret;
>
>         spin_lock_irqsave(&smmu_domain->devices_lock, flags);
>         list_for_each_entry(master_domain, &smmu_domain->devices, devices_elm) {
> @@ -36,6 +57,10 @@ arm_smmu_update_s1_domain_cd_entry(struct arm_smmu_domain *smmu_domain)
>                                         &target_cd);
>         }
>         spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
> +
> +       /* Clean the ASID we are about to assign to a new translation */
> +       arm_smmu_tlb_inv_asid(smmu, old_asid);
> +       return 0;
>  }
>
>  static u64 page_size_to_cd(void)
> @@ -148,12 +173,12 @@ static void arm_smmu_mm_arch_invalidate_secondary_tlbs(struct mmu_notifier *mn,
>         }
>
>         if (!smmu_domain->btm_invalidation) {
> +               ioasid_t asid = READ_ONCE(smmu_domain->cd.asid);
> +
>                 if (!size)
> -                       arm_smmu_tlb_inv_asid(smmu_domain->smmu,
> -                                             smmu_domain->cd.asid);
> +                       arm_smmu_tlb_inv_asid(smmu_domain->smmu, asid);
>                 else
> -                       arm_smmu_tlb_inv_range_asid(start, size,
> -                                                   smmu_domain->cd.asid,
> +                       arm_smmu_tlb_inv_range_asid(start, size, asid,
>                                                     PAGE_SIZE, false,
>                                                     smmu_domain);
>         }
> @@ -182,6 +207,8 @@ static void arm_smmu_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
>                 cdptr = arm_smmu_get_cd_ptr(master, master_domain->ssid);
>                 if (WARN_ON(!cdptr))
>                         continue;
> +
> +               /* An SVA ASID never changes, no asid_lock required */
>                 arm_smmu_make_sva_cd(&target, master, NULL,
>                                      smmu_domain->cd.asid,
>                                      smmu_domain->btm_invalidation);
> @@ -388,6 +415,8 @@ static void arm_smmu_sva_domain_free(struct iommu_domain *domain)
>          * unnecessary invalidation.
>          */
>         arm_smmu_domain_free_id(smmu_domain);
> +       if (smmu_domain->btm_invalidation)
> +               arm64_mm_context_put(domain->mm);
>
>         /*
>          * Actual free is defered to the SRCU callback
> @@ -401,13 +430,97 @@ static const struct iommu_domain_ops arm_smmu_sva_domain_ops = {
>         .free                   = arm_smmu_sva_domain_free
>  };
>
> +static int arm_smmu_share_asid(struct arm_smmu_device *smmu,
> +                              struct arm_smmu_domain *smmu_domain,
> +                              struct mm_struct *mm)
> +{
> +       struct arm_smmu_domain *old_s1_domain;
> +       int ret;
> +
> +       /*
> +        * Notice that BTM is never currently enabled, this is all dead code.
> +        * The specification cautions:
> +        *
> +        * Note: Arm expects that SMMU stage 2 address spaces are generally
> +        * shared with their respective PE virtual machine stage 2
> +        * configuration. If broadcast invalidation is required to be avoided
> +        * for a particular SMMU stage 2 address space, Arm recommends that a
> +        * hypervisor configures the STE with a VMID that is not allocated for
> +        * virtual machine use on the PEs
> +        *
> +        * However, in Linux, both KVM and SMMU think they own the VMID pool.
> +        * Unfortunately the ARM design is problematic for Linux as we do not
> +        * currently share the S2 table with KVM. This creates a situation where
> +        * the S2 needs to have the same VMID as KVM in order to allow the guest
> +        * to use BTM, however we must still invalidate the S2 directly since it
> +        * is a different radix tree. What Linux would like is something like
> +        * ASET for the STE to disable BTM only for the S2.
> +        *
> +        * Arguably in a system with BTM the driver should prefer to use a S1
> +        * table in all cases execpt when explicitly asked to create a nesting
> +        * parent. Then it should use the VMID of KVM to enable BTM in the
> +        * guest. We cannot optimize away the resulting double invalidation of
> +        * the S2 :( Or we simply ignore BTM entirely as we are doing now.
> +        */
> +       if (!(smmu_domain->smmu->features & ARM_SMMU_FEAT_BTM))
> +               return xa_alloc(&smmu->asid_map, &smmu_domain->cd.asid,
> +                               smmu_domain,
> +                               XA_LIMIT(1, (1 << smmu->asid_bits) - 1),
> +                               GFP_KERNEL);
> +
> +       /* At this point the caller ensures we have a mmget() */
> +       smmu_domain->cd.asid = arm64_mm_context_get(mm);
> +
> +       mutex_lock(&smmu->asid_lock);
> +       old_s1_domain = xa_store(&smmu->asid_map, smmu_domain->cd.asid,
> +                                smmu_domain, GFP_KERNEL);
> +       if (xa_err(old_s1_domain)) {
> +               ret = xa_err(old_s1_domain);
> +               goto out_put_asid;
> +       }
> +
> +       /*
> +        * In BTM mode the CPU ASID and the IOMMU ASID have to be the same.
> +        * Unfortunately we run separate allocators for this and the IOMMU
> +        * ASID can already have been assigned to a S1 domain. SVA domains
> +        * always align to their CPU ASIDs. In this case we change
> +        * the S1 domain's ASID, update the CD entry and flush the caches.
> +        *
> +        * This is a bit tricky, all the places writing to a S1 CD, reading the
> +        * S1 ASID, or doing xa_erase must hold the asid_lock or xa_lock to
> +        * avoid IOTLB incoherence.
> +        */
> +       if (old_s1_domain) {
> +               if (WARN_ON(old_s1_domain->domain.type == IOMMU_DOMAIN_SVA)) {
> +                       ret = -EINVAL;
> +                       goto out_restore_s1;
> +               }
> +               ret = arm_smmu_realloc_s1_domain_asid(smmu, old_s1_domain);
> +               if (ret)
> +                       goto out_restore_s1;
> +       }
> +
> +       smmu_domain->btm_invalidation = true;
> +
> +       ret = 0;
> +       goto out_unlock;
> +
> +out_restore_s1:
> +       xa_store(&smmu->asid_map, smmu_domain->cd.asid, old_s1_domain,
> +                GFP_KERNEL);
> +out_put_asid:
> +       arm64_mm_context_put(mm);
> +out_unlock:
> +       mutex_unlock(&smmu->asid_lock);
> +       return ret;
> +}
> +
>  struct iommu_domain *arm_smmu_sva_domain_alloc(struct device *dev,
>                                                struct mm_struct *mm)
>  {
>         struct arm_smmu_master *master = dev_iommu_priv_get(dev);
>         struct arm_smmu_device *smmu = master->smmu;
>         struct arm_smmu_domain *smmu_domain;
> -       u32 asid;
>         int ret;
>
>         smmu_domain = arm_smmu_domain_alloc();
> @@ -418,12 +531,10 @@ struct iommu_domain *arm_smmu_sva_domain_alloc(struct device *dev,
>         smmu_domain->domain.ops = &arm_smmu_sva_domain_ops;
>         smmu_domain->smmu = smmu;
>
> -       ret = xa_alloc(&smmu->asid_map, &asid, smmu_domain,
> -                      XA_LIMIT(1, (1 << smmu->asid_bits) - 1), GFP_KERNEL);
> +       ret = arm_smmu_share_asid(smmu, smmu_domain, mm);
>         if (ret)
>                 goto err_free;
>
> -       smmu_domain->cd.asid = asid;
>         smmu_domain->mmu_notifier.ops = &arm_smmu_mmu_notifier_ops;
>         ret = mmu_notifier_register(&smmu_domain->mmu_notifier, mm);
>         if (ret)
> @@ -433,6 +544,8 @@ struct iommu_domain *arm_smmu_sva_domain_alloc(struct device *dev,
>
>  err_asid:
>         arm_smmu_domain_free_id(smmu_domain);
> +       if (smmu_domain->btm_invalidation)
> +               arm64_mm_context_put(mm);
>  err_free:
>         kfree(smmu_domain);
>         return ERR_PTR(ret);
> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> index 1a72dd63e0ca14..01737a0fc82828 100644
> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> @@ -1314,6 +1314,8 @@ void arm_smmu_make_s1_cd(struct arm_smmu_cd *target,
>         typeof(&pgtbl_cfg->arm_lpae_s1_cfg.tcr) tcr =
>                 &pgtbl_cfg->arm_lpae_s1_cfg.tcr;
>
> +       lockdep_assert_held(&master->smmu->asid_lock);
> +
>         memset(target, 0, sizeof(*target));
>
>         target->data[0] = cpu_to_le64(
> @@ -2089,7 +2091,7 @@ static void arm_smmu_tlb_inv_context(void *cookie)
>          * careful, 007.
>          */
>         if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) {
> -               arm_smmu_tlb_inv_asid(smmu, smmu_domain->cd.asid);
> +               arm_smmu_tlb_inv_asid(smmu, READ_ONCE(smmu_domain->cd.asid));
>         } else {
>                 cmd.opcode      = CMDQ_OP_TLBI_S12_VMALL;
>                 cmd.tlbi.vmid   = smmu_domain->s2_cfg.vmid;
> @@ -2334,17 +2336,10 @@ static void arm_smmu_domain_free(struct iommu_domain *domain)
>  static int arm_smmu_domain_finalise_s1(struct arm_smmu_device *smmu,
>                                        struct arm_smmu_domain *smmu_domain)
>  {
> -       int ret;
> -       u32 asid;
>         struct arm_smmu_ctx_desc *cd = &smmu_domain->cd;
>
> -       /* Prevent SVA from modifying the ASID until it is written to the CD */
> -       mutex_lock(&smmu->asid_lock);
> -       ret = xa_alloc(&smmu->asid_map, &asid, smmu_domain,
> -                      XA_LIMIT(1, (1 << smmu->asid_bits) - 1), GFP_KERNEL);
> -       cd->asid        = (u16)asid;
> -       mutex_unlock(&smmu->asid_lock);
> -       return ret;
> +       return xa_alloc(&smmu->asid_map, &cd->asid, smmu_domain,
> +                       XA_LIMIT(1, (1 << smmu->asid_bits) - 1), GFP_KERNEL);
>  }
>
>  static int arm_smmu_domain_finalise_s2(struct arm_smmu_device *smmu,
> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
> index 6becdbae905598..a23593f1830106 100644
> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
> @@ -586,7 +586,7 @@ struct arm_smmu_strtab_l1_desc {
>  };
>
>  struct arm_smmu_ctx_desc {
> -       u16                             asid;
> +       u32                             asid;
>  };
>
>  struct arm_smmu_l1_ctx_desc {
> --
> 2.43.2
>