[PATCH v7 06/15] iommupt: Add unmap_pages op

Samiullah Khawaja skhawaja at google.com
Mon Oct 27 10:03:24 PDT 2025


On Thu, Oct 23, 2025 at 11:21 AM Jason Gunthorpe <jgg at nvidia.com> wrote:
>
> unmap_pages removes mappings and any fully contained interior tables from
> the given range. This follows the now-standard iommu_domain API definition
> where it does not split up larger page sizes into smaller. The caller must
> perform unmap only on ranges created by map or it must have somehow
> otherwise determined safe cut points (eg iommufd/vfio use iova_to_phys to
> scan for them)
>
> A future work will provide 'cut' which explicitly does the page size split
> if the HW can support it.
>
> unmap is implemented with a recursive descent of the tree. If the caller
> provides a VA range that spans an entire table item then the table memory
> can be freed as well.
>
> If an entire table item can be freed then this version will also check the
> leaf-only level of the tree to ensure that all entries are present to
> generate -EINVAL. Many of the existing drivers don't do this extra check.
>
> This version sits under the iommu_domain_ops as unmap_pages() but does not
> require the external page size calculation. The implementation is actually
> unmap_range() and can do arbitrary ranges, internally handling all the
> validation and supporting any arrangment of page sizes. A future series
> can optimize __iommu_unmap() to take advantage of this.
>
> Freed page table memory is batched up in the gather and will be freed in
> the driver's iotlb_sync() callback after the IOTLB flush completes.
>
> Tested-by: Alejandro Jimenez <alejandro.j.jimenez at oracle.com>
> Reviewed-by: Kevin Tian <kevin.tian at intel.com>
> Signed-off-by: Jason Gunthorpe <jgg at nvidia.com>
> ---
>  drivers/iommu/generic_pt/iommu_pt.h | 156 ++++++++++++++++++++++++++++
>  include/linux/generic_pt/iommu.h    |  10 +-
>  2 files changed, 164 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/iommu/generic_pt/iommu_pt.h b/drivers/iommu/generic_pt/iommu_pt.h
> index 5ff1b887928a46..e3d1b272723db0 100644
> --- a/drivers/iommu/generic_pt/iommu_pt.h
> +++ b/drivers/iommu/generic_pt/iommu_pt.h
> @@ -14,6 +14,29 @@
>  #include <linux/export.h>
>  #include <linux/iommu.h>
>  #include "../iommu-pages.h"
> +#include <linux/cleanup.h>
> +#include <linux/dma-mapping.h>
> +
> +static void gather_range_pages(struct iommu_iotlb_gather *iotlb_gather,
> +                              struct pt_iommu *iommu_table, pt_vaddr_t iova,
> +                              pt_vaddr_t len,
> +                              struct iommu_pages_list *free_list)
> +{
> +       struct pt_common *common = common_from_iommu(iommu_table);
> +
> +       if (pt_feature(common, PT_FEAT_FLUSH_RANGE_NO_GAPS) &&
> +           iommu_iotlb_gather_is_disjoint(iotlb_gather, iova, len)) {
> +               iommu_iotlb_sync(&iommu_table->domain, iotlb_gather);
> +               /*
> +                * Note that the sync frees the gather's free list, so we must
> +                * not have any pages on that list that are covered by iova/len
> +                */
> +       } else if (pt_feature(common, PT_FEAT_FLUSH_RANGE)) {
> +               iommu_iotlb_gather_add_range(iotlb_gather, iova, len);
> +       }
> +
> +       iommu_pages_list_splice(free_list, &iotlb_gather->freelist);
> +}
>
>  #define DOMAIN_NS(op) CONCATENATE(CONCATENATE(pt_iommu_, PTPFX), op)
>
> @@ -164,6 +187,139 @@ static inline struct pt_table_p *table_alloc_top(struct pt_common *common,
>                 log2_to_int(pt_top_memsize_lg2(common, top_of_table)));
>  }
>
> +struct pt_unmap_args {
> +       struct iommu_pages_list free_list;
> +       pt_vaddr_t unmapped;
> +};
> +
> +static __maybe_unused int __unmap_range(struct pt_range *range, void *arg,
> +                                       unsigned int level,
> +                                       struct pt_table_p *table)
> +{
> +       struct pt_state pts = pt_init(range, level, table);
> +       struct pt_unmap_args *unmap = arg;
> +       unsigned int num_oas = 0;
> +       unsigned int start_index;
> +       int ret = 0;
> +
> +       _pt_iter_first(&pts);
> +       start_index = pts.index;
> +       pts.type = pt_load_entry_raw(&pts);
> +       /*
> +        * A starting index is in the middle of a contiguous entry
> +        *
> +        * The IOMMU API does not require drivers to support unmapping parts of
> +        * large pages. Long ago VFIO would try to split maps but the current
> +        * version never does.
> +        *
> +        * Instead when unmap reaches a partial unmap of the start of a large
> +        * IOPTE it should remove the entire IOPTE and return that size to the
> +        * caller.
> +        */
> +       if (pts.type == PT_ENTRY_OA) {
> +               if (log2_mod(range->va, pt_entry_oa_lg2sz(&pts)))
> +                       return -EINVAL;
> +               /* Micro optimization */
> +               goto start_oa;
> +       }
> +
> +       do {
> +               if (pts.type != PT_ENTRY_OA) {
> +                       bool fully_covered;
> +
> +                       if (pts.type != PT_ENTRY_TABLE) {
> +                               ret = -EINVAL;
> +                               break;
> +                       }
> +
> +                       if (pts.index != start_index)
> +                               pt_index_to_va(&pts);
> +                       pts.table_lower = pt_table_ptr(&pts);
> +
> +                       fully_covered = pt_entry_fully_covered(
> +                               &pts, pt_table_item_lg2sz(&pts));
> +
> +                       ret = pt_descend(&pts, arg, __unmap_range);
> +                       if (ret)
> +                               break;
> +
> +                       /*
> +                        * If the unmapping range fully covers the table then we
> +                        * can free it as well. The clear is delayed until we
> +                        * succeed in clearing the lower table levels.
> +                        */
> +                       if (fully_covered) {
> +                               iommu_pages_list_add(&unmap->free_list,
> +                                                    pts.table_lower);
> +                               pt_clear_entries(&pts, ilog2(1));
> +                       }
> +                       pts.index++;
> +               } else {
> +                       unsigned int num_contig_lg2;
> +start_oa:
> +                       /*
> +                        * If the caller requested an last that falls within a
> +                        * single entry then the entire entry is unmapped and
> +                        * the length returned will be larger than requested.
> +                        */
> +                       num_contig_lg2 = pt_entry_num_contig_lg2(&pts);
> +                       pt_clear_entries(&pts, num_contig_lg2);
> +                       num_oas += log2_to_int(num_contig_lg2);
> +                       pts.index += log2_to_int(num_contig_lg2);
> +               }
> +               if (pts.index >= pts.end_index)
> +                       break;
> +               pts.type = pt_load_entry_raw(&pts);
> +       } while (true);
> +
> +       unmap->unmapped += log2_mul(num_oas, pt_table_item_lg2sz(&pts));
> +       return ret;
> +}
> +
> +/**
> + * unmap_pages() - Make a range of IOVA empty/not present
> + * @domain: Domain to manipulate
> + * @iova: IO virtual address to start
> + * @pgsize: Length of each page
> + * @pgcount: Length of the range in pgsize units starting from @iova
> + * @iotlb_gather: Gather struct that must be flushed on return
> + *
> + * unmap_pages() will remove a translation created by map_pages(). It cannot
> + * subdivide a mapping created by map_pages(), so it should be called with IOVA
> + * ranges that match those passed to map_pages(). The IOVA range can aggregate
> + * contiguous map_pages() calls so long as no individual range is split.
> + *
> + * Context: The caller must hold a write range lock that includes
> + * the whole range.
> + *
> + * Returns: Number of bytes of VA unmapped. iova + res will be the point
> + * unmapping stopped.
> + */
> +size_t DOMAIN_NS(unmap_pages)(struct iommu_domain *domain, unsigned long iova,
> +                             size_t pgsize, size_t pgcount,
> +                             struct iommu_iotlb_gather *iotlb_gather)
> +{
> +       struct pt_iommu *iommu_table =
> +               container_of(domain, struct pt_iommu, domain);
> +       struct pt_unmap_args unmap = { .free_list = IOMMU_PAGES_LIST_INIT(
> +                                              unmap.free_list) };
> +       pt_vaddr_t len = pgsize * pgcount;
> +       struct pt_range range;
> +       int ret;
> +
> +       ret = make_range(common_from_iommu(iommu_table), &range, iova, len);
> +       if (ret)
> +               return 0;
> +
> +       pt_walk_range(&range, __unmap_range, &unmap);
> +
> +       gather_range_pages(iotlb_gather, iommu_table, iova, len,
> +                          &unmap.free_list);
> +
> +       return unmap.unmapped;
> +}
> +EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(unmap_pages), "GENERIC_PT_IOMMU");
> +
>  static void NS(get_info)(struct pt_iommu *iommu_table,
>                          struct pt_iommu_info *info)
>  {
> diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h
> index 5622856e199881..ceb6bc9cea37cd 100644
> --- a/include/linux/generic_pt/iommu.h
> +++ b/include/linux/generic_pt/iommu.h
> @@ -9,6 +9,7 @@
>  #include <linux/iommu.h>
>  #include <linux/mm_types.h>
>
> +struct iommu_iotlb_gather;
>  struct pt_iommu_ops;
>
>  /**
> @@ -119,6 +120,10 @@ struct pt_iommu_cfg {
>  #define IOMMU_PROTOTYPES(fmt)                                                  \
>         phys_addr_t pt_iommu_##fmt##_iova_to_phys(struct iommu_domain *domain, \
>                                                   dma_addr_t iova);            \
> +       size_t pt_iommu_##fmt##_unmap_pages(                                   \
> +               struct iommu_domain *domain, unsigned long iova,               \
> +               size_t pgsize, size_t pgcount,                                 \
> +               struct iommu_iotlb_gather *iotlb_gather);                      \
>         int pt_iommu_##fmt##_init(struct pt_iommu_##fmt *table,                \
>                                   const struct pt_iommu_##fmt##_cfg *cfg,      \
>                                   gfp_t gfp);                                  \
> @@ -135,8 +140,9 @@ struct pt_iommu_cfg {
>   * A driver uses IOMMU_PT_DOMAIN_OPS to populate the iommu_domain_ops for the
>   * iommu_pt
>   */
> -#define IOMMU_PT_DOMAIN_OPS(fmt) \
> -       .iova_to_phys = &pt_iommu_##fmt##_iova_to_phys,
> +#define IOMMU_PT_DOMAIN_OPS(fmt)                        \
> +       .iova_to_phys = &pt_iommu_##fmt##_iova_to_phys, \
> +       .unmap_pages = &pt_iommu_##fmt##_unmap_pages
>
>  /*
>   * The driver should setup its domain struct like
> --
> 2.43.0
>
>

Reviewed-by: Samiullah Khawaja <skhawaja at google.com>



More information about the linux-riscv mailing list