[PATCH v7 06/15] iommupt: Add unmap_pages op
Samiullah Khawaja
skhawaja at google.com
Mon Oct 27 10:03:24 PDT 2025
On Thu, Oct 23, 2025 at 11:21 AM Jason Gunthorpe <jgg at nvidia.com> wrote:
>
> unmap_pages removes mappings and any fully contained interior tables from
> the given range. This follows the now-standard iommu_domain API definition
> where it does not split up larger page sizes into smaller. The caller must
> perform unmap only on ranges created by map or it must have somehow
> otherwise determined safe cut points (eg iommufd/vfio use iova_to_phys to
> scan for them)
>
> A future work will provide 'cut' which explicitly does the page size split
> if the HW can support it.
>
> unmap is implemented with a recursive descent of the tree. If the caller
> provides a VA range that spans an entire table item then the table memory
> can be freed as well.
>
> If an entire table item can be freed then this version will also check the
> leaf-only level of the tree to ensure that all entries are present to
> generate -EINVAL. Many of the existing drivers don't do this extra check.
>
> This version sits under the iommu_domain_ops as unmap_pages() but does not
> require the external page size calculation. The implementation is actually
> unmap_range() and can do arbitrary ranges, internally handling all the
> validation and supporting any arrangment of page sizes. A future series
> can optimize __iommu_unmap() to take advantage of this.
>
> Freed page table memory is batched up in the gather and will be freed in
> the driver's iotlb_sync() callback after the IOTLB flush completes.
>
> Tested-by: Alejandro Jimenez <alejandro.j.jimenez at oracle.com>
> Reviewed-by: Kevin Tian <kevin.tian at intel.com>
> Signed-off-by: Jason Gunthorpe <jgg at nvidia.com>
> ---
> drivers/iommu/generic_pt/iommu_pt.h | 156 ++++++++++++++++++++++++++++
> include/linux/generic_pt/iommu.h | 10 +-
> 2 files changed, 164 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/iommu/generic_pt/iommu_pt.h b/drivers/iommu/generic_pt/iommu_pt.h
> index 5ff1b887928a46..e3d1b272723db0 100644
> --- a/drivers/iommu/generic_pt/iommu_pt.h
> +++ b/drivers/iommu/generic_pt/iommu_pt.h
> @@ -14,6 +14,29 @@
> #include <linux/export.h>
> #include <linux/iommu.h>
> #include "../iommu-pages.h"
> +#include <linux/cleanup.h>
> +#include <linux/dma-mapping.h>
> +
> +static void gather_range_pages(struct iommu_iotlb_gather *iotlb_gather,
> + struct pt_iommu *iommu_table, pt_vaddr_t iova,
> + pt_vaddr_t len,
> + struct iommu_pages_list *free_list)
> +{
> + struct pt_common *common = common_from_iommu(iommu_table);
> +
> + if (pt_feature(common, PT_FEAT_FLUSH_RANGE_NO_GAPS) &&
> + iommu_iotlb_gather_is_disjoint(iotlb_gather, iova, len)) {
> + iommu_iotlb_sync(&iommu_table->domain, iotlb_gather);
> + /*
> + * Note that the sync frees the gather's free list, so we must
> + * not have any pages on that list that are covered by iova/len
> + */
> + } else if (pt_feature(common, PT_FEAT_FLUSH_RANGE)) {
> + iommu_iotlb_gather_add_range(iotlb_gather, iova, len);
> + }
> +
> + iommu_pages_list_splice(free_list, &iotlb_gather->freelist);
> +}
>
> #define DOMAIN_NS(op) CONCATENATE(CONCATENATE(pt_iommu_, PTPFX), op)
>
> @@ -164,6 +187,139 @@ static inline struct pt_table_p *table_alloc_top(struct pt_common *common,
> log2_to_int(pt_top_memsize_lg2(common, top_of_table)));
> }
>
> +struct pt_unmap_args {
> + struct iommu_pages_list free_list;
> + pt_vaddr_t unmapped;
> +};
> +
> +static __maybe_unused int __unmap_range(struct pt_range *range, void *arg,
> + unsigned int level,
> + struct pt_table_p *table)
> +{
> + struct pt_state pts = pt_init(range, level, table);
> + struct pt_unmap_args *unmap = arg;
> + unsigned int num_oas = 0;
> + unsigned int start_index;
> + int ret = 0;
> +
> + _pt_iter_first(&pts);
> + start_index = pts.index;
> + pts.type = pt_load_entry_raw(&pts);
> + /*
> + * A starting index is in the middle of a contiguous entry
> + *
> + * The IOMMU API does not require drivers to support unmapping parts of
> + * large pages. Long ago VFIO would try to split maps but the current
> + * version never does.
> + *
> + * Instead when unmap reaches a partial unmap of the start of a large
> + * IOPTE it should remove the entire IOPTE and return that size to the
> + * caller.
> + */
> + if (pts.type == PT_ENTRY_OA) {
> + if (log2_mod(range->va, pt_entry_oa_lg2sz(&pts)))
> + return -EINVAL;
> + /* Micro optimization */
> + goto start_oa;
> + }
> +
> + do {
> + if (pts.type != PT_ENTRY_OA) {
> + bool fully_covered;
> +
> + if (pts.type != PT_ENTRY_TABLE) {
> + ret = -EINVAL;
> + break;
> + }
> +
> + if (pts.index != start_index)
> + pt_index_to_va(&pts);
> + pts.table_lower = pt_table_ptr(&pts);
> +
> + fully_covered = pt_entry_fully_covered(
> + &pts, pt_table_item_lg2sz(&pts));
> +
> + ret = pt_descend(&pts, arg, __unmap_range);
> + if (ret)
> + break;
> +
> + /*
> + * If the unmapping range fully covers the table then we
> + * can free it as well. The clear is delayed until we
> + * succeed in clearing the lower table levels.
> + */
> + if (fully_covered) {
> + iommu_pages_list_add(&unmap->free_list,
> + pts.table_lower);
> + pt_clear_entries(&pts, ilog2(1));
> + }
> + pts.index++;
> + } else {
> + unsigned int num_contig_lg2;
> +start_oa:
> + /*
> + * If the caller requested an last that falls within a
> + * single entry then the entire entry is unmapped and
> + * the length returned will be larger than requested.
> + */
> + num_contig_lg2 = pt_entry_num_contig_lg2(&pts);
> + pt_clear_entries(&pts, num_contig_lg2);
> + num_oas += log2_to_int(num_contig_lg2);
> + pts.index += log2_to_int(num_contig_lg2);
> + }
> + if (pts.index >= pts.end_index)
> + break;
> + pts.type = pt_load_entry_raw(&pts);
> + } while (true);
> +
> + unmap->unmapped += log2_mul(num_oas, pt_table_item_lg2sz(&pts));
> + return ret;
> +}
> +
> +/**
> + * unmap_pages() - Make a range of IOVA empty/not present
> + * @domain: Domain to manipulate
> + * @iova: IO virtual address to start
> + * @pgsize: Length of each page
> + * @pgcount: Length of the range in pgsize units starting from @iova
> + * @iotlb_gather: Gather struct that must be flushed on return
> + *
> + * unmap_pages() will remove a translation created by map_pages(). It cannot
> + * subdivide a mapping created by map_pages(), so it should be called with IOVA
> + * ranges that match those passed to map_pages(). The IOVA range can aggregate
> + * contiguous map_pages() calls so long as no individual range is split.
> + *
> + * Context: The caller must hold a write range lock that includes
> + * the whole range.
> + *
> + * Returns: Number of bytes of VA unmapped. iova + res will be the point
> + * unmapping stopped.
> + */
> +size_t DOMAIN_NS(unmap_pages)(struct iommu_domain *domain, unsigned long iova,
> + size_t pgsize, size_t pgcount,
> + struct iommu_iotlb_gather *iotlb_gather)
> +{
> + struct pt_iommu *iommu_table =
> + container_of(domain, struct pt_iommu, domain);
> + struct pt_unmap_args unmap = { .free_list = IOMMU_PAGES_LIST_INIT(
> + unmap.free_list) };
> + pt_vaddr_t len = pgsize * pgcount;
> + struct pt_range range;
> + int ret;
> +
> + ret = make_range(common_from_iommu(iommu_table), &range, iova, len);
> + if (ret)
> + return 0;
> +
> + pt_walk_range(&range, __unmap_range, &unmap);
> +
> + gather_range_pages(iotlb_gather, iommu_table, iova, len,
> + &unmap.free_list);
> +
> + return unmap.unmapped;
> +}
> +EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(unmap_pages), "GENERIC_PT_IOMMU");
> +
> static void NS(get_info)(struct pt_iommu *iommu_table,
> struct pt_iommu_info *info)
> {
> diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h
> index 5622856e199881..ceb6bc9cea37cd 100644
> --- a/include/linux/generic_pt/iommu.h
> +++ b/include/linux/generic_pt/iommu.h
> @@ -9,6 +9,7 @@
> #include <linux/iommu.h>
> #include <linux/mm_types.h>
>
> +struct iommu_iotlb_gather;
> struct pt_iommu_ops;
>
> /**
> @@ -119,6 +120,10 @@ struct pt_iommu_cfg {
> #define IOMMU_PROTOTYPES(fmt) \
> phys_addr_t pt_iommu_##fmt##_iova_to_phys(struct iommu_domain *domain, \
> dma_addr_t iova); \
> + size_t pt_iommu_##fmt##_unmap_pages( \
> + struct iommu_domain *domain, unsigned long iova, \
> + size_t pgsize, size_t pgcount, \
> + struct iommu_iotlb_gather *iotlb_gather); \
> int pt_iommu_##fmt##_init(struct pt_iommu_##fmt *table, \
> const struct pt_iommu_##fmt##_cfg *cfg, \
> gfp_t gfp); \
> @@ -135,8 +140,9 @@ struct pt_iommu_cfg {
> * A driver uses IOMMU_PT_DOMAIN_OPS to populate the iommu_domain_ops for the
> * iommu_pt
> */
> -#define IOMMU_PT_DOMAIN_OPS(fmt) \
> - .iova_to_phys = &pt_iommu_##fmt##_iova_to_phys,
> +#define IOMMU_PT_DOMAIN_OPS(fmt) \
> + .iova_to_phys = &pt_iommu_##fmt##_iova_to_phys, \
> + .unmap_pages = &pt_iommu_##fmt##_unmap_pages
>
> /*
> * The driver should setup its domain struct like
> --
> 2.43.0
>
>
Reviewed-by: Samiullah Khawaja <skhawaja at google.com>
More information about the linux-riscv
mailing list