[PATCH v7 03/15] iommupt: Add the basic structure of the iommu implementation
Samiullah Khawaja
skhawaja at google.com
Mon Oct 27 09:40:48 PDT 2025
On Thu, Oct 23, 2025 at 11:21 AM Jason Gunthorpe <jgg at nvidia.com> wrote:
>
> The existing IOMMU page table implementations duplicate all of the working
> algorithms for each format. By using the generic page table API a single C
> version of the IOMMU algorithms can be created and re-used for all of the
> different formats used in the drivers. The implementation will provide a
> single C version of the iommu domain operations: iova_to_phys, map, unmap,
> and read_and_clear_dirty.
>
> Further, adding new algorithms and techniques becomes easy to do across
> the entire fleet of drivers and formats.
>
> The C functions are drop in compatible with the existing iommu_domain_ops
> using the IOMMU_PT_DOMAIN_OPS() macro. Each per-format implementation
> compilation unit will produce exported symbols following the pattern
> pt_iommu_FMT_map_pages() which the macro directly maps to the
> iommu_domain_ops members. This avoids the additional function pointer
> indirection like io-pgtable has.
>
> The top level struct used by the drivers is pt_iommu_table_FMT. It
> contains the other structs to allow container_of() to move between the
> driver, iommu page table, generic page table, and generic format layers.
>
> struct pt_iommu_table_amdv1 {
> struct pt_iommu {
> struct iommu_domain domain;
> } iommu;
> struct pt_amdv1 {
> struct pt_common common;
> } amdpt;
> };
>
> The driver is expected to union the pt_iommu_table_FMT with its own
> existing domain struct:
>
> struct driver_domain {
> union {
> struct iommu_domain domain;
> struct pt_iommu_table_amdv1 amdv1;
> };
> };
> PT_IOMMU_CHECK_DOMAIN(struct driver_domain, amdv1, domain);
>
> To create an alias to avoid renaming 'domain' in a lot of driver code.
>
> This allows all the layers to access all the necessary functions to
> implement their different roles with no change to any of the existing
> iommu core code.
>
> Implement the basic starting point: pt_iommu_init(), get_info() and
> deinit().
>
> Tested-by: Alejandro Jimenez <alejandro.j.jimenez at oracle.com>
> Reviewed-by: Kevin Tian <kevin.tian at intel.com>
> Signed-off-by: Jason Gunthorpe <jgg at nvidia.com>
> ---
> drivers/iommu/generic_pt/Kconfig | 13 +
> drivers/iommu/generic_pt/fmt/iommu_template.h | 39 +++
> drivers/iommu/generic_pt/iommu_pt.h | 259 ++++++++++++++++++
> include/linux/generic_pt/iommu.h | 150 ++++++++++
> 4 files changed, 461 insertions(+)
> create mode 100644 drivers/iommu/generic_pt/fmt/iommu_template.h
> create mode 100644 drivers/iommu/generic_pt/iommu_pt.h
> create mode 100644 include/linux/generic_pt/iommu.h
>
> diff --git a/drivers/iommu/generic_pt/Kconfig b/drivers/iommu/generic_pt/Kconfig
> index fb0f431ddba0a8..a81dfdd72ca016 100644
> --- a/drivers/iommu/generic_pt/Kconfig
> +++ b/drivers/iommu/generic_pt/Kconfig
> @@ -17,4 +17,17 @@ config DEBUG_GENERIC_PT
> kernels.
>
> The kunit tests require this to be enabled to get full coverage.
> +
> +config IOMMU_PT
> + tristate "IOMMU Page Tables"
> + select IOMMU_API
> + depends on IOMMU_SUPPORT
> + depends on GENERIC_PT
> + help
> + Generic library for building IOMMU page tables
> +
> + IOMMU_PT provides an implementation of the page table operations
> + related to struct iommu_domain using GENERIC_PT. It provides a single
> + implementation of the page table operations that can be shared by
> + multiple drivers.
> endif
> diff --git a/drivers/iommu/generic_pt/fmt/iommu_template.h b/drivers/iommu/generic_pt/fmt/iommu_template.h
> new file mode 100644
> index 00000000000000..5b631bc07cbc16
> --- /dev/null
> +++ b/drivers/iommu/generic_pt/fmt/iommu_template.h
> @@ -0,0 +1,39 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
> + *
> + * Template to build the iommu module and kunit from the format and
> + * implementation headers.
> + *
> + * The format should have:
> + * #define PT_FMT <name>
> + * #define PT_SUPPORTED_FEATURES (BIT(PT_FEAT_xx) | BIT(PT_FEAT_yy))
> + * And optionally:
> + * #define PT_FORCE_ENABLED_FEATURES ..
> + * #define PT_FMT_VARIANT <suffix>
> + */
> +#include <linux/args.h>
> +#include <linux/stringify.h>
> +
> +#ifdef PT_FMT_VARIANT
> +#define PTPFX_RAW \
> + CONCATENATE(CONCATENATE(PT_FMT, _), PT_FMT_VARIANT)
> +#else
> +#define PTPFX_RAW PT_FMT
> +#endif
> +
> +#define PTPFX CONCATENATE(PTPFX_RAW, _)
> +
> +#define _PT_FMT_H PT_FMT.h
> +#define PT_FMT_H __stringify(_PT_FMT_H)
> +
> +#define _PT_DEFS_H CONCATENATE(defs_, _PT_FMT_H)
> +#define PT_DEFS_H __stringify(_PT_DEFS_H)
> +
> +#include <linux/generic_pt/common.h>
> +#include PT_DEFS_H
> +#include "../pt_defs.h"
> +#include PT_FMT_H
> +#include "../pt_common.h"
> +
> +#include "../iommu_pt.h"
> diff --git a/drivers/iommu/generic_pt/iommu_pt.h b/drivers/iommu/generic_pt/iommu_pt.h
> new file mode 100644
> index 00000000000000..564f2d3a6e11e1
> --- /dev/null
> +++ b/drivers/iommu/generic_pt/iommu_pt.h
> @@ -0,0 +1,259 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
> + *
> + * "Templated C code" for implementing the iommu operations for page tables.
> + * This is compiled multiple times, over all the page table formats to pick up
> + * the per-format definitions.
> + */
> +#ifndef __GENERIC_PT_IOMMU_PT_H
> +#define __GENERIC_PT_IOMMU_PT_H
> +
> +#include "pt_iter.h"
> +
> +#include <linux/export.h>
> +#include <linux/iommu.h>
> +#include "../iommu-pages.h"
> +
> +#define DOMAIN_NS(op) CONCATENATE(CONCATENATE(pt_iommu_, PTPFX), op)
> +
> +struct pt_iommu_collect_args {
> + struct iommu_pages_list free_list;
> +};
> +
> +static int __collect_tables(struct pt_range *range, void *arg,
> + unsigned int level, struct pt_table_p *table)
> +{
> + struct pt_state pts = pt_init(range, level, table);
> + struct pt_iommu_collect_args *collect = arg;
> + int ret;
> +
> + if (!pt_can_have_table(&pts))
> + return 0;
> +
> + for_each_pt_level_entry(&pts) {
> + if (pts.type == PT_ENTRY_TABLE) {
> + iommu_pages_list_add(&collect->free_list, pts.table_lower);
> + ret = pt_descend(&pts, arg, __collect_tables);
> + if (ret)
> + return ret;
> + continue;
> + }
> + }
> + return 0;
> +}
> +
> +static inline struct pt_table_p *table_alloc_top(struct pt_common *common,
> + uintptr_t top_of_table,
> + gfp_t gfp)
> +{
> + struct pt_iommu *iommu_table = iommu_from_common(common);
> +
> + /*
> + * Top doesn't need the free list or otherwise, so it technically
> + * doesn't need to use iommu pages. Use the API anyhow as the top is
> + * usually not smaller than PAGE_SIZE to keep things simple.
> + */
> + return iommu_alloc_pages_node_sz(
> + iommu_table->nid, gfp,
> + log2_to_int(pt_top_memsize_lg2(common, top_of_table)));
> +}
> +
> +static void NS(get_info)(struct pt_iommu *iommu_table,
> + struct pt_iommu_info *info)
> +{
> + struct pt_common *common = common_from_iommu(iommu_table);
> + struct pt_range range = pt_top_range(common);
> + struct pt_state pts = pt_init_top(&range);
> + pt_vaddr_t pgsize_bitmap = 0;
> +
> + if (pt_feature(common, PT_FEAT_DYNAMIC_TOP)) {
> + for (pts.level = 0; pts.level <= PT_MAX_TOP_LEVEL;
> + pts.level++) {
> + if (pt_table_item_lg2sz(&pts) >= common->max_vasz_lg2)
> + break;
> + pgsize_bitmap |= pt_possible_sizes(&pts);
> + }
> + } else {
> + for (pts.level = 0; pts.level <= range.top_level; pts.level++)
> + pgsize_bitmap |= pt_possible_sizes(&pts);
> + }
> +
> + /* Hide page sizes larger than the maximum OA */
> + info->pgsize_bitmap = oalog2_mod(pgsize_bitmap, common->max_oasz_lg2);
> +}
> +
> +static void NS(deinit)(struct pt_iommu *iommu_table)
> +{
> + struct pt_common *common = common_from_iommu(iommu_table);
> + struct pt_range range = pt_all_range(common);
> + struct pt_iommu_collect_args collect = {
> + .free_list = IOMMU_PAGES_LIST_INIT(collect.free_list),
> + };
> +
> + iommu_pages_list_add(&collect.free_list, range.top_table);
> + pt_walk_range(&range, __collect_tables, &collect);
> +
> + /*
> + * The driver has to already have fenced the HW access to the page table
> + * and invalidated any caching referring to this memory.
> + */
> + iommu_put_pages_list(&collect.free_list);
> +}
> +
> +static const struct pt_iommu_ops NS(ops) = {
> + .get_info = NS(get_info),
> + .deinit = NS(deinit),
> +};
> +
> +static int pt_init_common(struct pt_common *common)
> +{
> + struct pt_range top_range = pt_top_range(common);
> +
> + if (PT_WARN_ON(top_range.top_level > PT_MAX_TOP_LEVEL))
> + return -EINVAL;
> +
> + if (top_range.top_level == PT_MAX_TOP_LEVEL ||
> + common->max_vasz_lg2 == top_range.max_vasz_lg2)
> + common->features &= ~BIT(PT_FEAT_DYNAMIC_TOP);
> +
> + if (top_range.max_vasz_lg2 == PT_VADDR_MAX_LG2)
> + common->features |= BIT(PT_FEAT_FULL_VA);
> +
> + /* Requested features must match features compiled into this format */
> + if ((common->features & ~(unsigned int)PT_SUPPORTED_FEATURES) ||
> + (!IS_ENABLED(CONFIG_DEBUG_GENERIC_PT) &&
> + (common->features & PT_FORCE_ENABLED_FEATURES) !=
> + PT_FORCE_ENABLED_FEATURES))
> + return -EOPNOTSUPP;
> +
> + if (common->max_oasz_lg2 == 0)
> + common->max_oasz_lg2 = pt_max_oa_lg2(common);
> + else
> + common->max_oasz_lg2 = min(common->max_oasz_lg2,
> + pt_max_oa_lg2(common));
> + return 0;
> +}
> +
> +static int pt_iommu_init_domain(struct pt_iommu *iommu_table,
> + struct iommu_domain *domain)
> +{
> + struct pt_common *common = common_from_iommu(iommu_table);
> + struct pt_iommu_info info;
> + struct pt_range range;
> +
> + NS(get_info)(iommu_table, &info);
> +
> + domain->type = __IOMMU_DOMAIN_PAGING;
> + domain->pgsize_bitmap = info.pgsize_bitmap;
> +
> + if (pt_feature(common, PT_FEAT_DYNAMIC_TOP))
> + range = _pt_top_range(common,
> + _pt_top_set(NULL, PT_MAX_TOP_LEVEL));
> + else
> + range = pt_top_range(common);
> +
> + /* A 64-bit high address space table on a 32-bit system cannot work. */
> + domain->geometry.aperture_start = (unsigned long)range.va;
> + if ((pt_vaddr_t)domain->geometry.aperture_start != range.va)
> + return -EOVERFLOW;
> +
> + /*
> + * The aperture is limited to what the API can do after considering all
> + * the different types dma_addr_t/unsigned long/pt_vaddr_t that are used
> + * to store a VA. Set the aperture to something that is valid for all
> + * cases. Saturate instead of truncate the end if the types are smaller
> + * than the top range. aperture_end should be called aperture_last.
> + */
> + domain->geometry.aperture_end = (unsigned long)range.last_va;
> + if ((pt_vaddr_t)domain->geometry.aperture_end != range.last_va) {
> + domain->geometry.aperture_end = ULONG_MAX;
> + domain->pgsize_bitmap &= ULONG_MAX;
> + }
> + domain->geometry.force_aperture = true;
> +
> + return 0;
> +}
> +
> +static void pt_iommu_zero(struct pt_iommu_table *fmt_table)
> +{
> + struct pt_iommu *iommu_table = &fmt_table->iommu;
> + struct pt_iommu cfg = *iommu_table;
> +
> + static_assert(offsetof(struct pt_iommu_table, iommu.domain) == 0);
> + memset_after(fmt_table, 0, iommu.domain);
> +
> + /* The caller can initialize some of these values */
> + iommu_table->nid = cfg.nid;
> +}
> +
> +#define pt_iommu_table_cfg CONCATENATE(pt_iommu_table, _cfg)
> +#define pt_iommu_init CONCATENATE(CONCATENATE(pt_iommu_, PTPFX), init)
> +
> +int pt_iommu_init(struct pt_iommu_table *fmt_table,
> + const struct pt_iommu_table_cfg *cfg, gfp_t gfp)
> +{
> + struct pt_iommu *iommu_table = &fmt_table->iommu;
> + struct pt_common *common = common_from_iommu(iommu_table);
> + struct pt_table_p *table_mem;
> + int ret;
> +
> + if (cfg->common.hw_max_vasz_lg2 > PT_MAX_VA_ADDRESS_LG2 ||
> + !cfg->common.hw_max_vasz_lg2 || !cfg->common.hw_max_oasz_lg2)
> + return -EINVAL;
> +
> + pt_iommu_zero(fmt_table);
> + common->features = cfg->common.features;
> + common->max_vasz_lg2 = cfg->common.hw_max_vasz_lg2;
> + common->max_oasz_lg2 = cfg->common.hw_max_oasz_lg2;
> + ret = pt_iommu_fmt_init(fmt_table, cfg);
> + if (ret)
> + return ret;
> +
> + if (cfg->common.hw_max_oasz_lg2 > pt_max_oa_lg2(common))
> + return -EINVAL;
> +
> + ret = pt_init_common(common);
> + if (ret)
> + return ret;
> +
> + if (pt_feature(common, PT_FEAT_SIGN_EXTEND) &&
> + (pt_feature(common, PT_FEAT_FULL_VA) ||
> + pt_feature(common, PT_FEAT_DYNAMIC_TOP)))
> + return -EINVAL;
> +
> + ret = pt_iommu_init_domain(iommu_table, &iommu_table->domain);
> + if (ret)
> + return ret;
> +
> + table_mem = table_alloc_top(common, common->top_of_table, gfp);
> + if (IS_ERR(table_mem))
> + return PTR_ERR(table_mem);
> + pt_top_set(common, table_mem, pt_top_get_level(common));
> +
> + /* Must be last, see pt_iommu_deinit() */
> + iommu_table->ops = &NS(ops);
> + return 0;
> +}
> +EXPORT_SYMBOL_NS_GPL(pt_iommu_init, "GENERIC_PT_IOMMU");
> +
> +#ifdef pt_iommu_fmt_hw_info
> +#define pt_iommu_table_hw_info CONCATENATE(pt_iommu_table, _hw_info)
> +#define pt_iommu_hw_info CONCATENATE(CONCATENATE(pt_iommu_, PTPFX), hw_info)
> +void pt_iommu_hw_info(struct pt_iommu_table *fmt_table,
> + struct pt_iommu_table_hw_info *info)
> +{
> + struct pt_iommu *iommu_table = &fmt_table->iommu;
> + struct pt_common *common = common_from_iommu(iommu_table);
> + struct pt_range top_range = pt_top_range(common);
> +
> + pt_iommu_fmt_hw_info(fmt_table, &top_range, info);
> +}
> +EXPORT_SYMBOL_NS_GPL(pt_iommu_hw_info, "GENERIC_PT_IOMMU");
> +#endif
> +
> +MODULE_LICENSE("GPL");
> +MODULE_DESCRIPTION("IOMMU Page table implementation for " __stringify(PTPFX_RAW));
> +MODULE_IMPORT_NS("GENERIC_PT");
> +
> +#endif /* __GENERIC_PT_IOMMU_PT_H */
> diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h
> new file mode 100644
> index 00000000000000..defa96abc49781
> --- /dev/null
> +++ b/include/linux/generic_pt/iommu.h
> @@ -0,0 +1,150 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
> + */
> +#ifndef __GENERIC_PT_IOMMU_H
> +#define __GENERIC_PT_IOMMU_H
> +
> +#include <linux/generic_pt/common.h>
> +#include <linux/iommu.h>
> +#include <linux/mm_types.h>
> +
> +struct pt_iommu_ops;
> +
> +/**
> + * DOC: IOMMU Radix Page Table
> + *
> + * The IOMMU implementation of the Generic Page Table provides an ops struct
> + * that is useful to go with an iommu_domain to serve the DMA API, IOMMUFD and
> + * the generic map/unmap interface.
> + *
> + * This interface uses a caller provided locking approach. The caller must have
> + * a VA range lock concept that prevents concurrent threads from calling ops on
> + * the same VA. Generally the range lock must be at least as large as a single
> + * map call.
> + */
> +
> +/**
> + * struct pt_iommu - Base structure for IOMMU page tables
> + *
> + * The format-specific struct will include this as the first member.
> + */
> +struct pt_iommu {
> + /**
> + * @domain: The core IOMMU domain. The driver should use a union to
> + * overlay this memory with its previously existing domain struct to
> + * create an alias.
> + */
> + struct iommu_domain domain;
> +
> + /**
> + * @ops: Function pointers to access the API
> + */
> + const struct pt_iommu_ops *ops;
> +
> + /**
> + * @nid: Node ID to use for table memory allocations. The IOMMU driver
> + * may want to set the NID to the device's NID, if there are multiple
> + * table walkers.
> + */
> + int nid;
> +};
> +
> +/**
> + * struct pt_iommu_info - Details about the IOMMU page table
> + *
> + * Returned from pt_iommu_ops->get_info()
> + */
> +struct pt_iommu_info {
> + /**
> + * @pgsize_bitmap: A bitmask where each set bit indicates
> + * a page size that can be natively stored in the page table.
> + */
> + u64 pgsize_bitmap;
> +};
> +
> +struct pt_iommu_ops {
> + /**
> + * @get_info: Return the pt_iommu_info structure
> + * @iommu_table: Table to query
> + *
> + * Return some basic static information about the page table.
> + */
> + void (*get_info)(struct pt_iommu *iommu_table,
> + struct pt_iommu_info *info);
> +
> + /**
> + * @deinit: Undo a format specific init operation
> + * @iommu_table: Table to destroy
> + *
> + * Release all of the memory. The caller must have already removed the
> + * table from all HW access and all caches.
> + */
> + void (*deinit)(struct pt_iommu *iommu_table);
> +};
> +
> +static inline void pt_iommu_deinit(struct pt_iommu *iommu_table)
> +{
> + /*
> + * It is safe to call pt_iommu_deinit() before an init, or if init
> + * fails. The ops pointer will only become non-NULL if deinit needs to be
> + * run.
> + */
> + if (iommu_table->ops)
> + iommu_table->ops->deinit(iommu_table);
> +}
> +
> +/**
> + * struct pt_iommu_cfg - Common configuration values for all formats
> + */
> +struct pt_iommu_cfg {
> + /**
> + * @features: Features required. Only these features will be turned on.
> + * The feature list should reflect what the IOMMU HW is capable of.
> + */
> + unsigned int features;
> + /**
> + * @hw_max_vasz_lg2: Maximum VA the IOMMU HW can support. This will
> + * imply the top level of the table.
> + */
> + u8 hw_max_vasz_lg2;
> + /**
> + * @hw_max_oasz_lg2: Maximum OA the IOMMU HW can support. The format
> + * might select a lower maximum OA.
> + */
> + u8 hw_max_oasz_lg2;
> +};
> +
> +/* Generate the exported function signatures from iommu_pt.h */
> +#define IOMMU_PROTOTYPES(fmt) \
> + int pt_iommu_##fmt##_init(struct pt_iommu_##fmt *table, \
> + const struct pt_iommu_##fmt##_cfg *cfg, \
> + gfp_t gfp); \
> + void pt_iommu_##fmt##_hw_info(struct pt_iommu_##fmt *table, \
> + struct pt_iommu_##fmt##_hw_info *info)
> +#define IOMMU_FORMAT(fmt, member) \
> + struct pt_iommu_##fmt { \
> + struct pt_iommu iommu; \
> + struct pt_##fmt member; \
> + }; \
> + IOMMU_PROTOTYPES(fmt)
> +
> +/*
> + * The driver should setup its domain struct like
> + * union {
> + * struct iommu_domain domain;
> + * struct pt_iommu_xxx xx;
> + * };
> + * PT_IOMMU_CHECK_DOMAIN(struct mock_iommu_domain, xx.iommu, domain);
> + *
> + * Which creates an alias between driver_domain.domain and
> + * driver_domain.xx.iommu.domain. This is to avoid a mass rename of existing
> + * driver_domain.domain users.
> + */
> +#define PT_IOMMU_CHECK_DOMAIN(s, pt_iommu_memb, domain_memb) \
> + static_assert(offsetof(s, pt_iommu_memb.domain) == \
> + offsetof(s, domain_memb))
> +
> +#undef IOMMU_PROTOTYPES
> +#undef IOMMU_FORMAT
> +#endif
> --
> 2.43.0
>
>
Reviewed-by: Samiullah Khawaja <skhawaja at google.com>
More information about the linux-riscv
mailing list