[PATCH v8 13/15] iommu/amd: Use the generic iommu page table

Wed Nov 5 08:01:53 PST 2025

On Tue, Nov 04, 2025 at 02:30:11PM -0400, Jason Gunthorpe wrote:
> From: Alejandro Jimenez <alejandro.j.jimenez at oracle.com>
> 
> Replace the io_pgtable versions with pt_iommu versions. The v2 page table
> uses the x86 implementation that will be eventually shared with VT-d.
> 
> This supports the same special features as the original code:
>  - increase_top for the v1 format to allow scaling from 3 to 6 levels
>  - non-present flushing
>  - Dirty tracking for v1 only
>  - __sme_set() to adjust the PTEs for CC
>  - Optimization for flushing with virtualization to minimize the range
>  - amd_iommu_pgsize_bitmap override of the native page sizes
>  - page tables allocate from the device's NUMA node
> 
> Rework the domain ops so that v1/v2 get their own ops. Make dedicated
> allocation functions for v1 and v2. Hook up invalidation for a top change
> to struct pt_iommu_flush_ops. Delete some of the iopgtable related code
> that becomes unused in this patch. The next patch will delete the rest of
> it.
> 
> This fixes a race bug in AMD's increase_address_space() implementation. It
> stores the top level and top pointer in different memory, which prevents
> other threads from reading a coherent version:
> 
>    increase_address_space()   alloc_pte()
>                                 level = pgtable->mode - 1;
> 	pgtable->root  = pte;
> 	pgtable->mode += 1;
>                                 pte = &pgtable->root[PM_LEVEL_INDEX(level, address)];
> 
> The iommupt version is careful to put mode and root under a single
> READ_ONCE and then is careful to only READ_ONCE a single time per
> walk.
> 
> Signed-off-by: Alejandro Jimenez <alejandro.j.jimenez at oracle.com>
> Reviewed-by: Vasant Hegde <vasant.hegde at amd.com>
> Tested-by: Alejandro Jimenez <alejandro.j.jimenez at oracle.com>
> Tested-by: Pasha Tatashin <pasha.tatashin at soleen.com>
> Signed-off-by: Jason Gunthorpe <jgg at nvidia.com>

Tested-by: Ankit Soni <Ankit.Soni at amd.com>

I am little late, I had tested this patch series in both v1 and v2 page table 
modes using multiple benchmark tools (FIO, netperf, etc). The changes work as 
expected, and I observed no regressions.

> ---
>  drivers/iommu/amd/Kconfig           |   5 +-
>  drivers/iommu/amd/amd_iommu.h       |   1 -
>  drivers/iommu/amd/amd_iommu_types.h |  12 +-
>  drivers/iommu/amd/io_pgtable.c      |   2 -
>  drivers/iommu/amd/iommu.c           | 538 ++++++++++++++--------------
>  5 files changed, 282 insertions(+), 276 deletions(-)
> 
> diff --git a/drivers/iommu/amd/Kconfig b/drivers/iommu/amd/Kconfig
> index ecef69c11144db..f2acf471cb5d9f 100644
> --- a/drivers/iommu/amd/Kconfig
> +++ b/drivers/iommu/amd/Kconfig
> @@ -11,10 +11,13 @@ config AMD_IOMMU
>  	select MMU_NOTIFIER
>  	select IOMMU_API
>  	select IOMMU_IOVA
> -	select IOMMU_IO_PGTABLE
>  	select IOMMU_SVA
>  	select IOMMU_IOPF
>  	select IOMMUFD_DRIVER if IOMMUFD
> +	select GENERIC_PT
> +	select IOMMU_PT
> +	select IOMMU_PT_AMDV1
> +	select IOMMU_PT_X86_64
>  	depends on X86_64 && PCI && ACPI && HAVE_CMPXCHG_DOUBLE
>  	help
>  	  With this option you can enable support for AMD IOMMU hardware in
> diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h
> index 9b4b589a54b57e..25044d28f28a8d 100644
> --- a/drivers/iommu/amd/amd_iommu.h
> +++ b/drivers/iommu/amd/amd_iommu.h
> @@ -88,7 +88,6 @@ int amd_iommu_complete_ppr(struct device *dev, u32 pasid, int status, int tag);
>   * the IOMMU used by this driver.
>   */
>  void amd_iommu_flush_all_caches(struct amd_iommu *iommu);
> -void amd_iommu_update_and_flush_device_table(struct protection_domain *domain);
>  void amd_iommu_domain_flush_pages(struct protection_domain *domain,
>  				  u64 address, size_t size);
>  void amd_iommu_dev_flush_pasid_pages(struct iommu_dev_data *dev_data,
> diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h
> index a698a2e7ce2a6e..d90a285b44eb3a 100644
> --- a/drivers/iommu/amd/amd_iommu_types.h
> +++ b/drivers/iommu/amd/amd_iommu_types.h
> @@ -19,6 +19,7 @@
>  #include <linux/pci.h>
>  #include <linux/irqreturn.h>
>  #include <linux/io-pgtable.h>
> +#include <linux/generic_pt/iommu.h>
>  
>  /*
>   * Maximum number of IOMMUs supported
> @@ -589,9 +590,13 @@ struct pdom_iommu_info {
>   * independent of their use.
>   */
>  struct protection_domain {
> +	union {
> +		struct iommu_domain domain;
> +		struct pt_iommu iommu;
> +		struct pt_iommu_amdv1 amdv1;
> +		struct pt_iommu_x86_64 amdv2;
> +	};
>  	struct list_head dev_list; /* List of all devices in this domain */
> -	struct iommu_domain domain; /* generic domain handle used by
> -				       iommu core code */
>  	struct amd_io_pgtable iop;
>  	spinlock_t lock;	/* mostly used to lock the page table*/
>  	u16 id;			/* the domain id written to the device table */
> @@ -602,6 +607,9 @@ struct protection_domain {
>  	struct mmu_notifier mn;	/* mmu notifier for the SVA domain */
>  	struct list_head dev_data_list; /* List of pdom_dev_data */
>  };
> +PT_IOMMU_CHECK_DOMAIN(struct protection_domain, iommu, domain);
> +PT_IOMMU_CHECK_DOMAIN(struct protection_domain, amdv1.iommu, domain);
> +PT_IOMMU_CHECK_DOMAIN(struct protection_domain, amdv2.iommu, domain);
>  
>  /*
>   * This structure contains information about one PCI segment in the system.
> diff --git a/drivers/iommu/amd/io_pgtable.c b/drivers/iommu/amd/io_pgtable.c
> index 70c2f5b1631b05..f64244938c9af7 100644
> --- a/drivers/iommu/amd/io_pgtable.c
> +++ b/drivers/iommu/amd/io_pgtable.c
> @@ -136,8 +136,6 @@ static bool increase_address_space(struct amd_io_pgtable *pgtable,
>  	pgtable->mode += 1;
>  	write_seqcount_end(&pgtable->seqcount);
>  
> -	amd_iommu_update_and_flush_device_table(domain);
> -
>  	pte = NULL;
>  	ret = true;
>  
> diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
> index 2e1865daa1cee8..0dc4d9682000bf 100644
> --- a/drivers/iommu/amd/iommu.c
> +++ b/drivers/iommu/amd/iommu.c
> @@ -30,7 +30,6 @@
>  #include <linux/msi.h>
>  #include <linux/irqdomain.h>
>  #include <linux/percpu.h>
> -#include <linux/io-pgtable.h>
>  #include <linux/cc_platform.h>
>  #include <asm/irq_remapping.h>
>  #include <asm/io_apic.h>
> @@ -41,9 +40,9 @@
>  #include <asm/gart.h>
>  #include <asm/dma.h>
>  #include <uapi/linux/iommufd.h>
> +#include <linux/generic_pt/iommu.h>
>  
>  #include "amd_iommu.h"
> -#include "../dma-iommu.h"
>  #include "../irq_remapping.h"
>  #include "../iommu-pages.h"
>  
> @@ -60,7 +59,6 @@ LIST_HEAD(hpet_map);
>  LIST_HEAD(acpihid_map);
>  
>  const struct iommu_ops amd_iommu_ops;
> -static const struct iommu_dirty_ops amd_dirty_ops;
>  
>  int amd_iommu_max_glx_val = -1;
>  
> @@ -74,11 +72,18 @@ static int amd_iommu_attach_device(struct iommu_domain *dom,
>  				   struct device *dev);
>  
>  static void set_dte_entry(struct amd_iommu *iommu,
> -			  struct iommu_dev_data *dev_data);
> +			  struct iommu_dev_data *dev_data,
> +			  phys_addr_t top_paddr, unsigned int top_level);
> +
> +static void amd_iommu_change_top(struct pt_iommu *iommu_table,
> +				 phys_addr_t top_paddr, unsigned int top_level);
>  
>  static void iommu_flush_dte_sync(struct amd_iommu *iommu, u16 devid);
>  
>  static struct iommu_dev_data *find_dev_data(struct amd_iommu *iommu, u16 devid);
> +static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain);
> +static int amd_iommu_set_dirty_tracking(struct iommu_domain *domain,
> +					bool enable);
>  
>  /****************************************************************************
>   *
> @@ -1756,42 +1761,6 @@ static void dev_flush_pasid_all(struct iommu_dev_data *dev_data,
>  					CMD_INV_IOMMU_ALL_PAGES_ADDRESS);
>  }
>  
> -/* Flush the not present cache if it exists */
> -static void domain_flush_np_cache(struct protection_domain *domain,
> -		dma_addr_t iova, size_t size)
> -{
> -	if (unlikely(amd_iommu_np_cache)) {
> -		unsigned long flags;
> -
> -		spin_lock_irqsave(&domain->lock, flags);
> -		amd_iommu_domain_flush_pages(domain, iova, size);
> -		spin_unlock_irqrestore(&domain->lock, flags);
> -	}
> -}
> -
> -
> -/*
> - * This function flushes the DTEs for all devices in domain
> - */
> -void amd_iommu_update_and_flush_device_table(struct protection_domain *domain)
> -{
> -	struct iommu_dev_data *dev_data;
> -
> -	lockdep_assert_held(&domain->lock);
> -
> -	list_for_each_entry(dev_data, &domain->dev_list, list) {
> -		struct amd_iommu *iommu = rlookup_amd_iommu(dev_data->dev);
> -
> -		set_dte_entry(iommu, dev_data);
> -		clone_aliases(iommu, dev_data->dev);
> -	}
> -
> -	list_for_each_entry(dev_data, &domain->dev_list, list)
> -		device_flush_dte(dev_data);
> -
> -	domain_flush_complete(domain);
> -}
> -
>  int amd_iommu_complete_ppr(struct device *dev, u32 pasid, int status, int tag)
>  {
>  	struct iommu_dev_data *dev_data;
> @@ -2051,7 +2020,8 @@ static void set_dte_gcr3_table(struct amd_iommu *iommu,
>  }
>  
>  static void set_dte_entry(struct amd_iommu *iommu,
> -			  struct iommu_dev_data *dev_data)
> +			  struct iommu_dev_data *dev_data,
> +			  phys_addr_t top_paddr, unsigned int top_level)
>  {
>  	u16 domid;
>  	u32 old_domid;
> @@ -2060,19 +2030,36 @@ static void set_dte_entry(struct amd_iommu *iommu,
>  	struct protection_domain *domain = dev_data->domain;
>  	struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info;
>  	struct dev_table_entry *dte = &get_dev_table(iommu)[dev_data->devid];
> -
> -	if (gcr3_info && gcr3_info->gcr3_tbl)
> -		domid = dev_data->gcr3_info.domid;
> -	else
> -		domid = domain->id;
> +	struct pt_iommu_amdv1_hw_info pt_info;
>  
>  	make_clear_dte(dev_data, dte, &new);
>  
> -	if (domain->iop.mode != PAGE_MODE_NONE)
> -		new.data[0] |= iommu_virt_to_phys(domain->iop.root);
> +	if (gcr3_info && gcr3_info->gcr3_tbl)
> +		domid = dev_data->gcr3_info.domid;
> +	else {
> +		domid = domain->id;
>  
> -	new.data[0] |= (domain->iop.mode & DEV_ENTRY_MODE_MASK)
> -		    << DEV_ENTRY_MODE_SHIFT;
> +		if (domain->domain.type & __IOMMU_DOMAIN_PAGING) {
> +			/*
> +			 * When updating the IO pagetable, the new top and level
> +			 * are provided as parameters. For other operations i.e.
> +			 * device attach, retrieve the current pagetable info
> +			 * via the IOMMU PT API.
> +			 */
> +			if (top_paddr) {
> +				pt_info.host_pt_root = top_paddr;
> +				pt_info.mode = top_level + 1;
> +			} else {
> +				WARN_ON(top_paddr || top_level);
> +				pt_iommu_amdv1_hw_info(&domain->amdv1,
> +						       &pt_info);
> +			}
> +
> +			new.data[0] |= __sme_set(pt_info.host_pt_root) |
> +				       (pt_info.mode & DEV_ENTRY_MODE_MASK)
> +					       << DEV_ENTRY_MODE_SHIFT;
> +		}
> +	}
>  
>  	new.data[0] |= DTE_FLAG_IR | DTE_FLAG_IW;
>  
> @@ -2138,7 +2125,7 @@ static void dev_update_dte(struct iommu_dev_data *dev_data, bool set)
>  	struct amd_iommu *iommu = get_amd_iommu_from_dev(dev_data->dev);
>  
>  	if (set)
> -		set_dte_entry(iommu, dev_data);
> +		set_dte_entry(iommu, dev_data, 0, 0);
>  	else
>  		clear_dte_entry(iommu, dev_data);
>  
> @@ -2156,6 +2143,7 @@ static int init_gcr3_table(struct iommu_dev_data *dev_data,
>  {
>  	struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data);
>  	int max_pasids = dev_data->max_pasids;
> +	struct pt_iommu_x86_64_hw_info pt_info;
>  	int ret = 0;
>  
>  	 /*
> @@ -2178,7 +2166,8 @@ static int init_gcr3_table(struct iommu_dev_data *dev_data,
>  	if (!pdom_is_v2_pgtbl_mode(pdom))
>  		return ret;
>  
> -	ret = update_gcr3(dev_data, 0, iommu_virt_to_phys(pdom->iop.pgd), true);
> +	pt_iommu_x86_64_hw_info(&pdom->amdv2, &pt_info);
> +	ret = update_gcr3(dev_data, 0, __sme_set(pt_info.gcr3_pt), true);
>  	if (ret)
>  		free_gcr3_table(&dev_data->gcr3_info);
>  
> @@ -2500,54 +2489,6 @@ struct protection_domain *protection_domain_alloc(void)
>  	return domain;
>  }
>  
> -static int pdom_setup_pgtable(struct protection_domain *domain,
> -			      struct device *dev)
> -{
> -	struct io_pgtable_ops *pgtbl_ops;
> -	enum io_pgtable_fmt fmt;
> -
> -	switch (domain->pd_mode) {
> -	case PD_MODE_V1:
> -		fmt = AMD_IOMMU_V1;
> -		break;
> -	case PD_MODE_V2:
> -		fmt = AMD_IOMMU_V2;
> -		break;
> -	case PD_MODE_NONE:
> -		WARN_ON_ONCE(1);
> -		return -EPERM;
> -	}
> -
> -	domain->iop.pgtbl.cfg.amd.nid = dev_to_node(dev);
> -	pgtbl_ops = alloc_io_pgtable_ops(fmt, &domain->iop.pgtbl.cfg, domain);
> -	if (!pgtbl_ops)
> -		return -ENOMEM;
> -
> -	return 0;
> -}
> -
> -static inline u64 dma_max_address(enum protection_domain_mode pgtable)
> -{
> -	if (pgtable == PD_MODE_V1)
> -		return PM_LEVEL_SIZE(amd_iommu_hpt_level);
> -
> -	/*
> -	 * V2 with 4/5 level page table. Note that "2.2.6.5 AMD64 4-Kbyte Page
> -	 * Translation" shows that the V2 table sign extends the top of the
> -	 * address space creating a reserved region in the middle of the
> -	 * translation, just like the CPU does. Further Vasant says the docs are
> -	 * incomplete and this only applies to non-zero PASIDs. If the AMDv2
> -	 * page table is assigned to the 0 PASID then there is no sign extension
> -	 * check.
> -	 *
> -	 * Since the IOMMU must have a fixed geometry, and the core code does
> -	 * not understand sign extended addressing, we have to chop off the high
> -	 * bit to get consistent behavior with attachments of the domain to any
> -	 * PASID.
> -	 */
> -	return ((1ULL << (PM_LEVEL_SHIFT(amd_iommu_gpt_level) - 1)) - 1);
> -}
> -
>  static bool amd_iommu_hd_support(struct amd_iommu *iommu)
>  {
>  	if (amd_iommu_hatdis)
> @@ -2556,38 +2497,229 @@ static bool amd_iommu_hd_support(struct amd_iommu *iommu)
>  	return iommu && (iommu->features & FEATURE_HDSUP);
>  }
>  
> -static struct iommu_domain *
> -do_iommu_domain_alloc(struct device *dev, u32 flags,
> -		      enum protection_domain_mode pgtable)
> +static spinlock_t *amd_iommu_get_top_lock(struct pt_iommu *iommupt)
>  {
> -	bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
> -	struct amd_iommu *iommu = get_amd_iommu_from_dev(dev);
> +	struct protection_domain *pdom =
> +		container_of(iommupt, struct protection_domain, iommu);
> +
> +	return &pdom->lock;
> +}
> +
> +/*
> + * Update all HW references to the domain with a new pgtable configuration.
> + */
> +static void amd_iommu_change_top(struct pt_iommu *iommu_table,
> +				 phys_addr_t top_paddr, unsigned int top_level)
> +{
> +	struct protection_domain *pdom =
> +		container_of(iommu_table, struct protection_domain, iommu);
> +	struct iommu_dev_data *dev_data;
> +
> +	lockdep_assert_held(&pdom->lock);
> +
> +	/* Update the DTE for all devices attached to this domain */
> +	list_for_each_entry(dev_data, &pdom->dev_list, list) {
> +		struct amd_iommu *iommu = rlookup_amd_iommu(dev_data->dev);
> +
> +		/* Update the HW references with the new level and top ptr */
> +		set_dte_entry(iommu, dev_data, top_paddr, top_level);
> +		clone_aliases(iommu, dev_data->dev);
> +	}
> +
> +	list_for_each_entry(dev_data, &pdom->dev_list, list)
> +		device_flush_dte(dev_data);
> +
> +	domain_flush_complete(pdom);
> +}
> +
> +/*
> + * amd_iommu_iotlb_sync_map() is used to generate flushes for non-present to
> + * present (ie mapping) operations. It is a NOP if the IOMMU doesn't have non
> + * present caching (like hypervisor shadowing).
> + */
> +static int amd_iommu_iotlb_sync_map(struct iommu_domain *dom,
> +				    unsigned long iova, size_t size)
> +{
> +	struct protection_domain *domain = to_pdomain(dom);
> +	unsigned long flags;
> +
> +	if (likely(!amd_iommu_np_cache))
> +		return 0;
> +
> +	spin_lock_irqsave(&domain->lock, flags);
> +	amd_iommu_domain_flush_pages(domain, iova, size);
> +	spin_unlock_irqrestore(&domain->lock, flags);
> +	return 0;
> +}
> +
> +static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain)
> +{
> +	struct protection_domain *dom = to_pdomain(domain);
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&dom->lock, flags);
> +	amd_iommu_domain_flush_all(dom);
> +	spin_unlock_irqrestore(&dom->lock, flags);
> +}
> +
> +static void amd_iommu_iotlb_sync(struct iommu_domain *domain,
> +				 struct iommu_iotlb_gather *gather)
> +{
> +	struct protection_domain *dom = to_pdomain(domain);
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&dom->lock, flags);
> +	amd_iommu_domain_flush_pages(dom, gather->start,
> +				     gather->end - gather->start + 1);
> +	spin_unlock_irqrestore(&dom->lock, flags);
> +	iommu_put_pages_list(&gather->freelist);
> +}
> +
> +static const struct pt_iommu_driver_ops amd_hw_driver_ops_v1 = {
> +	.get_top_lock = amd_iommu_get_top_lock,
> +	.change_top = amd_iommu_change_top,
> +};
> +
> +static const struct iommu_domain_ops amdv1_ops = {
> +	IOMMU_PT_DOMAIN_OPS(amdv1),
> +	.iotlb_sync_map = amd_iommu_iotlb_sync_map,
> +	.flush_iotlb_all = amd_iommu_flush_iotlb_all,
> +	.iotlb_sync = amd_iommu_iotlb_sync,
> +	.attach_dev = amd_iommu_attach_device,
> +	.free = amd_iommu_domain_free,
> +	.enforce_cache_coherency = amd_iommu_enforce_cache_coherency,
> +};
> +
> +static const struct iommu_dirty_ops amdv1_dirty_ops = {
> +	IOMMU_PT_DIRTY_OPS(amdv1),
> +	.set_dirty_tracking = amd_iommu_set_dirty_tracking,
> +};
> +
> +static struct iommu_domain *amd_iommu_domain_alloc_paging_v1(struct device *dev,
> +							     u32 flags)
> +{
> +	struct pt_iommu_amdv1_cfg cfg = {};
>  	struct protection_domain *domain;
>  	int ret;
>  
> +	if (amd_iommu_hatdis)
> +		return ERR_PTR(-EOPNOTSUPP);
> +
>  	domain = protection_domain_alloc();
>  	if (!domain)
>  		return ERR_PTR(-ENOMEM);
>  
> -	domain->pd_mode = pgtable;
> -	ret = pdom_setup_pgtable(domain, dev);
> +	domain->pd_mode = PD_MODE_V1;
> +	domain->iommu.driver_ops = &amd_hw_driver_ops_v1;
> +	domain->iommu.nid = dev_to_node(dev);
> +	if (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING)
> +		domain->domain.dirty_ops = &amdv1_dirty_ops;
> +
> +	/*
> +	 * Someday FORCE_COHERENCE should be set by
> +	 * amd_iommu_enforce_cache_coherency() like VT-d does.
> +	 */
> +	cfg.common.features = BIT(PT_FEAT_DYNAMIC_TOP) |
> +			      BIT(PT_FEAT_AMDV1_ENCRYPT_TABLES) |
> +			      BIT(PT_FEAT_AMDV1_FORCE_COHERENCE);
> +
> +	/*
> +	 * AMD's IOMMU can flush as many pages as necessary in a single flush.
> +	 * Unless we run in a virtual machine, which can be inferred according
> +	 * to whether "non-present cache" is on, it is probably best to prefer
> +	 * (potentially) too extensive TLB flushing (i.e., more misses) over
> +	 * multiple TLB flushes (i.e., more flushes). For virtual machines the
> +	 * hypervisor needs to synchronize the host IOMMU PTEs with those of
> +	 * the guest, and the trade-off is different: unnecessary TLB flushes
> +	 * should be avoided.
> +	 */
> +	if (amd_iommu_np_cache)
> +		cfg.common.features |= BIT(PT_FEAT_FLUSH_RANGE_NO_GAPS);
> +	else
> +		cfg.common.features |= BIT(PT_FEAT_FLUSH_RANGE);
> +
> +	cfg.common.hw_max_vasz_lg2 =
> +		min(64, (amd_iommu_hpt_level - 1) * 9 + 21);
> +	cfg.common.hw_max_oasz_lg2 = 52;
> +	cfg.starting_level = 2;
> +	domain->domain.ops = &amdv1_ops;
> +
> +	ret = pt_iommu_amdv1_init(&domain->amdv1, &cfg, GFP_KERNEL);
>  	if (ret) {
> -		pdom_id_free(domain->id);
> -		kfree(domain);
> +		amd_iommu_domain_free(&domain->domain);
>  		return ERR_PTR(ret);
>  	}
>  
> -	domain->domain.geometry.aperture_start = 0;
> -	domain->domain.geometry.aperture_end   = dma_max_address(pgtable);
> -	domain->domain.geometry.force_aperture = true;
> -	domain->domain.pgsize_bitmap = domain->iop.pgtbl.cfg.pgsize_bitmap;
> +	/*
> +	 * Narrow the supported page sizes to those selected by the kernel
> +	 * command line.
> +	 */
> +	domain->domain.pgsize_bitmap &= amd_iommu_pgsize_bitmap;
> +	return &domain->domain;
> +}
>  
> -	domain->domain.type = IOMMU_DOMAIN_UNMANAGED;
> -	domain->domain.ops = iommu->iommu.ops->default_domain_ops;
> +static const struct iommu_domain_ops amdv2_ops = {
> +	IOMMU_PT_DOMAIN_OPS(x86_64),
> +	.iotlb_sync_map = amd_iommu_iotlb_sync_map,
> +	.flush_iotlb_all = amd_iommu_flush_iotlb_all,
> +	.iotlb_sync = amd_iommu_iotlb_sync,
> +	.attach_dev = amd_iommu_attach_device,
> +	.free = amd_iommu_domain_free,
> +	/*
> +	 * Note the AMDv2 page table format does not support a Force Coherency
> +	 * bit, so enforce_cache_coherency should not be set. However VFIO is
> +	 * not prepared to handle a case where some domains will support
> +	 * enforcement and others do not. VFIO and iommufd will have to be fixed
> +	 * before it can fully use the V2 page table. See the comment in
> +	 * iommufd_hwpt_paging_alloc(). For now leave things as they have
> +	 * historically been and lie about enforce_cache_coherencey.
> +	 */
> +	.enforce_cache_coherency = amd_iommu_enforce_cache_coherency,
> +};
>  
> -	if (dirty_tracking)
> -		domain->domain.dirty_ops = &amd_dirty_ops;
> +static struct iommu_domain *amd_iommu_domain_alloc_paging_v2(struct device *dev,
> +							     u32 flags)
> +{
> +	struct pt_iommu_x86_64_cfg cfg = {};
> +	struct protection_domain *domain;
> +	int ret;
>  
> +	if (!amd_iommu_v2_pgtbl_supported())
> +		return ERR_PTR(-EOPNOTSUPP);
> +
> +	domain = protection_domain_alloc();
> +	if (!domain)
> +		return ERR_PTR(-ENOMEM);
> +
> +	domain->pd_mode = PD_MODE_V2;
> +	domain->iommu.nid = dev_to_node(dev);
> +
> +	cfg.common.features = BIT(PT_FEAT_X86_64_AMD_ENCRYPT_TABLES);
> +	if (amd_iommu_np_cache)
> +		cfg.common.features |= BIT(PT_FEAT_FLUSH_RANGE_NO_GAPS);
> +	else
> +		cfg.common.features |= BIT(PT_FEAT_FLUSH_RANGE);
> +
> +	/*
> +	 * The v2 table behaves differently if it is attached to PASID 0 vs a
> +	 * non-zero PASID. On PASID 0 it has no sign extension and the full
> +	 * 57/48 bits decode the lower addresses. Otherwise it behaves like a
> +	 * normal sign extended x86 page table. Since we want the domain to work
> +	 * in both modes the top bit is removed and PT_FEAT_SIGN_EXTEND is not
> +	 * set which creates a table that is compatible in both modes.
> +	 */
> +	if (amd_iommu_gpt_level == PAGE_MODE_5_LEVEL)
> +		cfg.common.hw_max_vasz_lg2 = 56;
> +	else
> +		cfg.common.hw_max_vasz_lg2 = 47;
> +	cfg.common.hw_max_oasz_lg2 = 52;
> +	domain->domain.ops = &amdv2_ops;
> +
> +	ret = pt_iommu_x86_64_init(&domain->amdv2, &cfg, GFP_KERNEL);
> +	if (ret) {
> +		amd_iommu_domain_free(&domain->domain);
> +		return ERR_PTR(ret);
> +	}
>  	return &domain->domain;
>  }
>  
> @@ -2608,15 +2740,27 @@ amd_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags,
>  		/* Allocate domain with v1 page table for dirty tracking */
>  		if (!amd_iommu_hd_support(iommu))
>  			break;
> -		return do_iommu_domain_alloc(dev, flags, PD_MODE_V1);
> +		return amd_iommu_domain_alloc_paging_v1(dev, flags);
>  	case IOMMU_HWPT_ALLOC_PASID:
>  		/* Allocate domain with v2 page table if IOMMU supports PASID. */
>  		if (!amd_iommu_pasid_supported())
>  			break;
> -		return do_iommu_domain_alloc(dev, flags, PD_MODE_V2);
> -	case 0:
> +		return amd_iommu_domain_alloc_paging_v2(dev, flags);
> +	case 0: {
> +		struct iommu_domain *ret;
> +
>  		/* If nothing specific is required use the kernel commandline default */
> -		return do_iommu_domain_alloc(dev, 0, amd_iommu_pgtable);
> +		if (amd_iommu_pgtable == PD_MODE_V1) {
> +			ret = amd_iommu_domain_alloc_paging_v1(dev, flags);
> +			if (ret != ERR_PTR(-EOPNOTSUPP))
> +				return ret;
> +			return amd_iommu_domain_alloc_paging_v2(dev, flags);
> +		}
> +		ret = amd_iommu_domain_alloc_paging_v2(dev, flags);
> +		if (ret != ERR_PTR(-EOPNOTSUPP))
> +			return ret;
> +		return amd_iommu_domain_alloc_paging_v1(dev, flags);
> +	}
>  	default:
>  		break;
>  	}
> @@ -2628,8 +2772,7 @@ void amd_iommu_domain_free(struct iommu_domain *dom)
>  	struct protection_domain *domain = to_pdomain(dom);
>  
>  	WARN_ON(!list_empty(&domain->dev_list));
> -	if (domain->domain.type & __IOMMU_DOMAIN_PAGING)
> -		free_io_pgtable_ops(&domain->iop.pgtbl.ops);
> +	pt_iommu_deinit(&domain->iommu);
>  	pdom_id_free(domain->id);
>  	kfree(domain);
>  }
> @@ -2734,93 +2877,6 @@ static int amd_iommu_attach_device(struct iommu_domain *dom,
>  	return ret;
>  }
>  
> -static int amd_iommu_iotlb_sync_map(struct iommu_domain *dom,
> -				    unsigned long iova, size_t size)
> -{
> -	struct protection_domain *domain = to_pdomain(dom);
> -	struct io_pgtable_ops *ops = &domain->iop.pgtbl.ops;
> -
> -	if (ops->map_pages)
> -		domain_flush_np_cache(domain, iova, size);
> -	return 0;
> -}
> -
> -static int amd_iommu_map_pages(struct iommu_domain *dom, unsigned long iova,
> -			       phys_addr_t paddr, size_t pgsize, size_t pgcount,
> -			       int iommu_prot, gfp_t gfp, size_t *mapped)
> -{
> -	struct protection_domain *domain = to_pdomain(dom);
> -	struct io_pgtable_ops *ops = &domain->iop.pgtbl.ops;
> -	int prot = 0;
> -	int ret = -EINVAL;
> -
> -	if ((domain->pd_mode == PD_MODE_V1) &&
> -	    (domain->iop.mode == PAGE_MODE_NONE))
> -		return -EINVAL;
> -
> -	if (iommu_prot & IOMMU_READ)
> -		prot |= IOMMU_PROT_IR;
> -	if (iommu_prot & IOMMU_WRITE)
> -		prot |= IOMMU_PROT_IW;
> -
> -	if (ops->map_pages) {
> -		ret = ops->map_pages(ops, iova, paddr, pgsize,
> -				     pgcount, prot, gfp, mapped);
> -	}
> -
> -	return ret;
> -}
> -
> -static void amd_iommu_iotlb_gather_add_page(struct iommu_domain *domain,
> -					    struct iommu_iotlb_gather *gather,
> -					    unsigned long iova, size_t size)
> -{
> -	/*
> -	 * AMD's IOMMU can flush as many pages as necessary in a single flush.
> -	 * Unless we run in a virtual machine, which can be inferred according
> -	 * to whether "non-present cache" is on, it is probably best to prefer
> -	 * (potentially) too extensive TLB flushing (i.e., more misses) over
> -	 * mutliple TLB flushes (i.e., more flushes). For virtual machines the
> -	 * hypervisor needs to synchronize the host IOMMU PTEs with those of
> -	 * the guest, and the trade-off is different: unnecessary TLB flushes
> -	 * should be avoided.
> -	 */
> -	if (amd_iommu_np_cache &&
> -	    iommu_iotlb_gather_is_disjoint(gather, iova, size))
> -		iommu_iotlb_sync(domain, gather);
> -
> -	iommu_iotlb_gather_add_range(gather, iova, size);
> -}
> -
> -static size_t amd_iommu_unmap_pages(struct iommu_domain *dom, unsigned long iova,
> -				    size_t pgsize, size_t pgcount,
> -				    struct iommu_iotlb_gather *gather)
> -{
> -	struct protection_domain *domain = to_pdomain(dom);
> -	struct io_pgtable_ops *ops = &domain->iop.pgtbl.ops;
> -	size_t r;
> -
> -	if ((domain->pd_mode == PD_MODE_V1) &&
> -	    (domain->iop.mode == PAGE_MODE_NONE))
> -		return 0;
> -
> -	r = (ops->unmap_pages) ? ops->unmap_pages(ops, iova, pgsize, pgcount, NULL) : 0;
> -
> -	if (r)
> -		amd_iommu_iotlb_gather_add_page(dom, gather, iova, r);
> -
> -	return r;
> -}
> -
> -static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
> -					  dma_addr_t iova)
> -{
> -	struct protection_domain *domain = to_pdomain(dom);
> -	struct io_pgtable_ops *ops = &domain->iop.pgtbl.ops;
> -
> -	return ops->iova_to_phys(ops, iova);
> -}
> -
>  static bool amd_iommu_capable(struct device *dev, enum iommu_cap cap)
>  {
>  	switch (cap) {
> @@ -2887,28 +2943,6 @@ static int amd_iommu_set_dirty_tracking(struct iommu_domain *domain,
>  	return 0;
>  }
>  
> -static int amd_iommu_read_and_clear_dirty(struct iommu_domain *domain,
> -					  unsigned long iova, size_t size,
> -					  unsigned long flags,
> -					  struct iommu_dirty_bitmap *dirty)
> -{
> -	struct protection_domain *pdomain = to_pdomain(domain);
> -	struct io_pgtable_ops *ops = &pdomain->iop.pgtbl.ops;
> -	unsigned long lflags;
> -
> -	if (!ops || !ops->read_and_clear_dirty)
> -		return -EOPNOTSUPP;
> -
> -	spin_lock_irqsave(&pdomain->lock, lflags);
> -	if (!pdomain->dirty_tracking && dirty->bitmap) {
> -		spin_unlock_irqrestore(&pdomain->lock, lflags);
> -		return -EINVAL;
> -	}
> -	spin_unlock_irqrestore(&pdomain->lock, lflags);
> -
> -	return ops->read_and_clear_dirty(ops, iova, size, flags, dirty);
> -}
> -
>  static void amd_iommu_get_resv_regions(struct device *dev,
>  				       struct list_head *head)
>  {
> @@ -2978,28 +3012,6 @@ static bool amd_iommu_is_attach_deferred(struct device *dev)
>  	return dev_data->defer_attach;
>  }
>  
> -static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain)
> -{
> -	struct protection_domain *dom = to_pdomain(domain);
> -	unsigned long flags;
> -
> -	spin_lock_irqsave(&dom->lock, flags);
> -	amd_iommu_domain_flush_all(dom);
> -	spin_unlock_irqrestore(&dom->lock, flags);
> -}
> -
> -static void amd_iommu_iotlb_sync(struct iommu_domain *domain,
> -				 struct iommu_iotlb_gather *gather)
> -{
> -	struct protection_domain *dom = to_pdomain(domain);
> -	unsigned long flags;
> -
> -	spin_lock_irqsave(&dom->lock, flags);
> -	amd_iommu_domain_flush_pages(dom, gather->start,
> -				     gather->end - gather->start + 1);
> -	spin_unlock_irqrestore(&dom->lock, flags);
> -}
> -
>  static int amd_iommu_def_domain_type(struct device *dev)
>  {
>  	struct iommu_dev_data *dev_data;
> @@ -3034,11 +3046,6 @@ static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain)
>  	return true;
>  }
>  
> -static const struct iommu_dirty_ops amd_dirty_ops = {
> -	.set_dirty_tracking = amd_iommu_set_dirty_tracking,
> -	.read_and_clear_dirty = amd_iommu_read_and_clear_dirty,
> -};
> -
>  const struct iommu_ops amd_iommu_ops = {
>  	.capable = amd_iommu_capable,
>  	.blocked_domain = &blocked_domain,
> @@ -3053,17 +3060,6 @@ const struct iommu_ops amd_iommu_ops = {
>  	.is_attach_deferred = amd_iommu_is_attach_deferred,
>  	.def_domain_type = amd_iommu_def_domain_type,
>  	.page_response = amd_iommu_page_response,
> -	.default_domain_ops = &(const struct iommu_domain_ops) {
> -		.attach_dev	= amd_iommu_attach_device,
> -		.map_pages	= amd_iommu_map_pages,
> -		.unmap_pages	= amd_iommu_unmap_pages,
> -		.iotlb_sync_map	= amd_iommu_iotlb_sync_map,
> -		.iova_to_phys	= amd_iommu_iova_to_phys,
> -		.flush_iotlb_all = amd_iommu_flush_iotlb_all,
> -		.iotlb_sync	= amd_iommu_iotlb_sync,
> -		.free		= amd_iommu_domain_free,
> -		.enforce_cache_coherency = amd_iommu_enforce_cache_coherency,
> -	}
>  };
>  
>  #ifdef CONFIG_IRQ_REMAP
> @@ -4072,3 +4068,5 @@ int amd_iommu_create_irq_domain(struct amd_iommu *iommu)
>  	return 0;
>  }
>  #endif
> +
> +MODULE_IMPORT_NS("GENERIC_PT_IOMMU");
> -- 
> 2.43.0
>