[PATCH v2 2/4] iommu/io-pgtable-arm: Add read_and_clear_dirty() support

Ryan Roberts ryan.roberts at arm.com
Tue Apr 23 08:56:06 PDT 2024


On 22/02/2024 09:49, Shameer Kolothum wrote:
> .read_and_clear_dirty() IOMMU domain op takes care of reading the dirty
> bits (i.e. PTE has both DBM and AP[2] set) and marshalling into a bitmap of

Err... a PTE is HW-dirty if DBM is set and AP[2] is *clear*, right? (AP[2] is "RDONLY" bit, and is initially set, then the HW clears it on first write. See pte_hw_dirty() in linux/arch/arm64/include/asm/pgtable.h).

> a given page size.
> 
> While reading the dirty bits we also clear the PTE AP[2] bit to mark it as
> writable-clean depending on read_and_clear_dirty() flags.

You would need to *set* AP[2] to mark it clean.

> 
> Structure it in a way that the IOPTE walker is generic, and so we pass a
> function pointer over what to do on a per-PTE basis.
> 
> Signed-off-by: Joao Martins <joao.m.martins at oracle.com>
> Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi at huawei.com>
> ---
>  drivers/iommu/io-pgtable-arm.c | 128 ++++++++++++++++++++++++++++++++-
>  1 file changed, 127 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
> index f7828a7aad41..1ce7b7a3c1e8 100644
> --- a/drivers/iommu/io-pgtable-arm.c
> +++ b/drivers/iommu/io-pgtable-arm.c
> @@ -75,6 +75,7 @@
>  
>  #define ARM_LPAE_PTE_NSTABLE		(((arm_lpae_iopte)1) << 63)
>  #define ARM_LPAE_PTE_XN			(((arm_lpae_iopte)3) << 53)
> +#define ARM_LPAE_PTE_DBM		(((arm_lpae_iopte)1) << 51)
>  #define ARM_LPAE_PTE_AF			(((arm_lpae_iopte)1) << 10)
>  #define ARM_LPAE_PTE_SH_NS		(((arm_lpae_iopte)0) << 8)
>  #define ARM_LPAE_PTE_SH_OS		(((arm_lpae_iopte)2) << 8)
> @@ -84,7 +85,7 @@
>  
>  #define ARM_LPAE_PTE_ATTR_LO_MASK	(((arm_lpae_iopte)0x3ff) << 2)
>  /* Ignore the contiguous bit for block splitting */
> -#define ARM_LPAE_PTE_ATTR_HI_MASK	(((arm_lpae_iopte)6) << 52)
> +#define ARM_LPAE_PTE_ATTR_HI_MASK	(((arm_lpae_iopte)0xd) << 51)

Perhaps this is easier to understand:

#define ARM_LPAE_PTE_ATTR_HI_MASK	(ARM_LPAE_PTE_XN | ARM_LPAE_PTE_DBM)

>  #define ARM_LPAE_PTE_ATTR_MASK		(ARM_LPAE_PTE_ATTR_LO_MASK |	\
>  					 ARM_LPAE_PTE_ATTR_HI_MASK)
>  /* Software bit for solving coherency races */
> @@ -93,6 +94,9 @@
>  /* Stage-1 PTE */
>  #define ARM_LPAE_PTE_AP_UNPRIV		(((arm_lpae_iopte)1) << 6)
>  #define ARM_LPAE_PTE_AP_RDONLY		(((arm_lpae_iopte)2) << 6)
> +#define ARM_LPAE_PTE_AP_RDONLY_BIT	7

Perhaps:

#define ARM_LPAE_PTE_AP_RDONLY_BIT	7
#define ARM_LPAE_PTE_AP_RDONLY		(((arm_lpae_iopte)1) << ARM_LPAE_PTE_AP_RDONLY_BIT)

> +#define ARM_LPAE_PTE_AP_WRITABLE	(ARM_LPAE_PTE_AP_RDONLY | \
> +					 ARM_LPAE_PTE_DBM)

Perhaps: ARM_LPAE_PTE_AP_WRITABLE_CLEAN ?


>  #define ARM_LPAE_PTE_ATTRINDX_SHIFT	2
>  #define ARM_LPAE_PTE_nG			(((arm_lpae_iopte)1) << 11)
>  
> @@ -729,6 +733,127 @@ static phys_addr_t arm_lpae_iova_to_phys(struct io_pgtable_ops *ops,
>  	return iopte_to_paddr(pte, data) | iova;
>  }
>  
> +struct arm_lpae_iopte_read_dirty {
> +	unsigned long flags;
> +	struct iommu_dirty_bitmap *dirty;
> +};
> +
> +typedef int (*io_pgtable_visit_fn_t)(unsigned long iova, size_t size,
> +				     arm_lpae_iopte *ptep, void *opaque);
> +struct io_pgtable_walk_data {
> +	const io_pgtable_visit_fn_t	fn;
> +	void				*opaque;> +	u64				addr;
> +	const u64			end;
> +};
> +
> +static int __arm_lpae_iopte_walk(struct arm_lpae_io_pgtable *data,
> +				 struct io_pgtable_walk_data *walk_data,
> +				 arm_lpae_iopte *ptep,
> +				 int lvl);
> +
> +static int __arm_lpae_read_and_clear_dirty(unsigned long iova, size_t size,
> +					   arm_lpae_iopte *ptep, void *opaque)
> +{
> +	struct arm_lpae_iopte_read_dirty *arg = opaque;
> +	struct iommu_dirty_bitmap *dirty = arg->dirty;
> +	arm_lpae_iopte pte;
> +
> +	pte = READ_ONCE(*ptep);
> +	if (WARN_ON(!pte))
> +		return -EINVAL;

You've already read and checked its not zero in io_pgtable_visit(). Either the walker is considered generic and probably shouldn't care if the pte is 0, (so just do check here). Or the walker is specific for this use case, in which case there is no need for the function pointer callback inefficiencies (and you've already READ_ONCE() the ptep so no need to repeat. (Multiple ptep_get()s does make a meaningful performance impact in the core-mm).

> +
> +	if ((pte & ARM_LPAE_PTE_AP_WRITABLE) == ARM_LPAE_PTE_AP_WRITABLE)
> +		return 0;

What about RO ptes? This check only checks that it is writable-clean. So you have 2 remaining options; writable-dirty and read-only. Suggest:

if (pte_hw_dirty(pte)) {
	iommu_dirty_bitmap_record(dirty, iova, size);
	if (!(arg->flags & IOMMU_DIRTY_NO_CLEAR))
		set_bit(ARM_LPAE_PTE_AP_RDONLY_BIT, (unsigned long *)ptep);
}

> +
> +	iommu_dirty_bitmap_record(dirty, iova, size);
> +	if (!(arg->flags & IOMMU_DIRTY_NO_CLEAR))
> +		set_bit(ARM_LPAE_PTE_AP_RDONLY_BIT, (unsigned long *)ptep);
> +	return 0;
> +}
> +
> +static int io_pgtable_visit(struct arm_lpae_io_pgtable *data,
> +			    struct io_pgtable_walk_data *walk_data,
> +			    arm_lpae_iopte *ptep, int lvl)
> +{
> +	struct io_pgtable *iop = &data->iop;
> +	arm_lpae_iopte pte = READ_ONCE(*ptep);
> +
> +	if (WARN_ON(!pte))
> +		return -EINVAL;
> +
> +	if (iopte_leaf(pte, lvl, iop->fmt)) {
> +		size_t size = ARM_LPAE_BLOCK_SIZE(lvl, data);
> +
> +		walk_data->fn(walk_data->addr, size, ptep, walk_data->opaque);
> +		walk_data->addr += size;
> +		return 0;
> +	}
> +
> +	ptep = iopte_deref(pte, data);
> +	return __arm_lpae_iopte_walk(data, walk_data, ptep, lvl + 1);
> +}
> +
> +static int __arm_lpae_iopte_walk(struct arm_lpae_io_pgtable *data,
> +				 struct io_pgtable_walk_data *walk_data,
> +				 arm_lpae_iopte *ptep,
> +				 int lvl)
> +{
> +	u32 idx;
> +	int max_entries, ret;
> +
> +	if (WARN_ON(lvl == ARM_LPAE_MAX_LEVELS))
> +		return -EINVAL;
> +
> +	if (lvl == data->start_level)
> +		max_entries = ARM_LPAE_PGD_SIZE(data) / sizeof(arm_lpae_iopte);
> +	else
> +		max_entries = ARM_LPAE_PTES_PER_TABLE(data);
> +
> +	for (idx = ARM_LPAE_LVL_IDX(walk_data->addr, lvl, data);
> +	     (idx < max_entries) && (walk_data->addr < walk_data->end); ++idx) {
> +		arm_lpae_iopte *pteref = &ptep[idx];

nit: Personally I would get rid of this and just pass `ptep + idx` to io_pgtable_visit()

> +
> +		ret = io_pgtable_visit(data, walk_data, pteref, lvl);
> +		if (ret)
> +			return ret;
> +	}
> +
> +	return 0;
> +}
> +
> +static int arm_lpae_read_and_clear_dirty(struct io_pgtable_ops *ops,
> +					 unsigned long iova, size_t size,
> +					 unsigned long flags,
> +					 struct iommu_dirty_bitmap *dirty)
> +{
> +	struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops);
> +	struct io_pgtable_cfg *cfg = &data->iop.cfg;
> +	struct arm_lpae_iopte_read_dirty arg = {
> +		.flags = flags, .dirty = dirty,
> +	};
> +	struct io_pgtable_walk_data walk_data = {
> +		.fn = __arm_lpae_read_and_clear_dirty,
> +		.opaque = &arg,

Do you have plans to reuse the walker? If not, then I wonder if separating the argument and callback function are valuable? It will certainly be more efficient without the per-pte indirect call.

> +		.addr = iova,
> +		.end = iova + size - 1,

Bug?: You are initializing end to be inclusive (i.e. last valid address in range). But __arm_lpae_iopte_walk() is using it as exclusive (i.e. first address after range) - see "walk_data->addr < walk_data->end".

mm code usually treats end as exclusive, so suggest removing the "- 1".

> +	};
> +	arm_lpae_iopte *ptep = data->pgd;
> +	int lvl = data->start_level;
> +	long iaext = (s64)iova >> cfg->ias;

Why cast this to signed? And why is the cast s64 and the result long? Suggest they should both be consistent at least. But probably clearer to just do:

WARN_ON((iova + size - 1) & (BIT(cfg->ias) - 1))

in the below if()? That way you are checking the full range too.

> +
> +	if (WARN_ON(!size))
> +		return -EINVAL;
> +
> +	if (WARN_ON(iaext))
> +		return -EINVAL;
> +
> +	if (data->iop.fmt != ARM_64_LPAE_S1)
> +		return -EINVAL;
> +
> +	return __arm_lpae_iopte_walk(data, &walk_data, ptep, lvl);
> +}
> +
>  static void arm_lpae_restrict_pgsizes(struct io_pgtable_cfg *cfg)
>  {
>  	unsigned long granule, page_sizes;
> @@ -807,6 +932,7 @@ arm_lpae_alloc_pgtable(struct io_pgtable_cfg *cfg)
>  		.map_pages	= arm_lpae_map_pages,
>  		.unmap_pages	= arm_lpae_unmap_pages,
>  		.iova_to_phys	= arm_lpae_iova_to_phys,
> +		.read_and_clear_dirty = arm_lpae_read_and_clear_dirty,
>  	};
>  
>  	return data;




More information about the linux-arm-kernel mailing list