[PATCH 5/5] iommu/io-pgtable: Add ARMv7 short descriptor support

Yong Wu yong.wu at mediatek.com
Tue Dec 8 00:58:11 PST 2015


Hi Robin,

   Thanks very much for your rewriting. It looks more pretty.
   
   This works well in my selftest. and I will push to our chrome branch
and request our video/display to test more.

   Only a little comment below.

On Fri, 2015-12-04 at 17:53 +0000, Robin Murphy wrote:
> Add a nearly-complete ARMv7 short descriptor implementation, omitting
> only a few legacy and CPU-centric aspects which shouldn't be necessary
> for IOMMU API use anyway.
> 
> Signed-off-by: Yong Wu <yong.wu at mediatek.com>
> Signed-off-by: Robin Murphy <robin.murphy at arm.com>
> ---
[...]
> +/* PTE type bits: these are all mixed up with XN/PXN bits in most cases */
> +#define ARM_V7S_PTE_TYPE_TABLE		0x1
> +#define ARM_V7S_PTE_TYPE_PAGE		0x2
> +#define ARM_V7S_PTE_TYPE_CONT_PAGE	0x1

>From the spec, This is Large page, Do we need add a comment for
readable?
/* Large page */

and add /* Supersection */ for CONT_SECTION too.

> +
> +#define ARM_V7S_PTE_IS_VALID(pte)	(((pte) & 0x3) != 0)
> +#define ARM_V7S_PTE_IS_TABLE(pte, lvl)	(lvl < 2 && ((pte) & ARM_V7S_PTE_TYPE_TABLE))
> +
> +/* Page table bits */
> +#define ARM_V7S_ATTR_XN(lvl)		BIT(4 * (2 - (lvl)))
> +#define ARM_V7S_ATTR_B			BIT(2)
> +#define ARM_V7S_ATTR_C			BIT(3)
> +#define ARM_V7S_ATTR_NS_TABLE		BIT(3)
> +#define ARM_V7S_ATTR_NS_SECTION		BIT(19)
> +
> +#define ARM_V7S_CONT_SECTION		BIT(18)
> +#define ARM_V7S_CONT_PAGE_XN_SHIFT	15
> +
[...]
> +static void *__arm_v7s_alloc_table(int lvl, gfp_t gfp,
> +				   struct arm_v7s_io_pgtable *data)
> +{
> +	struct device *dev = data->iop.cfg.iommu_dev;
> +	dma_addr_t dma;
> +	size_t size = ARM_V7S_TABLE_SIZE(lvl);
> +	void *table = NULL;
> +
> +	if (lvl == 1)
> +		table = (void *)__get_dma_pages(__GFP_ZERO, get_order(size));
> +	else if (lvl == 2)
> +		table = kmem_cache_zalloc(data->l2_tables, gfp);
> +	if (table && !selftest_running) {
> +		dma = dma_map_single(dev, table, size, DMA_TO_DEVICE);
> +		if (dma_mapping_error(dev, dma))
> +			goto out_free;
> +		/*
> +		 * We depend on the IOMMU being able to work with any physical
> +		 * address directly, so if the DMA layer suggests otherwise by
> +		 * translating or truncating them, that bodes very badly...
> +		 */
> +		if (dma != virt_to_phys(table))
> +			goto out_unmap;
> +	}

There is some special while we use kmem_cache, we save the physical
address into the pagetable, then get the virtual address via
phys_to_virt, then free it.
It isn't same with the normal case that saving the va and free the va.
Do we need add kmemleak_ignore here?

> +	return table;
> +
> +out_unmap:
> +	dev_err(dev, "Cannot accommodate DMA translation for IOMMU page tables\n");
> +	dma_unmap_single(dev, dma, size, DMA_TO_DEVICE);
> +out_free:
> +	if (lvl == 1)
> +		free_pages((unsigned long)table, get_order(size));
> +	else
> +		kmem_cache_free(data->l2_tables, table);
> +	return NULL;
> +}
> +
[...]
> +static int arm_v7s_init_pte(struct arm_v7s_io_pgtable *data,
> +			    unsigned long iova, phys_addr_t paddr, int prot,
> +			    int lvl, int num_entries, arm_v7s_iopte *ptep)
> +{
> +	struct io_pgtable_cfg *cfg = &data->iop.cfg;
> +	arm_v7s_iopte pte = arm_v7s_prot_to_pte(prot, lvl, cfg);
> +	int i;
> +
> +	for (i = 0; i < num_entries; i++)
> +		if (ARM_V7S_PTE_IS_TABLE(ptep[i], lvl)) {
> +			/*
> +			 * We need to unmap and free the old table before
> +			 * overwriting it with a block entry.
> +			 */
> +			arm_v7s_iopte *tblp;
> +			size_t sz = ARM_V7S_BLOCK_SIZE(lvl);
> +
> +			tblp = ptep + i - ARM_V7S_LVL_IDX(iova, lvl);
> +			if (WARN_ON(__arm_v7s_unmap(data, iova, sz, lvl, tblp)
> +					!= sz))
> +				return -EINVAL;

Here it may come from Will's "iommu/io-pgtable-arm: Unmap and free table
when overwriting with block".

But if we have IO_PGTABLE_QUIRK_TLBI_ON_MAP, the condition(1) in that
comment don't exist.  So we don't need take care whether the exist one
is a pgtable or not, we could always return -EEXIST here?

> +		} else if (ptep[i]) {
> +			/* We require an unmap first */
> +			WARN_ON(!selftest_running);
> +			return -EEXIST;
> +		}
> +

[...]

> +static int __arm_v7s_unmap(struct arm_v7s_io_pgtable *data,
> +			    unsigned long iova, size_t size, int lvl,
> +			    arm_v7s_iopte *ptep)
> +{
> +	arm_v7s_iopte pte[ARM_V7S_CONT_PAGES];
> +	struct io_pgtable_cfg *cfg = &data->iop.cfg;
> +	const struct iommu_gather_ops *tlb = cfg->tlb;
> +	void *cookie = data->iop.cookie;
> +	int idx, i = 0, num_entries = size >> ARM_V7S_LVL_SHIFT(lvl);
> +
> +	/* Something went horribly wrong and we ran out of page table */
> +	if (WARN_ON(lvl > 2))
> +		return 0;
> +
> +	idx = ARM_V7S_LVL_IDX(iova, lvl);
> +	ptep += idx;
> +	do {
> +		if (WARN_ON(!ARM_V7S_PTE_IS_VALID(ptep[i])))
> +			return 0;
> +		pte[i] = ptep[i];
> +	} while (++i < num_entries);
> +
> +	/*
> +	 * If we've hit a contiguous 'large page' entry at this level, it
> +	 * needs splitting first, unless we're unmapping the whole lot.
> +	 */
> +	if (num_entries <= 1 && arm_v7s_pte_is_cont(pte[0], lvl))
> +		arm_v7s_split_cont(data, iova, idx, lvl, ptep);
> +
> +	/* If the size matches this level, we're in the right place */
> +	if (num_entries) {
> +		size_t blk_size = ARM_V7S_BLOCK_SIZE(lvl);
> +
> +		__arm_v7s_set_pte(ptep, 0, num_entries, cfg);
> +
> +		for (i = 0; i < num_entries; i++) {
> +			if (ARM_V7S_PTE_IS_TABLE(pte[i], lvl)) {
> +				/* Also flush any partial walks */
> +				tlb->tlb_add_flush(iova, blk_size,
> +						   ARM_V7S_BLOCK_SIZE(2),
> +						   false, cookie);
> +				tlb->tlb_sync(cookie);

MTK iommu driver will get a warning here in my test.

There is a tlb_sync here, and in arm_v7s_unmap, there is another one.
then the flow is:

 tlb->tlb_add_flush(xxx)
 tlb->tlb_sync()
 tlb->tlb_sync()


In MTK tlb_sync, The code is:

static void mtk_iommu_tlb_sync(void *cookie)
{
	struct mtk_iommu_data *data = cookie;
	int ret;
	u32 tmp;

	ret = readl_poll_timeout_atomic(data->base + REG_MMU_CPE_DONE, tmp,
					tmp != 0, 10, 100000);
	if (ret) {
		dev_warn(data->dev,
			 "Partial TLB flush timed out, falling back to full flush\n");
		mtk_iommu_tlb_flush_all(cookie);
	}
	/* Clear the CPE status */
	writel_relaxed(0, data->base + REG_MMU_CPE_DONE);
}

In the end of this function, We have to write 0 to clear the CPE status,
then the HW could check the status in the next time.

So if we call tlb_sync twice continually. It will time out in the second
time, then we can see this log:

Partial TLB flush timed out, falling back to full flush

I don't know whether it is our HW special behavior, is this case ok in
the arm-smmu.c?
Is there some suggestion about this?


> +				ptep = iopte_deref(pte[i], lvl);
> +				__arm_v7s_free_table(ptep, lvl + 1, data);
> +			} else {
> +				tlb->tlb_add_flush(iova, blk_size, blk_size,
> +						   true, cookie);
> +			}
> +			iova += blk_size;
> +		}
> +		return size;
> +	} else if (lvl == 1 && !ARM_V7S_PTE_IS_TABLE(pte[0], lvl)) {
> +		/*
> +		 * Insert a table at the next level to map the old region,
> +		 * minus the part we want to unmap
> +		 */
> +		return arm_v7s_split_blk_unmap(data, iova, size, ptep);
> +	}
> +
> +	/* Keep on walkin' */
> +	ptep = iopte_deref(pte[0], lvl);
> +	return __arm_v7s_unmap(data, iova, size, lvl + 1, ptep);
> +}
> +
> +static int arm_v7s_unmap(struct io_pgtable_ops *ops, unsigned long iova,
> +			 size_t size)
> +{
> +	size_t unmapped;
> +	struct arm_v7s_io_pgtable *data = io_pgtable_ops_to_data(ops);
> +	struct io_pgtable *iop = &data->iop;
> +
> +	unmapped = __arm_v7s_unmap(data, iova, size, 1, data->pgd);
> +	if (unmapped)
> +		iop->cfg.tlb->tlb_sync(iop->cookie);
> +
> +	return unmapped;
> +}
> +
> +static phys_addr_t arm_v7s_iova_to_phys(struct io_pgtable_ops *ops,
> +					unsigned long iova)
> +{
> +	struct arm_v7s_io_pgtable *data = io_pgtable_ops_to_data(ops);
> +	arm_v7s_iopte *ptep = data->pgd, pte = ARM_V7S_PTE_TYPE_TABLE;
> +	int lvl = 0;
> +	u32 mask;
> +
> +	while (ARM_V7S_PTE_IS_TABLE(pte, lvl)) {
> +		pte = ptep[ARM_V7S_LVL_IDX(iova, ++lvl)];
> +		ptep = iopte_deref(pte, lvl);
> +	}

If we would like it always enter this while.
Could we use do{}while? then we don't need initialize the pte.

And in this file, the valid lvl should be 1 or 2. but here the "lvl" is
initialized to 0. Do we need add a enum for the lvl for more safe and
readable?

> +	if (!ARM_V7S_PTE_IS_VALID(pte))
> +		return 0;
> +
> +	mask = ARM_V7S_LVL_MASK(lvl);
> +	if (arm_v7s_pte_is_cont(pte, lvl))
> +		mask *= ARM_V7S_CONT_PAGES;
> +	return (pte & mask) | (iova & ~mask);
> +}
> +
[...]





More information about the linux-arm-kernel mailing list