[PATCH RFC RESEND 5/6] iommu/riscv: support nested iommu for creating domains owned by userspace

Jason Gunthorpe jgg at ziepe.ca
Tue May 7 08:07:08 PDT 2024


On Tue, May 07, 2024 at 10:25:59PM +0800, Zong Li wrote:
> This patch implements .domain_alloc_user operation for creating domains
> owend by userspace, e.g. through IOMMUFD. Add s2 domain for parent
> domain for second stage, s1 domain will be the first stage.
> 
> Don't remove IOMMU private data of dev in blocked domain, because it
> holds the user data of device, which is used when attaching device into
> s1 domain.
> 
> Signed-off-by: Zong Li <zong.li at sifive.com>
> ---
>  drivers/iommu/riscv/iommu.c  | 227 ++++++++++++++++++++++++++++++++++-
>  include/uapi/linux/iommufd.h |  17 +++
>  2 files changed, 243 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c
> index 072251f6ad85..7eda850df475 100644
> --- a/drivers/iommu/riscv/iommu.c
> +++ b/drivers/iommu/riscv/iommu.c
> @@ -827,6 +827,7 @@ static int riscv_iommu_iodir_set_mode(struct riscv_iommu_device *iommu,
>  
>  /* This struct contains protection domain specific IOMMU driver data. */
>  struct riscv_iommu_domain {
> +	struct riscv_iommu_domain *s2;
>  	struct iommu_domain domain;
>  	struct list_head bonds;
>  	spinlock_t lock;		/* protect bonds list updates. */
> @@ -844,6 +845,7 @@ struct riscv_iommu_domain {
>  /* Private IOMMU data for managed devices, dev_iommu_priv_* */
>  struct riscv_iommu_info {
>  	struct riscv_iommu_domain *domain;
> +	struct riscv_iommu_dc dc_user;
>  };
>  
>  /* Linkage between an iommu_domain and attached devices. */
> @@ -1454,7 +1456,6 @@ static int riscv_iommu_attach_blocking_domain(struct iommu_domain *iommu_domain,
>  
>  	riscv_iommu_iodir_update(iommu, dev, RISCV_IOMMU_FSC_BARE, 0, 0);
>  	riscv_iommu_bond_unlink(info->domain, dev);
> -	info->domain = NULL;

Nope, whatever is going on here, this can't be correct.

Once a domain is detached the driver must drop all references to it
immediately.

>  	return 0;
>  }
> @@ -1486,6 +1487,229 @@ static struct iommu_domain riscv_iommu_identity_domain = {
>  	}
>  };
>  
> +/**
> + * Nested IOMMU operations
> + */
> +
> +static int riscv_iommu_attach_dev_nested(struct iommu_domain *domain, struct device *dev)
> +{
> +	struct riscv_iommu_domain *riscv_domain = iommu_domain_to_riscv(domain);
> +	struct riscv_iommu_device *iommu = dev_to_iommu(dev);
> +	struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
> +
> +	if (riscv_domain->numa_node == NUMA_NO_NODE)
> +		riscv_domain->numa_node = dev_to_node(iommu->dev);

Why? The kernel doesn't do any memory allocation for nested domains,
does it?

> +	riscv_iommu_bond_unlink(info->domain, dev);
> +
> +	if (riscv_iommu_bond_link(riscv_domain, dev))
> +		return -ENOMEM;
> +
> +	riscv_iommu_iotlb_inval(riscv_domain, 0, ULONG_MAX);

Why? The cache tags should be in good shape before they are assigned
to the device. Wiping it on every attach is just needless flushing
that isn't useful.

> +	riscv_iommu_iodir_update(iommu, dev, info->dc_user.fsc, info->dc_user.ta,
> +				 info->dc_user.iohgatp);

This isn't ideal. Ideally the driver should make changing from S2 to
S2+S1 and back to be hitless. Wiping the valid bit as part of the
programming is not good.

As I said before, this driver probably needs the programming sequencer
from ARM.

> +	info->domain = riscv_domain;
> +
> +	return 0;
> +}
> +
> +static void riscv_iommu_domain_free_nested(struct iommu_domain *domain)
> +{
> +	struct riscv_iommu_domain *riscv_domain = iommu_domain_to_riscv(domain);
> +
> +	kfree(riscv_domain);
> +}

Doesn't the driver already have this function?
> +static struct iommu_domain *
> +riscv_iommu_domain_alloc_nested(struct device *dev,
> +				struct iommu_domain *parent,
> +				const struct iommu_user_data *user_data)
> +{
> +	struct riscv_iommu_domain *s2_domain = iommu_domain_to_riscv(parent);
> +	struct riscv_iommu_domain *s1_domain;
> +	struct riscv_iommu_device *iommu = dev_to_iommu(dev);
> +	struct iommu_hwpt_riscv_iommu arg;
> +	int ret, va_bits;
> +
> +	if (user_data->type != IOMMU_HWPT_DATA_RISCV_IOMMU)
> +		return ERR_PTR(-EOPNOTSUPP);
> +
> +	if (parent->type != IOMMU_DOMAIN_UNMANAGED)
> +		return ERR_PTR(-EINVAL);
> +
> +	ret = iommu_copy_struct_from_user(&arg,
> +					  user_data,
> +					  IOMMU_HWPT_DATA_RISCV_IOMMU,
> +					  out_event_uptr);
> +	if (ret)
> +		return ERR_PTR(ret);
> +
> +	s1_domain = kzalloc(sizeof(*s1_domain), GFP_KERNEL);
> +	if (!s1_domain)
> +		return ERR_PTR(-ENOMEM);
> +
> +	spin_lock_init(&s1_domain->lock);
> +	INIT_LIST_HEAD_RCU(&s1_domain->bonds);
> +
> +	s1_domain->pscid = ida_alloc_range(&riscv_iommu_pscids, 1,
> +					   RISCV_IOMMU_MAX_PSCID, GFP_KERNEL);
> +	if (s1_domain->pscid < 0) {
> +		iommu_free_page(s1_domain->pgd_root);
> +		kfree(s1_domain);
> +		return ERR_PTR(-ENOMEM);
> +	}
> +
> +	/* Get device context of stage-1 from user*/
> +	ret = riscv_iommu_get_dc_user(dev, &arg);
> +	if (ret) {
> +		kfree(s1_domain);
> +		return ERR_PTR(-EINVAL);
> +	}
> +
> +	if (!iommu) {
> +		va_bits = VA_BITS;
> +	} else if (iommu->caps & RISCV_IOMMU_CAP_S_SV57) {
> +		va_bits = 57;
> +	} else if (iommu->caps & RISCV_IOMMU_CAP_S_SV48) {
> +		va_bits = 48;
> +	} else if (iommu->caps & RISCV_IOMMU_CAP_S_SV39) {
> +		va_bits = 39;
> +	} else {
> +		dev_err(dev, "cannot find supported page table mode\n");
> +		return ERR_PTR(-ENODEV);
> +	}
> +
> +	/*
> +	 * The ops->domain_alloc_user could be directly called by the iommufd core,
> +	 * instead of iommu core. So, this function need to set the default value of
> +	 * following data member:
> +	 *  - domain->pgsize_bitmap
> +	 *  - domain->geometry
> +	 *  - domain->type
> +	 *  - domain->ops
> +	 */
> +	s1_domain->s2 = s2_domain;
> +	s1_domain->domain.type = IOMMU_DOMAIN_NESTED;
> +	s1_domain->domain.ops = &riscv_iommu_nested_domain_ops;

> +	s1_domain->domain.pgsize_bitmap = SZ_4K;
> +	s1_domain->domain.geometry.aperture_start = 0;
> +	s1_domain->domain.geometry.aperture_end = DMA_BIT_MASK(va_bits - 1);
> +	s1_domain->domain.geometry.force_aperture = true;

nested domains don't support paging/map so they don't use any of
this. Don't set it. Will remove the confusing va_bit stuff.


> +static struct iommu_domain *
> +riscv_iommu_domain_alloc_user(struct device *dev, u32 flags,
> +			      struct iommu_domain *parent,
> +			      const struct iommu_user_data *user_data)
> +{
> +	struct iommu_domain *domain;
> +	struct riscv_iommu_domain *riscv_domain;
> +
> +	/* Allocate stage-1 domain if it has stage-2 parent domain */
> +	if (parent)
> +		return riscv_iommu_domain_alloc_nested(dev, parent, user_data);
> +
> +	if (flags & ~((IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING)))
> +		return ERR_PTR(-EOPNOTSUPP);

This if should be moved up to the top and this driver does not support
diry_tracking, so don't test it.

> +	if (user_data)
> +		return ERR_PTR(-EINVAL);

Really? Don't you need to ask for the s2?

> +	/* domain_alloc_user op needs to be fully initialized */
> +	domain = iommu_domain_alloc(dev->bus);

Please no, structure your driver properly so you can call only
internal functions. This was a hack for intel only.

> +	if (!domain)
> +		return ERR_PTR(-ENOMEM);
> +
> +	/*
> +	 * We assume that nest-parent or g-stage only will come here

No... That isn't right. Normal non-virtualization paging domains come
here too.

The default of this function, with an empty user_data, should be to
create the same domain as alloc_domain_paging.


> +	 * TODO: Shadow page table doesn't be supported now.
> +	 *       We currently can't distinguish g-stage and shadow
> +	 *       page table here. Shadow page table shouldn't be
> +	 *       put at stage-2.
> +	 */
> +	riscv_domain = iommu_domain_to_riscv(domain);

What is a shadow page table?

> +	/* pgd_root may be allocated in .domain_alloc_paging */
> +	if (riscv_domain->pgd_root)
> +		iommu_free_page(riscv_domain->pgd_root);

And this is gross, structure your driver right so you don't have to
undo what prior functions did.
 
> +	riscv_domain->pgd_root = iommu_alloc_pages_node(riscv_domain->numa_node,
> +							GFP_KERNEL_ACCOUNT,
> +							2);
> +	if (!riscv_domain->pgd_root)
> +		return ERR_PTR(-ENOMEM);
> +
> +	riscv_domain->gscid = ida_alloc_range(&riscv_iommu_gscids, 1,
> +					      RISCV_IOMMU_MAX_GSCID, GFP_KERNEL);
> +	if (riscv_domain->gscid < 0) {
> +		iommu_free_pages(riscv_domain->pgd_root, 2);
> +		kfree(riscv_domain);
> +		return ERR_PTR(-ENOMEM);
> +	}
> +
> +	return domain;
> +}
> +
>  static void *riscv_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
>  {
>  	struct riscv_iommu_device *iommu = dev_to_iommu(dev);
> @@ -1587,6 +1811,7 @@ static const struct iommu_ops riscv_iommu_ops = {
>  	.blocked_domain = &riscv_iommu_blocking_domain,
>  	.release_domain = &riscv_iommu_blocking_domain,
>  	.domain_alloc_paging = riscv_iommu_alloc_paging_domain,
> +	.domain_alloc_user = riscv_iommu_domain_alloc_user,
>  	.def_domain_type = riscv_iommu_device_domain_type,
>  	.device_group = riscv_iommu_device_group,
>  	.probe_device = riscv_iommu_probe_device,
> diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
> index ec9aafd7d373..e10b6e236647 100644
> --- a/include/uapi/linux/iommufd.h
> +++ b/include/uapi/linux/iommufd.h
> @@ -390,14 +390,31 @@ struct iommu_hwpt_vtd_s1 {
>  	__u32 __reserved;
>  };
>  
> +/**
> + * struct iommu_hwpt_riscv_iommu - RISCV IOMMU stage-1 device context table
> + *                                 info (IOMMU_HWPT_TYPE_RISCV_IOMMU)
> + * @dc_len: Length of device context
> + * @dc_uptr: User pointer to the address of device context
> + * @event_len: Length of an event record
> + * @out_event_uptr: User pointer to the address of event record
> + */
> +struct iommu_hwpt_riscv_iommu {
> +	__aligned_u64 dc_len;
> +	__aligned_u64 dc_uptr;

So far other drivers have been inlining their "device context" in this
struct, why do you need a pointer and length?

> +	__aligned_u64 event_len;
> +	__aligned_u64 out_event_uptr;

Weird? Why?

Jason



More information about the linux-riscv mailing list