[PATCH v9 5/6] iommu/dma: Allow a single FQ in addition to per-CPU FQs

Tue May 23 05:02:47 PDT 2023

On Mon, 2023-05-22 at 17:26 +0100, Robin Murphy wrote:
> On 2023-05-15 10:15, Niklas Schnelle wrote:
> > In some virtualized environments, including s390 paged memory guests,
> > IOTLB flushes are used to update IOMMU shadow tables. Due to this, they
> > are much more expensive than in typical bare metal environments or
> > non-paged s390 guests. In addition they may parallelize more poorly in
> > virtualized environments. This changes the trade off for flushing IOVAs
> > such that minimizing the number of IOTLB flushes trumps any benefit of
> > cheaper queuing operations or increased paralellism.
> > 
> > In this scenario per-CPU flush queues pose several problems. Firstly
> > per-CPU memory is often quite limited prohibiting larger queues.
> > Secondly collecting IOVAs per-CPU but flushing via a global timeout
> > reduces the number of IOVAs flushed for each timeout especially on s390
> > where PCI interrupts may not be bound to a specific CPU.
> > 
> > Let's introduce a single flush queue mode that reuses the same queue
> > logic but only allocates a single global queue. This mode can be
> > selected as a flag bit in a new dma_iommu_options struct which can be
> > modified from its defaults by IOMMU drivers implementing a new
> > ops.tune_dma_iommu() callback. As a first user the s390 IOMMU driver
> > selects the single queue mode if IOTLB flushes are needed on map which
> > indicates shadow table use. With the unchanged small FQ size and
> > timeouts this setting is worse than per-CPU queues but a follow up patch
> > will make the FQ size and timeout variable. Together this allows the
> > common IOVA flushing code to more closely resemble the global flush
> > behavior used on s390's previous internal DMA API implementation.
> > 
> > Link: https://lore.kernel.org/linux-iommu/3e402947-61f9-b7e8-1414-fde006257b6f@arm.com/
> > Reviewed-by: Matthew Rosato <mjrosato at linux.ibm.com> #s390
> > Signed-off-by: Niklas Schnelle <schnelle at linux.ibm.com>
> > ---
> >   drivers/iommu/dma-iommu.c  | 163 ++++++++++++++++++++++++++++++++++-----------
> >   drivers/iommu/dma-iommu.h  |   4 +-
> >   drivers/iommu/iommu.c      |  18 +++--
> >   drivers/iommu/s390-iommu.c |  10 +++
> >   include/linux/iommu.h      |  21 ++++++
> >   5 files changed, 169 insertions(+), 47 deletions(-)
> > 
> > diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
> > index 7a9f0b0bddbd..be4cab6b4fe4 100644
> > --- a/drivers/iommu/dma-iommu.c
> > +++ b/drivers/iommu/dma-iommu.c
> > @@ -49,8 +49,11 @@ struct iommu_dma_cookie {
> >   		/* Full allocator for IOMMU_DMA_IOVA_COOKIE */
> >   		struct {
> >   			struct iova_domain	iovad;
> > -
> > -			struct iova_fq __percpu *fq;	/* Flush queue */
> > +			/* Flush queue */
> > +			union {
> > +				struct iova_fq	*single_fq;
> > +				struct iova_fq	__percpu *percpu_fq;
> > +			};
> >   			/* Number of TLB flushes that have been started */
> >   			atomic64_t		fq_flush_start_cnt;
> >   			/* Number of TLB flushes that have been finished */
> > @@ -67,6 +70,8 @@ struct iommu_dma_cookie {
> >   
> >   	/* Domain for flush queue callback; NULL if flush queue not in use */
> >   	struct iommu_domain		*fq_domain;
> > +	/* Options for dma-iommu use */
> > +	struct dma_iommu_options	options;
> >   	struct mutex			mutex;
> >   };
> >   
> > @@ -152,25 +157,44 @@ static void fq_flush_iotlb(struct iommu_dma_cookie *cookie)
> >   	atomic64_inc(&cookie->fq_flush_finish_cnt);
> >   }
> >   
> > -static void fq_flush_timeout(struct timer_list *t)
> > +static void fq_flush_percpu(struct iommu_dma_cookie *cookie)
> >   {
> > -	struct iommu_dma_cookie *cookie = from_timer(cookie, t, fq_timer);
> >   	int cpu;
> >   
> > -	atomic_set(&cookie->fq_timer_on, 0);
> > -	fq_flush_iotlb(cookie);
> > -
> >   	for_each_possible_cpu(cpu) {
> >   		unsigned long flags;
> >   		struct iova_fq *fq;
> >   
> > -		fq = per_cpu_ptr(cookie->fq, cpu);
> > +		fq = per_cpu_ptr(cookie->percpu_fq, cpu);
> >   		spin_lock_irqsave(&fq->lock, flags);
> >   		fq_ring_free(cookie, fq);
> >   		spin_unlock_irqrestore(&fq->lock, flags);
> >   	}
> >   }
> >   
> > +static void fq_flush_single(struct iommu_dma_cookie *cookie)
> > +{
> > +	struct iova_fq *fq = cookie->single_fq;
> > +	unsigned long flags;
> > +
> > +	spin_lock_irqsave(&fq->lock, flags);
> > +	fq_ring_free(cookie, fq);
> > +	spin_unlock_irqrestore(&fq->lock, flags)
> 
> Nit: this should clearly just be a self-locked version of fq_ring_free() 
> that takes fq as an argument, then both the new case and the existing 
> loop body become trivial one-line calls.

Sure will do. Just one question about names. As an example
pci_reset_function_locked() means that the relevant lock is already
taken with pci_reset_function() adding the lock/unlock. In your wording
the implied function names sound the other way around. I can't find
anything similar in drivers/iommu so would you mind going the PCI way
and having:

fq_ring_free_locked(): Called in queue_iova() with the lock held
fr_ring_free(): Called in fq_flush_timeout() takes the lock itself

Or maybe I'm just biased because I've used the PCI ..locked() functions
before and there is a better convention.

> 
> > +}
> > +
> > +static void fq_flush_timeout(struct timer_list *t)
> > +{
> > +	struct iommu_dma_cookie *cookie = from_timer(cookie, t, fq_timer);
> > +
> > +	atomic_set(&cookie->fq_timer_on, 0);
> > +	fq_flush_iotlb(cookie);
> > +
> > +	if (cookie->options.flags & IOMMU_DMA_OPTS_SINGLE_QUEUE)
> > +		fq_flush_single(cookie);
> > +	else
> > +		fq_flush_percpu(cookie);
> > +}
> > +
> >   static void queue_iova(struct iommu_dma_cookie *cookie,
> >   		unsigned long pfn, unsigned long pages,
> >   		struct list_head *freelist)
> > @@ -188,7 +212,11 @@ static void queue_iova(struct iommu_dma_cookie *cookie,
> >   	 */
> >   	smp_mb();
> >   
> > -	fq = raw_cpu_ptr(cookie->fq);
> > +	if (cookie->options.flags & IOMMU_DMA_OPTS_SINGLE_QUEUE)
> > +		fq = cookie->single_fq;
> > +	else
> > +		fq = raw_cpu_ptr(cookie->percpu_fq);
> > +
> >   	spin_lock_irqsave(&fq->lock, flags);
> >   
> >   	/*
> > @@ -219,58 +247,114 @@ static void queue_iova(struct iommu_dma_cookie *cookie,
> >   			  jiffies + msecs_to_jiffies(IOVA_FQ_TIMEOUT));
> >   }
> >   
> > -static void iommu_dma_free_fq(struct iommu_dma_cookie *cookie)
> > +static void iommu_dma_free_fq_single(struct iova_fq *fq)
> > +{
> > +	int idx;
> > +
> > +	if (!fq)
> > +		return;
> > +	fq_ring_for_each(idx, fq)
> > +		put_pages_list(&fq->entries[idx].freelist);
> > +	vfree(fq);
> > +}
> > +
> > +static void iommu_dma_free_fq_percpu(struct iova_fq __percpu *percpu_fq)
> >   {
> >   	int cpu, idx;
> >   
> > -	if (!cookie->fq)
> > -		return;
> > -
> > -	del_timer_sync(&cookie->fq_timer);
> >   	/* The IOVAs will be torn down separately, so just free our queued pages */
> >   	for_each_possible_cpu(cpu) {
> > -		struct iova_fq *fq = per_cpu_ptr(cookie->fq, cpu);
> > +		struct iova_fq *fq = per_cpu_ptr(percpu_fq, cpu);
> >   
> >   		fq_ring_for_each(idx, fq)
> >   			put_pages_list(&fq->entries[idx].freelist);
> >   	}
> >   
> > -	free_percpu(cookie->fq);
> > +	free_percpu(percpu_fq);
> > +}
> > +
> > +static void iommu_dma_free_fq(struct iommu_dma_cookie *cookie)
> > +{
> > +	if (!cookie->fq_domain)
> > +		return;
> > +
> > +	del_timer_sync(&cookie->fq_timer);
> > +	if (cookie->options.flags & IOMMU_DMA_OPTS_SINGLE_QUEUE)
> > +		iommu_dma_free_fq_single(cookie->single_fq);
> > +	else
> > +		iommu_dma_free_fq_percpu(cookie->percpu_fq);
> > +}
> > +
> > +
> > +static void iommu_dma_init_one_fq(struct iova_fq *fq)
> > +{
> > +	int i;
> > +
> > +	fq->head = 0;
> > +	fq->tail = 0;
> > +
> > +	spin_lock_init(&fq->lock);
> > +
> > +	for (i = 0; i < IOVA_FQ_SIZE; i++)
> > +		INIT_LIST_HEAD(&fq->entries[i].freelist);
> > +}
> > +
> > +static int iommu_dma_init_fq_single(struct iommu_dma_cookie *cookie)
> > +{
> > +	struct iova_fq *queue;
> > +
> > +	queue = vzalloc(sizeof(*queue));
> > +	if (!queue)
> > +		return -ENOMEM;
> > +	iommu_dma_init_one_fq(queue);
> > +	cookie->single_fq = queue;
> > +
> > +	return 0;
> > +}
> > +
> > +static int iommu_dma_init_fq_percpu(struct iommu_dma_cookie *cookie)
> > +{
> > +	struct iova_fq __percpu *queue;
> > +	int cpu;
> > +
> > +	queue = alloc_percpu(struct iova_fq);
> > +	if (!queue)
> > +		return -ENOMEM;
> > +
> > +	for_each_possible_cpu(cpu)
> > +		iommu_dma_init_one_fq(per_cpu_ptr(queue, cpu));
> > +	cookie->percpu_fq = queue;
> > +	return 0;
> >   }
> >   
> >   /* sysfs updates are serialised by the mutex of the group owning @domain */
> > -int iommu_dma_init_fq(struct iommu_domain *domain)
> > +int iommu_dma_init_fq(struct device *dev, struct iommu_domain *domain)
> >   {
> >   	struct iommu_dma_cookie *cookie = domain->iova_cookie;
> > -	struct iova_fq __percpu *queue;
> > -	int i, cpu;
> > +	const struct iommu_ops *ops = dev_iommu_ops(dev);
> > +	int rc;
> >   
> >   	if (cookie->fq_domain)
> >   		return 0;
> >   
> > +	if (ops->tune_dma_iommu)
> > +		ops->tune_dma_iommu(dev, &cookie->options);
> > +
> >   	atomic64_set(&cookie->fq_flush_start_cnt,  0);
> >   	atomic64_set(&cookie->fq_flush_finish_cnt, 0);
> >   
> > -	queue = alloc_percpu(struct iova_fq);
> > -	if (!queue) {
> > +	if (cookie->options.flags & IOMMU_DMA_OPTS_SINGLE_QUEUE)
> > +		rc = iommu_dma_init_fq_single(cookie);
> > +	else
> > +		rc = iommu_dma_init_fq_percpu(cookie);
> > +
> > +	if (rc) {
> >   		pr_warn("iova flush queue initialization failed\n");
> > -		return -ENOMEM;
> > +		/* fall back to strict mode */
> > +		domain->type = IOMMU_DOMAIN_DMA;
> 
> Why move this? It doesn't logically belong to FQ initialisation itself.

Ah yes this is not needed anymore. Previously when I had a new domain
type I think I needed to set domain->type in here and moved the
fallback for consistency. Will remove that change.

> 
> > +		return rc;
> >   	}
> >   
> > -	for_each_possible_cpu(cpu) {
> > -		struct iova_fq *fq = per_cpu_ptr(queue, cpu);
> > -
> > -		fq->head = 0;
> > -		fq->tail = 0;
> > -
> > -		spin_lock_init(&fq->lock);
> > -
> > -		for (i = 0; i < IOVA_FQ_SIZE; i++)
> > -			INIT_LIST_HEAD(&fq->entries[i].freelist);
> > -	}
> > -
> > -	cookie->fq = queue;
> > -
> >   	timer_setup(&cookie->fq_timer, fq_flush_timeout, 0);
> >   	atomic_set(&cookie->fq_timer_on, 0);
> >   	/*
> > 
---8<---
> >   static struct iommu_device *s390_iommu_probe_device(struct device *dev)
> >   {
> >   	struct zpci_dev *zdev;
> > @@ -793,6 +802,7 @@ static const struct iommu_ops s390_iommu_ops = {
> >   	.device_group = generic_device_group,
> >   	.pgsize_bitmap = SZ_4K,
> >   	.get_resv_regions = s390_iommu_get_resv_regions,
> > +	.tune_dma_iommu = s390_iommu_tune_dma_iommu,
> >   	.default_domain_ops = &(const struct iommu_domain_ops) {
> >   		.attach_dev	= s390_iommu_attach_device,
> >   		.map_pages	= s390_iommu_map_pages,
> > diff --git a/include/linux/iommu.h b/include/linux/iommu.h
> > index 58891eddc2c4..3649a17256a5 100644
> > --- a/include/linux/iommu.h
> > +++ b/include/linux/iommu.h
> > @@ -219,6 +219,21 @@ struct iommu_iotlb_gather {
> >   	bool			queued;
> >   };
> >   
> > +/**
> > + * struct dma_iommu_options - Options for dma-iommu
> > + *
> > + * @flags: Flag bits for enabling/disabling dma-iommu settings
> > + *
> > + * This structure is intended to provide IOMMU drivers a way to influence the
> > + * behavior of the dma-iommu DMA API implementation. This allows optimizing for
> > + * example for a virtualized environment with slow IOTLB flushes.
> > + */
> > +struct dma_iommu_options {
> > +#define IOMMU_DMA_OPTS_PER_CPU_QUEUE	(0L << 0)
> > +#define IOMMU_DMA_OPTS_SINGLE_QUEUE	(1L << 0)
> > +	u64	flags;
> > +};
> 
> I think for now this can just use a bit in dev_iommu to indicate that 
> the device will prefer a global flush queue; s390 can set that in 
> .probe_device, then iommu_dma_init_domain() can propagate it to an 
> equivalent flag in the cookie (possibly even a new cookie type?) that 
> iommu_dma_init_fq() can then consume. Then just make the s390 parameters 
> from patch #6 the standard parameters for a global queue.
> 
> Thanks,
> Robin.

Sounds good.

> 
> > +
> >   /**
> >    * struct iommu_ops - iommu ops and capabilities
> >    * @capable: check capability
> > @@ -242,6 +257,9 @@ struct iommu_iotlb_gather {
> >    *		- IOMMU_DOMAIN_IDENTITY: must use an identity domain
> >    *		- IOMMU_DOMAIN_DMA: must use a dma domain
> >    *		- 0: use the default setting
> > + * @tune_dma_iommu: Allows the IOMMU driver to modify the default
> > + *		    options of the dma-iommu layer for a specific
> > + *		    device.
> >    * @default_domain_ops: the default ops for domains
> >    * @remove_dev_pasid: Remove any translation configurations of a specific
> >    *                    pasid, so that any DMA transactions with this pasid
> > @@ -278,6 +296,9 @@ struct iommu_ops {
> >   	int (*def_domain_type)(struct device *dev);
> >   	void (*remove_dev_pasid)(struct device *dev, ioasid_t pasid);
> >   
> > +	void (*tune_dma_iommu)(struct device *dev,
> > +			       struct dma_iommu_options *options);
> > +
> >   	const struct iommu_domain_ops *default_domain_ops;
> >   	unsigned long pgsize_bitmap;
> >   	struct module *owner;
> > 
>