[PATCH v5 2/2] vfio/pci: add msi interrupt affinity support

Alex Williamson alex.williamson at redhat.com
Mon Jun 10 12:32:07 PDT 2024


On Mon, 10 Jun 2024 12:57:08 +0000
Fred Griffoul <fgriffo at amazon.co.uk> wrote:

> The usual way to configure a device interrupt from userland is to write
> the /proc/irq/<irq>/smp_affinity or smp_affinity_list files. When using
> vfio to implement a device driver or a virtual machine monitor, this may
> not be ideal: the process managing the vfio device interrupts may not be
> granted root privilege, for security reasons. Thus it cannot directly
> control the interrupt affinity and has to rely on an external command.
> 
> This patch extends the VFIO_DEVICE_SET_IRQS ioctl() with a new data flag
> to specify the affinity of interrupts of a vfio pci device.
> 
> The CPU affinity mask argument must be a subset of the process cpuset,
> otherwise an error -EPERM is returned.
> 
> The vfio_irq_set argument shall be set-up in the following way:
> 
> - the 'flags' field have the new flag VFIO_IRQ_SET_DATA_AFFINITY set
> as well as VFIO_IRQ_SET_ACTION_TRIGGER.
> 
> - the variable-length 'data' field is a cpu_set_t structure, as
> for the sched_setaffinity() syscall, the size of which is derived
> from 'argsz'.
> 
> Signed-off-by: Fred Griffoul <fgriffo at amazon.co.uk>
> ---
>  drivers/vfio/pci/vfio_pci_core.c  | 27 +++++++++++++++++----
>  drivers/vfio/pci/vfio_pci_intrs.c | 39 +++++++++++++++++++++++++++++++
>  drivers/vfio/vfio_main.c          | 13 +++++++----
>  include/uapi/linux/vfio.h         | 10 +++++++-
>  4 files changed, 80 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
> index 80cae87fff36..2e3419560480 100644
> --- a/drivers/vfio/pci/vfio_pci_core.c
> +++ b/drivers/vfio/pci/vfio_pci_core.c
> @@ -1192,6 +1192,7 @@ static int vfio_pci_ioctl_set_irqs(struct vfio_pci_core_device *vdev,
>  {
>  	unsigned long minsz = offsetofend(struct vfio_irq_set, count);
>  	struct vfio_irq_set hdr;
> +	cpumask_var_t mask;
>  	u8 *data = NULL;
>  	int max, ret = 0;
>  	size_t data_size = 0;
> @@ -1207,9 +1208,22 @@ static int vfio_pci_ioctl_set_irqs(struct vfio_pci_core_device *vdev,
>  		return ret;
>  
>  	if (data_size) {
> -		data = memdup_user(&arg->data, data_size);
> -		if (IS_ERR(data))
> -			return PTR_ERR(data);
> +		if (hdr.flags & VFIO_IRQ_SET_DATA_AFFINITY) {
> +			if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
> +				return -ENOMEM;
> +
> +			if (copy_from_user(mask, &arg->data, data_size)) {
> +				ret = -EFAULT;
> +				goto out;
> +			}
> +
> +			data = (u8 *)mask;

Seems like this could just use the memdup_user() path, why do we care
to copy it into a cpumask_var_t here?  If we do care, wouldn't we
implement something like get_user_cpu_mask() used by
sched_setaffinity(2)?

> +
> +		} else {
> +			data = memdup_user(&arg->data, data_size);
> +			if (IS_ERR(data))
> +				return PTR_ERR(data);
> +		}
>  	}
>  
>  	mutex_lock(&vdev->igate);
> @@ -1218,7 +1232,12 @@ static int vfio_pci_ioctl_set_irqs(struct vfio_pci_core_device *vdev,
>  				      hdr.count, data);
>  
>  	mutex_unlock(&vdev->igate);
> -	kfree(data);
> +
> +out:
> +	if (hdr.flags & VFIO_IRQ_SET_DATA_AFFINITY && data_size)
> +		free_cpumask_var(mask);
> +	else
> +		kfree(data);
>  
>  	return ret;
>  }
> diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c
> index 8382c5834335..fe01303cf94e 100644
> --- a/drivers/vfio/pci/vfio_pci_intrs.c
> +++ b/drivers/vfio/pci/vfio_pci_intrs.c
> @@ -19,6 +19,7 @@
>  #include <linux/vfio.h>
>  #include <linux/wait.h>
>  #include <linux/slab.h>
> +#include <linux/cpuset.h>
>  
>  #include "vfio_pci_priv.h"
>  
> @@ -675,6 +676,41 @@ static int vfio_pci_set_intx_trigger(struct vfio_pci_core_device *vdev,
>  	return 0;
>  }
>  
> +static int vfio_pci_set_msi_affinity(struct vfio_pci_core_device *vdev,
> +				     unsigned int start, unsigned int count,
> +				     struct cpumask *irq_mask)

Aside from the name, what makes this unique to MSI vectors?

> +{
> +	struct vfio_pci_irq_ctx *ctx;
> +	cpumask_var_t allowed_mask;
> +	unsigned int i;
> +	int err = 0;
> +
> +	if (!alloc_cpumask_var(&allowed_mask, GFP_KERNEL))
> +		return -ENOMEM;
> +
> +	cpuset_cpus_allowed(current, allowed_mask);
> +	if (!cpumask_subset(irq_mask, allowed_mask)) {
> +		err = -EPERM;
> +		goto finish;
> +	}
> +
> +	for (i = start; i < start + count; i++) {
> +		ctx = vfio_irq_ctx_get(vdev, i);
> +		if (!ctx) {
> +			err = -EINVAL;
> +			break;
> +		}
> +
> +		err = irq_set_affinity(ctx->producer.irq, irq_mask);
> +		if (err)
> +			break;
> +	}

Is this typical/userful behavior to set a series of vectors to the same
cpu_set_t?  It's unusual behavior for this ioctl to apply the same data
across multiple vectors.  Should the DATA_AFFINITY case support an
array of cpu_set_t?

> +
> +finish:
> +	free_cpumask_var(allowed_mask);
> +	return err;
> +}
> +
>  static int vfio_pci_set_msi_trigger(struct vfio_pci_core_device *vdev,
>  				    unsigned index, unsigned start,
>  				    unsigned count, uint32_t flags, void *data)
> @@ -713,6 +749,9 @@ static int vfio_pci_set_msi_trigger(struct vfio_pci_core_device *vdev,
>  	if (!irq_is(vdev, index))
>  		return -EINVAL;
>  
> +	if (flags & VFIO_IRQ_SET_DATA_AFFINITY)
> +		return vfio_pci_set_msi_affinity(vdev, start, count, data);
> +
>  	for (i = start; i < start + count; i++) {
>  		ctx = vfio_irq_ctx_get(vdev, i);
>  		if (!ctx)
> diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
> index e97d796a54fb..e75c5d66681c 100644
> --- a/drivers/vfio/vfio_main.c
> +++ b/drivers/vfio/vfio_main.c
> @@ -1505,23 +1505,28 @@ int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
>  		size = 0;
>  		break;
>  	case VFIO_IRQ_SET_DATA_BOOL:
> -		size = sizeof(uint8_t);
> +		size = hdr->count * sizeof(uint8_t);
>  		break;
>  	case VFIO_IRQ_SET_DATA_EVENTFD:
> -		size = sizeof(int32_t);
> +		size = size_mul(hdr->count, sizeof(int32_t));

Why use size_mul() in one place and not the other? 

> +		break;
> +	case VFIO_IRQ_SET_DATA_AFFINITY:
> +		size = hdr->argsz - minsz;
> +		if (size > cpumask_size())
> +			size = cpumask_size();

Or just set size = (hdr->argsz - minsz) / count?

Generate an error if (hdr->argsz - minsz) % count?

It seems like all the cpumask'items can be contained to the set affinity
function.

>  		break;
>  	default:
>  		return -EINVAL;
>  	}
>  
>  	if (size) {
> -		if (hdr->argsz - minsz < hdr->count * size)
> +		if (hdr->argsz - minsz < size)
>  			return -EINVAL;
>  
>  		if (!data_size)
>  			return -EINVAL;
>  
> -		*data_size = hdr->count * size;
> +		*data_size = size;
>  	}
>  
>  	return 0;
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index 2b68e6cdf190..5ba2ca223550 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -580,6 +580,12 @@ struct vfio_irq_info {
>   *
>   * Note that ACTION_[UN]MASK specify user->kernel signaling (irqfds) while
>   * ACTION_TRIGGER specifies kernel->user signaling.
> + *
> + * DATA_AFFINITY specifies the affinity for the range of interrupt vectors.
> + * It must be set with ACTION_TRIGGER in 'flags'. The variable-length 'data'
> + * array is a CPU affinity mask 'cpu_set_t' structure, as for the
> + * sched_setaffinity() syscall argument: the 'argsz' field is used to check
> + * the actual cpu_set_t size.
>   */

DATA_CPUSET?

The IRQ_INFO ioctl should probably also report support for this
feature.

Is there any proposed userspace code that takes advantage of this
interface?  Thanks,

Alex

>  struct vfio_irq_set {
>  	__u32	argsz;
> @@ -587,6 +593,7 @@ struct vfio_irq_set {
>  #define VFIO_IRQ_SET_DATA_NONE		(1 << 0) /* Data not present */
>  #define VFIO_IRQ_SET_DATA_BOOL		(1 << 1) /* Data is bool (u8) */
>  #define VFIO_IRQ_SET_DATA_EVENTFD	(1 << 2) /* Data is eventfd (s32) */
> +#define VFIO_IRQ_SET_DATA_AFFINITY	(1 << 6) /* Data is cpu_set_t */
>  #define VFIO_IRQ_SET_ACTION_MASK	(1 << 3) /* Mask interrupt */
>  #define VFIO_IRQ_SET_ACTION_UNMASK	(1 << 4) /* Unmask interrupt */
>  #define VFIO_IRQ_SET_ACTION_TRIGGER	(1 << 5) /* Trigger interrupt */
> @@ -599,7 +606,8 @@ struct vfio_irq_set {
>  
>  #define VFIO_IRQ_SET_DATA_TYPE_MASK	(VFIO_IRQ_SET_DATA_NONE | \
>  					 VFIO_IRQ_SET_DATA_BOOL | \
> -					 VFIO_IRQ_SET_DATA_EVENTFD)
> +					 VFIO_IRQ_SET_DATA_EVENTFD | \
> +					 VFIO_IRQ_SET_DATA_AFFINITY)
>  #define VFIO_IRQ_SET_ACTION_TYPE_MASK	(VFIO_IRQ_SET_ACTION_MASK | \
>  					 VFIO_IRQ_SET_ACTION_UNMASK | \
>  					 VFIO_IRQ_SET_ACTION_TRIGGER)




More information about the linux-arm-kernel mailing list