[RFC PATCH 1/4] vfio-nvme: add vfio-nvme lm driver infrastructure
Shameerali Kolothum Thodi
shameerali.kolothum.thodi at huawei.com
Mon Aug 4 08:20:34 PDT 2025
> -----Original Message-----
> From: Chaitanya Kulkarni <kch at nvidia.com>
> Sent: Sunday, August 3, 2025 3:47 AM
> To: kbusch at kernel.org; axboe at fb.com; hch at lst.de; sagi at grimberg.me;
> alex.williamson at redhat.com; cohuck at redhat.com; jgg at ziepe.ca;
> yishaih at nvidia.com; Shameerali Kolothum Thodi
> <shameerali.kolothum.thodi at huawei.com>; kevin.tian at intel.com;
> mjrosato at linux.ibm.com; mgurtovoy at nvidia.com
> Cc: linux-nvme at lists.infradead.org; kvm at vger.kernel.org;
> Konrad.wilk at oracle.com; martin.petersen at oracle.com;
> jmeneghi at redhat.com; arnd at arndb.de; schnelle at linux.ibm.com;
> bhelgaas at google.com; joao.m.martins at oracle.com; Chaitanya Kulkarni
> <kch at nvidia.com>; Lei Rao <lei.rao at intel.com>
> Subject: [RFC PATCH 1/4] vfio-nvme: add vfio-nvme lm driver infrastructure
>
> Add foundational infrastructure for vfio-nvme, enabling support for live
> migration of NVMe devices via the VFIO framework. The following
> components are included:
>
> - Core driver skeleton for vfio-nvme support under drivers/vfio/pci/nvme/
> - Definitions of basic data structures used in live migration
> (e.g., nvmevf_pci_core_device and nvmevf_migration_file)
> - Implementation of helper routines for managing migration file state
> - Integration of PCI driver callbacks and error handling logic
> - Registration with vfio-pci-core through nvmevf_pci_ops
> - Initial support for VFIO migration states and device open/close flows
>
> Subsequent patches will build upon this base to implement actual live
> migration commands and complete the vfio device state handling logic.
>
> Signed-off-by: Lei Rao <lei.rao at intel.com>
> Signed-off-by: Max Gurtovoy <mgurtovoy at nvidia.com>
> Signed-off-by: Chaitanya Kulkarni <kch at nvidia.com>
> ---
> drivers/vfio/pci/Kconfig | 2 +
> drivers/vfio/pci/Makefile | 2 +
> drivers/vfio/pci/nvme/Kconfig | 10 ++
> drivers/vfio/pci/nvme/Makefile | 3 +
> drivers/vfio/pci/nvme/nvme.c | 196
> +++++++++++++++++++++++++++++++++
> drivers/vfio/pci/nvme/nvme.h | 36 ++++++
> 6 files changed, 249 insertions(+)
> create mode 100644 drivers/vfio/pci/nvme/Kconfig
> create mode 100644 drivers/vfio/pci/nvme/Makefile
> create mode 100644 drivers/vfio/pci/nvme/nvme.c
> create mode 100644 drivers/vfio/pci/nvme/nvme.h
>
> diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
> index 2b0172f54665..8f94429e7adc 100644
> --- a/drivers/vfio/pci/Kconfig
> +++ b/drivers/vfio/pci/Kconfig
> @@ -67,4 +67,6 @@ source "drivers/vfio/pci/nvgrace-gpu/Kconfig"
>
> source "drivers/vfio/pci/qat/Kconfig"
>
> +source "drivers/vfio/pci/nvme/Kconfig"
> +
> endmenu
> diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
> index cf00c0a7e55c..be8c4b5ee0ba 100644
> --- a/drivers/vfio/pci/Makefile
> +++ b/drivers/vfio/pci/Makefile
> @@ -10,6 +10,8 @@ obj-$(CONFIG_VFIO_PCI) += vfio-pci.o
>
> obj-$(CONFIG_MLX5_VFIO_PCI) += mlx5/
>
> +obj-$(CONFIG_NVME_VFIO_PCI) += nvme/
> +
> obj-$(CONFIG_HISI_ACC_VFIO_PCI) += hisilicon/
>
> obj-$(CONFIG_PDS_VFIO_PCI) += pds/
> diff --git a/drivers/vfio/pci/nvme/Kconfig b/drivers/vfio/pci/nvme/Kconfig
> new file mode 100644
> index 000000000000..12e0eaba0de1
> --- /dev/null
> +++ b/drivers/vfio/pci/nvme/Kconfig
> @@ -0,0 +1,10 @@
> +# SPDX-License-Identifier: GPL-2.0-only
> +config NVME_VFIO_PCI
> + tristate "VFIO support for NVMe PCI devices"
> + depends on NVME_CORE
> + depends on VFIO_PCI_CORE
> + help
> + This provides migration support for NVMe devices using the
> + VFIO framework.
> +
> + If you don't know what to do here, say N.
> diff --git a/drivers/vfio/pci/nvme/Makefile b/drivers/vfio/pci/nvme/Makefile
> new file mode 100644
> index 000000000000..2f4a0ad3d9cf
> --- /dev/null
> +++ b/drivers/vfio/pci/nvme/Makefile
> @@ -0,0 +1,3 @@
> +# SPDX-License-Identifier: GPL-2.0-only
> +obj-$(CONFIG_NVME_VFIO_PCI) += nvme-vfio-pci.o
> +nvme-vfio-pci-y := nvme.o
> diff --git a/drivers/vfio/pci/nvme/nvme.c b/drivers/vfio/pci/nvme/nvme.c
> new file mode 100644
> index 000000000000..08bee3274207
> --- /dev/null
> +++ b/drivers/vfio/pci/nvme/nvme.c
> @@ -0,0 +1,196 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Copyright (c) 2022, INTEL CORPORATION. All rights reserved
> + * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved
> + */
> +
> +#include <linux/device.h>
> +#include <linux/eventfd.h>
> +#include <linux/file.h>
> +#include <linux/interrupt.h>
> +#include <linux/module.h>
> +#include <linux/mutex.h>
> +#include <linux/pci.h>
> +#include <linux/types.h>
> +#include <linux/vfio.h>
> +#include <linux/anon_inodes.h>
> +#include <linux/kernel.h>
> +#include <linux/vfio_pci_core.h>
> +
> +#include "nvme.h"
> +
> +static void nvmevf_disable_fd(struct nvmevf_migration_file *migf)
> +{
> + mutex_lock(&migf->lock);
> +
> + /* release the device states buffer */
> + kvfree(migf->vf_data);
> + migf->vf_data = NULL;
> + migf->disabled = true;
> + migf->total_length = 0;
> + migf->filp->f_pos = 0;
> + mutex_unlock(&migf->lock);
May be we can use guard(mutex) here and elsewhere for this driver now?
> +}
> +
> +static void nvmevf_disable_fds(struct nvmevf_pci_core_device
> *nvmevf_dev)
> +{
> + if (nvmevf_dev->resuming_migf) {
> + nvmevf_disable_fd(nvmevf_dev->resuming_migf);
> + fput(nvmevf_dev->resuming_migf->filp);
> + nvmevf_dev->resuming_migf = NULL;
> + }
> +
> + if (nvmevf_dev->saving_migf) {
> + nvmevf_disable_fd(nvmevf_dev->saving_migf);
> + fput(nvmevf_dev->saving_migf->filp);
> + nvmevf_dev->saving_migf = NULL;
> + }
> +}
> +
> +static void nvmevf_state_mutex_unlock(struct nvmevf_pci_core_device
> *nvmevf_dev)
> +{
> + lockdep_assert_held(&nvmevf_dev->state_mutex);
> +again:
> + spin_lock(&nvmevf_dev->reset_lock);
> + if (nvmevf_dev->deferred_reset) {
> + nvmevf_dev->deferred_reset = false;
> + spin_unlock(&nvmevf_dev->reset_lock);
> + nvmevf_dev->mig_state = VFIO_DEVICE_STATE_RUNNING;
> + nvmevf_disable_fds(nvmevf_dev);
> + goto again;
> + }
> + mutex_unlock(&nvmevf_dev->state_mutex);
> + spin_unlock(&nvmevf_dev->reset_lock);
> +}
> +
> +static struct nvmevf_pci_core_device *nvmevf_drvdata(struct pci_dev
> *pdev)
> +{
> + struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev-
> >dev);
> +
> + return container_of(core_device, struct nvmevf_pci_core_device,
> + core_device);
> +}
> +
> +static int nvmevf_pci_open_device(struct vfio_device *core_vdev)
> +{
> + struct nvmevf_pci_core_device *nvmevf_dev;
> + struct vfio_pci_core_device *vdev;
> + int ret;
> +
> + nvmevf_dev = container_of(core_vdev, struct
> nvmevf_pci_core_device,
> + core_device.vdev);
> + vdev = &nvmevf_dev->core_device;
> +
> + ret = vfio_pci_core_enable(vdev);
> + if (ret)
> + return ret;
> +
> + if (nvmevf_dev->migrate_cap)
> + nvmevf_dev->mig_state = VFIO_DEVICE_STATE_RUNNING;
> + vfio_pci_core_finish_enable(vdev);
> + return 0;
> +}
> +
> +static void nvmevf_pci_close_device(struct vfio_device *core_vdev)
> +{
> + struct nvmevf_pci_core_device *nvmevf_dev;
> +
> + nvmevf_dev = container_of(core_vdev, struct
> nvmevf_pci_core_device,
> + core_device.vdev);
> +
> + if (nvmevf_dev->migrate_cap) {
> + mutex_lock(&nvmevf_dev->state_mutex);
> + nvmevf_disable_fds(nvmevf_dev);
> + nvmevf_state_mutex_unlock(nvmevf_dev);
> + }
> +
> + vfio_pci_core_close_device(core_vdev);
> +}
> +
> +static const struct vfio_device_ops nvmevf_pci_ops = {
> + .name = "nvme-vfio-pci",
> + .release = vfio_pci_core_release_dev,
> + .open_device = nvmevf_pci_open_device,
> + .close_device = nvmevf_pci_close_device,
> + .ioctl = vfio_pci_core_ioctl,
> + .device_feature = vfio_pci_core_ioctl_feature,
> + .read = vfio_pci_core_read,
> + .write = vfio_pci_core_write,
> + .mmap = vfio_pci_core_mmap,
> + .request = vfio_pci_core_request,
> + .match = vfio_pci_core_match,
Any reason why this driver don't need the iommufd related callbacks?
bind_iommufd/unbind_iommufd etc?
> +};
> +
> +static int nvmevf_pci_probe(struct pci_dev *pdev,
> + const struct pci_device_id *id)
> +{
> + struct nvmevf_pci_core_device *nvmevf_dev;
> + int ret;
> +
> + nvmevf_dev = vfio_alloc_device(nvmevf_pci_core_device,
> core_device.vdev,
> + &pdev->dev, &nvmevf_pci_ops);
> + if (IS_ERR(nvmevf_dev))
> + return PTR_ERR(nvmevf_dev);
> +
> + dev_set_drvdata(&pdev->dev, &nvmevf_dev->core_device);
> + ret = vfio_pci_core_register_device(&nvmevf_dev->core_device);
> + if (ret)
> + goto out_put_dev;
> +
> + return 0;
> +
> +out_put_dev:
> + vfio_put_device(&nvmevf_dev->core_device.vdev);
> + return ret;
> +}
> +
> +static void nvmevf_pci_remove(struct pci_dev *pdev)
> +{
> + struct nvmevf_pci_core_device *nvmevf_dev =
> nvmevf_drvdata(pdev);
> +
> + vfio_pci_core_unregister_device(&nvmevf_dev->core_device);
> + vfio_put_device(&nvmevf_dev->core_device.vdev);
> +}
> +
> +static void nvmevf_pci_aer_reset_done(struct pci_dev *pdev)
> +{
> + struct nvmevf_pci_core_device *nvmevf_dev =
> nvmevf_drvdata(pdev);
> +
> + if (!nvmevf_dev->migrate_cap)
> + return;
> +
> + /*
> + * As the higher VFIO layers are holding locks across reset and using
> + * those same locks with the mm_lock we need to prevent ABBA
> deadlock
> + * with the state_mutex and mm_lock.
> + * In case the state_mutex was taken already we defer the cleanup
> work
> + * to the unlock flow of the other running context.
> + */
I am not sure this is relevant for this driver. This deferred logic was added to avoid
a circular locking issue w.r.t copy_to/from_user() functions holding mm_lock
under state mutex.
See this,
https://lore.kernel.org/all/20240229091152.56664-1-shameerali.kolothum.thodi@huawei.com/
Looking at patch #4 it doesn't look like this has that issue. May be I missed it.
Please double check.
Thanks,
Shameer
> + spin_lock(&nvmevf_dev->reset_lock);
> + nvmevf_dev->deferred_reset = true;
> + if (!mutex_trylock(&nvmevf_dev->state_mutex)) {
> + spin_unlock(&nvmevf_dev->reset_lock);
> + return;
> + }
> + spin_unlock(&nvmevf_dev->reset_lock);
> + nvmevf_state_mutex_unlock(nvmevf_dev);
> +}
> +
> +static const struct pci_error_handlers nvmevf_err_handlers = {
> + .reset_done = nvmevf_pci_aer_reset_done,
> + .error_detected = vfio_pci_core_aer_err_detected,
> +};
> +
> +static struct pci_driver nvmevf_pci_driver = {
> + .name = KBUILD_MODNAME,
> + .probe = nvmevf_pci_probe,
> + .remove = nvmevf_pci_remove,
> + .err_handler = &nvmevf_err_handlers,
> + .driver_managed_dma = true,
> +};
> +
> +module_pci_driver(nvmevf_pci_driver);
> +
> +MODULE_LICENSE("GPL");
> +MODULE_AUTHOR("Chaitanya Kulkarni <kch at nvidia.com>");
> +MODULE_DESCRIPTION("NVMe VFIO PCI - VFIO PCI driver with live
> migration support for NVMe");
> diff --git a/drivers/vfio/pci/nvme/nvme.h b/drivers/vfio/pci/nvme/nvme.h
> new file mode 100644
> index 000000000000..ee602254679e
> --- /dev/null
> +++ b/drivers/vfio/pci/nvme/nvme.h
> @@ -0,0 +1,36 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * Copyright (c) 2022, INTEL CORPORATION. All rights reserved
> + * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved
> + */
> +
> +#ifndef NVME_VFIO_PCI_H
> +#define NVME_VFIO_PCI_H
> +
> +#include <linux/kernel.h>
> +#include <linux/vfio_pci_core.h>
> +#include <linux/nvme.h>
> +
> +struct nvmevf_migration_file {
> + struct file *filp;
> + struct mutex lock;
> + bool disabled;
> + u8 *vf_data;
> + size_t total_length;
> +};
> +
> +struct nvmevf_pci_core_device {
> + struct vfio_pci_core_device core_device;
> + int vf_id;
> + u8 migrate_cap:1;
> + u8 deferred_reset:1;
> + /* protect migration state */
> + struct mutex state_mutex;
> + enum vfio_device_mig_state mig_state;
> + /* protect the reset_done flow */
> + spinlock_t reset_lock;
> + struct nvmevf_migration_file *resuming_migf;
> + struct nvmevf_migration_file *saving_migf;
> +};
> +
> +#endif /* NVME_VFIO_PCI_H */
> --
> 2.40.0
More information about the Linux-nvme
mailing list