[RFC PATCH 1/4] vfio-nvme: add vfio-nvme lm driver infrastructure
Chaitanya Kulkarni
kch at nvidia.com
Sat Aug 2 19:47:02 PDT 2025
Add foundational infrastructure for vfio-nvme, enabling support for live
migration of NVMe devices via the VFIO framework. The following
components are included:
- Core driver skeleton for vfio-nvme support under drivers/vfio/pci/nvme/
- Definitions of basic data structures used in live migration
(e.g., nvmevf_pci_core_device and nvmevf_migration_file)
- Implementation of helper routines for managing migration file state
- Integration of PCI driver callbacks and error handling logic
- Registration with vfio-pci-core through nvmevf_pci_ops
- Initial support for VFIO migration states and device open/close flows
Subsequent patches will build upon this base to implement actual live
migration commands and complete the vfio device state handling logic.
Signed-off-by: Lei Rao <lei.rao at intel.com>
Signed-off-by: Max Gurtovoy <mgurtovoy at nvidia.com>
Signed-off-by: Chaitanya Kulkarni <kch at nvidia.com>
---
drivers/vfio/pci/Kconfig | 2 +
drivers/vfio/pci/Makefile | 2 +
drivers/vfio/pci/nvme/Kconfig | 10 ++
drivers/vfio/pci/nvme/Makefile | 3 +
drivers/vfio/pci/nvme/nvme.c | 196 +++++++++++++++++++++++++++++++++
drivers/vfio/pci/nvme/nvme.h | 36 ++++++
6 files changed, 249 insertions(+)
create mode 100644 drivers/vfio/pci/nvme/Kconfig
create mode 100644 drivers/vfio/pci/nvme/Makefile
create mode 100644 drivers/vfio/pci/nvme/nvme.c
create mode 100644 drivers/vfio/pci/nvme/nvme.h
diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
index 2b0172f54665..8f94429e7adc 100644
--- a/drivers/vfio/pci/Kconfig
+++ b/drivers/vfio/pci/Kconfig
@@ -67,4 +67,6 @@ source "drivers/vfio/pci/nvgrace-gpu/Kconfig"
source "drivers/vfio/pci/qat/Kconfig"
+source "drivers/vfio/pci/nvme/Kconfig"
+
endmenu
diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
index cf00c0a7e55c..be8c4b5ee0ba 100644
--- a/drivers/vfio/pci/Makefile
+++ b/drivers/vfio/pci/Makefile
@@ -10,6 +10,8 @@ obj-$(CONFIG_VFIO_PCI) += vfio-pci.o
obj-$(CONFIG_MLX5_VFIO_PCI) += mlx5/
+obj-$(CONFIG_NVME_VFIO_PCI) += nvme/
+
obj-$(CONFIG_HISI_ACC_VFIO_PCI) += hisilicon/
obj-$(CONFIG_PDS_VFIO_PCI) += pds/
diff --git a/drivers/vfio/pci/nvme/Kconfig b/drivers/vfio/pci/nvme/Kconfig
new file mode 100644
index 000000000000..12e0eaba0de1
--- /dev/null
+++ b/drivers/vfio/pci/nvme/Kconfig
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: GPL-2.0-only
+config NVME_VFIO_PCI
+ tristate "VFIO support for NVMe PCI devices"
+ depends on NVME_CORE
+ depends on VFIO_PCI_CORE
+ help
+ This provides migration support for NVMe devices using the
+ VFIO framework.
+
+ If you don't know what to do here, say N.
diff --git a/drivers/vfio/pci/nvme/Makefile b/drivers/vfio/pci/nvme/Makefile
new file mode 100644
index 000000000000..2f4a0ad3d9cf
--- /dev/null
+++ b/drivers/vfio/pci/nvme/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
+obj-$(CONFIG_NVME_VFIO_PCI) += nvme-vfio-pci.o
+nvme-vfio-pci-y := nvme.o
diff --git a/drivers/vfio/pci/nvme/nvme.c b/drivers/vfio/pci/nvme/nvme.c
new file mode 100644
index 000000000000..08bee3274207
--- /dev/null
+++ b/drivers/vfio/pci/nvme/nvme.c
@@ -0,0 +1,196 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2022, INTEL CORPORATION. All rights reserved
+ * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved
+ */
+
+#include <linux/device.h>
+#include <linux/eventfd.h>
+#include <linux/file.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/pci.h>
+#include <linux/types.h>
+#include <linux/vfio.h>
+#include <linux/anon_inodes.h>
+#include <linux/kernel.h>
+#include <linux/vfio_pci_core.h>
+
+#include "nvme.h"
+
+static void nvmevf_disable_fd(struct nvmevf_migration_file *migf)
+{
+ mutex_lock(&migf->lock);
+
+ /* release the device states buffer */
+ kvfree(migf->vf_data);
+ migf->vf_data = NULL;
+ migf->disabled = true;
+ migf->total_length = 0;
+ migf->filp->f_pos = 0;
+ mutex_unlock(&migf->lock);
+}
+
+static void nvmevf_disable_fds(struct nvmevf_pci_core_device *nvmevf_dev)
+{
+ if (nvmevf_dev->resuming_migf) {
+ nvmevf_disable_fd(nvmevf_dev->resuming_migf);
+ fput(nvmevf_dev->resuming_migf->filp);
+ nvmevf_dev->resuming_migf = NULL;
+ }
+
+ if (nvmevf_dev->saving_migf) {
+ nvmevf_disable_fd(nvmevf_dev->saving_migf);
+ fput(nvmevf_dev->saving_migf->filp);
+ nvmevf_dev->saving_migf = NULL;
+ }
+}
+
+static void nvmevf_state_mutex_unlock(struct nvmevf_pci_core_device *nvmevf_dev)
+{
+ lockdep_assert_held(&nvmevf_dev->state_mutex);
+again:
+ spin_lock(&nvmevf_dev->reset_lock);
+ if (nvmevf_dev->deferred_reset) {
+ nvmevf_dev->deferred_reset = false;
+ spin_unlock(&nvmevf_dev->reset_lock);
+ nvmevf_dev->mig_state = VFIO_DEVICE_STATE_RUNNING;
+ nvmevf_disable_fds(nvmevf_dev);
+ goto again;
+ }
+ mutex_unlock(&nvmevf_dev->state_mutex);
+ spin_unlock(&nvmevf_dev->reset_lock);
+}
+
+static struct nvmevf_pci_core_device *nvmevf_drvdata(struct pci_dev *pdev)
+{
+ struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev);
+
+ return container_of(core_device, struct nvmevf_pci_core_device,
+ core_device);
+}
+
+static int nvmevf_pci_open_device(struct vfio_device *core_vdev)
+{
+ struct nvmevf_pci_core_device *nvmevf_dev;
+ struct vfio_pci_core_device *vdev;
+ int ret;
+
+ nvmevf_dev = container_of(core_vdev, struct nvmevf_pci_core_device,
+ core_device.vdev);
+ vdev = &nvmevf_dev->core_device;
+
+ ret = vfio_pci_core_enable(vdev);
+ if (ret)
+ return ret;
+
+ if (nvmevf_dev->migrate_cap)
+ nvmevf_dev->mig_state = VFIO_DEVICE_STATE_RUNNING;
+ vfio_pci_core_finish_enable(vdev);
+ return 0;
+}
+
+static void nvmevf_pci_close_device(struct vfio_device *core_vdev)
+{
+ struct nvmevf_pci_core_device *nvmevf_dev;
+
+ nvmevf_dev = container_of(core_vdev, struct nvmevf_pci_core_device,
+ core_device.vdev);
+
+ if (nvmevf_dev->migrate_cap) {
+ mutex_lock(&nvmevf_dev->state_mutex);
+ nvmevf_disable_fds(nvmevf_dev);
+ nvmevf_state_mutex_unlock(nvmevf_dev);
+ }
+
+ vfio_pci_core_close_device(core_vdev);
+}
+
+static const struct vfio_device_ops nvmevf_pci_ops = {
+ .name = "nvme-vfio-pci",
+ .release = vfio_pci_core_release_dev,
+ .open_device = nvmevf_pci_open_device,
+ .close_device = nvmevf_pci_close_device,
+ .ioctl = vfio_pci_core_ioctl,
+ .device_feature = vfio_pci_core_ioctl_feature,
+ .read = vfio_pci_core_read,
+ .write = vfio_pci_core_write,
+ .mmap = vfio_pci_core_mmap,
+ .request = vfio_pci_core_request,
+ .match = vfio_pci_core_match,
+};
+
+static int nvmevf_pci_probe(struct pci_dev *pdev,
+ const struct pci_device_id *id)
+{
+ struct nvmevf_pci_core_device *nvmevf_dev;
+ int ret;
+
+ nvmevf_dev = vfio_alloc_device(nvmevf_pci_core_device, core_device.vdev,
+ &pdev->dev, &nvmevf_pci_ops);
+ if (IS_ERR(nvmevf_dev))
+ return PTR_ERR(nvmevf_dev);
+
+ dev_set_drvdata(&pdev->dev, &nvmevf_dev->core_device);
+ ret = vfio_pci_core_register_device(&nvmevf_dev->core_device);
+ if (ret)
+ goto out_put_dev;
+
+ return 0;
+
+out_put_dev:
+ vfio_put_device(&nvmevf_dev->core_device.vdev);
+ return ret;
+}
+
+static void nvmevf_pci_remove(struct pci_dev *pdev)
+{
+ struct nvmevf_pci_core_device *nvmevf_dev = nvmevf_drvdata(pdev);
+
+ vfio_pci_core_unregister_device(&nvmevf_dev->core_device);
+ vfio_put_device(&nvmevf_dev->core_device.vdev);
+}
+
+static void nvmevf_pci_aer_reset_done(struct pci_dev *pdev)
+{
+ struct nvmevf_pci_core_device *nvmevf_dev = nvmevf_drvdata(pdev);
+
+ if (!nvmevf_dev->migrate_cap)
+ return;
+
+ /*
+ * As the higher VFIO layers are holding locks across reset and using
+ * those same locks with the mm_lock we need to prevent ABBA deadlock
+ * with the state_mutex and mm_lock.
+ * In case the state_mutex was taken already we defer the cleanup work
+ * to the unlock flow of the other running context.
+ */
+ spin_lock(&nvmevf_dev->reset_lock);
+ nvmevf_dev->deferred_reset = true;
+ if (!mutex_trylock(&nvmevf_dev->state_mutex)) {
+ spin_unlock(&nvmevf_dev->reset_lock);
+ return;
+ }
+ spin_unlock(&nvmevf_dev->reset_lock);
+ nvmevf_state_mutex_unlock(nvmevf_dev);
+}
+
+static const struct pci_error_handlers nvmevf_err_handlers = {
+ .reset_done = nvmevf_pci_aer_reset_done,
+ .error_detected = vfio_pci_core_aer_err_detected,
+};
+
+static struct pci_driver nvmevf_pci_driver = {
+ .name = KBUILD_MODNAME,
+ .probe = nvmevf_pci_probe,
+ .remove = nvmevf_pci_remove,
+ .err_handler = &nvmevf_err_handlers,
+ .driver_managed_dma = true,
+};
+
+module_pci_driver(nvmevf_pci_driver);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Chaitanya Kulkarni <kch at nvidia.com>");
+MODULE_DESCRIPTION("NVMe VFIO PCI - VFIO PCI driver with live migration support for NVMe");
diff --git a/drivers/vfio/pci/nvme/nvme.h b/drivers/vfio/pci/nvme/nvme.h
new file mode 100644
index 000000000000..ee602254679e
--- /dev/null
+++ b/drivers/vfio/pci/nvme/nvme.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2022, INTEL CORPORATION. All rights reserved
+ * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved
+ */
+
+#ifndef NVME_VFIO_PCI_H
+#define NVME_VFIO_PCI_H
+
+#include <linux/kernel.h>
+#include <linux/vfio_pci_core.h>
+#include <linux/nvme.h>
+
+struct nvmevf_migration_file {
+ struct file *filp;
+ struct mutex lock;
+ bool disabled;
+ u8 *vf_data;
+ size_t total_length;
+};
+
+struct nvmevf_pci_core_device {
+ struct vfio_pci_core_device core_device;
+ int vf_id;
+ u8 migrate_cap:1;
+ u8 deferred_reset:1;
+ /* protect migration state */
+ struct mutex state_mutex;
+ enum vfio_device_mig_state mig_state;
+ /* protect the reset_done flow */
+ spinlock_t reset_lock;
+ struct nvmevf_migration_file *resuming_migf;
+ struct nvmevf_migration_file *saving_migf;
+};
+
+#endif /* NVME_VFIO_PCI_H */
--
2.40.0
More information about the Linux-nvme
mailing list