[PATCH 07/37] iommu: Add a page fault handler

Jean-Philippe Brucker jean-philippe.brucker at arm.com
Mon Feb 12 10:33:22 PST 2018


Some systems allow devices to handle IOMMU translation faults in the core
mm. For example systems supporting the PCI PRI extension or Arm SMMU stall
model. Infrastructure for reporting such recoverable page faults was
recently added to the IOMMU core, for SVA virtualization. Extend
iommu_report_device_fault() to handle host page faults as well.

* IOMMU drivers instantiate a fault workqueue, using
  iommu_fault_queue_init() and iommu_fault_queue_destroy().

* When it receives a fault event, supposedly in an IRQ handler, the IOMMU
  driver reports the fault using iommu_report_device_fault()

* If the device driver registered a handler (e.g. VFIO), pass down the
  fault event. Otherwise submit it to the fault queue, to be handled in a
  thread.

* When the fault corresponds to an io_mm, call the mm fault handler on it
  (in next patch).

* Once the fault is handled, the mm wrapper or the device driver reports
  success of failure with iommu_page_response(). The translation is either
  retried or aborted, depending on the response code.

Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker at arm.com>
---
 drivers/iommu/Kconfig      |  10 ++
 drivers/iommu/Makefile     |   1 +
 drivers/iommu/io-pgfault.c | 282 +++++++++++++++++++++++++++++++++++++++++++++
 drivers/iommu/iommu-sva.c  |   3 -
 drivers/iommu/iommu.c      |  31 ++---
 include/linux/iommu.h      |  34 +++++-
 6 files changed, 339 insertions(+), 22 deletions(-)
 create mode 100644 drivers/iommu/io-pgfault.c

diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 146eebe9a4bb..e751bb9958ba 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -85,6 +85,15 @@ config IOMMU_SVA
 
 	  If unsure, say N here.
 
+config IOMMU_FAULT
+	bool "Fault handler for the IOMMU API"
+	select IOMMU_API
+	help
+	  Enable the generic fault handler for the IOMMU API, that handles
+	  recoverable page faults or inject them into guests.
+
+	  If unsure, say N here.
+
 config FSL_PAMU
 	bool "Freescale IOMMU support"
 	depends on PCI
@@ -156,6 +165,7 @@ config INTEL_IOMMU
 	select IOMMU_API
 	select IOMMU_IOVA
 	select DMAR_TABLE
+	select IOMMU_FAULT
 	help
 	  DMA remapping (DMAR) devices support enables independent address
 	  translations for Direct Memory Access (DMA) from devices.
diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
index 1dbcc89ebe4c..f4324e29035e 100644
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -4,6 +4,7 @@ obj-$(CONFIG_IOMMU_API) += iommu-traces.o
 obj-$(CONFIG_IOMMU_API) += iommu-sysfs.o
 obj-$(CONFIG_IOMMU_DMA) += dma-iommu.o
 obj-$(CONFIG_IOMMU_SVA) += iommu-sva.o
+obj-$(CONFIG_IOMMU_FAULT) += io-pgfault.o
 obj-$(CONFIG_IOMMU_IO_PGTABLE) += io-pgtable.o
 obj-$(CONFIG_IOMMU_IO_PGTABLE_ARMV7S) += io-pgtable-arm-v7s.o
 obj-$(CONFIG_IOMMU_IO_PGTABLE_LPAE) += io-pgtable-arm.o
diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c
new file mode 100644
index 000000000000..33309ed316d2
--- /dev/null
+++ b/drivers/iommu/io-pgfault.c
@@ -0,0 +1,282 @@
+/*
+ * Handle device page faults
+ *
+ * Copyright (C) 2018 ARM Ltd.
+ * Author: Jean-Philippe Brucker <jean-philippe.brucker at arm.com>
+ *
+ * SPDX-License-Identifier: GPL-2.0
+ */
+
+#include <linux/iommu.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+
+static struct workqueue_struct *iommu_fault_queue;
+static DECLARE_RWSEM(iommu_fault_queue_sem);
+static refcount_t iommu_fault_queue_refs = REFCOUNT_INIT(0);
+static BLOCKING_NOTIFIER_HEAD(iommu_fault_queue_flush_notifiers);
+
+/* Used to store incomplete fault groups */
+static LIST_HEAD(iommu_partial_faults);
+static DEFINE_SPINLOCK(iommu_partial_faults_lock);
+
+struct iommu_fault_context {
+	struct device			*dev;
+	struct iommu_fault_event	evt;
+	struct list_head		head;
+};
+
+struct iommu_fault_group {
+	struct iommu_domain		*domain;
+	struct iommu_fault_context	last_fault;
+	struct list_head		faults;
+	struct work_struct		work;
+};
+
+/*
+ * iommu_fault_complete() - Finish handling a fault
+ *
+ * Send a response if necessary and pass on the sanitized status code
+ */
+static int iommu_fault_complete(struct iommu_domain *domain, struct device *dev,
+				struct iommu_fault_event *evt, int status)
+{
+	struct page_response_msg resp = {
+		.addr		= evt->addr,
+		.pasid		= evt->pasid,
+		.pasid_present	= evt->pasid_valid,
+		.page_req_group_id = evt->page_req_group_id,
+		.type		= IOMMU_PAGE_GROUP_RESP,
+		.private_data	= evt->iommu_private,
+	};
+
+	/*
+	 * There is no "handling" an unrecoverable fault, so the only valid
+	 * return values are 0 or an error.
+	 */
+	if (evt->type == IOMMU_FAULT_DMA_UNRECOV)
+		return status > 0 ? 0 : status;
+
+	/* Someone took ownership of the fault and will complete it later */
+	if (status == IOMMU_PAGE_RESP_HANDLED)
+		return 0;
+
+	/*
+	 * There was an internal error with handling the recoverable fault. Try
+	 * to complete the fault if possible.
+	 */
+	if (status < 0)
+		status = IOMMU_PAGE_RESP_INVALID;
+
+	if (WARN_ON(!domain->ops->page_response))
+		/*
+		 * The IOMMU driver shouldn't have submitted recoverable faults
+		 * if it cannot receive a response.
+		 */
+		return -EINVAL;
+
+	resp.resp_code = status;
+	return domain->ops->page_response(domain, dev, &resp);
+}
+
+static int iommu_fault_handle_single(struct iommu_fault_context *fault)
+{
+	/* TODO */
+	return -ENODEV;
+}
+
+static void iommu_fault_handle_group(struct work_struct *work)
+{
+	struct iommu_fault_group *group;
+	struct iommu_fault_context *fault, *next;
+	int status = IOMMU_PAGE_RESP_SUCCESS;
+
+	group = container_of(work, struct iommu_fault_group, work);
+
+	list_for_each_entry_safe(fault, next, &group->faults, head) {
+		struct iommu_fault_event *evt = &fault->evt;
+		/*
+		 * Errors are sticky: don't handle subsequent faults in the
+		 * group if there is an error.
+		 */
+		if (status == IOMMU_PAGE_RESP_SUCCESS)
+			status = iommu_fault_handle_single(fault);
+
+		if (!evt->last_req)
+			kfree(fault);
+	}
+
+	iommu_fault_complete(group->domain, group->last_fault.dev,
+			     &group->last_fault.evt, status);
+	kfree(group);
+}
+
+static int iommu_queue_fault(struct iommu_domain *domain, struct device *dev,
+			     struct iommu_fault_event *evt)
+{
+	struct iommu_fault_group *group;
+	struct iommu_fault_context *fault, *next;
+
+	if (!iommu_fault_queue)
+		return -ENOSYS;
+
+	if (!evt->last_req) {
+		fault = kzalloc(sizeof(*fault), GFP_KERNEL);
+		if (!fault)
+			return -ENOMEM;
+
+		fault->evt = *evt;
+		fault->dev = dev;
+
+		/* Non-last request of a group. Postpone until the last one */
+		spin_lock(&iommu_partial_faults_lock);
+		list_add_tail(&fault->head, &iommu_partial_faults);
+		spin_unlock(&iommu_partial_faults_lock);
+
+		return IOMMU_PAGE_RESP_HANDLED;
+	}
+
+	group = kzalloc(sizeof(*group), GFP_KERNEL);
+	if (!group)
+		return -ENOMEM;
+
+	group->last_fault.evt = *evt;
+	group->last_fault.dev = dev;
+	group->domain = domain;
+	INIT_LIST_HEAD(&group->faults);
+	list_add(&group->last_fault.head, &group->faults);
+	INIT_WORK(&group->work, iommu_fault_handle_group);
+
+	/* See if we have pending faults for this group */
+	spin_lock(&iommu_partial_faults_lock);
+	list_for_each_entry_safe(fault, next, &iommu_partial_faults, head) {
+		if (fault->evt.page_req_group_id == evt->page_req_group_id &&
+		    fault->dev == dev) {
+			list_del(&fault->head);
+			/* Insert *before* the last fault */
+			list_add(&fault->head, &group->faults);
+		}
+	}
+	spin_unlock(&iommu_partial_faults_lock);
+
+	queue_work(iommu_fault_queue, &group->work);
+
+	/* Postpone the fault completion */
+	return IOMMU_PAGE_RESP_HANDLED;
+}
+
+/**
+ * iommu_report_device_fault() - Handle fault in device driver or mm
+ *
+ * If the device driver expressed interest in handling fault, report it through
+ * the callback. If the fault is recoverable, try to page in the address.
+ */
+int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt)
+{
+	int ret = -ENOSYS;
+	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
+
+	if (!domain)
+		return -ENODEV;
+
+	/*
+	 * if upper layers showed interest and installed a fault handler,
+	 * invoke it.
+	 */
+	if (iommu_has_device_fault_handler(dev)) {
+		struct iommu_fault_param *param = dev->iommu_param->fault_param;
+
+		return param->handler(evt, param->data);
+	}
+
+	/* If the handler is blocking, handle fault in the workqueue */
+	if (evt->type == IOMMU_FAULT_PAGE_REQ)
+		ret = iommu_queue_fault(domain, dev, evt);
+
+	return iommu_fault_complete(domain, dev, evt, ret);
+}
+EXPORT_SYMBOL_GPL(iommu_report_device_fault);
+
+/**
+ * iommu_fault_queue_register() - register an IOMMU driver to the fault queue
+ * @flush_notifier: a notifier block that is called before the fault queue is
+ * flushed. The IOMMU driver should commit all faults that are pending in its
+ * low-level queues at the time of the call, into the fault queue. The notifier
+ * takes a device pointer as argument, hinting what endpoint is causing the
+ * flush. When the device is NULL, all faults should be committed.
+ */
+int iommu_fault_queue_register(struct notifier_block *flush_notifier)
+{
+	/*
+	 * The WQ is unordered because the low-level handler enqueues faults by
+	 * group. PRI requests within a group have to be ordered, but once
+	 * that's dealt with, the high-level function can handle groups out of
+	 * order.
+	 */
+	down_write(&iommu_fault_queue_sem);
+	if (!iommu_fault_queue) {
+		iommu_fault_queue = alloc_workqueue("iommu_fault_queue",
+						    WQ_UNBOUND, 0);
+		if (iommu_fault_queue)
+			refcount_set(&iommu_fault_queue_refs, 1);
+	} else {
+		refcount_inc(&iommu_fault_queue_refs);
+	}
+	up_write(&iommu_fault_queue_sem);
+
+	if (!iommu_fault_queue)
+		return -ENOMEM;
+
+	if (flush_notifier)
+		blocking_notifier_chain_register(&iommu_fault_queue_flush_notifiers,
+						 flush_notifier);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(iommu_fault_queue_register);
+
+/**
+ * iommu_fault_queue_flush() - Ensure that all queued faults have been
+ * processed.
+ * @dev: the endpoint whose faults need to be flushed. If NULL, flush all
+ *       pending faults.
+ *
+ * Users must call this function when releasing a PASID, to ensure that all
+ * pending faults affecting this PASID have been handled, and won't affect the
+ * address space of a subsequent process that reuses this PASID.
+ */
+void iommu_fault_queue_flush(struct device *dev)
+{
+	blocking_notifier_call_chain(&iommu_fault_queue_flush_notifiers, 0, dev);
+
+	down_read(&iommu_fault_queue_sem);
+	/*
+	 * Don't flush the partial faults list. All PRGs with the PASID are
+	 * complete and have been submitted to the queue.
+	 */
+	if (iommu_fault_queue)
+		flush_workqueue(iommu_fault_queue);
+	up_read(&iommu_fault_queue_sem);
+}
+EXPORT_SYMBOL_GPL(iommu_fault_queue_flush);
+
+/**
+ * iommu_fault_queue_unregister() - Unregister an IOMMU driver from the fault
+ * queue.
+ * @flush_notifier: same parameter as iommu_fault_queue_register
+ */
+void iommu_fault_queue_unregister(struct notifier_block *flush_notifier)
+{
+	down_write(&iommu_fault_queue_sem);
+	if (refcount_dec_and_test(&iommu_fault_queue_refs)) {
+		destroy_workqueue(iommu_fault_queue);
+		iommu_fault_queue = NULL;
+	}
+	up_write(&iommu_fault_queue_sem);
+
+	if (flush_notifier)
+		blocking_notifier_chain_unregister(&iommu_fault_queue_flush_notifiers,
+						   flush_notifier);
+}
+EXPORT_SYMBOL_GPL(iommu_fault_queue_unregister);
diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c
index 4bc2a8c12465..d7b231cd7355 100644
--- a/drivers/iommu/iommu-sva.c
+++ b/drivers/iommu/iommu-sva.c
@@ -102,9 +102,6 @@
  * the device table and PASID 0 would be available to the allocator.
  */
 
-/* TODO: stub for the fault queue. Remove later. */
-#define iommu_fault_queue_flush(...)
-
 struct iommu_bond {
 	struct io_mm		*io_mm;
 	struct device		*dev;
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 1d60b32a6744..c475893ec7dc 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -798,6 +798,17 @@ int iommu_group_unregister_notifier(struct iommu_group *group,
 }
 EXPORT_SYMBOL_GPL(iommu_group_unregister_notifier);
 
+/**
+ * iommu_register_device_fault_handler() - Register a device fault handler
+ * @dev: the device
+ * @handler: the fault handler
+ * @data: private data passed as argument to the callback
+ *
+ * When an IOMMU fault event is received, call this handler with the fault event
+ * and data as argument.
+ *
+ * Return 0 if the fault handler was installed successfully, or an error.
+ */
 int iommu_register_device_fault_handler(struct device *dev,
 					iommu_dev_fault_handler_t handler,
 					void *data)
@@ -825,6 +836,13 @@ int iommu_register_device_fault_handler(struct device *dev,
 }
 EXPORT_SYMBOL_GPL(iommu_register_device_fault_handler);
 
+/**
+ * iommu_unregister_device_fault_handler() - Unregister the device fault handler
+ * @dev: the device
+ *
+ * Remove the device fault handler installed with
+ * iommu_register_device_fault_handler().
+ */
 int iommu_unregister_device_fault_handler(struct device *dev)
 {
 	struct iommu_param *idata = dev->iommu_param;
@@ -840,19 +858,6 @@ int iommu_unregister_device_fault_handler(struct device *dev)
 }
 EXPORT_SYMBOL_GPL(iommu_unregister_device_fault_handler);
 
-
-int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt)
-{
-	/* we only report device fault if there is a handler registered */
-	if (!dev->iommu_param || !dev->iommu_param->fault_param ||
-		!dev->iommu_param->fault_param->handler)
-		return -ENOSYS;
-
-	return dev->iommu_param->fault_param->handler(evt,
-						dev->iommu_param->fault_param->data);
-}
-EXPORT_SYMBOL_GPL(iommu_report_device_fault);
-
 /**
  * iommu_group_id - Return ID for a group
  * @group: the group to ID
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 226ab4f3ae0e..65e56f28e0ce 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -205,6 +205,7 @@ struct page_response_msg {
 	u32 resp_code:4;
 #define IOMMU_PAGE_RESP_SUCCESS	0
 #define IOMMU_PAGE_RESP_INVALID	1
+#define IOMMU_PAGE_RESP_HANDLED	2
 #define IOMMU_PAGE_RESP_FAILURE	0xF
 
 	u32 pasid_present:1;
@@ -534,7 +535,6 @@ extern int iommu_register_device_fault_handler(struct device *dev,
 
 extern int iommu_unregister_device_fault_handler(struct device *dev);
 
-extern int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt);
 extern int iommu_page_response(struct iommu_domain *domain, struct device *dev,
 			       struct page_response_msg *msg);
 
@@ -836,11 +836,6 @@ static inline bool iommu_has_device_fault_handler(struct device *dev)
 	return false;
 }
 
-static inline int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt)
-{
-	return 0;
-}
-
 static inline int iommu_page_response(struct iommu_domain *domain, struct device *dev,
 				      struct page_response_msg *msg)
 {
@@ -1005,4 +1000,31 @@ static inline struct mm_struct *iommu_sva_find(int pasid)
 }
 #endif /* CONFIG_IOMMU_SVA */
 
+#ifdef CONFIG_IOMMU_FAULT
+extern int iommu_fault_queue_register(struct notifier_block *flush_notifier);
+extern void iommu_fault_queue_flush(struct device *dev);
+extern void iommu_fault_queue_unregister(struct notifier_block *flush_notifier);
+extern int iommu_report_device_fault(struct device *dev,
+				     struct iommu_fault_event *evt);
+#else /* CONFIG_IOMMU_FAULT */
+static inline int iommu_fault_queue_register(struct notifier_block *flush_notifier)
+{
+	return -ENODEV;
+}
+
+static inline void iommu_fault_queue_flush(struct device *dev)
+{
+}
+
+static inline void iommu_fault_queue_unregister(struct notifier_block *flush_notifier)
+{
+}
+
+static inline int iommu_report_device_fault(struct device *dev,
+					    struct iommu_fault_event *evt)
+{
+	return 0;
+}
+#endif /* CONFIG_IOMMU_FAULT */
+
 #endif /* __LINUX_IOMMU_H */
-- 
2.15.1




More information about the linux-arm-kernel mailing list