[bug report] iommu/arm-smmu-v3: Event cannot be printed in some scenarios

Pranjal Shrivastava praan at google.com
Thu Aug 8 06:50:17 PDT 2024


On Wed, Aug 07, 2024 at 01:35:03PM +0800, Baolu Lu wrote:
> On 2024/8/6 23:58, Pranjal Shrivastava wrote:
> > On Tue, Aug 06, 2024 at 09:49:43AM -0300, Jason Gunthorpe wrote:
> > > On Mon, Aug 05, 2024 at 03:32:50PM +0000, Pranjal Shrivastava wrote:
> > > > Here's the updated diff:
> > > > diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> > > > index a31460f9f3d4..ed2b106e02dd 100644
> > > > --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> > > > +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> > > > @@ -1777,7 +1777,7 @@ static int arm_smmu_handle_evt(struct arm_smmu_device *smmu, u64 *evt)
> > > >   		goto out_unlock;
> > > >   	}
> > > > -	iommu_report_device_fault(master->dev, &fault_evt);
> > > > +	ret = iommu_report_device_fault(master->dev, &fault_evt);
> > > >   out_unlock:
> > > >   	mutex_unlock(&smmu->streams_mutex);
> > > >   	return ret;
> > > > diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
> > > > index 0e3a9b38bef2..7684e7562584 100644
> > > > --- a/drivers/iommu/intel/svm.c
> > > > +++ b/drivers/iommu/intel/svm.c
> > > > @@ -532,6 +532,9 @@ void intel_svm_page_response(struct device *dev, struct iopf_fault *evt,
> > > >   	bool last_page;
> > > >   	u16 sid;
> > > > +	if (!evt)
> > > > +		return;
> > > > +
> > > I'm not sure this make sense??
> > > 
> > > The point of this path is for the driver to retire the fault with a
> > > failure. This prevents that from happing on Intel and we are back to
> > > loosing track of a fault.
> > > 
> > > All calls to iommu_report_device_fault() must result in
> > > page_response() properly retiring whatever the event was.
> > > 
> > > > +static void iopf_error_response(struct device *dev, struct iommu_fault *fault)
> > > > +{
> > > > +	const struct iommu_ops *ops = dev_iommu_ops(dev);
> > > > +	struct iommu_page_response resp = {
> > > > +		.pasid = fault->prm.pasid,
> > > > +		.grpid = fault->prm.grpid,
> > > > +		.code = IOMMU_PAGE_RESP_INVALID
> > > > +	};
> > > > +
> > > > +	ops->page_response(dev, NULL, &resp);
> > > > +}
> > > The issue originates here, why is this NULL?
> > > 
> > > void iommu_report_device_fault(struct device *dev, struct iopf_fault *evt)
> > > {
> > > 
> > > The caller has an evt? I think we should pass it down.
> > Hmm, I agree, I don't see `iommu_report_device_fault` be called anywhere
> > with a NULL evt. Hence, it does make sense to pass the evt down and
> > ensure we don't lose track of the event.
> > 
> > I'm assuming that we retired the if (!evt) check from intel->page
> > response since we didn't have any callers of intel->page_response
> > with a NULL evt. (Atleast, for now, I don't see that happen).
> > 
> > Lu, Will -- Any additional comments/suggestions for this?
> 
> No. If evt is passed down in the above code, there is no need to add
> such check anymore.
>

Ack. I've addressed the review comments.
Jason -- I hope this addresses the report_partial_fault()'s -ENOMEM
return as per your liking?

Kunkun -- Please try this diff and check if it fixes the problem?

> Thanks,
> baolu

Thanks,
Pranjal

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index a31460f9f3d4..ed2b106e02dd 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -1777,7 +1777,7 @@ static int arm_smmu_handle_evt(struct arm_smmu_device *smmu, u64 *evt)
 		goto out_unlock;
 	}
 
-	iommu_report_device_fault(master->dev, &fault_evt);
+	ret = iommu_report_device_fault(master->dev, &fault_evt);
 out_unlock:
 	mutex_unlock(&smmu->streams_mutex);
 	return ret;
diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c
index 7c9011992d3f..4c56da1373cc 100644
--- a/drivers/iommu/io-pgfault.c
+++ b/drivers/iommu/io-pgfault.c
@@ -113,6 +113,59 @@ static struct iopf_group *iopf_group_alloc(struct iommu_fault_param *iopf_param,
 	return group;
 }
 
+static struct iommu_attach_handle *find_fault_handler(struct device *dev,
+						     struct iopf_fault *evt)
+{
+	struct iommu_fault *fault = &evt->fault;
+	struct iommu_attach_handle *attach_handle;
+
+	if (fault->prm.flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID) {
+		attach_handle = iommu_attach_handle_get(dev->iommu_group,
+				fault->prm.pasid, 0);
+		if (IS_ERR(attach_handle)) {
+			const struct iommu_ops *ops = dev_iommu_ops(dev);
+
+			if (!ops->user_pasid_table)
+				return NULL;
+			/*
+			 * The iommu driver for this device supports user-
+			 * managed PASID table. Therefore page faults for
+			 * any PASID should go through the NESTING domain
+			 * attached to the device RID.
+			 */
+			attach_handle = iommu_attach_handle_get(
+					dev->iommu_group, IOMMU_NO_PASID,
+					IOMMU_DOMAIN_NESTED);
+			if (IS_ERR(attach_handle))
+				return NULL;
+		}
+	} else {
+		attach_handle = iommu_attach_handle_get(dev->iommu_group,
+				IOMMU_NO_PASID, 0);
+
+		if (IS_ERR(attach_handle))
+			return NULL;
+	}
+
+	if (!attach_handle->domain->iopf_handler)
+		return NULL;
+
+	return attach_handle;
+}
+
+static void iopf_error_response(struct device *dev, struct iopf_fault *evt)
+{
+	const struct iommu_ops *ops = dev_iommu_ops(dev);
+	struct iommu_fault *fault = &evt->fault;
+	struct iommu_page_response resp = {
+		.pasid = fault->prm.pasid,
+		.grpid = fault->prm.grpid,
+		.code = IOMMU_PAGE_RESP_INVALID
+	};
+
+	ops->page_response(dev, evt, &resp);
+}
+
 /**
  * iommu_report_device_fault() - Report fault event to device driver
  * @dev: the device
@@ -153,21 +206,38 @@ static struct iopf_group *iopf_group_alloc(struct iommu_fault_param *iopf_param,
  * hardware has been set to block the page faults) and the pending page faults
  * have been flushed.
  */
-void iommu_report_device_fault(struct device *dev, struct iopf_fault *evt)
+int iommu_report_device_fault(struct device *dev, struct iopf_fault *evt)
 {
+	struct iommu_attach_handle *attach_handle;
 	struct iommu_fault *fault = &evt->fault;
 	struct iommu_fault_param *iopf_param;
 	struct iopf_group abort_group = {};
 	struct iopf_group *group;
+	int ret = 0;
 
+	attach_handle = find_fault_handler(dev, evt);
+	if (!attach_handle) {
+		ret = -EINVAL;
+		goto err_bad_iopf;
+	}
+
+	/*
+	 * Something has gone wrong if a fault capable domain is attached but no
+	 * iopf_param is setup
+	 */
 	iopf_param = iopf_get_dev_fault_param(dev);
-	if (WARN_ON(!iopf_param))
-		return;
+	if (WARN_ON(!iopf_param)) {
+		ret = -EINVAL;
+		goto err_bad_iopf;
+	}
 
 	if (!(fault->prm.flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE)) {
-		report_partial_fault(iopf_param, fault);
+		ret = report_partial_fault(iopf_param, fault);
 		iopf_put_dev_fault_param(iopf_param);
 		/* A request that is not the last does not need to be ack'd */
+
+		if (ret)
+			goto err_bad_iopf;
 	}
 
 	/*
@@ -182,38 +252,7 @@ void iommu_report_device_fault(struct device *dev, struct iopf_fault *evt)
 	if (group == &abort_group)
 		goto err_abort;
 
-	if (fault->prm.flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID) {
-		group->attach_handle = iommu_attach_handle_get(dev->iommu_group,
-							       fault->prm.pasid,
-							       0);
-		if (IS_ERR(group->attach_handle)) {
-			const struct iommu_ops *ops = dev_iommu_ops(dev);
-
-			if (!ops->user_pasid_table)
-				goto err_abort;
-
-			/*
-			 * The iommu driver for this device supports user-
-			 * managed PASID table. Therefore page faults for
-			 * any PASID should go through the NESTING domain
-			 * attached to the device RID.
-			 */
-			group->attach_handle =
-				iommu_attach_handle_get(dev->iommu_group,
-							IOMMU_NO_PASID,
-							IOMMU_DOMAIN_NESTED);
-			if (IS_ERR(group->attach_handle))
-				goto err_abort;
-		}
-	} else {
-		group->attach_handle =
-			iommu_attach_handle_get(dev->iommu_group, IOMMU_NO_PASID, 0);
-		if (IS_ERR(group->attach_handle))
-			goto err_abort;
-	}
-
-	if (!group->attach_handle->domain->iopf_handler)
-		goto err_abort;
+	group->attach_handle = attach_handle;
 
 	/*
 	 * On success iopf_handler must call iopf_group_response() and
@@ -222,7 +261,7 @@ void iommu_report_device_fault(struct device *dev, struct iopf_fault *evt)
 	if (group->attach_handle->domain->iopf_handler(group))
 		goto err_abort;
 
-	return;
+	return 0;
 
 err_abort:
 	dev_warn_ratelimited(dev, "iopf with pasid %d aborted\n",
@@ -232,6 +271,14 @@ void iommu_report_device_fault(struct device *dev, struct iopf_fault *evt)
 		__iopf_free_group(group);
 	else
 		iopf_free_group(group);
+
+	return 0;
+
+err_bad_iopf:
+	if (fault->type == IOMMU_FAULT_PAGE_REQ)
+		iopf_error_response(dev, evt);
+
+	return ret;
 }
 EXPORT_SYMBOL_GPL(iommu_report_device_fault);
 
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index d87f9cbfc01e..062156a8d87b 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -1561,7 +1561,7 @@ struct iopf_queue *iopf_queue_alloc(const char *name);
 void iopf_queue_free(struct iopf_queue *queue);
 int iopf_queue_discard_partial(struct iopf_queue *queue);
 void iopf_free_group(struct iopf_group *group);
-void iommu_report_device_fault(struct device *dev, struct iopf_fault *evt);
+int iommu_report_device_fault(struct device *dev, struct iopf_fault *evt);
 void iopf_group_response(struct iopf_group *group,
 			 enum iommu_page_response_code status);
 #else
@@ -1599,9 +1599,10 @@ static inline void iopf_free_group(struct iopf_group *group)
 {
 }
 
-static inline void
+static inline int
 iommu_report_device_fault(struct device *dev, struct iopf_fault *evt)
 {
+	return -ENODEV;
 }
 
 static inline void iopf_group_response(struct iopf_group *group,



More information about the linux-arm-kernel mailing list