[PATCH v6 6/6] iommu/tegra241-cmdqv: Limit CMDs for guest owned VINTF

Tue Apr 30 10:06:55 PDT 2024

On Mon, Apr 29, 2024 at 09:43:49PM -0700, Nicolin Chen wrote:
> -struct arm_smmu_cmdq *tegra241_cmdqv_get_cmdq(struct arm_smmu_device *smmu)
> +static bool tegra241_vintf_support_cmds(struct tegra241_vintf *vintf,
> +					u64 *cmds, int n)
> +{
> +	int i;
> +
> +	/* VINTF owned by hypervisor can execute any command */
> +	if (vintf->hyp_own)
> +		return true;
> +
> +	/* Guest-owned VINTF must Check against the list of supported CMDs */
> +	for (i = 0; i < n; i++) {
> +		switch (FIELD_GET(CMDQ_0_OP, cmds[i * CMDQ_ENT_DWORDS])) {
> +		case CMDQ_OP_TLBI_NH_ASID:
> +		case CMDQ_OP_TLBI_NH_VA:
> +		case CMDQ_OP_ATC_INV:

So CMDQ only works if not ARM_SMMU_FEAT_E2H? Probably worth mentioning
that too along with the discussion about HYP


> +			continue;
> +		default:
> +			return false;
> +		}
> +	}
> +
> +	return true;
> +}

For a performance path this looping seems disappointing.. The callers
don't actually mix different command type. Is there something
preventing adding a parameter at the callers?

Actually looking at this more closely, isn't the command q selection
in the wrong place?

Ie this batch stuff:

static void arm_smmu_cmdq_batch_add(struct arm_smmu_device *smmu,
				    struct arm_smmu_cmdq_batch *cmds,
				    struct arm_smmu_cmdq_ent *cmd)
{
	int index;

	if (cmds->num == CMDQ_BATCH_ENTRIES - 1 &&
	    (smmu->options & ARM_SMMU_OPT_CMDQ_FORCE_SYNC)) {
		arm_smmu_cmdq_issue_cmdlist(smmu, cmds->cmds, cmds->num, true);
		cmds->num = 0;
	}

	if (cmds->num == CMDQ_BATCH_ENTRIES) {
		arm_smmu_cmdq_issue_cmdlist(smmu, cmds->cmds, cmds->num, false);
		cmds->num = 0;
	}

	index = cmds->num * CMDQ_ENT_DWORDS;
	if (unlikely(arm_smmu_cmdq_build_cmd(&cmds->cmds[index], cmd))) {
		dev_warn(smmu->dev, "ignoring unknown CMDQ opcode 0x%x\n",
			 cmd->opcode);
		return;
	}

Has to push everything, across all the iterations of add/submut, onto
the same CMDQ otherwise the SYNC won't be properly flushing?

But each arm_smmu_cmdq_issue_cmdlist() calls its own get q
function. Yes, they probably return the same Q since we are probably
on the same CPU, but it seems logically wrong (and slower!) to
organize it like this.

I would expect the Q to be selected when the struct
arm_smmu_cmdq_batch is allocated on the stack, and be the same for the
entire batch operation. Not only do we spend less time trying to
compute the Q to use we have a built in guarentee that every command
will be on the same Q as the fenching SYNC.

Something sort of like this as another patch?

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 268da20baa4e9c..d8c9597878315a 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -357,11 +357,22 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
 	return 0;
 }
 
-static struct arm_smmu_cmdq *arm_smmu_get_cmdq(struct arm_smmu_device *smmu,
-					       u64 *cmds, int n)
+enum required_cmds {
+	CMDS_ALL,
+	/*
+	 * Commands will be one of:
+	 *  CMDQ_OP_ATC_INV, CMDQ_OP_TLBI_EL2_VA, CMDQ_OP_TLBI_NH_VA,
+	 *  CMDQ_OP_TLBI_EL2_ASID, CMDQ_OP_TLBI_NH_ASID, CMDQ_OP_TLBI_S2_IPA,
+	 *  CMDQ_OP_TLBI_S12_VMALL, CMDQ_OP_SYNC
+	 */
+	CMDS_INVALIDATION,
+};
+
+static struct arm_smmu_cmdq *
+arm_smmu_get_cmdq(struct arm_smmu_device *smmu, enum required_cmds required)
 {
 	if (smmu->tegra241_cmdqv)
-		return tegra241_cmdqv_get_cmdq(smmu, cmds, n);
+		return tegra241_cmdqv_get_cmdq(smmu, required);
 
 	return &smmu->cmdq;
 }
@@ -766,13 +777,13 @@ static void arm_smmu_cmdq_write_entries(struct arm_smmu_cmdq *cmdq, u64 *cmds,
  *   CPU will appear before any of the commands from the other CPU.
  */
 static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu,
-				       u64 *cmds, int n, bool sync)
+				       struct arm_smmu_cmdq *cmdq, u64 *cmds,
+				       int n, bool sync)
 {
 	u64 cmd_sync[CMDQ_ENT_DWORDS];
 	u32 prod;
 	unsigned long flags;
 	bool owner;
-	struct arm_smmu_cmdq *cmdq = arm_smmu_get_cmdq(smmu, cmds, n);
 	struct arm_smmu_ll_queue llq, head;
 	int ret = 0;
 
@@ -897,7 +908,8 @@ static int __arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
 		return -EINVAL;
 	}
 
-	return arm_smmu_cmdq_issue_cmdlist(smmu, cmd, 1, sync);
+	return arm_smmu_cmdq_issue_cmdlist(
+		smmu, arm_smmu_get_cmdq(smmu, CMDS_ALL), cmd, 1, sync);
 }
 
 static int arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
@@ -912,6 +924,14 @@ static int arm_smmu_cmdq_issue_cmd_with_sync(struct arm_smmu_device *smmu,
 	return __arm_smmu_cmdq_issue_cmd(smmu, ent, true);
 }
 
+static void arm_smmu_cmdq_batch_init(struct arm_smmu_device *smmu,
+				     struct arm_smmu_cmdq_batch *cmds,
+				     enum required_cmds required)
+{
+	cmds->num = 0;
+	cmds->q = arm_smmu_get_cmdq(smmu, required);
+}
+
 static void arm_smmu_cmdq_batch_add(struct arm_smmu_device *smmu,
 				    struct arm_smmu_cmdq_batch *cmds,
 				    struct arm_smmu_cmdq_ent *cmd)
@@ -920,12 +940,14 @@ static void arm_smmu_cmdq_batch_add(struct arm_smmu_device *smmu,
 
 	if (cmds->num == CMDQ_BATCH_ENTRIES - 1 &&
 	    (smmu->options & ARM_SMMU_OPT_CMDQ_FORCE_SYNC)) {
-		arm_smmu_cmdq_issue_cmdlist(smmu, cmds->cmds, cmds->num, true);
+		arm_smmu_cmdq_issue_cmdlist(smmu, cmds->q, cmds->cmds,
+					    cmds->num, true);
 		cmds->num = 0;
 	}
 
 	if (cmds->num == CMDQ_BATCH_ENTRIES) {
-		arm_smmu_cmdq_issue_cmdlist(smmu, cmds->cmds, cmds->num, false);
+		arm_smmu_cmdq_issue_cmdlist(smmu, cmds->q, cmds->cmds,
+					    cmds->num, false);
 		cmds->num = 0;
 	}
 
@@ -942,7 +964,8 @@ static void arm_smmu_cmdq_batch_add(struct arm_smmu_device *smmu,
 static int arm_smmu_cmdq_batch_submit(struct arm_smmu_device *smmu,
 				      struct arm_smmu_cmdq_batch *cmds)
 {
-	return arm_smmu_cmdq_issue_cmdlist(smmu, cmds->cmds, cmds->num, true);
+	return arm_smmu_cmdq_issue_cmdlist(smmu, cmds->q, cmds->cmds, cmds->num,
+					   true);
 }
 
 static void arm_smmu_page_response(struct device *dev, struct iopf_fault *unused,
@@ -1181,7 +1204,7 @@ static void arm_smmu_sync_cd(struct arm_smmu_master *master,
 		},
 	};
 
-	cmds.num = 0;
+	arm_smmu_cmdq_batch_init(smmu, &cmds, CMDS_ALL);
 	for (i = 0; i < master->num_streams; i++) {
 		cmd.cfgi.sid = master->streams[i].id;
 		arm_smmu_cmdq_batch_add(smmu, &cmds, &cmd);
@@ -2045,7 +2068,7 @@ static int arm_smmu_atc_inv_master(struct arm_smmu_master *master,
 
 	arm_smmu_atc_inv_to_cmd(ssid, 0, 0, &cmd);
 
-	cmds.num = 0;
+	arm_smmu_cmdq_batch_init(master->smmu, &cmds, CMDS_INVALIDATION);
 	for (i = 0; i < master->num_streams; i++) {
 		cmd.atc.sid = master->streams[i].id;
 		arm_smmu_cmdq_batch_add(master->smmu, &cmds, &cmd);
@@ -2083,7 +2106,7 @@ int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain,
 	if (!atomic_read(&smmu_domain->nr_ats_masters))
 		return 0;
 
-	cmds.num = 0;
+	arm_smmu_cmdq_batch_init(smmu_domain->smmu, &cmds, CMDS_INVALIDATION);
 
 	spin_lock_irqsave(&smmu_domain->devices_lock, flags);
 	list_for_each_entry(master_domain, &smmu_domain->devices,
@@ -2161,7 +2184,7 @@ static void __arm_smmu_tlb_inv_range(struct arm_smmu_cmdq_ent *cmd,
 			num_pages++;
 	}
 
-	cmds.num = 0;
+	arm_smmu_cmdq_batch_init(smmu_domain->smmu, &cmds, CMDS_INVALIDATION);
 
 	while (iova < end) {
 		if (smmu->features & ARM_SMMU_FEAT_RANGE_INV) {
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 9412fa4ff5e045..5651ea2541a0a2 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -576,6 +576,7 @@ struct arm_smmu_cmdq {
 
 struct arm_smmu_cmdq_batch {
 	u64				cmds[CMDQ_BATCH_ENTRIES * CMDQ_ENT_DWORDS];
+	struct arm_smmu_cmdq		*q;
 	int				num;
 };