[PATCH RFC v5 15/18] riscv_cbqri: resctrl: Add mbm_total_bytes bandwidth monitoring

Drew Fustini fustini at kernel.org
Sun May 24 16:55:45 PDT 2026


Expose CBQRI bandwidth controller's combined read+write counter as
the L3 mbm_total_bytes event. A software accumulator keeps the
64-bit byte total monotonic across the 62-bit hardware counter wrap.

mbm_local_bytes is not supported because the CBQRI spec has no way
to distinguish total versus local. mbm_total_bytes is enabled only
when the platform exposes exactly one mon-capable bandwidth
controller and exactly one L3 domain. Pairing a single BC with
multiple L3 domains would let standard userspace tools overcount
system bandwidth by summing the same counter across domains.

Assisted-by: Claude:claude-opus-4-7
Co-developed-by: Adrien Ricciardi <aricciardi at baylibre.com>
Signed-off-by: Adrien Ricciardi <aricciardi at baylibre.com>
Signed-off-by: Drew Fustini <fustini at kernel.org>
---
 drivers/resctrl/cbqri_resctrl.c | 191 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 187 insertions(+), 4 deletions(-)

diff --git a/drivers/resctrl/cbqri_resctrl.c b/drivers/resctrl/cbqri_resctrl.c
index ba764bc6ef17..f11709d7e479 100644
--- a/drivers/resctrl/cbqri_resctrl.c
+++ b/drivers/resctrl/cbqri_resctrl.c
@@ -29,6 +29,13 @@ struct cbqri_resctrl_res {
 struct cbqri_resctrl_dom {
 	struct rdt_ctrl_domain  resctrl_ctrl_dom;
 	struct cbqri_controller *hw_ctrl;
+	/*
+	 * For an L3 capacity controller paired with a bandwidth controller
+	 * of matching topology, paired_bc caches that BC so mbm_total_bytes
+	 * reads / resets don't have to walk cbqri_controllers on every hit.
+	 * NULL for non-L3 domains and L3s without a paired BC.
+	 */
+	struct cbqri_controller *paired_bc;
 };
 
 static struct cbqri_resctrl_res cbqri_resctrl_resources[RDT_NUM_RESOURCES];
@@ -37,7 +44,7 @@ static struct cbqri_resctrl_res cbqri_resctrl_resources[RDT_NUM_RESOURCES];
  * Per-event controller table. Only events CBQRI can back occupy a
  * slot, so other events do not bloat the array.
  */
-#define CBQRI_MAX_EVENT QOS_L3_OCCUP_EVENT_ID
+#define CBQRI_MAX_EVENT QOS_L3_MBM_TOTAL_EVENT_ID
 static struct cbqri_controller *cbqri_resctrl_counters[CBQRI_MAX_EVENT + 1];
 
 static bool exposed_alloc_capable;
@@ -228,6 +235,36 @@ void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_l3_mon_domain *d
 		mutex_unlock(&ctrl->lock);
 		break;
 
+	case QOS_L3_MBM_TOTAL_EVENT_ID: {
+		struct cbqri_controller *bc;
+
+		cd = cbqri_find_ctrl_domain(&r->ctrl_domains, d->hdr.id);
+		if (!cd)
+			break;
+		hw_dom = container_of(cd, struct cbqri_resctrl_dom, resctrl_ctrl_dom);
+		bc = hw_dom->paired_bc;
+		if (!bc)
+			break;
+		if (WARN_ON_ONCE(!bc->mbm_total_states))
+			break;
+		if (rmid >= bc->mcid_count)
+			break;
+
+		mutex_lock(&bc->lock);
+		/*
+		 * CONFIG_EVENT both resets and re-arms. Skip the accumulator
+		 * memset on failure. A stale hardware counter X with
+		 * prev_ctr=0 would inject overflow(0, X) on the next read.
+		 */
+		if (!cbqri_mon_op(bc, CBQRI_BC_MON_CTL_OFF,
+				  CBQRI_BC_MON_CTL_OP_CONFIG_EVENT, rmid,
+				  CBQRI_BC_EVT_ID_TOTAL_READ_WRITE, NULL))
+			memset(&bc->mbm_total_states[rmid], 0,
+			       sizeof(*bc->mbm_total_states));
+		mutex_unlock(&bc->lock);
+		break;
+	}
+
 	default:
 		break;
 	}
@@ -240,8 +277,10 @@ void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_l3_mon_domai
 	int i;
 
 	/* Bound by max_rmid (system-wide minimum mcid_count). */
-	for (i = 0; i < max_rmid; i++)
+	for (i = 0; i < max_rmid; i++) {
 		resctrl_arch_reset_rmid(r, d, 0, i, QOS_L3_OCCUP_EVENT_ID);
+		resctrl_arch_reset_rmid(r, d, 0, i, QOS_L3_MBM_TOTAL_EVENT_ID);
+	}
 }
 
 int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain_hdr *hdr,
@@ -305,6 +344,82 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain_hdr *hdr,
 		mutex_unlock(&ctrl->lock);
 		break;
 
+	case QOS_L3_MBM_TOTAL_EVENT_ID: {
+		struct cbqri_controller *bc;
+
+		/*
+		 * The L3 monitoring domain's id is the L3 cache id. The
+		 * matching ctrl domain's hw_dom->paired_bc was cached at
+		 * add time to avoid walking cbqri_controllers on every read.
+		 */
+		d = cbqri_find_ctrl_domain(&r->ctrl_domains, hdr->id);
+		if (!d) {
+			err = -ENOENT;
+			break;
+		}
+		hw_dom = container_of(d, struct cbqri_resctrl_dom, resctrl_ctrl_dom);
+		bc = hw_dom->paired_bc;
+		if (!bc) {
+			err = -ENOENT;
+			break;
+		}
+		if (WARN_ON_ONCE(!bc->mbm_total_states)) {
+			err = -EIO;
+			break;
+		}
+		if (rmid >= bc->mcid_count) {
+			err = -ERANGE;
+			break;
+		}
+
+		mutex_lock(&bc->lock);
+		/* Pass EVT_ID explicitly. Same reason as the CC path above. */
+		err = cbqri_mon_op(bc, CBQRI_BC_MON_CTL_OFF,
+				   CBQRI_BC_MON_CTL_OP_READ_COUNTER, rmid,
+				   CBQRI_BC_EVT_ID_TOTAL_READ_WRITE, NULL);
+		if (err)
+			goto out_bc;
+
+		ctr_val = ioread64(bc->base + CBQRI_BC_MON_CTR_VAL_OFF);
+
+		if (ctr_val & CBQRI_BC_MON_CTR_VAL_INVALID) {
+			/*
+			 * Return the last good total and leave prev_ctr so
+			 * the next valid sample resumes from there.
+			 */
+			*val = bc->mbm_total_states[rmid].chunks;
+		} else if (ctr_val & CBQRI_BC_MON_CTR_VAL_OVF) {
+			/*
+			 * OVF is sticky until next CONFIG_EVENT.
+			 * cbqri_bc_mon_overflow() can recover at most
+			 * one wrap. With OVF set, the count is unknown,
+			 * so re-arm and re-anchor prev_ctr=0.
+			 */
+			struct cbqri_bc_mon_state *s = &bc->mbm_total_states[rmid];
+
+			pr_warn_ratelimited("BC@%pa MCID %u: bandwidth counter overflow\n",
+					    &bc->addr, rmid);
+			err = cbqri_mon_op(bc, CBQRI_BC_MON_CTL_OFF,
+					   CBQRI_BC_MON_CTL_OP_CONFIG_EVENT, rmid,
+					   CBQRI_BC_EVT_ID_TOTAL_READ_WRITE, NULL);
+			if (err)
+				goto out_bc;
+
+			s->prev_ctr = 0;
+			*val = s->chunks;
+		} else {
+			struct cbqri_bc_mon_state *s = &bc->mbm_total_states[rmid];
+			u64 cur = ctr_val & CBQRI_BC_MON_CTR_VAL_CTR_MASK;
+
+			s->chunks  += cbqri_bc_mon_overflow(s->prev_ctr, cur);
+			s->prev_ctr = cur;
+			*val        = s->chunks;
+		}
+out_bc:
+		mutex_unlock(&bc->lock);
+		break;
+	}
+
 	default:
 		err = -EINVAL;
 		break;
@@ -738,6 +853,15 @@ static int cbqri_resctrl_control_init(struct cbqri_resctrl_res *cbqri_res)
 			res->mon.num_rmid = ctrl->mcid_count;
 			resctrl_enable_mon_event(QOS_L3_OCCUP_EVENT_ID,
 						 false, 0, NULL);
+
+			/*
+			 * Expose BC bandwidth monitoring as the L3's
+			 * mbm_total_bytes when they share topology.
+			 */
+			if (cbqri_resctrl_counters[QOS_L3_MBM_TOTAL_EVENT_ID])
+				resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID,
+							 false, 0, NULL);
+
 			res->mon_capable = true;
 		}
 		break;
@@ -824,15 +948,54 @@ static int cbqri_resctrl_pick_bw_alloc(void)
 }
 
 /*
- * Pick one controller per monitoring event.  L3 OCCUP comes from the
- * picked L3 CC (if mon_capable).
+ * Pick one controller per monitoring event. L3 OCCUP comes from the
+ * picked L3 CC if mon_capable. MBM_TOTAL from the only mon-capable BC,
+ * but only when the system exposes exactly one L3 cache. Pairing a
+ * single BC with multiple L3 domains would overcount system bandwidth
+ * by a factor equal to the L3 domain count.
  */
 static void cbqri_resctrl_pick_counters(void)
 {
 	struct cbqri_resctrl_res *l3 = &cbqri_resctrl_resources[RDT_RESOURCE_L3];
+	struct cbqri_controller *ctrl, *prev;
+	unsigned int l3_count = 0;
 
 	if (l3->ctrl && l3->ctrl->mon_capable)
 		cbqri_resctrl_counters[QOS_L3_OCCUP_EVENT_ID] = l3->ctrl;
+
+	/* Count distinct L3 cache_ids */
+	list_for_each_entry(ctrl, &cbqri_controllers, list) {
+		bool seen = false;
+
+		if (ctrl->type != CBQRI_CONTROLLER_TYPE_CAPACITY)
+			continue;
+		if (ctrl->cache.cache_level != 3)
+			continue;
+
+		list_for_each_entry(prev, &cbqri_controllers, list) {
+			if (prev == ctrl)
+				break;
+			if (prev->type != CBQRI_CONTROLLER_TYPE_CAPACITY)
+				continue;
+			if (prev->cache.cache_level != 3)
+				continue;
+			if (prev->cache.cache_id == ctrl->cache.cache_id) {
+				seen = true;
+				break;
+			}
+		}
+		if (!seen)
+			l3_count++;
+	}
+
+	if (l3_count > 1) {
+		pr_warn_once("multiple L3 domains (%u) detected. mbm_total_bytes disabled\n",
+			     l3_count);
+		return;
+	}
+
+	cbqri_resctrl_counters[QOS_L3_MBM_TOTAL_EVENT_ID] =
+		cbqri_find_only_mon_bc();
 }
 
 static void cbqri_resctrl_accumulate_caps(void)
@@ -948,6 +1111,26 @@ static int cbqri_attach_cpu_to_l3_mon(struct cbqri_controller *ctrl,
 	else
 		list_add_tail(&mon_dom->hdr.list, &res->mon_domains);
 
+	/*
+	 * Pair this L3 domain with the system's mon-capable BC and
+	 * initialise the BC's per-MCID software accumulators before
+	 * resctrl_online_mon_domain() exposes the domain to userspace.
+	 * A concurrent sysfs read of mbm_total_bytes between online and
+	 * BC init would otherwise pass the !bc->mbm_total_states check
+	 * with a half-initialised pointer.
+	 */
+	hw_dom = container_of(ctrl_dom, struct cbqri_resctrl_dom, resctrl_ctrl_dom);
+
+	hw_dom->paired_bc = cbqri_find_only_mon_bc();
+	if (hw_dom->paired_bc) {
+		err = cbqri_init_bc_mon_counters(hw_dom->paired_bc);
+		if (err) {
+			pr_err("BC @%pa: mon init failed (%d)\n", &hw_dom->paired_bc->addr, err);
+			hw_dom->paired_bc = NULL;
+			goto err_listdel;
+		}
+	}
+
 	err = resctrl_online_mon_domain(res, &mon_dom->hdr);
 	if (err)
 		goto err_listdel;

-- 
2.43.0




More information about the linux-riscv mailing list