[PATCH RFC v6 12/18] riscv_cbqri: resctrl: Add L3 cache occupancy monitoring

Drew Fustini fustini at kernel.org
Mon Jun 1 13:36:06 PDT 2026


Expose QOS_L3_OCCUP_EVENT_ID so userspace can read per-MCID
llc_occupancy. The result is converted from capacity blocks to bytes
using cache_size and ncblks.

Each MCID is armed once with the Occupancy event by
cbqri_init_mon_counters() and free runs thereafter. The resctrl core
only reads occupancy on the limbo recycle path and never resets it, so
resctrl_arch_reset_rmid() and resctrl_arch_reset_rmid_all() have no
per-rmid software state to clear.

The L3 mon_domain is created lazily on the first CPU of a cache_id and
linked to the paired ctrl_domain.

Assisted-by: Claude:claude-opus-4-7
Co-developed-by: Adrien Ricciardi <aricciardi at baylibre.com>
Signed-off-by: Adrien Ricciardi <aricciardi at baylibre.com>
Signed-off-by: Drew Fustini <fustini at kernel.org>
---
 drivers/resctrl/cbqri_resctrl.c | 286 ++++++++++++++++++++++++++++++++++++++--
 1 file changed, 274 insertions(+), 12 deletions(-)

diff --git a/drivers/resctrl/cbqri_resctrl.c b/drivers/resctrl/cbqri_resctrl.c
index fb6d82aa3ffc..f379058b0114 100644
--- a/drivers/resctrl/cbqri_resctrl.c
+++ b/drivers/resctrl/cbqri_resctrl.c
@@ -10,6 +10,7 @@
 #include <linux/cpuhotplug.h>
 #include <linux/err.h>
 #include <linux/init.h>
+#include <linux/io.h>
 #include <linux/resctrl.h>
 #include <linux/slab.h>
 #include <linux/types.h>
@@ -33,6 +34,10 @@ struct cbqri_resctrl_dom {
 static struct cbqri_resctrl_res cbqri_resctrl_resources[RDT_NUM_RESOURCES];
 
 static bool exposed_alloc_capable;
+static bool exposed_mon_capable;
+
+/* Used by resctrl_arch_system_num_rmid_idx(). Narrowed by accumulate_caps. */
+static u32 max_rmid = U32_MAX;
 
 /* Protects ctrl_domain list mutations across CPU hotplug. */
 static DEFINE_MUTEX(cbqri_domain_list_lock);
@@ -45,6 +50,14 @@ cbqri_find_ctrl_domain(struct list_head *h, int id)
 	return hdr ? container_of(hdr, struct rdt_ctrl_domain, hdr) : NULL;
 }
 
+static struct rdt_l3_mon_domain *
+cbqri_find_l3_mon_domain(struct list_head *h, int id)
+{
+	struct rdt_domain_hdr *hdr = resctrl_find_domain(h, id, NULL);
+
+	return hdr ? container_of(hdr, struct rdt_l3_mon_domain, hdr) : NULL;
+}
+
 static int cbqri_apply_cache_config_dom(struct cbqri_resctrl_dom *hw_dom,
 					struct rdt_resource *r,
 					u32 closid, enum resctrl_conf_type t,
@@ -68,7 +81,7 @@ bool resctrl_arch_alloc_capable(void)
 
 bool resctrl_arch_mon_capable(void)
 {
-	return false;
+	return exposed_mon_capable;
 }
 
 bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level rid)
@@ -168,20 +181,89 @@ void resctrl_arch_mon_event_config_write(void *info)
 {
 }
 
-void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_l3_mon_domain *d)
+void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_l3_mon_domain *d,
+			     u32 unused, u32 rmid, enum resctrl_event_id eventid)
 {
+	/*
+	 * Occupancy MCIDs are armed once by cbqri_init_mon_counters() and
+	 * free run thereafter. The core only reads occupancy on the limbo
+	 * recycle path, never resets it, so there is no per-rmid software
+	 * state to clear here.
+	 */
 }
 
-void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_l3_mon_domain *d,
-			     u32 unused, u32 rmid, enum resctrl_event_id eventid)
+void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_l3_mon_domain *d)
 {
+	/* Occupancy counters free run, so there is no state to reset. */
 }
 
 int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain_hdr *hdr,
 			   u32 closid, u32 rmid, enum resctrl_event_id eventid,
 			   void *arch_priv, u64 *val, void *arch_mon_ctx)
 {
-	return -ENODATA;
+	struct cbqri_resctrl_dom *hw_dom;
+	struct cbqri_controller *ctrl;
+	struct rdt_ctrl_domain *d;
+	u64 ctr_val;
+	int err = 0;
+
+	resctrl_arch_rmid_read_context_check();
+
+	/*
+	 * cbqri_mon_op() takes ctrl->lock sleeping mutex and polls
+	 * BUSY for up to 1 ms, neither of which is safe under
+	 * irqs_disabled().
+	 */
+	if (irqs_disabled())
+		return -EIO;
+
+	/*
+	 * cbqri_domain_list_lock serialises the list walk against
+	 * cbqri_detach_cpu_from_ctrl_domains().
+	 */
+	mutex_lock(&cbqri_domain_list_lock);
+
+	switch (eventid) {
+	case QOS_L3_OCCUP_EVENT_ID:
+		d = cbqri_find_ctrl_domain(&r->ctrl_domains, hdr->id);
+		if (!d) {
+			err = -ENOENT;
+			break;
+		}
+
+		hw_dom = container_of(d, struct cbqri_resctrl_dom, resctrl_ctrl_dom);
+		ctrl = hw_dom->hw_ctrl;
+
+		mutex_lock(&ctrl->lock);
+
+		/*
+		 * MCIDs are armed with Occupancy once at init and free run.
+		 * Pass EVT_ID explicitly as the CBQRI spec does not guarantee
+		 * sticky-last-configured-event for READ_COUNTER.
+		 */
+		err = cbqri_mon_op(ctrl, CBQRI_CC_MON_CTL_OFF,
+				   CBQRI_CC_MON_CTL_OP_READ_COUNTER,
+				   rmid, CBQRI_CC_EVT_ID_OCCUPANCY, NULL);
+		if (!err) {
+			ctr_val = ioread64(ctrl->base + CBQRI_CC_MON_CTL_VAL_OFF);
+
+			/*
+			 * Capacity blocks to bytes. Multiply before divide
+			 * so a non-power-of-2 ncblks doesn't truncate.
+			 */
+			*val = (u64)ctrl->cache.cache_size * ctr_val /
+			       ctrl->cc.ncblks;
+		}
+		mutex_unlock(&ctrl->lock);
+		break;
+
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+	mutex_unlock(&cbqri_domain_list_lock);
+	return err;
 }
 
 /*
@@ -203,7 +285,7 @@ u32 resctrl_arch_get_num_closid(struct rdt_resource *res)
 
 u32 resctrl_arch_system_num_rmid_idx(void)
 {
-	return 1;
+	return max_rmid;
 }
 
 u32 resctrl_arch_rmid_idx_encode(u32 closid, u32 rmid)
@@ -500,6 +582,13 @@ static int cbqri_resctrl_control_init(struct cbqri_resctrl_res *cbqri_res)
 		res->alloc_capable = ctrl->alloc_capable;
 		INIT_LIST_HEAD(&res->ctrl_domains);
 		INIT_LIST_HEAD(&res->mon_domains);
+
+		if (ctrl->mon_capable && res->rid == RDT_RESOURCE_L3) {
+			res->mon_scope = RESCTRL_L3_CACHE;
+			resctrl_enable_mon_event(QOS_L3_OCCUP_EVENT_ID,
+						 false, 0, NULL);
+			res->mon_capable = true;
+		}
 		break;
 	default:
 		break;
@@ -510,6 +599,7 @@ static int cbqri_resctrl_control_init(struct cbqri_resctrl_res *cbqri_res)
 
 static void cbqri_resctrl_accumulate_caps(void)
 {
+	struct cbqri_controller *l3_ctrl;
 	int rid;
 
 	for (rid = 0; rid < RDT_NUM_RESOURCES; rid++) {
@@ -519,7 +609,30 @@ static void cbqri_resctrl_accumulate_caps(void)
 			continue;
 		if (hw_res->ctrl->alloc_capable)
 			exposed_alloc_capable = true;
+		if (hw_res->ctrl->mon_capable)
+			exposed_mon_capable = true;
+	}
+
+	/*
+	 * Narrow max_rmid against the picked occupancy source (the L3 CC)
+	 * only. A mon-capable controller that is not exposed as a counter
+	 * source must not clamp the rmid space.
+	 */
+	l3_ctrl = cbqri_resctrl_resources[RDT_RESOURCE_L3].ctrl;
+	if (l3_ctrl && l3_ctrl->mon_capable)
+		max_rmid = min(max_rmid, l3_ctrl->mcid_count);
+
+	if (!exposed_mon_capable) {
+		max_rmid = 1;
+		return;
 	}
+
+	/*
+	 * num_rmid is the user-visible bound for the L3 monitoring rmid
+	 * space. Track max_rmid (the picked-source minimum) so userspace is
+	 * not told more RMIDs than can be allocated.
+	 */
+	cbqri_resctrl_resources[RDT_RESOURCE_L3].resctrl_res.mon.num_rmid = max_rmid;
 }
 
 /*
@@ -560,13 +673,89 @@ static struct rdt_ctrl_domain *cbqri_create_ctrl_domain(struct cbqri_controller
 	return domain;
 }
 
+static int cbqri_attach_cpu_to_l3_mon(struct cbqri_controller *ctrl,
+				      struct rdt_resource *res, unsigned int cpu)
+{
+	struct rdt_l3_mon_domain *mon_dom;
+	struct rdt_ctrl_domain *ctrl_dom;
+	struct list_head *mon_pos = NULL;
+	int dom_id = ctrl->cache.cache_id;
+	int err;
+
+	lockdep_assert_held(&cbqri_domain_list_lock);
+
+	mon_dom = cbqri_find_l3_mon_domain(&res->mon_domains, dom_id);
+	if (mon_dom) {
+		cpumask_set_cpu(cpu, &mon_dom->hdr.cpu_mask);
+		return 0;
+	}
+
+	ctrl_dom = cbqri_find_ctrl_domain(&res->ctrl_domains, dom_id);
+	if (!ctrl_dom) {
+		pr_err("L3 mon attach for cpu %u: no ctrl_domain id %d\n",
+		       cpu, dom_id);
+		return -EINVAL;
+	}
+
+	mon_dom = kzalloc_obj(*mon_dom, GFP_KERNEL);
+	if (!mon_dom)
+		return -ENOMEM;
+
+	mon_dom->hdr.id = dom_id;
+	mon_dom->hdr.type = RESCTRL_MON_DOMAIN;
+	mon_dom->hdr.rid = RDT_RESOURCE_L3;
+	cpumask_set_cpu(cpu, &mon_dom->hdr.cpu_mask);
+	INIT_LIST_HEAD(&mon_dom->hdr.list);
+
+	if (resctrl_find_domain(&res->mon_domains, dom_id, &mon_pos)) {
+		pr_err("duplicate L3 mon_domain id %d\n", dom_id);
+		err = -EEXIST;
+		goto err_free;
+	}
+	if (mon_pos)
+		list_add_tail(&mon_dom->hdr.list, mon_pos);
+	else
+		list_add_tail(&mon_dom->hdr.list, &res->mon_domains);
+
+	err = resctrl_online_mon_domain(res, &mon_dom->hdr);
+	if (err)
+		goto err_listdel;
+
+	err = cbqri_init_mon_counters(ctrl);
+	if (err)
+		goto err_offline;
+
+	return 0;
+
+err_offline:
+	/*
+	 * cancel_delayed_work avoids deadlocking against the cqm_limbo
+	 * worker which takes cpus_read_lock while this hotplug callback
+	 * already holds cpus_write_lock. mbm_over is only
+	 * INIT_DELAYED_WORK'd when MBM_TOTAL was enabled, so gate the
+	 * cancel on the same condition to avoid touching a zeroed work
+	 * struct.
+	 */
+	cancel_delayed_work(&mon_dom->cqm_limbo);
+	if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID))
+		cancel_delayed_work(&mon_dom->mbm_over);
+	resctrl_offline_mon_domain(res, &mon_dom->hdr);
+err_listdel:
+	list_del(&mon_dom->hdr.list);
+err_free:
+	kfree(mon_dom);
+	return err;
+}
+
 static int cbqri_attach_cpu_to_cap_ctrl(struct cbqri_controller *ctrl,
 					unsigned int cpu)
 {
 	struct cbqri_resctrl_res *hw_res;
 	struct rdt_ctrl_domain *domain;
 	struct rdt_resource *res;
+	bool new_domain = false;
 	int dom_id;
+	int err;
 
 	if (ctrl->cache.cache_level == 2)
 		hw_res = &cbqri_resctrl_resources[RDT_RESOURCE_L2];
@@ -584,14 +773,68 @@ static int cbqri_attach_cpu_to_cap_ctrl(struct cbqri_controller *ctrl,
 	domain = cbqri_find_ctrl_domain(&res->ctrl_domains, dom_id);
 	if (domain) {
 		cpumask_set_cpu(cpu, &domain->hdr.cpu_mask);
-		return 0;
+	} else {
+		domain = cbqri_create_ctrl_domain(ctrl, res, cpu, dom_id);
+		if (IS_ERR(domain))
+			return PTR_ERR(domain);
+		new_domain = true;
 	}
 
-	domain = cbqri_create_ctrl_domain(ctrl, res, cpu, dom_id);
-	if (IS_ERR(domain))
-		return PTR_ERR(domain);
+	if (ctrl->mon_capable && ctrl->cache.cache_level == 3) {
+		err = cbqri_attach_cpu_to_l3_mon(ctrl, res, cpu);
+		if (err)
+			goto err_undo_ctrl_dom;
+	}
 
 	return 0;
+
+err_undo_ctrl_dom:
+	/*
+	 * The cpuhp core only rolls back states that successfully ran their
+	 * startup. The L3 mon attach failure happens inside this state's
+	 * startup, so its own offline callback is not invoked. Undo the
+	 * cpumask_set and, if this attach created the ctrl_domain, tear it
+	 * down so a retry sees a clean slate.
+	 */
+	cpumask_clear_cpu(cpu, &domain->hdr.cpu_mask);
+	if (new_domain) {
+		resctrl_offline_ctrl_domain(res, domain);
+		list_del(&domain->hdr.list);
+		kfree(container_of(domain, struct cbqri_resctrl_dom,
+				   resctrl_ctrl_dom));
+	}
+	return err;
+}
+
+static void cbqri_detach_cpu_from_l3_mon(struct rdt_resource *res,
+					 unsigned int cpu)
+{
+	struct rdt_l3_mon_domain *mon_dom, *tmp;
+
+	lockdep_assert_held(&cbqri_domain_list_lock);
+
+	list_for_each_entry_safe(mon_dom, tmp, &res->mon_domains, hdr.list) {
+		if (!cpumask_test_cpu(cpu, &mon_dom->hdr.cpu_mask))
+			continue;
+		cpumask_clear_cpu(cpu, &mon_dom->hdr.cpu_mask);
+		if (cpumask_empty(&mon_dom->hdr.cpu_mask)) {
+			/*
+			 * This runs as a cpuhp offline callback under
+			 * cpus_write_lock. The cqm_limbo and mbm_over workers
+			 * take cpus_read_lock before touching a domain, so
+			 * neither can run or re-queue here. A non-sync cancel
+			 * thus reliably dequeues any pending work before kfree,
+			 * and cancel_delayed_work_sync() would instead deadlock
+			 * against that cpus_read_lock.
+			 */
+			cancel_delayed_work(&mon_dom->cqm_limbo);
+			if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID))
+				cancel_delayed_work(&mon_dom->mbm_over);
+			resctrl_offline_mon_domain(res, &mon_dom->hdr);
+			list_del(&mon_dom->hdr.list);
+			kfree(mon_dom);
+		}
+	}
 }
 
 static void cbqri_detach_cpu_from_ctrl_domains(struct rdt_resource *res,
@@ -630,6 +873,8 @@ static void cbqri_detach_cpu_from_all_ctrls(unsigned int cpu)
 		if (!hw_res->ctrl)
 			continue;
 		cbqri_detach_cpu_from_ctrl_domains(&hw_res->resctrl_res, cpu);
+		if (rid == RDT_RESOURCE_L3 && hw_res->ctrl->mon_capable)
+			cbqri_detach_cpu_from_l3_mon(&hw_res->resctrl_res, cpu);
 	}
 }
 
@@ -683,6 +928,8 @@ static void cbqri_resctrl_teardown(void)
 		hw_res->cdp_enabled = false;
 	}
 	exposed_alloc_capable = false;
+	exposed_mon_capable = false;
+	max_rmid = U32_MAX;
 	cbqri_resctrl_inited = false;
 }
 
@@ -706,14 +953,29 @@ static int cbqri_resctrl_setup(void)
 
 	cbqri_resctrl_accumulate_caps();
 
-	if (!exposed_alloc_capable) {
+	if (!exposed_alloc_capable && !exposed_mon_capable) {
 		pr_debug("no resctrl-capable CBQRI controllers found\n");
 		return -ENODEV;
 	}
 
 	err = resctrl_init();
-	if (err)
+	if (err) {
+		/*
+		 * resctrl_init() failed before we set cbqri_resctrl_inited,
+		 * so cbqri_resctrl_teardown() would no-op. Roll back the
+		 * exposed_*_capable flags and the resource picks directly
+		 * so resctrl_arch_alloc_capable() / _mon_capable() do not
+		 * lie to callers after this returns.
+		 */
+		for (rid = 0; rid < RDT_NUM_RESOURCES; rid++) {
+			cbqri_resctrl_resources[rid].ctrl = NULL;
+			cbqri_resctrl_resources[rid].cdp_enabled = false;
+		}
+		exposed_alloc_capable = false;
+		exposed_mon_capable = false;
+		max_rmid = U32_MAX;
 		return err;
+	}
 
 	cbqri_resctrl_inited = true;
 	return 0;

-- 
2.43.0




More information about the linux-riscv mailing list