[PATCH RFC v2 08/17] RISC-V: QoS: add resctrl interface for CBQRI controllers

Fri Feb 13 15:30:44 PST 2026

Hi Drew,

On 1/28/26 12:27 PM, Drew Fustini wrote:

...

> +
> +struct rdt_domain_hdr *resctrl_arch_find_domain(struct list_head *domain_list, int id)
> +{
> +	struct rdt_domain_hdr *hdr;
> +
> +	lockdep_assert_cpus_held();
> +
> +	list_for_each_entry(hdr, domain_list, list) {
> +		if (hdr->id == id)
> +			return hdr;
> +	}
> +
> +	return NULL;
> +}

This indeed looks like a duplicate of resctrl_find_domain(). From what I can tell it
is not used in this series nor is it an arch call called by resctrl fs so can be dropped?

> +
> +void resctrl_arch_rmid_idx_decode(u32 idx, u32 *closid, u32 *rmid)
> +{
> +	*closid = ((u32)~0); /* refer to X86_RESCTRL_BAD_CLOSID */

The name is actually X86_RESCTRL_EMPTY_CLOSID - and if RISC-V also needs it we could
make it generally available.

> +	*rmid = idx;
> +}
> +
> +/* RISC-V resctrl interface does not maintain a default srmcfg value for a given CPU */

This means that when user space uses resctrl fs to assign a CPU to a resource group and
then run a task belonging to the default resource group on that CPU then the task will not
obtain the allocations that user assigned to that resource group. Here is what the resctrl
docs currently contain wrt "Resource allocation rules"

	Resource allocation rules                                                       
	-------------------------                                                       

	When a task is running the following rules define which resources are           
	available to it:                                                                

	1) If the task is a member of a non-default group, then the schemata            
	   for that group is used.                                                      

	2) Else if the task belongs to the default group, but is running on a           
	   CPU that is assigned to some specific group, then the schemata for the       
	   CPU's group is used.                                                         

	3) Otherwise the schemata for the default group is used.         

If I understand correctly RISC-V thus does not support CPU assignment but user space cannot
tell. That is, user may write to the cpus/cpus_list file and resctrl will show that it
succeeds and actually display the new cpumask but underneath it all the actual allocations will
not reflect that?

> +void resctrl_arch_set_cpu_default_closid_rmid(int cpu, u32 closid, u32 rmid) { }
> +
> +void resctrl_arch_sched_in(struct task_struct *tsk)
> +{
> +	__switch_to_srmcfg(tsk);
> +}
> +
> +void resctrl_arch_set_closid_rmid(struct task_struct *tsk, u32 closid, u32 rmid)
> +{
> +	u32 srmcfg;
> +
> +	WARN_ON_ONCE((closid & SRMCFG_RCID_MASK) != closid);
> +	WARN_ON_ONCE((rmid & SRMCFG_MCID_MASK) != rmid);
> +
> +	srmcfg = rmid << SRMCFG_MCID_SHIFT;
> +	srmcfg |= closid;
> +	WRITE_ONCE(tsk->thread.srmcfg, srmcfg);
> +}
> +
> +void resctrl_arch_sync_cpu_closid_rmid(void *info)
> +{
> +	struct resctrl_cpu_defaults *r = info;
> +
> +	lockdep_assert_preemption_disabled();
> +
> +	if (r) {
> +		resctrl_arch_set_cpu_default_closid_rmid(smp_processor_id(),
> +							 r->closid, r->rmid);

This just calls the empty function above?

> +	}
> +
> +	resctrl_arch_sched_in(current);
> +}
> +
> +bool resctrl_arch_match_closid(struct task_struct *tsk, u32 closid)
> +{
> +	u32 srmcfg;
> +	bool match;
> +
> +	srmcfg = READ_ONCE(tsk->thread.srmcfg);
> +	match = (srmcfg & SRMCFG_RCID_MASK) == closid;
> +	return match;
> +}
> +
> +bool resctrl_arch_match_rmid(struct task_struct *tsk, u32 closid, u32 rmid)
> +{
> +	u32 tsk_rmid;
> +
> +	tsk_rmid = READ_ONCE(tsk->thread.srmcfg);
> +	tsk_rmid >>= SRMCFG_MCID_SHIFT;
> +	tsk_rmid &= SRMCFG_MCID_MASK;
> +
> +	return tsk_rmid == rmid;
> +}
> +
> +int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d,
> +			   u32 closid, u32 rmid, enum resctrl_event_id eventid,
> +			   u64 *val, void *arch_mon_ctx)
> +{
> +	/*
> +	 * The current Qemu implementation of CBQRI capacity and bandwidth
> +	 * controllers do not emulate the utilization of resources over
> +	 * time. Therefore, Qemu currently sets the invalid bit in
> +	 * cc_mon_ctr_val and bc_mon_ctr_val, and there is no meaningful
> +	 * value other than 0 to return for reading an RMID (e.g. MCID in
> +	 * CBQRI terminology)
> +	 */
> +
> +	return 0;
> +}
> +
> +void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d,
> +			     u32 closid, u32 rmid, enum resctrl_event_id eventid)
> +{
> +	/* not implemented for the RISC-V resctrl interface */
> +}
> +
> +void resctrl_arch_mon_event_config_read(void *info)
> +{
> +	/* not implemented for the RISC-V resctrl interface */
> +}
> +
> +void resctrl_arch_mon_event_config_write(void *info)
> +{
> +	/* not implemented for the RISC-V resctrl interface */
> +}
> +
> +void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d)
> +{
> +	/* not implemented for the RISC-V resctrl implementation */
> +}
> +
> +void resctrl_arch_reset_all_ctrls(struct rdt_resource *r)
> +{
> +	/* not implemented for the RISC-V resctrl implementation */
> +}
> +
> +/* Set capacity block mask (cc_block_mask) */
> +static void cbqri_set_cbm(struct cbqri_controller *ctrl, u64 cbm)
> +{
> +	int reg_offset;
> +	u64 reg;
> +
> +	reg_offset = CBQRI_CC_BLOCK_MASK_OFF;
> +	reg = ioread64(ctrl->base + reg_offset);
> +
> +	reg = cbm;
> +	iowrite64(reg, ctrl->base + reg_offset);

This just writes the new 64bit value without any modifications. Is it necessary to
read it first?

> +}

...

> +static int cbqri_apply_cache_config(struct cbqri_resctrl_dom *hw_dom, u32 closid,
> +				    enum resctrl_conf_type type, struct cbqri_config *cfg)
> +{
> +	struct cbqri_controller *ctrl = hw_dom->hw_ctrl;
> +	int reg_offset;
> +	int err = 0;
> +	u64 reg;
> +
> +	if (cfg->cbm != hw_dom->ctrl_val[closid]) {
> +		/* Store the new cbm in the ctrl_val array for this closid in this domain */
> +		hw_dom->ctrl_val[closid] = cfg->cbm;

How this hw_dom->ctrl_val[] is used is not clear ... it almost seems unnecessary? It seems to
resemble the x86 rdt_hw_ctrl_domain::ctrl_val that essentially contains a copy of the values
set on hardware but its use during config read and write does not reflect that.

In cbqri_apply_cache_config() hw_dom->ctrl_val[closid] is set before any attempt is made to
write it to hardware and below it is clear that code doing the writing may fail. Does this mean
that the driver may think that it set the control value correctly (because it will not retry
based on the  cfg->cbm != hw_dom->ctrl_val[closid] check) but that is actually not the case?

Jumping ahead to the partner code in resctrl_arch_get_config() that reads the current
configuration value it is unexpected that the implementation refers to the hardware and
not hw_dom->ctrl_val[closid] ... but actually sets hw_dom->ctrl_val[closid] there also
to reflect the hardware when the configuration is *read*?

As I understand there can be two possibilities, either cache the hardware value or don't:
One possibility could thus be to move hw_dom->ctrl_val[closid] assignment in this function to
be after hardware is configured so that it reflects accurate state and then resctrl_arch_get_config()
could just get the value from it instead of hardware.
Another possibility may be to drop hw_dom->ctrl_val[] entirely and just read from/write to
hardware every time, something that is much cheaper to do with this design that does not require IPIs.

This implementation seems to be a bit of both?

> +
> +		/* Set capacity block mask (cc_block_mask) */
> +		cbqri_set_cbm(ctrl, cfg->cbm);
> +
> +		/* Capacity config limit operation */
> +		err = cbqri_cc_alloc_op(ctrl, CBQRI_CC_ALLOC_CTL_OP_CONFIG_LIMIT, closid, type);
> +		if (err < 0) {
> +			pr_err("%s(): operation failed: err = %d", __func__, err);
> +			return err;
> +		}
> +
> +		/* Clear cc_block_mask before read limit to verify op works*/
> +		cbqri_set_cbm(ctrl, 0);
> +
> +		/* Performa capacity read limit operation to verify blockmask */
> +		err = cbqri_cc_alloc_op(ctrl, CBQRI_CC_ALLOC_CTL_OP_READ_LIMIT, closid, type);
> +		if (err < 0) {
> +			pr_err("%s(): operation failed: err = %d", __func__, err);
> +			return err;
> +		}
> +
> +		/* Read capacity blockmask to verify it matches the requested config */
> +		reg_offset = CBQRI_CC_BLOCK_MASK_OFF;
> +		reg = ioread64(ctrl->base + reg_offset);
> +		if (reg != cfg->cbm) {
> +			pr_warn("%s(): failed to verify allocation (reg:%llx != cbm:%llx)",
> +				__func__, reg, cfg->cbm);
> +			return -EIO;
> +		}
> +	}
> +
> +	return err;
> +}
> +

...

> +static int cbqri_apply_bw_config(struct cbqri_resctrl_dom *hw_dom, u32 closid,
> +				 enum resctrl_conf_type type, struct cbqri_config *cfg)
> +{
> +	struct cbqri_controller *ctrl = hw_dom->hw_ctrl;
> +	int ret = 0;
> +	u64 reg;
> +
> +	if (cfg->rbwb != hw_dom->ctrl_val[closid]) {
> +		/* Store the new rbwb in the ctrl_val array for this closid in this domain */
> +		hw_dom->ctrl_val[closid] = cfg->rbwb;

(similar comment as above about ctrl_val[])

> +
> +		/* Set reserved bandwidth blocks */
> +		cbqri_set_rbwb(ctrl, cfg->rbwb);
> +
> +		/* Bandwidth config limit operation */
> +		ret = cbqri_bc_alloc_op(ctrl, CBQRI_CC_ALLOC_CTL_OP_CONFIG_LIMIT, closid);
> +		if (ret < 0) {
> +			pr_err("%s(): operation failed: ret = %d", __func__, ret);
> +			return ret;
> +		}
> +
> +		/* Clear rbwb before read limit to verify op works*/
> +		cbqri_set_rbwb(ctrl, 0);
> +
> +		/* Bandwidth allocation read limit operation to verify */
> +		ret = cbqri_bc_alloc_op(ctrl, CBQRI_CC_ALLOC_CTL_OP_READ_LIMIT, closid);
> +		if (ret < 0) {
> +			pr_err("%s(): operation failed: ret = %d", __func__, ret);
> +			return ret;
> +		}
> +
> +		/* Read bandwidth allocation to verify it matches the requested config */
> +		reg = cbqri_get_rbwb(ctrl);
> +		if (reg != cfg->rbwb) {
> +			pr_warn("%s(): failed to verify allocation (reg:%llx != rbwb:%llu)",
> +				__func__, reg, cfg->rbwb);
> +			return -EIO;
> +		}
> +	}
> +
> +	return ret;
> +}

...

> +
> +u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d,
> +			    u32 closid, enum resctrl_conf_type type)
> +{
> +	struct cbqri_resctrl_dom *hw_dom;
> +	struct cbqri_controller *ctrl;
> +	int reg_offset;
> +	u32 percent;
> +	u32 rbwb;
> +	u64 reg;
> +	int err;
> +
> +	hw_dom = container_of(d, struct cbqri_resctrl_dom, resctrl_ctrl_dom);
> +
> +	ctrl = hw_dom->hw_ctrl;
> +
> +	if (!r->alloc_capable)
> +		return resctrl_get_default_ctrl(r);
> +
> +	switch (r->rid) {
> +	case RDT_RESOURCE_L2:
> +	case RDT_RESOURCE_L3:
> +		/* Clear cc_block_mask before read limit operation */
> +		cbqri_set_cbm(ctrl, 0);
> +
> +		/* Capacity read limit operation for RCID (closid) */
> +		err = cbqri_cc_alloc_op(ctrl, CBQRI_CC_ALLOC_CTL_OP_READ_LIMIT, type, closid);
> +		if (err < 0) {
> +			pr_err("%s(): operation failed: err = %d", __func__, err);
> +			return resctrl_get_default_ctrl(r);
> +		}
> +
> +		/* Read capacity block mask for RCID (closid) */
> +		reg_offset = CBQRI_CC_BLOCK_MASK_OFF;
> +		reg = ioread64(ctrl->base + reg_offset);
> +
> +		/* Update the config value for the closid in this domain */
> +		hw_dom->ctrl_val[closid] = reg;

This is what I refer to above, why is it necessary to read from hardware here and not
just return hw_dom->ctrl_val[closid] directly?

> +		return hw_dom->ctrl_val[closid];
> +
> +	case RDT_RESOURCE_MBA:
> +		/* Capacity read limit operation for RCID (closid) */
> +		err = cbqri_bc_alloc_op(ctrl, CBQRI_CC_ALLOC_CTL_OP_READ_LIMIT, closid);
> +		if (err < 0) {
> +			pr_err("%s(): operation failed: err = %d", __func__, err);
> +			return resctrl_get_default_ctrl(r);
> +		}
> +
> +		hw_dom->ctrl_val[closid] = cbqri_get_rbwb(ctrl);
> +
> +		/* Convert from bandwidth blocks to percent */
> +		rbwb = hw_dom->ctrl_val[closid];
> +		rbwb *= 100;
> +		percent = rbwb / ctrl->bc.nbwblks;
> +		if (rbwb % ctrl->bc.nbwblks)
> +			percent++;
> +		return percent;
> +
> +	default:
> +		return resctrl_get_default_ctrl(r);
> +	}
> +}
> +

...

> +
> +/*
> + * Note: for the purposes of the CBQRI proof-of-concept, debug logging
> + * has been left in this function that detects the properties of CBQRI
> + * capable controllers in the system. pr_info calls would be removed
> + * before submitting non-RFC patches.
> + */
> +static int cbqri_probe_controller(struct cbqri_controller_info *ctrl_info,
> +				  struct cbqri_controller *ctrl)
> +{
> +	int err = 0, status;
> +	u64 reg;
> +
> +	pr_info("controller info: type=%d addr=0x%lx size=%lu max-rcid=%u max-mcid=%u",
> +		ctrl_info->type, ctrl_info->addr, ctrl_info->size,
> +		ctrl_info->rcid_count, ctrl_info->mcid_count);
> +
> +	/* max_rmid is used by resctrl_arch_system_num_rmid_idx() */
> +	max_rmid = ctrl_info->mcid_count;

It looks like the max is just the MCID count of the last probed controller as opposed to
the maximum among all controllers.

...

> +
> +static int qos_resctrl_add_controller_domain(struct cbqri_controller *ctrl, int *id)
> +{
> +	struct rdt_ctrl_domain *domain = NULL;
> +	struct cbqri_resctrl_res *cbqri_res = NULL;
> +	struct rdt_resource *res = NULL;
> +	int internal_id = *id;
> +	int err = 0;
> +
> +	domain = qos_new_domain(ctrl);
> +	if (!domain)
> +		return -ENOSPC;
> +	if (ctrl->ctrl_info->type == CBQRI_CONTROLLER_TYPE_CAPACITY) {
> +		cpumask_copy(&domain->hdr.cpu_mask, &ctrl->ctrl_info->cache.cpu_mask);
> +		if (ctrl->ctrl_info->cache.cache_level == 2) {
> +			cbqri_res = &cbqri_resctrl_resources[RDT_RESOURCE_L2];
> +			cbqri_res->max_rcid = ctrl->ctrl_info->rcid_count;
> +			cbqri_res->max_mcid = ctrl->ctrl_info->mcid_count;
> +			res = &cbqri_res->resctrl_res;
> +			res->mon.num_rmid = ctrl->ctrl_info->mcid_count;
> +			res->rid = RDT_RESOURCE_L2;
> +			res->name = "L2";
> +			res->alloc_capable = ctrl->alloc_capable;
> +			res->mon_capable = ctrl->mon_capable;
> +			res->schema_fmt = RESCTRL_SCHEMA_BITMAP;
> +			res->ctrl_scope = RESCTRL_L2_CACHE;
> +			res->cache.arch_has_sparse_bitmasks = false;
> +			res->cache.arch_has_per_cpu_cfg = false;
> +			res->cache.cbm_len = ctrl->cc.ncblks;
> +			res->cache.shareable_bits = resctrl_get_default_ctrl(res);
> +			res->cache.min_cbm_bits = 1;
> +		} else if (ctrl->ctrl_info->cache.cache_level == 3) {
> +			cbqri_res = &cbqri_resctrl_resources[RDT_RESOURCE_L3];
> +			cbqri_res->max_rcid = ctrl->ctrl_info->rcid_count;
> +			cbqri_res->max_mcid = ctrl->ctrl_info->mcid_count;
> +			res = &cbqri_res->resctrl_res;
> +			res->mon.num_rmid = ctrl->ctrl_info->mcid_count;
> +			res->rid = RDT_RESOURCE_L3;
> +			res->name = "L3";
> +			res->schema_fmt = RESCTRL_SCHEMA_BITMAP;
> +			res->ctrl_scope = RESCTRL_L3_CACHE;
> +			res->alloc_capable = ctrl->alloc_capable;
> +			res->mon_capable = ctrl->mon_capable;
> +			res->cache.arch_has_sparse_bitmasks = false;
> +			res->cache.arch_has_per_cpu_cfg = false;
> +			res->cache.cbm_len = ctrl->cc.ncblks;
> +			res->cache.shareable_bits = resctrl_get_default_ctrl(res);
> +			res->cache.min_cbm_bits = 1;
> +		} else {
> +			pr_warn("%s(): unknown cache level %d", __func__,
> +				ctrl->ctrl_info->cache.cache_level);
> +			err = -ENODEV;
> +			goto err_free_domain;
> +		}
> +	} else if (ctrl->ctrl_info->type == CBQRI_CONTROLLER_TYPE_BANDWIDTH) {
> +		if (ctrl->alloc_capable) {
> +			cbqri_res = &cbqri_resctrl_resources[RDT_RESOURCE_MBA];
> +			cbqri_res->max_rcid = ctrl->ctrl_info->rcid_count;
> +			cbqri_res->max_mcid = ctrl->ctrl_info->mcid_count;
> +			res = &cbqri_res->resctrl_res;
> +			res->mon.num_rmid = ctrl->ctrl_info->mcid_count;
> +			res->rid = RDT_RESOURCE_MBA;
> +			res->name = "MB";
> +			res->schema_fmt = RESCTRL_SCHEMA_RANGE;
> +			res->ctrl_scope = RESCTRL_L3_CACHE;
> +			res->alloc_capable = ctrl->alloc_capable;
> +			res->mon_capable = false;
> +			res->membw.delay_linear = true;
> +			res->membw.arch_needs_linear = true;
> +			res->membw.throttle_mode = THREAD_THROTTLE_UNDEFINED;
> +			// The minimum percentage allowed by the CBQRI spec
> +			res->membw.min_bw = 1;
> +			// The maximum percentage allowed by the CBQRI spec
> +			res->membw.max_bw = 80;
> +			res->membw.bw_gran = 1;
> +		}
> +	} else {
> +		pr_warn("%s(): unknown resource %d", __func__, ctrl->ctrl_info->type);
> +		err = -ENODEV;
> +		goto err_free_domain;
> +	}
> +
> +	domain->hdr.id = internal_id;

I am missing something here. For the cache resources I expected the ID to be initialized
to ctrl->ctrl_info->cache_id instead (which is only introduced later in patch 15 though). 
When interacting with the L2 and L3 resources the domain ID should be the cache ID since that
is the ID printed to user space where the cache ID is expected. 

How is this "internal id" used?

Also please note there are a couple of other fields in the header that needs initializing.

> +	err = qos_init_domain_ctrlval(res, domain);
> +	if (err)
> +		goto err_free_domain;
> +
> +	if (cbqri_res) {
> +		list_add_tail(&domain->hdr.list, &cbqri_res->resctrl_res.ctrl_domains);

There is an expectation that the domain list be sorted. For reference, resctrl_find_domain().

> +		*id = internal_id;
> +		err = resctrl_online_ctrl_domain(res, domain);
> +		if (err) {
> +			pr_warn("%s(): failed to online cbqri_res domain", __func__);
> +			goto err_free_domain;
> +		}
> +	}

Reinette