[PATCH v14 10/44] arm64: RMI: Add support for SRO

Thu Jun 4 08:19:40 PDT 2026

On 21/05/2026 15:35, Marc Zyngier wrote:
> On Wed, 13 May 2026 14:17:18 +0100,
> Steven Price <steven.price at arm.com> wrote:
>>
>> RMM v2.0 introduces the concept of "Stateful RMI Operations" (SRO). This
>> means that an SMC can return with an operation still in progress. The
>> host is excepted to continue the operation until is reaches a conclusion
>> (either success or failure). During this process the RMM can request
>> additional memory ('donate') or hand memory back to the host
>> ('reclaim'). The host can request an in progress operation is cancelled,
>> but still continue the operation until it has completed (otherwise the
>> incomplete operation may cause future RMM operations to fail).
>>
>> The SRO is tracked using a struct rmi_sro_state object which keeps track
>> of any memory which has been allocated but not yet consumed by the RMM
>> or reclaimed from the RMM. This allows the memory to be reused in a
>> future request within the same operation. It will also permit an
>> operation to be done in a context where memory allocation may be
>> difficult (e.g. atomic context) with the option to abort the operation
>> and retry the memory allocation outside of the atomic context. The
>> memory stored in the struct rmi_sro_state object can then be reused on
>> the subsequent attempt.
>>
>> Signed-off-by: Steven Price <steven.price at arm.com>
>> ---
>> v14:
>>  * SRO support has improved although is still not fully complete. The
>>    infrastructure has been moved out of KVM.
>> ---
>>  arch/arm64/include/asm/rmi_cmds.h |   1 +
>>  arch/arm64/kernel/rmi.c           | 359 ++++++++++++++++++++++++++++++
>>  2 files changed, 360 insertions(+)
>>
>> diff --git a/arch/arm64/include/asm/rmi_cmds.h b/arch/arm64/include/asm/rmi_cmds.h
>> index eb213c8e6f26..1a7b0c8f1e38 100644
>> --- a/arch/arm64/include/asm/rmi_cmds.h
>> +++ b/arch/arm64/include/asm/rmi_cmds.h
>> @@ -35,6 +35,7 @@ struct rmi_sro_state {
>>  
>>  int rmi_delegate_range(phys_addr_t phys, unsigned long size);
>>  int rmi_undelegate_range(phys_addr_t phys, unsigned long size);
>> +int free_delegated_page(phys_addr_t phys);
>>  
>>  static inline int rmi_delegate_page(phys_addr_t phys)
>>  {
>> diff --git a/arch/arm64/kernel/rmi.c b/arch/arm64/kernel/rmi.c
>> index 08cef54acadb..a8107ca9bb6d 100644
>> --- a/arch/arm64/kernel/rmi.c
>> +++ b/arch/arm64/kernel/rmi.c
>> @@ -48,6 +48,365 @@ int rmi_undelegate_range(phys_addr_t phys, unsigned long size)
>>  	return ret;
>>  }
>>  
>> +static unsigned long donate_req_to_size(unsigned long donatereq)
>> +{
>> +	unsigned long unit_size = RMI_DONATE_SIZE(donatereq);
>> +
>> +	switch (unit_size) {
>> +	case 0:
>> +		return PAGE_SIZE;
>> +	case 1:
>> +		return PMD_SIZE;
>> +	case 2:
>> +		return PUD_SIZE;
>> +	case 3:
>> +		return P4D_SIZE;
> 
> How does this work when we have folded levels? If this is supposed to
> be the architected size, then it should actively express that:
> 
> 	return BIT(unit_size * (PAGE_SHIFT - 3) + PAGE_SHIFT);

It doesn't work (as Gavin also pointed out). There's an existing macro
to make this even cleaner:

return BIT(ARM64_HW_PGTABLE_LEVEL_SHIFT(3 - unit_size));

>> +	}
>> +	unreachable();
>> +}
>> +
>> +static void rmi_smccc_invoke(struct arm_smccc_1_2_regs *regs_in,
>> +			     struct arm_smccc_1_2_regs *regs_out)
>> +{
>> +	struct arm_smccc_1_2_regs regs = *regs_in;
>> +	unsigned long status;
>> +
>> +	do {
>> +		arm_smccc_1_2_invoke(&regs, regs_out);
>> +		status = RMI_RETURN_STATUS(regs_out->a0);
>> +	} while (status == RMI_BUSY || status == RMI_BLOCKED);
>> +}
>> +
>> +int free_delegated_page(phys_addr_t phys)
>> +{
>> +	if (WARN_ON(rmi_undelegate_page(phys))) {
> 
> Please drop this WARN_ON(). Or at least make it ONCE. Everywhere.

Happy to change to WARN_ON_ONCE(). I think we should keep a WARN of some
sort as this is causing Linux to leak pages - it's definitely something
the sysadmin would want to know about.

>> +		/* Undelegate failed: leak the page */
>> +		return -EBUSY;
>> +	}
>> +
>> +	free_page((unsigned long)phys_to_virt(phys));
>> +
>> +	return 0;
>> +}
>> +
>> +static int rmi_sro_ensure_capacity(struct rmi_sro_state *sro,
>> +				   unsigned long count)
>> +{
>> +	if (WARN_ON_ONCE(sro->addr_count > RMI_MAX_ADDR_LIST))
>> +		return -EOVERFLOW;
>> +
>> +	if (count > RMI_MAX_ADDR_LIST - sro->addr_count)
>> +		return -ENOSPC;
>> +
>> +	return 0;
>> +}
>> +
>> +static int rmi_sro_donate_contig(struct rmi_sro_state *sro,
>> +				 unsigned long sro_handle,
>> +				 unsigned long donatereq,
>> +				 struct arm_smccc_1_2_regs *out_regs,
>> +				 gfp_t gfp)
>> +{
>> +	unsigned long unit_size = RMI_DONATE_SIZE(donatereq);
>> +	unsigned long unit_size_bytes = donate_req_to_size(donatereq);
>> +	unsigned long count = RMI_DONATE_COUNT(donatereq);
>> +	unsigned long state = RMI_DONATE_STATE(donatereq);
>> +	unsigned long size = unit_size_bytes * count;
>> +	unsigned long addr_range;
>> +	int ret;
>> +	void *virt;
>> +	phys_addr_t phys;
>> +	struct arm_smccc_1_2_regs regs = {
>> +		SMC_RMI_OP_MEM_DONATE,
>> +		sro_handle
>> +	};
>> +
>> +	for (int i = 0; i < sro->addr_count; i++) {
>> +		unsigned long entry = sro->addr_list[i];
>> +
>> +		if (RMI_ADDR_RANGE_SIZE(entry) == unit_size &&
>> +		    RMI_ADDR_RANGE_COUNT(entry) == count &&
>> +		    RMI_ADDR_RANGE_STATE(entry) == state) {
>> +			sro->addr_count--;
>> +			swap(sro->addr_list[sro->addr_count],
>> +			     sro->addr_list[i]);
>> +
>> +			goto out;
>> +		}
>> +	}
>> +
>> +	ret = rmi_sro_ensure_capacity(sro, 1);
>> +	if (ret)
>> +		return ret;
>> +
>> +	virt = alloc_pages_exact(size, gfp);
>> +	if (!virt)
>> +		return -ENOMEM;
>> +	phys = virt_to_phys(virt);
>> +
>> +	if (state == RMI_OP_MEM_DELEGATED) {
>> +		if (rmi_delegate_range(phys, size)) {
>> +			free_pages_exact(virt, size);
>> +			return -ENXIO;
>> +		}
>> +	}
>> +
>> +	addr_range = phys & RMI_ADDR_RANGE_ADDR_MASK;
>> +	FIELD_MODIFY(RMI_ADDR_RANGE_SIZE_MASK, &addr_range, unit_size);
>> +	FIELD_MODIFY(RMI_ADDR_RANGE_COUNT_MASK, &addr_range, count);
>> +	FIELD_MODIFY(RMI_ADDR_RANGE_STATE_MASK, &addr_range, state);
>> +
>> +	sro->addr_list[sro->addr_count] = addr_range;
>> +
> 
> Shouldn't this be moved to a helper that ensures capacity, and returns
> an error otherwise?

I'm not sure quite what you are suggesting. I already have a
rmi_sro_ensure_capacity() helper. By this point we know there's space.

>> +out:
>> +	regs.a2 = virt_to_phys(&sro->addr_list[sro->addr_count]);
>> +	regs.a3 = 1;
> 
> This could really do with context specific helpers that populate regs
> based on a set of parameters. I have no idea what this 1 here is, and
> the init is spread over too much code. Think of the children!
> 
> That's valid for the whole patch.

That's a good point. SRO is a bit tricky because I wanted the actual SMC
call to be done in one place so we can handle all the RMI_INCOMPLETE
cases together. But I could certainly add some helpers to setup the
registers rather than assigning directly to regs.a<n>.

Thanks,
Steve

> 	M.
>> +	rmi_smccc_invoke(&regs, out_regs);
>> +
>> +	unsigned long donated_granules = out_regs->a1;
>> +	unsigned long donated_size = donated_granules << PAGE_SHIFT;
>> +
>> +	if (donated_granules == 0) {
>> +		/* No pages used by the RMM */
>> +		sro->addr_count++;
>> +	} else if (donated_size < size) {
>> +		phys = sro->addr_list[sro->addr_count] & RMI_ADDR_RANGE_ADDR_MASK;
>> +
>> +		/* Not all granules used by the RMM, free the remaining pages */
>> +		for (long i = donated_size; i < size; i += PAGE_SIZE) {
>> +			if (state == RMI_OP_MEM_DELEGATED)
>> +				free_delegated_page(phys + i);
>> +			else
>> +				__free_page(phys_to_page(phys + i));
>> +		}
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +static int rmi_sro_donate_noncontig(struct rmi_sro_state *sro,
>> +				    unsigned long sro_handle,
>> +				    unsigned long donatereq,
>> +				    struct arm_smccc_1_2_regs *out_regs,
>> +				    gfp_t gfp)
>> +{
>> +	unsigned long unit_size = RMI_DONATE_SIZE(donatereq);
>> +	unsigned long unit_size_bytes = donate_req_to_size(donatereq);
>> +	unsigned long count = RMI_DONATE_COUNT(donatereq);
>> +	unsigned long state = RMI_DONATE_STATE(donatereq);
>> +	unsigned long found = 0;
>> +	unsigned long addr_list_start = sro->addr_count;
>> +	int ret;
>> +	struct arm_smccc_1_2_regs regs = {
>> +		SMC_RMI_OP_MEM_DONATE,
>> +		sro_handle
>> +	};
>> +
>> +	for (int i = 0; i < addr_list_start && found < count; i++) {
>> +		unsigned long entry = sro->addr_list[i];
>> +
>> +		if (RMI_ADDR_RANGE_SIZE(entry) == unit_size &&
>> +		    RMI_ADDR_RANGE_COUNT(entry) == 1 &&
>> +		    RMI_ADDR_RANGE_STATE(entry) == state) {
>> +			addr_list_start--;
>> +			swap(sro->addr_list[addr_list_start],
>> +			     sro->addr_list[i]);
>> +			found++;
>> +			i--;
>> +		}
>> +	}
>> +
>> +	ret = rmi_sro_ensure_capacity(sro, count - found);
>> +	if (ret)
>> +		return ret;
>> +
>> +	while (found < count) {
>> +		unsigned long addr_range;
>> +		void *virt = alloc_pages_exact(unit_size_bytes, gfp);
>> +		phys_addr_t phys;
>> +
>> +		if (!virt)
>> +			return -ENOMEM;
>> +
>> +		phys = virt_to_phys(virt);
>> +
>> +		if (state == RMI_OP_MEM_DELEGATED) {
>> +			if (rmi_delegate_range(phys, unit_size_bytes)) {
>> +				free_pages_exact(virt, unit_size_bytes);
>> +				return -ENXIO;
>> +			}
>> +		}
>> +
>> +		addr_range = phys & RMI_ADDR_RANGE_ADDR_MASK;
>> +		FIELD_MODIFY(RMI_ADDR_RANGE_SIZE_MASK, &addr_range, unit_size);
>> +		FIELD_MODIFY(RMI_ADDR_RANGE_COUNT_MASK, &addr_range, 1);
>> +		FIELD_MODIFY(RMI_ADDR_RANGE_STATE_MASK, &addr_range, state);
>> +
>> +		sro->addr_list[sro->addr_count++] = addr_range;
>> +		found++;
>> +	}
>> +
>> +	regs.a2 = virt_to_phys(&sro->addr_list[addr_list_start]);
>> +	regs.a3 = found;
>> +	rmi_smccc_invoke(&regs, out_regs);
>> +
>> +	unsigned long donated_granules = out_regs->a1;
>> +
>> +	if (WARN_ON(donated_granules & ((unit_size_bytes >> PAGE_SHIFT) - 1))) {
>> +		/*
>> +		 * FIXME: RMM has only consumed part of a huge page, this leaks
>> +		 * the rest of the huge page
>> +		 */
>> +		donated_granules = ALIGN(donated_granules,
>> +					 (unit_size_bytes >> PAGE_SHIFT));
>> +	}
>> +	unsigned long donated_blocks = donated_granules / (unit_size_bytes >> PAGE_SHIFT);
>> +
>> +	if (WARN_ON(donated_blocks > found))
>> +		donated_blocks = found;
>> +
>> +	unsigned long undonated_blocks = found - donated_blocks;
>> +
>> +	while (donated_blocks && undonated_blocks) {
>> +		sro->addr_count--;
>> +		swap(sro->addr_list[addr_list_start],
>> +		     sro->addr_list[sro->addr_count]);
>> +		addr_list_start++;
>> +
>> +		donated_blocks--;
>> +		undonated_blocks--;
>> +	}
>> +	sro->addr_count -= donated_blocks;
>> +
>> +	return 0;
>> +}
>> +
>> +static int rmi_sro_donate(struct rmi_sro_state *sro,
>> +			  unsigned long sro_handle,
>> +			  unsigned long donatereq,
>> +			  struct arm_smccc_1_2_regs *regs,
>> +			  gfp_t gfp)
>> +{
>> +	unsigned long count = RMI_DONATE_COUNT(donatereq);
>> +
>> +	if (WARN_ON(!count))
>> +		return 0;
>> +
>> +	if (RMI_DONATE_CONTIG(donatereq)) {
>> +		return rmi_sro_donate_contig(sro, sro_handle, donatereq,
>> +					     regs, gfp);
>> +	} else {
>> +		return rmi_sro_donate_noncontig(sro, sro_handle, donatereq,
>> +						regs, gfp);
>> +	}
>> +}
>> +
>> +static int rmi_sro_reclaim(struct rmi_sro_state *sro,
>> +			   unsigned long sro_handle,
>> +			   struct arm_smccc_1_2_regs *out_regs)
>> +{
>> +	unsigned long capacity;
>> +	struct arm_smccc_1_2_regs regs;
>> +	int ret;
>> +
>> +	ret = rmi_sro_ensure_capacity(sro, 1);
>> +	if (ret)
>> +		rmi_sro_free(sro);
>> +
>> +	capacity = RMI_MAX_ADDR_LIST - sro->addr_count;
>> +
>> +	regs = (struct arm_smccc_1_2_regs){
>> +		SMC_RMI_OP_MEM_RECLAIM,
>> +		sro_handle,
>> +		virt_to_phys(&sro->addr_list[sro->addr_count]),
>> +		capacity
>> +	};
>> +	rmi_smccc_invoke(&regs, out_regs);
>> +
>> +	if (WARN_ON_ONCE(out_regs->a1 > capacity))
>> +		out_regs->a1 = capacity;
>> +
>> +	sro->addr_count += out_regs->a1;
>> +
>> +	return 0;
>> +}
>> +
>> +void rmi_sro_free(struct rmi_sro_state *sro)
>> +{
>> +	for (int i = 0; i < sro->addr_count; i++) {
>> +		unsigned long entry = sro->addr_list[i];
>> +		unsigned long addr = RMI_ADDR_RANGE_ADDR(entry);
>> +		unsigned long unit_size = RMI_ADDR_RANGE_SIZE(entry);
>> +		unsigned long count = RMI_ADDR_RANGE_COUNT(entry);
>> +		unsigned long state = RMI_ADDR_RANGE_STATE(entry);
>> +		unsigned long size = donate_req_to_size(unit_size) * count;
>> +
>> +		if (state == RMI_OP_MEM_DELEGATED) {
>> +			if (WARN_ON(rmi_undelegate_range(addr, size))) {
>> +				/* Leak the pages */
>> +				continue;
>> +			}
>> +		}
>> +		free_pages_exact(phys_to_virt(addr), size);
>> +	}
>> +
>> +	sro->addr_count = 0;
>> +}
>> +
>> +unsigned long rmi_sro_execute(struct rmi_sro_state *sro, gfp_t gfp)
>> +{
>> +	unsigned long sro_handle;
>> +	struct arm_smccc_1_2_regs regs;
>> +	struct arm_smccc_1_2_regs *regs_in = &sro->regs;
>> +
>> +	rmi_smccc_invoke(regs_in, &regs);
>> +
>> +	sro_handle = regs.a1;
>> +
>> +	while (RMI_RETURN_STATUS(regs.a0) == RMI_INCOMPLETE) {
>> +		bool can_cancel = RMI_RETURN_CAN_CANCEL(regs.a0);
>> +		int ret;
>> +
>> +		switch (RMI_RETURN_MEMREQ(regs.a0)) {
>> +		case RMI_OP_MEM_REQ_NONE:
>> +			regs = (struct arm_smccc_1_2_regs){
>> +				SMC_RMI_OP_CONTINUE, sro_handle, 0
>> +			};
>> +			rmi_smccc_invoke(&regs, &regs);
>> +			break;
>> +		case RMI_OP_MEM_REQ_DONATE:
>> +			ret = rmi_sro_donate(sro, sro_handle, regs.a2, &regs,
>> +					     gfp);
>> +			break;
>> +		case RMI_OP_MEM_REQ_RECLAIM:
>> +			ret = rmi_sro_reclaim(sro, sro_handle, &regs);
>> +			break;
>> +		default:
>> +			ret = WARN_ON(1);
>> +			break;
>> +		}
>> +
>> +		if (ret) {
>> +			if (can_cancel) {
>> +				/*
>> +				 * FIXME: Handle cancelling properly!
>> +				 *
>> +				 * If the operation has failed due to memory
>> +				 * allocation failure then the information on
>> +				 * the memory allocation should be saved, so
>> +				 * that the allocation can be repeated outside
>> +				 * of any context which prevented the
>> +				 * allocation.
> 
> Honestly, this is the sort of stuff that I'd expect to be solved
> *before* posting this code. Since this is so central to the whole
> memory management, it needs to be correct from day-1.
> 
> If you can't make it work in time, then tone the supported features
> down. But FIXMEs and WARN_ONs are not the way to go.
> 
> 	M.
>