[PATCH v3 02/16] KVM: arm64: nv: Implement nested Stage-2 page table walk logic

Zenghui Yu zenghui.yu at linux.dev
Wed Feb 4 00:28:57 PST 2026


Hi Marc,

[ chewing through the NV code.. ;-) ]

On 6/14/24 10:45 PM, Marc Zyngier wrote:
> From: Christoffer Dall <christoffer.dall at linaro.org>
> 
> Based on the pseudo-code in the ARM ARM, implement a stage 2 software
> page table walker.

[...]

> +static u32 compute_fsc(int level, u32 fsc)
> +{
> +	return fsc | (level & 0x3);
> +}
> +
> +static int get_ia_size(struct s2_walk_info *wi)
> +{
> +	return 64 - wi->t0sz;
> +}
> +
> +static int check_base_s2_limits(struct s2_walk_info *wi,
> +				int level, int input_size, int stride)
> +{
> +	int start_size, ia_size;
> +
> +	ia_size = get_ia_size(wi);
> +
> +	/* Check translation limits */
> +	switch (BIT(wi->pgshift)) {
> +	case SZ_64K:
> +		if (level == 0 || (level == 1 && ia_size <= 42))

It looks broken as the pseudocode checks the limits based on
*implemented PA size*, rather than on ia_size, which is essentially the
input address size (64 - T0SZ) programmed by L1 hypervisor. They're
different.

We can probably get the implemented PA size by:

AArch64.PAMax()
{
	parange = get_idreg_field_enum(kvm, ID_AA64MMFR0_EL1, PARANGE);
	return id_aa64mmfr0_parange_to_phys_shift(parange);
}

Not sure if I've read the spec correctly.

> +			return -EFAULT;
> +		break;
> +	case SZ_16K:
> +		if (level == 0 || (level == 1 && ia_size <= 40))
> +			return -EFAULT;
> +		break;
> +	case SZ_4K:
> +		if (level < 0 || (level == 0 && ia_size <= 42))
> +			return -EFAULT;
> +		break;
> +	}
> +
> +	/* Check input size limits */
> +	if (input_size > ia_size)

This is always false for the current code. ;-)

> +		return -EFAULT;
> +
> +	/* Check number of entries in starting level table */
> +	start_size = input_size - ((3 - level) * stride + wi->pgshift);
> +	if (start_size < 1 || start_size > stride + 4)
> +		return -EFAULT;
> +
> +	return 0;
> +}
> +
> +/* Check if output is within boundaries */
> +static int check_output_size(struct s2_walk_info *wi, phys_addr_t output)
> +{
> +	unsigned int output_size = wi->max_oa_bits;
> +
> +	if (output_size != 48 && (output & GENMASK_ULL(47, output_size)))
> +		return -1;
> +
> +	return 0;
> +}
> +
> +/*
> + * This is essentially a C-version of the pseudo code from the ARM ARM
> + * AArch64.TranslationTableWalk  function.  I strongly recommend looking at
> + * that pseudocode in trying to understand this.
> + *
> + * Must be called with the kvm->srcu read lock held
> + */
> +static int walk_nested_s2_pgd(phys_addr_t ipa,
> +			      struct s2_walk_info *wi, struct kvm_s2_trans *out)
> +{
> +	int first_block_level, level, stride, input_size, base_lower_bound;
> +	phys_addr_t base_addr;
> +	unsigned int addr_top, addr_bottom;
> +	u64 desc;  /* page table entry */
> +	int ret;
> +	phys_addr_t paddr;
> +
> +	switch (BIT(wi->pgshift)) {
> +	default:
> +	case SZ_64K:
> +	case SZ_16K:
> +		level = 3 - wi->sl;
> +		first_block_level = 2;
> +		break;
> +	case SZ_4K:
> +		level = 2 - wi->sl;
> +		first_block_level = 1;
> +		break;
> +	}
> +
> +	stride = wi->pgshift - 3;
> +	input_size = get_ia_size(wi);
> +	if (input_size > 48 || input_size < 25)
> +		return -EFAULT;
> +
> +	ret = check_base_s2_limits(wi, level, input_size, stride);
> +	if (WARN_ON(ret))
> +		return ret;
> +
> +	base_lower_bound = 3 + input_size - ((3 - level) * stride +
> +			   wi->pgshift);
> +	base_addr = wi->baddr & GENMASK_ULL(47, base_lower_bound);
> +
> +	if (check_output_size(wi, base_addr)) {
> +		out->esr = compute_fsc(level, ESR_ELx_FSC_ADDRSZ);

With a wrongly programmed base address, we should report the ADDRSZ
fault at level 0 (as per R_BFHQH and the pseudocode). It's easy to fix.

> +static void vtcr_to_walk_info(u64 vtcr, struct s2_walk_info *wi)
> +{
> +	wi->t0sz = vtcr & TCR_EL2_T0SZ_MASK;
> +
> +	switch (vtcr & VTCR_EL2_TG0_MASK) {
> +	case VTCR_EL2_TG0_4K:
> +		wi->pgshift = 12;	 break;
> +	case VTCR_EL2_TG0_16K:
> +		wi->pgshift = 14;	 break;
> +	case VTCR_EL2_TG0_64K:
> +	default:	    /* IMPDEF: treat any other value as 64k */
> +		wi->pgshift = 16;	 break;
> +	}
> +
> +	wi->sl = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr);
> +	/* Global limit for now, should eventually be per-VM */
> +	wi->max_oa_bits = min(get_kvm_ipa_limit(),
                              ^^^

Should we use AArch64.PAMax() instead? As the output address size is
never larger than the implemented PA size.

Now I'm wondering if we can let kvm_get_pa_bits() just return PAMax for
(based on the exposed (to-L1) AA64MFR0.PARange value) and use it when
possible.

Thanks,
Zenghui



More information about the linux-arm-kernel mailing list