[PATCH v4 06/12] KVM: arm64: Use LPA2 page-tables for stage2 and hyp stage1

Fri Oct 20 02:16:10 PDT 2023

On Mon, 09 Oct 2023 19:50:02 +0100,
Ryan Roberts <ryan.roberts at arm.com> wrote:
> 
> Implement a simple policy whereby if the HW supports FEAT_LPA2 for the
> page size we are using, always use LPA2-style page-tables for stage 2
> and hyp stage 1, regardless of the VMM-requested IPA size or
> HW-implemented PA size. When in use we can now support up to 52-bit IPA
> and PA sizes.

Maybe worth stating that this S1 comment only applies to the
standalone EL2 portion, and not the VHE S1 mappings.

> 
> We use the previously created cpu feature to track whether LPA2 is
> supported for deciding whether to use the LPA2 or classic pte format.
> 
> Note that FEAT_LPA2 brings support for bigger block mappings (512GB with
> 4KB, 64GB with 16KB). We explicitly don't enable these in the library
> because stage2_apply_range() works on batch sizes of the largest used
> block mapping, and increasing the size of the batch would lead to soft
> lockups. See commit 5994bc9e05c2 ("KVM: arm64: Limit
> stage2_apply_range() batch size to largest block").
> 
> Signed-off-by: Ryan Roberts <ryan.roberts at arm.com>
> ---
>  arch/arm64/include/asm/kvm_pgtable.h | 47 +++++++++++++++++++++-------
>  arch/arm64/kvm/arm.c                 |  2 ++
>  arch/arm64/kvm/hyp/nvhe/tlb.c        |  3 +-
>  arch/arm64/kvm/hyp/pgtable.c         | 15 +++++++--
>  arch/arm64/kvm/hyp/vhe/tlb.c         |  3 +-
>  5 files changed, 54 insertions(+), 16 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
> index d3e354bb8351..b240158e1218 100644
> --- a/arch/arm64/include/asm/kvm_pgtable.h
> +++ b/arch/arm64/include/asm/kvm_pgtable.h
> @@ -25,12 +25,22 @@
>  #define KVM_PGTABLE_MIN_BLOCK_LEVEL	2U
>  #endif
>  
> +static inline u64 kvm_get_parange_max(void)
> +{
> +	if (system_supports_lpa2() ||
> +	   (IS_ENABLED(CONFIG_ARM64_PA_BITS_52) && PAGE_SIZE == SZ_64K))

nit: the rest of the code uses PAGE_SHIFT instead of PAGE_SIZE. Not a
big deal, but being consistent might help the reader.

> +		return ID_AA64MMFR0_EL1_PARANGE_52;
> +	else
> +		return ID_AA64MMFR0_EL1_PARANGE_48;
> +}
> +
>  static inline u64 kvm_get_parange(u64 mmfr0)
>  {
> +	u64 parange_max = kvm_get_parange_max();
>  	u64 parange = cpuid_feature_extract_unsigned_field(mmfr0,
>  				ID_AA64MMFR0_EL1_PARANGE_SHIFT);
> -	if (parange > ID_AA64MMFR0_EL1_PARANGE_MAX)
> -		parange = ID_AA64MMFR0_EL1_PARANGE_MAX;
> +	if (parange > parange_max)
> +		parange = parange_max;
>  
>  	return parange;
>  }
> @@ -41,6 +51,8 @@ typedef u64 kvm_pte_t;
>  
>  #define KVM_PTE_ADDR_MASK		GENMASK(47, PAGE_SHIFT)
>  #define KVM_PTE_ADDR_51_48		GENMASK(15, 12)
> +#define KVM_PTE_ADDR_MASK_LPA2		GENMASK(49, PAGE_SHIFT)
> +#define KVM_PTE_ADDR_51_50_LPA2		GENMASK(9, 8)
>  
>  #define KVM_PHYS_INVALID		(-1ULL)
>  
> @@ -51,21 +63,34 @@ static inline bool kvm_pte_valid(kvm_pte_t pte)
>  
>  static inline u64 kvm_pte_to_phys(kvm_pte_t pte)
>  {
> -	u64 pa = pte & KVM_PTE_ADDR_MASK;
> -
> -	if (PAGE_SHIFT == 16)
> -		pa |= FIELD_GET(KVM_PTE_ADDR_51_48, pte) << 48;
> +	u64 pa;
> +
> +	if (system_supports_lpa2()) {
> +		pa = pte & KVM_PTE_ADDR_MASK_LPA2;
> +		pa |= FIELD_GET(KVM_PTE_ADDR_51_50_LPA2, pte) << 50;
> +	} else {
> +		pa = pte & KVM_PTE_ADDR_MASK;
> +		if (PAGE_SHIFT == 16)
> +			pa |= FIELD_GET(KVM_PTE_ADDR_51_48, pte) << 48;
> +	}
>  
>  	return pa;
>  }
>  
>  static inline kvm_pte_t kvm_phys_to_pte(u64 pa)
>  {
> -	kvm_pte_t pte = pa & KVM_PTE_ADDR_MASK;
> -
> -	if (PAGE_SHIFT == 16) {
> -		pa &= GENMASK(51, 48);
> -		pte |= FIELD_PREP(KVM_PTE_ADDR_51_48, pa >> 48);
> +	kvm_pte_t pte;
> +
> +	if (system_supports_lpa2()) {
> +		pte = pa & KVM_PTE_ADDR_MASK_LPA2;
> +		pa &= GENMASK(51, 50);
> +		pte |= FIELD_PREP(KVM_PTE_ADDR_51_50_LPA2, pa >> 50);
> +	} else {
> +		pte = pa & KVM_PTE_ADDR_MASK;
> +		if (PAGE_SHIFT == 16) {
> +			pa &= GENMASK(51, 48);
> +			pte |= FIELD_PREP(KVM_PTE_ADDR_51_48, pa >> 48);
> +		}
>  	}
>  
>  	return pte;
> diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
> index 4866b3f7b4ea..73cc67c2a8a7 100644
> --- a/arch/arm64/kvm/arm.c
> +++ b/arch/arm64/kvm/arm.c
> @@ -1747,6 +1747,8 @@ static void __init cpu_prepare_hyp_mode(int cpu, u32 hyp_va_bits)
>  	}
>  	tcr &= ~TCR_T0SZ_MASK;
>  	tcr |= TCR_T0SZ(hyp_va_bits);
> +	if (system_supports_lpa2())
> +		tcr |= TCR_EL2_DS;
>  	params->tcr_el2 = tcr;
>  
>  	params->pgd_pa = kvm_mmu_get_httbr();
> diff --git a/arch/arm64/kvm/hyp/nvhe/tlb.c b/arch/arm64/kvm/hyp/nvhe/tlb.c
> index d42b72f78a9b..c3cd16c6f95f 100644
> --- a/arch/arm64/kvm/hyp/nvhe/tlb.c
> +++ b/arch/arm64/kvm/hyp/nvhe/tlb.c
> @@ -198,7 +198,8 @@ void __kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu,
>  	/* Switch to requested VMID */
>  	__tlb_switch_to_guest(mmu, &cxt, false);
>  
> -	__flush_s2_tlb_range_op(ipas2e1is, start, pages, stride, 0, false);
> +	__flush_s2_tlb_range_op(ipas2e1is, start, pages, stride, 0,
> +				system_supports_lpa2());

At this stage, I'd fully expect the flag to have been subsumed into
the helper...

>  
>  	dsb(ish);
>  	__tlbi(vmalle1is);
> diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
> index f155b8c9e98c..062eb7bcdb8a 100644
> --- a/arch/arm64/kvm/hyp/pgtable.c
> +++ b/arch/arm64/kvm/hyp/pgtable.c
> @@ -79,7 +79,10 @@ static bool kvm_pgtable_walk_skip_cmo(const struct kvm_pgtable_visit_ctx *ctx)
>  
>  static bool kvm_phys_is_valid(u64 phys)
>  {
> -	return phys < BIT(id_aa64mmfr0_parange_to_phys_shift(ID_AA64MMFR0_EL1_PARANGE_MAX));
> +	u64 parange_max = kvm_get_parange_max();
> +	u8 shift = id_aa64mmfr0_parange_to_phys_shift(parange_max);
> +
> +	return phys < BIT(shift);
>  }
>  
>  static bool kvm_block_mapping_supported(const struct kvm_pgtable_visit_ctx *ctx, u64 phys)
> @@ -408,7 +411,8 @@ static int hyp_set_prot_attr(enum kvm_pgtable_prot prot, kvm_pte_t *ptep)
>  	}
>  
>  	attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_AP, ap);
> -	attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_SH, sh);
> +	if (!system_supports_lpa2())
> +		attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_SH, sh);
>  	attr |= KVM_PTE_LEAF_ATTR_LO_S1_AF;
>  	attr |= prot & KVM_PTE_LEAF_ATTR_HI_SW;
>  	*ptep = attr;
> @@ -654,6 +658,9 @@ u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift)
>  		vtcr |= VTCR_EL2_HA;
>  #endif /* CONFIG_ARM64_HW_AFDBM */
>  
> +	if (system_supports_lpa2())
> +		vtcr |= VTCR_EL2_DS;
> +
>  	/* Set the vmid bits */
>  	vtcr |= (get_vmid_bits(mmfr1) == 16) ?
>  		VTCR_EL2_VS_16BIT :
> @@ -711,7 +718,9 @@ static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot p
>  	if (prot & KVM_PGTABLE_PROT_W)
>  		attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W;
>  
> -	attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S2_SH, sh);
> +	if (!system_supports_lpa2())
> +		attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S2_SH, sh);
> +
>  	attr |= KVM_PTE_LEAF_ATTR_LO_S2_AF;
>  	attr |= prot & KVM_PTE_LEAF_ATTR_HI_SW;
>  	*ptep = attr;
> diff --git a/arch/arm64/kvm/hyp/vhe/tlb.c b/arch/arm64/kvm/hyp/vhe/tlb.c
> index 6041c6c78984..40cea2482a76 100644
> --- a/arch/arm64/kvm/hyp/vhe/tlb.c
> +++ b/arch/arm64/kvm/hyp/vhe/tlb.c
> @@ -161,7 +161,8 @@ void __kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu,
>  	/* Switch to requested VMID */
>  	__tlb_switch_to_guest(mmu, &cxt);
>  
> -	__flush_s2_tlb_range_op(ipas2e1is, start, pages, stride, 0, false);
> +	__flush_s2_tlb_range_op(ipas2e1is, start, pages, stride, 0,
> +				system_supports_lpa2());
>  
>  	dsb(ish);
>  	__tlbi(vmalle1is);

One thing I don't see here is how you update the tcr_compute_pa_size
macro that is used on the initial nVHE setup, which is inconsistent
with the kvm_get_parange_max() helper.

Thanks,

	M.

-- 
Without deviation from the norm, progress is not possible.