[PATCH v4 06/12] KVM: arm64: Use LPA2 page-tables for stage2 and hyp stage1

Fri Oct 20 08:06:50 PDT 2023

On 20/10/2023 10:16, Marc Zyngier wrote:
> On Mon, 09 Oct 2023 19:50:02 +0100,
> Ryan Roberts <ryan.roberts at arm.com> wrote:
>>
>> Implement a simple policy whereby if the HW supports FEAT_LPA2 for the
>> page size we are using, always use LPA2-style page-tables for stage 2
>> and hyp stage 1, regardless of the VMM-requested IPA size or
>> HW-implemented PA size. When in use we can now support up to 52-bit IPA
>> and PA sizes.
> 
> Maybe worth stating that this S1 comment only applies to the
> standalone EL2 portion, and not the VHE S1 mappings.

ACK.

> 
>>
>> We use the previously created cpu feature to track whether LPA2 is
>> supported for deciding whether to use the LPA2 or classic pte format.
>>
>> Note that FEAT_LPA2 brings support for bigger block mappings (512GB with
>> 4KB, 64GB with 16KB). We explicitly don't enable these in the library
>> because stage2_apply_range() works on batch sizes of the largest used
>> block mapping, and increasing the size of the batch would lead to soft
>> lockups. See commit 5994bc9e05c2 ("KVM: arm64: Limit
>> stage2_apply_range() batch size to largest block").
>>
>> Signed-off-by: Ryan Roberts <ryan.roberts at arm.com>
>> ---
>>  arch/arm64/include/asm/kvm_pgtable.h | 47 +++++++++++++++++++++-------
>>  arch/arm64/kvm/arm.c                 |  2 ++
>>  arch/arm64/kvm/hyp/nvhe/tlb.c        |  3 +-
>>  arch/arm64/kvm/hyp/pgtable.c         | 15 +++++++--
>>  arch/arm64/kvm/hyp/vhe/tlb.c         |  3 +-
>>  5 files changed, 54 insertions(+), 16 deletions(-)
>>
>> diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
>> index d3e354bb8351..b240158e1218 100644
>> --- a/arch/arm64/include/asm/kvm_pgtable.h
>> +++ b/arch/arm64/include/asm/kvm_pgtable.h
>> @@ -25,12 +25,22 @@
>>  #define KVM_PGTABLE_MIN_BLOCK_LEVEL	2U
>>  #endif
>>  
>> +static inline u64 kvm_get_parange_max(void)
>> +{
>> +	if (system_supports_lpa2() ||
>> +	   (IS_ENABLED(CONFIG_ARM64_PA_BITS_52) && PAGE_SIZE == SZ_64K))
> 
> nit: the rest of the code uses PAGE_SHIFT instead of PAGE_SIZE. Not a
> big deal, but being consistent might help the reader.

ACK.

> 
>> +		return ID_AA64MMFR0_EL1_PARANGE_52;
>> +	else
>> +		return ID_AA64MMFR0_EL1_PARANGE_48;
>> +}
>> +
>>  static inline u64 kvm_get_parange(u64 mmfr0)
>>  {
>> +	u64 parange_max = kvm_get_parange_max();
>>  	u64 parange = cpuid_feature_extract_unsigned_field(mmfr0,
>>  				ID_AA64MMFR0_EL1_PARANGE_SHIFT);
>> -	if (parange > ID_AA64MMFR0_EL1_PARANGE_MAX)
>> -		parange = ID_AA64MMFR0_EL1_PARANGE_MAX;
>> +	if (parange > parange_max)
>> +		parange = parange_max;
>>  
>>  	return parange;
>>  }
>> @@ -41,6 +51,8 @@ typedef u64 kvm_pte_t;
>>  
>>  #define KVM_PTE_ADDR_MASK		GENMASK(47, PAGE_SHIFT)
>>  #define KVM_PTE_ADDR_51_48		GENMASK(15, 12)
>> +#define KVM_PTE_ADDR_MASK_LPA2		GENMASK(49, PAGE_SHIFT)
>> +#define KVM_PTE_ADDR_51_50_LPA2		GENMASK(9, 8)
>>  
>>  #define KVM_PHYS_INVALID		(-1ULL)
>>  
>> @@ -51,21 +63,34 @@ static inline bool kvm_pte_valid(kvm_pte_t pte)
>>  
>>  static inline u64 kvm_pte_to_phys(kvm_pte_t pte)
>>  {
>> -	u64 pa = pte & KVM_PTE_ADDR_MASK;
>> -
>> -	if (PAGE_SHIFT == 16)
>> -		pa |= FIELD_GET(KVM_PTE_ADDR_51_48, pte) << 48;
>> +	u64 pa;
>> +
>> +	if (system_supports_lpa2()) {
>> +		pa = pte & KVM_PTE_ADDR_MASK_LPA2;
>> +		pa |= FIELD_GET(KVM_PTE_ADDR_51_50_LPA2, pte) << 50;
>> +	} else {
>> +		pa = pte & KVM_PTE_ADDR_MASK;
>> +		if (PAGE_SHIFT == 16)
>> +			pa |= FIELD_GET(KVM_PTE_ADDR_51_48, pte) << 48;
>> +	}
>>  
>>  	return pa;
>>  }
>>  
>>  static inline kvm_pte_t kvm_phys_to_pte(u64 pa)
>>  {
>> -	kvm_pte_t pte = pa & KVM_PTE_ADDR_MASK;
>> -
>> -	if (PAGE_SHIFT == 16) {
>> -		pa &= GENMASK(51, 48);
>> -		pte |= FIELD_PREP(KVM_PTE_ADDR_51_48, pa >> 48);
>> +	kvm_pte_t pte;
>> +
>> +	if (system_supports_lpa2()) {
>> +		pte = pa & KVM_PTE_ADDR_MASK_LPA2;
>> +		pa &= GENMASK(51, 50);
>> +		pte |= FIELD_PREP(KVM_PTE_ADDR_51_50_LPA2, pa >> 50);
>> +	} else {
>> +		pte = pa & KVM_PTE_ADDR_MASK;
>> +		if (PAGE_SHIFT == 16) {
>> +			pa &= GENMASK(51, 48);
>> +			pte |= FIELD_PREP(KVM_PTE_ADDR_51_48, pa >> 48);
>> +		}
>>  	}
>>  
>>  	return pte;
>> diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
>> index 4866b3f7b4ea..73cc67c2a8a7 100644
>> --- a/arch/arm64/kvm/arm.c
>> +++ b/arch/arm64/kvm/arm.c
>> @@ -1747,6 +1747,8 @@ static void __init cpu_prepare_hyp_mode(int cpu, u32 hyp_va_bits)
>>  	}
>>  	tcr &= ~TCR_T0SZ_MASK;
>>  	tcr |= TCR_T0SZ(hyp_va_bits);
>> +	if (system_supports_lpa2())
>> +		tcr |= TCR_EL2_DS;
>>  	params->tcr_el2 = tcr;
>>  
>>  	params->pgd_pa = kvm_mmu_get_httbr();
>> diff --git a/arch/arm64/kvm/hyp/nvhe/tlb.c b/arch/arm64/kvm/hyp/nvhe/tlb.c
>> index d42b72f78a9b..c3cd16c6f95f 100644
>> --- a/arch/arm64/kvm/hyp/nvhe/tlb.c
>> +++ b/arch/arm64/kvm/hyp/nvhe/tlb.c
>> @@ -198,7 +198,8 @@ void __kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu,
>>  	/* Switch to requested VMID */
>>  	__tlb_switch_to_guest(mmu, &cxt, false);
>>  
>> -	__flush_s2_tlb_range_op(ipas2e1is, start, pages, stride, 0, false);
>> +	__flush_s2_tlb_range_op(ipas2e1is, start, pages, stride, 0,
>> +				system_supports_lpa2());
> 
> At this stage, I'd fully expect the flag to have been subsumed into
> the helper...

ACK. I'm planning to have has_lpa2() always return false for now. Then once
Ard's changes are in, we can change it to report the system status. Then we can
move this inside __flush_s2_tlb_range_op(). Does that work for you?

> 
>>  
>>  	dsb(ish);
>>  	__tlbi(vmalle1is);
>> diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
>> index f155b8c9e98c..062eb7bcdb8a 100644
>> --- a/arch/arm64/kvm/hyp/pgtable.c
>> +++ b/arch/arm64/kvm/hyp/pgtable.c
>> @@ -79,7 +79,10 @@ static bool kvm_pgtable_walk_skip_cmo(const struct kvm_pgtable_visit_ctx *ctx)
>>  
>>  static bool kvm_phys_is_valid(u64 phys)
>>  {
>> -	return phys < BIT(id_aa64mmfr0_parange_to_phys_shift(ID_AA64MMFR0_EL1_PARANGE_MAX));
>> +	u64 parange_max = kvm_get_parange_max();
>> +	u8 shift = id_aa64mmfr0_parange_to_phys_shift(parange_max);
>> +
>> +	return phys < BIT(shift);
>>  }
>>  
>>  static bool kvm_block_mapping_supported(const struct kvm_pgtable_visit_ctx *ctx, u64 phys)
>> @@ -408,7 +411,8 @@ static int hyp_set_prot_attr(enum kvm_pgtable_prot prot, kvm_pte_t *ptep)
>>  	}
>>  
>>  	attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_AP, ap);
>> -	attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_SH, sh);
>> +	if (!system_supports_lpa2())
>> +		attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_SH, sh);
>>  	attr |= KVM_PTE_LEAF_ATTR_LO_S1_AF;
>>  	attr |= prot & KVM_PTE_LEAF_ATTR_HI_SW;
>>  	*ptep = attr;
>> @@ -654,6 +658,9 @@ u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift)
>>  		vtcr |= VTCR_EL2_HA;
>>  #endif /* CONFIG_ARM64_HW_AFDBM */
>>  
>> +	if (system_supports_lpa2())
>> +		vtcr |= VTCR_EL2_DS;
>> +
>>  	/* Set the vmid bits */
>>  	vtcr |= (get_vmid_bits(mmfr1) == 16) ?
>>  		VTCR_EL2_VS_16BIT :
>> @@ -711,7 +718,9 @@ static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot p
>>  	if (prot & KVM_PGTABLE_PROT_W)
>>  		attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W;
>>  
>> -	attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S2_SH, sh);
>> +	if (!system_supports_lpa2())
>> +		attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S2_SH, sh);
>> +
>>  	attr |= KVM_PTE_LEAF_ATTR_LO_S2_AF;
>>  	attr |= prot & KVM_PTE_LEAF_ATTR_HI_SW;
>>  	*ptep = attr;
>> diff --git a/arch/arm64/kvm/hyp/vhe/tlb.c b/arch/arm64/kvm/hyp/vhe/tlb.c
>> index 6041c6c78984..40cea2482a76 100644
>> --- a/arch/arm64/kvm/hyp/vhe/tlb.c
>> +++ b/arch/arm64/kvm/hyp/vhe/tlb.c
>> @@ -161,7 +161,8 @@ void __kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu,
>>  	/* Switch to requested VMID */
>>  	__tlb_switch_to_guest(mmu, &cxt);
>>  
>> -	__flush_s2_tlb_range_op(ipas2e1is, start, pages, stride, 0, false);
>> +	__flush_s2_tlb_range_op(ipas2e1is, start, pages, stride, 0,
>> +				system_supports_lpa2());
>>  
>>  	dsb(ish);
>>  	__tlbi(vmalle1is);
> 
> One thing I don't see here is how you update the tcr_compute_pa_size
> macro that is used on the initial nVHE setup, which is inconsistent
> with the kvm_get_parange_max() helper.

As you saw, it's in a separate patch.

> 
> Thanks,
> 
> 	M.
>