[PATCH 1/2] arm64: KVM: Implement 48 VA support for KVM EL2 and Stage-2
Catalin Marinas
catalin.marinas at arm.com
Tue Sep 30 05:39:47 PDT 2014
Hi Christoffer,
On Thu, Sep 25, 2014 at 08:42:53PM +0100, Christoffer Dall wrote:
> diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
> index 7796051..048f37f 100644
> --- a/arch/arm/kvm/arm.c
> +++ b/arch/arm/kvm/arm.c
> @@ -409,7 +409,7 @@ static void update_vttbr(struct kvm *kvm)
> kvm_next_vmid++;
>
> /* update vttbr to be used with the new vmid */
> - pgd_phys = virt_to_phys(kvm->arch.pgd);
> + pgd_phys = kvm_get_hwpgd(kvm);
> BUG_ON(pgd_phys & ~VTTBR_BADDR_MASK);
> vmid = ((u64)(kvm->arch.vmid) << VTTBR_VMID_SHIFT) & VTTBR_VMID_MASK;
> kvm->arch.vttbr = pgd_phys | vmid;
> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
> index bb06f76..4532f5f 100644
> --- a/arch/arm/kvm/mmu.c
> +++ b/arch/arm/kvm/mmu.c
> @@ -33,6 +33,7 @@
>
> extern char __hyp_idmap_text_start[], __hyp_idmap_text_end[];
>
> +static int kvm_prealloc_levels;
Do you have plans to determine this dynamically? In this patch, I can
only see it being assigned to KVM_PREALLOC_LEVELS. I guess you could
simply use the macro.
> @@ -521,6 +554,7 @@ int create_hyp_io_mappings(void *from, void *to, phys_addr_t phys_addr)
> */
> int kvm_alloc_stage2_pgd(struct kvm *kvm)
> {
> + int ret;
> pgd_t *pgd;
>
> if (kvm->arch.pgd != NULL) {
> @@ -533,9 +567,17 @@ int kvm_alloc_stage2_pgd(struct kvm *kvm)
> return -ENOMEM;
>
> memset(pgd, 0, PTRS_PER_S2_PGD * sizeof(pgd_t));
I don't think you need to allocate a full page here (not shown here in
the diff context) but it may not be trivial since you use get_page() to
keep track of when page tables could be freed (do you do this for the
pgd as well?).
> +
> + ret = kvm_prealloc_hwpgd(kvm, pgd);
> + if (ret)
> + goto out;
> +
> kvm_clean_pgd(pgd);
> kvm->arch.pgd = pgd;
> -
> + ret = 0;
> +out:
> + if (ret)
> + free_pages((unsigned long)pgd, S2_PGD_ORDER);
> return 0;
> }
Does this always return 0, even in case of error? Does it look better
as:
ret = kvm_prealloc_hwpgd(kvm, pgd);
if (ret)
goto out;
kvm_clean_pgd(pgd);
kvm->arch.pgd = pgd;
return 0;
out:
free_pages((unsigned long)pgd, S2_PGD_ORDER);
return ret;
> @@ -572,19 +614,36 @@ void kvm_free_stage2_pgd(struct kvm *kvm)
> return;
>
> unmap_stage2_range(kvm, 0, KVM_PHYS_SIZE);
> + kvm_free_hwpgd(kvm);
> free_pages((unsigned long)kvm->arch.pgd, S2_PGD_ORDER);
> kvm->arch.pgd = NULL;
> }
>
> -static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
> +static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
> phys_addr_t addr)
> {
> pgd_t *pgd;
> pud_t *pud;
> - pmd_t *pmd;
>
> pgd = kvm->arch.pgd + pgd_index(addr);
> - pud = pud_offset(pgd, addr);
> + if (pgd_none(*pgd)) {
> + if (!cache)
> + return NULL;
> + pud = mmu_memory_cache_alloc(cache);
> + pgd_populate(NULL, pgd, pud);
> + get_page(virt_to_page(pgd));
> + }
> +
> + return pud_offset(pgd, addr);
> +}
This function looks correct, though we would not expect pgd_none() to
ever be true as we pre-populate it. You could but a WARN_ON or something
until we actually have hardware 4-levels stage 2 (though I don't see a
need yet).
> +static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
> + phys_addr_t addr)
> +{
> + pud_t *pud;
> + pmd_t *pmd;
> +
> + pud = stage2_get_pud(kvm, cache, addr);
> if (pud_none(*pud)) {
> if (!cache)
> return NULL;
> @@ -630,7 +689,7 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
> pmd_t *pmd;
> pte_t *pte, old_pte;
>
> - /* Create stage-2 page table mapping - Level 1 */
> + /* Create stage-2 page table mapping - Levels 0 and 1 */
> pmd = stage2_get_pmd(kvm, cache, addr);
> if (!pmd) {
> /*
> @@ -688,7 +747,8 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
> for (addr = guest_ipa; addr < end; addr += PAGE_SIZE) {
> pte_t pte = pfn_pte(pfn, PAGE_S2_DEVICE);
>
> - ret = mmu_topup_memory_cache(&cache, 2, 2);
> + ret = mmu_topup_memory_cache(&cache, KVM_MMU_CACHE_MIN_PAGES,
> + KVM_MMU_CACHE_MIN_PAGES);
BTW, why do you need to preallocate the pages in KVM's own cache? Would
GFP_ATOMIC not work within the kvm->mmu_lock region?
> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index fd4e81a..0f3e0a9 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -200,7 +200,6 @@ config ARM64_VA_BITS_42
>
> config ARM64_VA_BITS_48
> bool "48-bit"
> - depends on BROKEN
I'm ok with removing BROKEN here but I think at the same time we need to
disable the SMMU driver which has similar issues with (not) supporting
4-levels page tables for stage 2 (stage 1 should be fine).
> diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
> index a030d16..313c6f9 100644
> --- a/arch/arm64/include/asm/kvm_mmu.h
> +++ b/arch/arm64/include/asm/kvm_mmu.h
[...]
> @@ -118,13 +128,123 @@ static inline bool kvm_page_empty(void *ptr)
> }
>
> #define kvm_pte_table_empty(ptep) kvm_page_empty(ptep)
> -#ifndef CONFIG_ARM64_64K_PAGES
> -#define kvm_pmd_table_empty(pmdp) kvm_page_empty(pmdp)
> -#else
> +#if CONFIG_ARM64_PGTABLE_LEVELS == 2
> #define kvm_pmd_table_empty(pmdp) (0)
> -#endif
> #define kvm_pud_table_empty(pudp) (0)
> +#elif CONFIG_ARM64_PGTABLE_LEVELS == 3
> +#define kvm_pmd_table_empty(pmdp) kvm_page_empty(pmdp)
> +#define kvm_pud_table_empty(pudp) (0)
> +#elif CONFIG_ARM64_PGTABLE_LEVELS == 4
> +#define kvm_pmd_table_empty(pmdp) kvm_page_empty(pmdp)
> +#define kvm_pud_table_empty(pudp) kvm_page_empty(pudp)
> +#endif
Not sure whether it's clearer as (up to you):
#ifdef __PAGETABLE_PMD_FOLDED
#define kvm_pmd_table_empty(pmdp) (0)
#else
#define kvm_pmd_table_empty(pmdp) kvm_page_empty(pmdp)
#endif
#ifdef __PAGETABLE_PUD_FOLDED
#define kvm_pud_table_empty(pudp) (0)
#else
#define kvm_pud_table_empty(pudp) kvm_page_empty(pudp)
#endif
> +
> +/**
> + * kvm_prealloc_hwpgd - allocate inital table for VTTBR
> + * @kvm: The KVM struct pointer for the VM.
> + * @pgd: The kernel pseudo pgd
"hwpgd" doesn't sound like the right name since it's actually a fake pgd
that it is allocating. Maybe "swpgd"?
> + *
> + * When the kernel uses more levels of page tables than the guest, we allocate
> + * a fake PGD and pre-populate it to point to the next-level page table, which
> + * will be the real initial page table pointed to by the VTTBR.
> + *
> + * When KVM_PREALLOC_PMD is defined, we allocate a single page for the PMD and
> + * the kernel will use folded pud. When KVM_PREALLOC_PUD is defined, we
> + * allocate 2 consecutive PUD pages.
> + */
> +#if defined(CONFIG_ARM64_64K_PAGES) && CONFIG_ARM64_PGTABLE_LEVELS == 3
> +#define KVM_PREALLOC_LEVELS 2
> +#define PTRS_PER_S2_PGD 1
> +#define S2_PGD_ORDER get_order(PTRS_PER_S2_PGD * sizeof(pgd_t))
> +
> +
> +static inline int kvm_prealloc_hwpgd(struct kvm *kvm, pgd_t *pgd)
> +{
> + pud_t *pud;
> + pmd_t *pmd;
> +
> + pud = pud_offset(pgd, 0);
> + pmd = (pmd_t *)__get_free_pages(GFP_KERNEL, 0);
> +
> + if (!pmd)
> + return -ENOMEM;
> + memset(pmd, 0, PAGE_SIZE);
You could use __GFP_ZERO.
> + pud_populate(NULL, pud, pmd);
> + get_page(virt_to_page(pud));
> +
> + return 0;
> +}
>
> +static inline void kvm_free_hwpgd(struct kvm *kvm)
> +{
> + pgd_t *pgd = kvm->arch.pgd;
> + pud_t *pud = pud_offset(pgd, 0);
> + pmd_t *pmd = pmd_offset(pud, 0);
> + free_pages((unsigned long)pmd, 0);
> + put_page(virt_to_page(pud));
> +}
> +
> +static inline phys_addr_t kvm_get_hwpgd(struct kvm *kvm)
> +{
> + pgd_t *pgd = kvm->arch.pgd;
> + pud_t *pud = pud_offset(pgd, 0);
> + pmd_t *pmd = pmd_offset(pud, 0);
> + return virt_to_phys(pmd);
> +
> +}
> +#elif defined(CONFIG_ARM64_4K_PAGES) && CONFIG_ARM64_PGTABLE_LEVELS == 4
> +#define KVM_PREALLOC_LEVELS 1
> +#define PTRS_PER_S2_PGD 2
> +#define S2_PGD_ORDER get_order(PTRS_PER_S2_PGD * sizeof(pgd_t))
> +
> +static inline int kvm_prealloc_hwpgd(struct kvm *kvm, pgd_t *pgd)
> +{
> + pud_t *pud;
> +
> + pud = (pud_t *)__get_free_pages(GFP_KERNEL, 1);
> + if (!pud)
> + return -ENOMEM;
> + memset(pud, 0, 2 * PAGE_SIZE);
> + pgd_populate(NULL, pgd, pud);
> + pgd_populate(NULL, pgd + 1, pud + PTRS_PER_PUD);
> + get_page(virt_to_page(pgd));
> + get_page(virt_to_page(pgd));
> +
> + return 0;
> +}
> +
> +static inline void kvm_free_hwpgd(struct kvm *kvm)
> +{
> + pgd_t *pgd = kvm->arch.pgd;
> + pud_t *pud = pud_offset(pgd, 0);
> + free_pages((unsigned long)pud, 1);
> + put_page(virt_to_page(pgd));
> + put_page(virt_to_page(pgd));
> +}
> +
> +static inline phys_addr_t kvm_get_hwpgd(struct kvm *kvm)
> +{
> + pgd_t *pgd = kvm->arch.pgd;
> + pud_t *pud = pud_offset(pgd, 0);
> + return virt_to_phys(pud);
> +}
> +#else
> +#define KVM_PREALLOC_LEVELS 0
> +#define PTRS_PER_S2_PGD (1 << (KVM_PHYS_SHIFT - PGDIR_SHIFT))
> +#define S2_PGD_ORDER get_order(PTRS_PER_S2_PGD * sizeof(pgd_t))
> +
> +static inline int kvm_prealloc_hwpgd(struct kvm *kvm, pgd_t *pgd)
> +{
> + return 0;
> +}
> +
> +static inline void kvm_free_hwpgd(struct kvm *kvm) { }
> +
> +static inline phys_addr_t kvm_get_hwpgd(struct kvm *kvm)
> +{
> + return virt_to_phys(kvm->arch.pgd);
> +}
> +#endif
I think all the above could be squashed into a single set of function
implementations with the right macros defined before.
For the KVM levels, you could use something like (we exchanged emails in
private on this topic and how I got to these macros but they need better
checking):
#define KVM_PGTABLE_LEVELS ((KVM_PHYS_SHIFT - PAGE_SHIFT - 5) / (PAGE_SHIFT - 3) + 1)
#define KVM_PREALLOC_LEVELS (CONFIG_ARM64_PGTABLE_LEVELS - KVM_PGTABLE_LEVELS)
#if PGDIR_SHIFT > KVM_PHYS_SHIFT
#define PTRS_PER_S2_PGD (1)
#else
#define PTRS_PER_S2_PGD (1 << (KVM_PHYS_SHIFT - PGDIR_SHIFT))
#endif
--
Catalin
More information about the linux-arm-kernel
mailing list