[PATCH 1/2] arm64: KVM: Implement 48 VA support for KVM EL2 and Stage-2

Tue Sep 30 05:39:47 PDT 2014

Hi Christoffer,

On Thu, Sep 25, 2014 at 08:42:53PM +0100, Christoffer Dall wrote:
> diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
> index 7796051..048f37f 100644
> --- a/arch/arm/kvm/arm.c
> +++ b/arch/arm/kvm/arm.c
> @@ -409,7 +409,7 @@ static void update_vttbr(struct kvm *kvm)
>         kvm_next_vmid++;
>
>         /* update vttbr to be used with the new vmid */
> -       pgd_phys = virt_to_phys(kvm->arch.pgd);
> +       pgd_phys = kvm_get_hwpgd(kvm);
>         BUG_ON(pgd_phys & ~VTTBR_BADDR_MASK);
>         vmid = ((u64)(kvm->arch.vmid) << VTTBR_VMID_SHIFT) & VTTBR_VMID_MASK;
>         kvm->arch.vttbr = pgd_phys | vmid;
> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
> index bb06f76..4532f5f 100644
> --- a/arch/arm/kvm/mmu.c
> +++ b/arch/arm/kvm/mmu.c
> @@ -33,6 +33,7 @@
>
>  extern char  __hyp_idmap_text_start[], __hyp_idmap_text_end[];
>
> +static int kvm_prealloc_levels;

Do you have plans to determine this dynamically? In this patch, I can
only see it being assigned to KVM_PREALLOC_LEVELS. I guess you could
simply use the macro.

> @@ -521,6 +554,7 @@ int create_hyp_io_mappings(void *from, void *to, phys_addr_t phys_addr)
>   */
>  int kvm_alloc_stage2_pgd(struct kvm *kvm)
>  {
> +       int ret;
>         pgd_t *pgd;
>
>         if (kvm->arch.pgd != NULL) {
> @@ -533,9 +567,17 @@ int kvm_alloc_stage2_pgd(struct kvm *kvm)
>                 return -ENOMEM;
>
>         memset(pgd, 0, PTRS_PER_S2_PGD * sizeof(pgd_t));

I don't think you need to allocate a full page here (not shown here in
the diff context) but it may not be trivial since you use get_page() to
keep track of when page tables could be freed (do you do this for the
pgd as well?).

> +
> +       ret = kvm_prealloc_hwpgd(kvm, pgd);
> +       if (ret)
> +               goto out;
> +
>         kvm_clean_pgd(pgd);
>         kvm->arch.pgd = pgd;
> -
> +       ret = 0;
> +out:
> +       if (ret)
> +               free_pages((unsigned long)pgd, S2_PGD_ORDER);
>         return 0;
>  }

Does this always return 0, even in case of error? Does it look better
as:

        ret = kvm_prealloc_hwpgd(kvm, pgd);
        if (ret)
                goto out;

        kvm_clean_pgd(pgd);
        kvm->arch.pgd = pgd;

        return 0;

out:
        free_pages((unsigned long)pgd, S2_PGD_ORDER);
        return ret;

> @@ -572,19 +614,36 @@ void kvm_free_stage2_pgd(struct kvm *kvm)
>                 return;
>
>         unmap_stage2_range(kvm, 0, KVM_PHYS_SIZE);
> +       kvm_free_hwpgd(kvm);
>         free_pages((unsigned long)kvm->arch.pgd, S2_PGD_ORDER);
>         kvm->arch.pgd = NULL;
>  }
>
> -static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
> +static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
>                              phys_addr_t addr)
>  {
>         pgd_t *pgd;
>         pud_t *pud;
> -       pmd_t *pmd;
>
>         pgd = kvm->arch.pgd + pgd_index(addr);
> -       pud = pud_offset(pgd, addr);
> +       if (pgd_none(*pgd)) {
> +               if (!cache)
> +                       return NULL;
> +               pud = mmu_memory_cache_alloc(cache);
> +               pgd_populate(NULL, pgd, pud);
> +               get_page(virt_to_page(pgd));
> +       }
> +
> +       return pud_offset(pgd, addr);
> +}

This function looks correct, though we would not expect pgd_none() to
ever be true as we pre-populate it. You could but a WARN_ON or something
until we actually have hardware 4-levels stage 2 (though I don't see a
need yet).

> +static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
> +                            phys_addr_t addr)
> +{
> +       pud_t *pud;
> +       pmd_t *pmd;
> +
> +       pud = stage2_get_pud(kvm, cache, addr);
>         if (pud_none(*pud)) {
>                 if (!cache)
>                         return NULL;
> @@ -630,7 +689,7 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
>         pmd_t *pmd;
>         pte_t *pte, old_pte;
>
> -       /* Create stage-2 page table mapping - Level 1 */
> +       /* Create stage-2 page table mapping - Levels 0 and 1 */
>         pmd = stage2_get_pmd(kvm, cache, addr);
>         if (!pmd) {
>                 /*
> @@ -688,7 +747,8 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
>         for (addr = guest_ipa; addr < end; addr += PAGE_SIZE) {
>                 pte_t pte = pfn_pte(pfn, PAGE_S2_DEVICE);
>
> -               ret = mmu_topup_memory_cache(&cache, 2, 2);
> +               ret = mmu_topup_memory_cache(&cache, KVM_MMU_CACHE_MIN_PAGES,
> +                                               KVM_MMU_CACHE_MIN_PAGES);

BTW, why do you need to preallocate the pages in KVM's own cache? Would
GFP_ATOMIC not work within the kvm->mmu_lock region?

> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index fd4e81a..0f3e0a9 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -200,7 +200,6 @@ config ARM64_VA_BITS_42
>
>  config ARM64_VA_BITS_48
>         bool "48-bit"
> -       depends on BROKEN

I'm ok with removing BROKEN here but I think at the same time we need to
disable the SMMU driver which has similar issues with (not) supporting
4-levels page tables for stage 2 (stage 1 should be fine).

> diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
> index a030d16..313c6f9 100644
> --- a/arch/arm64/include/asm/kvm_mmu.h
> +++ b/arch/arm64/include/asm/kvm_mmu.h
[...]
> @@ -118,13 +128,123 @@ static inline bool kvm_page_empty(void *ptr)
>  }
>
>  #define kvm_pte_table_empty(ptep) kvm_page_empty(ptep)
> -#ifndef CONFIG_ARM64_64K_PAGES
> -#define kvm_pmd_table_empty(pmdp) kvm_page_empty(pmdp)
> -#else
> +#if CONFIG_ARM64_PGTABLE_LEVELS == 2
>  #define kvm_pmd_table_empty(pmdp) (0)
> -#endif
>  #define kvm_pud_table_empty(pudp) (0)
> +#elif CONFIG_ARM64_PGTABLE_LEVELS == 3
> +#define kvm_pmd_table_empty(pmdp) kvm_page_empty(pmdp)
> +#define kvm_pud_table_empty(pudp) (0)
> +#elif CONFIG_ARM64_PGTABLE_LEVELS == 4
> +#define kvm_pmd_table_empty(pmdp) kvm_page_empty(pmdp)
> +#define kvm_pud_table_empty(pudp) kvm_page_empty(pudp)
> +#endif

Not sure whether it's clearer as (up to you):

#ifdef __PAGETABLE_PMD_FOLDED
#define kvm_pmd_table_empty(pmdp)       (0)
#else
#define kvm_pmd_table_empty(pmdp)       kvm_page_empty(pmdp)
#endif

#ifdef __PAGETABLE_PUD_FOLDED
#define kvm_pud_table_empty(pudp)       (0)
#else
#define kvm_pud_table_empty(pudp)       kvm_page_empty(pudp)
#endif

> +
> +/**
> + * kvm_prealloc_hwpgd - allocate inital table for VTTBR
> + * @kvm:       The KVM struct pointer for the VM.
> + * @pgd:       The kernel pseudo pgd

"hwpgd" doesn't sound like the right name since it's actually a fake pgd
that it is allocating. Maybe "swpgd"?

> + *
> + * When the kernel uses more levels of page tables than the guest, we allocate
> + * a fake PGD and pre-populate it to point to the next-level page table, which
> + * will be the real initial page table pointed to by the VTTBR.
> + *
> + * When KVM_PREALLOC_PMD is defined, we allocate a single page for the PMD and
> + * the kernel will use folded pud.  When KVM_PREALLOC_PUD is defined, we
> + * allocate 2 consecutive PUD pages.
> + */
> +#if defined(CONFIG_ARM64_64K_PAGES) && CONFIG_ARM64_PGTABLE_LEVELS == 3
> +#define KVM_PREALLOC_LEVELS    2
> +#define PTRS_PER_S2_PGD                1
> +#define S2_PGD_ORDER           get_order(PTRS_PER_S2_PGD * sizeof(pgd_t))
> +
> +
> +static inline int kvm_prealloc_hwpgd(struct kvm *kvm, pgd_t *pgd)
> +{
> +       pud_t *pud;
> +       pmd_t *pmd;
> +
> +       pud = pud_offset(pgd, 0);
> +       pmd = (pmd_t *)__get_free_pages(GFP_KERNEL, 0);
> +
> +       if (!pmd)
> +               return -ENOMEM;
> +       memset(pmd, 0, PAGE_SIZE);

You could use __GFP_ZERO.

> +       pud_populate(NULL, pud, pmd);
> +       get_page(virt_to_page(pud));
> +
> +       return 0;
> +}
>
> +static inline void kvm_free_hwpgd(struct kvm *kvm)
> +{
> +       pgd_t *pgd = kvm->arch.pgd;
> +       pud_t *pud = pud_offset(pgd, 0);
> +       pmd_t *pmd = pmd_offset(pud, 0);
> +       free_pages((unsigned long)pmd, 0);
> +       put_page(virt_to_page(pud));
> +}
> +
> +static inline phys_addr_t kvm_get_hwpgd(struct kvm *kvm)
> +{
> +       pgd_t *pgd = kvm->arch.pgd;
> +       pud_t *pud = pud_offset(pgd, 0);
> +       pmd_t *pmd = pmd_offset(pud, 0);
> +       return virt_to_phys(pmd);
> +
> +}
> +#elif defined(CONFIG_ARM64_4K_PAGES) && CONFIG_ARM64_PGTABLE_LEVELS == 4
> +#define KVM_PREALLOC_LEVELS    1
> +#define PTRS_PER_S2_PGD                2
> +#define S2_PGD_ORDER           get_order(PTRS_PER_S2_PGD * sizeof(pgd_t))
> +
> +static inline int kvm_prealloc_hwpgd(struct kvm *kvm, pgd_t *pgd)
> +{
> +       pud_t *pud;
> +
> +       pud = (pud_t *)__get_free_pages(GFP_KERNEL, 1);
> +       if (!pud)
> +               return -ENOMEM;
> +       memset(pud, 0, 2 * PAGE_SIZE);
> +       pgd_populate(NULL, pgd, pud);
> +       pgd_populate(NULL, pgd + 1, pud + PTRS_PER_PUD);
> +       get_page(virt_to_page(pgd));
> +       get_page(virt_to_page(pgd));
> +
> +       return 0;
> +}
> +
> +static inline void kvm_free_hwpgd(struct kvm *kvm)
> +{
> +       pgd_t *pgd = kvm->arch.pgd;
> +       pud_t *pud = pud_offset(pgd, 0);
> +       free_pages((unsigned long)pud, 1);
> +       put_page(virt_to_page(pgd));
> +       put_page(virt_to_page(pgd));
> +}
> +
> +static inline phys_addr_t kvm_get_hwpgd(struct kvm *kvm)
> +{
> +       pgd_t *pgd = kvm->arch.pgd;
> +       pud_t *pud = pud_offset(pgd, 0);
> +       return virt_to_phys(pud);
> +}
> +#else
> +#define KVM_PREALLOC_LEVELS    0
> +#define PTRS_PER_S2_PGD                (1 << (KVM_PHYS_SHIFT - PGDIR_SHIFT))
> +#define S2_PGD_ORDER           get_order(PTRS_PER_S2_PGD * sizeof(pgd_t))
> +
> +static inline int kvm_prealloc_hwpgd(struct kvm *kvm, pgd_t *pgd)
> +{
> +       return 0;
> +}
> +
> +static inline void kvm_free_hwpgd(struct kvm *kvm) { }
> +
> +static inline phys_addr_t kvm_get_hwpgd(struct kvm *kvm)
> +{
> +       return virt_to_phys(kvm->arch.pgd);
> +}
> +#endif

I think all the above could be squashed into a single set of function
implementations with the right macros defined before.

For the KVM levels, you could use something like (we exchanged emails in
private on this topic and how I got to these macros but they need better
checking):

#define KVM_PGTABLE_LEVELS      ((KVM_PHYS_SHIFT - PAGE_SHIFT - 5) / (PAGE_SHIFT - 3) + 1)
#define KVM_PREALLOC_LEVELS     (CONFIG_ARM64_PGTABLE_LEVELS - KVM_PGTABLE_LEVELS)
#if PGDIR_SHIFT > KVM_PHYS_SHIFT
#define PTRS_PER_S2_PGD		(1)
#else
#define PTRS_PER_S2_PGD		(1 << (KVM_PHYS_SHIFT - PGDIR_SHIFT))
#endif

--
Catalin