[PATCH] kvm: arm64: Enable hardware updates of the Access Flag for Stage 2 page tables
Christoffer Dall
christoffer.dall at linaro.org
Mon May 9 08:33:10 PDT 2016
On Wed, Apr 13, 2016 at 05:57:37PM +0100, Catalin Marinas wrote:
> The ARMv8.1 architecture extensions introduce support for hardware
> updates of the access and dirty information in page table entries. With
> VTCR_EL2.HA enabled (bit 21), when the CPU accesses an IPA with the
> PTE_AF bit cleared in the stage 2 page table, instead of raising an
> Access Flag fault to EL2 the CPU sets the actual page table entry bit
> (10). To ensure that kernel modifications to the page table do not
> inadvertently revert a bit set by hardware updates, certain Stage 2
> software pte/pmd operations must be performed atomically.
>
> The main user of the AF bit is the kvm_age_hva() mechanism. The
> kvm_age_hva_handler() function performs a "test and clear young" action
> on the pte/pmd. This needs to be atomic in respect of automatic hardware
> updates of the AF bit. Since the AF bit is in the same position for both
> Stage 1 and Stage 2, the patch reuses the existing
> ptep_test_and_clear_young() functionality if
> __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG is defined. Otherwise, the
> existing pte_young/pte_mkold mechanism is preserved.
>
> The kvm_set_s2pte_readonly() (and the corresponding pmd equivalent) have
> to perform atomic modifications in order to avoid a race with updates of
> the AF bit. The arm64 implementation has been re-written using
> exclusives.
>
> Currently, kvm_set_s2pte_writable() (and pmd equivalent) take a pointer
> argument and modify the pte/pmd in place. However, these functions are
> only used on local variables rather than actual page table entries, so
> it makes more sense to follow the pte_mkwrite() approach for stage 1
> attributes. The change to kvm_s2pte_mkwrite() makes it clear that these
> functions do not modify the actual page table entries.
so if one CPU takes a permission fault and is in the process of updating
the page table, what prevents another CPU from setting the access flag
(on a read, done by HW) on that entry between us reading the old pte and
replacing it with the new pte?
Don't we loose the AF information in that case too?
>
> The (pte|pmd)_mkyoung() uses on Stage 2 entries (setting the AF bit
> explicitly) do not need to be modified since hardware updates of the
> dirty status are not supported by KVM, so there is no possibility of
> losing such information.
>
> Signed-off-by: Catalin Marinas <catalin.marinas at arm.com>
> Cc: Christoffer Dall <christoffer.dall at linaro.org>
> Cc: Marc Zyngier <marc.zyngier at arm.com>
> Cc: Paolo Bonzini <pbonzini at redhat.com>
> ---
>
> After some digging through the KVM code, I concluded that hardware DBM
> (dirty bit management) support is not feasible for Stage 2. A potential
> user would be dirty logging but this requires a different bitmap exposed
> to Qemu and, to avoid races, the stage 2 mappings need to be mapped
> read-only on clean, writable on fault. This assumption simplifies the
> hardware Stage 2 AF support.
>
> Tested on a model with handle_access_fault() no longer being called when
> the hardware AF is present and enabled. It needs swap enabling and light
> memory pressure to trigger the page aging.
>
> arch/arm/include/asm/kvm_mmu.h | 10 +++++----
> arch/arm/kvm/mmu.c | 46 +++++++++++++++++++++++++---------------
> arch/arm64/include/asm/kvm_arm.h | 2 ++
> arch/arm64/include/asm/kvm_mmu.h | 27 +++++++++++++++++------
> arch/arm64/include/asm/pgtable.h | 13 ++++++++----
> arch/arm64/kvm/hyp/s2-setup.c | 8 +++++++
> 6 files changed, 74 insertions(+), 32 deletions(-)
>
> diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
> index da44be9db4fa..bff1ca42a97e 100644
> --- a/arch/arm/include/asm/kvm_mmu.h
> +++ b/arch/arm/include/asm/kvm_mmu.h
> @@ -105,14 +105,16 @@ static inline void kvm_clean_pte(pte_t *pte)
> clean_pte_table(pte);
> }
>
> -static inline void kvm_set_s2pte_writable(pte_t *pte)
> +static inline pte_t kvm_s2pte_mkwrite(pte_t pte)
> {
> - pte_val(*pte) |= L_PTE_S2_RDWR;
> + pte_val(pte) |= L_PTE_S2_RDWR;
> + return pte;
> }
>
> -static inline void kvm_set_s2pmd_writable(pmd_t *pmd)
> +static inline pmd_t kvm_s2pmd_mkwrite(pmd_t pmd)
> {
> - pmd_val(*pmd) |= L_PMD_S2_RDWR;
> + pmd_val(pmd) |= L_PMD_S2_RDWR;
> + return pmd;
> }
>
> static inline void kvm_set_s2pte_readonly(pte_t *pte)
> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
> index 58dbd5c439df..b6736ea85722 100644
> --- a/arch/arm/kvm/mmu.c
> +++ b/arch/arm/kvm/mmu.c
> @@ -955,6 +955,27 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
> return 0;
> }
>
> +#ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
> +static int stage2_ptep_test_and_clear_young(pte_t *pte)
> +{
> + if (pte_young(*pte)) {
> + *pte = pte_mkold(*pte);
> + return 1;
> + }
> + return 0;
> +}
> +#else
> +static int stage2_ptep_test_and_clear_young(pte_t *pte)
> +{
> + return __ptep_test_and_clear_young(pte);
> +}
> +#endif
> +
> +static int stage2_pmdp_test_and_clear_young(pmd_t *pmd)
> +{
> + return stage2_ptep_test_and_clear_young((pte_t *)pmd);
> +}
> +
> /**
> * kvm_phys_addr_ioremap - map a device range to guest IPA
> *
> @@ -978,7 +999,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
> pte_t pte = pfn_pte(pfn, PAGE_S2_DEVICE);
>
> if (writable)
> - kvm_set_s2pte_writable(&pte);
> + pte = kvm_s2pte_mkwrite(pte);
>
> ret = mmu_topup_memory_cache(&cache, KVM_MMU_CACHE_MIN_PAGES,
> KVM_NR_MEM_OBJS);
> @@ -1320,7 +1341,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> pmd_t new_pmd = pfn_pmd(pfn, mem_type);
> new_pmd = pmd_mkhuge(new_pmd);
> if (writable) {
> - kvm_set_s2pmd_writable(&new_pmd);
> + new_pmd = kvm_s2pmd_mkwrite(new_pmd);
> kvm_set_pfn_dirty(pfn);
> }
> coherent_cache_guest_page(vcpu, pfn, PMD_SIZE, fault_ipa_uncached);
> @@ -1329,7 +1350,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> pte_t new_pte = pfn_pte(pfn, mem_type);
>
> if (writable) {
> - kvm_set_s2pte_writable(&new_pte);
> + new_pte = kvm_s2pte_mkwrite(new_pte);
> kvm_set_pfn_dirty(pfn);
> mark_page_dirty(kvm, gfn);
> }
> @@ -1348,6 +1369,8 @@ out_unlock:
> * Resolve the access fault by making the page young again.
> * Note that because the faulting entry is guaranteed not to be
> * cached in the TLB, we don't need to invalidate anything.
> + * Only the HW Access Flag updates are supported for Stage 2 (no DBM),
> + * so there is no need for atomic (pte|pmd)_mkyoung operations.
> */
> static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
> {
> @@ -1588,25 +1611,14 @@ static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, void *data)
> if (!pmd || pmd_none(*pmd)) /* Nothing there */
> return 0;
>
> - if (kvm_pmd_huge(*pmd)) { /* THP, HugeTLB */
> - if (pmd_young(*pmd)) {
> - *pmd = pmd_mkold(*pmd);
> - return 1;
> - }
> -
> - return 0;
> - }
> + if (kvm_pmd_huge(*pmd)) /* THP, HugeTLB */
> + return stage2_pmdp_test_and_clear_young(pmd);
>
> pte = pte_offset_kernel(pmd, gpa);
> if (pte_none(*pte))
> return 0;
>
> - if (pte_young(*pte)) {
> - *pte = pte_mkold(*pte); /* Just a page... */
> - return 1;
> - }
> -
> - return 0;
> + return stage2_ptep_test_and_clear_young(pte);
> }
>
> static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, void *data)
> diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
> index 4150fd8bae01..bbed53c2735a 100644
> --- a/arch/arm64/include/asm/kvm_arm.h
> +++ b/arch/arm64/include/asm/kvm_arm.h
> @@ -110,6 +110,8 @@
>
> /* VTCR_EL2 Registers bits */
> #define VTCR_EL2_RES1 (1 << 31)
> +#define VTCR_EL2_HD (1 << 22)
> +#define VTCR_EL2_HA (1 << 21)
> #define VTCR_EL2_PS_MASK (7 << 16)
> #define VTCR_EL2_TG0_MASK (1 << 14)
> #define VTCR_EL2_TG0_4K (0 << 14)
> diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
> index 22732a5e3119..28bdf9ddaa0c 100644
> --- a/arch/arm64/include/asm/kvm_mmu.h
> +++ b/arch/arm64/include/asm/kvm_mmu.h
> @@ -121,19 +121,32 @@ static inline void kvm_clean_pmd_entry(pmd_t *pmd) {}
> static inline void kvm_clean_pte(pte_t *pte) {}
> static inline void kvm_clean_pte_entry(pte_t *pte) {}
>
> -static inline void kvm_set_s2pte_writable(pte_t *pte)
> +static inline pte_t kvm_s2pte_mkwrite(pte_t pte)
> {
> - pte_val(*pte) |= PTE_S2_RDWR;
> + pte_val(pte) |= PTE_S2_RDWR;
> + return pte;
> }
>
> -static inline void kvm_set_s2pmd_writable(pmd_t *pmd)
> +static inline pmd_t kvm_s2pmd_mkwrite(pmd_t pmd)
> {
> - pmd_val(*pmd) |= PMD_S2_RDWR;
> + pmd_val(pmd) |= PMD_S2_RDWR;
> + return pmd;
> }
>
> static inline void kvm_set_s2pte_readonly(pte_t *pte)
> {
> - pte_val(*pte) = (pte_val(*pte) & ~PTE_S2_RDWR) | PTE_S2_RDONLY;
> + pteval_t pteval;
> + unsigned long tmp;
> +
> + asm volatile("// kvm_set_s2pte_readonly\n"
> + " prfm pstl1strm, %2\n"
> + "1: ldxr %0, %2\n"
> + " and %0, %0, %3 // clear PTE_S2_RDWR\n"
> + " orr %0, %0, %4 // set PTE_S2_RDONLY\n"
> + " stxr %w1, %0, %2\n"
> + " cbnz %w1, 1b\n"
> + : "=&r" (pteval), "=&r" (tmp), "+Q" (pte_val(*pte))
so the +Q means "pass the memory address of the value and by the way the
content, not the address itself, can be updated by the assembly code" ?
> + : "L" (~PTE_S2_RDWR), "L" (PTE_S2_RDONLY));
> }
>
> static inline bool kvm_s2pte_readonly(pte_t *pte)
> @@ -143,12 +156,12 @@ static inline bool kvm_s2pte_readonly(pte_t *pte)
>
> static inline void kvm_set_s2pmd_readonly(pmd_t *pmd)
> {
> - pmd_val(*pmd) = (pmd_val(*pmd) & ~PMD_S2_RDWR) | PMD_S2_RDONLY;
> + kvm_set_s2pte_readonly((pte_t *)pmd);
> }
>
> static inline bool kvm_s2pmd_readonly(pmd_t *pmd)
> {
> - return (pmd_val(*pmd) & PMD_S2_RDWR) == PMD_S2_RDONLY;
> + return kvm_s2pte_readonly((pte_t *)pmd);
> }
>
>
> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
> index 989fef16d461..8d29e6eb33f7 100644
> --- a/arch/arm64/include/asm/pgtable.h
> +++ b/arch/arm64/include/asm/pgtable.h
> @@ -530,14 +530,12 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
> * Atomic pte/pmd modifications.
> */
> #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
> -static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
> - unsigned long address,
> - pte_t *ptep)
> +static inline int __ptep_test_and_clear_young(pte_t *ptep)
> {
> pteval_t pteval;
> unsigned int tmp, res;
>
> - asm volatile("// ptep_test_and_clear_young\n"
> + asm volatile("// __ptep_test_and_clear_young\n"
> " prfm pstl1strm, %2\n"
> "1: ldxr %0, %2\n"
> " ubfx %w3, %w0, %5, #1 // extract PTE_AF (young)\n"
> @@ -550,6 +548,13 @@ static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
> return res;
> }
>
> +static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
> + unsigned long address,
> + pte_t *ptep)
> +{
> + return __ptep_test_and_clear_young(ptep);
> +}
> +
> #ifdef CONFIG_TRANSPARENT_HUGEPAGE
> #define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
> static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
> diff --git a/arch/arm64/kvm/hyp/s2-setup.c b/arch/arm64/kvm/hyp/s2-setup.c
> index 5a9f3bf542b0..110267af0b83 100644
> --- a/arch/arm64/kvm/hyp/s2-setup.c
> +++ b/arch/arm64/kvm/hyp/s2-setup.c
> @@ -33,6 +33,14 @@ void __hyp_text __init_stage2_translation(void)
> val |= (read_sysreg(id_aa64mmfr0_el1) & 7) << 16;
>
> /*
> + * Check the availability of Hardware Access Flag / Dirty Bit
> + * Management in ID_AA64MMFR1_EL1 and enable the feature in VTCR_EL2.
> + */
> + tmp = (read_sysreg(id_aa64mmfr1_el1) >> ID_AA64MMFR1_HADBS_SHIFT) & 0xf;
> + if (IS_ENABLED(CONFIG_ARM64_HW_AFDBM) && tmp)
> + val |= VTCR_EL2_HA;
> +
> + /*
> * Read the VMIDBits bits from ID_AA64MMFR1_EL1 and set the VS
> * bit in VTCR_EL2.
> */
More information about the linux-arm-kernel
mailing list