[RFC PATCH v3 1/2] KVM: arm64: Move CMOs from user_mem_abort to the fault handlers
Alexandru Elisei
alexandru.elisei at arm.com
Thu Apr 8 16:59:12 BST 2021
Hi Yanan,
On 4/8/21 10:23 AM, wangyanan (Y) wrote:
> Hi Alex,
>
> On 2021/4/7 23:31, Alexandru Elisei wrote:
>> Hi Yanan,
>>
>> On 3/26/21 3:16 AM, Yanan Wang wrote:
>>> We currently uniformly permorm CMOs of D-cache and I-cache in function
>>> user_mem_abort before calling the fault handlers. If we get concurrent
>>> guest faults(e.g. translation faults, permission faults) or some really
>>> unnecessary guest faults caused by BBM, CMOs for the first vcpu are
>> I can't figure out what BBM means.
> Just as Will has explained, it's Break-Before-Make rule. When we need to
> replace an old table entry with a new one, we should firstly invalidate
> the old table entry(Break), before installation of the new entry(Make).
Got it, thank you and Will for the explanation.
>
>
> And I think this patch mainly introduces benefits in two specific scenarios:
> 1) In a VM startup, it will improve efficiency of handling page faults incurred
> by vCPUs, when initially populating stage2 page tables.
> 2) After live migration, the heavy workload will be resumed on the destination
> VMs, however all the stage2 page tables need to be rebuilt.
>>> necessary while the others later are not.
>>>
>>> By moving CMOs to the fault handlers, we can easily identify conditions
>>> where they are really needed and avoid the unnecessary ones. As it's a
>>> time consuming process to perform CMOs especially when flushing a block
>>> range, so this solution reduces much load of kvm and improve efficiency
>>> of the page table code.
>>>
>>> So let's move both clean of D-cache and invalidation of I-cache to the
>>> map path and move only invalidation of I-cache to the permission path.
>>> Since the original APIs for CMOs in mmu.c are only called in function
>>> user_mem_abort, we now also move them to pgtable.c.
>>>
>>> Signed-off-by: Yanan Wang <wangyanan55 at huawei.com>
>>> ---
>>> arch/arm64/include/asm/kvm_mmu.h | 31 ---------------
>>> arch/arm64/kvm/hyp/pgtable.c | 68 +++++++++++++++++++++++++-------
>>> arch/arm64/kvm/mmu.c | 23 ++---------
>>> 3 files changed, 57 insertions(+), 65 deletions(-)
>>>
>>> diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
>>> index 90873851f677..c31f88306d4e 100644
>>> --- a/arch/arm64/include/asm/kvm_mmu.h
>>> +++ b/arch/arm64/include/asm/kvm_mmu.h
>>> @@ -177,37 +177,6 @@ static inline bool vcpu_has_cache_enabled(struct kvm_vcpu
>>> *vcpu)
>>> return (vcpu_read_sys_reg(vcpu, SCTLR_EL1) & 0b101) == 0b101;
>>> }
>>> -static inline void __clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long
>>> size)
>>> -{
>>> - void *va = page_address(pfn_to_page(pfn));
>>> -
>>> - /*
>>> - * With FWB, we ensure that the guest always accesses memory using
>>> - * cacheable attributes, and we don't have to clean to PoC when
>>> - * faulting in pages. Furthermore, FWB implies IDC, so cleaning to
>>> - * PoU is not required either in this case.
>>> - */
>>> - if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
>>> - return;
>>> -
>>> - kvm_flush_dcache_to_poc(va, size);
>>> -}
>>> -
>>> -static inline void __invalidate_icache_guest_page(kvm_pfn_t pfn,
>>> - unsigned long size)
>>> -{
>>> - if (icache_is_aliasing()) {
>>> - /* any kind of VIPT cache */
>>> - __flush_icache_all();
>>> - } else if (is_kernel_in_hyp_mode() || !icache_is_vpipt()) {
>>> - /* PIPT or VPIPT at EL2 (see comment in __kvm_tlb_flush_vmid_ipa) */
>>> - void *va = page_address(pfn_to_page(pfn));
>>> -
>>> - invalidate_icache_range((unsigned long)va,
>>> - (unsigned long)va + size);
>>> - }
>>> -}
>>> -
>>> void kvm_set_way_flush(struct kvm_vcpu *vcpu);
>>> void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled);
>>> diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
>>> index 4d177ce1d536..829a34eea526 100644
>>> --- a/arch/arm64/kvm/hyp/pgtable.c
>>> +++ b/arch/arm64/kvm/hyp/pgtable.c
>>> @@ -464,6 +464,43 @@ static int stage2_map_set_prot_attr(enum kvm_pgtable_prot
>>> prot,
>>> return 0;
>>> }
>>> +static bool stage2_pte_cacheable(kvm_pte_t pte)
>>> +{
>>> + u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR;
>>> + return memattr == PAGE_S2_MEMATTR(NORMAL);
>>> +}
>>> +
>>> +static bool stage2_pte_executable(kvm_pte_t pte)
>>> +{
>>> + return !(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN);
>>> +}
>>> +
>>> +static void stage2_flush_dcache(void *addr, u64 size)
>>> +{
>>> + /*
>>> + * With FWB, we ensure that the guest always accesses memory using
>>> + * cacheable attributes, and we don't have to clean to PoC when
>>> + * faulting in pages. Furthermore, FWB implies IDC, so cleaning to
>>> + * PoU is not required either in this case.
>>> + */
>>> + if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
>>> + return;
>>> +
>>> + __flush_dcache_area(addr, size);
>>> +}
>>> +
>>> +static void stage2_invalidate_icache(void *addr, u64 size)
>>> +{
>>> + if (icache_is_aliasing()) {
>>> + /* Flush any kind of VIPT icache */
>>> + __flush_icache_all();
>>> + } else if (is_kernel_in_hyp_mode() || !icache_is_vpipt()) {
>>> + /* PIPT or VPIPT at EL2 */
>>> + invalidate_icache_range((unsigned long)addr,
>>> + (unsigned long)addr + size);
>>> + }
>>> +}
>>> +
>>> static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
>>> kvm_pte_t *ptep,
>>> struct stage2_map_data *data)
>>> @@ -495,6 +532,13 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 end,
>>> u32 level,
>>> put_page(page);
>>> }
>>> + /* Perform CMOs before installation of the new PTE */
>>> + if (!kvm_pte_valid(old) || stage2_pte_cacheable(old))
>> I'm not sure why the stage2_pte_cacheable(old) condition is needed.
>>
>> kvm_handle_guest_abort() handles three types of stage 2 data or instruction
>> aborts: translation faults (fault_status == FSC_FAULT), access faults
>> (fault_status == FSC_ACCESS) and permission faults (fault_status == FSC_PERM).
>>
>> Access faults are handled in handle_access_fault(), which means user_mem_abort()
>> handles translation and permission faults.
> Yes, and we are certain that it's a translation fault here in
> stage2_map_walker_try_leaf.
>> The original code did the dcache clean
>> + inval when not a permission fault, which means the CMO was done only on a
>> translation fault. Translation faults mean that the IPA was not mapped, so the old
>> entry will always be invalid. Even if we're coalescing multiple last level leaf
>> entries int oa block mapping, the table entry which is replaced is invalid
>> because it's marked as such in stage2_map_walk_table_pre().
>>
>> Is there something I'm missing?
> I originally thought that we could possibly have a translation fault on a valid
> stage2 table
> descriptor due to some special cases, and that's the reason
> stage2_pte_cacheable(old)
> condition exits, but I can't image any scenario like this.
>
> I think your above explanation is right, maybe I should just drop that condition.
>>
>>> + stage2_flush_dcache(__va(phys), granule);
>>> +
>>> + if (stage2_pte_executable(new))
>>> + stage2_invalidate_icache(__va(phys), granule);
>> This, together with the stage2_attr_walker() changes below, look identical to the
>> current code in user_mem_abort(). The executable permission is set on an exec
>> fault (instruction abort not on a stage 2 translation table walk), and as a result
>> of the fault we either need to map a new page here, or relax permissions in
>> kvm_pgtable_stage2_relax_perms() -> stage2_attr_walker() below.
> I agree.
> Do you mean this part of change is right?
Yes, I was trying to explain that the behaviour with regard to icache invalidation
from this patch is identical to the current behaviour of user_mem_abort ()
(without this patch).
Thanks,
Alex
>
> Thanks,
> Yanan
>> Thanks,
>>
>> Alex
>>
>>> +
>>> smp_store_release(ptep, new);
>>> get_page(page);
>>> data->phys += granule;
>>> @@ -651,20 +695,6 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64
>>> addr, u64 size,
>>> return ret;
>>> }
>>> -static void stage2_flush_dcache(void *addr, u64 size)
>>> -{
>>> - if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
>>> - return;
>>> -
>>> - __flush_dcache_area(addr, size);
>>> -}
>>> -
>>> -static bool stage2_pte_cacheable(kvm_pte_t pte)
>>> -{
>>> - u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR;
>>> - return memattr == PAGE_S2_MEMATTR(NORMAL);
>>> -}
>>> -
>>> static int stage2_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
>>> enum kvm_pgtable_walk_flags flag,
>>> void * const arg)
>>> @@ -743,8 +773,16 @@ static int stage2_attr_walker(u64 addr, u64 end, u32
>>> level, kvm_pte_t *ptep,
>>> * but worst-case the access flag update gets lost and will be
>>> * set on the next access instead.
>>> */
>>> - if (data->pte != pte)
>>> + if (data->pte != pte) {
>>> + /*
>>> + * Invalidate the instruction cache before updating
>>> + * if we are going to add the executable permission.
>>> + */
>>> + if (!stage2_pte_executable(*ptep) && stage2_pte_executable(pte))
>>> + stage2_invalidate_icache(kvm_pte_follow(pte),
>>> + kvm_granule_size(level));
>>> WRITE_ONCE(*ptep, pte);
>>> + }
>>> return 0;
>>> }
>>> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
>>> index 77cb2d28f2a4..1eec9f63bc6f 100644
>>> --- a/arch/arm64/kvm/mmu.c
>>> +++ b/arch/arm64/kvm/mmu.c
>>> @@ -609,16 +609,6 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm
>>> *kvm,
>>> kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
>>> }
>>> -static void clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size)
>>> -{
>>> - __clean_dcache_guest_page(pfn, size);
>>> -}
>>> -
>>> -static void invalidate_icache_guest_page(kvm_pfn_t pfn, unsigned long size)
>>> -{
>>> - __invalidate_icache_guest_page(pfn, size);
>>> -}
>>> -
>>> static void kvm_send_hwpoison_signal(unsigned long address, short lsb)
>>> {
>>> send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current);
>>> @@ -882,13 +872,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu,
>>> phys_addr_t fault_ipa,
>>> if (writable)
>>> prot |= KVM_PGTABLE_PROT_W;
>>> - if (fault_status != FSC_PERM && !device)
>>> - clean_dcache_guest_page(pfn, vma_pagesize);
>>> -
>>> - if (exec_fault) {
>>> + if (exec_fault)
>>> prot |= KVM_PGTABLE_PROT_X;
>>> - invalidate_icache_guest_page(pfn, vma_pagesize);
>>> - }
>>> if (device)
>>> prot |= KVM_PGTABLE_PROT_DEVICE;
>>> @@ -1144,10 +1129,10 @@ int kvm_set_spte_hva(struct kvm *kvm, unsigned long
>>> hva, pte_t pte)
>>> trace_kvm_set_spte_hva(hva);
>>> /*
>>> - * We've moved a page around, probably through CoW, so let's treat it
>>> - * just like a translation fault and clean the cache to the PoC.
>>> + * We've moved a page around, probably through CoW, so let's treat
>>> + * it just like a translation fault and the map handler will clean
>>> + * the cache to the PoC.
>>> */
>>> - clean_dcache_guest_page(pfn, PAGE_SIZE);
>>> handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &pfn);
>>> return 0;
>>> }
>> .
More information about the linux-arm-kernel
mailing list