[PATCH v2 2/2] KVM/arm: enable enhanced armv7 fp/simd lazy switch
Mario Smarduch
m.smarduch at samsung.com
Tue Oct 20 18:10:59 PDT 2015
On 10/20/2015 12:24 AM, Christoffer Dall wrote:
> On Mon, Oct 19, 2015 at 04:25:04PM -0700, Mario Smarduch wrote:
>>
>>
>> On 10/19/2015 3:14 AM, Christoffer Dall wrote:
>>> On Sat, Sep 26, 2015 at 04:43:29PM -0700, Mario Smarduch wrote:
>>>> This patch enhances current lazy vfp/simd hardware switch. In addition to
>>>> current lazy switch, it tracks vfp/simd hardware state with a vcpu
>>>> lazy flag.
>>>>
>>>> vcpu lazy flag is set on guest access and trap to vfp/simd hardware switch
>>>> handler. On vm-enter if lazy flag is set skip trap enable and saving
>>>> host fpexc. On vm-exit if flag is set skip hardware context switch
>>>> and return to host with guest context.
>>>>
>>>> On vcpu_put check if vcpu lazy flag is set, and execute a hardware context
>>>> switch to restore host.
>>>>
>>>> Signed-off-by: Mario Smarduch <m.smarduch at samsung.com>
>>>> ---
>>>> arch/arm/include/asm/kvm_asm.h | 1 +
>>>> arch/arm/kvm/arm.c | 17 ++++++++++++
>>>> arch/arm/kvm/interrupts.S | 60 +++++++++++++++++++++++++++++++-----------
>>>> arch/arm/kvm/interrupts_head.S | 12 ++++++---
>>>> 4 files changed, 71 insertions(+), 19 deletions(-)
>>>>
>>>> diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h
>>>> index 194c91b..4b45d79 100644
>>>> --- a/arch/arm/include/asm/kvm_asm.h
>>>> +++ b/arch/arm/include/asm/kvm_asm.h
>>>> @@ -97,6 +97,7 @@ extern char __kvm_hyp_code_end[];
>>>> extern void __kvm_flush_vm_context(void);
>>>> extern void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa);
>>>> extern void __kvm_tlb_flush_vmid(struct kvm *kvm);
>>>> +extern void __kvm_restore_host_vfp_state(struct kvm_vcpu *vcpu);
>>>>
>>>> extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
>>>> #endif
>>>> diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
>>>> index ce404a5..79f49c7 100644
>>>> --- a/arch/arm/kvm/arm.c
>>>> +++ b/arch/arm/kvm/arm.c
>>>> @@ -105,6 +105,20 @@ void kvm_arch_check_processor_compat(void *rtn)
>>>> *(int *)rtn = 0;
>>>> }
>>>>
>>>> +/**
>>>> + * kvm_switch_fp_regs() - switch guest/host VFP/SIMD registers
>>>> + * @vcpu: pointer to vcpu structure.
>>>> + *
>>>
>>> nit: stray blank line
>> ok
>>>
>>>> + */
>>>> +static void kvm_switch_fp_regs(struct kvm_vcpu *vcpu)
>>>> +{
>>>> +#ifdef CONFIG_ARM
>>>> + if (vcpu->arch.vfp_lazy == 1) {
>>>> + kvm_call_hyp(__kvm_restore_host_vfp_state, vcpu);
>>>
>>> why do you have to do this in HYP mode ?
>> Calling it directly works fine. I moved the function outside hyp start/end
>> range in interrupts.S. Not thinking outside the box, just thought let them all
>> be hyp calls.
>>
>>>
>>>> + vcpu->arch.vfp_lazy = 0;
>>>> + }
>>>> +#endif
>>>
>>> we've tried to put stuff like this in header files to avoid the ifdefs
>>> so far. Could that be done here as well?
>>
>> That was a to do, but didn't get around to it.
>>>
>>>> +}
>>>>
>>>> /**
>>>> * kvm_arch_init_vm - initializes a VM data structure
>>>> @@ -295,6 +309,9 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
>>>>
>>>> void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
>>>> {
>>>> + /* Check if Guest accessed VFP registers */
>>>
>>> misleading comment: this function does more than checking
>> Yep sure does, will change.
>>>
>>>> + kvm_switch_fp_regs(vcpu);
>>>> +
>>>> /*
>>>> * The arch-generic KVM code expects the cpu field of a vcpu to be -1
>>>> * if the vcpu is no longer assigned to a cpu. This is used for the
>>>> diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S
>>>> index 900ef6d..6d98232 100644
>>>> --- a/arch/arm/kvm/interrupts.S
>>>> +++ b/arch/arm/kvm/interrupts.S
>>>> @@ -96,6 +96,29 @@ ENTRY(__kvm_flush_vm_context)
>>>> bx lr
>>>> ENDPROC(__kvm_flush_vm_context)
>>>>
>>>> +/**
>>>> + * void __kvm_restore_host_vfp_state(struct vcpu *vcpu) - Executes a lazy
>>>> + * fp/simd switch, saves the guest, restores host.
>>>> + *
>>>
>>> nit: stray blank line
>> ok.
>>>
>>>> + */
>>>> +ENTRY(__kvm_restore_host_vfp_state)
>>>> +#ifdef CONFIG_VFPv3
>>>> + push {r3-r7}
>>>> +
>>>> + add r7, r0, #VCPU_VFP_GUEST
>>>> + store_vfp_state r7
>>>> +
>>>> + add r7, r0, #VCPU_VFP_HOST
>>>> + ldr r7, [r7]
>>>> + restore_vfp_state r7
>>>> +
>>>> + ldr r3, [vcpu, #VCPU_VFP_FPEXC]
>>>
>>> either use r0 or vcpu throughout this function please
>> Yeah that's bad - in the same function to
>>>
>>>> + VFPFMXR FPEXC, r3
>>>> +
>>>> + pop {r3-r7}
>>>> +#endif
>>>> + bx lr
>>>> +ENDPROC(__kvm_restore_host_vfp_state)
>>>>
>>>> /********************************************************************
>>>> * Hypervisor world-switch code
>>>> @@ -119,11 +142,15 @@ ENTRY(__kvm_vcpu_run)
>>>> @ If the host kernel has not been configured with VFPv3 support,
>>>> @ then it is safer if we deny guests from using it as well.
>>>> #ifdef CONFIG_VFPv3
>>>> + @ r7 must be preserved until next vfp lazy check
>>>
>>> I don't understand this comment
>>>
>>>> + vfp_inlazy_mode r7, skip_save_host_fpexc
>>>> +
>>>> @ Set FPEXC_EN so the guest doesn't trap floating point instructions
>>>> VFPFMRX r2, FPEXC @ VMRS
>>>> - push {r2}
>>>> + str r2, [vcpu, #VCPU_VFP_FPEXC]
>>>> orr r2, r2, #FPEXC_EN
>>>> VFPFMXR FPEXC, r2 @ VMSR
>>>> +skip_save_host_fpexc:
>>>> #endif
>>>>
>>>> @ Configure Hyp-role
>>>> @@ -131,7 +158,14 @@ ENTRY(__kvm_vcpu_run)
>>>>
>>>> @ Trap coprocessor CRx accesses
>>>> set_hstr vmentry
>>>> - set_hcptr vmentry, (HCPTR_TTA | HCPTR_TCP(10) | HCPTR_TCP(11))
>>>> + set_hcptr vmentry, (HCPTR_TTA)
>>>> +
>>>> + @ check if vfp_lazy flag set
>>>> + cmp r7, #1
>>>
>>> if you meant that you need to preserve r7 down to here, could you
>>> instead just move the VFP logic above down here and do the whole VFP
>>> logic in one coherent block?
>>
>> I reworked the code both fpexc save and trap enable are handled at once.
>>>
>>>> + beq skip_guest_vfp_trap
>>>> + set_hcptr vmentry, (HCPTR_TCP(10) | HCPTR_TCP(11))
>>>> +skip_guest_vfp_trap:
>>>> +
>>>> set_hdcr vmentry
>>>>
>>>> @ Write configured ID register into MIDR alias
>>>> @@ -170,22 +204,14 @@ __kvm_vcpu_return:
>>>> @ Don't trap coprocessor accesses for host kernel
>>>> set_hstr vmexit
>>>> set_hdcr vmexit
>>>> - set_hcptr vmexit, (HCPTR_TTA | HCPTR_TCP(10) | HCPTR_TCP(11)), after_vfp_restore
>>>> + set_hcptr vmexit, (HCPTR_TTA | HCPTR_TCP(10) | HCPTR_TCP(11))
>>>>
>>>> #ifdef CONFIG_VFPv3
>>>> - @ Switch VFP/NEON hardware state to the host's
>>>> - add r7, vcpu, #VCPU_VFP_GUEST
>>>> - store_vfp_state r7
>>>> - add r7, vcpu, #VCPU_VFP_HOST
>>>> - ldr r7, [r7]
>>>> - restore_vfp_state r7
>>>> -
>>>> -after_vfp_restore:
>>>> - @ Restore FPEXC_EN which we clobbered on entry
>>>> - pop {r2}
>>>> + vfp_inlazy_mode r2, skip_restore_host_fpexc
>>>> + @ If vfp_lazy is not set, restore FPEXC_EN which we clobbered on entry
>>>> + ldr r2, [vcpu, #VCPU_VFP_FPEXC]
>>>
>>> so we do this restore if, since we scheduled this VCPU thread, the guest
>>> has not touched any VFP regs, is that correct?
>> That's right.
>>>
>>> Did you measure how often we schedule the guest and run it until we
>>> schedule another process without the guest touching any VFP regs? I'm
>>> just wondering if this complexity is worth it, or if we should just
>>> switch the VFP regs on vcpu_load/vcpu_put instead?
>>
>> The loads I've been running mix of fp operations and lmbench mmu - shows huge
>> decrease of fp save/restore like from ~30-50%, down to ~2%. What I did is
>> measured all exits and fp/save restore for both scenarios. So yes it does make a
>> difference. Of course will depend on the load, but should be never be worse then
>> now.
>
> True, and with the renaming the complexity shouldn't be that bad.
>
>>>
>>> Also, what do other architectures do here?
>>
>> x86 does a similar thing in it's kvm_arch_vcpu_put().
>>
>
> ok.
>
>>>
>>>> VFPFMXR FPEXC, r2
>>>> -#else
>>>> -after_vfp_restore:
>>>> +skip_restore_host_fpexc:
>>>> #endif
>>>>
>>>> @ Reset Hyp-role
>>>> @@ -485,6 +511,10 @@ switch_to_guest_vfp:
>>>> @ NEON/VFP used. Turn on VFP access.
>>>> set_hcptr vmtrap, (HCPTR_TCP(10) | HCPTR_TCP(11))
>>>>
>>>> + @ set lazy mode flag, switch hardware context on vcpu_put
>>>> + mov r1, #1
>>>> + str r1, [vcpu, #VCPU_VFP_LAZY]
>>>> +
>>>> @ Switch VFP/NEON hardware state to the guest's
>>>> add r7, r0, #VCPU_VFP_HOST
>>>> ldr r7, [r7]
>>>> diff --git a/arch/arm/kvm/interrupts_head.S b/arch/arm/kvm/interrupts_head.S
>>>> index 702740d..4561171 100644
>>>> --- a/arch/arm/kvm/interrupts_head.S
>>>> +++ b/arch/arm/kvm/interrupts_head.S
>>>> @@ -594,7 +594,7 @@ ARM_BE8(rev r6, r6 )
>>>> * If a label is specified with vmexit, it is branched to if VFP wasn't
>>>> * enabled.
>>>> */
>>>> -.macro set_hcptr operation, mask, label = none
>>>> +.macro set_hcptr operation, mask
>>>> mrc p15, 4, r2, c1, c1, 2
>>>> ldr r3, =\mask
>>>> .if \operation == vmentry
>>>> @@ -609,13 +609,17 @@ ARM_BE8(rev r6, r6 )
>>>> beq 1f
>>>> .endif
>>>> isb
>>>> - .if \label != none
>>>> - b \label
>>>> - .endif
>>>> 1:
>>>> .endif
>>>> .endm
>>>>
>>>> +/* Checks if VFP/SIMD lazy flag is set, if it is branch to label. */
>>>
>>> I don't easily understand the semantics of the lazy flag. When set,
>>> does it mean we've switched the hardware to the guest state?
>>>
>
> The conclusion here is probably that the lazy flag should instead be
> called the dirty flag or something where a value of true has some more
> intuitive meaning.
>
> Thanks,
> -Christoffer
So to summarize arm patches will be reworked to include your latest comments.
arm64 will directly call the host el1 function in vcpu_put. And a retest of both.
Any cutoff dates in mind?
Thanks.
>
>>>> +.macro vfp_inlazy_mode, reg, label
>>>> + ldr \reg, [vcpu, #VCPU_VFP_LAZY]
>>>> + cmp \reg, #1
>>>> + beq \label
>>>> +.endm
>>>> +
>>>> /* Configures the HDCR (Hyp Debug Configuration Register) on entry/return
>>>> * (hardware reset value is 0) */
>>>> .macro set_hdcr operation
>>>> --
>>>> 1.9.1
>>>>
More information about the linux-arm-kernel
mailing list