Kexec and KVM not working gracefully together

Frediano Ziglio freddy77 at gmail.com
Fri Feb 6 04:14:14 PST 2015


2015-02-06 10:56 GMT+00:00 AKASHI Takahiro <takahiro.akashi at linaro.org>:
> On 02/06/2015 06:50 PM, Frediano Ziglio wrote:
>>
>> 2015-02-06 4:13 GMT+00:00 AKASHI Takahiro <takahiro.akashi at linaro.org>:
>>>
>>> Hi Frediano
>>> [add Geoff to cc]
>>>
>>> I still have to compare my code with yours, but please note that
>>> some files in arch/arm/kvm are shared with arm64, especially
>>> arm.c, mmu.c and related headers.
>>>
>>
>> Didn't know.
>>
>>> So can you please
>>> - submit your new patch without including old e-mail threads.
>>>    (don't forget RFC or PATCH.)
>>> - break it up into two pieces, one for common, the other for arm
>>>
>>
>> I'll do for next.
>
>
> Thanks
>
>
>>> and a few more comments below:
>>>
>>> On 02/06/2015 01:56 AM, Frediano Ziglio wrote:
>>>
>>> [snip]
>>>
>>>> New version. Changes:
>>>> - removed saved value and test again
>>>> - remove vm flush, useless
>>>> - correct compile check on header
>>>> - try KVM enabled and not, compile link and tests
>>>> - rewrite comments, add more where necessary
>>>> - remove code to free and allocate again boot PGD and related, keep in
>>>> memory if KEXEC is enabled
>>>>
>>>> Still not trying SMP. Still to be considered RFC. I tried different
>>>> compile options (KEXEC and KVM turned on/off). I realized that I
>>>> should test if kernel is started with SVC mode code still works
>>>> correctly. ARM_VIRT_EXT is always turned on for V7 CPU.
>>>>
>>>>
>>>> diff --git a/arch/arm/include/asm/kvm_asm.h
>>>> b/arch/arm/include/asm/kvm_asm.h
>>>> index 3a67bec..985f0a3 100644
>>>> --- a/arch/arm/include/asm/kvm_asm.h
>>>> +++ b/arch/arm/include/asm/kvm_asm.h
>>>> @@ -97,7 +97,19 @@ extern char __kvm_hyp_code_end[];
>>>>    extern void __kvm_flush_vm_context(void);
>>>>    extern void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t
>>>> ipa);
>>>>
>>>> +extern void __kvm_set_vectors(unsigned long phys_vector_base);
>>>> +
>>>>    extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
>>>> +
>>>> +#ifndef CONFIG_ARM_VIRT_EXT
>>>
>>>
>>>
>>> Arm64 doesn't have CONFIG_ARM_VIRT_EXT. Why don't use CONFIG_KVM?
>>> I'd rather put this conditional into the place where the function is
>>> actually called for flexible implementation. (See below)
>>>
>>
>> In ARMv7 you can have ARM_VIRT_EXT but not KVM. ARM_VIRT_EXT means the
>> processor can support virtualization while KVM means you want to
>> compile kvm in the kernel. So code should be defined if either are
>> defined.
>
>
> CONFIG_KVM exists in arch/arm/kvm/Kconfig.
>

I know. On ARMv7 CONFIG_ARM_VIRT_EXT can be defined while CONFIG_KVM
is not defined. In this case firmware can launch kernel in HYP mode.
Kernel will switch in SVC mode ASAP and work entirely in SVC mode just
ignoring the HYP mode (it install the stub HYP code). Probably on
ARMv8 virtualization is always supported so the CONFIG_ARM_VIRT_EXT is
not defined. However in head.S (arm64) mode is switched ASAP to SVC
too.

>
>>>
>>>> +static void kvm_cpu_reset(void (*phys_reset)(void *), void *addr)
>>>> +{
>>>> +    phys_reset(addr);
>>>> +}
>>>> +#else
>>>> +extern void kvm_cpu_reset(void (*phys_reset)(void *), void *addr);
>>>> +#endif
>>>> +
>>>>    #endif
>>>>
>>>>    #endif /* __ARM_KVM_ASM_H__ */
>>>> diff --git a/arch/arm/kernel/hyp-stub.S b/arch/arm/kernel/hyp-stub.S
>>>> index 2a55373..30339ad 100644
>>>> --- a/arch/arm/kernel/hyp-stub.S
>>>> +++ b/arch/arm/kernel/hyp-stub.S
>>>> @@ -164,13 +164,63 @@ ARM_BE8(orr    r7, r7, #(1 << 25))     @ HSCTLR.EE
>>>>        bx    lr            @ The boot CPU mode is left in r4.
>>>>    ENDPROC(__hyp_stub_install_secondary)
>>>>
>>>> +#undef CPU_RESET
>>>> +#if defined(CONFIG_ARM_VIRT_EXT) && !defined(CONFIG_KVM) &&
>>>> !defined(ZIMAGE)
>>>> +#  define CPU_RESET 1
>>>> +#endif
>>>> +
>>>>    __hyp_stub_do_trap:
>>>> +#ifdef CPU_RESET
>>>> +    cmp    r0, #-2
>>>> +    bne    1f
>>>> +
>>>> +    mrc    p15, 4, r0, c1, c0, 0    @ HSCR
>>>> +    ldr    r2, =0x40003807
>>>> +    bic    r0, r0, r2
>>>> +    mcr    p15, 4, r0, c1, c0, 0    @ HSCR
>>>> +    isb
>>>> +
>>>> +    @ Disable MMU, caches and Thumb in SCTLR
>>>> +    mrc    p15, 0, r0, c1, c0, 0    @ SCTLR
>>>> +    bic    r0, r0, r2
>>>> +    mcr    p15, 0, r0, c1, c0, 0
>>>> +
>>>> +    bx    r1
>>>> +    .ltorg
>>>> +1:
>>>> +#endif
>>>>        cmp    r0, #-1
>>>>        mrceq    p15, 4, r0, c12, c0, 0    @ get HVBAR
>>>>        mcrne    p15, 4, r0, c12, c0, 0    @ set HVBAR
>>>>        __ERET
>>>>    ENDPROC(__hyp_stub_do_trap)
>>>>
>>>> +#ifdef CPU_RESET
>>>> +/*
>>>> + * r0 cpu_reset function
>>>> + * r1 address to jump to
>>>> + */
>>>> +ENTRY(kvm_cpu_reset)
>>>> +    mov    r2, r0
>>>> +
>>>> +    @ __boot_cpu_mode should be HYP_MODE
>>>> +    adr    r0, .L__boot_cpu_mode_offset
>>>> +    ldr    r3, [r0]
>>>> +    ldr    r0, [r0, r3]
>>>> +    cmp    r0, #HYP_MODE
>>>> +    beq    reset_to_hyp
>>>> +
>>>> +    @ Call SVC mode cpu_reset
>>>> +    mov    r0, r1
>>>> +THUMB(    orr    r2, r2, 1    )
>>>> +    bx    r2
>>>> +
>>>> +reset_to_hyp:
>>>> +    mov    r0, #-2
>>>> +    b    __hyp_set_vectors
>>>> +ENDPROC(kvm_cpu_reset)
>>>> +#endif
>>>> +
>>>>    /*
>>>>     * __hyp_set_vectors: Call this after boot to set the initial
>>>> hypervisor
>>>>     * vectors as part of hypervisor installation.  On an SMP system,
>>>> this
>>>> should
>>>> diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
>>>> index fdfa3a7..c018719 100644
>>>> --- a/arch/arm/kernel/process.c
>>>> +++ b/arch/arm/kernel/process.c
>>>> @@ -41,6 +41,7 @@
>>>>    #include <asm/system_misc.h>
>>>>    #include <asm/mach/time.h>
>>>>    #include <asm/tls.h>
>>>> +#include <asm/kvm_asm.h>
>>>>
>>>>    #ifdef CONFIG_CC_STACKPROTECTOR
>>>>    #include <linux/stackprotector.h>
>>>> @@ -60,7 +61,7 @@ static const char *isa_modes[] __maybe_unused = {
>>>>    };
>>>>
>>>>    extern void call_with_stack(void (*fn)(void *), void *arg, void *sp);
>>>> -typedef void (*phys_reset_t)(unsigned long);
>>>> +typedef void (*phys_reset_t)(void *);
>>>>
>>>>    /*
>>>>     * A temporary stack to use for CPU reset. This is static so that we
>>>> @@ -89,7 +90,8 @@ static void __soft_restart(void *addr)
>>>>
>>>>        /* Switch to the identity mapping. */
>>>>        phys_reset = (phys_reset_t)(unsigned
>>>> long)virt_to_phys(cpu_reset);
>>>> -    phys_reset((unsigned long)addr);
>>>> +
>>>> +    kvm_cpu_reset(phys_reset, addr);
>>>
>>>
>>>
>>> How about this?
>>> #ifdef CONFIG_KVM
>>>          kvm_cpu_reset(...);
>>> #endif
>>>          phys_reset(addr);
>>>
>>> It will clearify the purpose of kvm_cpu_reset().
>>> The name of kvm_cpu_reset() might better be cpu_*de*init_hyp_mode or
>>> similar
>>> given that the function would be called in hyp_init_cpu_notify()
>>> once kvm supports cpu hotplug in the future.
>>>
>>
>> the purpose of kvm_cpu_reset is to reset state and jump to newer
>> kernel initialization code. Is a no return function like cpu_reset. So
>> perhaps code can be
>>
>> #if defined(CONFIG_KVM) || defined(CONFIG_ARM_VIRT_EXT)
>>          kvm_cpu_reset(...);
>> #else
>>          phys_reset(addr);
>> #endif
>
>
> I meant that kvm_cpu_reset() should be responsible for restoring only
> kvm-related status.
>

No, kernel code is meant to run in SVC mode, that function can't
safely return. For instance in ARMv7 even setting HTTBR to TTBR0 and
returning back would execute cpu_v7_reset which try to change its
SCTLR register (but it cannot as it has to change HSCTLR now).

The code that switch from HYP to SVC is meant to be run without MMU.
Would work if possibly a cpu_deinit_hyp_mode would just:
- disable cache
- disable mmu
- restore HYP stub code
- allow to do a sort of cpu_reset, but in HYP mode instead of SVC mode.

In this case this cpu_deinit_hyp_mode could be called freely from
sort_restart (not __soft_restart) and cpu unplugging. In this case a
kvm_cpu_reset could be implemented like:

void kvm_cpu_reset(void (*cpu_reset)(void*,void*), void* addr)
{
    if (we_have_hyp_mode())
       cpu_reset(hyp_physical_reset, addr);
    cpu_reset(addr, NULL);
}

yes, cpu_reset has an added parameter to be passed to
hyp_physical_reset. hyp_physical_reset would be a identify function
which just a HVC to do a reset from HYP.

Does it make sense?

>>>>        /* Should never get here. */
>>>>        BUG();
>>>> diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
>>>> index 0b0d58a..e4d4a2b 100644
>>>> --- a/arch/arm/kvm/arm.c
>>>> +++ b/arch/arm/kvm/arm.c
>>>> @@ -1009,7 +1009,7 @@ static int init_hyp_mode(void)
>>>>        if (err)
>>>>            goto out_free_mappings;
>>>>
>>>> -#ifndef CONFIG_HOTPLUG_CPU
>>>> +#if !defined(CONFIG_HOTPLUG_CPU) && !defined(CONFIG_KEXEC)
>>>>        free_boot_hyp_pgd();
>>>>    #endif
>>>>
>>>> @@ -1079,6 +1079,53 @@ out_err:
>>>>        return err;
>>>>    }
>>>>
>>>> +void kvm_cpu_reset(void (*phys_reset)(void *), void *addr)
>>>
>>>
>>>
>>> I'm not sure, but it might be better to put this code into
>>> arm/kernel/init.S
>>> as it is a counterpart of _do_hyp_init from cpu_init_hyp_mode().
>>>
>>
>> init.S is written in assembly, kvm_cpu_reset for kvm is written in C.
>> There is another version if kvm is disabled.
>
>
> For arm64, it seems easy to implement such function in asm.
>
>> The kernel code is supposed (at least in ARMv7) to run in SVC mode,
>> not on HYP mode. One reason is that code that change SCTLR executed at
>> HYP instead of SVC would not change current mode system control
>> register. The function does not revert to HYP for this reason.
>
>
> I need understand what you mentioned here, but for arm64, Geoff's kexec lets
> the boot cpu in hyp mode at soft_restart().
>
> -Takahiro AKASHI
>

Where's Geoff code? Note however that is not maintainable in this way
as people have to consider that future code could execute in either
HYP or SVC mode. Did you test that all code called from soft_restart
could do it?

It's similar as the kernel returning to user space with a jump instead
of a proper system return. User space code will execute in kernel
mode, now imagine if this code do another syscall. You can't be 100%
sure that current and future code works correctly or at least you
should check and state that these function have these restrictions.

I was going to add an kvm_call_hyp_ext function which in either arm32
or arm64 could pass up to 8 register parameters. Does it make sense?

Frediano

>
>>>
>>>> +{
>>>> +    unsigned long vector;
>>>> +
>>>> +    if (!is_hyp_mode_available()) {
>>>> +        phys_reset(addr);
>>>> +        return;
>>>> +    }
>>>> +
>>>> +    vector = (unsigned long) kvm_get_idmap_vector();
>>>> +
>>>> +    /*
>>>> +     * Set vectors so we call initialization routines.
>>>> +     * trampoline is mapped at TRAMPOLINE_VA but not at its physical
>>>> +     * address so we don't have an identity map to be able to
>>>> +     * disable MMU. We need to set exception vector at trampoline
>>>> +     * at TRAMPOLINE_VA address which is mapped.
>>>> +     */
>>>> +    kvm_call_hyp(__kvm_set_vectors,(unsigned long) (TRAMPOLINE_VA +
>>>> (vector & (PAGE_SIZE-1))));
>>>> +
>>>> +    /*
>>>> +     * Set HVBAR to physical address, page table to identity to do the
>>>> switch
>>>> +     */
>>>> +    kvm_call_hyp((void*) 4, (unsigned long) vector,
>>>> kvm_mmu_get_boot_httbr());
>>>> +
>>>> +    /*
>>>> +     * Flush to make sure code after the cache are disabled see updated
>>>> values
>>>> +     */
>>>> +    flush_cache_all();
>>>> +
>>>> +    /*
>>>> +     * Turn off caching on Hypervisor mode
>>>> +     */
>>>> +    kvm_call_hyp((void*) 1);
>>>> +
>>>> +    /*
>>>> +     * Flush to make sude code don't see old cached values after cache
>>>> is
>>>> +     * enabled
>>>> +     */
>>>> +    flush_cache_all();
>>>> +
>>>> +    /*
>>>> +     * Restore execution
>>>> +     */
>>>> +    kvm_call_hyp((void*) 3, addr);
>>>> +}
>>>> +
>>>
>>>
>>>
>>> If you like kvm_call_hyp-like sequences, please specify what each
>>> kvm_call_hyp() should do.
>>> (we can do all in one kvm_call_hyp() though.)
>>>
>>
>> kvm_call_hyp support a restricted number of arguments as they must be
>> on registers and I actually need more registers. I found quite
>> confusing that every vector table and HVC call have it's interface.
>> One for stub (-1 get HVBAR otherwise set), one for kvm (-1 get HVBAR
>> otherwise function pointer) and another for kvm initialization (0
>> first initialization, otherwise stack pointer, HVBAR and HTTBR).
>> Wouldn't be much nicer to pass a index for a function, something like
>> (in C pseudocode)
>>
>> typedef unsigned (*hvc_func_t)(unsigned a1, unsigned a2, unsigned a3);
>>
>> hvc_func_t hvc_funcs[MAX_HVC_FUNC];
>>
>> #define HVC_FUNC_GET_HVBAR 0
>> #define HVC_FUNC_SET_HVBAR 1
>> #define HVC_FUNC_FLUSH_xxx 2
>> ...
>>
>> and in assembly we could just use these indexes ?? Of course some HVC
>> implementation should return not implemented.
>>
>> I could disable cache on first call (the one I set just HVBAR) and
>> doing the flush than call second one with:
>> - vector in physical address (to jump to trampoline before disabling
>> MMU) with bit 0 set to distinguish from stack/0;
>> - address of newer kernel
>> - boot HTTBR (it takes 2 register as it's 64 bit)
>>
>> I'll have a look at arm64 code.
>>
>> Thank you,
>>     Frediano
>>
>>> Thanks,
>>> -Takahiro AKASHI
>>>
>>>
>>>>    /* NOP: Compiling as a module not supported */
>>>>    void kvm_arch_exit(void)
>>>>    {
>>>> diff --git a/arch/arm/kvm/init.S b/arch/arm/kvm/init.S
>>>> index 3988e72..c2f1d4d 100644
>>>> --- a/arch/arm/kvm/init.S
>>>> +++ b/arch/arm/kvm/init.S
>>>> @@ -68,6 +68,12 @@ __kvm_hyp_init:
>>>>        W(b)    .
>>>>
>>>>    __do_hyp_init:
>>>> +    @ check for special values, odd numbers can't be a stack pointer
>>>> +    cmp    r0, #1
>>>> +    beq    cpu_proc_fin
>>>> +    cmp    r0, #3
>>>> +    beq    restore
>>>> +
>>>>        cmp    r0, #0            @ We have a SP?
>>>>        bne    phase2            @ Yes, second stage init
>>>>
>>>> @@ -151,6 +157,33 @@ target:    @ We're now in the trampoline code,
>>>> switch page tables
>>>>
>>>>        eret
>>>>
>>>> +cpu_proc_fin:
>>>> +    mrc    p15, 4, r0, c1, c0, 0        @ ctrl register
>>>> +    bic    r0, r0, #0x1000            @ ...i............
>>>> +    bic    r0, r0, #0x0006            @ .............ca.
>>>> +    mcr    p15, 4, r0, c1, c0, 0        @ disable caches
>>>> +    eret
>>>> +
>>>> +restore:
>>>> +    @ Disable MMU, caches and Thumb in HSCTLR
>>>> +    mrc    p15, 4, r0, c1, c0, 0    @ HSCR
>>>> +    ldr    r2, =0x40003807
>>>> +    bic    r0, r0, r2
>>>> +    mcr    p15, 4, r0, c1, c0, 0    @ HSCR
>>>> +    isb
>>>> +
>>>> +    @ Invalidate old TLBs
>>>> +    mcr    p15, 4, r0, c8, c7, 0    @ TLBIALLH
>>>> +    isb
>>>> +    dsb    ish
>>>> +
>>>> +    @ Disable MMU, caches and Thumb in SCTLR
>>>> +    mrc    p15, 0, r0, c1, c0, 0   @ SCTLR
>>>> +    bic    r0, r0, r2
>>>> +    mcr    p15, 0, r0, c1, c0, 0
>>>> +
>>>> +    bx    r1
>>>> +
>>>>        .ltorg
>>>>
>>>>        .globl __kvm_hyp_init_end
>>>> diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S
>>>> index 01dcb0e..449e7dd 100644
>>>> --- a/arch/arm/kvm/interrupts.S
>>>> +++ b/arch/arm/kvm/interrupts.S
>>>> @@ -87,6 +87,18 @@ ENDPROC(__kvm_flush_vm_context)
>>>>
>>>>
>>>>    /********************************************************************
>>>> + * Set HVBAR
>>>> + *
>>>> + * void __kvm_set_vectors(unsigned long phys_vector_base);
>>>> + */
>>>> +ENTRY(__kvm_set_vectors)
>>>> +    mcr    p15, 4, r0, c12, c0, 0    @ set HVBAR
>>>> +    dsb    ish
>>>> +    isb
>>>> +    bx    lr
>>>> +ENDPROC(__kvm_set_vectors)
>>>> +
>>>> +/********************************************************************
>>>>     *  Hypervisor world-switch code
>>>>     *
>>>>     *
>>>> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
>>>> index 1366625..f853858 100644
>>>> --- a/arch/arm/kvm/mmu.c
>>>> +++ b/arch/arm/kvm/mmu.c
>>>> @@ -369,6 +369,11 @@ void free_boot_hyp_pgd(void)
>>>>        free_page((unsigned long)init_bounce_page);
>>>>        init_bounce_page = NULL;
>>>>
>>>> +    /* avoid to reuse possibly invalid values if bounce page is freed
>>>> */
>>>> +    hyp_idmap_start = 0;
>>>> +    hyp_idmap_end = 0;
>>>> +    hyp_idmap_vector = 0;
>>>> +
>>>>        mutex_unlock(&kvm_hyp_pgd_mutex);
>>>>    }
>>>>
>>>>
>>>> Frediano
>>>>
>>>
>



More information about the linux-arm-kernel mailing list