Kexec and KVM not working gracefully together

Frediano Ziglio freddy77 at gmail.com
Thu Feb 5 02:51:58 PST 2015


2015-02-05 10:27 GMT+00:00 AKASHI Takahiro <takahiro.akashi at linaro.org>:
> On 02/05/2015 06:43 PM, Marc Zyngier wrote:
>>
>> On Thu, 5 Feb 2015 06:17:04 +0000
>> AKASHI Takahiro <takahiro.akashi at linaro.org> wrote:
>>
>>> Frediano
>>> Cc: Marc
>>>
>>> Are you going to fix this issue on arm?
>>> As I'm now working on the same issue on arm64,
>>> we should share the idea and some code.
>>
>>
>> Just to be perfectly clear: KVM support for Kexec on arm and arm64
>> should be exactly the same, with the exception of some (very
>> limited) assembly code for the MMU teardown.
>
>
> Yep, that's exactly what I meant.
>
>> And I'd like to see both architectures addressed at the same time,
>> without statements such as "we'll do that as a next step". As such, I'd
>> very much recommend that Frediano and you work together to address this
>> problem.
>
>
> I hope so.
>
> Thanks,
> -Takahiro AKASHI
>
>> Thanks,
>>
>>          M.
>>



Well, let's code speak. I'm actually testing it with an ARM32 platform
with a single CPU (platform have 16 cores but I'm using a no SMP
kernel for my tests for different reason, I should be able to test SMP
in a couple of days).

The key point is the switch back code (arm.c, kvm_cpu_reset code). The idea:
- at boot time I save HSCTLR value
- restore boot page tables and trampoline before cpu reset
- call kvm_cpu_reset instead of cpu_reset (this will call cpu_reset if
kvm is disabled)
- set hvbar to trampoline (virtual memory)
- set hvbar to physical trampoline and boot pages (with trampoline
mapped at physical and virtual that is identity)
- flush memory while disabling caching on hypervisor mode (not doing
cause a lot of instability, now I rebooted more than 100 times in a
row)
- call hypervisor mode to do the reset cpu restoring the boot HSCTLR value


Hope gmail does not screw my patch...


diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h
index 3a67bec..71380a1 100644
--- a/arch/arm/include/asm/kvm_asm.h
+++ b/arch/arm/include/asm/kvm_asm.h
@@ -97,7 +97,24 @@ extern char __kvm_hyp_code_end[];
 extern void __kvm_flush_vm_context(void);
 extern void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa);

+extern void __kvm_set_vectors(unsigned long phys_vector_base);
+
 extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
+
+#ifdef CONFIG_HAVE_KVM
+static void kvm_cpu_reset(void (*phys_reset)(void *), void *addr)
+{
+    phys_reset(addr);
+}
+static inline int kvm_mmu_reset_prepare(void)
+{
+    return 0;
+}
+#else
+extern void kvm_cpu_reset(void (*phys_reset)(void *), void *addr);
+extern int kvm_mmu_reset_prepare(void);
+#endif
+
 #endif

 #endif /* __ARM_KVM_ASM_H__ */
diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 04b4ea0..c0b6c20 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -192,6 +192,8 @@ int kvm_arm_coproc_set_reg(struct kvm_vcpu *vcpu,
const struct kvm_one_reg *);
 int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
         int exception_index);

+extern unsigned kvm_saved_sctlr;
+
 static inline void __cpu_init_hyp_mode(phys_addr_t boot_pgd_ptr,
                        phys_addr_t pgd_ptr,
                        unsigned long hyp_stack_ptr,
@@ -213,7 +215,7 @@ static inline void __cpu_init_hyp_mode(phys_addr_t
boot_pgd_ptr,
      * PCS!).
      */

-    kvm_call_hyp(NULL, 0, boot_pgd_ptr);
+    kvm_saved_sctlr = (unsigned) kvm_call_hyp(NULL, 0, boot_pgd_ptr);

     kvm_call_hyp((void*)hyp_stack_ptr, vector_ptr, pgd_ptr);
 }
diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index fdfa3a7..026a1e4 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -41,6 +41,7 @@
 #include <asm/system_misc.h>
 #include <asm/mach/time.h>
 #include <asm/tls.h>
+#include <asm/kvm_asm.h>

 #ifdef CONFIG_CC_STACKPROTECTOR
 #include <linux/stackprotector.h>
@@ -60,7 +61,7 @@ static const char *isa_modes[] __maybe_unused = {
 };

 extern void call_with_stack(void (*fn)(void *), void *arg, void *sp);
-typedef void (*phys_reset_t)(unsigned long);
+typedef void (*phys_reset_t)(void *);

 /*
  * A temporary stack to use for CPU reset. This is static so that we
@@ -89,7 +90,7 @@ static void __soft_restart(void *addr)

     /* Switch to the identity mapping. */
     phys_reset = (phys_reset_t)(unsigned long)virt_to_phys(cpu_reset);
-    phys_reset((unsigned long)addr);
+    kvm_cpu_reset(phys_reset, addr);

     /* Should never get here. */
     BUG();
@@ -107,6 +108,8 @@ void soft_restart(unsigned long addr)
     if (num_online_cpus() == 1)
         outer_disable();

+    kvm_mmu_reset_prepare();
+
     /* Change to the new stack and continue with the reset. */
     call_with_stack(__soft_restart, (void *)addr, (void *)stack);

diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 0b0d58a..7727678 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -63,6 +63,8 @@ static DEFINE_SPINLOCK(kvm_vmid_lock);

 static bool vgic_present;

+unsigned kvm_saved_sctlr;
+
 static void kvm_arm_set_running_vcpu(struct kvm_vcpu *vcpu)
 {
     BUG_ON(preemptible());
@@ -1079,6 +1081,37 @@ out_err:
     return err;
 }

+void kvm_cpu_reset(void (*phys_reset)(void *), void *addr)
+{
+    unsigned long vector;
+
+    if (!is_hyp_mode_available()) {
+        phys_reset(addr);
+        return;
+    }
+
+    vector = (unsigned long) kvm_get_idmap_vector();
+
+    kvm_call_hyp(__kvm_flush_vm_context);
+
+    /* set vectors so we call initialization routines */
+    /* trampoline is still on memory, physical ram is not mapped */
+    kvm_call_hyp(__kvm_set_vectors,(unsigned long) (TRAMPOLINE_VA +
(vector & (PAGE_SIZE-1))));
+
+    /* set HVBAR to physical, page table to identity to do the switch */
+    kvm_call_hyp((void*) 0x100, (unsigned long) vector,
kvm_mmu_get_boot_httbr());
+
+    flush_cache_all();
+
+    /* Turn off caching on Hypervisor mode */
+    kvm_call_hyp((void*) 1);
+
+    flush_cache_all();
+
+    /* restore execution */
+    kvm_call_hyp((void*) 3, kvm_saved_sctlr, addr);
+}
+
 /* NOP: Compiling as a module not supported */
 void kvm_arch_exit(void)
 {
diff --git a/arch/arm/kvm/init.S b/arch/arm/kvm/init.S
index 3988e72..79f435a 100644
--- a/arch/arm/kvm/init.S
+++ b/arch/arm/kvm/init.S
@@ -68,12 +68,21 @@ __kvm_hyp_init:
     W(b)    .

 __do_hyp_init:
+    @ check for special values, odd numbers can't be a stack pointer
+    cmp    r0, #1
+    beq    cpu_proc_fin
+    cmp    r0, #3
+    beq    restore
+
     cmp    r0, #0            @ We have a SP?
     bne    phase2            @ Yes, second stage init

     @ Set the HTTBR to point to the hypervisor PGD pointer passed
     mcrr    p15, 4, rr_lo_hi(r2, r3), c2

+    @ Save HSCR to be able to return it, save and restore
+    mrc    p15, 4, r3, c1, c0, 0    @ HSCR
+
     @ Set the HTCR and VTCR to the same shareability and cacheability
     @ settings as the non-secure TTBCR and with T0SZ == 0.
     mrc    p15, 4, r0, c2, c0, 2    @ HTCR
@@ -125,6 +134,9 @@ __do_hyp_init:
     isb
     mcr    p15, 4, r0, c1, c0, 0    @ HSCR

+    @ Return initial HSCR register
+    mov    r0, r3
+
     @ End of init phase-1
     eret

@@ -151,6 +163,29 @@ target:    @ We're now in the trampoline code,
switch page tables

     eret

+cpu_proc_fin:
+    mrc    p15, 4, r0, c1, c0, 0        @ ctrl register
+    bic    r0, r0, #0x1000            @ ...i............
+    bic    r0, r0, #0x0006            @ .............ca.
+    mcr    p15, 4, r0, c1, c0, 0        @ disable caches
+    eret
+
+restore:
+    isb
+    mcr    p15, 4, r1, c1, c0, 0    @ HSCR
+    isb
+    mcr    p15, 4, r0, c8, c7, 0    @ TLBIALLH
+    isb
+    dsb    ish
+
+    @ Disable MMU, caches and Thumb in SCTLR
+    mrc    p15, 0, r1, c1, c0, 0   @ SCTLR
+    ldr    r0, =0x40003807
+    bic    r1, r1, r0
+    mcr    p15, 0, r1, c1, c0, 0
+
+    bx    r2
+
     .ltorg

     .globl __kvm_hyp_init_end
diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S
index 01dcb0e..449e7dd 100644
--- a/arch/arm/kvm/interrupts.S
+++ b/arch/arm/kvm/interrupts.S
@@ -87,6 +87,18 @@ ENDPROC(__kvm_flush_vm_context)


 /********************************************************************
+ * Set HVBAR
+ *
+ * void __kvm_set_vectors(unsigned long phys_vector_base);
+ */
+ENTRY(__kvm_set_vectors)
+    mcr    p15, 4, r0, c12, c0, 0    @ set HVBAR
+    dsb    ish
+    isb
+    bx    lr
+ENDPROC(__kvm_set_vectors)
+
+/********************************************************************
  *  Hypervisor world-switch code
  *
  *
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 1366625..9149ae5 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -46,6 +46,8 @@ static phys_addr_t hyp_idmap_vector;

 #define kvm_pmd_huge(_x)    (pmd_huge(_x) || pmd_trans_huge(_x))

+static int kvm_mmu_boot_init(void);
+
 static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
 {
     /*
@@ -369,6 +371,11 @@ void free_boot_hyp_pgd(void)
     free_page((unsigned long)init_bounce_page);
     init_bounce_page = NULL;

+    /* avoid to reuse possibly invalid values if bounce page is freed */
+    hyp_idmap_start = 0;
+    hyp_idmap_end = 0;
+    hyp_idmap_vector = 0;
+
     mutex_unlock(&kvm_hyp_pgd_mutex);
 }

@@ -1252,7 +1259,28 @@ phys_addr_t kvm_get_idmap_vector(void)
     return hyp_idmap_vector;
 }

-int kvm_mmu_init(void)
+/* assure we have MMU setup correctly */
+int kvm_mmu_reset_prepare(void)
+{
+    int err = 0;
+
+    if (boot_hyp_pgd)
+        goto out;
+
+    err = kvm_mmu_boot_init();
+    if (err)
+        goto out;
+
+    err =     __create_hyp_mappings(hyp_pgd,
+                      TRAMPOLINE_VA, TRAMPOLINE_VA + PAGE_SIZE,
+                      __phys_to_pfn(hyp_idmap_start),
+                      PAGE_HYP);
+
+out:
+    return err;
+}
+
+static int kvm_mmu_boot_init(void)
 {
     int err;

@@ -1294,10 +1322,9 @@ int kvm_mmu_init(void)
              (unsigned long)phys_base);
     }

-    hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
hyp_pgd_order);
     boot_hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
hyp_pgd_order);

-    if (!hyp_pgd || !boot_hyp_pgd) {
+    if (!boot_hyp_pgd) {
         kvm_err("Hyp mode PGD not allocated\n");
         err = -ENOMEM;
         goto out;
@@ -1326,6 +1353,27 @@ int kvm_mmu_init(void)
         goto out;
     }

+    return 0;
+out:
+    free_boot_hyp_pgd();
+    return err;
+}
+
+int kvm_mmu_init(void)
+{
+    int err;
+
+    err = kvm_mmu_boot_init();
+    if (err)
+        goto out0;
+
+    hyp_pgd = kzalloc(PTRS_PER_PGD * sizeof(pgd_t), GFP_KERNEL);
+    if (!hyp_pgd) {
+        kvm_err("Hyp mode PGD not allocated\n");
+        err = -ENOMEM;
+        goto out;
+    }
+
     /* Map the same page again into the runtime page tables */
     err =     __create_hyp_mappings(hyp_pgd,
                       TRAMPOLINE_VA, TRAMPOLINE_VA + PAGE_SIZE,
@@ -1340,6 +1388,7 @@ int kvm_mmu_init(void)
     return 0;
 out:
     free_hyp_pgds();
+out0:
     return err;
 }


Frediano



More information about the linux-arm-kernel mailing list