[PATCH 1/3] kexec: extend hypercall with improved load/unload ops

David Vrabel david.vrabel at citrix.com
Wed Jan 16 11:29:04 EST 2013


From: David Vrabel <david.vrabel at citrix.com>

In the existing kexec hypercall, the load and unload ops depend on
internals of the Linux kernel (the page list and code page provided by
the kernel).  The code page is used to transition between Xen context
and the image so using kernel code doesn't make sense and will not
work for PVH guests.

Add replacement KEXEC_CMD_kexec_load_v2 and KEXEC_CMD_kexec_unload_v2
ops that no longer require a code page to be provided by the guest --
Xen now provides the code for calling the image directly.

The load_v2 op looks similar to the Linux kexec_load system call and
allows the guest to provide the image data to be loaded into the crash
kernel memory region.  The guest may also specify whether the image is
64-bit or 32-bit.

The toolstack can now load images without kernel involvement.  This is
required for supporting kexec of crash kernels from PV-ops kernels.

Note: This also changes the behaviour of the kexec op when a image is
loaded with the old ABI.  The code page will no longer be used which
may result is incorrect behaviour in non-Linux guests.  This allowed
the code to be simpler and support for the old ABI is being removed in
a subsequent patch anyway.

[ This is a prototype and has the following limitations:

- no compat implementation for kexec_load_v2.
- 64-bit images are not supported.
- 32-bit images are called with paging enabled (Linux starts 32-bit
  images with paging disabled). ]

Signed-off-by: David Vrabel <david.vrabel at citrix.com>
---
 xen/arch/x86/machine_kexec.c       |   73 ++------------
 xen/arch/x86/x86_64/compat_kexec.S |   25 -----
 xen/common/kexec.c                 |  204 ++++++++++++++++++++++++++++++++++--
 xen/include/public/kexec.h         |   44 ++++++++
 xen/include/xen/kexec.h            |   18 ++--
 5 files changed, 255 insertions(+), 109 deletions(-)

diff --git a/xen/arch/x86/machine_kexec.c b/xen/arch/x86/machine_kexec.c
index 8191ef1..7131d20 100644
--- a/xen/arch/x86/machine_kexec.c
+++ b/xen/arch/x86/machine_kexec.c
@@ -12,62 +12,16 @@
 #include <asm/fixmap.h>
 #include <asm/hpet.h>
 
-typedef void (*relocate_new_kernel_t)(
-                unsigned long indirection_page,
-                unsigned long *page_list,
-                unsigned long start_address,
-                unsigned int preserve_context);
-
-int machine_kexec_load(int type, int slot, xen_kexec_image_t *image)
+int machine_kexec_load(struct kexec_image *image)
 {
-    unsigned long prev_ma = 0;
-    int fix_base = FIX_KEXEC_BASE_0 + (slot * (KEXEC_XEN_NO_PAGES >> 1));
-    int k;
-
-    /* setup fixmap to point to our pages and record the virtual address
-     * in every odd index in page_list[].
-     */
-
-    for ( k = 0; k < KEXEC_XEN_NO_PAGES; k++ )
-    {
-        if ( (k & 1) == 0 )
-        {
-            /* Even pages: machine address. */
-            prev_ma = image->page_list[k];
-        }
-        else
-        {
-            /* Odd pages: va for previous ma. */
-            if ( is_pv_32on64_domain(dom0) )
-            {
-                /*
-                 * The compatability bounce code sets up a page table
-                 * with a 1-1 mapping of the first 1G of memory so
-                 * VA==PA here.
-                 *
-                 * This Linux purgatory code still sets up separate
-                 * high and low mappings on the control page (entries
-                 * 0 and 1) but it is harmless if they are equal since
-                 * that PT is not live at the time.
-                 */
-                image->page_list[k] = prev_ma;
-            }
-            else
-            {
-                set_fixmap(fix_base + (k >> 1), prev_ma);
-                image->page_list[k] = fix_to_virt(fix_base + (k >> 1));
-            }
-        }
-    }
-
     return 0;
 }
 
-void machine_kexec_unload(int type, int slot, xen_kexec_image_t *image)
+void machine_kexec_unload(struct kexec_image *image)
 {
 }
 
-void machine_reboot_kexec(xen_kexec_image_t *image)
+void machine_reboot_kexec(struct kexec_image *image)
 {
     BUG_ON(smp_processor_id() != 0);
     smp_send_stop();
@@ -75,7 +29,7 @@ void machine_reboot_kexec(xen_kexec_image_t *image)
     BUG();
 }
 
-void machine_kexec(xen_kexec_image_t *image)
+void machine_kexec(struct kexec_image *image)
 {
     struct desc_ptr gdt_desc = {
         .base = (unsigned long)(boot_cpu_gdt_table - FIRST_RESERVED_GDT_ENTRY),
@@ -116,22 +70,11 @@ void machine_kexec(xen_kexec_image_t *image)
      */
     asm volatile ( "lgdt %0" : : "m" (gdt_desc) );
 
-    if ( is_pv_32on64_domain(dom0) )
-    {
-        compat_machine_kexec(image->page_list[1],
-                             image->indirection_page,
-                             image->page_list,
-                             image->start_address);
-    }
+    if ( image->class == KEXEC_CLASS_32 )
+        compat_machine_kexec(image->entry_maddr);
     else
-    {
-        relocate_new_kernel_t rnk;
-
-        rnk = (relocate_new_kernel_t) image->page_list[1];
-        (*rnk)(image->indirection_page, image->page_list,
-               image->start_address,
-               0 /* preserve_context */);
-    }
+        /* FIXME */
+        panic("KEXEC_CLASS_64 not yet supported\n");
 }
 
 int machine_kexec_get(xen_kexec_range_t *range)
diff --git a/xen/arch/x86/x86_64/compat_kexec.S b/xen/arch/x86/x86_64/compat_kexec.S
index fc92af9..d853231 100644
--- a/xen/arch/x86/x86_64/compat_kexec.S
+++ b/xen/arch/x86/x86_64/compat_kexec.S
@@ -36,21 +36,6 @@
 ENTRY(compat_machine_kexec)
         /* x86/64                        x86/32  */
         /* %rdi - relocate_new_kernel_t  CALL    */
-        /* %rsi - indirection page       4(%esp) */
-        /* %rdx - page_list              8(%esp) */
-        /* %rcx - start address         12(%esp) */
-        /*        cpu has pae           16(%esp) */
-
-        /* Shim the 64 bit page_list into a 32 bit page_list. */
-        mov $12,%r9
-        lea compat_page_list(%rip), %rbx
-1:      dec %r9
-        movl (%rdx,%r9,8),%eax
-        movl %eax,(%rbx,%r9,4)
-        test %r9,%r9
-        jnz 1b
-
-        RELOCATE_SYM(compat_page_list,%rdx)
 
         /* Relocate compatibility mode entry point address. */
         RELOCATE_MEM(compatibility_mode_far,%eax)
@@ -118,12 +103,6 @@ compatibility_mode:
         movl %eax, %gs
         movl %eax, %ss
 
-        /* Push arguments onto stack. */
-        pushl $0   /* 20(%esp) - preserve context */
-        pushl $1   /* 16(%esp) - cpu has pae */
-        pushl %ecx /* 12(%esp) - start address */
-        pushl %edx /*  8(%esp) - page list */
-        pushl %esi /*  4(%esp) - indirection page */
         pushl %edi /*  0(%esp) - CALL */
 
         /* Disable paging and therefore leave 64 bit mode. */
@@ -153,10 +132,6 @@ compatibility_mode:
         ud2
 
         .data
-        .align 4
-compat_page_list:
-        .fill 12,4,0
-
         .align 32,0
 
         /*
diff --git a/xen/common/kexec.c b/xen/common/kexec.c
index 25ebd6a..56bf8b4 100644
--- a/xen/common/kexec.c
+++ b/xen/common/kexec.c
@@ -45,7 +45,7 @@ static Elf_Note *xen_crash_note;
 
 static cpumask_t crash_saved_cpus;
 
-static xen_kexec_image_t kexec_image[KEXEC_IMAGE_NR];
+static struct kexec_image kexec_image[KEXEC_IMAGE_NR];
 
 #define KEXEC_FLAG_DEFAULT_POS   (KEXEC_IMAGE_NR + 0)
 #define KEXEC_FLAG_CRASH_POS     (KEXEC_IMAGE_NR + 1)
@@ -316,7 +316,7 @@ void kexec_crash(void)
 
 static long kexec_reboot(void *_image)
 {
-    xen_kexec_image_t *image = _image;
+    struct kexec_image *image = _image;
 
     kexecing = TRUE;
 
@@ -732,9 +732,19 @@ static void crash_save_vmcoreinfo(void)
 #endif
 }
 
+static void kexec_unload_slot(int slot)
+{
+    struct kexec_image *image = &kexec_image[slot];
+
+    if ( test_and_clear_bit(slot, &kexec_flags) )
+    {
+        machine_kexec_unload(image);
+    }
+}
+
 static int kexec_load_unload_internal(unsigned long op, xen_kexec_load_t *load)
 {
-    xen_kexec_image_t *image;
+    struct kexec_image *image;
     int base, bit, pos;
     int ret = 0;
 
@@ -750,9 +760,13 @@ static int kexec_load_unload_internal(unsigned long op, xen_kexec_load_t *load)
 
         BUG_ON(test_bit((base + !pos), &kexec_flags)); /* must be free */
 
-        memcpy(image, &load->image, sizeof(*image));
+        if ( is_pv_32on64_domain(dom0) )
+            image->class = KEXEC_CLASS_32;
+        else
+            image->class = KEXEC_CLASS_64;
+        image->entry_maddr = load->image.start_address;
 
-        if ( !(ret = machine_kexec_load(load->type, base + !pos, image)) )
+        if ( !(ret = machine_kexec_load(image)) )
         {
             /* Set image present bit */
             set_bit((base + !pos), &kexec_flags);
@@ -767,11 +781,7 @@ static int kexec_load_unload_internal(unsigned long op, xen_kexec_load_t *load)
     /* Unload the old image if present and load successful */
     if ( ret == 0 && !test_bit(KEXEC_FLAG_IN_PROGRESS, &kexec_flags) )
     {
-        if ( test_and_clear_bit((base + pos), &kexec_flags) )
-        {
-            image = &kexec_image[base + pos];
-            machine_kexec_unload(load->type, base + pos, image);
-        }
+        kexec_unload_slot(base + pos);
     }
 
     return ret;
@@ -816,7 +826,7 @@ static int kexec_load_unload_compat(unsigned long op,
 static int kexec_exec(XEN_GUEST_HANDLE_PARAM(void) uarg)
 {
     xen_kexec_exec_t exec;
-    xen_kexec_image_t *image;
+    struct kexec_image *image;
     int base, bit, pos, ret = -EINVAL;
 
     if ( unlikely(copy_from_guest(&exec, uarg, 1)) )
@@ -845,6 +855,162 @@ static int kexec_exec(XEN_GUEST_HANDLE_PARAM(void) uarg)
     return -EINVAL; /* never reached */
 }
 
+static int kexec_load_segments(xen_kexec_load_v2_t *load)
+{
+    unsigned s;
+    bool_t valid_entry = 0;
+
+    for ( s = 0; s < load->nr_segments; s++ )
+    {
+        xen_kexec_segment_t seg;
+        unsigned long to_copy;
+        unsigned long src_offset;
+        unsigned long dest;
+
+        if ( copy_from_guest_offset(&seg, load->segments, s, 1) )
+            return -EFAULT;
+
+        /* Caller is responsible for ensuring the crash space is
+           shared between multiple users of the load call.  Xen just
+           validates the load is to somewhere within the region. */
+
+        if ( seg.dest_maddr < kexec_crash_area.start
+             || seg.dest_maddr + seg.size > kexec_crash_area.start + kexec_crash_area.size)
+            return -EINVAL;
+
+        if ( load->entry_maddr >= seg.dest_maddr
+             && load->entry_maddr < seg.dest_maddr + seg.size)
+            valid_entry = 1;
+
+        to_copy = seg.size;
+        src_offset = 0;
+        dest = seg.dest_maddr;
+
+        while ( to_copy )
+        {
+            unsigned long dest_mfn;
+            size_t dest_off;
+            void *dest_va;
+            size_t size;
+
+            dest_mfn = dest >> PAGE_SHIFT;
+            dest_off = dest & ~PAGE_MASK;
+
+            size = min(PAGE_SIZE - dest_off, to_copy);
+
+            dest_va = vmap(&dest_mfn, 1);
+            if ( dest_va == NULL )
+                return -EINVAL;
+
+            copy_from_guest_offset(dest_va, seg.buf, src_offset, size);
+
+            vunmap(dest_va);
+
+            to_copy -= size;
+            src_offset += size;
+            dest += size;
+        }
+    }
+
+    /* Entry point is somewhere in a loaded segment? */
+    if ( !valid_entry )
+        return -EINVAL;
+
+    return 0;
+}
+
+static int slot_to_pos_bit(int slot)
+{
+    return KEXEC_IMAGE_NR + slot / 2;
+}
+
+static int kexec_load_slot(int slot, xen_kexec_load_v2_t *load)
+{
+    struct kexec_image *image = &kexec_image[slot];
+    int ret;
+
+    BUG_ON(test_bit(slot, &kexec_flags)); /* must be free */
+
+    /* validate and load each segment. */
+    ret = kexec_load_segments(load);
+    if ( ret < 0 )
+        return ret;
+
+    image->entry_maddr = load->entry_maddr;
+
+    ret = machine_kexec_load(image);
+    if ( ret < 0 )
+        return ret;
+
+    /* Set image present bit */
+    set_bit(slot, &kexec_flags);
+
+    /* Make new image the active one */
+    change_bit(slot_to_pos_bit(slot), &kexec_flags);
+
+    crash_save_vmcoreinfo();
+
+    return ret;
+}
+
+
+static int kexec_load_v2(XEN_GUEST_HANDLE_PARAM(void) uarg)
+{
+    xen_kexec_load_v2_t load;
+    int base, bit, pos, slot;
+    struct kexec_image *image;
+    int ret;
+
+    if ( unlikely(copy_from_guest(&load, uarg, 1)) )
+        return -EFAULT;
+
+    if ( kexec_load_get_bits(load.type, &base, &bit) )
+        return -EINVAL;
+
+    pos = (test_bit(bit, &kexec_flags) != 0);
+    slot = base + !pos;
+    image = &kexec_image[slot];
+
+    switch ( load.class )
+    {
+    case KEXEC_CLASS_32:
+    case KEXEC_CLASS_64:
+        image->class = load.class;
+        break;
+    default:
+        return -EINVAL;
+    }
+
+    ret = kexec_load_slot(slot, &load);
+
+    /* Unload the old image if present and load successful */
+    if ( ret == 0 && !test_bit(KEXEC_FLAG_IN_PROGRESS, &kexec_flags) )
+    {
+        kexec_unload_slot(slot ^ 0x1);
+    }
+
+    return ret;
+}
+
+static int kexec_unload_v2(XEN_GUEST_HANDLE_PARAM(void) uarg)
+{
+    xen_kexec_unload_v2_t unload;
+    int base, bit, pos, slot;
+
+    if ( unlikely(copy_from_guest(&unload, uarg, 1)) )
+        return -EFAULT;
+
+    if ( kexec_load_get_bits(unload.type, &base, &bit) )
+        return -EINVAL;
+
+    pos = (test_bit(bit, &kexec_flags) != 0);
+    slot = base + !pos;
+
+    kexec_unload_slot(slot);
+
+    return 0;
+}
+
 static int do_kexec_op_internal(unsigned long op,
                                 XEN_GUEST_HANDLE_PARAM(void) uarg,
                                 bool_t compat)
@@ -882,6 +1048,22 @@ static int do_kexec_op_internal(unsigned long op,
     case KEXEC_CMD_kexec:
         ret = kexec_exec(uarg);
         break;
+    case KEXEC_CMD_kexec_load_v2:
+        spin_lock_irqsave(&kexec_lock, flags);
+        if ( !test_bit(KEXEC_FLAG_IN_PROGRESS, &kexec_flags) )
+            ret = kexec_load_v2(uarg);
+        else
+            ret = -EBUSY;
+        spin_unlock_irqrestore(&kexec_lock, flags);
+        break;
+    case KEXEC_CMD_kexec_unload_v2:
+        spin_lock_irqsave(&kexec_lock, flags);
+        if ( !test_bit(KEXEC_FLAG_IN_PROGRESS, &kexec_flags) )
+            ret = kexec_unload_v2(uarg);
+        else
+            ret = -EBUSY;
+        spin_unlock_irqrestore(&kexec_lock, flags);
+        break;
     }
 
     return ret;
diff --git a/xen/include/public/kexec.h b/xen/include/public/kexec.h
index 61a8d7d..4b7d637 100644
--- a/xen/include/public/kexec.h
+++ b/xen/include/public/kexec.h
@@ -83,6 +83,8 @@
 #define KEXEC_TYPE_DEFAULT 0
 #define KEXEC_TYPE_CRASH   1
 
+#define KEXEC_CLASS_32   1 /* 32-bit image. */
+#define KEXEC_CLASS_64   2 /* 64-bit image. */
 
 /* The kexec implementation for Xen allows the user to load two
  * types of kernels, KEXEC_TYPE_DEFAULT and KEXEC_TYPE_CRASH.
@@ -152,6 +154,48 @@ typedef struct xen_kexec_range {
     unsigned long start;
 } xen_kexec_range_t;
 
+/*
+ * A contiguous chunk of a kexec image and it's destination machine
+ * address.
+ */
+typedef struct xen_kexec_segment {
+    XEN_GUEST_HANDLE(const_void) buf;
+    uint32_t size;
+    uint64_t dest_maddr;
+} xen_kexec_segment_t;
+DEFINE_XEN_GUEST_HANDLE(xen_kexec_segment_t);
+
+/*
+ * Load a kexec image into memory.
+ *
+ * Each segment of the image must reside in the memory region reserved
+ * for kexec (KEXEC_RANGE_MA_CRASH) and the entry point must be within
+ * the image.
+ *
+ * The caller is responsible for ensuring that multiple images do not
+ * overlap.
+ */
+#define KEXEC_CMD_kexec_load_v2        4
+typedef struct xen_kexec_load_v2 {
+    uint8_t type;  /* One of KEXEC_TYPE_* */
+    uint8_t class; /* One of KEXEC_CLASS_* */
+    uint32_t nr_segments;
+    XEN_GUEST_HANDLE(xen_kexec_segment_t) segments;
+    uint64_t entry_maddr; /* image entry point machine address. */
+} xen_kexec_load_v2_t;
+DEFINE_XEN_GUEST_HANDLE(xen_kexec_load_v2_t);
+
+/*
+ * Unload a kexec image.
+ *
+ * Type must be one of KEXEC_TYPE_DEFAULT or KEXEC_TYPE_CRASH.
+ */
+#define KEXEC_CMD_kexec_unload_v2      5
+typedef struct xen_kexec_unload_v2 {
+    uint32_t type;
+} xen_kexec_unload_v2_t;
+DEFINE_XEN_GUEST_HANDLE(xen_kexec_unload_v2_t);
+
 #endif /* _XEN_PUBLIC_KEXEC_H */
 
 /*
diff --git a/xen/include/xen/kexec.h b/xen/include/xen/kexec.h
index b3ca8b0..5c13af8 100644
--- a/xen/include/xen/kexec.h
+++ b/xen/include/xen/kexec.h
@@ -27,6 +27,11 @@ void set_kexec_crash_area_size(u64 system_ram);
 #define KEXEC_IMAGE_CRASH_BASE   2
 #define KEXEC_IMAGE_NR           4
 
+struct kexec_image {
+    uint8_t class;
+    unsigned long entry_maddr;
+};
+
 enum low_crashinfo {
     LOW_CRASHINFO_INVALID = 0,
     LOW_CRASHINFO_NONE = 1,
@@ -40,11 +45,11 @@ extern enum low_crashinfo low_crashinfo_mode;
 extern paddr_t crashinfo_maxaddr_bits;
 void kexec_early_calculations(void);
 
-int machine_kexec_load(int type, int slot, xen_kexec_image_t *image);
-void machine_kexec_unload(int type, int slot, xen_kexec_image_t *image);
+int machine_kexec_load(struct kexec_image *image);
+void machine_kexec_unload(struct kexec_image *image);
 void machine_kexec_reserved(xen_kexec_reserve_t *reservation);
-void machine_reboot_kexec(xen_kexec_image_t *image);
-void machine_kexec(xen_kexec_image_t *image);
+void machine_reboot_kexec(struct kexec_image *image);
+void machine_kexec(struct kexec_image *image);
 void kexec_crash(void);
 void kexec_crash_save_cpu(void);
 crash_xen_info_t *kexec_crash_save_info(void);
@@ -52,10 +57,7 @@ void machine_crash_shutdown(void);
 int machine_kexec_get(xen_kexec_range_t *range);
 int machine_kexec_get_xen(xen_kexec_range_t *range);
 
-void compat_machine_kexec(unsigned long rnk,
-                          unsigned long indirection_page,
-                          unsigned long *page_list,
-                          unsigned long start_address);
+void compat_machine_kexec(unsigned long start_address);
 
 /* vmcoreinfo stuff */
 #define VMCOREINFO_BYTES           (4096)
-- 
1.7.2.5




More information about the kexec mailing list