[PATCH 1/3] kexec: extend hypercall with improved load/unload ops

Daniel Kiper daniel.kiper at oracle.com
Thu Jan 17 07:28:03 EST 2013


On Wed, Jan 16, 2013 at 04:29:04PM +0000, David Vrabel wrote:
> From: David Vrabel <david.vrabel at citrix.com>
>
> In the existing kexec hypercall, the load and unload ops depend on
> internals of the Linux kernel (the page list and code page provided by
> the kernel).  The code page is used to transition between Xen context
> and the image so using kernel code doesn't make sense and will not
> work for PVH guests.
>
> Add replacement KEXEC_CMD_kexec_load_v2 and KEXEC_CMD_kexec_unload_v2
> ops that no longer require a code page to be provided by the guest --
> Xen now provides the code for calling the image directly.
>
> The load_v2 op looks similar to the Linux kexec_load system call and
> allows the guest to provide the image data to be loaded into the crash
> kernel memory region.  The guest may also specify whether the image is
> 64-bit or 32-bit.
>
> The toolstack can now load images without kernel involvement.  This is
> required for supporting kexec of crash kernels from PV-ops kernels.
>
> Note: This also changes the behaviour of the kexec op when a image is
> loaded with the old ABI.  The code page will no longer be used which
> may result is incorrect behaviour in non-Linux guests.  This allowed
> the code to be simpler and support for the old ABI is being removed in
> a subsequent patch anyway.
>
> [ This is a prototype and has the following limitations:
>
> - no compat implementation for kexec_load_v2.
> - 64-bit images are not supported.
> - 32-bit images are called with paging enabled (Linux starts 32-bit
>   images with paging disabled). ]
>
> Signed-off-by: David Vrabel <david.vrabel at citrix.com>
> ---
>  xen/arch/x86/machine_kexec.c       |   73 ++------------
>  xen/arch/x86/x86_64/compat_kexec.S |   25 -----
>  xen/common/kexec.c                 |  204 ++++++++++++++++++++++++++++++++++--
>  xen/include/public/kexec.h         |   44 ++++++++
>  xen/include/xen/kexec.h            |   18 ++--
>  5 files changed, 255 insertions(+), 109 deletions(-)
>
> diff --git a/xen/arch/x86/machine_kexec.c b/xen/arch/x86/machine_kexec.c
> index 8191ef1..7131d20 100644
> --- a/xen/arch/x86/machine_kexec.c
> +++ b/xen/arch/x86/machine_kexec.c
> @@ -12,62 +12,16 @@
>  #include <asm/fixmap.h>
>  #include <asm/hpet.h>
>
> -typedef void (*relocate_new_kernel_t)(
> -                unsigned long indirection_page,
> -                unsigned long *page_list,
> -                unsigned long start_address,
> -                unsigned int preserve_context);
> -
> -int machine_kexec_load(int type, int slot, xen_kexec_image_t *image)
> +int machine_kexec_load(struct kexec_image *image)
>  {
> -    unsigned long prev_ma = 0;
> -    int fix_base = FIX_KEXEC_BASE_0 + (slot * (KEXEC_XEN_NO_PAGES >> 1));
> -    int k;
> -
> -    /* setup fixmap to point to our pages and record the virtual address
> -     * in every odd index in page_list[].
> -     */
> -
> -    for ( k = 0; k < KEXEC_XEN_NO_PAGES; k++ )
> -    {
> -        if ( (k & 1) == 0 )
> -        {
> -            /* Even pages: machine address. */
> -            prev_ma = image->page_list[k];
> -        }
> -        else
> -        {
> -            /* Odd pages: va for previous ma. */
> -            if ( is_pv_32on64_domain(dom0) )
> -            {
> -                /*
> -                 * The compatability bounce code sets up a page table
> -                 * with a 1-1 mapping of the first 1G of memory so
> -                 * VA==PA here.
> -                 *
> -                 * This Linux purgatory code still sets up separate
> -                 * high and low mappings on the control page (entries
> -                 * 0 and 1) but it is harmless if they are equal since
> -                 * that PT is not live at the time.
> -                 */
> -                image->page_list[k] = prev_ma;
> -            }
> -            else
> -            {
> -                set_fixmap(fix_base + (k >> 1), prev_ma);
> -                image->page_list[k] = fix_to_virt(fix_base + (k >> 1));
> -            }
> -        }
> -    }
> -
>      return 0;
>  }
>
> -void machine_kexec_unload(int type, int slot, xen_kexec_image_t *image)
> +void machine_kexec_unload(struct kexec_image *image)
>  {
>  }
>
> -void machine_reboot_kexec(xen_kexec_image_t *image)
> +void machine_reboot_kexec(struct kexec_image *image)
>  {
>      BUG_ON(smp_processor_id() != 0);
>      smp_send_stop();
> @@ -75,7 +29,7 @@ void machine_reboot_kexec(xen_kexec_image_t *image)
>      BUG();
>  }
>
> -void machine_kexec(xen_kexec_image_t *image)
> +void machine_kexec(struct kexec_image *image)
>  {
>      struct desc_ptr gdt_desc = {
>          .base = (unsigned long)(boot_cpu_gdt_table - FIRST_RESERVED_GDT_ENTRY),
> @@ -116,22 +70,11 @@ void machine_kexec(xen_kexec_image_t *image)
>       */
>      asm volatile ( "lgdt %0" : : "m" (gdt_desc) );
>
> -    if ( is_pv_32on64_domain(dom0) )
> -    {
> -        compat_machine_kexec(image->page_list[1],
> -                             image->indirection_page,
> -                             image->page_list,
> -                             image->start_address);
> -    }
> +    if ( image->class == KEXEC_CLASS_32 )
> +        compat_machine_kexec(image->entry_maddr);

Why do you need that?

>      else
> -    {
> -        relocate_new_kernel_t rnk;
> -
> -        rnk = (relocate_new_kernel_t) image->page_list[1];
> -        (*rnk)(image->indirection_page, image->page_list,
> -               image->start_address,
> -               0 /* preserve_context */);
> -    }
> +        /* FIXME */
> +        panic("KEXEC_CLASS_64 not yet supported\n");
>  }
>
>  int machine_kexec_get(xen_kexec_range_t *range)
> diff --git a/xen/arch/x86/x86_64/compat_kexec.S b/xen/arch/x86/x86_64/compat_kexec.S
> index fc92af9..d853231 100644
> --- a/xen/arch/x86/x86_64/compat_kexec.S
> +++ b/xen/arch/x86/x86_64/compat_kexec.S
> @@ -36,21 +36,6 @@
>  ENTRY(compat_machine_kexec)
>          /* x86/64                        x86/32  */
>          /* %rdi - relocate_new_kernel_t  CALL    */
> -        /* %rsi - indirection page       4(%esp) */
> -        /* %rdx - page_list              8(%esp) */
> -        /* %rcx - start address         12(%esp) */
> -        /*        cpu has pae           16(%esp) */
> -
> -        /* Shim the 64 bit page_list into a 32 bit page_list. */
> -        mov $12,%r9
> -        lea compat_page_list(%rip), %rbx
> -1:      dec %r9
> -        movl (%rdx,%r9,8),%eax
> -        movl %eax,(%rbx,%r9,4)
> -        test %r9,%r9
> -        jnz 1b
> -
> -        RELOCATE_SYM(compat_page_list,%rdx)
>
>          /* Relocate compatibility mode entry point address. */
>          RELOCATE_MEM(compatibility_mode_far,%eax)
> @@ -118,12 +103,6 @@ compatibility_mode:
>          movl %eax, %gs
>          movl %eax, %ss
>
> -        /* Push arguments onto stack. */
> -        pushl $0   /* 20(%esp) - preserve context */
> -        pushl $1   /* 16(%esp) - cpu has pae */
> -        pushl %ecx /* 12(%esp) - start address */
> -        pushl %edx /*  8(%esp) - page list */
> -        pushl %esi /*  4(%esp) - indirection page */
>          pushl %edi /*  0(%esp) - CALL */
>
>          /* Disable paging and therefore leave 64 bit mode. */
> @@ -153,10 +132,6 @@ compatibility_mode:
>          ud2
>
>          .data
> -        .align 4
> -compat_page_list:
> -        .fill 12,4,0
> -
>          .align 32,0
>
>          /*
> diff --git a/xen/common/kexec.c b/xen/common/kexec.c
> index 25ebd6a..56bf8b4 100644
> --- a/xen/common/kexec.c
> +++ b/xen/common/kexec.c
> @@ -45,7 +45,7 @@ static Elf_Note *xen_crash_note;
>
>  static cpumask_t crash_saved_cpus;
>
> -static xen_kexec_image_t kexec_image[KEXEC_IMAGE_NR];
> +static struct kexec_image kexec_image[KEXEC_IMAGE_NR];
>
>  #define KEXEC_FLAG_DEFAULT_POS   (KEXEC_IMAGE_NR + 0)
>  #define KEXEC_FLAG_CRASH_POS     (KEXEC_IMAGE_NR + 1)
> @@ -316,7 +316,7 @@ void kexec_crash(void)
>
>  static long kexec_reboot(void *_image)
>  {
> -    xen_kexec_image_t *image = _image;
> +    struct kexec_image *image = _image;
>
>      kexecing = TRUE;
>
> @@ -732,9 +732,19 @@ static void crash_save_vmcoreinfo(void)
>  #endif
>  }
>
> +static void kexec_unload_slot(int slot)
> +{
> +    struct kexec_image *image = &kexec_image[slot];
> +
> +    if ( test_and_clear_bit(slot, &kexec_flags) )
> +    {
> +        machine_kexec_unload(image);
> +    }
> +}
> +
>  static int kexec_load_unload_internal(unsigned long op, xen_kexec_load_t *load)
>  {
> -    xen_kexec_image_t *image;
> +    struct kexec_image *image;
>      int base, bit, pos;
>      int ret = 0;
>
> @@ -750,9 +760,13 @@ static int kexec_load_unload_internal(unsigned long op, xen_kexec_load_t *load)
>
>          BUG_ON(test_bit((base + !pos), &kexec_flags)); /* must be free */
>
> -        memcpy(image, &load->image, sizeof(*image));
> +        if ( is_pv_32on64_domain(dom0) )
> +            image->class = KEXEC_CLASS_32;
> +        else
> +            image->class = KEXEC_CLASS_64;

Ditto.

> +        image->entry_maddr = load->image.start_address;
>
> -        if ( !(ret = machine_kexec_load(load->type, base + !pos, image)) )
> +        if ( !(ret = machine_kexec_load(image)) )
>          {
>              /* Set image present bit */
>              set_bit((base + !pos), &kexec_flags);
> @@ -767,11 +781,7 @@ static int kexec_load_unload_internal(unsigned long op, xen_kexec_load_t *load)
>      /* Unload the old image if present and load successful */
>      if ( ret == 0 && !test_bit(KEXEC_FLAG_IN_PROGRESS, &kexec_flags) )
>      {
> -        if ( test_and_clear_bit((base + pos), &kexec_flags) )
> -        {
> -            image = &kexec_image[base + pos];
> -            machine_kexec_unload(load->type, base + pos, image);
> -        }
> +        kexec_unload_slot(base + pos);
>      }
>
>      return ret;
> @@ -816,7 +826,7 @@ static int kexec_load_unload_compat(unsigned long op,
>  static int kexec_exec(XEN_GUEST_HANDLE_PARAM(void) uarg)
>  {
>      xen_kexec_exec_t exec;
> -    xen_kexec_image_t *image;
> +    struct kexec_image *image;
>      int base, bit, pos, ret = -EINVAL;
>
>      if ( unlikely(copy_from_guest(&exec, uarg, 1)) )
> @@ -845,6 +855,162 @@ static int kexec_exec(XEN_GUEST_HANDLE_PARAM(void) uarg)
>      return -EINVAL; /* never reached */
>  }
>
> +static int kexec_load_segments(xen_kexec_load_v2_t *load)
> +{
> +    unsigned s;
> +    bool_t valid_entry = 0;
> +
> +    for ( s = 0; s < load->nr_segments; s++ )
> +    {
> +        xen_kexec_segment_t seg;
> +        unsigned long to_copy;
> +        unsigned long src_offset;
> +        unsigned long dest;
> +
> +        if ( copy_from_guest_offset(&seg, load->segments, s, 1) )
> +            return -EFAULT;
> +
> +        /* Caller is responsible for ensuring the crash space is
> +           shared between multiple users of the load call.  Xen just
> +           validates the load is to somewhere within the region. */
> +
> +        if ( seg.dest_maddr < kexec_crash_area.start
> +             || seg.dest_maddr + seg.size > kexec_crash_area.start + kexec_crash_area.size)
> +            return -EINVAL;

This way you are breaking regular kexec support which
does not use prealocated area. As I said earlier you
should use kexec code from Linux Kernel (with relevant
changes). It has all needed stuff and you do not need
to reinvent the wheel.

> +
> +        if ( load->entry_maddr >= seg.dest_maddr
> +             && load->entry_maddr < seg.dest_maddr + seg.size)
> +            valid_entry = 1;
> +
> +        to_copy = seg.size;
> +        src_offset = 0;
> +        dest = seg.dest_maddr;
> +
> +        while ( to_copy )
> +        {
> +            unsigned long dest_mfn;
> +            size_t dest_off;
> +            void *dest_va;
> +            size_t size;
> +
> +            dest_mfn = dest >> PAGE_SHIFT;
> +            dest_off = dest & ~PAGE_MASK;
> +
> +            size = min(PAGE_SIZE - dest_off, to_copy);
> +
> +            dest_va = vmap(&dest_mfn, 1);
> +            if ( dest_va == NULL )
> +                return -EINVAL;
> +
> +            copy_from_guest_offset(dest_va, seg.buf, src_offset, size);
> +
> +            vunmap(dest_va);
> +
> +            to_copy -= size;
> +            src_offset += size;
> +            dest += size;
> +        }
> +    }
> +
> +    /* Entry point is somewhere in a loaded segment? */
> +    if ( !valid_entry )
> +        return -EINVAL;
> +
> +    return 0;
> +}
> +
> +static int slot_to_pos_bit(int slot)
> +{
> +    return KEXEC_IMAGE_NR + slot / 2;
> +}
> +
> +static int kexec_load_slot(int slot, xen_kexec_load_v2_t *load)
> +{
> +    struct kexec_image *image = &kexec_image[slot];
> +    int ret;
> +
> +    BUG_ON(test_bit(slot, &kexec_flags)); /* must be free */
> +
> +    /* validate and load each segment. */
> +    ret = kexec_load_segments(load);
> +    if ( ret < 0 )
> +        return ret;
> +
> +    image->entry_maddr = load->entry_maddr;
> +
> +    ret = machine_kexec_load(image);
> +    if ( ret < 0 )
> +        return ret;
> +
> +    /* Set image present bit */
> +    set_bit(slot, &kexec_flags);
> +
> +    /* Make new image the active one */
> +    change_bit(slot_to_pos_bit(slot), &kexec_flags);
> +
> +    crash_save_vmcoreinfo();
> +
> +    return ret;
> +}
> +
> +
> +static int kexec_load_v2(XEN_GUEST_HANDLE_PARAM(void) uarg)
> +{
> +    xen_kexec_load_v2_t load;
> +    int base, bit, pos, slot;
> +    struct kexec_image *image;
> +    int ret;
> +
> +    if ( unlikely(copy_from_guest(&load, uarg, 1)) )
> +        return -EFAULT;
> +
> +    if ( kexec_load_get_bits(load.type, &base, &bit) )
> +        return -EINVAL;
> +
> +    pos = (test_bit(bit, &kexec_flags) != 0);
> +    slot = base + !pos;
> +    image = &kexec_image[slot];
> +
> +    switch ( load.class )
> +    {
> +    case KEXEC_CLASS_32:
> +    case KEXEC_CLASS_64:
> +        image->class = load.class;
> +        break;
> +    default:
> +        return -EINVAL;
> +    }
> +
> +    ret = kexec_load_slot(slot, &load);
> +
> +    /* Unload the old image if present and load successful */
> +    if ( ret == 0 && !test_bit(KEXEC_FLAG_IN_PROGRESS, &kexec_flags) )
> +    {
> +        kexec_unload_slot(slot ^ 0x1);
> +    }
> +
> +    return ret;
> +}
> +
> +static int kexec_unload_v2(XEN_GUEST_HANDLE_PARAM(void) uarg)
> +{
> +    xen_kexec_unload_v2_t unload;
> +    int base, bit, pos, slot;
> +
> +    if ( unlikely(copy_from_guest(&unload, uarg, 1)) )
> +        return -EFAULT;
> +
> +    if ( kexec_load_get_bits(unload.type, &base, &bit) )
> +        return -EINVAL;
> +
> +    pos = (test_bit(bit, &kexec_flags) != 0);
> +    slot = base + !pos;
> +
> +    kexec_unload_slot(slot);
> +
> +    return 0;
> +}
> +
>  static int do_kexec_op_internal(unsigned long op,
>                                  XEN_GUEST_HANDLE_PARAM(void) uarg,
>                                  bool_t compat)
> @@ -882,6 +1048,22 @@ static int do_kexec_op_internal(unsigned long op,
>      case KEXEC_CMD_kexec:
>          ret = kexec_exec(uarg);
>          break;
> +    case KEXEC_CMD_kexec_load_v2:
> +        spin_lock_irqsave(&kexec_lock, flags);
> +        if ( !test_bit(KEXEC_FLAG_IN_PROGRESS, &kexec_flags) )
> +            ret = kexec_load_v2(uarg);
> +        else
> +            ret = -EBUSY;
> +        spin_unlock_irqrestore(&kexec_lock, flags);
> +        break;
> +    case KEXEC_CMD_kexec_unload_v2:
> +        spin_lock_irqsave(&kexec_lock, flags);
> +        if ( !test_bit(KEXEC_FLAG_IN_PROGRESS, &kexec_flags) )
> +            ret = kexec_unload_v2(uarg);
> +        else
> +            ret = -EBUSY;
> +        spin_unlock_irqrestore(&kexec_lock, flags);
> +        break;
>      }
>
>      return ret;
> diff --git a/xen/include/public/kexec.h b/xen/include/public/kexec.h
> index 61a8d7d..4b7d637 100644
> --- a/xen/include/public/kexec.h
> +++ b/xen/include/public/kexec.h
> @@ -83,6 +83,8 @@
>  #define KEXEC_TYPE_DEFAULT 0
>  #define KEXEC_TYPE_CRASH   1
>
> +#define KEXEC_CLASS_32   1 /* 32-bit image. */
> +#define KEXEC_CLASS_64   2 /* 64-bit image. */

???

>
>  /* The kexec implementation for Xen allows the user to load two
>   * types of kernels, KEXEC_TYPE_DEFAULT and KEXEC_TYPE_CRASH.
> @@ -152,6 +154,48 @@ typedef struct xen_kexec_range {
>      unsigned long start;
>  } xen_kexec_range_t;
>
> +/*
> + * A contiguous chunk of a kexec image and it's destination machine
> + * address.
> + */
> +typedef struct xen_kexec_segment {
> +    XEN_GUEST_HANDLE(const_void) buf;
> +    uint32_t size;
> +    uint64_t dest_maddr;
> +} xen_kexec_segment_t;
> +DEFINE_XEN_GUEST_HANDLE(xen_kexec_segment_t);
> +
> +/*
> + * Load a kexec image into memory.
> + *
> + * Each segment of the image must reside in the memory region reserved
> + * for kexec (KEXEC_RANGE_MA_CRASH) and the entry point must be within
> + * the image.
> + *
> + * The caller is responsible for ensuring that multiple images do not
> + * overlap.
> + */
> +#define KEXEC_CMD_kexec_load_v2        4
> +typedef struct xen_kexec_load_v2 {
> +    uint8_t type;  /* One of KEXEC_TYPE_* */
> +    uint8_t class; /* One of KEXEC_CLASS_* */

Why do not use one member called flags (uint32_t or uint64_t)?
This way you could add quite easily new flags in the future.

Daniel



More information about the kexec mailing list