[PATCH v3 06/11] x86/xen: Add i386 kexec/kdump implementation

Daniel Kiper daniel.kiper at oracle.com
Wed Dec 26 21:18:55 EST 2012


Add i386 kexec/kdump implementation.

v2 - suggestions/fixes:
   - allocate transition page table pages below 4 GiB
     (suggested by Jan Beulich).

Signed-off-by: Daniel Kiper <daniel.kiper at oracle.com>
---
 arch/x86/xen/machine_kexec_32.c   |  226 ++++++++++++++++++++++++++
 arch/x86/xen/relocate_kernel_32.S |  323 +++++++++++++++++++++++++++++++++++++
 2 files changed, 549 insertions(+), 0 deletions(-)
 create mode 100644 arch/x86/xen/machine_kexec_32.c
 create mode 100644 arch/x86/xen/relocate_kernel_32.S

diff --git a/arch/x86/xen/machine_kexec_32.c b/arch/x86/xen/machine_kexec_32.c
new file mode 100644
index 0000000..011a5e8
--- /dev/null
+++ b/arch/x86/xen/machine_kexec_32.c
@@ -0,0 +1,226 @@
+/*
+ * Copyright (c) 2011 Daniel Kiper
+ * Copyright (c) 2012 Daniel Kiper, Oracle Corporation
+ *
+ * kexec/kdump implementation for Xen was written by Daniel Kiper.
+ * Initial work on it was sponsored by Google under Google Summer
+ * of Code 2011 program and Citrix. Konrad Rzeszutek Wilk from Oracle
+ * was the mentor for this project.
+ *
+ * Some ideas are taken from:
+ *   - native kexec/kdump implementation,
+ *   - kexec/kdump implementation for Xen Linux Kernel Ver. 2.6.18,
+ *   - PV-GRUB.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/kexec.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+
+#include <xen/xen.h>
+#include <xen/xen-ops.h>
+
+#include <asm/xen/hypercall.h>
+#include <asm/xen/kexec.h>
+#include <asm/xen/page.h>
+
+#define __ma(vaddr)	(virt_to_machine(vaddr).maddr)
+
+static void *alloc_pgtable_page(struct kimage *image)
+{
+	struct page *page;
+
+	page = firmware_kimage_alloc_control_pages(image, 0);
+
+	if (!page || !page_address(page))
+		return NULL;
+
+	memset(page_address(page), 0, PAGE_SIZE);
+
+	return page_address(page);
+}
+
+static int alloc_transition_pgtable(struct kimage *image)
+{
+	image->arch.pgd = alloc_pgtable_page(image);
+
+	if (!image->arch.pgd)
+		return -ENOMEM;
+
+	image->arch.pmd0 = alloc_pgtable_page(image);
+
+	if (!image->arch.pmd0)
+		return -ENOMEM;
+
+	image->arch.pmd1 = alloc_pgtable_page(image);
+
+	if (!image->arch.pmd1)
+		return -ENOMEM;
+
+	image->arch.pte0 = alloc_pgtable_page(image);
+
+	if (!image->arch.pte0)
+		return -ENOMEM;
+
+	image->arch.pte1 = alloc_pgtable_page(image);
+
+	if (!image->arch.pte1)
+		return -ENOMEM;
+
+	return 0;
+}
+
+struct page *mf_kexec_kimage_alloc_pages(gfp_t gfp_mask,
+						unsigned int order,
+						unsigned long limit)
+{
+	struct page *pages;
+	unsigned int address_bits, i;
+
+	pages = alloc_pages(gfp_mask, order);
+
+	if (!pages)
+		return NULL;
+
+	address_bits = (limit == ULONG_MAX) ? BITS_PER_LONG : ilog2(limit);
+
+	/* Relocate set of pages below given limit. */
+	if (xen_create_contiguous_region((unsigned long)page_address(pages),
+							order, address_bits)) {
+		__free_pages(pages, order);
+		return NULL;
+	}
+
+	BUG_ON(PagePrivate(pages));
+
+	pages->mapping = NULL;
+	set_page_private(pages, order);
+
+	for (i = 0; i < (1 << order); ++i)
+		SetPageReserved(pages + i);
+
+	return pages;
+}
+
+void mf_kexec_kimage_free_pages(struct page *page)
+{
+	unsigned int i, order;
+
+	order = page_private(page);
+
+	for (i = 0; i < (1 << order); ++i)
+		ClearPageReserved(page + i);
+
+	xen_destroy_contiguous_region((unsigned long)page_address(page), order);
+	__free_pages(page, order);
+}
+
+unsigned long mf_kexec_page_to_pfn(struct page *page)
+{
+	return pfn_to_mfn(page_to_pfn(page));
+}
+
+struct page *mf_kexec_pfn_to_page(unsigned long mfn)
+{
+	return pfn_to_page(mfn_to_pfn(mfn));
+}
+
+unsigned long mf_kexec_virt_to_phys(volatile void *address)
+{
+	return virt_to_machine(address).maddr;
+}
+
+void *mf_kexec_phys_to_virt(unsigned long address)
+{
+	return phys_to_virt(machine_to_phys(XMADDR(address)).paddr);
+}
+
+int mf_kexec_prepare(struct kimage *image)
+{
+#ifdef CONFIG_KEXEC_JUMP
+	if (image->preserve_context) {
+		pr_info_once("kexec: Context preservation is not "
+				"supported in Xen domains.\n");
+		return -ENOSYS;
+	}
+#endif
+
+	return alloc_transition_pgtable(image);
+}
+
+int mf_kexec_load(struct kimage *image)
+{
+	void *control_page;
+	struct xen_kexec_load xkl = {};
+
+	/* Image is unloaded, nothing to do. */
+	if (!image)
+		return 0;
+
+	control_page = page_address(image->control_code_page);
+	memcpy(control_page, xen_relocate_kernel, xen_kexec_control_code_size);
+
+	xkl.type = image->type;
+	xkl.image.page_list[XK_MA_CONTROL_PAGE] = __ma(control_page);
+	xkl.image.page_list[XK_MA_TABLE_PAGE] = 0; /* Unused. */
+	xkl.image.page_list[XK_MA_PGD_PAGE] = __ma(image->arch.pgd);
+	xkl.image.page_list[XK_MA_PUD0_PAGE] = 0; /* Unused. */
+	xkl.image.page_list[XK_MA_PUD1_PAGE] = 0; /* Unused. */
+	xkl.image.page_list[XK_MA_PMD0_PAGE] = __ma(image->arch.pmd0);
+	xkl.image.page_list[XK_MA_PMD1_PAGE] = __ma(image->arch.pmd1);
+	xkl.image.page_list[XK_MA_PTE0_PAGE] = __ma(image->arch.pte0);
+	xkl.image.page_list[XK_MA_PTE1_PAGE] = __ma(image->arch.pte1);
+	xkl.image.indirection_page = image->head;
+	xkl.image.start_address = image->start;
+
+	return HYPERVISOR_kexec_op(KEXEC_CMD_kexec_load, &xkl);
+}
+
+void mf_kexec_cleanup(struct kimage *image)
+{
+}
+
+void mf_kexec_unload(struct kimage *image)
+{
+	int rc;
+	struct xen_kexec_load xkl = {};
+
+	if (!image)
+		return;
+
+	xkl.type = image->type;
+	rc = HYPERVISOR_kexec_op(KEXEC_CMD_kexec_unload, &xkl);
+
+	WARN(rc, "kexec: %s: HYPERVISOR_kexec_op(): %i\n", __func__, rc);
+}
+
+void mf_kexec_shutdown(void)
+{
+}
+
+void mf_kexec(struct kimage *image)
+{
+	int rc;
+	struct xen_kexec_exec xke = {};
+
+	xke.type = image->type;
+	rc = HYPERVISOR_kexec_op(KEXEC_CMD_kexec, &xke);
+
+	pr_emerg("kexec: %s: HYPERVISOR_kexec_op(): %i\n", __func__, rc);
+	BUG();
+}
diff --git a/arch/x86/xen/relocate_kernel_32.S b/arch/x86/xen/relocate_kernel_32.S
new file mode 100644
index 0000000..0e81830
--- /dev/null
+++ b/arch/x86/xen/relocate_kernel_32.S
@@ -0,0 +1,323 @@
+/*
+ * Copyright (c) 2002-2005 Eric Biederman <ebiederm at xmission.com>
+ * Copyright (c) 2011 Daniel Kiper
+ * Copyright (c) 2012 Daniel Kiper, Oracle Corporation
+ *
+ * kexec/kdump implementation for Xen was written by Daniel Kiper.
+ * Initial work on it was sponsored by Google under Google Summer
+ * of Code 2011 program and Citrix. Konrad Rzeszutek Wilk from Oracle
+ * was the mentor for this project.
+ *
+ * Some ideas are taken from:
+ *   - native kexec/kdump implementation,
+ *   - kexec/kdump implementation for Xen Linux Kernel Ver. 2.6.18,
+ *   - PV-GRUB.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either veesion 2 of the License, or
+ * (at your option) any later veesion.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <asm/cache.h>
+#include <asm/page_types.h>
+#include <asm/pgtable_types.h>
+#include <asm/processor-flags.h>
+
+#include <asm/xen/kexec.h>
+
+#define ARG_INDIRECTION_PAGE	0x4
+#define ARG_PAGE_LIST		0x8
+#define ARG_START_ADDRESS	0xc
+
+#define PTR(x)	(x << 2)
+
+	.text
+	.align	PAGE_SIZE
+	.globl	xen_kexec_control_code_size, xen_relocate_kernel
+
+xen_relocate_kernel:
+	/*
+	 * Must be relocatable PIC code callable as a C function.
+	 *
+	 * This function is called by Xen but here hypervisor is dead.
+	 * We are playing on bare metal.
+	 *
+	 * Every machine address passed to this function through
+	 * page_list (e.g. XK_MA_CONTROL_PAGE) is established
+	 * by dom0 during kexec load phase.
+	 *
+	 * Every virtual address passed to this function through page_list
+	 * (e.g. XK_VA_CONTROL_PAGE) is established by hypervisor during
+	 * HYPERVISOR_kexec_op(KEXEC_CMD_kexec_load) hypercall.
+	 *
+	 * 0x4(%esp) - indirection_page,
+	 * 0x8(%esp) - page_list,
+	 * 0xc(%esp) - start_address,
+	 * 0x10(%esp) - cpu_has_pae (ignored),
+	 * 0x14(%esp) - preserve_context (ignored).
+	 */
+
+	/* Zero out flags, and disable interrupts. */
+	pushl	$0
+	popfl
+
+	/* Get page_list address. */
+	movl	ARG_PAGE_LIST(%esp), %esi
+
+	/*
+	 * Map the control page at its virtual address
+	 * in transition page table.
+	 */
+	movl	PTR(XK_VA_CONTROL_PAGE)(%esi), %eax
+
+	/* Get PGD address and PGD entry index. */
+	movl	PTR(XK_VA_PGD_PAGE)(%esi), %ebx
+	movl	%eax, %ecx
+	shrl	$PGDIR_SHIFT, %ecx
+	andl	$(PTRS_PER_PGD - 1), %ecx
+
+	/* Fill PGD entry with PMD0 reference. */
+	movl	PTR(XK_MA_PMD0_PAGE)(%esi), %edx
+	orl	$_PAGE_PRESENT, %edx
+	movl	%edx, (%ebx, %ecx, 8)
+
+	/* Get PMD0 address and PMD0 entry index. */
+	movl	PTR(XK_VA_PMD0_PAGE)(%esi), %ebx
+	movl	%eax, %ecx
+	shrl	$PMD_SHIFT, %ecx
+	andl	$(PTRS_PER_PMD - 1), %ecx
+
+	/* Fill PMD0 entry with PTE0 reference. */
+	movl	PTR(XK_MA_PTE0_PAGE)(%esi), %edx
+	orl	$_KERNPG_TABLE, %edx
+	movl	%edx, (%ebx, %ecx, 8)
+
+	/* Get PTE0 address and PTE0 entry index. */
+	movl	PTR(XK_VA_PTE0_PAGE)(%esi), %ebx
+	movl	%eax, %ecx
+	shrl	$PAGE_SHIFT, %ecx
+	andl	$(PTRS_PER_PTE - 1), %ecx
+
+	/* Fill PTE0 entry with control page reference. */
+	movl	PTR(XK_MA_CONTROL_PAGE)(%esi), %edx
+	orl	$__PAGE_KERNEL_EXEC, %edx
+	movl	%edx, (%ebx, %ecx, 8)
+
+	/*
+	 * Identity map the control page at its machine address
+	 * in transition page table.
+	 */
+	movl	PTR(XK_MA_CONTROL_PAGE)(%esi), %eax
+
+	/* Get PGD address and PGD entry index. */
+	movl	PTR(XK_VA_PGD_PAGE)(%esi), %ebx
+	movl	%eax, %ecx
+	shrl	$PGDIR_SHIFT, %ecx
+	andl	$(PTRS_PER_PGD - 1), %ecx
+
+	/* Fill PGD entry with PMD1 reference. */
+	movl	PTR(XK_MA_PMD1_PAGE)(%esi), %edx
+	orl	$_PAGE_PRESENT, %edx
+	movl	%edx, (%ebx, %ecx, 8)
+
+	/* Get PMD1 address and PMD1 entry index. */
+	movl	PTR(XK_VA_PMD1_PAGE)(%esi), %ebx
+	movl	%eax, %ecx
+	shrl	$PMD_SHIFT, %ecx
+	andl	$(PTRS_PER_PMD - 1), %ecx
+
+	/* Fill PMD1 entry with PTE1 reference. */
+	movl	PTR(XK_MA_PTE1_PAGE)(%esi), %edx
+	orl	$_KERNPG_TABLE, %edx
+	movl	%edx, (%ebx, %ecx, 8)
+
+	/* Get PTE1 address and PTE1 entry index. */
+	movl	PTR(XK_VA_PTE1_PAGE)(%esi), %ebx
+	movl	%eax, %ecx
+	shrl	$PAGE_SHIFT, %ecx
+	andl	$(PTRS_PER_PTE - 1), %ecx
+
+	/* Fill PTE1 entry with control page reference. */
+	movl	PTR(XK_MA_CONTROL_PAGE)(%esi), %edx
+	orl	$__PAGE_KERNEL_EXEC, %edx
+	movl	%edx, (%ebx, %ecx, 8)
+
+	/*
+	 * Get machine address of control page now.
+	 * This is impossible after page table switch.
+	 */
+	movl	PTR(XK_MA_CONTROL_PAGE)(%esi), %ebx
+
+	/* Get machine address of transition page table now too. */
+	movl	PTR(XK_MA_PGD_PAGE)(%esi), %ecx
+
+	/* Get start_address too. */
+	movl	ARG_START_ADDRESS(%esp), %edx
+
+	/* Get indirection_page address too. */
+	movl	ARG_INDIRECTION_PAGE(%esp), %edi
+
+	/* Switch to transition page table. */
+	movl	%ecx, %cr3
+
+	/* Load IDT. */
+	lidtl	(idt_48 - xen_relocate_kernel)(%ebx)
+
+	/* Load GDT. */
+	leal	(gdt - xen_relocate_kernel)(%ebx), %eax
+	movl	%eax, (gdt_48 - xen_relocate_kernel + 2)(%ebx)
+	lgdtl	(gdt_48 - xen_relocate_kernel)(%ebx)
+
+	/* Load data segment registers. */
+	movl	$(gdt_ds - gdt), %eax
+	movl	%eax, %ds
+	movl	%eax, %es
+	movl	%eax, %fs
+	movl	%eax, %gs
+	movl	%eax, %ss
+
+	/* Setup a new stack at the end of machine address of control page. */
+	leal	PAGE_SIZE(%ebx), %esp
+
+	/* Store start_address on the stack. */
+	pushl   %edx
+
+	/* Jump to identity mapped page. */
+	pushl	$0
+	pushl	$(gdt_cs - gdt)
+	addl	$(identity_mapped - xen_relocate_kernel), %ebx
+	pushl	%ebx
+	iretl
+
+identity_mapped:
+	/*
+	 * Set %cr0 to a known state:
+	 *   - disable alignment check,
+	 *   - disable floating point emulation,
+	 *   - disable paging,
+	 *   - no task switch,
+	 *   - disable write protect,
+	 *   - enable protected mode.
+	 */
+	movl	%cr0, %eax
+	andl	$~(X86_CR0_AM | X86_CR0_EM | X86_CR0_PG | X86_CR0_TS | X86_CR0_WP), %eax
+	orl	$(X86_CR0_PE), %eax
+	movl	%eax, %cr0
+
+	/* Set %cr4 to a known state. */
+	xorl	%eax, %eax
+	movl	%eax, %cr4
+
+	jmp	1f
+
+1:
+	/* Flush the TLB (needed?). */
+	movl	%eax, %cr3
+
+	/* Do the copies. */
+	movl	%edi, %ecx	/* Put the indirection_page in %ecx. */
+	xorl	%edi, %edi
+	xorl	%esi, %esi
+	jmp	1f
+
+0:
+	/*
+	 * Top, read another doubleword from the indirection page.
+	 * Indirection page is an array which contains source
+	 * and destination address pairs. If all pairs could
+	 * not fit in one page then at the end of given
+	 * indirection page is pointer to next one.
+	 * Copy is stopped when done indicator
+	 * is found in indirection page.
+	 */
+	movl	(%ebx), %ecx
+	addl	$4, %ebx
+
+1:
+	testl	$0x1, %ecx	/* Is it a destination page? */
+	jz	2f
+
+	movl	%ecx, %edi
+	andl	$PAGE_MASK, %edi
+	jmp	0b
+
+2:
+	testl	$0x2, %ecx	/* Is it an indirection page? */
+	jz	2f
+
+	movl	%ecx, %ebx
+	andl	$PAGE_MASK, %ebx
+	jmp	0b
+
+2:
+	testl	$0x4, %ecx	/* Is it the done indicator? */
+	jz	2f
+	jmp	3f
+
+2:
+	testl	$0x8, %ecx	/* Is it the source indicator? */
+	jz	0b		/* Ignore it otherwise. */
+
+	movl	%ecx, %esi
+	andl	$PAGE_MASK, %esi
+	movl	$1024, %ecx
+
+	/* Copy page. */
+	rep	movsl
+	jmp	0b
+
+3:
+	/*
+	 * To be certain of avoiding problems with self-modifying code
+	 * I need to execute a serializing instruction here.
+	 * So I flush the TLB by reloading %cr3 here, it's handy,
+	 * and not processor dependent.
+	 */
+	xorl	%eax, %eax
+	movl	%eax, %cr3
+
+	/*
+	 * Set all of the registers to known values.
+	 * Leave %esp alone.
+	 */
+	xorl	%ebx, %ebx
+	xorl    %ecx, %ecx
+	xorl    %edx, %edx
+	xorl    %esi, %esi
+	xorl    %edi, %edi
+	xorl    %ebp, %ebp
+
+	/* Jump to start_address. */
+	retl
+
+	.align	L1_CACHE_BYTES
+
+gdt:
+	.quad	0x0000000000000000	/* NULL descriptor. */
+
+gdt_cs:
+	.quad	0x00cf9a000000ffff	/* 4 GiB code segment at 0x00000000. */
+
+gdt_ds:
+	.quad	0x00cf92000000ffff	/* 4 GiB data segment at 0x00000000. */
+gdt_end:
+
+gdt_48:
+	.word	gdt_end - gdt - 1	/* GDT limit. */
+	.long	0			/* GDT base - filled in by code above. */
+
+idt_48:
+	.word	0			/* IDT limit. */
+	.long	0			/* IDT base. */
+
+xen_kexec_control_code_size:
+	.long	. - xen_relocate_kernel
-- 
1.5.6.5




More information about the kexec mailing list