[Xen-devel] incorrect layout of globals from head_64.S during kexec boot

Olaf Hering olaf at aepfle.de
Fri Jul 6 10:14:19 EDT 2012


On Fri, Jul 06, Olaf Hering wrote:

> I will cleanup my debug changes and post the output.

What I see is that the content of the uncompressed vmlinux is
appearently already corrupted after decompress().  After I made small
changes to arch/x86/boot/compressed/misc.c and arch/x86/kernel/head_64.S
the offset in memory changed from 0x2c to 0x8.

This could mean that the unzip code is broken, but this is rather
unlikely. The odd thing is, if the first kernel is forced to return
false in xen_hvm_platform() to disable the PVonHVM features then kexec
works ok.

Could it be that some code tweaks the stack content used by decompress()
in some odd way? But that would most likely lead to a crash, not to
unexpected uncompressing results.

I will study the code some more.


This is the readelf output from vmlinux:

Program Headers:
  Type           Offset   VirtAddr           PhysAddr           FileSiz  MemSiz   Flg Align
0 LOAD           0x200000 0xffffffff81000000 0x0000000001000000 0xa3b000 0xa3b000 R E 0x200000
1 LOAD           0xe00000 0xffffffff81c00000 0x0000000001c00000 0x05b0e8 0x05b0e8 RW  0x200000
2 LOAD           0x1000000 0x0000000000000000 0x0000000001c5c000 0x012c40 0x012c40 RW  0x200000
3 LOAD           0x106f000 0xffffffff81c6f000 0x0000000001c6f000 0x087000 0x702000 RWE 0x200000
4 NOTE           0x82d5bc 0xffffffff8162d5bc 0x000000000162d5bc 0x00017c 0x00017c     0x4

Dump of the Program Header sections:

#0
dd if=../O/x86_64-O-3.5/vmlinux bs=1 skip=$(( 0x200000 )) count=$(( 0xa3b000 )) | xxd -a -g 8 | head -n 12
0000000: 4c8b0d2940c1004c 8b152a40c1004c8b  L..)@..L..*@..L.
0000010: 1d2b40c1004c8b25 3440c1004c8b2d35  .+ at ..L.%4 at ..L.-5
0000020: 40c1004c8b353640 c1004c8b3d3740c1  @..L.56 at ..L.=7 at .
0000030: 00488d2dc8ffffff 4881ed0000000148  .H.-....H......H
0000040: 89e825ffff1f0085 c00f855b01000048  ..%........[...H
0000050: 8d15aaffffff48b8 0000000080000000  ......H.........
0000060: 4839c20f83410100 0048012d90bfc000  H9...A...H.-....
0000070: 48012d09c8c00048 012d7acfc0004801  H.-....H.-z...H.
0000080: 2d7bcfc00048012d 64efc00048012d65  -{...H.-d...H.-e
0000090: efc00048012d36ff c000488d3d5fffff  ...H.-6...H.=_..
00000a0: ff4881e70000e0ff 4889f848c1e81e48  .H......H..H...H
00000b0: 25ff010000743148 8d956330c101488d  %....t1H..c0..H.

#1
dd if=../O/x86_64-O-3.5/vmlinux bs=1 skip=$(( 0xe00000 )) count=$(( 0x05b0e8 )) | xxd -a -g 8 | head -n 12
0000000: 8044c181ffffffff 60acc181ffffffff  .D......`.......
0000010: 0000000000000000 0000000001000010  ................
0000020: ffffffffffffffff c0d70481ffffffff  ................
0000030: 0000000000000000 0000000000000000  ................
*
0002000: 48c7c0600000000f 05c3cccccccccccc  H..`............
0002010: cccccccccccccccc cccccccccccccccc  ................
0002020: cccccccccccccccc cccccccccccccccc  ................
0002030: cccccccccccccccc cccccccccccccccc  ................
0002040: cccccccccccccccc cccccccccccccccc  ................
0002050: cccccccccccccccc cccccccccccccccc  ................
0002060: cccccccccccccccc cccccccccccccccc  ................

#2
 dd if=../O/x86_64-O-3.5/vmlinux bs=1 skip=$(( 0x1000000 )) count=$(( 0x012c40 )) | xxd -a -g 8 | head -n 12
0000000: 0000000000000000 0000000000000000  ................
*
0004000: 0000000000000000 ffff0000009bcf00  ................
0004010: ffff0000009baf00 ffff00000093cf00  ................
0004020: ffff000000fbcf00 ffff000000f3cf00  ................
0004030: ffff000000fbaf00 0000000000000000  ................
0004040: 0000000000000000 0000000000000000  ................
*
000be80: ffffffff00000000 0000000000000000  ................
000be90: 0000000000000000 0000000000000000  ................
*
000bf00: ffffffffffffffff ffffffffffffffff  ................

#3
dd if=../O/x86_64-O-3.5/vmlinux bs=1 skip=$(( 0x106f000 )) count=$(( 0x087000 )) | xxd -a -g 8 | head -n 12
0000000: 6a006a00e9170100 006a006a01e90e01  j.j......j.j....
0000010: 00006a006a02e905 0100006a006a03e9  ..j.j......j.j..
0000020: fc0000006a006a04 e9f30000006a006a  ....j.j......j.j
0000030: 05e9ea0000006a00 6a06e9e10000006a  ......j.j......j
0000040: 006a07e9d8000000 66906a08e9cf0000  .j......f.j.....
0000050: 006a006a09e9c600 000066906a0ae9bd  .j.j......f.j...
0000060: 00000066906a0be9 b400000066906a0c  ...f.j......f.j.
0000070: e9ab00000066906a 0de9a20000006690  .....f.j......f.
0000080: 6a0ee9990000006a 006a0fe990000000  j......j.j......
0000090: 6a006a10e9870000 0066906a11e97e00  j.j......f.j..~.
00000a0: 00006a006a12e975 0000006a006a13e9  ..j.j..u...j.j..
00000b0: 6c0000006a006a14 e9630000006a006a  l...j.j..c...j.j

#4
dd if=../O/x86_64-O-3.5/vmlinux bs=1 skip=$(( 0x82d5bc )) count=$(( 0x00017c )) 2>/dev/null | xxd -a -g 8 | head -n 12 
0000000: 0400000006000000 0600000058656e00  ............Xen.
0000010: 6c696e7578000000 0400000004000000  linux...........
0000020: 0700000058656e00 322e360004000000  ....Xen.2.6.....
0000030: 0800000005000000 58656e0078656e2d  ........Xen.xen-
0000040: 332e300004000000 0800000003000000  3.0.............
0000050: 58656e0000000080 ffffffff04000000  Xen.............
0000060: 0800000001000000 58656e0010f2c681  ........Xen.....
0000070: ffffffff04000000 0800000002000000  ................
0000080: 58656e0000100081 ffffffff04000000  Xen.............
0000090: 2a0000000a000000 58656e0021777269  *.......Xen.!wri
00000a0: 7461626c655f7061 67655f7461626c65  table_page_table
00000b0: 737c7061655f7067 6469725f61626f76  s|pae_pgdir_abov


Dump of the place where phys_base is stored in the .data section:
(5555555555555555 should have been at offset 0x14020, but in fact its
stored at 0x14028 in the output of the guest below)
*
0014000: 7f000000c681ffff ffff000000000000  ................
0014010: 0000000000000000 0000000000000000  ................
0014020: 5555555555555555 4444444444444444  UUUUUUUUDDDDDDDD
0014030: 3333333333333333 2222222222222222  33333333""""""""
0014040: 1111111111111111 9090909090909090  ................
0014050: 0000000000000000 9999999999999999  ................
0014060: 8888888888888888 7777777777777777  ........wwwwwwww
0014070: 0000000000000000 0000000000000000  ................
0014080: 7b599781ffffffff 82599781ffffffff  {Y.......Y......
0014090: 0000000000000000 0000000000000000  ................
*


Thats what I get in the guest:

...
[   29.590290] Starting new kernel
I'm in purgatory
early console in decompress_kernel

Decompressing Linux... Parsing ELF...
output 0000000001000000 phdrs 000000000210e040 ehdr.e_phoff 0000000000000040 ehdr.e_phnum 0000000000000005
i 0000000000000000 p 0000000001200000 4c8b0d2940c1004c8b152a40c1004c8b1d2b40c1004c8b253440c1004c8b2d3540c1004c8b353640c1004c8b3d3740c100488d2dc8ffffff4881ed0000000148
i 0000000000000001 p 0000000001e00000 00000000000000008044c181ffffffff60acc181ffffffff00000000000000000000000001000010ffffffffffffffffc0d70481ffffffff0000000000000000
i 0000000000000001 &p[j] 0000000001e14028 5555555555555555444444444444444433333333333333332222222222222222111111111111111190909090909090900000000000000000999999999999999988888888888888887777777777777777
i 0000000000000002 p 0000000002000000 00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
i 0000000000000003 p 000000000206f000 00000000000000006a006a00e9170100006a006a01e90e0100006a006a02e9050100006a006a03e9fc0000006a006a04e9f30000006a006a05e9ea0000006a00
i 0000000000000004 p 000000000182d5bc 6631d2e9c9fdfdff04ec300006ec300006ec300058656e006c696e7578ec300004ec300004ec300007ec300058656e00322e360004ec30000830ec3005ec3000

i 0000000000000000 dest 0000000001000000 phdr->p_paddr 0000000001000000 phdr->p_offset 0000000000200000 phdr->p_filesz 0000000000a3b000
i 0000000000000000 dest 0000000001000000 output + phdr->p_offset 0000000001200000 phdr->p_filesz 0000000000a3b000
overlap 000000000083b000
i 0000000000000001 dest 0000000001c00000 phdr->p_paddr 0000000001c00000 phdr->p_offset 0000000000e00000 phdr->p_filesz 000000000005b0e8
i 0000000000000001 dest 0000000001c00000 output + phdr->p_offset 0000000001e00000 phdr->p_filesz 000000000005b0e8
i 0000000000000002 dest 0000000001c5c000 phdr->p_paddr 0000000001c5c000 phdr->p_offset 0000000001000000 phdr->p_filesz 0000000000012c40
i 0000000000000002 dest 0000000001c5c000 output + phdr->p_offset 0000000002000000 phdr->p_filesz 0000000000012c40
i 0000000000000003 dest 0000000001c6f000 phdr->p_paddr 0000000001c6f000 phdr->p_offset 000000000106f000 phdr->p_filesz 0000000000087000
i 0000000000000003 dest 0000000001c6f000 output + phdr->p_offset 000000000206f000 phdr->p_filesz 0000000000087000
done.
Booting the kernel.
<crash>


xenctx shows this:
rip: 0000000001000136
flags: 00010086 rf s nz p
rsp: 000000000211a040
rax: 9090909092515090   rcx: 00000000000003d5   rdx: 0000000001000000
rbx: 0000000001cac000   rsi: 0000000000003000   rdi: 0000000001c13000
rbp: 0000000000000000    r8: 0000000001c13000    r9: 4444444444444444
r10: 3333333333333333   r11: 2222222222222222   r12: 9090909090909090
r13: 0000000000000000   r14: 9999999999999999   r15: 8888888888888888
 cs: 0010        ss: 0000        ds: 0000        es: 0000
 fs: 0000 @ 0000000000000000
 gs: 0000 @ 0000000000000000/0000000000000000

cr0: 80000011
cr2: ffffffffff600400
cr3: 0211b000
cr4: 000000a0

dr0: 00000000
dr1: 00000000
dr2: 00000000
dr3: 00000000
dr6: ffff0ff0
dr7: 00000400
Code (instr addr 01000136)
a0 00 00 00 0f 22 e0 48 c7 c0 00 c0 c0 01 48 03 05 1a 3f c1 00 <0f> 22 d8 48 c7 c0 42 01 00 81 ff


The debug patch for 3.5-rc5:

diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 7116dcb..14e77cf 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -273,6 +273,36 @@ static void error(char *x)
 		asm("hlt");
 }
 
+static void putbyte(unsigned char val)
+{
+	static const char digits[] = "0123456789abcdef";
+	char str[(sizeof(unsigned char) * 2) + 1];
+	int i = sizeof(str), c = sizeof(unsigned char) * 2;
+	str[--i] = 0;
+	while(c--) {
+		str[--i] = digits[val & 0xf];
+		val >>=4;
+	}
+	putstr(str);
+}
+static void __putval(char *s, unsigned long val)
+{
+	static const char digits[] = "0123456789abcdef";
+	char str[(sizeof(unsigned long) * 2) + 3];
+	int i = sizeof(str), c = sizeof(unsigned long) * 2;
+	str[--i] = 0;
+	str[--i] = ' ';
+	while(c--) {
+		str[--i] = digits[val & 0xf];
+		val >>=4;
+	}
+	str[--i] = ' ';
+
+	putstr(s);
+	putstr(str);
+}
+#define putval(x) __putval(#x, (unsigned long)(x))
+
 static void parse_elf(void *output)
 {
 #ifdef CONFIG_X86_64
@@ -284,6 +314,9 @@ static void parse_elf(void *output)
 #endif
 	void *dest;
 	int i;
+	unsigned long j, c;
+	unsigned char *p;
+	signed long overlap;
 
 	memcpy(&ehdr, output, sizeof(ehdr));
 	if (ehdr.e_ident[EI_MAG0] != ELFMAG0 ||
@@ -303,6 +336,33 @@ static void parse_elf(void *output)
 
 	memcpy(phdrs, output + ehdr.e_phoff, sizeof(*phdrs) * ehdr.e_phnum);
 
+	putstr("\n");
+	putval(output);
+	putval(phdrs);
+	putval(ehdr.e_phoff);
+	putval(ehdr.e_phnum);
+	putstr("\n");
+	for (i = 0; i < ehdr.e_phnum; i++) {
+		phdr = &phdrs[i];
+		p = output + phdr->p_offset;
+		putval(i);
+		putval(p);
+		for (j = 0; j < 64; j++)
+			putbyte(p[j]);
+		putstr("\n");
+		if (i == 1)
+			for (j = 0; j < phdr->p_filesz; j++) {
+				if (p[j + 0] == 0x55 && p[j + 1] == 0x55 && p[j + 2] == 0x55 && p[j + 3] == 0x55 && p[j + 4] == 0x55 && p[j + 5] == 0x55 && p[j + 6] == 0x55 && p[j + 7] == 0x55) {
+					putval(i);
+					putval(&p[j]);
+					for (c = 0; c < 10 * sizeof(unsigned long); c++)
+						putbyte(p[j +c]);
+					putstr("\n");
+					break;
+				}
+			}
+	}
+	putstr("\n");
 	for (i = 0; i < ehdr.e_phnum; i++) {
 		phdr = &phdrs[i];
 
@@ -314,6 +374,22 @@ static void parse_elf(void *output)
 #else
 			dest = (void *)(phdr->p_paddr);
 #endif
+			putval(i);
+			putval(dest);
+			putval(phdr->p_paddr);
+			putval(phdr->p_offset);
+			putval(phdr->p_filesz);
+			putstr("\n");
+			putval(i);
+			putval(dest);
+			putval(output + phdr->p_offset);
+			putval(phdr->p_filesz);
+			putstr("\n");
+			overlap = ((long)dest + (long)phdr->p_filesz) - ((long)output + (long)phdr->p_offset);
+			if (overlap > 0) {
+				putval(overlap);
+				putstr("\n");
+			}
 			memcpy(dest,
 			       output + phdr->p_offset,
 			       phdr->p_filesz);
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 94bf9cc..42f1836 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -69,6 +69,15 @@ startup_64:
 	/* Compute the delta between the address I am compiled to run at and the
 	 * address I am actually running at.
 	 */
+#if 1
+	movq	phys_base_minus3(%rip),%r9
+	movq	phys_base_minus2(%rip),%r10
+	movq	phys_base_minus1(%rip),%r11
+	movq	phys_base(%rip),%r12
+	movq	phys_base_plus1(%rip),%r13
+	movq	phys_base_plus2(%rip),%r14
+	movq	phys_base_plus3(%rip),%r15
+#endif
 	leaq	_text(%rip), %rbp
 	subq	$_text - __START_KERNEL_map, %rbp
 
@@ -166,6 +175,9 @@ ENTRY(secondary_startup_64)
 	/* Setup early boot stage 4 level pagetables. */
 	movq	$(init_level4_pgt - __START_KERNEL_map), %rax
 	addq	phys_base(%rip), %rax
+#if 0
+	ud2a
+#endif
 	movq	%rax, %cr3
 
 	/* Ensure I am executing from virtual addresses */
@@ -439,10 +451,27 @@ early_gdt_descr:
 	.word	GDT_ENTRIES*8-1
 early_gdt_descr_base:
 	.quad	INIT_PER_CPU_VAR(gdt_page)
+	.align 32
+phys_base_minus5:
+	.quad	0x5555555555555555
+phys_base_minus4:
+	.quad	0x4444444444444444
+phys_base_minus3:
+	.quad	0x3333333333333333
+phys_base_minus2:
+	.quad	0x2222222222222222
+phys_base_minus1:
+	.quad	0x1111111111111111
 
 ENTRY(phys_base)
 	/* This must match the first entry in level2_kernel_pgt */
 	.quad   0x0000000000000000
+phys_base_plus1:
+	.quad	0x9999999999999999
+phys_base_plus2:
+	.quad	0x8888888888888888
+phys_base_plus3:
+	.quad	0x7777777777777777
 
 #include "../../x86/xen/xen-head.S"
 	



More information about the kexec mailing list