[PATCH 1/3] makedumpfile: Xen4: determine Xen version

Petr Tesarik ptesarik at suse.cz
Tue Jul 24 16:45:57 EDT 2012


Hi Norbert et al,

please find my comments below.

Dne Pá 22. června 2012 17:46:39 Norbert Trapp napsal(a):
> The core file contains information about the xen version. The
> Xen version is determined and different functions will be
> called for different Xen versions. Additionally a Xen3 problem
> with the page_info _domain offset for x86_64 gets repaired.
> 
> In Xen3 xen/common/kexec.c the page_info _domain offset was set with:
>     VMCOREINFO_OFFSET_ALIAS(page_info, u, _domain);
> which is wrong for x86_in because the _domain is in union v.
> (xen/include/asm-x86/mm.h)
> For Xen3 in makedumpfile.c the offset is simply modified to 24
> for x86_64.
> 
> For Xen4 we asked Xen development to use:
>     #ifdef __ia64__
> 	VMCOREINFO_OFFSET_SUB(page_info, u.inuse, _domain);
>     #else
> 	VMCOREINFO_OFFSET_SUB(page_info, v.inuse, _domain);
>     #endif
> 
> Signed-off-by: Norbert Trapp <norbert.trapp at ts.fujitsu.com>
> ---
>  elf_info.c     |    2 +
>  makedumpfile.c |   90
> +++++++++++++++++++++++++++++++++++++++++++------------- makedumpfile.h | 
>   3 ++
>  3 files changed, 74 insertions(+), 21 deletions(-)
> 
> diff --git a/elf_info.c b/elf_info.c
> index cf476a3..e723720 100644
> --- a/elf_info.c
> +++ b/elf_info.c
> @@ -159,6 +159,7 @@ check_elf_format(int fd, char *filename, int *phnum,
> unsigned int *num_load) if ((ehdr64.e_ident[EI_CLASS] == ELFCLASS64)
>  	    && (ehdr32.e_ident[EI_CLASS] != ELFCLASS32)) {
>  		(*phnum) = ehdr64.e_phnum;
> +		info->elf_machine = ehdr64.e_machine;
>  		for (i = 0; i < ehdr64.e_phnum; i++) {
>  			if (!get_elf64_phdr(fd, filename, i, &load64)) {
>  				ERRMSG("Can't find Phdr %d.\n", i);
> @@ -172,6 +173,7 @@ check_elf_format(int fd, char *filename, int *phnum,
> unsigned int *num_load) } else if ((ehdr64.e_ident[EI_CLASS] !=
> ELFCLASS64)
>  	    && (ehdr32.e_ident[EI_CLASS] == ELFCLASS32)) {
>  		(*phnum) = ehdr32.e_phnum;
> +		info->elf_machine = ehdr32.e_machine;
>  		for (i = 0; i < ehdr32.e_phnum; i++) {
>  			if (!get_elf32_phdr(fd, filename, i, &load32)) {
>  				ERRMSG("Can't find Phdr %d.\n", i);
> diff --git a/makedumpfile.c b/makedumpfile.c
> index d024e95..7398f17 100644
> --- a/makedumpfile.c
> +++ b/makedumpfile.c
> @@ -5300,10 +5300,7 @@ get_structure_info_xen(void)
>  {
>  	SIZE_INIT(page_info, "page_info");
>  	OFFSET_INIT(page_info.count_info, "page_info", "count_info");
> -	/*
> -	 * _domain is the first member of union u
> -	 */
> -	OFFSET_INIT(page_info._domain, "page_info", "u");
> +	OFFSET_IN_UNION_INIT(page_info._domain, "page_info", "_domain");

This is clever, because it also works for ia64, which has kept the _domain 
field in union "u". Good!

> 
>  	SIZE_INIT(domain, "domain");
>  	OFFSET_INIT(domain.domain_id, "domain", "domain_id");
> @@ -5313,6 +5310,41 @@ get_structure_info_xen(void)
>  }
> 
>  int
> +get_xen_version(void)
> +{
> +	unsigned long   xen_major_version;
> +	unsigned long   xen_minor_version;
> +	off_t           offset_xen_crash_info;
> +	unsigned long   size_xen_crash_info;
> +	const off_t     failed = (off_t)-1;
> +
> +	get_xen_crash_info(&offset_xen_crash_info, &size_xen_crash_info);
> +	if (info->xen_major_version && info->xen_minor_version)
> +		return TRUE;
> +
> +	if (lseek(info->fd_memory, offset_xen_crash_info, SEEK_SET) == failed) {
> +		ERRMSG("Can't seek the dump memory(%s). %s\n",
> +		    info->name_memory, strerror(errno));
> +		return FALSE;
> +	}
> +	if (read(info->fd_memory, &xen_major_version, sizeof(xen_major_version))
> +	    != sizeof(xen_major_version)) {
> +		ERRMSG("Can't read the dump memory(%s). %s\n",
> +		    info->name_memory, strerror(errno));
> +		return FALSE;
> +	}
> +	if (read(info->fd_memory, &xen_minor_version, sizeof(xen_minor_version))
> +	    != sizeof(xen_major_version)) {
> +		ERRMSG("Can't read the dump memory(%s). %s\n",
> +		    info->name_memory, strerror(errno));
> +		return FALSE;
> +	}
> +	info->xen_major_version = xen_major_version;
> +	info->xen_minor_version = xen_minor_version;
> +	return TRUE;
> +}

My patch does this differently. I somehow feel that it might be better to 
describe the definition of crash_xen_info_t with a struct, because it helps 
other people understand the "magic" behind the crash notes. On the other hand, 
I ended up with typecasting a void*, so I don't have a strong opinion. I'll 
try rewriting my patch with a union and see if that looks nicer.

No matter how we do it, we must check the Xen version from different places, 
so it should be read and stored somewhere.

> +
> +int
>  get_xen_phys_start(void)
>  {
>  	off_t offset, offset_xen_crash_info;
> @@ -5350,23 +5382,25 @@ get_xen_info(void)
>  	unsigned int domain_id;
>  	int num_domain;
> 
> -	if (SYMBOL(alloc_bitmap) == NOT_FOUND_SYMBOL) {
> -		ERRMSG("Can't get the symbol of alloc_bitmap.\n");
> -		return FALSE;
> -	}
> -	if (!readmem(VADDR_XEN, SYMBOL(alloc_bitmap), &info->alloc_bitmap,
> -	      sizeof(info->alloc_bitmap))) {
> -		ERRMSG("Can't get the value of alloc_bitmap.\n");
> -		return FALSE;
> -	}
> -	if (SYMBOL(max_page) == NOT_FOUND_SYMBOL) {
> -		ERRMSG("Can't get the symbol of max_page.\n");
> -		return FALSE;
> -	}
> -	if (!readmem(VADDR_XEN, SYMBOL(max_page), &info->max_page,
> -	    sizeof(info->max_page))) {
> -		ERRMSG("Can't get the value of max_page.\n");
> -		return FALSE;
> +	if (info->xen_major_version <= 3) {
> +		if (SYMBOL(alloc_bitmap) == NOT_FOUND_SYMBOL) {
> +			ERRMSG("Can't get the symbol of alloc_bitmap.\n");
> +			return FALSE;
> +		}
> +		if (!readmem(VADDR_XEN, SYMBOL(alloc_bitmap), &info->alloc_bitmap,
> +		      sizeof(info->alloc_bitmap))) {
> +			ERRMSG("Can't get the value of alloc_bitmap.\n");
> +			return FALSE;
> +		}
> +		if (SYMBOL(max_page) == NOT_FOUND_SYMBOL) {
> +			ERRMSG("Can't get the symbol of max_page.\n");
> +			return FALSE;
> +		}
> +		if (!readmem(VADDR_XEN, SYMBOL(max_page), &info->max_page,
> +		    sizeof(info->max_page))) {
> +			ERRMSG("Can't get the value of max_page.\n");
> +			return FALSE;
> +		}

My patch does this differently. Instead of checking the Xen version number, 
I'm checking the existence of the "alloc_bitmap" symbol. Checking the version 
number looks cleaner, because we know what to expect.

>  	}
> 
>  	/*
> @@ -5503,6 +5537,8 @@ show_data_xen(void)
>  	MSG("OFFSET(domain.next_in_list): %ld\n", OFFSET(domain.next_in_list));
> 
>  	MSG("\n");
> +	MSG("xen_major_version: %lx\n", info->xen_major_version);
> +	MSG("xen_minor_version: %lx\n", info->xen_minor_version);

I didn't include this debugging info in my patch, but it may be very useful if 
we get the Xen version wrong for any reason.

>  	MSG("xen_phys_start: %lx\n", info->xen_phys_start);
>  	MSG("frame_table_vaddr: %lx\n", info->frame_table_vaddr);
>  	MSG("xen_heap_start: %lx\n", info->xen_heap_start);
> @@ -5640,6 +5676,12 @@ read_vmcoreinfo_xen(void)
> 
>  	READ_MEMBER_OFFSET("page_info.count_info", page_info.count_info);
>  	READ_MEMBER_OFFSET("page_info._domain", page_info._domain);
> +	if (info->elf_machine == EM_X86_64) {
> +		if (info->xen_major_version < 4) {
> +			MSG("modified offset_table.page_info._domain from %ld to 24\n",
> offset_table.page_info._domain); +			
offset_table.page_info._domain = 24;
> +		}
> +	}

This doesn't correctly cover the range of Xen versions where this was broken:

1. page_info members were re-arranged by changeset 162cdb596b9a, i.e. after 
xen-3.3.0 was branched and before xen-3.4.0 was released.
2. For the xen-3.4 branch, this bug was never fixed.
3. The bug was spotted during xen-4.1 development (changeset cb756381087c), so 
all xen-4.1 releases are fixed.
4. This changeset was also backported to xen-4.0 branch as changeset 
3b90a5353f20, which first went into xen-4.0.2.

To sum it up:
1. all Xen versions up to and including xen-3.3 put the right number in here
2. Xen-4.1 and above is also fine
3. Xen-3.4 is always broken
4. Xen-4.0 is broken up to 4.0.2

Obviously, an OS vendor may also have backported the patch to the otherwise 
broken versions, so even the above summary may not be reliable.

In my opinion, adding a workaround here for a bug in another package is wrong:

A. Standalone users should upgrade xen.
B. Users of distro packages should report it as a bug to their vendor, and the 
vendor should backport the patch.
C. Users who need an immediate workaround should generate a vmcoreinfo file 
using makedumpfile, which will be correct after applying this patch.

>  	READ_MEMBER_OFFSET("domain.domain_id", domain.domain_id);
>  	READ_MEMBER_OFFSET("domain.next_in_list", domain.next_in_list);
> 
> @@ -5763,6 +5805,12 @@ initial_xen(void)
>  	off_t offset;
>  	unsigned long size;
> 
> +	if (!get_xen_version())
> +		return FALSE;
> +	if ((info->elf_machine != EM_X86_64) && (info->xen_major_version >= 4)) {
> +		MSG("Xen major version %lu is not supported yet for this machine.\n",
> info->xen_major_version); +		return FALSE;
> +	}

I would have expected an "#if defined
__x86_64_)" here to be consistent with all the other arch-dependent code.

Anyway, my patches seem to cover the remaining architectures, although I have 
really only tested x86_64...

Petr Tesarik
SUSE Linux

>  #if defined(__powerpc64__) || defined(__powerpc32__)
>  	MSG("\n");
>  	MSG("Xen is not supported on powerpc.\n");
> diff --git a/makedumpfile.h b/makedumpfile.h
> index 6f5489d..d8d7779 100644
> --- a/makedumpfile.h
> +++ b/makedumpfile.h
> @@ -843,6 +843,7 @@ struct DumpInfo {
>  	/*
>  	 * ELF header info:
>  	 */
> +	int			elf_machine;
>  	unsigned int		num_load_dumpfile;
>  	size_t			offset_load_dumpfile;
> 
> @@ -902,6 +903,8 @@ struct DumpInfo {
>  	/*
>  	 * for Xen extraction
>  	 */
> +	unsigned long		xen_major_version;
> +	unsigned long		xen_minor_version;
>  	unsigned long long	dom0_mapnr;  /* The number of page in domain-0.
>  					      * Different from max_mapnr.
>  					      * max_mapnr is the number of page



More information about the kexec mailing list