[PATCH v4 4/4] x86: Pass memory range via E820 for kdump

WANG Chao chaowang at redhat.com
Fri Mar 28 00:52:54 EDT 2014


On 03/27/14 at 06:50pm, Vivek Goyal wrote:
> On Wed, Mar 19, 2014 at 04:04:01PM +0800, WANG Chao wrote:
> > command line size is restricted by kernel, sometimes memmap=exactmap has
> > too many memory ranges to pass to cmdline. A better approach, to pass the
> > memory ranges for crash kernel to boot into, is filling the memory
> > ranges into E820.
> > 
> > boot_params only got 128 slots for E820 map to fit in, when the number of
> > memory map exceeds 128, use setup_data to pass the rest as extended E820
> > memory map.
> > 
> > kexec boot could also benefit from setup_data in case E820 memory map
> > exceeds 128.
> > 
> > Now this new approach becomes default instead of memmap=exactmap.
> > saved_max_pfn users can specify --pass-memmap-cmdline to use the
> > exactmap approach.
> 
> I think it is worth to also mention that kaslr enabled kernel does not
> work with memmap=exactmap.

Sure. Will do.

> 
> This patch in general looks good. Two minor nits below.
> 
> > 
> > Signed-off-by: WANG Chao <chaowang at redhat.com>
> > Tested-by: Linn Crosetto <linn at hp.com>
> > Reviewed-by: Linn Crosetto <linn at hp.com>
> > ---
> >  kexec/arch/i386/crashdump-x86.c   |  25 +++---
> >  kexec/arch/i386/crashdump-x86.h   |   1 +
> >  kexec/arch/i386/x86-linux-setup.c | 171 +++++++++++++++++++++++++-------------
> >  3 files changed, 130 insertions(+), 67 deletions(-)
> > 
> > diff --git a/kexec/arch/i386/crashdump-x86.c b/kexec/arch/i386/crashdump-x86.c
> > index c55a6b1..cb19e7d 100644
> > --- a/kexec/arch/i386/crashdump-x86.c
> > +++ b/kexec/arch/i386/crashdump-x86.c
> > @@ -182,6 +182,8 @@ static int exclude_region(int *nr_ranges, uint64_t start, uint64_t end);
> >  struct memory_range crash_memory_range[CRASH_MAX_MEMORY_RANGES];
> >  int crash_memory_ranges;
> >  
> > +int pass_memmap_cmdline;
> > +
> >  /* Memory region reserved for storing panic kernel and other data. */
> >  #define CRASH_RESERVED_MEM_NR	8
> >  static struct memory_range crash_reserved_mem[CRASH_RESERVED_MEM_NR];
> > @@ -947,20 +949,23 @@ int load_crashdump_segments(struct kexec_info *info, char* mod_cmdline,
> >  	dbgprintf("Created elf header segment at 0x%lx\n", elfcorehdr);
> >  	if (delete_memmap(crash_memory_range, &crash_memory_ranges, elfcorehdr, memsz) < 0)
> >  		return -1;
> > -	cmdline_add_memmap(mod_cmdline, crash_memory_range);
> >  	if (!bzImage_support_efi_boot)
> >  		cmdline_add_efi(mod_cmdline);
> >  	cmdline_add_elfcorehdr(mod_cmdline, elfcorehdr);
> >  
> > -	/* Inform second kernel about the presence of ACPI tables. */
> > -	for (i = 0; i < CRASH_MAX_MEMORY_RANGES; i++) {
> > -		unsigned long start, end;
> > -		if ( !( mem_range[i].type == RANGE_ACPI
> > -			|| mem_range[i].type == RANGE_ACPI_NVS) )
> > -			continue;
> > -		start = mem_range[i].start;
> > -		end = mem_range[i].end;
> > -		cmdline_add_memmap_acpi(mod_cmdline, start, end);
> > +	pass_memmap_cmdline = arch_options.pass_memmap_cmdline;
> > +	if (pass_memmap_cmdline) {
> > +		cmdline_add_memmap(mod_cmdline, crash_memory_range);
> > +		/* Inform second kernel about the presence of ACPI tables. */
> > +		for (i = 0; i < CRASH_MAX_MEMORY_RANGES; i++) {
> > +			unsigned long start, end;
> > +			if ( !( mem_range[i].type == RANGE_ACPI
> > +						|| mem_range[i].type == RANGE_ACPI_NVS) )
> > +				continue;
> > +			start = mem_range[i].start;
> > +			end = mem_range[i].end;
> > +			cmdline_add_memmap_acpi(mod_cmdline, start, end);
> > +		}
> >  	}
> >  
> >  	return 0;
> > diff --git a/kexec/arch/i386/crashdump-x86.h b/kexec/arch/i386/crashdump-x86.h
> > index 633ee0e..e68b626 100644
> > --- a/kexec/arch/i386/crashdump-x86.h
> > +++ b/kexec/arch/i386/crashdump-x86.h
> > @@ -30,5 +30,6 @@ int load_crashdump_segments(struct kexec_info *info, char *mod_cmdline,
> >  
> >  extern struct memory_range crash_memory_range[CRASH_MAX_MEMORY_RANGES];
> >  extern int crash_memory_ranges;
> > +extern int pass_memmap_cmdline;
> >  
> >  #endif /* CRASHDUMP_X86_H */
> > diff --git a/kexec/arch/i386/x86-linux-setup.c b/kexec/arch/i386/x86-linux-setup.c
> > index 5884f4d..e8865e1 100644
> > --- a/kexec/arch/i386/x86-linux-setup.c
> > +++ b/kexec/arch/i386/x86-linux-setup.c
> > @@ -35,8 +35,7 @@
> >  #include "kexec-x86.h"
> >  #include "x86-linux-setup.h"
> >  #include "../../kexec/kexec-syscall.h"
> > -
> > -#define SETUP_EFI	4
> > +#include "crashdump-x86.h"
> >  
> >  void init_linux_parameters(struct x86_linux_param_header *real_mode)
> >  {
> > @@ -502,6 +501,11 @@ struct efi_setup_data {
> >  struct setup_data {
> >  	uint64_t next;
> >  	uint32_t type;
> > +#define SETUP_NONE	0
> > +#define SETUP_E820_EXT	1
> > +#define SETUP_DTB	2
> > +#define SETUP_PCI	3
> > +#define SETUP_EFI	4
> >  	uint32_t len;
> >  	uint8_t data[0];
> >  } __attribute__((packed));
> > @@ -602,6 +606,17 @@ struct efi_info {
> >  	uint32_t efi_memmap_hi;
> >  };
> >  
> > +static void add_setup_data(struct kexec_info *info,
> > +			   struct x86_linux_param_header *real_mode,
> > +			   struct setup_data *sd)
> > +{
> 
> What is setup_data? A little comment above function will make it easy
> to read. Is it that list of elements which contains extra memory map
> entries?

Not exactly. All extra memory maps (for SETUP_E820_EXT type) are
sealed into a single setup_data structure. Different types of setup_data
are linked in a list.

setup_data can be used to pass extra data for boot, for example EFI
data (SETUP_EFI), extended E820 map (SETUP_E820_EXT), SETUP_PCI and
SETUP_DTB. These types are defined when defining struct setup_data.

It's offically documented in Documentation/x86/boot.txt.

Field name:	setup_data
Type:		write (special)
Offset/size:	0x250/8
Protocol:	2.09+

  The 64-bit physical pointer to NULL terminated single linked list of
  struct setup_data. This is used to define a more extensible boot
  parameters passing mechanism. The definition of struct setup_data
  is as follow:

  struct setup_data {
	  u64 next;
	  u32 type;
	  u32 len;
	  u8  data[0];
  };

  Where, the next is a 64-bit physical pointer to the next node of
  linked list, the next field of the last node is 0; the type is used
  to identify the contents of data; the len is the length of data
  field; the data holds the real payload.

  This list may be modified at a number of points during the bootup
  process. Therefore, when modifying this list one should always make
  sure to consider the case where the linked list already contains
  entries.

I think I would comment add_setup_data as follows:

/*
 * Added another instance to single linked list of struct setup_data.
 * Please refer to kernel Documentation/x86/boot.txt for more details
 * about setup_data structure.
 */

> 
> > +	int sdsize = sizeof(struct setup_data) + sd->len;
> > +
> > +	sd->next = real_mode->setup_data;
> > +	real_mode->setup_data = add_buffer(info, sd, sdsize, sdsize, getpagesize(),
> > +			    0x100000, ULONG_MAX, INT_MAX);
> > +}
> > +
> >  /*
> >   * setup_efi_data will collect below data and pass them to 2nd kernel.
> >   * 1) SMBIOS, fw_vendor, runtime, config_table, they are passed via x86
> > @@ -611,11 +626,11 @@ struct efi_info {
> >  static int setup_efi_data(struct kexec_info *info,
> >  			  struct x86_linux_param_header *real_mode)
> >  {
> > -	int64_t setup_data_paddr, memmap_paddr;
> > +	int64_t memmap_paddr;
> >  	struct setup_data *sd;
> >  	struct efi_setup_data *esd;
> >  	struct efi_mem_descriptor *maps;
> > -	int nr_maps, size, sdsize, ret = 0;
> > +	int nr_maps, size, ret = 0;
> >  	struct efi_info *ei = (struct efi_info *)real_mode->efi_info;
> >  
> >  	ret = access("/sys/firmware/efi/systab", F_OK);
> > @@ -648,10 +663,8 @@ static int setup_efi_data(struct kexec_info *info,
> >  	sd->len = sizeof(*esd);
> >  	memcpy(sd->data, esd, sizeof(*esd));
> >  	free(esd);
> > -	sdsize = sd->len + sizeof(struct setup_data);
> > -	setup_data_paddr = add_buffer(info, sd, sdsize, sdsize, getpagesize(),
> > -					0x100000, ULONG_MAX, INT_MAX);
> > -	real_mode->setup_data = setup_data_paddr;
> > +
> > +	add_setup_data(info, real_mode, sd);
> >  
> >  	size = nr_maps * sizeof(struct efi_mem_descriptor);
> >  	memmap_paddr = add_buffer(info, maps, size, size, getpagesize(),
> > @@ -669,6 +682,98 @@ out:
> >  	return ret;
> >  }
> >  
> > +static void add_e820_map_from_mr(struct x86_linux_param_header *real_mode,
> > +			struct e820entry *e820, struct memory_range *range, int nr_range)
> > +{
> > +	int i;
> > +
> > +	for (i = 0; i < nr_range; i++) {
> > +		e820[i].addr = range[i].start;
> > +		e820[i].size = range[i].end - range[i].start;
> > +		switch (range[i].type) {
> > +			case RANGE_RAM:
> > +				e820[i].type = E820_RAM;
> > +				break;
> > +			case RANGE_ACPI:
> > +				e820[i].type = E820_ACPI;
> > +				break;
> > +			case RANGE_ACPI_NVS:
> > +				e820[i].type = E820_NVS;
> > +				break;
> > +			default:
> > +			case RANGE_RESERVED:
> > +				e820[i].type = E820_RESERVED;
> > +				break;
> > +		}
> > +		dbgprintf("%016lx-%016lx (%d)\n",
> > +				e820[i].addr,
> > +				e820[i].addr + e820[i].size - 1,
> > +				e820[i].type);
> > +
> > +		if (range[i].type != RANGE_RAM)
> > +			continue;
> > +		if ((range[i].start <= 0x100000) && range[i].end > 0x100000) {
> > +			unsigned long long mem_k = (range[i].end >> 10) - (0x100000 >> 10);
> > +			real_mode->ext_mem_k = mem_k;
> > +			real_mode->alt_mem_k = mem_k;
> > +			if (mem_k > 0xfc00) {
> > +				real_mode->ext_mem_k = 0xfc00; /* 64M */
> > +			}
> > +			if (mem_k > 0xffffffff) {
> > +				real_mode->alt_mem_k = 0xffffffff;
> > +			}
> > +		}
> > +	}
> > +}
> > +
> > +static void setup_e820_ext(struct kexec_info *info, struct x86_linux_param_header *real_mode,
> > +			   struct memory_range *range, int nr_range)
> > +{
> > +	struct setup_data *sd;
> > +	struct e820entry *e820;
> > +	int nr_range_ext;
> > +
> > +	nr_range_ext = nr_range - E820MAX;
> > +	sd = xmalloc(sizeof(struct setup_data) + nr_range_ext * sizeof(struct e820entry));
> > +	sd->next = 0;
> > +	sd->len = nr_range_ext * sizeof(struct e820entry);
> > +	sd->type = SETUP_E820_EXT;
> > +
> > +	e820 = (struct e820entry *) sd->data;
> > +	dbgprintf("Extended E820 via setup_data:\n");
> > +	add_e820_map_from_mr(real_mode, e820, range + E820MAX, nr_range_ext);
> > +	add_setup_data(info, real_mode, sd);
> > +}
> > +
> > +static void setup_e820(struct kexec_info *info, struct x86_linux_param_header *real_mode)
> > +{
> > +	struct memory_range *range;
> > +	int nr_range, nr_range_saved;
> > +
> > +
> > +	if (info->kexec_flags & KEXEC_ON_CRASH && !pass_memmap_cmdline) {
> > +		range = crash_memory_range;
> > +		nr_range = crash_memory_ranges;
> 
> You know what, it might be a good idea to store the pointer to
> crash_memory_range in kexec_info too, (like memory_range and
> memory_ranges).

Will do.

Thanks for your review.

WANG Chao
> 
> > +	} else {
> > +		range = info->memory_range;
> > +		nr_range = info->memory_ranges;
> > +	}
> > +
> > +	nr_range_saved = nr_range;
> > +	if (nr_range > E820MAX) {
> > +		nr_range = E820MAX;
> > +	}
> > +
> > +	real_mode->e820_map_nr = nr_range;
> > +	dbgprintf("E820 memmap:\n");
> > +	add_e820_map_from_mr(real_mode, real_mode->e820_map, range, nr_range);
> > +
> > +	if (nr_range_saved > E820MAX) {
> > +		dbgprintf("extra E820 memmap are passed via setup_data\n");
> > +		setup_e820_ext(info, real_mode, range, nr_range_saved);
> > +	}
> > +}
> > +
> >  static int
> >  get_efi_mem_desc_version(struct x86_linux_param_header *real_mode)
> >  {
> > @@ -702,10 +807,6 @@ static void setup_efi_info(struct kexec_info *info,
> >  void setup_linux_system_parameters(struct kexec_info *info,
> >  				   struct x86_linux_param_header *real_mode)
> >  {
> > -	/* Fill in information the BIOS would usually provide */
> > -	struct memory_range *range;
> > -	int i, ranges;
> > -
> >  	/* get subarch from running kernel */
> >  	setup_subarch(real_mode);
> >  	if (bzImage_support_efi_boot)
> > @@ -746,51 +847,7 @@ void setup_linux_system_parameters(struct kexec_info *info,
> >  	/* another safe default */
> >  	real_mode->aux_device_info = 0;
> >  
> > -	range = info->memory_range;
> > -	ranges = info->memory_ranges;
> > -	if (ranges > E820MAX) {
> > -		if (!(info->kexec_flags & KEXEC_ON_CRASH))
> > -			/*
> > -			 * this e820 not used for capture kernel, see
> > -			 * do_bzImage_load()
> > -			 */
> > -			fprintf(stderr,
> > -				"Too many memory ranges, truncating...\n");
> > -		ranges = E820MAX;
> > -	}
> > -	real_mode->e820_map_nr = ranges;
> > -	for(i = 0; i < ranges; i++) {
> > -		real_mode->e820_map[i].addr = range[i].start;
> > -		real_mode->e820_map[i].size = range[i].end - range[i].start;
> > -		switch (range[i].type) {
> > -		case RANGE_RAM:
> > -			real_mode->e820_map[i].type = E820_RAM; 
> > -			break;
> > -		case RANGE_ACPI:
> > -			real_mode->e820_map[i].type = E820_ACPI; 
> > -			break;
> > -		case RANGE_ACPI_NVS:
> > -			real_mode->e820_map[i].type = E820_NVS;
> > -			break;
> > -		default:
> > -		case RANGE_RESERVED:
> > -			real_mode->e820_map[i].type = E820_RESERVED; 
> > -			break;
> > -		}
> > -		if (range[i].type != RANGE_RAM)
> > -			continue;
> > -		if ((range[i].start <= 0x100000) && range[i].end > 0x100000) {
> > -			unsigned long long mem_k = (range[i].end >> 10) - (0x100000 >> 10);
> > -			real_mode->ext_mem_k = mem_k;
> > -			real_mode->alt_mem_k = mem_k;
> > -			if (mem_k > 0xfc00) {
> > -				real_mode->ext_mem_k = 0xfc00; /* 64M */
> > -			}
> > -			if (mem_k > 0xffffffff) {
> > -				real_mode->alt_mem_k = 0xffffffff;
> > -			}
> > -		}
> > -	}
> > +	setup_e820(info, real_mode);
> >  
> >  	/* fill the EDD information */
> >  	setup_edd_info(real_mode);
> > -- 
> > 1.8.5.3
> > 
> > 
> > _______________________________________________
> > kexec mailing list
> > kexec at lists.infradead.org
> > http://lists.infradead.org/mailman/listinfo/kexec



More information about the kexec mailing list