[PATCH v2] makedumpfile: request the kernel do page scans

Lisa Mitchell lisa.mitchell at hp.com
Wed Jan 16 07:15:29 EST 2013


On Fri, 2013-01-04 at 16:20 +0000, Cliff Wickman wrote:
> From: Cliff Wickman <cpw at sgi.com>
> 
> This version of the patch improves the consolidation of the mem_map table
> that is passed to the kernel.  See make_kernel_mmap().
> Particularly the seemingly duplicate pfn ranges generated on an older
> (2.6.32-based, rhel6) kernel.
> 
> 
> 
> I've been experimenting with asking the kernel to scan the page tables
> instead of reading all those page structures through /proc/vmcore.
> The results are rather dramatic.
> On a small, idle UV: about 4 sec. versus about 40 sec.
> On a 8TB UV the unnecessary page scan takes 4 minutes, vs. about 200 min
> through /proc/vmcore.
> 
> This patch incorporates this scheme into version 1.5.1, so that the cyclic
> processing can use the kernel scans.
> It also uses the page_is_buddy logic to speed the finding of free pages.
> And also allows makedumpfile to work as before with a kernel that does
> not provide /proc/vmcore_pfn_lists.
> 
> This patch:
>   - writes requests to new kernel file /proc/vmcore_pfn_lists
>   - makes request PL_REQUEST_MEMMAP to pass the crash kernel information about
>     the boot kernel
>   - makes requests PL_REQUEST_FREE and PL_REQUEST_EXCLUDE, asking the kernel
>     to return lists of PFNs
>   - adds page scan timing options -n -o and -t
>   - still has a debugging option  -a
> 
> This patch depends on a kernel patch.
> 
> Diffed against the released makedumpfile-1.5.1
> 
> Signed-off-by: Cliff Wickman <cpw at sgi.com>
> ---
>  dwarf_info.c   |    2 
>  makedumpfile.c |  587 ++++++++++++++++++++++++++++++++++++++++++++++++++++++---
>  makedumpfile.h |   95 +++++++++
>  print_info.c   |    5 
>  4 files changed, 665 insertions(+), 24 deletions(-)
> 
> 
> Index: makedumpfile-1.5.1.released/makedumpfile.h
> ===================================================================
> --- makedumpfile-1.5.1.released.orig/makedumpfile.h
> +++ makedumpfile-1.5.1.released/makedumpfile.h
> @@ -86,6 +86,8 @@ int get_mem_type(void);
>  #define LSEEKED_PDESC	(2)
>  #define LSEEKED_PDATA	(3)
>  
> +#define EXTRA_MEMMAPS	100
> +
>  /*
>   * Xen page flags
>   */
> @@ -418,7 +420,7 @@ do { \
>  #define KVER_MIN_SHIFT 16
>  #define KERNEL_VERSION(x,y,z) (((x) << KVER_MAJ_SHIFT) | ((y) << KVER_MIN_SHIFT) | (z))
>  #define OLDEST_VERSION		KERNEL_VERSION(2, 6, 15)/* linux-2.6.15 */
> -#define LATEST_VERSION		KERNEL_VERSION(3, 6, 7)/* linux-3.6.7 */
> +#define LATEST_VERSION		KERNEL_VERSION(3, 7, 8)/* linux-3.7.8 */
>  
>  /*
>   * vmcoreinfo in /proc/vmcore
> @@ -794,11 +796,25 @@ typedef struct {
>  } xen_crash_info_v2_t;
>  
>  struct mem_map_data {
> +	/*
> +	 * pfn_start/pfn_end are the pfn's represented by this mem_map entry.
> +	 * mem_map is the virtual address of the array of page structures
> +	 * that represent these pages.
> +	 * paddr is the physical address of that array of structures.
> +	 * ending_paddr would be (pfn_end - pfn_start) * sizeof(struct page).
> +	 * section_vaddr is the address we get from ioremap_cache().
> +	 */
>  	unsigned long long	pfn_start;
>  	unsigned long long	pfn_end;
> -	unsigned long	mem_map;
> +	unsigned long		mem_map;
> +	unsigned long long	paddr;		/* filled in by makedumpfile */
> +	long			virtual_offset;	/* filled in by kernel */
> +	unsigned long long	ending_paddr;	/* filled in by kernel */
> +	unsigned long		mapped_size;	/* filled in by kernel */
> +	void 			*section_vaddr;	/* filled in by kernel */
>  };
>  
> +
>  struct dump_bitmap {
>  	int		fd;
>  	int		no_block;
> @@ -875,6 +891,7 @@ struct DumpInfo {
>  	int		flag_rearrange;      /* flag of creating dumpfile from
>  						flattened format */
>  	int		flag_split;	     /* splitting vmcore */
> +	int		flag_use_kernel_lists;
>    	int		flag_cyclic;	     /* cyclic processing to keep memory consumption */
>  	int		flag_reassemble;     /* reassemble multiple dumpfiles into one */
>  	int		flag_refiltering;    /* refilter from kdump-compressed file */
> @@ -1384,6 +1401,80 @@ struct domain_list {
>  	unsigned int  pickled_id;
>  };
>  
> +#define PL_REQUEST_FREE		1	/* request for a list of free pages */
> +#define PL_REQUEST_EXCLUDE	2	/* request for a list of excludable
> +					   pages */
> +#define PL_REQUEST_MEMMAP	3	/* request to pass in the makedumpfile
> +					   mem_map_data table */
> +/*
> + * limit the size of the pfn list to this many pfn_element structures
> + */
> +#define MAX_PFN_LIST 10000
> +
> +/*
> + * one element in the pfn_list
> + */
> +struct pfn_element {
> +	unsigned long pfn;
> +	unsigned long order;
> +};
> +
> +/*
> + * a request for finding pfn's that can be excluded from the dump
> + * they may be pages of particular types or free pages
> + */
> +struct pfn_list_request {
> +	int request;		/* PL_REQUEST_FREE PL_REQUEST_EXCLUDE or */
> +				/* PL_REQUEST_MEMMAP */
> +	int debug;
> +	unsigned long paddr;	/* mem_map address for PL_REQUEST_EXCLUDE */
> +	unsigned long pfn_start;/* pfn represented by paddr */
> +	unsigned long pgdat_paddr; /* for PL_REQUEST_FREE */
> +	unsigned long pgdat_vaddr; /* for PL_REQUEST_FREE */
> +	int node;		/* for PL_REQUEST_FREE */
> +	int exclude_bits;	/* for PL_REQUEST_EXCLUDE */
> +	int count;		/* for PL_REQUEST_EXCLUDE */
> +	void *reply_ptr;	/* address of user's pfn_reply, for reply */
> +	void *pfn_list_ptr;	/* address of user's pfn array (*pfn_list) */
> +	int map_count;		/* for PL_REQUEST_MEMMAP; elements */
> +	int map_size;		/* for PL_REQUEST_MEMMAP; bytes in table */
> +	void *map_ptr;		/* for PL_REQUEST_MEMMAP; address of table */
> +	long list_size;		/* for PL_REQUEST_MEMMAP negotiation */
> +	/* resume info: */
> +	int more;		/* 0 for done, 1 for "there's more" */
> +				/* PL_REQUEST_EXCLUDE: */
> +	int map_index;		/* slot in the mem_map array of page structs */
> +				/* PL_REQUEST_FREE: */
> +	int zone_index;		/* zone within the node's pgdat_list */
> +	int freearea_index;	/* free_area within the zone */
> +	int type_index;		/* free_list within the free_area */
> +	int list_ct;		/* page within the list */
> +};
> +
> +/*
> + * the reply from a pfn_list_request
> + * the list of pfn's itself is pointed to by pfn_list
> + */
> +struct pfn_reply {
> +	long pfn_list_elements;	/* negoiated on PL_REQUEST_MEMMAP */
> +	long in_pfn_list;	/* returned by PL_REQUEST_EXCLUDE and
> +				   PL_REQUEST_FREE */
> +	/* resume info */
> +	int more;		/* 0 == done, 1 == there is more */
> +				/* PL_REQUEST_MEMMAP: */
> +	int map_index;		/* slot in the mem_map array of page structs */
> +				/* PL_REQUEST_FREE: */
> +	int zone_index;		/* zone within the node's pgdat_list */
> +	int freearea_index;	/* free_area within the zone */
> +	int type_index;		/* free_list within the free_area */
> +	int list_ct;		/* page within the list */
> +	/* statistic counters: */
> +	unsigned long long pfn_cache;		/* PL_REQUEST_EXCLUDE */
> +	unsigned long long pfn_cache_private;	/* PL_REQUEST_EXCLUDE */
> +	unsigned long long pfn_user;		/* PL_REQUEST_EXCLUDE */
> +	unsigned long long pfn_free;		/* PL_REQUEST_FREE */
> +};
> +
>  #define PAGES_PER_MAPWORD 	(sizeof(unsigned long) * 8)
>  #define MFNS_PER_FRAME		(info->page_size / sizeof(unsigned long))
>  
> Index: makedumpfile-1.5.1.released/dwarf_info.c
> ===================================================================
> --- makedumpfile-1.5.1.released.orig/dwarf_info.c
> +++ makedumpfile-1.5.1.released/dwarf_info.c
> @@ -324,6 +324,8 @@ get_data_member_location(Dwarf_Die *die,
>  	return TRUE;
>  }
>  
> +int dwarf_formref(Dwarf_Attribute *, Dwarf_Off *);
> +
>  static int
>  get_die_type(Dwarf_Die *die, Dwarf_Die *die_type)
>  {
> Index: makedumpfile-1.5.1.released/print_info.c
> ===================================================================
> --- makedumpfile-1.5.1.released.orig/print_info.c
> +++ makedumpfile-1.5.1.released/print_info.c
> @@ -244,6 +244,11 @@ print_usage(void)
>  	MSG("  [-f]:\n");
>  	MSG("      Overwrite DUMPFILE even if it already exists.\n");
>  	MSG("\n");
> +	MSG("  [-o]:\n");
> +	MSG("      Read page structures from /proc/vmcore in the scan for\n");
> +	MSG("      free and excluded pages regardless of whether\n");
> +	MSG("      /proc/vmcore_pfn_lists is present.\n");
> +	MSG("\n");
>  	MSG("  [-h]:\n");
>  	MSG("      Show help message and LZO/snappy support status (enabled/disabled).\n");
>  	MSG("\n");
> Index: makedumpfile-1.5.1.released/makedumpfile.c
> ===================================================================
> --- makedumpfile-1.5.1.released.orig/makedumpfile.c
> +++ makedumpfile-1.5.1.released/makedumpfile.c
> @@ -13,6 +13,8 @@
>   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>   * GNU General Public License for more details.
>   */
> +#define _GNU_SOURCE
> +#include <stdio.h>
>  #include "makedumpfile.h"
>  #include "print_info.h"
>  #include "dwarf_info.h"
> @@ -31,6 +33,14 @@ struct srcfile_table	srcfile_table;
>  
>  struct vm_table		vt = { 0 };
>  struct DumpInfo		*info = NULL;
> +int pfn_list_fd;
> +struct pfn_element *pfn_list;
> +int nflag = 0;
> +int oflag = 0;
> +int tflag = 0;
> +int aflag = 0;
> +struct timeval scan_start;
> +int max_pfn_list;
>  
>  char filename_stdout[] = FILENAME_STDOUT;
>  
> @@ -2415,6 +2425,22 @@ get_mm_sparsemem(void)
>  	unsigned long long pfn_start, pfn_end;
>  	unsigned long section, mem_map;
>  	unsigned long *mem_sec = NULL;
> +	unsigned long vaddr;
> +	unsigned long paddr;
> +	unsigned long lastvaddr;
> +	unsigned long lastpaddr;
> +	unsigned long diff;
> +	long j;
> +	int i;
> +	int npfns;
> +	int pagesize;
> +	int num_mem_map;
> +	int num_added = 0;
> +	struct mem_map_data *mmd;
> +	struct mem_map_data *curmmd;
> +	struct mem_map_data *work1mmd;
> +	struct mem_map_data *work2mmd;
> +	struct mem_map_data *lastmmd;
>  
>  	int ret = FALSE;
>  
> @@ -2441,7 +2467,8 @@ get_mm_sparsemem(void)
>  	}
>  	info->num_mem_map = num_section;
>  	if ((info->mem_map_data = (struct mem_map_data *)
> -	    malloc(sizeof(struct mem_map_data)*info->num_mem_map)) == NULL) {
> +	    malloc(sizeof(struct mem_map_data) *
> +		   		(EXTRA_MEMMAPS + info->num_mem_map))) == NULL) {
>  		ERRMSG("Can't allocate memory for the mem_map_data. %s\n",
>  		    strerror(errno));
>  		goto out;
> @@ -2459,6 +2486,71 @@ get_mm_sparsemem(void)
>  		dump_mem_map(pfn_start, pfn_end, mem_map, section_nr);
>  	}
>  	ret = TRUE;
> +
> +	/* add paddr to the table */
> +	mmd = &info->mem_map_data[0];
> +	num_mem_map = info->num_mem_map;
> +	lastmmd = mmd + num_mem_map;
> +	for (i = 0; i < num_mem_map; i++) {
> +		if (mmd[i].mem_map == 0) {
> +			mmd[i].paddr = 0;
> +		} else {
> +			mmd[i].paddr = vaddr_to_paddr(mmd[i].mem_map);
> +			if (mmd[i].paddr == 0) {
> +				printf("! can't translate %#lx to paddr\n",
> +					mmd[i].mem_map);
> +				exit(1);
> +			}
> +			/*
> +			 * When we pass a mem_map and its paddr to the kernel
> +			 * it will be remapped assuming the entire range
> +			 * of pfn's are consecutive. If they are not then
> +			 * we need to split the range into two.
> +			 */
> +			pagesize = SIZE(page);
> +			npfns = mmd[i].pfn_end - mmd[i].pfn_start;
> +			vaddr = (unsigned long)mmd[i].mem_map;
> +			paddr = vaddr_to_paddr(vaddr);
> +			diff = vaddr - paddr;
> +			lastvaddr = vaddr + (pagesize * (npfns-1));
> +			lastpaddr = vaddr_to_paddr(lastvaddr);
> +			if (lastvaddr - lastpaddr != diff) {
> +			/* there is a break in vtop somewhere in this range */
> +			/* we need to split it */
> +			  for (j = 0; j < npfns; j++) {
> +			    paddr = vaddr_to_paddr(vaddr);
> +			    if (vaddr - paddr != diff) {
> +				diff = vaddr - paddr;
> +				/* insert a new entry if we have room */
> +				if (num_added < EXTRA_MEMMAPS) {
> +					curmmd = &info->mem_map_data[i];
> +					num_added++;
> +					work1mmd = lastmmd - 1;
> +					for (work2mmd = lastmmd;
> +						work2mmd > curmmd; work2mmd--) {
> +						work1mmd = work2mmd - 1;
> +						*work2mmd = *work1mmd;
> +					}
> +					work2mmd = work1mmd + 1;
> +					work2mmd->mem_map =
> +					  work1mmd->mem_map + (pagesize * j);
> +					lastmmd++;
> +					num_mem_map++;
> +					info->num_mem_map++;
> +					/*
> +					 * need only 1 split, the new
> +					 * one will be checked also.
> +					 */
> +					break;
> +				} else
> +					printf("warn: out of EXTRA_MEMMAPS\n");
> +			    }
> +				vaddr += pagesize;
> +			  }
> +			}
> +		}
> +	}
> +
>  out:
>  	if (mem_sec != NULL)
>  		free(mem_sec);
> @@ -2571,6 +2663,172 @@ initialize_bitmap_memory(void)
>  	return TRUE;
>  }
>  
> +/*
> + * construct a version of the mem_map_data table to pass to the kernel
> + */
> +void *
> +make_kernel_mmap(int *kmap_elements, int *kmap_size)
> +{
> +	int i, j;
> +	int elements = 0;
> +	int page_structs;
> +	int elem;
> +	long l;
> +	unsigned long base_end_pfn;
> +	unsigned long end_paddr;
> +	unsigned long v1;
> +	unsigned long v2;
> +	unsigned long end_page_pfns;
> +	unsigned long hpagesize = 0x200000UL;
> +	unsigned long hpageoffset = hpagesize - 1;
> +	struct mem_map_data *mmdo, *mmdn;
> +	struct mem_map_data *mmdbase, *mmdnext, *mmdend, *mmdwork;
> +	struct mem_map_data temp_mmd;
> +	struct mem_map_data *mmap;
> +
> +	mmap = malloc(info->num_mem_map * sizeof(struct mem_map_data));
> +	if (mmap == NULL) {
> +		ERRMSG("Can't allocate memory kernel map\n");
> +		return NULL;
> +	}
> +
> +	/* condense them down to the valid ones */
> +	for (i = 0, mmdn = mmap, mmdo = &info->mem_map_data[0];
> +				i < info->num_mem_map; i++, mmdo++) {
> +		if (mmdo->mem_map && mmdo->paddr) {
> +			*mmdn = *mmdo;
> +			mmdn++;
> +			elements++;
> +		}
> +	}
> +
> +	/* make sure it is sorted by mem_map (it should be already) */
> +	mmdn = mmap;
> +	for (i = 0; i < elements - 1; i++) {
> +		for (j = i + 1; j < elements; j++) {
> +			if (mmdn[j].mem_map < mmdn[i].mem_map) {
> +				temp_mmd = mmdn[j];
> +				mmdn[j] = mmdn[i];
> +				mmdn[i] = temp_mmd;
> +			}
> +		}
> +	}
> +
> +	if (aflag) {
> +		mmdn = mmap;
> +		printf("entire mem_map:\n");
> +		for (i = 0; i < elements - 1; i++) {
> +			l = (mmdn[i].pfn_end - mmdn[i].pfn_start) * SIZE(page);
> +			printf(
> +			 "[%d] pfn %#llx-%llx mem_map %#lx paddr %#llx-%llx\n",
> +				i, mmdn[i].pfn_start, mmdn[i].pfn_end,
> +				mmdn[i].mem_map, mmdn[i].paddr,
> +				mmdn[i].paddr + l);
> +		}
> +	}
> +
> +	/*
> +	 * a first pass to split overlapping pfn entries like this:
> +	 * pfn 0x1248000-1250000 mem_map 0xffffea003ffc0000 paddr 0x10081c0000
> +	 * pfn 0x1248000-1250000 mem_map 0xffffea0040000030 paddr 0x1008400030
> +	 */
> +	mmdbase = mmap;
> +	mmdnext = mmap + 1;
> +	mmdend = mmap + elements;
> +	/* test each mmdbase/mmdnext pair */
> +	while (mmdnext < mmdend) {  /* mmdnext is the one after mmdbase */
> +		page_structs = (mmdbase->pfn_end - mmdbase->pfn_start);
> +		/* mmdwork scans from mmdnext to the end */
> +		if ((mmdbase->pfn_start == mmdnext->pfn_start) &&
> +		    (mmdbase->pfn_end == mmdnext->pfn_end)) {
> +			/* overlapping pfns, we need a fix */
> +			v1 = mmdnext->mem_map - mmdbase->mem_map;
> +			v2 = mmdnext->paddr - mmdbase->paddr;
> +			if (v1 != (v2 & hpageoffset))
> +				printf("virt to phys is wrong %#lx %#lx\n",
> +					v1, v2);
> +			l = mmdbase->pfn_end - mmdbase->pfn_start;
> +			end_page_pfns = l - (((hpagesize -
> +				(hpageoffset & mmdbase->paddr)) +
> +				  SIZE(page) - 1) / SIZE(page));
> +			mmdbase->pfn_end -= end_page_pfns;
> +			mmdnext->pfn_start = mmdbase->pfn_end;
> +		} else if ((mmdbase->pfn_start == mmdnext->pfn_start) ||
> +		    (mmdbase->pfn_end == mmdnext->pfn_end)) {
> +			printf("warning: unfixed overlap\n");
> +		}
> +		mmdbase++;
> +		mmdnext++;
> +	}
> +
> +	/*
> +	 * consolidate those mem_map's with occupying consecutive physical
> +	 * addresses
> +	 *  pages represented by these pages structs:       addr of page struct
> +	 * pfns 0x1000000-1008000 mem_map 0xffffea0038000000 paddr 0x11f7e00000
> +	 * pfns 0x1008000-1010000 mem_map 0xffffea00381c0000 paddr 0x11f7fc0000
> +	 * pfns 0x1010000-1018000 mem_map 0xffffea0038380000 paddr 0x11f8180000
> +	 *           8000 increments                             inc's:  1c0000
> +	 *        8000000 of memory (128M)                    8000 page structs
> +	 */
> +	mmdbase = mmap;
> +	mmdnext = mmap + 1;
> +	mmdend = mmap + elements;
> +	while (mmdnext < mmdend) {
> +		elem = mmdend - mmdnext;
> +		/*  test mmdbase vs. mmdwork and onward: */
> +		for (i = 0, mmdwork = mmdnext; i < elem; i++, mmdwork++) {
> +			base_end_pfn = mmdbase->pfn_end;
> +			if (base_end_pfn == mmdwork->pfn_start) {
> +				page_structs = (mmdbase->pfn_end -
> +							mmdbase->pfn_start);
> +				end_paddr = (page_structs * SIZE(page)) +
> +							mmdbase->paddr;
> +				if (mmdwork->paddr == end_paddr) {
> +					/* extend base by the work one */
> +					mmdbase->pfn_end = mmdwork->pfn_end;
> +					/* next is where to begin next time */
> +					mmdnext = mmdwork + 1;
> +				} else {
> +					/* gap in address of page
> +					   structs; end of section */
> +					mmdbase++;
> +					if (mmdwork - mmdbase > 0)
> +						*mmdbase = *mmdwork;
> +					mmdnext = mmdwork + 1;
> +					break;
> +				}
> +			} else {
> +				/* gap in pfns; end of section */
> +				mmdbase++;
> +				if (mmdwork - mmdbase > 0)
> +					*mmdbase = *mmdwork;
> +				mmdnext = mmdwork + 1;
> +				break;
> +			}
> +		}
> +	}
> +	elements = (mmdbase - mmap) + 1;
> +
> +	if (aflag) {
> +		printf("user mmap for kernel:\n");
> +		for (i = 0, mmdwork = mmap; i < elements; i++, mmdwork++) {
> +			l = mmdwork->pfn_end - mmdwork->pfn_start;
> +			printf(
> +		      "[%d] user pfn %#llx-%llx paddr %#llx-%llx vaddr %#lx\n",
> +				i, mmdwork->pfn_start, mmdwork->pfn_end,
> +				mmdwork->paddr,
> +				mmdwork->paddr + (l * SIZE(page)),
> +				mmdwork->mem_map);
> +		}
> +	}
> +
> +	*kmap_elements = elements;
> +	*kmap_size = elements * sizeof(struct mem_map_data);
> +
> +	return mmap;
> +}
> +
>  int
>  initial(void)
>  {
> @@ -2833,7 +3091,14 @@ out:
>  	if (!get_value_for_old_linux())
>  		return FALSE;
>  
> -	if (info->flag_cyclic && (info->dump_level & DL_EXCLUDE_FREE))
> +	/*
> +	 * page_is_buddy will tell us whether free pages can be identified
> +	 * by flags and counts in the page structure without making an extra
> +	 * pass through the free lists.
> +	 * This is applicable to using /proc/vmcore or using the kernel.
> +	 *   force all old (-o) forms to search free lists
> +	 */
> +	if (info->dump_level & DL_EXCLUDE_FREE)
>  		setup_page_is_buddy();
>  
>  	return TRUE;
> @@ -3549,6 +3814,65 @@ out:
>  	return ret;
>  }
>  
> +/*
> + * let the kernel find excludable pages from one node
> + */
> +void
> +__exclude_free_pages_kernel(unsigned long pgdat, int node)
> +{
> +	int i, j, ret, pages;
> +	unsigned long pgdat_paddr;
> +	struct pfn_list_request request;
> +	struct pfn_reply reply;
> +	struct pfn_element *pe;
> +
> +	if ((pgdat_paddr = vaddr_to_paddr(pgdat)) == NOT_PADDR) {
> +		ERRMSG("Can't convert virtual address(%#lx) to physical.\n",
> +			pgdat);
> +		return;
> +	}
> +
> +	/*
> +	 * Get the list of free pages.
> +	 * This may be broken up into MAX_PFN_list arrays of PFNs.
> +	 */
> +	memset(&request, 0, sizeof(request));
> +	request.request = PL_REQUEST_FREE;
> +	request.node = node;
> +	request.pgdat_paddr = pgdat_paddr;
> +	request.pgdat_vaddr = pgdat;
> +	request.reply_ptr = (void *)&reply;
> +	request.pfn_list_ptr = (void *)pfn_list;
> +	memset(&reply, 0, sizeof(reply));
> +
> +	do {
> +		request.more = 0;
> +		if (reply.more) {
> +			/* this is to be a continuation of the last request */
> +			request.more = 1;
> +			request.zone_index = reply.zone_index;
> +			request.freearea_index = reply.freearea_index;
> +			request.type_index = reply.type_index;
> +			request.list_ct = reply.list_ct;
> +		}
> +		ret = write(pfn_list_fd, &request, sizeof(request));
> +		if (ret != sizeof(request)) {
> +			printf("PL_REQUEST_FREE failed\n");
> +			return;
> +		}
> +		pfn_free += reply.pfn_free;
> +
> +		for (i = 0; i < reply.in_pfn_list; i++) {
> +			pe = &pfn_list[i];
> +			pages = (1 << pe->order);
> +                        for (j = 0; j < pages; j++) {
> +				clear_bit_on_2nd_bitmap_for_kernel(pe->pfn + j);
> +			}
> +		}
> +	} while (reply.more);
> +
> +	return;
> +}
>  
>  int
>  _exclude_free_page(void)
> @@ -3568,7 +3892,24 @@ _exclude_free_page(void)
>  	gettimeofday(&tv_start, NULL);
>  
>  	for (num_nodes = 1; num_nodes <= vt.numnodes; num_nodes++) {
> -
> +		if (!info->flag_cyclic && info->flag_use_kernel_lists) {
> +			node_zones = pgdat + OFFSET(pglist_data.node_zones);
> +			if (!readmem(VADDR,
> +				pgdat + OFFSET(pglist_data.nr_zones),
> +				&nr_zones, sizeof(nr_zones))) {
> +					ERRMSG("Can't get nr_zones.\n");
> +				return FALSE;
> +			}
> +			print_progress(PROGRESS_FREE_PAGES, num_nodes - 1,
> +								vt.numnodes);
> +			/* ask the kernel to do one node */
> +			__exclude_free_pages_kernel(pgdat, node);
> +			goto next_pgdat;
> +		}
> +		/*
> +		 * kernel does not have the pfn_list capability
> +		 * use the old way
> +		 */
>  		print_progress(PROGRESS_FREE_PAGES, num_nodes - 1, vt.numnodes);
>  
>  		node_zones = pgdat + OFFSET(pglist_data.node_zones);
> @@ -3595,6 +3936,7 @@ _exclude_free_page(void)
>  			if (!reset_bitmap_of_free_pages(zone))
>  				return FALSE;
>  		}
> +	next_pgdat:
>  		if (num_nodes < vt.numnodes) {
>  			if ((node = next_online_node(node + 1)) < 0) {
>  				ERRMSG("Can't get next online node.\n");
> @@ -3612,6 +3954,8 @@ _exclude_free_page(void)
>  	 */
>  	print_progress(PROGRESS_FREE_PAGES, vt.numnodes, vt.numnodes);
>  	print_execution_time(PROGRESS_FREE_PAGES, &tv_start);
> +	if (tflag)
> +		print_execution_time("Total time", &scan_start);
>  
>  	return TRUE;
>  }
> @@ -3755,7 +4099,6 @@ setup_page_is_buddy(void)
>  		}
>  	} else
>  		info->page_is_buddy = page_is_buddy_v2;
> -
>  out:
>  	if (!info->page_is_buddy)
>  		DEBUG_MSG("Can't select page_is_buddy handler; "
> @@ -3964,10 +4307,89 @@ exclude_zero_pages(void)
>  	return TRUE;
>  }
>  
> +/*
> + * let the kernel find excludable pages from one mem_section
> + */
> +int
> +__exclude_unnecessary_pages_kernel(int mm, struct mem_map_data *mmd)
> +{
> +	unsigned long long pfn_start = mmd->pfn_start;
> +	unsigned long long pfn_end = mmd->pfn_end;
> +	int i, j, ret, pages, flag;
> +	struct pfn_list_request request;
> +	struct pfn_reply reply;
> +	struct pfn_element *pe;
> +
> +	/*
> +	 * Get the list of to-be-excluded pages in this section.
> +	 * It may be broken up by groups of max_pfn_list size.
> +	 */
> +	memset(&request, 0, sizeof(request));
> +	request.request = PL_REQUEST_EXCLUDE;
> +	request.paddr = mmd->paddr; /* phys addr of mem_map */
> +	request.reply_ptr = (void *)&reply;
> +	request.pfn_list_ptr = (void *)pfn_list;
> +	request.exclude_bits = 0;
> +	request.pfn_start = pfn_start;
> +	request.count = pfn_end - pfn_start;
> +	if (info->dump_level & DL_EXCLUDE_CACHE)
> +	 	request.exclude_bits |= DL_EXCLUDE_CACHE;
> +	if (info->dump_level & DL_EXCLUDE_CACHE_PRI)
> +	 	request.exclude_bits |= DL_EXCLUDE_CACHE_PRI;
> +	if (info->dump_level & DL_EXCLUDE_USER_DATA)
> +	 	request.exclude_bits |= DL_EXCLUDE_USER_DATA;
> +	/* if we try for free pages from the freelists then we don't need
> +           to ask here for 'buddy' pages */
> +	if (info->dump_level & DL_EXCLUDE_FREE)
> +	 	request.exclude_bits |= DL_EXCLUDE_FREE;
> +	memset(&reply, 0, sizeof(reply));
> +
> +	do {
> +		/* pfn represented by paddr */
> +		request.more = 0;
> +		if (reply.more) {
> +			/* this is to be a continuation of the last request */
> +			request.more = 1;
> +			request.map_index = reply.map_index;
> +		}
> +
> +		ret = write(pfn_list_fd, &request, sizeof(request));
> +		if (ret != sizeof(request))
> +			return FALSE;
> +
> +		pfn_cache += reply.pfn_cache;
> +		pfn_cache_private += reply.pfn_cache_private;
> +		pfn_user += reply.pfn_user;
> +		pfn_free += reply.pfn_free;
> +
> +		flag = 0;
> +		for (i = 0; i < reply.in_pfn_list; i++) {
> +			pe = &pfn_list[i];
> +			pages = (1 << pe->order);
> +                        for (j = 0; j < pages; j++) {
> +				if (clear_bit_on_2nd_bitmap_for_kernel(
> +							pe->pfn + j) == FALSE) {
> +					// printf("fail: mm %d slot %d pfn %#lx\n",
> +						// mm, i, pe->pfn + j);
> +					// printf("paddr %#llx pfn %#llx-%#llx mem_map %#lx\n",
> +					// mmd->paddr, mmd->pfn_start, mmd->pfn_end, mmd->mem_map);
> +					flag = 1;
> +					break;
> +				}
> +				if (flag) break;
> +			}
> +		}
> +	} while (reply.more);
> +
> +	return TRUE;
> +}
> +
>  int
> -__exclude_unnecessary_pages(unsigned long mem_map,
> -    unsigned long long pfn_start, unsigned long long pfn_end)
> +__exclude_unnecessary_pages(int mm, struct mem_map_data *mmd)
>  {
> +	unsigned long long pfn_start = mmd->pfn_start;
> +	unsigned long long pfn_end = mmd->pfn_end;
> +	unsigned long mem_map = mmd->mem_map;
>  	unsigned long long pfn, pfn_mm, maddr;
>  	unsigned long long pfn_read_start, pfn_read_end, index_pg;
>  	unsigned char page_cache[SIZE(page) * PGMM_CACHED];
> @@ -3975,6 +4397,12 @@ __exclude_unnecessary_pages(unsigned lon
>  	unsigned int _count, _mapcount = 0;
>  	unsigned long flags, mapping, private = 0;
>  
> +	if (info->flag_use_kernel_lists) {
> +		if (__exclude_unnecessary_pages_kernel(mm, mmd) == FALSE)
> +			return FALSE;
> +		return TRUE;
> +	}
> +
>  	/*
>  	 * Refresh the buffer of struct page, when changing mem_map.
>  	 */
> @@ -4012,7 +4440,6 @@ __exclude_unnecessary_pages(unsigned lon
>  				pfn_mm = PGMM_CACHED - index_pg;
>  			else
>  				pfn_mm = pfn_end - pfn;
> -
>  			if (!readmem(VADDR, mem_map,
>  			    page_cache + (index_pg * SIZE(page)),
>  			    SIZE(page) * pfn_mm)) {
> @@ -4036,7 +4463,6 @@ __exclude_unnecessary_pages(unsigned lon
>  		 * Exclude the free page managed by a buddy
>  		 */
>  		if ((info->dump_level & DL_EXCLUDE_FREE)
> -		    && info->flag_cyclic
>  		    && info->page_is_buddy
>  		    && info->page_is_buddy(flags, _mapcount, private, _count)) {
>  			int i;
> @@ -4085,19 +4511,78 @@ __exclude_unnecessary_pages(unsigned lon
>  	return TRUE;
>  }
>  
> +/*
> + * Pass in the mem_map_data table.
> + * Must do this once, and before doing PL_REQUEST_FREE or PL_REQUEST_EXCLUDE.
> + */
> +int
> +setup_kernel_mmap()
> +{
> +	int ret;
> +	int kmap_elements, kmap_size;
> +	long malloc_size;
> +	void *kmap_addr;
> +	struct pfn_list_request request;
> +	struct pfn_reply reply;
> +
> +	kmap_addr = make_kernel_mmap(&kmap_elements, &kmap_size);
> +	if (kmap_addr == NULL)
> +		return FALSE;
> +	memset(&request, 0, sizeof(request));
> +	request.request = PL_REQUEST_MEMMAP;
> +	request.map_ptr = kmap_addr;
> +	request.reply_ptr = (void *)&reply;
> +	request.map_count = kmap_elements;
> +	request.map_size = kmap_size;
> +	request.list_size = MAX_PFN_LIST;
> +
> +	ret = write(pfn_list_fd, &request, sizeof(request));
> +	if (ret < 0) {
> +		fprintf(stderr, "PL_REQUEST_MEMMAP returned %d\n", ret);
> +		return FALSE;
> +	}
> +	/* the reply tells us how long the kernel's list actually is */
> +	max_pfn_list = reply.pfn_list_elements;
> +	if (max_pfn_list <= 0) {
> +		fprintf(stderr,
> +			"PL_REQUEST_MEMMAP returned max_pfn_list %d\n",
> +			max_pfn_list);
> +		return FALSE;
> +	}
> +	if (max_pfn_list < MAX_PFN_LIST) {
> +		printf("length of pfn list dropped from %d to %d\n",
> +			MAX_PFN_LIST, max_pfn_list);
> +	}
> +	free(kmap_addr);
> +	/*
> +	 * Allocate the buffer for the PFN list (just once).
> +	 */
> +	malloc_size = max_pfn_list * sizeof(struct pfn_element);
> +	if ((pfn_list = (struct pfn_element *)malloc(malloc_size)) == NULL) {
> +		ERRMSG("Can't allocate pfn_list of %ld\n", malloc_size);
> +		return FALSE;
> +	}
> +	return TRUE;
> +}
> +
>  int
>  exclude_unnecessary_pages(void)
>  {
> -	unsigned int mm;
> -	struct mem_map_data *mmd;
> -	struct timeval tv_start;
> + 	unsigned int mm;
> + 	struct mem_map_data *mmd;
> + 	struct timeval tv_start;
>  
>  	if (is_xen_memory() && !info->dom0_mapnr) {
>  		ERRMSG("Can't get max domain-0 PFN for excluding pages.\n");
>  		return FALSE;
>  	}
>  
> +	if (!info->flag_cyclic && info->flag_use_kernel_lists) {
> +		if (setup_kernel_mmap() == FALSE)
> +			return FALSE;
> +	}
>  	gettimeofday(&tv_start, NULL);
> +	gettimeofday(&scan_start, NULL);
>  
>  	for (mm = 0; mm < info->num_mem_map; mm++) {
>  		print_progress(PROGRESS_UNN_PAGES, mm, info->num_mem_map);
> @@ -4106,9 +4591,9 @@ exclude_unnecessary_pages(void)
>  
>  		if (mmd->mem_map == NOT_MEMMAP_ADDR)
>  			continue;
> -
> -		if (!__exclude_unnecessary_pages(mmd->mem_map,
> -						 mmd->pfn_start, mmd->pfn_end))
> +		if (mmd->paddr == 0)
> +			continue;
> +		if (!__exclude_unnecessary_pages(mm, mmd))
>  			return FALSE;
>  	}
>  
> @@ -4139,7 +4624,11 @@ exclude_unnecessary_pages_cyclic(void)
>  	 */
>  	copy_bitmap_cyclic();
>  
> -	if ((info->dump_level & DL_EXCLUDE_FREE) && !info->page_is_buddy)
> +	/*
> +	 * If free pages cannot be identified with the buddy flag and/or
> +	 * count then we have to search free lists.
> +	 */
> +	if ((info->dump_level & DL_EXCLUDE_FREE) && (!info->page_is_buddy))
>  		if (!exclude_free_page())
>  			return FALSE;
>  
> @@ -4164,8 +4653,7 @@ exclude_unnecessary_pages_cyclic(void)
>  
>  			if (mmd->pfn_end >= info->cyclic_start_pfn &&
>  			    mmd->pfn_start <= info->cyclic_end_pfn) {
> -				if (!__exclude_unnecessary_pages(mmd->mem_map,
> -								 mmd->pfn_start, mmd->pfn_end))
> +				if (!__exclude_unnecessary_pages(mm, mmd))
>  					return FALSE;
>  			}
>  		}
> @@ -4195,7 +4683,7 @@ update_cyclic_region(unsigned long long 
>  	if (!create_1st_bitmap_cyclic())
>  		return FALSE;
>  
> -	if (!exclude_unnecessary_pages_cyclic())
> +	if (exclude_unnecessary_pages_cyclic() == FALSE)
>  		return FALSE;
>  
>  	return TRUE;
> @@ -4255,7 +4743,7 @@ create_2nd_bitmap(void)
>  	if (info->dump_level & DL_EXCLUDE_CACHE ||
>  	    info->dump_level & DL_EXCLUDE_CACHE_PRI ||
>  	    info->dump_level & DL_EXCLUDE_USER_DATA) {
> -		if (!exclude_unnecessary_pages()) {
> +		if (exclude_unnecessary_pages() == FALSE) {
>  			ERRMSG("Can't exclude unnecessary pages.\n");
>  			return FALSE;
>  		}
> @@ -4263,8 +4751,10 @@ create_2nd_bitmap(void)
>  
>  	/*
>  	 * Exclude free pages.
> +	 * If free pages cannot be identified with the buddy flag and/or
> +	 * count then we have to search free lists.
>  	 */
> -	if (info->dump_level & DL_EXCLUDE_FREE)
> +	if ((info->dump_level & DL_EXCLUDE_FREE) && (!info->page_is_buddy))
>  		if (!exclude_free_page())
>  			return FALSE;
>  
> @@ -4395,6 +4885,10 @@ create_dump_bitmap(void)
>  	int ret = FALSE;
>  
>  	if (info->flag_cyclic) {
> +		if (info->flag_use_kernel_lists) {
> +			if (setup_kernel_mmap() == FALSE)
> +				goto out;
> +		}
>  		if (!prepare_bitmap_buffer_cyclic())
>  			goto out;
>  
> @@ -4872,6 +5366,7 @@ get_num_dumpable_cyclic(void)
>  {
>  	unsigned long long pfn, num_dumpable=0;
>  
> +	gettimeofday(&scan_start, NULL);
>  	for (pfn = 0; pfn < info->max_mapnr; pfn++) {
>  		if (!update_cyclic_region(pfn))
>  			return FALSE;
> @@ -5201,7 +5696,7 @@ get_loads_dumpfile_cyclic(void)
>  	info->cyclic_end_pfn = info->pfn_cyclic;
>  	if (!create_1st_bitmap_cyclic())
>  		return FALSE;
> -	if (!exclude_unnecessary_pages_cyclic())
> +	if (exclude_unnecessary_pages_cyclic() == FALSE)
>  		return FALSE;
>  
>  	if (!(phnum = get_phnum_memory()))
> @@ -5613,6 +6108,10 @@ write_kdump_pages(struct cache_data *cd_
>  			pfn_zero++;
>  			continue;
>  		}
> +
> +		if (nflag)
> +			continue;
> +
>  		/*
>  		 * Compress the page data.
>  		 */
> @@ -5768,6 +6267,7 @@ write_kdump_pages_cyclic(struct cache_da
>  	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
>  
>  		if ((num_dumped % per) == 0)
> +
>  			print_progress(PROGRESS_COPY, num_dumped, info->num_dumpable);
>  
>  		/*
> @@ -5786,11 +6286,17 @@ write_kdump_pages_cyclic(struct cache_da
>  		 */
>  		if ((info->dump_level & DL_EXCLUDE_ZERO)
>  		    && is_zero_page(buf, info->page_size)) {
> +		    if (!nflag) {
>  			if (!write_cache(cd_header, pd_zero, sizeof(page_desc_t)))
>  				goto out;
> +		    }
>  			pfn_zero++;
>  			continue;
>  		}
> +
> +		if (nflag)
> +			continue;
> +
>  		/*
>  		 * Compress the page data.
>  		 */
> @@ -6208,6 +6714,8 @@ write_kdump_pages_and_bitmap_cyclic(stru
>  		if (!update_cyclic_region(pfn))
>                          return FALSE;
>  
> +		if (tflag)
> +			print_execution_time("Total time", &scan_start);
>  		if (!write_kdump_pages_cyclic(cd_header, cd_page, &pd_zero, &offset_data))
>  			return FALSE;
>  
> @@ -8231,6 +8739,22 @@ static struct option longopts[] = {
>  	{0, 0, 0, 0}
>  };
>  
> +/*
> + * test for the presence of capability in the kernel to provide lists
> + * of pfn's:
> + *   /proc/vmcore_pfn_lists
> + * return 1 for present
> + * return 0 for not present
> + */
> +int
> +test_kernel_pfn_lists(void)
> +{
> +	if ((pfn_list_fd = open("/proc/vmcore_pfn_lists", O_WRONLY)) < 0) {
> +		return 0;
> +	}
> +	return 1;
> +}
> +
>  int
>  main(int argc, char *argv[])
>  {
> @@ -8256,9 +8780,12 @@ main(int argc, char *argv[])
>  	
>  	info->block_order = DEFAULT_ORDER;
>  	message_level = DEFAULT_MSG_LEVEL;
> -	while ((opt = getopt_long(argc, argv, "b:cDd:EFfg:hi:lMpRrsvXx:", longopts,
> +	while ((opt = getopt_long(argc, argv, "ab:cDd:EFfg:hi:MnoRrstVvXx:Y", longopts,
>  	    NULL)) != -1) {
>  		switch (opt) {
> +		case 'a':
> +			aflag = 1;
> +			break;
>  		case 'b':
>  			info->block_order = atoi(optarg);
>  			break;
> @@ -8314,6 +8841,13 @@ main(int argc, char *argv[])
>  		case 'M':
>  			info->flag_dmesg = 1;
>  			break;
> +		case 'n':
> +			/* -n undocumented, for testing page scanning time */
> +			nflag = 1;
> +			break;
> +		case 'o':
> +			oflag = 1;
> +			break;
>  		case 'p':
>  			info->flag_compress = DUMP_DH_COMPRESSED_SNAPPY;
>  			break;
> @@ -8329,6 +8863,9 @@ main(int argc, char *argv[])
>  		case 'r':
>  			info->flag_reassemble = 1;
>  			break;
> +		case 't':
> +			tflag = 1;
> +			break;
>  		case 'V':
>  			info->vaddr_for_vtop = strtoul(optarg, NULL, 0);
>  			break;
> @@ -8360,6 +8897,12 @@ main(int argc, char *argv[])
>  			goto out;
>  		}
>  	}
> +
> +	if (oflag)
> +		info->flag_use_kernel_lists = 0;
> +	else
> +		info->flag_use_kernel_lists = test_kernel_pfn_lists();
> +
>  	if (flag_debug)
>  		message_level |= ML_PRINT_DEBUG_MSG;
>  
> 
> _______________________________________________
> kexec mailing list
> kexec at lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/kexec

Cliff I tried your patch above on makedumpfile v1.5.1 (built dynamically
on the same DL980 I was running the test on), with all the RHEL 6
versions of kernel patches you gave me from 1207 plus the kernel patch
to kexec recommended for makedumpfile v1.5.1 built on top of a
preliminary RHEL 6.4 kernel source (higher patch level of 2.6.32
kernel), this time on a 1 TB Memory system (We have lost access to a 4
TB Memory system for some time, now). On this same system, regular
Makedumpfile v1.5.1 worked fine to produce a dump. But the Makedumpfile
with the patches above could not even start the dump, and printed:

Saving vmcore-dmesg.txt
Saved vmcore-dmesg.txt
PL_REQUEST_MEMMAP returned -1
Restarting system.

This happened with both a crashkernel size=200M that would have invoked
cyclic buffer mode, and also with a larger one, 384M that should not
have needed cyclic mode.  I had no cyclic buffer mode set or turned off
in the makedumpfile command line, just recording memory usage with:

core_collector makedumpfile -c --message-level 31 -d 31
debug_mem_level 2


 ret = write(pfn_list_fd, &request, sizeof(request));
        if (ret < 0) {
                fprintf(stderr, "PL_REQUEST_MEMMAP returned %d\n", ret);
                return FALSE;

Any ideas what probably caused this? Am I missing a patch?  Do I have
the wrong kernel patches? Tips to debug?

I am attaching the Kernel patches you sent me earlier that I used, on
top of:

https://lkml.org/lkml/2012/11/21/90  with the tweak for RHEL 2.6.32
kernels below applied on top of it:

NOTE: The patch above is for latest kernel. So you need to fix it as
      below if your kernel version is between v2.6.18 and v2.6.37:

diff --git a/kernel/kexec.c b/kernel/kexec.c
index 511151b..56583a4 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1490,7 +1490,6 @@ static int __init crash_save_vmcoreinfo_init(void)
	VMCOREINFO_OFFSET(page, flags);
	VMCOREINFO_OFFSET(page, _count);
	VMCOREINFO_OFFSET(page, mapping);
-	VMCOREINFO_OFFSET(page, _mapcount);
	VMCOREINFO_OFFSET(page, private);
	VMCOREINFO_OFFSET(page, lru);
	VMCOREINFO_OFFSET(pglist_data, node_zones);
@@ -1515,8 +1514,7 @@ static int __init crash_save_vmcoreinfo_init(void)
	VMCOREINFO_NUMBER(PG_lru);
	VMCOREINFO_NUMBER(PG_private);
	VMCOREINFO_NUMBER(PG_swapcache);
-	VMCOREINFO_NUMBER(PG_slab);
-	VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
+	VMCOREINFO_NUMBER(PG_buddy);

	arch_crash_save_vmcoreinfo();
	update_vmcoreinfo_note();


-------------- next part --------------
An embedded message was scrubbed...
From: 
Subject: [PATCH] scan page tables for makedumpfile
Date: Wed, 16 Jan 2013 05:00:37 -0700
Size: 21801
URL: <http://lists.infradead.org/pipermail/kexec/attachments/20130116/4cbb470c/attachment-0001.mht>


More information about the kexec mailing list