[PATCH v2] makedumpfile: request the kernel do page scans

Cliff Wickman cpw at sgi.com
Fri Jan 4 11:20:14 EST 2013


From: Cliff Wickman <cpw at sgi.com>

This version of the patch improves the consolidation of the mem_map table
that is passed to the kernel.  See make_kernel_mmap().
Particularly the seemingly duplicate pfn ranges generated on an older
(2.6.32-based, rhel6) kernel.



I've been experimenting with asking the kernel to scan the page tables
instead of reading all those page structures through /proc/vmcore.
The results are rather dramatic.
On a small, idle UV: about 4 sec. versus about 40 sec.
On a 8TB UV the unnecessary page scan takes 4 minutes, vs. about 200 min
through /proc/vmcore.

This patch incorporates this scheme into version 1.5.1, so that the cyclic
processing can use the kernel scans.
It also uses the page_is_buddy logic to speed the finding of free pages.
And also allows makedumpfile to work as before with a kernel that does
not provide /proc/vmcore_pfn_lists.

This patch:
  - writes requests to new kernel file /proc/vmcore_pfn_lists
  - makes request PL_REQUEST_MEMMAP to pass the crash kernel information about
    the boot kernel
  - makes requests PL_REQUEST_FREE and PL_REQUEST_EXCLUDE, asking the kernel
    to return lists of PFNs
  - adds page scan timing options -n -o and -t
  - still has a debugging option  -a

This patch depends on a kernel patch.

Diffed against the released makedumpfile-1.5.1

Signed-off-by: Cliff Wickman <cpw at sgi.com>
---
 dwarf_info.c   |    2 
 makedumpfile.c |  587 ++++++++++++++++++++++++++++++++++++++++++++++++++++++---
 makedumpfile.h |   95 +++++++++
 print_info.c   |    5 
 4 files changed, 665 insertions(+), 24 deletions(-)


Index: makedumpfile-1.5.1.released/makedumpfile.h
===================================================================
--- makedumpfile-1.5.1.released.orig/makedumpfile.h
+++ makedumpfile-1.5.1.released/makedumpfile.h
@@ -86,6 +86,8 @@ int get_mem_type(void);
 #define LSEEKED_PDESC	(2)
 #define LSEEKED_PDATA	(3)
 
+#define EXTRA_MEMMAPS	100
+
 /*
  * Xen page flags
  */
@@ -418,7 +420,7 @@ do { \
 #define KVER_MIN_SHIFT 16
 #define KERNEL_VERSION(x,y,z) (((x) << KVER_MAJ_SHIFT) | ((y) << KVER_MIN_SHIFT) | (z))
 #define OLDEST_VERSION		KERNEL_VERSION(2, 6, 15)/* linux-2.6.15 */
-#define LATEST_VERSION		KERNEL_VERSION(3, 6, 7)/* linux-3.6.7 */
+#define LATEST_VERSION		KERNEL_VERSION(3, 7, 8)/* linux-3.7.8 */
 
 /*
  * vmcoreinfo in /proc/vmcore
@@ -794,11 +796,25 @@ typedef struct {
 } xen_crash_info_v2_t;
 
 struct mem_map_data {
+	/*
+	 * pfn_start/pfn_end are the pfn's represented by this mem_map entry.
+	 * mem_map is the virtual address of the array of page structures
+	 * that represent these pages.
+	 * paddr is the physical address of that array of structures.
+	 * ending_paddr would be (pfn_end - pfn_start) * sizeof(struct page).
+	 * section_vaddr is the address we get from ioremap_cache().
+	 */
 	unsigned long long	pfn_start;
 	unsigned long long	pfn_end;
-	unsigned long	mem_map;
+	unsigned long		mem_map;
+	unsigned long long	paddr;		/* filled in by makedumpfile */
+	long			virtual_offset;	/* filled in by kernel */
+	unsigned long long	ending_paddr;	/* filled in by kernel */
+	unsigned long		mapped_size;	/* filled in by kernel */
+	void 			*section_vaddr;	/* filled in by kernel */
 };
 
+
 struct dump_bitmap {
 	int		fd;
 	int		no_block;
@@ -875,6 +891,7 @@ struct DumpInfo {
 	int		flag_rearrange;      /* flag of creating dumpfile from
 						flattened format */
 	int		flag_split;	     /* splitting vmcore */
+	int		flag_use_kernel_lists;
   	int		flag_cyclic;	     /* cyclic processing to keep memory consumption */
 	int		flag_reassemble;     /* reassemble multiple dumpfiles into one */
 	int		flag_refiltering;    /* refilter from kdump-compressed file */
@@ -1384,6 +1401,80 @@ struct domain_list {
 	unsigned int  pickled_id;
 };
 
+#define PL_REQUEST_FREE		1	/* request for a list of free pages */
+#define PL_REQUEST_EXCLUDE	2	/* request for a list of excludable
+					   pages */
+#define PL_REQUEST_MEMMAP	3	/* request to pass in the makedumpfile
+					   mem_map_data table */
+/*
+ * limit the size of the pfn list to this many pfn_element structures
+ */
+#define MAX_PFN_LIST 10000
+
+/*
+ * one element in the pfn_list
+ */
+struct pfn_element {
+	unsigned long pfn;
+	unsigned long order;
+};
+
+/*
+ * a request for finding pfn's that can be excluded from the dump
+ * they may be pages of particular types or free pages
+ */
+struct pfn_list_request {
+	int request;		/* PL_REQUEST_FREE PL_REQUEST_EXCLUDE or */
+				/* PL_REQUEST_MEMMAP */
+	int debug;
+	unsigned long paddr;	/* mem_map address for PL_REQUEST_EXCLUDE */
+	unsigned long pfn_start;/* pfn represented by paddr */
+	unsigned long pgdat_paddr; /* for PL_REQUEST_FREE */
+	unsigned long pgdat_vaddr; /* for PL_REQUEST_FREE */
+	int node;		/* for PL_REQUEST_FREE */
+	int exclude_bits;	/* for PL_REQUEST_EXCLUDE */
+	int count;		/* for PL_REQUEST_EXCLUDE */
+	void *reply_ptr;	/* address of user's pfn_reply, for reply */
+	void *pfn_list_ptr;	/* address of user's pfn array (*pfn_list) */
+	int map_count;		/* for PL_REQUEST_MEMMAP; elements */
+	int map_size;		/* for PL_REQUEST_MEMMAP; bytes in table */
+	void *map_ptr;		/* for PL_REQUEST_MEMMAP; address of table */
+	long list_size;		/* for PL_REQUEST_MEMMAP negotiation */
+	/* resume info: */
+	int more;		/* 0 for done, 1 for "there's more" */
+				/* PL_REQUEST_EXCLUDE: */
+	int map_index;		/* slot in the mem_map array of page structs */
+				/* PL_REQUEST_FREE: */
+	int zone_index;		/* zone within the node's pgdat_list */
+	int freearea_index;	/* free_area within the zone */
+	int type_index;		/* free_list within the free_area */
+	int list_ct;		/* page within the list */
+};
+
+/*
+ * the reply from a pfn_list_request
+ * the list of pfn's itself is pointed to by pfn_list
+ */
+struct pfn_reply {
+	long pfn_list_elements;	/* negoiated on PL_REQUEST_MEMMAP */
+	long in_pfn_list;	/* returned by PL_REQUEST_EXCLUDE and
+				   PL_REQUEST_FREE */
+	/* resume info */
+	int more;		/* 0 == done, 1 == there is more */
+				/* PL_REQUEST_MEMMAP: */
+	int map_index;		/* slot in the mem_map array of page structs */
+				/* PL_REQUEST_FREE: */
+	int zone_index;		/* zone within the node's pgdat_list */
+	int freearea_index;	/* free_area within the zone */
+	int type_index;		/* free_list within the free_area */
+	int list_ct;		/* page within the list */
+	/* statistic counters: */
+	unsigned long long pfn_cache;		/* PL_REQUEST_EXCLUDE */
+	unsigned long long pfn_cache_private;	/* PL_REQUEST_EXCLUDE */
+	unsigned long long pfn_user;		/* PL_REQUEST_EXCLUDE */
+	unsigned long long pfn_free;		/* PL_REQUEST_FREE */
+};
+
 #define PAGES_PER_MAPWORD 	(sizeof(unsigned long) * 8)
 #define MFNS_PER_FRAME		(info->page_size / sizeof(unsigned long))
 
Index: makedumpfile-1.5.1.released/dwarf_info.c
===================================================================
--- makedumpfile-1.5.1.released.orig/dwarf_info.c
+++ makedumpfile-1.5.1.released/dwarf_info.c
@@ -324,6 +324,8 @@ get_data_member_location(Dwarf_Die *die,
 	return TRUE;
 }
 
+int dwarf_formref(Dwarf_Attribute *, Dwarf_Off *);
+
 static int
 get_die_type(Dwarf_Die *die, Dwarf_Die *die_type)
 {
Index: makedumpfile-1.5.1.released/print_info.c
===================================================================
--- makedumpfile-1.5.1.released.orig/print_info.c
+++ makedumpfile-1.5.1.released/print_info.c
@@ -244,6 +244,11 @@ print_usage(void)
 	MSG("  [-f]:\n");
 	MSG("      Overwrite DUMPFILE even if it already exists.\n");
 	MSG("\n");
+	MSG("  [-o]:\n");
+	MSG("      Read page structures from /proc/vmcore in the scan for\n");
+	MSG("      free and excluded pages regardless of whether\n");
+	MSG("      /proc/vmcore_pfn_lists is present.\n");
+	MSG("\n");
 	MSG("  [-h]:\n");
 	MSG("      Show help message and LZO/snappy support status (enabled/disabled).\n");
 	MSG("\n");
Index: makedumpfile-1.5.1.released/makedumpfile.c
===================================================================
--- makedumpfile-1.5.1.released.orig/makedumpfile.c
+++ makedumpfile-1.5.1.released/makedumpfile.c
@@ -13,6 +13,8 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  */
+#define _GNU_SOURCE
+#include <stdio.h>
 #include "makedumpfile.h"
 #include "print_info.h"
 #include "dwarf_info.h"
@@ -31,6 +33,14 @@ struct srcfile_table	srcfile_table;
 
 struct vm_table		vt = { 0 };
 struct DumpInfo		*info = NULL;
+int pfn_list_fd;
+struct pfn_element *pfn_list;
+int nflag = 0;
+int oflag = 0;
+int tflag = 0;
+int aflag = 0;
+struct timeval scan_start;
+int max_pfn_list;
 
 char filename_stdout[] = FILENAME_STDOUT;
 
@@ -2415,6 +2425,22 @@ get_mm_sparsemem(void)
 	unsigned long long pfn_start, pfn_end;
 	unsigned long section, mem_map;
 	unsigned long *mem_sec = NULL;
+	unsigned long vaddr;
+	unsigned long paddr;
+	unsigned long lastvaddr;
+	unsigned long lastpaddr;
+	unsigned long diff;
+	long j;
+	int i;
+	int npfns;
+	int pagesize;
+	int num_mem_map;
+	int num_added = 0;
+	struct mem_map_data *mmd;
+	struct mem_map_data *curmmd;
+	struct mem_map_data *work1mmd;
+	struct mem_map_data *work2mmd;
+	struct mem_map_data *lastmmd;
 
 	int ret = FALSE;
 
@@ -2441,7 +2467,8 @@ get_mm_sparsemem(void)
 	}
 	info->num_mem_map = num_section;
 	if ((info->mem_map_data = (struct mem_map_data *)
-	    malloc(sizeof(struct mem_map_data)*info->num_mem_map)) == NULL) {
+	    malloc(sizeof(struct mem_map_data) *
+		   		(EXTRA_MEMMAPS + info->num_mem_map))) == NULL) {
 		ERRMSG("Can't allocate memory for the mem_map_data. %s\n",
 		    strerror(errno));
 		goto out;
@@ -2459,6 +2486,71 @@ get_mm_sparsemem(void)
 		dump_mem_map(pfn_start, pfn_end, mem_map, section_nr);
 	}
 	ret = TRUE;
+
+	/* add paddr to the table */
+	mmd = &info->mem_map_data[0];
+	num_mem_map = info->num_mem_map;
+	lastmmd = mmd + num_mem_map;
+	for (i = 0; i < num_mem_map; i++) {
+		if (mmd[i].mem_map == 0) {
+			mmd[i].paddr = 0;
+		} else {
+			mmd[i].paddr = vaddr_to_paddr(mmd[i].mem_map);
+			if (mmd[i].paddr == 0) {
+				printf("! can't translate %#lx to paddr\n",
+					mmd[i].mem_map);
+				exit(1);
+			}
+			/*
+			 * When we pass a mem_map and its paddr to the kernel
+			 * it will be remapped assuming the entire range
+			 * of pfn's are consecutive. If they are not then
+			 * we need to split the range into two.
+			 */
+			pagesize = SIZE(page);
+			npfns = mmd[i].pfn_end - mmd[i].pfn_start;
+			vaddr = (unsigned long)mmd[i].mem_map;
+			paddr = vaddr_to_paddr(vaddr);
+			diff = vaddr - paddr;
+			lastvaddr = vaddr + (pagesize * (npfns-1));
+			lastpaddr = vaddr_to_paddr(lastvaddr);
+			if (lastvaddr - lastpaddr != diff) {
+			/* there is a break in vtop somewhere in this range */
+			/* we need to split it */
+			  for (j = 0; j < npfns; j++) {
+			    paddr = vaddr_to_paddr(vaddr);
+			    if (vaddr - paddr != diff) {
+				diff = vaddr - paddr;
+				/* insert a new entry if we have room */
+				if (num_added < EXTRA_MEMMAPS) {
+					curmmd = &info->mem_map_data[i];
+					num_added++;
+					work1mmd = lastmmd - 1;
+					for (work2mmd = lastmmd;
+						work2mmd > curmmd; work2mmd--) {
+						work1mmd = work2mmd - 1;
+						*work2mmd = *work1mmd;
+					}
+					work2mmd = work1mmd + 1;
+					work2mmd->mem_map =
+					  work1mmd->mem_map + (pagesize * j);
+					lastmmd++;
+					num_mem_map++;
+					info->num_mem_map++;
+					/*
+					 * need only 1 split, the new
+					 * one will be checked also.
+					 */
+					break;
+				} else
+					printf("warn: out of EXTRA_MEMMAPS\n");
+			    }
+				vaddr += pagesize;
+			  }
+			}
+		}
+	}
+
 out:
 	if (mem_sec != NULL)
 		free(mem_sec);
@@ -2571,6 +2663,172 @@ initialize_bitmap_memory(void)
 	return TRUE;
 }
 
+/*
+ * construct a version of the mem_map_data table to pass to the kernel
+ */
+void *
+make_kernel_mmap(int *kmap_elements, int *kmap_size)
+{
+	int i, j;
+	int elements = 0;
+	int page_structs;
+	int elem;
+	long l;
+	unsigned long base_end_pfn;
+	unsigned long end_paddr;
+	unsigned long v1;
+	unsigned long v2;
+	unsigned long end_page_pfns;
+	unsigned long hpagesize = 0x200000UL;
+	unsigned long hpageoffset = hpagesize - 1;
+	struct mem_map_data *mmdo, *mmdn;
+	struct mem_map_data *mmdbase, *mmdnext, *mmdend, *mmdwork;
+	struct mem_map_data temp_mmd;
+	struct mem_map_data *mmap;
+
+	mmap = malloc(info->num_mem_map * sizeof(struct mem_map_data));
+	if (mmap == NULL) {
+		ERRMSG("Can't allocate memory kernel map\n");
+		return NULL;
+	}
+
+	/* condense them down to the valid ones */
+	for (i = 0, mmdn = mmap, mmdo = &info->mem_map_data[0];
+				i < info->num_mem_map; i++, mmdo++) {
+		if (mmdo->mem_map && mmdo->paddr) {
+			*mmdn = *mmdo;
+			mmdn++;
+			elements++;
+		}
+	}
+
+	/* make sure it is sorted by mem_map (it should be already) */
+	mmdn = mmap;
+	for (i = 0; i < elements - 1; i++) {
+		for (j = i + 1; j < elements; j++) {
+			if (mmdn[j].mem_map < mmdn[i].mem_map) {
+				temp_mmd = mmdn[j];
+				mmdn[j] = mmdn[i];
+				mmdn[i] = temp_mmd;
+			}
+		}
+	}
+
+	if (aflag) {
+		mmdn = mmap;
+		printf("entire mem_map:\n");
+		for (i = 0; i < elements - 1; i++) {
+			l = (mmdn[i].pfn_end - mmdn[i].pfn_start) * SIZE(page);
+			printf(
+			 "[%d] pfn %#llx-%llx mem_map %#lx paddr %#llx-%llx\n",
+				i, mmdn[i].pfn_start, mmdn[i].pfn_end,
+				mmdn[i].mem_map, mmdn[i].paddr,
+				mmdn[i].paddr + l);
+		}
+	}
+
+	/*
+	 * a first pass to split overlapping pfn entries like this:
+	 * pfn 0x1248000-1250000 mem_map 0xffffea003ffc0000 paddr 0x10081c0000
+	 * pfn 0x1248000-1250000 mem_map 0xffffea0040000030 paddr 0x1008400030
+	 */
+	mmdbase = mmap;
+	mmdnext = mmap + 1;
+	mmdend = mmap + elements;
+	/* test each mmdbase/mmdnext pair */
+	while (mmdnext < mmdend) {  /* mmdnext is the one after mmdbase */
+		page_structs = (mmdbase->pfn_end - mmdbase->pfn_start);
+		/* mmdwork scans from mmdnext to the end */
+		if ((mmdbase->pfn_start == mmdnext->pfn_start) &&
+		    (mmdbase->pfn_end == mmdnext->pfn_end)) {
+			/* overlapping pfns, we need a fix */
+			v1 = mmdnext->mem_map - mmdbase->mem_map;
+			v2 = mmdnext->paddr - mmdbase->paddr;
+			if (v1 != (v2 & hpageoffset))
+				printf("virt to phys is wrong %#lx %#lx\n",
+					v1, v2);
+			l = mmdbase->pfn_end - mmdbase->pfn_start;
+			end_page_pfns = l - (((hpagesize -
+				(hpageoffset & mmdbase->paddr)) +
+				  SIZE(page) - 1) / SIZE(page));
+			mmdbase->pfn_end -= end_page_pfns;
+			mmdnext->pfn_start = mmdbase->pfn_end;
+		} else if ((mmdbase->pfn_start == mmdnext->pfn_start) ||
+		    (mmdbase->pfn_end == mmdnext->pfn_end)) {
+			printf("warning: unfixed overlap\n");
+		}
+		mmdbase++;
+		mmdnext++;
+	}
+
+	/*
+	 * consolidate those mem_map's with occupying consecutive physical
+	 * addresses
+	 *  pages represented by these pages structs:       addr of page struct
+	 * pfns 0x1000000-1008000 mem_map 0xffffea0038000000 paddr 0x11f7e00000
+	 * pfns 0x1008000-1010000 mem_map 0xffffea00381c0000 paddr 0x11f7fc0000
+	 * pfns 0x1010000-1018000 mem_map 0xffffea0038380000 paddr 0x11f8180000
+	 *           8000 increments                             inc's:  1c0000
+	 *        8000000 of memory (128M)                    8000 page structs
+	 */
+	mmdbase = mmap;
+	mmdnext = mmap + 1;
+	mmdend = mmap + elements;
+	while (mmdnext < mmdend) {
+		elem = mmdend - mmdnext;
+		/*  test mmdbase vs. mmdwork and onward: */
+		for (i = 0, mmdwork = mmdnext; i < elem; i++, mmdwork++) {
+			base_end_pfn = mmdbase->pfn_end;
+			if (base_end_pfn == mmdwork->pfn_start) {
+				page_structs = (mmdbase->pfn_end -
+							mmdbase->pfn_start);
+				end_paddr = (page_structs * SIZE(page)) +
+							mmdbase->paddr;
+				if (mmdwork->paddr == end_paddr) {
+					/* extend base by the work one */
+					mmdbase->pfn_end = mmdwork->pfn_end;
+					/* next is where to begin next time */
+					mmdnext = mmdwork + 1;
+				} else {
+					/* gap in address of page
+					   structs; end of section */
+					mmdbase++;
+					if (mmdwork - mmdbase > 0)
+						*mmdbase = *mmdwork;
+					mmdnext = mmdwork + 1;
+					break;
+				}
+			} else {
+				/* gap in pfns; end of section */
+				mmdbase++;
+				if (mmdwork - mmdbase > 0)
+					*mmdbase = *mmdwork;
+				mmdnext = mmdwork + 1;
+				break;
+			}
+		}
+	}
+	elements = (mmdbase - mmap) + 1;
+
+	if (aflag) {
+		printf("user mmap for kernel:\n");
+		for (i = 0, mmdwork = mmap; i < elements; i++, mmdwork++) {
+			l = mmdwork->pfn_end - mmdwork->pfn_start;
+			printf(
+		      "[%d] user pfn %#llx-%llx paddr %#llx-%llx vaddr %#lx\n",
+				i, mmdwork->pfn_start, mmdwork->pfn_end,
+				mmdwork->paddr,
+				mmdwork->paddr + (l * SIZE(page)),
+				mmdwork->mem_map);
+		}
+	}
+
+	*kmap_elements = elements;
+	*kmap_size = elements * sizeof(struct mem_map_data);
+
+	return mmap;
+}
+
 int
 initial(void)
 {
@@ -2833,7 +3091,14 @@ out:
 	if (!get_value_for_old_linux())
 		return FALSE;
 
-	if (info->flag_cyclic && (info->dump_level & DL_EXCLUDE_FREE))
+	/*
+	 * page_is_buddy will tell us whether free pages can be identified
+	 * by flags and counts in the page structure without making an extra
+	 * pass through the free lists.
+	 * This is applicable to using /proc/vmcore or using the kernel.
+	 *   force all old (-o) forms to search free lists
+	 */
+	if (info->dump_level & DL_EXCLUDE_FREE)
 		setup_page_is_buddy();
 
 	return TRUE;
@@ -3549,6 +3814,65 @@ out:
 	return ret;
 }
 
+/*
+ * let the kernel find excludable pages from one node
+ */
+void
+__exclude_free_pages_kernel(unsigned long pgdat, int node)
+{
+	int i, j, ret, pages;
+	unsigned long pgdat_paddr;
+	struct pfn_list_request request;
+	struct pfn_reply reply;
+	struct pfn_element *pe;
+
+	if ((pgdat_paddr = vaddr_to_paddr(pgdat)) == NOT_PADDR) {
+		ERRMSG("Can't convert virtual address(%#lx) to physical.\n",
+			pgdat);
+		return;
+	}
+
+	/*
+	 * Get the list of free pages.
+	 * This may be broken up into MAX_PFN_list arrays of PFNs.
+	 */
+	memset(&request, 0, sizeof(request));
+	request.request = PL_REQUEST_FREE;
+	request.node = node;
+	request.pgdat_paddr = pgdat_paddr;
+	request.pgdat_vaddr = pgdat;
+	request.reply_ptr = (void *)&reply;
+	request.pfn_list_ptr = (void *)pfn_list;
+	memset(&reply, 0, sizeof(reply));
+
+	do {
+		request.more = 0;
+		if (reply.more) {
+			/* this is to be a continuation of the last request */
+			request.more = 1;
+			request.zone_index = reply.zone_index;
+			request.freearea_index = reply.freearea_index;
+			request.type_index = reply.type_index;
+			request.list_ct = reply.list_ct;
+		}
+		ret = write(pfn_list_fd, &request, sizeof(request));
+		if (ret != sizeof(request)) {
+			printf("PL_REQUEST_FREE failed\n");
+			return;
+		}
+		pfn_free += reply.pfn_free;
+
+		for (i = 0; i < reply.in_pfn_list; i++) {
+			pe = &pfn_list[i];
+			pages = (1 << pe->order);
+                        for (j = 0; j < pages; j++) {
+				clear_bit_on_2nd_bitmap_for_kernel(pe->pfn + j);
+			}
+		}
+	} while (reply.more);
+
+	return;
+}
 
 int
 _exclude_free_page(void)
@@ -3568,7 +3892,24 @@ _exclude_free_page(void)
 	gettimeofday(&tv_start, NULL);
 
 	for (num_nodes = 1; num_nodes <= vt.numnodes; num_nodes++) {
-
+		if (!info->flag_cyclic && info->flag_use_kernel_lists) {
+			node_zones = pgdat + OFFSET(pglist_data.node_zones);
+			if (!readmem(VADDR,
+				pgdat + OFFSET(pglist_data.nr_zones),
+				&nr_zones, sizeof(nr_zones))) {
+					ERRMSG("Can't get nr_zones.\n");
+				return FALSE;
+			}
+			print_progress(PROGRESS_FREE_PAGES, num_nodes - 1,
+								vt.numnodes);
+			/* ask the kernel to do one node */
+			__exclude_free_pages_kernel(pgdat, node);
+			goto next_pgdat;
+		}
+		/*
+		 * kernel does not have the pfn_list capability
+		 * use the old way
+		 */
 		print_progress(PROGRESS_FREE_PAGES, num_nodes - 1, vt.numnodes);
 
 		node_zones = pgdat + OFFSET(pglist_data.node_zones);
@@ -3595,6 +3936,7 @@ _exclude_free_page(void)
 			if (!reset_bitmap_of_free_pages(zone))
 				return FALSE;
 		}
+	next_pgdat:
 		if (num_nodes < vt.numnodes) {
 			if ((node = next_online_node(node + 1)) < 0) {
 				ERRMSG("Can't get next online node.\n");
@@ -3612,6 +3954,8 @@ _exclude_free_page(void)
 	 */
 	print_progress(PROGRESS_FREE_PAGES, vt.numnodes, vt.numnodes);
 	print_execution_time(PROGRESS_FREE_PAGES, &tv_start);
+	if (tflag)
+		print_execution_time("Total time", &scan_start);
 
 	return TRUE;
 }
@@ -3755,7 +4099,6 @@ setup_page_is_buddy(void)
 		}
 	} else
 		info->page_is_buddy = page_is_buddy_v2;
-
 out:
 	if (!info->page_is_buddy)
 		DEBUG_MSG("Can't select page_is_buddy handler; "
@@ -3964,10 +4307,89 @@ exclude_zero_pages(void)
 	return TRUE;
 }
 
+/*
+ * let the kernel find excludable pages from one mem_section
+ */
+int
+__exclude_unnecessary_pages_kernel(int mm, struct mem_map_data *mmd)
+{
+	unsigned long long pfn_start = mmd->pfn_start;
+	unsigned long long pfn_end = mmd->pfn_end;
+	int i, j, ret, pages, flag;
+	struct pfn_list_request request;
+	struct pfn_reply reply;
+	struct pfn_element *pe;
+
+	/*
+	 * Get the list of to-be-excluded pages in this section.
+	 * It may be broken up by groups of max_pfn_list size.
+	 */
+	memset(&request, 0, sizeof(request));
+	request.request = PL_REQUEST_EXCLUDE;
+	request.paddr = mmd->paddr; /* phys addr of mem_map */
+	request.reply_ptr = (void *)&reply;
+	request.pfn_list_ptr = (void *)pfn_list;
+	request.exclude_bits = 0;
+	request.pfn_start = pfn_start;
+	request.count = pfn_end - pfn_start;
+	if (info->dump_level & DL_EXCLUDE_CACHE)
+	 	request.exclude_bits |= DL_EXCLUDE_CACHE;
+	if (info->dump_level & DL_EXCLUDE_CACHE_PRI)
+	 	request.exclude_bits |= DL_EXCLUDE_CACHE_PRI;
+	if (info->dump_level & DL_EXCLUDE_USER_DATA)
+	 	request.exclude_bits |= DL_EXCLUDE_USER_DATA;
+	/* if we try for free pages from the freelists then we don't need
+           to ask here for 'buddy' pages */
+	if (info->dump_level & DL_EXCLUDE_FREE)
+	 	request.exclude_bits |= DL_EXCLUDE_FREE;
+	memset(&reply, 0, sizeof(reply));
+
+	do {
+		/* pfn represented by paddr */
+		request.more = 0;
+		if (reply.more) {
+			/* this is to be a continuation of the last request */
+			request.more = 1;
+			request.map_index = reply.map_index;
+		}
+
+		ret = write(pfn_list_fd, &request, sizeof(request));
+		if (ret != sizeof(request))
+			return FALSE;
+
+		pfn_cache += reply.pfn_cache;
+		pfn_cache_private += reply.pfn_cache_private;
+		pfn_user += reply.pfn_user;
+		pfn_free += reply.pfn_free;
+
+		flag = 0;
+		for (i = 0; i < reply.in_pfn_list; i++) {
+			pe = &pfn_list[i];
+			pages = (1 << pe->order);
+                        for (j = 0; j < pages; j++) {
+				if (clear_bit_on_2nd_bitmap_for_kernel(
+							pe->pfn + j) == FALSE) {
+					// printf("fail: mm %d slot %d pfn %#lx\n",
+						// mm, i, pe->pfn + j);
+					// printf("paddr %#llx pfn %#llx-%#llx mem_map %#lx\n",
+					// mmd->paddr, mmd->pfn_start, mmd->pfn_end, mmd->mem_map);
+					flag = 1;
+					break;
+				}
+				if (flag) break;
+			}
+		}
+	} while (reply.more);
+
+	return TRUE;
+}
+
 int
-__exclude_unnecessary_pages(unsigned long mem_map,
-    unsigned long long pfn_start, unsigned long long pfn_end)
+__exclude_unnecessary_pages(int mm, struct mem_map_data *mmd)
 {
+	unsigned long long pfn_start = mmd->pfn_start;
+	unsigned long long pfn_end = mmd->pfn_end;
+	unsigned long mem_map = mmd->mem_map;
 	unsigned long long pfn, pfn_mm, maddr;
 	unsigned long long pfn_read_start, pfn_read_end, index_pg;
 	unsigned char page_cache[SIZE(page) * PGMM_CACHED];
@@ -3975,6 +4397,12 @@ __exclude_unnecessary_pages(unsigned lon
 	unsigned int _count, _mapcount = 0;
 	unsigned long flags, mapping, private = 0;
 
+	if (info->flag_use_kernel_lists) {
+		if (__exclude_unnecessary_pages_kernel(mm, mmd) == FALSE)
+			return FALSE;
+		return TRUE;
+	}
+
 	/*
 	 * Refresh the buffer of struct page, when changing mem_map.
 	 */
@@ -4012,7 +4440,6 @@ __exclude_unnecessary_pages(unsigned lon
 				pfn_mm = PGMM_CACHED - index_pg;
 			else
 				pfn_mm = pfn_end - pfn;
-
 			if (!readmem(VADDR, mem_map,
 			    page_cache + (index_pg * SIZE(page)),
 			    SIZE(page) * pfn_mm)) {
@@ -4036,7 +4463,6 @@ __exclude_unnecessary_pages(unsigned lon
 		 * Exclude the free page managed by a buddy
 		 */
 		if ((info->dump_level & DL_EXCLUDE_FREE)
-		    && info->flag_cyclic
 		    && info->page_is_buddy
 		    && info->page_is_buddy(flags, _mapcount, private, _count)) {
 			int i;
@@ -4085,19 +4511,78 @@ __exclude_unnecessary_pages(unsigned lon
 	return TRUE;
 }
 
+/*
+ * Pass in the mem_map_data table.
+ * Must do this once, and before doing PL_REQUEST_FREE or PL_REQUEST_EXCLUDE.
+ */
+int
+setup_kernel_mmap()
+{
+	int ret;
+	int kmap_elements, kmap_size;
+	long malloc_size;
+	void *kmap_addr;
+	struct pfn_list_request request;
+	struct pfn_reply reply;
+
+	kmap_addr = make_kernel_mmap(&kmap_elements, &kmap_size);
+	if (kmap_addr == NULL)
+		return FALSE;
+	memset(&request, 0, sizeof(request));
+	request.request = PL_REQUEST_MEMMAP;
+	request.map_ptr = kmap_addr;
+	request.reply_ptr = (void *)&reply;
+	request.map_count = kmap_elements;
+	request.map_size = kmap_size;
+	request.list_size = MAX_PFN_LIST;
+
+	ret = write(pfn_list_fd, &request, sizeof(request));
+	if (ret < 0) {
+		fprintf(stderr, "PL_REQUEST_MEMMAP returned %d\n", ret);
+		return FALSE;
+	}
+	/* the reply tells us how long the kernel's list actually is */
+	max_pfn_list = reply.pfn_list_elements;
+	if (max_pfn_list <= 0) {
+		fprintf(stderr,
+			"PL_REQUEST_MEMMAP returned max_pfn_list %d\n",
+			max_pfn_list);
+		return FALSE;
+	}
+	if (max_pfn_list < MAX_PFN_LIST) {
+		printf("length of pfn list dropped from %d to %d\n",
+			MAX_PFN_LIST, max_pfn_list);
+	}
+	free(kmap_addr);
+	/*
+	 * Allocate the buffer for the PFN list (just once).
+	 */
+	malloc_size = max_pfn_list * sizeof(struct pfn_element);
+	if ((pfn_list = (struct pfn_element *)malloc(malloc_size)) == NULL) {
+		ERRMSG("Can't allocate pfn_list of %ld\n", malloc_size);
+		return FALSE;
+	}
+	return TRUE;
+}
+
 int
 exclude_unnecessary_pages(void)
 {
-	unsigned int mm;
-	struct mem_map_data *mmd;
-	struct timeval tv_start;
+ 	unsigned int mm;
+ 	struct mem_map_data *mmd;
+ 	struct timeval tv_start;
 
 	if (is_xen_memory() && !info->dom0_mapnr) {
 		ERRMSG("Can't get max domain-0 PFN for excluding pages.\n");
 		return FALSE;
 	}
 
+	if (!info->flag_cyclic && info->flag_use_kernel_lists) {
+		if (setup_kernel_mmap() == FALSE)
+			return FALSE;
+	}
 	gettimeofday(&tv_start, NULL);
+	gettimeofday(&scan_start, NULL);
 
 	for (mm = 0; mm < info->num_mem_map; mm++) {
 		print_progress(PROGRESS_UNN_PAGES, mm, info->num_mem_map);
@@ -4106,9 +4591,9 @@ exclude_unnecessary_pages(void)
 
 		if (mmd->mem_map == NOT_MEMMAP_ADDR)
 			continue;
-
-		if (!__exclude_unnecessary_pages(mmd->mem_map,
-						 mmd->pfn_start, mmd->pfn_end))
+		if (mmd->paddr == 0)
+			continue;
+		if (!__exclude_unnecessary_pages(mm, mmd))
 			return FALSE;
 	}
 
@@ -4139,7 +4624,11 @@ exclude_unnecessary_pages_cyclic(void)
 	 */
 	copy_bitmap_cyclic();
 
-	if ((info->dump_level & DL_EXCLUDE_FREE) && !info->page_is_buddy)
+	/*
+	 * If free pages cannot be identified with the buddy flag and/or
+	 * count then we have to search free lists.
+	 */
+	if ((info->dump_level & DL_EXCLUDE_FREE) && (!info->page_is_buddy))
 		if (!exclude_free_page())
 			return FALSE;
 
@@ -4164,8 +4653,7 @@ exclude_unnecessary_pages_cyclic(void)
 
 			if (mmd->pfn_end >= info->cyclic_start_pfn &&
 			    mmd->pfn_start <= info->cyclic_end_pfn) {
-				if (!__exclude_unnecessary_pages(mmd->mem_map,
-								 mmd->pfn_start, mmd->pfn_end))
+				if (!__exclude_unnecessary_pages(mm, mmd))
 					return FALSE;
 			}
 		}
@@ -4195,7 +4683,7 @@ update_cyclic_region(unsigned long long 
 	if (!create_1st_bitmap_cyclic())
 		return FALSE;
 
-	if (!exclude_unnecessary_pages_cyclic())
+	if (exclude_unnecessary_pages_cyclic() == FALSE)
 		return FALSE;
 
 	return TRUE;
@@ -4255,7 +4743,7 @@ create_2nd_bitmap(void)
 	if (info->dump_level & DL_EXCLUDE_CACHE ||
 	    info->dump_level & DL_EXCLUDE_CACHE_PRI ||
 	    info->dump_level & DL_EXCLUDE_USER_DATA) {
-		if (!exclude_unnecessary_pages()) {
+		if (exclude_unnecessary_pages() == FALSE) {
 			ERRMSG("Can't exclude unnecessary pages.\n");
 			return FALSE;
 		}
@@ -4263,8 +4751,10 @@ create_2nd_bitmap(void)
 
 	/*
 	 * Exclude free pages.
+	 * If free pages cannot be identified with the buddy flag and/or
+	 * count then we have to search free lists.
 	 */
-	if (info->dump_level & DL_EXCLUDE_FREE)
+	if ((info->dump_level & DL_EXCLUDE_FREE) && (!info->page_is_buddy))
 		if (!exclude_free_page())
 			return FALSE;
 
@@ -4395,6 +4885,10 @@ create_dump_bitmap(void)
 	int ret = FALSE;
 
 	if (info->flag_cyclic) {
+		if (info->flag_use_kernel_lists) {
+			if (setup_kernel_mmap() == FALSE)
+				goto out;
+		}
 		if (!prepare_bitmap_buffer_cyclic())
 			goto out;
 
@@ -4872,6 +5366,7 @@ get_num_dumpable_cyclic(void)
 {
 	unsigned long long pfn, num_dumpable=0;
 
+	gettimeofday(&scan_start, NULL);
 	for (pfn = 0; pfn < info->max_mapnr; pfn++) {
 		if (!update_cyclic_region(pfn))
 			return FALSE;
@@ -5201,7 +5696,7 @@ get_loads_dumpfile_cyclic(void)
 	info->cyclic_end_pfn = info->pfn_cyclic;
 	if (!create_1st_bitmap_cyclic())
 		return FALSE;
-	if (!exclude_unnecessary_pages_cyclic())
+	if (exclude_unnecessary_pages_cyclic() == FALSE)
 		return FALSE;
 
 	if (!(phnum = get_phnum_memory()))
@@ -5613,6 +6108,10 @@ write_kdump_pages(struct cache_data *cd_
 			pfn_zero++;
 			continue;
 		}
+
+		if (nflag)
+			continue;
+
 		/*
 		 * Compress the page data.
 		 */
@@ -5768,6 +6267,7 @@ write_kdump_pages_cyclic(struct cache_da
 	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
 
 		if ((num_dumped % per) == 0)
+
 			print_progress(PROGRESS_COPY, num_dumped, info->num_dumpable);
 
 		/*
@@ -5786,11 +6286,17 @@ write_kdump_pages_cyclic(struct cache_da
 		 */
 		if ((info->dump_level & DL_EXCLUDE_ZERO)
 		    && is_zero_page(buf, info->page_size)) {
+		    if (!nflag) {
 			if (!write_cache(cd_header, pd_zero, sizeof(page_desc_t)))
 				goto out;
+		    }
 			pfn_zero++;
 			continue;
 		}
+
+		if (nflag)
+			continue;
+
 		/*
 		 * Compress the page data.
 		 */
@@ -6208,6 +6714,8 @@ write_kdump_pages_and_bitmap_cyclic(stru
 		if (!update_cyclic_region(pfn))
                         return FALSE;
 
+		if (tflag)
+			print_execution_time("Total time", &scan_start);
 		if (!write_kdump_pages_cyclic(cd_header, cd_page, &pd_zero, &offset_data))
 			return FALSE;
 
@@ -8231,6 +8739,22 @@ static struct option longopts[] = {
 	{0, 0, 0, 0}
 };
 
+/*
+ * test for the presence of capability in the kernel to provide lists
+ * of pfn's:
+ *   /proc/vmcore_pfn_lists
+ * return 1 for present
+ * return 0 for not present
+ */
+int
+test_kernel_pfn_lists(void)
+{
+	if ((pfn_list_fd = open("/proc/vmcore_pfn_lists", O_WRONLY)) < 0) {
+		return 0;
+	}
+	return 1;
+}
+
 int
 main(int argc, char *argv[])
 {
@@ -8256,9 +8780,12 @@ main(int argc, char *argv[])
 	
 	info->block_order = DEFAULT_ORDER;
 	message_level = DEFAULT_MSG_LEVEL;
-	while ((opt = getopt_long(argc, argv, "b:cDd:EFfg:hi:lMpRrsvXx:", longopts,
+	while ((opt = getopt_long(argc, argv, "ab:cDd:EFfg:hi:MnoRrstVvXx:Y", longopts,
 	    NULL)) != -1) {
 		switch (opt) {
+		case 'a':
+			aflag = 1;
+			break;
 		case 'b':
 			info->block_order = atoi(optarg);
 			break;
@@ -8314,6 +8841,13 @@ main(int argc, char *argv[])
 		case 'M':
 			info->flag_dmesg = 1;
 			break;
+		case 'n':
+			/* -n undocumented, for testing page scanning time */
+			nflag = 1;
+			break;
+		case 'o':
+			oflag = 1;
+			break;
 		case 'p':
 			info->flag_compress = DUMP_DH_COMPRESSED_SNAPPY;
 			break;
@@ -8329,6 +8863,9 @@ main(int argc, char *argv[])
 		case 'r':
 			info->flag_reassemble = 1;
 			break;
+		case 't':
+			tflag = 1;
+			break;
 		case 'V':
 			info->vaddr_for_vtop = strtoul(optarg, NULL, 0);
 			break;
@@ -8360,6 +8897,12 @@ main(int argc, char *argv[])
 			goto out;
 		}
 	}
+
+	if (oflag)
+		info->flag_use_kernel_lists = 0;
+	else
+		info->flag_use_kernel_lists = test_kernel_pfn_lists();
+
 	if (flag_debug)
 		message_level |= ML_PRINT_DEBUG_MSG;
 



More information about the kexec mailing list