[PATCH v2] makedumpfile: request the kernel do page scans
Lisa Mitchell
lisa.mitchell at hp.com
Wed Jan 16 07:15:29 EST 2013
On Fri, 2013-01-04 at 16:20 +0000, Cliff Wickman wrote:
> From: Cliff Wickman <cpw at sgi.com>
>
> This version of the patch improves the consolidation of the mem_map table
> that is passed to the kernel. See make_kernel_mmap().
> Particularly the seemingly duplicate pfn ranges generated on an older
> (2.6.32-based, rhel6) kernel.
>
>
>
> I've been experimenting with asking the kernel to scan the page tables
> instead of reading all those page structures through /proc/vmcore.
> The results are rather dramatic.
> On a small, idle UV: about 4 sec. versus about 40 sec.
> On a 8TB UV the unnecessary page scan takes 4 minutes, vs. about 200 min
> through /proc/vmcore.
>
> This patch incorporates this scheme into version 1.5.1, so that the cyclic
> processing can use the kernel scans.
> It also uses the page_is_buddy logic to speed the finding of free pages.
> And also allows makedumpfile to work as before with a kernel that does
> not provide /proc/vmcore_pfn_lists.
>
> This patch:
> - writes requests to new kernel file /proc/vmcore_pfn_lists
> - makes request PL_REQUEST_MEMMAP to pass the crash kernel information about
> the boot kernel
> - makes requests PL_REQUEST_FREE and PL_REQUEST_EXCLUDE, asking the kernel
> to return lists of PFNs
> - adds page scan timing options -n -o and -t
> - still has a debugging option -a
>
> This patch depends on a kernel patch.
>
> Diffed against the released makedumpfile-1.5.1
>
> Signed-off-by: Cliff Wickman <cpw at sgi.com>
> ---
> dwarf_info.c | 2
> makedumpfile.c | 587 ++++++++++++++++++++++++++++++++++++++++++++++++++++++---
> makedumpfile.h | 95 +++++++++
> print_info.c | 5
> 4 files changed, 665 insertions(+), 24 deletions(-)
>
>
> Index: makedumpfile-1.5.1.released/makedumpfile.h
> ===================================================================
> --- makedumpfile-1.5.1.released.orig/makedumpfile.h
> +++ makedumpfile-1.5.1.released/makedumpfile.h
> @@ -86,6 +86,8 @@ int get_mem_type(void);
> #define LSEEKED_PDESC (2)
> #define LSEEKED_PDATA (3)
>
> +#define EXTRA_MEMMAPS 100
> +
> /*
> * Xen page flags
> */
> @@ -418,7 +420,7 @@ do { \
> #define KVER_MIN_SHIFT 16
> #define KERNEL_VERSION(x,y,z) (((x) << KVER_MAJ_SHIFT) | ((y) << KVER_MIN_SHIFT) | (z))
> #define OLDEST_VERSION KERNEL_VERSION(2, 6, 15)/* linux-2.6.15 */
> -#define LATEST_VERSION KERNEL_VERSION(3, 6, 7)/* linux-3.6.7 */
> +#define LATEST_VERSION KERNEL_VERSION(3, 7, 8)/* linux-3.7.8 */
>
> /*
> * vmcoreinfo in /proc/vmcore
> @@ -794,11 +796,25 @@ typedef struct {
> } xen_crash_info_v2_t;
>
> struct mem_map_data {
> + /*
> + * pfn_start/pfn_end are the pfn's represented by this mem_map entry.
> + * mem_map is the virtual address of the array of page structures
> + * that represent these pages.
> + * paddr is the physical address of that array of structures.
> + * ending_paddr would be (pfn_end - pfn_start) * sizeof(struct page).
> + * section_vaddr is the address we get from ioremap_cache().
> + */
> unsigned long long pfn_start;
> unsigned long long pfn_end;
> - unsigned long mem_map;
> + unsigned long mem_map;
> + unsigned long long paddr; /* filled in by makedumpfile */
> + long virtual_offset; /* filled in by kernel */
> + unsigned long long ending_paddr; /* filled in by kernel */
> + unsigned long mapped_size; /* filled in by kernel */
> + void *section_vaddr; /* filled in by kernel */
> };
>
> +
> struct dump_bitmap {
> int fd;
> int no_block;
> @@ -875,6 +891,7 @@ struct DumpInfo {
> int flag_rearrange; /* flag of creating dumpfile from
> flattened format */
> int flag_split; /* splitting vmcore */
> + int flag_use_kernel_lists;
> int flag_cyclic; /* cyclic processing to keep memory consumption */
> int flag_reassemble; /* reassemble multiple dumpfiles into one */
> int flag_refiltering; /* refilter from kdump-compressed file */
> @@ -1384,6 +1401,80 @@ struct domain_list {
> unsigned int pickled_id;
> };
>
> +#define PL_REQUEST_FREE 1 /* request for a list of free pages */
> +#define PL_REQUEST_EXCLUDE 2 /* request for a list of excludable
> + pages */
> +#define PL_REQUEST_MEMMAP 3 /* request to pass in the makedumpfile
> + mem_map_data table */
> +/*
> + * limit the size of the pfn list to this many pfn_element structures
> + */
> +#define MAX_PFN_LIST 10000
> +
> +/*
> + * one element in the pfn_list
> + */
> +struct pfn_element {
> + unsigned long pfn;
> + unsigned long order;
> +};
> +
> +/*
> + * a request for finding pfn's that can be excluded from the dump
> + * they may be pages of particular types or free pages
> + */
> +struct pfn_list_request {
> + int request; /* PL_REQUEST_FREE PL_REQUEST_EXCLUDE or */
> + /* PL_REQUEST_MEMMAP */
> + int debug;
> + unsigned long paddr; /* mem_map address for PL_REQUEST_EXCLUDE */
> + unsigned long pfn_start;/* pfn represented by paddr */
> + unsigned long pgdat_paddr; /* for PL_REQUEST_FREE */
> + unsigned long pgdat_vaddr; /* for PL_REQUEST_FREE */
> + int node; /* for PL_REQUEST_FREE */
> + int exclude_bits; /* for PL_REQUEST_EXCLUDE */
> + int count; /* for PL_REQUEST_EXCLUDE */
> + void *reply_ptr; /* address of user's pfn_reply, for reply */
> + void *pfn_list_ptr; /* address of user's pfn array (*pfn_list) */
> + int map_count; /* for PL_REQUEST_MEMMAP; elements */
> + int map_size; /* for PL_REQUEST_MEMMAP; bytes in table */
> + void *map_ptr; /* for PL_REQUEST_MEMMAP; address of table */
> + long list_size; /* for PL_REQUEST_MEMMAP negotiation */
> + /* resume info: */
> + int more; /* 0 for done, 1 for "there's more" */
> + /* PL_REQUEST_EXCLUDE: */
> + int map_index; /* slot in the mem_map array of page structs */
> + /* PL_REQUEST_FREE: */
> + int zone_index; /* zone within the node's pgdat_list */
> + int freearea_index; /* free_area within the zone */
> + int type_index; /* free_list within the free_area */
> + int list_ct; /* page within the list */
> +};
> +
> +/*
> + * the reply from a pfn_list_request
> + * the list of pfn's itself is pointed to by pfn_list
> + */
> +struct pfn_reply {
> + long pfn_list_elements; /* negoiated on PL_REQUEST_MEMMAP */
> + long in_pfn_list; /* returned by PL_REQUEST_EXCLUDE and
> + PL_REQUEST_FREE */
> + /* resume info */
> + int more; /* 0 == done, 1 == there is more */
> + /* PL_REQUEST_MEMMAP: */
> + int map_index; /* slot in the mem_map array of page structs */
> + /* PL_REQUEST_FREE: */
> + int zone_index; /* zone within the node's pgdat_list */
> + int freearea_index; /* free_area within the zone */
> + int type_index; /* free_list within the free_area */
> + int list_ct; /* page within the list */
> + /* statistic counters: */
> + unsigned long long pfn_cache; /* PL_REQUEST_EXCLUDE */
> + unsigned long long pfn_cache_private; /* PL_REQUEST_EXCLUDE */
> + unsigned long long pfn_user; /* PL_REQUEST_EXCLUDE */
> + unsigned long long pfn_free; /* PL_REQUEST_FREE */
> +};
> +
> #define PAGES_PER_MAPWORD (sizeof(unsigned long) * 8)
> #define MFNS_PER_FRAME (info->page_size / sizeof(unsigned long))
>
> Index: makedumpfile-1.5.1.released/dwarf_info.c
> ===================================================================
> --- makedumpfile-1.5.1.released.orig/dwarf_info.c
> +++ makedumpfile-1.5.1.released/dwarf_info.c
> @@ -324,6 +324,8 @@ get_data_member_location(Dwarf_Die *die,
> return TRUE;
> }
>
> +int dwarf_formref(Dwarf_Attribute *, Dwarf_Off *);
> +
> static int
> get_die_type(Dwarf_Die *die, Dwarf_Die *die_type)
> {
> Index: makedumpfile-1.5.1.released/print_info.c
> ===================================================================
> --- makedumpfile-1.5.1.released.orig/print_info.c
> +++ makedumpfile-1.5.1.released/print_info.c
> @@ -244,6 +244,11 @@ print_usage(void)
> MSG(" [-f]:\n");
> MSG(" Overwrite DUMPFILE even if it already exists.\n");
> MSG("\n");
> + MSG(" [-o]:\n");
> + MSG(" Read page structures from /proc/vmcore in the scan for\n");
> + MSG(" free and excluded pages regardless of whether\n");
> + MSG(" /proc/vmcore_pfn_lists is present.\n");
> + MSG("\n");
> MSG(" [-h]:\n");
> MSG(" Show help message and LZO/snappy support status (enabled/disabled).\n");
> MSG("\n");
> Index: makedumpfile-1.5.1.released/makedumpfile.c
> ===================================================================
> --- makedumpfile-1.5.1.released.orig/makedumpfile.c
> +++ makedumpfile-1.5.1.released/makedumpfile.c
> @@ -13,6 +13,8 @@
> * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> * GNU General Public License for more details.
> */
> +#define _GNU_SOURCE
> +#include <stdio.h>
> #include "makedumpfile.h"
> #include "print_info.h"
> #include "dwarf_info.h"
> @@ -31,6 +33,14 @@ struct srcfile_table srcfile_table;
>
> struct vm_table vt = { 0 };
> struct DumpInfo *info = NULL;
> +int pfn_list_fd;
> +struct pfn_element *pfn_list;
> +int nflag = 0;
> +int oflag = 0;
> +int tflag = 0;
> +int aflag = 0;
> +struct timeval scan_start;
> +int max_pfn_list;
>
> char filename_stdout[] = FILENAME_STDOUT;
>
> @@ -2415,6 +2425,22 @@ get_mm_sparsemem(void)
> unsigned long long pfn_start, pfn_end;
> unsigned long section, mem_map;
> unsigned long *mem_sec = NULL;
> + unsigned long vaddr;
> + unsigned long paddr;
> + unsigned long lastvaddr;
> + unsigned long lastpaddr;
> + unsigned long diff;
> + long j;
> + int i;
> + int npfns;
> + int pagesize;
> + int num_mem_map;
> + int num_added = 0;
> + struct mem_map_data *mmd;
> + struct mem_map_data *curmmd;
> + struct mem_map_data *work1mmd;
> + struct mem_map_data *work2mmd;
> + struct mem_map_data *lastmmd;
>
> int ret = FALSE;
>
> @@ -2441,7 +2467,8 @@ get_mm_sparsemem(void)
> }
> info->num_mem_map = num_section;
> if ((info->mem_map_data = (struct mem_map_data *)
> - malloc(sizeof(struct mem_map_data)*info->num_mem_map)) == NULL) {
> + malloc(sizeof(struct mem_map_data) *
> + (EXTRA_MEMMAPS + info->num_mem_map))) == NULL) {
> ERRMSG("Can't allocate memory for the mem_map_data. %s\n",
> strerror(errno));
> goto out;
> @@ -2459,6 +2486,71 @@ get_mm_sparsemem(void)
> dump_mem_map(pfn_start, pfn_end, mem_map, section_nr);
> }
> ret = TRUE;
> +
> + /* add paddr to the table */
> + mmd = &info->mem_map_data[0];
> + num_mem_map = info->num_mem_map;
> + lastmmd = mmd + num_mem_map;
> + for (i = 0; i < num_mem_map; i++) {
> + if (mmd[i].mem_map == 0) {
> + mmd[i].paddr = 0;
> + } else {
> + mmd[i].paddr = vaddr_to_paddr(mmd[i].mem_map);
> + if (mmd[i].paddr == 0) {
> + printf("! can't translate %#lx to paddr\n",
> + mmd[i].mem_map);
> + exit(1);
> + }
> + /*
> + * When we pass a mem_map and its paddr to the kernel
> + * it will be remapped assuming the entire range
> + * of pfn's are consecutive. If they are not then
> + * we need to split the range into two.
> + */
> + pagesize = SIZE(page);
> + npfns = mmd[i].pfn_end - mmd[i].pfn_start;
> + vaddr = (unsigned long)mmd[i].mem_map;
> + paddr = vaddr_to_paddr(vaddr);
> + diff = vaddr - paddr;
> + lastvaddr = vaddr + (pagesize * (npfns-1));
> + lastpaddr = vaddr_to_paddr(lastvaddr);
> + if (lastvaddr - lastpaddr != diff) {
> + /* there is a break in vtop somewhere in this range */
> + /* we need to split it */
> + for (j = 0; j < npfns; j++) {
> + paddr = vaddr_to_paddr(vaddr);
> + if (vaddr - paddr != diff) {
> + diff = vaddr - paddr;
> + /* insert a new entry if we have room */
> + if (num_added < EXTRA_MEMMAPS) {
> + curmmd = &info->mem_map_data[i];
> + num_added++;
> + work1mmd = lastmmd - 1;
> + for (work2mmd = lastmmd;
> + work2mmd > curmmd; work2mmd--) {
> + work1mmd = work2mmd - 1;
> + *work2mmd = *work1mmd;
> + }
> + work2mmd = work1mmd + 1;
> + work2mmd->mem_map =
> + work1mmd->mem_map + (pagesize * j);
> + lastmmd++;
> + num_mem_map++;
> + info->num_mem_map++;
> + /*
> + * need only 1 split, the new
> + * one will be checked also.
> + */
> + break;
> + } else
> + printf("warn: out of EXTRA_MEMMAPS\n");
> + }
> + vaddr += pagesize;
> + }
> + }
> + }
> + }
> +
> out:
> if (mem_sec != NULL)
> free(mem_sec);
> @@ -2571,6 +2663,172 @@ initialize_bitmap_memory(void)
> return TRUE;
> }
>
> +/*
> + * construct a version of the mem_map_data table to pass to the kernel
> + */
> +void *
> +make_kernel_mmap(int *kmap_elements, int *kmap_size)
> +{
> + int i, j;
> + int elements = 0;
> + int page_structs;
> + int elem;
> + long l;
> + unsigned long base_end_pfn;
> + unsigned long end_paddr;
> + unsigned long v1;
> + unsigned long v2;
> + unsigned long end_page_pfns;
> + unsigned long hpagesize = 0x200000UL;
> + unsigned long hpageoffset = hpagesize - 1;
> + struct mem_map_data *mmdo, *mmdn;
> + struct mem_map_data *mmdbase, *mmdnext, *mmdend, *mmdwork;
> + struct mem_map_data temp_mmd;
> + struct mem_map_data *mmap;
> +
> + mmap = malloc(info->num_mem_map * sizeof(struct mem_map_data));
> + if (mmap == NULL) {
> + ERRMSG("Can't allocate memory kernel map\n");
> + return NULL;
> + }
> +
> + /* condense them down to the valid ones */
> + for (i = 0, mmdn = mmap, mmdo = &info->mem_map_data[0];
> + i < info->num_mem_map; i++, mmdo++) {
> + if (mmdo->mem_map && mmdo->paddr) {
> + *mmdn = *mmdo;
> + mmdn++;
> + elements++;
> + }
> + }
> +
> + /* make sure it is sorted by mem_map (it should be already) */
> + mmdn = mmap;
> + for (i = 0; i < elements - 1; i++) {
> + for (j = i + 1; j < elements; j++) {
> + if (mmdn[j].mem_map < mmdn[i].mem_map) {
> + temp_mmd = mmdn[j];
> + mmdn[j] = mmdn[i];
> + mmdn[i] = temp_mmd;
> + }
> + }
> + }
> +
> + if (aflag) {
> + mmdn = mmap;
> + printf("entire mem_map:\n");
> + for (i = 0; i < elements - 1; i++) {
> + l = (mmdn[i].pfn_end - mmdn[i].pfn_start) * SIZE(page);
> + printf(
> + "[%d] pfn %#llx-%llx mem_map %#lx paddr %#llx-%llx\n",
> + i, mmdn[i].pfn_start, mmdn[i].pfn_end,
> + mmdn[i].mem_map, mmdn[i].paddr,
> + mmdn[i].paddr + l);
> + }
> + }
> +
> + /*
> + * a first pass to split overlapping pfn entries like this:
> + * pfn 0x1248000-1250000 mem_map 0xffffea003ffc0000 paddr 0x10081c0000
> + * pfn 0x1248000-1250000 mem_map 0xffffea0040000030 paddr 0x1008400030
> + */
> + mmdbase = mmap;
> + mmdnext = mmap + 1;
> + mmdend = mmap + elements;
> + /* test each mmdbase/mmdnext pair */
> + while (mmdnext < mmdend) { /* mmdnext is the one after mmdbase */
> + page_structs = (mmdbase->pfn_end - mmdbase->pfn_start);
> + /* mmdwork scans from mmdnext to the end */
> + if ((mmdbase->pfn_start == mmdnext->pfn_start) &&
> + (mmdbase->pfn_end == mmdnext->pfn_end)) {
> + /* overlapping pfns, we need a fix */
> + v1 = mmdnext->mem_map - mmdbase->mem_map;
> + v2 = mmdnext->paddr - mmdbase->paddr;
> + if (v1 != (v2 & hpageoffset))
> + printf("virt to phys is wrong %#lx %#lx\n",
> + v1, v2);
> + l = mmdbase->pfn_end - mmdbase->pfn_start;
> + end_page_pfns = l - (((hpagesize -
> + (hpageoffset & mmdbase->paddr)) +
> + SIZE(page) - 1) / SIZE(page));
> + mmdbase->pfn_end -= end_page_pfns;
> + mmdnext->pfn_start = mmdbase->pfn_end;
> + } else if ((mmdbase->pfn_start == mmdnext->pfn_start) ||
> + (mmdbase->pfn_end == mmdnext->pfn_end)) {
> + printf("warning: unfixed overlap\n");
> + }
> + mmdbase++;
> + mmdnext++;
> + }
> +
> + /*
> + * consolidate those mem_map's with occupying consecutive physical
> + * addresses
> + * pages represented by these pages structs: addr of page struct
> + * pfns 0x1000000-1008000 mem_map 0xffffea0038000000 paddr 0x11f7e00000
> + * pfns 0x1008000-1010000 mem_map 0xffffea00381c0000 paddr 0x11f7fc0000
> + * pfns 0x1010000-1018000 mem_map 0xffffea0038380000 paddr 0x11f8180000
> + * 8000 increments inc's: 1c0000
> + * 8000000 of memory (128M) 8000 page structs
> + */
> + mmdbase = mmap;
> + mmdnext = mmap + 1;
> + mmdend = mmap + elements;
> + while (mmdnext < mmdend) {
> + elem = mmdend - mmdnext;
> + /* test mmdbase vs. mmdwork and onward: */
> + for (i = 0, mmdwork = mmdnext; i < elem; i++, mmdwork++) {
> + base_end_pfn = mmdbase->pfn_end;
> + if (base_end_pfn == mmdwork->pfn_start) {
> + page_structs = (mmdbase->pfn_end -
> + mmdbase->pfn_start);
> + end_paddr = (page_structs * SIZE(page)) +
> + mmdbase->paddr;
> + if (mmdwork->paddr == end_paddr) {
> + /* extend base by the work one */
> + mmdbase->pfn_end = mmdwork->pfn_end;
> + /* next is where to begin next time */
> + mmdnext = mmdwork + 1;
> + } else {
> + /* gap in address of page
> + structs; end of section */
> + mmdbase++;
> + if (mmdwork - mmdbase > 0)
> + *mmdbase = *mmdwork;
> + mmdnext = mmdwork + 1;
> + break;
> + }
> + } else {
> + /* gap in pfns; end of section */
> + mmdbase++;
> + if (mmdwork - mmdbase > 0)
> + *mmdbase = *mmdwork;
> + mmdnext = mmdwork + 1;
> + break;
> + }
> + }
> + }
> + elements = (mmdbase - mmap) + 1;
> +
> + if (aflag) {
> + printf("user mmap for kernel:\n");
> + for (i = 0, mmdwork = mmap; i < elements; i++, mmdwork++) {
> + l = mmdwork->pfn_end - mmdwork->pfn_start;
> + printf(
> + "[%d] user pfn %#llx-%llx paddr %#llx-%llx vaddr %#lx\n",
> + i, mmdwork->pfn_start, mmdwork->pfn_end,
> + mmdwork->paddr,
> + mmdwork->paddr + (l * SIZE(page)),
> + mmdwork->mem_map);
> + }
> + }
> +
> + *kmap_elements = elements;
> + *kmap_size = elements * sizeof(struct mem_map_data);
> +
> + return mmap;
> +}
> +
> int
> initial(void)
> {
> @@ -2833,7 +3091,14 @@ out:
> if (!get_value_for_old_linux())
> return FALSE;
>
> - if (info->flag_cyclic && (info->dump_level & DL_EXCLUDE_FREE))
> + /*
> + * page_is_buddy will tell us whether free pages can be identified
> + * by flags and counts in the page structure without making an extra
> + * pass through the free lists.
> + * This is applicable to using /proc/vmcore or using the kernel.
> + * force all old (-o) forms to search free lists
> + */
> + if (info->dump_level & DL_EXCLUDE_FREE)
> setup_page_is_buddy();
>
> return TRUE;
> @@ -3549,6 +3814,65 @@ out:
> return ret;
> }
>
> +/*
> + * let the kernel find excludable pages from one node
> + */
> +void
> +__exclude_free_pages_kernel(unsigned long pgdat, int node)
> +{
> + int i, j, ret, pages;
> + unsigned long pgdat_paddr;
> + struct pfn_list_request request;
> + struct pfn_reply reply;
> + struct pfn_element *pe;
> +
> + if ((pgdat_paddr = vaddr_to_paddr(pgdat)) == NOT_PADDR) {
> + ERRMSG("Can't convert virtual address(%#lx) to physical.\n",
> + pgdat);
> + return;
> + }
> +
> + /*
> + * Get the list of free pages.
> + * This may be broken up into MAX_PFN_list arrays of PFNs.
> + */
> + memset(&request, 0, sizeof(request));
> + request.request = PL_REQUEST_FREE;
> + request.node = node;
> + request.pgdat_paddr = pgdat_paddr;
> + request.pgdat_vaddr = pgdat;
> + request.reply_ptr = (void *)&reply;
> + request.pfn_list_ptr = (void *)pfn_list;
> + memset(&reply, 0, sizeof(reply));
> +
> + do {
> + request.more = 0;
> + if (reply.more) {
> + /* this is to be a continuation of the last request */
> + request.more = 1;
> + request.zone_index = reply.zone_index;
> + request.freearea_index = reply.freearea_index;
> + request.type_index = reply.type_index;
> + request.list_ct = reply.list_ct;
> + }
> + ret = write(pfn_list_fd, &request, sizeof(request));
> + if (ret != sizeof(request)) {
> + printf("PL_REQUEST_FREE failed\n");
> + return;
> + }
> + pfn_free += reply.pfn_free;
> +
> + for (i = 0; i < reply.in_pfn_list; i++) {
> + pe = &pfn_list[i];
> + pages = (1 << pe->order);
> + for (j = 0; j < pages; j++) {
> + clear_bit_on_2nd_bitmap_for_kernel(pe->pfn + j);
> + }
> + }
> + } while (reply.more);
> +
> + return;
> +}
>
> int
> _exclude_free_page(void)
> @@ -3568,7 +3892,24 @@ _exclude_free_page(void)
> gettimeofday(&tv_start, NULL);
>
> for (num_nodes = 1; num_nodes <= vt.numnodes; num_nodes++) {
> -
> + if (!info->flag_cyclic && info->flag_use_kernel_lists) {
> + node_zones = pgdat + OFFSET(pglist_data.node_zones);
> + if (!readmem(VADDR,
> + pgdat + OFFSET(pglist_data.nr_zones),
> + &nr_zones, sizeof(nr_zones))) {
> + ERRMSG("Can't get nr_zones.\n");
> + return FALSE;
> + }
> + print_progress(PROGRESS_FREE_PAGES, num_nodes - 1,
> + vt.numnodes);
> + /* ask the kernel to do one node */
> + __exclude_free_pages_kernel(pgdat, node);
> + goto next_pgdat;
> + }
> + /*
> + * kernel does not have the pfn_list capability
> + * use the old way
> + */
> print_progress(PROGRESS_FREE_PAGES, num_nodes - 1, vt.numnodes);
>
> node_zones = pgdat + OFFSET(pglist_data.node_zones);
> @@ -3595,6 +3936,7 @@ _exclude_free_page(void)
> if (!reset_bitmap_of_free_pages(zone))
> return FALSE;
> }
> + next_pgdat:
> if (num_nodes < vt.numnodes) {
> if ((node = next_online_node(node + 1)) < 0) {
> ERRMSG("Can't get next online node.\n");
> @@ -3612,6 +3954,8 @@ _exclude_free_page(void)
> */
> print_progress(PROGRESS_FREE_PAGES, vt.numnodes, vt.numnodes);
> print_execution_time(PROGRESS_FREE_PAGES, &tv_start);
> + if (tflag)
> + print_execution_time("Total time", &scan_start);
>
> return TRUE;
> }
> @@ -3755,7 +4099,6 @@ setup_page_is_buddy(void)
> }
> } else
> info->page_is_buddy = page_is_buddy_v2;
> -
> out:
> if (!info->page_is_buddy)
> DEBUG_MSG("Can't select page_is_buddy handler; "
> @@ -3964,10 +4307,89 @@ exclude_zero_pages(void)
> return TRUE;
> }
>
> +/*
> + * let the kernel find excludable pages from one mem_section
> + */
> +int
> +__exclude_unnecessary_pages_kernel(int mm, struct mem_map_data *mmd)
> +{
> + unsigned long long pfn_start = mmd->pfn_start;
> + unsigned long long pfn_end = mmd->pfn_end;
> + int i, j, ret, pages, flag;
> + struct pfn_list_request request;
> + struct pfn_reply reply;
> + struct pfn_element *pe;
> +
> + /*
> + * Get the list of to-be-excluded pages in this section.
> + * It may be broken up by groups of max_pfn_list size.
> + */
> + memset(&request, 0, sizeof(request));
> + request.request = PL_REQUEST_EXCLUDE;
> + request.paddr = mmd->paddr; /* phys addr of mem_map */
> + request.reply_ptr = (void *)&reply;
> + request.pfn_list_ptr = (void *)pfn_list;
> + request.exclude_bits = 0;
> + request.pfn_start = pfn_start;
> + request.count = pfn_end - pfn_start;
> + if (info->dump_level & DL_EXCLUDE_CACHE)
> + request.exclude_bits |= DL_EXCLUDE_CACHE;
> + if (info->dump_level & DL_EXCLUDE_CACHE_PRI)
> + request.exclude_bits |= DL_EXCLUDE_CACHE_PRI;
> + if (info->dump_level & DL_EXCLUDE_USER_DATA)
> + request.exclude_bits |= DL_EXCLUDE_USER_DATA;
> + /* if we try for free pages from the freelists then we don't need
> + to ask here for 'buddy' pages */
> + if (info->dump_level & DL_EXCLUDE_FREE)
> + request.exclude_bits |= DL_EXCLUDE_FREE;
> + memset(&reply, 0, sizeof(reply));
> +
> + do {
> + /* pfn represented by paddr */
> + request.more = 0;
> + if (reply.more) {
> + /* this is to be a continuation of the last request */
> + request.more = 1;
> + request.map_index = reply.map_index;
> + }
> +
> + ret = write(pfn_list_fd, &request, sizeof(request));
> + if (ret != sizeof(request))
> + return FALSE;
> +
> + pfn_cache += reply.pfn_cache;
> + pfn_cache_private += reply.pfn_cache_private;
> + pfn_user += reply.pfn_user;
> + pfn_free += reply.pfn_free;
> +
> + flag = 0;
> + for (i = 0; i < reply.in_pfn_list; i++) {
> + pe = &pfn_list[i];
> + pages = (1 << pe->order);
> + for (j = 0; j < pages; j++) {
> + if (clear_bit_on_2nd_bitmap_for_kernel(
> + pe->pfn + j) == FALSE) {
> + // printf("fail: mm %d slot %d pfn %#lx\n",
> + // mm, i, pe->pfn + j);
> + // printf("paddr %#llx pfn %#llx-%#llx mem_map %#lx\n",
> + // mmd->paddr, mmd->pfn_start, mmd->pfn_end, mmd->mem_map);
> + flag = 1;
> + break;
> + }
> + if (flag) break;
> + }
> + }
> + } while (reply.more);
> +
> + return TRUE;
> +}
> +
> int
> -__exclude_unnecessary_pages(unsigned long mem_map,
> - unsigned long long pfn_start, unsigned long long pfn_end)
> +__exclude_unnecessary_pages(int mm, struct mem_map_data *mmd)
> {
> + unsigned long long pfn_start = mmd->pfn_start;
> + unsigned long long pfn_end = mmd->pfn_end;
> + unsigned long mem_map = mmd->mem_map;
> unsigned long long pfn, pfn_mm, maddr;
> unsigned long long pfn_read_start, pfn_read_end, index_pg;
> unsigned char page_cache[SIZE(page) * PGMM_CACHED];
> @@ -3975,6 +4397,12 @@ __exclude_unnecessary_pages(unsigned lon
> unsigned int _count, _mapcount = 0;
> unsigned long flags, mapping, private = 0;
>
> + if (info->flag_use_kernel_lists) {
> + if (__exclude_unnecessary_pages_kernel(mm, mmd) == FALSE)
> + return FALSE;
> + return TRUE;
> + }
> +
> /*
> * Refresh the buffer of struct page, when changing mem_map.
> */
> @@ -4012,7 +4440,6 @@ __exclude_unnecessary_pages(unsigned lon
> pfn_mm = PGMM_CACHED - index_pg;
> else
> pfn_mm = pfn_end - pfn;
> -
> if (!readmem(VADDR, mem_map,
> page_cache + (index_pg * SIZE(page)),
> SIZE(page) * pfn_mm)) {
> @@ -4036,7 +4463,6 @@ __exclude_unnecessary_pages(unsigned lon
> * Exclude the free page managed by a buddy
> */
> if ((info->dump_level & DL_EXCLUDE_FREE)
> - && info->flag_cyclic
> && info->page_is_buddy
> && info->page_is_buddy(flags, _mapcount, private, _count)) {
> int i;
> @@ -4085,19 +4511,78 @@ __exclude_unnecessary_pages(unsigned lon
> return TRUE;
> }
>
> +/*
> + * Pass in the mem_map_data table.
> + * Must do this once, and before doing PL_REQUEST_FREE or PL_REQUEST_EXCLUDE.
> + */
> +int
> +setup_kernel_mmap()
> +{
> + int ret;
> + int kmap_elements, kmap_size;
> + long malloc_size;
> + void *kmap_addr;
> + struct pfn_list_request request;
> + struct pfn_reply reply;
> +
> + kmap_addr = make_kernel_mmap(&kmap_elements, &kmap_size);
> + if (kmap_addr == NULL)
> + return FALSE;
> + memset(&request, 0, sizeof(request));
> + request.request = PL_REQUEST_MEMMAP;
> + request.map_ptr = kmap_addr;
> + request.reply_ptr = (void *)&reply;
> + request.map_count = kmap_elements;
> + request.map_size = kmap_size;
> + request.list_size = MAX_PFN_LIST;
> +
> + ret = write(pfn_list_fd, &request, sizeof(request));
> + if (ret < 0) {
> + fprintf(stderr, "PL_REQUEST_MEMMAP returned %d\n", ret);
> + return FALSE;
> + }
> + /* the reply tells us how long the kernel's list actually is */
> + max_pfn_list = reply.pfn_list_elements;
> + if (max_pfn_list <= 0) {
> + fprintf(stderr,
> + "PL_REQUEST_MEMMAP returned max_pfn_list %d\n",
> + max_pfn_list);
> + return FALSE;
> + }
> + if (max_pfn_list < MAX_PFN_LIST) {
> + printf("length of pfn list dropped from %d to %d\n",
> + MAX_PFN_LIST, max_pfn_list);
> + }
> + free(kmap_addr);
> + /*
> + * Allocate the buffer for the PFN list (just once).
> + */
> + malloc_size = max_pfn_list * sizeof(struct pfn_element);
> + if ((pfn_list = (struct pfn_element *)malloc(malloc_size)) == NULL) {
> + ERRMSG("Can't allocate pfn_list of %ld\n", malloc_size);
> + return FALSE;
> + }
> + return TRUE;
> +}
> +
> int
> exclude_unnecessary_pages(void)
> {
> - unsigned int mm;
> - struct mem_map_data *mmd;
> - struct timeval tv_start;
> + unsigned int mm;
> + struct mem_map_data *mmd;
> + struct timeval tv_start;
>
> if (is_xen_memory() && !info->dom0_mapnr) {
> ERRMSG("Can't get max domain-0 PFN for excluding pages.\n");
> return FALSE;
> }
>
> + if (!info->flag_cyclic && info->flag_use_kernel_lists) {
> + if (setup_kernel_mmap() == FALSE)
> + return FALSE;
> + }
> gettimeofday(&tv_start, NULL);
> + gettimeofday(&scan_start, NULL);
>
> for (mm = 0; mm < info->num_mem_map; mm++) {
> print_progress(PROGRESS_UNN_PAGES, mm, info->num_mem_map);
> @@ -4106,9 +4591,9 @@ exclude_unnecessary_pages(void)
>
> if (mmd->mem_map == NOT_MEMMAP_ADDR)
> continue;
> -
> - if (!__exclude_unnecessary_pages(mmd->mem_map,
> - mmd->pfn_start, mmd->pfn_end))
> + if (mmd->paddr == 0)
> + continue;
> + if (!__exclude_unnecessary_pages(mm, mmd))
> return FALSE;
> }
>
> @@ -4139,7 +4624,11 @@ exclude_unnecessary_pages_cyclic(void)
> */
> copy_bitmap_cyclic();
>
> - if ((info->dump_level & DL_EXCLUDE_FREE) && !info->page_is_buddy)
> + /*
> + * If free pages cannot be identified with the buddy flag and/or
> + * count then we have to search free lists.
> + */
> + if ((info->dump_level & DL_EXCLUDE_FREE) && (!info->page_is_buddy))
> if (!exclude_free_page())
> return FALSE;
>
> @@ -4164,8 +4653,7 @@ exclude_unnecessary_pages_cyclic(void)
>
> if (mmd->pfn_end >= info->cyclic_start_pfn &&
> mmd->pfn_start <= info->cyclic_end_pfn) {
> - if (!__exclude_unnecessary_pages(mmd->mem_map,
> - mmd->pfn_start, mmd->pfn_end))
> + if (!__exclude_unnecessary_pages(mm, mmd))
> return FALSE;
> }
> }
> @@ -4195,7 +4683,7 @@ update_cyclic_region(unsigned long long
> if (!create_1st_bitmap_cyclic())
> return FALSE;
>
> - if (!exclude_unnecessary_pages_cyclic())
> + if (exclude_unnecessary_pages_cyclic() == FALSE)
> return FALSE;
>
> return TRUE;
> @@ -4255,7 +4743,7 @@ create_2nd_bitmap(void)
> if (info->dump_level & DL_EXCLUDE_CACHE ||
> info->dump_level & DL_EXCLUDE_CACHE_PRI ||
> info->dump_level & DL_EXCLUDE_USER_DATA) {
> - if (!exclude_unnecessary_pages()) {
> + if (exclude_unnecessary_pages() == FALSE) {
> ERRMSG("Can't exclude unnecessary pages.\n");
> return FALSE;
> }
> @@ -4263,8 +4751,10 @@ create_2nd_bitmap(void)
>
> /*
> * Exclude free pages.
> + * If free pages cannot be identified with the buddy flag and/or
> + * count then we have to search free lists.
> */
> - if (info->dump_level & DL_EXCLUDE_FREE)
> + if ((info->dump_level & DL_EXCLUDE_FREE) && (!info->page_is_buddy))
> if (!exclude_free_page())
> return FALSE;
>
> @@ -4395,6 +4885,10 @@ create_dump_bitmap(void)
> int ret = FALSE;
>
> if (info->flag_cyclic) {
> + if (info->flag_use_kernel_lists) {
> + if (setup_kernel_mmap() == FALSE)
> + goto out;
> + }
> if (!prepare_bitmap_buffer_cyclic())
> goto out;
>
> @@ -4872,6 +5366,7 @@ get_num_dumpable_cyclic(void)
> {
> unsigned long long pfn, num_dumpable=0;
>
> + gettimeofday(&scan_start, NULL);
> for (pfn = 0; pfn < info->max_mapnr; pfn++) {
> if (!update_cyclic_region(pfn))
> return FALSE;
> @@ -5201,7 +5696,7 @@ get_loads_dumpfile_cyclic(void)
> info->cyclic_end_pfn = info->pfn_cyclic;
> if (!create_1st_bitmap_cyclic())
> return FALSE;
> - if (!exclude_unnecessary_pages_cyclic())
> + if (exclude_unnecessary_pages_cyclic() == FALSE)
> return FALSE;
>
> if (!(phnum = get_phnum_memory()))
> @@ -5613,6 +6108,10 @@ write_kdump_pages(struct cache_data *cd_
> pfn_zero++;
> continue;
> }
> +
> + if (nflag)
> + continue;
> +
> /*
> * Compress the page data.
> */
> @@ -5768,6 +6267,7 @@ write_kdump_pages_cyclic(struct cache_da
> for (pfn = start_pfn; pfn < end_pfn; pfn++) {
>
> if ((num_dumped % per) == 0)
> +
> print_progress(PROGRESS_COPY, num_dumped, info->num_dumpable);
>
> /*
> @@ -5786,11 +6286,17 @@ write_kdump_pages_cyclic(struct cache_da
> */
> if ((info->dump_level & DL_EXCLUDE_ZERO)
> && is_zero_page(buf, info->page_size)) {
> + if (!nflag) {
> if (!write_cache(cd_header, pd_zero, sizeof(page_desc_t)))
> goto out;
> + }
> pfn_zero++;
> continue;
> }
> +
> + if (nflag)
> + continue;
> +
> /*
> * Compress the page data.
> */
> @@ -6208,6 +6714,8 @@ write_kdump_pages_and_bitmap_cyclic(stru
> if (!update_cyclic_region(pfn))
> return FALSE;
>
> + if (tflag)
> + print_execution_time("Total time", &scan_start);
> if (!write_kdump_pages_cyclic(cd_header, cd_page, &pd_zero, &offset_data))
> return FALSE;
>
> @@ -8231,6 +8739,22 @@ static struct option longopts[] = {
> {0, 0, 0, 0}
> };
>
> +/*
> + * test for the presence of capability in the kernel to provide lists
> + * of pfn's:
> + * /proc/vmcore_pfn_lists
> + * return 1 for present
> + * return 0 for not present
> + */
> +int
> +test_kernel_pfn_lists(void)
> +{
> + if ((pfn_list_fd = open("/proc/vmcore_pfn_lists", O_WRONLY)) < 0) {
> + return 0;
> + }
> + return 1;
> +}
> +
> int
> main(int argc, char *argv[])
> {
> @@ -8256,9 +8780,12 @@ main(int argc, char *argv[])
>
> info->block_order = DEFAULT_ORDER;
> message_level = DEFAULT_MSG_LEVEL;
> - while ((opt = getopt_long(argc, argv, "b:cDd:EFfg:hi:lMpRrsvXx:", longopts,
> + while ((opt = getopt_long(argc, argv, "ab:cDd:EFfg:hi:MnoRrstVvXx:Y", longopts,
> NULL)) != -1) {
> switch (opt) {
> + case 'a':
> + aflag = 1;
> + break;
> case 'b':
> info->block_order = atoi(optarg);
> break;
> @@ -8314,6 +8841,13 @@ main(int argc, char *argv[])
> case 'M':
> info->flag_dmesg = 1;
> break;
> + case 'n':
> + /* -n undocumented, for testing page scanning time */
> + nflag = 1;
> + break;
> + case 'o':
> + oflag = 1;
> + break;
> case 'p':
> info->flag_compress = DUMP_DH_COMPRESSED_SNAPPY;
> break;
> @@ -8329,6 +8863,9 @@ main(int argc, char *argv[])
> case 'r':
> info->flag_reassemble = 1;
> break;
> + case 't':
> + tflag = 1;
> + break;
> case 'V':
> info->vaddr_for_vtop = strtoul(optarg, NULL, 0);
> break;
> @@ -8360,6 +8897,12 @@ main(int argc, char *argv[])
> goto out;
> }
> }
> +
> + if (oflag)
> + info->flag_use_kernel_lists = 0;
> + else
> + info->flag_use_kernel_lists = test_kernel_pfn_lists();
> +
> if (flag_debug)
> message_level |= ML_PRINT_DEBUG_MSG;
>
>
> _______________________________________________
> kexec mailing list
> kexec at lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/kexec
Cliff I tried your patch above on makedumpfile v1.5.1 (built dynamically
on the same DL980 I was running the test on), with all the RHEL 6
versions of kernel patches you gave me from 1207 plus the kernel patch
to kexec recommended for makedumpfile v1.5.1 built on top of a
preliminary RHEL 6.4 kernel source (higher patch level of 2.6.32
kernel), this time on a 1 TB Memory system (We have lost access to a 4
TB Memory system for some time, now). On this same system, regular
Makedumpfile v1.5.1 worked fine to produce a dump. But the Makedumpfile
with the patches above could not even start the dump, and printed:
Saving vmcore-dmesg.txt
Saved vmcore-dmesg.txt
PL_REQUEST_MEMMAP returned -1
Restarting system.
This happened with both a crashkernel size=200M that would have invoked
cyclic buffer mode, and also with a larger one, 384M that should not
have needed cyclic mode. I had no cyclic buffer mode set or turned off
in the makedumpfile command line, just recording memory usage with:
core_collector makedumpfile -c --message-level 31 -d 31
debug_mem_level 2
ret = write(pfn_list_fd, &request, sizeof(request));
if (ret < 0) {
fprintf(stderr, "PL_REQUEST_MEMMAP returned %d\n", ret);
return FALSE;
Any ideas what probably caused this? Am I missing a patch? Do I have
the wrong kernel patches? Tips to debug?
I am attaching the Kernel patches you sent me earlier that I used, on
top of:
https://lkml.org/lkml/2012/11/21/90 with the tweak for RHEL 2.6.32
kernels below applied on top of it:
NOTE: The patch above is for latest kernel. So you need to fix it as
below if your kernel version is between v2.6.18 and v2.6.37:
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 511151b..56583a4 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1490,7 +1490,6 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_OFFSET(page, flags);
VMCOREINFO_OFFSET(page, _count);
VMCOREINFO_OFFSET(page, mapping);
- VMCOREINFO_OFFSET(page, _mapcount);
VMCOREINFO_OFFSET(page, private);
VMCOREINFO_OFFSET(page, lru);
VMCOREINFO_OFFSET(pglist_data, node_zones);
@@ -1515,8 +1514,7 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_NUMBER(PG_lru);
VMCOREINFO_NUMBER(PG_private);
VMCOREINFO_NUMBER(PG_swapcache);
- VMCOREINFO_NUMBER(PG_slab);
- VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
+ VMCOREINFO_NUMBER(PG_buddy);
arch_crash_save_vmcoreinfo();
update_vmcoreinfo_note();
-------------- next part --------------
An embedded message was scrubbed...
From:
Subject: [PATCH] scan page tables for makedumpfile
Date: Wed, 16 Jan 2013 05:00:37 -0700
Size: 21801
URL: <http://lists.infradead.org/pipermail/kexec/attachments/20130116/4cbb470c/attachment-0001.mht>
More information about the kexec
mailing list