[PATCH 1/1] exclude page structures of non-dumped pages
cpw
cpw at sgi.com
Fri Aug 28 10:06:54 PDT 2015
From: Cliff Wickman <cpw at sgi.com>
This patch has been submitted before (see Jun29 2015), but as part of a
2-patch set. That set included a direct i/o option, but that idea has been
dropped as unnecesary.
I have been testing on large memory systems to demonstrate the importance
of this feature to such systems. See some numbers below.
The most dramatic demonstration was on a 32TB system where the patch
reduced the process from 2 hours to 26 minutes. The size of the dump
would probably have been over 30GB (but I ran out of disk space). It was
reduced to 5.4GB.
Applies to version 1.5.8
This patch adds a -e option to makedumpfile.
The -e option excludes kernel pages that contain nothing but kernel page
structures for pages that are not being included in the dump.
A page structure (56 bytes) exists for every 4096-byte page.
This amounts to 3.67 million pages, or about 14GB, per terabyte of system memory!
Without -e an idle 2-terabyte system can be dumped (compressed) to a file of
about 3.6G.
With -e that is reduced to about 456M. And the time and space savings
multiply for each additional terabyte of memory in the system.
Experimental time/size results: (basically idle systems)
Memory Size With -e Without -e
(sec.) (sec.)
(using a sles11sp3 kernel that does not provide mmap of /proc/vmcore:)
1TB 52 244M 257 1.7G
2TB 128 456M 526 3.6G
8TB 780 1.6G 3400 13.8G
16TB 2600 3.1G 9800 (extrapolated, 2:40 is too long to wait)
32TB 6000 5.4G not done
(using a sles11sp3 kernel that provides mmap of /proc/vmcore:)
32TB 1600 5.4G 7300 (extrapolated)
(ran out of 19G space before 1/2 done)
The only disadvantage is that various options of the crash 'kmem' command (that
walk lists of page structures) will not work.
There is a corresponding patch for crash to issue a warning about such commands
when the dump is flagged DUMP_DH_EXCLUDED_VMEMMAP.
The -e option only works in non-cyclic mode, which its use implies.
---
diskdump_mod.h | 1
makedumpfile.c | 708 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
makedumpfile.h | 59 ++++
print_info.c | 7
4 files changed, 764 insertions(+), 11 deletions(-)
Index: makedumpfile/print_info.c
===================================================================
--- makedumpfile.orig/print_info.c
+++ makedumpfile/print_info.c
@@ -58,7 +58,7 @@ print_usage(void)
MSG("\n");
MSG("Usage:\n");
MSG(" Creating DUMPFILE:\n");
- MSG(" # makedumpfile [-c|-l|-p|-E] [-d DL] [-x VMLINUX|-i VMCOREINFO] VMCORE\n");
+ MSG(" # makedumpfile [-c|-l|-p|-E] [-d DL] [-e] [-x VMLINUX|-i VMCOREINFO] VMCORE\n");
MSG(" DUMPFILE\n");
MSG("\n");
MSG(" Creating DUMPFILE with filtered kernel data specified through filter config\n");
@@ -108,6 +108,10 @@ print_usage(void)
MSG(" -E option, because the ELF format does not support compressed data.\n");
MSG(" THIS IS ONLY FOR THE CRASH UTILITY.\n");
MSG("\n");
+ MSG(" [-e]:\n");
+ MSG(" Exclude page structures (vmemmap) for unused pages.\n");
+ MSG(" This greatly shortens the dump of a very large memory system.\n");
+ MSG("\n");
MSG(" [-d DL]:\n");
MSG(" Specify the type of unnecessary page for analysis.\n");
MSG(" Pages of the specified type are not copied to DUMPFILE. The page type\n");
@@ -359,4 +363,3 @@ print_execution_time(char *step_name, st
REPORT_MSG("STEP [%s] : %ld.%06ld seconds\n",
step_name, diff_sec, diff_usec);
}
-
Index: makedumpfile/makedumpfile.h
===================================================================
--- makedumpfile.orig/makedumpfile.h
+++ makedumpfile/makedumpfile.h
@@ -43,6 +43,9 @@
#include "diskdump_mod.h"
#include "sadump_mod.h"
+#define VMEMMAPSTART 0xffffea0000000000UL
+#define BITS_PER_WORD 64
+
/*
* Result of command
*/
@@ -482,6 +485,7 @@ do { \
#define VMALLOC_END (info->vmalloc_end)
#define VMEMMAP_START (info->vmemmap_start)
#define VMEMMAP_END (info->vmemmap_end)
+#define PMASK (0x7ffffffffffff000UL)
#ifdef __arm__
#define KVBASE_MASK (0xffff)
@@ -568,15 +572,20 @@ do { \
#define PGDIR_SIZE (1UL << PGDIR_SHIFT)
#define PGDIR_MASK (~(PGDIR_SIZE - 1))
#define PTRS_PER_PGD (512)
+#define PGD_SHIFT (39)
+#define PUD_SHIFT (30)
#define PMD_SHIFT (21)
#define PMD_SIZE (1UL << PMD_SHIFT)
#define PMD_MASK (~(PMD_SIZE - 1))
+#define PTRS_PER_PUD (512)
#define PTRS_PER_PMD (512)
#define PTRS_PER_PTE (512)
#define PTE_SHIFT (12)
#define pml4_index(address) (((address) >> PML4_SHIFT) & (PTRS_PER_PML4 - 1))
#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
+#define pgd4_index(address) (((address) >> PGD_SHIFT) & (PTRS_PER_PGD - 1))
+#define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD - 1))
#define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))
#define pte_index(address) (((address) >> PTE_SHIFT) & (PTRS_PER_PTE - 1))
@@ -742,7 +751,6 @@ do { \
/*
* 4 Levels paging
*/
-#define PUD_SHIFT (PMD_SHIFT + PTRS_PER_PTD_SHIFT)
#define PGDIR_SHIFT_4L (PUD_SHIFT + PTRS_PER_PTD_SHIFT)
#define MASK_PUD ((1UL << REGION_SHIFT) - 1) & (~((1UL << PUD_SHIFT) - 1))
@@ -1564,6 +1572,51 @@ struct srcfile_table {
char pud_t[LEN_SRCFILE];
};
+/*
+ * This structure records where the vmemmap page structures reside, and which
+ * pfn's are represented by those page structures.
+ * The actual pages containing the page structures are 2MB pages, so their pfn's
+ * will all be multiples of 0x200.
+ * The page structures are 7 64-bit words in length (0x38) so they overlap the
+ * 2MB boundaries. Each page structure represents a 4k page.
+ * A 4k page is here defined to be represented on a 2MB page if its page structure
+ * 'ends' on that page (even if it began on the page before).
+ */
+struct vmap_pfns {
+ struct vmap_pfns *next;
+ struct vmap_pfns *prev;
+ /*
+ * These (start/end) are literal pfns of 2MB pages on which the page
+ * structures reside, not start and end+1.
+ */
+ unsigned long vmap_pfn_start;
+ unsigned long vmap_pfn_end;
+ /*
+ * These (start/end) are literal pfns represented on these pages, not
+ * start and end+1.
+ * The starting page struct is at least partly on the first page; the
+ * ending page struct is entirely on the last page.
+ */
+ unsigned long rep_pfn_start;
+ unsigned long rep_pfn_end;
+};
+
+/* for saving a list of pfns to a buffer, and then to a file if necessary */
+struct save_control {
+ int sc_fd;
+ char *sc_filename;
+ char *sc_buf;
+ long sc_buflen; /* length of buffer never changes */
+ long sc_bufposition; /* offset of next slot for write, or next to be read */
+ long sc_filelen; /* length of valid data written */
+ long sc_fileposition; /* offset in file of next entry to be read */
+};
+/* one entry in the buffer and file */
+struct sc_entry {
+ unsigned long startpfn;
+ unsigned long numpfns;
+};
+
extern struct symbol_table symbol_table;
extern struct size_table size_table;
extern struct offset_table offset_table;
@@ -1723,6 +1776,9 @@ int get_xen_info_ia64(void);
#define get_xen_info_arch(X) FALSE
#endif /* s390x */
+#define PAGESHFT 12 /* assuming a 4k page */
+#define PSE 128 /* bit 7 */
+
struct cycle {
mdf_pfn_t start_pfn;
mdf_pfn_t end_pfn;
@@ -1869,6 +1925,7 @@ struct elf_prstatus {
#define OPT_DEBUG 'D'
#define OPT_DUMP_LEVEL 'd'
#define OPT_ELF_DUMPFILE 'E'
+#define OPT_EXCLUDE_UNUSED_VM 'e'
#define OPT_FLATTEN 'F'
#define OPT_FORCE 'f'
#define OPT_GENERATE_VMCOREINFO 'g'
Index: makedumpfile/makedumpfile.c
===================================================================
--- makedumpfile.orig/makedumpfile.c
+++ makedumpfile/makedumpfile.c
@@ -32,12 +32,16 @@ struct offset_table offset_table;
struct array_table array_table;
struct number_table number_table;
struct srcfile_table srcfile_table;
+struct save_control sc;
struct vm_table vt = { 0 };
struct DumpInfo *info = NULL;
struct SplitBlock *splitblock = NULL;
+struct vmap_pfns *gvmem_pfns;
+int nr_gvmem_pfns;
char filename_stdout[] = FILENAME_STDOUT;
+char *root_dir = "/tmp";
/* Cache statistics */
static unsigned long long cache_hit;
@@ -85,8 +89,10 @@ mdf_pfn_t pfn_free;
mdf_pfn_t pfn_hwpoison;
mdf_pfn_t num_dumped;
+long blocksize;
int retcd = FAILED; /* return code */
+int excludevmflag = 0;
#define INITIALIZE_LONG_TABLE(table, value) \
do { \
@@ -1031,19 +1037,14 @@ int
open_dump_bitmap(void)
{
int i, fd;
- char *tmpname;
-
- tmpname = getenv("TMPDIR");
- if (!tmpname)
- tmpname = "/tmp";
if ((info->name_bitmap = (char *)malloc(sizeof(FILENAME_BITMAP) +
- strlen(tmpname) + 1)) == NULL) {
+ strlen(root_dir) + 1)) == NULL) {
ERRMSG("Can't allocate memory for the filename. %s\n",
strerror(errno));
return FALSE;
}
- strcpy(info->name_bitmap, tmpname);
+ strcpy(info->name_bitmap, root_dir);
strcat(info->name_bitmap, "/");
strcat(info->name_bitmap, FILENAME_BITMAP);
if ((fd = mkstemp(info->name_bitmap)) < 0) {
@@ -3342,10 +3343,12 @@ out:
if (info->flag_usemmap == MMAP_TRY && initialize_mmap()) {
DEBUG_MSG("mmap() is available on the kernel.\n");
+printf("cpw: mmap() is available on the kernel.\n");
info->flag_usemmap = MMAP_ENABLE;
} else {
DEBUG_MSG("The kernel doesn't support mmap(),");
DEBUG_MSG("read() will be used instead.\n");
+printf("cpw: mmap() is NOT available on the kernel. using read\n");
info->flag_usemmap = MMAP_DISABLE;
}
@@ -5205,6 +5208,320 @@ copy_bitmap(void)
return TRUE;
}
+/*
+ * Given a range of unused pfn's, check whether we can drop the vmemmap pages
+ * that represent them.
+ * (pfn ranges are literally start and end, not start and end+1)
+ * see the array of vmemmap pfns and the pfns they represent: gvmem_pfns
+ * Return 1 for delete, 0 for not to delete.
+ */
+int
+find_vmemmap_pages(unsigned long startpfn, unsigned long endpfn, unsigned long *vmappfn,
+ unsigned long *nmapnpfns)
+{
+ int i;
+ long npfns_offset, vmemmap_offset, vmemmap_pfns, start_vmemmap_pfn;
+ long npages, end_vmemmap_pfn;
+ struct vmap_pfns *vmapp;
+ int pagesize = info->page_size;
+
+ for (i = 0; i < nr_gvmem_pfns; i++) {
+ vmapp = gvmem_pfns + i;
+ if ((startpfn >= vmapp->rep_pfn_start) &&
+ (endpfn <= vmapp->rep_pfn_end)) {
+ npfns_offset = startpfn - vmapp->rep_pfn_start;
+ vmemmap_offset = npfns_offset * size_table.page;
+ // round up to a page boundary
+ if (vmemmap_offset % pagesize)
+ vmemmap_offset += (pagesize - (vmemmap_offset % pagesize));
+ vmemmap_pfns = vmemmap_offset / pagesize;
+ start_vmemmap_pfn = vmapp->vmap_pfn_start + vmemmap_pfns;
+ *vmappfn = start_vmemmap_pfn;
+
+ npfns_offset = endpfn - vmapp->rep_pfn_start;
+ vmemmap_offset = npfns_offset * size_table.page;
+ // round down to page boundary
+ vmemmap_offset -= (vmemmap_offset % pagesize);
+ vmemmap_pfns = vmemmap_offset / pagesize;
+ end_vmemmap_pfn = vmapp->vmap_pfn_start + vmemmap_pfns;
+ npages = end_vmemmap_pfn - start_vmemmap_pfn;
+ if (npages == 0)
+ return 0;
+ *nmapnpfns = npages;
+ return 1;
+ }
+ }
+ return 0;
+}
+
+/*
+ * Finalize the structure for saving pfn's to be deleted.
+ */
+void
+finalize_save_control()
+{
+ free(sc.sc_buf);
+ close(sc.sc_fd);
+ return;
+}
+
+/*
+ * Reset the structure for saving pfn's to be deleted so that it can be read
+ */
+void
+reset_save_control()
+{
+ int i;
+ if (sc.sc_bufposition == 0)
+ return;
+
+ i = write(sc.sc_fd, sc.sc_buf, sc.sc_buflen);
+ if (i != sc.sc_buflen) {
+ fprintf(stderr, "reset: Can't write a page to %s\n",
+ sc.sc_filename);
+ exit(1);
+ }
+ sc.sc_filelen += sc.sc_bufposition;
+
+ if (lseek(sc.sc_fd, 0, SEEK_SET) < 0) {
+ fprintf(stderr, "Can't seek the pfn file %s).", sc.sc_filename);
+ exit(1);
+ }
+ sc.sc_fileposition = 0;
+ sc.sc_bufposition = sc.sc_buflen; /* trigger 1st read */
+ return;
+}
+
+/*
+ * Initialize the structure for saving pfn's to be deleted.
+ */
+void
+init_save_control()
+{
+ int flags;
+ char *filename;
+
+ filename = malloc(50);
+ *filename = '\0';
+ strcpy(filename, root_dir);
+ strcat(filename, "/");
+ strcat(filename, "makedumpfilepfns");
+ sc.sc_filename = filename;
+ flags = O_RDWR|O_CREAT|O_TRUNC;
+ if ((sc.sc_fd = open(sc.sc_filename, flags, S_IRUSR|S_IWUSR)) < 0) {
+ fprintf(stderr, "Can't open the pfn file %s.\n",
+ sc.sc_filename);
+ exit(1);
+ }
+ unlink(sc.sc_filename);
+
+ sc.sc_buf= malloc(blocksize);
+ if (!sc.sc_buf) {
+ fprintf(stderr, "Can't allocate a page for pfn buf.\n");
+ exit(1);
+ }
+ sc.sc_buflen = blocksize;
+ sc.sc_bufposition = 0;
+ sc.sc_fileposition = 0;
+ sc.sc_filelen = 0;
+}
+
+/*
+ * Save a starting pfn and number of pfns for later delete from bitmap.
+ */
+void
+save_deletes(unsigned long startpfn, unsigned long numpfns)
+{
+ int i;
+ struct sc_entry *scp;
+
+ if (sc.sc_bufposition == sc.sc_buflen) {
+ i = write(sc.sc_fd, sc.sc_buf, sc.sc_buflen);
+ if (i != sc.sc_buflen) {
+ fprintf(stderr, "save: Can't write a page to %s\n",
+ sc.sc_filename);
+ exit(1);
+ }
+ sc.sc_filelen += sc.sc_buflen;
+ sc.sc_bufposition = 0;
+ }
+ scp = (struct sc_entry *)(sc.sc_buf + sc.sc_bufposition);
+ scp->startpfn = startpfn;
+ scp->numpfns = numpfns;
+ sc.sc_bufposition += sizeof(struct sc_entry);
+}
+
+/*
+ * Get a starting pfn and number of pfns for delete from bitmap.
+ * Return 0 for success, 1 for 'no more'
+ */
+int
+get_deletes(unsigned long *startpfn, unsigned long *numpfns)
+{
+ int i;
+ struct sc_entry *scp;
+
+ if (sc.sc_fileposition >= sc.sc_filelen) {
+ return 1;
+ }
+
+ if (sc.sc_bufposition == sc.sc_buflen) {
+ i = read(sc.sc_fd, sc.sc_buf, sc.sc_buflen);
+ if (i <= 0) {
+ fprintf(stderr, "Can't read a page from %s.\n", sc.sc_filename);
+ exit(1);
+ }
+ sc.sc_bufposition = 0;
+ }
+ scp = (struct sc_entry *)(sc.sc_buf + sc.sc_bufposition);
+ *startpfn = scp->startpfn;
+ *numpfns = scp->numpfns;
+ sc.sc_bufposition += sizeof(struct sc_entry);
+ sc.sc_fileposition += sizeof(struct sc_entry);
+ return 0;
+}
+
+/*
+ * Find the big holes in bitmap2; they represent ranges for which
+ * we do not need page structures.
+ * Bitmap1 is a map of dumpable (i.e existing) pages.
+ * They must only be pages that exist, so they will be 0 bits
+ * in the 2nd bitmap but 1 bits in the 1st bitmap.
+ * For speed, only worry about whole words full of bits.
+ */
+void
+find_unused_vmemmap_pages(void)
+{
+ struct dump_bitmap *bitmap1 = info->bitmap1;
+ struct dump_bitmap *bitmap2 = info->bitmap2;
+ unsigned long long pfn;
+ unsigned long *lp1, *lp2, startpfn, endpfn;
+ unsigned long vmapstartpfn, vmapnumpfns;
+ int i, sz, numpages=0, did_deletes;
+ int startword, numwords, do_break=0;
+ long deleted_pages = 0;
+ off_t new_offset1, new_offset2;
+
+ /* read each block of both bitmaps */
+ for (pfn = 0; pfn < info->max_mapnr; pfn += PFN_BUFBITMAP) { /* size in bits */
+ numpages++;
+ did_deletes = 0;
+ new_offset1 = bitmap1->offset + BUFSIZE_BITMAP * (pfn / PFN_BUFBITMAP);
+ if (lseek(bitmap1->fd, new_offset1, SEEK_SET) < 0 ) {
+ ERRMSG("Can't seek the bitmap(%s). %s\n",
+ bitmap1->file_name, strerror(errno));
+ return;
+ }
+ if (read(bitmap1->fd, bitmap1->buf, BUFSIZE_BITMAP) != BUFSIZE_BITMAP) {
+ ERRMSG("Can't read the bitmap(%s). %s\n",
+ bitmap1->file_name, strerror(errno));
+ return;
+ }
+ bitmap1->no_block = pfn / PFN_BUFBITMAP;
+
+ new_offset2 = bitmap2->offset + BUFSIZE_BITMAP * (pfn / PFN_BUFBITMAP);
+ if (lseek(bitmap2->fd, new_offset2, SEEK_SET) < 0 ) {
+ ERRMSG("Can't seek the bitmap(%s). %s\n",
+ bitmap2->file_name, strerror(errno));
+ return;
+ }
+ if (read(bitmap2->fd, bitmap2->buf, BUFSIZE_BITMAP) != BUFSIZE_BITMAP) {
+ ERRMSG("Can't read the bitmap(%s). %s\n",
+ bitmap2->file_name, strerror(errno));
+ return;
+ }
+ bitmap2->no_block = pfn / PFN_BUFBITMAP;
+
+ /* process this one page of both bitmaps at a time */
+ lp1 = (unsigned long *)bitmap1->buf;
+ lp2 = (unsigned long *)bitmap2->buf;
+ /* sz is words in the block */
+ sz = BUFSIZE_BITMAP / sizeof(unsigned long);
+ startword = -1;
+ for (i = 0; i < sz; i++, lp1++, lp2++) {
+ /* for each whole word in the block */
+ /* deal in full 64-page chunks only */
+ if (*lp1 == 0xffffffffffffffffUL) {
+ if (*lp2 == 0) {
+ /* we are in a series we want */
+ if (startword == -1) {
+ /* starting a new group */
+ startword = i;
+ }
+ } else {
+ /* we hit a used page */
+ if (startword >= 0)
+ do_break = 1;
+ }
+ } else {
+ /* we hit a hole in real memory, or part of one */
+ if (startword >= 0)
+ do_break = 1;
+ }
+ if (do_break) {
+ do_break = 0;
+ if (startword >= 0) {
+ numwords = i - startword;
+ /* 64 bits represents 64 page structs, which
+ are not even one page of them (takes
+ at least 73) */
+ if (numwords > 1) {
+ startpfn = pfn +
+ (startword * BITS_PER_WORD);
+ /* pfn ranges are literally start and end,
+ not start and end + 1 */
+ endpfn = startpfn +
+ (numwords * BITS_PER_WORD) - 1;
+ if (find_vmemmap_pages(startpfn, endpfn,
+ &vmapstartpfn, &vmapnumpfns)) {
+ save_deletes(vmapstartpfn,
+ vmapnumpfns);
+ deleted_pages += vmapnumpfns;
+ did_deletes = 1;
+ }
+ }
+ }
+ startword = -1;
+ }
+ }
+ if (startword >= 0) {
+ numwords = i - startword;
+ if (numwords > 1) {
+ startpfn = pfn + (startword * BITS_PER_WORD);
+ /* pfn ranges are literally start and end,
+ not start and end + 1 */
+ endpfn = startpfn + (numwords * BITS_PER_WORD) - 1;
+ if (find_vmemmap_pages(startpfn, endpfn,
+ &vmapstartpfn, &vmapnumpfns)) {
+ save_deletes(vmapstartpfn, vmapnumpfns);
+ deleted_pages += vmapnumpfns;
+ did_deletes = 1;
+ }
+ }
+ }
+ }
+ PROGRESS_MSG("\nExcluded %ld unused vmemmap pages\n", deleted_pages);
+
+ return;
+}
+
+/*
+ * Retrieve the list of pfn's and delete them from bitmap2;
+ */
+void
+delete_unused_vmemmap_pages(void)
+{
+ unsigned long startpfn, numpfns, pfn, i;
+
+ while (!get_deletes(&startpfn, &numpfns)) {
+ for (i = 0, pfn = startpfn; i < numpfns; i++, pfn++) {
+ clear_bit_on_2nd_bitmap_for_kernel(pfn, (struct cycle *)0);
+ // note that this is never to be used in cyclic mode!
+ }
+ }
+ return;
+}
+
int
create_2nd_bitmap(void)
{
@@ -5276,6 +5593,15 @@ create_2nd_bitmap(void)
if (!sync_2nd_bitmap())
return FALSE;
+ /* --exclude-unused-vm means exclude vmemmap page structures for unused pages */
+ if (excludevmflag) {
+ init_save_control();
+ find_unused_vmemmap_pages();
+ reset_save_control();
+ delete_unused_vmemmap_pages();
+ finalize_save_control();
+ }
+
return TRUE;
}
@@ -5792,8 +6118,13 @@ write_kdump_header(void)
dh->max_mapnr = MIN(info->max_mapnr, UINT_MAX);
dh->nr_cpus = get_nr_cpus();
dh->bitmap_blocks = divideup(info->len_bitmap, dh->block_size);
+ blocksize = dh->block_size;
memcpy(&dh->timestamp, &info->timestamp, sizeof(dh->timestamp));
memcpy(&dh->utsname, &info->system_utsname, sizeof(dh->utsname));
+
+ if (excludevmflag)
+ dh->status |= DUMP_DH_EXCLUDED_VMEMMAP;
+
if (info->flag_compress & DUMP_DH_COMPRESSED_ZLIB)
dh->status |= DUMP_DH_COMPRESSED_ZLIB;
#ifdef USELZO
@@ -8624,6 +8955,315 @@ writeout_multiple_dumpfiles(void)
return ret;
}
+/*
+ * Scan the kernel page table for the pfn's of the page structs
+ * Place them in array gvmem_pfns[nr_gvmem_pfns]
+ */
+void
+find_vmemmap()
+{
+ int i, verbose = 0;
+ int pgd_index, pud_index;
+ int start_range = 1;
+ int num_pmds=0, num_pmds_valid=0;
+ int break_in_valids, break_after_invalids;
+ int do_break, done = 0;
+ int last_valid=0, last_invalid=0;
+ int pagestructsize, structsperhpage, hugepagesize;
+ long page_structs_per_pud;
+ long num_puds, groups = 0;
+ long pgdindex, pudindex, pmdindex;
+ long vaddr, vaddr_base;
+ long rep_pfn_start = 0, rep_pfn_end = 0;
+ unsigned long init_level4_pgt;
+ unsigned long max_paddr, high_pfn;
+ unsigned long pgd_addr, pud_addr, pmd_addr;
+ unsigned long *pgdp, *pudp, *pmdp;
+ unsigned long pud_page[PTRS_PER_PUD];
+ unsigned long pmd_page[PTRS_PER_PMD];
+ unsigned long vmap_offset_start = 0, vmap_offset_end = 0;
+ unsigned long pmd, tpfn;
+ unsigned long pvaddr = 0;
+ unsigned long data_addr = 0, last_data_addr = 0, start_data_addr = 0;
+ /*
+ * data_addr is the paddr of the page holding the page structs.
+ * We keep lists of contiguous pages and the pfn's that their
+ * page structs represent.
+ * start_data_addr and last_data_addr mark start/end of those
+ * contiguous areas.
+ * An area descriptor is vmap start/end pfn and rep start/end
+ * of the pfn's represented by the vmap start/end.
+ */
+ struct vmap_pfns *vmapp, *vmaphead = NULL, *cur, *tail;
+
+ init_level4_pgt = SYMBOL(init_level4_pgt);
+ if (init_level4_pgt == NOT_FOUND_SYMBOL) {
+ fprintf(stderr, "init_level4_pgt not found\n");
+ return;
+ }
+ pagestructsize = size_table.page;
+ hugepagesize = PTRS_PER_PMD * info->page_size;
+ vaddr_base = info->vmemmap_start;
+ vaddr = vaddr_base;
+ max_paddr = get_max_paddr();
+ /*
+ * the page structures are mapped at VMEMMAP_START (info->vmemmap_start)
+ * for max_paddr >> 12 page structures
+ */
+ high_pfn = max_paddr >> 12;
+ pgd_index = pgd4_index(vaddr_base);
+ pud_index = pud_index(vaddr_base);
+ pgd_addr = vaddr_to_paddr(init_level4_pgt); /* address of pgd */
+ pgd_addr += pgd_index * sizeof(unsigned long);
+ page_structs_per_pud = (PTRS_PER_PUD * PTRS_PER_PMD * info->page_size) /
+ pagestructsize;
+ num_puds = (high_pfn + page_structs_per_pud - 1) / page_structs_per_pud;
+ pvaddr = VMEMMAP_START;
+ structsperhpage = hugepagesize / pagestructsize;
+
+ /* outer loop is for pud entries in the pgd */
+ for (pgdindex = 0, pgdp = (unsigned long *)pgd_addr; pgdindex < num_puds;
+ pgdindex++, pgdp++) {
+ /* read the pgd one word at a time, into pud_addr */
+ if (!readmem(PADDR, (unsigned long long)pgdp, (void *)&pud_addr,
+ sizeof(unsigned long))) {
+ ERRMSG("Can't get pgd entry for slot %d.\n", pgd_index);
+ return;
+ }
+ /* mask the pgd entry for the address of the pud page */
+ pud_addr &= PMASK;
+ /* read the entire pud page */
+ if (!readmem(PADDR, (unsigned long long)pud_addr, (void *)pud_page,
+ PTRS_PER_PUD * sizeof(unsigned long))) {
+ ERRMSG("Can't get pud entry for pgd slot %ld.\n", pgdindex);
+ return;
+ }
+ /* step thru each pmd address in the pud page */
+ /* pudp points to an entry in the pud page */
+ for (pudp = (unsigned long *)pud_page, pudindex = 0;
+ pudindex < PTRS_PER_PUD; pudindex++, pudp++) {
+ pmd_addr = *pudp & PMASK;
+ /* read the entire pmd page */
+ if (!readmem(PADDR, pmd_addr, (void *)pmd_page,
+ PTRS_PER_PMD * sizeof(unsigned long))) {
+ ERRMSG("Can't get pud entry for slot %ld.\n", pudindex);
+ return;
+ }
+ /* pmdp points to an entry in the pmd */
+ for (pmdp = (unsigned long *)pmd_page, pmdindex = 0;
+ pmdindex < PTRS_PER_PMD; pmdindex++, pmdp++) {
+ /* linear page position in this page table: */
+ pmd = *pmdp;
+ num_pmds++;
+ tpfn = (pvaddr - VMEMMAP_START) /
+ pagestructsize;
+ if (tpfn >= high_pfn) {
+ done = 1;
+ break;
+ }
+ /*
+ * vmap_offset_start:
+ * Starting logical position in the
+ * vmemmap array for the group stays
+ * constant until a hole in the table
+ * or a break in contiguousness.
+ */
+
+ /*
+ * Ending logical position in the
+ * vmemmap array:
+ */
+ vmap_offset_end += hugepagesize;
+ do_break = 0;
+ break_in_valids = 0;
+ break_after_invalids = 0;
+ /*
+ * We want breaks either when:
+ * - we hit a hole (invalid)
+ * - we discontiguous page is a string of valids
+ */
+ if (pmd) {
+ data_addr = (pmd & PMASK);
+ if (start_range) {
+ /* first-time kludge */
+ start_data_addr = data_addr;
+ last_data_addr = start_data_addr
+ - hugepagesize;
+ start_range = 0;
+ }
+ if (last_invalid) {
+ /* end of a hole */
+ start_data_addr = data_addr;
+ last_data_addr = start_data_addr
+ - hugepagesize;
+ /* trigger update of offset */
+ do_break = 1;
+ }
+ last_valid = 1;
+ last_invalid = 0;
+ /*
+ * we have a gap in physical
+ * contiguousness in the table.
+ */
+ /* ?? consecutive holes will have
+ same data_addr */
+ if (data_addr !=
+ last_data_addr + hugepagesize) {
+ do_break = 1;
+ break_in_valids = 1;
+ }
+ if (verbose)
+ printf("valid: pud %ld pmd %ld pfn %#lx"
+ " pvaddr %#lx pfns %#lx-%lx"
+ " start %#lx end %#lx\n",
+ pudindex, pmdindex,
+ data_addr >> 12,
+ pvaddr, tpfn,
+ tpfn + structsperhpage - 1,
+ vmap_offset_start,
+ vmap_offset_end);
+ num_pmds_valid++;
+ if (!(pmd & PSE)) {
+ printf("vmemmap pmd not huge, abort\n");
+ exit(1);
+ }
+ } else {
+ if (last_valid) {
+ /* this a hole after some valids */
+ do_break = 1;
+ break_in_valids = 1;
+ break_after_invalids = 0;
+ }
+ last_valid = 0;
+ last_invalid = 1;
+ /*
+ * There are holes in this sparsely
+ * populated table; they are 2MB gaps
+ * represented by null pmd entries.
+ */
+ if (verbose)
+ printf("invalid: pud %ld pmd %ld %#lx"
+ " pfns %#lx-%lx start %#lx end"
+ " %#lx\n", pudindex, pmdindex,
+ pvaddr, tpfn,
+ tpfn + structsperhpage - 1,
+ vmap_offset_start,
+ vmap_offset_end);
+ }
+ if (do_break) {
+ /* The end of a hole is not summarized.
+ * It must be the start of a hole or
+ * hitting a discontiguous series.
+ */
+ if (break_in_valids || break_after_invalids) {
+ /*
+ * calculate that pfns
+ * represented by the current
+ * offset in the vmemmap.
+ */
+ /* page struct even partly on this page */
+ rep_pfn_start = vmap_offset_start /
+ pagestructsize;
+ /* ending page struct entirely on
+ this page */
+ rep_pfn_end = ((vmap_offset_end -
+ hugepagesize) / pagestructsize);
+ if (verbose)
+ printf("vmap pfns %#lx-%lx "
+ "represent pfns %#lx-%lx\n\n",
+ start_data_addr >> PAGESHFT,
+ last_data_addr >> PAGESHFT,
+ rep_pfn_start, rep_pfn_end);
+ groups++;
+ vmapp = (struct vmap_pfns *)malloc(
+ sizeof(struct vmap_pfns));
+ /* pfn of this 2MB page of page structs */
+ vmapp->vmap_pfn_start = start_data_addr
+ >> PTE_SHIFT;
+ vmapp->vmap_pfn_end = last_data_addr
+ >> PTE_SHIFT;
+ /* these (start/end) are literal pfns
+ * on this page, not start and end+1 */
+ vmapp->rep_pfn_start = rep_pfn_start;
+ vmapp->rep_pfn_end = rep_pfn_end;
+
+ if (!vmaphead) {
+ vmaphead = vmapp;
+ vmapp->next = vmapp;
+ vmapp->prev = vmapp;
+ } else {
+ tail = vmaphead->prev;
+ vmaphead->prev = vmapp;
+ tail->next = vmapp;
+ vmapp->next = vmaphead;
+ vmapp->prev = tail;
+ }
+ }
+
+ /* update logical position at every break */
+ vmap_offset_start =
+ vmap_offset_end - hugepagesize;
+ start_data_addr = data_addr;
+ }
+
+ last_data_addr = data_addr;
+ pvaddr += hugepagesize;
+ /*
+ * pvaddr is current virtual address
+ * eg 0xffffea0004200000 if
+ * vmap_offset_start is 4200000
+ */
+ }
+ }
+ tpfn = (pvaddr - VMEMMAP_START) / pagestructsize;
+ if (tpfn >= high_pfn) {
+ done = 1;
+ break;
+ }
+ }
+ rep_pfn_start = vmap_offset_start / pagestructsize;
+ rep_pfn_end = (vmap_offset_end - hugepagesize) / pagestructsize;
+ if (verbose)
+ printf("vmap pfns %#lx-%lx represent pfns %#lx-%lx\n\n",
+ start_data_addr >> PAGESHFT, last_data_addr >> PAGESHFT,
+ rep_pfn_start, rep_pfn_end);
+ groups++;
+ vmapp = (struct vmap_pfns *)malloc(sizeof(struct vmap_pfns));
+ vmapp->vmap_pfn_start = start_data_addr >> PTE_SHIFT;
+ vmapp->vmap_pfn_end = last_data_addr >> PTE_SHIFT;
+ vmapp->rep_pfn_start = rep_pfn_start;
+ vmapp->rep_pfn_end = rep_pfn_end;
+ if (!vmaphead) {
+ vmaphead = vmapp;
+ vmapp->next = vmapp;
+ vmapp->prev = vmapp;
+ } else {
+ tail = vmaphead->prev;
+ vmaphead->prev = vmapp;
+ tail->next = vmapp;
+ vmapp->next = vmaphead;
+ vmapp->prev = tail;
+ }
+ if (verbose)
+ printf("num_pmds: %d num_pmds_valid %d\n", num_pmds, num_pmds_valid);
+
+ /* transfer the linked list to an array */
+ cur = vmaphead;
+ gvmem_pfns = (struct vmap_pfns *)malloc(sizeof(struct vmap_pfns) * groups);
+ i = 0;
+ do {
+ vmapp = gvmem_pfns + i;
+ vmapp->vmap_pfn_start = cur->vmap_pfn_start;
+ vmapp->vmap_pfn_end = cur->vmap_pfn_end;
+ vmapp->rep_pfn_start = cur->rep_pfn_start;
+ vmapp->rep_pfn_end = cur->rep_pfn_end;
+ cur = cur->next;
+ free(cur->prev);
+ i++;
+ } while (cur != vmaphead);
+ nr_gvmem_pfns = i;
+}
+
int
create_dumpfile(void)
{
@@ -8636,9 +9276,16 @@ create_dumpfile(void)
if (!get_elf_info(info->fd_memory, info->name_memory))
return FALSE;
}
+ blocksize = info->page_size;
+ if (!blocksize)
+ blocksize = sysconf(_SC_PAGE_SIZE);
if (!initial())
return FALSE;
+ /* create an array of translations from pfn to vmemmap pages */
+ if (excludevmflag)
+ find_vmemmap();
+
print_vtop();
num_retry = 0;
@@ -9795,9 +10442,47 @@ static struct option longopts[] = {
{"non-mmap", no_argument, NULL, OPT_NON_MMAP},
{"mem-usage", no_argument, NULL, OPT_MEM_USAGE},
{"splitblock-size", required_argument, NULL, OPT_SPLITBLOCK_SIZE},
+ {"exclude-unused-vm", no_argument, NULL, OPT_EXCLUDE_UNUSED_VM},
{0, 0, 0, 0}
};
+/* for bitmaps and other scratch files, use the same directory as the dump file */
+void
+set_root_dir()
+{
+ int len1, len2;
+ char *cp1, *cp2;
+
+ len1 = (int)strlen(info->name_dumpfile);
+ if (len1 == 0) {
+ printf("Error: empty info->name_dumpfile: %s\n", info->name_dumpfile);
+ root_dir = "/tmp";
+ return;
+ }
+ root_dir = malloc(len1);
+ cp1 = info->name_dumpfile;
+ cp2 = NULL;
+ while (*cp1 != '\0') {
+ if (*cp1 == '/')
+ cp2 = cp1;
+ cp1++;
+ }
+ if (cp2 == NULL) {
+ /* if path is not absolute, use something reasonable */
+ root_dir = getenv("TMPDIR");
+ if (!root_dir)
+ root_dir = "/tmp";
+ } else {
+ len2 = cp2 - info->name_dumpfile;
+ if (len2 == 0)
+ len2 = 1;
+ if (len2 > len1)
+ printf("ERROR: len2 %d len1 %d\n", len1, len2);
+ strncpy(root_dir, info->name_dumpfile, len2);
+ *(root_dir + len2) = '\0';
+ }
+}
+
int
main(int argc, char *argv[])
{
@@ -9828,7 +10513,7 @@ main(int argc, char *argv[])
info->block_order = DEFAULT_ORDER;
message_level = DEFAULT_MSG_LEVEL;
- while ((opt = getopt_long(argc, argv, "b:cDd:EFfg:hi:lpRvXx:", longopts,
+ while ((opt = getopt_long(argc, argv, "b:cDd:eEFfg:hi:lpRvXx:", longopts,
NULL)) != -1) {
switch (opt) {
case OPT_BLOCK_ORDER:
@@ -9872,6 +10557,10 @@ main(int argc, char *argv[])
info->flag_read_vmcoreinfo = 1;
info->name_vmcoreinfo = optarg;
break;
+ case OPT_EXCLUDE_UNUSED_VM:
+ excludevmflag = 1; /* exclude unused vmemmap pages */
+ info->flag_cyclic = FALSE; /* force create_2nd_bitmap */
+ break;
case OPT_DISKSET:
if (!sadump_add_diskset_info(optarg))
goto out;
@@ -9963,6 +10652,7 @@ main(int argc, char *argv[])
ERRMSG("Elf library out of date!\n");
goto out;
}
+
if (info->flag_generate_vmcoreinfo) {
if (!check_param_for_generating_vmcoreinfo(argc, argv)) {
MSG("Commandline parameter is invalid.\n");
@@ -10057,6 +10747,8 @@ main(int argc, char *argv[])
goto out;
}
+ set_root_dir();
+
if (!create_dumpfile())
goto out;
Index: makedumpfile/diskdump_mod.h
===================================================================
--- makedumpfile.orig/diskdump_mod.h
+++ makedumpfile/diskdump_mod.h
@@ -97,6 +97,7 @@ struct kdump_sub_header {
/* paged is compressed with snappy */
#define DUMP_DH_COMPRESSED_INCOMPLETE 0x8
/* indicate an incomplete dumpfile */
+#define DUMP_DH_EXCLUDED_VMEMMAP 0x10 /* unused vmemmap pages are excluded */
/* descriptor of each page for vmcore */
typedef struct page_desc {
More information about the kexec
mailing list