[PATCH] makedumpfile: Petr Tesarik's hugepage filtering

Atsushi Kumagai kumagai-atsushi at mxc.nes.nec.co.jp
Thu May 23 21:11:21 EDT 2013


On Wed, 15 May 2013 13:43:59 -0500
Cliff Wickman <cpw at sgi.com> wrote:

> From: Cliff Wickman <cpw at sgi.com>
> 
> This fix comes from Petr Tesarik <ptesarik at suse.cz>, and was applied
> to the SuSE version of makedumpfile.
> I don't see it in the mmap version.
> It is important to filtering out a great deal of user memory.
> 
> But this from Petr on 5/15:
> > This patch depends on exporting the relevant PG_* flags from the
> > kernel (in VMCOREINFO), and that's where I got stuck, because depending
> > on the number of available bits for the page flags, the kernel either
> > has PG_head and PG_tail, or only PG_compound, so I needed a #ifdef, and
> > the kernel maintainers didn't like the conditional.
> > I can restart the discussion with kernel maintainers and see what I can do
> So I'm not sending to the kexec list until he's ready.

I send this mail to kexec list to open the discussion below.

> His description:
> 
> > This filters out pages that have the compound page flag set.
> > The precise position of the flag(s) depends on the kernel version
> > and configuration, so an additional patch is needed for the kernel.
> > 
> > It is still possible to filter out huge pages on unpatched kernels
> > if the debuginfo is available. The page flag can also be saved
> > into a generated vmcoreinfo file.
> > 
> > To sum it up:
> > 
> > a. any kernel with the appropriate patch includes the PG_xxx values in
> >    VMCOERINFO, and these are automatically picked up by makedumpfile
> >    (planned for a regular SLES11 kernel maint update)
> > b. for unpatched 2.6.26+ kernels, the info can be pre-generated with
> >    "makedumpfile -g", or you can pass the "-x" option with the kernel
> >    debuginfo file when filtering
> >    NOTE: running "makedumpfile" without either "-g" or "-x" will NOT
> >    filter out huge pages; a warning message should be issued
> > c. for pre-2.6.26 kernels, makedumpfile will assume the legacy value of
> >    14 for PG_compound (this also covers SLES10 kernels)
> > d. ancient pre-2.6.13 kernels will not be supported; no change here
> > 
> > Signed-off-by: Petr Tesarik <ptesarik at suse.cz>
> ---
>  makedumpfile.c |   43 ++++++++++++++++++++++++++++++++++++++++++-
>  makedumpfile.h |    8 ++++++++
>  2 files changed, 50 insertions(+), 1 deletion(-)
> 
> Index: makedumpfile.mmap/makedumpfile.c
> ===================================================================
> --- makedumpfile.mmap.orig/makedumpfile.c
> +++ makedumpfile.mmap/makedumpfile.c
> @@ -173,6 +173,13 @@ is_in_same_page(unsigned long vaddr1, un
>  	return FALSE;
>  }
>  
> +/* Test whether flags belong to a huge page */
> +static inline int
> +isHugePage(unsigned long flags)
> +{
> +	return flags & info->hugepage_flags;
> +}
> +
>  #define BITMAP_SECT_LEN 4096
>  static inline int is_dumpable(struct dump_bitmap *, unsigned long long);
>  static inline int is_dumpable_cyclic(char *bitmap, unsigned long long);
> @@ -1092,11 +1099,19 @@ get_structure_info(void)
>  
>  	ENUM_NUMBER_INIT(PG_lru, "PG_lru");
>  	ENUM_NUMBER_INIT(PG_private, "PG_private");
> +	ENUM_NUMBER_INIT(PG_head, "PG_head");
> +	ENUM_NUMBER_INIT(PG_tail, "PG_tail");
> +	ENUM_NUMBER_INIT(PG_compound, "PG_compound");
>  	ENUM_NUMBER_INIT(PG_swapcache, "PG_swapcache");
>  	ENUM_NUMBER_INIT(PG_buddy, "PG_buddy");
>  	ENUM_NUMBER_INIT(PG_slab, "PG_slab");
>  	ENUM_NUMBER_INIT(PG_hwpoison, "PG_hwpoison");
>  
> +	if (NUMBER(PG_head) == NOT_FOUND_NUMBER &&
> +	    NUMBER(PG_compound) == NOT_FOUND_NUMBER)
> +		/* Pre-2.6.26 kernels did not have pageflags */
> +		NUMBER(PG_compound) = PG_compound_ORIGINAL;
> +
>  	ENUM_TYPE_SIZE_INIT(pageflags, "pageflags");
>  
>  	TYPEDEF_SIZE_INIT(nodemask_t, "nodemask_t");
> @@ -1537,6 +1552,12 @@ write_vmcoreinfo_data(void)
>  
>  	WRITE_NUMBER("PG_lru", PG_lru);
>  	WRITE_NUMBER("PG_private", PG_private);
> +	if (NUMBER(PG_head) != NOT_FOUND_NUMBER)
> +		WRITE_NUMBER("PG_head", PG_head);
> +	if (NUMBER(PG_tail) != NOT_FOUND_NUMBER)
> +		WRITE_NUMBER("PG_tail", PG_tail);
> +	if (NUMBER(PG_compound) != NOT_FOUND_NUMBER)
> +		WRITE_NUMBER("PG_compound", PG_compound);
>  	WRITE_NUMBER("PG_swapcache", PG_swapcache);
>  	WRITE_NUMBER("PG_buddy", PG_buddy);
>  	WRITE_NUMBER("PG_slab", PG_slab);
> @@ -1839,6 +1860,9 @@ read_vmcoreinfo(void)
>  
>  	READ_NUMBER("PG_lru", PG_lru);
>  	READ_NUMBER("PG_private", PG_private);
> +	READ_NUMBER("PG_head", PG_head);
> +	READ_NUMBER("PG_tail", PG_tail);
> +	READ_NUMBER("PG_compound", PG_compound);
>  	READ_NUMBER("PG_swapcache", PG_swapcache);
>  	READ_NUMBER("PG_slab", PG_slab);
>  	READ_NUMBER("PG_buddy", PG_buddy);
> @@ -3079,6 +3103,23 @@ sync_bitmap(struct dump_bitmap *bitmap)
>  	off_t offset;
>  	offset = bitmap->offset + BUFSIZE_BITMAP * bitmap->no_block;
>  
> +	/* Get page flags for compound pages */
> +	if (NUMBER(PG_head) != NOT_FOUND_NUMBER) {
> +		info->hugepage_flags = 1UL << NUMBER(PG_head);
> +		if (NUMBER(PG_tail) != NOT_FOUND_NUMBER)
> +			info->hugepage_flags |= 1UL << NUMBER(PG_tail);
> +		else
> +			/* If PG_tail is not explicitly saved, then assume
> +			 * that it immediately follows PG_head.
> +			 */
> +			info->hugepage_flags |= info->hugepage_flags << 1;
> +	} else if (NUMBER(PG_compound) != NOT_FOUND_NUMBER) {
> +		info->hugepage_flags = 1UL << NUMBER(PG_compound);
> +	} else if (info->dump_level & DL_EXCLUDE_USER_DATA) {
> +		MSG("Compound page bit could not be determined: ");
> +		MSG("huge pages will NOT be filtered.\n");
> +	}
> +
>  	/*
>  	 * The bitmap buffer is not dirty, and it is not necessary
>  	 * to write out it.
> @@ -4345,7 +4386,7 @@ __exclude_unnecessary_pages(unsigned lon
>  		 * Exclude the data page of the user process.
>  		 */
>  		else if ((info->dump_level & DL_EXCLUDE_USER_DATA)
> -		    && isAnon(mapping)) {
> +		    && (isAnon(mapping) || isHugePage(flags))) {
>  			if (clear_bit_on_2nd_bitmap_for_kernel(pfn))
>  				pfn_user++;

This code will filter out both transparent huge pages and hugetlbfs pages.
But I think hugetlbfs pages shouldn't be filtered out as anonymous pages
because anonymous pages and hugetlbfs pages are certainly different kind
of pages.

Therefore, I plan to add a new dump level to filter out hugetlbfs pages.
My basic idea is like below:


 		 * Exclude the data page of the user process.
 		 */
                else if ((info->dump_level & DL_EXCLUDE_USER_DATA)
                    && isAnon(mapping)) {
                        if (clear_bit_on_2nd_bitmap_for_kernel(pfn))
                                pfn_user++;
+
+                       if (isPageHead(flags)) {
+                               int i, nr_pages = 1 << compound_order;
+
+                               for (i = 1; i < nr_pages; ++i) {
+                                       if (clear_bit_on_2nd_bitmap_for_kernel(pfn + i))
+                                               pfn_user++;
+                               }
+                               pfn += nr_pages - 2;
+                               mem_map += (nr_pages - 1) * SIZE(page);
+                       }
+               }
+               /*
+                * Exclude the hugetlbfs pages.
+                */
+               else if ((info->dump_level & DL_EXCLUDE_HUGETLBFS_DATA)
+                        && (!isAnon(mapping) && isPageHead(flags))) {
+                       int i, nr_pages = 1 << compound_order;
+
+                       for (i = 0; i < nr_pages; ++i) {
+                               if (clear_bit_on_2nd_bitmap_for_kernel(pfn + i))
+                                       pfn_hugetlb++;
+                       }
+                       pfn += nr_pages - 1;
+                       mem_map += (nr_pages - 1) * SIZE(page);
                }
                /*
                 * Exclude the hwpoison page.


But before that, I would like to hear the opinions about the new dump level
to make sure that the level is worthy or not.


Thanks
Atsushi Kumagai

>  		}
> Index: makedumpfile.mmap/makedumpfile.h
> ===================================================================
> --- makedumpfile.mmap.orig/makedumpfile.h
> +++ makedumpfile.mmap/makedumpfile.h
> @@ -74,6 +74,7 @@ int get_mem_type(void);
>  #define PG_lru_ORIGINAL	 	(5)
>  #define PG_slab_ORIGINAL	(7)
>  #define PG_private_ORIGINAL	(11)	/* Has something at ->private */
> +#define PG_compound_ORIGINAL	(14)	/* Is part of a compound page */
>  #define PG_swapcache_ORIGINAL	(15)	/* Swap page: swp_entry_t in private */
>  
>  #define PAGE_BUDDY_MAPCOUNT_VALUE_v2_6_38	(-2)
> @@ -140,6 +141,9 @@ test_bit(int nr, unsigned long addr)
>  
>  #define isLRU(flags)		test_bit(NUMBER(PG_lru), flags)
>  #define isPrivate(flags)	test_bit(NUMBER(PG_private), flags)
> +#define isHead(flags)		test_bit(NUMBER(PG_head), flags)
> +#define isTail(flags)		test_bit(NUMBER(PG_tail), flags)
> +#define isCompound(flags)	test_bit(NUMBER(PG_compound), flags)
>  #define isSwapCache(flags)	test_bit(NUMBER(PG_swapcache), flags)
>  #define isHWPOISON(flags)	(test_bit(NUMBER(PG_hwpoison), flags) \
>  				&& (NUMBER(PG_hwpoison) != NOT_FOUND_NUMBER))
> @@ -908,6 +912,7 @@ struct DumpInfo {
>  	unsigned long   vmalloc_end;
>  	unsigned long	vmemmap_start;
>  	unsigned long	vmemmap_end;
> +	unsigned long	hugepage_flags;	     /* page flags for a compound page */
>  
>  	/*
>  	 * Filter config file containing filter commands to filter out kernel
> @@ -1377,6 +1382,9 @@ struct number_table {
>  	 */
>  	long	PG_lru;
>  	long	PG_private;
> +	long	PG_head;
> +	long	PG_tail;
> +	long	PG_compound;
>  	long	PG_swapcache;
>  	long	PG_buddy;
>  	long	PG_slab;



More information about the kexec mailing list