a bug of "arch/x86_64.c"

Atsushi Kumagai kumagai-atsushi at mxc.nes.nec.co.jp
Wed May 16 00:39:31 EDT 2012


Hello Norbert,

On Mon, 14 May 2012 16:28:49 +0200
"Trapp, Norbert" <norbert.trapp at ts.fujitsu.com> wrote:

> Dear Kumagai-San,
> 
> the relevant functions for xen4 in makedumpfile
> for saving just the xen and dom0 pages are
> exclude_xen4_user_domain and kvtop_xen_x86_64.
> Our current version of kvtop_xen_x86_64 also
> tries to check for 1GB pages and reads the page
> list faster like the crash utility does.
> I send you our current versions.
> 
> As was suggested I sent the implementation to the
> xen mailing list last year but didn't get any answer.

I'm sorry for late reply.

> We check the core file data to find out whether
> it is a xen3 or a xen4 dump and then the xen3
> or xen4 functions get called. Alternatively
> there could be #ifdefs but I do not know how
> the makedumpfile rpms would then be produced.
> Probably the xen3 code should not yet be removed.
> 
> Also we only implemented xen4 for x86_64.
> Maybe there should be an error message for
> other architectures as long as nobody hands in
> an implementation.

Thank you for your explanation.

I think that your idea is good but I can't review details of your code.
So, could you repost the whole of your current version as divided patches
to get advices to kexec-ML ?

  kexec-ML:
    http://lists.infradead.org/pipermail/kexec/


Thanks
Atsushi Kumagai
 
> Greetings
> Norbert
> 
> #if defined(__i386__)
> #define BITS_PER_LONG 32
> #define BYTES_PER_LONG 4
> #define LONG_BYTEORDER 2
> #elif defined(__x86_64__)
> #define BITS_PER_LONG 64
> #define BYTES_PER_LONG 8
> #define LONG_BYTEORDER 3
> #endif
> #define PG_shift(idx) (BITS_PER_LONG - (idx))
> //#define PG_mask(x, idx) (x ## UL << PG_shift(idx))
> #define PG_mask(x, idx) (x ## UL << PG_shift(idx))
> #define PGC_xen_heap        PG_mask(1, 2)
> #define PGC_allocated       PG_mask(1, 1)
> #define is_xen_heap_page(page) ((page)->count_info & PGC_xen_heap)
> #define PGC_count_width     PG_shift(9)
> #define PGC_count_mask ((1UL<<PGC_count_width)-1)
> #define PGC_state_offlined  PG_mask(2, 9)
> #define PGC_state_free      PG_mask(3, 9)
> #define PGC_state           PG_mask(3, 9)
> #define PGC_state_inuse     PG_mask(0, 9)
> #define PGC_state_offlining PG_mask(1, 9)
> #define PGC_state_offlined  PG_mask(2, 9)
> #define PGC_page_table      PG_mask(1, 3)
> #define PGC_broken          PG_mask(1, 7)
> 
> #define PGT_none          PG_mask(0, 4)  /* no special uses of this page   */
> #define PGT_l1_page_table PG_mask(1, 4)  /* using as an L1 page table?     */
> #define PGT_l2_page_table PG_mask(2, 4)  /* using as an L2 page table?     */
> #define PGT_l3_page_table PG_mask(3, 4)  /* using as an L3 page table?     */
> #define PGT_l4_page_table PG_mask(4, 4)  /* using as an L4 page table?     */
> #define PGT_seg_desc_page PG_mask(5, 4)  /* using this page in a GDT/LDT?  */
> #define PGT_writable_page PG_mask(7, 4)  /* has writable mappings?         */
> #define PGT_shared_page   PG_mask(8, 4)  /* CoW sharable page              */
> #define PGT_type_mask     PG_mask(15, 4) /* Bits 28-31 or 60-63.           */
> #define PGT_pinned        PG_mask(1, 5)
> #define PGT_validated     PG_mask(1, 6)
> #define PGT_pae_xen_l2    PG_mask(1, 7)
> #define PGT_partial       PG_mask(1, 8)
> #define PGT_locked        PG_mask(1, 9)
> #define PGT_count_width   PG_shift(9)
> #define PGT_count_mask    ((1UL<<PGT_count_width)-1)
> 
> int
> exclude_xen4_user_domain(void)
> {
>         int i;
>         unsigned long deleted_pages, total_deleted_pages = 0;
>         unsigned long state_free, total_state_free = 0;
>         unsigned long xen_heap, total_xen_heap = 0;
>         unsigned long allocated, total_allocated = 0;
>         unsigned long selected_domain, total_selected_domain = 0;
>         unsigned long not_selected_domain, total_not_selected_domain = 0;
>         unsigned long not_a_page, total_not_a_page = 0;
>         unsigned long page_not_readable, total_page_not_readable = 0;
>         unsigned long unknown_page_type, total_unknown_page_type = 0;
>         unsigned long not_a_page_offset, total_not_a_page_offset = 0;
>         unsigned long broken_pages, total_broken_pages = 0;
>         unsigned long page_in_use, total_page_in_use = 0;
>         unsigned long count_info;
>         unsigned int  _domain;
>         unsigned long page_info_addr, first_page_info_addr;
>         unsigned long long pfn, prev_pfn, pfn_end;
>         unsigned long long first_pfn;
>         unsigned long long num_pages, total_num_pages, num_pfn_done, num_one_percent_pfn;
>         unsigned long type_info;
>         struct pt_load_segment *pls;
>         int idx;
>         char page_info_local[SIZE(page_info)];
>         char *page_info_mem;
>         int page_info_cntr = 0;
>         int retval;
>         unsigned long long paddr;
>         off_t offset = 0;
>         const off_t failed = (off_t)-1;
> 
>         /*
>          * NOTE: the first half of bitmap is not used for Xen extraction
>          */
>         first_pfn = 0;
>         idx = 0;
> 
>         if ((page_info_mem = (char *)malloc(SIZE(page_info) * 128)) == NULL) {
>                 ERRMSG("Can't allocate memory for the page_info memory. %s\n", strerror(errno));
>                 return FALSE;
>         }
>         print_progress(PROGRESS_XEN_DOMAIN, 0, 1);
>         DEBUG_MSG("\nmakedumpfile: exclude_xen4_user_domain start\n");
>         DEBUG_MSG("XEN_VIRT_START       : 0x%016lx\n", XEN_VIRT_START);
>         DEBUG_MSG("XEN_VIRT_END         : 0x%016lx\n", XEN_VIRT_END);
>         DEBUG_MSG("DIRECTMAP_VIRT_START : 0x%016lx\n", DIRECTMAP_VIRT_START);
>         DEBUG_MSG("DIRECTMAP_VIRT_END   : 0x%016lx\n", DIRECTMAP_VIRT_END);
>         DEBUG_MSG("FRAMETABLE_VIRT_START: 0x%016lx\n", FRAMETABLE_VIRT_START);
>         DEBUG_MSG("FRAMETABLE_VIRT_END  : 0x%016lx\n", FRAMETABLE_VIRT_END);
>         DEBUG_MSG("FRAMETABLE_SIZE      : 0x%016lx\n", FRAMETABLE_SIZE);
>         DEBUG_MSG("frame_table_vaddr    : 0x%016lx\n", info->frame_table_vaddr);
>         DEBUG_MSG("SIZE(page_info)      : 0x%016lx\n", SIZE(page_info));
>         DEBUG_MSG("PAGESIZE()           : 0x%016lx\n", PAGESIZE());
>         DEBUG_MSG("_2MB_PAGE_MASK       : 0x%08x\n"  , _2MB_PAGE_MASK);
>         DEBUG_MSG("_PAGE_PSE            : 0x%08x\n"  , _PAGE_PSE);
>         DEBUG_MSG("ENTRY_MASK           : 0x%016llx\n", ENTRY_MASK);
>         DEBUG_MSG("PHYSICAL_PAGE_MASK   : 0x%016lx\n", PHYSICAL_PAGE_MASK);
>         DEBUG_MSG("PGC_state_inuse      : 0x%016lx\n", PGC_state_inuse);
>         DEBUG_MSG("PGC_count_mask       : 0x%016lx\n", PGC_count_mask);
>         DEBUG_MSG("PGC_state            : 0x%016lx\n", PGC_state);
>         DEBUG_MSG("PGC_state_free       : 0x%016lx\n", PGC_state_free);
>         DEBUG_MSG("PGC_allocated        : 0x%016lx\n", PGC_allocated);
>         DEBUG_MSG("PGC_broken           : 0x%016lx\n", PGC_broken);
>         num_pfn_done = 0;
>         total_num_pages = 0;
> 
>         DEBUG_MSG("exclude_xen4_user_domain: %d memory LOAD sections\n", info->num_load_memory);
>         DEBUG_MSG("section phys_start   phys_end pfn_start  pfn_end  num_pfn\n");
>         for (i = 0; i < info->num_load_memory; i++) {
>                 pls = &info->pt_load_segments[i];
>                 pfn     = pls->phys_start >> PAGESHIFT();
>                 pfn_end = pls->phys_end >> PAGESHIFT();
>                 total_num_pages += pfn_end - pfn;
>                 DEBUG_MSG("%3d 0x%016llx 0x%016llx %10llu %10llu %10llu\n",
>                         i, pls->phys_start, pls->phys_end, pfn, pfn_end, pfn_end - pfn);
>         }
>         DEBUG_MSG("exclude_xen4_user_domain total_num_pages: %llu\n", total_num_pages);
>         DEBUG_MSG("exclude_xen4_user_domain total size of pages: 0x%llx\n", total_num_pages * SIZE(page_info));
>         num_one_percent_pfn = total_num_pages / 100;
>         paddr = 0;
>         for (i = 0; i < info->num_load_memory; i++) {
>                 pls = &info->pt_load_segments[i];
>                 pfn     = pls->phys_start >> PAGESHIFT();
>                 pfn_end = pls->phys_end >> PAGESHIFT();
>                 num_pages    = pfn_end - pfn;
>                 page_info_cntr = 0;
>                 first_page_info_addr = info->frame_table_vaddr + pfn * SIZE(page_info);
>                 deleted_pages = 0;
>                 state_free = 0;
>                 page_in_use = 0;
>                 xen_heap = 0;
>                 allocated = 0;
>                 selected_domain = 0;
>                 not_selected_domain = 0;
>                 not_a_page = 0;
>                 not_a_page_offset = 0;
>                 page_not_readable = 0;
>                 unknown_page_type = 0;
>                 broken_pages = 0;
> 
>                 DEBUG_MSG("exclude_xen4_user_domain: i: %d/%d pfn_start: 0x%llx pfn_end: 0x%llx num_pfn: %llu\n",
>                         i, info->num_load_memory, pfn, pfn_end, pfn_end - pfn);
>                 while (pfn < pfn_end) {
>                         num_pfn_done++;
>                         if (((message_level & ML_PRINT_DEBUG_MSG) == 0) && ((num_pfn_done % num_one_percent_pfn) == 0)) {
>                                 print_progress(PROGRESS_XEN_DOMAIN, num_pfn_done, total_num_pages);
>                         }
>                         page_info_addr = info->frame_table_vaddr + pfn * SIZE(page_info);
>                         retval = TRUE;
>                         while (1 == 1) {
>                                 paddr = kvtop_xen(page_info_addr);
>                                 if (paddr == NOT_PADDR) {
>                                         ERRMSG("NOT a physical address(%llx) for pfn %llu\n", paddr, pfn);
>                                         not_a_page++;
>                                         retval = FALSE;
>                                         break;
>                                 }
>                                 if (!(offset = paddr_to_offset(paddr))) {
>                                         ERRMSG("Can't convert a physical address(%llx) to offset.\n", paddr);
>                                         not_a_page_offset++;
>                                         retval = FALSE;
>                                         break;
>                                 }
>                                 if (lseek(info->fd_memory, offset, SEEK_SET) == failed) {
>                                         ERRMSG("Can't seek the dump memory(%s). %s\n", info->name_memory, strerror(errno));
>                                         page_not_readable++;
>                                         retval = FALSE;
>                                         break;
>                                 }
>                                 if (read(info->fd_memory, page_info_local, SIZE(page_info)) != SIZE(page_info)) {
>                                         ERRMSG("Can't read the dump memory(%s). %s\n", info->name_memory, strerror(errno));
>                                         page_not_readable++;
>                                         retval = FALSE;
>                                         break;
>                                 }
>                                 retval = TRUE;
>                                 break;
>                         }
>                         if (retval == FALSE) {
>                                 ERRMSG("retval == False\n");
>                                 deleted_pages++;
>                                 clear_bit_on_2nd_bitmap(pfn);
>                                 pfn++;
>                                 continue;
>                         }
>                         count_info = *((unsigned long *)(page_info_local + OFFSET(page_info.count_info)));
>                         _domain = *((unsigned int *)(page_info_local + OFFSET(page_info._domain)));
>                         type_info = *((unsigned long *)(page_info_local + 0x10));
>                         if (count_info & PGC_xen_heap) {
>                                 xen_heap++;
>                                 pfn++;
>                                 continue;
>                         }
>                         if (count_info & PGC_allocated) {
>                                 allocated++;
>                                 if (_domain == 0) {
>                                         pfn++;
>                                         continue;
>                                 }
>                                 if (is_select_domain(_domain)) {
>                                         selected_domain++;
>                                         pfn++;
>                                         continue;
>                                 } else {
>                                         not_selected_domain++;
>                                         //DEBUG_MSG("domain not selected: %u\n", _domain);
>                                         prev_pfn = pfn;
>                                         clear_bit_on_2nd_bitmap(pfn);
>                                         pfn++;
>                                         deleted_pages++;
>                                         continue;
>                                 }
>                         }
>                         if ((count_info & PGC_state) == PGC_state_inuse) {
>                                 page_in_use++;
>                                 pfn++;
>                                 continue;
>                         }
>                         if ((count_info & PGC_state) == PGC_state_free) {
>                                 //DEBUG_MSG("PSE_PAGE PGC_state_free (0x%llx) count_info: (0x%lx) for pfn %llu\n",
>                                 //      PGC_state_free, count_info, pfn);
>                                 //DEBUG_MSG("PSE_PAGE PGC_state (0x%llx)\n", (count_info & PGC_state));
>                                 state_free++;
>                                 clear_bit_on_2nd_bitmap(pfn);
>                                 pfn++;
>                                 deleted_pages++;
>                                 continue;
>                         }
>                         if (count_info & PGC_broken) {
>                                 clear_bit_on_2nd_bitmap(pfn);
>                                 pfn++;
>                                 broken_pages++;
>                                 deleted_pages++;
>                                 continue;
>                         }
>                         unknown_page_type++;
>                         //clear_bit_on_2nd_bitmap(pfn);
>                         pfn++;
>                 }
>                 total_deleted_pages += deleted_pages;
>                 total_not_a_page += not_a_page;
>                 total_not_a_page_offset += not_a_page_offset;
>                 total_state_free += state_free;
>                 total_page_in_use += page_in_use;
>                 total_xen_heap += xen_heap;
>                 total_allocated += allocated;
>                 total_selected_domain += selected_domain;
>                 total_not_selected_domain += not_selected_domain;
>                 total_unknown_page_type += unknown_page_type;
>                 total_broken_pages += broken_pages;
>                 DEBUG_MSG("deleted pages               : %10lu of %10llu %3llu%%\n",
>                         deleted_pages, num_pages, deleted_pages * 100 / num_pages);
>                 DEBUG_MSG("       unused page          : %10lu\n", state_free);
>                 DEBUG_MSG("       not dom0 domain page : %10lu\n", not_selected_domain);
>                 DEBUG_MSG("       page address invalid : %10lu\n", not_a_page);
>                 DEBUG_MSG("       not a page offset    : %10lu\n", not_a_page_offset);
>                 DEBUG_MSG("       page not readable    : %10lu\n", page_not_readable);
>                 DEBUG_MSG("       broken page          : %10lu\n", broken_pages);
>                 DEBUG_MSG("saved pages                 : %10llu of %10llu %3llu%%\n",
>                         num_pages - deleted_pages, num_pages, (num_pages - deleted_pages) * 100 / num_pages);
>                 DEBUG_MSG("       page in use          : %10lu\n", page_in_use);
>                 DEBUG_MSG("       xen heap page        : %10lu\n", xen_heap);
>                 DEBUG_MSG("       dom0 page            : %10lu\n", selected_domain);
>                 DEBUG_MSG("       unknown type page    : %10lu\n", unknown_page_type);
>         }
>         /*
>          * print [100 %]
>          */
>         print_progress(PROGRESS_XEN_DOMAIN, 1, 1);
>         DEBUG_MSG("\n");
>         DEBUG_MSG("total deleted pages               : %10lu of %10llu %3llu%%\n",
>                 total_deleted_pages, total_num_pages, total_deleted_pages * 100 / total_num_pages);
>         DEBUG_MSG("       total unused page          : %10lu\n", total_state_free);
>         DEBUG_MSG("       total not dom0 domain page : %10lu\n", total_not_selected_domain);
>         DEBUG_MSG("       total page address invalid : %10lu\n", total_not_a_page);
>         DEBUG_MSG("       total not a page offset    : %10lu\n", total_not_a_page_offset);
>         DEBUG_MSG("       total page not readable    : %10lu\n", total_page_not_readable);
>         DEBUG_MSG("       total broken page          : %10lu\n", total_broken_pages);
>         DEBUG_MSG("total saved pages                 : %10llu of %10llu %3llu%%\n",
>                 total_num_pages - total_deleted_pages, total_num_pages, (total_num_pages - total_deleted_pages) * 100 / total_num_pages);
>         DEBUG_MSG("       total page in use          : %10lu\n", total_page_in_use);
>         DEBUG_MSG("       total xen heap page        : %10lu\n", total_xen_heap);
>         DEBUG_MSG("       total dom0 page            : %10lu\n", total_selected_domain);
>         DEBUG_MSG("       total unknown type page    : %10lu\n", total_unknown_page_type);
>         return TRUE;
> }
> 
> /*
>  * for Xen extraction
>  */
> 
> int pml4_page_read = 0;
> char pml4_page[4096];
> char pgd_page[4096];
> unsigned long long last_pgd_read = 0;
> char pmd_page[4096];
> unsigned long long last_pmd_read = 0;
> char pte_page[4096];
> unsigned long long last_pte_read = 0;
> 
> unsigned long long
> kvtop_xen_x86_64(unsigned long kvaddr)
> {
>         unsigned long long entry = 0;
>         unsigned long long pml4_entry, pml4_dirp;
>         unsigned long long pgd_entry, pgd_dirp;
>         unsigned long long pmd_entry, pmd_dirp;
>         unsigned long long pgd_idx = 0;
>         unsigned long pml4_idx = 0;
>         unsigned long pmd_idx = 0;
>         int reason;
> 
>         if (!is_xen_vaddr(kvaddr)) {
>                 reason = 1;
>                 goto not_paddr;
>         }
>         if (is_xen_text(kvaddr)) {
>                 if (info->xen_major_version < 4)
>                         return (unsigned long)kvaddr - XEN_VIRT_START_XEN3 + info->xen_phys_start;
>                 else {
>                         entry = (unsigned long)kvaddr - XEN_VIRT_START + info->xen_phys_start;
>                         return entry;
>                 }
>         }
>         if (is_direct(kvaddr)) {
>                 if (info->xen_major_version < 4)
>                         return (unsigned long)kvaddr - DIRECTMAP_VIRT_START_XEN3;
>                 else {
>                         entry = (unsigned long)kvaddr - DIRECTMAP_VIRT_START;
>                         return entry;
>                 }
>         }
>         pml4_idx = pml4_index(kvaddr);
>         if (pml4_page_read == 0) {
>                 if (!readmem(MADDR_XEN, kvtop_xen_x86_64(SYMBOL(pgd_l4)), pml4_page, PAGESIZE())) {
>                         reason = 2;
>                         goto not_paddr;
>                 }
>                 pml4_page_read = 1;
>         }
>         pml4_entry = *(unsigned long long *)(pml4_page + pml4_idx * sizeof(unsigned long long));
> 
>         if (!(pml4_entry & _PAGE_PRESENT)) {
>                 reason = 3;
>                 goto not_paddr;
>         }
>         pml4_dirp = pml4_entry & ENTRY_MASK;
>         if (pml4_dirp != last_pgd_read) {
>                 if (!readmem(MADDR_XEN, pml4_dirp, pgd_page, PAGESIZE())) {
>                         reason = 4;
>                         goto not_paddr;
>                 }
>                 last_pgd_read = pml4_dirp;
>         }
>         pgd_idx = pgd_index(kvaddr);
>         pgd_entry = *(unsigned long long *)(pgd_page + pgd_idx * sizeof(unsigned long long));
>         if (!(pgd_entry & _PAGE_PRESENT)) {
>                 reason = 5;
>                 goto not_paddr;
>         }
>         if (pgd_entry & _PAGE_PSE) { // 1GB page
>                 pgd_entry = (pgd_entry & ENTRY_MASK) + (kvaddr & ((1UL << PGDIR_SHIFT) - 1));
>                 return pgd_entry;
>         }
>         pgd_dirp = pgd_entry & ENTRY_MASK;
> 
>         if (pgd_dirp != last_pmd_read) {
>                 pmd_dirp = pgd_dirp;
>                 if (!readmem(MADDR_XEN, pgd_dirp, pmd_page, PAGESIZE())) {
>                         reason = 6;
>                         goto not_paddr;
>                 }
>                 last_pmd_read = pgd_dirp;
>         }
>         pmd_idx = pmd_index(kvaddr);
>         pmd_entry = *(unsigned long long *)(pmd_page + pmd_idx * sizeof(unsigned long long));
>         if (!(pmd_entry & _PAGE_PRESENT)) {
>                 reason = 7;
>                 goto not_paddr;
>         }
> 
>         if (pmd_entry & _PAGE_PSE) { // 2MB page
>                 return (PAGEBASE(pmd_entry) & ENTRY_MASK) + (kvaddr & ~_2MB_PAGE_MASK);
>         }
>         pmd_dirp = pmd_entry & ENTRY_MASK;
>         if (pmd_dirp != last_pte_read) {
>                 if (!readmem(MADDR_XEN, pmd_dirp, pte_page, PAGESIZE())) {
>                         reason = 8;
>                         goto not_paddr;
>                 }
>         }
>         entry = *(unsigned long long *)(pte_page + pte_index(kvaddr) * sizeof(unsigned long long));
> 
>         if (!(entry & _PAGE_PRESENT)) {
>                 reason = 9;
>                 goto not_paddr;
>         }
> 
>         entry = (entry & ENTRY_MASK) + (kvaddr & ((1UL << PTE_SHIFT) - 1));
>         return entry;
> not_paddr:
>         DEBUG_MSG("kvtop_xen: NOT_PADDR page 0x%llx from kavaddr: 0x%lx reason: %d\n",
>                 entry, kvaddr, reason);
>         return NOT_PADDR;
> }
> 
> With kind regards
> 
> Norbert Trapp
> PDG ES&S SWE OS 6
> 
> FUJITSU
> Fujitsu Technology Solutions GmbH
> Domagkstraße 28, D-80807 München, Germany
> Tel.: ...
> E-mail: Norbert.Trapp at ts.fujitsu.com
> Web: ts.fujitsu.com
> Company details: ts.fujitsu.com/imprint
> This communication contains information that is confidential, proprietary in nature and/or privileged.  It is for the exclusive use of the intended recipient(s). If you are not the intended recipient(s) or the person responsible for delivering it to the intended recipient(s), please note that any form of dissemination, distribution or copying of this communication is strictly prohibited and may be unlawful. If you have received this communication in error, please immediately notify the sender and delete the original communication. Thank you for your cooperation.
> Please be advised that neither Fujitsu, its affiliates, its employees or agents accept liability for any errors, omissions or damages caused by delays of receipt or by any virus infection in this message or its attachments, or which may otherwise arise as a result of this e-mail transmission.
> 



More information about the kexec mailing list