<br/><br/>Sent from my android device.<br/><br/>-----Original Message-----<br/>From: Takao Indoh &lt;indou.takao@jp.fujitsu.com&gt;<br/>To: linux-kernel@vger.kernel.org; kexec@lists.infradead.org<br/>Cc: ebiederm@xmission.com; tglx@linutronix.de; mingo@redhat.com; hpa@zytor.com; vgoyal@redhat.com; nhorman@tuxdriver.com<br/>Sent: Fri, 13 Aug 2010 03:18 PM<br/>Subject: [PATCH][EFI] Run EFI in physical mode<br/><br/>Hi all,<br/><br/>The attached patch enables EFI to run in physical mode.<br/><br/>Basically EFI is in physical mode at first and it&#039;s switched to virtual<br/>mode after calling SetVirtualAddressMap. By applying this patch, you can<br/>run EFI always in physical mode. And you can also specify &quot;virtefi&quot; as<br/>kernel boot parameter to run EFI in virtual mode as before. Note that<br/>this patch supports only x86_64.<br/><br/>This is needed to run kexec/kdump in EFI-booted system. The following is<br/>an original discussion. In this thread, I explained that kdump does not<br/>work because EFI system table is modified by SetVirtualAddressMap. And<br/>the idea to run EFI in physical mode was proposed. This patch implements<br/>it.<br/><br/>http://marc.info/?l=linux-kernel&amp;m=128018221820234&amp;w=2<br/>&gt; When the 1st kernel boots, EFI system table(efi_system_table_t) is<br/>&gt; modified by SetVirtualAddressMap, which is one of EFI runtime service.<br/>&gt; This runtime changes physical address in EFI system table to virtual<br/>&gt; address.<br/>&gt; <br/>&gt; When the 2nd kernel boots, it also receives the same EFI system table,<br/>&gt; and the address included in it is already virtual address(1st kernel<br/>&gt; rewrote it).  But 2nd kernel does not know that, 2nd kernel thinks it is<br/>&gt; a physical address. This causes problems.<br/><br/>Basic idea of this patch is to create EFI own pagetable. This pagetable<br/>maps physical address of EFI runtime to the virtual address which is the<br/>same value so that we can call it directly. For example, physical <br/>address 0x800000 is mapped to virtual address 0x800000. Before calling<br/>EFI runtime, cr3 register is switched to this pagetable, and restored<br/>when we come back from EFI.<br/><br/>Any comments would be appreciated.<br/><br/>Signed-off-by: Takao Indoh &lt;indou.takao@jp.fujitsu.com&gt;<br/><br/>Looks good<br/>Acked-by:  Neil Horman &lt;nhorman@tuxdriver.com&gt;<br/><br/><br/>---<br/> arch/x86/include/asm/efi.h |    3<br/> arch/x86/kernel/efi.c      |  142 ++++++++++++++++++++++++++++++++++-<br/> arch/x86/kernel/efi_32.c   |    4<br/> arch/x86/kernel/efi_64.c   |   92 ++++++++++++++++++++++<br/> include/linux/efi.h        |    1<br/> include/linux/init.h       |    1<br/> init/main.c                |   16 +++<br/> 7 files changed, 254 insertions(+), 5 deletions(-)<br/><br/>diff -Nurp linux-2.6.35.org/arch/x86/include/asm/efi.h linux-2.6.35/arch/x86/include/asm/efi.h<br/>--- linux-2.6.35.org/arch/x86/include/asm/efi.h        2010-08-01 18:11:14.000000000 -0400<br/>+++ linux-2.6.35/arch/x86/include/asm/efi.h        2010-08-13 14:39:25.817104994 -0400<br/>@@ -93,6 +93,9 @@ extern int add_efi_memmap;<br/> extern void efi_reserve_early(void);<br/> extern void efi_call_phys_prelog(void);<br/> extern void efi_call_phys_epilog(void);<br/>+extern void efi_call_phys_prelog_in_physmode(void);<br/>+extern void efi_call_phys_epilog_in_physmode(void);<br/>+extern void efi_pagetable_init(void);<br/> <br/> #ifndef CONFIG_EFI<br/> /*<br/>diff -Nurp linux-2.6.35.org/arch/x86/kernel/efi.c linux-2.6.35/arch/x86/kernel/efi.c<br/>--- linux-2.6.35.org/arch/x86/kernel/efi.c        2010-08-01 18:11:14.000000000 -0400<br/>+++ linux-2.6.35/arch/x86/kernel/efi.c        2010-08-13 14:39:25.819105004 -0400<br/>@@ -57,6 +57,7 @@ struct efi_memory_map memmap;<br/> <br/> static struct efi efi_phys __initdata;<br/> static efi_system_table_t efi_systab __initdata;<br/>+static efi_runtime_services_t phys_runtime;<br/> <br/> static int __init setup_noefi(char *arg)<br/> {<br/>@@ -171,7 +172,7 @@ static efi_status_t __init phys_efi_set_<br/>         return status;<br/> }<br/> <br/>-static efi_status_t __init phys_efi_get_time(efi_time_t *tm,<br/>+static efi_status_t __init phys_efi_get_time_early(efi_time_t *tm,<br/>                                              efi_time_cap_t *tc)<br/> {<br/>         efi_status_t status;<br/>@@ -182,6 +183,112 @@ static efi_status_t __init phys_efi_get_<br/>         return status;<br/> }<br/> <br/>+static efi_status_t phys_efi_get_time(efi_time_t *tm,<br/>+                                      efi_time_cap_t *tc)<br/>+{<br/>+        efi_status_t status;<br/>+<br/>+        efi_call_phys_prelog_in_physmode();<br/>+        status = efi_call_phys2((void*)phys_runtime.get_time, tm, tc);<br/>+        efi_call_phys_epilog_in_physmode();<br/>+        return status;<br/>+}<br/>+<br/>+static efi_status_t __init phys_efi_set_time(efi_time_t *tm)<br/>+{<br/>+        efi_status_t status;<br/>+<br/>+        efi_call_phys_prelog_in_physmode();<br/>+        status = efi_call_phys1((void*)phys_runtime.set_time, tm);<br/>+        efi_call_phys_epilog_in_physmode();<br/>+        return status;<br/>+}<br/>+<br/>+static efi_status_t phys_efi_get_wakeup_time(efi_bool_t *enabled,<br/>+                                             efi_bool_t *pending,<br/>+                                             efi_time_t *tm)<br/>+{<br/>+        efi_status_t status;<br/>+<br/>+        efi_call_phys_prelog_in_physmode();<br/>+        status = efi_call_phys3((void*)phys_runtime.get_wakeup_time, enabled,<br/>+                                pending, tm);<br/>+        efi_call_phys_epilog_in_physmode();<br/>+        return status;<br/>+}<br/>+<br/>+static efi_status_t phys_efi_set_wakeup_time(efi_bool_t enabled, efi_time_t *tm)<br/>+{<br/>+        efi_status_t status;<br/>+        efi_call_phys_prelog_in_physmode();<br/>+        status = efi_call_phys2((void*)phys_runtime.set_wakeup_time, enabled,<br/>+                                tm);<br/>+        efi_call_phys_epilog_in_physmode();<br/>+        return status;<br/>+}<br/>+<br/>+static efi_status_t phys_efi_get_variable(efi_char16_t *name,<br/>+                                          efi_guid_t *vendor,<br/>+                                          u32 *attr,<br/>+                                          unsigned long *data_size,<br/>+                                          void *data)<br/>+{<br/>+        efi_status_t status;<br/>+        efi_call_phys_prelog_in_physmode();<br/>+        status = efi_call_phys5((void*)phys_runtime.get_variable, name, vendor,<br/>+                                attr, data_size, data);<br/>+        efi_call_phys_epilog_in_physmode();<br/>+        return status;<br/>+}<br/>+<br/>+static efi_status_t phys_efi_get_next_variable(unsigned long *name_size,<br/>+                                               efi_char16_t *name,<br/>+                                               efi_guid_t *vendor)<br/>+{<br/>+        efi_status_t status;<br/>+<br/>+        efi_call_phys_prelog_in_physmode();<br/>+        status = efi_call_phys3((void*)phys_runtime.get_next_variable,<br/>+                                name_size, name, vendor);<br/>+        efi_call_phys_epilog_in_physmode();<br/>+        return status;<br/>+}<br/>+<br/>+static efi_status_t phys_efi_set_variable(efi_char16_t *name,<br/>+                                          efi_guid_t *vendor,<br/>+                                          unsigned long attr,<br/>+                                          unsigned long data_size,<br/>+                                          void *data)<br/>+{<br/>+        efi_status_t status;<br/>+        efi_call_phys_prelog_in_physmode();<br/>+        status = efi_call_phys5((void*)phys_runtime.set_variable, name,<br/>+                                vendor, attr, data_size, data);<br/>+        efi_call_phys_epilog_in_physmode();<br/>+        return status;<br/>+}<br/>+<br/>+static efi_status_t phys_efi_get_next_high_mono_count(u32 *count)<br/>+{<br/>+        efi_status_t status;<br/>+        efi_call_phys_prelog_in_physmode();<br/>+        status = efi_call_phys1((void*)phys_runtime.get_next_high_mono_count,<br/>+                                count);<br/>+        efi_call_phys_epilog_in_physmode();<br/>+        return status;<br/>+}<br/>+<br/>+static void phys_efi_reset_system(int reset_type,<br/>+                                  efi_status_t status,<br/>+                                  unsigned long data_size,<br/>+                                  efi_char16_t *data)<br/>+{<br/>+        efi_call_phys_prelog_in_physmode();<br/>+        efi_call_phys4((void*)phys_runtime.reset_system, reset_type, status,<br/>+                                data_size, data);<br/>+        efi_call_phys_epilog_in_physmode();<br/>+}<br/>+<br/> int efi_set_rtc_mmss(unsigned long nowtime)<br/> {<br/>         int real_seconds, real_minutes;<br/>@@ -434,7 +541,9 @@ void __init efi_init(void)<br/>                  * Make efi_get_time can be called before entering<br/>                  * virtual mode.<br/>                  */<br/>-                efi.get_time = phys_efi_get_time;<br/>+                efi.get_time = phys_efi_get_time_early;<br/>+<br/>+                memcpy(&amp;phys_runtime, runtime, sizeof(efi_runtime_services_t));<br/>         } else<br/>                 printk(KERN_ERR &quot;Could not map the EFI runtime service &quot;<br/>                        &quot;table!&#092;n&quot;);<br/>@@ -465,6 +574,14 @@ void __init efi_init(void)<br/> #if EFI_DEBUG<br/>         print_efi_memmap();<br/> #endif<br/>+<br/>+#ifndef CONFIG_X86_64<br/>+        /*<br/>+         * Only x86_64 supports physical mode as of now. Use virtual mode<br/>+         * forcibly.<br/>+         */<br/>+        usevirtefi = 1;<br/>+#endif<br/> }<br/> <br/> static void __init runtime_code_page_mkexec(void)<br/>@@ -578,6 +695,27 @@ void __init efi_enter_virtual_mode(void)<br/>         memmap.map = NULL;<br/> }<br/> <br/>+void __init efi_setup_physical_mode(void)<br/>+{<br/>+#ifdef CONFIG_X86_64<br/>+        efi_pagetable_init();<br/>+#endif<br/>+        efi.get_time = phys_efi_get_time;<br/>+        efi.set_time = phys_efi_set_time;<br/>+        efi.get_wakeup_time = phys_efi_get_wakeup_time;<br/>+        efi.set_wakeup_time = phys_efi_set_wakeup_time;<br/>+        efi.get_variable = phys_efi_get_variable;<br/>+        efi.get_next_variable = phys_efi_get_next_variable;<br/>+        efi.set_variable = phys_efi_set_variable;<br/>+        efi.get_next_high_mono_count =<br/>+                phys_efi_get_next_high_mono_count;<br/>+        efi.reset_system = phys_efi_reset_system;<br/>+        efi.set_virtual_address_map = NULL; /* Not needed */<br/>+<br/>+        early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size);<br/>+        memmap.map = NULL;<br/>+}<br/>+<br/> /*<br/>  * Convenience functions to obtain memory types and attributes<br/>  */<br/>diff -Nurp linux-2.6.35.org/arch/x86/kernel/efi_32.c linux-2.6.35/arch/x86/kernel/efi_32.c<br/>--- linux-2.6.35.org/arch/x86/kernel/efi_32.c        2010-08-01 18:11:14.000000000 -0400<br/>+++ linux-2.6.35/arch/x86/kernel/efi_32.c        2010-08-13 14:39:25.819105004 -0400<br/>@@ -110,3 +110,7 @@ void efi_call_phys_epilog(void)<br/> <br/>         local_irq_restore(efi_rt_eflags);<br/> }<br/>+<br/>+void efi_call_phys_prelog_in_physmode(void) { /* Not supported */ }<br/>+void efi_call_phys_epilog_in_physmode(void) { /* Not supported */ }<br/>+<br/>diff -Nurp linux-2.6.35.org/arch/x86/kernel/efi_64.c linux-2.6.35/arch/x86/kernel/efi_64.c<br/>--- linux-2.6.35.org/arch/x86/kernel/efi_64.c        2010-08-01 18:11:14.000000000 -0400<br/>+++ linux-2.6.35/arch/x86/kernel/efi_64.c        2010-08-13 14:39:25.819105004 -0400<br/>@@ -39,7 +39,9 @@<br/> #include &lt;asm/fixmap.h&gt;<br/> <br/> static pgd_t save_pgd __initdata;<br/>-static unsigned long efi_flags __initdata;<br/>+static unsigned long efi_flags;<br/>+static pgd_t efi_pgd[PTRS_PER_PGD] __page_aligned_bss;<br/>+static unsigned long save_cr3;<br/> <br/> static void __init early_mapping_set_exec(unsigned long start,<br/>                                           unsigned long end,<br/>@@ -98,6 +100,19 @@ void __init efi_call_phys_epilog(void)<br/>         early_runtime_code_mapping_set_exec(0);<br/> }<br/> <br/>+void efi_call_phys_prelog_in_physmode(void)<br/>+{<br/>+        local_irq_save(efi_flags);<br/>+        save_cr3 = read_cr3();<br/>+        write_cr3(virt_to_phys(efi_pgd));<br/>+}<br/>+<br/>+void efi_call_phys_epilog_in_physmode(void)<br/>+{<br/>+        write_cr3(save_cr3);<br/>+        local_irq_restore(efi_flags);<br/>+}<br/>+<br/> void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size,<br/>                                  u32 type)<br/> {<br/>@@ -112,3 +127,78 @@ void __iomem *__init efi_ioremap(unsigne<br/> <br/>         return (void __iomem *)__va(phys_addr);<br/> }<br/>+<br/>+static pud_t *fill_pud(pgd_t *pgd, unsigned long vaddr)<br/>+{<br/>+        if (pgd_none(*pgd)) {<br/>+                pud_t *pud = (pud_t *)get_zeroed_page(GFP_ATOMIC);<br/>+                set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud)));<br/>+                if (pud != pud_offset(pgd, 0))<br/>+                        printk(KERN_ERR &quot;EFI PAGETABLE BUG #00! %p &lt;-&gt; %p&#092;n&quot;,<br/>+                               pud, pud_offset(pgd, 0));<br/>+        }<br/>+        return pud_offset(pgd, vaddr);<br/>+}<br/>+<br/>+static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr)<br/>+{<br/>+        if (pud_none(*pud)) {<br/>+                pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_ATOMIC);<br/>+                set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd)));<br/>+                if (pmd != pmd_offset(pud, 0))<br/>+                        printk(KERN_ERR &quot;EFI PAGETABLE BUG #01! %p &lt;-&gt; %p&#092;n&quot;,<br/>+                               pmd, pmd_offset(pud, 0));<br/>+        }<br/>+        return pmd_offset(pud, vaddr);<br/>+}<br/>+<br/>+static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr)<br/>+{<br/>+        if (pmd_none(*pmd)) {<br/>+                pte_t *pte = (pte_t *)get_zeroed_page(GFP_ATOMIC);<br/>+                set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte)));<br/>+                if (pte != pte_offset_kernel(pmd, 0))<br/>+                        printk(KERN_ERR &quot;EFI PAGETABLE BUG #02!&#092;n&quot;);<br/>+        }<br/>+        return pte_offset_kernel(pmd, vaddr);<br/>+}<br/>+<br/>+void __init efi_pagetable_init(void)<br/>+{<br/>+        efi_memory_desc_t *md;<br/>+        unsigned long size;<br/>+        u64 start_pfn, end_pfn, pfn, vaddr;<br/>+        void *p;<br/>+        pgd_t *pgd;<br/>+        pud_t *pud;<br/>+        pmd_t *pmd;<br/>+        pte_t *pte;<br/>+<br/>+        memset(efi_pgd, 0, sizeof(efi_pgd));<br/>+        for (p = memmap.map; p &lt; memmap.map_end; p += memmap.desc_size) {<br/>+                md = p;<br/>+                if (!(md-&gt;type &amp; EFI_RUNTIME_SERVICES_CODE) &amp;&amp;<br/>+                    !(md-&gt;type &amp; EFI_RUNTIME_SERVICES_DATA))<br/>+                        continue;<br/>+<br/>+                start_pfn = md-&gt;phys_addr &gt;&gt; PAGE_SHIFT;<br/>+                size = md-&gt;num_pages &lt;&lt; EFI_PAGE_SHIFT;<br/>+                end_pfn = PFN_UP(md-&gt;phys_addr + size);<br/>+<br/>+                for (pfn = start_pfn; pfn &lt;= end_pfn; pfn++) {<br/>+                        vaddr = pfn &lt;&lt; PAGE_SHIFT;<br/>+                        pgd = efi_pgd + pgd_index(vaddr);<br/>+                        pud = fill_pud(pgd, vaddr);<br/>+                        pmd = fill_pmd(pud, vaddr);<br/>+                        pte = fill_pte(pmd, vaddr);<br/>+                        if (md-&gt;type &amp; EFI_RUNTIME_SERVICES_CODE)<br/>+                                set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC));<br/>+                        else<br/>+                                set_pte(pte, pfn_pte(pfn, PAGE_KERNEL));<br/>+                }<br/>+        }<br/>+        pgd = efi_pgd + pgd_index(PAGE_OFFSET);<br/>+        set_pgd(pgd, *pgd_offset_k(PAGE_OFFSET));<br/>+        pgd = efi_pgd + pgd_index(__START_KERNEL_map);<br/>+        set_pgd(pgd, *pgd_offset_k(__START_KERNEL_map));<br/>+}<br/>diff -Nurp linux-2.6.35.org/include/linux/efi.h linux-2.6.35/include/linux/efi.h<br/>--- linux-2.6.35.org/include/linux/efi.h        2010-08-01 18:11:14.000000000 -0400<br/>+++ linux-2.6.35/include/linux/efi.h        2010-08-13 14:39:25.820105006 -0400<br/>@@ -290,6 +290,7 @@ extern void efi_map_pal_code (void);<br/> extern void efi_memmap_walk (efi_freemem_callback_t callback, void *arg);<br/> extern void efi_gettimeofday (struct timespec *ts);<br/> extern void efi_enter_virtual_mode (void);        /* switch EFI to virtual mode, if possible */<br/>+extern void efi_setup_physical_mode(void);<br/> extern u64 efi_get_iobase (void);<br/> extern u32 efi_mem_type (unsigned long phys_addr);<br/> extern u64 efi_mem_attributes (unsigned long phys_addr);<br/>diff -Nurp linux-2.6.35.org/include/linux/init.h linux-2.6.35/include/linux/init.h<br/>--- linux-2.6.35.org/include/linux/init.h        2010-08-01 18:11:14.000000000 -0400<br/>+++ linux-2.6.35/include/linux/init.h        2010-08-13 14:39:25.820105006 -0400<br/>@@ -142,6 +142,7 @@ extern int do_one_initcall(initcall_t fn<br/> extern char __initdata boot_command_line[];<br/> extern char *saved_command_line;<br/> extern unsigned int reset_devices;<br/>+extern unsigned int usevirtefi;<br/> <br/> /* used by init/main.c */<br/> void setup_arch(char **);<br/>diff -Nurp linux-2.6.35.org/init/main.c linux-2.6.35/init/main.c<br/>--- linux-2.6.35.org/init/main.c        2010-08-01 18:11:14.000000000 -0400<br/>+++ linux-2.6.35/init/main.c        2010-08-13 14:39:25.820105006 -0400<br/>@@ -200,6 +200,14 @@ static int __init set_reset_devices(char<br/> <br/> __setup(&quot;reset_devices&quot;, set_reset_devices);<br/> <br/>+unsigned int usevirtefi;<br/>+static int __init set_virt_efi(char *str)<br/>+{<br/>+        usevirtefi = 1;<br/>+        return 1;<br/>+}<br/>+__setup(&quot;virtefi&quot;, set_virt_efi);<br/>+<br/> static char * argv_init[MAX_INIT_ARGS+2] = { &quot;init&quot;, NULL, };<br/> char * envp_init[MAX_INIT_ENVS+2] = { &quot;HOME=/&quot;, &quot;TERM=linux&quot;, NULL, };<br/> static const char *panic_later, *panic_param;<br/>@@ -676,8 +684,12 @@ asmlinkage void __init start_kernel(void<br/>         pidmap_init();<br/>         anon_vma_init();<br/> #ifdef CONFIG_X86<br/>-        if (efi_enabled)<br/>-                efi_enter_virtual_mode();<br/>+        if (efi_enabled) {<br/>+                if (usevirtefi)<br/>+                        efi_enter_virtual_mode();<br/>+                else<br/>+                        efi_setup_physical_mode();<br/>+        }<br/> #endif<br/>         thread_info_cache_init();<br/>         cred_init();<br/><br/>