[RFC PATCH] kexec, x86/boot: map systab region in identity mapping before accessing it

Baoquan He bhe at redhat.com
Fri Apr 19 01:58:04 PDT 2019


On 04/19/19 at 04:34pm, Kairui Song wrote:
>  /* Locates and clears a region for a new top level page table. */
>  void initialize_identity_maps(void)
>  {
> -	/* If running as an SEV guest, the encryption mask is required. */
> -	set_sev_encryption_mask();
> -
> -	/* Exclude the encryption mask from __PHYSICAL_MASK */
> -	physical_mask &= ~sme_me_mask;
> -
> -	/* Init mapping_info with run-time function/buffer pointers. */
> -	mapping_info.alloc_pgt_page = alloc_pgt_page;
> -	mapping_info.context = &pgt_data;
> -	mapping_info.page_flag = __PAGE_KERNEL_LARGE_EXEC | sme_me_mask;
> -	mapping_info.kernpg_flag = _KERNPG_TABLE;
> -
> -	/*
> -	 * It should be impossible for this not to already be true,
> -	 * but since calling this a second time would rewind the other
> -	 * counters, let's just make sure this is reset too.
> -	 */
> -	pgt_data.pgt_buf_offset = 0;
> -
> -	/*
> -	 * If we came here via startup_32(), cr3 will be _pgtable already
> -	 * and we must append to the existing area instead of entirely
> -	 * overwriting it.
> -	 *
> -	 * With 5-level paging, we use '_pgtable' to allocate the p4d page table,
> -	 * the top-level page table is allocated separately.
> -	 *
> -	 * p4d_offset(top_level_pgt, 0) would cover both the 4- and 5-level
> -	 * cases. On 4-level paging it's equal to 'top_level_pgt'.
> -	 */
> -	top_level_pgt = read_cr3_pa();
> -	if (p4d_offset((pgd_t *)top_level_pgt, 0) == (p4d_t *)_pgtable) {
> -		debug_putstr("booted via startup_32()\n");
> -		pgt_data.pgt_buf = _pgtable + BOOT_INIT_PGT_SIZE;
> -		pgt_data.pgt_buf_size = BOOT_PGT_SIZE - BOOT_INIT_PGT_SIZE;
> -		memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size);
> -	} else {
> -		debug_putstr("booted via startup_64()\n");
> -		pgt_data.pgt_buf = _pgtable;
> -		pgt_data.pgt_buf_size = BOOT_PGT_SIZE;
> -		memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size);
> +	top_level_pgt = early_boot_top_pgt;
> +	if ((p4d_t *)top_level_pgt != (p4d_t *)_pgtable)
>  		top_level_pgt = (unsigned long)alloc_pgt_page(&pgt_data);

Kairui, will you make a patchset to include these changes separately
later on? I don't get the purposes of code changes. E.g here, I
don't know why you introduce a new variable early_boot_top_pgt, and
allocate the page table, even though they have been done in the old 
initialize_identity_maps().

Thanks
Baoquan

> -	}
>  }
>  
>  /*
> @@ -141,8 +41,7 @@ void add_identity_map(unsigned long start, unsigned long size)
>  		return;
>  
>  	/* Build the mapping. */
> -	kernel_ident_mapping_init(&mapping_info, (pgd_t *)top_level_pgt,
> -				  start, end);
> +	add_identity_map_pgd(start, end, top_level_pgt);
>  }
>  
>  /*
> diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
> index c0d6c560df69..6b3548080d15 100644
> --- a/arch/x86/boot/compressed/misc.c
> +++ b/arch/x86/boot/compressed/misc.c
> @@ -345,6 +345,8 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,
>  	const unsigned long kernel_total_size = VO__end - VO__text;
>  	unsigned long virt_addr = LOAD_PHYSICAL_ADDR;
>  
> +	initialize_pgtable_alloc();
> +
>  	/* Retain x86 boot parameters pointer passed from startup_32/64. */
>  	boot_params = rmode;
>  
> diff --git a/arch/x86/boot/compressed/pgtable.h b/arch/x86/boot/compressed/pgtable.h
> index 6ff7e81b5628..443df2b65fbf 100644
> --- a/arch/x86/boot/compressed/pgtable.h
> +++ b/arch/x86/boot/compressed/pgtable.h
> @@ -16,5 +16,16 @@ extern unsigned long *trampoline_32bit;
>  
>  extern void trampoline_32bit_src(void *return_ptr);
>  
> +extern struct alloc_pgt_data pgt_data;
> +
> +extern unsigned long early_boot_top_pgt;
> +
> +void *alloc_pgt_page(void *context);
> +
> +int add_identity_map_pgd(unsigned long pstart,
> +			 unsigned long pend, unsigned long pgd);
> +
> +void initialize_pgtable_alloc(void);
> +
>  #endif /* __ASSEMBLER__ */
>  #endif /* BOOT_COMPRESSED_PAGETABLE_H */
> diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c
> index f8debf7aeb4c..cd36cf9e6a5c 100644
> --- a/arch/x86/boot/compressed/pgtable_64.c
> +++ b/arch/x86/boot/compressed/pgtable_64.c
> @@ -1,9 +1,30 @@
> +/*
> + * Since we're dealing with identity mappings, physical and virtual
> + * addresses are the same, so override these defines which are ultimately
> + * used by the headers in misc.h.
> + */
> +#define __pa(x)  ((unsigned long)(x))
> +#define __va(x)  ((void *)((unsigned long)(x)))
> +
> +/* No PAGE_TABLE_ISOLATION support needed either: */
> +#undef CONFIG_PAGE_TABLE_ISOLATION
> +
> +#include "misc.h"
> +#include "pgtable.h"
> +#include "../string.h"
> +
>  #include <linux/efi.h>
>  #include <asm/e820/types.h>
>  #include <asm/processor.h>
>  #include <asm/efi.h>
> -#include "pgtable.h"
> -#include "../string.h"
> +
> +/* For handling early ident mapping */
> +#include <asm/init.h>
> +#include <asm/pgtable.h>
> +/* Use the static base for this part of the boot process */
> +#undef __PAGE_OFFSET
> +#define __PAGE_OFFSET __PAGE_OFFSET_BASE
> +#include "../../mm/ident_map.c"
>  
>  /*
>   * __force_order is used by special_insns.h asm code to force instruction
> @@ -14,6 +35,28 @@
>   */
>  unsigned long __force_order;
>  
> +/* Used to track our page table allocation area. */
> +struct alloc_pgt_data {
> +	unsigned char *pgt_buf;
> +	unsigned long pgt_buf_size;
> +	unsigned long pgt_buf_offset;
> +};
> +
> +/* Used to track our allocated page tables. */
> +struct alloc_pgt_data pgt_data;
> +
> +/* Track the first loaded boot page table. */
> +unsigned long early_boot_top_pgt;
> +
> +phys_addr_t physical_mask = (1ULL << __PHYSICAL_MASK_SHIFT) - 1;
> +
> +/*
> + * Mapping information structure passed to kernel_ident_mapping_init().
> + * Due to relocation, pointers must be assigned at run time not build time.
> + */
> +static struct x86_mapping_info mapping_info;
> +
> +/* For handling trampoline. */
>  #define BIOS_START_MIN		0x20000U	/* 128K, less than this is insane */
>  #define BIOS_START_MAX		0x9f000U	/* 640K, absolute maximum */
>  
> @@ -202,3 +245,87 @@ void cleanup_trampoline(void *pgtable)
>  	/* Restore trampoline memory */
>  	memcpy(trampoline_32bit, trampoline_save, TRAMPOLINE_32BIT_SIZE);
>  }
> +
> +/*
> + * Allocates space for a page table entry, using struct alloc_pgt_data
> + * above. Besides the local callers, this is used as the allocation
> + * callback in mapping_info below.
> + */
> +void *alloc_pgt_page(void *context)
> +{
> +	struct alloc_pgt_data *pages = (struct alloc_pgt_data *)context;
> +	unsigned char *entry;
> +
> +	/* Validate there is space available for a new page. */
> +	if (pages->pgt_buf_offset >= pages->pgt_buf_size) {
> +		debug_putstr("out of pgt_buf in " __FILE__ "!?\n");
> +		debug_putaddr(pages->pgt_buf_offset);
> +		debug_putaddr(pages->pgt_buf_size);
> +		return NULL;
> +	}
> +
> +	entry = pages->pgt_buf + pages->pgt_buf_offset;
> +	pages->pgt_buf_offset += PAGE_SIZE;
> +
> +	return entry;
> +}
> +
> +/* Locates and clears a region for update or create page table. */
> +void initialize_pgtable_alloc(void)
> +{
> +	/* If running as an SEV guest, the encryption mask is required. */
> +	set_sev_encryption_mask();
> +
> +	/* Exclude the encryption mask from __PHYSICAL_MASK */
> +	physical_mask &= ~sme_me_mask;
> +
> +	/* Init mapping_info with run-time function/buffer pointers. */
> +	mapping_info.alloc_pgt_page = alloc_pgt_page;
> +	mapping_info.context = &pgt_data;
> +	mapping_info.page_flag = __PAGE_KERNEL_LARGE_EXEC | sme_me_mask;
> +	mapping_info.kernpg_flag = _KERNPG_TABLE;
> +
> +	/*
> +	 * It should be impossible for this not to already be true,
> +	 * but since calling this a second time would rewind the other
> +	 * counters, let's just make sure this is reset too.
> +	 */
> +	pgt_data.pgt_buf_offset = 0;
> +
> +	/*
> +	 * If we came here via startup_32(), cr3 will be _pgtable already
> +	 * and we must append to the existing area instead of entirely
> +	 * overwriting it.
> +	 *
> +	 * With 5-level paging, we use '_pgtable' to allocate the p4d page
> +	 * table, the top-level page table is allocated separately.
> +	 *
> +	 * p4d_offset(early_boot_top_pgt, 0) would cover both the 4- and 5-level
> +	 * cases. On 4-level paging it's equal to 'early_boot_top_pgt'.
> +	 */
> +
> +	early_boot_top_pgt = read_cr3_pa();
> +	early_boot_top_pgt = (unsigned long)p4d_offset(
> +			(pgd_t *)early_boot_top_pgt, 0);
> +	if ((p4d_t *)early_boot_top_pgt == (p4d_t *)_pgtable) {
> +		debug_putstr("booted via startup_32()\n");
> +		pgt_data.pgt_buf = _pgtable + BOOT_INIT_PGT_SIZE;
> +		pgt_data.pgt_buf_size = BOOT_PGT_SIZE - BOOT_INIT_PGT_SIZE;
> +		memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size);
> +	} else {
> +		debug_putstr("booted via startup_64()\n");
> +		pgt_data.pgt_buf = _pgtable;
> +		pgt_data.pgt_buf_size = BOOT_PGT_SIZE;
> +		memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size);
> +	}
> +}
> +
> +/*
> + * Helper for mapping extra memory region in very early stage
> + * before extract and execute the actual kernel
> + */
> +int add_identity_map_pgd(unsigned long pstart, unsigned long pend,
> +			 unsigned long pgd)
> +{
> +	kernel_ident_mapping_init(&mapping_info, (pgd_t *)pgd, pstart, pend);
> +}
> diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h
> index 680c320363db..fb37eb98b65d 100644
> --- a/arch/x86/include/asm/boot.h
> +++ b/arch/x86/include/asm/boot.h
> @@ -33,6 +33,8 @@
>  #ifdef CONFIG_X86_64
>  # define BOOT_STACK_SIZE	0x4000
>  
> +/* Reserve one page for possible extra mapping requirement */
> +# define BOOT_EXTRA_PGT_SIZE	(1*4096)
>  # define BOOT_INIT_PGT_SIZE	(6*4096)
>  # ifdef CONFIG_RANDOMIZE_BASE
>  /*
> @@ -43,12 +45,12 @@
>   * Total is 19 pages.
>   */
>  #  ifdef CONFIG_X86_VERBOSE_BOOTUP
> -#   define BOOT_PGT_SIZE	(19*4096)
> +#   define BOOT_PGT_SIZE	((19 * 4096) + BOOT_EXTRA_PGT_SIZE)
>  #  else /* !CONFIG_X86_VERBOSE_BOOTUP */
> -#   define BOOT_PGT_SIZE	(17*4096)
> +#   define BOOT_PGT_SIZE	((17 * 4096) + BOOT_EXTRA_PGT_SIZE)
>  #  endif
>  # else /* !CONFIG_RANDOMIZE_BASE */
> -#  define BOOT_PGT_SIZE		BOOT_INIT_PGT_SIZE
> +#  define BOOT_PGT_SIZE		(BOOT_INIT_PGT_SIZE + BOOT_EXTRA_PGT_SIZE)
>  # endif
>  
>  #else /* !CONFIG_X86_64 */
> -- 
> 2.20.1
> 



More information about the kexec mailing list