[PATCH] arm64: export memblock_reserve()d regions via /proc/iomem

Tyler Baicar tbaicar at codeaurora.org
Thu Apr 26 10:44:04 PDT 2018


On 4/25/2018 9:22 AM, James Morse wrote:
> There has been some confusion around what is necessary to prevent kexec
> overwriting important memory regions. memblock: reserve, or nomap?
> Only memblock nomap regions are reported via /proc/iomem, kexec's
> user-space doesn't know about memblock_reserve()d regions.
>
> Until commit f56ab9a5b73ca ("efi/arm: Don't mark ACPI reclaim memory
> as MEMBLOCK_NOMAP") the ACPI tables were nomap, now they are reserved
> and thus possible for kexec to overwrite with the new kernel or initrd.
> But this was always broken, as the UEFI memory map is also reserved
> and not marked as nomap.
>
> It turns out that while kexec-tools will pick up reserved sections in
> iomem that look like:
> | 80000000-dfffffff : System RAM
> |   81000000-8158ffff : reserved
>
> The reserved section is ignored by its 'locate_hole()' code. To fix
> this, we need to describe memblock_reserved() and nomap regions as
> 'reserved' at the top level:
> | 80000000-80ffffff : System RAM
> | 81000000-8158ffff : reserved
> | 81590000-dfffffff : System RAM
>
> To complicate matters, our existing named sections are described as
> being part of 'System RAM', but they are also memblock_reserve()d.
> We need to keep this in-case something is depending on it. To do this
> involves walking memblock multiple times:
>
> First add the 'System RAM' sections that are memory and not-reserved.
> These may be smaller than a page if part of the page is reserved. In
> this case we want to describe the page as reserved, so we round these
> regions down to the smallest page-size region, which may be empty.
> (We round-up the memblock_reserved() regions to fill in the gaps).
>
> The boundaries for kernel_data are changed because paging_init() punches
> holes in the _sdata -> _edata region, and this code can't add a named
> region that crosses memblock_reserve()d<->normal-memory regions. The
> new helpers will catch any more overlapping regions that occur.
>
> Lastly, we add the memblock_reserved() regions using
> reserve_region_with_split(), which will fill in the gaps between the
> existing named regions. (e.g. the regions occupied by the __init code).
> This call uses the slab allocator, so has to run from an initcall.
>
> Reported-by: Bhupesh Sharma <bhupesh.linux at gmail.com>
> Reported-by: Tyler Baicar <tbaicar at codeaurora.org>
> Suggested-by: Akashi Takahiro <takahiro.akashi at linaro.org>
> Signed-off-by: James Morse <james.morse at arm.com>
> CC: Ard Biesheuvel <ard.biesheuvel at linaro.org>
> CC: Mark Rutland <mark.rutland at arm.com>
Tested-by: Tyler Baicar <tbaicar at codeaurora.org>

This patch resolves the overwriting of the ESRT memory region without the need
to edit/update kexec.

Thanks!
>
> ---
> If we do send this to stable:
> Fixes: d28f6df1305a ("arm64/kexec: Add core kexec support")
>
> If we're happy to modify user-sapce, we can do much neater things.
>
> It looks like UEFI's careful 'memory map not mapped' code had me convinced
> it was nomap.
>
>   arch/arm64/kernel/setup.c | 136 ++++++++++++++++++++++++++++++++++++++--------
>   1 file changed, 113 insertions(+), 23 deletions(-)
>
> diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
> index 30ad2f085d1f..e82c0d5c70f8 100644
> --- a/arch/arm64/kernel/setup.c
> +++ b/arch/arm64/kernel/setup.c
> @@ -202,45 +202,135 @@ static void __init setup_machine_fdt(phys_addr_t dt_phys)
>   	dump_stack_set_arch_desc("%s (DT)", name);
>   }
>   
> +static struct resource * __init add_standard_resources(phys_addr_t start,
> +						       phys_addr_t end,
> +						       bool reserved)
> +{
> +	struct resource *res;
> +
> +	res = alloc_bootmem_low(sizeof(*res));
> +
> +	if (reserved) {
> +		res->name  = "reserved";
> +		res->flags = IORESOURCE_MEM;
> +	} else {
> +		res->name  = "System RAM";
> +		res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
> +	}
> +	res->start = start;
> +	res->end = end;
> +
> +	if (request_resource_conflict(&iomem_resource, res)) {
> +		pr_warn_once("Attempted to add overlapping resources\n");
> +		return NULL;
> +	}
> +
> +	return res;
> +}
> +
> +static void __init add_named_resources(struct resource *named_resource)
> +{
> +	phys_addr_t start, end;
> +	struct resource *res;
> +
> +	start = __pfn_to_phys(PFN_DOWN(named_resource->start));
> +	end = __pfn_to_phys(PFN_UP(named_resource->end)) - 1;
> +	res = add_standard_resources(start, end, false);
> +	if (res)
> +		request_resource(res, named_resource);
> +}
> +
>   static void __init request_standard_resources(void)
>   {
> +	phys_addr_t start, end;
>   	struct memblock_region *region;
>   	struct resource *res;
> +	u64 i;
> +	int num_res = 0;
>   
>   	kernel_code.start   = __pa_symbol(_text);
>   	kernel_code.end     = __pa_symbol(__init_begin - 1);
>   	kernel_data.start   = __pa_symbol(_sdata);
> -	kernel_data.end     = __pa_symbol(_end - 1);
> +	kernel_data.end     = __pa_symbol(_edata - 1);
>   
> -	for_each_memblock(memory, region) {
> -		res = alloc_bootmem_low(sizeof(*res));
> -		if (memblock_is_nomap(region)) {
> -			res->name  = "reserved";
> -			res->flags = IORESOURCE_MEM;
> -		} else {
> -			res->name  = "System RAM";
> -			res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
> -		}
> -		res->start = __pfn_to_phys(memblock_region_memory_base_pfn(region));
> -		res->end = __pfn_to_phys(memblock_region_memory_end_pfn(region)) - 1;
> +	/*
> +	 * We can't allocate memory while walking free memory, count the number
> +	 * of struct resource's we will need. Round start/end to the smallest
> +	 * page-size region as we round the reserved regions up.
> +	 */
> +	for_each_free_mem_range(i, NUMA_NO_NODE, 0, &start, &end, NULL) {
> +		start = ALIGN(start, PAGE_SIZE);
> +		end = ALIGN_DOWN(end, PAGE_SIZE) - 1;
> +		if (end > start)
> +			num_res++;
> +	}
> +
> +	/* our allocation may split a free memblock */
> +	num_res++;
> +	res = alloc_bootmem_low(num_res * sizeof(*res));
>   
> -		request_resource(&iomem_resource, res);
> +	/*
> +	 * Add the non-reserved memory regions. flag=0 means we skip nomap
> +	 * regions too.
> +	 */
> +	for_each_free_mem_range(i, NUMA_NO_NODE, 0, &start, &end, NULL) {
> +		if (WARN_ON(!num_res))
> +			return;
> +
> +		res->name  = "System RAM";
> +		res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
> +
> +		res->start = ALIGN(start, PAGE_SIZE);
> +		res->end = ALIGN_DOWN(end, PAGE_SIZE) - 1;
> +		if (res->end > res->start) {
> +			request_resource(&iomem_resource, res);
> +			res++;
> +			num_res--;
> +		}
> +	}
>   
> -		if (kernel_code.start >= res->start &&
> -		    kernel_code.end <= res->end)
> -			request_resource(res, &kernel_code);
> -		if (kernel_data.start >= res->start &&
> -		    kernel_data.end <= res->end)
> -			request_resource(res, &kernel_data);
> +	/* Add the named reserved regions and their system-ram parents */
> +	add_named_resources(&kernel_code);
> +	add_named_resources(&kernel_data);
>   #ifdef CONFIG_KEXEC_CORE
> -		/* Userspace will find "Crash kernel" region in /proc/iomem. */
> -		if (crashk_res.end && crashk_res.start >= res->start &&
> -		    crashk_res.end <= res->end)
> -			request_resource(res, &crashk_res);
> +	if (crashk_res.end)
> +		add_named_resources(&crashk_res);
>   #endif
> +
> +	/* Add the nomap regions */
> +	for_each_memblock(memory, region) {
> +		if (!memblock_is_nomap(region))
> +			continue;
> +
> +		start = __pfn_to_phys(memblock_region_memory_base_pfn(region));
> +		end = __pfn_to_phys(memblock_region_memory_end_pfn(region)) - 1;
> +		add_standard_resources(start, end, true);
>   	}
>   }
>   
> +static int __init reserve_memblock_reserved_regions(void)
> +{
> +	phys_addr_t start, end, roundup_end = 0;
> +	u64 i;
> +
> +	for_each_reserved_mem_region(i, &start, &end) {
> +		if (end <= roundup_end)
> +			continue; /* done already */
> +
> +		start = __pfn_to_phys(PFN_DOWN(start));
> +		end = __pfn_to_phys(PFN_UP(end)) - 1;
> +		roundup_end = end;
> +
> +		reserve_region_with_split(&iomem_resource, start, end,
> +					  "reserved");
> +	}
> +
> +	return 0;
> +}
> +/* reserve_region_with_split() requires the slab allocator: */
> +arch_initcall(reserve_memblock_reserved_regions);
> +
> +
>   u64 __cpu_logical_map[NR_CPUS] = { [0 ... NR_CPUS-1] = INVALID_HWID };
>   
>   void __init setup_arch(char **cmdline_p)

-- 
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm Technologies, Inc.
Qualcomm Technologies, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project.




More information about the linux-arm-kernel mailing list