[PATCH] arm64: module: Widen module region to 2 GiB

Mark Rutland mark.rutland at arm.com
Tue Apr 4 08:49:21 PDT 2023


On Tue, Apr 04, 2023 at 03:54:37PM +0200, Ard Biesheuvel wrote:
> Shanker reports that, after loading a 110+ MiB kernel module, no other
> modules can be loaded, in spite of the fact that module PLTs have been
> enabled in the build.
> 
> This is due to the fact, even with module PLTs enabled, the module
> region is dimensioned to be a fixed 128 MiB region, as we simply never
> anticipated the need for supporting modules that huge.
> 
> So let's increase the size of the statically allocated module region to
> 2 GiB, and update the module loading logic so that we prefer loading
> modules in the vicinity of the kernel text, where relative branches can
> be resolved without the need for PLTs. Only when we run out of space
> here (or when CONFIG_RANDOMIZE_MODULE_REGION_FULL is enabled), we will
> fall back to the larger window and allocate from there.
> 
> While at it, let's try to simplify the logic a bit, to make it easier to
> follow:
> - remove the special cases for KASAN, which are no longer needed now
>   that KASAN_VMALLOC is always enabled when KASAN is configured;
> - instead of defining a global module base address, define a global
>   module limit, and work our way down from it.

I find this last change a bit confusing, and I think it'd be preferable to have
explicit start and end variables; more on that below.

> 
> Cc: Shanker Donthineni <sdonthineni at nvidia.com>
> Signed-off-by: Ard Biesheuvel <ardb at kernel.org>
> ---
>  Documentation/arm64/memory.rst  |  8 ++---
>  arch/arm64/include/asm/memory.h |  2 +-
>  arch/arm64/include/asm/module.h | 10 ++++--
>  arch/arm64/kernel/kaslr.c       | 38 +++++++++++------------
>  arch/arm64/kernel/module.c      | 54 ++++++++++++++++-----------------
>  5 files changed, 59 insertions(+), 53 deletions(-)
> 
> diff --git a/Documentation/arm64/memory.rst b/Documentation/arm64/memory.rst
> index 2a641ba7be3b717a..55a55f30eed8a6ce 100644
> --- a/Documentation/arm64/memory.rst
> +++ b/Documentation/arm64/memory.rst
> @@ -33,8 +33,8 @@ AArch64 Linux memory layout with 4KB pages + 4 levels (48-bit)::
>    0000000000000000	0000ffffffffffff	 256TB		user
>    ffff000000000000	ffff7fffffffffff	 128TB		kernel logical memory map
>   [ffff600000000000	ffff7fffffffffff]	  32TB		[kasan shadow region]
> -  ffff800000000000	ffff800007ffffff	 128MB		modules
> -  ffff800008000000	fffffbffefffffff	 124TB		vmalloc
> +  ffff800000000000	ffff80007fffffff	   2GB		modules
> +  ffff800080000000	fffffbffefffffff	 124TB		vmalloc
>    fffffbfff0000000	fffffbfffdffffff	 224MB		fixed mappings (top down)
>    fffffbfffe000000	fffffbfffe7fffff	   8MB		[guard region]
>    fffffbfffe800000	fffffbffff7fffff	  16MB		PCI I/O space
> @@ -50,8 +50,8 @@ AArch64 Linux memory layout with 64KB pages + 3 levels (52-bit with HW support):
>    0000000000000000	000fffffffffffff	   4PB		user
>    fff0000000000000	ffff7fffffffffff	  ~4PB		kernel logical memory map
>   [fffd800000000000	ffff7fffffffffff]	 512TB		[kasan shadow region]
> -  ffff800000000000	ffff800007ffffff	 128MB		modules
> -  ffff800008000000	fffffbffefffffff	 124TB		vmalloc
> +  ffff800000000000	ffff80007fffffff	   2GB		modules
> +  ffff800080000000	fffffbffefffffff	 124TB		vmalloc
>    fffffbfff0000000	fffffbfffdffffff	 224MB		fixed mappings (top down)
>    fffffbfffe000000	fffffbfffe7fffff	   8MB		[guard region]
>    fffffbfffe800000	fffffbffff7fffff	  16MB		PCI I/O space
> diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
> index 78e5163836a0ab95..b58c3127323e16c8 100644
> --- a/arch/arm64/include/asm/memory.h
> +++ b/arch/arm64/include/asm/memory.h
> @@ -46,7 +46,7 @@
>  #define KIMAGE_VADDR		(MODULES_END)
>  #define MODULES_END		(MODULES_VADDR + MODULES_VSIZE)
>  #define MODULES_VADDR		(_PAGE_END(VA_BITS_MIN))
> -#define MODULES_VSIZE		(SZ_128M)
> +#define MODULES_VSIZE		(SZ_2G)
>  #define VMEMMAP_START		(-(UL(1) << (VA_BITS - VMEMMAP_SHIFT)))
>  #define VMEMMAP_END		(VMEMMAP_START + VMEMMAP_SIZE)
>  #define PCI_IO_END		(VMEMMAP_START - SZ_8M)
> diff --git a/arch/arm64/include/asm/module.h b/arch/arm64/include/asm/module.h
> index 18734fed3bdd7609..98dae9f87b521f07 100644
> --- a/arch/arm64/include/asm/module.h
> +++ b/arch/arm64/include/asm/module.h
> @@ -31,9 +31,15 @@ u64 module_emit_veneer_for_adrp(struct module *mod, Elf64_Shdr *sechdrs,
>  				void *loc, u64 val);
>  
>  #ifdef CONFIG_RANDOMIZE_BASE
> -extern u64 module_alloc_base;
> +extern u64 module_alloc_limit;
>  #else
> -#define module_alloc_base	((u64)_etext - MODULES_VSIZE)
> +#define module_alloc_limit	MODULE_REF_END
> +#endif
> +
> +#ifdef CONFIG_ARM64_MODULE_PLTS
> +#define MODULE_REF_END		((u64)_end)
> +#else
> +#define MODULE_REF_END		((u64)_etext)
>  #endif

I was initially a bit confused by this. I think it's a bit misleading for the
!CONFIG_ARM64_MODULE_PLTS case, since modules can still have data references
between _etext and _end, it's just that we're (hopefully) unlikely to have the
text be <128M with >2G of subsequent data.

I'd find this clearer if we could express the two constaints separately. e.g.
have something like:

#define MODULE_DATA_REF_END		((u64)_end)
#define MODULE_TEXT_REF_END		((u64)_etext)

That'd allow us to do something like the below, which I think would be clearer.

u64 module_alloc_end;
u64 module_alloc_base_noplt;
u64 module_alloc_base_plt;

/*
 * Call this somewhere after choosing hte KASLR limits
 */
void module_limits_init(void)
{
	module_alloc_end = (u64)_stext;

	/*
	 * 32-bit relative data references must always fall within 2G of the
	 * end of the kernel image.
	 */
	module_alloc_base_plt = max(MODULE_VADDR, MODULE_DATA_REF_END - SZ_2G);

	/*
	 * Direct branches must be within 128M of the end of the kernel text.
	 */
	module_alloc_base_noplt = max(module_alloc_base_plt,
				      MODULE_TEXT_REF_END - SZ_128M);
}

void *module_alloc(unsigned long size)
{
	gfp_t gfp_mask = GFP_KERNEL;
	void *p = NULL;

	if (IS_ENABLED(CONFIG_ARM64_MODULE_PLTS))
		gfp_mask |= __GFP_NOWARN;

	/*
	 * By default, prefer to place modules such that they don't require
	 * PLTs. With CONFIG_RANDOMIZE_MODULE_REGION_FULL=y we'll prefer to use
	 * the entire range possible with PLTs, to get as much entropy as possible.
	 */
	if (!IS_ENABLED(CONFIG_RANDOMIZE_MODULE_REGION_FULL)) {
		p = __vmalloc_node_range(size, MODULE_ALIGN,
					 module_base_noplt, module_end,
					 gfp_mask, PAGE_KERNEL,
					 VM_DEFER_KMEMLEAK, NUMA_NO_NODE,
					 __builtin_return_address(0));
	}

	if (!p && IS_ENABLED(CONFIG_MODULE_PLTS)) {
		p = __vmalloc_node_range(size, MODULE_ALIGN,
					 module_base_plt, module_end,
					 gfp_mask, PAGE_KERNEL,
					 VM_DEFER_KMEMLEAK, NUMA_NO_NODE,
					 __builtin_return_address(0));
	}

	[... play with KASAN here ...]
}

Is that viable, or am I missing something?

Thanks,
Mark.

>  struct plt_entry {
> diff --git a/arch/arm64/kernel/kaslr.c b/arch/arm64/kernel/kaslr.c
> index e7477f21a4c9d062..14e96c3f707a74a3 100644
> --- a/arch/arm64/kernel/kaslr.c
> +++ b/arch/arm64/kernel/kaslr.c
> @@ -8,6 +8,7 @@
>  #include <linux/init.h>
>  #include <linux/libfdt.h>
>  #include <linux/mm_types.h>
> +#include <linux/module.h>
>  #include <linux/sched.h>
>  #include <linux/types.h>
>  #include <linux/pgtable.h>
> @@ -17,10 +18,11 @@
>  #include <asm/kernel-pgtable.h>
>  #include <asm/memory.h>
>  #include <asm/mmu.h>
> +#include <asm/module.h>
>  #include <asm/sections.h>
>  #include <asm/setup.h>
>  
> -u64 __ro_after_init module_alloc_base;
> +u64 __ro_after_init module_alloc_limit = MODULE_REF_END;
>  u16 __initdata memstart_offset_seed;
>  
>  struct arm64_ftr_override kaslr_feature_override __initdata;
> @@ -30,12 +32,6 @@ static int __init kaslr_init(void)
>  	u64 module_range;
>  	u32 seed;
>  
> -	/*
> -	 * Set a reasonable default for module_alloc_base in case
> -	 * we end up running with module randomization disabled.
> -	 */
> -	module_alloc_base = (u64)_etext - MODULES_VSIZE;
> -
>  	if (kaslr_feature_override.val & kaslr_feature_override.mask & 0xf) {
>  		pr_info("KASLR disabled on command line\n");
>  		return 0;
> @@ -69,24 +65,28 @@ static int __init kaslr_init(void)
>  		 * resolved via PLTs. (Branches between modules will be
>  		 * resolved normally.)
>  		 */
> -		module_range = SZ_2G - (u64)(_end - _stext);
> -		module_alloc_base = max((u64)_end - SZ_2G, (u64)MODULES_VADDR);
> +		module_range = SZ_2G;
>  	} else {
>  		/*
> -		 * Randomize the module region by setting module_alloc_base to
> -		 * a PAGE_SIZE multiple in the range [_etext - MODULES_VSIZE,
> -		 * _stext) . This guarantees that the resulting region still
> -		 * covers [_stext, _etext], and that all relative branches can
> -		 * be resolved without veneers unless this region is exhausted
> -		 * and we fall back to a larger 2GB window in module_alloc()
> -		 * when ARM64_MODULE_PLTS is enabled.
> +		 * Randomize the module region over a 128 MB window covering
> +		 * the kernel text. This guarantees that the resulting region
> +		 * still covers [_stext, _etext], and that all relative
> +		 * branches can be resolved without veneers unless this region
> +		 * is exhausted and we fall back to a larger 2GB window in
> +		 * module_alloc() when ARM64_MODULE_PLTS is enabled.
>  		 */
> -		module_range = MODULES_VSIZE - (u64)(_etext - _stext);
> +		module_range = SZ_128M;
>  	}
>  
> +	/*
> +	 * Subtract the size of the core kernel region that must be in range
> +	 * for all loaded modules.
> +	 */
> +	module_range -= MODULE_REF_END - (u64)_stext;
> +
>  	/* use the lower 21 bits to randomize the base of the module region */
> -	module_alloc_base += (module_range * (seed & ((1 << 21) - 1))) >> 21;
> -	module_alloc_base &= PAGE_MASK;
> +	module_alloc_limit += (module_range * (seed & ((1 << 21) - 1))) >> 21;
> +	module_alloc_limit &= PAGE_MASK;
>  
>  	return 0;
>  }
> diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c
> index 5af4975caeb58ff7..aa61493957c010b2 100644
> --- a/arch/arm64/kernel/module.c
> +++ b/arch/arm64/kernel/module.c
> @@ -24,7 +24,6 @@
>  
>  void *module_alloc(unsigned long size)
>  {
> -	u64 module_alloc_end = module_alloc_base + MODULES_VSIZE;
>  	gfp_t gfp_mask = GFP_KERNEL;
>  	void *p;
>  
> @@ -32,33 +31,34 @@ void *module_alloc(unsigned long size)
>  	if (IS_ENABLED(CONFIG_ARM64_MODULE_PLTS))
>  		gfp_mask |= __GFP_NOWARN;
>  
> -	if (IS_ENABLED(CONFIG_KASAN_GENERIC) ||
> -	    IS_ENABLED(CONFIG_KASAN_SW_TAGS))
> -		/* don't exceed the static module region - see below */
> -		module_alloc_end = MODULES_END;
> -
> -	p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base,
> -				module_alloc_end, gfp_mask, PAGE_KERNEL, VM_DEFER_KMEMLEAK,
> -				NUMA_NO_NODE, __builtin_return_address(0));
> +	/*
> +	 * First, try to allocate from the 128 MB region just below the limit.
> +	 * If KASLR is disabled, or CONFIG_RANDOMIZE_MODULE_REGION_FULL is not
> +	 * set, this will produce an allocation that allows all relative
> +	 * branches into the kernel text to be resolved without the need for
> +	 * veneers (PLTs). If CONFIG_RANDOMIZE_MODULE_REGION_FULL is set, this
> +	 * 128 MB window might not cover the kernel text, but branches between
> +	 * modules will still be in relative branching range.
> +	 */
> +	p = __vmalloc_node_range(size, MODULE_ALIGN,
> +				 module_alloc_limit - SZ_128M,
> +				 module_alloc_limit, gfp_mask, PAGE_KERNEL,
> +				 VM_DEFER_KMEMLEAK, NUMA_NO_NODE,
> +				 __builtin_return_address(0));
>  
> -	if (!p && IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) &&
> -	    (IS_ENABLED(CONFIG_KASAN_VMALLOC) ||
> -	     (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
> -	      !IS_ENABLED(CONFIG_KASAN_SW_TAGS))))
> -		/*
> -		 * KASAN without KASAN_VMALLOC can only deal with module
> -		 * allocations being served from the reserved module region,
> -		 * since the remainder of the vmalloc region is already
> -		 * backed by zero shadow pages, and punching holes into it
> -		 * is non-trivial. Since the module region is not randomized
> -		 * when KASAN is enabled without KASAN_VMALLOC, it is even
> -		 * less likely that the module region gets exhausted, so we
> -		 * can simply omit this fallback in that case.
> -		 */
> -		p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base,
> -				module_alloc_base + SZ_2G, GFP_KERNEL,
> -				PAGE_KERNEL, 0, NUMA_NO_NODE,
> -				__builtin_return_address(0));
> +	/*
> +	 * If the prior allocation failed, and we have configured support for
> +	 * fixing up out-of-range relative branches through the use of PLTs,
> +	 * fall back to a 2 GB window for module allocations. This is the
> +	 * maximum we can support, due to the use of 32-bit place relative
> +	 * symbol references, which cannot be fixed up using PLTs.
> +	 */
> +	if (!p && IS_ENABLED(CONFIG_ARM64_MODULE_PLTS))
> +		p = __vmalloc_node_range(size, MODULE_ALIGN,
> +					 module_alloc_limit - SZ_2G,
> +					 module_alloc_limit, GFP_KERNEL,
> +					 PAGE_KERNEL, 0, NUMA_NO_NODE,
> +					 __builtin_return_address(0));
>  
>  	if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) {
>  		vfree(p);
> -- 
> 2.39.2
> 



More information about the linux-arm-kernel mailing list