[PoC PATCH] arm: allow modules outside of bl range

Fri Nov 21 07:46:22 PST 2014

On 21 November 2014 11:34, Ard Biesheuvel <ard.biesheuvel at linaro.org> wrote:
> On 20 November 2014 20:14, Nicolas Pitre <nicolas.pitre at linaro.org> wrote:
>> On Thu, 20 Nov 2014, Ard Biesheuvel wrote:
>>
>>> Loading modules far away from the kernel in memory is problematic because
>>> the 'bl' instruction only has limited reach, and modules are not built
>>> with PLTs. Instead of using the -mlong-calls option (which affects *all*
>>> emitted bl instructions), this patch allocates some additional space at
>>> module load time, and populates it with PLT like entries when encountering
>>> relocations that are out of reach.
>>>
>>> Note that this patch is a proof of concept, and thus removes the implementation
>>> of module_alloc() so that all modules are relocated using PLT entries.
>>> Ideally, we would switch into PLT mode and start using the vmalloc area only
>>> after we have exhausted the ordinary module space.
>>>
>>> This should work with all relocation against symbols exported by the kernel,
>>> including those resulting from GCC generated function calls for ftrace etc.
>>>
>>> This is largely based on the ia64 implementation.
>>> Thumb-2 kernels currently unsupported.
>>>
>>> Signed-off-by: Ard Biesheuvel <ard.biesheuvel at linaro.org>
>>
>> Looks on the right track to me.
>>
>> BTW it might be necessary to use PLT mode even from the primary module
>> area if e.g. the kernel gets too big to be reachable (we've seen that
>> already), or a module from the primary area wants to branch to a symbol
>> located in a larger module that ended up in the vmalloc area.  So you
>
> Indeed.
>
>> might need to estimate the worst case for the number of PLTs and end up
>> not using all of them or even none at all. Would be good to free the
>> unused pages in that case (only for the non init section obviously).
>> Looks like the module_finalize() hook might be used for that.
>>
>
> This code already establishes an upper bound for the number of
> required PLT entries, but allocates the memory unconditionally, which
> is indeed somewhat of a waste as 'no PLT entries' is obviously the
> general case as long as the primary module area has not been
> exhausted.
>
> I can easily round up the core PLT section to PAGE_SIZE size and
> alignment, but I haven't figured out how to punch a hole into an area
> returned by vmalloc(), and it is desirable to have the PLT region and
> the module region itself be part of the same allocation to begin with,
> or the PLT region may end up out of range itself, which kind of
> defeats the purpose. Or perhaps, some way to at least release the
> physical pages while retaining the single vmap_area.
>

It turns out, looking at the actual numbers (random sample of 46
modules), that the typical size overhead of the core PLT is about 5%,
and rarely results in the number of needed pages to increase.

-- 
Ard.

>>
>>>  arch/arm/Makefile             |   1 +
>>>  arch/arm/include/asm/module.h |   2 +
>>>  arch/arm/kernel/module.c      | 172 ++++++++++++++++++++++++++++++++++++++++--
>>>  arch/arm/kernel/module.lds    |   4 +
>>>  4 files changed, 173 insertions(+), 6 deletions(-)
>>>  create mode 100644 arch/arm/kernel/module.lds
>>>
>>> diff --git a/arch/arm/Makefile b/arch/arm/Makefile
>>> index 034a94904d69..dfb7ef1f2cc5 100644
>>> --- a/arch/arm/Makefile
>>> +++ b/arch/arm/Makefile
>>> @@ -12,6 +12,7 @@
>>>
>>>  # Ensure linker flags are correct
>>>  LDFLAGS              :=
>>> +LDFLAGS_MODULE       += -T $(srctree)/arch/arm/kernel/module.lds
>>>
>>>  LDFLAGS_vmlinux      :=-p --no-undefined -X
>>>  ifeq ($(CONFIG_CPU_ENDIAN_BE8),y)
>>> diff --git a/arch/arm/include/asm/module.h b/arch/arm/include/asm/module.h
>>> index ed690c49ef93..4c6927976469 100644
>>> --- a/arch/arm/include/asm/module.h
>>> +++ b/arch/arm/include/asm/module.h
>>> @@ -19,6 +19,8 @@ enum {
>>>
>>>  struct mod_arch_specific {
>>>       struct unwind_table *unwind[ARM_SEC_MAX];
>>> +     struct elf32_shdr   *core_plt;
>>> +     struct elf32_shdr   *init_plt;
>>>  };
>>>  #endif
>>>
>>> diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c
>>> index 6a4dffefd357..5ec70c15a881 100644
>>> --- a/arch/arm/kernel/module.c
>>> +++ b/arch/arm/kernel/module.c
>>> @@ -37,14 +37,62 @@
>>>  #define MODULES_VADDR        (((unsigned long)_etext + ~PMD_MASK) & PMD_MASK)
>>>  #endif
>>>
>>> -#ifdef CONFIG_MMU
>>> -void *module_alloc(unsigned long size)
>>> +#define PLT_ENTRY_STRIDE     L1_CACHE_BYTES
>>> +#define PLT_ENTRY_COUNT              (PLT_ENTRY_STRIDE / sizeof(u32))
>>> +#define PLT_ENTRY_SIZE               (sizeof(struct plt_entries) / PLT_ENTRY_COUNT)
>>> +#define PLT_ENTRY_LDR                __opcode_to_mem_arm(0xe59ff000 | (PLT_ENTRY_STRIDE - 8))
>>> +
>>> +struct plt_entries {
>>> +     u32     ldr[PLT_ENTRY_COUNT];
>>> +     u32     lit[PLT_ENTRY_COUNT];
>>> +};
>>> +
>>> +static inline int in_init (const struct module *mod, u32 addr)
>>>  {
>>> -     return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
>>> -                             GFP_KERNEL, PAGE_KERNEL_EXEC, NUMA_NO_NODE,
>>> -                             __builtin_return_address(0));
>>> +     return addr - (u32)mod->module_init < mod->init_size;
>>> +}
>>> +
>>> +static inline int in_core (const struct module *mod, u32 addr)
>>> +{
>>> +     return addr - (u32)mod->module_core < mod->core_size;
>>> +}
>>> +
>>> +static u32 get_plt(struct module *mod, unsigned long loc, Elf32_Addr val)
>>> +{
>>> +     struct plt_entries *plt, *plt_end;
>>> +
>>> +     if (in_init(mod, loc)) {
>>> +             plt = (void *)mod->arch.init_plt->sh_addr;
>>> +             plt_end = (void *)plt + mod->arch.init_plt->sh_size;
>>> +     } else {
>>> +             plt = (void *)mod->arch.core_plt->sh_addr;
>>> +             plt_end = (void *)plt + mod->arch.core_plt->sh_size;
>>> +     }
>>> +
>>> +     /* Look for an existing entry pointing to 'val' */
>>> +     while (plt < plt_end) {
>>> +             int i;
>>> +
>>> +             if (*plt->ldr != PLT_ENTRY_LDR) {
>>> +                     /* Populate a new set of entries */
>>> +                     *plt = (struct plt_entries){
>>> +                             { [0 ... PLT_ENTRY_COUNT-1] = PLT_ENTRY_LDR, },
>>> +                             { val, }
>>> +                     };
>>> +                     return (u32)plt->ldr;
>>> +             }
>>> +             for (i = 0; i < PLT_ENTRY_COUNT; i++) {
>>> +                     if (!plt->lit[i])
>>> +                             plt->lit[i] = val;
>>> +                     else if (plt->lit[i] != val)
>>> +                             continue;
>>> +                     return (u32)&plt->ldr[i];
>>> +             }
>>> +             plt++;
>>> +     }
>>> +     BUG();
>>> +     return 0;
>>>  }
>>> -#endif
>>>
>>>  int
>>>  apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
>>> @@ -107,6 +155,22 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
>>>                       if (offset & 3 ||
>>>                           offset <= (s32)0xfe000000 ||
>>>                           offset >= (s32)0x02000000) {
>>> +
>>> +                             /*
>>> +                              * Route this call through a PLT entry that we
>>> +                              * populate on the fly in the PLT section that
>>> +                              * is part of the module memory area.
>>> +                              * Note that 'offset + loc + 8' contains the
>>> +                              * absolute jump target, i.e., @sym + addend,
>>> +                              * corrected for the -8 PC bias.
>>> +                              */
>>> +                             offset = get_plt(module, loc, offset + loc + 8)
>>> +                                      - loc - 8;
>>> +                     }
>>> +
>>> +                     if (offset & 3 ||
>>> +                         offset <= (s32)0xfe000000 ||
>>> +                         offset >= (s32)0x02000000) {
>>>                               pr_err("%s: section %u reloc %u sym '%s': relocation %u out of range (%#lx -> %#x)\n",
>>>                                      module->name, relindex, i, symname,
>>>                                      ELF32_R_TYPE(rel->r_info), loc,
>>> @@ -354,3 +418,99 @@ module_arch_cleanup(struct module *mod)
>>>                       unwind_table_del(mod->arch.unwind[i]);
>>>  #endif
>>>  }
>>> +
>>> +static int duplicate_reloc(Elf32_Addr base, const Elf32_Rel *rel, int num,
>>> +                        u32 mask)
>>> +{
>>> +     u32 *loc1, *loc2;
>>> +     int i;
>>> +
>>> +     for (i = 0; i < num; i++) {
>>> +             if (rel[i].r_info != rel[num].r_info)
>>> +                     continue;
>>> +
>>> +             /*
>>> +              * Identical relocation types against identical symbols can
>>> +              * still result in different PLT entries if the addend in the
>>> +              * place is different. So resolve the target of the relocation
>>> +              * to compare the values.
>>> +              */
>>> +             loc1 = (u32 *)(base + rel[i].r_offset);
>>> +             loc2 = (u32 *)(base + rel[num].r_offset);
>>> +             if (((*loc1 ^ *loc2) & mask) == 0)
>>> +                     return 1;
>>> +     }
>>> +     return 0;
>>> +}
>>> +
>>> +/* Count how many PLT entries we may need */
>>> +static unsigned int count_plts(Elf32_Addr base, const Elf32_Rel *rel, int num)
>>> +{
>>> +     unsigned int ret = 0;
>>> +     int i;
>>> +
>>> +     /*
>>> +      * Sure, this is order(n^2), but it's usually short, and not
>>> +      * time critical
>>> +      */
>>> +     for (i = 0; i < num; i++)
>>> +             switch (ELF32_R_TYPE(rel[i].r_info))
>>> +             case R_ARM_CALL:
>>> +             case R_ARM_PC24:
>>> +             case R_ARM_JUMP24:
>>> +             case R_ARM_THM_CALL:
>>> +             case R_ARM_THM_JUMP24:
>>> +                     if (!duplicate_reloc(base, rel, i, 0x00ffffff))
>>> +                             ret++;
>>> +     return ret;
>>> +}
>>> +
>>> +int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
>>> +                           char *secstrings, struct module *mod)
>>> +{
>>> +     unsigned long core_plts = 0, init_plts = 0;
>>> +     Elf32_Shdr *s, *sechdrs_end = sechdrs + ehdr->e_shnum;
>>> +
>>> +     /*
>>> +      * To store the PLTs, we expand the .text section for core module code
>>> +      * and the .init.text section for initialization code.
>>> +      */
>>> +     for (s = sechdrs; s < sechdrs_end; ++s)
>>> +             if (strcmp(".core.plt", secstrings + s->sh_name) == 0)
>>> +                     mod->arch.core_plt = s;
>>> +             else if (strcmp(".init.plt", secstrings + s->sh_name) == 0)
>>> +                     mod->arch.init_plt = s;
>>> +
>>> +     if (!mod->arch.core_plt || !mod->arch.init_plt) {
>>> +             printk(KERN_ERR "%s: sections missing\n", mod->name);
>>> +             return -ENOEXEC;
>>> +     }
>>> +
>>> +     for (s = sechdrs + 1; s < sechdrs_end; ++s) {
>>> +             const Elf32_Rel *rels = (void *)ehdr + s->sh_offset;
>>> +             int numrels = s->sh_size / sizeof(Elf32_Rel);
>>> +             Elf32_Shdr *dstsec = sechdrs + s->sh_info;
>>> +
>>> +             if (s->sh_type != SHT_REL)
>>> +                     continue;
>>> +
>>> +             if (strstr(secstrings + s->sh_name, ".init"))
>>> +                     init_plts += count_plts(dstsec->sh_addr, rels, numrels);
>>> +             else
>>> +                     core_plts += count_plts(dstsec->sh_addr, rels, numrels);
>>> +     }
>>> +
>>> +     mod->arch.core_plt->sh_type = SHT_NOBITS;
>>> +     mod->arch.core_plt->sh_flags = SHF_EXECINSTR | SHF_ALLOC;
>>> +     mod->arch.core_plt->sh_addralign = L1_CACHE_BYTES;
>>> +     mod->arch.core_plt->sh_size = round_up(core_plts * PLT_ENTRY_SIZE,
>>> +                                            sizeof(struct plt_entries));
>>> +     mod->arch.init_plt->sh_type = SHT_NOBITS;
>>> +     mod->arch.init_plt->sh_flags = SHF_EXECINSTR | SHF_ALLOC;
>>> +     mod->arch.init_plt->sh_addralign = L1_CACHE_BYTES;
>>> +     mod->arch.init_plt->sh_size = round_up(init_plts * PLT_ENTRY_SIZE,
>>> +                                            sizeof(struct plt_entries));
>>> +     pr_debug("%s: core.plt=%x, init.plt=%x\n", __func__,
>>> +              mod->arch.core_plt->sh_size, mod->arch.init_plt->sh_size);
>>> +     return 0;
>>> +}
>>> diff --git a/arch/arm/kernel/module.lds b/arch/arm/kernel/module.lds
>>> new file mode 100644
>>> index 000000000000..3682fa107918
>>> --- /dev/null
>>> +++ b/arch/arm/kernel/module.lds
>>> @@ -0,0 +1,4 @@
>>> +SECTIONS {
>>> +        .core.plt : { BYTE(0) }
>>> +        .init.plt : { BYTE(0) }
>>> +}
>>> --
>>> 1.8.3.2
>>>
>>>