[PATCH] kprobes: Enable tracing for mololithic kernel images

Christophe Leroy christophe.leroy at csgroup.eu
Thu Jun 9 01:30:12 PDT 2022



Le 08/06/2022 à 01:59, Jarkko Sakkinen a écrit :
> [You don't often get email from jarkko at profian.com. Learn why this is important at https://aka.ms/LearnAboutSenderIdentification ]
> 
> Tracing with kprobes while running a monolithic kernel is currently
> impossible because CONFIG_KPROBES is dependent of CONFIG_MODULES.  This
> dependency is a result of kprobes code using the module allocator for the
> trampoline code.
> 
> Detaching kprobes from modules helps to squeeze down the user space,
> e.g. when developing new core kernel features, while still having all
> the nice tracing capabilities.

Nice idea, could also be nice to have BPF without MODULES.

> 
> For kernel/ and arch/*, move module_alloc() and module_memfree() to
> module_alloc.c, and compile as part of vmlinux when either CONFIG_MODULES
> or CONFIG_KPROBES is enabled.  In addition, flag kernel module specific
> code with CONFIG_MODULES.

Nice, but that's not enough. You have to audit every peace of code that 
depends on CONFIG_MODULES and see if it needs to be activated for your 
case as well. For instance some powerpc configurations don't honor exec 
page faults on kernel pages when CONFIG_MODULES is not selected.

> 
> As the result, kprobes can be used with a monolithic kernel.
> 
> Signed-off-by: Jarkko Sakkinen <jarkko at profian.com>

I think this patch should be split in a several patches, one (or even 
one per architectures ?) to make modules_alloc() independant of 
CONFIG_MODULES, then a patch to make CONFIG_KPROBES independant on 
CONFIG_MOUDLES.


> ---
> Tested with the help of BuildRoot and QEMU:
> - arm (function tracer)
> - arm64 (function tracer)
> - mips (function tracer)
> - powerpc (function tracer)
> - riscv (function tracer)
> - s390 (function tracer)
> - sparc (function tracer)
> - x86 (function tracer)
> - sh (function tracer, for the "pure" kernel/modules_alloc.c path)
> ---
>   arch/Kconfig                       |  1 -
>   arch/arm/kernel/Makefile           |  5 +++
>   arch/arm/kernel/module.c           | 32 ----------------
>   arch/arm/kernel/module_alloc.c     | 42 ++++++++++++++++++++
>   arch/arm64/kernel/Makefile         |  5 +++
>   arch/arm64/kernel/module.c         | 47 -----------------------
>   arch/arm64/kernel/module_alloc.c   | 57 ++++++++++++++++++++++++++++
>   arch/mips/kernel/Makefile          |  5 +++
>   arch/mips/kernel/module.c          |  9 -----
>   arch/mips/kernel/module_alloc.c    | 18 +++++++++
>   arch/parisc/kernel/Makefile        |  5 +++
>   arch/parisc/kernel/module.c        | 11 ------
>   arch/parisc/kernel/module_alloc.c  | 23 +++++++++++
>   arch/powerpc/kernel/Makefile       |  5 +++
>   arch/powerpc/kernel/module.c       | 37 ------------------
>   arch/powerpc/kernel/module_alloc.c | 47 +++++++++++++++++++++++

You are missing necessary changes for powerpc.

On powerpc 8xx or powerpc 603, software TLB handlers don't honor 
instruction TLB miss when CONFIG_MODULES are not set, look into 
head_8xx.S and head_book3s_32.S

On powerpc book3s/32, all kernel space is set to NX except the module 
segment. When CONFIG_MODULES is all space is set NX. See 
mmu_mark_initmem_nx() and is_module_segment().


>   arch/riscv/kernel/Makefile         |  5 +++
>   arch/riscv/kernel/module.c         | 10 -----
>   arch/riscv/kernel/module_alloc.c   | 19 ++++++++++
>   arch/s390/kernel/Makefile          |  5 +++
>   arch/s390/kernel/module.c          | 17 ---------
>   arch/s390/kernel/module_alloc.c    | 33 ++++++++++++++++
>   arch/sparc/kernel/Makefile         |  5 +++
>   arch/sparc/kernel/module.c         | 30 ---------------
>   arch/sparc/kernel/module_alloc.c   | 39 +++++++++++++++++++
>   arch/x86/kernel/Makefile           |  5 +++
>   arch/x86/kernel/module.c           | 50 ------------------------
>   arch/x86/kernel/module_alloc.c     | 61 ++++++++++++++++++++++++++++++
>   kernel/Makefile                    |  5 +++
>   kernel/kprobes.c                   | 10 +++++
>   kernel/module/main.c               | 17 ---------
>   kernel/module_alloc.c              | 26 +++++++++++++
>   kernel/trace/trace_kprobe.c        | 10 ++++-
>   33 files changed, 434 insertions(+), 262 deletions(-)
>   create mode 100644 arch/arm/kernel/module_alloc.c
>   create mode 100644 arch/arm64/kernel/module_alloc.c
>   create mode 100644 arch/mips/kernel/module_alloc.c
>   create mode 100644 arch/parisc/kernel/module_alloc.c
>   create mode 100644 arch/powerpc/kernel/module_alloc.c
>   create mode 100644 arch/riscv/kernel/module_alloc.c
>   create mode 100644 arch/s390/kernel/module_alloc.c
>   create mode 100644 arch/sparc/kernel/module_alloc.c
>   create mode 100644 arch/x86/kernel/module_alloc.c
>   create mode 100644 kernel/module_alloc.c
> 
> diff --git a/arch/Kconfig b/arch/Kconfig
> index fcf9a41a4ef5..e8e3e7998a2e 100644
> --- a/arch/Kconfig
> +++ b/arch/Kconfig
> @@ -39,7 +39,6 @@ config GENERIC_ENTRY
> 
>   config KPROBES
>          bool "Kprobes"
> -       depends on MODULES
>          depends on HAVE_KPROBES
>          select KALLSYMS
>          select TASKS_RCU if PREEMPTION

> diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
> index 2e2a2a9bcf43..5a811cdf230b 100644
> --- a/arch/powerpc/kernel/Makefile
> +++ b/arch/powerpc/kernel/Makefile
> @@ -103,6 +103,11 @@ obj-$(CONFIG_HIBERNATION)  += swsusp_$(BITS).o
>   endif
>   obj64-$(CONFIG_HIBERNATION)    += swsusp_asm64.o
>   obj-$(CONFIG_MODULES)          += module.o module_$(BITS).o
> +ifeq ($(CONFIG_MODULES),y)
> +obj-y                          += module_alloc.o
> +else
> +obj-$(CONFIG_KPROBES)          += module_alloc.o
> +endif

Why not just do:

obj-$(CONFIG_MODULES)          += module_alloc.o
obj-$(CONFIG_KPROBES)          += module_alloc.o

However, a new hidden config item (eg: CONFIG_DYNAMIC_TEXT) selected by 
both CONFIG_MODULES and CONFIG_KPROBES would make like easier when 
you'll come to do the changes required.



>   obj-$(CONFIG_44x)              += cpu_setup_44x.o
>   obj-$(CONFIG_PPC_FSL_BOOK3E)   += cpu_setup_fsl_booke.o
>   obj-$(CONFIG_PPC_DOORBELL)     += dbell.o
> diff --git a/arch/powerpc/kernel/module.c b/arch/powerpc/kernel/module.c
> index f6d6ae0a1692..b30e00964a60 100644
> --- a/arch/powerpc/kernel/module.c
> +++ b/arch/powerpc/kernel/module.c
> @@ -88,40 +88,3 @@ int module_finalize(const Elf_Ehdr *hdr,
> 
>          return 0;
>   }
> -
> -static __always_inline void *
> -__module_alloc(unsigned long size, unsigned long start, unsigned long end, bool nowarn)
> -{
> -       pgprot_t prot = strict_module_rwx_enabled() ? PAGE_KERNEL : PAGE_KERNEL_EXEC;
> -       gfp_t gfp = GFP_KERNEL | (nowarn ? __GFP_NOWARN : 0);
> -
> -       /*
> -        * Don't do huge page allocations for modules yet until more testing
> -        * is done. STRICT_MODULE_RWX may require extra work to support this
> -        * too.
> -        */
> -       return __vmalloc_node_range(size, 1, start, end, gfp, prot,
> -                                   VM_FLUSH_RESET_PERMS,
> -                                   NUMA_NO_NODE, __builtin_return_address(0));
> -}
> -
> -void *module_alloc(unsigned long size)
> -{
> -#ifdef MODULES_VADDR
> -       unsigned long limit = (unsigned long)_etext - SZ_32M;
> -       void *ptr = NULL;
> -
> -       BUILD_BUG_ON(TASK_SIZE > MODULES_VADDR);
> -
> -       /* First try within 32M limit from _etext to avoid branch trampolines */
> -       if (MODULES_VADDR < PAGE_OFFSET && MODULES_END > limit)
> -               ptr = __module_alloc(size, limit, MODULES_END, true);
> -
> -       if (!ptr)
> -               ptr = __module_alloc(size, MODULES_VADDR, MODULES_END, false);
> -
> -       return ptr;
> -#else
> -       return __module_alloc(size, VMALLOC_START, VMALLOC_END, false);
> -#endif
> -}
> diff --git a/arch/powerpc/kernel/module_alloc.c b/arch/powerpc/kernel/module_alloc.c
> new file mode 100644
> index 000000000000..48541c27ce46
> --- /dev/null
> +++ b/arch/powerpc/kernel/module_alloc.c
> @@ -0,0 +1,47 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + *  Kernel module help for powerpc.
> + *  Copyright (C) 2001, 2003 Rusty Russell IBM Corporation.
> + *  Copyright (C) 2008 Freescale Semiconductor, Inc.
> + */
> +
> +#include <linux/mm.h>
> +#include <linux/moduleloader.h>
> +#include <linux/vmalloc.h>
> +
> +static __always_inline void *
> +__module_alloc(unsigned long size, unsigned long start, unsigned long end, bool nowarn)
> +{
> +       pgprot_t prot = strict_module_rwx_enabled() ? PAGE_KERNEL : PAGE_KERNEL_EXEC;
> +       gfp_t gfp = GFP_KERNEL | (nowarn ? __GFP_NOWARN : 0);
> +
> +       /*
> +        * Don't do huge page allocations for modules yet until more testing
> +        * is done. STRICT_MODULE_RWX may require extra work to support this
> +        * too.
> +        */
> +       return __vmalloc_node_range(size, 1, start, end, gfp, prot,
> +                                   VM_FLUSH_RESET_PERMS,
> +                                   NUMA_NO_NODE, __builtin_return_address(0));
> +}
> +
> +void *module_alloc(unsigned long size)
> +{
> +#ifdef MODULES_VADDR

Is MODULES_VADDR defined even when CONFIG_MODULES is not ?

> +       unsigned long limit = (unsigned long)_etext - SZ_32M;
> +       void *ptr = NULL;
> +
> +       BUILD_BUG_ON(TASK_SIZE > MODULES_VADDR);
> +
> +       /* First try within 32M limit from _etext to avoid branch trampolines */
> +       if (MODULES_VADDR < PAGE_OFFSET && MODULES_END > limit)
> +               ptr = __module_alloc(size, limit, MODULES_END, true);
> +
> +       if (!ptr)
> +               ptr = __module_alloc(size, MODULES_VADDR, MODULES_END, false);
> +
> +       return ptr;
> +#else
> +       return __module_alloc(size, VMALLOC_START, VMALLOC_END, false);
> +#endif
> +}

> diff --git a/kernel/Makefile b/kernel/Makefile
> index 318789c728d3..2981fe42060d 100644
> --- a/kernel/Makefile
> +++ b/kernel/Makefile
> @@ -53,6 +53,11 @@ obj-y += livepatch/
>   obj-y += dma/
>   obj-y += entry/
>   obj-$(CONFIG_MODULES) += module/
> +ifeq ($(CONFIG_MODULES),y)
> +obj-y += module_alloc.o
> +else
> +obj-$(CONFIG_KPROBES) += module_alloc.o
> +endif

Same comment, could be:

obj-$(CONFIG_MODULES)          += module_alloc.o
obj-$(CONFIG_KPROBES)          += module_alloc.o

> 
>   obj-$(CONFIG_KCMP) += kcmp.o
>   obj-$(CONFIG_FREEZER) += freezer.o
> diff --git a/kernel/kprobes.c b/kernel/kprobes.c
> index f214f8c088ed..3f9876374cd3 100644
> --- a/kernel/kprobes.c
> +++ b/kernel/kprobes.c
> @@ -1569,6 +1569,7 @@ static int check_kprobe_address_safe(struct kprobe *p,
>                  goto out;
>          }
> 
> +#ifdef CONFIG_MODULES
>          /* Check if 'p' is probing a module. */
>          *probed_mod = __module_text_address((unsigned long) p->addr);
>          if (*probed_mod) {
> @@ -1592,6 +1593,8 @@ static int check_kprobe_address_safe(struct kprobe *p,
>                          ret = -ENOENT;
>                  }
>          }
> +#endif
> +
>   out:
>          preempt_enable();
>          jump_label_unlock();
> @@ -2475,6 +2478,7 @@ int kprobe_add_area_blacklist(unsigned long start, unsigned long end)
>          return 0;
>   }
> 
> +#ifdef CONFIG_MODULES
>   /* Remove all symbols in given area from kprobe blacklist */
>   static void kprobe_remove_area_blacklist(unsigned long start, unsigned long end)
>   {
> @@ -2492,6 +2496,7 @@ static void kprobe_remove_ksym_blacklist(unsigned long entry)
>   {
>          kprobe_remove_area_blacklist(entry, entry + 1);
>   }
> +#endif /* CONFIG_MODULES */
> 
>   int __weak arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long *value,
>                                     char *type, char *sym)
> @@ -2557,6 +2562,7 @@ static int __init populate_kprobe_blacklist(unsigned long *start,
>          return ret ? : arch_populate_kprobe_blacklist();
>   }
> 
> +#ifdef CONFIG_MODULES
>   static void add_module_kprobe_blacklist(struct module *mod)
>   {
>          unsigned long start, end;
> @@ -2658,6 +2664,7 @@ static struct notifier_block kprobe_module_nb = {
>          .notifier_call = kprobes_module_callback,
>          .priority = 0
>   };
> +#endif /* CONFIG_MODULES */
> 
>   void kprobe_free_init_mem(void)
>   {
> @@ -2717,8 +2724,11 @@ static int __init init_kprobes(void)
>          err = arch_init_kprobes();
>          if (!err)
>                  err = register_die_notifier(&kprobe_exceptions_nb);
> +
> +#ifdef CONFIG_MODULES
>          if (!err)
>                  err = register_module_notifier(&kprobe_module_nb);
> +#endif
> 
>          kprobes_initialized = (err == 0);
>          kprobe_sysctls_init();
> diff --git a/kernel/module/main.c b/kernel/module/main.c
> index fed58d30725d..7fa182b78550 100644
> --- a/kernel/module/main.c
> +++ b/kernel/module/main.c
> @@ -1121,16 +1121,6 @@ resolve_symbol_wait(struct module *mod,
>          return ksym;
>   }
> 
> -void __weak module_memfree(void *module_region)
> -{
> -       /*
> -        * This memory may be RO, and freeing RO memory in an interrupt is not
> -        * supported by vmalloc.
> -        */
> -       WARN_ON(in_interrupt());
> -       vfree(module_region);
> -}
> -
>   void __weak module_arch_cleanup(struct module *mod)
>   {
>   }
> @@ -1606,13 +1596,6 @@ static void dynamic_debug_remove(struct module *mod, struct _ddebug *debug)
>                  ddebug_remove_module(mod->name);
>   }
> 
> -void * __weak module_alloc(unsigned long size)
> -{
> -       return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
> -                       GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS,
> -                       NUMA_NO_NODE, __builtin_return_address(0));
> -}
> -
>   bool __weak module_init_section(const char *name)
>   {
>          return strstarts(name, ".init");
> diff --git a/kernel/module_alloc.c b/kernel/module_alloc.c
> new file mode 100644
> index 000000000000..26a4c60998ad
> --- /dev/null
> +++ b/kernel/module_alloc.c
> @@ -0,0 +1,26 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + * Copyright (C) 2002 Richard Henderson
> + * Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM.
> + */
> +
> +#include <linux/mm.h>
> +#include <linux/moduleloader.h>
> +#include <linux/vmalloc.h>
> +
> +void * __weak module_alloc(unsigned long size)
> +{
> +       return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
> +                       GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS,
> +                       NUMA_NO_NODE, __builtin_return_address(0));
> +}
> +
> +void __weak module_memfree(void *module_region)
> +{
> +       /*
> +        * This memory may be RO, and freeing RO memory in an interrupt is not
> +        * supported by vmalloc.
> +        */
> +       WARN_ON(in_interrupt());
> +       vfree(module_region);
> +}
> diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
> index 93507330462c..050b2975332e 100644
> --- a/kernel/trace/trace_kprobe.c
> +++ b/kernel/trace/trace_kprobe.c
> @@ -101,6 +101,7 @@ static nokprobe_inline bool trace_kprobe_has_gone(struct trace_kprobe *tk)
>          return kprobe_gone(&tk->rp.kp);
>   }
> 
> +#ifdef CONFIG_MODULES
>   static nokprobe_inline bool trace_kprobe_within_module(struct trace_kprobe *tk,
>                                                   struct module *mod)
>   {
> @@ -109,11 +110,13 @@ static nokprobe_inline bool trace_kprobe_within_module(struct trace_kprobe *tk,
> 
>          return strncmp(module_name(mod), name, len) == 0 && name[len] == ':';
>   }
> +#endif /* CONFIG_MODULES */
> 
>   static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk)
>   {
> +       bool ret = false;
> +#ifdef CONFIG_MODULES
>          char *p;
> -       bool ret;
> 
>          if (!tk->symbol)
>                  return false;
> @@ -125,6 +128,7 @@ static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk)
>          ret = !!find_module(tk->symbol);
>          rcu_read_unlock_sched();
>          *p = ':';
> +#endif /* CONFIG_MODULES */
> 
>          return ret;
>   }
> @@ -668,6 +672,7 @@ static int register_trace_kprobe(struct trace_kprobe *tk)
>          return ret;
>   }
> 
> +#ifdef CONFIG_MODULES
>   /* Module notifier call back, checking event on the module */
>   static int trace_kprobe_module_callback(struct notifier_block *nb,
>                                         unsigned long val, void *data)
> @@ -702,6 +707,7 @@ static struct notifier_block trace_kprobe_module_nb = {
>          .notifier_call = trace_kprobe_module_callback,
>          .priority = 1   /* Invoked after kprobe module callback */
>   };
> +#endif /* CONFIG_MODULES */
> 
>   static int __trace_kprobe_create(int argc, const char *argv[])
>   {
> @@ -1896,8 +1902,10 @@ static __init int init_kprobe_trace_early(void)
>          if (ret)
>                  return ret;
> 
> +#ifdef CONFIG_MODULES
>          if (register_module_notifier(&trace_kprobe_module_nb))
>                  return -EINVAL;
> +#endif /* CONFIG_MODULES */
> 
>          return 0;
>   }
> --
> 2.36.1
> 


More information about the linux-riscv mailing list