[RFC PATCH -next v2 3/4] arm64/ftrace: support dynamically allocated trampolines

Mark Rutland mark.rutland at arm.com
Thu Apr 21 06:10:04 PDT 2022


On Wed, Mar 16, 2022 at 06:01:31PM +0800, Wang ShaoBo wrote:
> From: Cheng Jian <cj.chengjian at huawei.com>
> 
> When tracing multiple functions customly, a list function is called
> in ftrace_(regs)_caller, which makes all the other traced functions
> recheck the hash of the ftrace_ops when tracing happend, apparently
> it is inefficient.

... and when does that actually matter? Who does this and why?

> We notified X86_64 has delivered a dynamically allocated trampoline
> which calls the callback directly to solve this problem. This patch
> introduces similar trampolines for ARM64, but we also should also
> handle long jump different.
> 
> Similar to X86_64, save the local ftrace_ops at the end.
> the ftrace trampoline layout:
> 
>          ftrace_(regs_)caller_tramp            low
>                    `--> +---------------------+ ^
>                         | ftrace_regs_entry:  | |
>                         | ...                 | |
>                         +---------------------+ |
>                         | ftrace_common:      | |
>                         | ...                 | |
>                         | ldr x2, <ftrace_ops>| |
>        ftrace callsite  | ...                 | |
>                    `--> +---------------------+ |
>                         | nop                  >> bl <callback>
>  ftrace graph callsite  | PLT entrys (TODO)   | |
>                    `--> +---------------------+ |
>                         | nop                  >> bl <graph callback>
>                         | PLT entrys (TODO)   | |
>                         +---------------------+ |
>                         | ftrace_regs_restore:| |
>                         | ...                 | |
>                         +---------------------+ |
>                         | ftrace_ops          | |
>                         +---------------------+ |
>                                                high

Chengming Zhou has patches to unify the two calls in the trampoline:

  https://lore.kernel.org/linux-arm-kernel/20220420160006.17880-1-zhouchengming@bytedance.com/

... and I'd much prefer that we do that *before* we copy-paste that code
for dynamic trampolines.

> Adopting the new dynamical trampoline is benificial when only one
> callback is registered to a function, when tracing happened, ftrace_ops
> can get its own callback directly from this trampoline allocated.
> 
> To compare the efficiency of calling schedule() when using traditional
> dynamic ftrace(using ftrace_ops_list) and ftrace using dynamic trampoline,
> We wrote a module to register multiple ftrace_ops, but only one(in our
> testcase is 'schedule') is used to measure the performance on the call
> path used by Unixbench.
> 
> This is the partial code:
> ```
>      static struct ftrace_ops *ops_array[100];
>      static struct ftrace_ops ops = {
>              .func                    = my_callback_func,
>              .flags                   = FTRACE_OPS_FL_PID,
>      };
>      static int register_ftrace_test_init(void)
>      {
>              int i, ret;
> 
>              if (cnt > 100 || cnt < 0)
>                      return -1;
> 
>              for (i = 0; i < cnt; i++) {
>                      ops_array[i] = kzalloc(sizeof(struct ftrace_ops), GFP_KERNEL);
>                      if (!ops_array[i])
>                              return -1;
> 
>                      *ops_array[i] = ops;
> 
>                      ret = ftrace_set_filter(ops_array[i], "cpuset_write_u64",
>                                  strlen("cpuset_write_u64"), 1);
>                      if (ret)
>                              return ret;
> 
>                      ret = register_ftrace_function(ops_array[i]);
>                      if (ret)
>                              return ret;
>              }
> 
>              ret = ftrace_set_filter(&ops, "schedule", strlen("schedule"), 1);
>              if (ret)
>                      return ret;
>              return register_ftrace_function(&ops);
>      }
> ```
> 
> So in our test, ftrace_ops_list can be illustrated:
> 
> HEAD(ftrace_ops_list)
> `-> +--------+    +--------+    +--------+     +------------+    +--------+
>     +  ops0  + -> +  ops1  + -> +  ops2  + ... +  ops(cnt-1)+ -> +   ops  +
>     +--------+    +--------+    +--------+     +------------+    +--------+ \_END
>         `-------------cpuset_write_u64----------------`           `schedule`
> 
> This is the result testing on kunpeng920 with CONFIG_RANDOMIZE_BASE=n
> (we also add first NA row for comparison with not tracing any function):
> 
>     command: taskset -c 1 ./Run -c 1 context1
>     +------------------UnixBench context1--------------------+
>     +---------+--------------+-----------------+-------------+
>     +         +            Score               + improvement +
>     +         +--------------+-----------------+-------------+
>     +         +  traditional +  dynamic tramp  +     +/-     +
>     +---------+--------------+-----------------+-------------+
>     +   NA    +    554.6     +     553.6       +      -      +
>     +---------+--------------+-----------------+-------------+
>     +  cnt=0  +    542.4     +     549.7       +    +1.2%    +
>     +---------+--------------+-----------------+-------------+
>     +  cnt=3  +    528.7     +     549.7       +    +3.9%    +
>     +---------+--------------+-----------------+-------------+
>     +  cnt=6  +    523.9     +     549.8       +    +4.9%    +
>     +---------+--------------+-----------------+-------------+
>     +  cnt=15 +    504.2     +     549.0       +    +8.9%    +
>     +---------+--------------+-----------------+-------------+
>     +  cnt=20 +    492.6     +     551.1       +    +11.9%   +
>     +---------+--------------+-----------------+-------------+
> 
> References:
> - https://lore.kernel.org/linux-arm-kernel/20200109142736.1122-1-cj.chengjian@huawei.com/
> 
> Signed-off-by: Cheng Jian <cj.chengjian at huawei.com>
> Signed-off-by: Wang ShaoBo <bobo.shaobowang at huawei.com>
> ---
>  arch/arm64/kernel/ftrace.c | 286 +++++++++++++++++++++++++++++++++++++
>  1 file changed, 286 insertions(+)
> 
> diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c
> index 4506c4a90ac1..d35a042baf75 100644
> --- a/arch/arm64/kernel/ftrace.c
> +++ b/arch/arm64/kernel/ftrace.c
> @@ -10,12 +10,14 @@
>  #include <linux/module.h>
>  #include <linux/swab.h>
>  #include <linux/uaccess.h>
> +#include <linux/vmalloc.h>
>  
>  #include <asm/cacheflush.h>
>  #include <asm/debug-monitors.h>
>  #include <asm/ftrace.h>
>  #include <asm/insn.h>
>  #include <asm/patching.h>
> +#include <asm/set_memory.h>
>  
>  #ifdef CONFIG_DYNAMIC_FTRACE
>  /*
> @@ -48,6 +50,290 @@ static int ftrace_modify_code(unsigned long pc, u32 old, u32 new,
>  	return 0;
>  }
>  
> +/* ftrace dynamic trampolines */
> +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
> +#ifdef CONFIG_MODULES
> +#include <linux/moduleloader.h>
> +
> +static inline void *alloc_tramp(unsigned long size)
> +{
> +	return module_alloc(size);
> +}
> +
> +static inline void tramp_free(void *tramp)
> +{
> +	module_memfree(tramp);
> +}
> +#else
> +static inline void *alloc_tramp(unsigned long size)
> +{
> +	return NULL;
> +}
> +
> +static inline void tramp_free(void *tramp) {}
> +#endif
> +
> +/* ftrace_caller trampoline template */
> +extern void ftrace_caller_tramp(void);
> +extern void ftrace_caller_tramp_end(void);
> +extern void ftrace_caller_tramp_call(void);
> +extern void ftrace_caller_tramp_graph_call(void);
> +extern void ftrace_caller_tramp_ops(void);
> +
> +/* ftrace_regs_caller trampoline template */
> +extern void ftrace_regs_caller_tramp(void);
> +extern void ftrace_regs_caller_tramp_end(void);
> +extern void ftrace_regs_caller_tramp_call(void);
> +extern void ftrace_regs_caller_tramp_graph_call(void);
> +extern void ftrace_regs_caller_tramp_ops(void);
> +
> +
> +/*
> + * The ARM64 ftrace trampoline layout :
> + *
> + *
> + *         ftrace_(regs_)caller_tramp            low
> + *                   `--> +---------------------+ ^
> + *                        | ftrace_regs_entry:  | |
> + *                        | ...                 | |
> + *                        +---------------------+ |
> + *                        | ftrace_common:      | |
> + *                        | ...                 | |
> + *                        | ldr x2, <ftrace_ops>| |
> + *       ftrace callsite  | ...                 | |
> + *                   `--> +---------------------+ |
> + *                        | nop                  >> bl <callback>
> + * ftrace graph callsite  | PLT entrys (TODO)   | |
> + *                   `--> +---------------------+ |
> + *                        | nop                  >> bl <graph callback>
> + *                        | PLT entrys (TODO)   | |
> + *                        +---------------------+ |
> + *                        | ftrace_regs_restore:| |
> + *                        | ...                 | |
> + *                        +---------------------+ |
> + *                        | ftrace_ops          | |
> + *                        +---------------------+ |
> + *                                               high
> + */

As above, I'd prefer we unify the callsite and graph callsite first.

> +
> +static unsigned long
> +create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
> +{
> +	unsigned long start_offset, end_offset, ops_offset;
> +	unsigned long caller_size, size, offset;
> +	unsigned long pc, npages;
> +	unsigned long *ptr;
> +	void *trampoline;
> +	u32 load;
> +	int ret;
> +
> +	if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) {
> +		start_offset = (unsigned long)ftrace_regs_caller_tramp;
> +		end_offset = (unsigned long)ftrace_regs_caller_tramp_end;
> +		ops_offset = (unsigned long)ftrace_regs_caller_tramp_ops;
> +	} else {
> +		start_offset = (unsigned long)ftrace_caller_tramp;
> +		end_offset = (unsigned long)ftrace_caller_tramp_end;
> +		ops_offset = (unsigned long)ftrace_caller_tramp_ops;
> +	}
> +
> +	caller_size = end_offset - start_offset + AARCH64_INSN_SIZE;
> +	size = caller_size + sizeof(void *);
> +
> +	trampoline = alloc_tramp(size);
> +	if (!trampoline)
> +		return 0;
> +
> +	*tramp_size = size;
> +	npages = DIV_ROUND_UP(*tramp_size, PAGE_SIZE);
> +
> +	/*
> +	 * copy ftrace_caller/ftrace_regs_caller trampoline template
> +	 * onto the trampoline memory
> +	 */
> +	ret = copy_from_kernel_nofault(trampoline, (void *)start_offset, caller_size);
> +	if (WARN_ON(ret < 0))
> +		goto free;

Why is this using copy_from_kernel_nofault() ?

There's no reason whatsoever that this should legitimately fault, so it
should be fine to use memcpy().

> +
> +	/*
> +	 * Stored ftrace_ops at the end of the trampoline.
> +	 * This will be used to load the third parameter for the callback.
> +	 * Basically, that location at the end of the trampoline takes the
> +	 * place of the global function_trace_op variable.
> +	 */
> +	ptr = (unsigned long *)(trampoline + caller_size);
> +	*ptr = (unsigned long)ops;
> +
> +	/*
> +	 * Update the trampoline ops REF
> +	 *	ldr x2, <ftrace_ops>
> +	 */
> +	ops_offset -= start_offset;
> +	pc = (unsigned long)trampoline + ops_offset;
> +	offset = (unsigned long)ptr - pc;
> +	if (WARN_ON(offset % AARCH64_INSN_SIZE != 0))
> +		goto free;
> +
> +	load = aarch64_insn_gen_load_literal(AARCH64_INSN_REG_2,
> +		AARCH64_INSN_VARIANT_64BIT, offset);
> +	if (ftrace_modify_code(pc, 0, load, false))
> +		goto free;
> +
> +	ops->flags |= FTRACE_OPS_FL_ALLOC_TRAMP;
> +
> +	set_vm_flush_reset_perms(trampoline);
> +
> +	/*
> +	 * Module allocation needs to be completed by making the page
> +	 * executable. The page is still writable, which is a security hazard,
> +	 * but anyhow ftrace breaks W^X completely.
> +	 */
> +	set_memory_ro((unsigned long)trampoline, npages);
> +	set_memory_x((unsigned long)trampoline, npages);
> +

Where is the cache maintenance performed?

> +	return (unsigned long)trampoline;
> +
> +free:
> +	tramp_free(trampoline);
> +	return 0;
> +}
> +
> +static unsigned long calc_trampoline_call_offset(bool save_regs, bool is_graph)
> +{
> +	unsigned start_offset, call_offset;
> +
> +	if (save_regs) {
> +		start_offset = (unsigned long)ftrace_regs_caller_tramp;
> +		if (is_graph)
> +			call_offset = (unsigned long)ftrace_regs_caller_tramp_graph_call;
> +		else
> +			call_offset = (unsigned long)ftrace_regs_caller_tramp_call;
> +	} else {
> +		start_offset = (unsigned long)ftrace_caller_tramp;
> +		if (is_graph)
> +			call_offset = (unsigned long)ftrace_caller_tramp_graph_call;
> +		else
> +			call_offset = (unsigned long)ftrace_caller_tramp_call;
> +	}

This should be simplified by unifying the call sites.

Since we need the offsets in a few places, we should factor that out.
For example, have a structure with those offsets, and a helper which
returns that filled with either the regular offsets or the regs offsts.
More ideally, fill those in at compile-time.

> +
> +	return call_offset - start_offset;
> +}
> +
> +static inline ftrace_func_t
> +ftrace_trampoline_get_func(struct ftrace_ops *ops, bool is_graph)
> +{
> +	ftrace_func_t func;
> +
> +	if (!is_graph)
> +		func = ftrace_ops_get_func(ops);
> +#ifdef CONFIG_FUNCTION_GRAPH_TRACER
> +	else
> +		func = (ftrace_func_t)ftrace_graph_caller;
> +#endif
> +
> +	return func;
> +}
> +
> +static int
> +ftrace_trampoline_modify_call(struct ftrace_ops *ops, bool is_graph, bool active)
> +{
> +	unsigned long offset;
> +	unsigned long pc;
> +	ftrace_func_t func;
> +	u32 nop, branch;
> +
> +	offset = calc_trampoline_call_offset(ops->flags &
> +				FTRACE_OPS_FL_SAVE_REGS, is_graph);
> +	pc = ops->trampoline + offset;
> +
> +	func = ftrace_trampoline_get_func(ops, is_graph);
> +	nop = aarch64_insn_gen_nop();
> +	branch = aarch64_insn_gen_branch_imm(pc, (unsigned long)func,
> +				AARCH64_INSN_BRANCH_LINK);
> +
> +	if (active)
> +		return ftrace_modify_code(pc, 0, branch, false);
> +	else
> +		return ftrace_modify_code(pc, 0, nop, false);
> +}
> +
> +extern int ftrace_graph_active;
> +void arch_ftrace_update_trampoline(struct ftrace_ops *ops)
> +{
> +	unsigned int size;
> +
> +	/*
> +	 * If kaslr is enabled, the address of tramp and ftrace call
> +	 * may be far away, Therefore, long jump support is required.
> +	 */
> +	if (IS_ENABLED(CONFIG_RANDOMIZE_BASE))
> +		return;

IIUC this can be required in some unusual configurations *today* even
without KASLR, so we probably want to add the long jump support first.

Thanks,
Mark.

> +
> +	if (!ops->trampoline) {
> +		ops->trampoline = create_trampoline(ops, &size);
> +		if (!ops->trampoline)
> +			return;
> +		ops->trampoline_size = size;
> +	}
> +
> +	/* These is no static trampoline on ARM64 */
> +	WARN_ON(!(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP));
> +
> +	/* atomic modify trampoline: <call func> */
> +	WARN_ON(ftrace_trampoline_modify_call(ops, false, true));
> +#ifdef CONFIG_FUNCTION_GRAPH_TRACER
> +	/* atomic modify trampoline: <call graph func> */
> +	WARN_ON(ftrace_trampoline_modify_call(ops, true, ftrace_graph_active));
> +#endif
> +}
> +
> +static void *addr_from_call(void *ptr)
> +{
> +	u32 insn;
> +	unsigned long offset;
> +
> +	if (aarch64_insn_read(ptr, &insn))
> +		return NULL;
> +
> +	/* Make sure this is a call */
> +	if (!aarch64_insn_is_bl(insn)) {
> +		pr_warn("Expected bl instruction, but not\n");
> +		return NULL;
> +	}
> +
> +	offset = aarch64_get_branch_offset(insn);
> +
> +	return (void *)ptr + AARCH64_INSN_SIZE + offset;
> +}
> +
> +void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent,
> +			   unsigned long frame_pointer);
> +
> +void *arch_ftrace_trampoline_func(struct ftrace_ops *ops, struct dyn_ftrace *rec)
> +{
> +	unsigned long offset;
> +
> +	/* If we didn't allocate this trampoline, consider it static */
> +	if (!ops || !(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))
> +		return NULL;
> +
> +	offset = calc_trampoline_call_offset(ops->flags & FTRACE_OPS_FL_SAVE_REGS,
> +					ftrace_graph_active);
> +
> +	return addr_from_call((void *)ops->trampoline + offset);
> +}
> +
> +void arch_ftrace_trampoline_free(struct ftrace_ops *ops)
> +{
> +	if (!ops || !(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))
> +		return;
> +
> +	tramp_free((void *)ops->trampoline);
> +	ops->trampoline = 0;
> +	ops->trampoline_size = 0;
> +}
> +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_REGS */
> +
>  /*
>   * Replace tracer function in ftrace_caller()
>   */
> -- 
> 2.25.1
> 



More information about the linux-arm-kernel mailing list