[RFC PATCH RESEND bpf-next 4/4] riscv, bpf: Add bpf trampoline support for RV64

Björn Töpel bjorn at kernel.org
Tue Jan 3 11:16:03 PST 2023


Pu Lehui <pulehui at huaweicloud.com> writes:

> From: Pu Lehui <pulehui at huawei.com>
>
> BPF trampoline is the critical infrastructure of the bpf
> subsystem, acting as a mediator between kernel functions
> and BPF programs. Numerous important features, such as
> using ebpf program for zero overhead kernel introspection,
> rely on this key component. We can't wait to support bpf
> trampoline on RV64. The implementation of bpf trampoline
> was closely to x86 and arm64 for future development. The
> related tests have passed, as well as the test_verifier
> with no new failure ceses.
>
> Signed-off-by: Pu Lehui <pulehui at huawei.com>
> ---
>  arch/riscv/net/bpf_jit_comp64.c | 322 ++++++++++++++++++++++++++++++++
>  1 file changed, 322 insertions(+)
>
> diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c
> index fa8b03c52463..11c001782e7b 100644
> --- a/arch/riscv/net/bpf_jit_comp64.c
> +++ b/arch/riscv/net/bpf_jit_comp64.c
> @@ -738,6 +738,328 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type poke_type,
>  	       bpf_text_poke_jump(ip, old_addr, new_addr);
>  }
>  
> +static void store_args(int nregs, int args_off, struct rv_jit_context *ctx)
> +{
> +	int i;
> +
> +	for (i = 0; i < nregs; i++) {
> +		emit_sd(RV_REG_FP, -args_off, RV_REG_A0 + i, ctx);
> +		args_off -= 8;
> +	}
> +}
> +
> +static void restore_args(int nregs, int args_off, struct rv_jit_context *ctx)
> +{
> +	int i;
> +
> +	for (i = 0; i < nregs; i++) {
> +		emit_ld(RV_REG_A0 + i, -args_off, RV_REG_FP, ctx);
> +		args_off -= 8;
> +	}
> +}
> +
> +static int invoke_bpf_prog(struct bpf_tramp_link *l, int args_off, int retval_off,
> +			   int run_ctx_off, bool save_ret, struct rv_jit_context *ctx)
> +{
> +	u32 insn;
> +	int ret, branch_off, offset;
> +	struct bpf_prog *p = l->link.prog;
> +	int cookie_off = offsetof(struct bpf_tramp_run_ctx, bpf_cookie);
> +
> +	if (l->cookie) {
> +		emit_imm(RV_REG_T1, l->cookie, ctx);
> +		emit_sd(RV_REG_FP, -run_ctx_off + cookie_off, RV_REG_T1, ctx);
> +	} else {
> +		emit_sd(RV_REG_FP, -run_ctx_off + cookie_off, RV_REG_ZERO, ctx);
> +	}
> +
> +	/* arg1: prog */
> +	emit_imm(RV_REG_A0, (const s64)p, ctx);
> +	/* arg2: &run_ctx */
> +	emit_addi(RV_REG_A1, RV_REG_FP, -run_ctx_off, ctx);
> +	ret = emit_call((const u64)bpf_trampoline_enter(p), true, ctx);
> +	if (ret)
> +		return ret;
> +
> +	/* if (__bpf_prog_enter(prog) == 0)
> +	 *	goto skip_exec_of_prog;
> +	 */
> +	branch_off = ctx->ninsns;
> +	/* nop reserved for conditional jump */
> +	emit(rv_nop(), ctx);
> +
> +	/* store prog start time */
> +	emit_mv(RV_REG_S1, RV_REG_A0, ctx);
> +
> +	/* arg1: &args_off */
> +	emit_addi(RV_REG_A0, RV_REG_FP, -args_off, ctx);
> +	if (!p->jited)
> +		/* arg2: progs[i]->insnsi for interpreter */
> +		emit_imm(RV_REG_A1, (const s64)p->insnsi, ctx);
> +	ret = emit_call((const u64)p->bpf_func, true, ctx);
> +	if (ret)
> +		return ret;
> +
> +	if (save_ret)
> +		emit_sd(RV_REG_FP, -retval_off, regmap[BPF_REG_0], ctx);
> +
> +	/* update branch with beqz */
> +	offset = ninsns_rvoff(ctx->ninsns - branch_off);
> +	insn = rv_beq(RV_REG_A0, RV_REG_ZERO, offset >> 1);
> +	*(u32 *)(ctx->insns + branch_off) = insn;
> +
> +	/* arg1: prog */
> +	emit_imm(RV_REG_A0, (const s64)p, ctx);
> +	/* arg2: prog start time */
> +	emit_mv(RV_REG_A1, RV_REG_S1, ctx);
> +	/* arg3: &run_ctx */
> +	emit_addi(RV_REG_A2, RV_REG_FP, -run_ctx_off, ctx);
> +	ret = emit_call((const u64)bpf_trampoline_exit(p), true, ctx);
> +
> +	return ret;
> +}
> +
> +static int invoke_bpf_mod_ret(struct bpf_tramp_links *tl, int args_off, int retval_off,
> +			      int run_ctx_off, int *branches_off, struct rv_jit_context *ctx)
> +{
> +	int i, ret;
> +
> +	/* cleanup to avoid garbage return value confusion */
> +	emit_sd(RV_REG_FP, -retval_off, RV_REG_ZERO, ctx);
> +	for (i = 0; i < tl->nr_links; i++) {
> +		ret = invoke_bpf_prog(tl->links[i], args_off, retval_off,
> +				run_ctx_off, true, ctx);
> +		if (ret)
> +			return ret;
> +		emit_ld(RV_REG_T1, -retval_off, RV_REG_FP, ctx);
> +		branches_off[i] = ctx->ninsns;
> +		/* nop reserved for conditional jump */
> +		emit(rv_nop(), ctx);
> +	}
> +
> +	return 0;
> +}
> +
> +static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im,
> +					 const struct btf_func_model *m,
> +					 struct bpf_tramp_links *tlinks,
> +					 void *func_addr, u32 flags,
> +					 struct rv_jit_context *ctx)
> +{
> +	int i, ret, offset;
> +	int *branches_off = NULL;
> +	int stack_size = 0, nregs = m->nr_args;
> +	int retaddr_off, fp_off, retval_off, args_off;
> +	int nregs_off, ip_off, run_ctx_off, sreg_off;
> +	struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY];
> +	struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT];
> +	struct bpf_tramp_links *fmod_ret = &tlinks[BPF_TRAMP_MODIFY_RETURN];
> +	void *orig_call = func_addr;
> +	bool save_ret;
> +	u32 insn;
> +
> +	/* Generated trampoline stack layout:
> +	 *
> +	 * FP - 8	    [ RA of parent func	] return address of parent
> +	 *					  function
> +	 * FP - retaddr_off [ RA of traced func	] return address of traced
> +	 *					  function
> +	 * FP - fp_off	    [ FP of parent func ]
> +	 *
> +	 * FP - retval_off  [ return value      ] BPF_TRAMP_F_CALL_ORIG or
> +	 *					  BPF_TRAMP_F_RET_FENTRY_RET
> +	 *                  [ argN              ]
> +	 *                  [ ...               ]
> +	 * FP - args_off    [ arg1              ]
> +	 *
> +	 * FP - nregs_off   [ regs count        ]
> +	 *
> +	 * FP - ip_off      [ traced func	] BPF_TRAMP_F_IP_ARG
> +	 *
> +	 * FP - run_ctx_off [ bpf_tramp_run_ctx ]
> +	 *
> +	 * FP - sreg_off    [ callee saved reg	]
> +	 *
> +	 *		    [ pads              ] pads for 16 bytes alignment
> +	 */
> +
> +	if (flags & (BPF_TRAMP_F_ORIG_STACK | BPF_TRAMP_F_SHARE_IPMODIFY))
> +		return -ENOTSUPP;
> +
> +	/* extra regiters for struct arguments */
> +	for (i = 0; i < m->nr_args; i++)
> +		if (m->arg_flags[i] & BTF_FMODEL_STRUCT_ARG)
> +			nregs += round_up(m->arg_size[i], 8) / 8 - 1;
> +
> +	/* 8 arguments passed by registers */
> +	if (nregs > 8)
> +		return -ENOTSUPP;
> +
> +	/* room for parent function return address */
> +	stack_size += 8;
> +
> +	stack_size += 8;
> +	retaddr_off = stack_size;
> +
> +	stack_size += 8;
> +	fp_off = stack_size;
> +
> +	save_ret = flags & (BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_RET_FENTRY_RET);
> +	if (save_ret) {
> +		stack_size += 8;
> +		retval_off = stack_size;
> +	}
> +
> +	stack_size += nregs * 8;
> +	args_off = stack_size;
> +
> +	stack_size += 8;
> +	nregs_off = stack_size;
> +
> +	if (flags & BPF_TRAMP_F_IP_ARG) {
> +		stack_size += 8;
> +		ip_off = stack_size;
> +	}
> +
> +	stack_size += round_up(sizeof(struct bpf_tramp_run_ctx), 8);
> +	run_ctx_off = stack_size;
> +
> +	stack_size += 8;
> +	sreg_off = stack_size;
> +
> +	stack_size = round_up(stack_size, 16);
> +
> +	emit_addi(RV_REG_SP, RV_REG_SP, -stack_size, ctx);
> +
> +	emit_sd(RV_REG_SP, stack_size - retaddr_off, RV_REG_RA, ctx);
> +	emit_sd(RV_REG_SP, stack_size - fp_off, RV_REG_FP, ctx);
> +
> +	emit_addi(RV_REG_FP, RV_REG_SP, stack_size, ctx);
> +
> +	/* callee saved register S1 to pass start time */
> +	emit_sd(RV_REG_FP, -sreg_off, RV_REG_S1, ctx);
> +
> +	/* store ip address of the traced function */
> +	if (flags & BPF_TRAMP_F_IP_ARG) {
> +		emit_imm(RV_REG_T1, (const s64)func_addr, ctx);
> +		emit_sd(RV_REG_FP, -ip_off, RV_REG_T1, ctx);
> +	}
> +
> +	emit_li(RV_REG_T1, nregs, ctx);
> +	emit_sd(RV_REG_FP, -nregs_off, RV_REG_T1, ctx);
> +
> +	store_args(nregs, args_off, ctx);
> +
> +	/* skip to actual body of traced function */
> +	if (flags & BPF_TRAMP_F_SKIP_FRAME)
> +		orig_call += 16;
> +
> +	if (flags & BPF_TRAMP_F_CALL_ORIG) {
> +		emit_imm(RV_REG_A0, (const s64)im, ctx);
> +		ret = emit_call((const u64)__bpf_tramp_enter, true, ctx);
> +		if (ret)
> +			return ret;
> +	}
> +
> +	for (i = 0; i < fentry->nr_links; i++) {
> +		ret = invoke_bpf_prog(fentry->links[i], args_off, retval_off, run_ctx_off,
> +				      flags & BPF_TRAMP_F_RET_FENTRY_RET, ctx);
> +		if (ret)
> +			return ret;
> +	}
> +
> +	if (fmod_ret->nr_links) {
> +		branches_off = kcalloc(fmod_ret->nr_links, sizeof(int), GFP_KERNEL);
> +		if (!branches_off)
> +			return -ENOMEM;
> +
> +		ret = invoke_bpf_mod_ret(fmod_ret, args_off, retval_off, run_ctx_off,
> +					 branches_off, ctx);
> +		if (ret)
> +			return ret;
> +	}
> +
> +	if (flags & BPF_TRAMP_F_CALL_ORIG) {
> +		restore_args(nregs, args_off, ctx);
> +		ret = emit_call((const u64)orig_call, true, ctx);
> +		if (ret)
> +			return ret;
> +		emit_sd(RV_REG_FP, -retval_off, RV_REG_A0, ctx);
> +		/* nop reserved for bpf_tramp_image_put */
> +		im->ip_after_call = ctx->insns + ctx->ninsns;
> +		emit(rv_nop(), ctx);
> +	}
> +
> +	/* update branches saved in invoke_bpf_mod_ret with bnez */
> +	for (i = 0; i < fmod_ret->nr_links; i++) {
> +		offset = ninsns_rvoff(ctx->ninsns - branches_off[i]);
> +		insn = rv_bne(RV_REG_T1, RV_REG_ZERO, offset >> 1);
> +		*(u32 *)(ctx->insns + branches_off[i]) = insn;
> +	}
> +
> +	for (i = 0; i < fexit->nr_links; i++) {
> +		ret = invoke_bpf_prog(fexit->links[i], args_off, retval_off,
> +				      run_ctx_off, false, ctx);
> +		if (ret)
> +			return ret;
> +	}
> +
> +	if (flags & BPF_TRAMP_F_CALL_ORIG) {
> +		im->ip_epilogue = ctx->insns + ctx->ninsns;
> +		emit_imm(RV_REG_A0, (const s64)im, ctx);
> +		ret = emit_call((const u64)__bpf_tramp_exit, true, ctx);
> +		if (ret)
> +			return ret;
> +	}
> +
> +	if (flags & BPF_TRAMP_F_RESTORE_REGS)
> +		restore_args(nregs, args_off, ctx);
> +
> +	if (save_ret)
> +		emit_ld(RV_REG_A0, -retval_off, RV_REG_FP, ctx);
> +
> +	emit_ld(RV_REG_S1, -sreg_off, RV_REG_FP, ctx);
> +
> +	if (flags & BPF_TRAMP_F_SKIP_FRAME)
> +		/* return address of parent function */
> +		emit_ld(RV_REG_RA, stack_size - 8, RV_REG_SP, ctx);
> +	else
> +		/* return address of traced function */
> +		emit_ld(RV_REG_RA, stack_size - retaddr_off, RV_REG_SP, ctx);
> +
> +	emit_ld(RV_REG_FP, stack_size - fp_off, RV_REG_SP, ctx);
> +	emit_addi(RV_REG_SP, RV_REG_SP, stack_size, ctx);
> +
> +	emit_jalr(RV_REG_ZERO, RV_REG_RA, 0, ctx);
> +
> +	bpf_flush_icache(ctx->insns, ctx->insns + ctx->ninsns);
> +
> +	kfree(branches_off);
> +
> +	return ctx->ninsns;
> +
> +}
> +
> +int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image,
> +				void *image_end, const struct btf_func_model *m,
> +				u32 flags, struct bpf_tramp_links *tlinks,
> +				void *func_addr)
> +{
> +	int ret;
> +	struct rv_jit_context ctx;
> +
> +	ctx.ninsns = 0;
> +	ctx.insns = image;
> +	ret = __arch_prepare_bpf_trampoline(im, m, tlinks, func_addr, flags, &ctx);
> +	if (ret < 0)
> +		return ret;
> +
> +	if (ninsns_rvoff(ret) > (long)image_end - (long)image)
> +		return -EFBIG;

This looks risky! First you generate the image, and here you realize
that you already wrote in all the wrong places?!

> +
> +	return ninsns_rvoff(ret);

Ok, this was a bit subtle to me. The return value of the this function
is used in kernel/bpf/bpf_struct_ops.c. Now I know! :-)


Thanks!
Björn



More information about the linux-riscv mailing list