[PATCH bpf-next v3 3/3] bpf, riscv: use prog pack allocator in the BPF JIT

Pu Lehui pulehui at huawei.com
Mon Aug 28 19:54:48 PDT 2023



On 2023/8/29 0:59, Puranjay Mohan wrote:
> Use bpf_jit_binary_pack_alloc() for memory management of JIT binaries in
> RISCV BPF JIT. The bpf_jit_binary_pack_alloc creates a pair of RW and RX
> buffers. The JIT writes the program into the RW buffer. When the JIT is
> done, the program is copied to the final RX buffer with
> bpf_jit_binary_pack_finalize.
> 
> Implement bpf_arch_text_copy() and bpf_arch_text_invalidate() for RISCV
> JIT as these functions are required by bpf_jit_binary_pack allocator.
> 
> Signed-off-by: Puranjay Mohan <puranjay12 at gmail.com>
> Reviewed-by: Song Liu <song at kernel.org>
> ---
>   arch/riscv/net/bpf_jit.h        |   3 +
>   arch/riscv/net/bpf_jit_comp64.c |  60 ++++++++++++++----
>   arch/riscv/net/bpf_jit_core.c   | 106 +++++++++++++++++++++++++++-----
>   3 files changed, 141 insertions(+), 28 deletions(-)
> 
> diff --git a/arch/riscv/net/bpf_jit.h b/arch/riscv/net/bpf_jit.h
> index d21c6c92a683..a5ce1ab76ece 100644
> --- a/arch/riscv/net/bpf_jit.h
> +++ b/arch/riscv/net/bpf_jit.h
> @@ -68,6 +68,7 @@ static inline bool is_creg(u8 reg)
>   struct rv_jit_context {
>   	struct bpf_prog *prog;
>   	u16 *insns;		/* RV insns */
> +	u16 *ro_insns;
>   	int ninsns;
>   	int prologue_len;
>   	int epilogue_offset;
> @@ -85,7 +86,9 @@ static inline int ninsns_rvoff(int ninsns)
>   
>   struct rv_jit_data {
>   	struct bpf_binary_header *header;
> +	struct bpf_binary_header *ro_header;
>   	u8 *image;
> +	u8 *ro_image;
>   	struct rv_jit_context ctx;
>   };
>   
> diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c
> index 8423f4ddf8f5..ecd3ae6f4116 100644
> --- a/arch/riscv/net/bpf_jit_comp64.c
> +++ b/arch/riscv/net/bpf_jit_comp64.c
> @@ -144,7 +144,11 @@ static bool in_auipc_jalr_range(s64 val)
>   /* Emit fixed-length instructions for address */
>   static int emit_addr(u8 rd, u64 addr, bool extra_pass, struct rv_jit_context *ctx)
>   {
> -	u64 ip = (u64)(ctx->insns + ctx->ninsns);
> +	/*
> +	 * Use the ro_insns(RX) to calculate the offset as the BPF program will
> +	 * finally run from this memory region.
> +	 */
> +	u64 ip = (u64)(ctx->ro_insns + ctx->ninsns);
>   	s64 off = addr - ip;
>   	s64 upper = (off + (1 << 11)) >> 12;
>   	s64 lower = off & 0xfff;
> @@ -464,8 +468,12 @@ static int emit_call(u64 addr, bool fixed_addr, struct rv_jit_context *ctx)
>   	s64 off = 0;
>   	u64 ip;
>   
> -	if (addr && ctx->insns) {
> -		ip = (u64)(long)(ctx->insns + ctx->ninsns);
> +	if (addr && ctx->insns && ctx->ro_insns) {
> +		/*
> +		 * Use the ro_insns(RX) to calculate the offset as the BPF
> +		 * program will finally run from this memory region.
> +		 */
> +		ip = (u64)(long)(ctx->ro_insns + ctx->ninsns);
>   		off = addr - ip;
>   	}
>   
> @@ -578,9 +586,10 @@ static int add_exception_handler(const struct bpf_insn *insn,
>   {
>   	struct exception_table_entry *ex;
>   	unsigned long pc;
> -	off_t offset;
> +	off_t ins_offset;
> +	off_t fixup_offset;
>   
> -	if (!ctx->insns || !ctx->prog->aux->extable ||
> +	if (!ctx->insns || !ctx->ro_insns || !ctx->prog->aux->extable ||
>   	    (BPF_MODE(insn->code) != BPF_PROBE_MEM && BPF_MODE(insn->code) != BPF_PROBE_MEMSX))
>   		return 0;
>   
> @@ -594,12 +603,17 @@ static int add_exception_handler(const struct bpf_insn *insn,
>   		return -EINVAL;
>   
>   	ex = &ctx->prog->aux->extable[ctx->nexentries];
> -	pc = (unsigned long)&ctx->insns[ctx->ninsns - insn_len];
> +	pc = (unsigned long)&ctx->ro_insns[ctx->ninsns - insn_len];
>   
> -	offset = pc - (long)&ex->insn;
> -	if (WARN_ON_ONCE(offset >= 0 || offset < INT_MIN))
> +	/*
> +	 * This is the relative offset of the instruction that may fault from
> +	 * the exception table itself. This will be written to the exception
> +	 * table and if this instruction faults, the destination register will
> +	 * be set to '0' and the execution will jump to the next instruction.
> +	 */
> +	ins_offset = pc - (long)&ex->insn;
> +	if (WARN_ON_ONCE(ins_offset >= 0 || ins_offset < INT_MIN))
>   		return -ERANGE;
> -	ex->insn = offset;
>   
>   	/*
>   	 * Since the extable follows the program, the fixup offset is always
> @@ -608,12 +622,25 @@ static int add_exception_handler(const struct bpf_insn *insn,
>   	 * bits. We don't need to worry about buildtime or runtime sort
>   	 * modifying the upper bits because the table is already sorted, and
>   	 * isn't part of the main exception table.
> +	 *
> +	 * The fixup_offset is set to the next instruction from the instruction
> +	 * that may fault. The execution will jump to this after handling the
> +	 * fault.
>   	 */
> -	offset = (long)&ex->fixup - (pc + insn_len * sizeof(u16));
> -	if (!FIELD_FIT(BPF_FIXUP_OFFSET_MASK, offset))
> +	fixup_offset = (long)&ex->fixup - (pc + insn_len * sizeof(u16));
> +	if (!FIELD_FIT(BPF_FIXUP_OFFSET_MASK, fixup_offset))
>   		return -ERANGE;
>   
> -	ex->fixup = FIELD_PREP(BPF_FIXUP_OFFSET_MASK, offset) |
> +	/*
> +	 * The offsets above have been calculated using the RO buffer but we
> +	 * need to use the R/W buffer for writes.
> +	 * switch ex to rw buffer for writing.
> +	 */
> +	ex = (void *)ctx->insns + ((void *)ex - (void *)ctx->ro_insns);
> +
> +	ex->insn = ins_offset;
> +
> +	ex->fixup = FIELD_PREP(BPF_FIXUP_OFFSET_MASK, fixup_offset) |
>   		FIELD_PREP(BPF_FIXUP_REG_MASK, dst_reg);
>   	ex->type = EX_TYPE_BPF;
>   
> @@ -1007,6 +1034,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image,
>   
>   	ctx.ninsns = 0;
>   	ctx.insns = NULL;
> +	ctx.ro_insns = NULL;
>   	ret = __arch_prepare_bpf_trampoline(im, m, tlinks, func_addr, flags, &ctx);
>   	if (ret < 0)
>   		return ret;
> @@ -1015,7 +1043,15 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image,
>   		return -EFBIG;
>   
>   	ctx.ninsns = 0;
> +	/*
> +	 * The bpf_int_jit_compile() uses a RW buffer (ctx.insns) to write the
> +	 * JITed instructions and later copies it to a RX region (ctx.ro_insns).
> +	 * It also uses ctx.ro_insns to calculate offsets for jumps etc. As the
> +	 * trampoline image uses the same memory area for writing and execution,
> +	 * both ctx.insns and ctx.ro_insns can be set to image.
> +	 */
>   	ctx.insns = image;
> +	ctx.ro_insns = image;
>   	ret = __arch_prepare_bpf_trampoline(im, m, tlinks, func_addr, flags, &ctx);
>   	if (ret < 0)
>   		return ret;
> diff --git a/arch/riscv/net/bpf_jit_core.c b/arch/riscv/net/bpf_jit_core.c
> index 7a26a3e1c73c..7b70ccb7fec3 100644
> --- a/arch/riscv/net/bpf_jit_core.c
> +++ b/arch/riscv/net/bpf_jit_core.c
> @@ -8,6 +8,8 @@
>   
>   #include <linux/bpf.h>
>   #include <linux/filter.h>
> +#include <linux/memory.h>
> +#include <asm/patch.h>
>   #include "bpf_jit.h"
>   
>   /* Number of iterations to try until offsets converge. */
> @@ -117,16 +119,24 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
>   				sizeof(struct exception_table_entry);
>   			prog_size = sizeof(*ctx->insns) * ctx->ninsns;
>   
> -			jit_data->header =
> -				bpf_jit_binary_alloc(prog_size + extable_size,
> -						     &jit_data->image,
> -						     sizeof(u32),
> -						     bpf_fill_ill_insns);
> -			if (!jit_data->header) {
> +			jit_data->ro_header =
> +				bpf_jit_binary_pack_alloc(prog_size + extable_size,
> +							  &jit_data->ro_image, sizeof(u32),
> +							  &jit_data->header, &jit_data->image,
> +							  bpf_fill_ill_insns);
> +			if (!jit_data->ro_header) {
>   				prog = orig_prog;
>   				goto out_offset;
>   			}
>   
> +			/*
> +			 * Use the image(RW) for writing the JITed instructions. But also save
> +			 * the ro_image(RX) for calculating the offsets in the image. The RW
> +			 * image will be later copied to the RX image from where the program
> +			 * will run. The bpf_jit_binary_pack_finalize() will do this copy in the
> +			 * final step.
> +			 */
> +			ctx->ro_insns = (u16 *)jit_data->ro_image;
>   			ctx->insns = (u16 *)jit_data->image;
>   			/*
>   			 * Now, when the image is allocated, the image can
> @@ -138,14 +148,12 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
>   
>   	if (i == NR_JIT_ITERATIONS) {
>   		pr_err("bpf-jit: image did not converge in <%d passes!\n", i);
> -		if (jit_data->header)
> -			bpf_jit_binary_free(jit_data->header);
>   		prog = orig_prog;
> -		goto out_offset;
> +		goto out_free_hdr;
>   	}
>   
>   	if (extable_size)
> -		prog->aux->extable = (void *)ctx->insns + prog_size;
> +		prog->aux->extable = (void *)ctx->ro_insns + prog_size;
>   
>   skip_init_ctx:
>   	pass++;
> @@ -154,23 +162,33 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
>   
>   	bpf_jit_build_prologue(ctx);
>   	if (build_body(ctx, extra_pass, NULL)) {
> -		bpf_jit_binary_free(jit_data->header);
>   		prog = orig_prog;
> -		goto out_offset;
> +		goto out_free_hdr;
>   	}
>   	bpf_jit_build_epilogue(ctx);
>   
>   	if (bpf_jit_enable > 1)
>   		bpf_jit_dump(prog->len, prog_size, pass, ctx->insns);
>   
> -	prog->bpf_func = (void *)ctx->insns;
> +	prog->bpf_func = (void *)ctx->ro_insns;
>   	prog->jited = 1;
>   	prog->jited_len = prog_size;
>   
> -	bpf_flush_icache(jit_data->header, ctx->insns + ctx->ninsns);
> -
>   	if (!prog->is_func || extra_pass) {
> -		bpf_jit_binary_lock_ro(jit_data->header);
> +		if (WARN_ON(bpf_jit_binary_pack_finalize(prog, jit_data->ro_header,
> +							 jit_data->header))) {
> +			/* ro_header has been freed */
> +			jit_data->ro_header = NULL;
> +			prog = orig_prog;
> +			goto out_offset;
> +		}
> +		/*
> +		 * The instructions have now been copied to the ROX region from
> +		 * where they will execute.
> +		 * Write any modified data cache blocks out to memory and
> +		 * invalidate the corresponding blocks in the instruction cache.
> +		 */
> +		bpf_flush_icache(jit_data->ro_header, ctx->ro_insns + ctx->ninsns);
>   		for (i = 0; i < prog->len; i++)
>   			ctx->offset[i] = ninsns_rvoff(ctx->offset[i]);
>   		bpf_prog_fill_jited_linfo(prog, ctx->offset);
> @@ -185,6 +203,14 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
>   		bpf_jit_prog_release_other(prog, prog == orig_prog ?
>   					   tmp : orig_prog);
>   	return prog;
> +
> +out_free_hdr:
> +	if (jit_data->header) {
> +		bpf_arch_text_copy(&jit_data->ro_header->size, &jit_data->header->size,
> +				   sizeof(jit_data->header->size));
> +		bpf_jit_binary_pack_free(jit_data->ro_header, jit_data->header);
> +	}
> +	goto out_offset;
>   }
>   
>   u64 bpf_jit_alloc_exec_limit(void)
> @@ -204,3 +230,51 @@ void bpf_jit_free_exec(void *addr)
>   {
>   	return vfree(addr);
>   }
> +
> +void *bpf_arch_text_copy(void *dst, void *src, size_t len)
> +{
> +	int ret;
> +
> +	mutex_lock(&text_mutex);
> +	ret = patch_text_nosync(dst, src, len);
> +	mutex_unlock(&text_mutex);
> +
> +	if (ret)
> +		return ERR_PTR(-EINVAL);
> +
> +	return dst;
> +}
> +
> +int bpf_arch_text_invalidate(void *dst, size_t len)
> +{
> +	int ret;
> +
> +	mutex_lock(&text_mutex);
> +	ret = patch_text_set_nosync(dst, 0, len);
> +	mutex_unlock(&text_mutex);
> +
> +	return ret;
> +}
> +
> +void bpf_jit_free(struct bpf_prog *prog)
> +{
> +	if (prog->jited) {
> +		struct rv_jit_data *jit_data = prog->aux->jit_data;
> +		struct bpf_binary_header *hdr;
> +
> +		/*
> +		 * If we fail the final pass of JIT (from jit_subprogs),
> +		 * the program may not be finalized yet. Call finalize here
> +		 * before freeing it.
> +		 */
> +		if (jit_data) {
> +			bpf_jit_binary_pack_finalize(prog, jit_data->ro_header, jit_data->header);
> +			kfree(jit_data);
> +		}
> +		hdr = bpf_jit_binary_pack_hdr(prog);
> +		bpf_jit_binary_pack_free(hdr, NULL);
> +		WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(prog));
> +	}
> +
> +	bpf_prog_unlock_free(prog);
> +}

Reviewed-by: Pu Lehui <pulehui at huawei.com>



More information about the linux-riscv mailing list