[PATCH bpf-next v2 3/3] bpf, riscv: use prog pack allocator in the BPF JIT

Pu Lehui pulehui at huawei.com
Fri Aug 25 00:34:45 PDT 2023



On 2023/8/25 15:09, Pu Lehui wrote:
> Hi Puranjay,
> 
> Happy to see the RV64 pack allocator implementation.

RV32 also

> 
> On 2023/8/24 21:31, Puranjay Mohan wrote:
>> Use bpf_jit_binary_pack_alloc() for memory management of JIT binaries in
>> RISCV BPF JIT. The bpf_jit_binary_pack_alloc creates a pair of RW and RX
>> buffers. The JIT writes the program into the RW buffer. When the JIT is
>> done, the program is copied to the final RX buffer with
>> bpf_jit_binary_pack_finalize.
>>
>> Implement bpf_arch_text_copy() and bpf_arch_text_invalidate() for RISCV
>> JIT as these functions are required by bpf_jit_binary_pack allocator.
>>
>> Signed-off-by: Puranjay Mohan <puranjay12 at gmail.com>
>> ---
>>   arch/riscv/net/bpf_jit.h        |   3 +
>>   arch/riscv/net/bpf_jit_comp64.c |  56 +++++++++++++---
>>   arch/riscv/net/bpf_jit_core.c   | 113 +++++++++++++++++++++++++++-----
>>   3 files changed, 146 insertions(+), 26 deletions(-)
>>
>> diff --git a/arch/riscv/net/bpf_jit.h b/arch/riscv/net/bpf_jit.h
>> index 2717f5490428..ad69319c8ea7 100644
>> --- a/arch/riscv/net/bpf_jit.h
>> +++ b/arch/riscv/net/bpf_jit.h
>> @@ -68,6 +68,7 @@ static inline bool is_creg(u8 reg)
>>   struct rv_jit_context {
>>       struct bpf_prog *prog;
>>       u16 *insns;        /* RV insns */
>> +    u16 *ro_insns;

In fact, the definition of w/ or w/o ro_ still looks a bit confusing. 
Maybe it is better for us not to change the current framework, as the 
current `image` is the final executed RX image, and the trampoline 
treats `image` as the same. Maybe it would be better to add a new RW 
image, such like `rw_iamge`, so that we do not break the existing 
framework and do not have to add too many comments.

And any other parts, it looks great.😄

>>       int ninsns;
>>       int prologue_len;
>>       int epilogue_offset;
>> @@ -85,7 +86,9 @@ static inline int ninsns_rvoff(int ninsns)
>>   struct rv_jit_data {
>>       struct bpf_binary_header *header;
>> +    struct bpf_binary_header *ro_header;
>>       u8 *image;
>> +    u8 *ro_image;
>>       struct rv_jit_context ctx;
>>   };
>> diff --git a/arch/riscv/net/bpf_jit_comp64.c 
>> b/arch/riscv/net/bpf_jit_comp64.c
>> index 0ca4f5c0097c..d77b16338ba2 100644
>> --- a/arch/riscv/net/bpf_jit_comp64.c
>> +++ b/arch/riscv/net/bpf_jit_comp64.c
>> @@ -144,7 +144,11 @@ static bool in_auipc_jalr_range(s64 val)
>>   /* Emit fixed-length instructions for address */
>>   static int emit_addr(u8 rd, u64 addr, bool extra_pass, struct 
>> rv_jit_context *ctx)
>>   {
>> -    u64 ip = (u64)(ctx->insns + ctx->ninsns);
>> +    /*
>> +     * Use the ro_insns(RX) to calculate the offset as the BPF 
>> program will
>> +     * finally run from this memory region.
>> +     */
>> +    u64 ip = (u64)(ctx->ro_insns + ctx->ninsns);
>>       s64 off = addr - ip;
>>       s64 upper = (off + (1 << 11)) >> 12;
>>       s64 lower = off & 0xfff;
>> @@ -465,7 +469,11 @@ static int emit_call(u64 addr, bool fixed_addr, 
>> struct rv_jit_context *ctx)
>>       u64 ip;
>>       if (addr && ctx->insns) {
> 
> ctx->insns need to sync to ctx->ro_insns
> 
>> -        ip = (u64)(long)(ctx->insns + ctx->ninsns);
>> +        /*
>> +         * Use the ro_insns(RX) to calculate the offset as the BPF
>> +         * program will finally run from this memory region.
>> +         */
>> +        ip = (u64)(long)(ctx->ro_insns + ctx->ninsns);
>>           off = addr - ip;
>>       }
>> @@ -578,7 +586,8 @@ static int add_exception_handler(const struct 
>> bpf_insn *insn,
>>   {
>>       struct exception_table_entry *ex;
>>       unsigned long pc;
>> -    off_t offset;
>> +    off_t ins_offset;
>> +    off_t fixup_offset;
>>       if (!ctx->insns || !ctx->prog->aux->extable || 
>> BPF_MODE(insn->code) != BPF_PROBE_MEM)
> 
> ctx->ro_insns need to be checked also.
> 
>>           return 0;
>> @@ -593,12 +602,17 @@ static int add_exception_handler(const struct 
>> bpf_insn *insn,
>>           return -EINVAL;
>>       ex = &ctx->prog->aux->extable[ctx->nexentries];
>> -    pc = (unsigned long)&ctx->insns[ctx->ninsns - insn_len];
>> +    pc = (unsigned long)&ctx->ro_insns[ctx->ninsns - insn_len];
>> -    offset = pc - (long)&ex->insn;
>> -    if (WARN_ON_ONCE(offset >= 0 || offset < INT_MIN))
>> +    /*
>> +     * This is the relative offset of the instruction that may fault 
>> from
>> +     * the exception table itself. This will be written to the exception
>> +     * table and if this instruction faults, the destination register 
>> will
>> +     * be set to '0' and the execution will jump to the next 
>> instruction.
>> +     */
>> +    ins_offset = pc - (long)&ex->insn;
>> +    if (WARN_ON_ONCE(ins_offset >= 0 || ins_offset < INT_MIN))
>>           return -ERANGE;
>> -    ex->insn = offset;
>>       /*
>>        * Since the extable follows the program, the fixup offset is 
>> always
>> @@ -607,12 +621,25 @@ static int add_exception_handler(const struct 
>> bpf_insn *insn,
>>        * bits. We don't need to worry about buildtime or runtime sort
>>        * modifying the upper bits because the table is already sorted, 
>> and
>>        * isn't part of the main exception table.
>> +     *
>> +     * The fixup_offset is set to the next instruction from the 
>> instruction
>> +     * that may fault. The execution will jump to this after handling 
>> the
>> +     * fault.
>>        */
>> -    offset = (long)&ex->fixup - (pc + insn_len * sizeof(u16));
>> -    if (!FIELD_FIT(BPF_FIXUP_OFFSET_MASK, offset))
>> +    fixup_offset = (long)&ex->fixup - (pc + insn_len * sizeof(u16));
>> +    if (!FIELD_FIT(BPF_FIXUP_OFFSET_MASK, fixup_offset))
>>           return -ERANGE;
>> -    ex->fixup = FIELD_PREP(BPF_FIXUP_OFFSET_MASK, offset) |
>> +    /*
>> +     * The offsets above have been calculated using the RO buffer but we
>> +     * need to use the R/W buffer for writes.
>> +     * switch ex to rw buffer for writing.
>> +     */
>> +    ex = (void *)ctx->insns + ((void *)ex - (void *)ctx->ro_insns);
>> +
>> +    ex->insn = ins_offset;
>> +
>> +    ex->fixup = FIELD_PREP(BPF_FIXUP_OFFSET_MASK, fixup_offset) |
>>           FIELD_PREP(BPF_FIXUP_REG_MASK, dst_reg);
>>       ex->type = EX_TYPE_BPF;
>> @@ -1006,6 +1033,7 @@ int arch_prepare_bpf_trampoline(struct 
>> bpf_tramp_image *im, void *image,
>>       ctx.ninsns = 0;
>>       ctx.insns = NULL;
>> +    ctx.ro_insns = NULL;
>>       ret = __arch_prepare_bpf_trampoline(im, m, tlinks, func_addr, 
>> flags, &ctx);
>>       if (ret < 0)
>>           return ret;
>> @@ -1014,7 +1042,15 @@ int arch_prepare_bpf_trampoline(struct 
>> bpf_tramp_image *im, void *image,
>>           return -EFBIG;
>>       ctx.ninsns = 0;
>> +    /*
>> +     * The bpf_int_jit_compile() uses a RW buffer (ctx.insns) to 
>> write the
>> +     * JITed instructions and later copies it to a RX region 
>> (ctx.ro_insns).
>> +     * It also uses ctx.ro_insns to calculate offsets for jumps etc. 
>> As the
>> +     * trampoline image uses the same memory area for writing and 
>> execution,
>> +     * both ctx.insns and ctx.ro_insns can be set to image.
>> +     */
>>       ctx.insns = image;
>> +    ctx.ro_insns = image;
>>       ret = __arch_prepare_bpf_trampoline(im, m, tlinks, func_addr, 
>> flags, &ctx);
>>       if (ret < 0)
>>           return ret;
>> diff --git a/arch/riscv/net/bpf_jit_core.c 
>> b/arch/riscv/net/bpf_jit_core.c
>> index 7a26a3e1c73c..4c8dffc09368 100644
>> --- a/arch/riscv/net/bpf_jit_core.c
>> +++ b/arch/riscv/net/bpf_jit_core.c
>> @@ -8,6 +8,8 @@
>>   #include <linux/bpf.h>
>>   #include <linux/filter.h>
>> +#include <linux/memory.h>
>> +#include <asm/patch.h>
>>   #include "bpf_jit.h"
>>   /* Number of iterations to try until offsets converge. */
>> @@ -117,16 +119,27 @@ struct bpf_prog *bpf_int_jit_compile(struct 
>> bpf_prog *prog)
>>                   sizeof(struct exception_table_entry);
>>               prog_size = sizeof(*ctx->insns) * ctx->ninsns;
>> -            jit_data->header =
>> -                bpf_jit_binary_alloc(prog_size + extable_size,
>> -                             &jit_data->image,
>> -                             sizeof(u32),
>> -                             bpf_fill_ill_insns);
>> -            if (!jit_data->header) {
>> +            jit_data->ro_header =
>> +                bpf_jit_binary_pack_alloc(prog_size +
>> +                              extable_size,
>> +                              &jit_data->ro_image,
>> +                              sizeof(u32),
>> +                              &jit_data->header,
>> +                              &jit_data->image,
>> +                              bpf_fill_ill_insns);
>> +            if (!jit_data->ro_header) {
>>                   prog = orig_prog;
>>                   goto out_offset;
>>               }
>> +            /*
>> +             * Use the image(RW) for writing the JITed instructions. 
>> But also save
>> +             * the ro_image(RX) for calculating the offsets in the 
>> image. The RW
>> +             * image will be later copied to the RX image from where 
>> the program
>> +             * will run. The bpf_jit_binary_pack_finalize() will do 
>> this copy in the
>> +             * final step.
>> +             */
>> +            ctx->ro_insns = (u16 *)jit_data->ro_image;
>>               ctx->insns = (u16 *)jit_data->image;
>>               /*
>>                * Now, when the image is allocated, the image can
>> @@ -138,14 +151,12 @@ struct bpf_prog *bpf_int_jit_compile(struct 
>> bpf_prog *prog)
>>       if (i == NR_JIT_ITERATIONS) {
>>           pr_err("bpf-jit: image did not converge in <%d passes!\n", i);
>> -        if (jit_data->header)
>> -            bpf_jit_binary_free(jit_data->header);
>>           prog = orig_prog;
>> -        goto out_offset;
>> +        goto out_free_hdr;
>>       }
>>       if (extable_size)
>> -        prog->aux->extable = (void *)ctx->insns + prog_size;
>> +        prog->aux->extable = (void *)ctx->ro_insns + prog_size;
>>   skip_init_ctx:
>>       pass++;
>> @@ -154,23 +165,35 @@ struct bpf_prog *bpf_int_jit_compile(struct 
>> bpf_prog *prog)
>>       bpf_jit_build_prologue(ctx);
>>       if (build_body(ctx, extra_pass, NULL)) {
>> -        bpf_jit_binary_free(jit_data->header);
>>           prog = orig_prog;
>> -        goto out_offset;
>> +        goto out_free_hdr;
>>       }
>>       bpf_jit_build_epilogue(ctx);
>>       if (bpf_jit_enable > 1)
>>           bpf_jit_dump(prog->len, prog_size, pass, ctx->insns);
>> -    prog->bpf_func = (void *)ctx->insns;
>> +    prog->bpf_func = (void *)ctx->ro_insns;
>>       prog->jited = 1;
>>       prog->jited_len = prog_size;
>> -    bpf_flush_icache(jit_data->header, ctx->insns + ctx->ninsns);
>> -
>>       if (!prog->is_func || extra_pass) {
>> -        bpf_jit_binary_lock_ro(jit_data->header);
>> +        if (WARN_ON(bpf_jit_binary_pack_finalize(prog,
>> +                             jit_data->ro_header,
>> +                             jit_data->header))) {
>> +            /* ro_header has been freed */
>> +            jit_data->ro_header = NULL;
>> +            prog = orig_prog;
>> +            goto out_offset;
>> +        }
>> +        /*
>> +         * The instructions have now been copied to the ROX region from
>> +         * where they will execute.
>> +         * Write any modified data cache blocks out to memory and
>> +         * invalidate the corresponding blocks in the instruction cache.
>> +         */
>> +        bpf_flush_icache(jit_data->ro_header,
>> +                 ctx->ro_insns + ctx->ninsns);
>>           for (i = 0; i < prog->len; i++)
>>               ctx->offset[i] = ninsns_rvoff(ctx->offset[i]);
>>           bpf_prog_fill_jited_linfo(prog, ctx->offset);
>> @@ -185,6 +208,15 @@ struct bpf_prog *bpf_int_jit_compile(struct 
>> bpf_prog *prog)
>>           bpf_jit_prog_release_other(prog, prog == orig_prog ?
>>                          tmp : orig_prog);
>>       return prog;
>> +
>> +out_free_hdr:
>> +    if (jit_data->header) {
>> +        bpf_arch_text_copy(&jit_data->ro_header->size,
>> +                   &jit_data->header->size,
>> +                   sizeof(jit_data->header->size));
>> +        bpf_jit_binary_pack_free(jit_data->ro_header, jit_data->header);
>> +    }
>> +    goto out_offset;
>>   }
>>   u64 bpf_jit_alloc_exec_limit(void)
>> @@ -204,3 +236,52 @@ void bpf_jit_free_exec(void *addr)
>>   {
>>       return vfree(addr);
>>   }
>> +
>> +void *bpf_arch_text_copy(void *dst, void *src, size_t len)
>> +{
>> +    int ret;
>> +
>> +    mutex_lock(&text_mutex);
>> +    ret = patch_text_nosync(dst, src, len);
>> +    mutex_unlock(&text_mutex);
>> +
>> +    if (ret)
>> +        return ERR_PTR(-EINVAL);
>> +
>> +    return dst;
>> +}
>> +
>> +int bpf_arch_text_invalidate(void *dst, size_t len)
>> +{
>> +    int ret = 0;
> 
> no need to initialize it
> 
>> +
>> +    mutex_lock(&text_mutex);
>> +    ret = patch_text_set_nosync(dst, 0, len);
>> +    mutex_unlock(&text_mutex);
>> +
>> +    return ret;
>> +}
>> +
>> +void bpf_jit_free(struct bpf_prog *prog)
>> +{
>> +    if (prog->jited) {
>> +        struct rv_jit_data *jit_data = prog->aux->jit_data;
>> +        struct bpf_binary_header *hdr;
>> +
>> +        /*
>> +         * If we fail the final pass of JIT (from jit_subprogs),
>> +         * the program may not be finalized yet. Call finalize here
>> +         * before freeing it.
>> +         */
>> +        if (jit_data) {
>> +            bpf_jit_binary_pack_finalize(prog, jit_data->ro_header,
>> +                             jit_data->header);
>> +            kfree(jit_data);
>> +        }
>> +        hdr = bpf_jit_binary_pack_hdr(prog);
>> +        bpf_jit_binary_pack_free(hdr, NULL);
>> +        WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(prog));
>> +    }
>> +
>> +    bpf_prog_unlock_free(prog);
>> +}
> 
> 



More information about the linux-riscv mailing list