[PATCH 2/2] x86/cfi,bpf: Fix BPF JIT call
Alexei Starovoitov
alexei.starovoitov at gmail.com
Tue Nov 21 18:18:17 PST 2023
On Mon, Nov 20, 2023 at 03:46:44PM +0100, Peter Zijlstra wrote:
> +
> +#ifdef CONFIG_CFI_CLANG
> +struct bpf_insn;
> +
> +extern unsigned int bpf_func_proto(const void *ctx,
> + const struct bpf_insn *insn);
To make it more obvious what is going on could you rename it to
__bpf_prog_runX()
and add a comment that its prototype should match exactly
bpf interpreters created by DEFINE_BPF_PROG_RUN() macro,
otherwise cfi will explode.
> +
> +__ADDRESSABLE(bpf_func_proto);
> +
> +asm (
> +" .pushsection .data..ro_after_init,\"aw\", at progbits \n"
> +" .type cfi_bpf_hash, at object \n"
> +" .globl cfi_bpf_hash \n"
> +" .p2align 2, 0x0 \n"
> +"cfi_bpf_hash: \n"
> +" .long __kcfi_typeid_bpf_func_proto \n"
Took me some time to grok this.
Cannot you use __CFI_TYPE() macro here ?
> +" .size cfi_bpf_hash, 4 \n"
> +" .popsection \n"
> +);
> +#endif
...
> +static int emit_fineibt(u8 **pprog)
> +{
> + u8 *prog = *pprog;
> +
> + EMIT_ENDBR();
> + EMIT3_off32(0x41, 0x81, 0xea, cfi_bpf_hash);
> + EMIT2(0x74, 0x07);
> + EMIT2(0x0f, 0x0b);
> + EMIT1(0x90);
> + EMIT_ENDBR_POISON();
Please add comments what this asm does. No one can read hex.
> +
> + *pprog = prog;
> + return 16;
16 means "the caller of this code will jump to endbr_poison", right?
> +}
> +
> +static int emit_kcfi(u8 **pprog)
> +{
> + u8 *prog = *pprog;
> + int offset = 5;
> +
> + EMIT1_off32(0xb8, cfi_bpf_hash);
and here too.
> +#ifdef CONFIG_CALL_PADDING
> + EMIT1(0x90);
> + EMIT1(0x90);
> + EMIT1(0x90);
> + EMIT1(0x90);
> + EMIT1(0x90);
> + EMIT1(0x90);
> + EMIT1(0x90);
> + EMIT1(0x90);
> + EMIT1(0x90);
> + EMIT1(0x90);
> + EMIT1(0x90);
> + offset += 11;
> +#endif
> + EMIT_ENDBR();
> +
> + *pprog = prog;
> + return offset;
5 or 16 would mean "jump to endbr" ?
> +}
> +
> +static int emit_cfi(u8 **pprog)
> +{
> + u8 *prog = *pprog;
> + int offset = 0;
> +
> + switch (cfi_mode) {
> + case CFI_FINEIBT:
> + offset = emit_fineibt(&prog);
> + break;
> +
> + case CFI_KCFI:
> + offset = emit_kcfi(&prog);
> + break;
> +
> + default:
> + EMIT_ENDBR();
> + break;
> + }
> +
> + *pprog = prog;
> + return offset;
> +}
> +
> /*
> * Emit x86-64 prologue code for BPF program.
> * bpf_tail_call helper will skip the first X86_TAIL_CALL_OFFSET bytes
> * while jumping to another program
> */
> -static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf,
> - bool tail_call_reachable, bool is_subprog,
> - bool is_exception_cb)
> +static int emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf,
> + bool tail_call_reachable, bool is_subprog,
> + bool is_exception_cb)
> {
> u8 *prog = *pprog;
> + int offset;
>
> + offset = emit_cfi(&prog);
I'm not sure doing cfi_bpf_hash check in JITed code is completely solving the problem.
>From bpf_dispatcher_*_func() calling into JITed will work,
but this emit_prologue() is doing the same job for all bpf progs.
Some bpf progs call each other directly and indirectly.
bpf_dispatcher_*_func() -> JITed_BPF_A -> JITed_BPF_B.
A into B can be a direct call (which cfi doesn't care about) and
indirect via emit_bpf_tail_call_indirect()->emit_indirect_jump().
Should we care about fineibt/kcfi there too?
> /* BPF trampoline can be made to work without these nops,
> * but let's waste 5 bytes for now and optimize later
> */
> - EMIT_ENDBR();
> memcpy(prog, x86_nops[5], X86_PATCH_SIZE);
> prog += X86_PATCH_SIZE;
> if (!ebpf_from_cbpf) {
> @@ -357,6 +426,8 @@ static void emit_prologue(u8 **pprog, u3
> if (tail_call_reachable)
> EMIT1(0x50); /* push rax */
> *pprog = prog;
> +
> + return offset;
> }
>
> static int emit_patch(u8 **pprog, void *func, void *ip, u8 opcode)
> @@ -1083,8 +1154,8 @@ static int do_jit(struct bpf_prog *bpf_p
> bool tail_call_seen = false;
> bool seen_exit = false;
> u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY];
> - int i, excnt = 0;
> int ilen, proglen = 0;
> + int i, excnt = 0;
> u8 *prog = temp;
> int err;
>
> @@ -1094,9 +1165,12 @@ static int do_jit(struct bpf_prog *bpf_p
> /* tail call's presence in current prog implies it is reachable */
> tail_call_reachable |= tail_call_seen;
>
> - emit_prologue(&prog, bpf_prog->aux->stack_depth,
> - bpf_prog_was_classic(bpf_prog), tail_call_reachable,
> - bpf_is_subprog(bpf_prog), bpf_prog->aux->exception_cb);
> + ctx->prog_offset = emit_prologue(&prog, bpf_prog->aux->stack_depth,
> + bpf_prog_was_classic(bpf_prog),
> + tail_call_reachable,
> + bpf_is_subprog(bpf_prog),
> + bpf_prog->aux->exception_cb);
> +
> /* Exception callback will clobber callee regs for its own use, and
> * restore the original callee regs from main prog's stack frame.
> */
> @@ -2935,9 +3009,9 @@ struct bpf_prog *bpf_int_jit_compile(str
> jit_data->header = header;
> jit_data->rw_header = rw_header;
> }
> - prog->bpf_func = (void *)image;
> + prog->bpf_func = (void *)image + ctx.prog_offset;
I don't understand this.
prog->bpf_func is the main entry point. Everything jumps there.
Are you trying to skip all of cfi code in the prologue and let
xdp_dispatcher jump to endbr or endbr_poison (depending on fineibt vs kcfi) ?
Then what is the point of earlier asm bits?
Is it a some clang thing that knows to offset indirect jump by
exactly that many hard coded bytes ?
Something in the clang does ptr -= 16 in case of fineibt and just
jumps there ? and ptr -= 5 for kcfi ?
If so, please add a giant comment explaining that.
No one should be reverse engineering such intricate details.
> prog->jited = 1;
> - prog->jited_len = proglen;
> + prog->jited_len = proglen - ctx.prog_offset; // XXX?
jited_len is used later to cover the whole generated code.
See bpf_prog_ksym_set_addr():
prog->aux->ksym.start = (unsigned long) prog->bpf_func;
prog->aux->ksym.end = prog->aux->ksym.start + prog->jited_len;
we definitely want ksym [start, end] to cover every useful byte
of JITed code in case IRQ happens on that byte.
Without covering cfi prologue the stack dump will be wrong for that frame.
I guess if xdp_dispatcher with fineibt=on jumps into prog->bpf_func - 16
and IRQ fires we don't care that much about accurate stack of last frame ?
I guess it's acceptable, but a comment is necessary.
More information about the linux-riscv
mailing list