[PATCH bpf-next v1 3/3] bpf, riscv: Use compressed instructions in the rv64 JIT

Tue Jul 21 03:25:22 EDT 2020

On Tue, 21 Jul 2020 at 04:52, Luke Nelson <lukenels at cs.washington.edu> wrote:
>
> This patch uses the RVC support and encodings from bpf_jit.h to optimize
> the rv64 jit.
>
> The optimizations work by replacing emit(rv_X(...)) with a call to a
> helper function emit_X, which will emit a compressed version of the
> instruction when possible, and when RVC is enabled.
>
> The JIT continues to pass all tests in lib/test_bpf.c, and introduces
> no new failures to test_verifier; both with and without RVC being enabled.
>
> Most changes are straightforward replacements of emit(rv_X(...), ctx)
> with emit_X(..., ctx), with the following exceptions bearing mention;
>
> * Change emit_imm to sign-extend the value in "lower", since the
> checks for RVC (and the instructions themselves) treat the value as
> signed. Otherwise, small negative immediates will not be recognized as
> encodable using an RVC instruction. For example, without this change,
> emit_imm(rd, -1, ctx) would cause lower to become 4095, which is not a
> 6b int even though a "c.li rd, -1" instruction suffices.
>
> * For {BPF_MOV,BPF_ADD} BPF_X, drop using addiw,addw in the 32-bit
> cases since the values are zero-extended into the upper 32 bits in
> the following instructions anyways, and the addition commutes with
> zero-extension. (BPF_SUB BPF_X must still use subw since subtraction
> does not commute with zero-extension.)
>
> This patch avoids optimizing branches and jumps to use RVC instructions
> since surrounding code often makes assumptions about the sizes of
> emitted instructions. Optimizing these will require changing these
> functions (e.g., emit_branch) to dynamically compute jump offsets.
>
> The following are examples of the JITed code for the verifier selftest
> "direct packet read test#3 for CGROUP_SKB OK", without and with RVC
> enabled, respectively. The former uses 178 bytes, and the latter uses 112,
> for a ~37% reduction in code size for this example.
>
> Without RVC:
>
>    0: 02000813    addi  a6,zero,32
>    4: fd010113    addi  sp,sp,-48
>    8: 02813423    sd    s0,40(sp)
>    c: 02913023    sd    s1,32(sp)
>   10: 01213c23    sd    s2,24(sp)
>   14: 01313823    sd    s3,16(sp)
>   18: 01413423    sd    s4,8(sp)
>   1c: 03010413    addi  s0,sp,48
>   20: 03056683    lwu   a3,48(a0)
>   24: 02069693    slli  a3,a3,0x20
>   28: 0206d693    srli  a3,a3,0x20
>   2c: 03456703    lwu   a4,52(a0)
>   30: 02071713    slli  a4,a4,0x20
>   34: 02075713    srli  a4,a4,0x20
>   38: 03856483    lwu   s1,56(a0)
>   3c: 02049493    slli  s1,s1,0x20
>   40: 0204d493    srli  s1,s1,0x20
>   44: 03c56903    lwu   s2,60(a0)
>   48: 02091913    slli  s2,s2,0x20
>   4c: 02095913    srli  s2,s2,0x20
>   50: 04056983    lwu   s3,64(a0)
>   54: 02099993    slli  s3,s3,0x20
>   58: 0209d993    srli  s3,s3,0x20
>   5c: 09056a03    lwu   s4,144(a0)
>   60: 020a1a13    slli  s4,s4,0x20
>   64: 020a5a13    srli  s4,s4,0x20
>   68: 00900313    addi  t1,zero,9
>   6c: 006a7463    bgeu  s4,t1,0x74
>   70: 00000a13    addi  s4,zero,0
>   74: 02d52823    sw    a3,48(a0)
>   78: 02e52a23    sw    a4,52(a0)
>   7c: 02952c23    sw    s1,56(a0)
>   80: 03252e23    sw    s2,60(a0)
>   84: 05352023    sw    s3,64(a0)
>   88: 00000793    addi  a5,zero,0
>   8c: 02813403    ld    s0,40(sp)
>   90: 02013483    ld    s1,32(sp)
>   94: 01813903    ld    s2,24(sp)
>   98: 01013983    ld    s3,16(sp)
>   9c: 00813a03    ld    s4,8(sp)
>   a0: 03010113    addi  sp,sp,48
>   a4: 00078513    addi  a0,a5,0
>   a8: 00008067    jalr  zero,0(ra)
>
> With RVC:
>
>    0:   02000813    addi    a6,zero,32
>    4:   7179        c.addi16sp  sp,-48
>    6:   f422        c.sdsp  s0,40(sp)
>    8:   f026        c.sdsp  s1,32(sp)
>    a:   ec4a        c.sdsp  s2,24(sp)
>    c:   e84e        c.sdsp  s3,16(sp)
>    e:   e452        c.sdsp  s4,8(sp)
>   10:   1800        c.addi4spn  s0,sp,48
>   12:   03056683    lwu     a3,48(a0)
>   16:   1682        c.slli  a3,0x20
>   18:   9281        c.srli  a3,0x20
>   1a:   03456703    lwu     a4,52(a0)
>   1e:   1702        c.slli  a4,0x20
>   20:   9301        c.srli  a4,0x20
>   22:   03856483    lwu     s1,56(a0)
>   26:   1482        c.slli  s1,0x20
>   28:   9081        c.srli  s1,0x20
>   2a:   03c56903    lwu     s2,60(a0)
>   2e:   1902        c.slli  s2,0x20
>   30:   02095913    srli    s2,s2,0x20
>   34:   04056983    lwu     s3,64(a0)
>   38:   1982        c.slli  s3,0x20
>   3a:   0209d993    srli    s3,s3,0x20
>   3e:   09056a03    lwu     s4,144(a0)
>   42:   1a02        c.slli  s4,0x20
>   44:   020a5a13    srli    s4,s4,0x20
>   48:   4325        c.li    t1,9
>   4a:   006a7363    bgeu    s4,t1,0x50
>   4e:   4a01        c.li    s4,0
>   50:   d914        c.sw    a3,48(a0)
>   52:   d958        c.sw    a4,52(a0)
>   54:   dd04        c.sw    s1,56(a0)
>   56:   03252e23    sw      s2,60(a0)
>   5a:   05352023    sw      s3,64(a0)
>   5e:   4781        c.li    a5,0
>   60:   7422        c.ldsp  s0,40(sp)
>   62:   7482        c.ldsp  s1,32(sp)
>   64:   6962        c.ldsp  s2,24(sp)
>   66:   69c2        c.ldsp  s3,16(sp)
>   68:   6a22        c.ldsp  s4,8(sp)
>   6a:   6145        c.addi16sp  sp,48
>   6c:   853e        c.mv    a0,a5
>   6e:   8082        c.jr    ra
>
> Cc: Björn Töpel <bjorn.topel at gmail.com>
> Signed-off-by: Luke Nelson <luke.r.nels at gmail.com>
> ---
> Björn: I added you as Cc instead of Acked-by for this patch so you
> would have a chance to review the further changes I made in emit_imm
> (described above).

Thanks! LGTM! I'll add the Acked-by/Reviewed-by to the cover letter!

Really nice work!
Björn

> ---
>  arch/riscv/net/bpf_jit_comp64.c | 281 +++++++++++++++++---------------
>  1 file changed, 147 insertions(+), 134 deletions(-)
>
> diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c
> index 55861269da2a..8a56b5293117 100644
> --- a/arch/riscv/net/bpf_jit_comp64.c
> +++ b/arch/riscv/net/bpf_jit_comp64.c
> @@ -132,19 +132,23 @@ static void emit_imm(u8 rd, s64 val, struct rv_jit_context *ctx)
>          *
>          * This also means that we need to process LSB to MSB.
>          */
> -       s64 upper = (val + (1 << 11)) >> 12, lower = val & 0xfff;
> +       s64 upper = (val + (1 << 11)) >> 12;
> +       /* Sign-extend lower 12 bits to 64 bits since immediates for li, addiw,
> +        * and addi are signed and RVC checks will perform signed comparisons.
> +        */
> +       s64 lower = ((val & 0xfff) << 52) >> 52;
>         int shift;
>
>         if (is_32b_int(val)) {
>                 if (upper)
> -                       emit(rv_lui(rd, upper), ctx);
> +                       emit_lui(rd, upper, ctx);
>
>                 if (!upper) {
> -                       emit(rv_addi(rd, RV_REG_ZERO, lower), ctx);
> +                       emit_li(rd, lower, ctx);
>                         return;
>                 }
>
> -               emit(rv_addiw(rd, rd, lower), ctx);
> +               emit_addiw(rd, rd, lower, ctx);
>                 return;
>         }
>
> @@ -154,9 +158,9 @@ static void emit_imm(u8 rd, s64 val, struct rv_jit_context *ctx)
>
>         emit_imm(rd, upper, ctx);
>
> -       emit(rv_slli(rd, rd, shift), ctx);
> +       emit_slli(rd, rd, shift, ctx);
>         if (lower)
> -               emit(rv_addi(rd, rd, lower), ctx);
> +               emit_addi(rd, rd, lower, ctx);
>  }
>
>  static void __build_epilogue(bool is_tail_call, struct rv_jit_context *ctx)
> @@ -164,43 +168,43 @@ static void __build_epilogue(bool is_tail_call, struct rv_jit_context *ctx)
>         int stack_adjust = ctx->stack_size, store_offset = stack_adjust - 8;
>
>         if (seen_reg(RV_REG_RA, ctx)) {
> -               emit(rv_ld(RV_REG_RA, store_offset, RV_REG_SP), ctx);
> +               emit_ld(RV_REG_RA, store_offset, RV_REG_SP, ctx);
>                 store_offset -= 8;
>         }
> -       emit(rv_ld(RV_REG_FP, store_offset, RV_REG_SP), ctx);
> +       emit_ld(RV_REG_FP, store_offset, RV_REG_SP, ctx);
>         store_offset -= 8;
>         if (seen_reg(RV_REG_S1, ctx)) {
> -               emit(rv_ld(RV_REG_S1, store_offset, RV_REG_SP), ctx);
> +               emit_ld(RV_REG_S1, store_offset, RV_REG_SP, ctx);
>                 store_offset -= 8;
>         }
>         if (seen_reg(RV_REG_S2, ctx)) {
> -               emit(rv_ld(RV_REG_S2, store_offset, RV_REG_SP), ctx);
> +               emit_ld(RV_REG_S2, store_offset, RV_REG_SP, ctx);
>                 store_offset -= 8;
>         }
>         if (seen_reg(RV_REG_S3, ctx)) {
> -               emit(rv_ld(RV_REG_S3, store_offset, RV_REG_SP), ctx);
> +               emit_ld(RV_REG_S3, store_offset, RV_REG_SP, ctx);
>                 store_offset -= 8;
>         }
>         if (seen_reg(RV_REG_S4, ctx)) {
> -               emit(rv_ld(RV_REG_S4, store_offset, RV_REG_SP), ctx);
> +               emit_ld(RV_REG_S4, store_offset, RV_REG_SP, ctx);
>                 store_offset -= 8;
>         }
>         if (seen_reg(RV_REG_S5, ctx)) {
> -               emit(rv_ld(RV_REG_S5, store_offset, RV_REG_SP), ctx);
> +               emit_ld(RV_REG_S5, store_offset, RV_REG_SP, ctx);
>                 store_offset -= 8;
>         }
>         if (seen_reg(RV_REG_S6, ctx)) {
> -               emit(rv_ld(RV_REG_S6, store_offset, RV_REG_SP), ctx);
> +               emit_ld(RV_REG_S6, store_offset, RV_REG_SP, ctx);
>                 store_offset -= 8;
>         }
>
> -       emit(rv_addi(RV_REG_SP, RV_REG_SP, stack_adjust), ctx);
> +       emit_addi(RV_REG_SP, RV_REG_SP, stack_adjust, ctx);
>         /* Set return value. */
>         if (!is_tail_call)
> -               emit(rv_addi(RV_REG_A0, RV_REG_A5, 0), ctx);
> -       emit(rv_jalr(RV_REG_ZERO, is_tail_call ? RV_REG_T3 : RV_REG_RA,
> -                    is_tail_call ? 4 : 0), /* skip TCC init */
> -            ctx);
> +               emit_mv(RV_REG_A0, RV_REG_A5, ctx);
> +       emit_jalr(RV_REG_ZERO, is_tail_call ? RV_REG_T3 : RV_REG_RA,
> +                 is_tail_call ? 4 : 0, /* skip TCC init */
> +                 ctx);
>  }
>
>  static void emit_bcc(u8 cond, u8 rd, u8 rs, int rvoff,
> @@ -280,8 +284,8 @@ static void emit_branch(u8 cond, u8 rd, u8 rs, int rvoff,
>
>  static void emit_zext_32(u8 reg, struct rv_jit_context *ctx)
>  {
> -       emit(rv_slli(reg, reg, 32), ctx);
> -       emit(rv_srli(reg, reg, 32), ctx);
> +       emit_slli(reg, reg, 32, ctx);
> +       emit_srli(reg, reg, 32, ctx);
>  }
>
>  static int emit_bpf_tail_call(int insn, struct rv_jit_context *ctx)
> @@ -310,7 +314,7 @@ static int emit_bpf_tail_call(int insn, struct rv_jit_context *ctx)
>         /* if (TCC-- < 0)
>          *     goto out;
>          */
> -       emit(rv_addi(RV_REG_T1, tcc, -1), ctx);
> +       emit_addi(RV_REG_T1, tcc, -1, ctx);
>         off = ninsns_rvoff(tc_ninsn - (ctx->ninsns - start_insn));
>         emit_branch(BPF_JSLT, tcc, RV_REG_ZERO, off, ctx);
>
> @@ -318,12 +322,12 @@ static int emit_bpf_tail_call(int insn, struct rv_jit_context *ctx)
>          * if (!prog)
>          *     goto out;
>          */
> -       emit(rv_slli(RV_REG_T2, RV_REG_A2, 3), ctx);
> -       emit(rv_add(RV_REG_T2, RV_REG_T2, RV_REG_A1), ctx);
> +       emit_slli(RV_REG_T2, RV_REG_A2, 3, ctx);
> +       emit_add(RV_REG_T2, RV_REG_T2, RV_REG_A1, ctx);
>         off = offsetof(struct bpf_array, ptrs);
>         if (is_12b_check(off, insn))
>                 return -1;
> -       emit(rv_ld(RV_REG_T2, off, RV_REG_T2), ctx);
> +       emit_ld(RV_REG_T2, off, RV_REG_T2, ctx);
>         off = ninsns_rvoff(tc_ninsn - (ctx->ninsns - start_insn));
>         emit_branch(BPF_JEQ, RV_REG_T2, RV_REG_ZERO, off, ctx);
>
> @@ -331,8 +335,8 @@ static int emit_bpf_tail_call(int insn, struct rv_jit_context *ctx)
>         off = offsetof(struct bpf_prog, bpf_func);
>         if (is_12b_check(off, insn))
>                 return -1;
> -       emit(rv_ld(RV_REG_T3, off, RV_REG_T2), ctx);
> -       emit(rv_addi(RV_REG_TCC, RV_REG_T1, 0), ctx);
> +       emit_ld(RV_REG_T3, off, RV_REG_T2, ctx);
> +       emit_mv(RV_REG_TCC, RV_REG_T1, ctx);
>         __build_epilogue(true, ctx);
>         return 0;
>  }
> @@ -360,9 +364,9 @@ static void init_regs(u8 *rd, u8 *rs, const struct bpf_insn *insn,
>
>  static void emit_zext_32_rd_rs(u8 *rd, u8 *rs, struct rv_jit_context *ctx)
>  {
> -       emit(rv_addi(RV_REG_T2, *rd, 0), ctx);
> +       emit_mv(RV_REG_T2, *rd, ctx);
>         emit_zext_32(RV_REG_T2, ctx);
> -       emit(rv_addi(RV_REG_T1, *rs, 0), ctx);
> +       emit_mv(RV_REG_T1, *rs, ctx);
>         emit_zext_32(RV_REG_T1, ctx);
>         *rd = RV_REG_T2;
>         *rs = RV_REG_T1;
> @@ -370,15 +374,15 @@ static void emit_zext_32_rd_rs(u8 *rd, u8 *rs, struct rv_jit_context *ctx)
>
>  static void emit_sext_32_rd_rs(u8 *rd, u8 *rs, struct rv_jit_context *ctx)
>  {
> -       emit(rv_addiw(RV_REG_T2, *rd, 0), ctx);
> -       emit(rv_addiw(RV_REG_T1, *rs, 0), ctx);
> +       emit_addiw(RV_REG_T2, *rd, 0, ctx);
> +       emit_addiw(RV_REG_T1, *rs, 0, ctx);
>         *rd = RV_REG_T2;
>         *rs = RV_REG_T1;
>  }
>
>  static void emit_zext_32_rd_t1(u8 *rd, struct rv_jit_context *ctx)
>  {
> -       emit(rv_addi(RV_REG_T2, *rd, 0), ctx);
> +       emit_mv(RV_REG_T2, *rd, ctx);
>         emit_zext_32(RV_REG_T2, ctx);
>         emit_zext_32(RV_REG_T1, ctx);
>         *rd = RV_REG_T2;
> @@ -386,7 +390,7 @@ static void emit_zext_32_rd_t1(u8 *rd, struct rv_jit_context *ctx)
>
>  static void emit_sext_32_rd(u8 *rd, struct rv_jit_context *ctx)
>  {
> -       emit(rv_addiw(RV_REG_T2, *rd, 0), ctx);
> +       emit_addiw(RV_REG_T2, *rd, 0, ctx);
>         *rd = RV_REG_T2;
>  }
>
> @@ -432,7 +436,7 @@ static int emit_call(bool fixed, u64 addr, struct rv_jit_context *ctx)
>         if (ret)
>                 return ret;
>         rd = bpf_to_rv_reg(BPF_REG_0, ctx);
> -       emit(rv_addi(rd, RV_REG_A0, 0), ctx);
> +       emit_mv(rd, RV_REG_A0, ctx);
>         return 0;
>  }
>
> @@ -458,7 +462,7 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
>                         emit_zext_32(rd, ctx);
>                         break;
>                 }
> -               emit(is64 ? rv_addi(rd, rs, 0) : rv_addiw(rd, rs, 0), ctx);
> +               emit_mv(rd, rs, ctx);
>                 if (!is64 && !aux->verifier_zext)
>                         emit_zext_32(rd, ctx);
>                 break;
> @@ -466,31 +470,35 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
>         /* dst = dst OP src */
>         case BPF_ALU | BPF_ADD | BPF_X:
>         case BPF_ALU64 | BPF_ADD | BPF_X:
> -               emit(is64 ? rv_add(rd, rd, rs) : rv_addw(rd, rd, rs), ctx);
> +               emit_add(rd, rd, rs, ctx);
>                 if (!is64 && !aux->verifier_zext)
>                         emit_zext_32(rd, ctx);
>                 break;
>         case BPF_ALU | BPF_SUB | BPF_X:
>         case BPF_ALU64 | BPF_SUB | BPF_X:
> -               emit(is64 ? rv_sub(rd, rd, rs) : rv_subw(rd, rd, rs), ctx);
> +               if (is64)
> +                       emit_sub(rd, rd, rs, ctx);
> +               else
> +                       emit_subw(rd, rd, rs, ctx);
> +
>                 if (!is64 && !aux->verifier_zext)
>                         emit_zext_32(rd, ctx);
>                 break;
>         case BPF_ALU | BPF_AND | BPF_X:
>         case BPF_ALU64 | BPF_AND | BPF_X:
> -               emit(rv_and(rd, rd, rs), ctx);
> +               emit_and(rd, rd, rs, ctx);
>                 if (!is64 && !aux->verifier_zext)
>                         emit_zext_32(rd, ctx);
>                 break;
>         case BPF_ALU | BPF_OR | BPF_X:
>         case BPF_ALU64 | BPF_OR | BPF_X:
> -               emit(rv_or(rd, rd, rs), ctx);
> +               emit_or(rd, rd, rs, ctx);
>                 if (!is64 && !aux->verifier_zext)
>                         emit_zext_32(rd, ctx);
>                 break;
>         case BPF_ALU | BPF_XOR | BPF_X:
>         case BPF_ALU64 | BPF_XOR | BPF_X:
> -               emit(rv_xor(rd, rd, rs), ctx);
> +               emit_xor(rd, rd, rs, ctx);
>                 if (!is64 && !aux->verifier_zext)
>                         emit_zext_32(rd, ctx);
>                 break;
> @@ -534,8 +542,7 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
>         /* dst = -dst */
>         case BPF_ALU | BPF_NEG:
>         case BPF_ALU64 | BPF_NEG:
> -               emit(is64 ? rv_sub(rd, RV_REG_ZERO, rd) :
> -                    rv_subw(rd, RV_REG_ZERO, rd), ctx);
> +               emit_sub(rd, RV_REG_ZERO, rd, ctx);
>                 if (!is64 && !aux->verifier_zext)
>                         emit_zext_32(rd, ctx);
>                 break;
> @@ -544,8 +551,8 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
>         case BPF_ALU | BPF_END | BPF_FROM_LE:
>                 switch (imm) {
>                 case 16:
> -                       emit(rv_slli(rd, rd, 48), ctx);
> -                       emit(rv_srli(rd, rd, 48), ctx);
> +                       emit_slli(rd, rd, 48, ctx);
> +                       emit_srli(rd, rd, 48, ctx);
>                         break;
>                 case 32:
>                         if (!aux->verifier_zext)
> @@ -558,51 +565,51 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
>                 break;
>
>         case BPF_ALU | BPF_END | BPF_FROM_BE:
> -               emit(rv_addi(RV_REG_T2, RV_REG_ZERO, 0), ctx);
> +               emit_li(RV_REG_T2, 0, ctx);
>
> -               emit(rv_andi(RV_REG_T1, rd, 0xff), ctx);
> -               emit(rv_add(RV_REG_T2, RV_REG_T2, RV_REG_T1), ctx);
> -               emit(rv_slli(RV_REG_T2, RV_REG_T2, 8), ctx);
> -               emit(rv_srli(rd, rd, 8), ctx);
> +               emit_andi(RV_REG_T1, rd, 0xff, ctx);
> +               emit_add(RV_REG_T2, RV_REG_T2, RV_REG_T1, ctx);
> +               emit_slli(RV_REG_T2, RV_REG_T2, 8, ctx);
> +               emit_srli(rd, rd, 8, ctx);
>                 if (imm == 16)
>                         goto out_be;
>
> -               emit(rv_andi(RV_REG_T1, rd, 0xff), ctx);
> -               emit(rv_add(RV_REG_T2, RV_REG_T2, RV_REG_T1), ctx);
> -               emit(rv_slli(RV_REG_T2, RV_REG_T2, 8), ctx);
> -               emit(rv_srli(rd, rd, 8), ctx);
> +               emit_andi(RV_REG_T1, rd, 0xff, ctx);
> +               emit_add(RV_REG_T2, RV_REG_T2, RV_REG_T1, ctx);
> +               emit_slli(RV_REG_T2, RV_REG_T2, 8, ctx);
> +               emit_srli(rd, rd, 8, ctx);
>
> -               emit(rv_andi(RV_REG_T1, rd, 0xff), ctx);
> -               emit(rv_add(RV_REG_T2, RV_REG_T2, RV_REG_T1), ctx);
> -               emit(rv_slli(RV_REG_T2, RV_REG_T2, 8), ctx);
> -               emit(rv_srli(rd, rd, 8), ctx);
> +               emit_andi(RV_REG_T1, rd, 0xff, ctx);
> +               emit_add(RV_REG_T2, RV_REG_T2, RV_REG_T1, ctx);
> +               emit_slli(RV_REG_T2, RV_REG_T2, 8, ctx);
> +               emit_srli(rd, rd, 8, ctx);
>                 if (imm == 32)
>                         goto out_be;
>
> -               emit(rv_andi(RV_REG_T1, rd, 0xff), ctx);
> -               emit(rv_add(RV_REG_T2, RV_REG_T2, RV_REG_T1), ctx);
> -               emit(rv_slli(RV_REG_T2, RV_REG_T2, 8), ctx);
> -               emit(rv_srli(rd, rd, 8), ctx);
> -
> -               emit(rv_andi(RV_REG_T1, rd, 0xff), ctx);
> -               emit(rv_add(RV_REG_T2, RV_REG_T2, RV_REG_T1), ctx);
> -               emit(rv_slli(RV_REG_T2, RV_REG_T2, 8), ctx);
> -               emit(rv_srli(rd, rd, 8), ctx);
> -
> -               emit(rv_andi(RV_REG_T1, rd, 0xff), ctx);
> -               emit(rv_add(RV_REG_T2, RV_REG_T2, RV_REG_T1), ctx);
> -               emit(rv_slli(RV_REG_T2, RV_REG_T2, 8), ctx);
> -               emit(rv_srli(rd, rd, 8), ctx);
> -
> -               emit(rv_andi(RV_REG_T1, rd, 0xff), ctx);
> -               emit(rv_add(RV_REG_T2, RV_REG_T2, RV_REG_T1), ctx);
> -               emit(rv_slli(RV_REG_T2, RV_REG_T2, 8), ctx);
> -               emit(rv_srli(rd, rd, 8), ctx);
> +               emit_andi(RV_REG_T1, rd, 0xff, ctx);
> +               emit_add(RV_REG_T2, RV_REG_T2, RV_REG_T1, ctx);
> +               emit_slli(RV_REG_T2, RV_REG_T2, 8, ctx);
> +               emit_srli(rd, rd, 8, ctx);
> +
> +               emit_andi(RV_REG_T1, rd, 0xff, ctx);
> +               emit_add(RV_REG_T2, RV_REG_T2, RV_REG_T1, ctx);
> +               emit_slli(RV_REG_T2, RV_REG_T2, 8, ctx);
> +               emit_srli(rd, rd, 8, ctx);
> +
> +               emit_andi(RV_REG_T1, rd, 0xff, ctx);
> +               emit_add(RV_REG_T2, RV_REG_T2, RV_REG_T1, ctx);
> +               emit_slli(RV_REG_T2, RV_REG_T2, 8, ctx);
> +               emit_srli(rd, rd, 8, ctx);
> +
> +               emit_andi(RV_REG_T1, rd, 0xff, ctx);
> +               emit_add(RV_REG_T2, RV_REG_T2, RV_REG_T1, ctx);
> +               emit_slli(RV_REG_T2, RV_REG_T2, 8, ctx);
> +               emit_srli(rd, rd, 8, ctx);
>  out_be:
> -               emit(rv_andi(RV_REG_T1, rd, 0xff), ctx);
> -               emit(rv_add(RV_REG_T2, RV_REG_T2, RV_REG_T1), ctx);
> +               emit_andi(RV_REG_T1, rd, 0xff, ctx);
> +               emit_add(RV_REG_T2, RV_REG_T2, RV_REG_T1, ctx);
>
> -               emit(rv_addi(rd, RV_REG_T2, 0), ctx);
> +               emit_mv(rd, RV_REG_T2, ctx);
>                 break;
>
>         /* dst = imm */
> @@ -617,12 +624,10 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
>         case BPF_ALU | BPF_ADD | BPF_K:
>         case BPF_ALU64 | BPF_ADD | BPF_K:
>                 if (is_12b_int(imm)) {
> -                       emit(is64 ? rv_addi(rd, rd, imm) :
> -                            rv_addiw(rd, rd, imm), ctx);
> +                       emit_addi(rd, rd, imm, ctx);
>                 } else {
>                         emit_imm(RV_REG_T1, imm, ctx);
> -                       emit(is64 ? rv_add(rd, rd, RV_REG_T1) :
> -                            rv_addw(rd, rd, RV_REG_T1), ctx);
> +                       emit_add(rd, rd, RV_REG_T1, ctx);
>                 }
>                 if (!is64 && !aux->verifier_zext)
>                         emit_zext_32(rd, ctx);
> @@ -630,12 +635,10 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
>         case BPF_ALU | BPF_SUB | BPF_K:
>         case BPF_ALU64 | BPF_SUB | BPF_K:
>                 if (is_12b_int(-imm)) {
> -                       emit(is64 ? rv_addi(rd, rd, -imm) :
> -                            rv_addiw(rd, rd, -imm), ctx);
> +                       emit_addi(rd, rd, -imm, ctx);
>                 } else {
>                         emit_imm(RV_REG_T1, imm, ctx);
> -                       emit(is64 ? rv_sub(rd, rd, RV_REG_T1) :
> -                            rv_subw(rd, rd, RV_REG_T1), ctx);
> +                       emit_sub(rd, rd, RV_REG_T1, ctx);
>                 }
>                 if (!is64 && !aux->verifier_zext)
>                         emit_zext_32(rd, ctx);
> @@ -643,10 +646,10 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
>         case BPF_ALU | BPF_AND | BPF_K:
>         case BPF_ALU64 | BPF_AND | BPF_K:
>                 if (is_12b_int(imm)) {
> -                       emit(rv_andi(rd, rd, imm), ctx);
> +                       emit_andi(rd, rd, imm, ctx);
>                 } else {
>                         emit_imm(RV_REG_T1, imm, ctx);
> -                       emit(rv_and(rd, rd, RV_REG_T1), ctx);
> +                       emit_and(rd, rd, RV_REG_T1, ctx);
>                 }
>                 if (!is64 && !aux->verifier_zext)
>                         emit_zext_32(rd, ctx);
> @@ -657,7 +660,7 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
>                         emit(rv_ori(rd, rd, imm), ctx);
>                 } else {
>                         emit_imm(RV_REG_T1, imm, ctx);
> -                       emit(rv_or(rd, rd, RV_REG_T1), ctx);
> +                       emit_or(rd, rd, RV_REG_T1, ctx);
>                 }
>                 if (!is64 && !aux->verifier_zext)
>                         emit_zext_32(rd, ctx);
> @@ -668,7 +671,7 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
>                         emit(rv_xori(rd, rd, imm), ctx);
>                 } else {
>                         emit_imm(RV_REG_T1, imm, ctx);
> -                       emit(rv_xor(rd, rd, RV_REG_T1), ctx);
> +                       emit_xor(rd, rd, RV_REG_T1, ctx);
>                 }
>                 if (!is64 && !aux->verifier_zext)
>                         emit_zext_32(rd, ctx);
> @@ -699,19 +702,28 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
>                 break;
>         case BPF_ALU | BPF_LSH | BPF_K:
>         case BPF_ALU64 | BPF_LSH | BPF_K:
> -               emit(is64 ? rv_slli(rd, rd, imm) : rv_slliw(rd, rd, imm), ctx);
> +               emit_slli(rd, rd, imm, ctx);
> +
>                 if (!is64 && !aux->verifier_zext)
>                         emit_zext_32(rd, ctx);
>                 break;
>         case BPF_ALU | BPF_RSH | BPF_K:
>         case BPF_ALU64 | BPF_RSH | BPF_K:
> -               emit(is64 ? rv_srli(rd, rd, imm) : rv_srliw(rd, rd, imm), ctx);
> +               if (is64)
> +                       emit_srli(rd, rd, imm, ctx);
> +               else
> +                       emit(rv_srliw(rd, rd, imm), ctx);
> +
>                 if (!is64 && !aux->verifier_zext)
>                         emit_zext_32(rd, ctx);
>                 break;
>         case BPF_ALU | BPF_ARSH | BPF_K:
>         case BPF_ALU64 | BPF_ARSH | BPF_K:
> -               emit(is64 ? rv_srai(rd, rd, imm) : rv_sraiw(rd, rd, imm), ctx);
> +               if (is64)
> +                       emit_srai(rd, rd, imm, ctx);
> +               else
> +                       emit(rv_sraiw(rd, rd, imm), ctx);
> +
>                 if (!is64 && !aux->verifier_zext)
>                         emit_zext_32(rd, ctx);
>                 break;
> @@ -763,7 +775,7 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
>                 if (BPF_OP(code) == BPF_JSET) {
>                         /* Adjust for and */
>                         rvoff -= 4;
> -                       emit(rv_and(RV_REG_T1, rd, rs), ctx);
> +                       emit_and(RV_REG_T1, rd, rs, ctx);
>                         emit_branch(BPF_JNE, RV_REG_T1, RV_REG_ZERO, rvoff,
>                                     ctx);
>                 } else {
> @@ -819,17 +831,17 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
>                 rvoff = rv_offset(i, off, ctx);
>                 s = ctx->ninsns;
>                 if (is_12b_int(imm)) {
> -                       emit(rv_andi(RV_REG_T1, rd, imm), ctx);
> +                       emit_andi(RV_REG_T1, rd, imm, ctx);
>                 } else {
>                         emit_imm(RV_REG_T1, imm, ctx);
> -                       emit(rv_and(RV_REG_T1, rd, RV_REG_T1), ctx);
> +                       emit_and(RV_REG_T1, rd, RV_REG_T1, ctx);
>                 }
>                 /* For jset32, we should clear the upper 32 bits of t1, but
>                  * sign-extension is sufficient here and saves one instruction,
>                  * as t1 is used only in comparison against zero.
>                  */
>                 if (!is64 && imm < 0)
> -                       emit(rv_addiw(RV_REG_T1, RV_REG_T1, 0), ctx);
> +                       emit_addiw(RV_REG_T1, RV_REG_T1, 0, ctx);
>                 e = ctx->ninsns;
>                 rvoff -= ninsns_rvoff(e - s);
>                 emit_branch(BPF_JNE, RV_REG_T1, RV_REG_ZERO, rvoff, ctx);
> @@ -887,7 +899,7 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
>                 }
>
>                 emit_imm(RV_REG_T1, off, ctx);
> -               emit(rv_add(RV_REG_T1, RV_REG_T1, rs), ctx);
> +               emit_add(RV_REG_T1, RV_REG_T1, rs, ctx);
>                 emit(rv_lbu(rd, 0, RV_REG_T1), ctx);
>                 if (insn_is_zext(&insn[1]))
>                         return 1;
> @@ -899,7 +911,7 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
>                 }
>
>                 emit_imm(RV_REG_T1, off, ctx);
> -               emit(rv_add(RV_REG_T1, RV_REG_T1, rs), ctx);
> +               emit_add(RV_REG_T1, RV_REG_T1, rs, ctx);
>                 emit(rv_lhu(rd, 0, RV_REG_T1), ctx);
>                 if (insn_is_zext(&insn[1]))
>                         return 1;
> @@ -911,20 +923,20 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
>                 }
>
>                 emit_imm(RV_REG_T1, off, ctx);
> -               emit(rv_add(RV_REG_T1, RV_REG_T1, rs), ctx);
> +               emit_add(RV_REG_T1, RV_REG_T1, rs, ctx);
>                 emit(rv_lwu(rd, 0, RV_REG_T1), ctx);
>                 if (insn_is_zext(&insn[1]))
>                         return 1;
>                 break;
>         case BPF_LDX | BPF_MEM | BPF_DW:
>                 if (is_12b_int(off)) {
> -                       emit(rv_ld(rd, off, rs), ctx);
> +                       emit_ld(rd, off, rs, ctx);
>                         break;
>                 }
>
>                 emit_imm(RV_REG_T1, off, ctx);
> -               emit(rv_add(RV_REG_T1, RV_REG_T1, rs), ctx);
> -               emit(rv_ld(rd, 0, RV_REG_T1), ctx);
> +               emit_add(RV_REG_T1, RV_REG_T1, rs, ctx);
> +               emit_ld(rd, 0, RV_REG_T1, ctx);
>                 break;
>
>         /* ST: *(size *)(dst + off) = imm */
> @@ -936,7 +948,7 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
>                 }
>
>                 emit_imm(RV_REG_T2, off, ctx);
> -               emit(rv_add(RV_REG_T2, RV_REG_T2, rd), ctx);
> +               emit_add(RV_REG_T2, RV_REG_T2, rd, ctx);
>                 emit(rv_sb(RV_REG_T2, 0, RV_REG_T1), ctx);
>                 break;
>
> @@ -948,30 +960,30 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
>                 }
>
>                 emit_imm(RV_REG_T2, off, ctx);
> -               emit(rv_add(RV_REG_T2, RV_REG_T2, rd), ctx);
> +               emit_add(RV_REG_T2, RV_REG_T2, rd, ctx);
>                 emit(rv_sh(RV_REG_T2, 0, RV_REG_T1), ctx);
>                 break;
>         case BPF_ST | BPF_MEM | BPF_W:
>                 emit_imm(RV_REG_T1, imm, ctx);
>                 if (is_12b_int(off)) {
> -                       emit(rv_sw(rd, off, RV_REG_T1), ctx);
> +                       emit_sw(rd, off, RV_REG_T1, ctx);
>                         break;
>                 }
>
>                 emit_imm(RV_REG_T2, off, ctx);
> -               emit(rv_add(RV_REG_T2, RV_REG_T2, rd), ctx);
> -               emit(rv_sw(RV_REG_T2, 0, RV_REG_T1), ctx);
> +               emit_add(RV_REG_T2, RV_REG_T2, rd, ctx);
> +               emit_sw(RV_REG_T2, 0, RV_REG_T1, ctx);
>                 break;
>         case BPF_ST | BPF_MEM | BPF_DW:
>                 emit_imm(RV_REG_T1, imm, ctx);
>                 if (is_12b_int(off)) {
> -                       emit(rv_sd(rd, off, RV_REG_T1), ctx);
> +                       emit_sd(rd, off, RV_REG_T1, ctx);
>                         break;
>                 }
>
>                 emit_imm(RV_REG_T2, off, ctx);
> -               emit(rv_add(RV_REG_T2, RV_REG_T2, rd), ctx);
> -               emit(rv_sd(RV_REG_T2, 0, RV_REG_T1), ctx);
> +               emit_add(RV_REG_T2, RV_REG_T2, rd, ctx);
> +               emit_sd(RV_REG_T2, 0, RV_REG_T1, ctx);
>                 break;
>
>         /* STX: *(size *)(dst + off) = src */
> @@ -982,7 +994,7 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
>                 }
>
>                 emit_imm(RV_REG_T1, off, ctx);
> -               emit(rv_add(RV_REG_T1, RV_REG_T1, rd), ctx);
> +               emit_add(RV_REG_T1, RV_REG_T1, rd, ctx);
>                 emit(rv_sb(RV_REG_T1, 0, rs), ctx);
>                 break;
>         case BPF_STX | BPF_MEM | BPF_H:
> @@ -992,28 +1004,28 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
>                 }
>
>                 emit_imm(RV_REG_T1, off, ctx);
> -               emit(rv_add(RV_REG_T1, RV_REG_T1, rd), ctx);
> +               emit_add(RV_REG_T1, RV_REG_T1, rd, ctx);
>                 emit(rv_sh(RV_REG_T1, 0, rs), ctx);
>                 break;
>         case BPF_STX | BPF_MEM | BPF_W:
>                 if (is_12b_int(off)) {
> -                       emit(rv_sw(rd, off, rs), ctx);
> +                       emit_sw(rd, off, rs, ctx);
>                         break;
>                 }
>
>                 emit_imm(RV_REG_T1, off, ctx);
> -               emit(rv_add(RV_REG_T1, RV_REG_T1, rd), ctx);
> -               emit(rv_sw(RV_REG_T1, 0, rs), ctx);
> +               emit_add(RV_REG_T1, RV_REG_T1, rd, ctx);
> +               emit_sw(RV_REG_T1, 0, rs, ctx);
>                 break;
>         case BPF_STX | BPF_MEM | BPF_DW:
>                 if (is_12b_int(off)) {
> -                       emit(rv_sd(rd, off, rs), ctx);
> +                       emit_sd(rd, off, rs, ctx);
>                         break;
>                 }
>
>                 emit_imm(RV_REG_T1, off, ctx);
> -               emit(rv_add(RV_REG_T1, RV_REG_T1, rd), ctx);
> -               emit(rv_sd(RV_REG_T1, 0, rs), ctx);
> +               emit_add(RV_REG_T1, RV_REG_T1, rd, ctx);
> +               emit_sd(RV_REG_T1, 0, rs, ctx);
>                 break;
>         /* STX XADD: lock *(u32 *)(dst + off) += src */
>         case BPF_STX | BPF_XADD | BPF_W:
> @@ -1021,10 +1033,10 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
>         case BPF_STX | BPF_XADD | BPF_DW:
>                 if (off) {
>                         if (is_12b_int(off)) {
> -                               emit(rv_addi(RV_REG_T1, rd, off), ctx);
> +                               emit_addi(RV_REG_T1, rd, off, ctx);
>                         } else {
>                                 emit_imm(RV_REG_T1, off, ctx);
> -                               emit(rv_add(RV_REG_T1, RV_REG_T1, rd), ctx);
> +                               emit_add(RV_REG_T1, RV_REG_T1, rd, ctx);
>                         }
>
>                         rd = RV_REG_T1;
> @@ -1073,52 +1085,53 @@ void bpf_jit_build_prologue(struct rv_jit_context *ctx)
>
>         /* First instruction is always setting the tail-call-counter
>          * (TCC) register. This instruction is skipped for tail calls.
> +        * Force using a 4-byte (non-compressed) instruction.
>          */
>         emit(rv_addi(RV_REG_TCC, RV_REG_ZERO, MAX_TAIL_CALL_CNT), ctx);
>
> -       emit(rv_addi(RV_REG_SP, RV_REG_SP, -stack_adjust), ctx);
> +       emit_addi(RV_REG_SP, RV_REG_SP, -stack_adjust, ctx);
>
>         if (seen_reg(RV_REG_RA, ctx)) {
> -               emit(rv_sd(RV_REG_SP, store_offset, RV_REG_RA), ctx);
> +               emit_sd(RV_REG_SP, store_offset, RV_REG_RA, ctx);
>                 store_offset -= 8;
>         }
> -       emit(rv_sd(RV_REG_SP, store_offset, RV_REG_FP), ctx);
> +       emit_sd(RV_REG_SP, store_offset, RV_REG_FP, ctx);
>         store_offset -= 8;
>         if (seen_reg(RV_REG_S1, ctx)) {
> -               emit(rv_sd(RV_REG_SP, store_offset, RV_REG_S1), ctx);
> +               emit_sd(RV_REG_SP, store_offset, RV_REG_S1, ctx);
>                 store_offset -= 8;
>         }
>         if (seen_reg(RV_REG_S2, ctx)) {
> -               emit(rv_sd(RV_REG_SP, store_offset, RV_REG_S2), ctx);
> +               emit_sd(RV_REG_SP, store_offset, RV_REG_S2, ctx);
>                 store_offset -= 8;
>         }
>         if (seen_reg(RV_REG_S3, ctx)) {
> -               emit(rv_sd(RV_REG_SP, store_offset, RV_REG_S3), ctx);
> +               emit_sd(RV_REG_SP, store_offset, RV_REG_S3, ctx);
>                 store_offset -= 8;
>         }
>         if (seen_reg(RV_REG_S4, ctx)) {
> -               emit(rv_sd(RV_REG_SP, store_offset, RV_REG_S4), ctx);
> +               emit_sd(RV_REG_SP, store_offset, RV_REG_S4, ctx);
>                 store_offset -= 8;
>         }
>         if (seen_reg(RV_REG_S5, ctx)) {
> -               emit(rv_sd(RV_REG_SP, store_offset, RV_REG_S5), ctx);
> +               emit_sd(RV_REG_SP, store_offset, RV_REG_S5, ctx);
>                 store_offset -= 8;
>         }
>         if (seen_reg(RV_REG_S6, ctx)) {
> -               emit(rv_sd(RV_REG_SP, store_offset, RV_REG_S6), ctx);
> +               emit_sd(RV_REG_SP, store_offset, RV_REG_S6, ctx);
>                 store_offset -= 8;
>         }
>
> -       emit(rv_addi(RV_REG_FP, RV_REG_SP, stack_adjust), ctx);
> +       emit_addi(RV_REG_FP, RV_REG_SP, stack_adjust, ctx);
>
>         if (bpf_stack_adjust)
> -               emit(rv_addi(RV_REG_S5, RV_REG_SP, bpf_stack_adjust), ctx);
> +               emit_addi(RV_REG_S5, RV_REG_SP, bpf_stack_adjust, ctx);
>
>         /* Program contains calls and tail calls, so RV_REG_TCC need
>          * to be saved across calls.
>          */
>         if (seen_tail_call(ctx) && seen_call(ctx))
> -               emit(rv_addi(RV_REG_TCC_SAVED, RV_REG_TCC, 0), ctx);
> +               emit_mv(RV_REG_TCC_SAVED, RV_REG_TCC, ctx);
>
>         ctx->stack_size = stack_adjust;
>  }
> --
> 2.25.1
>