[PATCH v2 4/4] lib: sbi: Rework misaligned vector load/store

Anup Patel apatel at ventanamicro.com
Tue Jun 16 23:16:14 PDT 2026


On Tue, Jun 9, 2026 at 12:44 PM Bo Gan <ganboing at gmail.com> wrote:
>
> Fix the following issues with misaligned vector load/store:
>
> a. Stack overflow: the mask[VLEN_MAX / 8] variable consumes 8K stack
> space, given VLEN_MAX=65536, overflowing the default-sized stack.
> There's no need to fetch the whole mask in one go, instead, make it
> on-demand. Use a 128-byte mask as local buffer to hold the sliding
> window of mask. For rvv load, this is allowed -- from the spec:
>
>   "The destination vector register group for a masked vector
>    instruction cannot overlap the source mask register (v0),
>    unless the destination vector register is being written with
>    a mask value (e.g., compares) or the scalar result of a reduction"
>
> We don't need to worry about the mask getting overwritten.
>
> b. Maintain the value of vstart upon abort (uptrap) to avoid duplicate
> work. After fault resolution, the instruction can restart from the
> faulting vstart. For Fault-Only-First loads, reset vstart to 0, as
> previously done so, to conform to spec.
>
> c. Explicitly set VS dirty in VSSTATUS with SET_VS_DIRTY() if faulting
> from V=1, and if any vector register, including vstart/vl/vtype, gets
> changed in the handler. It can add 1 unnecessary op to set VS dirty
> in M/SSTATUS (not VSSTATUS), where the HW already did, but for code
> simplicity, do it anyway. The overhead should be negligible.
>
> Signed-off-by: Bo Gan <ganboing at gmail.com>

LGTM.

Reviewed-by: Anup Patel <anup at brainfault.org>

Regards,
Anup

> ---
>  include/sbi/sbi_vector.h  |   6 +++
>  lib/sbi/sbi_trap_v_ldst.c | 103 +++++++++++++++++++++++++-------------
>  2 files changed, 74 insertions(+), 35 deletions(-)
>
> diff --git a/include/sbi/sbi_vector.h b/include/sbi/sbi_vector.h
> index f00184f0..c14f3174 100644
> --- a/include/sbi/sbi_vector.h
> +++ b/include/sbi/sbi_vector.h
> @@ -20,6 +20,12 @@ struct sbi_vector_context {
>         uint8_t vregs[];
>  };
>
> +#define SET_VS_DIRTY(regs) do {                                \
> +       if (sbi_regs_from_virt(regs))                   \
> +               csr_set(CSR_VSSTATUS, MSTATUS_VS);      \
> +       regs->mstatus |= MSTATUS_VS;                    \
> +} while(0)
> +
>  #ifdef OPENSBI_CC_SUPPORT_VECTOR
>  void sbi_vector_save(struct sbi_vector_context *dst);
>  void sbi_vector_restore(const struct sbi_vector_context *src);
> diff --git a/lib/sbi/sbi_trap_v_ldst.c b/lib/sbi/sbi_trap_v_ldst.c
> index 02f7d6cc..0f29dcf9 100644
> --- a/lib/sbi/sbi_trap_v_ldst.c
> +++ b/lib/sbi/sbi_trap_v_ldst.c
> @@ -16,11 +16,11 @@
>  #include <sbi/sbi_trap_ldst.h>
>  #include <sbi/sbi_trap.h>
>  #include <sbi/sbi_unpriv.h>
> -#include <sbi/sbi_trap.h>
> +#include <sbi/sbi_vector.h>
>
>  #ifdef OPENSBI_CC_SUPPORT_VECTOR
>
> -#define VLEN_MAX 65536
> +#define MASK_BUFFLEN 1024
>
>  static inline void set_vreg(ulong vlenb, ulong which,
>                             ulong pos, ulong size, const uint8_t *bytes)
> @@ -168,7 +168,7 @@ int sbi_misaligned_v_ld_emulator(ulong insn, struct sbi_trap_context *tcntx)
>         ulong vl = csr_read(CSR_VL);
>         ulong vtype = csr_read(CSR_VTYPE);
>         ulong vlenb = csr_read(CSR_VLENB);
> -       ulong vstart = csr_read(CSR_VSTART);
> +       ulong vstart = csr_read(CSR_VSTART), orig_vstart = vstart;
>         ulong base = GET_RS1(insn, regs);
>         ulong stride = GET_RS2(insn, regs);
>         ulong vd = GET_VD(insn);
> @@ -178,8 +178,9 @@ int sbi_misaligned_v_ld_emulator(ulong insn, struct sbi_trap_context *tcntx)
>         ulong vlmul = GET_VLMUL(vtype);
>         bool illegal = GET_MEW(insn);
>         bool masked = IS_MASKED(insn);
> -       uint8_t mask[VLEN_MAX / 8];
> +       uint8_t mask[MASK_BUFFLEN / 8];
>         uint8_t bytes[8 * sizeof(uint64_t)];
> +       ulong mask_len = MASK_BUFFLEN < vlenb * 8 ? MASK_BUFFLEN : vlenb * 8;
>         ulong len = GET_LEN(view);
>         ulong nf = GET_NF(insn);
>         ulong vemul = GET_VEMUL(vlmul, view, vsew);
> @@ -200,7 +201,7 @@ int sbi_misaligned_v_ld_emulator(ulong insn, struct sbi_trap_context *tcntx)
>                 stride = nf * len;
>         }
>
> -       if (illegal || vlenb > VLEN_MAX / 8) {
> +       if (illegal) {
>                 struct sbi_trap_info trap = {
>                         uptrap.cause = CAUSE_ILLEGAL_INSTRUCTION,
>                         uptrap.tval = insn,
> @@ -208,12 +209,16 @@ int sbi_misaligned_v_ld_emulator(ulong insn, struct sbi_trap_context *tcntx)
>                 return sbi_trap_redirect(regs, &trap);
>         }
>
> -       if (masked)
> -               get_vreg(vlenb, 0, 0, vlenb, mask);
> -
>         do {
> -               if (masked && (~mask[vstart / 8] & BIT(vstart % 8)))
> -                       continue;
> +               if (masked) {
> +                       if (vstart == orig_vstart || vstart % mask_len == 0)
> +                               /* Fetch a mask_len chunk of mask */
> +                               get_vreg(vlenb, 0, vstart / mask_len * mask_len,
> +                                        mask_len, mask);
> +
> +                       if (~mask[vstart % mask_len / 8] & BIT(vstart % 8))
> +                               continue;
> +               }
>
>                 /* compute element address */
>                 ulong addr = base + vstart * stride;
> @@ -232,15 +237,21 @@ int sbi_misaligned_v_ld_emulator(ulong insn, struct sbi_trap_context *tcntx)
>                         sbi_load_loop(bytes + seg * len,
>                                        addr + seg * len, len, &uptrap);
>
> -                       if (uptrap.cause) {
> -                               if (IS_FAULT_ONLY_FIRST_LOAD(insn) && vstart != 0) {
> -                                       vl = vstart;
> -                                       break;
> -                               }
> -                               vsetvl(vl, vtype);
> -                               sbi_misaligned_v_tinst_fixup(&uptrap);
> -                               return sbi_trap_redirect(regs, &uptrap);
> +                       if (!uptrap.cause)
> +                               continue;
> +
> +                       if (IS_FAULT_ONLY_FIRST_LOAD(insn) && vstart != 0) {
> +                               vl = vstart;
> +                               goto done;
>                         }
> +
> +                       vsetvl(vl, vtype);
> +                       csr_write(CSR_VSTART, vstart);
> +                       /* Don't forget to set dirty if vstart has changed */
> +                       if (vstart != orig_vstart)
> +                               SET_VS_DIRTY(regs);
> +                       sbi_misaligned_v_tinst_fixup(&uptrap);
> +                       return sbi_trap_redirect(regs, &uptrap);
>                 }
>
>                 /* write load data to regfile */
> @@ -249,8 +260,15 @@ int sbi_misaligned_v_ld_emulator(ulong insn, struct sbi_trap_context *tcntx)
>                                  len, &bytes[seg * len]);
>         } while (++vstart < vl);
>
> +done:
>         /* restore clobbered vl/vtype */
> -       vsetvl(vl, vtype);
> +       vsetvl(vl, vtype); // VSTART resets to 0
> +
> +       /*
> +        * At least 1 element is processed, or vl is changed above in
> +        * the FAULT_ONLY_FIRST_LOAD path, thus set dirty.
> +        */
> +       SET_VS_DIRTY(regs);
>
>         /* Return a >0 value for the caller to advance mepc */
>         return 1;
> @@ -263,7 +281,7 @@ int sbi_misaligned_v_st_emulator(ulong insn, struct sbi_trap_context *tcntx)
>         ulong vl = csr_read(CSR_VL);
>         ulong vtype = csr_read(CSR_VTYPE);
>         ulong vlenb = csr_read(CSR_VLENB);
> -       ulong vstart = csr_read(CSR_VSTART);
> +       ulong vstart = csr_read(CSR_VSTART), orig_vstart = vstart;
>         ulong base = GET_RS1(insn, regs);
>         ulong stride = GET_RS2(insn, regs);
>         ulong vd = GET_VD(insn);
> @@ -273,8 +291,9 @@ int sbi_misaligned_v_st_emulator(ulong insn, struct sbi_trap_context *tcntx)
>         ulong vlmul = GET_VLMUL(vtype);
>         bool illegal = GET_MEW(insn);
>         bool masked = IS_MASKED(insn);
> -       uint8_t mask[VLEN_MAX / 8];
> +       uint8_t mask[MASK_BUFFLEN / 8];
>         uint8_t bytes[8 * sizeof(uint64_t)];
> +       ulong mask_len = MASK_BUFFLEN < vlenb * 8 ? MASK_BUFFLEN : vlenb * 8;
>         ulong len = GET_LEN(view);
>         ulong nf = GET_NF(insn);
>         ulong vemul = GET_VEMUL(vlmul, view, vsew);
> @@ -295,7 +314,7 @@ int sbi_misaligned_v_st_emulator(ulong insn, struct sbi_trap_context *tcntx)
>                 stride = nf * len;
>         }
>
> -       if (illegal || vlenb > VLEN_MAX / 8) {
> +       if (illegal) {
>                 struct sbi_trap_info trap = {
>                         uptrap.cause = CAUSE_ILLEGAL_INSTRUCTION,
>                         uptrap.tval = insn,
> @@ -303,12 +322,16 @@ int sbi_misaligned_v_st_emulator(ulong insn, struct sbi_trap_context *tcntx)
>                 return sbi_trap_redirect(regs, &trap);
>         }
>
> -       if (masked)
> -               get_vreg(vlenb, 0, 0, vlenb, mask);
> -
>         do {
> -               if (masked && (~mask[vstart / 8] & BIT(vstart % 8)))
> -                       continue;
> +               if (masked) {
> +                       if (vstart == orig_vstart || vstart % mask_len == 0)
> +                               /* Fetch a mask_len chunk of mask */
> +                               get_vreg(vlenb, 0, vstart / mask_len * mask_len,
> +                                        mask_len, mask);
> +
> +                       if (~mask[vstart % mask_len / 8] & BIT(vstart % 8))
> +                               continue;
> +               }
>
>                 /* compute element address */
>                 ulong addr = base + vstart * stride;
> @@ -325,23 +348,33 @@ int sbi_misaligned_v_st_emulator(ulong insn, struct sbi_trap_context *tcntx)
>                         get_vreg(vlenb, vd + seg * emul, vstart * len,
>                                  len, &bytes[seg * len]);
>
> -               csr_write(CSR_VSTART, vstart);
> -
>                 /* write store data to memory */
>                 for (ulong seg = 0; seg < nf; seg++) {
>                         sbi_store_loop(bytes + seg * len,
>                                         addr + seg * len, len, &uptrap);
>
> -                       if (uptrap.cause) {
> -                               vsetvl(vl, vtype);
> -                               sbi_misaligned_v_tinst_fixup(&uptrap);
> -                               return sbi_trap_redirect(regs, &uptrap);
> -                       }
> +                       if (!uptrap.cause)
> +                               continue;
> +
> +                       vsetvl(vl, vtype);
> +                       csr_write(CSR_VSTART, vstart);
> +                       /* Don't forget to set dirty if vstart has changed */
> +                       if (vstart != orig_vstart)
> +                               SET_VS_DIRTY(regs);
> +                       sbi_misaligned_v_tinst_fixup(&uptrap);
> +                       return sbi_trap_redirect(regs, &uptrap);
>                 }
>         } while (++vstart < vl);
>
>         /* restore clobbered vl/vtype */
> -       vsetvl(vl, vtype);
> +       vsetvl(vl, vtype); // VSTART resets to 0
> +
> +       /*
> +        * No need to set dirty for memory store, but as VSTART resets to
> +        * 0 above, need to set dirty if it's originally not 0.
> +        */
> +       if (orig_vstart != 0)
> +               SET_VS_DIRTY(regs);
>
>         /* Return a >0 value for the caller to advance mepc */
>         return 1;
> --
> 2.34.1
>
>
> --
> opensbi mailing list
> opensbi at lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/opensbi



More information about the opensbi mailing list