[PATCH v2 4/4] lib: sbi: Rework misaligned vector load/store
Anup Patel
apatel at ventanamicro.com
Tue Jun 16 23:16:14 PDT 2026
On Tue, Jun 9, 2026 at 12:44 PM Bo Gan <ganboing at gmail.com> wrote:
>
> Fix the following issues with misaligned vector load/store:
>
> a. Stack overflow: the mask[VLEN_MAX / 8] variable consumes 8K stack
> space, given VLEN_MAX=65536, overflowing the default-sized stack.
> There's no need to fetch the whole mask in one go, instead, make it
> on-demand. Use a 128-byte mask as local buffer to hold the sliding
> window of mask. For rvv load, this is allowed -- from the spec:
>
> "The destination vector register group for a masked vector
> instruction cannot overlap the source mask register (v0),
> unless the destination vector register is being written with
> a mask value (e.g., compares) or the scalar result of a reduction"
>
> We don't need to worry about the mask getting overwritten.
>
> b. Maintain the value of vstart upon abort (uptrap) to avoid duplicate
> work. After fault resolution, the instruction can restart from the
> faulting vstart. For Fault-Only-First loads, reset vstart to 0, as
> previously done so, to conform to spec.
>
> c. Explicitly set VS dirty in VSSTATUS with SET_VS_DIRTY() if faulting
> from V=1, and if any vector register, including vstart/vl/vtype, gets
> changed in the handler. It can add 1 unnecessary op to set VS dirty
> in M/SSTATUS (not VSSTATUS), where the HW already did, but for code
> simplicity, do it anyway. The overhead should be negligible.
>
> Signed-off-by: Bo Gan <ganboing at gmail.com>
LGTM.
Reviewed-by: Anup Patel <anup at brainfault.org>
Regards,
Anup
> ---
> include/sbi/sbi_vector.h | 6 +++
> lib/sbi/sbi_trap_v_ldst.c | 103 +++++++++++++++++++++++++-------------
> 2 files changed, 74 insertions(+), 35 deletions(-)
>
> diff --git a/include/sbi/sbi_vector.h b/include/sbi/sbi_vector.h
> index f00184f0..c14f3174 100644
> --- a/include/sbi/sbi_vector.h
> +++ b/include/sbi/sbi_vector.h
> @@ -20,6 +20,12 @@ struct sbi_vector_context {
> uint8_t vregs[];
> };
>
> +#define SET_VS_DIRTY(regs) do { \
> + if (sbi_regs_from_virt(regs)) \
> + csr_set(CSR_VSSTATUS, MSTATUS_VS); \
> + regs->mstatus |= MSTATUS_VS; \
> +} while(0)
> +
> #ifdef OPENSBI_CC_SUPPORT_VECTOR
> void sbi_vector_save(struct sbi_vector_context *dst);
> void sbi_vector_restore(const struct sbi_vector_context *src);
> diff --git a/lib/sbi/sbi_trap_v_ldst.c b/lib/sbi/sbi_trap_v_ldst.c
> index 02f7d6cc..0f29dcf9 100644
> --- a/lib/sbi/sbi_trap_v_ldst.c
> +++ b/lib/sbi/sbi_trap_v_ldst.c
> @@ -16,11 +16,11 @@
> #include <sbi/sbi_trap_ldst.h>
> #include <sbi/sbi_trap.h>
> #include <sbi/sbi_unpriv.h>
> -#include <sbi/sbi_trap.h>
> +#include <sbi/sbi_vector.h>
>
> #ifdef OPENSBI_CC_SUPPORT_VECTOR
>
> -#define VLEN_MAX 65536
> +#define MASK_BUFFLEN 1024
>
> static inline void set_vreg(ulong vlenb, ulong which,
> ulong pos, ulong size, const uint8_t *bytes)
> @@ -168,7 +168,7 @@ int sbi_misaligned_v_ld_emulator(ulong insn, struct sbi_trap_context *tcntx)
> ulong vl = csr_read(CSR_VL);
> ulong vtype = csr_read(CSR_VTYPE);
> ulong vlenb = csr_read(CSR_VLENB);
> - ulong vstart = csr_read(CSR_VSTART);
> + ulong vstart = csr_read(CSR_VSTART), orig_vstart = vstart;
> ulong base = GET_RS1(insn, regs);
> ulong stride = GET_RS2(insn, regs);
> ulong vd = GET_VD(insn);
> @@ -178,8 +178,9 @@ int sbi_misaligned_v_ld_emulator(ulong insn, struct sbi_trap_context *tcntx)
> ulong vlmul = GET_VLMUL(vtype);
> bool illegal = GET_MEW(insn);
> bool masked = IS_MASKED(insn);
> - uint8_t mask[VLEN_MAX / 8];
> + uint8_t mask[MASK_BUFFLEN / 8];
> uint8_t bytes[8 * sizeof(uint64_t)];
> + ulong mask_len = MASK_BUFFLEN < vlenb * 8 ? MASK_BUFFLEN : vlenb * 8;
> ulong len = GET_LEN(view);
> ulong nf = GET_NF(insn);
> ulong vemul = GET_VEMUL(vlmul, view, vsew);
> @@ -200,7 +201,7 @@ int sbi_misaligned_v_ld_emulator(ulong insn, struct sbi_trap_context *tcntx)
> stride = nf * len;
> }
>
> - if (illegal || vlenb > VLEN_MAX / 8) {
> + if (illegal) {
> struct sbi_trap_info trap = {
> uptrap.cause = CAUSE_ILLEGAL_INSTRUCTION,
> uptrap.tval = insn,
> @@ -208,12 +209,16 @@ int sbi_misaligned_v_ld_emulator(ulong insn, struct sbi_trap_context *tcntx)
> return sbi_trap_redirect(regs, &trap);
> }
>
> - if (masked)
> - get_vreg(vlenb, 0, 0, vlenb, mask);
> -
> do {
> - if (masked && (~mask[vstart / 8] & BIT(vstart % 8)))
> - continue;
> + if (masked) {
> + if (vstart == orig_vstart || vstart % mask_len == 0)
> + /* Fetch a mask_len chunk of mask */
> + get_vreg(vlenb, 0, vstart / mask_len * mask_len,
> + mask_len, mask);
> +
> + if (~mask[vstart % mask_len / 8] & BIT(vstart % 8))
> + continue;
> + }
>
> /* compute element address */
> ulong addr = base + vstart * stride;
> @@ -232,15 +237,21 @@ int sbi_misaligned_v_ld_emulator(ulong insn, struct sbi_trap_context *tcntx)
> sbi_load_loop(bytes + seg * len,
> addr + seg * len, len, &uptrap);
>
> - if (uptrap.cause) {
> - if (IS_FAULT_ONLY_FIRST_LOAD(insn) && vstart != 0) {
> - vl = vstart;
> - break;
> - }
> - vsetvl(vl, vtype);
> - sbi_misaligned_v_tinst_fixup(&uptrap);
> - return sbi_trap_redirect(regs, &uptrap);
> + if (!uptrap.cause)
> + continue;
> +
> + if (IS_FAULT_ONLY_FIRST_LOAD(insn) && vstart != 0) {
> + vl = vstart;
> + goto done;
> }
> +
> + vsetvl(vl, vtype);
> + csr_write(CSR_VSTART, vstart);
> + /* Don't forget to set dirty if vstart has changed */
> + if (vstart != orig_vstart)
> + SET_VS_DIRTY(regs);
> + sbi_misaligned_v_tinst_fixup(&uptrap);
> + return sbi_trap_redirect(regs, &uptrap);
> }
>
> /* write load data to regfile */
> @@ -249,8 +260,15 @@ int sbi_misaligned_v_ld_emulator(ulong insn, struct sbi_trap_context *tcntx)
> len, &bytes[seg * len]);
> } while (++vstart < vl);
>
> +done:
> /* restore clobbered vl/vtype */
> - vsetvl(vl, vtype);
> + vsetvl(vl, vtype); // VSTART resets to 0
> +
> + /*
> + * At least 1 element is processed, or vl is changed above in
> + * the FAULT_ONLY_FIRST_LOAD path, thus set dirty.
> + */
> + SET_VS_DIRTY(regs);
>
> /* Return a >0 value for the caller to advance mepc */
> return 1;
> @@ -263,7 +281,7 @@ int sbi_misaligned_v_st_emulator(ulong insn, struct sbi_trap_context *tcntx)
> ulong vl = csr_read(CSR_VL);
> ulong vtype = csr_read(CSR_VTYPE);
> ulong vlenb = csr_read(CSR_VLENB);
> - ulong vstart = csr_read(CSR_VSTART);
> + ulong vstart = csr_read(CSR_VSTART), orig_vstart = vstart;
> ulong base = GET_RS1(insn, regs);
> ulong stride = GET_RS2(insn, regs);
> ulong vd = GET_VD(insn);
> @@ -273,8 +291,9 @@ int sbi_misaligned_v_st_emulator(ulong insn, struct sbi_trap_context *tcntx)
> ulong vlmul = GET_VLMUL(vtype);
> bool illegal = GET_MEW(insn);
> bool masked = IS_MASKED(insn);
> - uint8_t mask[VLEN_MAX / 8];
> + uint8_t mask[MASK_BUFFLEN / 8];
> uint8_t bytes[8 * sizeof(uint64_t)];
> + ulong mask_len = MASK_BUFFLEN < vlenb * 8 ? MASK_BUFFLEN : vlenb * 8;
> ulong len = GET_LEN(view);
> ulong nf = GET_NF(insn);
> ulong vemul = GET_VEMUL(vlmul, view, vsew);
> @@ -295,7 +314,7 @@ int sbi_misaligned_v_st_emulator(ulong insn, struct sbi_trap_context *tcntx)
> stride = nf * len;
> }
>
> - if (illegal || vlenb > VLEN_MAX / 8) {
> + if (illegal) {
> struct sbi_trap_info trap = {
> uptrap.cause = CAUSE_ILLEGAL_INSTRUCTION,
> uptrap.tval = insn,
> @@ -303,12 +322,16 @@ int sbi_misaligned_v_st_emulator(ulong insn, struct sbi_trap_context *tcntx)
> return sbi_trap_redirect(regs, &trap);
> }
>
> - if (masked)
> - get_vreg(vlenb, 0, 0, vlenb, mask);
> -
> do {
> - if (masked && (~mask[vstart / 8] & BIT(vstart % 8)))
> - continue;
> + if (masked) {
> + if (vstart == orig_vstart || vstart % mask_len == 0)
> + /* Fetch a mask_len chunk of mask */
> + get_vreg(vlenb, 0, vstart / mask_len * mask_len,
> + mask_len, mask);
> +
> + if (~mask[vstart % mask_len / 8] & BIT(vstart % 8))
> + continue;
> + }
>
> /* compute element address */
> ulong addr = base + vstart * stride;
> @@ -325,23 +348,33 @@ int sbi_misaligned_v_st_emulator(ulong insn, struct sbi_trap_context *tcntx)
> get_vreg(vlenb, vd + seg * emul, vstart * len,
> len, &bytes[seg * len]);
>
> - csr_write(CSR_VSTART, vstart);
> -
> /* write store data to memory */
> for (ulong seg = 0; seg < nf; seg++) {
> sbi_store_loop(bytes + seg * len,
> addr + seg * len, len, &uptrap);
>
> - if (uptrap.cause) {
> - vsetvl(vl, vtype);
> - sbi_misaligned_v_tinst_fixup(&uptrap);
> - return sbi_trap_redirect(regs, &uptrap);
> - }
> + if (!uptrap.cause)
> + continue;
> +
> + vsetvl(vl, vtype);
> + csr_write(CSR_VSTART, vstart);
> + /* Don't forget to set dirty if vstart has changed */
> + if (vstart != orig_vstart)
> + SET_VS_DIRTY(regs);
> + sbi_misaligned_v_tinst_fixup(&uptrap);
> + return sbi_trap_redirect(regs, &uptrap);
> }
> } while (++vstart < vl);
>
> /* restore clobbered vl/vtype */
> - vsetvl(vl, vtype);
> + vsetvl(vl, vtype); // VSTART resets to 0
> +
> + /*
> + * No need to set dirty for memory store, but as VSTART resets to
> + * 0 above, need to set dirty if it's originally not 0.
> + */
> + if (orig_vstart != 0)
> + SET_VS_DIRTY(regs);
>
> /* Return a >0 value for the caller to advance mepc */
> return 1;
> --
> 2.34.1
>
>
> --
> opensbi mailing list
> opensbi at lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/opensbi
More information about the opensbi
mailing list