[PATCH v3 1/1] riscv: __asm_copy_to-from_user: Optimize unaligned memory access and pipeline stall
Palmer Dabbelt
palmer at dabbelt.com
Tue Jul 6 16:16:52 PDT 2021
On Wed, 23 Jun 2021 05:40:39 PDT (-0700), akira.tsukamoto at gmail.com wrote:
>
> This patch will reduce cpu usage dramatically in kernel space especially
> for application which use sys-call with large buffer size, such as network
> applications. The main reason behind this is that every unaligned memory
> access will raise exceptions and switch between s-mode and m-mode causing
> large overhead.
>
> First copy in bytes until reaches the first word aligned boundary in
> destination memory address. This is the preparation before the bulk
> aligned word copy.
>
> The destination address is aligned now, but oftentimes the source address
> is not in an aligned boundary. To reduce the unaligned memory access, it
> reads the data from source in aligned boundaries, which will cause the
> data to have an offset, and then combines the data in the next iteration
> by fixing offset with shifting before writing to destination. The majority
> of the improving copy speed comes from this shift copy.
>
> In the lucky situation that the both source and destination address are on
> the aligned boundary, perform load and store with register size to copy the
> data. Without the unrolling, it will reduce the speed since the next store
> instruction for the same register using from the load will stall the
> pipeline.
>
> At last, copying the remainder in one byte at a time.
>
> Signed-off-by: Akira Tsukamoto <akira.tsukamoto at gmail.com>
> ---
> arch/riscv/lib/uaccess.S | 181 +++++++++++++++++++++++++++++++--------
> 1 file changed, 146 insertions(+), 35 deletions(-)
>
> diff --git a/arch/riscv/lib/uaccess.S b/arch/riscv/lib/uaccess.S
> index fceaeb18cc64..bceb0629e440 100644
> --- a/arch/riscv/lib/uaccess.S
> +++ b/arch/riscv/lib/uaccess.S
> @@ -19,50 +19,161 @@ ENTRY(__asm_copy_from_user)
> li t6, SR_SUM
> csrs CSR_STATUS, t6
>
> - add a3, a1, a2
> - /* Use word-oriented copy only if low-order bits match */
> - andi t0, a0, SZREG-1
> - andi t1, a1, SZREG-1
> - bne t0, t1, 2f
> + /* Save for return value */
> + mv t5, a2
>
> - addi t0, a1, SZREG-1
> - andi t1, a3, ~(SZREG-1)
> - andi t0, t0, ~(SZREG-1)
> /*
> - * a3: terminal address of source region
> - * t0: lowest XLEN-aligned address in source
> - * t1: highest XLEN-aligned address in source
> + * Register allocation for code below:
> + * a0 - start of uncopied dst
> + * a1 - start of uncopied src
> + * a2 - size
> + * t0 - end of uncopied dst
> */
> - bgeu t0, t1, 2f
> - bltu a1, t0, 4f
> + add t0, a0, a2
> + bgtu a0, t0, 5f
> +
> + /*
> + * Use byte copy only if too small.
> + */
> + li a3, 8*SZREG /* size must be larger than size in word_copy */
> + bltu a2, a3, .Lbyte_copy_tail
> +
> + /*
> + * Copy first bytes until dst is align to word boundary.
> + * a0 - start of dst
> + * t1 - start of aligned dst
> + */
> + addi t1, a0, SZREG-1
> + andi t1, t1, ~(SZREG-1)
> + /* dst is already aligned, skip */
> + beq a0, t1, .Lskip_first_bytes
> 1:
> - fixup REG_L, t2, (a1), 10f
> - fixup REG_S, t2, (a0), 10f
> - addi a1, a1, SZREG
> - addi a0, a0, SZREG
> - bltu a1, t1, 1b
> + /* a5 - one byte for copying data */
> + fixup lb a5, 0(a1), 10f
> + addi a1, a1, 1 /* src */
> + fixup sb a5, 0(a0), 10f
> + addi a0, a0, 1 /* dst */
> + bltu a0, t1, 1b /* t1 - start of aligned dst */
> +
> +.Lskip_first_bytes:
> + /*
> + * Now dst is aligned.
> + * Use shift-copy if src is misaligned.
> + * Use word-copy if both src and dst are aligned because
> + * can not use shift-copy which do not require shifting
> + */
> + /* a1 - start of src */
> + andi a3, a1, SZREG-1
> + bnez a3, .Lshift_copy
> +
> +.Lword_copy:
> + /*
> + * Both src and dst are aligned, unrolled word copy
> + *
> + * a0 - start of aligned dst
> + * a1 - start of aligned src
> + * a3 - a1 & mask:(SZREG-1)
> + * t0 - end of aligned dst
> + */
> + addi t0, t0, -(8*SZREG-1) /* not to over run */
> 2:
> - bltu a1, a3, 5f
> + fixup REG_L a4, 0(a1), 10f
> + fixup REG_L a5, SZREG(a1), 10f
> + fixup REG_L a6, 2*SZREG(a1), 10f
> + fixup REG_L a7, 3*SZREG(a1), 10f
> + fixup REG_L t1, 4*SZREG(a1), 10f
> + fixup REG_L t2, 5*SZREG(a1), 10f
> + fixup REG_L t3, 6*SZREG(a1), 10f
> + fixup REG_L t4, 7*SZREG(a1), 10f
> + fixup REG_S a4, 0(a0), 10f
> + fixup REG_S a5, SZREG(a0), 10f
> + fixup REG_S a6, 2*SZREG(a0), 10f
> + fixup REG_S a7, 3*SZREG(a0), 10f
> + fixup REG_S t1, 4*SZREG(a0), 10f
> + fixup REG_S t2, 5*SZREG(a0), 10f
> + fixup REG_S t3, 6*SZREG(a0), 10f
> + fixup REG_S t4, 7*SZREG(a0), 10f
This seems like a suspiciously large unrolling factor, at least without
a fallback. My guess is that some workloads will want some smaller
unrolling factors, but given that we run on these single-issue in-order
processors it's probably best to have some big unrolling factors as well
since they're pretty limited WRT integer bandwidth.
> + addi a0, a0, 8*SZREG
> + addi a1, a1, 8*SZREG
> + bltu a0, t0, 2b
> +
> + addi t0, t0, 8*SZREG-1 /* revert to original value */
> + j .Lbyte_copy_tail
> +
> +.Lshift_copy:
> +
> + /*
> + * Word copy with shifting.
> + * For misaligned copy we still perform aligned word copy, but
> + * we need to use the value fetched from the previous iteration and
> + * do some shifts.
> + * This is safe because reading less than a word size.
> + *
> + * a0 - start of aligned dst
> + * a1 - start of src
> + * a3 - a1 & mask:(SZREG-1)
> + * t0 - end of uncopied dst
> + * t1 - end of aligned dst
> + */
> + /* calculating aligned word boundary for dst */
> + andi t1, t0, ~(SZREG-1)
> + /* Converting unaligned src to aligned arc */
> + andi a1, a1, ~(SZREG-1)
> +
> + /*
> + * Calculate shifts
> + * t3 - prev shift
> + * t4 - current shift
> + */
> + slli t3, a3, LGREG
> + li a5, SZREG*8
> + sub t4, a5, t3
> +
> + /* Load the first word to combine with seceond word */
> + fixup REG_L a5, 0(a1), 10f
>
> 3:
> + /* Main shifting copy
> + *
> + * a0 - start of aligned dst
> + * a1 - start of aligned src
> + * t1 - end of aligned dst
> + */
> +
> + /* At least one iteration will be executed */
> + srl a4, a5, t3
> + fixup REG_L a5, SZREG(a1), 10f
> + addi a1, a1, SZREG
> + sll a2, a5, t4
> + or a2, a2, a4
> + fixup REG_S a2, 0(a0), 10f
> + addi a0, a0, SZREG
> + bltu a0, t1, 3b
> +
> + /* Revert src to original unaligned value */
> + add a1, a1, a3
> +
> +.Lbyte_copy_tail:
> + /*
> + * Byte copy anything left.
> + *
> + * a0 - start of remaining dst
> + * a1 - start of remaining src
> + * t0 - end of remaining dst
> + */
> + bgeu a0, t0, 5f
> +4:
> + fixup lb a5, 0(a1), 10f
> + addi a1, a1, 1 /* src */
> + fixup sb a5, 0(a0), 10f
> + addi a0, a0, 1 /* dst */
> + bltu a0, t0, 4b /* t0 - end of dst */
> +
> +5:
> /* Disable access to user memory */
> csrc CSR_STATUS, t6
> - li a0, 0
> + li a0, 0
> ret
> -4: /* Edge case: unalignment */
> - fixup lbu, t2, (a1), 10f
> - fixup sb, t2, (a0), 10f
> - addi a1, a1, 1
> - addi a0, a0, 1
> - bltu a1, t0, 4b
> - j 1b
> -5: /* Edge case: remainder */
> - fixup lbu, t2, (a1), 10f
> - fixup sb, t2, (a0), 10f
> - addi a1, a1, 1
> - addi a0, a0, 1
> - bltu a1, a3, 5b
> - j 3b
> ENDPROC(__asm_copy_to_user)
> ENDPROC(__asm_copy_from_user)
> EXPORT_SYMBOL(__asm_copy_to_user)
> @@ -117,7 +228,7 @@ EXPORT_SYMBOL(__clear_user)
> 10:
> /* Disable access to user memory */
> csrs CSR_STATUS, t6
> - mv a0, a2
> + mv a0, t5
> ret
> 11:
> csrs CSR_STATUS, t6
That said, this is good enough for me. If someone comes up with a case
where the extra unrolling is an issue I'm happy to take something to fix
it, but until then I'm fine with this as-is. Like the string fuctions
it's probably best to eventually put this in C, but IIRC last time I
tried it was kind of a headache.
This is on for-next.
Thanks!
More information about the linux-riscv
mailing list