[PATCH 1/1] riscv: __asm_copy_to-from_user: Improve using word copy if size < 9*SZREG

Mon Aug 16 12:00:16 PDT 2021

On Aug 16 2021, Palmer Dabbelt wrote:

> On Fri, 30 Jul 2021 06:52:44 PDT (-0700), akira.tsukamoto at gmail.com wrote:
>> Reduce the number of slow byte_copy when the size is in between
>> 2*SZREG to 9*SZREG by using none unrolled word_copy.
>>
>> Without it any size smaller than 9*SZREG will be using slow byte_copy
>> instead of none unrolled word_copy.
>>
>> Signed-off-by: Akira Tsukamoto <akira.tsukamoto at gmail.com>
>> ---
>>  arch/riscv/lib/uaccess.S | 46 ++++++++++++++++++++++++++++++++++++----
>>  1 file changed, 42 insertions(+), 4 deletions(-)
>>
>> diff --git a/arch/riscv/lib/uaccess.S b/arch/riscv/lib/uaccess.S
>> index 63bc691cff91..6a80d5517afc 100644
>> --- a/arch/riscv/lib/uaccess.S
>> +++ b/arch/riscv/lib/uaccess.S
>> @@ -34,8 +34,10 @@ ENTRY(__asm_copy_from_user)
>>  	/*
>>  	 * Use byte copy only if too small.
>>  	 * SZREG holds 4 for RV32 and 8 for RV64
>> +	 * a3 - 2*SZREG is minimum size for word_copy
>> +	 *      1*SZREG for aligning dst + 1*SZREG for word_copy
>>  	 */
>> -	li	a3, 9*SZREG /* size must be larger than size in word_copy */
>> +	li	a3, 2*SZREG
>>  	bltu	a2, a3, .Lbyte_copy_tail
>>
>>  	/*
>> @@ -66,9 +68,40 @@ ENTRY(__asm_copy_from_user)
>>  	andi	a3, a1, SZREG-1
>>  	bnez	a3, .Lshift_copy
>>
>> +.Lcheck_size_bulk:
>> +	/*
>> +	 * Evaluate the size if possible to use unrolled.
>> +	 * The word_copy_unlrolled requires larger than 8*SZREG
>> +	 */
>> +	li	a3, 8*SZREG
>> +	add	a4, a0, a3
>> +	bltu	a4, t0, .Lword_copy_unlrolled
>> +
>>  .Lword_copy:
>> -        /*
>> -	 * Both src and dst are aligned, unrolled word copy
>> +	/*
>> +	 * Both src and dst are aligned
>> +	 * None unrolled word copy with every 1*SZREG iteration
>> +	 *
>> +	 * a0 - start of aligned dst
>> +	 * a1 - start of aligned src
>> +	 * t0 - end of aligned dst
>> +	 */
>> +	bgeu	a0, t0, .Lbyte_copy_tail /* check if end of copy */
>> +	addi	t0, t0, -(SZREG) /* not to over run */
>> +1:
>> +	REG_L	a5, 0(a1)
>> +	addi	a1, a1, SZREG
>> +	REG_S	a5, 0(a0)
>> +	addi	a0, a0, SZREG
>> +	bltu	a0, t0, 1b
>> +
>> +	addi	t0, t0, SZREG /* revert to original value */
>> +	j	.Lbyte_copy_tail
>> +
>> +.Lword_copy_unlrolled:
>> +	/*
>> +	 * Both src and dst are aligned
>> +	 * Unrolled word copy with every 8*SZREG iteration
>>  	 *
>>  	 * a0 - start of aligned dst
>>  	 * a1 - start of aligned src
>> @@ -97,7 +130,12 @@ ENTRY(__asm_copy_from_user)
>>  	bltu	a0, t0, 2b
>>
>>  	addi	t0, t0, 8*SZREG /* revert to original value */
>> -	j	.Lbyte_copy_tail
>> +
>> +	/*
>> +	 * Remaining might large enough for word_copy to reduce slow byte
>> +	 * copy
>> +	 */
>> +	j	.Lcheck_size_bulk
>>
>>  .Lshift_copy:
>
> I'm still not convinced that going all the way to such a large unrolling
> factor is a net win, but this at least provides a much smoother cost 
> curve.
>
> That said, this is causing my 32-bit configs to hang.

It's missing fixups for the loads in the loop.

diff --git a/arch/riscv/lib/uaccess.S b/arch/riscv/lib/uaccess.S
index a835df6bd68f..12ed1f76bd1f 100644
--- a/arch/riscv/lib/uaccess.S
+++ b/arch/riscv/lib/uaccess.S
@@ -89,9 +89,9 @@ ENTRY(__asm_copy_from_user)
 	bgeu	a0, t0, .Lbyte_copy_tail /* check if end of copy */
 	addi	t0, t0, -(SZREG) /* not to over run */
 1:
-	REG_L	a5, 0(a1)
+	fixup REG_L	a5, 0(a1), 10f
 	addi	a1, a1, SZREG
-	REG_S	a5, 0(a0)
+	fixup REG_S	a5, 0(a0), 10f
 	addi	a0, a0, SZREG
 	bltu	a0, t0, 1b
 

Andreas.

-- 
Andreas Schwab, schwab at linux-m68k.org
GPG Key fingerprint = 7578 EB47 D4E5 4D69 2510  2552 DF73 E780 A9DA AEC1
"And now for something completely different."