[PATCH v2 2/2] RISC-V: lib: Optimize memset performance

Wed May 10 18:34:53 PDT 2023

From: zhangfei <zhangfei at nj.iscas.ac.cn>

Optimized performance when the data size is less than 16 bytes.
Compared to byte by byte storage, significant performance improvement has been achieved.
It allows storage instructions to be executed in parallel and reduces the number of jumps.
Additional checks can avoid redundant stores.

Signed-off-by: Fei Zhang <zhangfei at nj.iscas.ac.cn>
---
 arch/riscv/lib/memset.S | 40 +++++++++++++++++++++++++++++++++++++---
 1 file changed, 37 insertions(+), 3 deletions(-)

diff --git a/arch/riscv/lib/memset.S b/arch/riscv/lib/memset.S
index e613c5c27998..452764bc9900 100644
--- a/arch/riscv/lib/memset.S
+++ b/arch/riscv/lib/memset.S
@@ -106,9 +106,43 @@ WEAK(memset)
 	beqz	a2, 6f
 	add	a3, t0, a2
 5:
-	sb	a1, 0(t0)
-	addi	t0, t0, 1
-	bltu	t0, a3, 5b
+       /* fill head and tail with minimal branching */
+       sb      a1,  0(t0)
+       sb      a1, -1(a3)
+       li 	a4, 2
+       bgeu 	a4, a2, 6f
+
+       sb 	a1,  1(t0)
+       sb 	a1,  2(t0)
+       sb 	a1, -2(a3)
+       sb 	a1, -3(a3)
+       li 	a4, 6
+       bgeu 	a4, a2, 6f
+
+       /* 
+        * Adding additional detection to avoid 
+        * redundant stores can lead 
+        * to better performance
+        */
+       sb 	a1,  3(t0)
+       sb 	a1, -4(a3)
+       li 	a4, 8
+       bgeu 	a4, a2, 6f
+
+       sb 	a1,  4(t0)
+       sb 	a1, -5(a3)
+       li 	a4, 10
+       bgeu 	a4, a2, 6f
+
+       sb 	a1,  5(t0)
+       sb 	a1,  6(t0)
+       sb 	a1, -6(a3)
+       sb 	a1, -7(a3)
+       li 	a4, 14
+       bgeu 	a4, a2, 6f
+       
+       /* store the last byte */
+       sb 	a1,  7(t0)
 6:
 	ret
 END(__memset)
-- 
2.33.0