[RFC PATCH 07/13] ARCv2: memset: rewrite using double load/stores
Sergey Matyukevich
geomatsi at gmail.com
Tue Feb 22 06:15:00 PST 2022
From: Vineet Gupta <vgupta at kernel.org>
Signed-off-by: Vineet Gupta <vgupta at kernel.org>
---
arch/arc/lib/memset-archs.S | 112 ++++++++++++++----------------------
1 file changed, 43 insertions(+), 69 deletions(-)
diff --git a/arch/arc/lib/memset-archs.S b/arch/arc/lib/memset-archs.S
index 330e22f7cf3c..a9a0ccef761d 100644
--- a/arch/arc/lib/memset-archs.S
+++ b/arch/arc/lib/memset-archs.S
@@ -5,6 +5,7 @@
#include <linux/linkage.h>
#include <asm/cache.h>
+#include <asm/assembler.h>
/*
* The memset implementation below is optimized to use prefetchw and prealloc
@@ -55,7 +56,7 @@ ENTRY_CFI(memset)
1:
#endif
-;;; Destination is aligned
+ ; promote memset pattern from char to int (double actually for STD)
and r1, r1, 0xFF
asl r4, r1, 8
or r4, r4, r1
@@ -63,75 +64,48 @@ ENTRY_CFI(memset)
or r5, r5, r4
mov r4, r5
- sub3 lp_count, r2, 8
- cmp r2, 64
- bmsk.hi r2, r2, 5
- mov.ls lp_count, 0
- add3.hi r2, r2, 8
-
-;;; Convert len to Dwords, unfold x8
- lsr.f lp_count, lp_count, 6
-
- lpnz @.Lset64bytes
- ;; LOOP START
- PREALLOC_INSTR r3, 64 ; alloc next line w/o fetching
-
-#ifdef CONFIG_ARC_HAS_LL64
- std.ab r4, [r3, 8]
- std.ab r4, [r3, 8]
- std.ab r4, [r3, 8]
- std.ab r4, [r3, 8]
- std.ab r4, [r3, 8]
- std.ab r4, [r3, 8]
- std.ab r4, [r3, 8]
- std.ab r4, [r3, 8]
-#else
- st.ab r4, [r3, 4]
- st.ab r4, [r3, 4]
- st.ab r4, [r3, 4]
- st.ab r4, [r3, 4]
- st.ab r4, [r3, 4]
- st.ab r4, [r3, 4]
- st.ab r4, [r3, 4]
- st.ab r4, [r3, 4]
- st.ab r4, [r3, 4]
- st.ab r4, [r3, 4]
- st.ab r4, [r3, 4]
- st.ab r4, [r3, 4]
- st.ab r4, [r3, 4]
- st.ab r4, [r3, 4]
- st.ab r4, [r3, 4]
- st.ab r4, [r3, 4]
-#endif
-.Lset64bytes:
-
- lsr.f lp_count, r2, 5 ;Last remaining max 124 bytes
- lpnz .Lset32bytes
- ;; LOOP START
-#ifdef CONFIG_ARC_HAS_LL64
- std.ab r4, [r3, 8]
- std.ab r4, [r3, 8]
- std.ab r4, [r3, 8]
- std.ab r4, [r3, 8]
-#else
- st.ab r4, [r3, 4]
- st.ab r4, [r3, 4]
- st.ab r4, [r3, 4]
- st.ab r4, [r3, 4]
- st.ab r4, [r3, 4]
- st.ab r4, [r3, 4]
- st.ab r4, [r3, 4]
- st.ab r4, [r3, 4]
-#endif
-.Lset32bytes:
-
- and.f lp_count, r2, 0x1F ;Last remaining 31 bytes
-.Lsmallchunk:
- lpnz .Lcopy3bytes
- ;; LOOP START
+ ; Loop #a:
+ ; - Updates 1 cache line worth data (64 bytes) per iteration
+ ; - PREALLOC the next line.
+ ;
+ ; = Only entered if at least 2 lines worth of work (i.e. >= 128 bytes),
+ ; else PREALLOC for next can "bleed" past end of buffer, causing data
+ ; corruption issue if that line is owned by some other core.
+ ; = Last 64 bytes (even for min 128 bytes work) are NOT done here to
+ ; avoid PREALLOC issue
+
+ sub r6, r2, 64
+ cmp r2, 64
+ bmsk.hi r2, r2, 5 ; trailing 63 bytes
+ mov.ls r6, 0
+ add.hi r2, r2, 64 ; line skipped in loop below
+
+ lsr.f lp_count, r6, 6
+ lpnz 2f
+ PREALLOCR r3, 64
+ ST64.ab r4, r3, 8
+ ST64.ab r4, r3, 8
+ ST64.ab r4, r3, 8
+ ST64.ab r4, r3, 8
+ ST64.ab r4, r3, 8
+ ST64.ab r4, r3, 8
+ ST64.ab r4, r3, 8
+ ST64.ab r4, r3, 8
+2:
+ ; Loop #b: Remaining 32 / 64 bytes
+ lsr.f lp_count, r2, 5
+ lpnz .Lbyteloop
+ ST64.ab r4, r3, 8
+ ST64.ab r4, r3, 8
+ ST64.ab r4, r3, 8
+ ST64.ab r4, r3, 8
+
+.Lbyteloop:
+ ; Loop #c: straggler 31 bytes
+ and.f lp_count, r2, 0x1F
+ lpnz 4f
stb.ab r1, [r3, 1]
-.Lcopy3bytes:
-
+4:
j [blink]
END_CFI(memset)
--
2.25.1
More information about the linux-snps-arc
mailing list