[openwrt/openwrt] kernel: unroll MIPS r4k cache blast function
LEDE Commits
lede-commits at lists.infradead.org
Sat Mar 10 02:58:43 PST 2018
nbd pushed a commit to openwrt/openwrt.git, branch master:
https://git.lede-project.org/4e8f1e9f4ca088542fd2b861ea2f1a9dca0d845f
commit 4e8f1e9f4ca088542fd2b861ea2f1a9dca0d845f
Author: Felix Fietkau <nbd at nbd.name>
AuthorDate: Mon Dec 4 22:44:33 2017 +0100
kernel: unroll MIPS r4k cache blast function
Optimize the compiler output for larger cache blast cases that are
common for DMA-based networking.
On ar71xx, I measured a routing throughput increase of ~8%
Signed-off-by: Ben Menchaca <ben.menchaca at qca.qualcomm.com>
Signed-off-by: Rosen Penev <rosenp at gmail.com>
Signed-off-by: Felix Fietkau <nbd at nbd.name>
---
.../linux/brcm47xx/patches-4.9/159-cpu_fixes.patch | 55 +++++++++++++++---
...-r4k_cache-use-more-efficient-cache-blast.patch | 66 ++++++++++++++++++++++
...-r4k_cache-use-more-efficient-cache-blast.patch | 66 ++++++++++++++++++++++
3 files changed, 180 insertions(+), 7 deletions(-)
diff --git a/target/linux/brcm47xx/patches-4.9/159-cpu_fixes.patch b/target/linux/brcm47xx/patches-4.9/159-cpu_fixes.patch
index 36d39fa..3102923 100644
--- a/target/linux/brcm47xx/patches-4.9/159-cpu_fixes.patch
+++ b/target/linux/brcm47xx/patches-4.9/159-cpu_fixes.patch
@@ -204,7 +204,7 @@
#define __BUILD_BLAST_USER_CACHE(pfx, desc, indexop, hitop, lsize) \
static inline void blast_##pfx##cache##lsize##_user_page(unsigned long page) \
-@@ -660,17 +744,19 @@ __BUILD_BLAST_USER_CACHE(d, dcache, Inde
+@@ -660,53 +744,23 @@ __BUILD_BLAST_USER_CACHE(d, dcache, Inde
__BUILD_BLAST_USER_CACHE(i, icache, Index_Invalidate_I, Hit_Invalidate_I, 64)
/* build blast_xxx_range, protected_blast_xxx_range */
@@ -214,18 +214,59 @@
unsigned long end) \
{ \
unsigned long lsize = cpu_##desc##_line_size(); \
+- unsigned long lsize_2 = lsize * 2; \
+- unsigned long lsize_3 = lsize * 3; \
+- unsigned long lsize_4 = lsize * 4; \
+- unsigned long lsize_5 = lsize * 5; \
+- unsigned long lsize_6 = lsize * 6; \
+- unsigned long lsize_7 = lsize * 7; \
+- unsigned long lsize_8 = lsize * 8; \
unsigned long addr = start & ~(lsize - 1); \
- unsigned long aend = (end - 1) & ~(lsize - 1); \
+- unsigned long aend = (end + lsize - 1) & ~(lsize - 1); \
+- int lines = (aend - addr) / lsize; \
++ unsigned long aend = (end - 1) & ~(lsize - 1); \
+ war \
\
__##pfx##flush_prologue \
\
- while (1) { \
+- while (lines >= 8) { \
+- prot##cache_op(hitop, addr); \
+- prot##cache_op(hitop, addr + lsize); \
+- prot##cache_op(hitop, addr + lsize_2); \
+- prot##cache_op(hitop, addr + lsize_3); \
+- prot##cache_op(hitop, addr + lsize_4); \
+- prot##cache_op(hitop, addr + lsize_5); \
+- prot##cache_op(hitop, addr + lsize_6); \
+- prot##cache_op(hitop, addr + lsize_7); \
+- addr += lsize_8; \
+- lines -= 8; \
+- } \
+- \
+- if (lines & 0x4) { \
+- prot##cache_op(hitop, addr); \
+- prot##cache_op(hitop, addr + lsize); \
+- prot##cache_op(hitop, addr + lsize_2); \
+- prot##cache_op(hitop, addr + lsize_3); \
+- addr += lsize_4; \
+- } \
+- \
+- if (lines & 0x2) { \
+- prot##cache_op(hitop, addr); \
+- prot##cache_op(hitop, addr + lsize); \
+- addr += lsize_2; \
+- } \
+- \
+- if (lines & 0x1) { \
++ while (1) { \
+ war2 \
prot##cache_op(hitop, addr); \
- if (addr == aend) \
- break; \
-@@ -682,8 +768,8 @@ static inline void prot##extra##blast_##
++ if (addr == aend) \
++ break; \
++ addr += lsize; \
+ } \
+ \
+ __##pfx##flush_epilogue \
+@@ -714,8 +768,8 @@ static inline void prot##extra##blast_##
#ifndef CONFIG_EVA
@@ -236,7 +277,7 @@
#else
-@@ -720,14 +806,14 @@ __BUILD_PROT_BLAST_CACHE_RANGE(d, dcache
+@@ -752,14 +806,14 @@ __BUILD_PROT_BLAST_CACHE_RANGE(d, dcache
__BUILD_PROT_BLAST_CACHE_RANGE(i, icache, Hit_Invalidate_I)
#endif
diff --git a/target/linux/generic/hack-4.14/300-MIPS-r4k_cache-use-more-efficient-cache-blast.patch b/target/linux/generic/hack-4.14/300-MIPS-r4k_cache-use-more-efficient-cache-blast.patch
new file mode 100644
index 0000000..860a7e0
--- /dev/null
+++ b/target/linux/generic/hack-4.14/300-MIPS-r4k_cache-use-more-efficient-cache-blast.patch
@@ -0,0 +1,66 @@
+From: Ben Menchaca <ben.menchaca at qca.qualcomm.com>
+Date: Fri, 7 Jun 2013 18:35:22 -0500
+Subject: MIPS: r4k_cache: use more efficient cache blast
+
+Optimize the compiler output for larger cache blast cases that are
+common for DMA-based networking.
+
+Signed-off-by: Ben Menchaca <ben.menchaca at qca.qualcomm.com>
+Signed-off-by: Felix Fietkau <nbd at nbd.name>
+---
+--- a/arch/mips/include/asm/r4kcache.h
++++ b/arch/mips/include/asm/r4kcache.h
+@@ -682,16 +682,48 @@ static inline void prot##extra##blast_##
+ unsigned long end) \
+ { \
+ unsigned long lsize = cpu_##desc##_line_size(); \
++ unsigned long lsize_2 = lsize * 2; \
++ unsigned long lsize_3 = lsize * 3; \
++ unsigned long lsize_4 = lsize * 4; \
++ unsigned long lsize_5 = lsize * 5; \
++ unsigned long lsize_6 = lsize * 6; \
++ unsigned long lsize_7 = lsize * 7; \
++ unsigned long lsize_8 = lsize * 8; \
+ unsigned long addr = start & ~(lsize - 1); \
+- unsigned long aend = (end - 1) & ~(lsize - 1); \
++ unsigned long aend = (end + lsize - 1) & ~(lsize - 1); \
++ int lines = (aend - addr) / lsize; \
+ \
+ __##pfx##flush_prologue \
+ \
+- while (1) { \
++ while (lines >= 8) { \
++ prot##cache_op(hitop, addr); \
++ prot##cache_op(hitop, addr + lsize); \
++ prot##cache_op(hitop, addr + lsize_2); \
++ prot##cache_op(hitop, addr + lsize_3); \
++ prot##cache_op(hitop, addr + lsize_4); \
++ prot##cache_op(hitop, addr + lsize_5); \
++ prot##cache_op(hitop, addr + lsize_6); \
++ prot##cache_op(hitop, addr + lsize_7); \
++ addr += lsize_8; \
++ lines -= 8; \
++ } \
++ \
++ if (lines & 0x4) { \
++ prot##cache_op(hitop, addr); \
++ prot##cache_op(hitop, addr + lsize); \
++ prot##cache_op(hitop, addr + lsize_2); \
++ prot##cache_op(hitop, addr + lsize_3); \
++ addr += lsize_4; \
++ } \
++ \
++ if (lines & 0x2) { \
++ prot##cache_op(hitop, addr); \
++ prot##cache_op(hitop, addr + lsize); \
++ addr += lsize_2; \
++ } \
++ \
++ if (lines & 0x1) { \
+ prot##cache_op(hitop, addr); \
+- if (addr == aend) \
+- break; \
+- addr += lsize; \
+ } \
+ \
+ __##pfx##flush_epilogue \
diff --git a/target/linux/generic/hack-4.9/300-MIPS-r4k_cache-use-more-efficient-cache-blast.patch b/target/linux/generic/hack-4.9/300-MIPS-r4k_cache-use-more-efficient-cache-blast.patch
new file mode 100644
index 0000000..ce7901a
--- /dev/null
+++ b/target/linux/generic/hack-4.9/300-MIPS-r4k_cache-use-more-efficient-cache-blast.patch
@@ -0,0 +1,66 @@
+From: Ben Menchaca <ben.menchaca at qca.qualcomm.com>
+Date: Fri, 7 Jun 2013 18:35:22 -0500
+Subject: MIPS: r4k_cache: use more efficient cache blast
+
+Optimize the compiler output for larger cache blast cases that are
+common for DMA-based networking.
+
+Signed-off-by: Ben Menchaca <ben.menchaca at qca.qualcomm.com>
+Signed-off-by: Felix Fietkau <nbd at nbd.name>
+---
+--- a/arch/mips/include/asm/r4kcache.h
++++ b/arch/mips/include/asm/r4kcache.h
+@@ -665,16 +665,48 @@ static inline void prot##extra##blast_##pfx##cache##_range(unsigned long start,
+ unsigned long end) \
+ { \
+ unsigned long lsize = cpu_##desc##_line_size(); \
++ unsigned long lsize_2 = lsize * 2; \
++ unsigned long lsize_3 = lsize * 3; \
++ unsigned long lsize_4 = lsize * 4; \
++ unsigned long lsize_5 = lsize * 5; \
++ unsigned long lsize_6 = lsize * 6; \
++ unsigned long lsize_7 = lsize * 7; \
++ unsigned long lsize_8 = lsize * 8; \
+ unsigned long addr = start & ~(lsize - 1); \
+- unsigned long aend = (end - 1) & ~(lsize - 1); \
++ unsigned long aend = (end + lsize - 1) & ~(lsize - 1); \
++ int lines = (aend - addr) / lsize; \
+ \
+ __##pfx##flush_prologue \
+ \
+- while (1) { \
++ while (lines >= 8) { \
++ prot##cache_op(hitop, addr); \
++ prot##cache_op(hitop, addr + lsize); \
++ prot##cache_op(hitop, addr + lsize_2); \
++ prot##cache_op(hitop, addr + lsize_3); \
++ prot##cache_op(hitop, addr + lsize_4); \
++ prot##cache_op(hitop, addr + lsize_5); \
++ prot##cache_op(hitop, addr + lsize_6); \
++ prot##cache_op(hitop, addr + lsize_7); \
++ addr += lsize_8; \
++ lines -= 8; \
++ } \
++ \
++ if (lines & 0x4) { \
++ prot##cache_op(hitop, addr); \
++ prot##cache_op(hitop, addr + lsize); \
++ prot##cache_op(hitop, addr + lsize_2); \
++ prot##cache_op(hitop, addr + lsize_3); \
++ addr += lsize_4; \
++ } \
++ \
++ if (lines & 0x2) { \
++ prot##cache_op(hitop, addr); \
++ prot##cache_op(hitop, addr + lsize); \
++ addr += lsize_2; \
++ } \
++ \
++ if (lines & 0x1) { \
+ prot##cache_op(hitop, addr); \
+- if (addr == aend) \
+- break; \
+- addr += lsize; \
+ } \
+ \
+ __##pfx##flush_epilogue \
More information about the lede-commits
mailing list