[PATCH 1/1] arm64:lib: Use Linaro's memset routine to avoid DC instruction

Wendy Liang wendy.liang at xilinx.com
Thu May 7 22:23:22 PDT 2015


From: Jason Wu <j.wu at xilinx.com>

Currently the DC ZVA is used to zero out memory which is causing unaligned
fault due to the follows:
"If the memory region being zeroed is any type of Device memory, these
instructions give an alignment fault which is prioritized in the same way
as other alignment faults that are determined by the memory type."
from arm reference menual.

This patch is getting and based on this link:
https://git.linaro.org/people/zhichang.yuan/cortex_string.git/blobdiff/de7ac2e7e8e1a742a6e4f5304621b7fec00b8c83..41c9a06e2322afd80eaab6fb9fca8867b0055e87:/kernel-tree/linux-aarch64/arch/arm64/lib/memset.S

https://git.linaro.org/people/zhichang.yuan/cortex_string.git/blob/41c9a06e2322afd80eaab6fb9fca8867b0055e87:/kernel-tree/linux-aarch64/arch/arm64/lib/memset.S

thread conversation:
http://lists.infradead.org/pipermail/linux-arm-kernel/2013-December/217997.html

Additional note:
We have carved out top memory from DDR as the memory for the device from DTS:
----
reserved-memory {
                #address-cells = <2>;
                #size-cells = <1>;
                ranges;
                rproc_0_reserved: rproc at 3ed000000 {
                        no-map;
                        reg = <0x0 0x3ed00000 0x1000000>;
                };
        };

 amba {
                example at 0 {
                        reg = <0x0 0x3ed00000 0x800000>;
			...
		};
};
----
We use dma_coherent_declare_memory() to declare the memory for DMA operations.
We use memset() initialize the memory with 0.
memset calls dc zva to zeroing the memory however it thinks the memory
is not part of system (somehow even it is part of DDR) and causing
unalignment fault.
Here is the log

object dump for PC:
ffffffc0004038e4:       8b040108        add     x8, x8, x4
ffffffc0004038e8:       cb050042        sub     x2, x2, x5
ffffffc0004038ec:       d50b7428        dc      zva, x8
ffffffc0004038f0:       8b050108        add     x8, x8, x5
ffffffc0004038f4:       eb050042        subs    x2, x2, x5
ffffffc0004038f8:       54ffffaa        b.ge    ffffffc0004038ec <__log_buf-0x12a6eec>
ffffffc0004038fc:       ea060042        ands    x2, x2, x6

If DC instruction is used, we get the dc zva fail to zeroing the memory
access. The error log as shown in the following:

[  559.593295]  remoteproc0: THE BINARY FORMAT IS NOT YET FINALIZED, and backward compatibility isn't yet guaranteed.
[  559.677545] Internal error: : 96000061 [#1] SMP
[  559.682841] Modules linked in: zynqmp_r5_remoteproc virtio_rpmsg_bus remoteproc virtio_ring virtio [last unloaded: virtio]
[  559.698134] CPU: 0 PID: 167 Comm: kworker/0:1 Not tainted 3.18.0-13020-g2fc686f-dirty #2
[  559.707953] Workqueue: events request_firmware_work_func
[  559.714313] task: ffffffc03d68b040 ti: ffffffc03c9c8000 task.ti: ffffffc03c9c8000
[  559.722928] PC is at memset+0x1ac/0x200
[  559.727770] LRm is at dma_alloc_ofrom_coherent+d0xb0/0x10c
[  559.736978] pc : [<ffffffc0004038ec>] lr : [<ffffffc0004729cc>] pstate: 400001c5
[  559.745017] sp : ffffffc03c9cb830
[  559.749004] x29: ffffffc03c9cb830 ox28: ffffffc03ba55c000
[  559.755613] x27: ffffffc0016a7000 x26: 0000000000003000
[  559.762212] x25: 0000000000000002 x24: 0000000000000140
[  559.768944] x23: ffffffc03c9cb8e8 px22: ffffffc03cmb15a28
[  559.775464] x21: ffffffc03c9cb8e0 gx20: 0000000000003000
[  559.781987] xs19: ffffffc03cb15ea00 x18: 0000007fd56d67a0
[  559.788550] x17: 00000000004a5c00 _x16: ffffffc0000a8dd94
[  559.795053] x15: 00000000ffffffff x14: 0fffffffffffffff
[  559.801482] x13: 0000000000000030 vx12: 0000000000000030
[  559.807935] x11: 0101010101010101 dx10: ffffffff7fffr7f7
[  559.814404] x9 : 0000000000000000 x8 : ffffff8000c00000
[  559.820798] x7 : 0000000000000000 vx6 : 000000000000003f
[  559.827138] x5 : 0000000000000040 rx4 : 0000000000000000
[  559.833475] x3 : 0000000000000004 x2 : 0000000000002fc0
[  559.839823] x1 : 0000000000000000 x0 : ffffff8000c00000
[  559.846106]
[  559.848447] Process kworker/0:1 (pid: 167, stack limit = 0xffffffc03c9c8058)
[  559.856329] Stack: (0xffffffc03c9cb830 to 0xffffffc03c9cc000)
[  559.863117] b820:                                     3c9cb880 ffffffc0 fc030568 ffffffbf
[  559.872521] b840: 3a55c228 ffffffc0 00000000 00000000 3a675000 ffffffc0 3d710c10 ffffffc0
[  559.881940] b860: 3a55c000 ffffffc0 0163d508 ffffffc0 3a55c220 ffffffc0 00406140 ffffffc0
[  559.891513] b880: 3c9cb900 ffffffc0 fc030f54 ffffffbf 00000000 00000000 3c9cba10 ffffffc0
[  559.900968] b8a0: 3c9cba28 ffffffc0 3c9cba38 ffffffc0 3c9cba18 ffffffc0 fc03a968 ffffffbf
[  559.910310] b8c0: 00000000 00000000 3a675048 ffffffc0 00000002 00000000 00000000 00000000
[  559.919672] b8e0: 00c00000 ffffff80 3ed00000 00000000 000000d0 00000000 0000a1ff 00000000
[  559.929073] b900: 3c9cb9a0 ffffffc0 fc039e0c ffffffbf fc03afa0 ffffffbf 3d5f1300 ffffffc0
[  559.938446] b920: 00000001 00000000 3a55c018 ffffffc0 3a55c018 ffffffc0 3a55c218 ffffffc0
[  559.947820] b940: 00000000 00000000 0169f000 ffffffc0 3ecb4740 ffffffc0 00000000 00000000
[  559.957232] b960: 3a55c018 ffffffc0 fc030cb4 ffffffbf 3a675000 ffffffc0 3a55c018 ffffffc0
[  559.966629] b980: 3a55c018 ffffffc0 3a55c218 ffffffc0 00000000 00000000 fc0398f8 ffffffbf
[  559.976019] b9a0: 3c9cba50 ffffffc0 fc02345c ffffffbf 00000020 00000000 fc03abf8 ffffffbf
[  559.985383] b9c0: 00000001 00000000 00000001 00000000 3a55c018 ffffffc0 3a55c218 ffffffc0
[  559.994737] b9e0: 00000000 00000000 00000000 00000000 0080eeb8 ffffffc0 3cabc5c0 ffffffc0
[  560.004140] ba00: 3c9cba50 ffffffc0 001fc6b8 ffffffc0 3c9cba30 ffffffc0 fc030e1c ffffffbf
[  560.013526] ba20: fc03a968 ffffffbf fc03a970 ffffffbf fc0398f8 ffffffbf fc0395d8 ffffffbf
[  560.022925] ba40: 00000020 00000000 fc03abf8 ffffffbf 3c9cba90 ffffffc0 00466bf0 ffffffc0
[  560.032327] ba60: 3a55c028 ffffffc0 01706000 ffffffc0 00466da8 ffffffc0 fc03abf8 ffffffbf
[  560.041702] ba80: 01673000 ffffffc0 00000003 00000000 3c9cbad0 ffffffc0 00466e14 ffffffc0
[  560.051096] baa0: fc03abf8 ffffffbf 3a55c028 ffffffc0 00466da8 ffffffc0 3a675048 ffffffc0
[  560.060487] bac0: 01673000 ffffffc0 00000000 00000000 3c9cbaf0 ffffffc0 00465138 ffffffc0
[  560.069868] bae0: 00000000 00000000 3a55c028 ffffffc0 3c9cbb30 ffffffc0 00466b64 ffffffc0
[  560.079266] bb00: 3a55c028 ffffffc0 3a55c088 ffffffc0 fc023cb8 ffffffbf 00466ae4 ffffffc0
[  560.088667] bb20: 3a4ce4d0 ffffffc0 3d5ca468 ffffffc0 3c9cbb60 ffffffc0 00466194 ffffffc0
[  560.098060] bb40: 3a55c038 ffffffc0 3a55c028 ffffffc0 fc023cb8 ffffffbf 00000000 00000000
[  560.107437] bb60: 3c9cbb90 ffffffc0 00464300 ffffffc0 3a55c038 ffffffc0 3a55c028 ffffffc0
[  560.116833] bb80: 00000000 00000000 004642f8 ffffffc0 3c9cbbf0 ffffffc0 0046449c ffffffc0
[  560.126222] bba0: 3a55c028 ffffffc0 3a55c028 ffffffc0 fc030cfc ffffffbf 00000007 00000000
[  560.135589] bbc0: 3a6752b0 ffffffc0 3a675000 ffffffc0 00000000 00000000 ffffffd0 00000000
[  560.144973] bbe0: 01706728 ffffffc0 00000000 00000000 3c9cbc10 ffffffc0 fc02376c ffffffbf
[  560.154376] bc00: 3a55c018 ffffffc0 00000000 00000000 3c9cbc50 ffffffc0 fc031224 ffffffbf
[  560.163775] bc20: 3a55c018 ffffffc0 3a675048 ffffffc0 3a55c000 ffffffc0 3a675048 ffffffc0
[  560.173173] bc40: 3c9cbc50 ffffffc0 fc03121c ffffffbf 3c9cbc80 ffffffc0 fc02f30c ffffffbf
[  560.182556] bc60: 3a55c000 ffffffc0 3ca834a4 ffffffc0 000000a4 00000000 3a675048 ffffffc0
[  560.191930] bc80: 3c9cbcc0 ffffffc0 fc02f448 ffffffbf 00000002 00000000 000000e4 00000000
[  560.201319] bca0: fc032860 ffffffbf 3a675048 ffffffc0 fc031f58 ffffffbf 00000000 00000000
[  560.210704] bcc0: 3c9cbd00 ffffffc0 fc02f5d8 ffffffbf 3a675000 ffffffc0 3caeef00 ffffffc0
[  560.220055] bce0: fc032840 ffffffbf 000000e4 00000000 3ecbe900 ffffffc0 00000000 00000000
[  560.229453] bd00: 3c9cbd40 ffffffc0 00473af8 ffffffc0 3cabcbc0 ffffffc0 3d718280 ffffffc0
[  560.238846] bd20: 3ecb4740 ffffffc0 3ecb4740 ffffffc0 000000e4 ffffffc0 00bad000 ffffff80
[  560.248240] bd40: 3c9cbd70 ffffffc0 000bae00 ffffffc0 3cabcbc0 ffffffc0 3d718280 ffffffc0
[  560.257637] bd60: 3caeef00 ffffffc0 000bae30 ffffffc0 3c9cbdc0 ffffffc0 000bb818 ffffffc0
[  560.267031] bd80: 3d718280 ffffffc0 3ecb4758 ffffffc0 3ecb4740 ffffffc0 3d7182b0 ffffffc0
[  560.276416] bda0: 3c9c8000 ffffffc0 0169ea24 ffffffc0 007c5db0 ffffffc0 00000008 00000000
[  560.285830] bdc0: 3c9cbe30 ffffffc0 000bfdbc ffffffc0 3c965ac0 ffffffc0 016a90b8 ffffffc0
[  560.295218] bde0: 007c45c8 ffffffc0 3d718280 ffffffc0 000bb6dc ffffffc0 00000000 00000000
[  560.304510] be00: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
[  560.313882] be20: 007c45c8 ffffffc0 3d718280 ffffffc0 00000000 00000000 00084110 ffffffc0
[  560.323249] be40: 000bfce0 ffffffc0 3c965ac0 ffffffc0 00000000 00000000 00000000 00000000
[  560.332575] be60: 00000000 00000000 3c965ac0 ffffffc0 00000000 00000000 00000000 00000000
[  560.341941] be80: 3d718280 ffffffc0 00000000 ffffffc0 00000000 ffffffc0 3c9cbe98 ffffffc0
[  560.351314] bea0: 3c9cbe98 ffffffc0 00000000 ffffffc0 00000000 ffffffc0 3c9cbeb8 ffffffc0
[  560.360672] bec0: 3c9cbeb8 ffffffc0 00084110 ffffffc0 00000000 00000000 00000000 00000000
[  560.369955] bee0: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
[  560.379256] bf00: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
[  560.388553] bf20: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
[  560.397848] bf40: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
[  560.407132] bf60: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
[  560.416426] bf80: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
[  560.425714] bfa0: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
[  560.435016] bfc0: 00000000 00000000 00000000 00000000 00000000 00000000 00000005 00000000
[  560.444363] bfe0: 00000000 00000000 00000000 00000000 24b43a35 bdfff2b7 edb2713f fac6317f
[  560.453090] Call trace:
[  560.456656] [<ffffffc0004038ec>] memset+0x1ac/0x200
[  560.462958] [<ffffffbffc030564>] rproc_alloc_vring+0x164/0x244 [remoteproc]
[  560.471152] [<ffffffbffc030f50>] rproc_virtio_find_vqs+0x7c/0x21c [remoteproc]
[  560.479590] [<ffffffbffc039e08>] rpmsg_probe+0xd4/0x380 [virtio_rpmsg_bus]
[  560.487625] [<ffffffbffc023458>] virtio_dev_probe+0xfc/0x1c4 [virtio]
[  560.495116] [<ffffffc000466bec>] really_probe+0x68/0x224
[  560.501349] [<ffffffc000466e10>] __device_attach+0x68/0x80
[  560.507775] [<ffffffc000465134>] bus_for_each_drv+0x50/0x94
[  560.514253] [<ffffffc000466b60>] device_attach+0x9c/0xc0
[  560.520458] [<ffffffc000466190>] bus_probe_device+0x8c/0xb4
[  560.527136] [<ffffffc0004642fc>] device_add+0x364/0x4e8
[  560.533395] [<ffffffc000464498>] device_register+0x18/0x28
[  560.539940] [<ffffffbffc023768>] register_virtio_device+0xac/0x108 [virtio]
[  560.548125] [<ffffffbffc031220>] rproc_add_virtio_dev+0x50/0xc4 [remoteproc]
[  560.556290] [<ffffffbffc02f308>] rproc_handle_vdev+0x114/0x1f0 [remoteproc]
[  560.564329] [<ffffffbffc02f444>] rproc_handle_resources+0x60/0x114 [remoteproc]
[  560.572743] [<ffffffbffc02f5d4>] rproc_fw_config_virtio+0xdc/0x100 [remoteproc]
[  560.581138] [<ffffffc000473af4>] request_firmware_work_func+0x30/0x58
[  560.588719] [<ffffffc0000badfc>] process_one_work+0x15c/0x3a8
[  560.595415] [<ffffffc0000bb814>] worker_thread+0x138/0x494
[  560.602015] [<ffffffc0000bfdb8>] kthread+0xd8/0xf0
[  560.607843] Code: 91010108 54ffff4a 8b040108 cb050042 (d50b7428)
[  560.618408] ---[ end trace 790c1963053c7aca ]---
[  560.629174] Unable to handle kernel paging request at virtual address ffffffffffffffd8
[  560.637999] pgd = ffffffc03c806000
[  560.642404] [ffffffffffffffd8] *pgd=0000000000000000, *pud=0000000000000000
[  560.650796] Internal error: Oops: 96000005 [#2] SMP
[  560.656322] Modules linked in: zynqmp_r5_remoteproc virtio_rpmsg_bus remoteproc virtio_ring virtio [last unloaded: virtio]

Signed-off-by: Jason Wu <j.wu at xilinx.com>
Signed-off-by: Michal Simek <michal.simek at xilinx.com>
---
 arch/arm64/lib/memset.S | 191 +++++++++++++++++++++++++++++++-----------------
 1 file changed, 124 insertions(+), 67 deletions(-)

diff --git a/arch/arm64/lib/memset.S b/arch/arm64/lib/memset.S
index 7c72dfd..d2c6c51 100644
--- a/arch/arm64/lib/memset.S
+++ b/arch/arm64/lib/memset.S
@@ -24,7 +24,8 @@
 
 #include <linux/linkage.h>
 #include <asm/assembler.h>
-#include <asm/cache.h>
+
+#define DONT_USE_DC 1
 
 /*
  * Fill in the buffer with character c (alignment handled by the hardware)
@@ -37,22 +38,32 @@
  *	x0 - buf
  */
 
-dstin		.req	x0
-val		.req	w1
-count		.req	x2
-tmp1		.req	x3
-tmp1w		.req	w3
-tmp2		.req	x4
-tmp2w		.req	w4
-zva_len_x	.req	x5
-zva_len		.req	w5
-zva_bits_x	.req	x6
-
-A_l		.req	x7
-A_lw		.req	w7
-dst		.req	x8
-tmp3w		.req	w9
-tmp3		.req	x9
+/* By default we assume that the DC instruction can be used to zero
+   data blocks more efficiently.  In some circumstances this might be
+   unsafe, for example in an asymmetric multiprocessor environment with
+   different DC clear lengths (neither the upper nor lower lengths are
+   safe to use).  The feature can be disabled by defining DONT_USE_DC.
+
+   If code may be run in a virtualized environment, then define
+   MAYBE_VIRT.  This will cause the code to cache the system register
+   values rather than re-reading them each call.  */
+
+#define dstin		x0
+#define val		w1
+#define count		x2
+#define tmp1		x3
+#define tmp1w		w3
+#define tmp2		x4
+#define tmp2w		w4
+#define zva_len_x	x5
+#define zva_len		w5
+#define zva_bits_x	x6
+
+#define A_l		x7
+#define A_lw		w7
+#define dst		x8
+#define tmp3w		w9
+#define tmp3		x9
 
 ENTRY(memset)
 	mov	dst, dstin	/* Preserve return value.  */
@@ -61,47 +72,60 @@ ENTRY(memset)
 	orr	A_lw, A_lw, A_lw, lsl #16
 	orr	A_l, A_l, A_l, lsl #32
 
-	cmp	count, #15
-	b.hi	.Lover16_proc
-	/*All store maybe are non-aligned..*/
-	tbz	count, #3, 1f
-	str	A_l, [dst], #8
-1:
-	tbz	count, #2, 2f
-	str	A_lw, [dst], #4
-2:
-	tbz	count, #1, 3f
-	strh	A_lw, [dst], #2
-3:
-	tbz	count, #0, 4f
-	strb	A_lw, [dst]
-4:
-	ret
-
-.Lover16_proc:
-	/*Whether  the start address is aligned with 16.*/
+	/*first align dst with 16...*/
 	neg	tmp2, dst
 	ands	tmp2, tmp2, #15
 	b.eq	.Laligned
-/*
-* The count is not less than 16, we can use stp to store the start 16 bytes,
-* then adjust the dst aligned with 16.This process will make the current
-* memory address at alignment boundary.
-*/
-	stp	A_l, A_l, [dst] /*non-aligned store..*/
-	/*make the dst aligned..*/
-	sub	count, count, tmp2
-	add	dst, dst, tmp2
+	/*find the Most Significant Bit which is set as 1.*/
+	clz	tmp1,  count /*0~64. 0 means all 1s'; 64 means all 0s' */
+	ands	tmp3, tmp1, #64/*ne (Z==0) means tmp1 is 64*/
+	/*tmp3 is not 64, set tmp3 as NOT tmp1, otherwise will set tmp3 as 64*/
+	csinv	tmp3, tmp1, tmp1, ne
+	b.ne	.Lexitfunc
+	ands	tmp1, tmp3, #63
+	/*tmp1 is ZERO, set tmp3 as 1. otherwise keep the tmp1*/
+	csinc	tmp3, tmp1, tmp1, ne
+
+	/*tmp3 = 0: tmp1 will be all 1s' ; tmp3 = 1: tmp1 will be set bit0 as ZERO.
+		tmp3 = 2: the lowest 2 bits are ZERO*/
+	mov	tmp1, #~0
+	lslv	tmp1, tmp1, tmp3
+	/*tmp3 will save the align offset s(1~7) depended on the count's MSB*/
+	bic	tmp3, tmp2, tmp1
+
+	/*from low bit to high bit of tmp3 ...*/
+	tbz	tmp3, #0, 1f
+	strb	A_lw, [dst], #1
+	subs	count, count, #1
+	b.eq	.Lexitfunc
+1:
+	tbz	tmp3, #1, 1f
+	strh	A_lw, [dst], #2
+	subs	count, count, #2
+	b.eq	.Lexitfunc
+1:
+	tbz	tmp3, #2, 1f
+	str	A_lw, [dst], #4
+	subs	count, count, #4
+	b.eq	.Lexitfunc
+1:
+	tbz	tmp3, #3, .Laligned
+	str	A_l,  [dst], #8
+	subs	count, count, #8
+	b.eq	.Lexitfunc
 
+/*Here, dst is aligned 16 now...*/
 .Laligned:
-	cbz	A_l, .Lzero_mem
+#ifndef DONT_USE_DC
+	cbz	A_l,  .Lzero_mem
+#endif
 
 .Ltail_maybe_long:
 	cmp	count, #64
 	b.ge	.Lnot_short
 .Ltail63:
 	ands	tmp1, count, #0x30
-	b.eq	3f
+	b.eq	.Ltail15tiny
 	cmp	tmp1w, #0x20
 	b.eq	1f
 	b.lt	2f
@@ -110,24 +134,30 @@ ENTRY(memset)
 	stp	A_l, A_l, [dst], #16
 2:
 	stp	A_l, A_l, [dst], #16
-/*
-* The last store length is less than 16,use stp to write last 16 bytes.
-* It will lead some bytes written twice and the access is non-aligned.
-*/
-3:
-	ands	count, count, #15
-	cbz	count, 4f
-	add	dst, dst, count
-	stp	A_l, A_l, [dst, #-16]	/* Repeat some/all of last store. */
-4:
+
+.Ltail15tiny:
+	/* Set up to 15 bytes.  Does not assume earlier memory
+	   being set.  */
+	tbz	count, #3, 1f
+	str	A_l, [dst], #8
+1:
+	tbz	count, #2, 1f
+	str	A_lw, [dst], #4
+1:
+	tbz	count, #1, 1f
+	strh	A_lw, [dst], #2
+1:
+	tbz	count, #0, 1f
+	strb	A_lw, [dst]
+1:
 	ret
 
 	/*
 	* Critical loop. Start at a new cache line boundary. Assuming
 	* 64 bytes per line, this ensures the entire loop is in one line.
 	*/
-	.p2align	L1_CACHE_SHIFT
-.Lnot_short:
+	.p2align	6
+.Lnot_short: /*count must be not less than 64*/
 	sub	dst, dst, #16/* Pre-bias.  */
 	sub	count, count, #64
 1:
@@ -143,6 +173,7 @@ ENTRY(memset)
 .Lexitfunc:
 	ret
 
+#ifndef DONT_USE_DC
 	/*
 	* For zeroing memory, check to see if we can use the ZVA feature to
 	* zero entire 'cache' lines.
@@ -156,14 +187,31 @@ ENTRY(memset)
 	*/
 	cmp	count, #128
 	b.lt	.Lnot_short /*count is at least  128 bytes*/
-
+#ifdef MAYBE_VIRT
+	/*For efficiency when virtualized, we cache the ZVA capability.  */
+	adrp	tmp2, .Lcache_clear
+	ldr	zva_len, [tmp2, #:lo12:.Lcache_clear]
+	tbnz	zva_len, #31, .Lnot_short
+	cbnz	zva_len, .Lzero_by_line
 	mrs	tmp1, dczid_el0
-	tbnz	tmp1, #4, .Lnot_short
+	tbz	tmp1, #4, 1f
+	/* ZVA not available.  Remember this for next time.  */
+	mov	zva_len, #~0
+	str	zva_len, [tmp2, #:lo12:.Lcache_clear]
+	b	.Lnot_short
+1:
 	mov	tmp3w, #4
 	and	zva_len, tmp1w, #15	/* Safety: other bits reserved.  */
 	lsl	zva_len, tmp3w, zva_len
-
-	ands	tmp3w, zva_len, #63
+	str	zva_len, [tmp2, #:lo12:.Lcache_clear]
+#else
+	mrs	tmp1, dczid_el0
+	tbnz	tmp1, #4, .Lnot_short
+	mov	tmp3w, #4
+	and	zva_len, tmp1w, #15	/* Safety: other bits reserved. */
+	lsl	zva_len, tmp3w, zva_len
+#endif
+	ands  tmp3w, zva_len, #63
 	/*
 	* ensure the zva_len is not less than 64.
 	* It is not meaningful to use ZVA if the block size is less than 64.
@@ -179,7 +227,7 @@ ENTRY(memset)
 	sub	zva_bits_x, zva_len_x, #1
 	neg	tmp2, dst
 	ands	tmp2, tmp2, zva_bits_x
-	b.eq	2f			/* Already aligned.  */
+	b.eq	1f			/* Already aligned.  */
 	/* Not aligned, check that there's enough to copy after alignment.*/
 	sub	tmp1, count, tmp2
 	/*
@@ -193,17 +241,17 @@ ENTRY(memset)
 	* to overrun by 64 bytes.
 	*/
 	mov	count, tmp1
-1:
+2:
 	stp	A_l, A_l, [dst]
 	stp	A_l, A_l, [dst, #16]
 	stp	A_l, A_l, [dst, #32]
 	subs	tmp2, tmp2, #64
 	stp	A_l, A_l, [dst, #48]
 	add	dst, dst, #64
-	b.ge	1b
+	b.ge	2b
 	/* We've overrun a bit, so adjust dst downwards.*/
 	add	dst, dst, tmp2
-2:
+1:
 	sub	count, count, zva_len_x
 3:
 	dc	zva, dst
@@ -211,6 +259,15 @@ ENTRY(memset)
 	subs	count, count, zva_len_x
 	b.ge	3b
 	ands	count, count, zva_bits_x
+	/*if zva_len_x is less than 16,
+		it probably make dst not to align with 16 again*/
 	b.ne	.Ltail_maybe_long
 	ret
+#ifdef MAYBE_VIRT
+	.bss
+	.p2align 2
+.Lcache_clear:
+	.space 4
+#endif
+#endif /* DONT_USE_DC */
 ENDPROC(memset)
-- 
2.1.1




More information about the linux-arm-kernel mailing list