From 3263f90ef72a2edab765f3d91b91fc69427307be Mon Sep 17 00:00:00 2001
From: Wilco Dijkstra <wdijkstr@arm.com>
Date: Mon, 13 Jul 2015 13:13:34 +0100
Subject: [PATCH] [AArch64] Optimized memset.

This is an optimized memset for AArch64.  Memset is split into 4 main
cases: small sets of up to 16 bytes, medium of 16..96 bytes which are
fully unrolled.  Large memsets of more than 96 bytes align the
destination and use an unrolled loop processing 64 bytes per
iteration.  Memsets of zero of more than 256 use the dc zva
instruction, and there are faster versions for the common ZVA sizes 64
or 128.  STP of Q registers is used to reduce codesize without loss of
performance.
---
 newlib/ChangeLog                     |   5 +
 newlib/libc/machine/aarch64/memset.S | 373 +++++++++++++--------------
 2 files changed, 188 insertions(+), 190 deletions(-)

diff --git a/newlib/ChangeLog b/newlib/ChangeLog
index e851d9f4c..64caa071c 100644
--- a/newlib/ChangeLog
+++ b/newlib/ChangeLog
@@ -1,3 +1,8 @@
+2015-07-13  Wilco Dijkstra  <wdijkstr@arm.com>
+
+	* libc/machine/aarch64/memset.S (memset):
+	Rewrite of optimized memset.
+
 2015-07-13  Wilco Dijkstra  <wdijkstr@arm.com>
 
 	* libc/machine/aarch64/memcpy.S (memcpy):
diff --git a/newlib/libc/machine/aarch64/memset.S b/newlib/libc/machine/aarch64/memset.S
index 20372df94..9b04e83d4 100644
--- a/newlib/libc/machine/aarch64/memset.S
+++ b/newlib/libc/machine/aarch64/memset.S
@@ -24,10 +24,37 @@
    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
 
+/*
+ * Copyright (c) 2015 ARM Ltd
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the company may not be used to endorse or promote
+ *    products derived from this software without specific prior written
+ *    permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
 /* Assumptions:
  *
- * ARMv8-a, AArch64
- * Unaligned accesses
+ * ARMv8-a, AArch64, unaligned accesses
  *
  */
 
@@ -35,32 +62,20 @@
 /* See memset-stub.c  */
 #else
 
-/* By default we assume that the DC instruction can be used to zero
-   data blocks more efficiently.  In some circumstances this might be
-   unsafe, for example in an asymmetric multiprocessor environment with
-   different DC clear lengths (neither the upper nor lower lengths are
-   safe to use).  The feature can be disabled by defining DONT_USE_DC.
-
-   If code may be run in a virtualized environment, then define
-   MAYBE_VIRT.  This will cause the code to cache the system register
-   values rather than re-reading them each call.  */
-
-#define dstin		x0
-#define val		w1
-#define count		x2
-#define tmp1		x3
-#define tmp1w		w3
-#define tmp2		x4
-#define tmp2w		w4
-#define zva_len_x	x5
-#define zva_len		w5
-#define zva_bits_x	x6
-
-#define A_l		x7
-#define A_lw		w7
-#define dst		x8
-#define tmp3w		w9
+#define dstin	x0
+#define val	x1
+#define valw	w1
+#define count	x2
+#define dst	x3
+#define dstend	x4
+#define tmp1	x5
+#define tmp1w	w5
+#define tmp2	x6
+#define tmp2w	w6
+#define zva_len x7
+#define zva_lenw w7
 
+#define L(l) .L ## l
 
 	.macro def_fn f p2align=0
 	.text
@@ -72,175 +87,153 @@
 
 def_fn memset p2align=6
 
-	mov	dst, dstin		/* Preserve return value.  */
-	ands	A_lw, val, #255
-#ifndef DONT_USE_DC
-	b.eq	.Lzero_mem
-#endif
-	orr	A_lw, A_lw, A_lw, lsl #8
-	orr	A_lw, A_lw, A_lw, lsl #16
-	orr	A_l, A_l, A_l, lsl #32
-.Ltail_maybe_long:
-	cmp	count, #64
-	b.ge	.Lnot_short
-.Ltail_maybe_tiny:
-	cmp	count, #15
-	b.le	.Ltail15tiny
-.Ltail63:
-	ands	tmp1, count, #0x30
-	b.eq	.Ltail15
-	add	dst, dst, tmp1
-	cmp	tmp1w, #0x20
-	b.eq	1f
-	b.lt	2f
-	stp	A_l, A_l, [dst, #-48]
-1:
-	stp	A_l, A_l, [dst, #-32]
-2:
-	stp	A_l, A_l, [dst, #-16]
+	dup	v0.16B, valw
+	add	dstend, dstin, count
 
-.Ltail15:
-	and	count, count, #15
-	add	dst, dst, count
-	stp	A_l, A_l, [dst, #-16]	/* Repeat some/all of last store. */
+	cmp	count, 96
+	b.hi	L(set_long)
+	cmp	count, 16
+	b.hs	L(set_medium)
+	mov	val, v0.D[0]
+
+	/* Set 0..15 bytes.  */
+	tbz	count, 3, 1f
+	str	val, [dstin]
+	str	val, [dstend, -8]
+	ret
+	nop
+1:	tbz	count, 2, 2f
+	str	valw, [dstin]
+	str	valw, [dstend, -4]
+	ret
+2:	cbz	count, 3f
+	strb	valw, [dstin]
+	tbz	count, 1, 3f
+	strh	valw, [dstend, -2]
+3:	ret
+
+	/* Set 17..96 bytes.  */
+L(set_medium):
+	str	q0, [dstin]
+	tbnz	count, 6, L(set96)
+	str	q0, [dstend, -16]
+	tbz	count, 5, 1f
+	str	q0, [dstin, 16]
+	str	q0, [dstend, -32]
+1:	ret
+
+	.p2align 4
+	/* Set 64..96 bytes.  Write 64 bytes from the start and
+	   32 bytes from the end.  */
+L(set96):
+	str	q0, [dstin, 16]
+	stp	q0, q0, [dstin, 32]
+	stp	q0, q0, [dstend, -32]
 	ret
 
-.Ltail15tiny:
-	/* Set up to 15 bytes.  Does not assume earlier memory
-	   being set.  */
-	tbz	count, #3, 1f
-	str	A_l, [dst], #8
-1:
-	tbz	count, #2, 1f
-	str	A_lw, [dst], #4
-1:
-	tbz	count, #1, 1f
-	strh	A_lw, [dst], #2
-1:
-	tbz	count, #0, 1f
-	strb	A_lw, [dst]
-1:
+	.p2align 3
+	nop
+L(set_long):
+	and	valw, valw, 255
+	bic	dst, dstin, 15
+	str	q0, [dstin]
+	cmp	count, 256
+	ccmp	valw, 0, 0, cs
+	b.eq	L(try_zva)
+L(no_zva):
+	sub	count, dstend, dst	/* Count is 16 too large.  */
+	add	dst, dst, 16
+	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
+1:	stp	q0, q0, [dst], 64
+	stp	q0, q0, [dst, -32]
+L(tail64):
+	subs	count, count, 64
+	b.hi	1b
+2:	stp	q0, q0, [dstend, -64]
+	stp	q0, q0, [dstend, -32]
 	ret
 
-	/* Critical loop.  Start at a new cache line boundary.  Assuming
-	 * 64 bytes per line, this ensures the entire loop is in one line.  */
-	.p2align 6
-.Lnot_short:
-	neg	tmp2, dst
-	ands	tmp2, tmp2, #15
-	b.eq	2f
-	/* Bring DST to 128-bit (16-byte) alignment.  We know that there's
-	 * more than that to set, so we simply store 16 bytes and advance by
-	 * the amount required to reach alignment.  */
-	sub	count, count, tmp2
-	stp	A_l, A_l, [dst]
-	add	dst, dst, tmp2
-	/* There may be less than 63 bytes to go now.  */
-	cmp	count, #63
-	b.le	.Ltail63
-2:
-	sub	dst, dst, #16		/* Pre-bias.  */
-	sub	count, count, #64
-1:
-	stp	A_l, A_l, [dst, #16]
-	stp	A_l, A_l, [dst, #32]
-	stp	A_l, A_l, [dst, #48]
-	stp	A_l, A_l, [dst, #64]!
-	subs	count, count, #64
-	b.ge	1b
-	tst	count, #0x3f
-	add	dst, dst, #16
-	b.ne	.Ltail63
-	ret
-
-#ifndef DONT_USE_DC
-	/* For zeroing memory, check to see if we can use the ZVA feature to
-	 * zero entire 'cache' lines.  */
-.Lzero_mem:
-	mov	A_l, #0
-	cmp	count, #63
-	b.le	.Ltail_maybe_tiny
-	neg	tmp2, dst
-	ands	tmp2, tmp2, #15
-	b.eq	1f
-	sub	count, count, tmp2
-	stp	A_l, A_l, [dst]
-	add	dst, dst, tmp2
-	cmp	count, #63
-	b.le	.Ltail63
-1:
-	/* For zeroing small amounts of memory, it's not worth setting up
-	 * the line-clear code.  */
-	cmp	count, #128
-	b.lt	.Lnot_short
-#ifdef MAYBE_VIRT
-	/* For efficiency when virtualized, we cache the ZVA capability.  */
-	adrp	tmp2, .Lcache_clear
-	ldr	zva_len, [tmp2, #:lo12:.Lcache_clear]
-	tbnz	zva_len, #31, .Lnot_short
-	cbnz	zva_len, .Lzero_by_line
+	.p2align 3
+L(try_zva):
 	mrs	tmp1, dczid_el0
-	tbz	tmp1, #4, 1f
-	/* ZVA not available.  Remember this for next time.  */
-	mov	zva_len, #~0
-	str	zva_len, [tmp2, #:lo12:.Lcache_clear]
-	b	.Lnot_short
-1:
-	mov	tmp3w, #4
-	and	zva_len, tmp1w, #15	/* Safety: other bits reserved.  */
-	lsl	zva_len, tmp3w, zva_len
-	str	zva_len, [tmp2, #:lo12:.Lcache_clear]
-#else
-	mrs	tmp1, dczid_el0
-	tbnz	tmp1, #4, .Lnot_short
-	mov	tmp3w, #4
-	and	zva_len, tmp1w, #15	/* Safety: other bits reserved.  */
-	lsl	zva_len, tmp3w, zva_len
-#endif
+	tbnz	tmp1w, 4, L(no_zva)
+	and	tmp1w, tmp1w, 4
+	cmp	tmp1w, 4	/* ZVA size is 64 bytes.  */
+	b.ne	 L(zva_128)
 
-.Lzero_by_line:
-	/* Compute how far we need to go to become suitably aligned.  We're
-	 * already at quad-word alignment.  */
-	cmp	count, zva_len_x
-	b.lt	.Lnot_short		/* Not enough to reach alignment.  */
-	sub	zva_bits_x, zva_len_x, #1
-	neg	tmp2, dst
-	ands	tmp2, tmp2, zva_bits_x
-	b.eq	1f			/* Already aligned.  */
-	/* Not aligned, check that there's enough to copy after alignment.  */
-	sub	tmp1, count, tmp2
-	cmp	tmp1, #64
-	ccmp	tmp1, zva_len_x, #8, ge	/* NZCV=0b1000 */
-	b.lt	.Lnot_short
-	/* We know that there's at least 64 bytes to zero and that it's safe
-	 * to overrun by 64 bytes.  */
-	mov	count, tmp1
-2:
-	stp	A_l, A_l, [dst]
-	stp	A_l, A_l, [dst, #16]
-	stp	A_l, A_l, [dst, #32]
-	subs	tmp2, tmp2, #64
-	stp	A_l, A_l, [dst, #48]
-	add	dst, dst, #64
-	b.ge	2b
-	/* We've overrun a bit, so adjust dst downwards.  */
-	add	dst, dst, tmp2
-1:
-	sub	count, count, zva_len_x
-3:
-	dc	zva, dst
-	add	dst, dst, zva_len_x
-	subs	count, count, zva_len_x
-	b.ge	3b
-	ands	count, count, zva_bits_x
-	b.ne	.Ltail_maybe_long
+	/* Write the first and last 64 byte aligned block using stp rather
+	   than using DC ZVA.  This is faster on some cores.
+	 */
+L(zva_64):
+	str	q0, [dst, 16]
+	stp	q0, q0, [dst, 32]
+	bic	dst, dst, 63
+	stp	q0, q0, [dst, 64]
+	stp	q0, q0, [dst, 96]
+	sub	count, dstend, dst	/* Count is now 128 too large.	*/
+	sub	count, count, 128+64+64	/* Adjust count and bias for loop.  */
+	add	dst, dst, 128
+	nop
+1:	dc	zva, dst
+	add	dst, dst, 64
+	subs	count, count, 64
+	b.hi	1b
+	stp	q0, q0, [dst, 0]
+	stp	q0, q0, [dst, 32]
+	stp	q0, q0, [dstend, -64]
+	stp	q0, q0, [dstend, -32]
 	ret
-	.size	memset, .-memset
-#ifdef MAYBE_VIRT
-	.bss
-	.p2align 2
-.Lcache_clear:
-	.space 4
-#endif
-#endif /* DONT_USE_DC */
+
+	.p2align 3
+L(zva_128):
+	cmp	tmp1w, 5	/* ZVA size is 128 bytes.  */
+	b.ne	L(zva_other)
+
+	str	q0, [dst, 16]
+	stp	q0, q0, [dst, 32]
+	stp	q0, q0, [dst, 64]
+	stp	q0, q0, [dst, 96]
+	bic	dst, dst, 127
+	sub	count, dstend, dst	/* Count is now 128 too large.	*/
+	sub	count, count, 128+128	/* Adjust count and bias for loop.  */
+	add	dst, dst, 128
+1:	dc	zva, dst
+	add	dst, dst, 128
+	subs	count, count, 128
+	b.hi	1b
+	stp	q0, q0, [dstend, -128]
+	stp	q0, q0, [dstend, -96]
+	stp	q0, q0, [dstend, -64]
+	stp	q0, q0, [dstend, -32]
+	ret
+
+L(zva_other):
+	mov	tmp2w, 4
+	lsl	zva_lenw, tmp2w, tmp1w
+	add	tmp1, zva_len, 64	/* Max alignment bytes written.	 */
+	cmp	count, tmp1
+	blo	L(no_zva)
+
+	sub	tmp2, zva_len, 1
+	add	tmp1, dst, zva_len
+	add	dst, dst, 16
+	subs	count, tmp1, dst	/* Actual alignment bytes to write.  */
+	bic	tmp1, tmp1, tmp2	/* Aligned dc zva start address.  */
+	beq	2f
+1:	stp	q0, q0, [dst], 64
+	stp	q0, q0, [dst, -32]
+	subs	count, count, 64
+	b.hi	1b
+2:	mov	dst, tmp1
+	sub	count, dstend, tmp1	/* Remaining bytes to write.  */
+	subs	count, count, zva_len
+	b.lo	4f
+3:	dc	zva, dst
+	add	dst, dst, zva_len
+	subs	count, count, zva_len
+	b.hs	3b
+4:	add	count, count, zva_len
+	b	L(tail64)
+
+	.size	memset, . - memset
 #endif