strlen-armv7.S: Import latest strlen cortex-strings code.

Import the latest version of strlen from the Linaro cortex-strings package. This version is faster across a variety of block size and alignments on ARMv7. newlib/ChangeLog: 2013-06-21 Will Newton <will.newton@linaro.org> * libc/machine/arm/strlen-armv7.S: Import latest strlen code from Linaro cortex-strings.
2013-06-21 09:10:37 +00:00
parent a1a7a74e6b
commit c8af057907
2 changed files with 112 additions and 77 deletions
--- a/newlib/ChangeLog
+++ b/newlib/ChangeLog
@ -1,3 +1,8 @@
 2013-06-21  Will Newton  <will.newton@linaro.org>
 	* libc/machine/arm/strlen-armv7.S: Import latest strlen
 	code from Linaro cortex-strings.
 2013-06-21  Will Newton  <will.newton@linaro.org>
 	* MAINTAINERS: Add Will Newton to Write After Approval.
--- a/newlib/libc/machine/arm/strlen-armv7.S
+++ b/newlib/libc/machine/arm/strlen-armv7.S
@ -1,4 +1,4 @@
-/* Copyright (c) 2010-2011, Linaro Limited
+/* Copyright (c) 2010-2011,2013 Linaro Limited
   All rights reserved.
   Redistribution and use in source and binary forms, with or without
@ -28,100 +28,130 @@
   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-   Written by Dave Gilbert <david.gilbert@linaro.org>
+   Assumes:
-
+   ARMv6T2, AArch32
-   This strlen routine is optimised on a Cortex-A9 and should work on
+ */
   all ARMv7 processors.   This routine is reasonably fast for short
   strings, but is probably slower than a simple implementation if all
   your strings are very short */
@ 2011-02-08 david.gilbert@linaro.org
@    Extracted from local git 6848613a
@ 2011-10-13 david.gilbert@linaro.org
@    Extracted from cortex-strings bzr rev 63
@      Integrate to newlib, flip to ldrd
@      Pull in Endian macro from my memchr
 #include "arm_asm.h"
-@ NOTE: This ifdef MUST match the ones in arm/strlen.c
+/* NOTE: This ifdef MUST match the ones in arm/strlen.c
-@ We fallback to the one in arm/strlen.c for size optimised or
+   We fallback to the one in arm/strlen.c for size optimised or
-@ for older arch's
+   for older architectures. */
 #if defined(_ISA_ARM_7) || defined(__ARM_ARCH_6T2__) && \
    !(defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED) || \
      (defined (__thumb__) && !defined (__thumb2__)))
-@ this lets us check a flag in a 00/ff byte easily in either endianness
+	.macro def_fn f p2align=0
 	.text
 	.p2align \p2align
 	.global \f
 	.type \f, %function
 \f:
 	.endm
 #ifdef __ARMEB__
-#define CHARTSTMASK(c) 1<<(31-(c*8))
+#define S2LO		lsl
 #define S2HI		lsr
 #else
-#define CHARTSTMASK(c) 1<<(c*8)
+#define S2LO		lsr
 #define S2HI		lsl
 #endif
-@------------------------------------------------------------------------------
+	/* This code requires Thumb.  */
 	.thumb
 	.syntax unified
 	.arch armv7-a
-	.thumb_func
+/* Parameters and result.  */
-	.align 2
+#define srcin		r0
-	.p2align 4,,15
+#define result		r0
 	.global strlen
 	.type strlen,%function
 strlen:
 	@ r0 = string
 	@ returns count of bytes in string not including terminator
 	mov	r1, r0
 	push	{ r4,r6 }
 	mvns	r6, #0		@ all F
 	movs	r4, #0
 	tst	r0, #7
 	beq	2f
-1:
+/* Internal variables.  */
-	ldrb	r2, [r1], #1
+#define src		r1
-	tst	r1, #7		@ Hit alignment yet?
+#define data1a		r2
-	cbz	r2, 10f		@ Exit if we found the 0
+#define data1b		r3
-	bne	1b
+#define const_m1	r12
 #define const_0		r4
 #define tmp1		r4		/* Overlaps const_0  */
 #define tmp2		r5
-	@ So we're now aligned
+def_fn	strlen p2align=6
-2:
+	pld	[srcin, #0]
-	ldrd    r2,r3,[r1],#8
+	strd	r4, r5, [sp, #-8]!
-	uadd8	r2, r2, r6	@ Par add 0xff - sets the GE bits for bytes!=0
+	bic	src, srcin, #7
-	sel	r2, r4, r6	@ bytes are 00 for none-00 bytes,
+	mvn	const_m1, #0
-				@ or ff for 00 bytes - NOTE INVERSION
+	ands	tmp1, srcin, #7		/* (8 - bytes) to alignment.  */
-	uadd8	r3, r3, r6	@ Par add 0xff - sets the GE bits for bytes!=0
+	pld	[src, #32]
-	sel	r3, r2, r6	@ chained...bytes are 00 for none-00 bytes,
+	bne.w	.Lmisaligned8
-				@ or ff for 00 bytes - NOTE INVERSION
+	mov	const_0, #0
-	cmp	r3, #0
+	mov	result, #-8
-	beq	2b
+.Lloop_aligned:
 	/* Bytes 0-7.  */
 	ldrd	data1a, data1b, [src]
 	pld	[src, #64]
 	add	result, result, #8
 .Lstart_realigned:
 	uadd8	data1a, data1a, const_m1	/* Saturating GE<0:3> set.  */
 	sel	data1a, const_0, const_m1	/* Select based on GE<0:3>.  */
 	uadd8	data1b, data1b, const_m1
 	sel	data1b, data1a, const_m1	/* Only used if d1a == 0.  */
 	cbnz	data1b, .Lnull_found
-strlenendtmp:
+	/* Bytes 8-15.  */
-	@ One (or more) of the bytes we loaded was 0 - but which one?
+	ldrd	data1a, data1b, [src, #8]
-	@ r2 has the mask corresponding to the first loaded word
+	uadd8	data1a, data1a, const_m1	/* Saturating GE<0:3> set.  */
-	@ r3 has a combined mask of the two words - but if r2 was all-non 0 
+	add	result, result, #8
-	@ then it's just the 2nd words
+	sel	data1a, const_0, const_m1	/* Select based on GE<0:3>.  */
-	cmp	r2, #0
+	uadd8	data1b, data1b, const_m1
-	itte	eq
+	sel	data1b, data1a, const_m1	/* Only used if d1a == 0.  */
-	moveq	r2, r3		@ the end is in the 2nd word
+	cbnz	data1b, .Lnull_found
 	subeq	r1,r1,#3
 	subne	r1,r1,#7
-	@ r1 currently points to the 2nd byte of the word containing the 0
+	/* Bytes 16-23.  */
-	tst	r2, # CHARTSTMASK(0)	@ 1st character
+	ldrd	data1a, data1b, [src, #16]
-	bne	10f
+	uadd8	data1a, data1a, const_m1	/* Saturating GE<0:3> set.  */
-	adds	r1,r1,#1
+	add	result, result, #8
-	tst	r2, # CHARTSTMASK(1)	@ 2nd character
+	sel	data1a, const_0, const_m1	/* Select based on GE<0:3>.  */
-	ittt	eq
+	uadd8	data1b, data1b, const_m1
-	addeq	r1,r1,#1
+	sel	data1b, data1a, const_m1	/* Only used if d1a == 0.  */
-	tsteq	r2, # (3<<15)	@ 2nd & 3rd character
+	cbnz	data1b, .Lnull_found
 	@ If not the 3rd must be the last one
 	addeq	r1,r1,#1
-10:
+	/* Bytes 24-31.  */
-	@ r0 is still at the beginning, r1 is pointing 1 byte after the nul
+	ldrd	data1a, data1b, [src, #24]
-	sub	r0, r1, r0
+	add	src, src, #32
-	subs	r0, r0, #1
+	uadd8	data1a, data1a, const_m1	/* Saturating GE<0:3> set.  */
-	pop	{ r4, r6 }
+	add	result, result, #8
 	sel	data1a, const_0, const_m1	/* Select based on GE<0:3>.  */
 	uadd8	data1b, data1b, const_m1
 	sel	data1b, data1a, const_m1	/* Only used if d1a == 0.  */
 	cmp	data1b, #0
 	beq	.Lloop_aligned
 .Lnull_found:
 	cmp	data1a, #0
 	itt	eq
 	addeq	result, result, #4
 	moveq	data1a, data1b
 #ifndef __ARMEB__
 	rev	data1a, data1a
 #endif
 	clz	data1a, data1a
 	ldrd	r4, r5, [sp], #8
 	add	result, result, data1a, lsr #3	/* Bits -> Bytes.  */
 	bx	lr
 .Lmisaligned8:
 	ldrd	data1a, data1b, [src]
 	and	tmp2, tmp1, #3
 	rsb	result, tmp1, #0
 	lsl	tmp2, tmp2, #3			/* Bytes -> bits.  */
 	tst	tmp1, #4
 	pld	[src, #64]
 	S2HI	tmp2, const_m1, tmp2
 	orn	data1a, data1a, tmp2
 	itt	ne
 	ornne	data1b, data1b, tmp2
 	movne	data1a, const_m1
 	mov	const_0, #0
 	b	.Lstart_realigned
 	.size	strlen, . - strlen
 #endif