strlen-armv7.S: Import latest strlen cortex-strings code.

Import the latest version of strlen from the Linaro cortex-strings
package. This version is faster across a variety of block size and
alignments on ARMv7.

newlib/ChangeLog:

2013-06-21  Will Newton  <will.newton@linaro.org>

	* libc/machine/arm/strlen-armv7.S: Import latest strlen
	code from Linaro cortex-strings.
This commit is contained in:
Will Newton 2013-06-21 09:10:37 +00:00
parent a1a7a74e6b
commit c8af057907
2 changed files with 112 additions and 77 deletions

View File

@ -1,3 +1,8 @@
2013-06-21 Will Newton <will.newton@linaro.org>
* libc/machine/arm/strlen-armv7.S: Import latest strlen
code from Linaro cortex-strings.
2013-06-21 Will Newton <will.newton@linaro.org> 2013-06-21 Will Newton <will.newton@linaro.org>
* MAINTAINERS: Add Will Newton to Write After Approval. * MAINTAINERS: Add Will Newton to Write After Approval.

View File

@ -1,4 +1,4 @@
/* Copyright (c) 2010-2011, Linaro Limited /* Copyright (c) 2010-2011,2013 Linaro Limited
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@ -28,100 +28,130 @@
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Written by Dave Gilbert <david.gilbert@linaro.org> Assumes:
ARMv6T2, AArch32
This strlen routine is optimised on a Cortex-A9 and should work on */
all ARMv7 processors. This routine is reasonably fast for short
strings, but is probably slower than a simple implementation if all
your strings are very short */
@ 2011-02-08 david.gilbert@linaro.org
@ Extracted from local git 6848613a
@ 2011-10-13 david.gilbert@linaro.org
@ Extracted from cortex-strings bzr rev 63
@ Integrate to newlib, flip to ldrd
@ Pull in Endian macro from my memchr
#include "arm_asm.h" #include "arm_asm.h"
@ NOTE: This ifdef MUST match the ones in arm/strlen.c /* NOTE: This ifdef MUST match the ones in arm/strlen.c
@ We fallback to the one in arm/strlen.c for size optimised or We fallback to the one in arm/strlen.c for size optimised or
@ for older arch's for older architectures. */
#if defined(_ISA_ARM_7) || defined(__ARM_ARCH_6T2__) && \ #if defined(_ISA_ARM_7) || defined(__ARM_ARCH_6T2__) && \
!(defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED) || \ !(defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED) || \
(defined (__thumb__) && !defined (__thumb2__))) (defined (__thumb__) && !defined (__thumb2__)))
@ this lets us check a flag in a 00/ff byte easily in either endianness .macro def_fn f p2align=0
.text
.p2align \p2align
.global \f
.type \f, %function
\f:
.endm
#ifdef __ARMEB__ #ifdef __ARMEB__
#define CHARTSTMASK(c) 1<<(31-(c*8)) #define S2LO lsl
#define S2HI lsr
#else #else
#define CHARTSTMASK(c) 1<<(c*8) #define S2LO lsr
#define S2HI lsl
#endif #endif
@------------------------------------------------------------------------------ /* This code requires Thumb. */
.thumb
.syntax unified .syntax unified
.arch armv7-a
.thumb_func /* Parameters and result. */
.align 2 #define srcin r0
.p2align 4,,15 #define result r0
.global strlen
.type strlen,%function
strlen:
@ r0 = string
@ returns count of bytes in string not including terminator
mov r1, r0
push { r4,r6 }
mvns r6, #0 @ all F
movs r4, #0
tst r0, #7
beq 2f
1: /* Internal variables. */
ldrb r2, [r1], #1 #define src r1
tst r1, #7 @ Hit alignment yet? #define data1a r2
cbz r2, 10f @ Exit if we found the 0 #define data1b r3
bne 1b #define const_m1 r12
#define const_0 r4
#define tmp1 r4 /* Overlaps const_0 */
#define tmp2 r5
@ So we're now aligned def_fn strlen p2align=6
2: pld [srcin, #0]
ldrd r2,r3,[r1],#8 strd r4, r5, [sp, #-8]!
uadd8 r2, r2, r6 @ Par add 0xff - sets the GE bits for bytes!=0 bic src, srcin, #7
sel r2, r4, r6 @ bytes are 00 for none-00 bytes, mvn const_m1, #0
@ or ff for 00 bytes - NOTE INVERSION ands tmp1, srcin, #7 /* (8 - bytes) to alignment. */
uadd8 r3, r3, r6 @ Par add 0xff - sets the GE bits for bytes!=0 pld [src, #32]
sel r3, r2, r6 @ chained...bytes are 00 for none-00 bytes, bne.w .Lmisaligned8
@ or ff for 00 bytes - NOTE INVERSION mov const_0, #0
cmp r3, #0 mov result, #-8
beq 2b .Lloop_aligned:
/* Bytes 0-7. */
ldrd data1a, data1b, [src]
pld [src, #64]
add result, result, #8
.Lstart_realigned:
uadd8 data1a, data1a, const_m1 /* Saturating GE<0:3> set. */
sel data1a, const_0, const_m1 /* Select based on GE<0:3>. */
uadd8 data1b, data1b, const_m1
sel data1b, data1a, const_m1 /* Only used if d1a == 0. */
cbnz data1b, .Lnull_found
strlenendtmp: /* Bytes 8-15. */
@ One (or more) of the bytes we loaded was 0 - but which one? ldrd data1a, data1b, [src, #8]
@ r2 has the mask corresponding to the first loaded word uadd8 data1a, data1a, const_m1 /* Saturating GE<0:3> set. */
@ r3 has a combined mask of the two words - but if r2 was all-non 0 add result, result, #8
@ then it's just the 2nd words sel data1a, const_0, const_m1 /* Select based on GE<0:3>. */
cmp r2, #0 uadd8 data1b, data1b, const_m1
itte eq sel data1b, data1a, const_m1 /* Only used if d1a == 0. */
moveq r2, r3 @ the end is in the 2nd word cbnz data1b, .Lnull_found
subeq r1,r1,#3
subne r1,r1,#7
@ r1 currently points to the 2nd byte of the word containing the 0 /* Bytes 16-23. */
tst r2, # CHARTSTMASK(0) @ 1st character ldrd data1a, data1b, [src, #16]
bne 10f uadd8 data1a, data1a, const_m1 /* Saturating GE<0:3> set. */
adds r1,r1,#1 add result, result, #8
tst r2, # CHARTSTMASK(1) @ 2nd character sel data1a, const_0, const_m1 /* Select based on GE<0:3>. */
ittt eq uadd8 data1b, data1b, const_m1
addeq r1,r1,#1 sel data1b, data1a, const_m1 /* Only used if d1a == 0. */
tsteq r2, # (3<<15) @ 2nd & 3rd character cbnz data1b, .Lnull_found
@ If not the 3rd must be the last one
addeq r1,r1,#1
10: /* Bytes 24-31. */
@ r0 is still at the beginning, r1 is pointing 1 byte after the nul ldrd data1a, data1b, [src, #24]
sub r0, r1, r0 add src, src, #32
subs r0, r0, #1 uadd8 data1a, data1a, const_m1 /* Saturating GE<0:3> set. */
pop { r4, r6 } add result, result, #8
sel data1a, const_0, const_m1 /* Select based on GE<0:3>. */
uadd8 data1b, data1b, const_m1
sel data1b, data1a, const_m1 /* Only used if d1a == 0. */
cmp data1b, #0
beq .Lloop_aligned
.Lnull_found:
cmp data1a, #0
itt eq
addeq result, result, #4
moveq data1a, data1b
#ifndef __ARMEB__
rev data1a, data1a
#endif
clz data1a, data1a
ldrd r4, r5, [sp], #8
add result, result, data1a, lsr #3 /* Bits -> Bytes. */
bx lr bx lr
.Lmisaligned8:
ldrd data1a, data1b, [src]
and tmp2, tmp1, #3
rsb result, tmp1, #0
lsl tmp2, tmp2, #3 /* Bytes -> bits. */
tst tmp1, #4
pld [src, #64]
S2HI tmp2, const_m1, tmp2
orn data1a, data1a, tmp2
itt ne
ornne data1b, data1b, tmp2
movne data1a, const_m1
mov const_0, #0
b .Lstart_realigned
.size strlen, . - strlen
#endif #endif