Optimized memcmp
This is an optimized memcmp for AArch64. This is a complete rewrite using a different algorithm. The previous version split into cases where both inputs were aligned, the inputs were mutually aligned and unaligned using a byte loop. The new version combines all these cases, while small inputs of less than 8 bytes are handled separately. This allows the main code to be sped up using unaligned loads since there are now at least 8 bytes to be compared. After the first 8 bytes, align the first input. This ensures each iteration does at most one unaligned access and mutually aligned inputs behave as aligned. After the main loop, process the last 8 bytes using unaligned accesses. This improves performance of (mutually) aligned cases by 25% and unaligned by >500% (yes >6 times faster) on large inputs. ChangeLog: 2017-06-28 Wilco Dijkstra <wdijkstr@arm.com> * newlib/libc/machine/aarch64/memcmp.S (memcmp): Rewrite of optimized memcmp. GLIBC benchtests/bench-memcmp.c performance comparison for Cortex-A53: Length 1, alignment 1/ 1: 153% Length 1, alignment 1/ 1: 119% Length 1, alignment 1/ 1: 154% Length 2, alignment 2/ 2: 121% Length 2, alignment 2/ 2: 140% Length 2, alignment 2/ 2: 121% Length 3, alignment 3/ 3: 105% Length 3, alignment 3/ 3: 105% Length 3, alignment 3/ 3: 105% Length 4, alignment 4/ 4: 155% Length 4, alignment 4/ 4: 154% Length 4, alignment 4/ 4: 161% Length 5, alignment 5/ 5: 173% Length 5, alignment 5/ 5: 173% Length 5, alignment 5/ 5: 173% Length 6, alignment 6/ 6: 145% Length 6, alignment 6/ 6: 145% Length 6, alignment 6/ 6: 145% Length 7, alignment 7/ 7: 125% Length 7, alignment 7/ 7: 125% Length 7, alignment 7/ 7: 125% Length 8, alignment 8/ 8: 111% Length 8, alignment 8/ 8: 130% Length 8, alignment 8/ 8: 124% Length 9, alignment 9/ 9: 160% Length 9, alignment 9/ 9: 160% Length 9, alignment 9/ 9: 150% Length 10, alignment 10/10: 170% Length 10, alignment 10/10: 137% Length 10, alignment 10/10: 150% Length 11, alignment 11/11: 160% Length 11, alignment 11/11: 160% Length 11, alignment 11/11: 160% Length 12, alignment 12/12: 146% Length 12, alignment 12/12: 168% Length 12, alignment 12/12: 156% Length 13, alignment 13/13: 167% Length 13, alignment 13/13: 167% Length 13, alignment 13/13: 173% Length 14, alignment 14/14: 167% Length 14, alignment 14/14: 168% Length 14, alignment 14/14: 168% Length 15, alignment 15/15: 168% Length 15, alignment 15/15: 173% Length 15, alignment 15/15: 173% Length 1, alignment 0/ 0: 134% Length 1, alignment 0/ 0: 127% Length 1, alignment 0/ 0: 119% Length 2, alignment 0/ 0: 94% Length 2, alignment 0/ 0: 94% Length 2, alignment 0/ 0: 106% Length 3, alignment 0/ 0: 82% Length 3, alignment 0/ 0: 87% Length 3, alignment 0/ 0: 82% Length 4, alignment 0/ 0: 115% Length 4, alignment 0/ 0: 115% Length 4, alignment 0/ 0: 122% Length 5, alignment 0/ 0: 127% Length 5, alignment 0/ 0: 119% Length 5, alignment 0/ 0: 127% Length 6, alignment 0/ 0: 103% Length 6, alignment 0/ 0: 100% Length 6, alignment 0/ 0: 100% Length 7, alignment 0/ 0: 82% Length 7, alignment 0/ 0: 91% Length 7, alignment 0/ 0: 87% Length 8, alignment 0/ 0: 111% Length 8, alignment 0/ 0: 124% Length 8, alignment 0/ 0: 124% Length 9, alignment 0/ 0: 136% Length 9, alignment 0/ 0: 136% Length 9, alignment 0/ 0: 136% Length 10, alignment 0/ 0: 136% Length 10, alignment 0/ 0: 135% Length 10, alignment 0/ 0: 136% Length 11, alignment 0/ 0: 136% Length 11, alignment 0/ 0: 136% Length 11, alignment 0/ 0: 135% Length 12, alignment 0/ 0: 136% Length 12, alignment 0/ 0: 136% Length 12, alignment 0/ 0: 136% Length 13, alignment 0/ 0: 135% Length 13, alignment 0/ 0: 136% Length 13, alignment 0/ 0: 136% Length 14, alignment 0/ 0: 136% Length 14, alignment 0/ 0: 136% Length 14, alignment 0/ 0: 136% Length 15, alignment 0/ 0: 136% Length 15, alignment 0/ 0: 136% Length 15, alignment 0/ 0: 136% Length 4, alignment 0/ 0: 115% Length 4, alignment 0/ 0: 115% Length 4, alignment 0/ 0: 115% Length 32, alignment 0/ 0: 127% Length 32, alignment 7/ 2: 395% Length 32, alignment 0/ 0: 127% Length 32, alignment 0/ 0: 127% Length 8, alignment 0/ 0: 111% Length 8, alignment 0/ 0: 124% Length 8, alignment 0/ 0: 124% Length 64, alignment 0/ 0: 128% Length 64, alignment 6/ 4: 475% Length 64, alignment 0/ 0: 131% Length 64, alignment 0/ 0: 134% Length 16, alignment 0/ 0: 128% Length 16, alignment 0/ 0: 119% Length 16, alignment 0/ 0: 128% Length 128, alignment 0/ 0: 129% Length 128, alignment 5/ 6: 475% Length 128, alignment 0/ 0: 130% Length 128, alignment 0/ 0: 129% Length 32, alignment 0/ 0: 126% Length 32, alignment 0/ 0: 126% Length 32, alignment 0/ 0: 126% Length 256, alignment 0/ 0: 127% Length 256, alignment 4/ 8: 545% Length 256, alignment 0/ 0: 126% Length 256, alignment 0/ 0: 128% Length 64, alignment 0/ 0: 171% Length 64, alignment 0/ 0: 171% Length 64, alignment 0/ 0: 174% Length 512, alignment 0/ 0: 126% Length 512, alignment 3/10: 585% Length 512, alignment 0/ 0: 126% Length 512, alignment 0/ 0: 127% Length 128, alignment 0/ 0: 129% Length 128, alignment 0/ 0: 128% Length 128, alignment 0/ 0: 129% Length 1024, alignment 0/ 0: 125% Length 1024, alignment 2/12: 611% Length 1024, alignment 0/ 0: 126% Length 1024, alignment 0/ 0: 126% Length 256, alignment 0/ 0: 128% Length 256, alignment 0/ 0: 127% Length 256, alignment 0/ 0: 128% Length 2048, alignment 0/ 0: 125% Length 2048, alignment 1/14: 625% Length 2048, alignment 0/ 0: 125% Length 2048, alignment 0/ 0: 125% Length 512, alignment 0/ 0: 126% Length 512, alignment 0/ 0: 127% Length 512, alignment 0/ 0: 127% Length 4096, alignment 0/ 0: 125% Length 4096, alignment 0/16: 125% Length 4096, alignment 0/ 0: 125% Length 4096, alignment 0/ 0: 125% Length 1024, alignment 0/ 0: 126% Length 1024, alignment 0/ 0: 126% Length 1024, alignment 0/ 0: 126% Length 8192, alignment 0/ 0: 125% Length 8192, alignment 63/18: 636% Length 8192, alignment 0/ 0: 125% Length 8192, alignment 0/ 0: 125% Length 16, alignment 1/ 2: 317% Length 16, alignment 1/ 2: 317% Length 16, alignment 1/ 2: 317% Length 32, alignment 2/ 4: 395% Length 32, alignment 2/ 4: 395% Length 32, alignment 2/ 4: 398% Length 64, alignment 3/ 6: 475% Length 64, alignment 3/ 6: 475% Length 64, alignment 3/ 6: 477% Length 128, alignment 4/ 8: 479% Length 128, alignment 4/ 8: 479% Length 128, alignment 4/ 8: 479% Length 256, alignment 5/10: 543% Length 256, alignment 5/10: 539% Length 256, alignment 5/10: 543% Length 512, alignment 6/12: 585% Length 512, alignment 6/12: 585% Length 512, alignment 6/12: 585% Length 1024, alignment 7/14: 611% Length 1024, alignment 7/14: 611% Length 1024, alignment 7/14: 611%
This commit is contained in:
		
				
					committed by
					
						 Corinna Vinschen
						Corinna Vinschen
					
				
			
			
				
	
			
			
			
						parent
						
							181d8393ae
						
					
				
				
					commit
					c86063bdc0
				
			| @@ -1,220 +1,140 @@ | |||||||
| /* memcmp - compare memory | /* | ||||||
|  |  * Copyright (c) 2017 ARM Ltd | ||||||
|    Copyright (c) 2013, Linaro Limited |  * All rights reserved. | ||||||
|    Copyright (c) 2017, Samsung Austin R&D Center |  * | ||||||
|    All rights reserved. |  * Redistribution and use in source and binary forms, with or without | ||||||
|  |  * modification, are permitted provided that the following conditions | ||||||
|    Redistribution and use in source and binary forms, with or without |  * are met: | ||||||
|    modification, are permitted provided that the following conditions are met: |  * 1. Redistributions of source code must retain the above copyright | ||||||
|        * Redistributions of source code must retain the above copyright |  *    notice, this list of conditions and the following disclaimer. | ||||||
|          notice, this list of conditions and the following disclaimer. |  * 2. Redistributions in binary form must reproduce the above copyright | ||||||
|        * Redistributions in binary form must reproduce the above copyright |  *    notice, this list of conditions and the following disclaimer in the | ||||||
|          notice, this list of conditions and the following disclaimer in the |  *    documentation and/or other materials provided with the distribution. | ||||||
|          documentation and/or other materials provided with the distribution. |  * 3. The name of the company may not be used to endorse or promote | ||||||
|        * Neither the name of the Linaro nor the |  *    products derived from this software without specific prior written | ||||||
|          names of its contributors may be used to endorse or promote products |  *    permission. | ||||||
|          derived from this software without specific prior written permission. |  * | ||||||
|  |  * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED | ||||||
|    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF | ||||||
|    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. | ||||||
|    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |  * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | ||||||
|    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED | ||||||
|    HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |  * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | ||||||
|    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF | ||||||
|    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING | ||||||
|    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | ||||||
|    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||||||
|    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |  */ | ||||||
|    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ |  | ||||||
|  |  | ||||||
| #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) | #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) | ||||||
| /* See memcmp-stub.c  */ | /* See memcmp-stub.c  */ | ||||||
| #else | #else | ||||||
|  |  | ||||||
| /* Assumptions: | /* Assumptions: | ||||||
|  * |  * | ||||||
|  * ARMv8-a, AArch64 |  * ARMv8-a, AArch64, unaligned accesses. | ||||||
|  */ |  */ | ||||||
|  |  | ||||||
| 	.macro def_fn f p2align=0 |  | ||||||
| 	.text |  | ||||||
| 	.p2align \p2align |  | ||||||
| 	.global \f |  | ||||||
| 	.type \f, %function |  | ||||||
| \f: |  | ||||||
| 	.endm |  | ||||||
|  |  | ||||||
| /* Parameters and result.  */ | /* Parameters and result.  */ | ||||||
| #define src1		x0 | #define src1		x0 | ||||||
| #define src2		x1 | #define src2		x1 | ||||||
| #define limit		x2 | #define limit		x2 | ||||||
| #define result		x0 | #define result		w0 | ||||||
|  |  | ||||||
| /* Internal variables.  */ | /* Internal variables.  */ | ||||||
| #define data1		x3 | #define data1		x3 | ||||||
| #define data1w		w3 | #define data1w		w3 | ||||||
| #define data2		x4 | #define data2		x4 | ||||||
| #define data2w		w4 | #define data2w		w4 | ||||||
| #define has_nul		x5 | #define tmp1		x5 | ||||||
| #define diff		x6 |  | ||||||
| #define endloop		x7 |         .macro def_fn f p2align=0 | ||||||
| #define tmp1		x8 |         .text | ||||||
| #define tmp2		x9 |         .p2align \p2align | ||||||
| #define tmp3		x10 |         .global \f | ||||||
| #define pos		x11 |         .type \f, %function | ||||||
| #define limit_wd	x12 | \f: | ||||||
| #define mask		x13 |         .endm | ||||||
|  |  | ||||||
|  | /* Small inputs of less than 8 bytes are handled separately.  This allows the | ||||||
|  |    main code to be sped up using unaligned loads since there are now at least | ||||||
|  |    8 bytes to be compared.  If the first 8 bytes are equal, align src1. | ||||||
|  |    This ensures each iteration does at most one unaligned access even if both | ||||||
|  |    src1 and src2 are unaligned, and mutually aligned inputs behave as if | ||||||
|  |    aligned.  After the main loop, process the last 8 bytes using unaligned | ||||||
|  |    accesses.  */ | ||||||
|  |  | ||||||
| def_fn memcmp p2align=6 | def_fn memcmp p2align=6 | ||||||
| 	cbz	limit, .Lret0 | 	subs	limit, limit, 8 | ||||||
| 	eor	tmp1, src1, src2 | 	b.lo	.Lless8 | ||||||
| 	tst	tmp1, #7 |  | ||||||
| 	b.ne	.Lmisaligned8 |  | ||||||
| 	ands	tmp1, src1, #7 |  | ||||||
| 	b.ne	.Lmutual_align |  | ||||||
| 	add	limit_wd, limit, #7 |  | ||||||
| 	lsr	limit_wd, limit_wd, #3 |  | ||||||
| 	/* Start of performance-critical section  -- one 64B cache line.  */ |  | ||||||
| .Lloop_aligned: |  | ||||||
| 	ldr	data1, [src1], #8 |  | ||||||
| 	ldr	data2, [src2], #8 |  | ||||||
| .Lstart_realigned: |  | ||||||
| 	subs	limit_wd, limit_wd, #1 |  | ||||||
| 	eor	diff, data1, data2	/* Non-zero if differences found.  */ |  | ||||||
| 	csinv	endloop, diff, xzr, ne	/* Last Dword or differences.  */ |  | ||||||
| 	cbz	endloop, .Lloop_aligned |  | ||||||
| 	/* End of performance-critical section  -- one 64B cache line.  */ |  | ||||||
|  |  | ||||||
| 	/* Not reached the limit, must have found a diff.  */ | 	/* Limit >= 8, so check first 8 bytes using unaligned loads.  */ | ||||||
| 	cbnz	limit_wd, .Lnot_limit | 	ldr	data1, [src1], 8 | ||||||
|  | 	ldr	data2, [src2], 8 | ||||||
|  | 	and	tmp1, src1, 7 | ||||||
|  | 	add	limit, limit, tmp1 | ||||||
|  | 	cmp	data1, data2 | ||||||
|  | 	bne	.Lreturn | ||||||
|  |  | ||||||
| 	/* Limit % 8 == 0 => all bytes significant.  */ | 	/* Align src1 and adjust src2 with bytes not yet done.  */ | ||||||
| 	ands	limit, limit, #7 | 	sub	src1, src1, tmp1 | ||||||
| 	b.eq	.Lnot_limit | 	sub	src2, src2, tmp1 | ||||||
|  |  | ||||||
| 	lsl	limit, limit, #3	/* Bits -> bytes.  */ | 	subs	limit, limit, 8 | ||||||
| 	mov	mask, #~0 | 	b.ls	.Llast_bytes | ||||||
| #ifdef __AARCH64EB__ |  | ||||||
| 	lsr	mask, mask, limit |  | ||||||
| #else |  | ||||||
| 	lsl	mask, mask, limit |  | ||||||
| #endif |  | ||||||
| 	bic	data1, data1, mask |  | ||||||
| 	bic	data2, data2, mask |  | ||||||
|  |  | ||||||
| 	orr	diff, diff, mask | 	/* Loop performing 8 bytes per iteration using aligned src1. | ||||||
| .Lnot_limit: | 	   Limit is pre-decremented by 8 and must be larger than zero. | ||||||
|  | 	   Exit if <= 8 bytes left to do or if the data is not equal.  */ | ||||||
|  | 	.p2align 4 | ||||||
|  | .Lloop8: | ||||||
|  | 	ldr	data1, [src1], 8 | ||||||
|  | 	ldr	data2, [src2], 8 | ||||||
|  | 	subs	limit, limit, 8 | ||||||
|  | 	ccmp	data1, data2, 0, hi  /* NZCV = 0b0000.  */ | ||||||
|  | 	b.eq	.Lloop8 | ||||||
|  |  | ||||||
| #ifndef	__AARCH64EB__ | 	cmp	data1, data2 | ||||||
| 	rev	diff, diff | 	bne	.Lreturn | ||||||
|  |  | ||||||
|  | 	/* Compare last 1-8 bytes using unaligned access.  */ | ||||||
|  | .Llast_bytes: | ||||||
|  | 	ldr	data1, [src1, limit] | ||||||
|  | 	ldr	data2, [src2, limit] | ||||||
|  |  | ||||||
|  | 	/* Compare data bytes and set return value to 0, -1 or 1.  */ | ||||||
|  | .Lreturn: | ||||||
|  | #ifndef __AARCH64EB__ | ||||||
| 	rev	data1, data1 | 	rev	data1, data1 | ||||||
| 	rev	data2, data2 | 	rev	data2, data2 | ||||||
| #endif | #endif | ||||||
| 	/* The MS-non-zero bit of DIFF marks either the first bit | 	cmp     data1, data2 | ||||||
| 	   that is different, or the end of the significant data. | .Lret_eq: | ||||||
| 	   Shifting left now will bring the critical information into the | 	cset	result, ne | ||||||
| 	   top bits.  */ | 	cneg	result, result, lo | ||||||
| 	clz	pos, diff |         ret | ||||||
| 	lsl	data1, data1, pos |  | ||||||
| 	lsl	data2, data2, pos | 	.p2align 4 | ||||||
| 	/* But we need to zero-extend (char is unsigned) the value and then | 	/* Compare up to 8 bytes.  Limit is [-8..-1].  */ | ||||||
| 	   perform a signed 32-bit subtraction.  */ | .Lless8: | ||||||
| 	lsr	data1, data1, #56 | 	adds	limit, limit, 4 | ||||||
| 	sub	result, data1, data2, lsr #56 | 	b.lo	.Lless4 | ||||||
|  | 	ldr	data1w, [src1], 4 | ||||||
|  | 	ldr	data2w, [src2], 4 | ||||||
|  | 	cmp	data1w, data2w | ||||||
|  | 	b.ne	.Lreturn | ||||||
|  | 	sub	limit, limit, 4 | ||||||
|  | .Lless4: | ||||||
|  | 	adds	limit, limit, 4 | ||||||
|  | 	beq	.Lret_eq | ||||||
|  | .Lbyte_loop: | ||||||
|  | 	ldrb	data1w, [src1], 1 | ||||||
|  | 	ldrb	data2w, [src2], 1 | ||||||
|  | 	subs	limit, limit, 1 | ||||||
|  | 	ccmp	data1w, data2w, 0, ne	/* NZCV = 0b0000.  */ | ||||||
|  | 	b.eq	.Lbyte_loop | ||||||
|  | 	sub	result, data1w, data2w | ||||||
| 	ret | 	ret | ||||||
|  |  | ||||||
| .Lmutual_align: | 	.size	memcmp, . - memcmp | ||||||
| 	/* Sources are mutually aligned, but are not currently at an |  | ||||||
| 	   alignment boundary.  Round down the addresses and then mask off |  | ||||||
| 	   the bytes that precede the start point.  */ |  | ||||||
| 	bic	src1, src1, #7 |  | ||||||
| 	bic	src2, src2, #7 |  | ||||||
| 	add	limit, limit, tmp1	/* Adjust the limit for the extra.  */ |  | ||||||
| 	lsl	tmp1, tmp1, #3		/* Bytes beyond alignment -> bits.  */ |  | ||||||
| 	ldr	data1, [src1], #8 |  | ||||||
| 	neg	tmp1, tmp1		/* Bits to alignment -64.  */ |  | ||||||
| 	ldr	data2, [src2], #8 |  | ||||||
| 	mov	tmp2, #~0 |  | ||||||
| #ifdef __AARCH64EB__ |  | ||||||
| 	/* Big-endian.  Early bytes are at MSB.  */ |  | ||||||
| 	lsl	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */ |  | ||||||
| #else |  | ||||||
| 	/* Little-endian.  Early bytes are at LSB.  */ |  | ||||||
| 	lsr	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */ |  | ||||||
| #endif |  | ||||||
| 	add	limit_wd, limit, #7 |  | ||||||
| 	orr	data1, data1, tmp2 |  | ||||||
| 	orr	data2, data2, tmp2 |  | ||||||
| 	lsr	limit_wd, limit_wd, #3 |  | ||||||
| 	b	.Lstart_realigned |  | ||||||
|  |  | ||||||
| .Lret0: |  | ||||||
| 	mov	result, #0 |  | ||||||
| 	ret |  | ||||||
|  |  | ||||||
| 	.p2align 6 |  | ||||||
| .Lmisaligned8: |  | ||||||
|  |  | ||||||
| 	cmp	limit, #8 |  | ||||||
| 	b.lo	.LmisalignedLt8 |  | ||||||
|  |  | ||||||
| .LunalignedGe8 : |  | ||||||
|  |  | ||||||
| 	/* Load the first dword with both src potentially unaligned. */ |  | ||||||
| 	ldr	data1, [src1] |  | ||||||
| 	ldr	data2, [src2] |  | ||||||
|  |  | ||||||
| 	eor	diff, data1, data2	/* Non-zero if differences found. */ |  | ||||||
| 	cbnz	diff, .Lnot_limit |  | ||||||
|  |  | ||||||
| 	/* Sources are not aligned: align one of the sources. */ |  | ||||||
|  |  | ||||||
| 	and	tmp1, src1, #0x7 |  | ||||||
| 	orr	tmp3, xzr, #0x8 |  | ||||||
| 	sub	pos, tmp3, tmp1 |  | ||||||
|  |  | ||||||
| 	/* Increment SRC pointers by POS so SRC1 is word-aligned. */ |  | ||||||
| 	add	src1, src1, pos |  | ||||||
| 	add	src2, src2, pos |  | ||||||
|  |  | ||||||
| 	sub	limit, limit, pos |  | ||||||
| 	lsr	limit_wd, limit, #3 |  | ||||||
|  |  | ||||||
| 	cmp limit_wd, #0 |  | ||||||
|  |  | ||||||
| 	/* save #bytes to go back to be able to read 8byte at end |  | ||||||
| 	   pos=negative offset position to read 8 bytes when len%8 != 0 */ |  | ||||||
| 	and	limit, limit, #7 |  | ||||||
| 	sub	pos, limit, #8 |  | ||||||
|  |  | ||||||
| 	b	.Lstart_part_realigned |  | ||||||
|  |  | ||||||
| 	.p2align 5 |  | ||||||
| .Lloop_part_aligned: |  | ||||||
| 	ldr	data1, [src1], #8 |  | ||||||
| 	ldr	data2, [src2], #8 |  | ||||||
| 	subs	limit_wd, limit_wd, #1 |  | ||||||
| .Lstart_part_realigned: |  | ||||||
| 	eor	diff, data1, data2	/* Non-zero if differences found. */ |  | ||||||
| 	cbnz	diff, .Lnot_limit |  | ||||||
| 	b.ne	.Lloop_part_aligned |  | ||||||
|  |  | ||||||
| 	/* process leftover bytes: read the leftover bytes, starting with |  | ||||||
| 	   negative offset - so we can load 8 bytes. */ |  | ||||||
| 	ldr	data1, [src1, pos] |  | ||||||
| 	ldr	data2, [src2, pos] |  | ||||||
| 	eor	diff, data1, data2	/* Non-zero if differences found.  */ |  | ||||||
| 	b .Lnot_limit |  | ||||||
|  |  | ||||||
| .LmisalignedLt8: |  | ||||||
| 	sub	limit, limit, #1 |  | ||||||
| 1: |  | ||||||
| 	ldrb	data1w, [src1], #1 |  | ||||||
| 	ldrb	data2w, [src2], #1 |  | ||||||
| 	subs	limit, limit, #1 |  | ||||||
| 	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */ |  | ||||||
| 	b.eq	1b |  | ||||||
| 	sub	result, data1, data2 |  | ||||||
| 	ret |  | ||||||
| 	.size memcmp, . - memcmp |  | ||||||
|  |  | ||||||
| #endif | #endif | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user