From c86063bdc0f83b4f333d6f4968818c6244d77983 Mon Sep 17 00:00:00 2001 From: Wilco Dijkstra Date: Thu, 29 Jun 2017 14:32:09 +0000 Subject: [PATCH] Optimized memcmp This is an optimized memcmp for AArch64. This is a complete rewrite using a different algorithm. The previous version split into cases where both inputs were aligned, the inputs were mutually aligned and unaligned using a byte loop. The new version combines all these cases, while small inputs of less than 8 bytes are handled separately. This allows the main code to be sped up using unaligned loads since there are now at least 8 bytes to be compared. After the first 8 bytes, align the first input. This ensures each iteration does at most one unaligned access and mutually aligned inputs behave as aligned. After the main loop, process the last 8 bytes using unaligned accesses. This improves performance of (mutually) aligned cases by 25% and unaligned by >500% (yes >6 times faster) on large inputs. ChangeLog: 2017-06-28 Wilco Dijkstra * newlib/libc/machine/aarch64/memcmp.S (memcmp): Rewrite of optimized memcmp. GLIBC benchtests/bench-memcmp.c performance comparison for Cortex-A53: Length 1, alignment 1/ 1: 153% Length 1, alignment 1/ 1: 119% Length 1, alignment 1/ 1: 154% Length 2, alignment 2/ 2: 121% Length 2, alignment 2/ 2: 140% Length 2, alignment 2/ 2: 121% Length 3, alignment 3/ 3: 105% Length 3, alignment 3/ 3: 105% Length 3, alignment 3/ 3: 105% Length 4, alignment 4/ 4: 155% Length 4, alignment 4/ 4: 154% Length 4, alignment 4/ 4: 161% Length 5, alignment 5/ 5: 173% Length 5, alignment 5/ 5: 173% Length 5, alignment 5/ 5: 173% Length 6, alignment 6/ 6: 145% Length 6, alignment 6/ 6: 145% Length 6, alignment 6/ 6: 145% Length 7, alignment 7/ 7: 125% Length 7, alignment 7/ 7: 125% Length 7, alignment 7/ 7: 125% Length 8, alignment 8/ 8: 111% Length 8, alignment 8/ 8: 130% Length 8, alignment 8/ 8: 124% Length 9, alignment 9/ 9: 160% Length 9, alignment 9/ 9: 160% Length 9, alignment 9/ 9: 150% Length 10, alignment 10/10: 170% Length 10, alignment 10/10: 137% Length 10, alignment 10/10: 150% Length 11, alignment 11/11: 160% Length 11, alignment 11/11: 160% Length 11, alignment 11/11: 160% Length 12, alignment 12/12: 146% Length 12, alignment 12/12: 168% Length 12, alignment 12/12: 156% Length 13, alignment 13/13: 167% Length 13, alignment 13/13: 167% Length 13, alignment 13/13: 173% Length 14, alignment 14/14: 167% Length 14, alignment 14/14: 168% Length 14, alignment 14/14: 168% Length 15, alignment 15/15: 168% Length 15, alignment 15/15: 173% Length 15, alignment 15/15: 173% Length 1, alignment 0/ 0: 134% Length 1, alignment 0/ 0: 127% Length 1, alignment 0/ 0: 119% Length 2, alignment 0/ 0: 94% Length 2, alignment 0/ 0: 94% Length 2, alignment 0/ 0: 106% Length 3, alignment 0/ 0: 82% Length 3, alignment 0/ 0: 87% Length 3, alignment 0/ 0: 82% Length 4, alignment 0/ 0: 115% Length 4, alignment 0/ 0: 115% Length 4, alignment 0/ 0: 122% Length 5, alignment 0/ 0: 127% Length 5, alignment 0/ 0: 119% Length 5, alignment 0/ 0: 127% Length 6, alignment 0/ 0: 103% Length 6, alignment 0/ 0: 100% Length 6, alignment 0/ 0: 100% Length 7, alignment 0/ 0: 82% Length 7, alignment 0/ 0: 91% Length 7, alignment 0/ 0: 87% Length 8, alignment 0/ 0: 111% Length 8, alignment 0/ 0: 124% Length 8, alignment 0/ 0: 124% Length 9, alignment 0/ 0: 136% Length 9, alignment 0/ 0: 136% Length 9, alignment 0/ 0: 136% Length 10, alignment 0/ 0: 136% Length 10, alignment 0/ 0: 135% Length 10, alignment 0/ 0: 136% Length 11, alignment 0/ 0: 136% Length 11, alignment 0/ 0: 136% Length 11, alignment 0/ 0: 135% Length 12, alignment 0/ 0: 136% Length 12, alignment 0/ 0: 136% Length 12, alignment 0/ 0: 136% Length 13, alignment 0/ 0: 135% Length 13, alignment 0/ 0: 136% Length 13, alignment 0/ 0: 136% Length 14, alignment 0/ 0: 136% Length 14, alignment 0/ 0: 136% Length 14, alignment 0/ 0: 136% Length 15, alignment 0/ 0: 136% Length 15, alignment 0/ 0: 136% Length 15, alignment 0/ 0: 136% Length 4, alignment 0/ 0: 115% Length 4, alignment 0/ 0: 115% Length 4, alignment 0/ 0: 115% Length 32, alignment 0/ 0: 127% Length 32, alignment 7/ 2: 395% Length 32, alignment 0/ 0: 127% Length 32, alignment 0/ 0: 127% Length 8, alignment 0/ 0: 111% Length 8, alignment 0/ 0: 124% Length 8, alignment 0/ 0: 124% Length 64, alignment 0/ 0: 128% Length 64, alignment 6/ 4: 475% Length 64, alignment 0/ 0: 131% Length 64, alignment 0/ 0: 134% Length 16, alignment 0/ 0: 128% Length 16, alignment 0/ 0: 119% Length 16, alignment 0/ 0: 128% Length 128, alignment 0/ 0: 129% Length 128, alignment 5/ 6: 475% Length 128, alignment 0/ 0: 130% Length 128, alignment 0/ 0: 129% Length 32, alignment 0/ 0: 126% Length 32, alignment 0/ 0: 126% Length 32, alignment 0/ 0: 126% Length 256, alignment 0/ 0: 127% Length 256, alignment 4/ 8: 545% Length 256, alignment 0/ 0: 126% Length 256, alignment 0/ 0: 128% Length 64, alignment 0/ 0: 171% Length 64, alignment 0/ 0: 171% Length 64, alignment 0/ 0: 174% Length 512, alignment 0/ 0: 126% Length 512, alignment 3/10: 585% Length 512, alignment 0/ 0: 126% Length 512, alignment 0/ 0: 127% Length 128, alignment 0/ 0: 129% Length 128, alignment 0/ 0: 128% Length 128, alignment 0/ 0: 129% Length 1024, alignment 0/ 0: 125% Length 1024, alignment 2/12: 611% Length 1024, alignment 0/ 0: 126% Length 1024, alignment 0/ 0: 126% Length 256, alignment 0/ 0: 128% Length 256, alignment 0/ 0: 127% Length 256, alignment 0/ 0: 128% Length 2048, alignment 0/ 0: 125% Length 2048, alignment 1/14: 625% Length 2048, alignment 0/ 0: 125% Length 2048, alignment 0/ 0: 125% Length 512, alignment 0/ 0: 126% Length 512, alignment 0/ 0: 127% Length 512, alignment 0/ 0: 127% Length 4096, alignment 0/ 0: 125% Length 4096, alignment 0/16: 125% Length 4096, alignment 0/ 0: 125% Length 4096, alignment 0/ 0: 125% Length 1024, alignment 0/ 0: 126% Length 1024, alignment 0/ 0: 126% Length 1024, alignment 0/ 0: 126% Length 8192, alignment 0/ 0: 125% Length 8192, alignment 63/18: 636% Length 8192, alignment 0/ 0: 125% Length 8192, alignment 0/ 0: 125% Length 16, alignment 1/ 2: 317% Length 16, alignment 1/ 2: 317% Length 16, alignment 1/ 2: 317% Length 32, alignment 2/ 4: 395% Length 32, alignment 2/ 4: 395% Length 32, alignment 2/ 4: 398% Length 64, alignment 3/ 6: 475% Length 64, alignment 3/ 6: 475% Length 64, alignment 3/ 6: 477% Length 128, alignment 4/ 8: 479% Length 128, alignment 4/ 8: 479% Length 128, alignment 4/ 8: 479% Length 256, alignment 5/10: 543% Length 256, alignment 5/10: 539% Length 256, alignment 5/10: 543% Length 512, alignment 6/12: 585% Length 512, alignment 6/12: 585% Length 512, alignment 6/12: 585% Length 1024, alignment 7/14: 611% Length 1024, alignment 7/14: 611% Length 1024, alignment 7/14: 611% --- newlib/libc/machine/aarch64/memcmp.S | 298 ++++++++++----------------- 1 file changed, 109 insertions(+), 189 deletions(-) diff --git a/newlib/libc/machine/aarch64/memcmp.S b/newlib/libc/machine/aarch64/memcmp.S index 09be4c344..1ffb79eb3 100644 --- a/newlib/libc/machine/aarch64/memcmp.S +++ b/newlib/libc/machine/aarch64/memcmp.S @@ -1,220 +1,140 @@ -/* memcmp - compare memory - - Copyright (c) 2013, Linaro Limited - Copyright (c) 2017, Samsung Austin R&D Center - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the Linaro nor the - names of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +/* + * Copyright (c) 2017 ARM Ltd + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the company may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) /* See memcmp-stub.c */ #else + /* Assumptions: * - * ARMv8-a, AArch64 + * ARMv8-a, AArch64, unaligned accesses. */ - .macro def_fn f p2align=0 - .text - .p2align \p2align - .global \f - .type \f, %function -\f: - .endm - /* Parameters and result. */ #define src1 x0 #define src2 x1 #define limit x2 -#define result x0 +#define result w0 /* Internal variables. */ #define data1 x3 #define data1w w3 #define data2 x4 #define data2w w4 -#define has_nul x5 -#define diff x6 -#define endloop x7 -#define tmp1 x8 -#define tmp2 x9 -#define tmp3 x10 -#define pos x11 -#define limit_wd x12 -#define mask x13 +#define tmp1 x5 + + .macro def_fn f p2align=0 + .text + .p2align \p2align + .global \f + .type \f, %function +\f: + .endm + +/* Small inputs of less than 8 bytes are handled separately. This allows the + main code to be sped up using unaligned loads since there are now at least + 8 bytes to be compared. If the first 8 bytes are equal, align src1. + This ensures each iteration does at most one unaligned access even if both + src1 and src2 are unaligned, and mutually aligned inputs behave as if + aligned. After the main loop, process the last 8 bytes using unaligned + accesses. */ def_fn memcmp p2align=6 - cbz limit, .Lret0 - eor tmp1, src1, src2 - tst tmp1, #7 - b.ne .Lmisaligned8 - ands tmp1, src1, #7 - b.ne .Lmutual_align - add limit_wd, limit, #7 - lsr limit_wd, limit_wd, #3 - /* Start of performance-critical section -- one 64B cache line. */ -.Lloop_aligned: - ldr data1, [src1], #8 - ldr data2, [src2], #8 -.Lstart_realigned: - subs limit_wd, limit_wd, #1 - eor diff, data1, data2 /* Non-zero if differences found. */ - csinv endloop, diff, xzr, ne /* Last Dword or differences. */ - cbz endloop, .Lloop_aligned - /* End of performance-critical section -- one 64B cache line. */ + subs limit, limit, 8 + b.lo .Lless8 - /* Not reached the limit, must have found a diff. */ - cbnz limit_wd, .Lnot_limit + /* Limit >= 8, so check first 8 bytes using unaligned loads. */ + ldr data1, [src1], 8 + ldr data2, [src2], 8 + and tmp1, src1, 7 + add limit, limit, tmp1 + cmp data1, data2 + bne .Lreturn - /* Limit % 8 == 0 => all bytes significant. */ - ands limit, limit, #7 - b.eq .Lnot_limit + /* Align src1 and adjust src2 with bytes not yet done. */ + sub src1, src1, tmp1 + sub src2, src2, tmp1 - lsl limit, limit, #3 /* Bits -> bytes. */ - mov mask, #~0 -#ifdef __AARCH64EB__ - lsr mask, mask, limit -#else - lsl mask, mask, limit -#endif - bic data1, data1, mask - bic data2, data2, mask + subs limit, limit, 8 + b.ls .Llast_bytes - orr diff, diff, mask -.Lnot_limit: + /* Loop performing 8 bytes per iteration using aligned src1. + Limit is pre-decremented by 8 and must be larger than zero. + Exit if <= 8 bytes left to do or if the data is not equal. */ + .p2align 4 +.Lloop8: + ldr data1, [src1], 8 + ldr data2, [src2], 8 + subs limit, limit, 8 + ccmp data1, data2, 0, hi /* NZCV = 0b0000. */ + b.eq .Lloop8 -#ifndef __AARCH64EB__ - rev diff, diff + cmp data1, data2 + bne .Lreturn + + /* Compare last 1-8 bytes using unaligned access. */ +.Llast_bytes: + ldr data1, [src1, limit] + ldr data2, [src2, limit] + + /* Compare data bytes and set return value to 0, -1 or 1. */ +.Lreturn: +#ifndef __AARCH64EB__ rev data1, data1 rev data2, data2 #endif - /* The MS-non-zero bit of DIFF marks either the first bit - that is different, or the end of the significant data. - Shifting left now will bring the critical information into the - top bits. */ - clz pos, diff - lsl data1, data1, pos - lsl data2, data2, pos - /* But we need to zero-extend (char is unsigned) the value and then - perform a signed 32-bit subtraction. */ - lsr data1, data1, #56 - sub result, data1, data2, lsr #56 + cmp data1, data2 +.Lret_eq: + cset result, ne + cneg result, result, lo + ret + + .p2align 4 + /* Compare up to 8 bytes. Limit is [-8..-1]. */ +.Lless8: + adds limit, limit, 4 + b.lo .Lless4 + ldr data1w, [src1], 4 + ldr data2w, [src2], 4 + cmp data1w, data2w + b.ne .Lreturn + sub limit, limit, 4 +.Lless4: + adds limit, limit, 4 + beq .Lret_eq +.Lbyte_loop: + ldrb data1w, [src1], 1 + ldrb data2w, [src2], 1 + subs limit, limit, 1 + ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */ + b.eq .Lbyte_loop + sub result, data1w, data2w ret -.Lmutual_align: - /* Sources are mutually aligned, but are not currently at an - alignment boundary. Round down the addresses and then mask off - the bytes that precede the start point. */ - bic src1, src1, #7 - bic src2, src2, #7 - add limit, limit, tmp1 /* Adjust the limit for the extra. */ - lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */ - ldr data1, [src1], #8 - neg tmp1, tmp1 /* Bits to alignment -64. */ - ldr data2, [src2], #8 - mov tmp2, #~0 -#ifdef __AARCH64EB__ - /* Big-endian. Early bytes are at MSB. */ - lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ -#else - /* Little-endian. Early bytes are at LSB. */ - lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ -#endif - add limit_wd, limit, #7 - orr data1, data1, tmp2 - orr data2, data2, tmp2 - lsr limit_wd, limit_wd, #3 - b .Lstart_realigned - -.Lret0: - mov result, #0 - ret - - .p2align 6 -.Lmisaligned8: - - cmp limit, #8 - b.lo .LmisalignedLt8 - -.LunalignedGe8 : - - /* Load the first dword with both src potentially unaligned. */ - ldr data1, [src1] - ldr data2, [src2] - - eor diff, data1, data2 /* Non-zero if differences found. */ - cbnz diff, .Lnot_limit - - /* Sources are not aligned: align one of the sources. */ - - and tmp1, src1, #0x7 - orr tmp3, xzr, #0x8 - sub pos, tmp3, tmp1 - - /* Increment SRC pointers by POS so SRC1 is word-aligned. */ - add src1, src1, pos - add src2, src2, pos - - sub limit, limit, pos - lsr limit_wd, limit, #3 - - cmp limit_wd, #0 - - /* save #bytes to go back to be able to read 8byte at end - pos=negative offset position to read 8 bytes when len%8 != 0 */ - and limit, limit, #7 - sub pos, limit, #8 - - b .Lstart_part_realigned - - .p2align 5 -.Lloop_part_aligned: - ldr data1, [src1], #8 - ldr data2, [src2], #8 - subs limit_wd, limit_wd, #1 -.Lstart_part_realigned: - eor diff, data1, data2 /* Non-zero if differences found. */ - cbnz diff, .Lnot_limit - b.ne .Lloop_part_aligned - - /* process leftover bytes: read the leftover bytes, starting with - negative offset - so we can load 8 bytes. */ - ldr data1, [src1, pos] - ldr data2, [src2, pos] - eor diff, data1, data2 /* Non-zero if differences found. */ - b .Lnot_limit - -.LmisalignedLt8: - sub limit, limit, #1 -1: - ldrb data1w, [src1], #1 - ldrb data2w, [src2], #1 - subs limit, limit, #1 - ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ - b.eq 1b - sub result, data1, data2 - ret - .size memcmp, . - memcmp - + .size memcmp, . - memcmp #endif