[AArch64] Reverting recent optimized memset().

This commit is contained in:
Marcus Shawcroft 2015-07-15 13:34:58 +01:00
parent c028685518
commit c7806ef76a
2 changed files with 203 additions and 191 deletions

View File

@ -1,3 +1,8 @@
2015-07-15 Wilco Dijkstra <wdijkstr@arm.com>
* libc/machine/aarch64/memset.S (memset):
Revert: Rewrite of optimized memset.
2015-07-13 Wilco Dijkstra <wdijkstr@arm.com> 2015-07-13 Wilco Dijkstra <wdijkstr@arm.com>
* libc/machine/aarch64/memset.S (memset): * libc/machine/aarch64/memset.S (memset):

View File

@ -24,37 +24,10 @@
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
/*
* Copyright (c) 2015 ARM Ltd
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the company may not be used to endorse or promote
* products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
* TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/* Assumptions: /* Assumptions:
* *
* ARMv8-a, AArch64, unaligned accesses * ARMv8-a, AArch64
* Unaligned accesses
* *
*/ */
@ -62,20 +35,32 @@
/* See memset-stub.c */ /* See memset-stub.c */
#else #else
#define dstin x0 /* By default we assume that the DC instruction can be used to zero
#define val x1 data blocks more efficiently. In some circumstances this might be
#define valw w1 unsafe, for example in an asymmetric multiprocessor environment with
#define count x2 different DC clear lengths (neither the upper nor lower lengths are
#define dst x3 safe to use). The feature can be disabled by defining DONT_USE_DC.
#define dstend x4
#define tmp1 x5 If code may be run in a virtualized environment, then define
#define tmp1w w5 MAYBE_VIRT. This will cause the code to cache the system register
#define tmp2 x6 values rather than re-reading them each call. */
#define tmp2w w6
#define zva_len x7 #define dstin x0
#define zva_lenw w7 #define val w1
#define count x2
#define tmp1 x3
#define tmp1w w3
#define tmp2 x4
#define tmp2w w4
#define zva_len_x x5
#define zva_len w5
#define zva_bits_x x6
#define A_l x7
#define A_lw w7
#define dst x8
#define tmp3w w9
#define L(l) .L ## l
.macro def_fn f p2align=0 .macro def_fn f p2align=0
.text .text
@ -87,153 +72,175 @@
def_fn memset p2align=6 def_fn memset p2align=6
dup v0.16B, valw mov dst, dstin /* Preserve return value. */
add dstend, dstin, count ands A_lw, val, #255
#ifndef DONT_USE_DC
cmp count, 96 b.eq .Lzero_mem
b.hi L(set_long) #endif
cmp count, 16 orr A_lw, A_lw, A_lw, lsl #8
b.hs L(set_medium) orr A_lw, A_lw, A_lw, lsl #16
mov val, v0.D[0] orr A_l, A_l, A_l, lsl #32
.Ltail_maybe_long:
/* Set 0..15 bytes. */ cmp count, #64
tbz count, 3, 1f b.ge .Lnot_short
str val, [dstin] .Ltail_maybe_tiny:
str val, [dstend, -8] cmp count, #15
ret b.le .Ltail15tiny
nop .Ltail63:
1: tbz count, 2, 2f ands tmp1, count, #0x30
str valw, [dstin] b.eq .Ltail15
str valw, [dstend, -4] add dst, dst, tmp1
ret cmp tmp1w, #0x20
2: cbz count, 3f b.eq 1f
strb valw, [dstin] b.lt 2f
tbz count, 1, 3f stp A_l, A_l, [dst, #-48]
strh valw, [dstend, -2] 1:
3: ret stp A_l, A_l, [dst, #-32]
2:
/* Set 17..96 bytes. */ stp A_l, A_l, [dst, #-16]
L(set_medium):
str q0, [dstin] .Ltail15:
tbnz count, 6, L(set96) and count, count, #15
str q0, [dstend, -16] add dst, dst, count
tbz count, 5, 1f stp A_l, A_l, [dst, #-16] /* Repeat some/all of last store. */
str q0, [dstin, 16] ret
str q0, [dstend, -32]
1: ret .Ltail15tiny:
/* Set up to 15 bytes. Does not assume earlier memory
.p2align 4 being set. */
/* Set 64..96 bytes. Write 64 bytes from the start and tbz count, #3, 1f
32 bytes from the end. */ str A_l, [dst], #8
L(set96): 1:
str q0, [dstin, 16] tbz count, #2, 1f
stp q0, q0, [dstin, 32] str A_lw, [dst], #4
stp q0, q0, [dstend, -32] 1:
ret tbz count, #1, 1f
strh A_lw, [dst], #2
.p2align 3 1:
nop tbz count, #0, 1f
L(set_long): strb A_lw, [dst]
and valw, valw, 255 1:
bic dst, dstin, 15 ret
str q0, [dstin]
cmp count, 256 /* Critical loop. Start at a new cache line boundary. Assuming
ccmp valw, 0, 0, cs * 64 bytes per line, this ensures the entire loop is in one line. */
b.eq L(try_zva) .p2align 6
L(no_zva): .Lnot_short:
sub count, dstend, dst /* Count is 16 too large. */ neg tmp2, dst
add dst, dst, 16 ands tmp2, tmp2, #15
sub count, count, 64 + 16 /* Adjust count and bias for loop. */ b.eq 2f
1: stp q0, q0, [dst], 64 /* Bring DST to 128-bit (16-byte) alignment. We know that there's
stp q0, q0, [dst, -32] * more than that to set, so we simply store 16 bytes and advance by
L(tail64): * the amount required to reach alignment. */
subs count, count, 64 sub count, count, tmp2
b.hi 1b stp A_l, A_l, [dst]
2: stp q0, q0, [dstend, -64] add dst, dst, tmp2
stp q0, q0, [dstend, -32] /* There may be less than 63 bytes to go now. */
ret cmp count, #63
b.le .Ltail63
.p2align 3 2:
L(try_zva): sub dst, dst, #16 /* Pre-bias. */
mrs tmp1, dczid_el0 sub count, count, #64
tbnz tmp1w, 4, L(no_zva) 1:
and tmp1w, tmp1w, 4 stp A_l, A_l, [dst, #16]
cmp tmp1w, 4 /* ZVA size is 64 bytes. */ stp A_l, A_l, [dst, #32]
b.ne L(zva_128) stp A_l, A_l, [dst, #48]
stp A_l, A_l, [dst, #64]!
/* Write the first and last 64 byte aligned block using stp rather subs count, count, #64
than using DC ZVA. This is faster on some cores. b.ge 1b
*/ tst count, #0x3f
L(zva_64): add dst, dst, #16
str q0, [dst, 16] b.ne .Ltail63
stp q0, q0, [dst, 32] ret
bic dst, dst, 63
stp q0, q0, [dst, 64] #ifndef DONT_USE_DC
stp q0, q0, [dst, 96] /* For zeroing memory, check to see if we can use the ZVA feature to
sub count, dstend, dst /* Count is now 128 too large. */ * zero entire 'cache' lines. */
sub count, count, 128+64+64 /* Adjust count and bias for loop. */ .Lzero_mem:
add dst, dst, 128 mov A_l, #0
nop cmp count, #63
1: dc zva, dst b.le .Ltail_maybe_tiny
add dst, dst, 64 neg tmp2, dst
subs count, count, 64 ands tmp2, tmp2, #15
b.hi 1b b.eq 1f
stp q0, q0, [dst, 0] sub count, count, tmp2
stp q0, q0, [dst, 32] stp A_l, A_l, [dst]
stp q0, q0, [dstend, -64] add dst, dst, tmp2
stp q0, q0, [dstend, -32] cmp count, #63
ret b.le .Ltail63
1:
.p2align 3 /* For zeroing small amounts of memory, it's not worth setting up
L(zva_128): * the line-clear code. */
cmp tmp1w, 5 /* ZVA size is 128 bytes. */ cmp count, #128
b.ne L(zva_other) b.lt .Lnot_short
#ifdef MAYBE_VIRT
str q0, [dst, 16] /* For efficiency when virtualized, we cache the ZVA capability. */
stp q0, q0, [dst, 32] adrp tmp2, .Lcache_clear
stp q0, q0, [dst, 64] ldr zva_len, [tmp2, #:lo12:.Lcache_clear]
stp q0, q0, [dst, 96] tbnz zva_len, #31, .Lnot_short
bic dst, dst, 127 cbnz zva_len, .Lzero_by_line
sub count, dstend, dst /* Count is now 128 too large. */ mrs tmp1, dczid_el0
sub count, count, 128+128 /* Adjust count and bias for loop. */ tbz tmp1, #4, 1f
add dst, dst, 128 /* ZVA not available. Remember this for next time. */
1: dc zva, dst mov zva_len, #~0
add dst, dst, 128 str zva_len, [tmp2, #:lo12:.Lcache_clear]
subs count, count, 128 b .Lnot_short
b.hi 1b 1:
stp q0, q0, [dstend, -128] mov tmp3w, #4
stp q0, q0, [dstend, -96] and zva_len, tmp1w, #15 /* Safety: other bits reserved. */
stp q0, q0, [dstend, -64] lsl zva_len, tmp3w, zva_len
stp q0, q0, [dstend, -32] str zva_len, [tmp2, #:lo12:.Lcache_clear]
ret #else
mrs tmp1, dczid_el0
L(zva_other): tbnz tmp1, #4, .Lnot_short
mov tmp2w, 4 mov tmp3w, #4
lsl zva_lenw, tmp2w, tmp1w and zva_len, tmp1w, #15 /* Safety: other bits reserved. */
add tmp1, zva_len, 64 /* Max alignment bytes written. */ lsl zva_len, tmp3w, zva_len
cmp count, tmp1 #endif
blo L(no_zva)
.Lzero_by_line:
sub tmp2, zva_len, 1 /* Compute how far we need to go to become suitably aligned. We're
add tmp1, dst, zva_len * already at quad-word alignment. */
add dst, dst, 16 cmp count, zva_len_x
subs count, tmp1, dst /* Actual alignment bytes to write. */ b.lt .Lnot_short /* Not enough to reach alignment. */
bic tmp1, tmp1, tmp2 /* Aligned dc zva start address. */ sub zva_bits_x, zva_len_x, #1
beq 2f neg tmp2, dst
1: stp q0, q0, [dst], 64 ands tmp2, tmp2, zva_bits_x
stp q0, q0, [dst, -32] b.eq 1f /* Already aligned. */
subs count, count, 64 /* Not aligned, check that there's enough to copy after alignment. */
b.hi 1b sub tmp1, count, tmp2
2: mov dst, tmp1 cmp tmp1, #64
sub count, dstend, tmp1 /* Remaining bytes to write. */ ccmp tmp1, zva_len_x, #8, ge /* NZCV=0b1000 */
subs count, count, zva_len b.lt .Lnot_short
b.lo 4f /* We know that there's at least 64 bytes to zero and that it's safe
3: dc zva, dst * to overrun by 64 bytes. */
add dst, dst, zva_len mov count, tmp1
subs count, count, zva_len 2:
b.hs 3b stp A_l, A_l, [dst]
4: add count, count, zva_len stp A_l, A_l, [dst, #16]
b L(tail64) stp A_l, A_l, [dst, #32]
subs tmp2, tmp2, #64
.size memset, . - memset stp A_l, A_l, [dst, #48]
add dst, dst, #64
b.ge 2b
/* We've overrun a bit, so adjust dst downwards. */
add dst, dst, tmp2
1:
sub count, count, zva_len_x
3:
dc zva, dst
add dst, dst, zva_len_x
subs count, count, zva_len_x
b.ge 3b
ands count, count, zva_bits_x
b.ne .Ltail_maybe_long
ret
.size memset, .-memset
#ifdef MAYBE_VIRT
.bss
.p2align 2
.Lcache_clear:
.space 4
#endif
#endif /* DONT_USE_DC */
#endif #endif