Add missing libc/machine/arm/aeabi_memcpy-armv7a.S file
This commit is contained in:
		
							
								
								
									
										286
									
								
								newlib/libc/machine/arm/aeabi_memcpy-armv7a.S
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										286
									
								
								newlib/libc/machine/arm/aeabi_memcpy-armv7a.S
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,286 @@ | |||||||
|  | /* | ||||||
|  |  * Copyright (c) 2014 ARM Ltd | ||||||
|  |  * All rights reserved. | ||||||
|  |  * | ||||||
|  |  * Redistribution and use in source and binary forms, with or without | ||||||
|  |  * modification, are permitted provided that the following conditions | ||||||
|  |  * are met: | ||||||
|  |  * 1. Redistributions of source code must retain the above copyright | ||||||
|  |  *    notice, this list of conditions and the following disclaimer. | ||||||
|  |  * 2. Redistributions in binary form must reproduce the above copyright | ||||||
|  |  *    notice, this list of conditions and the following disclaimer in the | ||||||
|  |  *    documentation and/or other materials provided with the distribution. | ||||||
|  |  * 3. The name of the company may not be used to endorse or promote | ||||||
|  |  *    products derived from this software without specific prior written | ||||||
|  |  *    permission. | ||||||
|  |  * | ||||||
|  |  * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED | ||||||
|  |  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF | ||||||
|  |  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. | ||||||
|  |  * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | ||||||
|  |  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED | ||||||
|  |  * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | ||||||
|  |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF | ||||||
|  |  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING | ||||||
|  |  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | ||||||
|  |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||||||
|  |  */ | ||||||
|  |  | ||||||
|  | #include "arm_asm.h" | ||||||
|  |  | ||||||
|  | /* NOTE: This ifdef MUST match the one in aeabi_memcpy.c.  */ | ||||||
|  | #if defined (__ARM_ARCH_7A__) && defined (__ARM_FEATURE_UNALIGNED) && \ | ||||||
|  | 	(defined (__ARM_NEON__) || !defined (__SOFTFP__)) | ||||||
|  |  | ||||||
|  | 	.syntax unified | ||||||
|  | 	.global __aeabi_memcpy | ||||||
|  | 	.type   __aeabi_memcpy, %function | ||||||
|  | __aeabi_memcpy: | ||||||
|  | 	/* Assumes that n >= 0, and dst, src are valid pointers. | ||||||
|  |           If there is at least 8 bytes to copy, use LDRD/STRD. | ||||||
|  |           If src and dst are misaligned with different offsets, | ||||||
|  |           first copy byte by byte until dst is aligned, | ||||||
|  |           and then copy using LDRD/STRD and shift if needed. | ||||||
|  |           When less than 8 left, copy a word and then byte by byte.  */ | ||||||
|  |  | ||||||
|  |        /* Save registers (r0 holds the return value): | ||||||
|  |           optimized push {r0, r4, r5, lr}. | ||||||
|  |           To try and improve performance, stack layout changed, | ||||||
|  |           i.e., not keeping the stack looking like users expect | ||||||
|  |           (highest numbered register at highest address).  */ | ||||||
|  |         push {r0, lr} | ||||||
|  |         strd r4, r5, [sp, #-8]! | ||||||
|  |  | ||||||
|  |         /* Get copying of tiny blocks out of the way first.  */ | ||||||
|  |         /* Is there at least 4 bytes to copy?  */ | ||||||
|  |         subs    r2, r2, #4 | ||||||
|  |         blt     copy_less_than_4       /* If n < 4.  */ | ||||||
|  |  | ||||||
|  |         /* Check word alignment.  */ | ||||||
|  |         ands    ip, r0, #3             /* ip = last 2 bits of dst.  */ | ||||||
|  |         bne     dst_not_word_aligned   /* If dst is not word-aligned.  */ | ||||||
|  |  | ||||||
|  |         /* Get here if dst is word-aligned.  */ | ||||||
|  |         ands    ip, r1, #3             /* ip = last 2 bits of src.  */ | ||||||
|  |         bne     src_not_word_aligned   /* If src is not word-aligned.  */ | ||||||
|  | word_aligned: | ||||||
|  |         /* Get here if source and dst both are word-aligned. | ||||||
|  |            The number of bytes remaining to copy is r2+4.  */ | ||||||
|  |  | ||||||
|  |         /* Is there is at least 64 bytes to copy?  */ | ||||||
|  |         subs    r2, r2, #60 | ||||||
|  |         blt     copy_less_than_64                /* If r2 + 4 < 64.  */ | ||||||
|  |  | ||||||
|  |         /* First, align the destination buffer to 8-bytes, | ||||||
|  |            to make sure double loads and stores don't cross cache line boundary, | ||||||
|  |            as they are then more expensive even if the data is in the cache | ||||||
|  |            (require two load/store issue cycles instead of one). | ||||||
|  |            If only one of the buffers is not 8-bytes aligned, | ||||||
|  |            then it's more important to align dst than src, | ||||||
|  |            because there is more penalty for stores | ||||||
|  |            than loads that cross cacheline boundary. | ||||||
|  |            This check and realignment are only worth doing | ||||||
|  |            if there is a lot to copy.  */ | ||||||
|  |  | ||||||
|  |         /* Get here if dst is word aligned, | ||||||
|  |            i.e., the 2 least significant bits are 0. | ||||||
|  |            If dst is not 2w aligned (i.e., the 3rd bit is not set in dst), | ||||||
|  |            then copy 1 word (4 bytes).  */ | ||||||
|  |         ands    r3, r0, #4 | ||||||
|  |         beq     two_word_aligned  /* If dst already two-word aligned.  */ | ||||||
|  |         ldr     r3, [r1], #4 | ||||||
|  |         str     r3, [r0], #4 | ||||||
|  |         subs    r2, r2, #4 | ||||||
|  |         blt     copy_less_than_64 | ||||||
|  |  | ||||||
|  | two_word_aligned: | ||||||
|  |         /* TODO: Align to cacheline (useful for PLD optimization).  */ | ||||||
|  |  | ||||||
|  |         /* Every loop iteration copies 64 bytes.  */ | ||||||
|  | 1: | ||||||
|  |         .irp    offset, #0, #8, #16, #24, #32, #40, #48, #56 | ||||||
|  |         ldrd    r4, r5, [r1, \offset] | ||||||
|  |         strd    r4, r5, [r0, \offset] | ||||||
|  |         .endr | ||||||
|  |  | ||||||
|  |         add     r0, r0, #64 | ||||||
|  |         add     r1, r1, #64 | ||||||
|  |         subs    r2, r2, #64 | ||||||
|  |         bge     1b                     /* If there is more to copy.  */ | ||||||
|  |  | ||||||
|  | copy_less_than_64: | ||||||
|  |  | ||||||
|  |         /* Get here if less than 64 bytes to copy, -64 <= r2 < 0. | ||||||
|  |            Restore the count if there is more than 7 bytes to copy.  */ | ||||||
|  |         adds    r2, r2, #56 | ||||||
|  |         blt     copy_less_than_8 | ||||||
|  |  | ||||||
|  |         /* Copy 8 bytes at a time.  */ | ||||||
|  | 2: | ||||||
|  |         ldrd    r4, r5, [r1], #8 | ||||||
|  |         strd    r4, r5, [r0], #8 | ||||||
|  |         subs    r2, r2, #8 | ||||||
|  |         bge     2b                     /* If there is more to copy.  */ | ||||||
|  |  | ||||||
|  | copy_less_than_8: | ||||||
|  |  | ||||||
|  |         /* Get here if less than 8 bytes to copy, -8 <= r2 < 0. | ||||||
|  |            Check if there is more to copy.  */ | ||||||
|  |         cmn     r2, #8 | ||||||
|  |         beq     return                          /* If r2 + 8 == 0.  */ | ||||||
|  |  | ||||||
|  |         /* Restore the count if there is more than 3 bytes to copy.  */ | ||||||
|  |         adds    r2, r2, #4 | ||||||
|  |         blt     copy_less_than_4 | ||||||
|  |  | ||||||
|  |         /* Copy 4 bytes.  */ | ||||||
|  |         ldr     r3, [r1], #4 | ||||||
|  |         str     r3, [r0], #4 | ||||||
|  |  | ||||||
|  | copy_less_than_4: | ||||||
|  |         /* Get here if less than 4 bytes to copy, -4 <= r2 < 0.  */ | ||||||
|  |  | ||||||
|  |         /* Restore the count, check if there is more to copy.  */ | ||||||
|  |         adds    r2, r2, #4 | ||||||
|  |         beq     return                          /* If r2 == 0.  */ | ||||||
|  |  | ||||||
|  |         /* Get here with r2 is in {1,2,3}={01,10,11}.  */ | ||||||
|  |         /* Logical shift left r2, insert 0s, update flags.  */ | ||||||
|  |         lsls    r2, r2, #31 | ||||||
|  |  | ||||||
|  |         /* Copy byte by byte. | ||||||
|  |            Condition ne means the last bit of r2 is 0. | ||||||
|  |            Condition cs means the second to last bit of r2 is set, | ||||||
|  |            i.e., r2 is 1 or 3.  */ | ||||||
|  |         itt     ne | ||||||
|  |         ldrbne  r3, [r1], #1 | ||||||
|  |         strbne  r3, [r0], #1 | ||||||
|  |  | ||||||
|  |         itttt   cs | ||||||
|  |         ldrbcs  r4, [r1], #1 | ||||||
|  |         ldrbcs  r5, [r1] | ||||||
|  |         strbcs  r4, [r0], #1 | ||||||
|  |         strbcs  r5, [r0] | ||||||
|  |  | ||||||
|  | return: | ||||||
|  |         /* Restore registers: optimized pop {r0, r4, r5, pc}   */ | ||||||
|  |         ldrd r4, r5, [sp], #8 | ||||||
|  |         pop {r0, pc}         /* This is the only return point of memcpy.  */ | ||||||
|  |  | ||||||
|  | dst_not_word_aligned: | ||||||
|  |  | ||||||
|  |        /* Get here when dst is not aligned and ip has the last 2 bits of dst, | ||||||
|  |           i.e., ip is the offset of dst from word. | ||||||
|  |           The number of bytes that remains to copy is r2 + 4, | ||||||
|  |           i.e., there are at least 4 bytes to copy. | ||||||
|  |           Write a partial word (0 to 3 bytes), such that dst becomes | ||||||
|  | 	  word-aligned.  */ | ||||||
|  |  | ||||||
|  |        /* If dst is at ip bytes offset from a word (with 0 < ip < 4), | ||||||
|  |           then there are (4 - ip) bytes to fill up to align dst to the next | ||||||
|  | 	  word.  */ | ||||||
|  |         rsb     ip, ip, #4                 /* ip = #4 - ip.  */ | ||||||
|  |         cmp     ip, #2 | ||||||
|  |  | ||||||
|  |        /* Copy byte by byte with conditionals.  */ | ||||||
|  |         itt     gt | ||||||
|  |         ldrbgt  r3, [r1], #1 | ||||||
|  |         strbgt  r3, [r0], #1 | ||||||
|  |  | ||||||
|  |         itt     ge | ||||||
|  |         ldrbge  r4, [r1], #1 | ||||||
|  |         strbge  r4, [r0], #1 | ||||||
|  |  | ||||||
|  |         ldrb    lr, [r1], #1 | ||||||
|  |         strb    lr, [r0], #1 | ||||||
|  |  | ||||||
|  |        /* Update the count. | ||||||
|  |           ip holds the number of bytes we have just copied.  */ | ||||||
|  |         subs    r2, r2, ip                        /* r2 = r2 - ip.  */ | ||||||
|  |         blt     copy_less_than_4                  /* If r2 < ip.  */ | ||||||
|  |  | ||||||
|  |        /* Get here if there are more than 4 bytes to copy. | ||||||
|  |           Check if src is aligned.  If beforehand src and dst were not word | ||||||
|  | 	  aligned but congruent (same offset), then now they are both | ||||||
|  | 	  word-aligned, and we can copy the rest efficiently (without | ||||||
|  | 	  shifting).  */ | ||||||
|  |         ands    ip, r1, #3                    /* ip = last 2 bits of src.  */ | ||||||
|  |         beq     word_aligned                  /* If r1 is word-aligned.  */ | ||||||
|  |  | ||||||
|  | src_not_word_aligned: | ||||||
|  |        /* Get here when src is not word-aligned, but dst is word-aligned. | ||||||
|  |           The number of bytes that remains to copy is r2+4.  */ | ||||||
|  |  | ||||||
|  |        /* Copy word by word using LDR when alignment can be done in hardware, | ||||||
|  |           i.e., SCTLR.A is set, supporting unaligned access in LDR and STR.  */ | ||||||
|  |         subs    r2, r2, #60 | ||||||
|  |         blt     8f | ||||||
|  |  | ||||||
|  | 7: | ||||||
|  |         /* Copy 64 bytes in every loop iteration.  */ | ||||||
|  |         .irp    offset, #0, #4, #8, #12, #16, #20, #24, #28, #32, #36, #40, #44, #48, #52, #56, #60 | ||||||
|  |         ldr     r3, [r1, \offset] | ||||||
|  |         str     r3, [r0, \offset] | ||||||
|  |         .endr | ||||||
|  |  | ||||||
|  |         add     r0, r0, #64 | ||||||
|  |         add     r1, r1, #64 | ||||||
|  |         subs    r2, r2, #64 | ||||||
|  |         bge     7b | ||||||
|  |  | ||||||
|  | 8: | ||||||
|  |         /* Get here if less than 64 bytes to copy, -64 <= r2 < 0. | ||||||
|  |            Check if there is more than 3 bytes to copy.  */ | ||||||
|  |         adds    r2, r2, #60 | ||||||
|  |         blt     copy_less_than_4 | ||||||
|  |  | ||||||
|  | 9: | ||||||
|  |        /* Get here if there is less than 64 but at least 4 bytes to copy, | ||||||
|  |           where the number of bytes to copy is r2+4.  */ | ||||||
|  |         ldr     r3, [r1], #4 | ||||||
|  |         str     r3, [r0], #4 | ||||||
|  |         subs    r2, r2, #4 | ||||||
|  |         bge     9b | ||||||
|  |  | ||||||
|  |         b       copy_less_than_4 | ||||||
|  |  | ||||||
|  |  | ||||||
|  | 	.syntax unified | ||||||
|  | 	.global __aeabi_memcpy4 | ||||||
|  | 	.type   __aeabi_memcpy4, %function | ||||||
|  | __aeabi_memcpy4: | ||||||
|  | 	/* Assumes that both of its arguments are 4-byte aligned.  */ | ||||||
|  |  | ||||||
|  |         push {r0, lr} | ||||||
|  |         strd r4, r5, [sp, #-8]! | ||||||
|  |  | ||||||
|  |         /* Is there at least 4 bytes to copy?  */ | ||||||
|  |         subs    r2, r2, #4 | ||||||
|  |         blt     copy_less_than_4       /* If n < 4.  */ | ||||||
|  |  | ||||||
|  | 	bl	word_aligned | ||||||
|  |  | ||||||
|  | 	.syntax unified | ||||||
|  | 	.global __aeabi_memcpy8 | ||||||
|  | 	.type   __aeabi_memcpy8, %function | ||||||
|  | __aeabi_memcpy8: | ||||||
|  | 	/* Assumes that both of its arguments are 8-byte aligned.  */ | ||||||
|  |  | ||||||
|  |         push {r0, lr} | ||||||
|  |         strd r4, r5, [sp, #-8]! | ||||||
|  |  | ||||||
|  | 	/* Is there at least 4 bytes to copy?  */ | ||||||
|  |         subs    r2, r2, #4 | ||||||
|  |         blt     copy_less_than_4	/* If n < 4.  */ | ||||||
|  |  | ||||||
|  |         /* Is there at least 8 bytes to copy?  */ | ||||||
|  |         subs    r2, r2, #4 | ||||||
|  |         blt     copy_less_than_8	/* If n < 8.  */ | ||||||
|  |  | ||||||
|  | 	/* Is there at least 64 bytes to copy?  */ | ||||||
|  | 	subs	r2, r2, #56 | ||||||
|  | 	blt	copy_less_than_64	/* if n + 8 < 64.  */ | ||||||
|  |  | ||||||
|  | 	bl	two_word_aligned | ||||||
|  |  | ||||||
|  | #endif | ||||||
		Reference in New Issue
	
	Block a user