Add missing libc/machine/arm/aeabi_memcpy-armv7a.S file
This commit is contained in:
		
							
								
								
									
										286
									
								
								newlib/libc/machine/arm/aeabi_memcpy-armv7a.S
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										286
									
								
								newlib/libc/machine/arm/aeabi_memcpy-armv7a.S
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,286 @@ | ||||
| /* | ||||
|  * Copyright (c) 2014 ARM Ltd | ||||
|  * All rights reserved. | ||||
|  * | ||||
|  * Redistribution and use in source and binary forms, with or without | ||||
|  * modification, are permitted provided that the following conditions | ||||
|  * are met: | ||||
|  * 1. Redistributions of source code must retain the above copyright | ||||
|  *    notice, this list of conditions and the following disclaimer. | ||||
|  * 2. Redistributions in binary form must reproduce the above copyright | ||||
|  *    notice, this list of conditions and the following disclaimer in the | ||||
|  *    documentation and/or other materials provided with the distribution. | ||||
|  * 3. The name of the company may not be used to endorse or promote | ||||
|  *    products derived from this software without specific prior written | ||||
|  *    permission. | ||||
|  * | ||||
|  * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED | ||||
|  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF | ||||
|  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. | ||||
|  * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | ||||
|  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED | ||||
|  * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | ||||
|  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF | ||||
|  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING | ||||
|  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | ||||
|  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||||
|  */ | ||||
|  | ||||
| #include "arm_asm.h" | ||||
|  | ||||
| /* NOTE: This ifdef MUST match the one in aeabi_memcpy.c.  */ | ||||
| #if defined (__ARM_ARCH_7A__) && defined (__ARM_FEATURE_UNALIGNED) && \ | ||||
| 	(defined (__ARM_NEON__) || !defined (__SOFTFP__)) | ||||
|  | ||||
| 	.syntax unified | ||||
| 	.global __aeabi_memcpy | ||||
| 	.type   __aeabi_memcpy, %function | ||||
| __aeabi_memcpy: | ||||
| 	/* Assumes that n >= 0, and dst, src are valid pointers. | ||||
|           If there is at least 8 bytes to copy, use LDRD/STRD. | ||||
|           If src and dst are misaligned with different offsets, | ||||
|           first copy byte by byte until dst is aligned, | ||||
|           and then copy using LDRD/STRD and shift if needed. | ||||
|           When less than 8 left, copy a word and then byte by byte.  */ | ||||
|  | ||||
|        /* Save registers (r0 holds the return value): | ||||
|           optimized push {r0, r4, r5, lr}. | ||||
|           To try and improve performance, stack layout changed, | ||||
|           i.e., not keeping the stack looking like users expect | ||||
|           (highest numbered register at highest address).  */ | ||||
|         push {r0, lr} | ||||
|         strd r4, r5, [sp, #-8]! | ||||
|  | ||||
|         /* Get copying of tiny blocks out of the way first.  */ | ||||
|         /* Is there at least 4 bytes to copy?  */ | ||||
|         subs    r2, r2, #4 | ||||
|         blt     copy_less_than_4       /* If n < 4.  */ | ||||
|  | ||||
|         /* Check word alignment.  */ | ||||
|         ands    ip, r0, #3             /* ip = last 2 bits of dst.  */ | ||||
|         bne     dst_not_word_aligned   /* If dst is not word-aligned.  */ | ||||
|  | ||||
|         /* Get here if dst is word-aligned.  */ | ||||
|         ands    ip, r1, #3             /* ip = last 2 bits of src.  */ | ||||
|         bne     src_not_word_aligned   /* If src is not word-aligned.  */ | ||||
| word_aligned: | ||||
|         /* Get here if source and dst both are word-aligned. | ||||
|            The number of bytes remaining to copy is r2+4.  */ | ||||
|  | ||||
|         /* Is there is at least 64 bytes to copy?  */ | ||||
|         subs    r2, r2, #60 | ||||
|         blt     copy_less_than_64                /* If r2 + 4 < 64.  */ | ||||
|  | ||||
|         /* First, align the destination buffer to 8-bytes, | ||||
|            to make sure double loads and stores don't cross cache line boundary, | ||||
|            as they are then more expensive even if the data is in the cache | ||||
|            (require two load/store issue cycles instead of one). | ||||
|            If only one of the buffers is not 8-bytes aligned, | ||||
|            then it's more important to align dst than src, | ||||
|            because there is more penalty for stores | ||||
|            than loads that cross cacheline boundary. | ||||
|            This check and realignment are only worth doing | ||||
|            if there is a lot to copy.  */ | ||||
|  | ||||
|         /* Get here if dst is word aligned, | ||||
|            i.e., the 2 least significant bits are 0. | ||||
|            If dst is not 2w aligned (i.e., the 3rd bit is not set in dst), | ||||
|            then copy 1 word (4 bytes).  */ | ||||
|         ands    r3, r0, #4 | ||||
|         beq     two_word_aligned  /* If dst already two-word aligned.  */ | ||||
|         ldr     r3, [r1], #4 | ||||
|         str     r3, [r0], #4 | ||||
|         subs    r2, r2, #4 | ||||
|         blt     copy_less_than_64 | ||||
|  | ||||
| two_word_aligned: | ||||
|         /* TODO: Align to cacheline (useful for PLD optimization).  */ | ||||
|  | ||||
|         /* Every loop iteration copies 64 bytes.  */ | ||||
| 1: | ||||
|         .irp    offset, #0, #8, #16, #24, #32, #40, #48, #56 | ||||
|         ldrd    r4, r5, [r1, \offset] | ||||
|         strd    r4, r5, [r0, \offset] | ||||
|         .endr | ||||
|  | ||||
|         add     r0, r0, #64 | ||||
|         add     r1, r1, #64 | ||||
|         subs    r2, r2, #64 | ||||
|         bge     1b                     /* If there is more to copy.  */ | ||||
|  | ||||
| copy_less_than_64: | ||||
|  | ||||
|         /* Get here if less than 64 bytes to copy, -64 <= r2 < 0. | ||||
|            Restore the count if there is more than 7 bytes to copy.  */ | ||||
|         adds    r2, r2, #56 | ||||
|         blt     copy_less_than_8 | ||||
|  | ||||
|         /* Copy 8 bytes at a time.  */ | ||||
| 2: | ||||
|         ldrd    r4, r5, [r1], #8 | ||||
|         strd    r4, r5, [r0], #8 | ||||
|         subs    r2, r2, #8 | ||||
|         bge     2b                     /* If there is more to copy.  */ | ||||
|  | ||||
| copy_less_than_8: | ||||
|  | ||||
|         /* Get here if less than 8 bytes to copy, -8 <= r2 < 0. | ||||
|            Check if there is more to copy.  */ | ||||
|         cmn     r2, #8 | ||||
|         beq     return                          /* If r2 + 8 == 0.  */ | ||||
|  | ||||
|         /* Restore the count if there is more than 3 bytes to copy.  */ | ||||
|         adds    r2, r2, #4 | ||||
|         blt     copy_less_than_4 | ||||
|  | ||||
|         /* Copy 4 bytes.  */ | ||||
|         ldr     r3, [r1], #4 | ||||
|         str     r3, [r0], #4 | ||||
|  | ||||
| copy_less_than_4: | ||||
|         /* Get here if less than 4 bytes to copy, -4 <= r2 < 0.  */ | ||||
|  | ||||
|         /* Restore the count, check if there is more to copy.  */ | ||||
|         adds    r2, r2, #4 | ||||
|         beq     return                          /* If r2 == 0.  */ | ||||
|  | ||||
|         /* Get here with r2 is in {1,2,3}={01,10,11}.  */ | ||||
|         /* Logical shift left r2, insert 0s, update flags.  */ | ||||
|         lsls    r2, r2, #31 | ||||
|  | ||||
|         /* Copy byte by byte. | ||||
|            Condition ne means the last bit of r2 is 0. | ||||
|            Condition cs means the second to last bit of r2 is set, | ||||
|            i.e., r2 is 1 or 3.  */ | ||||
|         itt     ne | ||||
|         ldrbne  r3, [r1], #1 | ||||
|         strbne  r3, [r0], #1 | ||||
|  | ||||
|         itttt   cs | ||||
|         ldrbcs  r4, [r1], #1 | ||||
|         ldrbcs  r5, [r1] | ||||
|         strbcs  r4, [r0], #1 | ||||
|         strbcs  r5, [r0] | ||||
|  | ||||
| return: | ||||
|         /* Restore registers: optimized pop {r0, r4, r5, pc}   */ | ||||
|         ldrd r4, r5, [sp], #8 | ||||
|         pop {r0, pc}         /* This is the only return point of memcpy.  */ | ||||
|  | ||||
| dst_not_word_aligned: | ||||
|  | ||||
|        /* Get here when dst is not aligned and ip has the last 2 bits of dst, | ||||
|           i.e., ip is the offset of dst from word. | ||||
|           The number of bytes that remains to copy is r2 + 4, | ||||
|           i.e., there are at least 4 bytes to copy. | ||||
|           Write a partial word (0 to 3 bytes), such that dst becomes | ||||
| 	  word-aligned.  */ | ||||
|  | ||||
|        /* If dst is at ip bytes offset from a word (with 0 < ip < 4), | ||||
|           then there are (4 - ip) bytes to fill up to align dst to the next | ||||
| 	  word.  */ | ||||
|         rsb     ip, ip, #4                 /* ip = #4 - ip.  */ | ||||
|         cmp     ip, #2 | ||||
|  | ||||
|        /* Copy byte by byte with conditionals.  */ | ||||
|         itt     gt | ||||
|         ldrbgt  r3, [r1], #1 | ||||
|         strbgt  r3, [r0], #1 | ||||
|  | ||||
|         itt     ge | ||||
|         ldrbge  r4, [r1], #1 | ||||
|         strbge  r4, [r0], #1 | ||||
|  | ||||
|         ldrb    lr, [r1], #1 | ||||
|         strb    lr, [r0], #1 | ||||
|  | ||||
|        /* Update the count. | ||||
|           ip holds the number of bytes we have just copied.  */ | ||||
|         subs    r2, r2, ip                        /* r2 = r2 - ip.  */ | ||||
|         blt     copy_less_than_4                  /* If r2 < ip.  */ | ||||
|  | ||||
|        /* Get here if there are more than 4 bytes to copy. | ||||
|           Check if src is aligned.  If beforehand src and dst were not word | ||||
| 	  aligned but congruent (same offset), then now they are both | ||||
| 	  word-aligned, and we can copy the rest efficiently (without | ||||
| 	  shifting).  */ | ||||
|         ands    ip, r1, #3                    /* ip = last 2 bits of src.  */ | ||||
|         beq     word_aligned                  /* If r1 is word-aligned.  */ | ||||
|  | ||||
| src_not_word_aligned: | ||||
|        /* Get here when src is not word-aligned, but dst is word-aligned. | ||||
|           The number of bytes that remains to copy is r2+4.  */ | ||||
|  | ||||
|        /* Copy word by word using LDR when alignment can be done in hardware, | ||||
|           i.e., SCTLR.A is set, supporting unaligned access in LDR and STR.  */ | ||||
|         subs    r2, r2, #60 | ||||
|         blt     8f | ||||
|  | ||||
| 7: | ||||
|         /* Copy 64 bytes in every loop iteration.  */ | ||||
|         .irp    offset, #0, #4, #8, #12, #16, #20, #24, #28, #32, #36, #40, #44, #48, #52, #56, #60 | ||||
|         ldr     r3, [r1, \offset] | ||||
|         str     r3, [r0, \offset] | ||||
|         .endr | ||||
|  | ||||
|         add     r0, r0, #64 | ||||
|         add     r1, r1, #64 | ||||
|         subs    r2, r2, #64 | ||||
|         bge     7b | ||||
|  | ||||
| 8: | ||||
|         /* Get here if less than 64 bytes to copy, -64 <= r2 < 0. | ||||
|            Check if there is more than 3 bytes to copy.  */ | ||||
|         adds    r2, r2, #60 | ||||
|         blt     copy_less_than_4 | ||||
|  | ||||
| 9: | ||||
|        /* Get here if there is less than 64 but at least 4 bytes to copy, | ||||
|           where the number of bytes to copy is r2+4.  */ | ||||
|         ldr     r3, [r1], #4 | ||||
|         str     r3, [r0], #4 | ||||
|         subs    r2, r2, #4 | ||||
|         bge     9b | ||||
|  | ||||
|         b       copy_less_than_4 | ||||
|  | ||||
|  | ||||
| 	.syntax unified | ||||
| 	.global __aeabi_memcpy4 | ||||
| 	.type   __aeabi_memcpy4, %function | ||||
| __aeabi_memcpy4: | ||||
| 	/* Assumes that both of its arguments are 4-byte aligned.  */ | ||||
|  | ||||
|         push {r0, lr} | ||||
|         strd r4, r5, [sp, #-8]! | ||||
|  | ||||
|         /* Is there at least 4 bytes to copy?  */ | ||||
|         subs    r2, r2, #4 | ||||
|         blt     copy_less_than_4       /* If n < 4.  */ | ||||
|  | ||||
| 	bl	word_aligned | ||||
|  | ||||
| 	.syntax unified | ||||
| 	.global __aeabi_memcpy8 | ||||
| 	.type   __aeabi_memcpy8, %function | ||||
| __aeabi_memcpy8: | ||||
| 	/* Assumes that both of its arguments are 8-byte aligned.  */ | ||||
|  | ||||
|         push {r0, lr} | ||||
|         strd r4, r5, [sp, #-8]! | ||||
|  | ||||
| 	/* Is there at least 4 bytes to copy?  */ | ||||
|         subs    r2, r2, #4 | ||||
|         blt     copy_less_than_4	/* If n < 4.  */ | ||||
|  | ||||
|         /* Is there at least 8 bytes to copy?  */ | ||||
|         subs    r2, r2, #4 | ||||
|         blt     copy_less_than_8	/* If n < 8.  */ | ||||
|  | ||||
| 	/* Is there at least 64 bytes to copy?  */ | ||||
| 	subs	r2, r2, #56 | ||||
| 	blt	copy_less_than_64	/* if n + 8 < 64.  */ | ||||
|  | ||||
| 	bl	two_word_aligned | ||||
|  | ||||
| #endif | ||||
		Reference in New Issue
	
	Block a user