newlib/libgloss/arm/cpu-init/rdimon-aem.S
Jiong Wang 18b47e05d3 Initializing TTBR0 to inner/outer WB
While running tests on internal systems, we identified an issue in the
startup code for newlib on AArch32 systems with Multiprocessor
Extensions to the architecture.

The issue is we were configuring page table flags to be Inner
cacheable/Outer non-cacheable, while for at least architectures with
Multiprocessor Extension, we'd configure it to Inner/Outer write-back, no
write-allocate, and cacheable.

The attached patch fixes this, and no regression on arm-none-eabi
bare-metal tests.

Adopted suggestion given by Richard offline to avoid using jump.

libgloss/
	* arm/cpu-init/rdimon-aem.S: Set TTBR0 to inner/outer
	cacheable WB, and no allocate on WB for arch with multiprocessor
	extension.
2016-03-26 12:45:07 +01:00

540 lines
19 KiB
ArmAsm

/* Copyright (c) 2005-2013 ARM Ltd. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
3. The name of the company may not be used to endorse or promote
products derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
/* This file gives a basic initialisation of a Cortex-A series core. It is
the bare minimum required to get Cortex-A core running with a semihosting
interface.
It sets up a basic 1:1 phsyical address to virtual address mapping;
turns the MMU on; enables branch prediction; activates any integrated
caches; enables the Advanced SIMD and VFP co-processors; and installs
basic exception handlers.
It does not handle peripherals, and assumes all memory is Normal.
It does not change processor state from the startup privilege and security
level.
This has only been tested to work in ARM state.
By default it assumes exception vectors are located from address 0.
However, if this is not true they can be moved by defining the
_rdimon_vector_base symbol. For example if you have HIVECS enabled you
may pass --defsym _rdimon_vector_base=0xffff0000 on the linker command
line. */
/* __ARM_ARCH_PROFILE is defined from GCC 4.8 onwards, however __ARM_ARCH_7A
has been defined since 4.2 onwards, which is when v7-a support was added
and hence 'A' profile support was added in the compiler. Allow for this
file to be built with older compilers. */
#if defined(__ARM_ARCH_7A__) || (__ARM_ARCH_PROFILE == 'A')
.syntax unified
.arch armv7-a
.arm
@ CPU Initialisation
.globl _rdimon_hw_init_hook
.type _rdimon_hw_init_hook, %function
_rdimon_hw_init_hook:
@ Only run the code on CPU 0 - otherwise spin
mrc 15, 0, r4, cr0, cr0, 5 @ Read MPIDR
ands r4, r4, #15
spin:
bne spin
mov r10, lr @ Save LR for final return
#ifdef __ARMEB__
@ Setup for Big Endian
setend be
mrc 15, 0, r4, cr1, cr0, 0 @ Read SCTLR
orr r4, r4, #(1<<25) @ Switch to Big Endian (Set SCTLR.EE)
mcr 15, 0, r4, cr1, cr0, 0 @ Write SCTLR
#else
@ Setup for Little Endian
setend le
mrc 15, 0, r4, cr1, cr0, 0 @ Read SCTLR
bic r4, r4, #(1<<25) @ Switch to LE (unset SCTLR.EE)
mcr 15, 0, r4, cr1, cr0, 0 @ Write SCTLR
#endif
bl is_a15_a7
@ For Cortex-A15 and Cortex-A7 only:
@ Write zero into the ACTLR to turn everything on.
itt eq
moveq r4, #0
mcreq 15, 0, r4, c1, c0, 1
isb
@ For Cortex-A15 and Cortex-A7 only:
@ Set ACTLR:SMP bit before enabling the caches and MMU,
@ or performing any cache and TLB maintenance operations.
ittt eq
mrceq 15, 0, r4, c1, c0, 1 @ Read ACTLR
orreq r4, r4, #(1<<6) @ Enable ACTLR:SMP
mcreq 15, 0, r4, c1, c0, 1 @ Write ACTLR
isb
@ Setup for exceptions being taken to Thumb/ARM state
mrc 15, 0, r4, cr1, cr0, 0 @ Read SCTLR
#if defined(__thumb__)
orr r4, r4, #(1 << 30) @ Enable SCTLR.TE
#else
bic r4, r4, #(1 << 30) @ Disable SCTLR.TE
#endif
mcr 15, 0, r4, cr1, cr0, 0 @ Write SCTLR
bl __reset_caches
mrc 15, 0, r4, cr1, cr0, 0 @ Read SCTLR
orr r4, r4, #(1<<22) @ Enable unaligned mode
bic r4, r4, #2 @ Disable alignment faults
bic r4, r4, #1 @ Disable MMU
mcr 15, 0, r4, cr1, cr0, 0 @ Write SCTLR
mov r4, #0
mcr 15, 0, r4, cr8, cr7, 0 @ Write TLBIALL - Invaliidate unified
@ TLB
@ Setup MMU Primary table P=V mapping.
mvn r4, #0
mcr 15, 0, r4, cr3, cr0, 0 @ Write DACR
mov r4, #0 @ Always use TTBR0, no LPAE
mcr 15, 0, r4, cr2, cr0, 2 @ Write TTBCR
adr r4, page_table_addr @ Load the base for vectors
ldr r4, [r4]
mrc p15, 0, r0, c0, c0, 5 @ read MPIDR
tst r0, #0x80000000 @ bis[31]
@ Set page table flags - there are two page table flag formats for the
@ architecture. For systems without multiprocessor extensions we use 0x1
@ which is Inner cacheable/Outer non-cacheable. For systems with
@ multiprocessor extensions we use 0x59 which is Inner/Outer write-back,
@ no write-allocate, and cacheable. See the ARMARM-v7AR for more details.
it ne
addne r4, r4, #0x58
add r4, r4, #1
mcr 15, 0, r4, cr2, cr0, 0 @ Write TTBR0
mov r0, #34 @ 0x22 @ TR0 and TR1 - normal memory
orr r0, r0, #(1 << 19) @ Shareable
mcr 15, 0, r0, cr10, cr2, 0 @ Write PRRR
movw r0, #0x33
movt r0, #0x33
mcr 15, 0, r0, cr10, cr2, 1 @ Write NMRR
mrc 15, 0, r0, cr1, cr0, 0 @ Read SCTLR
bic r0, r0, #(1 << 28) @ Clear TRE bit
mcr 15, 0, r0, cr1, cr0, 0 @ Write SCTLR
@ Now install the vector code - we move the Vector code from where it is
@ in the image to be based at _rdimon_vector_base. We have to do this copy
@ as the code is all PC-relative. We actually cheat and do a BX <reg> so
@ that we are at a known address relatively quickly and have to move as
@ little code as possible.
mov r7, #(VectorCode_Limit - VectorCode)
adr r5, VectorCode
adr r6, vector_base_addr @ Load the base for vectors
ldr r6, [r6]
copy_loop: @ Do the copy
ldr r4, [r5], #4
str r4, [r6], #4
subs r7, r7, #4
bne copy_loop
mrc 15, 0, r4, cr1, cr0, 0 @ Read SCTLR
bic r4, r4, #0x1000 @ Disable I Cache
bic r4, r4, #4 @ Disable D Cache
orr r4, r4, #1 @ Enable MMU
bic r4, r4, #(1 << 28) @ Clear TRE bit
mcr 15, 0, r4, cr1, cr0, 0 @ Write SCTLR
mrc 15, 0, r4, cr1, cr0, 2 @ Read CPACR
orr r4, r4, #0x00f00000 @ Turn on VFP Co-procs
bic r4, r4, #0x80000000 @ Clear ASEDIS bit
mcr 15, 0, r4, cr1, cr0, 2 @ Write CPACR
isb
mov r4, #0
mcr 15, 0, r4, cr7, cr5, 4 @ Flush prefetch buffer
mrc 15, 0, r4, cr1, cr0, 2 @ Read CPACR
ubfx r4, r4, #20, #4 @ Extract bits [20, 23)
cmp r4, #0xf @ If not all set then the CPU does not
itt eq @ have FP or Advanced SIMD.
moveq r4, #0x40000000 @ Enable FP and Advanced SIMD
mcreq 10, 7, r4, cr8, cr0, 0 @ vmsr fpexc, r4
skip_vfp_enable:
bl __enable_caches @ Turn caches on
bx r10 @ Return to CRT startup routine
@ This enable us to be more precise about which caches we want
init_cpu_client_enable_dcache:
init_cpu_client_enable_icache:
mov r0, #1
bx lr
vector_base_addr:
.word _rdimon_vector_base
.weak _rdimon_vector_base
page_table_addr:
.word page_tables
@ Vector code - must be PIC and in ARM state.
VectorCode:
b vector_reset
b vector_undef
b vector_swi
b vector_prefetch
b vector_dataabt
b vector_reserved
b vector_irq
b vector_fiq
vector_reset:
adr sp, vector_sp_base
push {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, sl, fp, ip, lr}
mov r4, #0
b vector_common
vector_undef:
adr sp, vector_sp_base
push {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, sl, fp, ip, lr}
mov r4, #1
b vector_common
vector_swi:
adr sp, vector_sp_base
push {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, sl, fp, ip, lr}
mov r4, #2
b vector_common
vector_prefetch:
adr sp, vector_sp_base
push {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, sl, fp, ip, lr}
mov r4, #3
b vector_common
vector_dataabt:
adr sp, vector_sp_base
push {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, sl, fp, ip, lr}
mov r4, #4
b vector_common
vector_reserved:
adr sp, vector_sp_base
push {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, sl, fp, ip, lr}
mov r4, #5
b vector_common
vector_irq:
adr sp, vector_sp_base
push {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, sl, fp, ip, lr}
mov r4, #6
b vector_common
vector_fiq:
adr sp, vector_sp_base
push {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, sl, fp, ip, lr}
mov r4, #7
b vector_common
vector_common:
adr r1, vector_common_adr @ Find where we're going to
ldr r1, [r1]
bx r1 @ And branch there
vector_common_adr:
.word vector_common_2 @ Common handling code
@ Vector stack
.p2align 3 @ Align to 8 byte boundary boundary to
@ keep ABI compatibility
.fill 32, 4, 0 @ 32-entry stack is enough for vector
@ handlers.
vector_sp_base:
VectorCode_Limit:
@ End of PIC code for vectors
@ Common Handling of vectors
.type vector_common_2, %function
vector_common_2:
mrs r1, APSR
mrs r2, SPSR
push {r1, r2} @ Save PSRs
@ Output the vector we have caught
bl out_nl
adr r0, which_vector
bl out_string
adr r0, vector_names
mov r1, #11
mla r0, r4, r1, r0
bl out_string
bl out_nl
@ Dump the registers
adrl r6, register_names
mov r7, #0
dump_r_loop:
mov r0, r6
bl out_string
add r6, r6, #6
ldr r0, [sp, r7, lsl #2]
bl out_word
bl out_nl
add r7, r7, #1
cmp r7, #16
blt dump_r_loop
adr r0, end
bl out_string
@ And exit
mov r0, #24
orr r1, r4, #0x20000
svc 0x00123456
@ Output the string in r0
out_string:
push {lr}
mov r1, r0
mov r0, #4
svc 0x00123456
pop {pc}
@ Output a New-line
out_nl:
mov r0, #10
@ Fallthrough
@ Output the character in r0
out_char:
push {lr}
strb r0, [sp, #-4]!
mov r0, #3
mov r1, sp
svc 0x00123456
add sp, sp, #4
pop {pc}
@ Output the value of r0 as a hex-word
out_word:
push {r4, r5, r6, lr}
mov r4, r0
mov r5, #28
adr r6, hexchars
word_loop:
lsr r0, r4, r5
and r0, r0, #15
ldrb r0, [r6, r0]
bl out_char
subs r5, r5, #4
bpl word_loop
pop {r4, r5, r6, pc}
hexchars:
.ascii "0123456789abcdef"
which_vector:
.asciz "Hit vector:"
end:
.asciz "End.\n"
vector_names:
.asciz "reset "
.asciz "undef "
.asciz "swi "
.asciz "prefetch "
.asciz "data abort"
.asciz "reserved "
.asciz "irq "
.asciz "fiq "
register_names:
.asciz "apsr "
.asciz "spsr "
.asciz "r0 "
.asciz "r1 "
.asciz "r2 "
.asciz "r3 "
.asciz "r4 "
.asciz "r5 "
.asciz "r6 "
.asciz "r7 "
.asciz "r8 "
.asciz "r9 "
.asciz "r10 "
.asciz "r11 "
.asciz "r12 "
.asciz "r14 "
.p2align 3
@ Enable the caches
__enable_caches:
mov r0, #0
mcr 15, 0, r0, cr8, cr7, 0 @ Invalidate all unified-TLB
mov r0, #0
mcr 15, 0, r0, cr7, cr5, 6 @ Invalidate branch predictor
mrc 15, 0, r4, cr1, cr0, 0 @ Read SCTLR
orr r4, r4, #0x800 @ Enable branch predictor
mcr 15, 0, r4, cr1, cr0, 0 @ Set SCTLR
mov r5, lr @ Save LR as we're going to BL
mrc 15, 0, r4, cr1, cr0, 0 @ Read SCTLR
bl init_cpu_client_enable_icache
cmp r0, #0
it ne
orrne r4, r4, #0x1000 @ Enable I-Cache
bl init_cpu_client_enable_dcache
cmp r0, #0
it ne
orrne r4, r4, #4
mcr 15, 0, r4, cr1, cr0, 0 @ Enable D-Cache
bx r5 @ Return
__reset_caches:
mov ip, lr @ Save LR
mov r0, #0
mcr 15, 0, r0, cr7, cr5, 6 @ Invalidate branch predictor
mrc 15, 0, r6, cr1, cr0, 0 @ Read SCTLR
mrc 15, 0, r0, cr1, cr0, 0 @ Read SCTLR!
bic r0, r0, #0x1000 @ Disable I cache
mcr 15, 0, r0, cr1, cr0, 0 @ Write SCTLR
mrc 15, 1, r0, cr0, cr0, 1 @ Read CLIDR
tst r0, #3 @ Harvard Cache?
mov r0, #0
it ne
mcrne 15, 0, r0, cr7, cr5, 0 @ Invalidate Instruction Cache?
mrc 15, 0, r1, cr1, cr0, 0 @ Read SCTLR (again!)
orr r1, r1, #0x800 @ Enable branch predictor
@ If we're not enabling caches we have
@ no more work to do.
bl init_cpu_client_enable_icache
cmp r0, #0
it ne
orrne r1, r1, #0x1000 @ Enable I-Cache now -
@ We actually only do this if we have a
@ Harvard style cache.
it eq
bleq init_cpu_client_enable_dcache
itt eq
cmpeq r0, #0
beq Finished1
mcr 15, 0, r1, cr1, cr0, 0 @ Write SCTLR (turn on Branch predictor & I-cache)
mrc 15, 1, r0, cr0, cr0, 1 @ Read CLIDR
ands r3, r0, #0x7000000
lsr r3, r3, #23 @ Total cache levels << 1
beq Finished1
mov lr, #0 @ lr = cache level << 1
Loop11:
mrc 15, 1, r0, cr0, cr0, 1 @ Read CLIDR
add r2, lr, lr, lsr #1 @ r2 holds cache 'set' position
lsr r1, r0, r2 @ Bottom 3-bits are Ctype for this level
and r1, r1, #7 @ Get those 3-bits alone
cmp r1, #2
blt Skip1 @ No cache or only I-Cache at this level
mcr 15, 2, lr, cr0, cr0, 0 @ Write CSSELR
mov r1, #0
isb sy
mrc 15, 1, r1, cr0, cr0, 0 @ Read CCSIDR
and r2, r1, #7 @ Extract line length field
add r2, r2, #4 @ Add 4 for the line length offset (log2 16 bytes)
movw r0, #0x3ff
ands r0, r0, r1, lsr #3 @ r0 is the max number on the way size
clz r4, r0 @ r4 is the bit position of the way size increment
movw r5, #0x7fff
ands r5, r5, r1, lsr #13 @ r5 is the max number of the index size (right aligned)
Loop21:
mov r7, r0 @ r7 working copy of max way size
Loop31:
orr r1, lr, r7, lsl r4 @ factor in way number and cache number
orr r1, r1, r5, lsl r2 @ factor in set number
tst r6, #4 @ D-Cache on?
ite eq
mcreq 15, 0, r1, cr7, cr6, 2 @ No - invalidate by set/way
mcrne 15, 0, r1, cr7, cr14, 2 @ yes - clean + invalidate by set/way
subs r7, r7, #1 @ Decrement way number
bge Loop31
subs r5, r5, #1 @ Decrement set number
bge Loop21
Skip1:
add lr, lr, #2 @ increment cache number
cmp r3, lr
bgt Loop11
Finished1:
@ Now we know the caches are clean we can:
mrc 15, 0, r4, cr1, cr0, 0 @ Read SCTLR
bic r4, r4, #4 @ Disable D-Cache
mcr 15, 0, r4, cr1, cr0, 0 @ Write SCTLR
mov r4, #0
mcr 15, 0, r4, cr7, cr5, 6 @ Write BPIALL
bx ip @ Return
@ Set Z if this is a Cortex-A15 or Cortex_A7
@ Other flags corrupted
is_a15_a7:
mrc 15, 0, r8, c0, c0, 0
movw r9, #0xfff0
movt r9, #0xff0f
and r8, r8, r9
movw r9, #0xc0f0
movt r9, #0x410f
cmp r8, r9
movw r9, #0xc070
movt r9, #0x410f
it ne
cmpne r8, r9
bx lr
@ Descriptor type: Section
@ Bufferable: True
@ Cacheable: True
@ Execute Never: False
@ Domain: 0
@ Impl. Defined: 0
@ Access: 0/11 Full access
@ TEX: 001
@ Shareable: False
@ Not Global: False
@ Supersection: False
#define PT(X) \
.word X;
#define PT2(X) \
PT(X) PT(X + 0x100000) PT(X + 0x200000) PT(X + 0x300000)
#define PT3(X) \
PT2(X) PT2(X + 0x400000) PT2(X + 0x800000) PT2(X + 0xc00000)
#define PT4(X) \
PT3(X) PT3(X + 0x1000000) PT3(X + 0x2000000) PT3(X + 0x3000000)
#define PT5(X) \
PT4(X) PT4(X + 0x4000000) PT4(X + 0x8000000) PT4(X + 0xc000000)
#define PT6(X) \
PT5(X) PT5(X + 0x10000000) PT5(X + 0x20000000) PT5(X + 0x30000000)
#define PT7(X) \
PT6(X) PT6(X + 0x40000000) PT6(X + 0x80000000) PT6(X + 0xc0000000)
.section page_tables_section, "aw", %progbits
.p2align 14
page_tables:
PT7(0x1c0e)
#endif //#if defined(__ARM_ARCH_7A__) || __ARM_ARCH_PROFILE == 'A'