/* Copyright (c) 2005-2013 ARM Ltd.  All rights reserved.

 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions
 are met:
 1. Redistributions of source code must retain the above copyright
    notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
    notice, this list of conditions and the following disclaimer in the
    documentation and/or other materials provided with the distribution.
 3. The name of the company may not be used to endorse or promote
    products derived from this software without specific prior written
    permission.

 THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
 WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
 TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */

/* This file gives a basic initialisation of a Cortex-A series core.  It is
   the bare minimum required to get Cortex-A core running with a semihosting
   interface.

   It sets up a basic 1:1 phsyical address to virtual address mapping;
   turns the MMU on; enables branch prediction; activates any integrated
   caches; enables the Advanced SIMD and VFP co-processors; and installs
   basic exception handlers.

   It does not handle peripherals, and assumes all memory is Normal.

   It does not change processor state from the startup privilege and security
   level.

   By default it assumes exception vectors are located from address 0.
   However, if this is not true they can be moved by defining the
   _rdimon_vector_base symbol.  For example if you have HIVECS enabled you
   may pass --defsym _rdimon_vector_base=0xffff0000 on the linker command
   line.  */

    .syntax	unified
    .arch	armv7-a

#if defined(__thumb__)
    .thumb
#endif

    @ CPU Initialisation
    .globl	_rdimon_hw_init_hook
    .type	_rdimon_hw_init_hook, %function

_rdimon_hw_init_hook:
    @ Only run the code on CPU 0 - otherwise spin
    mrc         15, 0, r4, cr0, cr0, 5  @ Read MPIDR
    ands        r4, r4, #15
spin:
    bne spin

    mov         r10, lr			@ Save LR for final return

#ifdef __ARMEB__
    @ Setup for Big Endian
    setend      be
    mrc         15, 0, r4, cr1, cr0, 0  @ Read SCTLR
    orr         r4, r4, #(1<<25)        @ Switch to Big Endian (Set SCTLR.EE)
    mcr         15, 0, r4, cr1, cr0, 0  @ Write SCTLR
#else
    @ Setup for Little Endian
    setend      le
    mrc         15, 0, r4, cr1, cr0, 0  @ Read SCTLR
    bic         r4, r4, #(1<<25)        @ Switch to LE (unset SCTLR.EE)
    mcr         15, 0, r4, cr1, cr0, 0  @ Write SCTLR
#endif

    bl          is_a15_a7

    @ For Cortex-A15 and Cortex-A7 only:
    @ Write zero into the ACTLR to turn everything on.
    moveq       r4, #0
    mcreq       15, 0, r4, c1, c0, 1
    isb

    @ For Cortex-A15 and Cortex-A7 only:
    @ Set ACTLR:SMP bit before enabling the caches and MMU,
    @ or performing any cache and TLB maintenance operations.
    mrceq       15, 0, r4, c1, c0, 1    @ Read ACTLR
    orreq       r4, r4, #(1<<6)         @ Enable ACTLR:SMP
    mcreq       15, 0, r4, c1, c0, 1    @ Write ACTLR
    isb

    @ Setup for exceptions being taken to Thumb/ARM state
    mrc         15, 0, r4, cr1, cr0, 0	@ Read SCTLR
#if defined(__thumb__)
    orr         r4, r4, #(1 << 30)	@ Enable SCTLR.TE
#else
    bic         r4, r4, #(1 << 30)      @ Disable SCTLR.TE
#endif
    mcr         15, 0, r4, cr1, cr0, 0  @ Write SCTLR

    bl          __reset_caches

    mrc         15, 0, r4, cr1, cr0, 0  @ Read SCTLR
    orr         r4, r4, #(1<<22)        @ Enable unaligned mode
    bic         r4, r4, #2              @ Disable alignment faults
    bic         r4, r4, #1              @ Disable MMU
    mcr         15, 0, r4, cr1, cr0, 0  @ Write SCTLR

    mov         r4, #0
    mcr         15, 0, r4, cr8, cr7, 0  @ Write TLBIALL - Invaliidate unified
                                        @ TLB
    @ Setup MMU Primary table P=V mapping.
    mvn         r4, #0
    mcr         15, 0, r4, cr3, cr0, 0  @ Write DACR

    mov         r4, #0                  @ Always use TTBR0, no LPAE
    mcr         15, 0, r4, cr2, cr0, 2  @ Write TTBCR
    adr         r4, page_table_addr	@ Load the base for vectors
    ldr         r4, [r4]
    add         r4, r4, #1		@ Page tables inner cacheable

    mcr         15, 0, r4, cr2, cr0, 0  @ Write TTBR0

    mov         r0, #34 @ 0x22          @ TR0 and TR1 - normal memory
    orr         r0, r0, #(1 << 19)      @ Shareable
    mcr         15, 0, r0, cr10, cr2, 0 @ Write PRRR
    movw        r0, #0x33
    movt        r0, #0x33
    mcr         15, 0, r0, cr10, cr2, 1 @ Write NMRR
    mrc         15, 0, r0, cr1, cr0, 0  @ Read SCTLR
    bic         r0, r0, #(1 << 28)      @ Clear TRE bit
    mcr         15, 0, r0, cr1, cr0, 0  @ Write SCTLR

    @ Now install the vector code - we move the Vector code from where it is
    @ in the image to be based at _rdimon_vector_base.  We have to do this copy
    @ as the code is all PC-relative.  We actually cheat and do a BX <reg> so
    @ that we are at a known address relatively quickly and have to move as
    @ little code as possible.
    mov         r7, #(VectorCode_Limit - VectorCode)
    adr         r5, VectorCode
    adr         r6, vector_base_addr	@ Load the base for vectors
    ldr         r6, [r6]

copy_loop:                              @ Do the copy
    ldr         r4, [r5], #4
    str         r4, [r6], #4
    subs        r7, r7, #4
    bne         copy_loop

    mrc         15, 0, r4, cr1, cr0, 0  @ Read SCTLR
    bic         r4, r4, #0x1000         @ Disable I Cache
    bic         r4, r4, #4              @ Disable D Cache
    orr         r4, r4, #1              @ Enable MMU
    bic         r4, r4, #(1 << 28)      @ Clear TRE bit
    mcr         15, 0, r4, cr1, cr0, 0  @ Write SCTLR
    mrc         15, 0, r4, cr1, cr0, 2  @ Read CPACR
    orr         r4, r4, #0x00f00000     @ Turn on VFP Co-procs
    bic         r4, r4, #0x80000000     @ Clear ASEDIS bit
    mcr         15, 0, r4, cr1, cr0, 2  @ Write CPACR
    isb
    mov         r4, #0
    mcr         15, 0, r4, cr7, cr5, 4  @ Flush prefetch buffer
    mrc         15, 0, r4, cr1, cr0, 2  @ Read CPACR
    ubfx        r4, r4, #20, #4		@ Extract bits [20, 23)
    cmp         r4, #0xf		@ If not all set then the CPU does not
    itt		eq			@ have FP or Advanced SIMD.
    moveq       r4, #0x40000000		@ Enable FP and Advanced SIMD
    mcreq       10, 7, r4, cr8, cr0, 0  @ vmsr  fpexc, r4
skip_vfp_enable:
    bl          __enable_caches         @ Turn caches on
    bx		r10                     @ Return to CRT startup routine

    @ This enable us to be more precise about which caches we want
init_cpu_client_enable_dcache:
init_cpu_client_enable_icache:
    mov         r0, #1
    bx          lr

vector_base_addr:
    .word       _rdimon_vector_base
    .weak       _rdimon_vector_base
page_table_addr:
    .word       page_tables

    @ Vector code - must be PIC and in ARM state.
VectorCode:
    b           vector_reset
    b           vector_undef
    b           vector_swi
    b           vector_prefetch
    b           vector_dataabt
    b           vector_reserved
    b           vector_irq
    b           vector_fiq

vector_reset:
    adr         sp, vector_sp_base
    push        {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, sl, fp, ip, lr}
    mov         r4, #0
    b           vector_common
vector_undef:
    adr         sp, vector_sp_base
    push        {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, sl, fp, ip, lr}
    mov         r4, #1
    b           vector_common
vector_swi:
    adr         sp, vector_sp_base
    push        {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, sl, fp, ip, lr}
    mov         r4, #2
    b           vector_common
vector_prefetch:
    adr         sp, vector_sp_base
    push        {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, sl, fp, ip, lr}
    mov         r4, #3
    b           vector_common
vector_dataabt:
    adr         sp, vector_sp_base
    push        {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, sl, fp, ip, lr}
    mov         r4, #4
    b           vector_common
vector_reserved:
    adr         sp, vector_sp_base
    push        {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, sl, fp, ip, lr}
    mov         r4, #5
    b           vector_common
vector_irq:
    adr         sp, vector_sp_base
    push        {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, sl, fp, ip, lr}
    mov         r4, #6
    b           vector_common
vector_fiq:
    adr         sp, vector_sp_base
    push        {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, sl, fp, ip, lr}
    mov         r4, #7
    b           vector_common
vector_common:
    adr         r1, vector_common_adr   @ Find where we're going to
    ldr         r1, [r1]
    bx          r1                      @ And branch there
vector_common_adr:
   .word        vector_common_2         @ Common handling code

                                        @ Vector stack
   .align       3                       @ Align to 8 byte boundary boundary to
					@ keep ABI compatibility
   .fill        32, 4, 0                @ 32-entry stack is enough for vector
					@ handlers.
vector_sp_base:
VectorCode_Limit:
    @ End of PIC code for vectors

    @ Common Handling of vectors
    .type	vector_common_2, %function
vector_common_2:
    mrs         r1, APSR
    mrs         r2, SPSR
    push        {r1, r2}                @ Save PSRs

    @ Output the vector we have caught
    bl          out_nl
    adr         r0, which_vector
    bl          out_string
    adr         r0, vector_names
    mov         r1, #11
    mla         r0, r4, r1, r0
    bl          out_string
    bl          out_nl

    @ Dump the registers
    adrl        r6, register_names
    mov         r7, #0
dump_r_loop:
    mov         r0, r6
    bl          out_string
    add         r6, r6, #6
    ldr         r0, [sp, r7, lsl #2]
    bl          out_word
    bl          out_nl
    add         r7, r7, #1
    cmp         r7, #16
    blt         dump_r_loop
    adr         r0, end
    bl          out_string

    @ And exit
    mov         r0, #24
    orr         r1, r4, #0x20000
    svc         0x00123456

    @ Output the string in r0
out_string:
    push        {lr}
    mov         r1, r0
    mov         r0, #4
    svc         0x00123456
    pop         {pc}

    @ Output a New-line
out_nl:
    mov r0, #10
    @ Fallthrough

    @ Output the character in r0
out_char:
    push        {lr}
    strb        r0, [sp, #-4]!
    mov         r0, #3
    mov         r1, sp
    svc         0x00123456
    add         sp, sp, #4
    pop         {pc}

    @ Output the value of r0 as a hex-word
out_word:
    push        {r4, r5, r6, lr}
    mov         r4, r0
    mov         r5, #28
    adr         r6, hexchars
word_loop:
    lsr         r0, r4, r5
    and         r0, r0, #15
    ldrb        r0, [r6, r0]
    bl          out_char
    subs        r5, r5, #4
    bpl         word_loop
    pop         {r4, r5, r6, pc}

hexchars:
    .ascii	"0123456789abcdef"

which_vector:
    .asciz	"Hit vector:"
end:
    .asciz	"End.\n"

vector_names:
    .asciz	"reset     "
    .asciz	"undef     "
    .asciz	"swi       "
    .asciz	"prefetch  "
    .asciz	"data abort"
    .asciz	"reserved  "
    .asciz	"irq       "
    .asciz	"fiq       "

register_names:
    .asciz	"apsr "
    .asciz	"spsr "
    .asciz	"r0   "
    .asciz	"r1   "
    .asciz	"r2   "
    .asciz	"r3   "
    .asciz	"r4   "
    .asciz	"r5   "
    .asciz	"r6   "
    .asciz	"r7   "
    .asciz	"r8   "
    .asciz	"r9   "
    .asciz	"r10  "
    .asciz	"r11  "
    .asciz	"r12  "
    .asciz	"r14  "

    .align

    @ Enable the caches
__enable_caches:
    mov         r0, #0
    mcr         15, 0, r0, cr8, cr7, 0  @ Invalidate all unified-TLB
    mov         r0, #0
    mcr         15, 0, r0, cr7, cr5, 6  @ Invalidate branch predictor
    mrc         15, 0, r4, cr1, cr0, 0  @ Read SCTLR
    orr         r4, r4, #0x800          @ Enable branch predictor
    mcr         15, 0, r4, cr1, cr0, 0  @ Set SCTLR
    mov         r5, lr                  @ Save LR as we're going to BL
    mrc         15, 0, r4, cr1, cr0, 0  @ Read SCTLR
    bl          init_cpu_client_enable_icache
    cmp         r0, #0
    it		ne
    orrne       r4, r4, #0x1000         @ Enable I-Cache
    bl          init_cpu_client_enable_dcache
    cmp         r0, #0
    it		ne
    orrne       r4, r4, #4
    mcr         15, 0, r4, cr1, cr0, 0  @ Eanble D-Cache
    bx          r5                      @ Return

__reset_caches:
    mov         ip, lr                  @ Save LR
    mov         r0, #0
    mcr         15, 0, r0, cr7, cr5, 6  @ Invalidate branch predictor
    mrc         15, 0, r6, cr1, cr0, 0  @ Read SCTLR
    mrc         15, 0, r0, cr1, cr0, 0  @ Read SCTLR!
    bic         r0, r0, #0x1000         @ Disable I cache
    mcr         15, 0, r0, cr1, cr0, 0  @ Write SCTLR
    mrc         15, 1, r0, cr0, cr0, 1  @ Read CLIDR
    tst         r0, #3                  @ Harvard Cache?
    mov         r0, #0
    it		ne
    mcrne       15, 0, r0, cr7, cr5, 0  @ Invalidate Instruction Cache?

    mrc         15, 0, r1, cr1, cr0, 0  @ Read SCTLR (again!)
    orr         r1, r1, #0x800          @ Enable branch predictor

                                        @ If we're not enabling caches we have
                                        @ no more work to do.
    bl          init_cpu_client_enable_icache
    cmp         r0, #0
    it		ne
    orrne       r1, r1, #0x1000         @ Enable I-Cache now -
                                        @ We actually only do this if we have a
                                        @ Harvard style cache.
    bleq        init_cpu_client_enable_dcache
    cmpeq       r0, #0
    beq         Finished1

    mcr         15, 0, r1, cr1, cr0, 0  @ Write SCTLR (turn on Branch predictor & I-cache)

    mrc         15, 1, r0, cr0, cr0, 1  @ Read CLIDR
    ands        r3, r0, #0x7000000
    lsr         r3, r3, #23             @ Total cache levels << 1
    beq         Finished1

    mov         lr, #0                  @ lr = cache level << 1
Loop11:
    mrc         15, 1, r0, cr0, cr0, 1  @ Read CLIDR
    add         r2, lr, lr, lsr #1      @ r2 holds cache 'set' position
    lsr         r1, r0, r2              @ Bottom 3-bits are Ctype for this level
    and         r1, r1, #7              @ Get those 3-bits alone
    cmp         r1, #2
    blt         Skip1                   @ No cache or only I-Cache at this level
    mcr         15, 2, lr, cr0, cr0, 0  @ Write CSSELR
    mov         r1, #0
    isb         sy
    mrc         15, 1, r1, cr0, cr0, 0  @ Read CCSIDR
    and         r2, r1, #7              @ Extract line length field
    add         r2, r2, #4              @ Add 4 for the line length offset (log2 16 bytes)
    movw        r0, #0x3ff
    ands        r0, r0, r1, lsr #3      @ r0 is the max number on the way size
    clz         r4, r0                  @ r4 is the bit position of the way size increment
    movw        r5, #0x7fff
    ands        r5, r5, r1, lsr #13     @ r5 is the max number of the index size (right aligned)
Loop21:
    mov r7, r0                          @ r7 working copy of max way size
Loop31:
    orr         r1, lr, r7, lsl r4      @ factor in way number and cache number
    orr         r1, r1, r5, lsl r2      @ factor in set number
    tst         r6, #4                  @ D-Cache on?
    ite         eq
    mcreq       15, 0, r1, cr7, cr6, 2  @ No - invalidate by set/way
    mcrne       15, 0, r1, cr7, cr14, 2 @ yes - clean + invalidate by set/way
    subs        r7, r7, #1              @ Decrement way number
    bge         Loop31
    subs        r5, r5, #1              @ Decrement set number
    bge         Loop21
Skip1:
    add         lr, lr, #2              @ increment cache number
    cmp         r3, lr
    bgt         Loop11
Finished1:
    @ Now we know the caches are clean we can:
    mrc         15, 0, r4, cr1, cr0, 0  @ Read SCTLR
    bic         r4, r4, #4              @ Disable D-Cache
    mcr         15, 0, r4, cr1, cr0, 0  @ Write SCTLR
    mov         r4, #0
    mcr         15, 0, r4, cr7, cr5, 6  @ Write BPIALL

    bx          ip                      @ Return

    @ Set Z if this is a Cortex-A15 or Cortex_A7
    @ Other flags corrupted
is_a15_a7:
    mrc         15, 0, r8, c0, c0, 0
    movw        r9, #0xfff0
    movt        r9, #0xff0f
    and         r8, r8, r9
    movw        r9, #0xc0f0
    movt        r9, #0x410f
    cmp         r8, r9
    movw        r9, #0xc070
    movt        r9, #0x410f
    cmpne       r8, r9
    bx          lr

    @ Descriptor type: Section
    @ Bufferable: True
    @ Cacheable: True
    @ Execute Never: False
    @ Domain: 0
    @ Impl. Defined: 0
    @ Access: 0/11 Full access
    @ TEX: 001
    @ Shareable: False
    @ Not Global: False
    @ Supersection: False
#define PT(X) \
    .word	X;
#define PT2(X) \
    PT(X)  PT(X + 0x100000)    PT(X + 0x200000)    PT(X + 0x300000)
#define PT3(X) \
    PT2(X) PT2(X + 0x400000)   PT2(X + 0x800000)   PT2(X + 0xc00000)
#define PT4(X) \
    PT3(X) PT3(X + 0x1000000)  PT3(X + 0x2000000)  PT3(X + 0x3000000)
#define PT5(X) \
    PT4(X) PT4(X + 0x4000000)  PT4(X + 0x8000000)  PT4(X + 0xc000000)
#define PT6(X) \
    PT5(X) PT5(X + 0x10000000) PT5(X + 0x20000000) PT5(X + 0x30000000)
#define PT7(X) \
    PT6(X) PT6(X + 0x40000000) PT6(X + 0x80000000) PT6(X + 0xc0000000)

    .section    page_tables_section, "aw", %progbits
    .p2align    14
page_tables:
     PT7(0x1c0e)