dnscrypt-proxy/vendor/github.com/aead/chacha20/chacha/chacha_386.s

288 lines
5.8 KiB
ArmAsm

// Copyright (c) 2016 Andreas Auernhammer. All rights reserved.
// Use of this source code is governed by a license that can be
// found in the LICENSE file.
// +build 386,!gccgo,!appengine,!nacl
#include "const.s"
#include "macro.s"
// FINALIZE xors len bytes from src and block using
// the temp. registers t0 and t1 and writes the result
// to dst.
#define FINALIZE(dst, src, block, len, t0, t1) \
XORL t0, t0; \
XORL t1, t1; \
FINALIZE_LOOP:; \
MOVB 0(src), t0; \
MOVB 0(block), t1; \
XORL t0, t1; \
MOVB t1, 0(dst); \
INCL src; \
INCL block; \
INCL dst; \
DECL len; \
JG FINALIZE_LOOP \
// func supportsSSE2() bool
TEXT ·supportsSSE2(SB), NOSPLIT, $0-1
XORL AX, AX
INCL AX
CPUID
SHRL $26, DX
ANDL $1, DX
MOVB DX, ret+0(FP)
RET
// func supportsSSSE3() bool
TEXT ·supportsSSSE3(SB), NOSPLIT, $0-1
XORL AX, AX
INCL AX
CPUID
SHRL $9, CX
ANDL $1, CX
MOVB CX, DX
MOVB DX, ret+0(FP)
RET
#define Dst DI
#define Nonce AX
#define Key BX
#define Rounds DX
// func hChaCha20SSE2(out *[32]byte, nonce *[16]byte, key *[32]byte)
TEXT ·hChaCha20SSE2(SB), 4, $0-12
MOVL out+0(FP), Dst
MOVL nonce+4(FP), Nonce
MOVL key+8(FP), Key
MOVOU ·sigma<>(SB), X0
MOVOU 0*16(Key), X1
MOVOU 1*16(Key), X2
MOVOU 0*16(Nonce), X3
MOVL $20, Rounds
chacha_loop:
CHACHA_QROUND_SSE2(X0, X1, X2, X3, X4)
CHACHA_SHUFFLE_SSE(X1, X2, X3)
CHACHA_QROUND_SSE2(X0, X1, X2, X3, X4)
CHACHA_SHUFFLE_SSE(X3, X2, X1)
SUBL $2, Rounds
JNZ chacha_loop
MOVOU X0, 0*16(Dst)
MOVOU X3, 1*16(Dst)
RET
// func hChaCha20SSSE3(out *[32]byte, nonce *[16]byte, key *[32]byte)
TEXT ·hChaCha20SSSE3(SB), 4, $0-12
MOVL out+0(FP), Dst
MOVL nonce+4(FP), Nonce
MOVL key+8(FP), Key
MOVOU ·sigma<>(SB), X0
MOVOU 0*16(Key), X1
MOVOU 1*16(Key), X2
MOVOU 0*16(Nonce), X3
MOVL $20, Rounds
MOVOU ·rol16<>(SB), X5
MOVOU ·rol8<>(SB), X6
chacha_loop:
CHACHA_QROUND_SSSE3(X0, X1, X2, X3, X4, X5, X6)
CHACHA_SHUFFLE_SSE(X1, X2, X3)
CHACHA_QROUND_SSSE3(X0, X1, X2, X3, X4, X5, X6)
CHACHA_SHUFFLE_SSE(X3, X2, X1)
SUBL $2, Rounds
JNZ chacha_loop
MOVOU X0, 0*16(Dst)
MOVOU X3, 1*16(Dst)
RET
#undef Dst
#undef Nonce
#undef Key
#undef Rounds
#define State AX
#define Dst DI
#define Src SI
#define Len CX
#define Rounds DX
#define Tmp0 BX
#define Tmp1 BP
// func xorKeyStreamSSE2(dst, src []byte, block, state *[64]byte, rounds int) int
TEXT ·xorKeyStreamSSE2(SB), 4, $0-40
MOVL dst_base+0(FP), Dst
MOVL src_base+12(FP), Src
MOVL state+28(FP), State
MOVL rounds+32(FP), Rounds
MOVL src_len+16(FP), Len
MOVOU 0*16(State), X0
MOVOU 1*16(State), X1
MOVOU 2*16(State), X2
MOVOU 3*16(State), X3
TESTL Len, Len
JZ DONE
GENERATE_KEYSTREAM:
MOVO X0, X4
MOVO X1, X5
MOVO X2, X6
MOVO X3, X7
MOVL Rounds, Tmp0
CHACHA_LOOP:
CHACHA_QROUND_SSE2(X4, X5, X6, X7, X0)
CHACHA_SHUFFLE_SSE(X5, X6, X7)
CHACHA_QROUND_SSE2(X4, X5, X6, X7, X0)
CHACHA_SHUFFLE_SSE(X7, X6, X5)
SUBL $2, Tmp0
JA CHACHA_LOOP
MOVOU 0*16(State), X0 // Restore X0 from state
PADDL X0, X4
PADDL X1, X5
PADDL X2, X6
PADDL X3, X7
MOVOU ·one<>(SB), X0
PADDQ X0, X3
CMPL Len, $64
JL BUFFER_KEYSTREAM
XOR_SSE(Dst, Src, 0, X4, X5, X6, X7, X0)
MOVOU 0*16(State), X0 // Restore X0 from state
ADDL $64, Src
ADDL $64, Dst
SUBL $64, Len
JZ DONE
JMP GENERATE_KEYSTREAM // There is at least one more plaintext byte
BUFFER_KEYSTREAM:
MOVL Len, Rounds // Use Rounds as tmp. register for Len - we don't need Rounds anymore
MOVL block+24(FP), State
MOVOU X4, 0(State)
MOVOU X5, 16(State)
MOVOU X6, 32(State)
MOVOU X7, 48(State)
FINALIZE(Dst, Src, State, Rounds, Tmp0, Tmp1)
DONE:
MOVL Len, Tmp0 // Number of bytes written to the keystream buffer - 0 iff Len mod 64 == 0
MOVL Tmp0, ret+36(FP)
MOVL state+28(FP), State
MOVOU X3, 3*16(State)
RET
#undef State
#undef Dst
#undef Src
#undef Len
#undef Rounds
#undef Tmp0
#undef Tmp1
#define Dst DI
#define Src SI
#define Len CX
#define Rounds DX
#define State SP
#define Stack State
#define Tmp0 AX
#define Tmp1 BX
#define Tmp2 BP
// func xorKeyStreamSSSE3(dst, src []byte, block, state *[64]byte, rounds int) int
TEXT ·xorKeyStreamSSSE3(SB), 4, $80-40
MOVL dst_base+0(FP), Dst
MOVL src_base+12(FP), Src
MOVL state+28(FP), Tmp0
MOVL rounds+32(FP), Rounds
MOVL src_len+16(FP), Len
MOVL Stack, Tmp2 // save stack pointer
ADDL $16, Stack // ensure 16 byte stack alignment
ANDL $-16, Stack
MOVOU 0*16(Tmp0), X0
MOVOU 1*16(Tmp0), X1
MOVOU 2*16(Tmp0), X2
MOVOU 3*16(Tmp0), X3
TESTL Len, Len
JZ DONE
MOVOU ·one<>(SB), X4
MOVO X0, 0*16(State)
MOVO X1, 1*16(State)
MOVO X2, 2*16(State)
MOVO X4, 3*16(Stack) // store constant on stack
MOVOU ·rol16<>(SB), X1
MOVOU ·rol8<>(SB), X2
GENERATE_KEYSTREAM:
MOVO 0*16(State), X4
MOVO 1*16(State), X5
MOVO 2*16(State), X6
MOVO X3, X7
MOVL Rounds, Tmp0
CHACHA_LOOP:
CHACHA_QROUND_SSSE3(X4, X5, X6, X7, X0, X1, X2)
CHACHA_SHUFFLE_SSE(X5, X6, X7)
CHACHA_QROUND_SSSE3(X4, X5, X6, X7, X0, X1, X2)
CHACHA_SHUFFLE_SSE(X7, X6, X5)
SUBL $2, Tmp0
JA CHACHA_LOOP
PADDL 0*16(State), X4
PADDL 1*16(State), X5
PADDL 2*16(State), X6
PADDL X3, X7
PADDQ 3*16(Stack), X3
CMPL Len, $64
JL BUFFER_KEYSTREAM
XOR_SSE(Dst, Src, 0, X4, X5, X6, X7, X0)
ADDL $64, Src
ADDL $64, Dst
SUBL $64, Len
JZ DONE
JMP GENERATE_KEYSTREAM
BUFFER_KEYSTREAM:
MOVL Tmp2, Stack // restore stack pointer
MOVL Len, Tmp2
MOVL block+24(FP), Tmp1
MOVOU X4, 0*16(Tmp1)
MOVOU X5, 1*16(Tmp1)
MOVOU X6, 2*16(Tmp1)
MOVOU X7, 3*16(Tmp1)
FINALIZE(DI, SI, Tmp1, Tmp2, Tmp0, Rounds)// we don't need the number of rounds anymore
MOVL Stack, Tmp2 // set BP to SP so that DONE resets SP correctly
DONE:
MOVL Len, Tmp0
MOVL Tmp0, ret+36(FP)
MOVL Tmp2, Stack // restore stack pointer
MOVL state+28(FP), Tmp0
MOVOU X3, 3*16(Tmp0)
RET
#undef Dst
#undef Src
#undef Len
#undef Rounds
#undef State
#undef Stack
#undef Tmp0
#undef Tmp1
#undef Tmp2