164 lines
3.5 KiB
ArmAsm
164 lines
3.5 KiB
ArmAsm
// Copyright (c) 2016 Andreas Auernhammer. All rights reserved.
|
|
// Use of this source code is governed by a license that can be
|
|
// found in the LICENSE file.
|
|
|
|
// +build 386,!gccgo,!appengine,!nacl
|
|
|
|
#include "const.s"
|
|
#include "macro.s"
|
|
|
|
// FINALIZE xors len bytes from src and block using
|
|
// the temp. registers t0 and t1 and writes the result
|
|
// to dst.
|
|
#define FINALIZE(dst, src, block, len, t0, t1) \
|
|
XORL t0, t0; \
|
|
XORL t1, t1; \
|
|
FINALIZE_LOOP:; \
|
|
MOVB 0(src), t0; \
|
|
MOVB 0(block), t1; \
|
|
XORL t0, t1; \
|
|
MOVB t1, 0(dst); \
|
|
INCL src; \
|
|
INCL block; \
|
|
INCL dst; \
|
|
DECL len; \
|
|
JG FINALIZE_LOOP \
|
|
|
|
#define Dst DI
|
|
#define Nonce AX
|
|
#define Key BX
|
|
#define Rounds DX
|
|
|
|
// func hChaCha20SSE2(out *[32]byte, nonce *[16]byte, key *[32]byte)
|
|
TEXT ·hChaCha20SSE2(SB), 4, $0-12
|
|
MOVL out+0(FP), Dst
|
|
MOVL nonce+4(FP), Nonce
|
|
MOVL key+8(FP), Key
|
|
|
|
MOVOU ·sigma<>(SB), X0
|
|
MOVOU 0*16(Key), X1
|
|
MOVOU 1*16(Key), X2
|
|
MOVOU 0*16(Nonce), X3
|
|
MOVL $20, Rounds
|
|
|
|
chacha_loop:
|
|
CHACHA_QROUND_SSE2(X0, X1, X2, X3, X4)
|
|
CHACHA_SHUFFLE_SSE(X1, X2, X3)
|
|
CHACHA_QROUND_SSE2(X0, X1, X2, X3, X4)
|
|
CHACHA_SHUFFLE_SSE(X3, X2, X1)
|
|
SUBL $2, Rounds
|
|
JNZ chacha_loop
|
|
|
|
MOVOU X0, 0*16(Dst)
|
|
MOVOU X3, 1*16(Dst)
|
|
RET
|
|
|
|
// func hChaCha20SSSE3(out *[32]byte, nonce *[16]byte, key *[32]byte)
|
|
TEXT ·hChaCha20SSSE3(SB), 4, $0-12
|
|
MOVL out+0(FP), Dst
|
|
MOVL nonce+4(FP), Nonce
|
|
MOVL key+8(FP), Key
|
|
|
|
MOVOU ·sigma<>(SB), X0
|
|
MOVOU 0*16(Key), X1
|
|
MOVOU 1*16(Key), X2
|
|
MOVOU 0*16(Nonce), X3
|
|
MOVL $20, Rounds
|
|
|
|
MOVOU ·rol16<>(SB), X5
|
|
MOVOU ·rol8<>(SB), X6
|
|
|
|
chacha_loop:
|
|
CHACHA_QROUND_SSSE3(X0, X1, X2, X3, X4, X5, X6)
|
|
CHACHA_SHUFFLE_SSE(X1, X2, X3)
|
|
CHACHA_QROUND_SSSE3(X0, X1, X2, X3, X4, X5, X6)
|
|
CHACHA_SHUFFLE_SSE(X3, X2, X1)
|
|
SUBL $2, Rounds
|
|
JNZ chacha_loop
|
|
|
|
MOVOU X0, 0*16(Dst)
|
|
MOVOU X3, 1*16(Dst)
|
|
RET
|
|
|
|
#undef Dst
|
|
#undef Nonce
|
|
#undef Key
|
|
#undef Rounds
|
|
|
|
#define State AX
|
|
#define Dst DI
|
|
#define Src SI
|
|
#define Len DX
|
|
#define Tmp0 BX
|
|
#define Tmp1 BP
|
|
|
|
// func xorKeyStreamSSE2(dst, src []byte, block, state *[64]byte, rounds int) int
|
|
TEXT ·xorKeyStreamSSE2(SB), 4, $0-40
|
|
MOVL dst_base+0(FP), Dst
|
|
MOVL src_base+12(FP), Src
|
|
MOVL state+28(FP), State
|
|
MOVL src_len+16(FP), Len
|
|
MOVL $0, ret+36(FP) // Number of bytes written to the keystream buffer - 0 iff len mod 64 == 0
|
|
|
|
MOVOU 0*16(State), X0
|
|
MOVOU 1*16(State), X1
|
|
MOVOU 2*16(State), X2
|
|
MOVOU 3*16(State), X3
|
|
TESTL Len, Len
|
|
JZ DONE
|
|
|
|
GENERATE_KEYSTREAM:
|
|
MOVO X0, X4
|
|
MOVO X1, X5
|
|
MOVO X2, X6
|
|
MOVO X3, X7
|
|
MOVL rounds+32(FP), Tmp0
|
|
|
|
CHACHA_LOOP:
|
|
CHACHA_QROUND_SSE2(X4, X5, X6, X7, X0)
|
|
CHACHA_SHUFFLE_SSE(X5, X6, X7)
|
|
CHACHA_QROUND_SSE2(X4, X5, X6, X7, X0)
|
|
CHACHA_SHUFFLE_SSE(X7, X6, X5)
|
|
SUBL $2, Tmp0
|
|
JA CHACHA_LOOP
|
|
|
|
MOVOU 0*16(State), X0 // Restore X0 from state
|
|
PADDL X0, X4
|
|
PADDL X1, X5
|
|
PADDL X2, X6
|
|
PADDL X3, X7
|
|
MOVOU ·one<>(SB), X0
|
|
PADDQ X0, X3
|
|
|
|
CMPL Len, $64
|
|
JL BUFFER_KEYSTREAM
|
|
|
|
XOR_SSE(Dst, Src, 0, X4, X5, X6, X7, X0)
|
|
MOVOU 0*16(State), X0 // Restore X0 from state
|
|
ADDL $64, Src
|
|
ADDL $64, Dst
|
|
SUBL $64, Len
|
|
JZ DONE
|
|
JMP GENERATE_KEYSTREAM // There is at least one more plaintext byte
|
|
|
|
BUFFER_KEYSTREAM:
|
|
MOVL block+24(FP), State
|
|
MOVOU X4, 0(State)
|
|
MOVOU X5, 16(State)
|
|
MOVOU X6, 32(State)
|
|
MOVOU X7, 48(State)
|
|
MOVL Len, ret+36(FP) // Number of bytes written to the keystream buffer - 0 < Len < 64
|
|
FINALIZE(Dst, Src, State, Len, Tmp0, Tmp1)
|
|
|
|
DONE:
|
|
MOVL state+28(FP), State
|
|
MOVOU X3, 3*16(State)
|
|
RET
|
|
|
|
#undef State
|
|
#undef Dst
|
|
#undef Src
|
|
#undef Len
|
|
#undef Tmp0
|
|
#undef Tmp1
|