// Copyright (c) 2016 Andreas Auernhammer. All rights reserved. // Use of this source code is governed by a license that can be // found in the LICENSE file. // +build amd64,!gccgo,!appengine,!nacl #include "const.s" #include "macro.s" // FINALIZE xors len bytes from src and block using // the temp. registers t0 and t1 and writes the result // to dst. #define FINALIZE(dst, src, block, len, t0, t1) \ XORQ t0, t0; \ XORQ t1, t1; \ FINALIZE_LOOP:; \ MOVB 0(src), t0; \ MOVB 0(block), t1; \ XORQ t0, t1; \ MOVB t1, 0(dst); \ INCQ src; \ INCQ block; \ INCQ dst; \ DECQ len; \ JA FINALIZE_LOOP \ // func supportsSSSE3() bool TEXT ·supportsSSSE3(SB), NOSPLIT, $0-1 XORQ AX, AX INCQ AX CPUID SHRQ $9, CX ANDQ $1, CX MOVB CX, ret+0(FP) RET // func supportsAVX() bool TEXT ·supportsAVX(SB), 4, $0-1 MOVQ runtime·support_avx(SB), AX MOVB AX, ret+0(FP) RET // func supportsAVX2() bool TEXT ·supportsAVX2(SB), 4, $0-1 MOVQ runtime·support_avx2(SB), AX MOVB AX, ret+0(FP) RET #define Dst DI #define Nonce AX #define Key BX #define Rounds CX // func initialize(state *[64]byte, key []byte, nonce *[16]byte) TEXT ·initialize(SB), 4, $0-40 MOVQ state+0(FP), Dst MOVQ key+8(FP), Key MOVQ nonce+32(FP), Nonce MOVOU ·sigma<>(SB), X0 MOVOU 0*16(Key), X1 MOVOU 1*16(Key), X2 MOVOU 0*16(Nonce), X3 MOVOU X0, 0*16(Dst) MOVOU X1, 1*16(Dst) MOVOU X2, 2*16(Dst) MOVOU X3, 3*16(Dst) RET // func hChaCha20AVX(out *[32]byte, nonce *[16]byte, key *[32]byte) TEXT ·hChaCha20AVX(SB), 4, $0-24 MOVQ out+0(FP), Dst MOVQ nonce+8(FP), Nonce MOVQ key+16(FP), Key VMOVDQU ·sigma<>(SB), X0 VMOVDQU 0*16(Key), X1 VMOVDQU 1*16(Key), X2 VMOVDQU 0*16(Nonce), X3 VMOVDQU ·rol16_AVX2<>(SB), X5 VMOVDQU ·rol8_AVX2<>(SB), X6 MOVQ $20, Rounds CHACHA_LOOP: CHACHA_QROUND_AVX(X0, X1, X2, X3, X4, X5, X6) CHACHA_SHUFFLE_AVX(X1, X2, X3) CHACHA_QROUND_AVX(X0, X1, X2, X3, X4, X5, X6) CHACHA_SHUFFLE_AVX(X3, X2, X1) SUBQ $2, Rounds JNZ CHACHA_LOOP VMOVDQU X0, 0*16(Dst) VMOVDQU X3, 1*16(Dst) VZEROUPPER RET // func hChaCha20SSE2(out *[32]byte, nonce *[16]byte, key *[32]byte) TEXT ·hChaCha20SSE2(SB), 4, $0-24 MOVQ out+0(FP), Dst MOVQ nonce+8(FP), Nonce MOVQ key+16(FP), Key MOVOU ·sigma<>(SB), X0 MOVOU 0*16(Key), X1 MOVOU 1*16(Key), X2 MOVOU 0*16(Nonce), X3 MOVQ $20, Rounds CHACHA_LOOP: CHACHA_QROUND_SSE2(X0, X1, X2, X3, X4) CHACHA_SHUFFLE_SSE(X1, X2, X3) CHACHA_QROUND_SSE2(X0, X1, X2, X3, X4) CHACHA_SHUFFLE_SSE(X3, X2, X1) SUBQ $2, Rounds JNZ CHACHA_LOOP MOVOU X0, 0*16(Dst) MOVOU X3, 1*16(Dst) RET // func hChaCha20SSSE3(out *[32]byte, nonce *[16]byte, key *[32]byte) TEXT ·hChaCha20SSSE3(SB), 4, $0-24 MOVQ out+0(FP), Dst MOVQ nonce+8(FP), Nonce MOVQ key+16(FP), Key MOVOU ·sigma<>(SB), X0 MOVOU 0*16(Key), X1 MOVOU 1*16(Key), X2 MOVOU 0*16(Nonce), X3 MOVOU ·rol16<>(SB), X5 MOVOU ·rol8<>(SB), X6 MOVQ $20, Rounds chacha_loop: CHACHA_QROUND_SSSE3(X0, X1, X2, X3, X4, X5, X6) CHACHA_SHUFFLE_SSE(X1, X2, X3) CHACHA_QROUND_SSSE3(X0, X1, X2, X3, X4, X5, X6) CHACHA_SHUFFLE_SSE(X3, X2, X1) SUBQ $2, Rounds JNZ chacha_loop MOVOU X0, 0*16(Dst) MOVOU X3, 1*16(Dst) RET #undef Dst #undef Nonce #undef Key #undef Rounds #define Dst DI #define Src SI #define Len CX #define Rounds DX #define Buffer BX #define State AX #define Stack SP #define SavedSP R8 #define Tmp0 R9 #define Tmp1 R10 #define Tmp2 R11 // func xorKeyStreamSSE2(dst, src []byte, block, state *[64]byte, rounds int) int TEXT ·xorKeyStreamSSE2(SB), 4, $112-80 MOVQ dst_base+0(FP), Dst MOVQ src_base+24(FP), Src MOVQ src_len+32(FP), Len MOVQ block+48(FP), Buffer MOVQ state+56(FP), State MOVQ rounds+64(FP), Rounds MOVOU 0*16(State), X0 MOVOU 1*16(State), X1 MOVOU 2*16(State), X2 MOVOU 3*16(State), X3 MOVQ Stack, SavedSP ADDQ $16, Stack ANDQ $-16, Stack TESTQ Len, Len JZ DONE MOVOU ·one<>(SB), X4 MOVO X0, 0*16(Stack) MOVO X1, 1*16(Stack) MOVO X2, 2*16(Stack) MOVO X3, 3*16(Stack) MOVO X4, 4*16(Stack) CMPQ Len, $64 JBE GENERATE_KEYSTREAM_64 CMPQ Len, $128 JBE GENERATE_KEYSTREAM_128 CMPQ Len, $192 JBE GENERATE_KEYSTREAM_192 GENERATE_KEYSTREAM_256: MOVO X0, X12 MOVO X1, X13 MOVO X2, X14 MOVO X3, X15 PADDQ 4*16(Stack), X15 MOVO X0, X8 MOVO X1, X9 MOVO X2, X10 MOVO X15, X11 PADDQ 4*16(Stack), X11 MOVO X0, X4 MOVO X1, X5 MOVO X2, X6 MOVO X11, X7 PADDQ 4*16(Stack), X7 MOVQ Rounds, Tmp0 MOVO X3, 3*16(Stack) // Save X3 CHACHA_LOOP_256: MOVO X4, 5*16(Stack) CHACHA_QROUND_SSE2(X0, X1, X2, X3, X4) CHACHA_QROUND_SSE2(X12, X13, X14, X15, X4) MOVO 5*16(Stack), X4 MOVO X0, 5*16(Stack) CHACHA_QROUND_SSE2(X8, X9, X10, X11, X0) CHACHA_QROUND_SSE2(X4, X5, X6, X7, X0) MOVO 5*16(Stack), X0 CHACHA_SHUFFLE_SSE(X1, X2, X3) CHACHA_SHUFFLE_SSE(X13, X14, X15) CHACHA_SHUFFLE_SSE(X9, X10, X11) CHACHA_SHUFFLE_SSE(X5, X6, X7) MOVO X4, 5*16(Stack) CHACHA_QROUND_SSE2(X0, X1, X2, X3, X4) CHACHA_QROUND_SSE2(X12, X13, X14, X15, X4) MOVO 5*16(Stack), X4 MOVO X0, 5*16(Stack) CHACHA_QROUND_SSE2(X8, X9, X10, X11, X0) CHACHA_QROUND_SSE2(X4, X5, X6, X7, X0) MOVO 5*16(Stack), X0 CHACHA_SHUFFLE_SSE(X3, X2, X1) CHACHA_SHUFFLE_SSE(X15, X14, X13) CHACHA_SHUFFLE_SSE(X11, X10, X9) CHACHA_SHUFFLE_SSE(X7, X6, X5) SUBQ $2, Tmp0 JNZ CHACHA_LOOP_256 PADDL 0*16(Stack), X0 PADDL 1*16(Stack), X1 PADDL 2*16(Stack), X2 PADDL 3*16(Stack), X3 MOVO X4, 5*16(Stack) // Save X4 XOR_SSE(Dst, Src, 0, X0, X1, X2, X3, X4) MOVO 5*16(Stack), X4 // Restore X4 MOVO 0*16(Stack), X0 MOVO 1*16(Stack), X1 MOVO 2*16(Stack), X2 MOVO 3*16(Stack), X3 PADDQ 4*16(Stack), X3 PADDL X0, X12 PADDL X1, X13 PADDL X2, X14 PADDL X3, X15 PADDQ 4*16(Stack), X3 PADDL X0, X8 PADDL X1, X9 PADDL X2, X10 PADDL X3, X11 PADDQ 4*16(Stack), X3 PADDL X0, X4 PADDL X1, X5 PADDL X2, X6 PADDL X3, X7 PADDQ 4*16(Stack), X3 XOR_SSE(Dst, Src, 64, X12, X13, X14, X15, X0) XOR_SSE(Dst, Src, 128, X8, X9, X10, X11, X0) MOVO 0*16(Stack), X0 // Restore X0 ADDQ $192, Dst ADDQ $192, Src SUBQ $192, Len CMPQ Len, $64 JB BUFFER_KEYSTREAM XOR_SSE(Dst, Src, 0, X4, X5, X6, X7, X8) ADDQ $64, Dst ADDQ $64, Src SUBQ $64, Len JZ DONE CMPQ Len, $64 // If Len <= 64 -> gen. only 64 byte keystream. JBE GENERATE_KEYSTREAM_64 CMPQ Len, $128 // If 64 < Len <= 128 -> gen. only 128 byte keystream. JBE GENERATE_KEYSTREAM_128 CMPQ Len, $192 // If Len > 192 -> repeat, otherwise Len > 128 && Len <= 192 -> gen. 192 byte keystream JA GENERATE_KEYSTREAM_256 GENERATE_KEYSTREAM_192: MOVO X0, X12 MOVO X1, X13 MOVO X2, X14 MOVO X3, X15 MOVO X0, X8 MOVO X1, X9 MOVO X2, X10 MOVO X3, X11 PADDQ 4*16(Stack), X11 MOVO X0, X4 MOVO X1, X5 MOVO X2, X6 MOVO X11, X7 PADDQ 4*16(Stack), X7 MOVQ Rounds, Tmp0 CHACHA_LOOP_192: CHACHA_QROUND_SSE2(X12, X13, X14, X15, X0) CHACHA_QROUND_SSE2(X8, X9, X10, X11, X0) CHACHA_QROUND_SSE2(X4, X5, X6, X7, X0) CHACHA_SHUFFLE_SSE(X13, X14, X15) CHACHA_SHUFFLE_SSE(X9, X10, X11) CHACHA_SHUFFLE_SSE(X5, X6, X7) CHACHA_QROUND_SSE2(X12, X13, X14, X15, X0) CHACHA_QROUND_SSE2(X8, X9, X10, X11, X0) CHACHA_QROUND_SSE2(X4, X5, X6, X7, X0) CHACHA_SHUFFLE_SSE(X15, X14, X13) CHACHA_SHUFFLE_SSE(X11, X10, X9) CHACHA_SHUFFLE_SSE(X7, X6, X5) SUBQ $2, Tmp0 JNZ CHACHA_LOOP_192 MOVO 0*16(Stack), X0 // Restore X0 PADDL X0, X12 PADDL X1, X13 PADDL X2, X14 PADDL X3, X15 PADDQ 4*16(Stack), X3 PADDL X0, X8 PADDL X1, X9 PADDL X2, X10 PADDL X3, X11 PADDQ 4*16(Stack), X3 PADDL X0, X4 PADDL X1, X5 PADDL X2, X6 PADDL X3, X7 PADDQ 4*16(Stack), X3 XOR_SSE(Dst, Src, 0, X12, X13, X14, X15, X0) XOR_SSE(Dst, Src, 64, X8, X9, X10, X11, X0) MOVO 0*16(Stack), X0 // Restore X0 ADDQ $128, Dst ADDQ $128, Src SUBQ $128, Len CMPQ Len, $64 JB BUFFER_KEYSTREAM XOR_SSE(Dst, Src, 0, X4, X5, X6, X7, X8) ADDQ $64, Dst ADDQ $64, Src SUBQ $64, Len JZ DONE CMPQ Len, $64 // If Len <= 64 -> gen. only 64 byte keystream. JBE GENERATE_KEYSTREAM_64 GENERATE_KEYSTREAM_128: MOVO X0, X8 MOVO X1, X9 MOVO X2, X10 MOVO X3, X11 MOVO X0, X4 MOVO X1, X5 MOVO X2, X6 MOVO X3, X7 PADDQ 4*16(Stack), X7 MOVQ Rounds, Tmp0 CHACHA_LOOP_128: CHACHA_QROUND_SSE2(X8, X9, X10, X11, X12) CHACHA_QROUND_SSE2(X4, X5, X6, X7, X12) CHACHA_SHUFFLE_SSE(X9, X10, X11) CHACHA_SHUFFLE_SSE(X5, X6, X7) CHACHA_QROUND_SSE2(X8, X9, X10, X11, X12) CHACHA_QROUND_SSE2(X4, X5, X6, X7, X12) CHACHA_SHUFFLE_SSE(X11, X10, X9) CHACHA_SHUFFLE_SSE(X7, X6, X5) SUBQ $2, Tmp0 JNZ CHACHA_LOOP_128 PADDL X0, X8 PADDL X1, X9 PADDL X2, X10 PADDL X3, X11 PADDQ 4*16(Stack), X3 PADDL X0, X4 PADDL X1, X5 PADDL X2, X6 PADDL X3, X7 PADDQ 4*16(Stack), X3 XOR_SSE(Dst, Src, 0, X8, X9, X10, X11, X12) ADDQ $64, Dst ADDQ $64, Src SUBQ $64, Len CMPQ Len, $64 JB BUFFER_KEYSTREAM XOR_SSE(Dst, Src, 0, X4, X5, X6, X7, X8) ADDQ $64, Dst ADDQ $64, Src SUBQ $64, Len JZ DONE // If Len == 0 -> DONE, otherwise Len <= 64 -> gen 64 byte keystream GENERATE_KEYSTREAM_64: MOVO X0, X4 MOVO X1, X5 MOVO X2, X6 MOVO X3, X7 MOVQ Rounds, Tmp0 CHACHA_LOOP_64: CHACHA_QROUND_SSE2(X4, X5, X6, X7, X8) CHACHA_SHUFFLE_SSE(X5, X6, X7) CHACHA_QROUND_SSE2(X4, X5, X6, X7, X8) CHACHA_SHUFFLE_SSE(X7, X6, X5) SUBQ $2, Tmp0 JNZ CHACHA_LOOP_64 PADDL X0, X4 PADDL X1, X5 PADDL X2, X6 PADDL X3, X7 PADDQ 4*16(Stack), X3 CMPQ Len, $64 JB BUFFER_KEYSTREAM XOR_SSE(Dst, Src, 0, X4, X5, X6, X7, X8) ADDQ $64, Src ADDQ $64, Dst SUBQ $64, Len JMP DONE // jump directly to DONE - there is no keystream to buffer, Len == 0 always true. BUFFER_KEYSTREAM: MOVOU X4, 0*16(Buffer) MOVOU X5, 1*16(Buffer) MOVOU X6, 2*16(Buffer) MOVOU X7, 3*16(Buffer) MOVQ Len, Tmp0 FINALIZE(Dst, Src, Buffer, Tmp0, Tmp1, Tmp2) DONE: MOVQ SavedSP, Stack // Restore stack pointer MOVOU X3, 3*16(State) MOVQ Len, ret+72(FP) RET // func xorKeyStreamSSSE3(dst, src []byte, block, state *[64]byte, rounds int) int TEXT ·xorKeyStreamSSSE3(SB), 4, $144-80 MOVQ dst_base+0(FP), Dst MOVQ src_base+24(FP), Src MOVQ src_len+32(FP), Len MOVQ block+48(FP), Buffer MOVQ state+56(FP), State MOVQ rounds+64(FP), Rounds MOVOU 0*16(State), X0 MOVOU 1*16(State), X1 MOVOU 2*16(State), X2 MOVOU 3*16(State), X3 MOVQ Stack, SavedSP ADDQ $16, Stack ANDQ $-16, Stack TESTQ Len, Len JZ DONE MOVOU ·one<>(SB), X4 MOVOU ·rol16<>(SB), X5 MOVOU ·rol8<>(SB), X6 MOVO X0, 0*16(Stack) MOVO X1, 1*16(Stack) MOVO X2, 2*16(Stack) MOVO X3, 3*16(Stack) MOVO X4, 4*16(Stack) MOVO X5, 6*16(Stack) MOVO X6, 7*16(Stack) CMPQ Len, $64 JBE GENERATE_KEYSTREAM_64 CMPQ Len, $128 JBE GENERATE_KEYSTREAM_128 CMPQ Len, $192 JBE GENERATE_KEYSTREAM_192 GENERATE_KEYSTREAM_256: MOVO X0, X12 MOVO X1, X13 MOVO X2, X14 MOVO X3, X15 PADDQ 4*16(Stack), X15 MOVO X0, X8 MOVO X1, X9 MOVO X2, X10 MOVO X15, X11 PADDQ 4*16(Stack), X11 MOVO X0, X4 MOVO X1, X5 MOVO X2, X6 MOVO X11, X7 PADDQ 4*16(Stack), X7 MOVQ Rounds, Tmp0 MOVO X3, 3*16(Stack) // Save X3 CHACHA_LOOP_256: MOVO X4, 5*16(Stack) CHACHA_QROUND_SSSE3(X0, X1, X2, X3, X4, 6*16(Stack), 7*16(Stack)) CHACHA_QROUND_SSSE3(X12, X13, X14, X15, X4, 6*16(Stack), 7*16(Stack)) MOVO 5*16(Stack), X4 MOVO X0, 5*16(Stack) CHACHA_QROUND_SSSE3(X8, X9, X10, X11, X0, 6*16(Stack), 7*16(Stack)) CHACHA_QROUND_SSSE3(X4, X5, X6, X7, X0, 6*16(Stack), 7*16(Stack)) MOVO 5*16(Stack), X0 CHACHA_SHUFFLE_SSE(X1, X2, X3) CHACHA_SHUFFLE_SSE(X13, X14, X15) CHACHA_SHUFFLE_SSE(X9, X10, X11) CHACHA_SHUFFLE_SSE(X5, X6, X7) MOVO X4, 5*16(Stack) CHACHA_QROUND_SSSE3(X0, X1, X2, X3, X4, 6*16(Stack), 7*16(Stack)) CHACHA_QROUND_SSSE3(X12, X13, X14, X15, X4, 6*16(Stack), 7*16(Stack)) MOVO 5*16(Stack), X4 MOVO X0, 5*16(Stack) CHACHA_QROUND_SSSE3(X8, X9, X10, X11, X0, 6*16(Stack), 7*16(Stack)) CHACHA_QROUND_SSSE3(X4, X5, X6, X7, X0, 6*16(Stack), 7*16(Stack)) MOVO 5*16(Stack), X0 CHACHA_SHUFFLE_SSE(X3, X2, X1) CHACHA_SHUFFLE_SSE(X15, X14, X13) CHACHA_SHUFFLE_SSE(X11, X10, X9) CHACHA_SHUFFLE_SSE(X7, X6, X5) SUBQ $2, Tmp0 JNZ CHACHA_LOOP_256 PADDL 0*16(Stack), X0 PADDL 1*16(Stack), X1 PADDL 2*16(Stack), X2 PADDL 3*16(Stack), X3 MOVO X4, 5*16(Stack) // Save X4 XOR_SSE(Dst, Src, 0, X0, X1, X2, X3, X4) MOVO 5*16(Stack), X4 // Restore X4 MOVO 0*16(Stack), X0 MOVO 1*16(Stack), X1 MOVO 2*16(Stack), X2 MOVO 3*16(Stack), X3 PADDQ 4*16(Stack), X3 PADDL X0, X12 PADDL X1, X13 PADDL X2, X14 PADDL X3, X15 PADDQ 4*16(Stack), X3 PADDL X0, X8 PADDL X1, X9 PADDL X2, X10 PADDL X3, X11 PADDQ 4*16(Stack), X3 PADDL X0, X4 PADDL X1, X5 PADDL X2, X6 PADDL X3, X7 PADDQ 4*16(Stack), X3 XOR_SSE(Dst, Src, 64, X12, X13, X14, X15, X0) XOR_SSE(Dst, Src, 128, X8, X9, X10, X11, X0) MOVO 0*16(Stack), X0 // Restore X0 ADDQ $192, Dst ADDQ $192, Src SUBQ $192, Len CMPQ Len, $64 JB BUFFER_KEYSTREAM XOR_SSE(Dst, Src, 0, X4, X5, X6, X7, X8) ADDQ $64, Dst ADDQ $64, Src SUBQ $64, Len JZ DONE CMPQ Len, $64 // If Len <= 64 -> gen. only 64 byte keystream. JBE GENERATE_KEYSTREAM_64 CMPQ Len, $128 // If 64 < Len <= 128 -> gen. only 128 byte keystream. JBE GENERATE_KEYSTREAM_128 CMPQ Len, $192 // If Len > 192 -> repeat, otherwise Len > 128 && Len <= 192 -> gen. 192 byte keystream JA GENERATE_KEYSTREAM_256 GENERATE_KEYSTREAM_192: MOVO X0, X12 MOVO X1, X13 MOVO X2, X14 MOVO X3, X15 MOVO X0, X8 MOVO X1, X9 MOVO X2, X10 MOVO X3, X11 PADDQ 4*16(Stack), X11 MOVO X0, X4 MOVO X1, X5 MOVO X2, X6 MOVO X11, X7 PADDQ 4*16(Stack), X7 MOVQ Rounds, Tmp0 MOVO 6*16(Stack), X1 // Load 16 bit rotate-left constant MOVO 7*16(Stack), X2 // Load 8 bit rotate-left constant CHACHA_LOOP_192: CHACHA_QROUND_SSSE3(X12, X13, X14, X15, X0, X1, X2) CHACHA_QROUND_SSSE3(X8, X9, X10, X11, X0, X1, X2) CHACHA_QROUND_SSSE3(X4, X5, X6, X7, X0, X1, X2) CHACHA_SHUFFLE_SSE(X13, X14, X15) CHACHA_SHUFFLE_SSE(X9, X10, X11) CHACHA_SHUFFLE_SSE(X5, X6, X7) CHACHA_QROUND_SSSE3(X12, X13, X14, X15, X0, X1, X2) CHACHA_QROUND_SSSE3(X8, X9, X10, X11, X0, X1, X2) CHACHA_QROUND_SSSE3(X4, X5, X6, X7, X0, X1, X2) CHACHA_SHUFFLE_SSE(X15, X14, X13) CHACHA_SHUFFLE_SSE(X11, X10, X9) CHACHA_SHUFFLE_SSE(X7, X6, X5) SUBQ $2, Tmp0 JNZ CHACHA_LOOP_192 MOVO 0*16(Stack), X0 // Restore X0 MOVO 1*16(Stack), X1 // Restore X1 MOVO 2*16(Stack), X2 // Restore X2 PADDL X0, X12 PADDL X1, X13 PADDL X2, X14 PADDL X3, X15 PADDQ 4*16(Stack), X3 PADDL X0, X8 PADDL X1, X9 PADDL X2, X10 PADDL X3, X11 PADDQ 4*16(Stack), X3 PADDL X0, X4 PADDL X1, X5 PADDL X2, X6 PADDL X3, X7 PADDQ 4*16(Stack), X3 XOR_SSE(Dst, Src, 0, X12, X13, X14, X15, X0) XOR_SSE(Dst, Src, 64, X8, X9, X10, X11, X0) MOVO 0*16(Stack), X0 // Restore X0 ADDQ $128, Dst ADDQ $128, Src SUBQ $128, Len CMPQ Len, $64 JB BUFFER_KEYSTREAM XOR_SSE(Dst, Src, 0, X4, X5, X6, X7, X8) ADDQ $64, Dst ADDQ $64, Src SUBQ $64, Len JZ DONE CMPQ Len, $64 // If Len <= 64 -> gen. only 64 byte keystream. JBE GENERATE_KEYSTREAM_64 GENERATE_KEYSTREAM_128: MOVO X0, X8 MOVO X1, X9 MOVO X2, X10 MOVO X3, X11 MOVO X0, X4 MOVO X1, X5 MOVO X2, X6 MOVO X3, X7 PADDQ 4*16(Stack), X7 MOVQ Rounds, Tmp0 MOVO 6*16(Stack), X13 // Load 16 bit rotate-left constant MOVO 7*16(Stack), X14 // Load 8 bit rotate-left constant CHACHA_LOOP_128: CHACHA_QROUND_SSSE3(X8, X9, X10, X11, X12, X13, X14) CHACHA_QROUND_SSSE3(X4, X5, X6, X7, X12, X13, X14) CHACHA_SHUFFLE_SSE(X9, X10, X11) CHACHA_SHUFFLE_SSE(X5, X6, X7) CHACHA_QROUND_SSSE3(X8, X9, X10, X11, X12, X13, X14) CHACHA_QROUND_SSSE3(X4, X5, X6, X7, X12, X13, X14) CHACHA_SHUFFLE_SSE(X11, X10, X9) CHACHA_SHUFFLE_SSE(X7, X6, X5) SUBQ $2, Tmp0 JNZ CHACHA_LOOP_128 PADDL X0, X8 PADDL X1, X9 PADDL X2, X10 PADDL X3, X11 PADDQ 4*16(Stack), X3 PADDL X0, X4 PADDL X1, X5 PADDL X2, X6 PADDL X3, X7 PADDQ 4*16(Stack), X3 XOR_SSE(Dst, Src, 0, X8, X9, X10, X11, X12) ADDQ $64, Dst ADDQ $64, Src SUBQ $64, Len CMPQ Len, $64 JB BUFFER_KEYSTREAM XOR_SSE(Dst, Src, 0, X4, X5, X6, X7, X8) ADDQ $64, Dst ADDQ $64, Src SUBQ $64, Len JZ DONE // If Len == 0 -> DONE, otherwise Len <= 64 -> gen 64 byte keystream GENERATE_KEYSTREAM_64: MOVO X0, X4 MOVO X1, X5 MOVO X2, X6 MOVO X3, X7 MOVQ Rounds, Tmp0 MOVO 6*16(Stack), X9 // Load 16 bit rotate-left constant MOVO 7*16(Stack), X10 // Load 8 bit rotate-left constant CHACHA_LOOP_64: CHACHA_QROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10) CHACHA_SHUFFLE_SSE(X5, X6, X7) CHACHA_QROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10) CHACHA_SHUFFLE_SSE(X7, X6, X5) SUBQ $2, Tmp0 JNZ CHACHA_LOOP_64 PADDL X0, X4 PADDL X1, X5 PADDL X2, X6 PADDL X3, X7 PADDQ 4*16(Stack), X3 CMPQ Len, $64 JB BUFFER_KEYSTREAM XOR_SSE(Dst, Src, 0, X4, X5, X6, X7, X8) ADDQ $64, Src ADDQ $64, Dst SUBQ $64, Len JMP DONE // jump directly to DONE - there is no keystream to buffer, Len == 0 always true. BUFFER_KEYSTREAM: MOVOU X4, 0*16(Buffer) MOVOU X5, 1*16(Buffer) MOVOU X6, 2*16(Buffer) MOVOU X7, 3*16(Buffer) MOVQ Len, Tmp0 FINALIZE(Dst, Src, Buffer, Tmp0, Tmp1, Tmp2) DONE: MOVQ SavedSP, Stack // Restore stack pointer MOVOU X3, 3*16(State) MOVQ Len, ret+72(FP) RET // func xorKeyStreamAVX(dst, src []byte, block, state *[64]byte, rounds int) int TEXT ·xorKeyStreamAVX(SB), 4, $144-80 MOVQ dst_base+0(FP), Dst MOVQ src_base+24(FP), Src MOVQ src_len+32(FP), Len MOVQ block+48(FP), Buffer MOVQ state+56(FP), State MOVQ rounds+64(FP), Rounds VMOVDQU 0*16(State), X0 VMOVDQU 1*16(State), X1 VMOVDQU 2*16(State), X2 VMOVDQU 3*16(State), X3 MOVQ Stack, SavedSP ADDQ $16, Stack ANDQ $-16, Stack TESTQ Len, Len JZ DONE VMOVDQU ·one<>(SB), X4 VMOVDQU ·rol16<>(SB), X5 VMOVDQU ·rol8<>(SB), X6 VMOVDQA X0, 0*16(Stack) VMOVDQA X1, 1*16(Stack) VMOVDQA X2, 2*16(Stack) VMOVDQA X3, 3*16(Stack) VMOVDQA X4, 4*16(Stack) VMOVDQA X5, 6*16(Stack) VMOVDQA X6, 7*16(Stack) CMPQ Len, $64 JBE GENERATE_KEYSTREAM_64 CMPQ Len, $128 JBE GENERATE_KEYSTREAM_128 CMPQ Len, $192 JBE GENERATE_KEYSTREAM_192 GENERATE_KEYSTREAM_256: VMOVDQA X0, X12 VMOVDQA X1, X13 VMOVDQA X2, X14 VMOVDQA X3, X15 VPADDQ 4*16(Stack), X15, X15 VMOVDQA X0, X8 VMOVDQA X1, X9 VMOVDQA X2, X10 VMOVDQA X15, X11 VPADDQ 4*16(Stack), X11, X11 VMOVDQA X0, X4 VMOVDQA X1, X5 VMOVDQA X2, X6 VMOVDQA X11, X7 VPADDQ 4*16(Stack), X7, X7 MOVQ Rounds, Tmp0 VMOVDQA X3, 3*16(Stack) // Save X3 CHACHA_LOOP_256: VMOVDQA X4, 5*16(Stack) CHACHA_QROUND_AVX(X0, X1, X2, X3, X4, 6*16(Stack), 7*16(Stack)) CHACHA_QROUND_AVX(X12, X13, X14, X15, X4, 6*16(Stack), 7*16(Stack)) VMOVDQA 5*16(Stack), X4 VMOVDQA X0, 5*16(Stack) CHACHA_QROUND_AVX(X8, X9, X10, X11, X0, 6*16(Stack), 7*16(Stack)) CHACHA_QROUND_AVX(X4, X5, X6, X7, X0, 6*16(Stack), 7*16(Stack)) VMOVDQA 5*16(Stack), X0 CHACHA_SHUFFLE_AVX(X1, X2, X3) CHACHA_SHUFFLE_AVX(X13, X14, X15) CHACHA_SHUFFLE_AVX(X9, X10, X11) CHACHA_SHUFFLE_AVX(X5, X6, X7) VMOVDQA X4, 5*16(Stack) CHACHA_QROUND_AVX(X0, X1, X2, X3, X4, 6*16(Stack), 7*16(Stack)) CHACHA_QROUND_AVX(X12, X13, X14, X15, X4, 6*16(Stack), 7*16(Stack)) VMOVDQA 5*16(Stack), X4 VMOVDQA X0, 5*16(Stack) CHACHA_QROUND_AVX(X8, X9, X10, X11, X0, 6*16(Stack), 7*16(Stack)) CHACHA_QROUND_AVX(X4, X5, X6, X7, X0, 6*16(Stack), 7*16(Stack)) VMOVDQA 5*16(Stack), X0 CHACHA_SHUFFLE_AVX(X3, X2, X1) CHACHA_SHUFFLE_AVX(X15, X14, X13) CHACHA_SHUFFLE_AVX(X11, X10, X9) CHACHA_SHUFFLE_AVX(X7, X6, X5) SUBQ $2, Tmp0 JNZ CHACHA_LOOP_256 VPADDD 0*16(Stack), X0, X0 VPADDD 1*16(Stack), X1, X1 VPADDD 2*16(Stack), X2, X2 VPADDD 3*16(Stack), X3, X3 VMOVDQA X4, 5*16(Stack) // Save X4 XOR_AVX(Dst, Src, 0, X0, X1, X2, X3, X4) VMOVDQA 5*16(Stack), X4 // Restore X4 VMOVDQA 0*16(Stack), X0 VMOVDQA 1*16(Stack), X1 VMOVDQA 2*16(Stack), X2 VMOVDQA 3*16(Stack), X3 VPADDQ 4*16(Stack), X3, X3 VPADDD X0, X12, X12 VPADDD X1, X13, X13 VPADDD X2, X14, X14 VPADDD X3, X15, X15 VPADDQ 4*16(Stack), X3, X3 VPADDD X0, X8, X8 VPADDD X1, X9, X9 VPADDD X2, X10, X10 VPADDD X3, X11, X11 VPADDQ 4*16(Stack), X3, X3 VPADDD X0, X4, X4 VPADDD X1, X5, X5 VPADDD X2, X6, X6 VPADDD X3, X7, X7 VPADDQ 4*16(Stack), X3, X3 XOR_AVX(Dst, Src, 64, X12, X13, X14, X15, X0) XOR_AVX(Dst, Src, 128, X8, X9, X10, X11, X0) VMOVDQA 0*16(Stack), X0 // Restore X0 ADDQ $192, Dst ADDQ $192, Src SUBQ $192, Len CMPQ Len, $64 JB BUFFER_KEYSTREAM XOR_AVX(Dst, Src, 0, X4, X5, X6, X7, X8) ADDQ $64, Dst ADDQ $64, Src SUBQ $64, Len JZ DONE CMPQ Len, $64 // If Len <= 64 -> gen. only 64 byte keystream. JBE GENERATE_KEYSTREAM_64 CMPQ Len, $128 // If 64 < Len <= 128 -> gen. only 128 byte keystream. JBE GENERATE_KEYSTREAM_128 CMPQ Len, $192 // If Len > 192 -> repeat, otherwise Len > 128 && Len <= 192 -> gen. 192 byte keystream JA GENERATE_KEYSTREAM_256 GENERATE_KEYSTREAM_192: VMOVDQA X0, X12 VMOVDQA X1, X13 VMOVDQA X2, X14 VMOVDQA X3, X15 VMOVDQA X0, X8 VMOVDQA X1, X9 VMOVDQA X2, X10 VMOVDQA X3, X11 VPADDQ 4*16(Stack), X11, X11 VMOVDQA X0, X4 VMOVDQA X1, X5 VMOVDQA X2, X6 VMOVDQA X11, X7 VPADDQ 4*16(Stack), X7, X7 MOVQ Rounds, Tmp0 VMOVDQA 6*16(Stack), X1 // Load 16 bit rotate-left constant VMOVDQA 7*16(Stack), X2 // Load 8 bit rotate-left constant CHACHA_LOOP_192: CHACHA_QROUND_AVX(X12, X13, X14, X15, X0, X1, X2) CHACHA_QROUND_AVX(X8, X9, X10, X11, X0, X1, X2) CHACHA_QROUND_AVX(X4, X5, X6, X7, X0, X1, X2) CHACHA_SHUFFLE_AVX(X13, X14, X15) CHACHA_SHUFFLE_AVX(X9, X10, X11) CHACHA_SHUFFLE_AVX(X5, X6, X7) CHACHA_QROUND_AVX(X12, X13, X14, X15, X0, X1, X2) CHACHA_QROUND_AVX(X8, X9, X10, X11, X0, X1, X2) CHACHA_QROUND_AVX(X4, X5, X6, X7, X0, X1, X2) CHACHA_SHUFFLE_AVX(X15, X14, X13) CHACHA_SHUFFLE_AVX(X11, X10, X9) CHACHA_SHUFFLE_AVX(X7, X6, X5) SUBQ $2, Tmp0 JNZ CHACHA_LOOP_192 VMOVDQA 0*16(Stack), X0 // Restore X0 VMOVDQA 1*16(Stack), X1 // Restore X1 VMOVDQA 2*16(Stack), X2 // Restore X2 VPADDD X0, X12, X12 VPADDD X1, X13, X13 VPADDD X2, X14, X14 VPADDD X3, X15, X15 VPADDQ 4*16(Stack), X3, X3 VPADDD X0, X8, X8 VPADDD X1, X9, X9 VPADDD X2, X10, X10 VPADDD X3, X11, X11 VPADDQ 4*16(Stack), X3, X3 VPADDD X0, X4, X4 VPADDD X1, X5, X5 VPADDD X2, X6, X6 VPADDD X3, X7, X7 VPADDQ 4*16(Stack), X3, X3 XOR_AVX(Dst, Src, 0, X12, X13, X14, X15, X0) XOR_AVX(Dst, Src, 64, X8, X9, X10, X11, X0) VMOVDQA 0*16(Stack), X0 // Restore X0 ADDQ $128, Dst ADDQ $128, Src SUBQ $128, Len CMPQ Len, $64 JB BUFFER_KEYSTREAM XOR_AVX(Dst, Src, 0, X4, X5, X6, X7, X8) ADDQ $64, Dst ADDQ $64, Src SUBQ $64, Len JZ DONE CMPQ Len, $64 // If Len <= 64 -> gen. only 64 byte keystream. JBE GENERATE_KEYSTREAM_64 GENERATE_KEYSTREAM_128: VMOVDQA X0, X8 VMOVDQA X1, X9 VMOVDQA X2, X10 VMOVDQA X3, X11 VMOVDQA X0, X4 VMOVDQA X1, X5 VMOVDQA X2, X6 VMOVDQA X3, X7 VPADDQ 4*16(Stack), X7, X7 MOVQ Rounds, Tmp0 VMOVDQA 6*16(Stack), X13 // Load 16 bit rotate-left constant VMOVDQA 7*16(Stack), X14 // Load 8 bit rotate-left constant CHACHA_LOOP_128: CHACHA_QROUND_AVX(X8, X9, X10, X11, X12, X13, X14) CHACHA_QROUND_AVX(X4, X5, X6, X7, X12, X13, X14) CHACHA_SHUFFLE_AVX(X9, X10, X11) CHACHA_SHUFFLE_AVX(X5, X6, X7) CHACHA_QROUND_AVX(X8, X9, X10, X11, X12, X13, X14) CHACHA_QROUND_AVX(X4, X5, X6, X7, X12, X13, X14) CHACHA_SHUFFLE_AVX(X11, X10, X9) CHACHA_SHUFFLE_AVX(X7, X6, X5) SUBQ $2, Tmp0 JNZ CHACHA_LOOP_128 VPADDD X0, X8, X8 VPADDD X1, X9, X9 VPADDD X2, X10, X10 VPADDD X3, X11, X11 VPADDQ 4*16(Stack), X3, X3 VPADDD X0, X4, X4 VPADDD X1, X5, X5 VPADDD X2, X6, X6 VPADDD X3, X7, X7 VPADDQ 4*16(Stack), X3, X3 XOR_AVX(Dst, Src, 0, X8, X9, X10, X11, X12) ADDQ $64, Dst ADDQ $64, Src SUBQ $64, Len CMPQ Len, $64 JB BUFFER_KEYSTREAM XOR_AVX(Dst, Src, 0, X4, X5, X6, X7, X8) ADDQ $64, Dst ADDQ $64, Src SUBQ $64, Len JZ DONE // If Len == 0 -> DONE, otherwise Len <= 64 -> gen 64 byte keystream GENERATE_KEYSTREAM_64: VMOVDQA X0, X4 VMOVDQA X1, X5 VMOVDQA X2, X6 VMOVDQA X3, X7 MOVQ Rounds, Tmp0 VMOVDQA 6*16(Stack), X9 // Load 16 bit rotate-left constant VMOVDQA 7*16(Stack), X10 // Load 8 bit rotate-left constant CHACHA_LOOP_64: CHACHA_QROUND_AVX(X4, X5, X6, X7, X8, X9, X10) CHACHA_SHUFFLE_AVX(X5, X6, X7) CHACHA_QROUND_AVX(X4, X5, X6, X7, X8, X9, X10) CHACHA_SHUFFLE_AVX(X7, X6, X5) SUBQ $2, Tmp0 JNZ CHACHA_LOOP_64 VPADDD X0, X4, X4 VPADDD X1, X5, X5 VPADDD X2, X6, X6 VPADDD X3, X7, X7 VPADDQ 4*16(Stack), X3, X3 CMPQ Len, $64 JB BUFFER_KEYSTREAM XOR_AVX(Dst, Src, 0, X4, X5, X6, X7, X8) ADDQ $64, Src ADDQ $64, Dst SUBQ $64, Len JMP DONE // jump directly to DONE - there is no keystream to buffer, Len == 0 always true. BUFFER_KEYSTREAM: VMOVDQU X4, 0*16(Buffer) VMOVDQU X5, 1*16(Buffer) VMOVDQU X6, 2*16(Buffer) VMOVDQU X7, 3*16(Buffer) MOVQ Len, Tmp0 FINALIZE(Dst, Src, Buffer, Tmp0, Tmp1, Tmp2) DONE: MOVQ SavedSP, Stack // Restore stack pointer VMOVDQU X3, 3*16(State) VZEROUPPER MOVQ Len, ret+72(FP) RET #undef Dst #undef Src #undef Len #undef Rounds #undef Buffer #undef State #undef Stack #undef SavedSP #undef Tmp0 #undef Tmp1 #undef Tmp2