dnscrypt-proxy/vendor/github.com/aead/chacha20/chacha/macro.s

// Copyright (c) 2018 Andreas Auernhammer. All rights reserved.
// Use of this source code is governed by a license that can be
// found in the LICENSE file.

// +build 386,!gccgo,!appengine,!nacl amd64,!gccgo,!appengine,!nacl

// ROTL_SSE rotates all 4 32 bit values of the XMM register v
// left by n bits using SSE2 instructions (0 <= n <= 32).
// The XMM register t is used as a temp. register.
#define ROTL_SSE(n, t, v) \
	MOVO  v, t;       \
	PSLLL $n, t;      \
	PSRLL $(32-n), v; \
	PXOR  t, v

// ROTL_AVX rotates all 4/8 32 bit values of the AVX/AVX2 register v
// left by n bits using AVX/AVX2 instructions (0 <= n <= 32).
// The AVX/AVX2 register t is used as a temp. register.
#define ROTL_AVX(n, t, v) \
	VPSLLD $n, v, t;      \
	VPSRLD $(32-n), v, v; \
	VPXOR  v, t, v

// CHACHA_QROUND_SSE2 performs a ChaCha quarter-round using the
// 4 XMM registers v0, v1, v2 and v3. It uses only ROTL_SSE2 for
// rotations. The XMM register t is used as a temp. register.
#define CHACHA_QROUND_SSE2(v0, v1, v2, v3, t) \
	PADDL v1, v0;        \
	PXOR  v0, v3;        \
	ROTL_SSE(16, t, v3); \
	PADDL v3, v2;        \
	PXOR  v2, v1;        \
	ROTL_SSE(12, t, v1); \
	PADDL v1, v0;        \
	PXOR  v0, v3;        \
	ROTL_SSE(8, t, v3);  \
	PADDL v3, v2;        \
	PXOR  v2, v1;        \
	ROTL_SSE(7, t, v1)

// CHACHA_QROUND_SSSE3 performs a ChaCha quarter-round using the
// 4 XMM registers v0, v1, v2 and v3. It uses PSHUFB for 8/16 bit
// rotations. The XMM register t is used as a temp. register.
//
// r16 holds the PSHUFB constant for a 16 bit left rotate.
// r8 holds the PSHUFB constant for a 8 bit left rotate.
#define CHACHA_QROUND_SSSE3(v0, v1, v2, v3, t, r16, r8) \
	PADDL  v1, v0;       \
	PXOR   v0, v3;       \
	PSHUFB r16, v3;      \
	PADDL  v3, v2;       \
	PXOR   v2, v1;       \
	ROTL_SSE(12, t, v1); \
	PADDL  v1, v0;       \
	PXOR   v0, v3;       \
	PSHUFB r8, v3;       \
	PADDL  v3, v2;       \
	PXOR   v2, v1;       \
	ROTL_SSE(7, t, v1)

// CHACHA_QROUND_AVX performs a ChaCha quarter-round using the
// 4 AVX/AVX2 registers v0, v1, v2 and v3. It uses VPSHUFB for 8/16 bit
// rotations. The AVX/AVX2 register t is used as a temp. register.
//
// r16 holds the VPSHUFB constant for a 16 bit left rotate.
// r8 holds the VPSHUFB constant for a 8 bit left rotate.
#define CHACHA_QROUND_AVX(v0, v1, v2, v3, t, r16, r8) \
	VPADDD  v0, v1, v0;  \
	VPXOR   v3, v0, v3;  \
	VPSHUFB r16, v3, v3; \
	VPADDD  v2, v3, v2;  \
	VPXOR   v1, v2, v1;  \
	ROTL_AVX(12, t, v1); \
	VPADDD  v0, v1, v0;  \
	VPXOR   v3, v0, v3;  \
	VPSHUFB r8, v3, v3;  \
	VPADDD  v2, v3, v2;  \
	VPXOR   v1, v2, v1;  \
	ROTL_AVX(7, t, v1)

// CHACHA_SHUFFLE_SSE performs a ChaCha shuffle using the
// 3 XMM registers v1, v2 and v3. The inverse shuffle is
// performed by switching v1 and v3: CHACHA_SHUFFLE_SSE(v3, v2, v1).
#define CHACHA_SHUFFLE_SSE(v1, v2, v3) \
	PSHUFL $0x39, v1, v1; \
	PSHUFL $0x4E, v2, v2; \
	PSHUFL $0x93, v3, v3

// CHACHA_SHUFFLE_AVX performs a ChaCha shuffle using the
// 3 AVX/AVX2 registers v1, v2 and v3. The inverse shuffle is
// performed by switching v1 and v3: CHACHA_SHUFFLE_AVX(v3, v2, v1).
#define CHACHA_SHUFFLE_AVX(v1, v2, v3) \
	VPSHUFD $0x39, v1, v1; \
	VPSHUFD $0x4E, v2, v2; \
	VPSHUFD $0x93, v3, v3

// XOR_SSE extracts 4x16 byte vectors from src at
// off, xors all vectors with the corresponding XMM
// register (v0 - v3) and writes the result to dst
// at off.
// The XMM register t is used as a temp. register.
#define XOR_SSE(dst, src, off, v0, v1, v2, v3, t) \
	MOVOU 0+off(src), t;  \
	PXOR  v0, t;          \
	MOVOU t, 0+off(dst);  \
	MOVOU 16+off(src), t; \
	PXOR  v1, t;          \
	MOVOU t, 16+off(dst); \
	MOVOU 32+off(src), t; \
	PXOR  v2, t;          \
	MOVOU t, 32+off(dst); \
	MOVOU 48+off(src), t; \
	PXOR  v3, t;          \
	MOVOU t, 48+off(dst)

// XOR_AVX extracts 4x16 byte vectors from src at
// off, xors all vectors with the corresponding AVX
// register (v0 - v3) and writes the result to dst
// at off.
// The XMM register t is used as a temp. register.
#define XOR_AVX(dst, src, off, v0, v1, v2, v3, t) \
	VPXOR   0+off(src), v0, t;  \
	VMOVDQU t, 0+off(dst);      \
	VPXOR   16+off(src), v1, t; \
	VMOVDQU t, 16+off(dst);     \
	VPXOR   32+off(src), v2, t; \
	VMOVDQU t, 32+off(dst);     \
	VPXOR   48+off(src), v3, t; \
	VMOVDQU t, 48+off(dst)

#define XOR_AVX2(dst, src, off, v0, v1, v2, v3, t0, t1) \
	VMOVDQU    (0+off)(src), t0;  \
	VPERM2I128 $32, v1, v0, t1;   \
	VPXOR      t0, t1, t0;        \
	VMOVDQU    t0, (0+off)(dst);  \
	VMOVDQU    (32+off)(src), t0; \
	VPERM2I128 $32, v3, v2, t1;   \
	VPXOR      t0, t1, t0;        \
	VMOVDQU    t0, (32+off)(dst); \
	VMOVDQU    (64+off)(src), t0; \
	VPERM2I128 $49, v1, v0, t1;   \
	VPXOR      t0, t1, t0;        \
	VMOVDQU    t0, (64+off)(dst); \
	VMOVDQU    (96+off)(src), t0; \
	VPERM2I128 $49, v3, v2, t1;   \
	VPXOR      t0, t1, t0;        \
	VMOVDQU    t0, (96+off)(dst)

#define XOR_UPPER_AVX2(dst, src, off, v0, v1, v2, v3, t0, t1) \
	VMOVDQU    (0+off)(src), t0;  \
	VPERM2I128 $32, v1, v0, t1;   \
	VPXOR      t0, t1, t0;        \
	VMOVDQU    t0, (0+off)(dst);  \
	VMOVDQU    (32+off)(src), t0; \
	VPERM2I128 $32, v3, v2, t1;   \
	VPXOR      t0, t1, t0;        \
	VMOVDQU    t0, (32+off)(dst); \

#define EXTRACT_LOWER(dst, v0, v1, v2, v3, t0) \
	VPERM2I128 $49, v1, v0, t0; \
	VMOVDQU    t0, 0(dst);      \
	VPERM2I128 $49, v3, v2, t0; \
	VMOVDQU    t0, 32(dst)
Update deps; especially for chacha20 2018-03-26 20:46:25 +02:00			`// Copyright (c) 2018 Andreas Auernhammer. All rights reserved.`
			`// Use of this source code is governed by a license that can be`
			`// found in the LICENSE file.`

			`// +build 386,!gccgo,!appengine,!nacl amd64,!gccgo,!appengine,!nacl`

			`// ROTL_SSE rotates all 4 32 bit values of the XMM register v`
Deps update 2018-04-25 20:23:31 +02:00			`// left by n bits using SSE2 instructions (0 <= n <= 32).`
Update deps; especially for chacha20 2018-03-26 20:46:25 +02:00			`// The XMM register t is used as a temp. register.`
			`#define ROTL_SSE(n, t, v) \`
			`MOVO v, t; \`
			`PSLLL $n, t; \`
			`PSRLL $(32-n), v; \`
			`PXOR t, v`

			`// ROTL_AVX rotates all 4/8 32 bit values of the AVX/AVX2 register v`
Deps update 2018-04-25 20:23:31 +02:00			`// left by n bits using AVX/AVX2 instructions (0 <= n <= 32).`
Update deps; especially for chacha20 2018-03-26 20:46:25 +02:00			`// The AVX/AVX2 register t is used as a temp. register.`
			`#define ROTL_AVX(n, t, v) \`
			`VPSLLD $n, v, t; \`
			`VPSRLD $(32-n), v, v; \`
			`VPXOR v, t, v`

			`// CHACHA_QROUND_SSE2 performs a ChaCha quarter-round using the`
			`// 4 XMM registers v0, v1, v2 and v3. It uses only ROTL_SSE2 for`
			`// rotations. The XMM register t is used as a temp. register.`
			`#define CHACHA_QROUND_SSE2(v0, v1, v2, v3, t) \`
Deps update 2018-04-25 20:23:31 +02:00			`PADDL v1, v0; \`
			`PXOR v0, v3; \`
Update deps; especially for chacha20 2018-03-26 20:46:25 +02:00			`ROTL_SSE(16, t, v3); \`
Deps update 2018-04-25 20:23:31 +02:00			`PADDL v3, v2; \`
			`PXOR v2, v1; \`
Update deps; especially for chacha20 2018-03-26 20:46:25 +02:00			`ROTL_SSE(12, t, v1); \`
Deps update 2018-04-25 20:23:31 +02:00			`PADDL v1, v0; \`
			`PXOR v0, v3; \`
Update deps; especially for chacha20 2018-03-26 20:46:25 +02:00			`ROTL_SSE(8, t, v3); \`
Deps update 2018-04-25 20:23:31 +02:00			`PADDL v3, v2; \`
			`PXOR v2, v1; \`
Update deps; especially for chacha20 2018-03-26 20:46:25 +02:00			`ROTL_SSE(7, t, v1)`

			`// CHACHA_QROUND_SSSE3 performs a ChaCha quarter-round using the`
Deps update 2018-04-25 20:23:31 +02:00			`// 4 XMM registers v0, v1, v2 and v3. It uses PSHUFB for 8/16 bit`
Update deps; especially for chacha20 2018-03-26 20:46:25 +02:00			`// rotations. The XMM register t is used as a temp. register.`
			`//`
			`// r16 holds the PSHUFB constant for a 16 bit left rotate.`
			`// r8 holds the PSHUFB constant for a 8 bit left rotate.`
			`#define CHACHA_QROUND_SSSE3(v0, v1, v2, v3, t, r16, r8) \`
Deps update 2018-04-25 20:23:31 +02:00			`PADDL v1, v0; \`
			`PXOR v0, v3; \`
			`PSHUFB r16, v3; \`
			`PADDL v3, v2; \`
			`PXOR v2, v1; \`
Update deps; especially for chacha20 2018-03-26 20:46:25 +02:00			`ROTL_SSE(12, t, v1); \`
Deps update 2018-04-25 20:23:31 +02:00			`PADDL v1, v0; \`
			`PXOR v0, v3; \`
			`PSHUFB r8, v3; \`
			`PADDL v3, v2; \`
			`PXOR v2, v1; \`
Update deps; especially for chacha20 2018-03-26 20:46:25 +02:00			`ROTL_SSE(7, t, v1)`

			`// CHACHA_QROUND_AVX performs a ChaCha quarter-round using the`
Deps update 2018-04-25 20:23:31 +02:00			`// 4 AVX/AVX2 registers v0, v1, v2 and v3. It uses VPSHUFB for 8/16 bit`
Update deps; especially for chacha20 2018-03-26 20:46:25 +02:00			`// rotations. The AVX/AVX2 register t is used as a temp. register.`
			`//`
			`// r16 holds the VPSHUFB constant for a 16 bit left rotate.`
			`// r8 holds the VPSHUFB constant for a 8 bit left rotate.`
			`#define CHACHA_QROUND_AVX(v0, v1, v2, v3, t, r16, r8) \`
			`VPADDD v0, v1, v0; \`
			`VPXOR v3, v0, v3; \`
			`VPSHUFB r16, v3, v3; \`
			`VPADDD v2, v3, v2; \`
			`VPXOR v1, v2, v1; \`
Deps update 2018-04-25 20:23:31 +02:00			`ROTL_AVX(12, t, v1); \`
Update deps; especially for chacha20 2018-03-26 20:46:25 +02:00			`VPADDD v0, v1, v0; \`
			`VPXOR v3, v0, v3; \`
			`VPSHUFB r8, v3, v3; \`
			`VPADDD v2, v3, v2; \`
			`VPXOR v1, v2, v1; \`
			`ROTL_AVX(7, t, v1)`

			`// CHACHA_SHUFFLE_SSE performs a ChaCha shuffle using the`
			`// 3 XMM registers v1, v2 and v3. The inverse shuffle is`
			`// performed by switching v1 and v3: CHACHA_SHUFFLE_SSE(v3, v2, v1).`
			`#define CHACHA_SHUFFLE_SSE(v1, v2, v3) \`
			`PSHUFL $0x39, v1, v1; \`
			`PSHUFL $0x4E, v2, v2; \`
			`PSHUFL $0x93, v3, v3`

			`// CHACHA_SHUFFLE_AVX performs a ChaCha shuffle using the`
			`// 3 AVX/AVX2 registers v1, v2 and v3. The inverse shuffle is`
			`// performed by switching v1 and v3: CHACHA_SHUFFLE_AVX(v3, v2, v1).`
			`#define CHACHA_SHUFFLE_AVX(v1, v2, v3) \`
			`VPSHUFD $0x39, v1, v1; \`
			`VPSHUFD $0x4E, v2, v2; \`
			`VPSHUFD $0x93, v3, v3`

Deps update 2018-04-25 20:23:31 +02:00			`// XOR_SSE extracts 4x16 byte vectors from src at`
			`// off, xors all vectors with the corresponding XMM`
Update deps; especially for chacha20 2018-03-26 20:46:25 +02:00			`// register (v0 - v3) and writes the result to dst`
			`// at off.`
			`// The XMM register t is used as a temp. register.`
			`#define XOR_SSE(dst, src, off, v0, v1, v2, v3, t) \`
			`MOVOU 0+off(src), t; \`
			`PXOR v0, t; \`
			`MOVOU t, 0+off(dst); \`
			`MOVOU 16+off(src), t; \`
			`PXOR v1, t; \`
			`MOVOU t, 16+off(dst); \`
			`MOVOU 32+off(src), t; \`
			`PXOR v2, t; \`
			`MOVOU t, 32+off(dst); \`
			`MOVOU 48+off(src), t; \`
			`PXOR v3, t; \`
			`MOVOU t, 48+off(dst)`

Deps update 2018-04-25 20:23:31 +02:00			`// XOR_AVX extracts 4x16 byte vectors from src at`
			`// off, xors all vectors with the corresponding AVX`
Update deps; especially for chacha20 2018-03-26 20:46:25 +02:00			`// register (v0 - v3) and writes the result to dst`
			`// at off.`
			`// The XMM register t is used as a temp. register.`
			`#define XOR_AVX(dst, src, off, v0, v1, v2, v3, t) \`
			`VPXOR 0+off(src), v0, t; \`
			`VMOVDQU t, 0+off(dst); \`
			`VPXOR 16+off(src), v1, t; \`
			`VMOVDQU t, 16+off(dst); \`
			`VPXOR 32+off(src), v2, t; \`
			`VMOVDQU t, 32+off(dst); \`
			`VPXOR 48+off(src), v3, t; \`
			`VMOVDQU t, 48+off(dst)`

			`#define XOR_AVX2(dst, src, off, v0, v1, v2, v3, t0, t1) \`
			`VMOVDQU (0+off)(src), t0; \`
			`VPERM2I128 $32, v1, v0, t1; \`
			`VPXOR t0, t1, t0; \`
			`VMOVDQU t0, (0+off)(dst); \`
			`VMOVDQU (32+off)(src), t0; \`
			`VPERM2I128 $32, v3, v2, t1; \`
			`VPXOR t0, t1, t0; \`
			`VMOVDQU t0, (32+off)(dst); \`
			`VMOVDQU (64+off)(src), t0; \`
			`VPERM2I128 $49, v1, v0, t1; \`
			`VPXOR t0, t1, t0; \`
			`VMOVDQU t0, (64+off)(dst); \`
			`VMOVDQU (96+off)(src), t0; \`
			`VPERM2I128 $49, v3, v2, t1; \`
			`VPXOR t0, t1, t0; \`
			`VMOVDQU t0, (96+off)(dst)`

			`#define XOR_UPPER_AVX2(dst, src, off, v0, v1, v2, v3, t0, t1) \`
			`VMOVDQU (0+off)(src), t0; \`
			`VPERM2I128 $32, v1, v0, t1; \`
			`VPXOR t0, t1, t0; \`
			`VMOVDQU t0, (0+off)(dst); \`
			`VMOVDQU (32+off)(src), t0; \`
			`VPERM2I128 $32, v3, v2, t1; \`
			`VPXOR t0, t1, t0; \`
			`VMOVDQU t0, (32+off)(dst); \`

			`#define EXTRACT_LOWER(dst, v0, v1, v2, v3, t0) \`
			`VPERM2I128 $49, v1, v0, t0; \`
			`VMOVDQU t0, 0(dst); \`
			`VPERM2I128 $49, v3, v2, t0; \`
			`VMOVDQU t0, 32(dst)`