[chore] bump go structr cache version -> v0.6.0 (#2773)

* update go-structr library -> v0.6.0, add necessary wrapping types + code changes to support these changes

* update readme with go-structr package changes

* improved wrapping of the SliceCache type

* add code comments for the cache wrapper types

* remove test.out 😇

---------

Co-authored-by: tobi <31960611+tsmethurst@users.noreply.github.com>
This commit is contained in:
kim
2024-04-02 11:03:40 +01:00
committed by GitHub
parent f05874be30
commit adf345f1ec
62 changed files with 2412 additions and 5857 deletions

View File

@ -1,6 +0,0 @@
upstream
*.pprof
xxh3.test
.vscode
*.txt
_compat

25
vendor/github.com/zeebo/xxh3/LICENSE generated vendored
View File

@ -1,25 +0,0 @@
xxHash Library
Copyright (c) 2012-2014, Yann Collet
Copyright (c) 2019, Jeff Wendling
All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice, this
list of conditions and the following disclaimer in the documentation and/or
other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@ -1,27 +0,0 @@
.PHONY: all vet
all: genasm _compat
genasm: avo/avx.go avo/sse.go
cd ./avo; go generate gen.go
clean:
rm accum_vector_avx_amd64.s
rm accum_vector_sse_amd64.s
rm _compat
upstream/xxhash.o: upstream/xxhash.h
( cd upstream && make )
_compat: _compat.c upstream/xxhash.o
gcc -o _compat _compat.c ./upstream/xxhash.o
vet:
GOOS=linux GOARCH=386 GO386=softfloat go vet ./...
GOOS=windows GOARCH=386 GO386=softfloat go vet ./...
GOOS=linux GOARCH=amd64 go vet ./...
GOOS=windows GOARCH=amd64 go vet ./...
GOOS=darwin GOARCH=amd64 go vet ./...
GOOS=linux GOARCH=arm go vet ./...
GOOS=linux GOARCH=arm64 go vet ./...
GOOS=windows GOARCH=arm64 go vet ./...
GOOS=darwin GOARCH=arm64 go vet ./...

View File

@ -1,38 +0,0 @@
# XXH3
[![GoDoc](https://godoc.org/github.com/zeebo/xxh3?status.svg)](https://godoc.org/github.com/zeebo/xxh3)
[![Sourcegraph](https://sourcegraph.com/github.com/zeebo/xxh3/-/badge.svg)](https://sourcegraph.com/github.com/zeebo/xxh3?badge)
[![Go Report Card](https://goreportcard.com/badge/github.com/zeebo/xxh3)](https://goreportcard.com/report/github.com/zeebo/xxh3)
This package is a port of the [xxh3](https://github.com/Cyan4973/xxHash) library to Go.
Upstream has fixed the output as of v0.8.0, and this package matches that.
---
# Benchmarks
Run on my `i7-8850H CPU @ 2.60GHz`
## Small Sizes
| Bytes | Rate |
|-----------|--------------------------------------|
|` 0 ` |` 0.74 ns/op ` |
|` 1-3 ` |` 4.19 ns/op (0.24 GB/s - 0.71 GB/s) `|
|` 4-8 ` |` 4.16 ns/op (0.97 GB/s - 1.98 GB/s) `|
|` 9-16 ` |` 4.46 ns/op (2.02 GB/s - 3.58 GB/s) `|
|` 17-32 ` |` 6.22 ns/op (2.76 GB/s - 5.15 GB/s) `|
|` 33-64 ` |` 8.00 ns/op (4.13 GB/s - 8.13 GB/s) `|
|` 65-96 ` |` 11.0 ns/op (5.91 GB/s - 8.84 GB/s) `|
|` 97-128 ` |` 12.8 ns/op (7.68 GB/s - 10.0 GB/s) `|
## Large Sizes
| Bytes | Rate | SSE2 Rate | AVX2 Rate |
|---------|--------------------------|--------------------------|--------------------------|
|` 129 ` |` 13.6 ns/op (9.45 GB/s) `| | |
|` 240 ` |` 23.8 ns/op (10.1 GB/s) `| | |
|` 241 ` |` 40.5 ns/op (5.97 GB/s) `|` 23.3 ns/op (10.4 GB/s) `|` 20.1 ns/op (12.0 GB/s) `|
|` 512 ` |` 69.8 ns/op (7.34 GB/s) `|` 30.4 ns/op (16.9 GB/s) `|` 24.7 ns/op (20.7 GB/s) `|
|` 1024 ` |` 132 ns/op (7.77 GB/s) `|` 48.9 ns/op (20.9 GB/s) `|` 37.7 ns/op (27.2 GB/s) `|
|` 100KB `|` 13.0 us/op (7.88 GB/s) `|` 4.05 us/op (25.3 GB/s) `|` 2.31 us/op (44.3 GB/s) `|

View File

@ -1,39 +0,0 @@
#include "upstream/xxhash.h"
#include <stdio.h>
int main() {
unsigned char buf[4096];
for (int i = 0; i < 4096; i++) {
buf[i] = (unsigned char)((i+1)%251);
}
printf("var testVecs64 = []uint64{\n");
for (int i = 0; i < 4096; i++) {
if (i % 4 == 0) {
printf("\t");
}
uint64_t h = XXH3_64bits(buf, (size_t)i);
printf("0x%lx, ", h);
if (i % 4 == 3) {
printf("\n\t");
}
}
printf("}\n\n");
printf("var testVecs128 = [][2]uint64{\n");
for (int i = 0; i < 4096; i++) {
if (i % 4 == 0) {
printf("\t");
}
XXH128_hash_t h = XXH3_128bits(buf, (size_t)i);
printf("{0x%lx, 0x%lx}, ", h.high64, h.low64);
if (i % 4 == 3) {
printf("\n");
}
}
printf("}\n\n");
}

View File

@ -1,542 +0,0 @@
package xxh3
// avx512Switch is the size at which the avx512 code is used.
// Bigger blocks benefit more.
const avx512Switch = 1 << 10
func accumScalar(accs *[8]u64, p, secret ptr, l u64) {
if secret != key {
accumScalarSeed(accs, p, secret, l)
return
}
for l > _block {
k := secret
// accs
for i := 0; i < 16; i++ {
dv0 := readU64(p, 8*0)
dk0 := dv0 ^ readU64(k, 8*0)
accs[1] += dv0
accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32)
dv1 := readU64(p, 8*1)
dk1 := dv1 ^ readU64(k, 8*1)
accs[0] += dv1
accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32)
dv2 := readU64(p, 8*2)
dk2 := dv2 ^ readU64(k, 8*2)
accs[3] += dv2
accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32)
dv3 := readU64(p, 8*3)
dk3 := dv3 ^ readU64(k, 8*3)
accs[2] += dv3
accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32)
dv4 := readU64(p, 8*4)
dk4 := dv4 ^ readU64(k, 8*4)
accs[5] += dv4
accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32)
dv5 := readU64(p, 8*5)
dk5 := dv5 ^ readU64(k, 8*5)
accs[4] += dv5
accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32)
dv6 := readU64(p, 8*6)
dk6 := dv6 ^ readU64(k, 8*6)
accs[7] += dv6
accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32)
dv7 := readU64(p, 8*7)
dk7 := dv7 ^ readU64(k, 8*7)
accs[6] += dv7
accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32)
l -= _stripe
if l > 0 {
p, k = ptr(ui(p)+_stripe), ptr(ui(k)+8)
}
}
// scramble accs
accs[0] ^= accs[0] >> 47
accs[0] ^= key64_128
accs[0] *= prime32_1
accs[1] ^= accs[1] >> 47
accs[1] ^= key64_136
accs[1] *= prime32_1
accs[2] ^= accs[2] >> 47
accs[2] ^= key64_144
accs[2] *= prime32_1
accs[3] ^= accs[3] >> 47
accs[3] ^= key64_152
accs[3] *= prime32_1
accs[4] ^= accs[4] >> 47
accs[4] ^= key64_160
accs[4] *= prime32_1
accs[5] ^= accs[5] >> 47
accs[5] ^= key64_168
accs[5] *= prime32_1
accs[6] ^= accs[6] >> 47
accs[6] ^= key64_176
accs[6] *= prime32_1
accs[7] ^= accs[7] >> 47
accs[7] ^= key64_184
accs[7] *= prime32_1
}
if l > 0 {
t, k := (l-1)/_stripe, secret
for i := u64(0); i < t; i++ {
dv0 := readU64(p, 8*0)
dk0 := dv0 ^ readU64(k, 8*0)
accs[1] += dv0
accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32)
dv1 := readU64(p, 8*1)
dk1 := dv1 ^ readU64(k, 8*1)
accs[0] += dv1
accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32)
dv2 := readU64(p, 8*2)
dk2 := dv2 ^ readU64(k, 8*2)
accs[3] += dv2
accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32)
dv3 := readU64(p, 8*3)
dk3 := dv3 ^ readU64(k, 8*3)
accs[2] += dv3
accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32)
dv4 := readU64(p, 8*4)
dk4 := dv4 ^ readU64(k, 8*4)
accs[5] += dv4
accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32)
dv5 := readU64(p, 8*5)
dk5 := dv5 ^ readU64(k, 8*5)
accs[4] += dv5
accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32)
dv6 := readU64(p, 8*6)
dk6 := dv6 ^ readU64(k, 8*6)
accs[7] += dv6
accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32)
dv7 := readU64(p, 8*7)
dk7 := dv7 ^ readU64(k, 8*7)
accs[6] += dv7
accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32)
l -= _stripe
if l > 0 {
p, k = ptr(ui(p)+_stripe), ptr(ui(k)+8)
}
}
if l > 0 {
p = ptr(ui(p) - uintptr(_stripe-l))
dv0 := readU64(p, 8*0)
dk0 := dv0 ^ key64_121
accs[1] += dv0
accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32)
dv1 := readU64(p, 8*1)
dk1 := dv1 ^ key64_129
accs[0] += dv1
accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32)
dv2 := readU64(p, 8*2)
dk2 := dv2 ^ key64_137
accs[3] += dv2
accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32)
dv3 := readU64(p, 8*3)
dk3 := dv3 ^ key64_145
accs[2] += dv3
accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32)
dv4 := readU64(p, 8*4)
dk4 := dv4 ^ key64_153
accs[5] += dv4
accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32)
dv5 := readU64(p, 8*5)
dk5 := dv5 ^ key64_161
accs[4] += dv5
accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32)
dv6 := readU64(p, 8*6)
dk6 := dv6 ^ key64_169
accs[7] += dv6
accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32)
dv7 := readU64(p, 8*7)
dk7 := dv7 ^ key64_177
accs[6] += dv7
accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32)
}
}
}
func accumBlockScalar(accs *[8]u64, p, secret ptr) {
if secret != key {
accumBlockScalarSeed(accs, p, secret)
return
}
// accs
for i := 0; i < 16; i++ {
dv0 := readU64(p, 8*0)
dk0 := dv0 ^ readU64(secret, 8*0)
accs[1] += dv0
accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32)
dv1 := readU64(p, 8*1)
dk1 := dv1 ^ readU64(secret, 8*1)
accs[0] += dv1
accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32)
dv2 := readU64(p, 8*2)
dk2 := dv2 ^ readU64(secret, 8*2)
accs[3] += dv2
accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32)
dv3 := readU64(p, 8*3)
dk3 := dv3 ^ readU64(secret, 8*3)
accs[2] += dv3
accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32)
dv4 := readU64(p, 8*4)
dk4 := dv4 ^ readU64(secret, 8*4)
accs[5] += dv4
accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32)
dv5 := readU64(p, 8*5)
dk5 := dv5 ^ readU64(secret, 8*5)
accs[4] += dv5
accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32)
dv6 := readU64(p, 8*6)
dk6 := dv6 ^ readU64(secret, 8*6)
accs[7] += dv6
accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32)
dv7 := readU64(p, 8*7)
dk7 := dv7 ^ readU64(secret, 8*7)
accs[6] += dv7
accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32)
p, secret = ptr(ui(p)+_stripe), ptr(ui(secret)+8)
}
// scramble accs
accs[0] ^= accs[0] >> 47
accs[0] ^= key64_128
accs[0] *= prime32_1
accs[1] ^= accs[1] >> 47
accs[1] ^= key64_136
accs[1] *= prime32_1
accs[2] ^= accs[2] >> 47
accs[2] ^= key64_144
accs[2] *= prime32_1
accs[3] ^= accs[3] >> 47
accs[3] ^= key64_152
accs[3] *= prime32_1
accs[4] ^= accs[4] >> 47
accs[4] ^= key64_160
accs[4] *= prime32_1
accs[5] ^= accs[5] >> 47
accs[5] ^= key64_168
accs[5] *= prime32_1
accs[6] ^= accs[6] >> 47
accs[6] ^= key64_176
accs[6] *= prime32_1
accs[7] ^= accs[7] >> 47
accs[7] ^= key64_184
accs[7] *= prime32_1
}
// accumScalarSeed should be used with custom key.
func accumScalarSeed(accs *[8]u64, p, secret ptr, l u64) {
for l > _block {
k := secret
// accs
for i := 0; i < 16; i++ {
dv0 := readU64(p, 8*0)
dk0 := dv0 ^ readU64(k, 8*0)
accs[1] += dv0
accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32)
dv1 := readU64(p, 8*1)
dk1 := dv1 ^ readU64(k, 8*1)
accs[0] += dv1
accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32)
dv2 := readU64(p, 8*2)
dk2 := dv2 ^ readU64(k, 8*2)
accs[3] += dv2
accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32)
dv3 := readU64(p, 8*3)
dk3 := dv3 ^ readU64(k, 8*3)
accs[2] += dv3
accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32)
dv4 := readU64(p, 8*4)
dk4 := dv4 ^ readU64(k, 8*4)
accs[5] += dv4
accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32)
dv5 := readU64(p, 8*5)
dk5 := dv5 ^ readU64(k, 8*5)
accs[4] += dv5
accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32)
dv6 := readU64(p, 8*6)
dk6 := dv6 ^ readU64(k, 8*6)
accs[7] += dv6
accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32)
dv7 := readU64(p, 8*7)
dk7 := dv7 ^ readU64(k, 8*7)
accs[6] += dv7
accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32)
l -= _stripe
if l > 0 {
p, k = ptr(ui(p)+_stripe), ptr(ui(k)+8)
}
}
// scramble accs
accs[0] ^= accs[0] >> 47
accs[0] ^= readU64(secret, 128)
accs[0] *= prime32_1
accs[1] ^= accs[1] >> 47
accs[1] ^= readU64(secret, 136)
accs[1] *= prime32_1
accs[2] ^= accs[2] >> 47
accs[2] ^= readU64(secret, 144)
accs[2] *= prime32_1
accs[3] ^= accs[3] >> 47
accs[3] ^= readU64(secret, 152)
accs[3] *= prime32_1
accs[4] ^= accs[4] >> 47
accs[4] ^= readU64(secret, 160)
accs[4] *= prime32_1
accs[5] ^= accs[5] >> 47
accs[5] ^= readU64(secret, 168)
accs[5] *= prime32_1
accs[6] ^= accs[6] >> 47
accs[6] ^= readU64(secret, 176)
accs[6] *= prime32_1
accs[7] ^= accs[7] >> 47
accs[7] ^= readU64(secret, 184)
accs[7] *= prime32_1
}
if l > 0 {
t, k := (l-1)/_stripe, secret
for i := u64(0); i < t; i++ {
dv0 := readU64(p, 8*0)
dk0 := dv0 ^ readU64(k, 8*0)
accs[1] += dv0
accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32)
dv1 := readU64(p, 8*1)
dk1 := dv1 ^ readU64(k, 8*1)
accs[0] += dv1
accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32)
dv2 := readU64(p, 8*2)
dk2 := dv2 ^ readU64(k, 8*2)
accs[3] += dv2
accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32)
dv3 := readU64(p, 8*3)
dk3 := dv3 ^ readU64(k, 8*3)
accs[2] += dv3
accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32)
dv4 := readU64(p, 8*4)
dk4 := dv4 ^ readU64(k, 8*4)
accs[5] += dv4
accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32)
dv5 := readU64(p, 8*5)
dk5 := dv5 ^ readU64(k, 8*5)
accs[4] += dv5
accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32)
dv6 := readU64(p, 8*6)
dk6 := dv6 ^ readU64(k, 8*6)
accs[7] += dv6
accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32)
dv7 := readU64(p, 8*7)
dk7 := dv7 ^ readU64(k, 8*7)
accs[6] += dv7
accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32)
l -= _stripe
if l > 0 {
p, k = ptr(ui(p)+_stripe), ptr(ui(k)+8)
}
}
if l > 0 {
p = ptr(ui(p) - uintptr(_stripe-l))
dv0 := readU64(p, 8*0)
dk0 := dv0 ^ readU64(secret, 121)
accs[1] += dv0
accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32)
dv1 := readU64(p, 8*1)
dk1 := dv1 ^ readU64(secret, 129)
accs[0] += dv1
accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32)
dv2 := readU64(p, 8*2)
dk2 := dv2 ^ readU64(secret, 137)
accs[3] += dv2
accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32)
dv3 := readU64(p, 8*3)
dk3 := dv3 ^ readU64(secret, 145)
accs[2] += dv3
accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32)
dv4 := readU64(p, 8*4)
dk4 := dv4 ^ readU64(secret, 153)
accs[5] += dv4
accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32)
dv5 := readU64(p, 8*5)
dk5 := dv5 ^ readU64(secret, 161)
accs[4] += dv5
accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32)
dv6 := readU64(p, 8*6)
dk6 := dv6 ^ readU64(secret, 169)
accs[7] += dv6
accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32)
dv7 := readU64(p, 8*7)
dk7 := dv7 ^ readU64(secret, 177)
accs[6] += dv7
accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32)
}
}
}
// accumBlockScalarSeed should be used with custom key.
func accumBlockScalarSeed(accs *[8]u64, p, secret ptr) {
// accs
{
secret := secret
for i := 0; i < 16; i++ {
dv0 := readU64(p, 8*0)
dk0 := dv0 ^ readU64(secret, 8*0)
accs[1] += dv0
accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32)
dv1 := readU64(p, 8*1)
dk1 := dv1 ^ readU64(secret, 8*1)
accs[0] += dv1
accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32)
dv2 := readU64(p, 8*2)
dk2 := dv2 ^ readU64(secret, 8*2)
accs[3] += dv2
accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32)
dv3 := readU64(p, 8*3)
dk3 := dv3 ^ readU64(secret, 8*3)
accs[2] += dv3
accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32)
dv4 := readU64(p, 8*4)
dk4 := dv4 ^ readU64(secret, 8*4)
accs[5] += dv4
accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32)
dv5 := readU64(p, 8*5)
dk5 := dv5 ^ readU64(secret, 8*5)
accs[4] += dv5
accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32)
dv6 := readU64(p, 8*6)
dk6 := dv6 ^ readU64(secret, 8*6)
accs[7] += dv6
accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32)
dv7 := readU64(p, 8*7)
dk7 := dv7 ^ readU64(secret, 8*7)
accs[6] += dv7
accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32)
p, secret = ptr(ui(p)+_stripe), ptr(ui(secret)+8)
}
}
// scramble accs
accs[0] ^= accs[0] >> 47
accs[0] ^= readU64(secret, 128)
accs[0] *= prime32_1
accs[1] ^= accs[1] >> 47
accs[1] ^= readU64(secret, 136)
accs[1] *= prime32_1
accs[2] ^= accs[2] >> 47
accs[2] ^= readU64(secret, 144)
accs[2] *= prime32_1
accs[3] ^= accs[3] >> 47
accs[3] ^= readU64(secret, 152)
accs[3] *= prime32_1
accs[4] ^= accs[4] >> 47
accs[4] ^= readU64(secret, 160)
accs[4] *= prime32_1
accs[5] ^= accs[5] >> 47
accs[5] ^= readU64(secret, 168)
accs[5] *= prime32_1
accs[6] ^= accs[6] >> 47
accs[6] ^= readU64(secret, 176)
accs[6] *= prime32_1
accs[7] ^= accs[7] >> 47
accs[7] ^= readU64(secret, 184)
accs[7] *= prime32_1
}

View File

@ -1,40 +0,0 @@
package xxh3
import (
"unsafe"
"github.com/klauspost/cpuid/v2"
)
var (
hasAVX2 = cpuid.CPU.Has(cpuid.AVX2)
hasSSE2 = cpuid.CPU.Has(cpuid.SSE2) // Always true on amd64
hasAVX512 = cpuid.CPU.Has(cpuid.AVX512F)
)
//go:noescape
func accumAVX2(acc *[8]u64, data, key unsafe.Pointer, len u64)
//go:noescape
func accumAVX512(acc *[8]u64, data, key unsafe.Pointer, len u64)
//go:noescape
func accumSSE(acc *[8]u64, data, key unsafe.Pointer, len u64)
//go:noescape
func accumBlockAVX2(acc *[8]u64, data, key unsafe.Pointer)
//go:noescape
func accumBlockSSE(acc *[8]u64, data, key unsafe.Pointer)
func withOverrides(avx512, avx2, sse2 bool, cb func()) {
avx512Orig, avx2Orig, sse2Orig := hasAVX512, hasAVX2, hasSSE2
hasAVX512, hasAVX2, hasSSE2 = avx512, avx2, sse2
defer func() { hasAVX512, hasAVX2, hasSSE2 = avx512Orig, avx2Orig, sse2Orig }()
cb()
}
func withAVX512(cb func()) { withOverrides(hasAVX512, false, false, cb) }
func withAVX2(cb func()) { withOverrides(false, hasAVX2, false, cb) }
func withSSE2(cb func()) { withOverrides(false, false, hasSSE2, cb) }
func withGeneric(cb func()) { withOverrides(false, false, false, cb) }

View File

@ -1,25 +0,0 @@
//go:build !amd64
// +build !amd64
package xxh3
import (
"unsafe"
)
const (
hasAVX2 = false
hasSSE2 = false
hasAVX512 = false
)
func accumAVX2(acc *[8]u64, data, key unsafe.Pointer, len u64) { panic("unreachable") }
func accumSSE(acc *[8]u64, data, key unsafe.Pointer, len u64) { panic("unreachable") }
func accumBlockAVX2(acc *[8]u64, data, key unsafe.Pointer) { panic("unreachable") }
func accumBlockSSE(acc *[8]u64, data, key unsafe.Pointer) { panic("unreachable") }
func accumAVX512(acc *[8]u64, data, key unsafe.Pointer, len u64) { panic("unreachable") }
func withAVX512(cb func()) { cb() }
func withAVX2(cb func()) { cb() }
func withSSE2(cb func()) { cb() }
func withGeneric(cb func()) { cb() }

View File

@ -1,379 +0,0 @@
// Code generated by command: go run gen.go -avx512 -out ../accum_vector_avx512_amd64.s -pkg xxh3. DO NOT EDIT.
#include "textflag.h"
DATA prime_avx512<>+0(SB)/8, $0x000000009e3779b1
DATA prime_avx512<>+8(SB)/8, $0x000000009e3779b1
DATA prime_avx512<>+16(SB)/8, $0x000000009e3779b1
DATA prime_avx512<>+24(SB)/8, $0x000000009e3779b1
DATA prime_avx512<>+32(SB)/8, $0x000000009e3779b1
DATA prime_avx512<>+40(SB)/8, $0x000000009e3779b1
DATA prime_avx512<>+48(SB)/8, $0x000000009e3779b1
DATA prime_avx512<>+56(SB)/8, $0x000000009e3779b1
GLOBL prime_avx512<>(SB), RODATA|NOPTR, $64
// func accumAVX512(acc *[8]uint64, data *byte, key *byte, len uint64)
// Requires: AVX, AVX512F, MMX+
TEXT ·accumAVX512(SB), NOSPLIT, $0-32
MOVQ acc+0(FP), AX
MOVQ data+8(FP), CX
MOVQ key+16(FP), DX
MOVQ len+24(FP), BX
VMOVDQU64 (AX), Z1
VMOVDQU64 prime_avx512<>+0(SB), Z0
VMOVDQU64 (DX), Z2
VMOVDQU64 8(DX), Z3
VMOVDQU64 16(DX), Z4
VMOVDQU64 24(DX), Z5
VMOVDQU64 32(DX), Z6
VMOVDQU64 40(DX), Z7
VMOVDQU64 48(DX), Z8
VMOVDQU64 56(DX), Z9
VMOVDQU64 64(DX), Z10
VMOVDQU64 72(DX), Z11
VMOVDQU64 80(DX), Z12
VMOVDQU64 88(DX), Z13
VMOVDQU64 96(DX), Z14
VMOVDQU64 104(DX), Z15
VMOVDQU64 112(DX), Z16
VMOVDQU64 120(DX), Z17
VMOVDQU64 128(DX), Z18
VMOVDQU64 121(DX), Z19
accum_large:
CMPQ BX, $0x00000400
JLE accum
VMOVDQU64 (CX), Z20
PREFETCHT0 1024(CX)
VPXORD Z2, Z20, Z21
VPSHUFD $0x31, Z21, Z22
VPMULUDQ Z21, Z22, Z21
VPSHUFD $0x4e, Z20, Z20
VPADDQ Z1, Z20, Z1
VPADDQ Z1, Z21, Z1
VMOVDQU64 64(CX), Z20
PREFETCHT0 1088(CX)
VPXORD Z3, Z20, Z21
VPSHUFD $0x31, Z21, Z22
VPMULUDQ Z21, Z22, Z21
VPSHUFD $0x4e, Z20, Z20
VPADDQ Z1, Z20, Z1
VPADDQ Z1, Z21, Z1
VMOVDQU64 128(CX), Z20
PREFETCHT0 1152(CX)
VPXORD Z4, Z20, Z21
VPSHUFD $0x31, Z21, Z22
VPMULUDQ Z21, Z22, Z21
VPSHUFD $0x4e, Z20, Z20
VPADDQ Z1, Z20, Z1
VPADDQ Z1, Z21, Z1
VMOVDQU64 192(CX), Z20
PREFETCHT0 1216(CX)
VPXORD Z5, Z20, Z21
VPSHUFD $0x31, Z21, Z22
VPMULUDQ Z21, Z22, Z21
VPSHUFD $0x4e, Z20, Z20
VPADDQ Z1, Z20, Z1
VPADDQ Z1, Z21, Z1
VMOVDQU64 256(CX), Z20
PREFETCHT0 1280(CX)
VPXORD Z6, Z20, Z21
VPSHUFD $0x31, Z21, Z22
VPMULUDQ Z21, Z22, Z21
VPSHUFD $0x4e, Z20, Z20
VPADDQ Z1, Z20, Z1
VPADDQ Z1, Z21, Z1
VMOVDQU64 320(CX), Z20
PREFETCHT0 1344(CX)
VPXORD Z7, Z20, Z21
VPSHUFD $0x31, Z21, Z22
VPMULUDQ Z21, Z22, Z21
VPSHUFD $0x4e, Z20, Z20
VPADDQ Z1, Z20, Z1
VPADDQ Z1, Z21, Z1
VMOVDQU64 384(CX), Z20
PREFETCHT0 1408(CX)
VPXORD Z8, Z20, Z21
VPSHUFD $0x31, Z21, Z22
VPMULUDQ Z21, Z22, Z21
VPSHUFD $0x4e, Z20, Z20
VPADDQ Z1, Z20, Z1
VPADDQ Z1, Z21, Z1
VMOVDQU64 448(CX), Z20
PREFETCHT0 1472(CX)
VPXORD Z9, Z20, Z21
VPSHUFD $0x31, Z21, Z22
VPMULUDQ Z21, Z22, Z21
VPSHUFD $0x4e, Z20, Z20
VPADDQ Z1, Z20, Z1
VPADDQ Z1, Z21, Z1
VMOVDQU64 512(CX), Z20
PREFETCHT0 1536(CX)
VPXORD Z10, Z20, Z21
VPSHUFD $0x31, Z21, Z22
VPMULUDQ Z21, Z22, Z21
VPSHUFD $0x4e, Z20, Z20
VPADDQ Z1, Z20, Z1
VPADDQ Z1, Z21, Z1
VMOVDQU64 576(CX), Z20
PREFETCHT0 1600(CX)
VPXORD Z11, Z20, Z21
VPSHUFD $0x31, Z21, Z22
VPMULUDQ Z21, Z22, Z21
VPSHUFD $0x4e, Z20, Z20
VPADDQ Z1, Z20, Z1
VPADDQ Z1, Z21, Z1
VMOVDQU64 640(CX), Z20
PREFETCHT0 1664(CX)
VPXORD Z12, Z20, Z21
VPSHUFD $0x31, Z21, Z22
VPMULUDQ Z21, Z22, Z21
VPSHUFD $0x4e, Z20, Z20
VPADDQ Z1, Z20, Z1
VPADDQ Z1, Z21, Z1
VMOVDQU64 704(CX), Z20
PREFETCHT0 1728(CX)
VPXORD Z13, Z20, Z21
VPSHUFD $0x31, Z21, Z22
VPMULUDQ Z21, Z22, Z21
VPSHUFD $0x4e, Z20, Z20
VPADDQ Z1, Z20, Z1
VPADDQ Z1, Z21, Z1
VMOVDQU64 768(CX), Z20
PREFETCHT0 1792(CX)
VPXORD Z14, Z20, Z21
VPSHUFD $0x31, Z21, Z22
VPMULUDQ Z21, Z22, Z21
VPSHUFD $0x4e, Z20, Z20
VPADDQ Z1, Z20, Z1
VPADDQ Z1, Z21, Z1
VMOVDQU64 832(CX), Z20
PREFETCHT0 1856(CX)
VPXORD Z15, Z20, Z21
VPSHUFD $0x31, Z21, Z22
VPMULUDQ Z21, Z22, Z21
VPSHUFD $0x4e, Z20, Z20
VPADDQ Z1, Z20, Z1
VPADDQ Z1, Z21, Z1
VMOVDQU64 896(CX), Z20
PREFETCHT0 1920(CX)
VPXORD Z16, Z20, Z21
VPSHUFD $0x31, Z21, Z22
VPMULUDQ Z21, Z22, Z21
VPSHUFD $0x4e, Z20, Z20
VPADDQ Z1, Z20, Z1
VPADDQ Z1, Z21, Z1
VMOVDQU64 960(CX), Z20
PREFETCHT0 1984(CX)
VPXORD Z17, Z20, Z21
VPSHUFD $0x31, Z21, Z22
VPMULUDQ Z21, Z22, Z21
VPSHUFD $0x4e, Z20, Z20
VPADDQ Z1, Z20, Z1
VPADDQ Z1, Z21, Z1
ADDQ $0x00000400, CX
SUBQ $0x00000400, BX
VPSRLQ $0x2f, Z1, Z20
VPTERNLOGD $0x96, Z1, Z18, Z20
VPMULUDQ Z0, Z20, Z1
VPSHUFD $0xf5, Z20, Z20
VPMULUDQ Z0, Z20, Z20
VPSLLQ $0x20, Z20, Z20
VPADDQ Z1, Z20, Z1
JMP accum_large
accum:
CMPQ BX, $0x40
JLE finalize
VMOVDQU64 (CX), Z0
VPXORD Z2, Z0, Z2
VPSHUFD $0x31, Z2, Z18
VPMULUDQ Z2, Z18, Z2
VPSHUFD $0x4e, Z0, Z0
VPADDQ Z1, Z0, Z1
VPADDQ Z1, Z2, Z1
ADDQ $0x00000040, CX
SUBQ $0x00000040, BX
CMPQ BX, $0x40
JLE finalize
VMOVDQU64 (CX), Z0
VPXORD Z3, Z0, Z2
VPSHUFD $0x31, Z2, Z3
VPMULUDQ Z2, Z3, Z2
VPSHUFD $0x4e, Z0, Z0
VPADDQ Z1, Z0, Z1
VPADDQ Z1, Z2, Z1
ADDQ $0x00000040, CX
SUBQ $0x00000040, BX
CMPQ BX, $0x40
JLE finalize
VMOVDQU64 (CX), Z0
VPXORD Z4, Z0, Z2
VPSHUFD $0x31, Z2, Z3
VPMULUDQ Z2, Z3, Z2
VPSHUFD $0x4e, Z0, Z0
VPADDQ Z1, Z0, Z1
VPADDQ Z1, Z2, Z1
ADDQ $0x00000040, CX
SUBQ $0x00000040, BX
CMPQ BX, $0x40
JLE finalize
VMOVDQU64 (CX), Z0
VPXORD Z5, Z0, Z2
VPSHUFD $0x31, Z2, Z3
VPMULUDQ Z2, Z3, Z2
VPSHUFD $0x4e, Z0, Z0
VPADDQ Z1, Z0, Z1
VPADDQ Z1, Z2, Z1
ADDQ $0x00000040, CX
SUBQ $0x00000040, BX
CMPQ BX, $0x40
JLE finalize
VMOVDQU64 (CX), Z0
VPXORD Z6, Z0, Z2
VPSHUFD $0x31, Z2, Z3
VPMULUDQ Z2, Z3, Z2
VPSHUFD $0x4e, Z0, Z0
VPADDQ Z1, Z0, Z1
VPADDQ Z1, Z2, Z1
ADDQ $0x00000040, CX
SUBQ $0x00000040, BX
CMPQ BX, $0x40
JLE finalize
VMOVDQU64 (CX), Z0
VPXORD Z7, Z0, Z2
VPSHUFD $0x31, Z2, Z3
VPMULUDQ Z2, Z3, Z2
VPSHUFD $0x4e, Z0, Z0
VPADDQ Z1, Z0, Z1
VPADDQ Z1, Z2, Z1
ADDQ $0x00000040, CX
SUBQ $0x00000040, BX
CMPQ BX, $0x40
JLE finalize
VMOVDQU64 (CX), Z0
VPXORD Z8, Z0, Z2
VPSHUFD $0x31, Z2, Z3
VPMULUDQ Z2, Z3, Z2
VPSHUFD $0x4e, Z0, Z0
VPADDQ Z1, Z0, Z1
VPADDQ Z1, Z2, Z1
ADDQ $0x00000040, CX
SUBQ $0x00000040, BX
CMPQ BX, $0x40
JLE finalize
VMOVDQU64 (CX), Z0
VPXORD Z9, Z0, Z2
VPSHUFD $0x31, Z2, Z3
VPMULUDQ Z2, Z3, Z2
VPSHUFD $0x4e, Z0, Z0
VPADDQ Z1, Z0, Z1
VPADDQ Z1, Z2, Z1
ADDQ $0x00000040, CX
SUBQ $0x00000040, BX
CMPQ BX, $0x40
JLE finalize
VMOVDQU64 (CX), Z0
VPXORD Z10, Z0, Z2
VPSHUFD $0x31, Z2, Z3
VPMULUDQ Z2, Z3, Z2
VPSHUFD $0x4e, Z0, Z0
VPADDQ Z1, Z0, Z1
VPADDQ Z1, Z2, Z1
ADDQ $0x00000040, CX
SUBQ $0x00000040, BX
CMPQ BX, $0x40
JLE finalize
VMOVDQU64 (CX), Z0
VPXORD Z11, Z0, Z2
VPSHUFD $0x31, Z2, Z3
VPMULUDQ Z2, Z3, Z2
VPSHUFD $0x4e, Z0, Z0
VPADDQ Z1, Z0, Z1
VPADDQ Z1, Z2, Z1
ADDQ $0x00000040, CX
SUBQ $0x00000040, BX
CMPQ BX, $0x40
JLE finalize
VMOVDQU64 (CX), Z0
VPXORD Z12, Z0, Z2
VPSHUFD $0x31, Z2, Z3
VPMULUDQ Z2, Z3, Z2
VPSHUFD $0x4e, Z0, Z0
VPADDQ Z1, Z0, Z1
VPADDQ Z1, Z2, Z1
ADDQ $0x00000040, CX
SUBQ $0x00000040, BX
CMPQ BX, $0x40
JLE finalize
VMOVDQU64 (CX), Z0
VPXORD Z13, Z0, Z2
VPSHUFD $0x31, Z2, Z3
VPMULUDQ Z2, Z3, Z2
VPSHUFD $0x4e, Z0, Z0
VPADDQ Z1, Z0, Z1
VPADDQ Z1, Z2, Z1
ADDQ $0x00000040, CX
SUBQ $0x00000040, BX
CMPQ BX, $0x40
JLE finalize
VMOVDQU64 (CX), Z0
VPXORD Z14, Z0, Z2
VPSHUFD $0x31, Z2, Z3
VPMULUDQ Z2, Z3, Z2
VPSHUFD $0x4e, Z0, Z0
VPADDQ Z1, Z0, Z1
VPADDQ Z1, Z2, Z1
ADDQ $0x00000040, CX
SUBQ $0x00000040, BX
CMPQ BX, $0x40
JLE finalize
VMOVDQU64 (CX), Z0
VPXORD Z15, Z0, Z2
VPSHUFD $0x31, Z2, Z3
VPMULUDQ Z2, Z3, Z2
VPSHUFD $0x4e, Z0, Z0
VPADDQ Z1, Z0, Z1
VPADDQ Z1, Z2, Z1
ADDQ $0x00000040, CX
SUBQ $0x00000040, BX
CMPQ BX, $0x40
JLE finalize
VMOVDQU64 (CX), Z0
VPXORD Z16, Z0, Z2
VPSHUFD $0x31, Z2, Z3
VPMULUDQ Z2, Z3, Z2
VPSHUFD $0x4e, Z0, Z0
VPADDQ Z1, Z0, Z1
VPADDQ Z1, Z2, Z1
ADDQ $0x00000040, CX
SUBQ $0x00000040, BX
CMPQ BX, $0x40
JLE finalize
VMOVDQU64 (CX), Z0
VPXORD Z17, Z0, Z2
VPSHUFD $0x31, Z2, Z3
VPMULUDQ Z2, Z3, Z2
VPSHUFD $0x4e, Z0, Z0
VPADDQ Z1, Z0, Z1
VPADDQ Z1, Z2, Z1
ADDQ $0x00000040, CX
SUBQ $0x00000040, BX
finalize:
CMPQ BX, $0x00
JE return
SUBQ $0x40, CX
ADDQ BX, CX
VMOVDQU64 (CX), Z0
VPXORD Z19, Z0, Z2
VPSHUFD $0x31, Z2, Z3
VPMULUDQ Z2, Z3, Z2
VPSHUFD $0x4e, Z0, Z0
VPADDQ Z1, Z0, Z1
VPADDQ Z1, Z2, Z1
return:
VMOVDQU64 Z1, (AX)
VZEROUPPER
RET

View File

@ -1,586 +0,0 @@
// Code generated by command: go run gen.go -avx -out ../accum_vector_avx_amd64.s -pkg xxh3. DO NOT EDIT.
#include "textflag.h"
DATA prime_avx<>+0(SB)/8, $0x000000009e3779b1
DATA prime_avx<>+8(SB)/8, $0x000000009e3779b1
DATA prime_avx<>+16(SB)/8, $0x000000009e3779b1
DATA prime_avx<>+24(SB)/8, $0x000000009e3779b1
GLOBL prime_avx<>(SB), RODATA|NOPTR, $32
// func accumAVX2(acc *[8]uint64, data *byte, key *byte, len uint64)
// Requires: AVX, AVX2, MMX+
TEXT ·accumAVX2(SB), NOSPLIT, $0-32
MOVQ acc+0(FP), AX
MOVQ data+8(FP), CX
MOVQ key+16(FP), DX
MOVQ key+16(FP), BX
MOVQ len+24(FP), SI
VMOVDQU (AX), Y1
VMOVDQU 32(AX), Y2
VMOVDQU prime_avx<>+0(SB), Y0
accum_large:
CMPQ SI, $0x00000400
JLE accum
VMOVDQU (CX), Y3
VMOVDQU 32(CX), Y6
PREFETCHT0 512(CX)
VPXOR (DX), Y3, Y4
VPXOR 32(DX), Y6, Y7
VPSHUFD $0x31, Y4, Y5
VPSHUFD $0x31, Y7, Y8
VPMULUDQ Y4, Y5, Y4
VPMULUDQ Y7, Y8, Y7
VPSHUFD $0x4e, Y3, Y3
VPSHUFD $0x4e, Y6, Y6
VPADDQ Y1, Y3, Y1
VPADDQ Y1, Y4, Y1
VPADDQ Y2, Y6, Y2
VPADDQ Y2, Y7, Y2
VMOVDQU 64(CX), Y3
VMOVDQU 96(CX), Y6
PREFETCHT0 576(CX)
VPXOR 8(DX), Y3, Y4
VPXOR 40(DX), Y6, Y7
VPSHUFD $0x31, Y4, Y5
VPSHUFD $0x31, Y7, Y8
VPMULUDQ Y4, Y5, Y4
VPMULUDQ Y7, Y8, Y7
VPSHUFD $0x4e, Y3, Y3
VPSHUFD $0x4e, Y6, Y6
VPADDQ Y1, Y3, Y1
VPADDQ Y1, Y4, Y1
VPADDQ Y2, Y6, Y2
VPADDQ Y2, Y7, Y2
VMOVDQU 128(CX), Y3
VMOVDQU 160(CX), Y6
PREFETCHT0 640(CX)
VPXOR 16(DX), Y3, Y4
VPXOR 48(DX), Y6, Y7
VPSHUFD $0x31, Y4, Y5
VPSHUFD $0x31, Y7, Y8
VPMULUDQ Y4, Y5, Y4
VPMULUDQ Y7, Y8, Y7
VPSHUFD $0x4e, Y3, Y3
VPSHUFD $0x4e, Y6, Y6
VPADDQ Y1, Y3, Y1
VPADDQ Y1, Y4, Y1
VPADDQ Y2, Y6, Y2
VPADDQ Y2, Y7, Y2
VMOVDQU 192(CX), Y3
VMOVDQU 224(CX), Y6
PREFETCHT0 704(CX)
VPXOR 24(DX), Y3, Y4
VPXOR 56(DX), Y6, Y7
VPSHUFD $0x31, Y4, Y5
VPSHUFD $0x31, Y7, Y8
VPMULUDQ Y4, Y5, Y4
VPMULUDQ Y7, Y8, Y7
VPSHUFD $0x4e, Y3, Y3
VPSHUFD $0x4e, Y6, Y6
VPADDQ Y1, Y3, Y1
VPADDQ Y1, Y4, Y1
VPADDQ Y2, Y6, Y2
VPADDQ Y2, Y7, Y2
VMOVDQU 256(CX), Y3
VMOVDQU 288(CX), Y6
PREFETCHT0 768(CX)
VPXOR 32(DX), Y3, Y4
VPXOR 64(DX), Y6, Y7
VPSHUFD $0x31, Y4, Y5
VPSHUFD $0x31, Y7, Y8
VPMULUDQ Y4, Y5, Y4
VPMULUDQ Y7, Y8, Y7
VPSHUFD $0x4e, Y3, Y3
VPSHUFD $0x4e, Y6, Y6
VPADDQ Y1, Y3, Y1
VPADDQ Y1, Y4, Y1
VPADDQ Y2, Y6, Y2
VPADDQ Y2, Y7, Y2
VMOVDQU 320(CX), Y3
VMOVDQU 352(CX), Y6
PREFETCHT0 832(CX)
VPXOR 40(DX), Y3, Y4
VPXOR 72(DX), Y6, Y7
VPSHUFD $0x31, Y4, Y5
VPSHUFD $0x31, Y7, Y8
VPMULUDQ Y4, Y5, Y4
VPMULUDQ Y7, Y8, Y7
VPSHUFD $0x4e, Y3, Y3
VPSHUFD $0x4e, Y6, Y6
VPADDQ Y1, Y3, Y1
VPADDQ Y1, Y4, Y1
VPADDQ Y2, Y6, Y2
VPADDQ Y2, Y7, Y2
VMOVDQU 384(CX), Y3
VMOVDQU 416(CX), Y6
PREFETCHT0 896(CX)
VPXOR 48(DX), Y3, Y4
VPXOR 80(DX), Y6, Y7
VPSHUFD $0x31, Y4, Y5
VPSHUFD $0x31, Y7, Y8
VPMULUDQ Y4, Y5, Y4
VPMULUDQ Y7, Y8, Y7
VPSHUFD $0x4e, Y3, Y3
VPSHUFD $0x4e, Y6, Y6
VPADDQ Y1, Y3, Y1
VPADDQ Y1, Y4, Y1
VPADDQ Y2, Y6, Y2
VPADDQ Y2, Y7, Y2
VMOVDQU 448(CX), Y3
VMOVDQU 480(CX), Y6
PREFETCHT0 960(CX)
VPXOR 56(DX), Y3, Y4
VPXOR 88(DX), Y6, Y7
VPSHUFD $0x31, Y4, Y5
VPSHUFD $0x31, Y7, Y8
VPMULUDQ Y4, Y5, Y4
VPMULUDQ Y7, Y8, Y7
VPSHUFD $0x4e, Y3, Y3
VPSHUFD $0x4e, Y6, Y6
VPADDQ Y1, Y3, Y1
VPADDQ Y1, Y4, Y1
VPADDQ Y2, Y6, Y2
VPADDQ Y2, Y7, Y2
VMOVDQU 512(CX), Y3
VMOVDQU 544(CX), Y6
PREFETCHT0 1024(CX)
VPXOR 64(DX), Y3, Y4
VPXOR 96(DX), Y6, Y7
VPSHUFD $0x31, Y4, Y5
VPSHUFD $0x31, Y7, Y8
VPMULUDQ Y4, Y5, Y4
VPMULUDQ Y7, Y8, Y7
VPSHUFD $0x4e, Y3, Y3
VPSHUFD $0x4e, Y6, Y6
VPADDQ Y1, Y3, Y1
VPADDQ Y1, Y4, Y1
VPADDQ Y2, Y6, Y2
VPADDQ Y2, Y7, Y2
VMOVDQU 576(CX), Y3
VMOVDQU 608(CX), Y6
PREFETCHT0 1088(CX)
VPXOR 72(DX), Y3, Y4
VPXOR 104(DX), Y6, Y7
VPSHUFD $0x31, Y4, Y5
VPSHUFD $0x31, Y7, Y8
VPMULUDQ Y4, Y5, Y4
VPMULUDQ Y7, Y8, Y7
VPSHUFD $0x4e, Y3, Y3
VPSHUFD $0x4e, Y6, Y6
VPADDQ Y1, Y3, Y1
VPADDQ Y1, Y4, Y1
VPADDQ Y2, Y6, Y2
VPADDQ Y2, Y7, Y2
VMOVDQU 640(CX), Y3
VMOVDQU 672(CX), Y6
PREFETCHT0 1152(CX)
VPXOR 80(DX), Y3, Y4
VPXOR 112(DX), Y6, Y7
VPSHUFD $0x31, Y4, Y5
VPSHUFD $0x31, Y7, Y8
VPMULUDQ Y4, Y5, Y4
VPMULUDQ Y7, Y8, Y7
VPSHUFD $0x4e, Y3, Y3
VPSHUFD $0x4e, Y6, Y6
VPADDQ Y1, Y3, Y1
VPADDQ Y1, Y4, Y1
VPADDQ Y2, Y6, Y2
VPADDQ Y2, Y7, Y2
VMOVDQU 704(CX), Y3
VMOVDQU 736(CX), Y6
PREFETCHT0 1216(CX)
VPXOR 88(DX), Y3, Y4
VPXOR 120(DX), Y6, Y7
VPSHUFD $0x31, Y4, Y5
VPSHUFD $0x31, Y7, Y8
VPMULUDQ Y4, Y5, Y4
VPMULUDQ Y7, Y8, Y7
VPSHUFD $0x4e, Y3, Y3
VPSHUFD $0x4e, Y6, Y6
VPADDQ Y1, Y3, Y1
VPADDQ Y1, Y4, Y1
VPADDQ Y2, Y6, Y2
VPADDQ Y2, Y7, Y2
VMOVDQU 768(CX), Y3
VMOVDQU 800(CX), Y6
PREFETCHT0 1280(CX)
VPXOR 96(DX), Y3, Y4
VPXOR 128(DX), Y6, Y7
VPSHUFD $0x31, Y4, Y5
VPSHUFD $0x31, Y7, Y8
VPMULUDQ Y4, Y5, Y4
VPMULUDQ Y7, Y8, Y7
VPSHUFD $0x4e, Y3, Y3
VPSHUFD $0x4e, Y6, Y6
VPADDQ Y1, Y3, Y1
VPADDQ Y1, Y4, Y1
VPADDQ Y2, Y6, Y2
VPADDQ Y2, Y7, Y2
VMOVDQU 832(CX), Y3
VMOVDQU 864(CX), Y6
PREFETCHT0 1344(CX)
VPXOR 104(DX), Y3, Y4
VPXOR 136(DX), Y6, Y7
VPSHUFD $0x31, Y4, Y5
VPSHUFD $0x31, Y7, Y8
VPMULUDQ Y4, Y5, Y4
VPMULUDQ Y7, Y8, Y7
VPSHUFD $0x4e, Y3, Y3
VPSHUFD $0x4e, Y6, Y6
VPADDQ Y1, Y3, Y1
VPADDQ Y1, Y4, Y1
VPADDQ Y2, Y6, Y2
VPADDQ Y2, Y7, Y2
VMOVDQU 896(CX), Y3
VMOVDQU 928(CX), Y6
PREFETCHT0 1408(CX)
VPXOR 112(DX), Y3, Y4
VPXOR 144(DX), Y6, Y7
VPSHUFD $0x31, Y4, Y5
VPSHUFD $0x31, Y7, Y8
VPMULUDQ Y4, Y5, Y4
VPMULUDQ Y7, Y8, Y7
VPSHUFD $0x4e, Y3, Y3
VPSHUFD $0x4e, Y6, Y6
VPADDQ Y1, Y3, Y1
VPADDQ Y1, Y4, Y1
VPADDQ Y2, Y6, Y2
VPADDQ Y2, Y7, Y2
VMOVDQU 960(CX), Y3
VMOVDQU 992(CX), Y6
PREFETCHT0 1472(CX)
VPXOR 120(DX), Y3, Y4
VPXOR 152(DX), Y6, Y7
VPSHUFD $0x31, Y4, Y5
VPSHUFD $0x31, Y7, Y8
VPMULUDQ Y4, Y5, Y4
VPMULUDQ Y7, Y8, Y7
VPSHUFD $0x4e, Y3, Y3
VPSHUFD $0x4e, Y6, Y6
VPADDQ Y1, Y3, Y1
VPADDQ Y1, Y4, Y1
VPADDQ Y2, Y6, Y2
VPADDQ Y2, Y7, Y2
ADDQ $0x00000400, CX
SUBQ $0x00000400, SI
VPSRLQ $0x2f, Y1, Y3
VPXOR Y1, Y3, Y3
VPXOR 128(DX), Y3, Y3
VPMULUDQ Y0, Y3, Y1
VPSHUFD $0xf5, Y3, Y3
VPMULUDQ Y0, Y3, Y3
VPSLLQ $0x20, Y3, Y3
VPADDQ Y1, Y3, Y1
VPSRLQ $0x2f, Y2, Y3
VPXOR Y2, Y3, Y3
VPXOR 160(DX), Y3, Y3
VPMULUDQ Y0, Y3, Y2
VPSHUFD $0xf5, Y3, Y3
VPMULUDQ Y0, Y3, Y3
VPSLLQ $0x20, Y3, Y3
VPADDQ Y2, Y3, Y2
JMP accum_large
accum:
CMPQ SI, $0x40
JLE finalize
VMOVDQU (CX), Y0
VMOVDQU 32(CX), Y5
VPXOR (BX), Y0, Y3
VPXOR 32(BX), Y5, Y6
VPSHUFD $0x31, Y3, Y4
VPSHUFD $0x31, Y6, Y7
VPMULUDQ Y3, Y4, Y3
VPMULUDQ Y6, Y7, Y6
VPSHUFD $0x4e, Y0, Y0
VPSHUFD $0x4e, Y5, Y5
VPADDQ Y1, Y0, Y1
VPADDQ Y1, Y3, Y1
VPADDQ Y2, Y5, Y2
VPADDQ Y2, Y6, Y2
ADDQ $0x00000040, CX
SUBQ $0x00000040, SI
ADDQ $0x00000008, BX
JMP accum
finalize:
CMPQ SI, $0x00
JE return
SUBQ $0x40, CX
ADDQ SI, CX
VMOVDQU (CX), Y0
VMOVDQU 32(CX), Y5
VPXOR 121(DX), Y0, Y3
VPXOR 153(DX), Y5, Y6
VPSHUFD $0x31, Y3, Y4
VPSHUFD $0x31, Y6, Y7
VPMULUDQ Y3, Y4, Y3
VPMULUDQ Y6, Y7, Y6
VPSHUFD $0x4e, Y0, Y0
VPSHUFD $0x4e, Y5, Y5
VPADDQ Y1, Y0, Y1
VPADDQ Y1, Y3, Y1
VPADDQ Y2, Y5, Y2
VPADDQ Y2, Y6, Y2
return:
VMOVDQU Y1, (AX)
VMOVDQU Y2, 32(AX)
VZEROUPPER
RET
// func accumBlockAVX2(acc *[8]uint64, data *byte, key *byte)
// Requires: AVX, AVX2
TEXT ·accumBlockAVX2(SB), NOSPLIT, $0-24
MOVQ acc+0(FP), AX
MOVQ data+8(FP), CX
MOVQ key+16(FP), DX
VMOVDQU (AX), Y1
VMOVDQU 32(AX), Y2
VMOVDQU prime_avx<>+0(SB), Y0
VMOVDQU (CX), Y3
VMOVDQU 32(CX), Y6
VPXOR (DX), Y3, Y4
VPXOR 32(DX), Y6, Y7
VPSHUFD $0x31, Y4, Y5
VPSHUFD $0x31, Y7, Y8
VPMULUDQ Y4, Y5, Y4
VPMULUDQ Y7, Y8, Y7
VPSHUFD $0x4e, Y3, Y3
VPSHUFD $0x4e, Y6, Y6
VPADDQ Y1, Y3, Y1
VPADDQ Y1, Y4, Y1
VPADDQ Y2, Y6, Y2
VPADDQ Y2, Y7, Y2
VMOVDQU 64(CX), Y3
VMOVDQU 96(CX), Y6
VPXOR 8(DX), Y3, Y4
VPXOR 40(DX), Y6, Y7
VPSHUFD $0x31, Y4, Y5
VPSHUFD $0x31, Y7, Y8
VPMULUDQ Y4, Y5, Y4
VPMULUDQ Y7, Y8, Y7
VPSHUFD $0x4e, Y3, Y3
VPSHUFD $0x4e, Y6, Y6
VPADDQ Y1, Y3, Y1
VPADDQ Y1, Y4, Y1
VPADDQ Y2, Y6, Y2
VPADDQ Y2, Y7, Y2
VMOVDQU 128(CX), Y3
VMOVDQU 160(CX), Y6
VPXOR 16(DX), Y3, Y4
VPXOR 48(DX), Y6, Y7
VPSHUFD $0x31, Y4, Y5
VPSHUFD $0x31, Y7, Y8
VPMULUDQ Y4, Y5, Y4
VPMULUDQ Y7, Y8, Y7
VPSHUFD $0x4e, Y3, Y3
VPSHUFD $0x4e, Y6, Y6
VPADDQ Y1, Y3, Y1
VPADDQ Y1, Y4, Y1
VPADDQ Y2, Y6, Y2
VPADDQ Y2, Y7, Y2
VMOVDQU 192(CX), Y3
VMOVDQU 224(CX), Y6
VPXOR 24(DX), Y3, Y4
VPXOR 56(DX), Y6, Y7
VPSHUFD $0x31, Y4, Y5
VPSHUFD $0x31, Y7, Y8
VPMULUDQ Y4, Y5, Y4
VPMULUDQ Y7, Y8, Y7
VPSHUFD $0x4e, Y3, Y3
VPSHUFD $0x4e, Y6, Y6
VPADDQ Y1, Y3, Y1
VPADDQ Y1, Y4, Y1
VPADDQ Y2, Y6, Y2
VPADDQ Y2, Y7, Y2
VMOVDQU 256(CX), Y3
VMOVDQU 288(CX), Y6
VPXOR 32(DX), Y3, Y4
VPXOR 64(DX), Y6, Y7
VPSHUFD $0x31, Y4, Y5
VPSHUFD $0x31, Y7, Y8
VPMULUDQ Y4, Y5, Y4
VPMULUDQ Y7, Y8, Y7
VPSHUFD $0x4e, Y3, Y3
VPSHUFD $0x4e, Y6, Y6
VPADDQ Y1, Y3, Y1
VPADDQ Y1, Y4, Y1
VPADDQ Y2, Y6, Y2
VPADDQ Y2, Y7, Y2
VMOVDQU 320(CX), Y3
VMOVDQU 352(CX), Y6
VPXOR 40(DX), Y3, Y4
VPXOR 72(DX), Y6, Y7
VPSHUFD $0x31, Y4, Y5
VPSHUFD $0x31, Y7, Y8
VPMULUDQ Y4, Y5, Y4
VPMULUDQ Y7, Y8, Y7
VPSHUFD $0x4e, Y3, Y3
VPSHUFD $0x4e, Y6, Y6
VPADDQ Y1, Y3, Y1
VPADDQ Y1, Y4, Y1
VPADDQ Y2, Y6, Y2
VPADDQ Y2, Y7, Y2
VMOVDQU 384(CX), Y3
VMOVDQU 416(CX), Y6
VPXOR 48(DX), Y3, Y4
VPXOR 80(DX), Y6, Y7
VPSHUFD $0x31, Y4, Y5
VPSHUFD $0x31, Y7, Y8
VPMULUDQ Y4, Y5, Y4
VPMULUDQ Y7, Y8, Y7
VPSHUFD $0x4e, Y3, Y3
VPSHUFD $0x4e, Y6, Y6
VPADDQ Y1, Y3, Y1
VPADDQ Y1, Y4, Y1
VPADDQ Y2, Y6, Y2
VPADDQ Y2, Y7, Y2
VMOVDQU 448(CX), Y3
VMOVDQU 480(CX), Y6
VPXOR 56(DX), Y3, Y4
VPXOR 88(DX), Y6, Y7
VPSHUFD $0x31, Y4, Y5
VPSHUFD $0x31, Y7, Y8
VPMULUDQ Y4, Y5, Y4
VPMULUDQ Y7, Y8, Y7
VPSHUFD $0x4e, Y3, Y3
VPSHUFD $0x4e, Y6, Y6
VPADDQ Y1, Y3, Y1
VPADDQ Y1, Y4, Y1
VPADDQ Y2, Y6, Y2
VPADDQ Y2, Y7, Y2
VMOVDQU 512(CX), Y3
VMOVDQU 544(CX), Y6
VPXOR 64(DX), Y3, Y4
VPXOR 96(DX), Y6, Y7
VPSHUFD $0x31, Y4, Y5
VPSHUFD $0x31, Y7, Y8
VPMULUDQ Y4, Y5, Y4
VPMULUDQ Y7, Y8, Y7
VPSHUFD $0x4e, Y3, Y3
VPSHUFD $0x4e, Y6, Y6
VPADDQ Y1, Y3, Y1
VPADDQ Y1, Y4, Y1
VPADDQ Y2, Y6, Y2
VPADDQ Y2, Y7, Y2
VMOVDQU 576(CX), Y3
VMOVDQU 608(CX), Y6
VPXOR 72(DX), Y3, Y4
VPXOR 104(DX), Y6, Y7
VPSHUFD $0x31, Y4, Y5
VPSHUFD $0x31, Y7, Y8
VPMULUDQ Y4, Y5, Y4
VPMULUDQ Y7, Y8, Y7
VPSHUFD $0x4e, Y3, Y3
VPSHUFD $0x4e, Y6, Y6
VPADDQ Y1, Y3, Y1
VPADDQ Y1, Y4, Y1
VPADDQ Y2, Y6, Y2
VPADDQ Y2, Y7, Y2
VMOVDQU 640(CX), Y3
VMOVDQU 672(CX), Y6
VPXOR 80(DX), Y3, Y4
VPXOR 112(DX), Y6, Y7
VPSHUFD $0x31, Y4, Y5
VPSHUFD $0x31, Y7, Y8
VPMULUDQ Y4, Y5, Y4
VPMULUDQ Y7, Y8, Y7
VPSHUFD $0x4e, Y3, Y3
VPSHUFD $0x4e, Y6, Y6
VPADDQ Y1, Y3, Y1
VPADDQ Y1, Y4, Y1
VPADDQ Y2, Y6, Y2
VPADDQ Y2, Y7, Y2
VMOVDQU 704(CX), Y3
VMOVDQU 736(CX), Y6
VPXOR 88(DX), Y3, Y4
VPXOR 120(DX), Y6, Y7
VPSHUFD $0x31, Y4, Y5
VPSHUFD $0x31, Y7, Y8
VPMULUDQ Y4, Y5, Y4
VPMULUDQ Y7, Y8, Y7
VPSHUFD $0x4e, Y3, Y3
VPSHUFD $0x4e, Y6, Y6
VPADDQ Y1, Y3, Y1
VPADDQ Y1, Y4, Y1
VPADDQ Y2, Y6, Y2
VPADDQ Y2, Y7, Y2
VMOVDQU 768(CX), Y3
VMOVDQU 800(CX), Y6
VPXOR 96(DX), Y3, Y4
VPXOR 128(DX), Y6, Y7
VPSHUFD $0x31, Y4, Y5
VPSHUFD $0x31, Y7, Y8
VPMULUDQ Y4, Y5, Y4
VPMULUDQ Y7, Y8, Y7
VPSHUFD $0x4e, Y3, Y3
VPSHUFD $0x4e, Y6, Y6
VPADDQ Y1, Y3, Y1
VPADDQ Y1, Y4, Y1
VPADDQ Y2, Y6, Y2
VPADDQ Y2, Y7, Y2
VMOVDQU 832(CX), Y3
VMOVDQU 864(CX), Y6
VPXOR 104(DX), Y3, Y4
VPXOR 136(DX), Y6, Y7
VPSHUFD $0x31, Y4, Y5
VPSHUFD $0x31, Y7, Y8
VPMULUDQ Y4, Y5, Y4
VPMULUDQ Y7, Y8, Y7
VPSHUFD $0x4e, Y3, Y3
VPSHUFD $0x4e, Y6, Y6
VPADDQ Y1, Y3, Y1
VPADDQ Y1, Y4, Y1
VPADDQ Y2, Y6, Y2
VPADDQ Y2, Y7, Y2
VMOVDQU 896(CX), Y3
VMOVDQU 928(CX), Y6
VPXOR 112(DX), Y3, Y4
VPXOR 144(DX), Y6, Y7
VPSHUFD $0x31, Y4, Y5
VPSHUFD $0x31, Y7, Y8
VPMULUDQ Y4, Y5, Y4
VPMULUDQ Y7, Y8, Y7
VPSHUFD $0x4e, Y3, Y3
VPSHUFD $0x4e, Y6, Y6
VPADDQ Y1, Y3, Y1
VPADDQ Y1, Y4, Y1
VPADDQ Y2, Y6, Y2
VPADDQ Y2, Y7, Y2
VMOVDQU 960(CX), Y3
VMOVDQU 992(CX), Y6
VPXOR 120(DX), Y3, Y4
VPXOR 152(DX), Y6, Y7
VPSHUFD $0x31, Y4, Y5
VPSHUFD $0x31, Y7, Y8
VPMULUDQ Y4, Y5, Y4
VPMULUDQ Y7, Y8, Y7
VPSHUFD $0x4e, Y3, Y3
VPSHUFD $0x4e, Y6, Y6
VPADDQ Y1, Y3, Y1
VPADDQ Y1, Y4, Y1
VPADDQ Y2, Y6, Y2
VPADDQ Y2, Y7, Y2
VPSRLQ $0x2f, Y1, Y3
VPXOR Y1, Y3, Y3
VPXOR 128(DX), Y3, Y3
VPMULUDQ Y0, Y3, Y1
VPSHUFD $0xf5, Y3, Y3
VPMULUDQ Y0, Y3, Y3
VPSLLQ $0x20, Y3, Y3
VPADDQ Y1, Y3, Y1
VPSRLQ $0x2f, Y2, Y3
VPXOR Y2, Y3, Y3
VPXOR 160(DX), Y3, Y3
VPMULUDQ Y0, Y3, Y2
VPSHUFD $0xf5, Y3, Y3
VPMULUDQ Y0, Y3, Y3
VPSLLQ $0x20, Y3, Y3
VPADDQ Y2, Y3, Y2
VMOVDQU Y1, (AX)
VMOVDQU Y2, 32(AX)
VZEROUPPER
RET

File diff suppressed because it is too large Load Diff

View File

@ -1,97 +0,0 @@
package xxh3
const (
_stripe = 64
_block = 1024
prime32_1 = 2654435761
prime32_2 = 2246822519
prime32_3 = 3266489917
prime64_1 = 11400714785074694791
prime64_2 = 14029467366897019727
prime64_3 = 1609587929392839161
prime64_4 = 9650029242287828579
prime64_5 = 2870177450012600261
)
var key = ptr(&[...]u8{
0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe /* 8 */, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c, /* 16 */
0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb /* 24 */, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f, /* 32 */
0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78 /* 40 */, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21, /* 48 */
0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e /* 56 */, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c, /* 64 */
0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb /* 72 */, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3, /* 80 */
0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e /* 88 */, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8, /* 96 */
0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f /* 104 */, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d, /* 112 */
0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31 /* 120 */, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64, /* 128 */
0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3 /* 136 */, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb, /* 144 */
0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49 /* 152 */, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e, /* 160 */
0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc /* 168 */, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce, /* 176 */
0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28 /* 184 */, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e, /* 192 */
})
const (
key64_000 u64 = 0xbe4ba423396cfeb8
key64_008 u64 = 0x1cad21f72c81017c
key64_016 u64 = 0xdb979083e96dd4de
key64_024 u64 = 0x1f67b3b7a4a44072
key64_032 u64 = 0x78e5c0cc4ee679cb
key64_040 u64 = 0x2172ffcc7dd05a82
key64_048 u64 = 0x8e2443f7744608b8
key64_056 u64 = 0x4c263a81e69035e0
key64_064 u64 = 0xcb00c391bb52283c
key64_072 u64 = 0xa32e531b8b65d088
key64_080 u64 = 0x4ef90da297486471
key64_088 u64 = 0xd8acdea946ef1938
key64_096 u64 = 0x3f349ce33f76faa8
key64_104 u64 = 0x1d4f0bc7c7bbdcf9
key64_112 u64 = 0x3159b4cd4be0518a
key64_120 u64 = 0x647378d9c97e9fc8
key64_128 u64 = 0xc3ebd33483acc5ea
key64_136 u64 = 0xeb6313faffa081c5
key64_144 u64 = 0x49daf0b751dd0d17
key64_152 u64 = 0x9e68d429265516d3
key64_160 u64 = 0xfca1477d58be162b
key64_168 u64 = 0xce31d07ad1b8f88f
key64_176 u64 = 0x280416958f3acb45
key64_184 u64 = 0x7e404bbbcafbd7af
key64_103 u64 = 0x4f0bc7c7bbdcf93f
key64_111 u64 = 0x59b4cd4be0518a1d
key64_119 u64 = 0x7378d9c97e9fc831
key64_127 u64 = 0xebd33483acc5ea64
key64_121 u64 = 0xea647378d9c97e9f
key64_129 u64 = 0xc5c3ebd33483acc5
key64_137 u64 = 0x17eb6313faffa081
key64_145 u64 = 0xd349daf0b751dd0d
key64_153 u64 = 0x2b9e68d429265516
key64_161 u64 = 0x8ffca1477d58be16
key64_169 u64 = 0x45ce31d07ad1b8f8
key64_177 u64 = 0xaf280416958f3acb
key64_011 = 0x6dd4de1cad21f72c
key64_019 = 0xa44072db979083e9
key64_027 = 0xe679cb1f67b3b7a4
key64_035 = 0xd05a8278e5c0cc4e
key64_043 = 0x4608b82172ffcc7d
key64_051 = 0x9035e08e2443f774
key64_059 = 0x52283c4c263a81e6
key64_067 = 0x65d088cb00c391bb
key64_117 = 0xd9c97e9fc83159b4
key64_125 = 0x3483acc5ea647378
key64_133 = 0xfaffa081c5c3ebd3
key64_141 = 0xb751dd0d17eb6313
key64_149 = 0x29265516d349daf0
key64_157 = 0x7d58be162b9e68d4
key64_165 = 0x7ad1b8f88ffca147
key64_173 = 0x958f3acb45ce31d0
)
const (
key32_000 u32 = 0xbe4ba423
key32_004 u32 = 0x396cfeb8
key32_008 u32 = 0x1cad21f7
key32_012 u32 = 0x2c81017c
)

View File

@ -1,253 +0,0 @@
package xxh3
import (
"math/bits"
)
// Hash128 returns the 128-bit hash of the byte slice.
func Hash128(b []byte) Uint128 {
return hashAny128(*(*str)(ptr(&b)))
}
// HashString128 returns the 128-bit hash of the string slice.
func HashString128(s string) Uint128 {
return hashAny128(*(*str)(ptr(&s)))
}
func hashAny128(s str) (acc u128) {
p, l := s.p, s.l
switch {
case l <= 16:
switch {
case l > 8: // 9-16
const bitflipl = key64_032 ^ key64_040
const bitfliph = key64_048 ^ key64_056
input_lo := readU64(p, 0)
input_hi := readU64(p, ui(l)-8)
m128_h, m128_l := bits.Mul64(input_lo^input_hi^bitflipl, prime64_1)
m128_l += uint64(l-1) << 54
input_hi ^= bitfliph
m128_h += input_hi + uint64(uint32(input_hi))*(prime32_2-1)
m128_l ^= bits.ReverseBytes64(m128_h)
acc.Hi, acc.Lo = bits.Mul64(m128_l, prime64_2)
acc.Hi += m128_h * prime64_2
acc.Lo = xxh3Avalanche(acc.Lo)
acc.Hi = xxh3Avalanche(acc.Hi)
return acc
case l > 3: // 4-8
const bitflip = key64_016 ^ key64_024
input_lo := readU32(p, 0)
input_hi := readU32(p, ui(l)-4)
input_64 := u64(input_lo) + u64(input_hi)<<32
keyed := input_64 ^ bitflip
acc.Hi, acc.Lo = bits.Mul64(keyed, prime64_1+(uint64(l)<<2))
acc.Hi += acc.Lo << 1
acc.Lo ^= acc.Hi >> 3
acc.Lo ^= acc.Lo >> 35
acc.Lo *= 0x9fb21c651e98df25
acc.Lo ^= acc.Lo >> 28
acc.Hi = xxh3Avalanche(acc.Hi)
return acc
case l == 3: // 3
c12 := u64(readU16(p, 0))
c3 := u64(readU8(p, 2))
acc.Lo = c12<<16 + c3 + 3<<8
case l > 1: // 2
c12 := u64(readU16(p, 0))
acc.Lo = c12*(1<<24+1)>>8 + 2<<8
case l == 1: // 1
c1 := u64(readU8(p, 0))
acc.Lo = c1*(1<<24+1<<16+1) + 1<<8
default: // 0
return u128{0x99aa06d3014798d8, 0x6001c324468d497f}
}
acc.Hi = uint64(bits.RotateLeft32(bits.ReverseBytes32(uint32(acc.Lo)), 13))
acc.Lo ^= uint64(key32_000 ^ key32_004)
acc.Hi ^= uint64(key32_008 ^ key32_012)
acc.Lo = xxh64AvalancheSmall(acc.Lo)
acc.Hi = xxh64AvalancheSmall(acc.Hi)
return acc
case l <= 128:
acc.Lo = u64(l) * prime64_1
if l > 32 {
if l > 64 {
if l > 96 {
in8, in7 := readU64(p, ui(l)-8*8), readU64(p, ui(l)-7*8)
i6, i7 := readU64(p, 6*8), readU64(p, 7*8)
acc.Hi += mulFold64(in8^key64_112, in7^key64_120)
acc.Hi ^= i6 + i7
acc.Lo += mulFold64(i6^key64_096, i7^key64_104)
acc.Lo ^= in8 + in7
} // 96
in6, in5 := readU64(p, ui(l)-6*8), readU64(p, ui(l)-5*8)
i4, i5 := readU64(p, 4*8), readU64(p, 5*8)
acc.Hi += mulFold64(in6^key64_080, in5^key64_088)
acc.Hi ^= i4 + i5
acc.Lo += mulFold64(i4^key64_064, i5^key64_072)
acc.Lo ^= in6 + in5
} // 64
in4, in3 := readU64(p, ui(l)-4*8), readU64(p, ui(l)-3*8)
i2, i3 := readU64(p, 2*8), readU64(p, 3*8)
acc.Hi += mulFold64(in4^key64_048, in3^key64_056)
acc.Hi ^= i2 + i3
acc.Lo += mulFold64(i2^key64_032, i3^key64_040)
acc.Lo ^= in4 + in3
} // 32
in2, in1 := readU64(p, ui(l)-2*8), readU64(p, ui(l)-1*8)
i0, i1 := readU64(p, 0*8), readU64(p, 1*8)
acc.Hi += mulFold64(in2^key64_016, in1^key64_024)
acc.Hi ^= i0 + i1
acc.Lo += mulFold64(i0^key64_000, i1^key64_008)
acc.Lo ^= in2 + in1
acc.Hi, acc.Lo = (acc.Lo*prime64_1)+(acc.Hi*prime64_4)+(u64(l)*prime64_2), acc.Hi+acc.Lo
acc.Hi = -xxh3Avalanche(acc.Hi)
acc.Lo = xxh3Avalanche(acc.Lo)
return acc
case l <= 240:
acc.Lo = u64(l) * prime64_1
{
i0, i1, i2, i3 := readU64(p, 0*8), readU64(p, 1*8), readU64(p, 2*8), readU64(p, 3*8)
acc.Hi += mulFold64(i2^key64_016, i3^key64_024)
acc.Hi ^= i0 + i1
acc.Lo += mulFold64(i0^key64_000, i1^key64_008)
acc.Lo ^= i2 + i3
}
{
i0, i1, i2, i3 := readU64(p, 4*8), readU64(p, 5*8), readU64(p, 6*8), readU64(p, 7*8)
acc.Hi += mulFold64(i2^key64_048, i3^key64_056)
acc.Hi ^= i0 + i1
acc.Lo += mulFold64(i0^key64_032, i1^key64_040)
acc.Lo ^= i2 + i3
}
{
i0, i1, i2, i3 := readU64(p, 8*8), readU64(p, 9*8), readU64(p, 10*8), readU64(p, 11*8)
acc.Hi += mulFold64(i2^key64_080, i3^key64_088)
acc.Hi ^= i0 + i1
acc.Lo += mulFold64(i0^key64_064, i1^key64_072)
acc.Lo ^= i2 + i3
}
{
i0, i1, i2, i3 := readU64(p, 12*8), readU64(p, 13*8), readU64(p, 14*8), readU64(p, 15*8)
acc.Hi += mulFold64(i2^key64_112, i3^key64_120)
acc.Hi ^= i0 + i1
acc.Lo += mulFold64(i0^key64_096, i1^key64_104)
acc.Lo ^= i2 + i3
}
// avalanche
acc.Hi = xxh3Avalanche(acc.Hi)
acc.Lo = xxh3Avalanche(acc.Lo)
// trailing groups after 128
top := ui(l) &^ 31
for i := ui(4 * 32); i < top; i += 32 {
i0, i1, i2, i3 := readU64(p, i+0), readU64(p, i+8), readU64(p, i+16), readU64(p, i+24)
k0, k1, k2, k3 := readU64(key, i-125), readU64(key, i-117), readU64(key, i-109), readU64(key, i-101)
acc.Hi += mulFold64(i2^k2, i3^k3)
acc.Hi ^= i0 + i1
acc.Lo += mulFold64(i0^k0, i1^k1)
acc.Lo ^= i2 + i3
}
// last 32 bytes
{
i0, i1, i2, i3 := readU64(p, ui(l)-32), readU64(p, ui(l)-24), readU64(p, ui(l)-16), readU64(p, ui(l)-8)
acc.Hi += mulFold64(i0^key64_119, i1^key64_127)
acc.Hi ^= i2 + i3
acc.Lo += mulFold64(i2^key64_103, i3^key64_111)
acc.Lo ^= i0 + i1
}
acc.Hi, acc.Lo = (acc.Lo*prime64_1)+(acc.Hi*prime64_4)+(u64(l)*prime64_2), acc.Hi+acc.Lo
acc.Hi = -xxh3Avalanche(acc.Hi)
acc.Lo = xxh3Avalanche(acc.Lo)
return acc
default:
acc.Lo = u64(l) * prime64_1
acc.Hi = ^(u64(l) * prime64_2)
accs := [8]u64{
prime32_3, prime64_1, prime64_2, prime64_3,
prime64_4, prime32_2, prime64_5, prime32_1,
}
if hasAVX512 && l >= avx512Switch {
accumAVX512(&accs, p, key, u64(l))
} else if hasAVX2 {
accumAVX2(&accs, p, key, u64(l))
} else if hasSSE2 {
accumSSE(&accs, p, key, u64(l))
} else {
accumScalar(&accs, p, key, u64(l))
}
// merge accs
acc.Lo += mulFold64(accs[0]^key64_011, accs[1]^key64_019)
acc.Hi += mulFold64(accs[0]^key64_117, accs[1]^key64_125)
acc.Lo += mulFold64(accs[2]^key64_027, accs[3]^key64_035)
acc.Hi += mulFold64(accs[2]^key64_133, accs[3]^key64_141)
acc.Lo += mulFold64(accs[4]^key64_043, accs[5]^key64_051)
acc.Hi += mulFold64(accs[4]^key64_149, accs[5]^key64_157)
acc.Lo += mulFold64(accs[6]^key64_059, accs[7]^key64_067)
acc.Hi += mulFold64(accs[6]^key64_165, accs[7]^key64_173)
acc.Lo = xxh3Avalanche(acc.Lo)
acc.Hi = xxh3Avalanche(acc.Hi)
return acc
}
}

View File

@ -1,264 +0,0 @@
package xxh3
import (
"math/bits"
)
// Hash128Seed returns the 128-bit hash of the byte slice.
func Hash128Seed(b []byte, seed uint64) Uint128 {
return hashAny128Seed(*(*str)(ptr(&b)), seed)
}
// HashString128Seed returns the 128-bit hash of the string slice.
func HashString128Seed(s string, seed uint64) Uint128 {
return hashAny128Seed(*(*str)(ptr(&s)), seed)
}
func hashAny128Seed(s str, seed uint64) (acc u128) {
p, l := s.p, s.l
switch {
case l <= 16:
switch {
case l > 8: // 9-16
bitflipl := (key64_032 ^ key64_040) - seed
bitfliph := (key64_048 ^ key64_056) + seed
input_lo := readU64(p, 0)
input_hi := readU64(p, ui(l)-8)
m128_h, m128_l := bits.Mul64(input_lo^input_hi^bitflipl, prime64_1)
m128_l += uint64(l-1) << 54
input_hi ^= bitfliph
m128_h += input_hi + uint64(uint32(input_hi))*(prime32_2-1)
m128_l ^= bits.ReverseBytes64(m128_h)
acc.Hi, acc.Lo = bits.Mul64(m128_l, prime64_2)
acc.Hi += m128_h * prime64_2
acc.Lo = xxh3Avalanche(acc.Lo)
acc.Hi = xxh3Avalanche(acc.Hi)
return acc
case l > 3: // 4-8
seed ^= u64(bits.ReverseBytes32(u32(seed))) << 32
bitflip := (key64_016 ^ key64_024) + seed
input_lo := readU32(p, 0)
input_hi := readU32(p, ui(l)-4)
input_64 := u64(input_lo) + u64(input_hi)<<32
keyed := input_64 ^ bitflip
acc.Hi, acc.Lo = bits.Mul64(keyed, prime64_1+(uint64(l)<<2))
acc.Hi += acc.Lo << 1
acc.Lo ^= acc.Hi >> 3
acc.Lo ^= acc.Lo >> 35
acc.Lo *= 0x9fb21c651e98df25
acc.Lo ^= acc.Lo >> 28
acc.Hi = xxh3Avalanche(acc.Hi)
return acc
case l == 3: // 3
c12 := u64(readU16(p, 0))
c3 := u64(readU8(p, 2))
acc.Lo = c12<<16 + c3 + 3<<8
case l > 1: // 2
c12 := u64(readU16(p, 0))
acc.Lo = c12*(1<<24+1)>>8 + 2<<8
case l == 1: // 1
c1 := u64(readU8(p, 0))
acc.Lo = c1*(1<<24+1<<16+1) + 1<<8
default: // 0
bitflipl := key64_064 ^ key64_072 ^ seed
bitfliph := key64_080 ^ key64_088 ^ seed
return u128{Lo: xxh64AvalancheFull(bitflipl), Hi: xxh64AvalancheFull(bitfliph)}
}
acc.Hi = uint64(bits.RotateLeft32(bits.ReverseBytes32(uint32(acc.Lo)), 13))
acc.Lo ^= uint64(key32_000^key32_004) + seed
acc.Hi ^= uint64(key32_008^key32_012) - seed
acc.Lo = xxh64AvalancheFull(acc.Lo)
acc.Hi = xxh64AvalancheFull(acc.Hi)
return acc
case l <= 128:
acc.Lo = u64(l) * prime64_1
if l > 32 {
if l > 64 {
if l > 96 {
in8, in7 := readU64(p, ui(l)-8*8), readU64(p, ui(l)-7*8)
i6, i7 := readU64(p, 6*8), readU64(p, 7*8)
acc.Hi += mulFold64(in8^(key64_112+seed), in7^(key64_120-seed))
acc.Hi ^= i6 + i7
acc.Lo += mulFold64(i6^(key64_096+seed), i7^(key64_104-seed))
acc.Lo ^= in8 + in7
} // 96
in6, in5 := readU64(p, ui(l)-6*8), readU64(p, ui(l)-5*8)
i4, i5 := readU64(p, 4*8), readU64(p, 5*8)
acc.Hi += mulFold64(in6^(key64_080+seed), in5^(key64_088-seed))
acc.Hi ^= i4 + i5
acc.Lo += mulFold64(i4^(key64_064+seed), i5^(key64_072-seed))
acc.Lo ^= in6 + in5
} // 64
in4, in3 := readU64(p, ui(l)-4*8), readU64(p, ui(l)-3*8)
i2, i3 := readU64(p, 2*8), readU64(p, 3*8)
acc.Hi += mulFold64(in4^(key64_048+seed), in3^(key64_056-seed))
acc.Hi ^= i2 + i3
acc.Lo += mulFold64(i2^(key64_032+seed), i3^(key64_040-seed))
acc.Lo ^= in4 + in3
} // 32
in2, in1 := readU64(p, ui(l)-2*8), readU64(p, ui(l)-1*8)
i0, i1 := readU64(p, 0*8), readU64(p, 1*8)
acc.Hi += mulFold64(in2^(key64_016+seed), in1^(key64_024-seed))
acc.Hi ^= i0 + i1
acc.Lo += mulFold64(i0^(key64_000+seed), i1^(key64_008-seed))
acc.Lo ^= in2 + in1
acc.Hi, acc.Lo = (acc.Lo*prime64_1)+(acc.Hi*prime64_4)+((u64(l)-seed)*prime64_2), acc.Hi+acc.Lo
acc.Hi = -xxh3Avalanche(acc.Hi)
acc.Lo = xxh3Avalanche(acc.Lo)
return acc
case l <= 240:
acc.Lo = u64(l) * prime64_1
{
i0, i1, i2, i3 := readU64(p, 0*8), readU64(p, 1*8), readU64(p, 2*8), readU64(p, 3*8)
acc.Hi += mulFold64(i2^(key64_016+seed), i3^(key64_024-seed))
acc.Hi ^= i0 + i1
acc.Lo += mulFold64(i0^(key64_000+seed), i1^(key64_008-seed))
acc.Lo ^= i2 + i3
}
{
i0, i1, i2, i3 := readU64(p, 4*8), readU64(p, 5*8), readU64(p, 6*8), readU64(p, 7*8)
acc.Hi += mulFold64(i2^(key64_048+seed), i3^(key64_056-seed))
acc.Hi ^= i0 + i1
acc.Lo += mulFold64(i0^(key64_032+seed), i1^(key64_040-seed))
acc.Lo ^= i2 + i3
}
{
i0, i1, i2, i3 := readU64(p, 8*8), readU64(p, 9*8), readU64(p, 10*8), readU64(p, 11*8)
acc.Hi += mulFold64(i2^(key64_080+seed), i3^(key64_088-seed))
acc.Hi ^= i0 + i1
acc.Lo += mulFold64(i0^(key64_064+seed), i1^(key64_072-seed))
acc.Lo ^= i2 + i3
}
{
i0, i1, i2, i3 := readU64(p, 12*8), readU64(p, 13*8), readU64(p, 14*8), readU64(p, 15*8)
acc.Hi += mulFold64(i2^(key64_112+seed), i3^(key64_120-seed))
acc.Hi ^= i0 + i1
acc.Lo += mulFold64(i0^(key64_096+seed), i1^(key64_104-seed))
acc.Lo ^= i2 + i3
}
// avalanche
acc.Hi = xxh3Avalanche(acc.Hi)
acc.Lo = xxh3Avalanche(acc.Lo)
// trailing groups after 128
top := ui(l) &^ 31
for i := ui(4 * 32); i < top; i += 32 {
i0, i1, i2, i3 := readU64(p, i+0), readU64(p, i+8), readU64(p, i+16), readU64(p, i+24)
k0, k1, k2, k3 := readU64(key, i-125)+seed, readU64(key, i-117)-seed, readU64(key, i-109)+seed, readU64(key, i-101)-seed
acc.Hi += mulFold64(i2^k2, i3^k3)
acc.Hi ^= i0 + i1
acc.Lo += mulFold64(i0^k0, i1^k1)
acc.Lo ^= i2 + i3
}
// last 32 bytes
{
i0, i1, i2, i3 := readU64(p, ui(l)-32), readU64(p, ui(l)-24), readU64(p, ui(l)-16), readU64(p, ui(l)-8)
seed := 0 - seed
acc.Hi += mulFold64(i0^(key64_119+seed), i1^(key64_127-seed))
acc.Hi ^= i2 + i3
acc.Lo += mulFold64(i2^(key64_103+seed), i3^(key64_111-seed))
acc.Lo ^= i0 + i1
}
acc.Hi, acc.Lo = (acc.Lo*prime64_1)+(acc.Hi*prime64_4)+((u64(l)-seed)*prime64_2), acc.Hi+acc.Lo
acc.Hi = -xxh3Avalanche(acc.Hi)
acc.Lo = xxh3Avalanche(acc.Lo)
return acc
default:
acc.Lo = u64(l) * prime64_1
acc.Hi = ^(u64(l) * prime64_2)
secret := key
if seed != 0 {
secret = ptr(&[secretSize]byte{})
initSecret(secret, seed)
}
accs := [8]u64{
prime32_3, prime64_1, prime64_2, prime64_3,
prime64_4, prime32_2, prime64_5, prime32_1,
}
if hasAVX512 && l >= avx512Switch {
accumAVX512(&accs, p, secret, u64(l))
} else if hasAVX2 {
accumAVX2(&accs, p, secret, u64(l))
} else if hasSSE2 {
accumSSE(&accs, p, secret, u64(l))
} else {
accumScalar(&accs, p, secret, u64(l))
}
// merge accs
const hi_off = 117 - 11
acc.Lo += mulFold64(accs[0]^readU64(secret, 11), accs[1]^readU64(secret, 19))
acc.Hi += mulFold64(accs[0]^readU64(secret, 11+hi_off), accs[1]^readU64(secret, 19+hi_off))
acc.Lo += mulFold64(accs[2]^readU64(secret, 27), accs[3]^readU64(secret, 35))
acc.Hi += mulFold64(accs[2]^readU64(secret, 27+hi_off), accs[3]^readU64(secret, 35+hi_off))
acc.Lo += mulFold64(accs[4]^readU64(secret, 43), accs[5]^readU64(secret, 51))
acc.Hi += mulFold64(accs[4]^readU64(secret, 43+hi_off), accs[5]^readU64(secret, 51+hi_off))
acc.Lo += mulFold64(accs[6]^readU64(secret, 59), accs[7]^readU64(secret, 67))
acc.Hi += mulFold64(accs[6]^readU64(secret, 59+hi_off), accs[7]^readU64(secret, 67+hi_off))
acc.Lo = xxh3Avalanche(acc.Lo)
acc.Hi = xxh3Avalanche(acc.Hi)
return acc
}
}

View File

@ -1,126 +0,0 @@
package xxh3
import "math/bits"
// Hash returns the hash of the byte slice.
func Hash(b []byte) uint64 {
return hashAny(*(*str)(ptr(&b)))
}
// Hash returns the hash of the string slice.
func HashString(s string) uint64 {
return hashAny(*(*str)(ptr(&s)))
}
func hashAny(s str) (acc u64) {
p, l := s.p, s.l
switch {
case l <= 16:
switch {
case l > 8: // 9-16
inputlo := readU64(p, 0) ^ (key64_024 ^ key64_032)
inputhi := readU64(p, ui(l)-8) ^ (key64_040 ^ key64_048)
folded := mulFold64(inputlo, inputhi)
return xxh3Avalanche(u64(l) + bits.ReverseBytes64(inputlo) + inputhi + folded)
case l > 3: // 4-8
input1 := readU32(p, 0)
input2 := readU32(p, ui(l)-4)
input64 := u64(input2) + u64(input1)<<32
keyed := input64 ^ (key64_008 ^ key64_016)
return rrmxmx(keyed, u64(l))
case l == 3: // 3
c12 := u64(readU16(p, 0))
c3 := u64(readU8(p, 2))
acc = c12<<16 + c3 + 3<<8
case l > 1: // 2
c12 := u64(readU16(p, 0))
acc = c12*(1<<24+1)>>8 + 2<<8
case l == 1: // 1
c1 := u64(readU8(p, 0))
acc = c1*(1<<24+1<<16+1) + 1<<8
default: // 0
return 0x2d06800538d394c2 // xxh_avalanche(key64_056 ^ key64_064)
}
acc ^= u64(key32_000 ^ key32_004)
return xxhAvalancheSmall(acc)
case l <= 128:
acc = u64(l) * prime64_1
if l > 32 {
if l > 64 {
if l > 96 {
acc += mulFold64(readU64(p, 6*8)^key64_096, readU64(p, 7*8)^key64_104)
acc += mulFold64(readU64(p, ui(l)-8*8)^key64_112, readU64(p, ui(l)-7*8)^key64_120)
} // 96
acc += mulFold64(readU64(p, 4*8)^key64_064, readU64(p, 5*8)^key64_072)
acc += mulFold64(readU64(p, ui(l)-6*8)^key64_080, readU64(p, ui(l)-5*8)^key64_088)
} // 64
acc += mulFold64(readU64(p, 2*8)^key64_032, readU64(p, 3*8)^key64_040)
acc += mulFold64(readU64(p, ui(l)-4*8)^key64_048, readU64(p, ui(l)-3*8)^key64_056)
} // 32
acc += mulFold64(readU64(p, 0*8)^key64_000, readU64(p, 1*8)^key64_008)
acc += mulFold64(readU64(p, ui(l)-2*8)^key64_016, readU64(p, ui(l)-1*8)^key64_024)
return xxh3Avalanche(acc)
case l <= 240:
acc = u64(l) * prime64_1
acc += mulFold64(readU64(p, 0*16+0)^key64_000, readU64(p, 0*16+8)^key64_008)
acc += mulFold64(readU64(p, 1*16+0)^key64_016, readU64(p, 1*16+8)^key64_024)
acc += mulFold64(readU64(p, 2*16+0)^key64_032, readU64(p, 2*16+8)^key64_040)
acc += mulFold64(readU64(p, 3*16+0)^key64_048, readU64(p, 3*16+8)^key64_056)
acc += mulFold64(readU64(p, 4*16+0)^key64_064, readU64(p, 4*16+8)^key64_072)
acc += mulFold64(readU64(p, 5*16+0)^key64_080, readU64(p, 5*16+8)^key64_088)
acc += mulFold64(readU64(p, 6*16+0)^key64_096, readU64(p, 6*16+8)^key64_104)
acc += mulFold64(readU64(p, 7*16+0)^key64_112, readU64(p, 7*16+8)^key64_120)
// avalanche
acc = xxh3Avalanche(acc)
// trailing groups after 128
top := ui(l) &^ 15
for i := ui(8 * 16); i < top; i += 16 {
acc += mulFold64(readU64(p, i+0)^readU64(key, i-125), readU64(p, i+8)^readU64(key, i-117))
}
// last 16 bytes
acc += mulFold64(readU64(p, ui(l)-16)^key64_119, readU64(p, ui(l)-8)^key64_127)
return xxh3Avalanche(acc)
default:
acc = u64(l) * prime64_1
accs := [8]u64{
prime32_3, prime64_1, prime64_2, prime64_3,
prime64_4, prime32_2, prime64_5, prime32_1,
}
if hasAVX512 && l >= avx512Switch {
accumAVX512(&accs, p, key, u64(l))
} else if hasAVX2 {
accumAVX2(&accs, p, key, u64(l))
} else if hasSSE2 {
accumSSE(&accs, p, key, u64(l))
} else {
accumScalar(&accs, p, key, u64(l))
}
// merge accs
acc += mulFold64(accs[0]^key64_011, accs[1]^key64_019)
acc += mulFold64(accs[2]^key64_027, accs[3]^key64_035)
acc += mulFold64(accs[4]^key64_043, accs[5]^key64_051)
acc += mulFold64(accs[6]^key64_059, accs[7]^key64_067)
return xxh3Avalanche(acc)
}
}

View File

@ -1,134 +0,0 @@
package xxh3
import "math/bits"
// HashSeed returns the hash of the byte slice with given seed.
func HashSeed(b []byte, seed uint64) uint64 {
return hashAnySeed(*(*str)(ptr(&b)), seed)
}
// HashStringSeed returns the hash of the string slice with given seed.
func HashStringSeed(s string, seed uint64) uint64 {
return hashAnySeed(*(*str)(ptr(&s)), seed)
}
func hashAnySeed(s str, seed uint64) (acc u64) {
p, l := s.p, s.l
switch {
case l <= 16:
switch {
case l > 8:
inputlo := readU64(p, 0) ^ (key64_024 ^ key64_032 + seed)
inputhi := readU64(p, ui(l)-8) ^ (key64_040 ^ key64_048 - seed)
folded := mulFold64(inputlo, inputhi)
return xxh3Avalanche(u64(l) + bits.ReverseBytes64(inputlo) + inputhi + folded)
case l > 3:
seed ^= u64(bits.ReverseBytes32(u32(seed))) << 32
input1 := readU32(p, 0)
input2 := readU32(p, ui(l)-4)
input64 := u64(input2) + u64(input1)<<32
keyed := input64 ^ (key64_008 ^ key64_016 - seed)
return rrmxmx(keyed, u64(l))
case l == 3: // 3
c12 := u64(readU16(p, 0))
c3 := u64(readU8(p, 2))
acc = c12<<16 + c3 + 3<<8
case l > 1: // 2
c12 := u64(readU16(p, 0))
acc = c12*(1<<24+1)>>8 + 2<<8
case l == 1: // 1
c1 := u64(readU8(p, 0))
acc = c1*(1<<24+1<<16+1) + 1<<8
default:
return xxhAvalancheSmall(seed ^ key64_056 ^ key64_064)
}
acc ^= u64(key32_000^key32_004) + seed
return xxhAvalancheSmall(acc)
case l <= 128:
acc = u64(l) * prime64_1
if l > 32 {
if l > 64 {
if l > 96 {
acc += mulFold64(readU64(p, 6*8)^(key64_096+seed), readU64(p, 7*8)^(key64_104-seed))
acc += mulFold64(readU64(p, ui(l)-8*8)^(key64_112+seed), readU64(p, ui(l)-7*8)^(key64_120-seed))
} // 96
acc += mulFold64(readU64(p, 4*8)^(key64_064+seed), readU64(p, 5*8)^(key64_072-seed))
acc += mulFold64(readU64(p, ui(l)-6*8)^(key64_080+seed), readU64(p, ui(l)-5*8)^(key64_088-seed))
} // 64
acc += mulFold64(readU64(p, 2*8)^(key64_032+seed), readU64(p, 3*8)^(key64_040-seed))
acc += mulFold64(readU64(p, ui(l)-4*8)^(key64_048+seed), readU64(p, ui(l)-3*8)^(key64_056-seed))
} // 32
acc += mulFold64(readU64(p, 0*8)^(key64_000+seed), readU64(p, 1*8)^(key64_008-seed))
acc += mulFold64(readU64(p, ui(l)-2*8)^(key64_016+seed), readU64(p, ui(l)-1*8)^(key64_024-seed))
return xxh3Avalanche(acc)
case l <= 240:
acc = u64(l) * prime64_1
acc += mulFold64(readU64(p, 0*16+0)^(key64_000+seed), readU64(p, 0*16+8)^(key64_008-seed))
acc += mulFold64(readU64(p, 1*16+0)^(key64_016+seed), readU64(p, 1*16+8)^(key64_024-seed))
acc += mulFold64(readU64(p, 2*16+0)^(key64_032+seed), readU64(p, 2*16+8)^(key64_040-seed))
acc += mulFold64(readU64(p, 3*16+0)^(key64_048+seed), readU64(p, 3*16+8)^(key64_056-seed))
acc += mulFold64(readU64(p, 4*16+0)^(key64_064+seed), readU64(p, 4*16+8)^(key64_072-seed))
acc += mulFold64(readU64(p, 5*16+0)^(key64_080+seed), readU64(p, 5*16+8)^(key64_088-seed))
acc += mulFold64(readU64(p, 6*16+0)^(key64_096+seed), readU64(p, 6*16+8)^(key64_104-seed))
acc += mulFold64(readU64(p, 7*16+0)^(key64_112+seed), readU64(p, 7*16+8)^(key64_120-seed))
// avalanche
acc = xxh3Avalanche(acc)
// trailing groups after 128
top := ui(l) &^ 15
for i := ui(8 * 16); i < top; i += 16 {
acc += mulFold64(readU64(p, i+0)^(readU64(key, i-125)+seed), readU64(p, i+8)^(readU64(key, i-117)-seed))
}
// last 16 bytes
acc += mulFold64(readU64(p, ui(l)-16)^(key64_119+seed), readU64(p, ui(l)-8)^(key64_127-seed))
return xxh3Avalanche(acc)
default:
acc = u64(l) * prime64_1
secret := key
if seed != 0 {
secret = ptr(&[secretSize]byte{})
initSecret(secret, seed)
}
accs := [8]u64{
prime32_3, prime64_1, prime64_2, prime64_3,
prime64_4, prime32_2, prime64_5, prime32_1,
}
if hasAVX512 && l >= avx512Switch {
accumAVX512(&accs, p, secret, u64(l))
} else if hasAVX2 {
accumAVX2(&accs, p, secret, u64(l))
} else if hasSSE2 {
accumSSE(&accs, p, secret, u64(l))
} else {
accumScalarSeed(&accs, p, secret, u64(l))
}
// merge accs
acc += mulFold64(accs[0]^readU64(secret, 11), accs[1]^readU64(secret, 19))
acc += mulFold64(accs[2]^readU64(secret, 27), accs[3]^readU64(secret, 35))
acc += mulFold64(accs[4]^readU64(secret, 43), accs[5]^readU64(secret, 51))
acc += mulFold64(accs[6]^readU64(secret, 59), accs[7]^readU64(secret, 67))
return xxh3Avalanche(acc)
}
}

View File

@ -1,239 +0,0 @@
package xxh3
import (
"encoding/binary"
"hash"
)
// Hasher implements the hash.Hash interface
type Hasher struct {
acc [8]u64
blk u64
len u64
key ptr
buf [_block + _stripe]byte
seed u64
}
var (
_ hash.Hash = (*Hasher)(nil)
_ hash.Hash64 = (*Hasher)(nil)
)
// New returns a new Hasher that implements the hash.Hash interface.
func New() *Hasher {
return new(Hasher)
}
// NewSeed returns a new Hasher that implements the hash.Hash interface.
func NewSeed(seed uint64) *Hasher {
var h Hasher
h.Reset()
h.seed = seed
h.key = key
// Only initiate once, not on reset.
if seed != 0 {
h.key = ptr(&[secretSize]byte{})
initSecret(h.key, seed)
}
return &h
}
// Reset resets the Hash to its initial state.
func (h *Hasher) Reset() {
h.acc = [8]u64{
prime32_3, prime64_1, prime64_2, prime64_3,
prime64_4, prime32_2, prime64_5, prime32_1,
}
h.blk = 0
h.len = 0
}
// BlockSize returns the hash's underlying block size.
// The Write method will accept any amount of data, but
// it may operate more efficiently if all writes are a
// multiple of the block size.
func (h *Hasher) BlockSize() int { return _stripe }
// Size returns the number of bytes Sum will return.
func (h *Hasher) Size() int { return 8 }
// Sum appends the current hash to b and returns the resulting slice.
// It does not change the underlying hash state.
func (h *Hasher) Sum(b []byte) []byte {
var tmp [8]byte
binary.BigEndian.PutUint64(tmp[:], h.Sum64())
return append(b, tmp[:]...)
}
// Write adds more data to the running hash.
// It never returns an error.
func (h *Hasher) Write(buf []byte) (int, error) {
h.update(buf)
return len(buf), nil
}
// WriteString adds more data to the running hash.
// It never returns an error.
func (h *Hasher) WriteString(buf string) (int, error) {
h.updateString(buf)
return len(buf), nil
}
func (h *Hasher) update(buf []byte) {
// relies on the data pointer being the first word in the string header
h.updateString(*(*string)(ptr(&buf)))
}
func (h *Hasher) updateString(buf string) {
if h.key == nil {
h.key = key
h.Reset()
}
// On first write, if more than 1 block, process without copy.
for h.len == 0 && len(buf) > len(h.buf) {
if hasAVX2 {
accumBlockAVX2(&h.acc, *(*ptr)(ptr(&buf)), h.key)
} else if hasSSE2 {
accumBlockSSE(&h.acc, *(*ptr)(ptr(&buf)), h.key)
} else {
accumBlockScalar(&h.acc, *(*ptr)(ptr(&buf)), h.key)
}
buf = buf[_block:]
h.blk++
}
for len(buf) > 0 {
if h.len < u64(len(h.buf)) {
n := copy(h.buf[h.len:], buf)
h.len += u64(n)
buf = buf[n:]
continue
}
if hasAVX2 {
accumBlockAVX2(&h.acc, ptr(&h.buf), h.key)
} else if hasSSE2 {
accumBlockSSE(&h.acc, ptr(&h.buf), h.key)
} else {
accumBlockScalar(&h.acc, ptr(&h.buf), h.key)
}
h.blk++
h.len = _stripe
copy(h.buf[:_stripe], h.buf[_block:])
}
}
// Sum64 returns the 64-bit hash of the written data.
func (h *Hasher) Sum64() uint64 {
if h.key == nil {
h.key = key
h.Reset()
}
if h.blk == 0 {
if h.seed == 0 {
return Hash(h.buf[:h.len])
}
return HashSeed(h.buf[:h.len], h.seed)
}
l := h.blk*_block + h.len
acc := l * prime64_1
accs := h.acc
if h.len > 0 {
// We are only ever doing 1 block here, so no avx512.
if hasAVX2 {
accumAVX2(&accs, ptr(&h.buf[0]), h.key, h.len)
} else if hasSSE2 {
accumSSE(&accs, ptr(&h.buf[0]), h.key, h.len)
} else {
accumScalar(&accs, ptr(&h.buf[0]), h.key, h.len)
}
}
if h.seed == 0 {
acc += mulFold64(accs[0]^key64_011, accs[1]^key64_019)
acc += mulFold64(accs[2]^key64_027, accs[3]^key64_035)
acc += mulFold64(accs[4]^key64_043, accs[5]^key64_051)
acc += mulFold64(accs[6]^key64_059, accs[7]^key64_067)
} else {
secret := h.key
acc += mulFold64(accs[0]^readU64(secret, 11), accs[1]^readU64(secret, 19))
acc += mulFold64(accs[2]^readU64(secret, 27), accs[3]^readU64(secret, 35))
acc += mulFold64(accs[4]^readU64(secret, 43), accs[5]^readU64(secret, 51))
acc += mulFold64(accs[6]^readU64(secret, 59), accs[7]^readU64(secret, 67))
}
acc = xxh3Avalanche(acc)
return acc
}
// Sum128 returns the 128-bit hash of the written data.
func (h *Hasher) Sum128() Uint128 {
if h.key == nil {
h.key = key
h.Reset()
}
if h.blk == 0 {
if h.seed == 0 {
return Hash128(h.buf[:h.len])
}
return Hash128Seed(h.buf[:h.len], h.seed)
}
l := h.blk*_block + h.len
acc := Uint128{Lo: l * prime64_1, Hi: ^(l * prime64_2)}
accs := h.acc
if h.len > 0 {
// We are only ever doing 1 block here, so no avx512.
if hasAVX2 {
accumAVX2(&accs, ptr(&h.buf[0]), h.key, h.len)
} else if hasSSE2 {
accumSSE(&accs, ptr(&h.buf[0]), h.key, h.len)
} else {
accumScalar(&accs, ptr(&h.buf[0]), h.key, h.len)
}
}
if h.seed == 0 {
acc.Lo += mulFold64(accs[0]^key64_011, accs[1]^key64_019)
acc.Hi += mulFold64(accs[0]^key64_117, accs[1]^key64_125)
acc.Lo += mulFold64(accs[2]^key64_027, accs[3]^key64_035)
acc.Hi += mulFold64(accs[2]^key64_133, accs[3]^key64_141)
acc.Lo += mulFold64(accs[4]^key64_043, accs[5]^key64_051)
acc.Hi += mulFold64(accs[4]^key64_149, accs[5]^key64_157)
acc.Lo += mulFold64(accs[6]^key64_059, accs[7]^key64_067)
acc.Hi += mulFold64(accs[6]^key64_165, accs[7]^key64_173)
} else {
secret := h.key
const hi_off = 117 - 11
acc.Lo += mulFold64(accs[0]^readU64(secret, 11), accs[1]^readU64(secret, 19))
acc.Hi += mulFold64(accs[0]^readU64(secret, 11+hi_off), accs[1]^readU64(secret, 19+hi_off))
acc.Lo += mulFold64(accs[2]^readU64(secret, 27), accs[3]^readU64(secret, 35))
acc.Hi += mulFold64(accs[2]^readU64(secret, 27+hi_off), accs[3]^readU64(secret, 35+hi_off))
acc.Lo += mulFold64(accs[4]^readU64(secret, 43), accs[5]^readU64(secret, 51))
acc.Hi += mulFold64(accs[4]^readU64(secret, 43+hi_off), accs[5]^readU64(secret, 51+hi_off))
acc.Lo += mulFold64(accs[6]^readU64(secret, 59), accs[7]^readU64(secret, 67))
acc.Hi += mulFold64(accs[6]^readU64(secret, 59+hi_off), accs[7]^readU64(secret, 67+hi_off))
}
acc.Lo = xxh3Avalanche(acc.Lo)
acc.Hi = xxh3Avalanche(acc.Hi)
return acc
}

129
vendor/github.com/zeebo/xxh3/utils.go generated vendored
View File

@ -1,129 +0,0 @@
package xxh3
import (
"math/bits"
"unsafe"
)
// Uint128 is a 128 bit value.
// The actual value can be thought of as u.Hi<<64 | u.Lo.
type Uint128 struct {
Hi, Lo uint64
}
// Bytes returns the uint128 as an array of bytes in canonical form (big-endian encoded).
func (u Uint128) Bytes() [16]byte {
return [16]byte{
byte(u.Hi >> 0x38), byte(u.Hi >> 0x30), byte(u.Hi >> 0x28), byte(u.Hi >> 0x20),
byte(u.Hi >> 0x18), byte(u.Hi >> 0x10), byte(u.Hi >> 0x08), byte(u.Hi),
byte(u.Lo >> 0x38), byte(u.Lo >> 0x30), byte(u.Lo >> 0x28), byte(u.Lo >> 0x20),
byte(u.Lo >> 0x18), byte(u.Lo >> 0x10), byte(u.Lo >> 0x08), byte(u.Lo),
}
}
type (
ptr = unsafe.Pointer
ui = uintptr
u8 = uint8
u32 = uint32
u64 = uint64
u128 = Uint128
)
type str struct {
p ptr
l uint
}
func readU8(p ptr, o ui) uint8 {
return *(*uint8)(ptr(ui(p) + o))
}
func readU16(p ptr, o ui) uint16 {
b := (*[2]byte)(ptr(ui(p) + o))
return uint16(b[0]) | uint16(b[1])<<8
}
func readU32(p ptr, o ui) uint32 {
b := (*[4]byte)(ptr(ui(p) + o))
return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
}
func readU64(p ptr, o ui) uint64 {
b := (*[8]byte)(ptr(ui(p) + o))
return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
}
func writeU64(p ptr, o ui, v u64) {
b := (*[8]byte)(ptr(ui(p) + o))
b[0] = byte(v)
b[1] = byte(v >> 8)
b[2] = byte(v >> 16)
b[3] = byte(v >> 24)
b[4] = byte(v >> 32)
b[5] = byte(v >> 40)
b[6] = byte(v >> 48)
b[7] = byte(v >> 56)
}
const secretSize = 192
func initSecret(secret ptr, seed u64) {
for i := ui(0); i < secretSize/16; i++ {
lo := readU64(key, 16*i) + seed
hi := readU64(key, 16*i+8) - seed
writeU64(secret, 16*i, lo)
writeU64(secret, 16*i+8, hi)
}
}
func xxh64AvalancheSmall(x u64) u64 {
// x ^= x >> 33 // x must be < 32 bits
// x ^= u64(key32_000 ^ key32_004) // caller must do this
x *= prime64_2
x ^= x >> 29
x *= prime64_3
x ^= x >> 32
return x
}
func xxhAvalancheSmall(x u64) u64 {
x ^= x >> 33
x *= prime64_2
x ^= x >> 29
x *= prime64_3
x ^= x >> 32
return x
}
func xxh64AvalancheFull(x u64) u64 {
x ^= x >> 33
x *= prime64_2
x ^= x >> 29
x *= prime64_3
x ^= x >> 32
return x
}
func xxh3Avalanche(x u64) u64 {
x ^= x >> 37
x *= 0x165667919e3779f9
x ^= x >> 32
return x
}
func rrmxmx(h64 u64, len u64) u64 {
h64 ^= bits.RotateLeft64(h64, 49) ^ bits.RotateLeft64(h64, 24)
h64 *= 0x9fb21c651e98df25
h64 ^= (h64 >> 35) + len
h64 *= 0x9fb21c651e98df25
h64 ^= (h64 >> 28)
return h64
}
func mulFold64(x, y u64) u64 {
hi, lo := bits.Mul64(x, y)
return hi ^ lo
}