diff --git a/.ci/ci-test.sh b/.ci/ci-test.sh index c3d163de..21066fe6 100755 --- a/.ci/ci-test.sh +++ b/.ci/ci-test.sh @@ -146,3 +146,25 @@ t || dig -p${DNS_PORT} A MICROSOFT.COM @127.0.0.1 | grep -Fq "NOERROR" || fail kill $(cat /tmp/dnscrypt-proxy.pidfile) sleep 5 + +section +../dnscrypt-proxy/dnscrypt-proxy -loglevel 3 -config test-odoh-direct.toml -pidfile /tmp/odoh-direct.pidfile & +sleep 5 + +section +t || dig -p${DNS_PORT} A microsoft.com @127.0.0.1 | grep -Fq "NOERROR" || fail +t || dig -p${DNS_PORT} A cloudflare.com @127.0.0.1 | grep -Fq "NOERROR" || fail + +kill $(cat /tmp/odoh-direct.pidfile) +sleep 5 + +section +../dnscrypt-proxy/dnscrypt-proxy -loglevel 3 -config test-odoh-proxied.toml -pidfile /tmp/odoh-proxied.pidfile & +sleep 5 + +section +t || dig -p${DNS_PORT} A microsoft.com @127.0.0.1 | grep -Fq "NOERROR" || fail +t || dig -p${DNS_PORT} A cloudflare.com @127.0.0.1 | grep -Fq "NOERROR" || fail + +kill $(cat /tmp/odoh-proxied.pidfile) +sleep 5 diff --git a/.github/workflows/releases.yml b/.github/workflows/releases.yml index 9d021277..26b8c115 100644 --- a/.github/workflows/releases.yml +++ b/.github/workflows/releases.yml @@ -39,6 +39,7 @@ jobs: - name: Test suite run: | go version + go mod vendor cd .ci ./ci-test.sh cd - diff --git a/dnscrypt-proxy/config.go b/dnscrypt-proxy/config.go index 0a4af124..8ce5d588 100644 --- a/dnscrypt-proxy/config.go +++ b/dnscrypt-proxy/config.go @@ -718,8 +718,8 @@ func ConfigLoad(proxy *Proxy, flags *ConfigFlags) error { hasSpecificRoutes := false for _, server := range proxy.registeredServers { if via, ok := (*proxy.routes)[server.name]; ok { - if server.stamp.Proto != stamps.StampProtoTypeDNSCrypt { - dlog.Errorf("DNS anonymization is only supported with the DNSCrypt protocol - Connections to [%v] cannot be anonymized", server.name) + if server.stamp.Proto != stamps.StampProtoTypeDNSCrypt && server.stamp.Proto != stamps.StampProtoTypeODoHTarget { + dlog.Errorf("DNS anonymization is only supported with the DNSCrypt and ODoH protocols - Connections to [%v] cannot be anonymized", server.name) } else { dlog.Noticef("Anonymized DNS: routing [%v] via %v", server.name, via) } diff --git a/dnscrypt-proxy/odoh.go b/dnscrypt-proxy/odoh.go new file mode 100644 index 00000000..d05c2ac5 --- /dev/null +++ b/dnscrypt-proxy/odoh.go @@ -0,0 +1,186 @@ +package main + +import ( + "crypto/aes" + "crypto/cipher" + "crypto/subtle" + "encoding/binary" + "fmt" + + hpkecompact "github.com/jedisct1/go-hpke-compact" +) + +const ( + odohVersion = uint16(0xff06) +) + +type ODoHTarget struct { + suite *hpkecompact.Suite + keyID []byte + publicKey []byte +} + +func encodeLengthValue(b []byte) []byte { + lengthBuffer := make([]byte, 2) + binary.BigEndian.PutUint16(lengthBuffer, uint16(len(b))) + return append(lengthBuffer, b...) +} + +func parseODoHTargetConfig(config []byte) (ODoHTarget, error) { + if len(config) < 8 { + return ODoHTarget{}, fmt.Errorf("Malformed config") + } + kemID := binary.BigEndian.Uint16(config[0:2]) + kdfID := binary.BigEndian.Uint16(config[2:4]) + aeadID := binary.BigEndian.Uint16(config[4:6]) + publicKeyLength := binary.BigEndian.Uint16(config[6:8]) + if len(config[8:]) != int(publicKeyLength) { + return ODoHTarget{}, fmt.Errorf("Malformed config") + } + + suite, err := hpkecompact.NewSuite(hpkecompact.KemID(kemID), hpkecompact.KdfID(kdfID), hpkecompact.AeadID(aeadID)) + if err != nil { + return ODoHTarget{}, err + } + + publicKey := config[8:] + _, _, err = suite.NewClientContext(publicKey, []byte("odoh query"), nil) + if err != nil { + return ODoHTarget{}, err + } + + keyID, err := suite.Expand(suite.Extract(config, nil), []byte("odoh key id"), uint16(suite.Hash().Size())) + if err != nil { + return ODoHTarget{}, err + } + + return ODoHTarget{ + suite: suite, + publicKey: publicKey, + keyID: encodeLengthValue(keyID), + }, nil +} + +func parseODoHTargetConfigs(configs []byte) ([]ODoHTarget, error) { + length := binary.BigEndian.Uint16(configs) + if len(configs) != int(length)+2 { + return nil, fmt.Errorf("Malformed configs") + } + + targets := make([]ODoHTarget, 0) + offset := 2 + for { + if offset+4 > len(configs) { + return targets, nil + } + configVersion := binary.BigEndian.Uint16(configs[offset : offset+2]) + configLength := binary.BigEndian.Uint16(configs[offset+2 : offset+4]) + if configVersion == odohVersion { + target, err := parseODoHTargetConfig(configs[offset+4 : offset+4+int(configLength)]) + if err == nil { + targets = append(targets, target) + } + } + + offset = offset + int(configLength) + 4 + } +} + +type ODoHQuery struct { + suite *hpkecompact.Suite + ctx hpkecompact.ClientContext + odohPlaintext []byte + odohMessage []byte +} + +func (t ODoHTarget) encryptQuery(query []byte) (ODoHQuery, error) { + clientCtx, encryptedSharedSecret, err := t.suite.NewClientContext(t.publicKey, []byte("odoh query"), nil) + if err != nil { + return ODoHQuery{}, err + } + + odohPlaintext := make([]byte, 4+len(query)) + binary.BigEndian.PutUint16(odohPlaintext[0:2], uint16(len(query))) + copy(odohPlaintext[2:], query) + + aad := append([]byte{0x01}, t.keyID...) + ciphertext, err := clientCtx.EncryptToServer(odohPlaintext, aad) + + encryptedMessage := encodeLengthValue(append(encryptedSharedSecret, ciphertext...)) + odohMessage := append(append([]byte{0x01}, t.keyID...), encryptedMessage...) + + return ODoHQuery{ + suite: t.suite, + odohPlaintext: odohPlaintext, + odohMessage: odohMessage, + ctx: clientCtx, + }, nil +} + +func (q ODoHQuery) decryptResponse(response []byte) ([]byte, error) { + if len(response) < 3 { + return nil, fmt.Errorf("Malformed response") + } + + messageType := response[0] + if messageType != uint8(0x02) { + return nil, fmt.Errorf("Malformed response") + } + + responseNonceLength := binary.BigEndian.Uint16(response[1:3]) + if len(response) < 5+int(responseNonceLength) { + return nil, fmt.Errorf("Malformed response") + } + + responseNonceEnc := response[1 : 3+responseNonceLength] + + secret, err := q.ctx.Export([]byte("odoh response"), q.suite.KeyBytes) + if err != nil { + return nil, err + } + + salt := append(q.odohPlaintext, responseNonceEnc...) + prk := q.suite.Extract(secret, salt) + key, err := q.suite.Expand(prk, []byte("odoh key"), q.suite.KeyBytes) + if err != nil { + return nil, err + } + nonce, err := q.suite.Expand(prk, []byte("odoh nonce"), q.suite.NonceBytes) + if err != nil { + return nil, err + } + + block, err := aes.NewCipher(key) + if err != nil { + return nil, err + } + + aesgcm, err := cipher.NewGCM(block) + if err != nil { + return nil, err + } + + ctLength := binary.BigEndian.Uint16(response[3+int(responseNonceLength) : 5+int(responseNonceLength)]) + if int(ctLength) != len(response[5+int(responseNonceLength):]) { + return nil, fmt.Errorf("Malformed response") + } + + ct := response[5+int(responseNonceLength):] + aad := response[0 : 3+int(responseNonceLength)] + + responsePlaintext, err := aesgcm.Open(nil, nonce, ct, aad) + if err != nil { + return nil, err + } + + responseLength := binary.BigEndian.Uint16(responsePlaintext[0:2]) + valid := 1 + for i := 4 + int(responseLength); i < len(responsePlaintext); i++ { + valid = valid & subtle.ConstantTimeByteEq(response[i], 0x00) + } + if valid != 1 { + return nil, fmt.Errorf("Malformed response") + } + + return responsePlaintext[2 : 2+int(responseLength)], nil +} diff --git a/dnscrypt-proxy/proxy.go b/dnscrypt-proxy/proxy.go index ec578808..e05727d4 100644 --- a/dnscrypt-proxy/proxy.go +++ b/dnscrypt-proxy/proxy.go @@ -658,7 +658,7 @@ func (proxy *Proxy) processIncomingQuery(clientProto string, serverProto string, tid := TransactionID(query) SetTransactionID(query, 0) serverInfo.noticeBegin(proxy) - serverResponse, tls, _, err := proxy.xTransport.DoHQuery(serverInfo.useGet, serverInfo.URL, query, proxy.timeout) + serverResponse, _, tls, _, err := proxy.xTransport.DoHQuery(serverInfo.useGet, serverInfo.URL, query, proxy.timeout) SetTransactionID(query, tid) if err == nil || tls == nil || !tls.HandshakeComplete { response = nil @@ -678,6 +678,42 @@ func (proxy *Proxy) processIncomingQuery(clientProto string, serverProto string, if len(response) >= MinDNSPacketSize { SetTransactionID(response, tid) } + } else if serverInfo.Proto == stamps.StampProtoTypeODoHTarget { + tid := TransactionID(query) + target := serverInfo.odohTargets[0] + odohQuery, err := target.encryptQuery(query) + if err != nil { + dlog.Error("Failed to encrypt query") + response = nil + } else { + targetURL := serverInfo.URL + if serverInfo.Relay != nil && serverInfo.Relay.ODoH != nil { + targetURL = serverInfo.Relay.ODoH.url + } + responseBody, responseCode, _, _, err := proxy.xTransport.ObliviousDoHQuery(targetURL, odohQuery.odohMessage, proxy.timeout) + if err == nil && len(responseBody) > 0 && responseCode == 200 { + response, err = odohQuery.decryptResponse(responseBody) + if err != nil { + dlog.Error("Failed to decrypt response") + response = nil + } + } else if responseCode == 401 { + dlog.Notice("Forcing key update") + go proxy.serversInfo.refresh(proxy) + response = nil + } else { + dlog.Error("Failed to receive successful response") + } + } + + if len(response) >= MinDNSPacketSize { + SetTransactionID(response, tid) + } else if response == nil { + pluginsState.returnCode = PluginsReturnCodeNetworkError + pluginsState.ApplyLoggingPlugins(&proxy.pluginsGlobals) + serverInfo.noticeFailure(proxy) + return + } } else { dlog.Fatal("Unsupported protocol") } diff --git a/dnscrypt-proxy/serversInfo.go b/dnscrypt-proxy/serversInfo.go index 6f631fd8..d6c4aa24 100644 --- a/dnscrypt-proxy/serversInfo.go +++ b/dnscrypt-proxy/serversInfo.go @@ -6,9 +6,11 @@ import ( "encoding/hex" "errors" "fmt" + "io/ioutil" "math/bits" "math/rand" "net" + "net/http" "net/url" "sort" "strings" @@ -61,6 +63,7 @@ type ServerInfo struct { knownBugs ServerBugs Proto stamps.StampProtoType useGet bool + odohTargets []ODoHTarget } type LBStrategy interface { @@ -104,7 +107,9 @@ type DNSCryptRelay struct { RelayTCPAddr *net.TCPAddr } -type ODoHRelay struct{} +type ODoHRelay struct { + url *url.URL +} type Relay struct { Proto stamps.StampProtoType @@ -277,6 +282,8 @@ func fetchServerInfo(proxy *Proxy, name string, stamp stamps.ServerStamp, isNew return fetchDNSCryptServerInfo(proxy, name, stamp, isNew) } else if stamp.Proto == stamps.StampProtoTypeDoH { return fetchDoHServerInfo(proxy, name, stamp, isNew) + } else if stamp.Proto == stamps.StampProtoTypeODoHTarget { + return fetchODoHTargetInfo(proxy, name, stamp, isNew) } return ServerInfo{}, fmt.Errorf("Unsupported protocol for [%s]: [%s]", name, stamp.Proto.String()) } @@ -412,7 +419,26 @@ func route(proxy *Proxy, name string) (*Relay, error) { dlog.Noticef("Anonymizing queries for [%v] via [%v]", name, relayName) return &Relay{Proto: stamps.StampProtoTypeDNSCryptRelay, Dnscrypt: &DNSCryptRelay{RelayUDPAddr: relayUDPAddr, RelayTCPAddr: relayTCPAddr}}, nil case stamps.StampProtoTypeODoHRelay: - return &Relay{Proto: stamps.StampProtoTypeODoHRelay, ODoH: &ODoHRelay{}}, nil + target, err := url.Parse("https://" + relayCandidateStamp.ProviderName + "/" + relayCandidateStamp.Path) + if err != nil { + return nil, err + } + + for _, server := range proxy.registeredServers { + if server.name == name && server.stamp.Proto == stamps.StampProtoTypeODoHTarget { + qs := target.Query() + qs.Add("targethost", server.stamp.ProviderName) + qs.Add("targetpath", server.stamp.Path) + target2 := *target + target2.RawQuery = qs.Encode() + target = &target2 + break + } + } + + return &Relay{Proto: stamps.StampProtoTypeODoHRelay, ODoH: &ODoHRelay{ + url: target, + }}, nil } return nil, fmt.Errorf("Invalid relay set for server [%v]", name) } @@ -550,15 +576,15 @@ func fetchDoHServerInfo(proxy *Proxy, name string, stamp stamps.ServerStamp, isN proxy.xTransport.rebuildTransport() } useGet := false - if _, _, _, err := proxy.xTransport.DoHQuery(useGet, url, body, proxy.timeout); err != nil { + if _, _, _, _, err := proxy.xTransport.DoHQuery(useGet, url, body, proxy.timeout); err != nil { useGet = true - if _, _, _, err := proxy.xTransport.DoHQuery(useGet, url, body, proxy.timeout); err != nil { + if _, _, _, _, err := proxy.xTransport.DoHQuery(useGet, url, body, proxy.timeout); err != nil { return ServerInfo{}, err } dlog.Debugf("Server [%s] doesn't appear to support POST; falling back to GET requests", name) } body = dohNXTestPacket(0xcafe) - serverResponse, tls, rtt, err := proxy.xTransport.DoHQuery(useGet, url, body, proxy.timeout) + serverResponse, _, tls, rtt, err := proxy.xTransport.DoHQuery(useGet, url, body, proxy.timeout) if err != nil { dlog.Infof("[%s] [%s]: %v", name, url, err) return ServerInfo{}, err @@ -632,6 +658,62 @@ func fetchDoHServerInfo(proxy *Proxy, name string, stamp stamps.ServerStamp, isN }, nil } +func fetchTargetConfigsFromWellKnown(url string) ([]ODoHTarget, error) { + req, err := http.NewRequest(http.MethodGet, url, nil) + if err != nil { + return nil, err + } + + client := http.Client{} + resp, err := client.Do(req) + if err != nil { + return nil, err + } + bodyBytes, err := ioutil.ReadAll(resp.Body) + defer resp.Body.Close() + if err != nil { + return nil, err + } + + return parseODoHTargetConfigs(bodyBytes) +} + +func fetchODoHTargetInfo(proxy *Proxy, name string, stamp stamps.ServerStamp, isNew bool) (ServerInfo, error) { + odohTargets, err := fetchTargetConfigsFromWellKnown("https://" + stamp.ProviderName + "/.well-known/odohconfigs") + if err != nil || len(odohTargets) == 0 { + return ServerInfo{}, fmt.Errorf("[%s] does not have an Oblivious DoH configuration", name) + } + + relay, err := route(proxy, name) + if err != nil { + return ServerInfo{}, err + } + if relay == nil || relay.ODoH == nil { + relay = nil + } + + if relay == nil { + dlog.Notice("Relay is empty for " + name) + } + + url := &url.URL{ + Scheme: "https", + Host: stamp.ProviderName, + Path: stamp.Path, + } + + return ServerInfo{ + Proto: stamps.StampProtoTypeODoHTarget, + Name: name, + Timeout: proxy.timeout, + URL: url, + HostName: stamp.ProviderName, + useGet: false, + odohTargets: odohTargets, + Relay: relay, + }, nil +} + func (serverInfo *ServerInfo) noticeFailure(proxy *Proxy) { proxy.serversInfo.Lock() serverInfo.rtt.Add(float64(proxy.timeout.Nanoseconds() / 1000000)) diff --git a/dnscrypt-proxy/sources.go b/dnscrypt-proxy/sources.go index c31f7379..a361001c 100644 --- a/dnscrypt-proxy/sources.go +++ b/dnscrypt-proxy/sources.go @@ -132,7 +132,7 @@ func (source *Source) parseURLs(urls []string) { } func fetchFromURL(xTransport *XTransport, u *url.URL) (bin []byte, err error) { - bin, _, _, err = xTransport.Get(u, "", DefaultTimeout) + bin, _, _, _, err = xTransport.Get(u, "", DefaultTimeout) return bin, err } diff --git a/dnscrypt-proxy/xtransport.go b/dnscrypt-proxy/xtransport.go index 183d3de2..85c663c7 100644 --- a/dnscrypt-proxy/xtransport.go +++ b/dnscrypt-proxy/xtransport.go @@ -338,7 +338,7 @@ func (xTransport *XTransport) resolveAndUpdateCache(host string) error { return nil } -func (xTransport *XTransport) Fetch(method string, url *url.URL, accept string, contentType string, body *[]byte, timeout time.Duration) ([]byte, *tls.ConnectionState, time.Duration, error) { +func (xTransport *XTransport) Fetch(method string, url *url.URL, accept string, contentType string, body *[]byte, timeout time.Duration) ([]byte, int, *tls.ConnectionState, time.Duration, error) { if timeout <= 0 { timeout = xTransport.timeout } @@ -361,11 +361,11 @@ func (xTransport *XTransport) Fetch(method string, url *url.URL, accept string, } host, _ := ExtractHostAndPort(url.Host, 0) if xTransport.proxyDialer == nil && strings.HasSuffix(host, ".onion") { - return nil, nil, 0, errors.New("Onion service is not reachable without Tor") + return nil, 0, nil, 0, errors.New("Onion service is not reachable without Tor") } if err := xTransport.resolveAndUpdateCache(host); err != nil { dlog.Errorf("Unable to resolve [%v] - Make sure that the system resolver works, or that `bootstrap_resolvers` has been set to resolvers that can be reached", host) - return nil, nil, 0, err + return nil, 0, nil, 0, err } req := &http.Request{ Method: method, @@ -396,26 +396,26 @@ func (xTransport *XTransport) Fetch(method string, url *url.URL, accept string, xTransport.tlsCipherSuite = nil xTransport.rebuildTransport() } - return nil, nil, 0, err + return nil, 0, nil, 0, err } tls := resp.TLS bin, err := ioutil.ReadAll(io.LimitReader(resp.Body, MaxHTTPBodyLength)) if err != nil { - return nil, tls, 0, err + return nil, resp.StatusCode, tls, 0, err } resp.Body.Close() - return bin, tls, rtt, err + return bin, resp.StatusCode, tls, rtt, err } -func (xTransport *XTransport) Get(url *url.URL, accept string, timeout time.Duration) ([]byte, *tls.ConnectionState, time.Duration, error) { +func (xTransport *XTransport) Get(url *url.URL, accept string, timeout time.Duration) ([]byte, int, *tls.ConnectionState, time.Duration, error) { return xTransport.Fetch("GET", url, accept, "", nil, timeout) } -func (xTransport *XTransport) Post(url *url.URL, accept string, contentType string, body *[]byte, timeout time.Duration) ([]byte, *tls.ConnectionState, time.Duration, error) { +func (xTransport *XTransport) Post(url *url.URL, accept string, contentType string, body *[]byte, timeout time.Duration) ([]byte, int, *tls.ConnectionState, time.Duration, error) { return xTransport.Fetch("POST", url, accept, contentType, body, timeout) } -func (xTransport *XTransport) DoHQuery(useGet bool, url *url.URL, body []byte, timeout time.Duration) ([]byte, *tls.ConnectionState, time.Duration, error) { +func (xTransport *XTransport) DoHQuery(useGet bool, url *url.URL, body []byte, timeout time.Duration) ([]byte, int, *tls.ConnectionState, time.Duration, error) { dataType := "application/dns-message" if useGet { qs := url.Query() @@ -427,3 +427,8 @@ func (xTransport *XTransport) DoHQuery(useGet bool, url *url.URL, body []byte, t } return xTransport.Post(url, dataType, dataType, &body, timeout) } + +func (xTransport *XTransport) ObliviousDoHQuery(url *url.URL, body []byte, timeout time.Duration) ([]byte, int, *tls.ConnectionState, time.Duration, error) { + dataType := "application/oblivious-dns-message" + return xTransport.Post(url, dataType, dataType, &body, timeout) +} diff --git a/go.mod b/go.mod index fa7ea0e1..8162ee24 100644 --- a/go.mod +++ b/go.mod @@ -5,6 +5,8 @@ go 1.16 require ( github.com/BurntSushi/toml v0.3.1 github.com/VividCortex/ewma v1.1.1 + github.com/cisco/go-hpke v0.0.0-20210215210317-01c430f1f302 // indirect + github.com/cloudflare/odoh-go v0.1.6 // indirect github.com/coreos/go-systemd v0.0.0-20191104093116-d3cd4ed1dbcf github.com/dchest/safefile v0.0.0-20151022103144-855e8d98f185 github.com/hashicorp/go-immutable-radix v1.3.0 @@ -13,6 +15,7 @@ require ( github.com/jedisct1/dlog v0.0.0-20210101122416-354ffe815216 github.com/jedisct1/go-clocksmith v0.0.0-20210101121932-da382b963868 github.com/jedisct1/go-dnsstamps v0.0.0-20210101121956-16fbdadcf8f5 + github.com/jedisct1/go-hpke-compact v0.0.0-20210329192501-7ceabaabca65 github.com/jedisct1/go-minisign v0.0.0-20210106175330-e54e81d562c7 github.com/jedisct1/xsecretbox v0.0.0-20210102102453-4ecb2081017a github.com/k-sone/critbitgo v1.4.0 diff --git a/go.sum b/go.sum index 329c0df0..d386227f 100644 --- a/go.sum +++ b/go.sum @@ -1,4 +1,6 @@ cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= +git.schwanenlied.me/yawning/x448.git v0.0.0-20170617130356-01b048fb03d6 h1:w8IZgCntCe0RuBJp+dENSMwEBl/k8saTgJ5hPca5IWw= +git.schwanenlied.me/yawning/x448.git v0.0.0-20170617130356-01b048fb03d6/go.mod h1:wQaGCqEu44ykB17jZHCevrgSVl3KJnwQBObUtrKU4uU= github.com/BurntSushi/toml v0.3.1 h1:WXkYYl6Yr3qBf1K79EBnL4mak0OimBfB0XUf9Vl28OQ= github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= github.com/VividCortex/ewma v1.1.1 h1:MnEK4VOv6n0RSY4vtRe3h11qjxL3+t0B8yOL8iMXdcM= @@ -8,9 +10,18 @@ github.com/aead/chacha20 v0.0.0-20180709150244-8b13a72661da/go.mod h1:eHEWzANqSi github.com/aead/poly1305 v0.0.0-20180717145839-3fee0db0b635 h1:52m0LGchQBBVqJRyYYufQuIbVqRawmubW3OFGqK1ekw= github.com/aead/poly1305 v0.0.0-20180717145839-3fee0db0b635/go.mod h1:lmLxL+FV291OopO93Bwf9fQLQeLyt33VJRUg5VJ30us= github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= +github.com/cisco/go-hpke v0.0.0-20210215210317-01c430f1f302 h1:unAbn7dpE8eeUfWRaOPl1qTfffhIcCNuKQuECGNGWtk= +github.com/cisco/go-hpke v0.0.0-20210215210317-01c430f1f302/go.mod h1:RSsoIHRMBe69FbF/fIbmWYa3rrC6vuPyC0MbNUpel3Q= +github.com/cisco/go-tls-syntax v0.0.0-20200617162716-46b0cfb76b9b h1:Ves2turKTX7zruivAcUOQg155xggcbv3suVdbKCBQNM= +github.com/cisco/go-tls-syntax v0.0.0-20200617162716-46b0cfb76b9b/go.mod h1:0AZAV7lYvynZQ5ErHlGMKH+4QYMyNCFd+AiL9MlrCYA= github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw= +github.com/cloudflare/circl v1.0.0 h1:64b6pyfCFbYm623ncIkYGNZaOcmIbyd+CjyMi2L9vdI= +github.com/cloudflare/circl v1.0.0/go.mod h1:MhjB3NEEhJbTOdLLq964NIUisXDxaE1WkQPUxtgZXiY= +github.com/cloudflare/odoh-go v0.1.6 h1:siTTv/pBXztJORDgVNDAYKdTboenwWFMDD1B9YQHkag= +github.com/cloudflare/odoh-go v0.1.6/go.mod h1:J3Doz827YDYvz4hEmJU6q45hRFOqxUBL6NRUuEfjMxA= github.com/coreos/go-systemd v0.0.0-20191104093116-d3cd4ed1dbcf h1:iW4rZ826su+pqaw19uhpSCzhj44qo35pNgKFGqzDKkU= github.com/coreos/go-systemd v0.0.0-20191104093116-d3cd4ed1dbcf/go.mod h1:F5haX7vjVVG0kc13fIWeqUViNPyEJxv/OmvnBo0Yme4= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/dchest/safefile v0.0.0-20151022103144-855e8d98f185 h1:3T8ZyTDp5QxTx3NU48JVb2u+75xc040fofcBaN+6jPA= @@ -54,6 +65,10 @@ github.com/jedisct1/go-clocksmith v0.0.0-20210101121932-da382b963868 h1:QZ79mRbN github.com/jedisct1/go-clocksmith v0.0.0-20210101121932-da382b963868/go.mod h1:SAINchklztk2jcLWJ4bpNF4KnwDUSUTX+cJbspWC2Rw= github.com/jedisct1/go-dnsstamps v0.0.0-20210101121956-16fbdadcf8f5 h1:FnAupK0Gm6PJZDhI5sGkbNZQ7DT4+tG8opjmoXfBu/o= github.com/jedisct1/go-dnsstamps v0.0.0-20210101121956-16fbdadcf8f5/go.mod h1:t35n6rsPE3nD3RXbc5hI5Ax1ci/SSYTpx0BdMXh/1aE= +github.com/jedisct1/go-hpke-compact v0.0.0-20210216141532-42fe67ec23b2 h1:5pGP8stsRZ3SHicLH189p/RroaZ43+m8P4+CBu3maRE= +github.com/jedisct1/go-hpke-compact v0.0.0-20210216141532-42fe67ec23b2/go.mod h1:Sc4df3OZCQgMxgbCVYLnIA16SRDKrGcSFcUqeu67MGs= +github.com/jedisct1/go-hpke-compact v0.0.0-20210329192501-7ceabaabca65 h1:qxey1Jfre+udaWyQI+lS3qPGuJDzmkBaHDIhmL9qef8= +github.com/jedisct1/go-hpke-compact v0.0.0-20210329192501-7ceabaabca65/go.mod h1:fFxJHJ4XTptLoMkie7bC9zjyKSKC8yycljJtKXGaAAI= github.com/jedisct1/go-minisign v0.0.0-20210106175330-e54e81d562c7 h1:qrPDNqqT76vs8oWL6Z1/D6hKvbXULvlD7FdNVTIUI8A= github.com/jedisct1/go-minisign v0.0.0-20210106175330-e54e81d562c7/go.mod h1:oPTyITpvr7hPx/9w76gWrgbZwbb+7gZ9/On8hFc+LNE= github.com/jedisct1/xsecretbox v0.0.0-20210102102453-4ecb2081017a h1:ptZ+a2DzulAgKEfMuqH2Ckfv6oio5BX14fRG5aShvXw= @@ -70,6 +85,7 @@ github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/powerman/check v1.3.0/go.mod h1:k/8NCUQwepaKJKctBBKjQo84jvGEvKiumD9pDl87RB0= github.com/powerman/check v1.3.1 h1:86xiEjMNn0K4b3bhQsLfrLqJb0DWLXHBK3wBd5PoJH4= github.com/powerman/check v1.3.1/go.mod h1:k/8NCUQwepaKJKctBBKjQo84jvGEvKiumD9pDl87RB0= github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= @@ -78,8 +94,12 @@ github.com/smartystreets/assertions v1.0.1 h1:voD4ITNjPL5jjBfgR/r8fPIIBrliWrWHei github.com/smartystreets/assertions v1.0.1/go.mod h1:kHHU4qYBaI3q23Pp3VPrmWhuIUrLW/7eUrw0BU5VaoM= github.com/smartystreets/goconvey v1.6.4 h1:fv0U8FUIMPNf1L9lnHLvLhgicrIVChEkdzIKYqbNC9s= github.com/smartystreets/goconvey v1.6.4/go.mod h1:syvi0/a8iFYH4r/RixwvyeAJjdLS9QV7WQ/tjFTllLA= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20200820211705-5c72a883971a/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/crypto v0.0.0-20201221181555-eec23a3978ad/go.mod h1:jdWPYTVW3xRLrWPugEBEK3UY2ZEsg3UU495nc5E+M+I= +golang.org/x/crypto v0.0.0-20210220033148-5ea612d1eb83/go.mod h1:jdWPYTVW3xRLrWPugEBEK3UY2ZEsg3UU495nc5E+M+I= golang.org/x/crypto v0.0.0-20210322153248-0c34fe9e7dc2 h1:It14KIkyBFYkHkwZ7k45minvA9aorojkyjGk9KJ5B/w= golang.org/x/crypto v0.0.0-20210322153248-0c34fe9e7dc2/go.mod h1:T9bdIzuCu7OtxOm1hfPfRQxPLYneinmdGuTeoZ9dtd4= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= @@ -102,12 +122,15 @@ golang.org/x/sync v0.0.0-20210220032951-036812b2e83c h1:5KslGYwFpkhGh+Q16bwMP3cO golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190529164535-6a60838ec259/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190602015325-4c4f7f33c9ed/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200124204421-9fbb57f87de9/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20201015000850-e3ed0017c211/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20201231184435-2d18734c6014/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210220050731-9a76102bfb43/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210303074136-134d130e1a04/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210315160823-c6e025ad8005/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210320140829-1e4c9ba3b0c4 h1:EZ2mChiOa8udjfp6rRmswTbtZN/QzUQp4ptM4rnjHvc= @@ -149,5 +172,6 @@ gopkg.in/natefinch/lumberjack.v2 v2.0.0 h1:1Lc07Kr7qY4U2YPouBjpCLxpiyxIVoxqXgkXL gopkg.in/natefinch/lumberjack.v2 v2.0.0/go.mod h1:l0ndWWf7gzL7RNwBG7wST/UCcT4T24xpD6X8LsfU/+k= gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= diff --git a/vendor/github.com/jedisct1/go-hpke-compact/.gitignore b/vendor/github.com/jedisct1/go-hpke-compact/.gitignore new file mode 100644 index 00000000..cfe0bb0f --- /dev/null +++ b/vendor/github.com/jedisct1/go-hpke-compact/.gitignore @@ -0,0 +1,2 @@ +*~ +go.sum diff --git a/vendor/github.com/jedisct1/go-hpke-compact/LICENSE b/vendor/github.com/jedisct1/go-hpke-compact/LICENSE new file mode 100644 index 00000000..729f611a --- /dev/null +++ b/vendor/github.com/jedisct1/go-hpke-compact/LICENSE @@ -0,0 +1,18 @@ +/* + * ISC License + * + * Copyright (c) 2020-2021 + * Frank Denis + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ diff --git a/vendor/github.com/jedisct1/go-hpke-compact/README.md b/vendor/github.com/jedisct1/go-hpke-compact/README.md new file mode 100644 index 00000000..ca2c52ec --- /dev/null +++ b/vendor/github.com/jedisct1/go-hpke-compact/README.md @@ -0,0 +1,129 @@ +[![CI status](https://github.com/jedisct1/go-hpke-compact/workflows/Go/badge.svg)](https://github.com/jedisct1/go-hpke-compact/actions) +[![Go Reference](https://pkg.go.dev/badge/github.com/jedisct1/go-hpke-compact.svg)](https://pkg.go.dev/github.com/jedisct1/go-hpke-compact) + +# ![HPKE-Compact](.assets/logo.png) + +# A compact HPKE implemention for Go + +`hpkecompact` is a small implementation of the [Hybrid Public Key Encryption](https://cfrg.github.io/draft-irtf-cfrg-hpke/draft-irtf-cfrg-hpke.html) (HPKE) draft. + +It fits in a single file and only uses the Go standard library and `x/crypto`. + +Suites are currently limited to `X25519-HKDF-SHA256` / `HKDF-SHA-256` / `{AES-{128,256}-GCM, CHACHA20-POLY1305}`; these are very likely to be the most commonly deployed ones for a forseable future. + +## Usage + +### Suite instantiation + +```go +suite, err := NewSuite(KemX25519HkdfSha256, KdfHkdfSha256, AeadAes128Gcm) +``` + +### Key pair creation + +```go +serverKp, err := ctx.GenerateKeyPair() +``` + +### Client: creation and encapsulation of the shared secret + +A _client_ initiates a connexion by sending an encrypted secret; a _server_ accepts an encrypted secret from a client, and decrypts it, so that both parties can eventually agree on a shared secret. + +```go +clientCtx, encryptedSharedSecret, err := + suite.NewClientContext(serverKp.PublicKey, []byte("application name"), nil) +``` + +* `encryptedSharedSecret` needs to be sent to the server. +* `clientCtx` can be used to encrypt/decrypt messages exchanged with the server. +* The last parameter is an optional pre-shared key (`Psk` type). + +To improve misuse resistance, this implementation uses distinct types for the client and the server context: `ClientContext` for the client, and `ServerContext` for the server. + +### Server: decapsulation of the shared secret + +```go +serverCtx, err := suite.NewServerContext(encryptedSharedSecret, + serverKp, []byte("application name"), nil) +``` + +* `serverCtx` can be used to encrypt/decrypt messages exchanged with the client +* The last parameter is an optional pre-shared key (`Psk` type). + +### Encryption of a message from the client to the server + +A message can be encrypted by the client for the server: + +```go +ciphertext, err := clientCtx.EncryptToServer([]byte("message"), nil) +``` + +Nonces are automatically incremented, so it is safe to call this function multiple times within the same context. + +Second parameter is optional associated data. + +### Decryption of a ciphertext received by the server + +The server can decrypt a ciphertext sent by the client: + +```go +decrypted, err := serverCtx.DecryptFromClient(ciphertext, nil) +``` + +Second parameter is optional associated data. + +### Encryption of a message from the server to the client + +A message can also be encrypted by the server for the client: + +```go +ciphertext, err := clientCtx.EncryptToClient([]byte("response"), nil) +``` + +Nonces are automatically incremented, so it is safe to call this function multiple times within the same context. + +Second parameter is optional associated data. + +### Decryption of a ciphertext received by the client + +The client can decrypt a ciphertext sent by the server: + +```go +decrypted, err := serverCtx.DecryptFromServer(ciphertext, nil) +``` + +Second parameter is optional associated data. + +## Authenticated modes + +Authenticated modes, with or without a PSK are supported. + +Just replace `NewClientContext()` with `NewAuthenticatedClientContext()` and `NewServerContext()` with `NewAuthenticatedServerContext()` for authentication. + +```go +clientKp, err := suite.GenerateKeyPair() +serverKp, err := suite.GenerateKeyPair() + +clientCtx, encryptedSharedSecret, err := suite.NewAuthenticatedClientContext( + clientKp, serverKp.PublicKey, []byte("app"), psk) + +serverCtx, err := suite.NewAuthenticatedServerContext( + clientKp.PublicKey, encryptedSharedSecret, serverKp, []byte("app"), psk) +``` + +### Exporter secret + +The exporter secret can be obtained with the `ExportedSecret()` function available both in the `ServerContext` and `ClientContext` structures: + +```go +exporter := serverCtx.ExporterSecret() +``` + +### Key derivation + +```go +secret1, err := clientCtx.Export("description 1") +secret2, err := serverCtx.Export("description 2"); +``` + +## That's it! \ No newline at end of file diff --git a/vendor/github.com/jedisct1/go-hpke-compact/go.mod b/vendor/github.com/jedisct1/go-hpke-compact/go.mod new file mode 100644 index 00000000..af15fdc6 --- /dev/null +++ b/vendor/github.com/jedisct1/go-hpke-compact/go.mod @@ -0,0 +1,8 @@ +module github.com/jedisct1/go-hpke-compact + +go 1.16 + +require ( + github.com/powerman/check v1.3.1 + golang.org/x/crypto v0.0.0-20210322153248-0c34fe9e7dc2 +) diff --git a/vendor/github.com/jedisct1/go-hpke-compact/hpke.go b/vendor/github.com/jedisct1/go-hpke-compact/hpke.go new file mode 100644 index 00000000..959ce8fa --- /dev/null +++ b/vendor/github.com/jedisct1/go-hpke-compact/hpke.go @@ -0,0 +1,656 @@ +package hpkecompact + +import ( + "crypto/aes" + "crypto/cipher" + crypto_rand "crypto/rand" + "crypto/sha256" + "encoding/binary" + "errors" + "hash" + + "golang.org/x/crypto/chacha20poly1305" + "golang.org/x/crypto/curve25519" + "golang.org/x/crypto/hkdf" +) + +var hpkeVersion = [7]byte{'H', 'P', 'K', 'E', '-', 'v', '1'} + +// Mode - Mode +type Mode byte + +const ( + // ModeBase - Base mode + ModeBase Mode = 0x00 + // ModePsk - PSK mode + ModePsk Mode = 0x01 + // ModeAuth - Auth mode + ModeAuth Mode = 0x02 + // ModeAuthPsk - PSK Auth mode + ModeAuthPsk Mode = 0x03 +) + +// KemID - KEM ID +type KemID uint16 + +const ( + // KemX25519HkdfSha256 - X25519 with HKDF-SHA256 + KemX25519HkdfSha256 KemID = 0x0020 +) + +// KdfID - KDF ID +type KdfID uint16 + +const ( + // KdfHkdfSha256 - HKDF-SHA256 + KdfHkdfSha256 KdfID = 0x0001 +) + +// AeadID - AEAD ID +type AeadID uint16 + +const ( + // AeadAes128Gcm - AES128-GCM + AeadAes128Gcm AeadID = 0x0001 + // AeadAes256Gcm - AES256-GCM + AeadAes256Gcm AeadID = 0x0002 + // AeadChaCha20Poly1305 - ChaCha20-Poly1305 + AeadChaCha20Poly1305 AeadID = 0x0003 + // AeadExportOnly - Don't use the HPKE encryption API + AeadExportOnly AeadID = 0xffff +) + +// Psk - Pre-shared key and key ID +type Psk struct { + Key []byte + ID []byte +} + +// KeyPair - A key pair (packed as a byte string) +type KeyPair struct { + // PublicKey - Public key + PublicKey []byte + // SecretKey - Secret key + SecretKey []byte +} + +type aeadState struct { + aead aeadImpl + baseNonce []byte + counter []byte +} + +// Suite - HPKE suite +type Suite struct { + SuiteIDContext [10]byte + SuiteIDKEM [5]byte + Hash func() hash.Hash + PrkBytes uint16 + KeyBytes uint16 + NonceBytes uint16 + KemHashBytes uint16 + AeadID AeadID +} + +// NewSuite - Create a new suite from its components +func NewSuite(kemID KemID, kdfID KdfID, aeadID AeadID) (*Suite, error) { + if kemID != KemX25519HkdfSha256 || kdfID != KdfHkdfSha256 { + return nil, errors.New("unimplemented suite") + } + hash := sha256.New + nonceBytes := uint16(12) + var keyBytes uint16 + switch aeadID { + case AeadAes128Gcm: + keyBytes = 16 + case AeadAes256Gcm: + keyBytes = 32 + case AeadChaCha20Poly1305: + keyBytes = 32 + case AeadExportOnly: + keyBytes = 0 + nonceBytes = 0 + default: + return nil, errors.New("unimplemented suite") + } + var prkBytes uint16 + switch kdfID { + case KdfHkdfSha256: + prkBytes = 32 + default: + return nil, errors.New("unimplemented suite") + } + var kemHashBytes uint16 + switch kemID { + case KemX25519HkdfSha256: + kemHashBytes = 32 + default: + return nil, errors.New("unimplemented suite") + } + suite := Suite{ + SuiteIDContext: getSuiteIDContext(kemID, kdfID, aeadID), + SuiteIDKEM: getSuiteIDKEM(kemID), + Hash: hash, + KeyBytes: keyBytes, + PrkBytes: prkBytes, + NonceBytes: nonceBytes, + KemHashBytes: kemHashBytes, + AeadID: aeadID, + } + return &suite, nil +} + +func getSuiteIDContext(kemID KemID, kdfID KdfID, aeadID AeadID) [10]byte { + suiteIDContext := [10]byte{'H', 'P', 'K', 'E', 0, 0, 0, 0, 0, 0} + binary.BigEndian.PutUint16(suiteIDContext[4:6], uint16(kemID)) + binary.BigEndian.PutUint16(suiteIDContext[6:8], uint16(kdfID)) + binary.BigEndian.PutUint16(suiteIDContext[8:10], uint16(aeadID)) + return suiteIDContext +} + +func getSuiteIDKEM(kemID KemID) [5]byte { + suiteIDKEM := [5]byte{'K', 'E', 'M', 0, 0} + binary.BigEndian.PutUint16(suiteIDKEM[3:5], uint16(kemID)) + return suiteIDKEM +} + +// Extract - KDF-Extract +func (suite *Suite) Extract(secret []byte, salt []byte) []byte { + return hkdf.Extract(suite.Hash, secret, salt) +} + +// Expand - KDF-Expand +func (suite *Suite) Expand(prk []byte, info []byte, length uint16) ([]byte, error) { + reader := hkdf.Expand(suite.Hash, prk, info) + out := make([]byte, length) + if readNb, err := reader.Read(out); err != nil { + return nil, err + } else if readNb != int(length) { + return nil, errors.New("unable to expand") + } + return out, nil +} + +func (suite *Suite) labeledExtract(suiteID []byte, salt []byte, label string, ikm []byte) []byte { + secret := append(hpkeVersion[:], suiteID...) + secret = append(secret, []byte(label)...) + secret = append(secret, ikm...) + return suite.Extract(secret, salt) +} + +func (suite *Suite) labeledExpand(suiteID []byte, prk []byte, label string, info []byte, length uint16) ([]byte, error) { + labeledInfo := []byte{0, 0} + binary.BigEndian.PutUint16(labeledInfo, length) + labeledInfo = append(labeledInfo, hpkeVersion[:]...) + labeledInfo = append(labeledInfo, suiteID...) + labeledInfo = append(labeledInfo, []byte(label)...) + labeledInfo = append(labeledInfo, info...) + return suite.Expand(prk, labeledInfo, length) +} + +func (suite *Suite) newAeadState(key []uint8, baseNonce []uint8) (*aeadState, error) { + var aead aeadImpl + var err error + switch suite.AeadID { + case AeadAes128Gcm, AeadAes256Gcm: + aead, err = newAesAead(key) + case AeadChaCha20Poly1305: + aead, err = newChaChaPolyAead(key) + default: + return nil, errors.New("unimplemented AEAD") + } + if err != nil { + return nil, err + } + return &aeadState{aead: aead, baseNonce: baseNonce, counter: make([]byte, suite.NonceBytes)}, nil +} + +func verifyPskInputs(mode Mode, psk *Psk) error { + if psk != nil && ((len(psk.Key) == 0) != (len(psk.ID) == 0)) { + return errors.New("a PSK and a PSK ID need both to be set") + } + if psk != nil { + if mode == ModeBase || mode == ModeAuth { + return errors.New("PSK input provided when not needed") + } + } else if mode == ModePsk || mode == ModeAuthPsk { + return errors.New("PSK required for that mode") + } + return nil +} + +// innerContext - An AEAD context +type innerContext struct { + suite *Suite + exporterSecret []byte + outboundState *aeadState + inboundState *aeadState +} + +func (inner *innerContext) export(exporterContext []byte, length uint16) ([]byte, error) { + return inner.suite.labeledExpand(inner.suite.SuiteIDContext[:], inner.exporterSecret, "sec", exporterContext, length) +} + +// ClientContext - A client encryption context +type ClientContext struct { + inner innerContext +} + +// ServerContext - A server encryption context +type ServerContext struct { + inner innerContext +} + +func (suite *Suite) keySchedule(mode Mode, dhSecret []byte, info []byte, psk *Psk) (innerContext, error) { + if err := verifyPskInputs(mode, psk); err != nil { + return innerContext{}, err + } + if psk == nil { + psk = &Psk{} + } + pskIDHash := suite.labeledExtract(suite.SuiteIDContext[:], nil, "psk_id_hash", psk.ID) + infoHash := suite.labeledExtract(suite.SuiteIDContext[:], nil, "info_hash", info) + keyScheduleContext := []byte{byte(mode)} + keyScheduleContext = append(keyScheduleContext, pskIDHash...) + keyScheduleContext = append(keyScheduleContext, infoHash...) + secret := suite.labeledExtract(suite.SuiteIDContext[:], dhSecret, "secret", psk.Key) + + exporterSecret, err := suite.labeledExpand(suite.SuiteIDContext[:], secret, "exp", keyScheduleContext, suite.PrkBytes) + if err != nil { + return innerContext{}, err + } + + var outboundState *aeadState + if suite.AeadID != AeadExportOnly { + outboundKey, err := suite.labeledExpand(suite.SuiteIDContext[:], secret, "key", keyScheduleContext, suite.KeyBytes) + if err != nil { + return innerContext{}, err + } + outboundBaseNonce, err := suite.labeledExpand(suite.SuiteIDContext[:], secret, "base_nonce", keyScheduleContext, suite.NonceBytes) + if err != nil { + return innerContext{}, err + } + outboundState, err = suite.newAeadState(outboundKey, outboundBaseNonce) + if err != nil { + return innerContext{}, err + } + } + return innerContext{ + suite: suite, + exporterSecret: exporterSecret, + outboundState: outboundState, + }, nil +} + +// GenerateKeyPair - Generate a random key pair +func (suite *Suite) GenerateKeyPair() (KeyPair, error) { + var pk, sk [32]byte + if _, err := crypto_rand.Read(sk[:]); err != nil { + return KeyPair{}, err + } + curve25519.ScalarBaseMult(&pk, &sk) + return KeyPair{PublicKey: pk[:], SecretKey: sk[:]}, nil +} + +// DeterministicKeyPair - Derive a deterministic key pair from a seed +func (suite *Suite) DeterministicKeyPair(seed []byte) (KeyPair, error) { + var pk, sk [32]byte + prk := suite.labeledExtract(suite.SuiteIDKEM[:], nil, "dkp_prk", seed) + xsk, err := suite.labeledExpand(suite.SuiteIDKEM[:], prk, "sk", nil, 32) + if err != nil { + return KeyPair{}, err + } + copy(sk[:], xsk) + curve25519.ScalarBaseMult(&pk, &sk) + return KeyPair{PublicKey: pk[:], SecretKey: sk[:]}, nil +} + +func (suite *Suite) dh(pk []byte, sk []byte) ([]byte, error) { + dhSecret, err := curve25519.X25519(sk, pk) + if err != nil { + return nil, err + } + return dhSecret, nil +} + +func (suite *Suite) extractAndExpandDH(dh []byte, kemContext []byte) ([]byte, error) { + prk := suite.labeledExtract(suite.SuiteIDKEM[:], nil, "eae_prk", dh) + dhSecret, err := suite.labeledExpand(suite.SuiteIDKEM[:], prk, "shared_secret", kemContext, suite.KemHashBytes) + if err != nil { + return nil, err + } + return dhSecret, nil +} + +func (suite *Suite) encap(serverPk []byte, seed []byte) ([]byte, []byte, error) { + var ephKp KeyPair + var err error + if len(seed) > 0 { + ephKp, err = suite.DeterministicKeyPair(seed) + } else { + ephKp, err = suite.GenerateKeyPair() + } + if err != nil { + return nil, nil, err + } + dh, err := suite.dh(serverPk, ephKp.SecretKey) + if err != nil { + return nil, nil, err + } + kemContext := append(ephKp.PublicKey, serverPk...) + dhSecret, err := suite.extractAndExpandDH(dh, kemContext) + if err != nil { + return nil, nil, err + } + return dhSecret, ephKp.PublicKey, nil +} + +func (suite *Suite) decap(ephPk []byte, serverKp KeyPair) ([]byte, error) { + dh, err := suite.dh(ephPk, serverKp.SecretKey) + if err != nil { + return nil, err + } + kemContext := append(ephPk, serverKp.PublicKey...) + dhSecret, err := suite.extractAndExpandDH(dh, kemContext) + if err != nil { + return nil, err + } + return dhSecret, nil +} + +func (suite *Suite) authEncap(serverPk []byte, clientKp KeyPair, seed []byte) ([]byte, []byte, error) { + var ephKp KeyPair + var err error + if len(seed) > 0 { + ephKp, err = suite.DeterministicKeyPair(seed) + } else { + ephKp, err = suite.GenerateKeyPair() + } + if err != nil { + return nil, nil, err + } + dh1, err := suite.dh(serverPk, ephKp.SecretKey) + if err != nil { + return nil, nil, err + } + dh2, err := suite.dh(serverPk, clientKp.SecretKey) + if err != nil { + return nil, nil, err + } + dh := append(dh1, dh2...) + kemContext := append(ephKp.PublicKey, serverPk...) + kemContext = append(kemContext, clientKp.PublicKey...) + dhSecret, err := suite.extractAndExpandDH(dh, kemContext) + if err != nil { + return nil, nil, err + } + return dhSecret, ephKp.PublicKey, nil +} + +func (suite *Suite) authDecap(ephPk []byte, serverKp KeyPair, clientPk []byte) ([]byte, error) { + dh1, err := suite.dh(ephPk, serverKp.SecretKey) + if err != nil { + return nil, err + } + dh2, err := suite.dh(clientPk, serverKp.SecretKey) + if err != nil { + return nil, err + } + dh := append(dh1, dh2...) + kemContext := append(ephPk, serverKp.PublicKey...) + kemContext = append(kemContext, clientPk...) + dhSecret, err := suite.extractAndExpandDH(dh, kemContext) + if err != nil { + return nil, err + } + return dhSecret, nil +} + +// NewClientContext - Create a new context for a client (aka "sender") +func (suite *Suite) NewClientContext(serverPk []byte, info []byte, psk *Psk) (ClientContext, []byte, error) { + dhSecret, enc, err := suite.encap(serverPk, nil) + if err != nil { + return ClientContext{}, nil, err + } + mode := ModeBase + if psk != nil { + mode = ModePsk + } + context, err := suite.keySchedule(mode, dhSecret, info, psk) + if err != nil { + return ClientContext{}, nil, err + } + return ClientContext{inner: context}, enc, nil +} + +// NewClientDeterministicContext - Create a new deterministic context for a client - Should only be used for testing purposes +func (suite *Suite) NewClientDeterministicContext(serverPk []byte, info []byte, psk *Psk, seed []byte) (ClientContext, []byte, error) { + dhSecret, enc, err := suite.encap(serverPk, seed) + if err != nil { + return ClientContext{}, nil, err + } + mode := ModeBase + if psk != nil { + mode = ModePsk + } + context, err := suite.keySchedule(mode, dhSecret, info, psk) + if err != nil { + return ClientContext{}, nil, err + } + return ClientContext{inner: context}, enc, nil +} + +// NewServerContext - Create a new context for a server (aka "recipient") +func (suite *Suite) NewServerContext(enc []byte, serverKp KeyPair, info []byte, psk *Psk) (ServerContext, error) { + dhSecret, err := suite.decap(enc, serverKp) + if err != nil { + return ServerContext{}, err + } + mode := ModeBase + if psk != nil { + mode = ModePsk + } + context, err := suite.keySchedule(mode, dhSecret, info, psk) + if err != nil { + return ServerContext{}, err + } + return ServerContext{inner: context}, nil +} + +// NewAuthenticatedClientContext - Create a new context for a client (aka "sender"), with authentication +func (suite *Suite) NewAuthenticatedClientContext(clientKp KeyPair, serverPk []byte, info []byte, psk *Psk) (ClientContext, []byte, error) { + dhSecret, enc, err := suite.authEncap(serverPk, clientKp, nil) + if err != nil { + return ClientContext{}, nil, err + } + mode := ModeAuth + if psk != nil { + mode = ModeAuthPsk + } + context, err := suite.keySchedule(mode, dhSecret, info, psk) + if err != nil { + return ClientContext{}, nil, err + } + return ClientContext{inner: context}, enc, nil +} + +// NewAuthenticatedClientDeterministicContext - Create a new deterministic context for a client, with authentication - Should only be used for testing purposes +func (suite *Suite) NewAuthenticatedClientDeterministicContext(clientKp KeyPair, serverPk []byte, info []byte, psk *Psk, seed []byte) (ClientContext, []byte, error) { + dhSecret, enc, err := suite.authEncap(serverPk, clientKp, seed) + if err != nil { + return ClientContext{}, nil, err + } + mode := ModeAuth + if psk != nil { + mode = ModeAuthPsk + } + context, err := suite.keySchedule(mode, dhSecret, info, psk) + if err != nil { + return ClientContext{}, nil, err + } + return ClientContext{inner: context}, enc, nil +} + +// NewAuthenticatedServerContext - Create a new context for a server (aka "recipient"), with authentication +func (suite *Suite) NewAuthenticatedServerContext(clientPk []byte, enc []byte, serverKp KeyPair, info []byte, psk *Psk) (ServerContext, error) { + dhSecret, err := suite.authDecap(enc, serverKp, clientPk) + if err != nil { + return ServerContext{}, err + } + mode := ModeAuth + if psk != nil { + mode = ModeAuthPsk + } + context, err := suite.keySchedule(mode, dhSecret, info, psk) + if err != nil { + return ServerContext{}, err + } + return ServerContext{inner: context}, nil +} + +func (state *aeadState) incrementCounter() error { + carry := uint16(1) + for i := len(state.counter); ; { + i-- + x := uint16(state.counter[i]) + carry + state.counter[i] = byte(x & 0xff) + carry = x >> 8 + if i == 0 { + break + } + } + if carry != 0 { + return errors.New("Overflow") + } + return nil +} + +// NextNonce - Get the next nonce to encrypt/decrypt a message with an AEAD +// Note: this is not thread-safe. +func (state *aeadState) NextNonce() []byte { + if len(state.counter) != len(state.baseNonce) { + panic("Inconsistent nonce length") + } + nonce := append(state.baseNonce[:0:0], state.baseNonce...) + for i := 0; i < len(nonce); i++ { + nonce[i] ^= state.counter[i] + } + state.incrementCounter() + return nonce +} + +// EncryptToServer - Encrypt and authenticate a message for the server, with optional associated data +func (context *ClientContext) EncryptToServer(message []byte, ad []byte) ([]byte, error) { + state := context.inner.outboundState + nonce := state.NextNonce() + return state.aead.internal().Seal(nil, nonce, message, ad), nil +} + +// DecryptFromClient - Verify and decrypt a ciphertext received from the client, with optional associated data +func (context *ServerContext) DecryptFromClient(ciphertext []byte, ad []byte) ([]byte, error) { + state := context.inner.outboundState + nonce := state.NextNonce() + return state.aead.internal().Open(nil, nonce, ciphertext, ad) +} + +func (inner *innerContext) responseState() (*aeadState, error) { + key, err := inner.export([]byte("response key"), inner.suite.KeyBytes) + if err != nil { + return nil, err + } + baseNonce, err := inner.export([]byte("response nonce"), inner.suite.NonceBytes) + if err != nil { + return nil, err + } + return inner.suite.newAeadState(key, baseNonce) +} + +// EncryptToClient - Encrypt and authenticate a message for the client, with optional associated data +func (context *ServerContext) EncryptToClient(message []byte, ad []byte) ([]byte, error) { + if context.inner.inboundState == nil { + var err error + context.inner.inboundState, err = context.inner.responseState() + if err != nil { + return nil, err + } + } + state := context.inner.inboundState + nonce := state.NextNonce() + return state.aead.internal().Seal(nil, nonce, message, ad), nil +} + +// DecryptFromServer - Verify and decrypt a ciphertext received from the server, with optional associated data +func (context *ClientContext) DecryptFromServer(ciphertext []byte, ad []byte) ([]byte, error) { + if context.inner.inboundState == nil { + var err error + context.inner.inboundState, err = context.inner.responseState() + if err != nil { + return nil, err + } + } + state := context.inner.inboundState + nonce := state.NextNonce() + return state.aead.internal().Open(nil, nonce, ciphertext, ad) +} + +// ExporterSecret - Return the exporter secret +func (context *ClientContext) ExporterSecret() []byte { + return context.inner.exporterSecret +} + +// ExporterSecret - Return the exporter secret +func (context *ServerContext) ExporterSecret() []byte { + return context.inner.exporterSecret +} + +// Export - Return the exporter secret +func (context *ClientContext) Export(exporterContext []byte, length uint16) ([]byte, error) { + return context.inner.export(exporterContext, length) +} + +// Export - Return the exporter secret +func (context *ServerContext) Export(exporterContext []byte, length uint16) ([]byte, error) { + return context.inner.export(exporterContext, length) +} + +type aeadImpl interface { + internal() cipher.AEAD +} + +type aeadAesImpl struct { + impl cipher.AEAD +} + +func newAesAead(key []byte) (aeadAesImpl, error) { + block, err := aes.NewCipher(key) + if err != nil { + return aeadAesImpl{}, nil + } + aesGcm, err := cipher.NewGCM(block) + if err != nil { + return aeadAesImpl{}, nil + } + aead := aeadAesImpl{impl: aesGcm} + return aead, nil +} + +func (aead aeadAesImpl) internal() cipher.AEAD { + return aead.impl +} + +type aeadChaChaPolyImpl struct { + impl cipher.AEAD +} + +func newChaChaPolyAead(key []byte) (aeadChaChaPolyImpl, error) { + impl, err := chacha20poly1305.New(key) + if err != nil { + return aeadChaChaPolyImpl{}, nil + } + aead := aeadChaChaPolyImpl{impl: impl} + return aead, nil +} + +func (aead aeadChaChaPolyImpl) internal() cipher.AEAD { + return aead.impl +} diff --git a/vendor/golang.org/x/crypto/chacha20/chacha_arm64.go b/vendor/golang.org/x/crypto/chacha20/chacha_arm64.go new file mode 100644 index 00000000..94c71ac1 --- /dev/null +++ b/vendor/golang.org/x/crypto/chacha20/chacha_arm64.go @@ -0,0 +1,17 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build go1.11 && gc && !purego +// +build go1.11,gc,!purego + +package chacha20 + +const bufSize = 256 + +//go:noescape +func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32) + +func (c *Cipher) xorKeyStreamBlocks(dst, src []byte) { + xorKeyStreamVX(dst, src, &c.key, &c.nonce, &c.counter) +} diff --git a/vendor/golang.org/x/crypto/chacha20/chacha_arm64.s b/vendor/golang.org/x/crypto/chacha20/chacha_arm64.s new file mode 100644 index 00000000..8fb49a13 --- /dev/null +++ b/vendor/golang.org/x/crypto/chacha20/chacha_arm64.s @@ -0,0 +1,307 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build go1.11,gc,!purego + +#include "textflag.h" + +#define NUM_ROUNDS 10 + +// func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32) +TEXT ·xorKeyStreamVX(SB), NOSPLIT, $0 + MOVD dst+0(FP), R1 + MOVD src+24(FP), R2 + MOVD src_len+32(FP), R3 + MOVD key+48(FP), R4 + MOVD nonce+56(FP), R6 + MOVD counter+64(FP), R7 + + MOVD $·constants(SB), R10 + MOVD $·incRotMatrix(SB), R11 + + MOVW (R7), R20 + + AND $~255, R3, R13 + ADD R2, R13, R12 // R12 for block end + AND $255, R3, R13 +loop: + MOVD $NUM_ROUNDS, R21 + VLD1 (R11), [V30.S4, V31.S4] + + // load contants + // VLD4R (R10), [V0.S4, V1.S4, V2.S4, V3.S4] + WORD $0x4D60E940 + + // load keys + // VLD4R 16(R4), [V4.S4, V5.S4, V6.S4, V7.S4] + WORD $0x4DFFE884 + // VLD4R 16(R4), [V8.S4, V9.S4, V10.S4, V11.S4] + WORD $0x4DFFE888 + SUB $32, R4 + + // load counter + nonce + // VLD1R (R7), [V12.S4] + WORD $0x4D40C8EC + + // VLD3R (R6), [V13.S4, V14.S4, V15.S4] + WORD $0x4D40E8CD + + // update counter + VADD V30.S4, V12.S4, V12.S4 + +chacha: + // V0..V3 += V4..V7 + // V12..V15 <<<= ((V12..V15 XOR V0..V3), 16) + VADD V0.S4, V4.S4, V0.S4 + VADD V1.S4, V5.S4, V1.S4 + VADD V2.S4, V6.S4, V2.S4 + VADD V3.S4, V7.S4, V3.S4 + VEOR V12.B16, V0.B16, V12.B16 + VEOR V13.B16, V1.B16, V13.B16 + VEOR V14.B16, V2.B16, V14.B16 + VEOR V15.B16, V3.B16, V15.B16 + VREV32 V12.H8, V12.H8 + VREV32 V13.H8, V13.H8 + VREV32 V14.H8, V14.H8 + VREV32 V15.H8, V15.H8 + // V8..V11 += V12..V15 + // V4..V7 <<<= ((V4..V7 XOR V8..V11), 12) + VADD V8.S4, V12.S4, V8.S4 + VADD V9.S4, V13.S4, V9.S4 + VADD V10.S4, V14.S4, V10.S4 + VADD V11.S4, V15.S4, V11.S4 + VEOR V8.B16, V4.B16, V16.B16 + VEOR V9.B16, V5.B16, V17.B16 + VEOR V10.B16, V6.B16, V18.B16 + VEOR V11.B16, V7.B16, V19.B16 + VSHL $12, V16.S4, V4.S4 + VSHL $12, V17.S4, V5.S4 + VSHL $12, V18.S4, V6.S4 + VSHL $12, V19.S4, V7.S4 + VSRI $20, V16.S4, V4.S4 + VSRI $20, V17.S4, V5.S4 + VSRI $20, V18.S4, V6.S4 + VSRI $20, V19.S4, V7.S4 + + // V0..V3 += V4..V7 + // V12..V15 <<<= ((V12..V15 XOR V0..V3), 8) + VADD V0.S4, V4.S4, V0.S4 + VADD V1.S4, V5.S4, V1.S4 + VADD V2.S4, V6.S4, V2.S4 + VADD V3.S4, V7.S4, V3.S4 + VEOR V12.B16, V0.B16, V12.B16 + VEOR V13.B16, V1.B16, V13.B16 + VEOR V14.B16, V2.B16, V14.B16 + VEOR V15.B16, V3.B16, V15.B16 + VTBL V31.B16, [V12.B16], V12.B16 + VTBL V31.B16, [V13.B16], V13.B16 + VTBL V31.B16, [V14.B16], V14.B16 + VTBL V31.B16, [V15.B16], V15.B16 + + // V8..V11 += V12..V15 + // V4..V7 <<<= ((V4..V7 XOR V8..V11), 7) + VADD V12.S4, V8.S4, V8.S4 + VADD V13.S4, V9.S4, V9.S4 + VADD V14.S4, V10.S4, V10.S4 + VADD V15.S4, V11.S4, V11.S4 + VEOR V8.B16, V4.B16, V16.B16 + VEOR V9.B16, V5.B16, V17.B16 + VEOR V10.B16, V6.B16, V18.B16 + VEOR V11.B16, V7.B16, V19.B16 + VSHL $7, V16.S4, V4.S4 + VSHL $7, V17.S4, V5.S4 + VSHL $7, V18.S4, V6.S4 + VSHL $7, V19.S4, V7.S4 + VSRI $25, V16.S4, V4.S4 + VSRI $25, V17.S4, V5.S4 + VSRI $25, V18.S4, V6.S4 + VSRI $25, V19.S4, V7.S4 + + // V0..V3 += V5..V7, V4 + // V15,V12-V14 <<<= ((V15,V12-V14 XOR V0..V3), 16) + VADD V0.S4, V5.S4, V0.S4 + VADD V1.S4, V6.S4, V1.S4 + VADD V2.S4, V7.S4, V2.S4 + VADD V3.S4, V4.S4, V3.S4 + VEOR V15.B16, V0.B16, V15.B16 + VEOR V12.B16, V1.B16, V12.B16 + VEOR V13.B16, V2.B16, V13.B16 + VEOR V14.B16, V3.B16, V14.B16 + VREV32 V12.H8, V12.H8 + VREV32 V13.H8, V13.H8 + VREV32 V14.H8, V14.H8 + VREV32 V15.H8, V15.H8 + + // V10 += V15; V5 <<<= ((V10 XOR V5), 12) + // ... + VADD V15.S4, V10.S4, V10.S4 + VADD V12.S4, V11.S4, V11.S4 + VADD V13.S4, V8.S4, V8.S4 + VADD V14.S4, V9.S4, V9.S4 + VEOR V10.B16, V5.B16, V16.B16 + VEOR V11.B16, V6.B16, V17.B16 + VEOR V8.B16, V7.B16, V18.B16 + VEOR V9.B16, V4.B16, V19.B16 + VSHL $12, V16.S4, V5.S4 + VSHL $12, V17.S4, V6.S4 + VSHL $12, V18.S4, V7.S4 + VSHL $12, V19.S4, V4.S4 + VSRI $20, V16.S4, V5.S4 + VSRI $20, V17.S4, V6.S4 + VSRI $20, V18.S4, V7.S4 + VSRI $20, V19.S4, V4.S4 + + // V0 += V5; V15 <<<= ((V0 XOR V15), 8) + // ... + VADD V5.S4, V0.S4, V0.S4 + VADD V6.S4, V1.S4, V1.S4 + VADD V7.S4, V2.S4, V2.S4 + VADD V4.S4, V3.S4, V3.S4 + VEOR V0.B16, V15.B16, V15.B16 + VEOR V1.B16, V12.B16, V12.B16 + VEOR V2.B16, V13.B16, V13.B16 + VEOR V3.B16, V14.B16, V14.B16 + VTBL V31.B16, [V12.B16], V12.B16 + VTBL V31.B16, [V13.B16], V13.B16 + VTBL V31.B16, [V14.B16], V14.B16 + VTBL V31.B16, [V15.B16], V15.B16 + + // V10 += V15; V5 <<<= ((V10 XOR V5), 7) + // ... + VADD V15.S4, V10.S4, V10.S4 + VADD V12.S4, V11.S4, V11.S4 + VADD V13.S4, V8.S4, V8.S4 + VADD V14.S4, V9.S4, V9.S4 + VEOR V10.B16, V5.B16, V16.B16 + VEOR V11.B16, V6.B16, V17.B16 + VEOR V8.B16, V7.B16, V18.B16 + VEOR V9.B16, V4.B16, V19.B16 + VSHL $7, V16.S4, V5.S4 + VSHL $7, V17.S4, V6.S4 + VSHL $7, V18.S4, V7.S4 + VSHL $7, V19.S4, V4.S4 + VSRI $25, V16.S4, V5.S4 + VSRI $25, V17.S4, V6.S4 + VSRI $25, V18.S4, V7.S4 + VSRI $25, V19.S4, V4.S4 + + SUB $1, R21 + CBNZ R21, chacha + + // VLD4R (R10), [V16.S4, V17.S4, V18.S4, V19.S4] + WORD $0x4D60E950 + + // VLD4R 16(R4), [V20.S4, V21.S4, V22.S4, V23.S4] + WORD $0x4DFFE894 + VADD V30.S4, V12.S4, V12.S4 + VADD V16.S4, V0.S4, V0.S4 + VADD V17.S4, V1.S4, V1.S4 + VADD V18.S4, V2.S4, V2.S4 + VADD V19.S4, V3.S4, V3.S4 + // VLD4R 16(R4), [V24.S4, V25.S4, V26.S4, V27.S4] + WORD $0x4DFFE898 + // restore R4 + SUB $32, R4 + + // load counter + nonce + // VLD1R (R7), [V28.S4] + WORD $0x4D40C8FC + // VLD3R (R6), [V29.S4, V30.S4, V31.S4] + WORD $0x4D40E8DD + + VADD V20.S4, V4.S4, V4.S4 + VADD V21.S4, V5.S4, V5.S4 + VADD V22.S4, V6.S4, V6.S4 + VADD V23.S4, V7.S4, V7.S4 + VADD V24.S4, V8.S4, V8.S4 + VADD V25.S4, V9.S4, V9.S4 + VADD V26.S4, V10.S4, V10.S4 + VADD V27.S4, V11.S4, V11.S4 + VADD V28.S4, V12.S4, V12.S4 + VADD V29.S4, V13.S4, V13.S4 + VADD V30.S4, V14.S4, V14.S4 + VADD V31.S4, V15.S4, V15.S4 + + VZIP1 V1.S4, V0.S4, V16.S4 + VZIP2 V1.S4, V0.S4, V17.S4 + VZIP1 V3.S4, V2.S4, V18.S4 + VZIP2 V3.S4, V2.S4, V19.S4 + VZIP1 V5.S4, V4.S4, V20.S4 + VZIP2 V5.S4, V4.S4, V21.S4 + VZIP1 V7.S4, V6.S4, V22.S4 + VZIP2 V7.S4, V6.S4, V23.S4 + VZIP1 V9.S4, V8.S4, V24.S4 + VZIP2 V9.S4, V8.S4, V25.S4 + VZIP1 V11.S4, V10.S4, V26.S4 + VZIP2 V11.S4, V10.S4, V27.S4 + VZIP1 V13.S4, V12.S4, V28.S4 + VZIP2 V13.S4, V12.S4, V29.S4 + VZIP1 V15.S4, V14.S4, V30.S4 + VZIP2 V15.S4, V14.S4, V31.S4 + VZIP1 V18.D2, V16.D2, V0.D2 + VZIP2 V18.D2, V16.D2, V4.D2 + VZIP1 V19.D2, V17.D2, V8.D2 + VZIP2 V19.D2, V17.D2, V12.D2 + VLD1.P 64(R2), [V16.B16, V17.B16, V18.B16, V19.B16] + + VZIP1 V22.D2, V20.D2, V1.D2 + VZIP2 V22.D2, V20.D2, V5.D2 + VZIP1 V23.D2, V21.D2, V9.D2 + VZIP2 V23.D2, V21.D2, V13.D2 + VLD1.P 64(R2), [V20.B16, V21.B16, V22.B16, V23.B16] + VZIP1 V26.D2, V24.D2, V2.D2 + VZIP2 V26.D2, V24.D2, V6.D2 + VZIP1 V27.D2, V25.D2, V10.D2 + VZIP2 V27.D2, V25.D2, V14.D2 + VLD1.P 64(R2), [V24.B16, V25.B16, V26.B16, V27.B16] + VZIP1 V30.D2, V28.D2, V3.D2 + VZIP2 V30.D2, V28.D2, V7.D2 + VZIP1 V31.D2, V29.D2, V11.D2 + VZIP2 V31.D2, V29.D2, V15.D2 + VLD1.P 64(R2), [V28.B16, V29.B16, V30.B16, V31.B16] + VEOR V0.B16, V16.B16, V16.B16 + VEOR V1.B16, V17.B16, V17.B16 + VEOR V2.B16, V18.B16, V18.B16 + VEOR V3.B16, V19.B16, V19.B16 + VST1.P [V16.B16, V17.B16, V18.B16, V19.B16], 64(R1) + VEOR V4.B16, V20.B16, V20.B16 + VEOR V5.B16, V21.B16, V21.B16 + VEOR V6.B16, V22.B16, V22.B16 + VEOR V7.B16, V23.B16, V23.B16 + VST1.P [V20.B16, V21.B16, V22.B16, V23.B16], 64(R1) + VEOR V8.B16, V24.B16, V24.B16 + VEOR V9.B16, V25.B16, V25.B16 + VEOR V10.B16, V26.B16, V26.B16 + VEOR V11.B16, V27.B16, V27.B16 + VST1.P [V24.B16, V25.B16, V26.B16, V27.B16], 64(R1) + VEOR V12.B16, V28.B16, V28.B16 + VEOR V13.B16, V29.B16, V29.B16 + VEOR V14.B16, V30.B16, V30.B16 + VEOR V15.B16, V31.B16, V31.B16 + VST1.P [V28.B16, V29.B16, V30.B16, V31.B16], 64(R1) + + ADD $4, R20 + MOVW R20, (R7) // update counter + + CMP R2, R12 + BGT loop + + RET + + +DATA ·constants+0x00(SB)/4, $0x61707865 +DATA ·constants+0x04(SB)/4, $0x3320646e +DATA ·constants+0x08(SB)/4, $0x79622d32 +DATA ·constants+0x0c(SB)/4, $0x6b206574 +GLOBL ·constants(SB), NOPTR|RODATA, $32 + +DATA ·incRotMatrix+0x00(SB)/4, $0x00000000 +DATA ·incRotMatrix+0x04(SB)/4, $0x00000001 +DATA ·incRotMatrix+0x08(SB)/4, $0x00000002 +DATA ·incRotMatrix+0x0c(SB)/4, $0x00000003 +DATA ·incRotMatrix+0x10(SB)/4, $0x02010003 +DATA ·incRotMatrix+0x14(SB)/4, $0x06050407 +DATA ·incRotMatrix+0x18(SB)/4, $0x0A09080B +DATA ·incRotMatrix+0x1c(SB)/4, $0x0E0D0C0F +GLOBL ·incRotMatrix(SB), NOPTR|RODATA, $32 diff --git a/vendor/golang.org/x/crypto/chacha20/chacha_generic.go b/vendor/golang.org/x/crypto/chacha20/chacha_generic.go new file mode 100644 index 00000000..a2ecf5c3 --- /dev/null +++ b/vendor/golang.org/x/crypto/chacha20/chacha_generic.go @@ -0,0 +1,398 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package chacha20 implements the ChaCha20 and XChaCha20 encryption algorithms +// as specified in RFC 8439 and draft-irtf-cfrg-xchacha-01. +package chacha20 + +import ( + "crypto/cipher" + "encoding/binary" + "errors" + "math/bits" + + "golang.org/x/crypto/internal/subtle" +) + +const ( + // KeySize is the size of the key used by this cipher, in bytes. + KeySize = 32 + + // NonceSize is the size of the nonce used with the standard variant of this + // cipher, in bytes. + // + // Note that this is too short to be safely generated at random if the same + // key is reused more than 2³² times. + NonceSize = 12 + + // NonceSizeX is the size of the nonce used with the XChaCha20 variant of + // this cipher, in bytes. + NonceSizeX = 24 +) + +// Cipher is a stateful instance of ChaCha20 or XChaCha20 using a particular key +// and nonce. A *Cipher implements the cipher.Stream interface. +type Cipher struct { + // The ChaCha20 state is 16 words: 4 constant, 8 of key, 1 of counter + // (incremented after each block), and 3 of nonce. + key [8]uint32 + counter uint32 + nonce [3]uint32 + + // The last len bytes of buf are leftover key stream bytes from the previous + // XORKeyStream invocation. The size of buf depends on how many blocks are + // computed at a time by xorKeyStreamBlocks. + buf [bufSize]byte + len int + + // overflow is set when the counter overflowed, no more blocks can be + // generated, and the next XORKeyStream call should panic. + overflow bool + + // The counter-independent results of the first round are cached after they + // are computed the first time. + precompDone bool + p1, p5, p9, p13 uint32 + p2, p6, p10, p14 uint32 + p3, p7, p11, p15 uint32 +} + +var _ cipher.Stream = (*Cipher)(nil) + +// NewUnauthenticatedCipher creates a new ChaCha20 stream cipher with the given +// 32 bytes key and a 12 or 24 bytes nonce. If a nonce of 24 bytes is provided, +// the XChaCha20 construction will be used. It returns an error if key or nonce +// have any other length. +// +// Note that ChaCha20, like all stream ciphers, is not authenticated and allows +// attackers to silently tamper with the plaintext. For this reason, it is more +// appropriate as a building block than as a standalone encryption mechanism. +// Instead, consider using package golang.org/x/crypto/chacha20poly1305. +func NewUnauthenticatedCipher(key, nonce []byte) (*Cipher, error) { + // This function is split into a wrapper so that the Cipher allocation will + // be inlined, and depending on how the caller uses the return value, won't + // escape to the heap. + c := &Cipher{} + return newUnauthenticatedCipher(c, key, nonce) +} + +func newUnauthenticatedCipher(c *Cipher, key, nonce []byte) (*Cipher, error) { + if len(key) != KeySize { + return nil, errors.New("chacha20: wrong key size") + } + if len(nonce) == NonceSizeX { + // XChaCha20 uses the ChaCha20 core to mix 16 bytes of the nonce into a + // derived key, allowing it to operate on a nonce of 24 bytes. See + // draft-irtf-cfrg-xchacha-01, Section 2.3. + key, _ = HChaCha20(key, nonce[0:16]) + cNonce := make([]byte, NonceSize) + copy(cNonce[4:12], nonce[16:24]) + nonce = cNonce + } else if len(nonce) != NonceSize { + return nil, errors.New("chacha20: wrong nonce size") + } + + key, nonce = key[:KeySize], nonce[:NonceSize] // bounds check elimination hint + c.key = [8]uint32{ + binary.LittleEndian.Uint32(key[0:4]), + binary.LittleEndian.Uint32(key[4:8]), + binary.LittleEndian.Uint32(key[8:12]), + binary.LittleEndian.Uint32(key[12:16]), + binary.LittleEndian.Uint32(key[16:20]), + binary.LittleEndian.Uint32(key[20:24]), + binary.LittleEndian.Uint32(key[24:28]), + binary.LittleEndian.Uint32(key[28:32]), + } + c.nonce = [3]uint32{ + binary.LittleEndian.Uint32(nonce[0:4]), + binary.LittleEndian.Uint32(nonce[4:8]), + binary.LittleEndian.Uint32(nonce[8:12]), + } + return c, nil +} + +// The constant first 4 words of the ChaCha20 state. +const ( + j0 uint32 = 0x61707865 // expa + j1 uint32 = 0x3320646e // nd 3 + j2 uint32 = 0x79622d32 // 2-by + j3 uint32 = 0x6b206574 // te k +) + +const blockSize = 64 + +// quarterRound is the core of ChaCha20. It shuffles the bits of 4 state words. +// It's executed 4 times for each of the 20 ChaCha20 rounds, operating on all 16 +// words each round, in columnar or diagonal groups of 4 at a time. +func quarterRound(a, b, c, d uint32) (uint32, uint32, uint32, uint32) { + a += b + d ^= a + d = bits.RotateLeft32(d, 16) + c += d + b ^= c + b = bits.RotateLeft32(b, 12) + a += b + d ^= a + d = bits.RotateLeft32(d, 8) + c += d + b ^= c + b = bits.RotateLeft32(b, 7) + return a, b, c, d +} + +// SetCounter sets the Cipher counter. The next invocation of XORKeyStream will +// behave as if (64 * counter) bytes had been encrypted so far. +// +// To prevent accidental counter reuse, SetCounter panics if counter is less +// than the current value. +// +// Note that the execution time of XORKeyStream is not independent of the +// counter value. +func (s *Cipher) SetCounter(counter uint32) { + // Internally, s may buffer multiple blocks, which complicates this + // implementation slightly. When checking whether the counter has rolled + // back, we must use both s.counter and s.len to determine how many blocks + // we have already output. + outputCounter := s.counter - uint32(s.len)/blockSize + if s.overflow || counter < outputCounter { + panic("chacha20: SetCounter attempted to rollback counter") + } + + // In the general case, we set the new counter value and reset s.len to 0, + // causing the next call to XORKeyStream to refill the buffer. However, if + // we're advancing within the existing buffer, we can save work by simply + // setting s.len. + if counter < s.counter { + s.len = int(s.counter-counter) * blockSize + } else { + s.counter = counter + s.len = 0 + } +} + +// XORKeyStream XORs each byte in the given slice with a byte from the +// cipher's key stream. Dst and src must overlap entirely or not at all. +// +// If len(dst) < len(src), XORKeyStream will panic. It is acceptable +// to pass a dst bigger than src, and in that case, XORKeyStream will +// only update dst[:len(src)] and will not touch the rest of dst. +// +// Multiple calls to XORKeyStream behave as if the concatenation of +// the src buffers was passed in a single run. That is, Cipher +// maintains state and does not reset at each XORKeyStream call. +func (s *Cipher) XORKeyStream(dst, src []byte) { + if len(src) == 0 { + return + } + if len(dst) < len(src) { + panic("chacha20: output smaller than input") + } + dst = dst[:len(src)] + if subtle.InexactOverlap(dst, src) { + panic("chacha20: invalid buffer overlap") + } + + // First, drain any remaining key stream from a previous XORKeyStream. + if s.len != 0 { + keyStream := s.buf[bufSize-s.len:] + if len(src) < len(keyStream) { + keyStream = keyStream[:len(src)] + } + _ = src[len(keyStream)-1] // bounds check elimination hint + for i, b := range keyStream { + dst[i] = src[i] ^ b + } + s.len -= len(keyStream) + dst, src = dst[len(keyStream):], src[len(keyStream):] + } + if len(src) == 0 { + return + } + + // If we'd need to let the counter overflow and keep generating output, + // panic immediately. If instead we'd only reach the last block, remember + // not to generate any more output after the buffer is drained. + numBlocks := (uint64(len(src)) + blockSize - 1) / blockSize + if s.overflow || uint64(s.counter)+numBlocks > 1<<32 { + panic("chacha20: counter overflow") + } else if uint64(s.counter)+numBlocks == 1<<32 { + s.overflow = true + } + + // xorKeyStreamBlocks implementations expect input lengths that are a + // multiple of bufSize. Platform-specific ones process multiple blocks at a + // time, so have bufSizes that are a multiple of blockSize. + + full := len(src) - len(src)%bufSize + if full > 0 { + s.xorKeyStreamBlocks(dst[:full], src[:full]) + } + dst, src = dst[full:], src[full:] + + // If using a multi-block xorKeyStreamBlocks would overflow, use the generic + // one that does one block at a time. + const blocksPerBuf = bufSize / blockSize + if uint64(s.counter)+blocksPerBuf > 1<<32 { + s.buf = [bufSize]byte{} + numBlocks := (len(src) + blockSize - 1) / blockSize + buf := s.buf[bufSize-numBlocks*blockSize:] + copy(buf, src) + s.xorKeyStreamBlocksGeneric(buf, buf) + s.len = len(buf) - copy(dst, buf) + return + } + + // If we have a partial (multi-)block, pad it for xorKeyStreamBlocks, and + // keep the leftover keystream for the next XORKeyStream invocation. + if len(src) > 0 { + s.buf = [bufSize]byte{} + copy(s.buf[:], src) + s.xorKeyStreamBlocks(s.buf[:], s.buf[:]) + s.len = bufSize - copy(dst, s.buf[:]) + } +} + +func (s *Cipher) xorKeyStreamBlocksGeneric(dst, src []byte) { + if len(dst) != len(src) || len(dst)%blockSize != 0 { + panic("chacha20: internal error: wrong dst and/or src length") + } + + // To generate each block of key stream, the initial cipher state + // (represented below) is passed through 20 rounds of shuffling, + // alternatively applying quarterRounds by columns (like 1, 5, 9, 13) + // or by diagonals (like 1, 6, 11, 12). + // + // 0:cccccccc 1:cccccccc 2:cccccccc 3:cccccccc + // 4:kkkkkkkk 5:kkkkkkkk 6:kkkkkkkk 7:kkkkkkkk + // 8:kkkkkkkk 9:kkkkkkkk 10:kkkkkkkk 11:kkkkkkkk + // 12:bbbbbbbb 13:nnnnnnnn 14:nnnnnnnn 15:nnnnnnnn + // + // c=constant k=key b=blockcount n=nonce + var ( + c0, c1, c2, c3 = j0, j1, j2, j3 + c4, c5, c6, c7 = s.key[0], s.key[1], s.key[2], s.key[3] + c8, c9, c10, c11 = s.key[4], s.key[5], s.key[6], s.key[7] + _, c13, c14, c15 = s.counter, s.nonce[0], s.nonce[1], s.nonce[2] + ) + + // Three quarters of the first round don't depend on the counter, so we can + // calculate them here, and reuse them for multiple blocks in the loop, and + // for future XORKeyStream invocations. + if !s.precompDone { + s.p1, s.p5, s.p9, s.p13 = quarterRound(c1, c5, c9, c13) + s.p2, s.p6, s.p10, s.p14 = quarterRound(c2, c6, c10, c14) + s.p3, s.p7, s.p11, s.p15 = quarterRound(c3, c7, c11, c15) + s.precompDone = true + } + + // A condition of len(src) > 0 would be sufficient, but this also + // acts as a bounds check elimination hint. + for len(src) >= 64 && len(dst) >= 64 { + // The remainder of the first column round. + fcr0, fcr4, fcr8, fcr12 := quarterRound(c0, c4, c8, s.counter) + + // The second diagonal round. + x0, x5, x10, x15 := quarterRound(fcr0, s.p5, s.p10, s.p15) + x1, x6, x11, x12 := quarterRound(s.p1, s.p6, s.p11, fcr12) + x2, x7, x8, x13 := quarterRound(s.p2, s.p7, fcr8, s.p13) + x3, x4, x9, x14 := quarterRound(s.p3, fcr4, s.p9, s.p14) + + // The remaining 18 rounds. + for i := 0; i < 9; i++ { + // Column round. + x0, x4, x8, x12 = quarterRound(x0, x4, x8, x12) + x1, x5, x9, x13 = quarterRound(x1, x5, x9, x13) + x2, x6, x10, x14 = quarterRound(x2, x6, x10, x14) + x3, x7, x11, x15 = quarterRound(x3, x7, x11, x15) + + // Diagonal round. + x0, x5, x10, x15 = quarterRound(x0, x5, x10, x15) + x1, x6, x11, x12 = quarterRound(x1, x6, x11, x12) + x2, x7, x8, x13 = quarterRound(x2, x7, x8, x13) + x3, x4, x9, x14 = quarterRound(x3, x4, x9, x14) + } + + // Add back the initial state to generate the key stream, then + // XOR the key stream with the source and write out the result. + addXor(dst[0:4], src[0:4], x0, c0) + addXor(dst[4:8], src[4:8], x1, c1) + addXor(dst[8:12], src[8:12], x2, c2) + addXor(dst[12:16], src[12:16], x3, c3) + addXor(dst[16:20], src[16:20], x4, c4) + addXor(dst[20:24], src[20:24], x5, c5) + addXor(dst[24:28], src[24:28], x6, c6) + addXor(dst[28:32], src[28:32], x7, c7) + addXor(dst[32:36], src[32:36], x8, c8) + addXor(dst[36:40], src[36:40], x9, c9) + addXor(dst[40:44], src[40:44], x10, c10) + addXor(dst[44:48], src[44:48], x11, c11) + addXor(dst[48:52], src[48:52], x12, s.counter) + addXor(dst[52:56], src[52:56], x13, c13) + addXor(dst[56:60], src[56:60], x14, c14) + addXor(dst[60:64], src[60:64], x15, c15) + + s.counter += 1 + + src, dst = src[blockSize:], dst[blockSize:] + } +} + +// HChaCha20 uses the ChaCha20 core to generate a derived key from a 32 bytes +// key and a 16 bytes nonce. It returns an error if key or nonce have any other +// length. It is used as part of the XChaCha20 construction. +func HChaCha20(key, nonce []byte) ([]byte, error) { + // This function is split into a wrapper so that the slice allocation will + // be inlined, and depending on how the caller uses the return value, won't + // escape to the heap. + out := make([]byte, 32) + return hChaCha20(out, key, nonce) +} + +func hChaCha20(out, key, nonce []byte) ([]byte, error) { + if len(key) != KeySize { + return nil, errors.New("chacha20: wrong HChaCha20 key size") + } + if len(nonce) != 16 { + return nil, errors.New("chacha20: wrong HChaCha20 nonce size") + } + + x0, x1, x2, x3 := j0, j1, j2, j3 + x4 := binary.LittleEndian.Uint32(key[0:4]) + x5 := binary.LittleEndian.Uint32(key[4:8]) + x6 := binary.LittleEndian.Uint32(key[8:12]) + x7 := binary.LittleEndian.Uint32(key[12:16]) + x8 := binary.LittleEndian.Uint32(key[16:20]) + x9 := binary.LittleEndian.Uint32(key[20:24]) + x10 := binary.LittleEndian.Uint32(key[24:28]) + x11 := binary.LittleEndian.Uint32(key[28:32]) + x12 := binary.LittleEndian.Uint32(nonce[0:4]) + x13 := binary.LittleEndian.Uint32(nonce[4:8]) + x14 := binary.LittleEndian.Uint32(nonce[8:12]) + x15 := binary.LittleEndian.Uint32(nonce[12:16]) + + for i := 0; i < 10; i++ { + // Diagonal round. + x0, x4, x8, x12 = quarterRound(x0, x4, x8, x12) + x1, x5, x9, x13 = quarterRound(x1, x5, x9, x13) + x2, x6, x10, x14 = quarterRound(x2, x6, x10, x14) + x3, x7, x11, x15 = quarterRound(x3, x7, x11, x15) + + // Column round. + x0, x5, x10, x15 = quarterRound(x0, x5, x10, x15) + x1, x6, x11, x12 = quarterRound(x1, x6, x11, x12) + x2, x7, x8, x13 = quarterRound(x2, x7, x8, x13) + x3, x4, x9, x14 = quarterRound(x3, x4, x9, x14) + } + + _ = out[31] // bounds check elimination hint + binary.LittleEndian.PutUint32(out[0:4], x0) + binary.LittleEndian.PutUint32(out[4:8], x1) + binary.LittleEndian.PutUint32(out[8:12], x2) + binary.LittleEndian.PutUint32(out[12:16], x3) + binary.LittleEndian.PutUint32(out[16:20], x12) + binary.LittleEndian.PutUint32(out[20:24], x13) + binary.LittleEndian.PutUint32(out[24:28], x14) + binary.LittleEndian.PutUint32(out[28:32], x15) + return out, nil +} diff --git a/vendor/golang.org/x/crypto/chacha20/chacha_noasm.go b/vendor/golang.org/x/crypto/chacha20/chacha_noasm.go new file mode 100644 index 00000000..025b4989 --- /dev/null +++ b/vendor/golang.org/x/crypto/chacha20/chacha_noasm.go @@ -0,0 +1,14 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build (!arm64 && !s390x && !ppc64le) || (arm64 && !go1.11) || !gc || purego +// +build !arm64,!s390x,!ppc64le arm64,!go1.11 !gc purego + +package chacha20 + +const bufSize = blockSize + +func (s *Cipher) xorKeyStreamBlocks(dst, src []byte) { + s.xorKeyStreamBlocksGeneric(dst, src) +} diff --git a/vendor/golang.org/x/crypto/chacha20/chacha_ppc64le.go b/vendor/golang.org/x/crypto/chacha20/chacha_ppc64le.go new file mode 100644 index 00000000..da420b2e --- /dev/null +++ b/vendor/golang.org/x/crypto/chacha20/chacha_ppc64le.go @@ -0,0 +1,17 @@ +// Copyright 2019 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build gc && !purego +// +build gc,!purego + +package chacha20 + +const bufSize = 256 + +//go:noescape +func chaCha20_ctr32_vsx(out, inp *byte, len int, key *[8]uint32, counter *uint32) + +func (c *Cipher) xorKeyStreamBlocks(dst, src []byte) { + chaCha20_ctr32_vsx(&dst[0], &src[0], len(src), &c.key, &c.counter) +} diff --git a/vendor/golang.org/x/crypto/chacha20/chacha_ppc64le.s b/vendor/golang.org/x/crypto/chacha20/chacha_ppc64le.s new file mode 100644 index 00000000..3dad4b2f --- /dev/null +++ b/vendor/golang.org/x/crypto/chacha20/chacha_ppc64le.s @@ -0,0 +1,449 @@ +// Copyright 2019 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Based on CRYPTOGAMS code with the following comment: +// # ==================================================================== +// # Written by Andy Polyakov for the OpenSSL +// # project. The module is, however, dual licensed under OpenSSL and +// # CRYPTOGAMS licenses depending on where you obtain it. For further +// # details see http://www.openssl.org/~appro/cryptogams/. +// # ==================================================================== + +// Code for the perl script that generates the ppc64 assembler +// can be found in the cryptogams repository at the link below. It is based on +// the original from openssl. + +// https://github.com/dot-asm/cryptogams/commit/a60f5b50ed908e91 + +// The differences in this and the original implementation are +// due to the calling conventions and initialization of constants. + +// +build gc,!purego + +#include "textflag.h" + +#define OUT R3 +#define INP R4 +#define LEN R5 +#define KEY R6 +#define CNT R7 +#define TMP R15 + +#define CONSTBASE R16 +#define BLOCKS R17 + +DATA consts<>+0x00(SB)/8, $0x3320646e61707865 +DATA consts<>+0x08(SB)/8, $0x6b20657479622d32 +DATA consts<>+0x10(SB)/8, $0x0000000000000001 +DATA consts<>+0x18(SB)/8, $0x0000000000000000 +DATA consts<>+0x20(SB)/8, $0x0000000000000004 +DATA consts<>+0x28(SB)/8, $0x0000000000000000 +DATA consts<>+0x30(SB)/8, $0x0a0b08090e0f0c0d +DATA consts<>+0x38(SB)/8, $0x0203000106070405 +DATA consts<>+0x40(SB)/8, $0x090a0b080d0e0f0c +DATA consts<>+0x48(SB)/8, $0x0102030005060704 +DATA consts<>+0x50(SB)/8, $0x6170786561707865 +DATA consts<>+0x58(SB)/8, $0x6170786561707865 +DATA consts<>+0x60(SB)/8, $0x3320646e3320646e +DATA consts<>+0x68(SB)/8, $0x3320646e3320646e +DATA consts<>+0x70(SB)/8, $0x79622d3279622d32 +DATA consts<>+0x78(SB)/8, $0x79622d3279622d32 +DATA consts<>+0x80(SB)/8, $0x6b2065746b206574 +DATA consts<>+0x88(SB)/8, $0x6b2065746b206574 +DATA consts<>+0x90(SB)/8, $0x0000000100000000 +DATA consts<>+0x98(SB)/8, $0x0000000300000002 +GLOBL consts<>(SB), RODATA, $0xa0 + +//func chaCha20_ctr32_vsx(out, inp *byte, len int, key *[8]uint32, counter *uint32) +TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40 + MOVD out+0(FP), OUT + MOVD inp+8(FP), INP + MOVD len+16(FP), LEN + MOVD key+24(FP), KEY + MOVD counter+32(FP), CNT + + // Addressing for constants + MOVD $consts<>+0x00(SB), CONSTBASE + MOVD $16, R8 + MOVD $32, R9 + MOVD $48, R10 + MOVD $64, R11 + SRD $6, LEN, BLOCKS + // V16 + LXVW4X (CONSTBASE)(R0), VS48 + ADD $80,CONSTBASE + + // Load key into V17,V18 + LXVW4X (KEY)(R0), VS49 + LXVW4X (KEY)(R8), VS50 + + // Load CNT, NONCE into V19 + LXVW4X (CNT)(R0), VS51 + + // Clear V27 + VXOR V27, V27, V27 + + // V28 + LXVW4X (CONSTBASE)(R11), VS60 + + // splat slot from V19 -> V26 + VSPLTW $0, V19, V26 + + VSLDOI $4, V19, V27, V19 + VSLDOI $12, V27, V19, V19 + + VADDUWM V26, V28, V26 + + MOVD $10, R14 + MOVD R14, CTR + +loop_outer_vsx: + // V0, V1, V2, V3 + LXVW4X (R0)(CONSTBASE), VS32 + LXVW4X (R8)(CONSTBASE), VS33 + LXVW4X (R9)(CONSTBASE), VS34 + LXVW4X (R10)(CONSTBASE), VS35 + + // splat values from V17, V18 into V4-V11 + VSPLTW $0, V17, V4 + VSPLTW $1, V17, V5 + VSPLTW $2, V17, V6 + VSPLTW $3, V17, V7 + VSPLTW $0, V18, V8 + VSPLTW $1, V18, V9 + VSPLTW $2, V18, V10 + VSPLTW $3, V18, V11 + + // VOR + VOR V26, V26, V12 + + // splat values from V19 -> V13, V14, V15 + VSPLTW $1, V19, V13 + VSPLTW $2, V19, V14 + VSPLTW $3, V19, V15 + + // splat const values + VSPLTISW $-16, V27 + VSPLTISW $12, V28 + VSPLTISW $8, V29 + VSPLTISW $7, V30 + +loop_vsx: + VADDUWM V0, V4, V0 + VADDUWM V1, V5, V1 + VADDUWM V2, V6, V2 + VADDUWM V3, V7, V3 + + VXOR V12, V0, V12 + VXOR V13, V1, V13 + VXOR V14, V2, V14 + VXOR V15, V3, V15 + + VRLW V12, V27, V12 + VRLW V13, V27, V13 + VRLW V14, V27, V14 + VRLW V15, V27, V15 + + VADDUWM V8, V12, V8 + VADDUWM V9, V13, V9 + VADDUWM V10, V14, V10 + VADDUWM V11, V15, V11 + + VXOR V4, V8, V4 + VXOR V5, V9, V5 + VXOR V6, V10, V6 + VXOR V7, V11, V7 + + VRLW V4, V28, V4 + VRLW V5, V28, V5 + VRLW V6, V28, V6 + VRLW V7, V28, V7 + + VADDUWM V0, V4, V0 + VADDUWM V1, V5, V1 + VADDUWM V2, V6, V2 + VADDUWM V3, V7, V3 + + VXOR V12, V0, V12 + VXOR V13, V1, V13 + VXOR V14, V2, V14 + VXOR V15, V3, V15 + + VRLW V12, V29, V12 + VRLW V13, V29, V13 + VRLW V14, V29, V14 + VRLW V15, V29, V15 + + VADDUWM V8, V12, V8 + VADDUWM V9, V13, V9 + VADDUWM V10, V14, V10 + VADDUWM V11, V15, V11 + + VXOR V4, V8, V4 + VXOR V5, V9, V5 + VXOR V6, V10, V6 + VXOR V7, V11, V7 + + VRLW V4, V30, V4 + VRLW V5, V30, V5 + VRLW V6, V30, V6 + VRLW V7, V30, V7 + + VADDUWM V0, V5, V0 + VADDUWM V1, V6, V1 + VADDUWM V2, V7, V2 + VADDUWM V3, V4, V3 + + VXOR V15, V0, V15 + VXOR V12, V1, V12 + VXOR V13, V2, V13 + VXOR V14, V3, V14 + + VRLW V15, V27, V15 + VRLW V12, V27, V12 + VRLW V13, V27, V13 + VRLW V14, V27, V14 + + VADDUWM V10, V15, V10 + VADDUWM V11, V12, V11 + VADDUWM V8, V13, V8 + VADDUWM V9, V14, V9 + + VXOR V5, V10, V5 + VXOR V6, V11, V6 + VXOR V7, V8, V7 + VXOR V4, V9, V4 + + VRLW V5, V28, V5 + VRLW V6, V28, V6 + VRLW V7, V28, V7 + VRLW V4, V28, V4 + + VADDUWM V0, V5, V0 + VADDUWM V1, V6, V1 + VADDUWM V2, V7, V2 + VADDUWM V3, V4, V3 + + VXOR V15, V0, V15 + VXOR V12, V1, V12 + VXOR V13, V2, V13 + VXOR V14, V3, V14 + + VRLW V15, V29, V15 + VRLW V12, V29, V12 + VRLW V13, V29, V13 + VRLW V14, V29, V14 + + VADDUWM V10, V15, V10 + VADDUWM V11, V12, V11 + VADDUWM V8, V13, V8 + VADDUWM V9, V14, V9 + + VXOR V5, V10, V5 + VXOR V6, V11, V6 + VXOR V7, V8, V7 + VXOR V4, V9, V4 + + VRLW V5, V30, V5 + VRLW V6, V30, V6 + VRLW V7, V30, V7 + VRLW V4, V30, V4 + BC 16, LT, loop_vsx + + VADDUWM V12, V26, V12 + + WORD $0x13600F8C // VMRGEW V0, V1, V27 + WORD $0x13821F8C // VMRGEW V2, V3, V28 + + WORD $0x10000E8C // VMRGOW V0, V1, V0 + WORD $0x10421E8C // VMRGOW V2, V3, V2 + + WORD $0x13A42F8C // VMRGEW V4, V5, V29 + WORD $0x13C63F8C // VMRGEW V6, V7, V30 + + XXPERMDI VS32, VS34, $0, VS33 + XXPERMDI VS32, VS34, $3, VS35 + XXPERMDI VS59, VS60, $0, VS32 + XXPERMDI VS59, VS60, $3, VS34 + + WORD $0x10842E8C // VMRGOW V4, V5, V4 + WORD $0x10C63E8C // VMRGOW V6, V7, V6 + + WORD $0x13684F8C // VMRGEW V8, V9, V27 + WORD $0x138A5F8C // VMRGEW V10, V11, V28 + + XXPERMDI VS36, VS38, $0, VS37 + XXPERMDI VS36, VS38, $3, VS39 + XXPERMDI VS61, VS62, $0, VS36 + XXPERMDI VS61, VS62, $3, VS38 + + WORD $0x11084E8C // VMRGOW V8, V9, V8 + WORD $0x114A5E8C // VMRGOW V10, V11, V10 + + WORD $0x13AC6F8C // VMRGEW V12, V13, V29 + WORD $0x13CE7F8C // VMRGEW V14, V15, V30 + + XXPERMDI VS40, VS42, $0, VS41 + XXPERMDI VS40, VS42, $3, VS43 + XXPERMDI VS59, VS60, $0, VS40 + XXPERMDI VS59, VS60, $3, VS42 + + WORD $0x118C6E8C // VMRGOW V12, V13, V12 + WORD $0x11CE7E8C // VMRGOW V14, V15, V14 + + VSPLTISW $4, V27 + VADDUWM V26, V27, V26 + + XXPERMDI VS44, VS46, $0, VS45 + XXPERMDI VS44, VS46, $3, VS47 + XXPERMDI VS61, VS62, $0, VS44 + XXPERMDI VS61, VS62, $3, VS46 + + VADDUWM V0, V16, V0 + VADDUWM V4, V17, V4 + VADDUWM V8, V18, V8 + VADDUWM V12, V19, V12 + + CMPU LEN, $64 + BLT tail_vsx + + // Bottom of loop + LXVW4X (INP)(R0), VS59 + LXVW4X (INP)(R8), VS60 + LXVW4X (INP)(R9), VS61 + LXVW4X (INP)(R10), VS62 + + VXOR V27, V0, V27 + VXOR V28, V4, V28 + VXOR V29, V8, V29 + VXOR V30, V12, V30 + + STXVW4X VS59, (OUT)(R0) + STXVW4X VS60, (OUT)(R8) + ADD $64, INP + STXVW4X VS61, (OUT)(R9) + ADD $-64, LEN + STXVW4X VS62, (OUT)(R10) + ADD $64, OUT + BEQ done_vsx + + VADDUWM V1, V16, V0 + VADDUWM V5, V17, V4 + VADDUWM V9, V18, V8 + VADDUWM V13, V19, V12 + + CMPU LEN, $64 + BLT tail_vsx + + LXVW4X (INP)(R0), VS59 + LXVW4X (INP)(R8), VS60 + LXVW4X (INP)(R9), VS61 + LXVW4X (INP)(R10), VS62 + VXOR V27, V0, V27 + + VXOR V28, V4, V28 + VXOR V29, V8, V29 + VXOR V30, V12, V30 + + STXVW4X VS59, (OUT)(R0) + STXVW4X VS60, (OUT)(R8) + ADD $64, INP + STXVW4X VS61, (OUT)(R9) + ADD $-64, LEN + STXVW4X VS62, (OUT)(V10) + ADD $64, OUT + BEQ done_vsx + + VADDUWM V2, V16, V0 + VADDUWM V6, V17, V4 + VADDUWM V10, V18, V8 + VADDUWM V14, V19, V12 + + CMPU LEN, $64 + BLT tail_vsx + + LXVW4X (INP)(R0), VS59 + LXVW4X (INP)(R8), VS60 + LXVW4X (INP)(R9), VS61 + LXVW4X (INP)(R10), VS62 + + VXOR V27, V0, V27 + VXOR V28, V4, V28 + VXOR V29, V8, V29 + VXOR V30, V12, V30 + + STXVW4X VS59, (OUT)(R0) + STXVW4X VS60, (OUT)(R8) + ADD $64, INP + STXVW4X VS61, (OUT)(R9) + ADD $-64, LEN + STXVW4X VS62, (OUT)(R10) + ADD $64, OUT + BEQ done_vsx + + VADDUWM V3, V16, V0 + VADDUWM V7, V17, V4 + VADDUWM V11, V18, V8 + VADDUWM V15, V19, V12 + + CMPU LEN, $64 + BLT tail_vsx + + LXVW4X (INP)(R0), VS59 + LXVW4X (INP)(R8), VS60 + LXVW4X (INP)(R9), VS61 + LXVW4X (INP)(R10), VS62 + + VXOR V27, V0, V27 + VXOR V28, V4, V28 + VXOR V29, V8, V29 + VXOR V30, V12, V30 + + STXVW4X VS59, (OUT)(R0) + STXVW4X VS60, (OUT)(R8) + ADD $64, INP + STXVW4X VS61, (OUT)(R9) + ADD $-64, LEN + STXVW4X VS62, (OUT)(R10) + ADD $64, OUT + + MOVD $10, R14 + MOVD R14, CTR + BNE loop_outer_vsx + +done_vsx: + // Increment counter by number of 64 byte blocks + MOVD (CNT), R14 + ADD BLOCKS, R14 + MOVD R14, (CNT) + RET + +tail_vsx: + ADD $32, R1, R11 + MOVD LEN, CTR + + // Save values on stack to copy from + STXVW4X VS32, (R11)(R0) + STXVW4X VS36, (R11)(R8) + STXVW4X VS40, (R11)(R9) + STXVW4X VS44, (R11)(R10) + ADD $-1, R11, R12 + ADD $-1, INP + ADD $-1, OUT + +looptail_vsx: + // Copying the result to OUT + // in bytes. + MOVBZU 1(R12), KEY + MOVBZU 1(INP), TMP + XOR KEY, TMP, KEY + MOVBU KEY, 1(OUT) + BC 16, LT, looptail_vsx + + // Clear the stack values + STXVW4X VS48, (R11)(R0) + STXVW4X VS48, (R11)(R8) + STXVW4X VS48, (R11)(R9) + STXVW4X VS48, (R11)(R10) + BR done_vsx diff --git a/vendor/golang.org/x/crypto/chacha20/chacha_s390x.go b/vendor/golang.org/x/crypto/chacha20/chacha_s390x.go new file mode 100644 index 00000000..c5898db4 --- /dev/null +++ b/vendor/golang.org/x/crypto/chacha20/chacha_s390x.go @@ -0,0 +1,27 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build gc && !purego +// +build gc,!purego + +package chacha20 + +import "golang.org/x/sys/cpu" + +var haveAsm = cpu.S390X.HasVX + +const bufSize = 256 + +// xorKeyStreamVX is an assembly implementation of XORKeyStream. It must only +// be called when the vector facility is available. Implementation in asm_s390x.s. +//go:noescape +func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32) + +func (c *Cipher) xorKeyStreamBlocks(dst, src []byte) { + if cpu.S390X.HasVX { + xorKeyStreamVX(dst, src, &c.key, &c.nonce, &c.counter) + } else { + c.xorKeyStreamBlocksGeneric(dst, src) + } +} diff --git a/vendor/golang.org/x/crypto/chacha20/chacha_s390x.s b/vendor/golang.org/x/crypto/chacha20/chacha_s390x.s new file mode 100644 index 00000000..81816118 --- /dev/null +++ b/vendor/golang.org/x/crypto/chacha20/chacha_s390x.s @@ -0,0 +1,224 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build gc,!purego + +#include "go_asm.h" +#include "textflag.h" + +// This is an implementation of the ChaCha20 encryption algorithm as +// specified in RFC 7539. It uses vector instructions to compute +// 4 keystream blocks in parallel (256 bytes) which are then XORed +// with the bytes in the input slice. + +GLOBL ·constants<>(SB), RODATA|NOPTR, $32 +// BSWAP: swap bytes in each 4-byte element +DATA ·constants<>+0x00(SB)/4, $0x03020100 +DATA ·constants<>+0x04(SB)/4, $0x07060504 +DATA ·constants<>+0x08(SB)/4, $0x0b0a0908 +DATA ·constants<>+0x0c(SB)/4, $0x0f0e0d0c +// J0: [j0, j1, j2, j3] +DATA ·constants<>+0x10(SB)/4, $0x61707865 +DATA ·constants<>+0x14(SB)/4, $0x3320646e +DATA ·constants<>+0x18(SB)/4, $0x79622d32 +DATA ·constants<>+0x1c(SB)/4, $0x6b206574 + +#define BSWAP V5 +#define J0 V6 +#define KEY0 V7 +#define KEY1 V8 +#define NONCE V9 +#define CTR V10 +#define M0 V11 +#define M1 V12 +#define M2 V13 +#define M3 V14 +#define INC V15 +#define X0 V16 +#define X1 V17 +#define X2 V18 +#define X3 V19 +#define X4 V20 +#define X5 V21 +#define X6 V22 +#define X7 V23 +#define X8 V24 +#define X9 V25 +#define X10 V26 +#define X11 V27 +#define X12 V28 +#define X13 V29 +#define X14 V30 +#define X15 V31 + +#define NUM_ROUNDS 20 + +#define ROUND4(a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3) \ + VAF a1, a0, a0 \ + VAF b1, b0, b0 \ + VAF c1, c0, c0 \ + VAF d1, d0, d0 \ + VX a0, a2, a2 \ + VX b0, b2, b2 \ + VX c0, c2, c2 \ + VX d0, d2, d2 \ + VERLLF $16, a2, a2 \ + VERLLF $16, b2, b2 \ + VERLLF $16, c2, c2 \ + VERLLF $16, d2, d2 \ + VAF a2, a3, a3 \ + VAF b2, b3, b3 \ + VAF c2, c3, c3 \ + VAF d2, d3, d3 \ + VX a3, a1, a1 \ + VX b3, b1, b1 \ + VX c3, c1, c1 \ + VX d3, d1, d1 \ + VERLLF $12, a1, a1 \ + VERLLF $12, b1, b1 \ + VERLLF $12, c1, c1 \ + VERLLF $12, d1, d1 \ + VAF a1, a0, a0 \ + VAF b1, b0, b0 \ + VAF c1, c0, c0 \ + VAF d1, d0, d0 \ + VX a0, a2, a2 \ + VX b0, b2, b2 \ + VX c0, c2, c2 \ + VX d0, d2, d2 \ + VERLLF $8, a2, a2 \ + VERLLF $8, b2, b2 \ + VERLLF $8, c2, c2 \ + VERLLF $8, d2, d2 \ + VAF a2, a3, a3 \ + VAF b2, b3, b3 \ + VAF c2, c3, c3 \ + VAF d2, d3, d3 \ + VX a3, a1, a1 \ + VX b3, b1, b1 \ + VX c3, c1, c1 \ + VX d3, d1, d1 \ + VERLLF $7, a1, a1 \ + VERLLF $7, b1, b1 \ + VERLLF $7, c1, c1 \ + VERLLF $7, d1, d1 + +#define PERMUTE(mask, v0, v1, v2, v3) \ + VPERM v0, v0, mask, v0 \ + VPERM v1, v1, mask, v1 \ + VPERM v2, v2, mask, v2 \ + VPERM v3, v3, mask, v3 + +#define ADDV(x, v0, v1, v2, v3) \ + VAF x, v0, v0 \ + VAF x, v1, v1 \ + VAF x, v2, v2 \ + VAF x, v3, v3 + +#define XORV(off, dst, src, v0, v1, v2, v3) \ + VLM off(src), M0, M3 \ + PERMUTE(BSWAP, v0, v1, v2, v3) \ + VX v0, M0, M0 \ + VX v1, M1, M1 \ + VX v2, M2, M2 \ + VX v3, M3, M3 \ + VSTM M0, M3, off(dst) + +#define SHUFFLE(a, b, c, d, t, u, v, w) \ + VMRHF a, c, t \ // t = {a[0], c[0], a[1], c[1]} + VMRHF b, d, u \ // u = {b[0], d[0], b[1], d[1]} + VMRLF a, c, v \ // v = {a[2], c[2], a[3], c[3]} + VMRLF b, d, w \ // w = {b[2], d[2], b[3], d[3]} + VMRHF t, u, a \ // a = {a[0], b[0], c[0], d[0]} + VMRLF t, u, b \ // b = {a[1], b[1], c[1], d[1]} + VMRHF v, w, c \ // c = {a[2], b[2], c[2], d[2]} + VMRLF v, w, d // d = {a[3], b[3], c[3], d[3]} + +// func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32) +TEXT ·xorKeyStreamVX(SB), NOSPLIT, $0 + MOVD $·constants<>(SB), R1 + MOVD dst+0(FP), R2 // R2=&dst[0] + LMG src+24(FP), R3, R4 // R3=&src[0] R4=len(src) + MOVD key+48(FP), R5 // R5=key + MOVD nonce+56(FP), R6 // R6=nonce + MOVD counter+64(FP), R7 // R7=counter + + // load BSWAP and J0 + VLM (R1), BSWAP, J0 + + // setup + MOVD $95, R0 + VLM (R5), KEY0, KEY1 + VLL R0, (R6), NONCE + VZERO M0 + VLEIB $7, $32, M0 + VSRLB M0, NONCE, NONCE + + // initialize counter values + VLREPF (R7), CTR + VZERO INC + VLEIF $1, $1, INC + VLEIF $2, $2, INC + VLEIF $3, $3, INC + VAF INC, CTR, CTR + VREPIF $4, INC + +chacha: + VREPF $0, J0, X0 + VREPF $1, J0, X1 + VREPF $2, J0, X2 + VREPF $3, J0, X3 + VREPF $0, KEY0, X4 + VREPF $1, KEY0, X5 + VREPF $2, KEY0, X6 + VREPF $3, KEY0, X7 + VREPF $0, KEY1, X8 + VREPF $1, KEY1, X9 + VREPF $2, KEY1, X10 + VREPF $3, KEY1, X11 + VLR CTR, X12 + VREPF $1, NONCE, X13 + VREPF $2, NONCE, X14 + VREPF $3, NONCE, X15 + + MOVD $(NUM_ROUNDS/2), R1 + +loop: + ROUND4(X0, X4, X12, X8, X1, X5, X13, X9, X2, X6, X14, X10, X3, X7, X15, X11) + ROUND4(X0, X5, X15, X10, X1, X6, X12, X11, X2, X7, X13, X8, X3, X4, X14, X9) + + ADD $-1, R1 + BNE loop + + // decrement length + ADD $-256, R4 + + // rearrange vectors + SHUFFLE(X0, X1, X2, X3, M0, M1, M2, M3) + ADDV(J0, X0, X1, X2, X3) + SHUFFLE(X4, X5, X6, X7, M0, M1, M2, M3) + ADDV(KEY0, X4, X5, X6, X7) + SHUFFLE(X8, X9, X10, X11, M0, M1, M2, M3) + ADDV(KEY1, X8, X9, X10, X11) + VAF CTR, X12, X12 + SHUFFLE(X12, X13, X14, X15, M0, M1, M2, M3) + ADDV(NONCE, X12, X13, X14, X15) + + // increment counters + VAF INC, CTR, CTR + + // xor keystream with plaintext + XORV(0*64, R2, R3, X0, X4, X8, X12) + XORV(1*64, R2, R3, X1, X5, X9, X13) + XORV(2*64, R2, R3, X2, X6, X10, X14) + XORV(3*64, R2, R3, X3, X7, X11, X15) + + // increment pointers + MOVD $256(R2), R2 + MOVD $256(R3), R3 + + CMPBNE R4, $0, chacha + + VSTEF $0, CTR, (R7) + RET diff --git a/vendor/golang.org/x/crypto/chacha20/xor.go b/vendor/golang.org/x/crypto/chacha20/xor.go new file mode 100644 index 00000000..c2d04851 --- /dev/null +++ b/vendor/golang.org/x/crypto/chacha20/xor.go @@ -0,0 +1,42 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found src the LICENSE file. + +package chacha20 + +import "runtime" + +// Platforms that have fast unaligned 32-bit little endian accesses. +const unaligned = runtime.GOARCH == "386" || + runtime.GOARCH == "amd64" || + runtime.GOARCH == "arm64" || + runtime.GOARCH == "ppc64le" || + runtime.GOARCH == "s390x" + +// addXor reads a little endian uint32 from src, XORs it with (a + b) and +// places the result in little endian byte order in dst. +func addXor(dst, src []byte, a, b uint32) { + _, _ = src[3], dst[3] // bounds check elimination hint + if unaligned { + // The compiler should optimize this code into + // 32-bit unaligned little endian loads and stores. + // TODO: delete once the compiler does a reliably + // good job with the generic code below. + // See issue #25111 for more details. + v := uint32(src[0]) + v |= uint32(src[1]) << 8 + v |= uint32(src[2]) << 16 + v |= uint32(src[3]) << 24 + v ^= a + b + dst[0] = byte(v) + dst[1] = byte(v >> 8) + dst[2] = byte(v >> 16) + dst[3] = byte(v >> 24) + } else { + a += b + dst[0] = src[0] ^ byte(a) + dst[1] = src[1] ^ byte(a>>8) + dst[2] = src[2] ^ byte(a>>16) + dst[3] = src[3] ^ byte(a>>24) + } +} diff --git a/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305.go b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305.go new file mode 100644 index 00000000..0d7bac3f --- /dev/null +++ b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305.go @@ -0,0 +1,94 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package chacha20poly1305 implements the ChaCha20-Poly1305 AEAD and its +// extended nonce variant XChaCha20-Poly1305, as specified in RFC 8439 and +// draft-irtf-cfrg-xchacha-01. +package chacha20poly1305 // import "golang.org/x/crypto/chacha20poly1305" + +import ( + "crypto/cipher" + "errors" +) + +const ( + // KeySize is the size of the key used by this AEAD, in bytes. + KeySize = 32 + + // NonceSize is the size of the nonce used with the standard variant of this + // AEAD, in bytes. + // + // Note that this is too short to be safely generated at random if the same + // key is reused more than 2³² times. + NonceSize = 12 + + // NonceSizeX is the size of the nonce used with the XChaCha20-Poly1305 + // variant of this AEAD, in bytes. + NonceSizeX = 24 +) + +type chacha20poly1305 struct { + key [KeySize]byte +} + +// New returns a ChaCha20-Poly1305 AEAD that uses the given 256-bit key. +func New(key []byte) (cipher.AEAD, error) { + if len(key) != KeySize { + return nil, errors.New("chacha20poly1305: bad key length") + } + ret := new(chacha20poly1305) + copy(ret.key[:], key) + return ret, nil +} + +func (c *chacha20poly1305) NonceSize() int { + return NonceSize +} + +func (c *chacha20poly1305) Overhead() int { + return 16 +} + +func (c *chacha20poly1305) Seal(dst, nonce, plaintext, additionalData []byte) []byte { + if len(nonce) != NonceSize { + panic("chacha20poly1305: bad nonce length passed to Seal") + } + + if uint64(len(plaintext)) > (1<<38)-64 { + panic("chacha20poly1305: plaintext too large") + } + + return c.seal(dst, nonce, plaintext, additionalData) +} + +var errOpen = errors.New("chacha20poly1305: message authentication failed") + +func (c *chacha20poly1305) Open(dst, nonce, ciphertext, additionalData []byte) ([]byte, error) { + if len(nonce) != NonceSize { + panic("chacha20poly1305: bad nonce length passed to Open") + } + if len(ciphertext) < 16 { + return nil, errOpen + } + if uint64(len(ciphertext)) > (1<<38)-48 { + panic("chacha20poly1305: ciphertext too large") + } + + return c.open(dst, nonce, ciphertext, additionalData) +} + +// sliceForAppend takes a slice and a requested number of bytes. It returns a +// slice with the contents of the given slice followed by that many bytes and a +// second slice that aliases into it and contains only the extra bytes. If the +// original slice has sufficient capacity then no allocation is performed. +func sliceForAppend(in []byte, n int) (head, tail []byte) { + if total := len(in) + n; cap(in) >= total { + head = in[:total] + } else { + head = make([]byte, total) + copy(head, in) + } + tail = head[len(in):] + return +} diff --git a/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.go b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.go new file mode 100644 index 00000000..25959b9a --- /dev/null +++ b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.go @@ -0,0 +1,87 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build gc && !purego +// +build gc,!purego + +package chacha20poly1305 + +import ( + "encoding/binary" + + "golang.org/x/crypto/internal/subtle" + "golang.org/x/sys/cpu" +) + +//go:noescape +func chacha20Poly1305Open(dst []byte, key []uint32, src, ad []byte) bool + +//go:noescape +func chacha20Poly1305Seal(dst []byte, key []uint32, src, ad []byte) + +var ( + useAVX2 = cpu.X86.HasAVX2 && cpu.X86.HasBMI2 +) + +// setupState writes a ChaCha20 input matrix to state. See +// https://tools.ietf.org/html/rfc7539#section-2.3. +func setupState(state *[16]uint32, key *[32]byte, nonce []byte) { + state[0] = 0x61707865 + state[1] = 0x3320646e + state[2] = 0x79622d32 + state[3] = 0x6b206574 + + state[4] = binary.LittleEndian.Uint32(key[0:4]) + state[5] = binary.LittleEndian.Uint32(key[4:8]) + state[6] = binary.LittleEndian.Uint32(key[8:12]) + state[7] = binary.LittleEndian.Uint32(key[12:16]) + state[8] = binary.LittleEndian.Uint32(key[16:20]) + state[9] = binary.LittleEndian.Uint32(key[20:24]) + state[10] = binary.LittleEndian.Uint32(key[24:28]) + state[11] = binary.LittleEndian.Uint32(key[28:32]) + + state[12] = 0 + state[13] = binary.LittleEndian.Uint32(nonce[0:4]) + state[14] = binary.LittleEndian.Uint32(nonce[4:8]) + state[15] = binary.LittleEndian.Uint32(nonce[8:12]) +} + +func (c *chacha20poly1305) seal(dst, nonce, plaintext, additionalData []byte) []byte { + if !cpu.X86.HasSSSE3 { + return c.sealGeneric(dst, nonce, plaintext, additionalData) + } + + var state [16]uint32 + setupState(&state, &c.key, nonce) + + ret, out := sliceForAppend(dst, len(plaintext)+16) + if subtle.InexactOverlap(out, plaintext) { + panic("chacha20poly1305: invalid buffer overlap") + } + chacha20Poly1305Seal(out[:], state[:], plaintext, additionalData) + return ret +} + +func (c *chacha20poly1305) open(dst, nonce, ciphertext, additionalData []byte) ([]byte, error) { + if !cpu.X86.HasSSSE3 { + return c.openGeneric(dst, nonce, ciphertext, additionalData) + } + + var state [16]uint32 + setupState(&state, &c.key, nonce) + + ciphertext = ciphertext[:len(ciphertext)-16] + ret, out := sliceForAppend(dst, len(ciphertext)) + if subtle.InexactOverlap(out, ciphertext) { + panic("chacha20poly1305: invalid buffer overlap") + } + if !chacha20Poly1305Open(out, state[:], ciphertext, additionalData) { + for i := range out { + out[i] = 0 + } + return nil, errOpen + } + + return ret, nil +} diff --git a/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s new file mode 100644 index 00000000..55226b0e --- /dev/null +++ b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s @@ -0,0 +1,2695 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// This file was originally from https://golang.org/cl/24717 by Vlad Krasnov of CloudFlare. + +// +build gc,!purego + +#include "textflag.h" +// General register allocation +#define oup DI +#define inp SI +#define inl BX +#define adp CX // free to reuse, after we hash the additional data +#define keyp R8 // free to reuse, when we copy the key to stack +#define itr2 R9 // general iterator +#define itr1 CX // general iterator +#define acc0 R10 +#define acc1 R11 +#define acc2 R12 +#define t0 R13 +#define t1 R14 +#define t2 R15 +#define t3 R8 +// Register and stack allocation for the SSE code +#define rStore (0*16)(BP) +#define sStore (1*16)(BP) +#define state1Store (2*16)(BP) +#define state2Store (3*16)(BP) +#define tmpStore (4*16)(BP) +#define ctr0Store (5*16)(BP) +#define ctr1Store (6*16)(BP) +#define ctr2Store (7*16)(BP) +#define ctr3Store (8*16)(BP) +#define A0 X0 +#define A1 X1 +#define A2 X2 +#define B0 X3 +#define B1 X4 +#define B2 X5 +#define C0 X6 +#define C1 X7 +#define C2 X8 +#define D0 X9 +#define D1 X10 +#define D2 X11 +#define T0 X12 +#define T1 X13 +#define T2 X14 +#define T3 X15 +#define A3 T0 +#define B3 T1 +#define C3 T2 +#define D3 T3 +// Register and stack allocation for the AVX2 code +#define rsStoreAVX2 (0*32)(BP) +#define state1StoreAVX2 (1*32)(BP) +#define state2StoreAVX2 (2*32)(BP) +#define ctr0StoreAVX2 (3*32)(BP) +#define ctr1StoreAVX2 (4*32)(BP) +#define ctr2StoreAVX2 (5*32)(BP) +#define ctr3StoreAVX2 (6*32)(BP) +#define tmpStoreAVX2 (7*32)(BP) // 256 bytes on stack +#define AA0 Y0 +#define AA1 Y5 +#define AA2 Y6 +#define AA3 Y7 +#define BB0 Y14 +#define BB1 Y9 +#define BB2 Y10 +#define BB3 Y11 +#define CC0 Y12 +#define CC1 Y13 +#define CC2 Y8 +#define CC3 Y15 +#define DD0 Y4 +#define DD1 Y1 +#define DD2 Y2 +#define DD3 Y3 +#define TT0 DD3 +#define TT1 AA3 +#define TT2 BB3 +#define TT3 CC3 +// ChaCha20 constants +DATA ·chacha20Constants<>+0x00(SB)/4, $0x61707865 +DATA ·chacha20Constants<>+0x04(SB)/4, $0x3320646e +DATA ·chacha20Constants<>+0x08(SB)/4, $0x79622d32 +DATA ·chacha20Constants<>+0x0c(SB)/4, $0x6b206574 +DATA ·chacha20Constants<>+0x10(SB)/4, $0x61707865 +DATA ·chacha20Constants<>+0x14(SB)/4, $0x3320646e +DATA ·chacha20Constants<>+0x18(SB)/4, $0x79622d32 +DATA ·chacha20Constants<>+0x1c(SB)/4, $0x6b206574 +// <<< 16 with PSHUFB +DATA ·rol16<>+0x00(SB)/8, $0x0504070601000302 +DATA ·rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A +DATA ·rol16<>+0x10(SB)/8, $0x0504070601000302 +DATA ·rol16<>+0x18(SB)/8, $0x0D0C0F0E09080B0A +// <<< 8 with PSHUFB +DATA ·rol8<>+0x00(SB)/8, $0x0605040702010003 +DATA ·rol8<>+0x08(SB)/8, $0x0E0D0C0F0A09080B +DATA ·rol8<>+0x10(SB)/8, $0x0605040702010003 +DATA ·rol8<>+0x18(SB)/8, $0x0E0D0C0F0A09080B + +DATA ·avx2InitMask<>+0x00(SB)/8, $0x0 +DATA ·avx2InitMask<>+0x08(SB)/8, $0x0 +DATA ·avx2InitMask<>+0x10(SB)/8, $0x1 +DATA ·avx2InitMask<>+0x18(SB)/8, $0x0 + +DATA ·avx2IncMask<>+0x00(SB)/8, $0x2 +DATA ·avx2IncMask<>+0x08(SB)/8, $0x0 +DATA ·avx2IncMask<>+0x10(SB)/8, $0x2 +DATA ·avx2IncMask<>+0x18(SB)/8, $0x0 +// Poly1305 key clamp +DATA ·polyClampMask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF +DATA ·polyClampMask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC +DATA ·polyClampMask<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF +DATA ·polyClampMask<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF + +DATA ·sseIncMask<>+0x00(SB)/8, $0x1 +DATA ·sseIncMask<>+0x08(SB)/8, $0x0 +// To load/store the last < 16 bytes in a buffer +DATA ·andMask<>+0x00(SB)/8, $0x00000000000000ff +DATA ·andMask<>+0x08(SB)/8, $0x0000000000000000 +DATA ·andMask<>+0x10(SB)/8, $0x000000000000ffff +DATA ·andMask<>+0x18(SB)/8, $0x0000000000000000 +DATA ·andMask<>+0x20(SB)/8, $0x0000000000ffffff +DATA ·andMask<>+0x28(SB)/8, $0x0000000000000000 +DATA ·andMask<>+0x30(SB)/8, $0x00000000ffffffff +DATA ·andMask<>+0x38(SB)/8, $0x0000000000000000 +DATA ·andMask<>+0x40(SB)/8, $0x000000ffffffffff +DATA ·andMask<>+0x48(SB)/8, $0x0000000000000000 +DATA ·andMask<>+0x50(SB)/8, $0x0000ffffffffffff +DATA ·andMask<>+0x58(SB)/8, $0x0000000000000000 +DATA ·andMask<>+0x60(SB)/8, $0x00ffffffffffffff +DATA ·andMask<>+0x68(SB)/8, $0x0000000000000000 +DATA ·andMask<>+0x70(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+0x78(SB)/8, $0x0000000000000000 +DATA ·andMask<>+0x80(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+0x88(SB)/8, $0x00000000000000ff +DATA ·andMask<>+0x90(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+0x98(SB)/8, $0x000000000000ffff +DATA ·andMask<>+0xa0(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+0xa8(SB)/8, $0x0000000000ffffff +DATA ·andMask<>+0xb0(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+0xb8(SB)/8, $0x00000000ffffffff +DATA ·andMask<>+0xc0(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+0xc8(SB)/8, $0x000000ffffffffff +DATA ·andMask<>+0xd0(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+0xd8(SB)/8, $0x0000ffffffffffff +DATA ·andMask<>+0xe0(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+0xe8(SB)/8, $0x00ffffffffffffff + +GLOBL ·chacha20Constants<>(SB), (NOPTR+RODATA), $32 +GLOBL ·rol16<>(SB), (NOPTR+RODATA), $32 +GLOBL ·rol8<>(SB), (NOPTR+RODATA), $32 +GLOBL ·sseIncMask<>(SB), (NOPTR+RODATA), $16 +GLOBL ·avx2IncMask<>(SB), (NOPTR+RODATA), $32 +GLOBL ·avx2InitMask<>(SB), (NOPTR+RODATA), $32 +GLOBL ·polyClampMask<>(SB), (NOPTR+RODATA), $32 +GLOBL ·andMask<>(SB), (NOPTR+RODATA), $240 +// No PALIGNR in Go ASM yet (but VPALIGNR is present). +#define shiftB0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X3, X3 +#define shiftB1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x04 // PALIGNR $4, X4, X4 +#define shiftB2Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X5, X5 +#define shiftB3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X13, X13 +#define shiftC0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X6, X6 +#define shiftC1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x08 // PALIGNR $8, X7, X7 +#define shiftC2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc0; BYTE $0x08 // PALIGNR $8, X8, X8 +#define shiftC3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X14, X14 +#define shiftD0Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x0c // PALIGNR $12, X9, X9 +#define shiftD1Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x0c // PALIGNR $12, X10, X10 +#define shiftD2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X11, X11 +#define shiftD3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x0c // PALIGNR $12, X15, X15 +#define shiftB0Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X3, X3 +#define shiftB1Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x0c // PALIGNR $12, X4, X4 +#define shiftB2Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X5, X5 +#define shiftB3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X13, X13 +#define shiftC0Right shiftC0Left +#define shiftC1Right shiftC1Left +#define shiftC2Right shiftC2Left +#define shiftC3Right shiftC3Left +#define shiftD0Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x04 // PALIGNR $4, X9, X9 +#define shiftD1Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x04 // PALIGNR $4, X10, X10 +#define shiftD2Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X11, X11 +#define shiftD3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x04 // PALIGNR $4, X15, X15 +// Some macros +#define chachaQR(A, B, C, D, T) \ + PADDD B, A; PXOR A, D; PSHUFB ·rol16<>(SB), D \ + PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $12, T; PSRLL $20, B; PXOR T, B \ + PADDD B, A; PXOR A, D; PSHUFB ·rol8<>(SB), D \ + PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $7, T; PSRLL $25, B; PXOR T, B + +#define chachaQR_AVX2(A, B, C, D, T) \ + VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol16<>(SB), D, D \ + VPADDD D, C, C; VPXOR C, B, B; VPSLLD $12, B, T; VPSRLD $20, B, B; VPXOR T, B, B \ + VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol8<>(SB), D, D \ + VPADDD D, C, C; VPXOR C, B, B; VPSLLD $7, B, T; VPSRLD $25, B, B; VPXOR T, B, B + +#define polyAdd(S) ADDQ S, acc0; ADCQ 8+S, acc1; ADCQ $1, acc2 +#define polyMulStage1 MOVQ (0*8)(BP), AX; MOVQ AX, t2; MULQ acc0; MOVQ AX, t0; MOVQ DX, t1; MOVQ (0*8)(BP), AX; MULQ acc1; IMULQ acc2, t2; ADDQ AX, t1; ADCQ DX, t2 +#define polyMulStage2 MOVQ (1*8)(BP), AX; MOVQ AX, t3; MULQ acc0; ADDQ AX, t1; ADCQ $0, DX; MOVQ DX, acc0; MOVQ (1*8)(BP), AX; MULQ acc1; ADDQ AX, t2; ADCQ $0, DX +#define polyMulStage3 IMULQ acc2, t3; ADDQ acc0, t2; ADCQ DX, t3 +#define polyMulReduceStage MOVQ t0, acc0; MOVQ t1, acc1; MOVQ t2, acc2; ANDQ $3, acc2; MOVQ t2, t0; ANDQ $-4, t0; MOVQ t3, t1; SHRQ $2, t3, t2; SHRQ $2, t3; ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $0, acc2; ADDQ t2, acc0; ADCQ t3, acc1; ADCQ $0, acc2 + +#define polyMulStage1_AVX2 MOVQ (0*8)(BP), DX; MOVQ DX, t2; MULXQ acc0, t0, t1; IMULQ acc2, t2; MULXQ acc1, AX, DX; ADDQ AX, t1; ADCQ DX, t2 +#define polyMulStage2_AVX2 MOVQ (1*8)(BP), DX; MULXQ acc0, acc0, AX; ADDQ acc0, t1; MULXQ acc1, acc1, t3; ADCQ acc1, t2; ADCQ $0, t3 +#define polyMulStage3_AVX2 IMULQ acc2, DX; ADDQ AX, t2; ADCQ DX, t3 + +#define polyMul polyMulStage1; polyMulStage2; polyMulStage3; polyMulReduceStage +#define polyMulAVX2 polyMulStage1_AVX2; polyMulStage2_AVX2; polyMulStage3_AVX2; polyMulReduceStage +// ---------------------------------------------------------------------------- +TEXT polyHashADInternal<>(SB), NOSPLIT, $0 + // adp points to beginning of additional data + // itr2 holds ad length + XORQ acc0, acc0 + XORQ acc1, acc1 + XORQ acc2, acc2 + CMPQ itr2, $13 + JNE hashADLoop + +openFastTLSAD: + // Special treatment for the TLS case of 13 bytes + MOVQ (adp), acc0 + MOVQ 5(adp), acc1 + SHRQ $24, acc1 + MOVQ $1, acc2 + polyMul + RET + +hashADLoop: + // Hash in 16 byte chunks + CMPQ itr2, $16 + JB hashADTail + polyAdd(0(adp)) + LEAQ (1*16)(adp), adp + SUBQ $16, itr2 + polyMul + JMP hashADLoop + +hashADTail: + CMPQ itr2, $0 + JE hashADDone + + // Hash last < 16 byte tail + XORQ t0, t0 + XORQ t1, t1 + XORQ t2, t2 + ADDQ itr2, adp + +hashADTailLoop: + SHLQ $8, t0, t1 + SHLQ $8, t0 + MOVB -1(adp), t2 + XORQ t2, t0 + DECQ adp + DECQ itr2 + JNE hashADTailLoop + +hashADTailFinish: + ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2 + polyMul + + // Finished AD +hashADDone: + RET + +// ---------------------------------------------------------------------------- +// func chacha20Poly1305Open(dst, key, src, ad []byte) bool +TEXT ·chacha20Poly1305Open(SB), 0, $288-97 + // For aligned stack access + MOVQ SP, BP + ADDQ $32, BP + ANDQ $-32, BP + MOVQ dst+0(FP), oup + MOVQ key+24(FP), keyp + MOVQ src+48(FP), inp + MOVQ src_len+56(FP), inl + MOVQ ad+72(FP), adp + + // Check for AVX2 support + CMPB ·useAVX2(SB), $1 + JE chacha20Poly1305Open_AVX2 + + // Special optimization, for very short buffers + CMPQ inl, $128 + JBE openSSE128 // About 16% faster + + // For long buffers, prepare the poly key first + MOVOU ·chacha20Constants<>(SB), A0 + MOVOU (1*16)(keyp), B0 + MOVOU (2*16)(keyp), C0 + MOVOU (3*16)(keyp), D0 + MOVO D0, T1 + + // Store state on stack for future use + MOVO B0, state1Store + MOVO C0, state2Store + MOVO D0, ctr3Store + MOVQ $10, itr2 + +openSSEPreparePolyKey: + chachaQR(A0, B0, C0, D0, T0) + shiftB0Left; shiftC0Left; shiftD0Left + chachaQR(A0, B0, C0, D0, T0) + shiftB0Right; shiftC0Right; shiftD0Right + DECQ itr2 + JNE openSSEPreparePolyKey + + // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded + PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0 + + // Clamp and store the key + PAND ·polyClampMask<>(SB), A0 + MOVO A0, rStore; MOVO B0, sStore + + // Hash AAD + MOVQ ad_len+80(FP), itr2 + CALL polyHashADInternal<>(SB) + +openSSEMainLoop: + CMPQ inl, $256 + JB openSSEMainLoopDone + + // Load state, increment counter blocks + MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0 + MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 + MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 + MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3 + + // Store counters + MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store + + // There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we hash 2 blocks, and for the remaining 4 only 1 block - for a total of 16 + MOVQ $4, itr1 + MOVQ inp, itr2 + +openSSEInternalLoop: + MOVO C3, tmpStore + chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) + MOVO tmpStore, C3 + MOVO C1, tmpStore + chachaQR(A3, B3, C3, D3, C1) + MOVO tmpStore, C1 + polyAdd(0(itr2)) + shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left + shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left + shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left + polyMulStage1 + polyMulStage2 + LEAQ (2*8)(itr2), itr2 + MOVO C3, tmpStore + chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) + MOVO tmpStore, C3 + MOVO C1, tmpStore + polyMulStage3 + chachaQR(A3, B3, C3, D3, C1) + MOVO tmpStore, C1 + polyMulReduceStage + shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right + shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right + shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right + DECQ itr1 + JGE openSSEInternalLoop + + polyAdd(0(itr2)) + polyMul + LEAQ (2*8)(itr2), itr2 + + CMPQ itr1, $-6 + JG openSSEInternalLoop + + // Add in the state + PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3 + PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 + PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 + PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 + + // Load - xor - store + MOVO D3, tmpStore + MOVOU (0*16)(inp), D3; PXOR D3, A0; MOVOU A0, (0*16)(oup) + MOVOU (1*16)(inp), D3; PXOR D3, B0; MOVOU B0, (1*16)(oup) + MOVOU (2*16)(inp), D3; PXOR D3, C0; MOVOU C0, (2*16)(oup) + MOVOU (3*16)(inp), D3; PXOR D3, D0; MOVOU D0, (3*16)(oup) + MOVOU (4*16)(inp), D0; PXOR D0, A1; MOVOU A1, (4*16)(oup) + MOVOU (5*16)(inp), D0; PXOR D0, B1; MOVOU B1, (5*16)(oup) + MOVOU (6*16)(inp), D0; PXOR D0, C1; MOVOU C1, (6*16)(oup) + MOVOU (7*16)(inp), D0; PXOR D0, D1; MOVOU D1, (7*16)(oup) + MOVOU (8*16)(inp), D0; PXOR D0, A2; MOVOU A2, (8*16)(oup) + MOVOU (9*16)(inp), D0; PXOR D0, B2; MOVOU B2, (9*16)(oup) + MOVOU (10*16)(inp), D0; PXOR D0, C2; MOVOU C2, (10*16)(oup) + MOVOU (11*16)(inp), D0; PXOR D0, D2; MOVOU D2, (11*16)(oup) + MOVOU (12*16)(inp), D0; PXOR D0, A3; MOVOU A3, (12*16)(oup) + MOVOU (13*16)(inp), D0; PXOR D0, B3; MOVOU B3, (13*16)(oup) + MOVOU (14*16)(inp), D0; PXOR D0, C3; MOVOU C3, (14*16)(oup) + MOVOU (15*16)(inp), D0; PXOR tmpStore, D0; MOVOU D0, (15*16)(oup) + LEAQ 256(inp), inp + LEAQ 256(oup), oup + SUBQ $256, inl + JMP openSSEMainLoop + +openSSEMainLoopDone: + // Handle the various tail sizes efficiently + TESTQ inl, inl + JE openSSEFinalize + CMPQ inl, $64 + JBE openSSETail64 + CMPQ inl, $128 + JBE openSSETail128 + CMPQ inl, $192 + JBE openSSETail192 + JMP openSSETail256 + +openSSEFinalize: + // Hash in the PT, AAD lengths + ADDQ ad_len+80(FP), acc0; ADCQ src_len+56(FP), acc1; ADCQ $1, acc2 + polyMul + + // Final reduce + MOVQ acc0, t0 + MOVQ acc1, t1 + MOVQ acc2, t2 + SUBQ $-5, acc0 + SBBQ $-1, acc1 + SBBQ $3, acc2 + CMOVQCS t0, acc0 + CMOVQCS t1, acc1 + CMOVQCS t2, acc2 + + // Add in the "s" part of the key + ADDQ 0+sStore, acc0 + ADCQ 8+sStore, acc1 + + // Finally, constant time compare to the tag at the end of the message + XORQ AX, AX + MOVQ $1, DX + XORQ (0*8)(inp), acc0 + XORQ (1*8)(inp), acc1 + ORQ acc1, acc0 + CMOVQEQ DX, AX + + // Return true iff tags are equal + MOVB AX, ret+96(FP) + RET + +// ---------------------------------------------------------------------------- +// Special optimization for buffers smaller than 129 bytes +openSSE128: + // For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks + MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0 + MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 + MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 + MOVO B0, T1; MOVO C0, T2; MOVO D1, T3 + MOVQ $10, itr2 + +openSSE128InnerCipherLoop: + chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) + shiftB0Left; shiftB1Left; shiftB2Left + shiftC0Left; shiftC1Left; shiftC2Left + shiftD0Left; shiftD1Left; shiftD2Left + chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) + shiftB0Right; shiftB1Right; shiftB2Right + shiftC0Right; shiftC1Right; shiftC2Right + shiftD0Right; shiftD1Right; shiftD2Right + DECQ itr2 + JNE openSSE128InnerCipherLoop + + // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded + PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2 + PADDL T1, B0; PADDL T1, B1; PADDL T1, B2 + PADDL T2, C1; PADDL T2, C2 + PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2 + + // Clamp and store the key + PAND ·polyClampMask<>(SB), A0 + MOVOU A0, rStore; MOVOU B0, sStore + + // Hash + MOVQ ad_len+80(FP), itr2 + CALL polyHashADInternal<>(SB) + +openSSE128Open: + CMPQ inl, $16 + JB openSSETail16 + SUBQ $16, inl + + // Load for hashing + polyAdd(0(inp)) + + // Load for decryption + MOVOU (inp), T0; PXOR T0, A1; MOVOU A1, (oup) + LEAQ (1*16)(inp), inp + LEAQ (1*16)(oup), oup + polyMul + + // Shift the stream "left" + MOVO B1, A1 + MOVO C1, B1 + MOVO D1, C1 + MOVO A2, D1 + MOVO B2, A2 + MOVO C2, B2 + MOVO D2, C2 + JMP openSSE128Open + +openSSETail16: + TESTQ inl, inl + JE openSSEFinalize + + // We can safely load the CT from the end, because it is padded with the MAC + MOVQ inl, itr2 + SHLQ $4, itr2 + LEAQ ·andMask<>(SB), t0 + MOVOU (inp), T0 + ADDQ inl, inp + PAND -16(t0)(itr2*1), T0 + MOVO T0, 0+tmpStore + MOVQ T0, t0 + MOVQ 8+tmpStore, t1 + PXOR A1, T0 + + // We can only store one byte at a time, since plaintext can be shorter than 16 bytes +openSSETail16Store: + MOVQ T0, t3 + MOVB t3, (oup) + PSRLDQ $1, T0 + INCQ oup + DECQ inl + JNE openSSETail16Store + ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2 + polyMul + JMP openSSEFinalize + +// ---------------------------------------------------------------------------- +// Special optimization for the last 64 bytes of ciphertext +openSSETail64: + // Need to decrypt up to 64 bytes - prepare single block + MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store + XORQ itr2, itr2 + MOVQ inl, itr1 + CMPQ itr1, $16 + JB openSSETail64LoopB + +openSSETail64LoopA: + // Perform ChaCha rounds, while hashing the remaining input + polyAdd(0(inp)(itr2*1)) + polyMul + SUBQ $16, itr1 + +openSSETail64LoopB: + ADDQ $16, itr2 + chachaQR(A0, B0, C0, D0, T0) + shiftB0Left; shiftC0Left; shiftD0Left + chachaQR(A0, B0, C0, D0, T0) + shiftB0Right; shiftC0Right; shiftD0Right + + CMPQ itr1, $16 + JAE openSSETail64LoopA + + CMPQ itr2, $160 + JNE openSSETail64LoopB + + PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0; PADDL state2Store, C0; PADDL ctr0Store, D0 + +openSSETail64DecLoop: + CMPQ inl, $16 + JB openSSETail64DecLoopDone + SUBQ $16, inl + MOVOU (inp), T0 + PXOR T0, A0 + MOVOU A0, (oup) + LEAQ 16(inp), inp + LEAQ 16(oup), oup + MOVO B0, A0 + MOVO C0, B0 + MOVO D0, C0 + JMP openSSETail64DecLoop + +openSSETail64DecLoopDone: + MOVO A0, A1 + JMP openSSETail16 + +// ---------------------------------------------------------------------------- +// Special optimization for the last 128 bytes of ciphertext +openSSETail128: + // Need to decrypt up to 128 bytes - prepare two blocks + MOVO ·chacha20Constants<>(SB), A1; MOVO state1Store, B1; MOVO state2Store, C1; MOVO ctr3Store, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr0Store + MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr1Store + XORQ itr2, itr2 + MOVQ inl, itr1 + ANDQ $-16, itr1 + +openSSETail128LoopA: + // Perform ChaCha rounds, while hashing the remaining input + polyAdd(0(inp)(itr2*1)) + polyMul + +openSSETail128LoopB: + ADDQ $16, itr2 + chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0) + shiftB0Left; shiftC0Left; shiftD0Left + shiftB1Left; shiftC1Left; shiftD1Left + chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0) + shiftB0Right; shiftC0Right; shiftD0Right + shiftB1Right; shiftC1Right; shiftD1Right + + CMPQ itr2, itr1 + JB openSSETail128LoopA + + CMPQ itr2, $160 + JNE openSSETail128LoopB + + PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1 + PADDL state1Store, B0; PADDL state1Store, B1 + PADDL state2Store, C0; PADDL state2Store, C1 + PADDL ctr1Store, D0; PADDL ctr0Store, D1 + + MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3 + PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1 + MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup) + + SUBQ $64, inl + LEAQ 64(inp), inp + LEAQ 64(oup), oup + JMP openSSETail64DecLoop + +// ---------------------------------------------------------------------------- +// Special optimization for the last 192 bytes of ciphertext +openSSETail192: + // Need to decrypt up to 192 bytes - prepare three blocks + MOVO ·chacha20Constants<>(SB), A2; MOVO state1Store, B2; MOVO state2Store, C2; MOVO ctr3Store, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr0Store + MOVO A2, A1; MOVO B2, B1; MOVO C2, C1; MOVO D2, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store + MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr2Store + + MOVQ inl, itr1 + MOVQ $160, itr2 + CMPQ itr1, $160 + CMOVQGT itr2, itr1 + ANDQ $-16, itr1 + XORQ itr2, itr2 + +openSSLTail192LoopA: + // Perform ChaCha rounds, while hashing the remaining input + polyAdd(0(inp)(itr2*1)) + polyMul + +openSSLTail192LoopB: + ADDQ $16, itr2 + chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) + shiftB0Left; shiftC0Left; shiftD0Left + shiftB1Left; shiftC1Left; shiftD1Left + shiftB2Left; shiftC2Left; shiftD2Left + + chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) + shiftB0Right; shiftC0Right; shiftD0Right + shiftB1Right; shiftC1Right; shiftD1Right + shiftB2Right; shiftC2Right; shiftD2Right + + CMPQ itr2, itr1 + JB openSSLTail192LoopA + + CMPQ itr2, $160 + JNE openSSLTail192LoopB + + CMPQ inl, $176 + JB openSSLTail192Store + + polyAdd(160(inp)) + polyMul + + CMPQ inl, $192 + JB openSSLTail192Store + + polyAdd(176(inp)) + polyMul + +openSSLTail192Store: + PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2 + PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2 + PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2 + PADDL ctr2Store, D0; PADDL ctr1Store, D1; PADDL ctr0Store, D2 + + MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3 + PXOR T0, A2; PXOR T1, B2; PXOR T2, C2; PXOR T3, D2 + MOVOU A2, (0*16)(oup); MOVOU B2, (1*16)(oup); MOVOU C2, (2*16)(oup); MOVOU D2, (3*16)(oup) + + MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3 + PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1 + MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup) + + SUBQ $128, inl + LEAQ 128(inp), inp + LEAQ 128(oup), oup + JMP openSSETail64DecLoop + +// ---------------------------------------------------------------------------- +// Special optimization for the last 256 bytes of ciphertext +openSSETail256: + // Need to decrypt up to 256 bytes - prepare four blocks + MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0 + MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 + MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 + MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3 + + // Store counters + MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store + XORQ itr2, itr2 + +openSSETail256Loop: + // This loop inteleaves 8 ChaCha quarter rounds with 1 poly multiplication + polyAdd(0(inp)(itr2*1)) + MOVO C3, tmpStore + chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) + MOVO tmpStore, C3 + MOVO C1, tmpStore + chachaQR(A3, B3, C3, D3, C1) + MOVO tmpStore, C1 + shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left + shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left + shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left + polyMulStage1 + polyMulStage2 + MOVO C3, tmpStore + chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) + MOVO tmpStore, C3 + MOVO C1, tmpStore + chachaQR(A3, B3, C3, D3, C1) + MOVO tmpStore, C1 + polyMulStage3 + polyMulReduceStage + shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right + shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right + shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right + ADDQ $2*8, itr2 + CMPQ itr2, $160 + JB openSSETail256Loop + MOVQ inl, itr1 + ANDQ $-16, itr1 + +openSSETail256HashLoop: + polyAdd(0(inp)(itr2*1)) + polyMul + ADDQ $2*8, itr2 + CMPQ itr2, itr1 + JB openSSETail256HashLoop + + // Add in the state + PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3 + PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 + PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 + PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 + MOVO D3, tmpStore + + // Load - xor - store + MOVOU (0*16)(inp), D3; PXOR D3, A0 + MOVOU (1*16)(inp), D3; PXOR D3, B0 + MOVOU (2*16)(inp), D3; PXOR D3, C0 + MOVOU (3*16)(inp), D3; PXOR D3, D0 + MOVOU A0, (0*16)(oup) + MOVOU B0, (1*16)(oup) + MOVOU C0, (2*16)(oup) + MOVOU D0, (3*16)(oup) + MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0 + PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1 + MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup) + MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0 + PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2 + MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup) + LEAQ 192(inp), inp + LEAQ 192(oup), oup + SUBQ $192, inl + MOVO A3, A0 + MOVO B3, B0 + MOVO C3, C0 + MOVO tmpStore, D0 + + JMP openSSETail64DecLoop + +// ---------------------------------------------------------------------------- +// ------------------------- AVX2 Code ---------------------------------------- +chacha20Poly1305Open_AVX2: + VZEROUPPER + VMOVDQU ·chacha20Constants<>(SB), AA0 + BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14 + BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12 + BYTE $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4 + VPADDD ·avx2InitMask<>(SB), DD0, DD0 + + // Special optimization, for very short buffers + CMPQ inl, $192 + JBE openAVX2192 + CMPQ inl, $320 + JBE openAVX2320 + + // For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream + VMOVDQA BB0, state1StoreAVX2 + VMOVDQA CC0, state2StoreAVX2 + VMOVDQA DD0, ctr3StoreAVX2 + MOVQ $10, itr2 + +openAVX2PreparePolyKey: + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0 + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0 + DECQ itr2 + JNE openAVX2PreparePolyKey + + VPADDD ·chacha20Constants<>(SB), AA0, AA0 + VPADDD state1StoreAVX2, BB0, BB0 + VPADDD state2StoreAVX2, CC0, CC0 + VPADDD ctr3StoreAVX2, DD0, DD0 + + VPERM2I128 $0x02, AA0, BB0, TT0 + + // Clamp and store poly key + VPAND ·polyClampMask<>(SB), TT0, TT0 + VMOVDQA TT0, rsStoreAVX2 + + // Stream for the first 64 bytes + VPERM2I128 $0x13, AA0, BB0, AA0 + VPERM2I128 $0x13, CC0, DD0, BB0 + + // Hash AD + first 64 bytes + MOVQ ad_len+80(FP), itr2 + CALL polyHashADInternal<>(SB) + XORQ itr1, itr1 + +openAVX2InitialHash64: + polyAdd(0(inp)(itr1*1)) + polyMulAVX2 + ADDQ $16, itr1 + CMPQ itr1, $64 + JNE openAVX2InitialHash64 + + // Decrypt the first 64 bytes + VPXOR (0*32)(inp), AA0, AA0 + VPXOR (1*32)(inp), BB0, BB0 + VMOVDQU AA0, (0*32)(oup) + VMOVDQU BB0, (1*32)(oup) + LEAQ (2*32)(inp), inp + LEAQ (2*32)(oup), oup + SUBQ $64, inl + +openAVX2MainLoop: + CMPQ inl, $512 + JB openAVX2MainLoopDone + + // Load state, increment counter blocks, store the incremented counters + VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 + VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 + VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 + VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 + VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 + XORQ itr1, itr1 + +openAVX2InternalLoop: + // Lets just say this spaghetti loop interleaves 2 quarter rounds with 3 poly multiplications + // Effectively per 512 bytes of stream we hash 480 bytes of ciphertext + polyAdd(0*8(inp)(itr1*1)) + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + polyMulStage1_AVX2 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 + polyMulStage2_AVX2 + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + polyMulStage3_AVX2 + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + polyMulReduceStage + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 + polyAdd(2*8(inp)(itr1*1)) + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + polyMulStage1_AVX2 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + polyMulStage2_AVX2 + VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 + VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3 + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + polyMulStage3_AVX2 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 + polyMulReduceStage + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + polyAdd(4*8(inp)(itr1*1)) + LEAQ (6*8)(itr1), itr1 + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + polyMulStage1_AVX2 + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + polyMulStage2_AVX2 + VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + polyMulStage3_AVX2 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + polyMulReduceStage + VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 + VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3 + CMPQ itr1, $480 + JNE openAVX2InternalLoop + + VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 + VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 + VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 + VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 + VMOVDQA CC3, tmpStoreAVX2 + + // We only hashed 480 of the 512 bytes available - hash the remaining 32 here + polyAdd(480(inp)) + polyMulAVX2 + VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0 + VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0 + VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup) + VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 + VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0 + VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup) + + // and here + polyAdd(496(inp)) + polyMulAVX2 + VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 + VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0 + VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup) + VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0 + VPXOR (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0 + VMOVDQU AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup) + LEAQ (32*16)(inp), inp + LEAQ (32*16)(oup), oup + SUBQ $(32*16), inl + JMP openAVX2MainLoop + +openAVX2MainLoopDone: + // Handle the various tail sizes efficiently + TESTQ inl, inl + JE openSSEFinalize + CMPQ inl, $128 + JBE openAVX2Tail128 + CMPQ inl, $256 + JBE openAVX2Tail256 + CMPQ inl, $384 + JBE openAVX2Tail384 + JMP openAVX2Tail512 + +// ---------------------------------------------------------------------------- +// Special optimization for buffers smaller than 193 bytes +openAVX2192: + // For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks + VMOVDQA AA0, AA1 + VMOVDQA BB0, BB1 + VMOVDQA CC0, CC1 + VPADDD ·avx2IncMask<>(SB), DD0, DD1 + VMOVDQA AA0, AA2 + VMOVDQA BB0, BB2 + VMOVDQA CC0, CC2 + VMOVDQA DD0, DD2 + VMOVDQA DD1, TT3 + MOVQ $10, itr2 + +openAVX2192InnerCipherLoop: + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 + VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1 + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 + VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1 + DECQ itr2 + JNE openAVX2192InnerCipherLoop + VPADDD AA2, AA0, AA0; VPADDD AA2, AA1, AA1 + VPADDD BB2, BB0, BB0; VPADDD BB2, BB1, BB1 + VPADDD CC2, CC0, CC0; VPADDD CC2, CC1, CC1 + VPADDD DD2, DD0, DD0; VPADDD TT3, DD1, DD1 + VPERM2I128 $0x02, AA0, BB0, TT0 + + // Clamp and store poly key + VPAND ·polyClampMask<>(SB), TT0, TT0 + VMOVDQA TT0, rsStoreAVX2 + + // Stream for up to 192 bytes + VPERM2I128 $0x13, AA0, BB0, AA0 + VPERM2I128 $0x13, CC0, DD0, BB0 + VPERM2I128 $0x02, AA1, BB1, CC0 + VPERM2I128 $0x02, CC1, DD1, DD0 + VPERM2I128 $0x13, AA1, BB1, AA1 + VPERM2I128 $0x13, CC1, DD1, BB1 + +openAVX2ShortOpen: + // Hash + MOVQ ad_len+80(FP), itr2 + CALL polyHashADInternal<>(SB) + +openAVX2ShortOpenLoop: + CMPQ inl, $32 + JB openAVX2ShortTail32 + SUBQ $32, inl + + // Load for hashing + polyAdd(0*8(inp)) + polyMulAVX2 + polyAdd(2*8(inp)) + polyMulAVX2 + + // Load for decryption + VPXOR (inp), AA0, AA0 + VMOVDQU AA0, (oup) + LEAQ (1*32)(inp), inp + LEAQ (1*32)(oup), oup + + // Shift stream left + VMOVDQA BB0, AA0 + VMOVDQA CC0, BB0 + VMOVDQA DD0, CC0 + VMOVDQA AA1, DD0 + VMOVDQA BB1, AA1 + VMOVDQA CC1, BB1 + VMOVDQA DD1, CC1 + VMOVDQA AA2, DD1 + VMOVDQA BB2, AA2 + JMP openAVX2ShortOpenLoop + +openAVX2ShortTail32: + CMPQ inl, $16 + VMOVDQA A0, A1 + JB openAVX2ShortDone + + SUBQ $16, inl + + // Load for hashing + polyAdd(0*8(inp)) + polyMulAVX2 + + // Load for decryption + VPXOR (inp), A0, T0 + VMOVDQU T0, (oup) + LEAQ (1*16)(inp), inp + LEAQ (1*16)(oup), oup + VPERM2I128 $0x11, AA0, AA0, AA0 + VMOVDQA A0, A1 + +openAVX2ShortDone: + VZEROUPPER + JMP openSSETail16 + +// ---------------------------------------------------------------------------- +// Special optimization for buffers smaller than 321 bytes +openAVX2320: + // For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks + VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1 + VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2 + VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3 + MOVQ $10, itr2 + +openAVX2320InnerCipherLoop: + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) + VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 + VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2 + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) + VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 + VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2 + DECQ itr2 + JNE openAVX2320InnerCipherLoop + + VMOVDQA ·chacha20Constants<>(SB), TT0 + VPADDD TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2 + VPADDD TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2 + VPADDD TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2 + VMOVDQA ·avx2IncMask<>(SB), TT0 + VPADDD TT3, DD0, DD0; VPADDD TT0, TT3, TT3 + VPADDD TT3, DD1, DD1; VPADDD TT0, TT3, TT3 + VPADDD TT3, DD2, DD2 + + // Clamp and store poly key + VPERM2I128 $0x02, AA0, BB0, TT0 + VPAND ·polyClampMask<>(SB), TT0, TT0 + VMOVDQA TT0, rsStoreAVX2 + + // Stream for up to 320 bytes + VPERM2I128 $0x13, AA0, BB0, AA0 + VPERM2I128 $0x13, CC0, DD0, BB0 + VPERM2I128 $0x02, AA1, BB1, CC0 + VPERM2I128 $0x02, CC1, DD1, DD0 + VPERM2I128 $0x13, AA1, BB1, AA1 + VPERM2I128 $0x13, CC1, DD1, BB1 + VPERM2I128 $0x02, AA2, BB2, CC1 + VPERM2I128 $0x02, CC2, DD2, DD1 + VPERM2I128 $0x13, AA2, BB2, AA2 + VPERM2I128 $0x13, CC2, DD2, BB2 + JMP openAVX2ShortOpen + +// ---------------------------------------------------------------------------- +// Special optimization for the last 128 bytes of ciphertext +openAVX2Tail128: + // Need to decrypt up to 128 bytes - prepare two blocks + VMOVDQA ·chacha20Constants<>(SB), AA1 + VMOVDQA state1StoreAVX2, BB1 + VMOVDQA state2StoreAVX2, CC1 + VMOVDQA ctr3StoreAVX2, DD1 + VPADDD ·avx2IncMask<>(SB), DD1, DD1 + VMOVDQA DD1, DD0 + + XORQ itr2, itr2 + MOVQ inl, itr1 + ANDQ $-16, itr1 + TESTQ itr1, itr1 + JE openAVX2Tail128LoopB + +openAVX2Tail128LoopA: + // Perform ChaCha rounds, while hashing the remaining input + polyAdd(0(inp)(itr2*1)) + polyMulAVX2 + +openAVX2Tail128LoopB: + ADDQ $16, itr2 + chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + VPALIGNR $4, BB1, BB1, BB1 + VPALIGNR $8, CC1, CC1, CC1 + VPALIGNR $12, DD1, DD1, DD1 + chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + VPALIGNR $12, BB1, BB1, BB1 + VPALIGNR $8, CC1, CC1, CC1 + VPALIGNR $4, DD1, DD1, DD1 + CMPQ itr2, itr1 + JB openAVX2Tail128LoopA + CMPQ itr2, $160 + JNE openAVX2Tail128LoopB + + VPADDD ·chacha20Constants<>(SB), AA1, AA1 + VPADDD state1StoreAVX2, BB1, BB1 + VPADDD state2StoreAVX2, CC1, CC1 + VPADDD DD0, DD1, DD1 + VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 + +openAVX2TailLoop: + CMPQ inl, $32 + JB openAVX2Tail + SUBQ $32, inl + + // Load for decryption + VPXOR (inp), AA0, AA0 + VMOVDQU AA0, (oup) + LEAQ (1*32)(inp), inp + LEAQ (1*32)(oup), oup + VMOVDQA BB0, AA0 + VMOVDQA CC0, BB0 + VMOVDQA DD0, CC0 + JMP openAVX2TailLoop + +openAVX2Tail: + CMPQ inl, $16 + VMOVDQA A0, A1 + JB openAVX2TailDone + SUBQ $16, inl + + // Load for decryption + VPXOR (inp), A0, T0 + VMOVDQU T0, (oup) + LEAQ (1*16)(inp), inp + LEAQ (1*16)(oup), oup + VPERM2I128 $0x11, AA0, AA0, AA0 + VMOVDQA A0, A1 + +openAVX2TailDone: + VZEROUPPER + JMP openSSETail16 + +// ---------------------------------------------------------------------------- +// Special optimization for the last 256 bytes of ciphertext +openAVX2Tail256: + // Need to decrypt up to 256 bytes - prepare four blocks + VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1 + VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1 + VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1 + VMOVDQA ctr3StoreAVX2, DD0 + VPADDD ·avx2IncMask<>(SB), DD0, DD0 + VPADDD ·avx2IncMask<>(SB), DD0, DD1 + VMOVDQA DD0, TT1 + VMOVDQA DD1, TT2 + + // Compute the number of iterations that will hash data + MOVQ inl, tmpStoreAVX2 + MOVQ inl, itr1 + SUBQ $128, itr1 + SHRQ $4, itr1 + MOVQ $10, itr2 + CMPQ itr1, $10 + CMOVQGT itr2, itr1 + MOVQ inp, inl + XORQ itr2, itr2 + +openAVX2Tail256LoopA: + polyAdd(0(inl)) + polyMulAVX2 + LEAQ 16(inl), inl + + // Perform ChaCha rounds, while hashing the remaining input +openAVX2Tail256LoopB: + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 + VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1 + INCQ itr2 + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 + VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1 + CMPQ itr2, itr1 + JB openAVX2Tail256LoopA + + CMPQ itr2, $10 + JNE openAVX2Tail256LoopB + + MOVQ inl, itr2 + SUBQ inp, inl + MOVQ inl, itr1 + MOVQ tmpStoreAVX2, inl + + // Hash the remainder of data (if any) +openAVX2Tail256Hash: + ADDQ $16, itr1 + CMPQ itr1, inl + JGT openAVX2Tail256HashEnd + polyAdd (0(itr2)) + polyMulAVX2 + LEAQ 16(itr2), itr2 + JMP openAVX2Tail256Hash + +// Store 128 bytes safely, then go to store loop +openAVX2Tail256HashEnd: + VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1 + VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1 + VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1 + VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1 + VPERM2I128 $0x02, AA0, BB0, AA2; VPERM2I128 $0x02, CC0, DD0, BB2; VPERM2I128 $0x13, AA0, BB0, CC2; VPERM2I128 $0x13, CC0, DD0, DD2 + VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 + + VPXOR (0*32)(inp), AA2, AA2; VPXOR (1*32)(inp), BB2, BB2; VPXOR (2*32)(inp), CC2, CC2; VPXOR (3*32)(inp), DD2, DD2 + VMOVDQU AA2, (0*32)(oup); VMOVDQU BB2, (1*32)(oup); VMOVDQU CC2, (2*32)(oup); VMOVDQU DD2, (3*32)(oup) + LEAQ (4*32)(inp), inp + LEAQ (4*32)(oup), oup + SUBQ $4*32, inl + + JMP openAVX2TailLoop + +// ---------------------------------------------------------------------------- +// Special optimization for the last 384 bytes of ciphertext +openAVX2Tail384: + // Need to decrypt up to 384 bytes - prepare six blocks + VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2 + VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2 + VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2 + VMOVDQA ctr3StoreAVX2, DD0 + VPADDD ·avx2IncMask<>(SB), DD0, DD0 + VPADDD ·avx2IncMask<>(SB), DD0, DD1 + VPADDD ·avx2IncMask<>(SB), DD1, DD2 + VMOVDQA DD0, ctr0StoreAVX2 + VMOVDQA DD1, ctr1StoreAVX2 + VMOVDQA DD2, ctr2StoreAVX2 + + // Compute the number of iterations that will hash two blocks of data + MOVQ inl, tmpStoreAVX2 + MOVQ inl, itr1 + SUBQ $256, itr1 + SHRQ $4, itr1 + ADDQ $6, itr1 + MOVQ $10, itr2 + CMPQ itr1, $10 + CMOVQGT itr2, itr1 + MOVQ inp, inl + XORQ itr2, itr2 + + // Perform ChaCha rounds, while hashing the remaining input +openAVX2Tail384LoopB: + polyAdd(0(inl)) + polyMulAVX2 + LEAQ 16(inl), inl + +openAVX2Tail384LoopA: + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) + VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 + VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2 + polyAdd(0(inl)) + polyMulAVX2 + LEAQ 16(inl), inl + INCQ itr2 + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) + VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 + VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2 + + CMPQ itr2, itr1 + JB openAVX2Tail384LoopB + + CMPQ itr2, $10 + JNE openAVX2Tail384LoopA + + MOVQ inl, itr2 + SUBQ inp, inl + MOVQ inl, itr1 + MOVQ tmpStoreAVX2, inl + +openAVX2Tail384Hash: + ADDQ $16, itr1 + CMPQ itr1, inl + JGT openAVX2Tail384HashEnd + polyAdd(0(itr2)) + polyMulAVX2 + LEAQ 16(itr2), itr2 + JMP openAVX2Tail384Hash + +// Store 256 bytes safely, then go to store loop +openAVX2Tail384HashEnd: + VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2 + VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2 + VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2 + VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2 + VPERM2I128 $0x02, AA0, BB0, TT0; VPERM2I128 $0x02, CC0, DD0, TT1; VPERM2I128 $0x13, AA0, BB0, TT2; VPERM2I128 $0x13, CC0, DD0, TT3 + VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3 + VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup) + VPERM2I128 $0x02, AA1, BB1, TT0; VPERM2I128 $0x02, CC1, DD1, TT1; VPERM2I128 $0x13, AA1, BB1, TT2; VPERM2I128 $0x13, CC1, DD1, TT3 + VPXOR (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3 + VMOVDQU TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup) + VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 + LEAQ (8*32)(inp), inp + LEAQ (8*32)(oup), oup + SUBQ $8*32, inl + JMP openAVX2TailLoop + +// ---------------------------------------------------------------------------- +// Special optimization for the last 512 bytes of ciphertext +openAVX2Tail512: + VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 + VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 + VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 + VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 + VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 + XORQ itr1, itr1 + MOVQ inp, itr2 + +openAVX2Tail512LoopB: + polyAdd(0(itr2)) + polyMulAVX2 + LEAQ (2*8)(itr2), itr2 + +openAVX2Tail512LoopA: + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + polyAdd(0*8(itr2)) + polyMulAVX2 + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 + VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3 + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + polyAdd(2*8(itr2)) + polyMulAVX2 + LEAQ (4*8)(itr2), itr2 + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 + VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3 + INCQ itr1 + CMPQ itr1, $4 + JLT openAVX2Tail512LoopB + + CMPQ itr1, $10 + JNE openAVX2Tail512LoopA + + MOVQ inl, itr1 + SUBQ $384, itr1 + ANDQ $-16, itr1 + +openAVX2Tail512HashLoop: + TESTQ itr1, itr1 + JE openAVX2Tail512HashEnd + polyAdd(0(itr2)) + polyMulAVX2 + LEAQ 16(itr2), itr2 + SUBQ $16, itr1 + JMP openAVX2Tail512HashLoop + +openAVX2Tail512HashEnd: + VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 + VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 + VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 + VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 + VMOVDQA CC3, tmpStoreAVX2 + VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0 + VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0 + VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup) + VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 + VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0 + VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup) + VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 + VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0 + VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup) + VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0 + + LEAQ (12*32)(inp), inp + LEAQ (12*32)(oup), oup + SUBQ $12*32, inl + + JMP openAVX2TailLoop + +// ---------------------------------------------------------------------------- +// ---------------------------------------------------------------------------- +// func chacha20Poly1305Seal(dst, key, src, ad []byte) +TEXT ·chacha20Poly1305Seal(SB), 0, $288-96 + // For aligned stack access + MOVQ SP, BP + ADDQ $32, BP + ANDQ $-32, BP + MOVQ dst+0(FP), oup + MOVQ key+24(FP), keyp + MOVQ src+48(FP), inp + MOVQ src_len+56(FP), inl + MOVQ ad+72(FP), adp + + CMPB ·useAVX2(SB), $1 + JE chacha20Poly1305Seal_AVX2 + + // Special optimization, for very short buffers + CMPQ inl, $128 + JBE sealSSE128 // About 15% faster + + // In the seal case - prepare the poly key + 3 blocks of stream in the first iteration + MOVOU ·chacha20Constants<>(SB), A0 + MOVOU (1*16)(keyp), B0 + MOVOU (2*16)(keyp), C0 + MOVOU (3*16)(keyp), D0 + + // Store state on stack for future use + MOVO B0, state1Store + MOVO C0, state2Store + + // Load state, increment counter blocks + MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 + MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 + MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3 + + // Store counters + MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store + MOVQ $10, itr2 + +sealSSEIntroLoop: + MOVO C3, tmpStore + chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) + MOVO tmpStore, C3 + MOVO C1, tmpStore + chachaQR(A3, B3, C3, D3, C1) + MOVO tmpStore, C1 + shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left + shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left + shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left + + MOVO C3, tmpStore + chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) + MOVO tmpStore, C3 + MOVO C1, tmpStore + chachaQR(A3, B3, C3, D3, C1) + MOVO tmpStore, C1 + shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right + shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right + shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right + DECQ itr2 + JNE sealSSEIntroLoop + + // Add in the state + PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3 + PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 + PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 + PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 + + // Clamp and store the key + PAND ·polyClampMask<>(SB), A0 + MOVO A0, rStore + MOVO B0, sStore + + // Hash AAD + MOVQ ad_len+80(FP), itr2 + CALL polyHashADInternal<>(SB) + + MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0 + PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1 + MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup) + MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0 + PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2 + MOVOU A2, (4*16)(oup); MOVOU B2, (5*16)(oup); MOVOU C2, (6*16)(oup); MOVOU D2, (7*16)(oup) + + MOVQ $128, itr1 + SUBQ $128, inl + LEAQ 128(inp), inp + + MOVO A3, A1; MOVO B3, B1; MOVO C3, C1; MOVO D3, D1 + + CMPQ inl, $64 + JBE sealSSE128SealHash + + MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0 + PXOR A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3 + MOVOU A3, (8*16)(oup); MOVOU B3, (9*16)(oup); MOVOU C3, (10*16)(oup); MOVOU D3, (11*16)(oup) + + ADDQ $64, itr1 + SUBQ $64, inl + LEAQ 64(inp), inp + + MOVQ $2, itr1 + MOVQ $8, itr2 + + CMPQ inl, $64 + JBE sealSSETail64 + CMPQ inl, $128 + JBE sealSSETail128 + CMPQ inl, $192 + JBE sealSSETail192 + +sealSSEMainLoop: + // Load state, increment counter blocks + MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0 + MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 + MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 + MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3 + + // Store counters + MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store + +sealSSEInnerLoop: + MOVO C3, tmpStore + chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) + MOVO tmpStore, C3 + MOVO C1, tmpStore + chachaQR(A3, B3, C3, D3, C1) + MOVO tmpStore, C1 + polyAdd(0(oup)) + shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left + shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left + shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left + polyMulStage1 + polyMulStage2 + LEAQ (2*8)(oup), oup + MOVO C3, tmpStore + chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) + MOVO tmpStore, C3 + MOVO C1, tmpStore + polyMulStage3 + chachaQR(A3, B3, C3, D3, C1) + MOVO tmpStore, C1 + polyMulReduceStage + shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right + shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right + shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right + DECQ itr2 + JGE sealSSEInnerLoop + polyAdd(0(oup)) + polyMul + LEAQ (2*8)(oup), oup + DECQ itr1 + JG sealSSEInnerLoop + + // Add in the state + PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3 + PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 + PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 + PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 + MOVO D3, tmpStore + + // Load - xor - store + MOVOU (0*16)(inp), D3; PXOR D3, A0 + MOVOU (1*16)(inp), D3; PXOR D3, B0 + MOVOU (2*16)(inp), D3; PXOR D3, C0 + MOVOU (3*16)(inp), D3; PXOR D3, D0 + MOVOU A0, (0*16)(oup) + MOVOU B0, (1*16)(oup) + MOVOU C0, (2*16)(oup) + MOVOU D0, (3*16)(oup) + MOVO tmpStore, D3 + + MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0 + PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1 + MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup) + MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0 + PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2 + MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup) + ADDQ $192, inp + MOVQ $192, itr1 + SUBQ $192, inl + MOVO A3, A1 + MOVO B3, B1 + MOVO C3, C1 + MOVO D3, D1 + CMPQ inl, $64 + JBE sealSSE128SealHash + MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0 + PXOR A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3 + MOVOU A3, (12*16)(oup); MOVOU B3, (13*16)(oup); MOVOU C3, (14*16)(oup); MOVOU D3, (15*16)(oup) + LEAQ 64(inp), inp + SUBQ $64, inl + MOVQ $6, itr1 + MOVQ $4, itr2 + CMPQ inl, $192 + JG sealSSEMainLoop + + MOVQ inl, itr1 + TESTQ inl, inl + JE sealSSE128SealHash + MOVQ $6, itr1 + CMPQ inl, $64 + JBE sealSSETail64 + CMPQ inl, $128 + JBE sealSSETail128 + JMP sealSSETail192 + +// ---------------------------------------------------------------------------- +// Special optimization for the last 64 bytes of plaintext +sealSSETail64: + // Need to encrypt up to 64 bytes - prepare single block, hash 192 or 256 bytes + MOVO ·chacha20Constants<>(SB), A1 + MOVO state1Store, B1 + MOVO state2Store, C1 + MOVO ctr3Store, D1 + PADDL ·sseIncMask<>(SB), D1 + MOVO D1, ctr0Store + +sealSSETail64LoopA: + // Perform ChaCha rounds, while hashing the previously encrypted ciphertext + polyAdd(0(oup)) + polyMul + LEAQ 16(oup), oup + +sealSSETail64LoopB: + chachaQR(A1, B1, C1, D1, T1) + shiftB1Left; shiftC1Left; shiftD1Left + chachaQR(A1, B1, C1, D1, T1) + shiftB1Right; shiftC1Right; shiftD1Right + polyAdd(0(oup)) + polyMul + LEAQ 16(oup), oup + + DECQ itr1 + JG sealSSETail64LoopA + + DECQ itr2 + JGE sealSSETail64LoopB + PADDL ·chacha20Constants<>(SB), A1 + PADDL state1Store, B1 + PADDL state2Store, C1 + PADDL ctr0Store, D1 + + JMP sealSSE128Seal + +// ---------------------------------------------------------------------------- +// Special optimization for the last 128 bytes of plaintext +sealSSETail128: + // Need to encrypt up to 128 bytes - prepare two blocks, hash 192 or 256 bytes + MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store + MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store + +sealSSETail128LoopA: + // Perform ChaCha rounds, while hashing the previously encrypted ciphertext + polyAdd(0(oup)) + polyMul + LEAQ 16(oup), oup + +sealSSETail128LoopB: + chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0) + shiftB0Left; shiftC0Left; shiftD0Left + shiftB1Left; shiftC1Left; shiftD1Left + polyAdd(0(oup)) + polyMul + LEAQ 16(oup), oup + chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0) + shiftB0Right; shiftC0Right; shiftD0Right + shiftB1Right; shiftC1Right; shiftD1Right + + DECQ itr1 + JG sealSSETail128LoopA + + DECQ itr2 + JGE sealSSETail128LoopB + + PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1 + PADDL state1Store, B0; PADDL state1Store, B1 + PADDL state2Store, C0; PADDL state2Store, C1 + PADDL ctr0Store, D0; PADDL ctr1Store, D1 + + MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3 + PXOR T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0 + MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup) + + MOVQ $64, itr1 + LEAQ 64(inp), inp + SUBQ $64, inl + + JMP sealSSE128SealHash + +// ---------------------------------------------------------------------------- +// Special optimization for the last 192 bytes of plaintext +sealSSETail192: + // Need to encrypt up to 192 bytes - prepare three blocks, hash 192 or 256 bytes + MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store + MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store + MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr2Store + +sealSSETail192LoopA: + // Perform ChaCha rounds, while hashing the previously encrypted ciphertext + polyAdd(0(oup)) + polyMul + LEAQ 16(oup), oup + +sealSSETail192LoopB: + chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) + shiftB0Left; shiftC0Left; shiftD0Left + shiftB1Left; shiftC1Left; shiftD1Left + shiftB2Left; shiftC2Left; shiftD2Left + + polyAdd(0(oup)) + polyMul + LEAQ 16(oup), oup + + chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) + shiftB0Right; shiftC0Right; shiftD0Right + shiftB1Right; shiftC1Right; shiftD1Right + shiftB2Right; shiftC2Right; shiftD2Right + + DECQ itr1 + JG sealSSETail192LoopA + + DECQ itr2 + JGE sealSSETail192LoopB + + PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2 + PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2 + PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2 + PADDL ctr0Store, D0; PADDL ctr1Store, D1; PADDL ctr2Store, D2 + + MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3 + PXOR T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0 + MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup) + MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3 + PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1 + MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup) + + MOVO A2, A1 + MOVO B2, B1 + MOVO C2, C1 + MOVO D2, D1 + MOVQ $128, itr1 + LEAQ 128(inp), inp + SUBQ $128, inl + + JMP sealSSE128SealHash + +// ---------------------------------------------------------------------------- +// Special seal optimization for buffers smaller than 129 bytes +sealSSE128: + // For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks + MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0 + MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 + MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 + MOVO B0, T1; MOVO C0, T2; MOVO D1, T3 + MOVQ $10, itr2 + +sealSSE128InnerCipherLoop: + chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) + shiftB0Left; shiftB1Left; shiftB2Left + shiftC0Left; shiftC1Left; shiftC2Left + shiftD0Left; shiftD1Left; shiftD2Left + chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) + shiftB0Right; shiftB1Right; shiftB2Right + shiftC0Right; shiftC1Right; shiftC2Right + shiftD0Right; shiftD1Right; shiftD2Right + DECQ itr2 + JNE sealSSE128InnerCipherLoop + + // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded + PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2 + PADDL T1, B0; PADDL T1, B1; PADDL T1, B2 + PADDL T2, C1; PADDL T2, C2 + PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2 + PAND ·polyClampMask<>(SB), A0 + MOVOU A0, rStore + MOVOU B0, sStore + + // Hash + MOVQ ad_len+80(FP), itr2 + CALL polyHashADInternal<>(SB) + XORQ itr1, itr1 + +sealSSE128SealHash: + // itr1 holds the number of bytes encrypted but not yet hashed + CMPQ itr1, $16 + JB sealSSE128Seal + polyAdd(0(oup)) + polyMul + + SUBQ $16, itr1 + ADDQ $16, oup + + JMP sealSSE128SealHash + +sealSSE128Seal: + CMPQ inl, $16 + JB sealSSETail + SUBQ $16, inl + + // Load for decryption + MOVOU (inp), T0 + PXOR T0, A1 + MOVOU A1, (oup) + LEAQ (1*16)(inp), inp + LEAQ (1*16)(oup), oup + + // Extract for hashing + MOVQ A1, t0 + PSRLDQ $8, A1 + MOVQ A1, t1 + ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2 + polyMul + + // Shift the stream "left" + MOVO B1, A1 + MOVO C1, B1 + MOVO D1, C1 + MOVO A2, D1 + MOVO B2, A2 + MOVO C2, B2 + MOVO D2, C2 + JMP sealSSE128Seal + +sealSSETail: + TESTQ inl, inl + JE sealSSEFinalize + + // We can only load the PT one byte at a time to avoid read after end of buffer + MOVQ inl, itr2 + SHLQ $4, itr2 + LEAQ ·andMask<>(SB), t0 + MOVQ inl, itr1 + LEAQ -1(inp)(inl*1), inp + XORQ t2, t2 + XORQ t3, t3 + XORQ AX, AX + +sealSSETailLoadLoop: + SHLQ $8, t2, t3 + SHLQ $8, t2 + MOVB (inp), AX + XORQ AX, t2 + LEAQ -1(inp), inp + DECQ itr1 + JNE sealSSETailLoadLoop + MOVQ t2, 0+tmpStore + MOVQ t3, 8+tmpStore + PXOR 0+tmpStore, A1 + MOVOU A1, (oup) + MOVOU -16(t0)(itr2*1), T0 + PAND T0, A1 + MOVQ A1, t0 + PSRLDQ $8, A1 + MOVQ A1, t1 + ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2 + polyMul + + ADDQ inl, oup + +sealSSEFinalize: + // Hash in the buffer lengths + ADDQ ad_len+80(FP), acc0 + ADCQ src_len+56(FP), acc1 + ADCQ $1, acc2 + polyMul + + // Final reduce + MOVQ acc0, t0 + MOVQ acc1, t1 + MOVQ acc2, t2 + SUBQ $-5, acc0 + SBBQ $-1, acc1 + SBBQ $3, acc2 + CMOVQCS t0, acc0 + CMOVQCS t1, acc1 + CMOVQCS t2, acc2 + + // Add in the "s" part of the key + ADDQ 0+sStore, acc0 + ADCQ 8+sStore, acc1 + + // Finally store the tag at the end of the message + MOVQ acc0, (0*8)(oup) + MOVQ acc1, (1*8)(oup) + RET + +// ---------------------------------------------------------------------------- +// ------------------------- AVX2 Code ---------------------------------------- +chacha20Poly1305Seal_AVX2: + VZEROUPPER + VMOVDQU ·chacha20Constants<>(SB), AA0 + BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14 + BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12 + BYTE $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4 + VPADDD ·avx2InitMask<>(SB), DD0, DD0 + + // Special optimizations, for very short buffers + CMPQ inl, $192 + JBE seal192AVX2 // 33% faster + CMPQ inl, $320 + JBE seal320AVX2 // 17% faster + + // For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream + VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 + VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3; VMOVDQA BB0, state1StoreAVX2 + VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3; VMOVDQA CC0, state2StoreAVX2 + VPADDD ·avx2IncMask<>(SB), DD0, DD1; VMOVDQA DD0, ctr0StoreAVX2 + VPADDD ·avx2IncMask<>(SB), DD1, DD2; VMOVDQA DD1, ctr1StoreAVX2 + VPADDD ·avx2IncMask<>(SB), DD2, DD3; VMOVDQA DD2, ctr2StoreAVX2 + VMOVDQA DD3, ctr3StoreAVX2 + MOVQ $10, itr2 + +sealAVX2IntroLoop: + VMOVDQA CC3, tmpStoreAVX2 + chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) + VMOVDQA tmpStoreAVX2, CC3 + VMOVDQA CC1, tmpStoreAVX2 + chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) + VMOVDQA tmpStoreAVX2, CC1 + + VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0 + VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1 + VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2 + VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3 + + VMOVDQA CC3, tmpStoreAVX2 + chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) + VMOVDQA tmpStoreAVX2, CC3 + VMOVDQA CC1, tmpStoreAVX2 + chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) + VMOVDQA tmpStoreAVX2, CC1 + + VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0 + VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1 + VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2 + VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3 + DECQ itr2 + JNE sealAVX2IntroLoop + + VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 + VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 + VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 + VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 + + VPERM2I128 $0x13, CC0, DD0, CC0 // Stream bytes 96 - 127 + VPERM2I128 $0x02, AA0, BB0, DD0 // The Poly1305 key + VPERM2I128 $0x13, AA0, BB0, AA0 // Stream bytes 64 - 95 + + // Clamp and store poly key + VPAND ·polyClampMask<>(SB), DD0, DD0 + VMOVDQA DD0, rsStoreAVX2 + + // Hash AD + MOVQ ad_len+80(FP), itr2 + CALL polyHashADInternal<>(SB) + + // Can store at least 320 bytes + VPXOR (0*32)(inp), AA0, AA0 + VPXOR (1*32)(inp), CC0, CC0 + VMOVDQU AA0, (0*32)(oup) + VMOVDQU CC0, (1*32)(oup) + + VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 + VPXOR (2*32)(inp), AA0, AA0; VPXOR (3*32)(inp), BB0, BB0; VPXOR (4*32)(inp), CC0, CC0; VPXOR (5*32)(inp), DD0, DD0 + VMOVDQU AA0, (2*32)(oup); VMOVDQU BB0, (3*32)(oup); VMOVDQU CC0, (4*32)(oup); VMOVDQU DD0, (5*32)(oup) + VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 + VPXOR (6*32)(inp), AA0, AA0; VPXOR (7*32)(inp), BB0, BB0; VPXOR (8*32)(inp), CC0, CC0; VPXOR (9*32)(inp), DD0, DD0 + VMOVDQU AA0, (6*32)(oup); VMOVDQU BB0, (7*32)(oup); VMOVDQU CC0, (8*32)(oup); VMOVDQU DD0, (9*32)(oup) + + MOVQ $320, itr1 + SUBQ $320, inl + LEAQ 320(inp), inp + + VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, CC3, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, CC3, DD3, DD0 + CMPQ inl, $128 + JBE sealAVX2SealHash + + VPXOR (0*32)(inp), AA0, AA0; VPXOR (1*32)(inp), BB0, BB0; VPXOR (2*32)(inp), CC0, CC0; VPXOR (3*32)(inp), DD0, DD0 + VMOVDQU AA0, (10*32)(oup); VMOVDQU BB0, (11*32)(oup); VMOVDQU CC0, (12*32)(oup); VMOVDQU DD0, (13*32)(oup) + SUBQ $128, inl + LEAQ 128(inp), inp + + MOVQ $8, itr1 + MOVQ $2, itr2 + + CMPQ inl, $128 + JBE sealAVX2Tail128 + CMPQ inl, $256 + JBE sealAVX2Tail256 + CMPQ inl, $384 + JBE sealAVX2Tail384 + CMPQ inl, $512 + JBE sealAVX2Tail512 + + // We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop + VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 + VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 + VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 + VMOVDQA ctr3StoreAVX2, DD0 + VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 + VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 + + VMOVDQA CC3, tmpStoreAVX2 + chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) + VMOVDQA tmpStoreAVX2, CC3 + VMOVDQA CC1, tmpStoreAVX2 + chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) + VMOVDQA tmpStoreAVX2, CC1 + + VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0 + VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1 + VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2 + VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3 + + VMOVDQA CC3, tmpStoreAVX2 + chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) + VMOVDQA tmpStoreAVX2, CC3 + VMOVDQA CC1, tmpStoreAVX2 + chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) + VMOVDQA tmpStoreAVX2, CC1 + + VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0 + VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1 + VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2 + VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3 + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + + SUBQ $16, oup // Adjust the pointer + MOVQ $9, itr1 + JMP sealAVX2InternalLoopStart + +sealAVX2MainLoop: + // Load state, increment counter blocks, store the incremented counters + VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 + VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 + VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 + VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 + VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 + MOVQ $10, itr1 + +sealAVX2InternalLoop: + polyAdd(0*8(oup)) + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + polyMulStage1_AVX2 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 + polyMulStage2_AVX2 + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + polyMulStage3_AVX2 + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + polyMulReduceStage + +sealAVX2InternalLoopStart: + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 + polyAdd(2*8(oup)) + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + polyMulStage1_AVX2 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + polyMulStage2_AVX2 + VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 + VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3 + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + polyMulStage3_AVX2 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 + polyMulReduceStage + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + polyAdd(4*8(oup)) + LEAQ (6*8)(oup), oup + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + polyMulStage1_AVX2 + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + polyMulStage2_AVX2 + VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + polyMulStage3_AVX2 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + polyMulReduceStage + VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 + VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3 + DECQ itr1 + JNE sealAVX2InternalLoop + + VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 + VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 + VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 + VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 + VMOVDQA CC3, tmpStoreAVX2 + + // We only hashed 480 of the 512 bytes available - hash the remaining 32 here + polyAdd(0*8(oup)) + polyMulAVX2 + LEAQ (4*8)(oup), oup + VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0 + VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0 + VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup) + VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 + VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0 + VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup) + + // and here + polyAdd(-2*8(oup)) + polyMulAVX2 + VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 + VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0 + VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup) + VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0 + VPXOR (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0 + VMOVDQU AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup) + LEAQ (32*16)(inp), inp + SUBQ $(32*16), inl + CMPQ inl, $512 + JG sealAVX2MainLoop + + // Tail can only hash 480 bytes + polyAdd(0*8(oup)) + polyMulAVX2 + polyAdd(2*8(oup)) + polyMulAVX2 + LEAQ 32(oup), oup + + MOVQ $10, itr1 + MOVQ $0, itr2 + CMPQ inl, $128 + JBE sealAVX2Tail128 + CMPQ inl, $256 + JBE sealAVX2Tail256 + CMPQ inl, $384 + JBE sealAVX2Tail384 + JMP sealAVX2Tail512 + +// ---------------------------------------------------------------------------- +// Special optimization for buffers smaller than 193 bytes +seal192AVX2: + // For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks + VMOVDQA AA0, AA1 + VMOVDQA BB0, BB1 + VMOVDQA CC0, CC1 + VPADDD ·avx2IncMask<>(SB), DD0, DD1 + VMOVDQA AA0, AA2 + VMOVDQA BB0, BB2 + VMOVDQA CC0, CC2 + VMOVDQA DD0, DD2 + VMOVDQA DD1, TT3 + MOVQ $10, itr2 + +sealAVX2192InnerCipherLoop: + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 + VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1 + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 + VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1 + DECQ itr2 + JNE sealAVX2192InnerCipherLoop + VPADDD AA2, AA0, AA0; VPADDD AA2, AA1, AA1 + VPADDD BB2, BB0, BB0; VPADDD BB2, BB1, BB1 + VPADDD CC2, CC0, CC0; VPADDD CC2, CC1, CC1 + VPADDD DD2, DD0, DD0; VPADDD TT3, DD1, DD1 + VPERM2I128 $0x02, AA0, BB0, TT0 + + // Clamp and store poly key + VPAND ·polyClampMask<>(SB), TT0, TT0 + VMOVDQA TT0, rsStoreAVX2 + + // Stream for up to 192 bytes + VPERM2I128 $0x13, AA0, BB0, AA0 + VPERM2I128 $0x13, CC0, DD0, BB0 + VPERM2I128 $0x02, AA1, BB1, CC0 + VPERM2I128 $0x02, CC1, DD1, DD0 + VPERM2I128 $0x13, AA1, BB1, AA1 + VPERM2I128 $0x13, CC1, DD1, BB1 + +sealAVX2ShortSeal: + // Hash aad + MOVQ ad_len+80(FP), itr2 + CALL polyHashADInternal<>(SB) + XORQ itr1, itr1 + +sealAVX2SealHash: + // itr1 holds the number of bytes encrypted but not yet hashed + CMPQ itr1, $16 + JB sealAVX2ShortSealLoop + polyAdd(0(oup)) + polyMul + SUBQ $16, itr1 + ADDQ $16, oup + JMP sealAVX2SealHash + +sealAVX2ShortSealLoop: + CMPQ inl, $32 + JB sealAVX2ShortTail32 + SUBQ $32, inl + + // Load for encryption + VPXOR (inp), AA0, AA0 + VMOVDQU AA0, (oup) + LEAQ (1*32)(inp), inp + + // Now can hash + polyAdd(0*8(oup)) + polyMulAVX2 + polyAdd(2*8(oup)) + polyMulAVX2 + LEAQ (1*32)(oup), oup + + // Shift stream left + VMOVDQA BB0, AA0 + VMOVDQA CC0, BB0 + VMOVDQA DD0, CC0 + VMOVDQA AA1, DD0 + VMOVDQA BB1, AA1 + VMOVDQA CC1, BB1 + VMOVDQA DD1, CC1 + VMOVDQA AA2, DD1 + VMOVDQA BB2, AA2 + JMP sealAVX2ShortSealLoop + +sealAVX2ShortTail32: + CMPQ inl, $16 + VMOVDQA A0, A1 + JB sealAVX2ShortDone + + SUBQ $16, inl + + // Load for encryption + VPXOR (inp), A0, T0 + VMOVDQU T0, (oup) + LEAQ (1*16)(inp), inp + + // Hash + polyAdd(0*8(oup)) + polyMulAVX2 + LEAQ (1*16)(oup), oup + VPERM2I128 $0x11, AA0, AA0, AA0 + VMOVDQA A0, A1 + +sealAVX2ShortDone: + VZEROUPPER + JMP sealSSETail + +// ---------------------------------------------------------------------------- +// Special optimization for buffers smaller than 321 bytes +seal320AVX2: + // For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks + VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1 + VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2 + VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3 + MOVQ $10, itr2 + +sealAVX2320InnerCipherLoop: + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) + VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 + VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2 + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) + VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 + VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2 + DECQ itr2 + JNE sealAVX2320InnerCipherLoop + + VMOVDQA ·chacha20Constants<>(SB), TT0 + VPADDD TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2 + VPADDD TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2 + VPADDD TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2 + VMOVDQA ·avx2IncMask<>(SB), TT0 + VPADDD TT3, DD0, DD0; VPADDD TT0, TT3, TT3 + VPADDD TT3, DD1, DD1; VPADDD TT0, TT3, TT3 + VPADDD TT3, DD2, DD2 + + // Clamp and store poly key + VPERM2I128 $0x02, AA0, BB0, TT0 + VPAND ·polyClampMask<>(SB), TT0, TT0 + VMOVDQA TT0, rsStoreAVX2 + + // Stream for up to 320 bytes + VPERM2I128 $0x13, AA0, BB0, AA0 + VPERM2I128 $0x13, CC0, DD0, BB0 + VPERM2I128 $0x02, AA1, BB1, CC0 + VPERM2I128 $0x02, CC1, DD1, DD0 + VPERM2I128 $0x13, AA1, BB1, AA1 + VPERM2I128 $0x13, CC1, DD1, BB1 + VPERM2I128 $0x02, AA2, BB2, CC1 + VPERM2I128 $0x02, CC2, DD2, DD1 + VPERM2I128 $0x13, AA2, BB2, AA2 + VPERM2I128 $0x13, CC2, DD2, BB2 + JMP sealAVX2ShortSeal + +// ---------------------------------------------------------------------------- +// Special optimization for the last 128 bytes of ciphertext +sealAVX2Tail128: + // Need to decrypt up to 128 bytes - prepare two blocks + // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed + // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed + VMOVDQA ·chacha20Constants<>(SB), AA0 + VMOVDQA state1StoreAVX2, BB0 + VMOVDQA state2StoreAVX2, CC0 + VMOVDQA ctr3StoreAVX2, DD0 + VPADDD ·avx2IncMask<>(SB), DD0, DD0 + VMOVDQA DD0, DD1 + +sealAVX2Tail128LoopA: + polyAdd(0(oup)) + polyMul + LEAQ 16(oup), oup + +sealAVX2Tail128LoopB: + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + polyAdd(0(oup)) + polyMul + VPALIGNR $4, BB0, BB0, BB0 + VPALIGNR $8, CC0, CC0, CC0 + VPALIGNR $12, DD0, DD0, DD0 + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + polyAdd(16(oup)) + polyMul + LEAQ 32(oup), oup + VPALIGNR $12, BB0, BB0, BB0 + VPALIGNR $8, CC0, CC0, CC0 + VPALIGNR $4, DD0, DD0, DD0 + DECQ itr1 + JG sealAVX2Tail128LoopA + DECQ itr2 + JGE sealAVX2Tail128LoopB + + VPADDD ·chacha20Constants<>(SB), AA0, AA1 + VPADDD state1StoreAVX2, BB0, BB1 + VPADDD state2StoreAVX2, CC0, CC1 + VPADDD DD1, DD0, DD1 + + VPERM2I128 $0x02, AA1, BB1, AA0 + VPERM2I128 $0x02, CC1, DD1, BB0 + VPERM2I128 $0x13, AA1, BB1, CC0 + VPERM2I128 $0x13, CC1, DD1, DD0 + JMP sealAVX2ShortSealLoop + +// ---------------------------------------------------------------------------- +// Special optimization for the last 256 bytes of ciphertext +sealAVX2Tail256: + // Need to decrypt up to 256 bytes - prepare two blocks + // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed + // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed + VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA ·chacha20Constants<>(SB), AA1 + VMOVDQA state1StoreAVX2, BB0; VMOVDQA state1StoreAVX2, BB1 + VMOVDQA state2StoreAVX2, CC0; VMOVDQA state2StoreAVX2, CC1 + VMOVDQA ctr3StoreAVX2, DD0 + VPADDD ·avx2IncMask<>(SB), DD0, DD0 + VPADDD ·avx2IncMask<>(SB), DD0, DD1 + VMOVDQA DD0, TT1 + VMOVDQA DD1, TT2 + +sealAVX2Tail256LoopA: + polyAdd(0(oup)) + polyMul + LEAQ 16(oup), oup + +sealAVX2Tail256LoopB: + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + polyAdd(0(oup)) + polyMul + VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 + VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1 + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + polyAdd(16(oup)) + polyMul + LEAQ 32(oup), oup + VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 + VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1 + DECQ itr1 + JG sealAVX2Tail256LoopA + DECQ itr2 + JGE sealAVX2Tail256LoopB + + VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1 + VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1 + VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1 + VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1 + VPERM2I128 $0x02, AA0, BB0, TT0 + VPERM2I128 $0x02, CC0, DD0, TT1 + VPERM2I128 $0x13, AA0, BB0, TT2 + VPERM2I128 $0x13, CC0, DD0, TT3 + VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3 + VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup) + MOVQ $128, itr1 + LEAQ 128(inp), inp + SUBQ $128, inl + VPERM2I128 $0x02, AA1, BB1, AA0 + VPERM2I128 $0x02, CC1, DD1, BB0 + VPERM2I128 $0x13, AA1, BB1, CC0 + VPERM2I128 $0x13, CC1, DD1, DD0 + + JMP sealAVX2SealHash + +// ---------------------------------------------------------------------------- +// Special optimization for the last 384 bytes of ciphertext +sealAVX2Tail384: + // Need to decrypt up to 384 bytes - prepare two blocks + // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed + // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed + VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2 + VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2 + VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2 + VMOVDQA ctr3StoreAVX2, DD0 + VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2 + VMOVDQA DD0, TT1; VMOVDQA DD1, TT2; VMOVDQA DD2, TT3 + +sealAVX2Tail384LoopA: + polyAdd(0(oup)) + polyMul + LEAQ 16(oup), oup + +sealAVX2Tail384LoopB: + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) + polyAdd(0(oup)) + polyMul + VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 + VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2 + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) + polyAdd(16(oup)) + polyMul + LEAQ 32(oup), oup + VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 + VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2 + DECQ itr1 + JG sealAVX2Tail384LoopA + DECQ itr2 + JGE sealAVX2Tail384LoopB + + VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2 + VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2 + VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2 + VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1; VPADDD TT3, DD2, DD2 + VPERM2I128 $0x02, AA0, BB0, TT0 + VPERM2I128 $0x02, CC0, DD0, TT1 + VPERM2I128 $0x13, AA0, BB0, TT2 + VPERM2I128 $0x13, CC0, DD0, TT3 + VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3 + VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup) + VPERM2I128 $0x02, AA1, BB1, TT0 + VPERM2I128 $0x02, CC1, DD1, TT1 + VPERM2I128 $0x13, AA1, BB1, TT2 + VPERM2I128 $0x13, CC1, DD1, TT3 + VPXOR (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3 + VMOVDQU TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup) + MOVQ $256, itr1 + LEAQ 256(inp), inp + SUBQ $256, inl + VPERM2I128 $0x02, AA2, BB2, AA0 + VPERM2I128 $0x02, CC2, DD2, BB0 + VPERM2I128 $0x13, AA2, BB2, CC0 + VPERM2I128 $0x13, CC2, DD2, DD0 + + JMP sealAVX2SealHash + +// ---------------------------------------------------------------------------- +// Special optimization for the last 512 bytes of ciphertext +sealAVX2Tail512: + // Need to decrypt up to 512 bytes - prepare two blocks + // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed + // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed + VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 + VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 + VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 + VMOVDQA ctr3StoreAVX2, DD0 + VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 + VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 + +sealAVX2Tail512LoopA: + polyAdd(0(oup)) + polyMul + LEAQ 16(oup), oup + +sealAVX2Tail512LoopB: + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + polyAdd(0*8(oup)) + polyMulAVX2 + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 + VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3 + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + polyAdd(2*8(oup)) + polyMulAVX2 + LEAQ (4*8)(oup), oup + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 + VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3 + + DECQ itr1 + JG sealAVX2Tail512LoopA + DECQ itr2 + JGE sealAVX2Tail512LoopB + + VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 + VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 + VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 + VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 + VMOVDQA CC3, tmpStoreAVX2 + VPERM2I128 $0x02, AA0, BB0, CC3 + VPXOR (0*32)(inp), CC3, CC3 + VMOVDQU CC3, (0*32)(oup) + VPERM2I128 $0x02, CC0, DD0, CC3 + VPXOR (1*32)(inp), CC3, CC3 + VMOVDQU CC3, (1*32)(oup) + VPERM2I128 $0x13, AA0, BB0, CC3 + VPXOR (2*32)(inp), CC3, CC3 + VMOVDQU CC3, (2*32)(oup) + VPERM2I128 $0x13, CC0, DD0, CC3 + VPXOR (3*32)(inp), CC3, CC3 + VMOVDQU CC3, (3*32)(oup) + + VPERM2I128 $0x02, AA1, BB1, AA0 + VPERM2I128 $0x02, CC1, DD1, BB0 + VPERM2I128 $0x13, AA1, BB1, CC0 + VPERM2I128 $0x13, CC1, DD1, DD0 + VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0 + VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup) + + VPERM2I128 $0x02, AA2, BB2, AA0 + VPERM2I128 $0x02, CC2, DD2, BB0 + VPERM2I128 $0x13, AA2, BB2, CC0 + VPERM2I128 $0x13, CC2, DD2, DD0 + VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0 + VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup) + + MOVQ $384, itr1 + LEAQ 384(inp), inp + SUBQ $384, inl + VPERM2I128 $0x02, AA3, BB3, AA0 + VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0 + VPERM2I128 $0x13, AA3, BB3, CC0 + VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0 + + JMP sealAVX2SealHash diff --git a/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_generic.go b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_generic.go new file mode 100644 index 00000000..fe191d39 --- /dev/null +++ b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_generic.go @@ -0,0 +1,81 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package chacha20poly1305 + +import ( + "encoding/binary" + + "golang.org/x/crypto/chacha20" + "golang.org/x/crypto/internal/subtle" + "golang.org/x/crypto/poly1305" +) + +func writeWithPadding(p *poly1305.MAC, b []byte) { + p.Write(b) + if rem := len(b) % 16; rem != 0 { + var buf [16]byte + padLen := 16 - rem + p.Write(buf[:padLen]) + } +} + +func writeUint64(p *poly1305.MAC, n int) { + var buf [8]byte + binary.LittleEndian.PutUint64(buf[:], uint64(n)) + p.Write(buf[:]) +} + +func (c *chacha20poly1305) sealGeneric(dst, nonce, plaintext, additionalData []byte) []byte { + ret, out := sliceForAppend(dst, len(plaintext)+poly1305.TagSize) + ciphertext, tag := out[:len(plaintext)], out[len(plaintext):] + if subtle.InexactOverlap(out, plaintext) { + panic("chacha20poly1305: invalid buffer overlap") + } + + var polyKey [32]byte + s, _ := chacha20.NewUnauthenticatedCipher(c.key[:], nonce) + s.XORKeyStream(polyKey[:], polyKey[:]) + s.SetCounter(1) // set the counter to 1, skipping 32 bytes + s.XORKeyStream(ciphertext, plaintext) + + p := poly1305.New(&polyKey) + writeWithPadding(p, additionalData) + writeWithPadding(p, ciphertext) + writeUint64(p, len(additionalData)) + writeUint64(p, len(plaintext)) + p.Sum(tag[:0]) + + return ret +} + +func (c *chacha20poly1305) openGeneric(dst, nonce, ciphertext, additionalData []byte) ([]byte, error) { + tag := ciphertext[len(ciphertext)-16:] + ciphertext = ciphertext[:len(ciphertext)-16] + + var polyKey [32]byte + s, _ := chacha20.NewUnauthenticatedCipher(c.key[:], nonce) + s.XORKeyStream(polyKey[:], polyKey[:]) + s.SetCounter(1) // set the counter to 1, skipping 32 bytes + + p := poly1305.New(&polyKey) + writeWithPadding(p, additionalData) + writeWithPadding(p, ciphertext) + writeUint64(p, len(additionalData)) + writeUint64(p, len(ciphertext)) + + ret, out := sliceForAppend(dst, len(ciphertext)) + if subtle.InexactOverlap(out, ciphertext) { + panic("chacha20poly1305: invalid buffer overlap") + } + if !p.Verify(tag) { + for i := range out { + out[i] = 0 + } + return nil, errOpen + } + + s.XORKeyStream(out, ciphertext) + return ret, nil +} diff --git a/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_noasm.go b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_noasm.go new file mode 100644 index 00000000..f832b33d --- /dev/null +++ b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_noasm.go @@ -0,0 +1,16 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build !amd64 || !gc || purego +// +build !amd64 !gc purego + +package chacha20poly1305 + +func (c *chacha20poly1305) seal(dst, nonce, plaintext, additionalData []byte) []byte { + return c.sealGeneric(dst, nonce, plaintext, additionalData) +} + +func (c *chacha20poly1305) open(dst, nonce, ciphertext, additionalData []byte) ([]byte, error) { + return c.openGeneric(dst, nonce, ciphertext, additionalData) +} diff --git a/vendor/golang.org/x/crypto/chacha20poly1305/xchacha20poly1305.go b/vendor/golang.org/x/crypto/chacha20poly1305/xchacha20poly1305.go new file mode 100644 index 00000000..d9d46b96 --- /dev/null +++ b/vendor/golang.org/x/crypto/chacha20poly1305/xchacha20poly1305.go @@ -0,0 +1,86 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package chacha20poly1305 + +import ( + "crypto/cipher" + "errors" + + "golang.org/x/crypto/chacha20" +) + +type xchacha20poly1305 struct { + key [KeySize]byte +} + +// NewX returns a XChaCha20-Poly1305 AEAD that uses the given 256-bit key. +// +// XChaCha20-Poly1305 is a ChaCha20-Poly1305 variant that takes a longer nonce, +// suitable to be generated randomly without risk of collisions. It should be +// preferred when nonce uniqueness cannot be trivially ensured, or whenever +// nonces are randomly generated. +func NewX(key []byte) (cipher.AEAD, error) { + if len(key) != KeySize { + return nil, errors.New("chacha20poly1305: bad key length") + } + ret := new(xchacha20poly1305) + copy(ret.key[:], key) + return ret, nil +} + +func (*xchacha20poly1305) NonceSize() int { + return NonceSizeX +} + +func (*xchacha20poly1305) Overhead() int { + return 16 +} + +func (x *xchacha20poly1305) Seal(dst, nonce, plaintext, additionalData []byte) []byte { + if len(nonce) != NonceSizeX { + panic("chacha20poly1305: bad nonce length passed to Seal") + } + + // XChaCha20-Poly1305 technically supports a 64-bit counter, so there is no + // size limit. However, since we reuse the ChaCha20-Poly1305 implementation, + // the second half of the counter is not available. This is unlikely to be + // an issue because the cipher.AEAD API requires the entire message to be in + // memory, and the counter overflows at 256 GB. + if uint64(len(plaintext)) > (1<<38)-64 { + panic("chacha20poly1305: plaintext too large") + } + + c := new(chacha20poly1305) + hKey, _ := chacha20.HChaCha20(x.key[:], nonce[0:16]) + copy(c.key[:], hKey) + + // The first 4 bytes of the final nonce are unused counter space. + cNonce := make([]byte, NonceSize) + copy(cNonce[4:12], nonce[16:24]) + + return c.seal(dst, cNonce[:], plaintext, additionalData) +} + +func (x *xchacha20poly1305) Open(dst, nonce, ciphertext, additionalData []byte) ([]byte, error) { + if len(nonce) != NonceSizeX { + panic("chacha20poly1305: bad nonce length passed to Open") + } + if len(ciphertext) < 16 { + return nil, errOpen + } + if uint64(len(ciphertext)) > (1<<38)-48 { + panic("chacha20poly1305: ciphertext too large") + } + + c := new(chacha20poly1305) + hKey, _ := chacha20.HChaCha20(x.key[:], nonce[0:16]) + copy(c.key[:], hKey) + + // The first 4 bytes of the final nonce are unused counter space. + cNonce := make([]byte, NonceSize) + copy(cNonce[4:12], nonce[16:24]) + + return c.open(dst, cNonce[:], ciphertext, additionalData) +} diff --git a/vendor/golang.org/x/crypto/hkdf/hkdf.go b/vendor/golang.org/x/crypto/hkdf/hkdf.go new file mode 100644 index 00000000..dda3f143 --- /dev/null +++ b/vendor/golang.org/x/crypto/hkdf/hkdf.go @@ -0,0 +1,93 @@ +// Copyright 2014 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package hkdf implements the HMAC-based Extract-and-Expand Key Derivation +// Function (HKDF) as defined in RFC 5869. +// +// HKDF is a cryptographic key derivation function (KDF) with the goal of +// expanding limited input keying material into one or more cryptographically +// strong secret keys. +package hkdf // import "golang.org/x/crypto/hkdf" + +import ( + "crypto/hmac" + "errors" + "hash" + "io" +) + +// Extract generates a pseudorandom key for use with Expand from an input secret +// and an optional independent salt. +// +// Only use this function if you need to reuse the extracted key with multiple +// Expand invocations and different context values. Most common scenarios, +// including the generation of multiple keys, should use New instead. +func Extract(hash func() hash.Hash, secret, salt []byte) []byte { + if salt == nil { + salt = make([]byte, hash().Size()) + } + extractor := hmac.New(hash, salt) + extractor.Write(secret) + return extractor.Sum(nil) +} + +type hkdf struct { + expander hash.Hash + size int + + info []byte + counter byte + + prev []byte + buf []byte +} + +func (f *hkdf) Read(p []byte) (int, error) { + // Check whether enough data can be generated + need := len(p) + remains := len(f.buf) + int(255-f.counter+1)*f.size + if remains < need { + return 0, errors.New("hkdf: entropy limit reached") + } + // Read any leftover from the buffer + n := copy(p, f.buf) + p = p[n:] + + // Fill the rest of the buffer + for len(p) > 0 { + f.expander.Reset() + f.expander.Write(f.prev) + f.expander.Write(f.info) + f.expander.Write([]byte{f.counter}) + f.prev = f.expander.Sum(f.prev[:0]) + f.counter++ + + // Copy the new batch into p + f.buf = f.prev + n = copy(p, f.buf) + p = p[n:] + } + // Save leftovers for next run + f.buf = f.buf[n:] + + return need, nil +} + +// Expand returns a Reader, from which keys can be read, using the given +// pseudorandom key and optional context info, skipping the extraction step. +// +// The pseudorandomKey should have been generated by Extract, or be a uniformly +// random or pseudorandom cryptographically strong key. See RFC 5869, Section +// 3.3. Most common scenarios will want to use New instead. +func Expand(hash func() hash.Hash, pseudorandomKey, info []byte) io.Reader { + expander := hmac.New(hash, pseudorandomKey) + return &hkdf{expander, expander.Size(), info, 1, nil, nil} +} + +// New returns a Reader, from which keys can be read, using the given hash, +// secret, salt and context info. Salt and info can be nil. +func New(hash func() hash.Hash, secret, salt, info []byte) io.Reader { + prk := Extract(hash, secret, salt) + return Expand(hash, prk, info) +} diff --git a/vendor/modules.txt b/vendor/modules.txt index 442119e1..bef8394d 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -8,6 +8,10 @@ github.com/VividCortex/ewma github.com/aead/chacha20/chacha # github.com/aead/poly1305 v0.0.0-20180717145839-3fee0db0b635 github.com/aead/poly1305 +# github.com/cisco/go-hpke v0.0.0-20210215210317-01c430f1f302 +## explicit +# github.com/cloudflare/odoh-go v0.1.6 +## explicit # github.com/coreos/go-systemd v0.0.0-20191104093116-d3cd4ed1dbcf ## explicit github.com/coreos/go-systemd/activation @@ -45,6 +49,9 @@ github.com/jedisct1/go-clocksmith # github.com/jedisct1/go-dnsstamps v0.0.0-20210101121956-16fbdadcf8f5 ## explicit github.com/jedisct1/go-dnsstamps +# github.com/jedisct1/go-hpke-compact v0.0.0-20210329192501-7ceabaabca65 +## explicit +github.com/jedisct1/go-hpke-compact # github.com/jedisct1/go-minisign v0.0.0-20210106175330-e54e81d562c7 ## explicit github.com/jedisct1/go-minisign @@ -73,9 +80,12 @@ github.com/smartystreets/goconvey/convey/reporting # golang.org/x/crypto v0.0.0-20210322153248-0c34fe9e7dc2 ## explicit golang.org/x/crypto/blake2b +golang.org/x/crypto/chacha20 +golang.org/x/crypto/chacha20poly1305 golang.org/x/crypto/curve25519 golang.org/x/crypto/ed25519 golang.org/x/crypto/ed25519/internal/edwards25519 +golang.org/x/crypto/hkdf golang.org/x/crypto/internal/subtle golang.org/x/crypto/nacl/box golang.org/x/crypto/nacl/secretbox